Optimize MDEC encoder, use ffmpeg DCT implementation
This commit is contained in:
parent
87b0fe3f2a
commit
302989badf
|
@ -1,6 +1,6 @@
|
|||
project('psxavenc', 'c', default_options: ['c_std=c11'])
|
||||
|
||||
add_project_arguments('-D_POSIX_C_SOURCE=201112L', language : 'c')
|
||||
add_project_arguments('-D_POSIX_C_SOURCE=201112L', '-ffast-math', language : 'c')
|
||||
|
||||
conf_data = configuration_data()
|
||||
conf_data.set('VERSION', '"' + run_command('git', '-C', meson.project_source_root(), 'describe', '--tags', '--always', '--dirty', '--match=v*', check: true).stdout().strip() + '"')
|
||||
|
|
|
@ -34,6 +34,7 @@ freely, subject to the following restrictions:
|
|||
|
||||
#include <libavutil/opt.h>
|
||||
#include <libavcodec/avcodec.h>
|
||||
#include <libavcodec/avdct.h>
|
||||
#include <libavformat/avformat.h>
|
||||
#include <libswscale/swscale.h>
|
||||
#include <libswresample/swresample.h>
|
||||
|
@ -65,7 +66,11 @@ typedef struct {
|
|||
int uncomp_hwords_used;
|
||||
int quant_scale;
|
||||
int quant_scale_sum;
|
||||
float *dct_block_lists[6];
|
||||
|
||||
uint32_t *huffman_encoding_map;
|
||||
int16_t *coeff_clamp_map;
|
||||
int16_t *dct_block_lists[6];
|
||||
AVDCT *dct_context;
|
||||
} vid_encoder_state_t;
|
||||
|
||||
typedef struct {
|
||||
|
@ -142,5 +147,7 @@ void encode_file_str(settings_t *settings, FILE *output);
|
|||
void encode_file_sbs(settings_t *settings, FILE *output);
|
||||
|
||||
// mdec.c
|
||||
bool init_encoder_state(settings_t *settings);
|
||||
void destroy_encoder_state(settings_t *settings);
|
||||
void encode_frame_bs(uint8_t *video_frame, settings_t *settings);
|
||||
void encode_sector_str(uint8_t *video_frames, uint8_t *output, settings_t *settings);
|
||||
|
|
|
@ -299,6 +299,7 @@ void encode_file_str(settings_t *settings, FILE *output) {
|
|||
fprintf(stderr, "Frame size: %.2f sectors\n", frame_size);
|
||||
}
|
||||
|
||||
init_encoder_state(settings);
|
||||
settings->state_vid.frame_output = malloc(2016 * (int)ceil(frame_size));
|
||||
settings->state_vid.frame_index = 0;
|
||||
settings->state_vid.frame_data_offset = 0;
|
||||
|
@ -362,9 +363,11 @@ void encode_file_str(settings_t *settings, FILE *output) {
|
|||
}
|
||||
|
||||
free(settings->state_vid.frame_output);
|
||||
destroy_encoder_state(settings);
|
||||
}
|
||||
|
||||
void encode_file_sbs(settings_t *settings, FILE *output) {
|
||||
init_encoder_state(settings);
|
||||
settings->state_vid.frame_output = malloc(settings->alignment);
|
||||
settings->state_vid.frame_data_offset = 0;
|
||||
settings->state_vid.frame_max_size = settings->alignment;
|
||||
|
@ -385,4 +388,5 @@ void encode_file_sbs(settings_t *settings, FILE *output) {
|
|||
}
|
||||
|
||||
free(settings->state_vid.frame_output);
|
||||
destroy_encoder_state(settings);
|
||||
}
|
||||
|
|
182
psxavenc/mdec.c
182
psxavenc/mdec.c
|
@ -24,11 +24,6 @@ freely, subject to the following restrictions:
|
|||
|
||||
#include "common.h"
|
||||
|
||||
// high 8 bits = bit count
|
||||
// low 24 bits = value
|
||||
uint32_t huffman_encoding_map[0x10000];
|
||||
bool dct_done_init = false;
|
||||
|
||||
#define MAKE_HUFFMAN_PAIR(zeroes, value) (((zeroes)<<10)|((+(value))&0x3FF)),(((zeroes)<<10)|((-(value))&0x3FF))
|
||||
const struct {
|
||||
int c_bits;
|
||||
|
@ -195,17 +190,24 @@ const int16_t dct_scale_table[8*8] = {
|
|||
+0x18F8, -0x471D, +0x6A6D, -0x7D8B, +0x7D8A, -0x6A6E, +0x471C, -0x18F9,
|
||||
};
|
||||
|
||||
static void init_dct_data(void)
|
||||
static void init_dct_data(vid_encoder_state_t *state)
|
||||
{
|
||||
for(int i = 0; i <= 0xFFFF; i++) {
|
||||
huffman_encoding_map[i] = ((6+16)<<24)|((0x01<<16)|(i));
|
||||
// high 8 bits = bit count
|
||||
// low 24 bits = value
|
||||
state->huffman_encoding_map[i] = ((6+16)<<24)|((0x01<<16)|(i));
|
||||
|
||||
int16_t coeff = (int16_t)i;
|
||||
if (coeff < -0x200) { coeff = -0x200; }
|
||||
if (coeff > +0x1FF) { coeff = +0x1FF; }
|
||||
state->coeff_clamp_map[i] = coeff&0x3FF;
|
||||
}
|
||||
|
||||
for(int i = 0; i < sizeof(huffman_lookup)/sizeof(huffman_lookup[0]); i++) {
|
||||
int bits = huffman_lookup[i].c_bits+1;
|
||||
uint32_t base_value = huffman_lookup[i].c_value;
|
||||
huffman_encoding_map[huffman_lookup[i].u_hword_pos] = (bits<<24)|(base_value<<1)|0;
|
||||
huffman_encoding_map[huffman_lookup[i].u_hword_neg] = (bits<<24)|(base_value<<1)|1;
|
||||
state->huffman_encoding_map[huffman_lookup[i].u_hword_pos] = (bits<<24)|(base_value<<1)|0;
|
||||
state->huffman_encoding_map[huffman_lookup[i].u_hword_neg] = (bits<<24)|(base_value<<1)|1;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -296,80 +298,71 @@ static bool encode_ac_value(vid_encoder_state_t *state, uint16_t value)
|
|||
// Use an escape
|
||||
return encode_bits(state, 6+16, (0x01<<16)|(0xFFFF&(uint32_t)value));
|
||||
#else
|
||||
uint32_t outword = huffman_encoding_map[value];
|
||||
uint32_t outword = state->huffman_encoding_map[value];
|
||||
return encode_bits(state, outword>>24, outword&0xFFFFFF);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void transform_dct_block(vid_encoder_state_t *state, float *block)
|
||||
static void transform_dct_block(vid_encoder_state_t *state, int16_t *block)
|
||||
{
|
||||
#if 0
|
||||
// Apply DCT to block
|
||||
float midblock[8*8];
|
||||
int midblock[8*8];
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
for (int j = 0; j < 8; j++) {
|
||||
float v = 0.0f;
|
||||
int v = 0;
|
||||
for(int k = 0; k < 8; k++) {
|
||||
v += block[8*j+k] * (float)dct_scale_table[8*i+k] / (float)(1 << 16);
|
||||
v += (int)block[8*j+k] * (int)dct_scale_table[8*i+k] / 8;
|
||||
}
|
||||
midblock[8*i+j] = v;
|
||||
midblock[8*i+j] = (v + 0xFFF) >> 13;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 8; i++) {
|
||||
for (int j = 0; j < 8; j++) {
|
||||
float v = 0.0f;
|
||||
int v = 0;
|
||||
for(int k = 0; k < 8; k++) {
|
||||
v += midblock[8*j+k] * (float)dct_scale_table[8*i+k] / (float)(1 << 16);
|
||||
v += (int)midblock[8*j+k] * (int)dct_scale_table[8*i+k];
|
||||
}
|
||||
block[8*i+j] = v;
|
||||
block[8*i+j] = (int16_t)((v + 0xFFF) >> 13);
|
||||
}
|
||||
}
|
||||
#else
|
||||
state->dct_context->fdct(block);
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool encode_dct_block(vid_encoder_state_t *state, float *block)
|
||||
// https://stackoverflow.com/a/60011209
|
||||
//#define DIVIDE_ROUNDED(n, d) (((n) >= 0) ? (((n) + (d)/2) / (d)) : (((n) - (d)/2) / (d)))
|
||||
#define DIVIDE_ROUNDED(n, d) ((int)round((double)(n) / (double)(d)))
|
||||
|
||||
static bool encode_dct_block(vid_encoder_state_t *state, const int16_t *block, const int16_t *quant_table)
|
||||
{
|
||||
int16_t coeffs[64];
|
||||
float scale = 8.0f / (float)state->quant_scale;
|
||||
int dc = DIVIDE_ROUNDED(block[0], quant_table[0]);
|
||||
dc = state->coeff_clamp_map[dc&0xFFFF];
|
||||
|
||||
for (int i = 0; i < 64; i++) {
|
||||
// The DC coefficient is not affected by the quantization scale.
|
||||
float x = block[i];
|
||||
if (i) { x *= scale; }
|
||||
|
||||
int v = (int)roundf(x / (float)quant_dec[i]);
|
||||
if (v < -0x200) { v = -0x200; }
|
||||
if (v > +0x1FF) { v = +0x1FF; }
|
||||
coeffs[i] = v;
|
||||
}
|
||||
|
||||
if (!encode_bits(state, 10, coeffs[0]&0x3FF)) {
|
||||
if (!encode_bits(state, 10, dc)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Build RLE output
|
||||
uint16_t zero_rle_data[8*8];
|
||||
int zero_rle_words = 0;
|
||||
for (int i = 1, zeroes = 0; i < 64; i++) {
|
||||
int ri = dct_zagzig_table[i];
|
||||
//int ri = dct_zigzag_table[i];
|
||||
if (coeffs[ri] == 0) {
|
||||
int ac = DIVIDE_ROUNDED(block[ri], quant_table[ri]);
|
||||
ac = state->coeff_clamp_map[ac&0xFFFF];
|
||||
|
||||
if (ac == 0) {
|
||||
zeroes++;
|
||||
} else {
|
||||
zero_rle_data[zero_rle_words++] = (zeroes<<10)|(coeffs[ri]&0x3FF);
|
||||
if (!encode_ac_value(state, (zeroes<<10)|ac)) {
|
||||
return false;
|
||||
}
|
||||
zeroes = 0;
|
||||
state->uncomp_hwords_used += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Now Huffman-code the data
|
||||
for (int i = 0; i < zero_rle_words; i++) {
|
||||
if (!encode_ac_value(state, zero_rle_data[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
//fprintf(stderr, "dc %08X rles %2d\n", coeffs[0], zero_rle_words);
|
||||
//assert(coeffs[0] >= -0x200); assert(coeffs[0] < +0x200);
|
||||
//fprintf(stderr, "dc %08X rles %2d\n", dc, zero_rle_words);
|
||||
//assert(dc >= -0x200); assert(dc < +0x200);
|
||||
|
||||
// Store end of block
|
||||
if (!encode_bits(state, 2, 0x2)) {
|
||||
|
@ -404,6 +397,61 @@ static int reduce_dct_block(vid_encoder_state_t *state, int32_t *block, int32_t
|
|||
}
|
||||
#endif
|
||||
|
||||
bool init_encoder_state(settings_t *settings)
|
||||
{
|
||||
if (settings->state_vid.huffman_encoding_map) {
|
||||
return true;
|
||||
}
|
||||
|
||||
settings->state_vid.huffman_encoding_map = malloc(0x10000*sizeof(uint32_t));
|
||||
settings->state_vid.coeff_clamp_map = malloc(0x10000*sizeof(int16_t));
|
||||
if (!settings->state_vid.huffman_encoding_map || !settings->state_vid.coeff_clamp_map) {
|
||||
return false;
|
||||
}
|
||||
init_dct_data(&(settings->state_vid));
|
||||
|
||||
settings->state_vid.dct_context = avcodec_dct_alloc();
|
||||
if (!settings->state_vid.dct_context) {
|
||||
return false;
|
||||
}
|
||||
avcodec_dct_init(settings->state_vid.dct_context);
|
||||
|
||||
int dct_block_count_x = (settings->video_width+15)/16;
|
||||
int dct_block_count_y = (settings->video_height+15)/16;
|
||||
|
||||
int dct_block_size = dct_block_count_x*dct_block_count_y*sizeof(int16_t)*8*8;
|
||||
for (int i = 0; i < 6; i++) {
|
||||
settings->state_vid.dct_block_lists[i] = malloc(dct_block_size);
|
||||
if (!settings->state_vid.dct_block_lists[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void destroy_encoder_state(settings_t *settings)
|
||||
{
|
||||
if (settings->state_vid.huffman_encoding_map) {
|
||||
free(settings->state_vid.huffman_encoding_map);
|
||||
settings->state_vid.huffman_encoding_map = NULL;
|
||||
}
|
||||
if (settings->state_vid.coeff_clamp_map) {
|
||||
free(settings->state_vid.coeff_clamp_map);
|
||||
settings->state_vid.coeff_clamp_map = NULL;
|
||||
}
|
||||
if (settings->state_vid.dct_context) {
|
||||
av_free(settings->state_vid.dct_context);
|
||||
settings->state_vid.dct_context = NULL;
|
||||
}
|
||||
if (settings->state_vid.dct_block_lists[0]) {
|
||||
for (int i = 0; i < 6; i++) {
|
||||
free(settings->state_vid.dct_block_lists[i]);
|
||||
settings->state_vid.dct_block_lists[i] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
|
||||
{
|
||||
int pitch = settings->video_width;
|
||||
|
@ -415,21 +463,11 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
|
|||
uint8_t *y_plane = video_frame;
|
||||
uint8_t *c_plane = y_plane + (settings->video_width*settings->video_height);
|
||||
|
||||
if (!dct_done_init) {
|
||||
init_dct_data();
|
||||
dct_done_init = true;
|
||||
}
|
||||
assert(settings->state_vid.huffman_encoding_map);
|
||||
|
||||
int dct_block_count_x = (settings->video_width+15)/16;
|
||||
int dct_block_count_y = (settings->video_height+15)/16;
|
||||
|
||||
if (settings->state_vid.dct_block_lists[0] == NULL) {
|
||||
int dct_block_size = dct_block_count_x*dct_block_count_y*sizeof(float)*8*8;
|
||||
for (int i = 0; i < 6; i++) {
|
||||
settings->state_vid.dct_block_lists[i] = malloc(dct_block_size);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: non-16x16-aligned videos
|
||||
assert((settings->video_width % 16) == 0);
|
||||
assert((settings->video_height % 16) == 0);
|
||||
|
@ -439,7 +477,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
|
|||
for(int fy = 0; fy < dct_block_count_y; fy++) {
|
||||
// Order: Cr Cb [Y1|Y2\nY3|Y4]
|
||||
int block_offs = 64 * (fy*dct_block_count_x + fx);
|
||||
float *blocks[6] = {
|
||||
int16_t *blocks[6] = {
|
||||
settings->state_vid.dct_block_lists[0] + block_offs,
|
||||
settings->state_vid.dct_block_lists[1] + block_offs,
|
||||
settings->state_vid.dct_block_lists[2] + block_offs,
|
||||
|
@ -456,12 +494,12 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
|
|||
int lx = fx*16 + x;
|
||||
int ly = fy*16 + y;
|
||||
|
||||
blocks[0][k] = (float)c_plane[pitch*cy + 2*cx + 0] - 128.0f;
|
||||
blocks[1][k] = (float)c_plane[pitch*cy + 2*cx + 1] - 128.0f;
|
||||
blocks[2][k] = (float)y_plane[pitch*(ly+0) + (lx+0)] - 128.0f;
|
||||
blocks[3][k] = (float)y_plane[pitch*(ly+0) + (lx+8)] - 128.0f;
|
||||
blocks[4][k] = (float)y_plane[pitch*(ly+8) + (lx+0)] - 128.0f;
|
||||
blocks[5][k] = (float)y_plane[pitch*(ly+8) + (lx+8)] - 128.0f;
|
||||
blocks[0][k] = (int16_t)c_plane[pitch*cy + 2*cx + 0] - 128;
|
||||
blocks[1][k] = (int16_t)c_plane[pitch*cy + 2*cx + 1] - 128;
|
||||
blocks[2][k] = (int16_t)y_plane[pitch*(ly+0) + (lx+0)] - 128;
|
||||
blocks[3][k] = (int16_t)y_plane[pitch*(ly+0) + (lx+8)] - 128;
|
||||
blocks[4][k] = (int16_t)y_plane[pitch*(ly+8) + (lx+0)] - 128;
|
||||
blocks[5][k] = (int16_t)y_plane[pitch*(ly+8) + (lx+8)] - 128;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -482,6 +520,14 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
|
|||
settings->state_vid.quant_scale < 64;
|
||||
settings->state_vid.quant_scale++
|
||||
) {
|
||||
int16_t quant_table[8*8];
|
||||
|
||||
// The DC coefficient's quantization scale is always 8.
|
||||
quant_table[0] = quant_dec[0] * 8;
|
||||
for (int i = 1; i < 64; i++) {
|
||||
quant_table[i] = quant_dec[i] * settings->state_vid.quant_scale;
|
||||
}
|
||||
|
||||
memset(settings->state_vid.frame_output, 0, settings->state_vid.frame_max_size);
|
||||
|
||||
settings->state_vid.bits_value = 0;
|
||||
|
@ -494,7 +540,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
|
|||
for(int fy = 0; ok && (fy < dct_block_count_y); fy++) {
|
||||
// Order: Cr Cb [Y1|Y2\nY3|Y4]
|
||||
int block_offs = 64 * (fy*dct_block_count_x + fx);
|
||||
float *blocks[6] = {
|
||||
int16_t *blocks[6] = {
|
||||
settings->state_vid.dct_block_lists[0] + block_offs,
|
||||
settings->state_vid.dct_block_lists[1] + block_offs,
|
||||
settings->state_vid.dct_block_lists[2] + block_offs,
|
||||
|
@ -504,7 +550,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
|
|||
};
|
||||
|
||||
for(int i = 0; ok && (i < 6); i++) {
|
||||
ok = encode_dct_block(&(settings->state_vid), blocks[i]);
|
||||
ok = encode_dct_block(&(settings->state_vid), blocks[i], quant_table);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -374,6 +374,9 @@ int main(int argc, char **argv) {
|
|||
settings.video_frames = NULL;
|
||||
settings.video_frame_count = 0;
|
||||
|
||||
settings.state_vid.huffman_encoding_map = NULL;
|
||||
settings.state_vid.coeff_clamp_map = NULL;
|
||||
settings.state_vid.dct_context = NULL;
|
||||
for(int i = 0; i < 6; i++) {
|
||||
settings.state_vid.dct_block_lists[i] = NULL;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue