diff --git a/meson.build b/meson.build index 500703d..99b6249 100644 --- a/meson.build +++ b/meson.build @@ -1,6 +1,6 @@ project('psxavenc', 'c', default_options: ['c_std=c11']) -add_project_arguments('-D_POSIX_C_SOURCE=201112L', language : 'c') +add_project_arguments('-D_POSIX_C_SOURCE=201112L', '-ffast-math', language : 'c') conf_data = configuration_data() conf_data.set('VERSION', '"' + run_command('git', '-C', meson.project_source_root(), 'describe', '--tags', '--always', '--dirty', '--match=v*', check: true).stdout().strip() + '"') diff --git a/psxavenc/common.h b/psxavenc/common.h index bc75f6f..343448a 100644 --- a/psxavenc/common.h +++ b/psxavenc/common.h @@ -34,6 +34,7 @@ freely, subject to the following restrictions: #include #include +#include #include #include #include @@ -65,7 +66,11 @@ typedef struct { int uncomp_hwords_used; int quant_scale; int quant_scale_sum; - float *dct_block_lists[6]; + + uint32_t *huffman_encoding_map; + int16_t *coeff_clamp_map; + int16_t *dct_block_lists[6]; + AVDCT *dct_context; } vid_encoder_state_t; typedef struct { @@ -142,5 +147,7 @@ void encode_file_str(settings_t *settings, FILE *output); void encode_file_sbs(settings_t *settings, FILE *output); // mdec.c +bool init_encoder_state(settings_t *settings); +void destroy_encoder_state(settings_t *settings); void encode_frame_bs(uint8_t *video_frame, settings_t *settings); void encode_sector_str(uint8_t *video_frames, uint8_t *output, settings_t *settings); diff --git a/psxavenc/filefmt.c b/psxavenc/filefmt.c index 0b8ef35..247ca19 100644 --- a/psxavenc/filefmt.c +++ b/psxavenc/filefmt.c @@ -299,6 +299,7 @@ void encode_file_str(settings_t *settings, FILE *output) { fprintf(stderr, "Frame size: %.2f sectors\n", frame_size); } + init_encoder_state(settings); settings->state_vid.frame_output = malloc(2016 * (int)ceil(frame_size)); settings->state_vid.frame_index = 0; settings->state_vid.frame_data_offset = 0; @@ -362,9 +363,11 @@ void encode_file_str(settings_t *settings, FILE *output) { } free(settings->state_vid.frame_output); + destroy_encoder_state(settings); } void encode_file_sbs(settings_t *settings, FILE *output) { + init_encoder_state(settings); settings->state_vid.frame_output = malloc(settings->alignment); settings->state_vid.frame_data_offset = 0; settings->state_vid.frame_max_size = settings->alignment; @@ -385,4 +388,5 @@ void encode_file_sbs(settings_t *settings, FILE *output) { } free(settings->state_vid.frame_output); + destroy_encoder_state(settings); } diff --git a/psxavenc/mdec.c b/psxavenc/mdec.c index c88964e..6abffea 100644 --- a/psxavenc/mdec.c +++ b/psxavenc/mdec.c @@ -24,11 +24,6 @@ freely, subject to the following restrictions: #include "common.h" -// high 8 bits = bit count -// low 24 bits = value -uint32_t huffman_encoding_map[0x10000]; -bool dct_done_init = false; - #define MAKE_HUFFMAN_PAIR(zeroes, value) (((zeroes)<<10)|((+(value))&0x3FF)),(((zeroes)<<10)|((-(value))&0x3FF)) const struct { int c_bits; @@ -195,17 +190,24 @@ const int16_t dct_scale_table[8*8] = { +0x18F8, -0x471D, +0x6A6D, -0x7D8B, +0x7D8A, -0x6A6E, +0x471C, -0x18F9, }; -static void init_dct_data(void) +static void init_dct_data(vid_encoder_state_t *state) { for(int i = 0; i <= 0xFFFF; i++) { - huffman_encoding_map[i] = ((6+16)<<24)|((0x01<<16)|(i)); + // high 8 bits = bit count + // low 24 bits = value + state->huffman_encoding_map[i] = ((6+16)<<24)|((0x01<<16)|(i)); + + int16_t coeff = (int16_t)i; + if (coeff < -0x200) { coeff = -0x200; } + if (coeff > +0x1FF) { coeff = +0x1FF; } + state->coeff_clamp_map[i] = coeff&0x3FF; } for(int i = 0; i < sizeof(huffman_lookup)/sizeof(huffman_lookup[0]); i++) { int bits = huffman_lookup[i].c_bits+1; uint32_t base_value = huffman_lookup[i].c_value; - huffman_encoding_map[huffman_lookup[i].u_hword_pos] = (bits<<24)|(base_value<<1)|0; - huffman_encoding_map[huffman_lookup[i].u_hword_neg] = (bits<<24)|(base_value<<1)|1; + state->huffman_encoding_map[huffman_lookup[i].u_hword_pos] = (bits<<24)|(base_value<<1)|0; + state->huffman_encoding_map[huffman_lookup[i].u_hword_neg] = (bits<<24)|(base_value<<1)|1; } } @@ -296,80 +298,71 @@ static bool encode_ac_value(vid_encoder_state_t *state, uint16_t value) // Use an escape return encode_bits(state, 6+16, (0x01<<16)|(0xFFFF&(uint32_t)value)); #else - uint32_t outword = huffman_encoding_map[value]; + uint32_t outword = state->huffman_encoding_map[value]; return encode_bits(state, outword>>24, outword&0xFFFFFF); #endif } -static void transform_dct_block(vid_encoder_state_t *state, float *block) +static void transform_dct_block(vid_encoder_state_t *state, int16_t *block) { +#if 0 // Apply DCT to block - float midblock[8*8]; + int midblock[8*8]; for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) { - float v = 0.0f; + int v = 0; for(int k = 0; k < 8; k++) { - v += block[8*j+k] * (float)dct_scale_table[8*i+k] / (float)(1 << 16); + v += (int)block[8*j+k] * (int)dct_scale_table[8*i+k] / 8; } - midblock[8*i+j] = v; + midblock[8*i+j] = (v + 0xFFF) >> 13; } } for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) { - float v = 0.0f; + int v = 0; for(int k = 0; k < 8; k++) { - v += midblock[8*j+k] * (float)dct_scale_table[8*i+k] / (float)(1 << 16); + v += (int)midblock[8*j+k] * (int)dct_scale_table[8*i+k]; } - block[8*i+j] = v; + block[8*i+j] = (int16_t)((v + 0xFFF) >> 13); } } +#else + state->dct_context->fdct(block); +#endif } -static bool encode_dct_block(vid_encoder_state_t *state, float *block) +// https://stackoverflow.com/a/60011209 +//#define DIVIDE_ROUNDED(n, d) (((n) >= 0) ? (((n) + (d)/2) / (d)) : (((n) - (d)/2) / (d))) +#define DIVIDE_ROUNDED(n, d) ((int)round((double)(n) / (double)(d))) + +static bool encode_dct_block(vid_encoder_state_t *state, const int16_t *block, const int16_t *quant_table) { - int16_t coeffs[64]; - float scale = 8.0f / (float)state->quant_scale; + int dc = DIVIDE_ROUNDED(block[0], quant_table[0]); + dc = state->coeff_clamp_map[dc&0xFFFF]; - for (int i = 0; i < 64; i++) { - // The DC coefficient is not affected by the quantization scale. - float x = block[i]; - if (i) { x *= scale; } - - int v = (int)roundf(x / (float)quant_dec[i]); - if (v < -0x200) { v = -0x200; } - if (v > +0x1FF) { v = +0x1FF; } - coeffs[i] = v; - } - - if (!encode_bits(state, 10, coeffs[0]&0x3FF)) { + if (!encode_bits(state, 10, dc)) { return false; } - // Build RLE output - uint16_t zero_rle_data[8*8]; - int zero_rle_words = 0; for (int i = 1, zeroes = 0; i < 64; i++) { int ri = dct_zagzig_table[i]; - //int ri = dct_zigzag_table[i]; - if (coeffs[ri] == 0) { + int ac = DIVIDE_ROUNDED(block[ri], quant_table[ri]); + ac = state->coeff_clamp_map[ac&0xFFFF]; + + if (ac == 0) { zeroes++; } else { - zero_rle_data[zero_rle_words++] = (zeroes<<10)|(coeffs[ri]&0x3FF); + if (!encode_ac_value(state, (zeroes<<10)|ac)) { + return false; + } zeroes = 0; state->uncomp_hwords_used += 1; } } - // Now Huffman-code the data - for (int i = 0; i < zero_rle_words; i++) { - if (!encode_ac_value(state, zero_rle_data[i])) { - return false; - } - } - - //fprintf(stderr, "dc %08X rles %2d\n", coeffs[0], zero_rle_words); - //assert(coeffs[0] >= -0x200); assert(coeffs[0] < +0x200); + //fprintf(stderr, "dc %08X rles %2d\n", dc, zero_rle_words); + //assert(dc >= -0x200); assert(dc < +0x200); // Store end of block if (!encode_bits(state, 2, 0x2)) { @@ -404,6 +397,61 @@ static int reduce_dct_block(vid_encoder_state_t *state, int32_t *block, int32_t } #endif +bool init_encoder_state(settings_t *settings) +{ + if (settings->state_vid.huffman_encoding_map) { + return true; + } + + settings->state_vid.huffman_encoding_map = malloc(0x10000*sizeof(uint32_t)); + settings->state_vid.coeff_clamp_map = malloc(0x10000*sizeof(int16_t)); + if (!settings->state_vid.huffman_encoding_map || !settings->state_vid.coeff_clamp_map) { + return false; + } + init_dct_data(&(settings->state_vid)); + + settings->state_vid.dct_context = avcodec_dct_alloc(); + if (!settings->state_vid.dct_context) { + return false; + } + avcodec_dct_init(settings->state_vid.dct_context); + + int dct_block_count_x = (settings->video_width+15)/16; + int dct_block_count_y = (settings->video_height+15)/16; + + int dct_block_size = dct_block_count_x*dct_block_count_y*sizeof(int16_t)*8*8; + for (int i = 0; i < 6; i++) { + settings->state_vid.dct_block_lists[i] = malloc(dct_block_size); + if (!settings->state_vid.dct_block_lists[i]) { + return false; + } + } + + return true; +} + +void destroy_encoder_state(settings_t *settings) +{ + if (settings->state_vid.huffman_encoding_map) { + free(settings->state_vid.huffman_encoding_map); + settings->state_vid.huffman_encoding_map = NULL; + } + if (settings->state_vid.coeff_clamp_map) { + free(settings->state_vid.coeff_clamp_map); + settings->state_vid.coeff_clamp_map = NULL; + } + if (settings->state_vid.dct_context) { + av_free(settings->state_vid.dct_context); + settings->state_vid.dct_context = NULL; + } + if (settings->state_vid.dct_block_lists[0]) { + for (int i = 0; i < 6; i++) { + free(settings->state_vid.dct_block_lists[i]); + settings->state_vid.dct_block_lists[i] = NULL; + } + } +} + void encode_frame_bs(uint8_t *video_frame, settings_t *settings) { int pitch = settings->video_width; @@ -415,21 +463,11 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings) uint8_t *y_plane = video_frame; uint8_t *c_plane = y_plane + (settings->video_width*settings->video_height); - if (!dct_done_init) { - init_dct_data(); - dct_done_init = true; - } + assert(settings->state_vid.huffman_encoding_map); int dct_block_count_x = (settings->video_width+15)/16; int dct_block_count_y = (settings->video_height+15)/16; - if (settings->state_vid.dct_block_lists[0] == NULL) { - int dct_block_size = dct_block_count_x*dct_block_count_y*sizeof(float)*8*8; - for (int i = 0; i < 6; i++) { - settings->state_vid.dct_block_lists[i] = malloc(dct_block_size); - } - } - // TODO: non-16x16-aligned videos assert((settings->video_width % 16) == 0); assert((settings->video_height % 16) == 0); @@ -439,7 +477,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings) for(int fy = 0; fy < dct_block_count_y; fy++) { // Order: Cr Cb [Y1|Y2\nY3|Y4] int block_offs = 64 * (fy*dct_block_count_x + fx); - float *blocks[6] = { + int16_t *blocks[6] = { settings->state_vid.dct_block_lists[0] + block_offs, settings->state_vid.dct_block_lists[1] + block_offs, settings->state_vid.dct_block_lists[2] + block_offs, @@ -456,12 +494,12 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings) int lx = fx*16 + x; int ly = fy*16 + y; - blocks[0][k] = (float)c_plane[pitch*cy + 2*cx + 0] - 128.0f; - blocks[1][k] = (float)c_plane[pitch*cy + 2*cx + 1] - 128.0f; - blocks[2][k] = (float)y_plane[pitch*(ly+0) + (lx+0)] - 128.0f; - blocks[3][k] = (float)y_plane[pitch*(ly+0) + (lx+8)] - 128.0f; - blocks[4][k] = (float)y_plane[pitch*(ly+8) + (lx+0)] - 128.0f; - blocks[5][k] = (float)y_plane[pitch*(ly+8) + (lx+8)] - 128.0f; + blocks[0][k] = (int16_t)c_plane[pitch*cy + 2*cx + 0] - 128; + blocks[1][k] = (int16_t)c_plane[pitch*cy + 2*cx + 1] - 128; + blocks[2][k] = (int16_t)y_plane[pitch*(ly+0) + (lx+0)] - 128; + blocks[3][k] = (int16_t)y_plane[pitch*(ly+0) + (lx+8)] - 128; + blocks[4][k] = (int16_t)y_plane[pitch*(ly+8) + (lx+0)] - 128; + blocks[5][k] = (int16_t)y_plane[pitch*(ly+8) + (lx+8)] - 128; } } @@ -482,6 +520,14 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings) settings->state_vid.quant_scale < 64; settings->state_vid.quant_scale++ ) { + int16_t quant_table[8*8]; + + // The DC coefficient's quantization scale is always 8. + quant_table[0] = quant_dec[0] * 8; + for (int i = 1; i < 64; i++) { + quant_table[i] = quant_dec[i] * settings->state_vid.quant_scale; + } + memset(settings->state_vid.frame_output, 0, settings->state_vid.frame_max_size); settings->state_vid.bits_value = 0; @@ -494,7 +540,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings) for(int fy = 0; ok && (fy < dct_block_count_y); fy++) { // Order: Cr Cb [Y1|Y2\nY3|Y4] int block_offs = 64 * (fy*dct_block_count_x + fx); - float *blocks[6] = { + int16_t *blocks[6] = { settings->state_vid.dct_block_lists[0] + block_offs, settings->state_vid.dct_block_lists[1] + block_offs, settings->state_vid.dct_block_lists[2] + block_offs, @@ -504,7 +550,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings) }; for(int i = 0; ok && (i < 6); i++) { - ok = encode_dct_block(&(settings->state_vid), blocks[i]); + ok = encode_dct_block(&(settings->state_vid), blocks[i], quant_table); } } } diff --git a/psxavenc/psxavenc.c b/psxavenc/psxavenc.c index 5c96d30..89b58da 100644 --- a/psxavenc/psxavenc.c +++ b/psxavenc/psxavenc.c @@ -374,6 +374,9 @@ int main(int argc, char **argv) { settings.video_frames = NULL; settings.video_frame_count = 0; + settings.state_vid.huffman_encoding_map = NULL; + settings.state_vid.coeff_clamp_map = NULL; + settings.state_vid.dct_context = NULL; for(int i = 0; i < 6; i++) { settings.state_vid.dct_block_lists[i] = NULL; }