Optimize MDEC encoder, use ffmpeg DCT implementation

2023-11-04 10:34:46 +01:00 · 2023-11-04 10:34:46 +01:00 · 302989badf
parent 87b0fe3f2a
commit 302989badf
5 changed files with 130 additions and 70 deletions
--- a/meson.build
+++ b/meson.build
@ -1,6 +1,6 @@
 project('psxavenc', 'c', default_options: ['c_std=c11'])

-add_project_arguments('-D_POSIX_C_SOURCE=201112L', language : 'c')
+add_project_arguments('-D_POSIX_C_SOURCE=201112L', '-ffast-math', language : 'c')

 conf_data = configuration_data()
 conf_data.set('VERSION', '"' + run_command('git', '-C', meson.project_source_root(), 'describe', '--tags', '--always', '--dirty', '--match=v*', check: true).stdout().strip() + '"')
--- a/psxavenc/common.h
+++ b/psxavenc/common.h
@ -34,6 +34,7 @@ freely, subject to the following restrictions:

 #include <libavutil/opt.h>
 #include <libavcodec/avcodec.h>
+#include <libavcodec/avdct.h>
 #include <libavformat/avformat.h>
 #include <libswscale/swscale.h>
 #include <libswresample/swresample.h>
@ -65,7 +66,11 @@ typedef struct {
 	int uncomp_hwords_used;
 	int quant_scale;
 	int quant_scale_sum;
-	float *dct_block_lists[6];
+
+	uint32_t *huffman_encoding_map;
+	int16_t *coeff_clamp_map;
+	int16_t *dct_block_lists[6];
+	AVDCT *dct_context;
 } vid_encoder_state_t;

 typedef struct {
@ -142,5 +147,7 @@ void encode_file_str(settings_t *settings, FILE *output);
 void encode_file_sbs(settings_t *settings, FILE *output);

 // mdec.c
+bool init_encoder_state(settings_t *settings);
+void destroy_encoder_state(settings_t *settings);
 void encode_frame_bs(uint8_t *video_frame, settings_t *settings);
 void encode_sector_str(uint8_t *video_frames, uint8_t *output, settings_t *settings);
--- a/psxavenc/filefmt.c
+++ b/psxavenc/filefmt.c
@ -299,6 +299,7 @@ void encode_file_str(settings_t *settings, FILE *output) {
 		fprintf(stderr, "Frame size: %.2f sectors\n", frame_size);
 	}

+	init_encoder_state(settings);
 	settings->state_vid.frame_output = malloc(2016 * (int)ceil(frame_size));
 	settings->state_vid.frame_index = 0;
 	settings->state_vid.frame_data_offset = 0;
@ -362,9 +363,11 @@ void encode_file_str(settings_t *settings, FILE *output) {
 	}

 	free(settings->state_vid.frame_output);
+	destroy_encoder_state(settings);
 }

 void encode_file_sbs(settings_t *settings, FILE *output) {
+	init_encoder_state(settings);
 	settings->state_vid.frame_output = malloc(settings->alignment);
 	settings->state_vid.frame_data_offset = 0;
 	settings->state_vid.frame_max_size = settings->alignment;
@ -385,4 +388,5 @@ void encode_file_sbs(settings_t *settings, FILE *output) {
 	}

 	free(settings->state_vid.frame_output);
+	destroy_encoder_state(settings);
 }
--- a/psxavenc/mdec.c
+++ b/psxavenc/mdec.c
@ -24,11 +24,6 @@ freely, subject to the following restrictions:

 #include "common.h"

-// high 8 bits = bit count
-// low 24 bits = value
-uint32_t huffman_encoding_map[0x10000];
-bool dct_done_init = false;
-
 #define MAKE_HUFFMAN_PAIR(zeroes, value) (((zeroes)<<10)|((+(value))&0x3FF)),(((zeroes)<<10)|((-(value))&0x3FF))
 const struct {
 	int c_bits;
@ -195,17 +190,24 @@ const int16_t dct_scale_table[8*8] = {
 	+0x18F8, -0x471D, +0x6A6D, -0x7D8B, +0x7D8A, -0x6A6E, +0x471C, -0x18F9,
 };

-static void init_dct_data(void)
+static void init_dct_data(vid_encoder_state_t *state)
 {
 	for(int i = 0; i <= 0xFFFF; i++) {
-		huffman_encoding_map[i] = ((6+16)<<24)|((0x01<<16)|(i));
+		// high 8 bits = bit count
+		// low 24 bits = value
+		state->huffman_encoding_map[i] = ((6+16)<<24)|((0x01<<16)|(i));
+
+		int16_t coeff = (int16_t)i;
+		if (coeff < -0x200) { coeff = -0x200; }
+		if (coeff > +0x1FF) { coeff = +0x1FF; }
+		state->coeff_clamp_map[i] = coeff&0x3FF;
 	}

 	for(int i = 0; i < sizeof(huffman_lookup)/sizeof(huffman_lookup[0]); i++) {
 		int bits = huffman_lookup[i].c_bits+1;
 		uint32_t base_value = huffman_lookup[i].c_value;
-		huffman_encoding_map[huffman_lookup[i].u_hword_pos] = (bits<<24)|(base_value<<1)|0;
-		huffman_encoding_map[huffman_lookup[i].u_hword_neg] = (bits<<24)|(base_value<<1)|1;
+		state->huffman_encoding_map[huffman_lookup[i].u_hword_pos] = (bits<<24)|(base_value<<1)|0;
+		state->huffman_encoding_map[huffman_lookup[i].u_hword_neg] = (bits<<24)|(base_value<<1)|1;
 	}

 }
@ -296,80 +298,71 @@ static bool encode_ac_value(vid_encoder_state_t *state, uint16_t value)
 	// Use an escape
 	return encode_bits(state, 6+16, (0x01<<16)|(0xFFFF&(uint32_t)value));
 #else
-	uint32_t outword = huffman_encoding_map[value];
+	uint32_t outword = state->huffman_encoding_map[value];
 	return encode_bits(state, outword>>24, outword&0xFFFFFF);
 #endif
 }

-static void transform_dct_block(vid_encoder_state_t *state, float *block)
+static void transform_dct_block(vid_encoder_state_t *state, int16_t *block)
 {
+#if 0
 	// Apply DCT to block
-	float midblock[8*8];
+	int midblock[8*8];

 	for (int i = 0; i < 8; i++) {
 	for (int j = 0; j < 8; j++) {
-		float v = 0.0f;
+		int v = 0;
 		for(int k = 0; k < 8; k++) {
-			v += block[8*j+k] * (float)dct_scale_table[8*i+k] / (float)(1 << 16);
+			v += (int)block[8*j+k] * (int)dct_scale_table[8*i+k] / 8;
 		}
-		midblock[8*i+j] = v;
+		midblock[8*i+j] = (v + 0xFFF) >> 13;
 	}
 	}
 	for (int i = 0; i < 8; i++) {
 	for (int j = 0; j < 8; j++) {
-		float v = 0.0f;
+		int v = 0;
 		for(int k = 0; k < 8; k++) {
-			v += midblock[8*j+k] * (float)dct_scale_table[8*i+k] / (float)(1 << 16);
+			v += (int)midblock[8*j+k] * (int)dct_scale_table[8*i+k];
 		}
-		block[8*i+j] = v;
+		block[8*i+j] = (int16_t)((v + 0xFFF) >> 13);
 	}
 	}
+#else
+	state->dct_context->fdct(block);
+#endif
 }

-static bool encode_dct_block(vid_encoder_state_t *state, float *block)
+// https://stackoverflow.com/a/60011209
+//#define DIVIDE_ROUNDED(n, d) (((n) >= 0) ? (((n) + (d)/2) / (d)) : (((n) - (d)/2) / (d)))
+#define DIVIDE_ROUNDED(n, d) ((int)round((double)(n) / (double)(d)))
+
+static bool encode_dct_block(vid_encoder_state_t *state, const int16_t *block, const int16_t *quant_table)
 {
-	int16_t coeffs[64];
-	float scale = 8.0f / (float)state->quant_scale;
+	int dc = DIVIDE_ROUNDED(block[0], quant_table[0]);
+	dc = state->coeff_clamp_map[dc&0xFFFF];

-	for (int i = 0; i < 64; i++) {
-		// The DC coefficient is not affected by the quantization scale.
-		float x = block[i];
-		if (i) { x *= scale; }
-
-		int v = (int)roundf(x / (float)quant_dec[i]);
-		if (v < -0x200) { v = -0x200; }
-		if (v > +0x1FF) { v = +0x1FF; }
-		coeffs[i] = v;
-	}
-
-	if (!encode_bits(state, 10, coeffs[0]&0x3FF)) {
+	if (!encode_bits(state, 10, dc)) {
 		return false;
 	}

-	// Build RLE output
-	uint16_t zero_rle_data[8*8];
-	int zero_rle_words = 0;
 	for (int i = 1, zeroes = 0; i < 64; i++) {
 		int ri = dct_zagzig_table[i];
-		//int ri = dct_zigzag_table[i];
-		if (coeffs[ri] == 0) {
+		int ac = DIVIDE_ROUNDED(block[ri], quant_table[ri]);
+		ac = state->coeff_clamp_map[ac&0xFFFF];
+
+		if (ac == 0) {
 			zeroes++;
 		} else {
-			zero_rle_data[zero_rle_words++] = (zeroes<<10)|(coeffs[ri]&0x3FF);
+			if (!encode_ac_value(state, (zeroes<<10)|ac)) {
+				return false;
+			}
 			zeroes = 0;
 			state->uncomp_hwords_used += 1;
 		}
 	}

-	// Now Huffman-code the data
-	for (int i = 0; i < zero_rle_words; i++) {
-		if (!encode_ac_value(state, zero_rle_data[i])) {
-			return false;
-		}
-	}
-
-	//fprintf(stderr, "dc %08X rles %2d\n", coeffs[0], zero_rle_words);
-	//assert(coeffs[0] >= -0x200); assert(coeffs[0] <  +0x200);
+	//fprintf(stderr, "dc %08X rles %2d\n", dc, zero_rle_words);
+	//assert(dc >= -0x200); assert(dc <  +0x200);

 	// Store end of block
 	if (!encode_bits(state, 2, 0x2)) {
@ -404,6 +397,61 @@ static int reduce_dct_block(vid_encoder_state_t *state, int32_t *block, int32_t
 }
 #endif

+bool init_encoder_state(settings_t *settings)
+{
+	if (settings->state_vid.huffman_encoding_map) {
+		return true;
+	}
+
+	settings->state_vid.huffman_encoding_map = malloc(0x10000*sizeof(uint32_t));
+	settings->state_vid.coeff_clamp_map = malloc(0x10000*sizeof(int16_t));
+	if (!settings->state_vid.huffman_encoding_map || !settings->state_vid.coeff_clamp_map) {
+		return false;
+	}
+	init_dct_data(&(settings->state_vid));
+
+	settings->state_vid.dct_context = avcodec_dct_alloc();
+	if (!settings->state_vid.dct_context) {
+		return false;
+	}
+	avcodec_dct_init(settings->state_vid.dct_context);
+
+	int dct_block_count_x = (settings->video_width+15)/16;
+	int dct_block_count_y = (settings->video_height+15)/16;
+
+	int dct_block_size = dct_block_count_x*dct_block_count_y*sizeof(int16_t)*8*8;
+	for (int i = 0; i < 6; i++) {
+		settings->state_vid.dct_block_lists[i] = malloc(dct_block_size);
+		if (!settings->state_vid.dct_block_lists[i]) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+void destroy_encoder_state(settings_t *settings)
+{
+	if (settings->state_vid.huffman_encoding_map) {
+		free(settings->state_vid.huffman_encoding_map);
+		settings->state_vid.huffman_encoding_map = NULL;
+	}
+	if (settings->state_vid.coeff_clamp_map) {
+		free(settings->state_vid.coeff_clamp_map);
+		settings->state_vid.coeff_clamp_map = NULL;
+	}
+	if (settings->state_vid.dct_context) {
+		av_free(settings->state_vid.dct_context);
+		settings->state_vid.dct_context = NULL;
+	}
+	if (settings->state_vid.dct_block_lists[0]) {
+		for (int i = 0; i < 6; i++) {
+			free(settings->state_vid.dct_block_lists[i]);
+			settings->state_vid.dct_block_lists[i] = NULL;
+		}
+	}
+}
+
 void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
 {
 	int pitch = settings->video_width;
@ -415,21 +463,11 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
 	uint8_t *y_plane = video_frame;
 	uint8_t *c_plane = y_plane + (settings->video_width*settings->video_height);

-	if (!dct_done_init) {
-		init_dct_data();
-		dct_done_init = true;
-	}
+	assert(settings->state_vid.huffman_encoding_map);

 	int dct_block_count_x = (settings->video_width+15)/16;
 	int dct_block_count_y = (settings->video_height+15)/16;

-	if (settings->state_vid.dct_block_lists[0] == NULL) {
-		int dct_block_size = dct_block_count_x*dct_block_count_y*sizeof(float)*8*8;
-		for (int i = 0; i < 6; i++) {
-			settings->state_vid.dct_block_lists[i] = malloc(dct_block_size);
-		}
-	}
-
 	// TODO: non-16x16-aligned videos
 	assert((settings->video_width % 16) == 0);
 	assert((settings->video_height % 16) == 0);
@ -439,7 +477,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
 	for(int fy = 0; fy < dct_block_count_y; fy++) {
 		// Order: Cr Cb [Y1|Y2\nY3|Y4]
 		int block_offs = 64 * (fy*dct_block_count_x + fx);
-		float *blocks[6] = {
+		int16_t *blocks[6] = {
 			settings->state_vid.dct_block_lists[0] + block_offs,
 			settings->state_vid.dct_block_lists[1] + block_offs,
 			settings->state_vid.dct_block_lists[2] + block_offs,
@ -456,12 +494,12 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
 			int lx = fx*16 + x;
 			int ly = fy*16 + y;

-			blocks[0][k] = (float)c_plane[pitch*cy + 2*cx + 0] - 128.0f;
-			blocks[1][k] = (float)c_plane[pitch*cy + 2*cx + 1] - 128.0f;
-			blocks[2][k] = (float)y_plane[pitch*(ly+0) + (lx+0)] - 128.0f;
-			blocks[3][k] = (float)y_plane[pitch*(ly+0) + (lx+8)] - 128.0f;
-			blocks[4][k] = (float)y_plane[pitch*(ly+8) + (lx+0)] - 128.0f;
-			blocks[5][k] = (float)y_plane[pitch*(ly+8) + (lx+8)] - 128.0f;
+			blocks[0][k] = (int16_t)c_plane[pitch*cy + 2*cx + 0] - 128;
+			blocks[1][k] = (int16_t)c_plane[pitch*cy + 2*cx + 1] - 128;
+			blocks[2][k] = (int16_t)y_plane[pitch*(ly+0) + (lx+0)] - 128;
+			blocks[3][k] = (int16_t)y_plane[pitch*(ly+0) + (lx+8)] - 128;
+			blocks[4][k] = (int16_t)y_plane[pitch*(ly+8) + (lx+0)] - 128;
+			blocks[5][k] = (int16_t)y_plane[pitch*(ly+8) + (lx+8)] - 128;
 		}
 		}

@ -482,6 +520,14 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
 		settings->state_vid.quant_scale < 64;
 		settings->state_vid.quant_scale++
 	) {
+		int16_t quant_table[8*8];
+
+		// The DC coefficient's quantization scale is always 8.
+		quant_table[0] = quant_dec[0] * 8;
+		for (int i = 1; i < 64; i++) {
+			quant_table[i] = quant_dec[i] * settings->state_vid.quant_scale;
+		}
+
 		memset(settings->state_vid.frame_output, 0, settings->state_vid.frame_max_size);

 		settings->state_vid.bits_value = 0;
@ -494,7 +540,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
 		for(int fy = 0; ok && (fy < dct_block_count_y); fy++) {
 			// Order: Cr Cb [Y1|Y2\nY3|Y4]
 			int block_offs = 64 * (fy*dct_block_count_x + fx);
-			float *blocks[6] = {
+			int16_t *blocks[6] = {
 				settings->state_vid.dct_block_lists[0] + block_offs,
 				settings->state_vid.dct_block_lists[1] + block_offs,
 				settings->state_vid.dct_block_lists[2] + block_offs,
@ -504,7 +550,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
 			};

 			for(int i = 0; ok && (i < 6); i++) {
-				ok = encode_dct_block(&(settings->state_vid), blocks[i]);
+				ok = encode_dct_block(&(settings->state_vid), blocks[i], quant_table);
 			}
 		}
 		}
--- a/psxavenc/psxavenc.c
+++ b/psxavenc/psxavenc.c
@ -374,6 +374,9 @@ int main(int argc, char **argv) {
 	settings.video_frames = NULL;
 	settings.video_frame_count = 0;

+	settings.state_vid.huffman_encoding_map = NULL;
+	settings.state_vid.coeff_clamp_map = NULL;
+	settings.state_vid.dct_context = NULL;
 	for(int i = 0; i < 6; i++) {
 		settings.state_vid.dct_block_lists[i] = NULL;
 	}