Optimize MDEC encoder, use ffmpeg DCT implementation

This commit is contained in:
spicyjpeg 2023-11-04 10:34:46 +01:00 committed by Adrian Siekierka
parent 87b0fe3f2a
commit 302989badf
5 changed files with 130 additions and 70 deletions

View File

@ -1,6 +1,6 @@
project('psxavenc', 'c', default_options: ['c_std=c11'])
add_project_arguments('-D_POSIX_C_SOURCE=201112L', language : 'c')
add_project_arguments('-D_POSIX_C_SOURCE=201112L', '-ffast-math', language : 'c')
conf_data = configuration_data()
conf_data.set('VERSION', '"' + run_command('git', '-C', meson.project_source_root(), 'describe', '--tags', '--always', '--dirty', '--match=v*', check: true).stdout().strip() + '"')

View File

@ -34,6 +34,7 @@ freely, subject to the following restrictions:
#include <libavutil/opt.h>
#include <libavcodec/avcodec.h>
#include <libavcodec/avdct.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libswresample/swresample.h>
@ -65,7 +66,11 @@ typedef struct {
int uncomp_hwords_used;
int quant_scale;
int quant_scale_sum;
float *dct_block_lists[6];
uint32_t *huffman_encoding_map;
int16_t *coeff_clamp_map;
int16_t *dct_block_lists[6];
AVDCT *dct_context;
} vid_encoder_state_t;
typedef struct {
@ -142,5 +147,7 @@ void encode_file_str(settings_t *settings, FILE *output);
void encode_file_sbs(settings_t *settings, FILE *output);
// mdec.c
bool init_encoder_state(settings_t *settings);
void destroy_encoder_state(settings_t *settings);
void encode_frame_bs(uint8_t *video_frame, settings_t *settings);
void encode_sector_str(uint8_t *video_frames, uint8_t *output, settings_t *settings);

View File

@ -299,6 +299,7 @@ void encode_file_str(settings_t *settings, FILE *output) {
fprintf(stderr, "Frame size: %.2f sectors\n", frame_size);
}
init_encoder_state(settings);
settings->state_vid.frame_output = malloc(2016 * (int)ceil(frame_size));
settings->state_vid.frame_index = 0;
settings->state_vid.frame_data_offset = 0;
@ -362,9 +363,11 @@ void encode_file_str(settings_t *settings, FILE *output) {
}
free(settings->state_vid.frame_output);
destroy_encoder_state(settings);
}
void encode_file_sbs(settings_t *settings, FILE *output) {
init_encoder_state(settings);
settings->state_vid.frame_output = malloc(settings->alignment);
settings->state_vid.frame_data_offset = 0;
settings->state_vid.frame_max_size = settings->alignment;
@ -385,4 +388,5 @@ void encode_file_sbs(settings_t *settings, FILE *output) {
}
free(settings->state_vid.frame_output);
destroy_encoder_state(settings);
}

View File

@ -24,11 +24,6 @@ freely, subject to the following restrictions:
#include "common.h"
// high 8 bits = bit count
// low 24 bits = value
uint32_t huffman_encoding_map[0x10000];
bool dct_done_init = false;
#define MAKE_HUFFMAN_PAIR(zeroes, value) (((zeroes)<<10)|((+(value))&0x3FF)),(((zeroes)<<10)|((-(value))&0x3FF))
const struct {
int c_bits;
@ -195,17 +190,24 @@ const int16_t dct_scale_table[8*8] = {
+0x18F8, -0x471D, +0x6A6D, -0x7D8B, +0x7D8A, -0x6A6E, +0x471C, -0x18F9,
};
static void init_dct_data(void)
static void init_dct_data(vid_encoder_state_t *state)
{
for(int i = 0; i <= 0xFFFF; i++) {
huffman_encoding_map[i] = ((6+16)<<24)|((0x01<<16)|(i));
// high 8 bits = bit count
// low 24 bits = value
state->huffman_encoding_map[i] = ((6+16)<<24)|((0x01<<16)|(i));
int16_t coeff = (int16_t)i;
if (coeff < -0x200) { coeff = -0x200; }
if (coeff > +0x1FF) { coeff = +0x1FF; }
state->coeff_clamp_map[i] = coeff&0x3FF;
}
for(int i = 0; i < sizeof(huffman_lookup)/sizeof(huffman_lookup[0]); i++) {
int bits = huffman_lookup[i].c_bits+1;
uint32_t base_value = huffman_lookup[i].c_value;
huffman_encoding_map[huffman_lookup[i].u_hword_pos] = (bits<<24)|(base_value<<1)|0;
huffman_encoding_map[huffman_lookup[i].u_hword_neg] = (bits<<24)|(base_value<<1)|1;
state->huffman_encoding_map[huffman_lookup[i].u_hword_pos] = (bits<<24)|(base_value<<1)|0;
state->huffman_encoding_map[huffman_lookup[i].u_hword_neg] = (bits<<24)|(base_value<<1)|1;
}
}
@ -296,80 +298,71 @@ static bool encode_ac_value(vid_encoder_state_t *state, uint16_t value)
// Use an escape
return encode_bits(state, 6+16, (0x01<<16)|(0xFFFF&(uint32_t)value));
#else
uint32_t outword = huffman_encoding_map[value];
uint32_t outword = state->huffman_encoding_map[value];
return encode_bits(state, outword>>24, outword&0xFFFFFF);
#endif
}
static void transform_dct_block(vid_encoder_state_t *state, float *block)
static void transform_dct_block(vid_encoder_state_t *state, int16_t *block)
{
#if 0
// Apply DCT to block
float midblock[8*8];
int midblock[8*8];
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 8; j++) {
float v = 0.0f;
int v = 0;
for(int k = 0; k < 8; k++) {
v += block[8*j+k] * (float)dct_scale_table[8*i+k] / (float)(1 << 16);
v += (int)block[8*j+k] * (int)dct_scale_table[8*i+k] / 8;
}
midblock[8*i+j] = v;
midblock[8*i+j] = (v + 0xFFF) >> 13;
}
}
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 8; j++) {
float v = 0.0f;
int v = 0;
for(int k = 0; k < 8; k++) {
v += midblock[8*j+k] * (float)dct_scale_table[8*i+k] / (float)(1 << 16);
v += (int)midblock[8*j+k] * (int)dct_scale_table[8*i+k];
}
block[8*i+j] = v;
block[8*i+j] = (int16_t)((v + 0xFFF) >> 13);
}
}
#else
state->dct_context->fdct(block);
#endif
}
static bool encode_dct_block(vid_encoder_state_t *state, float *block)
// https://stackoverflow.com/a/60011209
//#define DIVIDE_ROUNDED(n, d) (((n) >= 0) ? (((n) + (d)/2) / (d)) : (((n) - (d)/2) / (d)))
#define DIVIDE_ROUNDED(n, d) ((int)round((double)(n) / (double)(d)))
static bool encode_dct_block(vid_encoder_state_t *state, const int16_t *block, const int16_t *quant_table)
{
int16_t coeffs[64];
float scale = 8.0f / (float)state->quant_scale;
int dc = DIVIDE_ROUNDED(block[0], quant_table[0]);
dc = state->coeff_clamp_map[dc&0xFFFF];
for (int i = 0; i < 64; i++) {
// The DC coefficient is not affected by the quantization scale.
float x = block[i];
if (i) { x *= scale; }
int v = (int)roundf(x / (float)quant_dec[i]);
if (v < -0x200) { v = -0x200; }
if (v > +0x1FF) { v = +0x1FF; }
coeffs[i] = v;
}
if (!encode_bits(state, 10, coeffs[0]&0x3FF)) {
if (!encode_bits(state, 10, dc)) {
return false;
}
// Build RLE output
uint16_t zero_rle_data[8*8];
int zero_rle_words = 0;
for (int i = 1, zeroes = 0; i < 64; i++) {
int ri = dct_zagzig_table[i];
//int ri = dct_zigzag_table[i];
if (coeffs[ri] == 0) {
int ac = DIVIDE_ROUNDED(block[ri], quant_table[ri]);
ac = state->coeff_clamp_map[ac&0xFFFF];
if (ac == 0) {
zeroes++;
} else {
zero_rle_data[zero_rle_words++] = (zeroes<<10)|(coeffs[ri]&0x3FF);
if (!encode_ac_value(state, (zeroes<<10)|ac)) {
return false;
}
zeroes = 0;
state->uncomp_hwords_used += 1;
}
}
// Now Huffman-code the data
for (int i = 0; i < zero_rle_words; i++) {
if (!encode_ac_value(state, zero_rle_data[i])) {
return false;
}
}
//fprintf(stderr, "dc %08X rles %2d\n", coeffs[0], zero_rle_words);
//assert(coeffs[0] >= -0x200); assert(coeffs[0] < +0x200);
//fprintf(stderr, "dc %08X rles %2d\n", dc, zero_rle_words);
//assert(dc >= -0x200); assert(dc < +0x200);
// Store end of block
if (!encode_bits(state, 2, 0x2)) {
@ -404,6 +397,61 @@ static int reduce_dct_block(vid_encoder_state_t *state, int32_t *block, int32_t
}
#endif
bool init_encoder_state(settings_t *settings)
{
if (settings->state_vid.huffman_encoding_map) {
return true;
}
settings->state_vid.huffman_encoding_map = malloc(0x10000*sizeof(uint32_t));
settings->state_vid.coeff_clamp_map = malloc(0x10000*sizeof(int16_t));
if (!settings->state_vid.huffman_encoding_map || !settings->state_vid.coeff_clamp_map) {
return false;
}
init_dct_data(&(settings->state_vid));
settings->state_vid.dct_context = avcodec_dct_alloc();
if (!settings->state_vid.dct_context) {
return false;
}
avcodec_dct_init(settings->state_vid.dct_context);
int dct_block_count_x = (settings->video_width+15)/16;
int dct_block_count_y = (settings->video_height+15)/16;
int dct_block_size = dct_block_count_x*dct_block_count_y*sizeof(int16_t)*8*8;
for (int i = 0; i < 6; i++) {
settings->state_vid.dct_block_lists[i] = malloc(dct_block_size);
if (!settings->state_vid.dct_block_lists[i]) {
return false;
}
}
return true;
}
void destroy_encoder_state(settings_t *settings)
{
if (settings->state_vid.huffman_encoding_map) {
free(settings->state_vid.huffman_encoding_map);
settings->state_vid.huffman_encoding_map = NULL;
}
if (settings->state_vid.coeff_clamp_map) {
free(settings->state_vid.coeff_clamp_map);
settings->state_vid.coeff_clamp_map = NULL;
}
if (settings->state_vid.dct_context) {
av_free(settings->state_vid.dct_context);
settings->state_vid.dct_context = NULL;
}
if (settings->state_vid.dct_block_lists[0]) {
for (int i = 0; i < 6; i++) {
free(settings->state_vid.dct_block_lists[i]);
settings->state_vid.dct_block_lists[i] = NULL;
}
}
}
void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
{
int pitch = settings->video_width;
@ -415,21 +463,11 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
uint8_t *y_plane = video_frame;
uint8_t *c_plane = y_plane + (settings->video_width*settings->video_height);
if (!dct_done_init) {
init_dct_data();
dct_done_init = true;
}
assert(settings->state_vid.huffman_encoding_map);
int dct_block_count_x = (settings->video_width+15)/16;
int dct_block_count_y = (settings->video_height+15)/16;
if (settings->state_vid.dct_block_lists[0] == NULL) {
int dct_block_size = dct_block_count_x*dct_block_count_y*sizeof(float)*8*8;
for (int i = 0; i < 6; i++) {
settings->state_vid.dct_block_lists[i] = malloc(dct_block_size);
}
}
// TODO: non-16x16-aligned videos
assert((settings->video_width % 16) == 0);
assert((settings->video_height % 16) == 0);
@ -439,7 +477,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
for(int fy = 0; fy < dct_block_count_y; fy++) {
// Order: Cr Cb [Y1|Y2\nY3|Y4]
int block_offs = 64 * (fy*dct_block_count_x + fx);
float *blocks[6] = {
int16_t *blocks[6] = {
settings->state_vid.dct_block_lists[0] + block_offs,
settings->state_vid.dct_block_lists[1] + block_offs,
settings->state_vid.dct_block_lists[2] + block_offs,
@ -456,12 +494,12 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
int lx = fx*16 + x;
int ly = fy*16 + y;
blocks[0][k] = (float)c_plane[pitch*cy + 2*cx + 0] - 128.0f;
blocks[1][k] = (float)c_plane[pitch*cy + 2*cx + 1] - 128.0f;
blocks[2][k] = (float)y_plane[pitch*(ly+0) + (lx+0)] - 128.0f;
blocks[3][k] = (float)y_plane[pitch*(ly+0) + (lx+8)] - 128.0f;
blocks[4][k] = (float)y_plane[pitch*(ly+8) + (lx+0)] - 128.0f;
blocks[5][k] = (float)y_plane[pitch*(ly+8) + (lx+8)] - 128.0f;
blocks[0][k] = (int16_t)c_plane[pitch*cy + 2*cx + 0] - 128;
blocks[1][k] = (int16_t)c_plane[pitch*cy + 2*cx + 1] - 128;
blocks[2][k] = (int16_t)y_plane[pitch*(ly+0) + (lx+0)] - 128;
blocks[3][k] = (int16_t)y_plane[pitch*(ly+0) + (lx+8)] - 128;
blocks[4][k] = (int16_t)y_plane[pitch*(ly+8) + (lx+0)] - 128;
blocks[5][k] = (int16_t)y_plane[pitch*(ly+8) + (lx+8)] - 128;
}
}
@ -482,6 +520,14 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
settings->state_vid.quant_scale < 64;
settings->state_vid.quant_scale++
) {
int16_t quant_table[8*8];
// The DC coefficient's quantization scale is always 8.
quant_table[0] = quant_dec[0] * 8;
for (int i = 1; i < 64; i++) {
quant_table[i] = quant_dec[i] * settings->state_vid.quant_scale;
}
memset(settings->state_vid.frame_output, 0, settings->state_vid.frame_max_size);
settings->state_vid.bits_value = 0;
@ -494,7 +540,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
for(int fy = 0; ok && (fy < dct_block_count_y); fy++) {
// Order: Cr Cb [Y1|Y2\nY3|Y4]
int block_offs = 64 * (fy*dct_block_count_x + fx);
float *blocks[6] = {
int16_t *blocks[6] = {
settings->state_vid.dct_block_lists[0] + block_offs,
settings->state_vid.dct_block_lists[1] + block_offs,
settings->state_vid.dct_block_lists[2] + block_offs,
@ -504,7 +550,7 @@ void encode_frame_bs(uint8_t *video_frame, settings_t *settings)
};
for(int i = 0; ok && (i < 6); i++) {
ok = encode_dct_block(&(settings->state_vid), blocks[i]);
ok = encode_dct_block(&(settings->state_vid), blocks[i], quant_table);
}
}
}

View File

@ -374,6 +374,9 @@ int main(int argc, char **argv) {
settings.video_frames = NULL;
settings.video_frame_count = 0;
settings.state_vid.huffman_encoding_map = NULL;
settings.state_vid.coeff_clamp_map = NULL;
settings.state_vid.dct_context = NULL;
for(int i = 0; i < 6; i++) {
settings.state_vid.dct_block_lists[i] = NULL;
}