Files
psxavenc/psxavenc/filefmt.c

644 lines
19 KiB
C

/*
psxavenc: MDEC video + SPU/XA-ADPCM audio encoder frontend
Copyright (c) 2019, 2020 Adrian "asie" Siekierka
Copyright (c) 2019 Ben "GreaseMonkey" Russell
Copyright (c) 2023, 2025 spicyjpeg
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <libpsxav.h>
#include "args.h"
#include "decoding.h"
#include "mdec.h"
static time_t start_time = 0;
static time_t last_progress_update = 0;
static time_t get_elapsed_time(void) {
time_t t;
if (start_time > 0) {
t = time(NULL) - start_time;
} else {
t = 0;
start_time = time(NULL);
}
if (t <= last_progress_update)
return 0;
last_progress_update = t;
return t;
}
static psx_audio_xa_settings_t args_to_libpsxav_xa_audio(const args_t *args) {
psx_audio_xa_settings_t settings;
settings.bits_per_sample = args->audio_bit_depth;
settings.frequency = args->audio_frequency;
settings.stereo = (args->audio_channels == 2);
settings.file_number = args->audio_xa_file;
settings.channel_number = args->audio_xa_channel;
if (args->format == FORMAT_XACD || args->format == FORMAT_STRCD)
settings.format = PSX_AUDIO_XA_FORMAT_XACD;
else
settings.format = PSX_AUDIO_XA_FORMAT_XA;
return settings;
};
static void init_sector_buffer_video(const args_t *args, uint8_t *sector, int lba) {
psx_cdrom_sector_xa_subheader_t *subheader = NULL;
if (args->format == FORMAT_STRCD) {
psx_cdrom_init_sector((psx_cdrom_sector_t *)sector, lba, PSX_CDROM_SECTOR_TYPE_MODE2_FORM1);
subheader = ((psx_cdrom_sector_t *)sector)->mode2.subheader;
} else if (args->format == FORMAT_STR) {
subheader = (psx_cdrom_sector_xa_subheader_t *)sector;
}
if (subheader != NULL) {
subheader->file = args->audio_xa_file;
subheader->channel = args->audio_xa_channel & PSX_CDROM_SECTOR_XA_CHANNEL_MASK;
subheader->submode = PSX_CDROM_SECTOR_XA_SUBMODE_DATA | PSX_CDROM_SECTOR_XA_SUBMODE_RT;
subheader->coding = 0;
memcpy(subheader + 1, subheader, sizeof(psx_cdrom_sector_xa_subheader_t));
}
}
#define VAG_HEADER_SIZE 0x30
static void write_vag_header(const args_t *args, int size_per_channel, uint8_t *header) {
memset(header, 0, VAG_HEADER_SIZE);
// Magic
header[0x00] = 'V';
header[0x01] = 'A';
header[0x02] = 'G';
if (args->format == FORMAT_VAGI)
header[0x03] = 'i';
else
header[0x03] = 'p';
// Version (big-endian)
header[0x04] = 0x00;
header[0x05] = 0x00;
header[0x06] = 0x00;
header[0x07] = 0x20;
// Interleave (little-endian)
if (args->format == FORMAT_VAGI) {
header[0x08] = (uint8_t)args->audio_interleave;
header[0x09] = (uint8_t)(args->audio_interleave >> 8);
header[0x0A] = (uint8_t)(args->audio_interleave >> 16);
header[0x0B] = (uint8_t)(args->audio_interleave >> 24);
}
// Length of data for each channel (big-endian)
header[0x0C] = (uint8_t)(size_per_channel >> 24);
header[0x0D] = (uint8_t)(size_per_channel >> 16);
header[0x0E] = (uint8_t)(size_per_channel >> 8);
header[0x0F] = (uint8_t)size_per_channel;
// Sample rate (big-endian)
header[0x10] = (uint8_t)(args->audio_frequency >> 24);
header[0x11] = (uint8_t)(args->audio_frequency >> 16);
header[0x12] = (uint8_t)(args->audio_frequency >> 8);
header[0x13] = (uint8_t)args->audio_frequency;
// Number of channels (little-endian)
header[0x1E] = (uint8_t)args->audio_channels;
header[0x1F] = 0x00;
// Filename
int name_offset = strlen(args->output_file);
while (
name_offset > 0 &&
args->output_file[name_offset - 1] != '/' &&
args->output_file[name_offset - 1] != '\\'
)
name_offset--;
strncpy((char*)(header + 0x20), &args->output_file[name_offset], 16);
}
// The functions below are some peak spaghetti code I would rewrite if that
// didn't also require scrapping the rest of the codebase. -- spicyjpeg
void encode_file_xa(const args_t *args, decoder_t *decoder, FILE *output) {
psx_audio_xa_settings_t xa_settings = args_to_libpsxav_xa_audio(args);
int audio_samples_per_sector = psx_audio_xa_get_samples_per_sector(xa_settings);
psx_audio_encoder_state_t audio_state;
memset(&audio_state, 0, sizeof(psx_audio_encoder_state_t));
int sector_count = 0;
for (; ensure_av_data(decoder, audio_samples_per_sector * args->audio_channels, 0); sector_count++) {
int samples_length = decoder->audio_sample_count / args->audio_channels;
if (samples_length > audio_samples_per_sector)
samples_length = audio_samples_per_sector;
uint8_t sector[PSX_CDROM_SECTOR_SIZE];
int length = psx_audio_xa_encode(
xa_settings,
&audio_state,
decoder->audio_samples,
samples_length,
sector_count,
sector
);
if (decoder->end_of_input)
psx_audio_xa_encode_finalize(xa_settings, sector, length);
retire_av_data(decoder, samples_length * args->audio_channels, 0);
fwrite(sector, length, 1, output);
time_t t = get_elapsed_time();
if (!(args->flags & FLAG_HIDE_PROGRESS) && t) {
fprintf(
stderr,
"\rLBA: %6d | Encoding speed: %5.2fx",
sector_count,
(double)(sector_count * audio_samples_per_sector) / (double)(args->audio_frequency * t)
);
}
}
}
void encode_file_spu(const args_t *args, decoder_t *decoder, FILE *output) {
psx_audio_encoder_channel_state_t audio_state;
memset(&audio_state, 0, sizeof(psx_audio_encoder_channel_state_t));
// The header must be written after the data as we don't yet know the
// number of audio samples.
if (args->format == FORMAT_VAG)
fseek(output, VAG_HEADER_SIZE, SEEK_SET);
uint8_t block[PSX_AUDIO_SPU_BLOCK_SIZE];
int block_count = 0;
if (!(args->flags & FLAG_SPU_NO_LEADING_DUMMY)) {
// Insert leading silent block
memset(block, 0, PSX_AUDIO_SPU_BLOCK_SIZE);
fwrite(block, PSX_AUDIO_SPU_BLOCK_SIZE, 1, output);
block_count++;
}
int loop_start_block = -1;
if (args->audio_loop_point >= 0)
loop_start_block = block_count + (args->audio_loop_point * args->audio_frequency) / (PSX_AUDIO_SPU_SAMPLES_PER_BLOCK * 1000);
for (; ensure_av_data(decoder, PSX_AUDIO_SPU_SAMPLES_PER_BLOCK, 0); block_count++) {
int samples_length = decoder->audio_sample_count;
if (samples_length > PSX_AUDIO_SPU_SAMPLES_PER_BLOCK)
samples_length = PSX_AUDIO_SPU_SAMPLES_PER_BLOCK;
int length = psx_audio_spu_encode(
&audio_state,
decoder->audio_samples,
samples_length,
1,
block
);
if (block_count == loop_start_block)
block[1] |= PSX_AUDIO_SPU_LOOP_START;
if ((args->flags & FLAG_SPU_LOOP_END) && decoder->end_of_input)
block[1] |= PSX_AUDIO_SPU_LOOP_REPEAT;
retire_av_data(decoder, samples_length, 0);
fwrite(block, length, 1, output);
time_t t = get_elapsed_time();
if (!(args->flags & FLAG_HIDE_PROGRESS) && t) {
fprintf(
stderr,
"\rBlock: %6d | Encoding speed: %5.2fx",
block_count,
(double)(block_count * PSX_AUDIO_SPU_SAMPLES_PER_BLOCK) / (double)(args->audio_frequency * t)
);
}
}
if (!(args->flags & FLAG_SPU_LOOP_END)) {
// Insert trailing looping block
memset(block, 0, PSX_AUDIO_SPU_BLOCK_SIZE);
block[1] = PSX_AUDIO_SPU_LOOP_START | PSX_AUDIO_SPU_LOOP_END;
fwrite(block, PSX_AUDIO_SPU_BLOCK_SIZE, 1, output);
block_count++;
}
int overflow = (block_count * PSX_AUDIO_SPU_BLOCK_SIZE) % args->alignment;
if (overflow) {
for (int i = 0; i < (args->alignment - overflow); i++)
fputc(0, output);
}
if (args->format == FORMAT_VAG) {
uint8_t header[VAG_HEADER_SIZE];
write_vag_header(args, block_count * PSX_AUDIO_SPU_BLOCK_SIZE, header);
fseek(output, 0, SEEK_SET);
fwrite(header, VAG_HEADER_SIZE, 1, output);
}
}
void encode_file_spui(const args_t *args, decoder_t *decoder, FILE *output) {
int audio_samples_per_chunk = args->audio_interleave / PSX_AUDIO_SPU_BLOCK_SIZE * PSX_AUDIO_SPU_SAMPLES_PER_BLOCK;
// NOTE: since the interleaved .vag format is not standardized, some tools
// (such as vgmstream) will not properly play files with interleave < 2048,
// alignment != 2048 or channels != 2.
int chunk_size = args->audio_interleave * args->audio_channels + args->alignment - 1;
chunk_size -= chunk_size % args->alignment;
int header_size = VAG_HEADER_SIZE + args->alignment - 1;
header_size -= header_size % args->alignment;
if (args->format == FORMAT_VAGI)
fseek(output, header_size, SEEK_SET);
int audio_state_size = sizeof(psx_audio_encoder_channel_state_t) * args->audio_channels;
psx_audio_encoder_channel_state_t *audio_state = malloc(audio_state_size);
memset(audio_state, 0, audio_state_size);
uint8_t *chunk = malloc(chunk_size);
int chunk_count = 0;
for (; ensure_av_data(decoder, audio_samples_per_chunk * args->audio_channels, 0); chunk_count++) {
int samples_length = decoder->audio_sample_count / args->audio_channels;
if (samples_length > audio_samples_per_chunk)
samples_length = audio_samples_per_chunk;
memset(chunk, 0, chunk_size);
uint8_t *chunk_ptr = chunk;
// Insert leading silent block
if (chunk_count == 0 && !(args->flags & FLAG_SPU_NO_LEADING_DUMMY)) {
chunk_ptr += PSX_AUDIO_SPU_BLOCK_SIZE;
samples_length -= PSX_AUDIO_SPU_SAMPLES_PER_BLOCK;
}
for (int ch = 0; ch < args->audio_channels; ch++, chunk_ptr += args->audio_interleave) {
int length = psx_audio_spu_encode(
audio_state + ch,
decoder->audio_samples + ch,
samples_length,
args->audio_channels,
chunk_ptr
);
if (length > 0) {
uint8_t *last_block = chunk_ptr + length - PSX_AUDIO_SPU_BLOCK_SIZE;
if (args->flags & FLAG_SPU_LOOP_END) {
last_block[1] = PSX_AUDIO_SPU_LOOP_REPEAT;
} else if (decoder->end_of_input) {
// HACK: the trailing block should in theory be appended to
// the existing data, but it's easier to just zerofill and
// repurpose the last encoded block
memset(last_block, 0, PSX_AUDIO_SPU_BLOCK_SIZE);
last_block[1] = PSX_AUDIO_SPU_LOOP_START | PSX_AUDIO_SPU_LOOP_END;
}
}
}
retire_av_data(decoder, samples_length * args->audio_channels, 0);
fwrite(chunk, chunk_size, 1, output);
time_t t = get_elapsed_time();
if (!(args->flags & FLAG_HIDE_PROGRESS) && t) {
fprintf(
stderr,
"\rChunk: %6d | Encoding speed: %5.2fx",
chunk_count,
(double)(chunk_count * audio_samples_per_chunk) / (double)(args->audio_frequency * t)
);
}
}
free(audio_state);
free(chunk);
if (args->format == FORMAT_VAGI) {
uint8_t *header = malloc(header_size);
memset(header, 0, header_size);
write_vag_header(args, chunk_count * args->audio_interleave, header);
fseek(output, 0, SEEK_SET);
fwrite(header, header_size, 1, output);
free(header);
}
}
void encode_file_str(const args_t *args, decoder_t *decoder, FILE *output) {
psx_audio_xa_settings_t xa_settings = args_to_libpsxav_xa_audio(args);
int sector_size = psx_audio_xa_get_buffer_size_per_sector(xa_settings);
int interleave;
int audio_samples_per_sector;
int video_sectors_per_block;
if (decoder->state.audio_stream != NULL) {
// 1/N audio, (N-1)/N video
interleave = psx_audio_xa_get_sector_interleave(xa_settings) * args->str_cd_speed;
audio_samples_per_sector = psx_audio_xa_get_samples_per_sector(xa_settings);
video_sectors_per_block = interleave - 1;
if (!(args->flags & FLAG_QUIET))
fprintf(
stderr,
"Interleave: %d/%d audio, %d/%d video\n",
interleave - video_sectors_per_block,
interleave,
video_sectors_per_block,
interleave
);
} else {
// 0/1 audio, 1/1 video
interleave = 1;
audio_samples_per_sector = 0;
video_sectors_per_block = 1;
}
psx_audio_encoder_state_t audio_state;
memset(&audio_state, 0, sizeof(psx_audio_encoder_state_t));
mdec_encoder_t encoder;
init_mdec_encoder(&encoder, args->video_codec, args->video_width, args->video_height);
// e.g. 15fps = (150*7/8/15) = 8.75 blocks per frame
encoder.state.frame_block_base_overflow = (75 * args->str_cd_speed) * video_sectors_per_block * args->str_fps_den;
encoder.state.frame_block_overflow_den = interleave * args->str_fps_num;
double frame_size = (double)encoder.state.frame_block_base_overflow / (double)encoder.state.frame_block_overflow_den;
if (!(args->flags & FLAG_QUIET))
fprintf(stderr, "Frame size: %.2f sectors\n", frame_size);
encoder.state.frame_output = malloc(2016 * (int)ceil(frame_size));
encoder.state.frame_index = 0;
encoder.state.frame_data_offset = 0;
encoder.state.frame_max_size = 0;
encoder.state.frame_block_overflow_num = 0;
encoder.state.quant_scale_sum = 0;
// FIXME: this needs an extra frame to prevent A/V desync
int frames_needed = (int) ceil((double)video_sectors_per_block / frame_size);
if (frames_needed < 2)
frames_needed = 2;
int sector_count = 0;
for (; !decoder->end_of_input || encoder.state.frame_data_offset < encoder.state.frame_max_size; sector_count++) {
ensure_av_data(decoder, audio_samples_per_sector * args->audio_channels, frames_needed);
uint8_t sector[PSX_CDROM_SECTOR_SIZE];
bool is_video_sector;
if (audio_samples_per_sector == 0)
is_video_sector = true;
else if (args->flags & FLAG_STR_TRAILING_AUDIO)
is_video_sector = (sector_count % interleave) < video_sectors_per_block;
else
is_video_sector = (sector_count % interleave) > 0;
if (is_video_sector) {
init_sector_buffer_video(args, sector, sector_count);
int frames_used = encode_sector_str(
&encoder,
args->format,
args->str_video_id,
decoder->video_frames,
sector
);
psx_cdrom_calculate_checksums((psx_cdrom_sector_t *)sector, PSX_CDROM_SECTOR_TYPE_MODE2_FORM1);
retire_av_data(decoder, 0, frames_used);
} else {
int samples_length = decoder->audio_sample_count / args->audio_channels;
if (samples_length > audio_samples_per_sector)
samples_length = audio_samples_per_sector;
// FIXME: this is an extremely hacky way to handle audio tracks
// shorter than the video track
if (!samples_length)
video_sectors_per_block++;
int length = psx_audio_xa_encode(
xa_settings,
&audio_state,
decoder->audio_samples,
samples_length,
sector_count,
sector
);
if (decoder->end_of_input)
psx_audio_xa_encode_finalize(xa_settings, sector, length);
retire_av_data(decoder, samples_length * args->audio_channels, 0);
}
fwrite(sector, sector_size, 1, output);
time_t t = get_elapsed_time();
if (!(args->flags & FLAG_HIDE_PROGRESS) && t) {
fprintf(
stderr,
"\rFrame: %4d | LBA: %6d | Avg. q. scale: %5.2f | Encoding speed: %5.2fx",
encoder.state.frame_index,
sector_count,
(double)encoder.state.quant_scale_sum / (double)encoder.state.frame_index,
(double)(encoder.state.frame_index * args->str_fps_den) / (double)(t * args->str_fps_num)
);
}
}
free(encoder.state.frame_output);
destroy_mdec_encoder(&encoder);
}
void encode_file_strspu(const args_t *args, decoder_t *decoder, FILE *output) {
int interleave;
int audio_samples_per_sector;
int video_sectors_per_block;
if (decoder->state.audio_stream != NULL) {
assert(false); // TODO: implement
if (!(args->flags & FLAG_QUIET))
fprintf(
stderr,
"Interleave: %d/%d audio, %d/%d video\n",
interleave - video_sectors_per_block,
interleave,
video_sectors_per_block,
interleave
);
} else {
// 0/1 audio, 1/1 video
interleave = 1;
audio_samples_per_sector = 0;
video_sectors_per_block = 1;
}
mdec_encoder_t encoder;
init_mdec_encoder(&encoder, args->video_codec, args->video_width, args->video_height);
// e.g. 15fps = (150*7/8/15) = 8.75 blocks per frame
encoder.state.frame_block_base_overflow = (75 * args->str_cd_speed) * video_sectors_per_block * args->str_fps_den;
encoder.state.frame_block_overflow_den = interleave * args->str_fps_num;
double frame_size = (double)encoder.state.frame_block_base_overflow / (double)encoder.state.frame_block_overflow_den;
if (!(args->flags & FLAG_QUIET))
fprintf(stderr, "Frame size: %.2f sectors\n", frame_size);
encoder.state.frame_output = malloc(2016 * (int)ceil(frame_size));
encoder.state.frame_index = 0;
encoder.state.frame_data_offset = 0;
encoder.state.frame_max_size = 0;
encoder.state.frame_block_overflow_num = 0;
encoder.state.quant_scale_sum = 0;
// FIXME: this needs an extra frame to prevent A/V desync
int frames_needed = (int) ceil((double)video_sectors_per_block / frame_size);
if (frames_needed < 2)
frames_needed = 2;
int sector_count = 0;
for (; !decoder->end_of_input || encoder.state.frame_data_offset < encoder.state.frame_max_size; sector_count++) {
ensure_av_data(decoder, audio_samples_per_sector * args->audio_channels, frames_needed);
uint8_t sector[2048];
bool is_video_sector;
if (audio_samples_per_sector == 0)
is_video_sector = true;
else if (args->flags & FLAG_STR_TRAILING_AUDIO)
is_video_sector = (sector_count % interleave) < video_sectors_per_block;
else
is_video_sector = (sector_count % interleave) > 0;
if (is_video_sector) {
init_sector_buffer_video(args, sector, sector_count);
int frames_used = encode_sector_str(
&encoder,
args->format,
args->str_video_id,
decoder->video_frames,
sector
);
retire_av_data(decoder, 0, frames_used);
} else {
int samples_length = decoder->audio_sample_count / args->audio_channels;
if (samples_length > audio_samples_per_sector)
samples_length = audio_samples_per_sector;
// FIXME: this is an extremely hacky way to handle audio tracks
// shorter than the video track
if (!samples_length)
video_sectors_per_block++;
assert(false); // TODO: implement
retire_av_data(decoder, samples_length * args->audio_channels, 0);
}
fwrite(sector, 2048, 1, output);
time_t t = get_elapsed_time();
if (!(args->flags & FLAG_HIDE_PROGRESS) && t) {
fprintf(
stderr,
"\rFrame: %4d | LBA: %6d | Avg. q. scale: %5.2f | Encoding speed: %5.2fx",
encoder.state.frame_index,
sector_count,
(double)encoder.state.quant_scale_sum / (double)encoder.state.frame_index,
(double)(encoder.state.frame_index * args->str_fps_den) / (double)(t * args->str_fps_num)
);
}
}
free(encoder.state.frame_output);
destroy_mdec_encoder(&encoder);
}
void encode_file_sbs(const args_t *args, decoder_t *decoder, FILE *output) {
mdec_encoder_t encoder;
init_mdec_encoder(&encoder, args->video_codec, args->video_width, args->video_height);
encoder.state.frame_output = malloc(args->alignment);
encoder.state.frame_data_offset = 0;
encoder.state.frame_max_size = args->alignment;
encoder.state.quant_scale_sum = 0;
for (int j = 0; ensure_av_data(decoder, 0, 1); j++) {
encode_frame_bs(&encoder, decoder->video_frames);
retire_av_data(decoder, 0, 1);
fwrite(encoder.state.frame_output, args->alignment, 1, output);
time_t t = get_elapsed_time();
if (!(args->flags & FLAG_HIDE_PROGRESS) && t) {
fprintf(
stderr,
"\rFrame: %4d | Avg. q. scale: %5.2f | Encoding speed: %5.2fx",
j,
(double)encoder.state.quant_scale_sum / (double)j,
(double)(j * args->str_fps_den) / (double)(t * args->str_fps_num)
);
}
}
free(encoder.state.frame_output);
destroy_mdec_encoder(&encoder);
}