/* libpsxav: MDEC video + SPU/XA-ADPCM audio library Copyright (c) 2019, 2020 Adrian "asie" Siekierka Copyright (c) 2019 Ben "GreaseMonkey" Russell Copyright (c) 2023 spicyjpeg This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. */ #include #include #include "libpsxav.h" #define SHIFT_RANGE_4BPS 12 #define SHIFT_RANGE_8BPS 8 #define ADPCM_FILTER_COUNT 5 #define XA_ADPCM_FILTER_COUNT 4 #define SPU_ADPCM_FILTER_COUNT 5 static const int16_t filter_k1[ADPCM_FILTER_COUNT] = {0, 60, 115, 98, 122}; static const int16_t filter_k2[ADPCM_FILTER_COUNT] = {0, 0, -52, -55, -60}; static int find_min_shift(const psx_audio_encoder_channel_state_t *state, int16_t *samples, int sample_limit, int pitch, int filter, int shift_range) { // Assumption made: // // There is value in shifting right one step further to allow the nibbles to clip. // However, given a possible shift value, there is no value in shifting one step less. // // Having said that, this is not a completely accurate model of the encoder, // so maybe we will need to shift one step less. // int prev1 = state->prev1; int prev2 = state->prev2; int k1 = filter_k1[filter]; int k2 = filter_k2[filter]; int right_shift = 0; int32_t s_min = 0; int32_t s_max = 0; for (int i = 0; i < 28; i++) { int32_t raw_sample = (i >= sample_limit) ? 0 : samples[i * pitch]; int32_t previous_values = (k1*prev1 + k2*prev2 + (1<<5))>>6; int32_t sample = raw_sample - previous_values; if (sample < s_min) { s_min = sample; } if (sample > s_max) { s_max = sample; } prev2 = prev1; prev1 = raw_sample; } while(right_shift < shift_range && (s_max>>right_shift) > (+0x7FFF >> shift_range)) { right_shift += 1; }; while(right_shift < shift_range && (s_min>>right_shift) < (-0x8000 >> shift_range)) { right_shift += 1; }; int min_shift = shift_range - right_shift; assert(0 <= min_shift && min_shift <= shift_range); return min_shift; } static uint8_t attempt_to_encode(psx_audio_encoder_channel_state_t *outstate, const psx_audio_encoder_channel_state_t *instate, int16_t *samples, int sample_limit, int pitch, uint8_t *data, int data_shift, int data_pitch, int filter, int sample_shift, int shift_range) { uint8_t sample_mask = 0xFFFF >> shift_range; uint8_t nondata_mask = ~(sample_mask << data_shift); int min_shift = sample_shift; int k1 = filter_k1[filter]; int k2 = filter_k2[filter]; uint8_t hdr = (min_shift & 0x0F) | (filter << 4); if (outstate != instate) { memcpy(outstate, instate, sizeof(psx_audio_encoder_channel_state_t)); } outstate->mse = 0; for (int i = 0; i < 28; i++) { int32_t sample = ((i >= sample_limit) ? 0 : samples[i * pitch]) + outstate->qerr; int32_t previous_values = (k1*outstate->prev1 + k2*outstate->prev2 + (1<<5))>>6; int32_t sample_enc = sample - previous_values; sample_enc <<= min_shift; sample_enc += (1<<(shift_range-1)); sample_enc >>= shift_range; if(sample_enc < (-0x8000 >> shift_range)) { sample_enc = -0x8000 >> shift_range; } if(sample_enc > (+0x7FFF >> shift_range)) { sample_enc = +0x7FFF >> shift_range; } sample_enc &= sample_mask; int32_t sample_dec = (int16_t) ((sample_enc & sample_mask) << shift_range); sample_dec >>= min_shift; sample_dec += previous_values; if (sample_dec > +0x7FFF) { sample_dec = +0x7FFF; } if (sample_dec < -0x8000) { sample_dec = -0x8000; } int64_t sample_error = sample_dec - sample; assert(sample_error < (1<<30)); assert(sample_error > -(1<<30)); data[i * data_pitch] = (data[i * data_pitch] & nondata_mask) | (sample_enc << data_shift); // FIXME: dithering is hard to predict //outstate->qerr += sample_error; outstate->mse += ((uint64_t)sample_error) * (uint64_t)sample_error; outstate->prev2 = outstate->prev1; outstate->prev1 = sample_dec; } return hdr; } static uint8_t encode(psx_audio_encoder_channel_state_t *state, int16_t *samples, int sample_limit, int pitch, uint8_t *data, int data_shift, int data_pitch, int filter_count, int shift_range) { psx_audio_encoder_channel_state_t proposed; int64_t best_mse = ((int64_t)1<<(int64_t)50); int best_filter = 0; int best_sample_shift = 0; for (int filter = 0; filter < filter_count; filter++) { int true_min_shift = find_min_shift(state, samples, sample_limit, pitch, filter, shift_range); // Testing has shown that the optimal shift can be off the true minimum shift // by 1 in *either* direction. // This is NOT the case when dither is used. int min_shift = true_min_shift - 1; int max_shift = true_min_shift + 1; if (min_shift < 0) { min_shift = 0; } if (max_shift > shift_range) { max_shift = shift_range; } for (int sample_shift = min_shift; sample_shift <= max_shift; sample_shift++) { // ignore header here attempt_to_encode( &proposed, state, samples, sample_limit, pitch, data, data_shift, data_pitch, filter, sample_shift, shift_range); if (best_mse > proposed.mse) { best_mse = proposed.mse; best_filter = filter; best_sample_shift = sample_shift; } } } // now go with the encoder return attempt_to_encode( state, state, samples, sample_limit, pitch, data, data_shift, data_pitch, best_filter, best_sample_shift, shift_range); } static void encode_block_xa(int16_t *audio_samples, int audio_samples_limit, uint8_t *data, psx_audio_xa_settings_t settings, psx_audio_encoder_state_t *state) { if (settings.bits_per_sample == 4) { if (settings.stereo) { data[0] = encode(&(state->left), audio_samples, audio_samples_limit, 2, data + 0x10, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[1] = encode(&(state->right), audio_samples + 1, audio_samples_limit, 2, data + 0x10, 4, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[2] = encode(&(state->left), audio_samples + 56, audio_samples_limit - 28, 2, data + 0x11, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[3] = encode(&(state->right), audio_samples + 56 + 1, audio_samples_limit - 28, 2, data + 0x11, 4, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[8] = encode(&(state->left), audio_samples + 56*2, audio_samples_limit - 28*2, 2, data + 0x12, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[9] = encode(&(state->right), audio_samples + 56*2 + 1, audio_samples_limit - 28*2, 2, data + 0x12, 4, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[10] = encode(&(state->left), audio_samples + 56*3, audio_samples_limit - 28*3, 2, data + 0x13, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[11] = encode(&(state->right), audio_samples + 56*3 + 1, audio_samples_limit - 28*3, 2, data + 0x13, 4, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); } else { data[0] = encode(&(state->left), audio_samples, audio_samples_limit, 1, data + 0x10, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[1] = encode(&(state->left), audio_samples + 28, audio_samples_limit - 28, 1, data + 0x10, 4, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[2] = encode(&(state->left), audio_samples + 28*2, audio_samples_limit - 28*2, 1, data + 0x11, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[3] = encode(&(state->left), audio_samples + 28*3, audio_samples_limit - 28*3, 1, data + 0x11, 4, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[8] = encode(&(state->left), audio_samples + 28*4, audio_samples_limit - 28*4, 1, data + 0x12, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[9] = encode(&(state->left), audio_samples + 28*5, audio_samples_limit - 28*5, 1, data + 0x12, 4, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[10] = encode(&(state->left), audio_samples + 28*6, audio_samples_limit - 28*6, 1, data + 0x13, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); data[11] = encode(&(state->left), audio_samples + 28*7, audio_samples_limit - 28*7, 1, data + 0x13, 4, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); } } else { if (settings.stereo) { data[0] = encode(&(state->left), audio_samples, audio_samples_limit, 2, data + 0x10, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_8BPS); data[1] = encode(&(state->right), audio_samples + 1, audio_samples_limit, 2, data + 0x11, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_8BPS); data[2] = encode(&(state->left), audio_samples + 56, audio_samples_limit - 28, 2, data + 0x12, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_8BPS); data[3] = encode(&(state->right), audio_samples + 56 + 1, audio_samples_limit - 28, 2, data + 0x13, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_8BPS); } else { data[0] = encode(&(state->left), audio_samples, audio_samples_limit, 1, data + 0x10, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_8BPS); data[1] = encode(&(state->left), audio_samples + 28, audio_samples_limit - 28, 1, data + 0x11, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_8BPS); data[2] = encode(&(state->left), audio_samples + 28*2, audio_samples_limit - 28*2, 1, data + 0x12, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_8BPS); data[3] = encode(&(state->left), audio_samples + 28*3, audio_samples_limit - 28*3, 1, data + 0x13, 0, 4, XA_ADPCM_FILTER_COUNT, SHIFT_RANGE_8BPS); } } } uint32_t psx_audio_xa_get_buffer_size(psx_audio_xa_settings_t settings, int sample_count) { int sample_pitch = psx_audio_xa_get_samples_per_sector(settings); int xa_sectors = ((sample_count + sample_pitch - 1) / sample_pitch); int xa_sector_size = psx_audio_xa_get_buffer_size_per_sector(settings); return xa_sectors * xa_sector_size; } uint32_t psx_audio_spu_get_buffer_size(int sample_count) { return ((sample_count + 27) / 28) << 4; } uint32_t psx_audio_xa_get_buffer_size_per_sector(psx_audio_xa_settings_t settings) { return settings.format == PSX_AUDIO_XA_FORMAT_XA ? 2336 : 2352; } uint32_t psx_audio_spu_get_buffer_size_per_block(void) { return 16; } uint32_t psx_audio_xa_get_samples_per_sector(psx_audio_xa_settings_t settings) { return (((settings.bits_per_sample == 8) ? 112 : 224) >> (settings.stereo ? 1 : 0)) * 18; } uint32_t psx_audio_spu_get_samples_per_block(void) { return 28; } uint32_t psx_audio_xa_get_sector_interleave(psx_audio_xa_settings_t settings) { // 1/2 interleave for 37800 Hz 8-bit stereo at 1x speed int interleave = settings.stereo ? 2 : 4; if (settings.frequency == PSX_AUDIO_XA_FREQ_SINGLE) { interleave <<= 1; } if (settings.bits_per_sample == 4) { interleave <<= 1; } return interleave; } static void psx_audio_xa_encode_init_sector(uint8_t *buffer, psx_audio_xa_settings_t settings) { if (settings.format == PSX_AUDIO_XA_FORMAT_XACD) { memset(buffer, 0, 2352); memset(buffer+0x001, 0xFF, 10); buffer[0x00F] = 0x02; } else { memset(buffer + 0x10, 0, 2336); } buffer[0x010] = settings.file_number; buffer[0x011] = settings.channel_number & 0x1F; buffer[0x012] = 0x24 | 0x40; buffer[0x013] = (settings.stereo ? 1 : 0) | (settings.frequency >= PSX_AUDIO_XA_FREQ_DOUBLE ? 0 : 4) | (settings.bits_per_sample >= 8 ? 16 : 0); memcpy(buffer + 0x014, buffer + 0x010, 4); } int psx_audio_xa_encode(psx_audio_xa_settings_t settings, psx_audio_encoder_state_t *state, int16_t* samples, int sample_count, uint8_t *output) { int sample_jump = (settings.bits_per_sample == 8) ? 112 : 224; int i, j; int xa_sector_size = settings.format == PSX_AUDIO_XA_FORMAT_XA ? 2336 : 2352; int xa_offset = 2352 - xa_sector_size; uint8_t init_sector = 1; if (settings.stereo) { sample_count <<= 1; } for (i = 0, j = 0; i < sample_count || ((j % 18) != 0); i += sample_jump, j++) { uint8_t *sector_data = output + ((j/18) * xa_sector_size) - xa_offset; uint8_t *block_data = sector_data + 0x18 + ((j%18) * 0x80); if (init_sector) { psx_audio_xa_encode_init_sector(sector_data, settings); init_sector = 0; } encode_block_xa(samples + i, sample_count - i, block_data, settings, state); memcpy(block_data + 4, block_data, 4); memcpy(block_data + 12, block_data + 8, 4); if ((j+1)%18 == 0) { psx_cdrom_calculate_checksums(sector_data, PSX_CDROM_SECTOR_TYPE_MODE2_FORM2); init_sector = 1; } } return (((j + 17) / 18) * xa_sector_size); } void psx_audio_xa_encode_finalize(psx_audio_xa_settings_t settings, uint8_t *output, int output_length) { if (output_length >= 2336) { output[output_length - 2352 + 0x12] |= 0x80; output[output_length - 2352 + 0x18] |= 0x80; } } int psx_audio_xa_encode_simple(psx_audio_xa_settings_t settings, int16_t* samples, int sample_count, uint8_t *output) { psx_audio_encoder_state_t state; memset(&state, 0, sizeof(psx_audio_encoder_state_t)); int length = psx_audio_xa_encode(settings, &state, samples, sample_count, output); psx_audio_xa_encode_finalize(settings, output, length); return length; } int psx_audio_spu_encode(psx_audio_encoder_channel_state_t *state, int16_t* samples, int sample_count, int pitch, uint8_t *output) { uint8_t prebuf[28]; uint8_t *buffer = output; for (int i = 0; i < sample_count; i += 28, buffer += 16) { buffer[0] = encode(state, samples + i * pitch, sample_count - i, pitch, prebuf, 0, 1, SPU_ADPCM_FILTER_COUNT, SHIFT_RANGE_4BPS); buffer[1] = 0; for (int j = 0; j < 28; j+=2) { buffer[2 + (j>>1)] = (prebuf[j] & 0x0F) | (prebuf[j+1] << 4); } } return buffer - output; } int psx_audio_spu_encode_simple(int16_t* samples, int sample_count, uint8_t *output, int loop_start) { psx_audio_encoder_channel_state_t state; memset(&state, 0, sizeof(psx_audio_encoder_channel_state_t)); int length = psx_audio_spu_encode(&state, samples, sample_count, 1, output); if (length >= 32) { if (loop_start < 0) { //output[1] = PSX_AUDIO_SPU_LOOP_START; output[length - 16 + 1] = PSX_AUDIO_SPU_LOOP_END; } else { psx_audio_spu_set_flag_at_sample(output, loop_start, PSX_AUDIO_SPU_LOOP_START); output[length - 16 + 1] = PSX_AUDIO_SPU_LOOP_REPEAT; } } else if (length >= 16) { output[1] = PSX_AUDIO_SPU_LOOP_START | PSX_AUDIO_SPU_LOOP_END; if (loop_start >= 0) output[1] |= PSX_AUDIO_SPU_LOOP_REPEAT; } return length; } void psx_audio_spu_set_flag_at_sample(uint8_t* spu_data, int sample_pos, int flag) { int buffer_pos = (sample_pos / 28) << 4; spu_data[buffer_pos + 1] = flag; }