#pragma once
#include "gte_instruction.hpp"

// GTE Overview
// GTE Data Register Summary (cop2r0-31)
//
// `mtc2` (Move To Coprocessor 2): Sets a Data Register (0–31).
// `mfc2` (Move From Coprocessor 2): Gets a Data Register (0–31).
//
//  | cop2r0-1   | 3xS16 | VXY0,VZ0            | Vector 0 (X,Y,Z)                      |
//  | cop2r2-3   | 3xS16 | VXY1,VZ1            | Vector 1 (X,Y,Z)                      |
//  | cop2r4-5   | 3xS16 | VXY2,VZ2            | Vector 2 (X,Y,Z)                      |
//  | cop2r6     | 4xU8  | RGBC                | Color/code value                      |
//  | cop2r7     | 1xU16 | OTZ                 | Average Z value (for Ordering Table)  |
//  | cop2r8     | 1xS16 | IR0                 | 16bit Accumulator (Interpolate)       |
//  | cop2r9-11  | 3xS16 | IR1,IR2,IR3         | 16bit Accumulator (Vector)            |
//  | cop2r12-15 | 6xS16 | SXY0,SXY1,SXY2,SXYP | Screen XY-coordinate FIFO  (3 stages) |
//  | cop2r16-19 | 4xU16 | SZ0,SZ1,SZ2,SZ3     | Screen Z-coordinate FIFO   (4 stages) |
//  | cop2r20-22 | 12xU8 | RGB0,RGB1,RGB2      | Color CRGB-code/color FIFO (3 stages) |
//  | cop2r23    | 4xU8  | (RES1)              | Prohibited                            |
//  | cop2r24    | 1xS32 | MAC0                | 32bit Maths Accumulators (Value)      |
//  | cop2r25-27 | 3xS32 | MAC1,MAC2,MAC3      | 32bit Maths Accumulators (Vector)     |
//  | cop2r28-29 | 1xU15 | IRGB,ORGB           | Convert RGB Color (48bit vs 15bit)    |
//  | cop2r30-31 | 2xS32 | LZCS,LZCR           | Count Leading-Zeroes/Ones (sign bits) |
//
// GTE Control Register Summary (cop2r32-63)
// ctc2 (Copy To Control Coprocessor 2): Sets a Control Register (cnt0–31).
// cfc2 (Copy From Control Coprocessor 2): Gets a Control Register (cnt0–31).
//
//  | cop2r32-36 9xS16 RT11RT12,..,RT33 | Rotation matrix     (3x3)         | cnt0-4   |
//  | cop2r37-39 3x 32 TRX,TRY,TRZ      | Translation vector  (X,Y,Z)       | cnt5-7   |
//  | cop2r40-44 9xS16 L11L12,..,L33    | Light source matrix (3x3)         | cnt8-12  |
//  | cop2r45-47 3x 32 RBK,GBK,BBK      | Background color    (R,G,B)       | cnt13-15 |
//  | cop2r48-52 9xS16 LR1LR2,..,LB3    | Light color matrix source (3x3)   | cnt16-20 |
//  | cop2r53-55 3x 32 RFC,GFC,BFC      | Far color           (R,G,B)       | cnt21-23 |
//  | cop2r56-57 2x 32 OFX,OFY          | Screen offset       (X,Y)         | cnt24-25 | (1bit sign, 15bit integer, 16bit fraction)
//  | cop2r58 BuggyU16 H                | Projection plane distance.        | cnt26    | (0bit sign, 16bit integer, 0bit fraction)
//  | cop2r59      S16 DQA              | Depth queing parameter A (coeff)  | cnt27    |
//  | cop2r60       32 DQB              | Depth queing parameter B (offset) | cnt28    |
//  | cop2r61-62 2xS16 ZSF3,ZSF4        | Average Z scale factors           | cnt29-30 |
//  | cop2r63      U20 FLAG             | Returns any calculation errors    | cnt31    |


namespace JabyEngine {
    namespace GTE {
        static constexpr auto StackSize = 16;

        /*
            matrix: first input

            Sets the 3x3 constant rotation matrix and the parallel transfer vector from input 
        */
        void set_matrix(const MATRIX& matrix);

        /*
            returns: current matrix

            Gets the current 3x3 constant rotation matrix and the parallel transfer vector
        */
        MATRIX get_matrix();

        /*
            RotTrans

            Perform coordinate transformation using a rotation matrix
            input:  Input vector
            output: Output vector
            flag:   flag output
        */
        static void rot_trans(const SVECTOR& input, VECTOR& output, int32_t& flag) {
            ldv0(input);
            rt();
            stlvnl(output);
            stflg(flag);
        }

        /*
            ScaleMatrix

            m: Pointer to matrix (input/output)
            v: Pointer to scale vector (input)

            result: m
            Scales m by v. The components of v are fixed point decimals in which 1.0 represents 4096
        */
        static ROTMATRIX& scale_matrix(ROTMATRIX& m, const VECTOR& v) {
            static const auto multiply_matrix_row = [](int32_t value, ROTMATRIX& matrix, size_t row) {
                ldir0(value);           // lwc2	r8,	v.x
                ldclmv(matrix, row);    // load matrix row to r9 - r11 (mtc2)
                gpf12();                // gte_gpf12
                stclmv(matrix, row);    // store matrix row
            };

            multiply_matrix_row(v.x, m, 0);
            multiply_matrix_row(v.y, m, 1);
            multiply_matrix_row(v.z, m, 2);
            return m;
        }

        /*
            SetRotMatrix

            Sets a 3x3 matrix m as a constant rotation matrix.
            matrix: The rotation matrix to set
        */
        static void set_rot_matrix(const ROTMATRIX& matrix) {
            __asm__ volatile("lw   $12, 0(%0)"  :: "r"(&matrix) : "$12", "$13", "$14"); 
            __asm__ volatile("lw   $13, 4(%0)"  :: "r"(&matrix) : "$12", "$13", "$14"); 
            __asm__ volatile("ctc2 $12, $0"     :: "r"(&matrix) : "$12", "$13", "$14"); 
            __asm__ volatile("ctc2 $13, $1"     :: "r"(&matrix) : "$12", "$13", "$14"); 
            __asm__ volatile("lw   $12, 8(%0)"  :: "r"(&matrix) : "$12", "$13", "$14"); 
            __asm__ volatile("lw   $13, 12(%0)" :: "r"(&matrix) : "$12", "$13", "$14");
            __asm__ volatile("lw   $14, 16(%0)" :: "r"(&matrix) : "$12", "$13", "$14");
            __asm__ volatile("ctc2 $12, $2"     :: "r"(&matrix) : "$12", "$13", "$14");
            __asm__ volatile("ctc2 $13, $3"     :: "r"(&matrix) : "$12", "$13", "$14");
            __asm__ volatile("ctc2 $14, $4"     :: "r"(&matrix) : "$12", "$13", "$14");
        }

        /*
            GetRotMatrix

            Writes the current 3x3 constant rotation matrix to matrix
            (This doesn't require us to use memory clobber)
        */
        static void get_rot_matrix(ROTMATRIX &matrix) {
            __asm__ volatile("cfc2 $12, $0"     :: "r"(&matrix) : "$12", "$13", "$14"); 
            __asm__ volatile("cfc2 $13, $1"     :: "r"(&matrix) : "$12", "$13", "$14"); 
            __asm__ volatile("sw   $12, 0(%0)"  :: "r"(&matrix) : "$12", "$13", "$14"); 
            __asm__ volatile("sw   $13, 4(%0)"  :: "r"(&matrix) : "$12", "$13", "$14"); 
            __asm__ volatile("cfc2 $12, $2"     :: "r"(&matrix) : "$12", "$13", "$14");
            __asm__ volatile("cfc2 $13, $3"     :: "r"(&matrix) : "$12", "$13", "$14");
            __asm__ volatile("cfc2 $14, $4"     :: "r"(&matrix) : "$12", "$13", "$14");            
            __asm__ volatile("sw   $12, 8(%0)"  :: "r"(&matrix) : "$12", "$13", "$14"); 
            __asm__ volatile("sw   $13, 12(%0)" :: "r"(&matrix) : "$12", "$13", "$14");
            __asm__ volatile("sw   $14, 16(%0)" :: "r"(&matrix) : "$12", "$13", "$14");
        }

        /*
            SetTransMatrix

            Sets a constant parallel transfer vector specified by m
        */
        static void set_trans_vector(const TRANSFERVECTOR& vector)  {
            __asm__ volatile("lw   $12, 0(%0)" :: "r"(&vector) : "$12", "$13", "$14");
            __asm__ volatile("lw   $13, 4(%0)" :: "r"(&vector) : "$12", "$13", "$14");
            __asm__ volatile("ctc2 $12, $5"    :: "r"(&vector) : "$12", "$13", "$14");
            __asm__ volatile("lw   $14, 8(%0)" :: "r"(&vector) : "$12", "$13", "$14");
            __asm__ volatile("ctc2 $13, $6"    :: "r"(&vector) : "$12", "$13", "$14");
            __asm__ volatile("ctc2 $14, $7"    :: "r"(&vector) : "$12", "$13", "$14");
        }

        /*
            GetTransMatrix

            Writes the current constant parallel transfer vector to matrix
            (This doesn't require us to use memory clobber)
        */
        static void get_trans_vector(TRANSFERVECTOR& vector) {
            __asm__ volatile("cfc2 $14, $7"    :: "r"(&vector) : "$12", "$13", "$14");
            __asm__ volatile("cfc2 $13, $6"    :: "r"(&vector) : "$12", "$13", "$14");
            __asm__ volatile("sw   $14, 8(%0)" :: "r"(&vector) : "$12", "$13", "$14");
            __asm__ volatile("cfc2 $12, $5"    :: "r"(&vector) : "$12", "$13", "$14");
            __asm__ volatile("sw   $13, 4(%0)" :: "r"(&vector) : "$12", "$13", "$14");
            __asm__ volatile("sw   $12, 0(%0)" :: "r"(&vector) : "$12", "$13", "$14");
        }
        
        /*
            ApplyMatrix
            m0: Matrix to apply
            v0: Vector to apply to
            v1: Result
            returns: result

            Applies the matrix to the vector
            The function destroys the constant rotation matrix and transfer vector
        */
        static SVECTOR& apply_matrix(const MATRIX& m0, const SVECTOR& v0, SVECTOR& v1) {
            set_matrix(m0);

            JabyEngine::GTE::ldv0(v0);
            JabyEngine::GTE::rt();
            JabyEngine::GTE::stsv(v1);
            return v1;
        }

        /*
            Same as apply_matrix but works on Vertex
        */
        static GPU::Vertex& apply_matrix(const MATRIX& m0, const GPU::Vertex& v0, GPU::Vertex& v1) {
            set_matrix(m0);

            JabyEngine::GTE::ldgv0(v0);
            JabyEngine::GTE::rt();
            JabyEngine::GTE::stgv(v1);
            return v1;
        }

        /*
            MulMatrix0

            m0: first input
            m1: second input
            result: result of multiplication
            returns: result

            Multiplies two matrices m0 and m1.
            The function destroys the constant rotation matrix
        */
        ROTMATRIX& multiply_matrix(const ROTMATRIX& m0, const ROTMATRIX& m1, ROTMATRIX& result);

        /*
            CompMatrix

            m0: first input
            m1: second input
            result: result of computing m0 and m1
            return: returns result
        */
        static MATRIX& comp_matrix(const MATRIX& m0, const MATRIX& m1, MATRIX& result) {
            multiply_matrix(m0.rotation, m1.rotation, result.rotation);
            set_trans_vector(m0.transfer);
            GTE::ldlv0(reinterpret_cast<const VECTOR&>(m1.transfer));
            GTE::rt();
            GTE::stlvnl(reinterpret_cast<VECTOR&>(result.transfer));

			return result;
        } 

        /*
            matrix: optional input

            Pushes the current matrix (rotation and parallel) to an internal stack
            Optional: replaces current matrix (rotation and parallel) with input
        */
        void push_matrix();
        void push_matrix_and_set(const MATRIX& matrix);

        /*
            Restores the previous stored matrix (rotation and parallel)
        */
        MATRIX get_and_pop_matrix();
        void   pop_matrix();

        /*
            SetGeomOffset(ofx,ofy)

            Load GTE-offset.
        */
        static void set_geom_offset(int32_t off_x, int32_t off_y) {
            __asm__ volatile("sll  $12, %0, 16" :: "r"(off_x), "r"(off_y) : "$12", "$13");
            __asm__ volatile("sll  $13, %1, 16" :: "r"(off_x), "r"(off_y) : "$12", "$13");
            __asm__ volatile("ctc2 $12, $24"    :: "r"(off_x), "r"(off_y) : "$12", "$13");
            __asm__ volatile("ctc2 $13, $25"    :: "r"(off_x), "r"(off_y) : "$12", "$13");
        }

        static void get_geom_offset(int32_t &off_x, int32_t &off_y) {
            int32_t raw_x, raw_y;

            __asm__ volatile (
                "cfc2 %0, $24\n"
                "cfc2 %1, $25"
                : "=r" (raw_x), "=r" (raw_y)
            );

            off_x = raw_x >> 16;
            off_y = raw_y >> 16;
        }

        /*
            SetGeomScreen(h)
            
            Load distance from viewpoint to screen.
        */
        static void set_geom_screen(int32_t h) {
            __asm__ volatile("ctc2 %0, $26" :: "r"(h));
        }

        /*
            GetGeomScreen() (???)
            
            Get distance from viewpoint to screen.
        */
        static int32_t get_geom_screen() {
            int32_t h;

            __asm__ volatile("cfc2 %0, $26" : "=r"(h));
            return h;
        }

        // Implementations for the MATRIX struct
        inline MATRIX& MATRIX :: comp(const MATRIX& matrix) {
            return comp_matrix(matrix, *this, *this);
        }

        inline GPU::Vertex& MATRIX :: apply_to(GPU::Vertex& vertex) const {
            return apply_matrix(*this, vertex, vertex);
        }

        inline GPU::Vertex MATRIX :: apply_to(const GPU::Vertex& vertex) const {
            GPU::Vertex result;

            apply_matrix(*this, vertex, result);
            return result;
        }
    }
}