From 524188bb7aa08692a688ea7911f757a298913108 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Fri, 8 Sep 2023 02:28:51 +0200 Subject: [PATCH] Refactor more GX2 code to use LatteReg.h --- src/Cafe/CafeSystem.cpp | 2 +- src/Cafe/HW/Latte/Core/FetchShader.cpp | 4 +- src/Cafe/HW/Latte/ISA/LatteReg.h | 305 ++++++++++++++++++++- src/Cafe/HW/Latte/ISA/RegDefines.h | 2 - src/Cafe/OS/libs/gx2/GX2.cpp | 7 +- src/Cafe/OS/libs/gx2/GX2.h | 6 - src/Cafe/OS/libs/gx2/GX2_Command.cpp | 4 +- src/Cafe/OS/libs/gx2/GX2_Shader.cpp | 196 ++++++++++++- src/Cafe/OS/libs/gx2/GX2_Shader.h | 58 ++-- src/Cafe/OS/libs/gx2/GX2_shader_legacy.cpp | 214 +-------------- 10 files changed, 536 insertions(+), 262 deletions(-) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index 8c2344ce..93ced948 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -487,7 +487,7 @@ namespace CafeSystem #if BOOST_OS_WINDOWS std::string GetWindowsNamedVersion(uint32& buildNumber) { - static char productName[256]; + char productName[256]; HKEY hKey; DWORD dwType = REG_SZ; DWORD dwSize = sizeof(productName); diff --git a/src/Cafe/HW/Latte/Core/FetchShader.cpp b/src/Cafe/HW/Latte/Core/FetchShader.cpp index b4beba4e..6c9893f9 100644 --- a/src/Cafe/HW/Latte/Core/FetchShader.cpp +++ b/src/Cafe/HW/Latte/Core/FetchShader.cpp @@ -228,13 +228,13 @@ void _fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(LatteFetchShader* pars else if (srcSelX == LatteClauseInstruction_VTX::SRC_SEL::SEL_Y) { // use alu divisor 1 - attribGroup->attrib[groupAttribIndex].aluDivisor = (sint32)contextRegister[mmVGT_INSTANCE_STEP_RATE_0 + 0]; + attribGroup->attrib[groupAttribIndex].aluDivisor = (sint32)contextRegister[Latte::REGADDR::VGT_INSTANCE_STEP_RATE_0]; cemu_assert_debug(attribGroup->attrib[groupAttribIndex].aluDivisor > 0); } else if (srcSelX == LatteClauseInstruction_VTX::SRC_SEL::SEL_Z) { // use alu divisor 2 - attribGroup->attrib[groupAttribIndex].aluDivisor = (sint32)contextRegister[mmVGT_INSTANCE_STEP_RATE_0 + 1]; + attribGroup->attrib[groupAttribIndex].aluDivisor = (sint32)contextRegister[Latte::REGADDR::VGT_INSTANCE_STEP_RATE_1]; cemu_assert_debug(attribGroup->attrib[groupAttribIndex].aluDivisor > 0); } } diff --git a/src/Cafe/HW/Latte/ISA/LatteReg.h b/src/Cafe/HW/Latte/ISA/LatteReg.h index e539902f..7f0cf7c9 100644 --- a/src/Cafe/HW/Latte/ISA/LatteReg.h +++ b/src/Cafe/HW/Latte/ISA/LatteReg.h @@ -381,6 +381,9 @@ namespace Latte PA_SC_GENERIC_SCISSOR_TL = 0xA090, PA_SC_GENERIC_SCISSOR_BR = 0xA091, + SQ_VTX_SEMANTIC_0 = 0xA0E0, + SQ_VTX_SEMANTIC_31 = 0xA0FF, + VGT_MULTI_PRIM_IB_RESET_INDX = 0xA103, SX_ALPHA_TEST_CONTROL = 0xA104, CB_BLEND_RED = 0xA105, @@ -398,6 +401,10 @@ namespace Latte PA_CL_VPORT_ZSCALE = 0xA113, PA_CL_VPORT_ZOFFSET = 0xA114, + SPI_VS_OUT_ID_0 = 0xA185, + + SPI_VS_OUT_CONFIG = 0xA1B1, + CB_BLEND0_CONTROL = 0xA1E0, // first CB_BLEND7_CONTROL = 0xA1E7, // last @@ -408,7 +415,23 @@ namespace Latte PA_CL_CLIP_CNTL = 0xA204, PA_SU_SC_MODE_CNTL = 0xA205, PA_CL_VTE_CNTL = 0xA206, - + PA_CL_VS_OUT_CNTL = 0xA207, + + // shader program descriptors: + SQ_PGM_START_PS = 0xA210, + SQ_PGM_RESOURCES_PS = 0xA214, + SQ_PGM_EXPORTS_PS = 0xA215, + SQ_PGM_START_VS = 0xA216, + SQ_PGM_RESOURCES_VS = 0xA21A, + SQ_PGM_START_GS = 0xA21B, + SQ_PGM_RESOURCES_GS = 0xA21F, + SQ_PGM_START_ES = 0xA220, + SQ_PGM_RESOURCES_ES = 0xA224, + SQ_PGM_START_FS = 0xA225, + SQ_PGM_RESOURCES_FS = 0xA229, + + SQ_VTX_SEMANTIC_CLEAR = 0xA238, + PA_SU_POINT_SIZE = 0xA280, PA_SU_POINT_MINMAX = 0xA281, @@ -416,6 +439,35 @@ namespace Latte VGT_DMA_INDEX_TYPE = 0xA29F, // todo - verify offset + VGT_PRIMITIVEID_EN = 0xA2A1, + + VGT_MULTI_PRIM_IB_RESET_EN = 0xA2A5, + + VGT_INSTANCE_STEP_RATE_0 = 0xA2A8, + VGT_INSTANCE_STEP_RATE_1 = 0xA2A9, + + VGT_STRMOUT_BUFFER_SIZE_0 = 0xA2B4, + VGT_STRMOUT_VTX_STRIDE_0 = 0xA2B5, + VGT_STRMOUT_BUFFER_BASE_0 = 0xA2B6, + VGT_STRMOUT_BUFFER_OFFSET_0 = 0xA2B7, + VGT_STRMOUT_BUFFER_SIZE_1 = 0xA2B8, + VGT_STRMOUT_VTX_STRIDE_1 = 0xA2B9, + VGT_STRMOUT_BUFFER_BASE_1 = 0xA2BA, + VGT_STRMOUT_BUFFER_OFFSET_1 = 0xA2BB, + VGT_STRMOUT_BUFFER_SIZE_2 = 0xA2BC, + VGT_STRMOUT_VTX_STRIDE_2 = 0xA2BD, + VGT_STRMOUT_BUFFER_BASE_2 = 0xA2BE, + VGT_STRMOUT_BUFFER_OFFSET_2 = 0xA2BF, + VGT_STRMOUT_BUFFER_SIZE_3 = 0xA2C0, + VGT_STRMOUT_VTX_STRIDE_3 = 0xA2C1, + VGT_STRMOUT_BUFFER_BASE_3 = 0xA2C2, + VGT_STRMOUT_BUFFER_OFFSET_3 = 0xA2C3, + VGT_STRMOUT_BASE_OFFSET_0 = 0xA2C4, + VGT_STRMOUT_BASE_OFFSET_1 = 0xA2C5, + VGT_STRMOUT_BASE_OFFSET_2 = 0xA2C6, + VGT_STRMOUT_BASE_OFFSET_3 = 0xA2C7, + VGT_STRMOUT_BUFFER_EN = 0xA2C8, + // HiZ early stencil test? DB_SRESULTS_COMPARE_STATE0 = 0xA34A, DB_SRESULTS_COMPARE_STATE1 = 0xA34B, @@ -842,6 +894,12 @@ float get_##__regname() const \ LATTE_BITFIELD_BOOL(VTX_W0_FMT, 10); }; + struct LATTE_PA_CL_VS_OUT_CNTL : LATTEREG // 0xA207 + { + LATTE_BITFIELD(CLIP_DIST_ENA_MASK, 0, 8); + LATTE_BITFIELD(CULL_DIST_ENA_MASK, 8, 8); + }; + struct LATTE_PA_SU_POINT_SIZE : LATTEREG // 0xA280 { LATTE_BITFIELD(HEIGHT, 0, 16); @@ -909,6 +967,54 @@ float get_##__regname() const \ LATTE_BITFIELD_FULL_TYPED(INDEX_TYPE, E_INDEX_TYPE); }; + struct LATTE_VGT_PRIMITIVEID_EN : LATTEREG // 0xA2A1 + { + LATTE_BITFIELD_BOOL(PRIMITIVEID_EN, 0); + }; + + struct LATTE_VGT_MULTI_PRIM_IB_RESET_EN : LATTEREG // 0xA2A5 + { + LATTE_BITFIELD_BOOL(RESET_EN, 0); + }; + + struct LATTE_VGT_INSTANCE_STEP_RATE_X : LATTEREG // 0xA2A8-0xA2A9 + { + LATTE_BITFIELD_FULL_TYPED(STEP_RATE, uint32); + }; + + struct LATTE_VGT_STRMOUT_BUFFER_SIZE_X : LATTEREG // 0xA2B4 + index * 4 + { + LATTE_BITFIELD_FULL_TYPED(SIZE, uint32); + }; + + struct LATTE_VGT_STRMOUT_STRIDE_X : LATTEREG // 0xA2B5 + index * 4 + { + LATTE_BITFIELD_FULL_TYPED(STRIDE, uint32); + }; + + struct LATTE_VGT_STRMOUT_BUFFER_BASE_X : LATTEREG // 0xA2B6 + index * 4 + { + LATTE_BITFIELD_FULL_TYPED(BASE, uint32); + }; + + struct LATTE_VGT_STRMOUT_BUFFER_OFFSET_X : LATTEREG // 0xA2B7 + index * 4 + { + LATTE_BITFIELD_FULL_TYPED(BUFFER_OFFSET, uint32); + }; + + struct LATTE_VGT_STRMOUT_BASE_OFFSET_X : LATTEREG // 0xA2C4-0xA2C7 + { + LATTE_BITFIELD_FULL_TYPED(BASE_OFFSET, uint32); + }; + + struct LATTE_VGT_STRMOUT_BUFFER_EN : LATTEREG // 0xA2C8 + { + LATTE_BITFIELD_BOOL(BUFFER_ENABLE_0, 0); + LATTE_BITFIELD_BOOL(BUFFER_ENABLE_1, 1); + LATTE_BITFIELD_BOOL(BUFFER_ENABLE_2, 2); + LATTE_BITFIELD_BOOL(BUFFER_ENABLE_3, 3); + }; + struct LATTE_PA_SU_POLY_OFFSET_CLAMP : LATTEREG // 0xA37F { LATTE_BITFIELD_FLOAT(CLAMP); @@ -934,6 +1040,16 @@ float get_##__regname() const \ LATTE_BITFIELD_FLOAT(OFFSET); }; + struct LATTE_SQ_VTX_SEMANTIC_CLEAR : LATTEREG // 0xA238 + { + LATTE_BITFIELD_FULL_TYPED(CLEAR_MASK, uint32); // probably a bitmask + }; + + struct LATTE_SQ_VTX_SEMANTIC_X : LATTEREG // 0xA0E0 - 0xA0FF + { + LATTE_BITFIELD(SEMANTIC_ID, 0, 8); + }; + struct LATTE_SQ_TEX_RESOURCE_WORD0_N : LATTEREG // 0xE000 + index * 7 { LATTE_BITFIELD_TYPED(DIM, 0, 3, E_DIM); @@ -1154,6 +1270,65 @@ float get_##__regname() const \ LATTE_BITFIELD_TYPED(TYPE, 31, 1, E_SAMPLER_TYPE); }; + struct LATTE_SQ_PGM_START_X : LATTEREG // 0xA210 / 0xA216 / 0xA21B / 0xA220 / 0xA225 + { + LATTE_BITFIELD_FULL_TYPED(PGM_START, uint32); + }; + + struct LATTE_SQ_PGM_RESOURCES_PS : LATTEREG // 0xA214 + { + LATTE_BITFIELD(NUM_GPRS, 0, 8); + LATTE_BITFIELD(NUM_STACK_ENTRIES, 8, 8); + LATTE_BITFIELD_BOOL(DX10_CLAMP, 21); // if true, CLAMP modifier in shaders will return 0 for NaN + LATTE_BITFIELD(FETCH_CACHE_LINES, 24, 3); + LATTE_BITFIELD_BOOL(UNCACHED_FIRST_INST, 28); + LATTE_BITFIELD_BOOL(CLAMP_CONSTS, 31); + }; + + struct LATTE_SQ_PGM_RESOURCES_VS : LATTEREG // 0xA21A + { + LATTE_BITFIELD(NUM_GPRS, 0, 8); + LATTE_BITFIELD(NUM_STACK_ENTRIES, 8, 8); + LATTE_BITFIELD_BOOL(DX10_CLAMP, 21); // if true, CLAMP modifier in shaders will return 0 for NaN + LATTE_BITFIELD(FETCH_CACHE_LINES, 24, 3); + LATTE_BITFIELD_BOOL(UNCACHED_FIRST_INST, 28); + }; + + struct LATTE_SQ_PGM_RESOURCES_GS : LATTEREG // 0xA21F + { + LATTE_BITFIELD(NUM_GPRS, 0, 8); + LATTE_BITFIELD(NUM_STACK_ENTRIES, 8, 8); + LATTE_BITFIELD_BOOL(DX10_CLAMP, 21); // if true, CLAMP modifier in shaders will return 0 for NaN + }; + + struct LATTE_SQ_PGM_RESOURCES_ES : LATTEREG // 0xA224 + { + LATTE_BITFIELD(NUM_GPRS, 0, 8); + LATTE_BITFIELD(NUM_STACK_ENTRIES, 8, 8); + LATTE_BITFIELD_BOOL(DX10_CLAMP, 21); // if true, CLAMP modifier in shaders will return 0 for NaN + }; + + struct LATTE_SQ_PGM_RESOURCES_FS : LATTEREG // 0xA229 + { + LATTE_BITFIELD(NUM_GPRS, 0, 8); + LATTE_BITFIELD(NUM_STACK_ENTRIES, 8, 8); + LATTE_BITFIELD_BOOL(DX10_CLAMP, 21); // if true, CLAMP modifier in shaders will return 0 for NaN + }; + + struct LATTE_SQ_XX_ITEMSIZE : LATTEREG // 0xA227 - 0xA2XX + { + // used by: + // SQ_ESGS_RING_ITEMSIZE + // SQ_GSVS_RING_ITEMSIZE + // SQ_ESTMP_RING_ITEMSIZE + // SQ_GSTMP_RING_ITEMSIZE + // SQ_VSTMP_RING_ITEMSIZE + // SQ_PSTMP_RING_ITEMSIZE + // SQ_FBUF_RING_ITEMSIZE + // SQ_REDUC_RING_ITEMSIZE + LATTE_BITFIELD(ITEMSIZE, 0, 15); + }; + struct LATTE_PA_SU_SC_MODE_CNTL : LATTEREG // 0xA205 { enum class E_FRONTFACE @@ -1185,7 +1360,32 @@ float get_##__regname() const \ LATTE_BITFIELD_BOOL(OFFSET_PARA_ENABLED, 13); // offset enable for lines and points? // additional fields? }; -} + + struct LATTE_SPI_VS_OUT_CONFIG : LATTEREG // 0xA1B1 + { + LATTE_BITFIELD_BOOL(VS_PER_COMPONENT, 0); + LATTE_BITFIELD(VS_EXPORT_COUNT, 1, 5); + LATTE_BITFIELD_BOOL(EXPORTS_FOG, 8); + LATTE_BITFIELD(VS_OUT_FOG_VEC_ADDR, 9, 5); + }; + + struct LATTE_SPI_VS_OUT_ID_N : LATTEREG // 0xA185 - 0xA18E(?) - 0xA1B2 - 0xA1B3 + { + uint8 get_SEMANTIC(sint32 index) + { + cemu_assert_debug(index < 4); + return (uint8)((v >> (index * 8)) & 0xFF); + } + + void set_SEMANTIC(sint32 index, uint8 value) + { + cemu_assert_debug(index < 4); + v &= ~(0xFF << (index * 8)); + v |= (value & 0xFF) << (index * 8); + } + }; + +}; struct _LatteRegisterSetTextureUnit { @@ -1219,6 +1419,16 @@ struct _LatteRegisterSetSamplerBorderColor static_assert(sizeof(_LatteRegisterSetSamplerBorderColor) == 16); +struct _LatteRegisterSetStreamoutBuffer +{ + Latte::LATTE_VGT_STRMOUT_BUFFER_SIZE_X SIZE; + Latte::LATTE_VGT_STRMOUT_STRIDE_X STRIDE; + Latte::LATTE_VGT_STRMOUT_BUFFER_BASE_X BASE; + Latte::LATTE_VGT_STRMOUT_BUFFER_OFFSET_X BUFFER_OFFSET; +}; + +static_assert(sizeof(_LatteRegisterSetStreamoutBuffer) == 16); + struct LatteContextRegister { uint8 padding0[0x08958]; @@ -1235,7 +1445,9 @@ struct LatteContextRegister uint8 padding_2823C[4]; /* +0x28240 */ Latte::LATTE_PA_SC_GENERIC_SCISSOR_TL PA_SC_GENERIC_SCISSOR_TL; /* +0x28244 */ Latte::LATTE_PA_SC_GENERIC_SCISSOR_BR PA_SC_GENERIC_SCISSOR_BR; - uint8 padding_28248[0x2840C - 0x28248]; + uint8 padding_28248[0x28380 - 0x28248]; + /* +0x28380 */ Latte::LATTE_SQ_VTX_SEMANTIC_X SQ_VTX_SEMANTIC_X[32]; + /* +0x28400 */ uint8 padding_28400[0x2840C - 0x28400]; /* +0x2840C */ Latte::LATTE_VGT_MULTI_PRIM_IB_RESET_INDX VGT_MULTI_PRIM_IB_RESET_INDX; /* +0x28410 */ Latte::LATTE_SX_ALPHA_TEST_CONTROL SX_ALPHA_TEST_CONTROL; /* +0x28414 */ Latte::LATTE_CB_BLEND_RED CB_BLEND_RED; @@ -1253,7 +1465,15 @@ struct LatteContextRegister /* +0x2844C */ Latte::LATTE_PA_CL_VPORT_ZSCALE PA_CL_VPORT_ZSCALE; /* +0x28450 */ Latte::LATTE_PA_CL_VPORT_ZOFFSET PA_CL_VPORT_ZOFFSET; - uint8 padding_28450[0x28780 - 0x28454]; + uint8 padding_28450[0x28614 - 0x28454]; + + /* +0x28614 */ Latte::LATTE_SPI_VS_OUT_ID_N LATTE_SPI_VS_OUT_ID_N[10]; + + uint8 padding_2863C[0x286C4 - 0x2863C]; + + /* +0x286C4 */ Latte::LATTE_SPI_VS_OUT_CONFIG SPI_VS_OUT_CONFIG; + + uint8 padding_286C8[0x28780 - 0x286C8]; /* +0x28780 */ Latte::LATTE_CB_BLENDN_CONTROL CB_BLENDN_CONTROL[8]; @@ -1266,9 +1486,44 @@ struct LatteContextRegister /* +0x28810 */ Latte::LATTE_PA_CL_CLIP_CNTL PA_CL_CLIP_CNTL; /* +0x28814 */ Latte::LATTE_PA_SU_SC_MODE_CNTL PA_SU_SC_MODE_CNTL; /* +0x28818 */ Latte::LATTE_PA_CL_VTE_CNTL PA_CL_VTE_CNTL; + /* +0x2881C */ Latte::LATTE_PA_CL_VS_OUT_CNTL PA_CL_VS_OUT_CNTL; - uint8 padding_2881C[0x28A00 - 0x2881C]; + uint8 padding_2881C[0x28840 - 0x28820]; + /* +0x28840 */ Latte::LATTE_SQ_PGM_START_X SQ_PGM_START_PS; + /* +0x28844 */ uint32 ukn28844; // PS size + /* +0x28848 */ uint32 ukn28848; + /* +0x2884C */ uint32 ukn2884C; + /* +0x28850 */ Latte::LATTE_SQ_PGM_RESOURCES_PS SQ_PGM_RESOURCES_PS; + /* +0x28854 */ uint32 ukn28854; // SQ_PGM_EXPORTS_PS + /* +0x28858 */ Latte::LATTE_SQ_PGM_START_X SQ_PGM_START_VS; + /* +0x2885C */ uint32 ukn2885C; // VS size + /* +0x28860 */ uint32 ukn28860; + /* +0x28864 */ uint32 ukn28864; + /* +0x28868 */ Latte::LATTE_SQ_PGM_RESOURCES_VS SQ_PGM_RESOURCES_VS; + /* +0x2886C */ Latte::LATTE_SQ_PGM_START_X SQ_PGM_START_GS; + /* +0x28870 */ uint32 ukn28870; // GS size + /* +0x28874 */ uint32 ukn28874; + /* +0x28878 */ uint32 ukn28878; + /* +0x2887C */ Latte::LATTE_SQ_PGM_RESOURCES_GS SQ_PGM_RESOURCES_GS; + /* +0x28880 */ Latte::LATTE_SQ_PGM_START_X SQ_PGM_START_ES; + /* +0x28884 */ uint32 ukn28884; // ES size + /* +0x28888 */ uint32 ukn28888; + /* +0x2888C */ uint32 ukn2888C; + /* +0x28890 */ Latte::LATTE_SQ_PGM_RESOURCES_ES SQ_PGM_RESOURCES_ES; + /* +0x28894 */ Latte::LATTE_SQ_PGM_START_X SQ_PGM_START_FS; + /* +0x28898 */ uint32 ukn28898; // FS size + /* +0x2889C */ uint32 ukn2889C; + /* +0x288A0 */ uint32 ukn288A0; + /* +0x288A4 */ Latte::LATTE_SQ_PGM_RESOURCES_FS SQ_PGM_RESOURCES_FS; + /* +0x288A8 */ Latte::LATTE_SQ_XX_ITEMSIZE SQ_ESGS_RING_ITEMSIZE; + /* +0x288AC */ Latte::LATTE_SQ_XX_ITEMSIZE SQ_GSVS_RING_ITEMSIZE; + /* +0x288B0 */ Latte::LATTE_SQ_XX_ITEMSIZE SQ_ESTMP_RING_ITEMSIZE; + /* +0x288B4 */ Latte::LATTE_SQ_XX_ITEMSIZE SQ_GSTMP_RING_ITEMSIZE; + /* +0x288B8 */ Latte::LATTE_SQ_XX_ITEMSIZE SQ_VSTMP_RING_ITEMSIZE; + uint8 padding_288BC[0x288E0 - 0x288BC]; + /* +0x288E0 */ Latte::LATTE_SQ_VTX_SEMANTIC_CLEAR SQ_VTX_SEMANTIC_CLEAR; + uint8 padding_288E4[0x28A00 - 0x288E4]; /* +0x28A00 */ Latte::LATTE_PA_SU_POINT_SIZE PA_SU_POINT_SIZE; /* +0x28A04 */ Latte::LATTE_PA_SU_POINT_MINMAX PA_SU_POINT_MINMAX; @@ -1279,8 +1534,24 @@ struct LatteContextRegister uint8 padding_28A44[0x28A7C - 0x28A44]; /* +0x28A7C */ Latte::LATTE_VGT_DMA_INDEX_TYPE VGT_DMA_INDEX_TYPE; + /* +0x28A80 */ uint32 ukn28A80; + /* +0x28A84 */ Latte::LATTE_VGT_PRIMITIVEID_EN VGT_PRIMITIVEID_EN; + /* +0x28A88 */ uint32 ukn28A88; + /* +0x28A8C */ uint32 ukn28A8C; + /* +0x28A90 */ uint32 ukn28A90; + /* +0x28A94 */ Latte::LATTE_VGT_MULTI_PRIM_IB_RESET_EN VGT_MULTI_PRIM_IB_RESET_EN; + /* +0x28A98 */ uint32 ukn28A98; + /* +0x28A9C */ uint32 ukn28A9C; + /* +0x28AA0 */ Latte::LATTE_VGT_INSTANCE_STEP_RATE_X VGT_INSTANCE_STEP_RATE_0; + /* +0x28AA4 */ Latte::LATTE_VGT_INSTANCE_STEP_RATE_X VGT_INSTANCE_STEP_RATE_1; - uint8 padding_28A80[0x28DFC - 0x28A80]; + uint8 padding_28AA8[0x28AD0 - 0x28AA8]; + + /* +0x28AD0 */ _LatteRegisterSetStreamoutBuffer VGT_STRMOUT_BUFFER_X[4]; + /* +0x28B10 */ Latte::LATTE_VGT_STRMOUT_BASE_OFFSET_X VGT_STRMOUT_BASE_OFFSET_X[4]; + /* +0x28B20 */ Latte::LATTE_VGT_STRMOUT_BUFFER_EN VGT_STRMOUT_BUFFER_EN; + + uint8 padding_28B24[0x28DFC - 0x28B24]; /* +0x28DFC */ Latte::LATTE_PA_SU_POLY_OFFSET_CLAMP PA_SU_POLY_OFFSET_CLAMP; /* +0x28E00 */ Latte::LATTE_PA_SU_POLY_OFFSET_FRONT_SCALE PA_SU_POLY_OFFSET_FRONT_SCALE; @@ -1334,6 +1605,13 @@ static_assert(offsetof(LatteContextRegister, CB_TARGET_MASK) == Latte::REGADDR:: static_assert(offsetof(LatteContextRegister, PA_SC_GENERIC_SCISSOR_TL) == Latte::REGADDR::PA_SC_GENERIC_SCISSOR_TL * 4); static_assert(offsetof(LatteContextRegister, PA_SC_GENERIC_SCISSOR_BR) == Latte::REGADDR::PA_SC_GENERIC_SCISSOR_BR * 4); static_assert(offsetof(LatteContextRegister, VGT_MULTI_PRIM_IB_RESET_INDX) == Latte::REGADDR::VGT_MULTI_PRIM_IB_RESET_INDX * 4); +static_assert(offsetof(LatteContextRegister, VGT_PRIMITIVEID_EN) == Latte::REGADDR::VGT_PRIMITIVEID_EN * 4); +static_assert(offsetof(LatteContextRegister, VGT_MULTI_PRIM_IB_RESET_EN) == Latte::REGADDR::VGT_MULTI_PRIM_IB_RESET_EN * 4); +static_assert(offsetof(LatteContextRegister, VGT_INSTANCE_STEP_RATE_0) == Latte::REGADDR::VGT_INSTANCE_STEP_RATE_0 * 4); +static_assert(offsetof(LatteContextRegister, VGT_INSTANCE_STEP_RATE_1) == Latte::REGADDR::VGT_INSTANCE_STEP_RATE_1 * 4); +static_assert(offsetof(LatteContextRegister, VGT_STRMOUT_BUFFER_X) == Latte::REGADDR::VGT_STRMOUT_BUFFER_SIZE_0 * 4); +static_assert(offsetof(LatteContextRegister, VGT_STRMOUT_BASE_OFFSET_X) == Latte::REGADDR::VGT_STRMOUT_BASE_OFFSET_0 * 4); +static_assert(offsetof(LatteContextRegister, VGT_STRMOUT_BUFFER_EN) == Latte::REGADDR::VGT_STRMOUT_BUFFER_EN * 4); static_assert(offsetof(LatteContextRegister, SX_ALPHA_TEST_CONTROL) == Latte::REGADDR::SX_ALPHA_TEST_CONTROL * 4); static_assert(offsetof(LatteContextRegister, DB_STENCILREFMASK) == Latte::REGADDR::DB_STENCILREFMASK * 4); static_assert(offsetof(LatteContextRegister, DB_STENCILREFMASK_BF) == Latte::REGADDR::DB_STENCILREFMASK_BF * 4); @@ -1351,6 +1629,7 @@ static_assert(offsetof(LatteContextRegister, PA_CL_VPORT_ZOFFSET) == Latte::REGA static_assert(offsetof(LatteContextRegister, PA_CL_CLIP_CNTL) == Latte::REGADDR::PA_CL_CLIP_CNTL * 4); static_assert(offsetof(LatteContextRegister, PA_SU_SC_MODE_CNTL) == Latte::REGADDR::PA_SU_SC_MODE_CNTL * 4); static_assert(offsetof(LatteContextRegister, PA_CL_VTE_CNTL) == Latte::REGADDR::PA_CL_VTE_CNTL * 4); +static_assert(offsetof(LatteContextRegister, PA_CL_VS_OUT_CNTL) == Latte::REGADDR::PA_CL_VS_OUT_CNTL * 4); static_assert(offsetof(LatteContextRegister, PA_SU_POINT_SIZE) == Latte::REGADDR::PA_SU_POINT_SIZE * 4); static_assert(offsetof(LatteContextRegister, PA_SU_POINT_MINMAX) == Latte::REGADDR::PA_SU_POINT_MINMAX * 4); static_assert(offsetof(LatteContextRegister, CB_BLENDN_CONTROL) == Latte::REGADDR::CB_BLEND0_CONTROL * 4); @@ -1363,7 +1642,21 @@ static_assert(offsetof(LatteContextRegister, PA_SU_POLY_OFFSET_FRONT_SCALE) == L static_assert(offsetof(LatteContextRegister, PA_SU_POLY_OFFSET_FRONT_OFFSET) == Latte::REGADDR::PA_SU_POLY_OFFSET_FRONT_OFFSET * 4); static_assert(offsetof(LatteContextRegister, PA_SU_POLY_OFFSET_BACK_SCALE) == Latte::REGADDR::PA_SU_POLY_OFFSET_BACK_SCALE * 4); static_assert(offsetof(LatteContextRegister, PA_SU_POLY_OFFSET_BACK_OFFSET) == Latte::REGADDR::PA_SU_POLY_OFFSET_BACK_OFFSET * 4); +static_assert(offsetof(LatteContextRegister, SQ_VTX_SEMANTIC_X) == Latte::REGADDR::SQ_VTX_SEMANTIC_0 * 4); +static_assert(offsetof(LatteContextRegister, SQ_VTX_SEMANTIC_CLEAR) == Latte::REGADDR::SQ_VTX_SEMANTIC_CLEAR * 4); static_assert(offsetof(LatteContextRegister, SQ_TEX_START_PS) == Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS * 4); static_assert(offsetof(LatteContextRegister, SQ_TEX_START_VS) == Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS * 4); static_assert(offsetof(LatteContextRegister, SQ_TEX_START_GS) == Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS * 4); static_assert(offsetof(LatteContextRegister, SQ_TEX_SAMPLER) == Latte::REGADDR::SQ_TEX_SAMPLER_WORD0_0 * 4); +static_assert(offsetof(LatteContextRegister, SQ_PGM_START_PS) == Latte::REGADDR::SQ_PGM_START_PS * 4); +static_assert(offsetof(LatteContextRegister, SQ_PGM_RESOURCES_PS) == Latte::REGADDR::SQ_PGM_RESOURCES_PS * 4); +static_assert(offsetof(LatteContextRegister, SQ_PGM_START_VS) == Latte::REGADDR::SQ_PGM_START_VS * 4); +static_assert(offsetof(LatteContextRegister, SQ_PGM_RESOURCES_VS) == Latte::REGADDR::SQ_PGM_RESOURCES_VS * 4); +static_assert(offsetof(LatteContextRegister, SQ_PGM_START_FS) == Latte::REGADDR::SQ_PGM_START_FS * 4); +static_assert(offsetof(LatteContextRegister, SQ_PGM_RESOURCES_FS) == Latte::REGADDR::SQ_PGM_RESOURCES_FS * 4); +static_assert(offsetof(LatteContextRegister, SQ_PGM_START_ES) == Latte::REGADDR::SQ_PGM_START_ES * 4); +static_assert(offsetof(LatteContextRegister, SQ_PGM_RESOURCES_ES) == Latte::REGADDR::SQ_PGM_RESOURCES_ES * 4); +static_assert(offsetof(LatteContextRegister, SQ_PGM_START_GS) == Latte::REGADDR::SQ_PGM_START_GS * 4); +static_assert(offsetof(LatteContextRegister, SQ_PGM_RESOURCES_GS) == Latte::REGADDR::SQ_PGM_RESOURCES_GS * 4); +static_assert(offsetof(LatteContextRegister, SPI_VS_OUT_CONFIG) == Latte::REGADDR::SPI_VS_OUT_CONFIG * 4); +static_assert(offsetof(LatteContextRegister, LATTE_SPI_VS_OUT_ID_N) == Latte::REGADDR::SPI_VS_OUT_ID_0 * 4); \ No newline at end of file diff --git a/src/Cafe/HW/Latte/ISA/RegDefines.h b/src/Cafe/HW/Latte/ISA/RegDefines.h index d28739c0..b99c2126 100644 --- a/src/Cafe/HW/Latte/ISA/RegDefines.h +++ b/src/Cafe/HW/Latte/ISA/RegDefines.h @@ -50,8 +50,6 @@ #define mmVGT_PRIMITIVEID_EN 0xA2A1 #define mmVGT_VTX_CNT_EN 0xA2AE #define mmVGT_REUSE_OFF 0xA2AD -#define mmVGT_INSTANCE_STEP_RATE_0 0xA2A8 -#define mmVGT_INSTANCE_STEP_RATE_1 0xA2A9 #define mmVGT_MAX_VTX_INDX 0xA100 #define mmVGT_MIN_VTX_INDX 0xA101 #define mmVGT_INDX_OFFSET 0xA102 diff --git a/src/Cafe/OS/libs/gx2/GX2.cpp b/src/Cafe/OS/libs/gx2/GX2.cpp index 5a1f5520..8c3fbc64 100644 --- a/src/Cafe/OS/libs/gx2/GX2.cpp +++ b/src/Cafe/OS/libs/gx2/GX2.cpp @@ -396,12 +396,7 @@ void gx2_load() osLib_addFunction("gx2", "GX2GetCurrentScanBuffer", gx2Export_GX2GetCurrentScanBuffer); // shader stuff - osLib_addFunction("gx2", "GX2GetVertexShaderGPRs", gx2Export_GX2GetVertexShaderGPRs); - osLib_addFunction("gx2", "GX2GetVertexShaderStackEntries", gx2Export_GX2GetVertexShaderStackEntries); - osLib_addFunction("gx2", "GX2GetPixelShaderGPRs", gx2Export_GX2GetPixelShaderGPRs); - osLib_addFunction("gx2", "GX2GetPixelShaderStackEntries", gx2Export_GX2GetPixelShaderStackEntries); - osLib_addFunction("gx2", "GX2SetFetchShader", gx2Export_GX2SetFetchShader); - osLib_addFunction("gx2", "GX2SetVertexShader", gx2Export_GX2SetVertexShader); + //osLib_addFunction("gx2", "GX2SetVertexShader", gx2Export_GX2SetVertexShader); osLib_addFunction("gx2", "GX2SetPixelShader", gx2Export_GX2SetPixelShader); osLib_addFunction("gx2", "GX2SetGeometryShader", gx2Export_GX2SetGeometryShader); osLib_addFunction("gx2", "GX2SetComputeShader", gx2Export_GX2SetComputeShader); diff --git a/src/Cafe/OS/libs/gx2/GX2.h b/src/Cafe/OS/libs/gx2/GX2.h index c9607ee4..b8a3f919 100644 --- a/src/Cafe/OS/libs/gx2/GX2.h +++ b/src/Cafe/OS/libs/gx2/GX2.h @@ -20,12 +20,6 @@ void gx2_load(); // shader -void gx2Export_GX2SetFetchShader(PPCInterpreter_t* hCPU); -void gx2Export_GX2GetVertexShaderGPRs(PPCInterpreter_t* hCPU); -void gx2Export_GX2GetVertexShaderStackEntries(PPCInterpreter_t* hCPU); -void gx2Export_GX2GetPixelShaderGPRs(PPCInterpreter_t* hCPU); -void gx2Export_GX2GetPixelShaderStackEntries(PPCInterpreter_t* hCPU); -void gx2Export_GX2SetVertexShader(PPCInterpreter_t* hCPU); void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU); void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU); void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU); diff --git a/src/Cafe/OS/libs/gx2/GX2_Command.cpp b/src/Cafe/OS/libs/gx2/GX2_Command.cpp index 8d584190..6da19741 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Command.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_Command.cpp @@ -263,7 +263,7 @@ namespace GX2 if (patchType == GX2_PATCH_TYPE::VERTEX_SHADER) { - GX2VertexShader_t* vertexShader = (GX2VertexShader_t*)obj; + GX2VertexShader* vertexShader = (GX2VertexShader*)obj; displayData[patchOffset / 4 + 2] = memory_virtualToPhysical(vertexShader->GetProgramAddr()) >> 8; } else if (patchType == GX2_PATCH_TYPE::PIXEL_SHADER) @@ -273,7 +273,7 @@ namespace GX2 } else if (patchType == GX2_PATCH_TYPE::FETCH_SHADER) { - GX2FetchShader_t* fetchShader = (GX2FetchShader_t*)obj; + GX2FetchShader* fetchShader = (GX2FetchShader*)obj; displayData[patchOffset / 4 + 2] = memory_virtualToPhysical(fetchShader->GetProgramAddr()) >> 8; } else if (patchType == GX2_PATCH_TYPE::GEOMETRY_COPY_SHADER) diff --git a/src/Cafe/OS/libs/gx2/GX2_Shader.cpp b/src/Cafe/OS/libs/gx2/GX2_Shader.cpp index c63688eb..ad17dc49 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Shader.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_Shader.cpp @@ -3,6 +3,7 @@ #include "GX2_Shader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Core/LattePM4.h" +#include "Cafe/HW/Latte/ISA/LatteReg.h" #include "Cafe/HW/Latte/ISA/LatteInstructions.h" uint32 memory_getVirtualOffsetFromPointer(void* ptr); // remove once we updated everything to MEMPTR @@ -70,9 +71,9 @@ namespace GX2 static_assert(sizeof(betype) == 0x4); // calculate size of CF program subpart, includes alignment padding for clause instructions - size_t _calcFetchShaderCFCodeSize(uint32 attributeCount, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode) + size_t _calcFetchShaderCFCodeSize(uint32 attributeCount, GX2FetchShader::FetchShaderType fetchShaderType, uint32 tessellationMode) { - cemu_assert_debug(fetchShaderType == GX2FetchShader_t::FetchShaderType::NO_TESSELATION); + cemu_assert_debug(fetchShaderType == GX2FetchShader::FetchShaderType::NO_TESSELATION); cemu_assert_debug(tessellationMode == 0); uint32 numCFInstructions = ((attributeCount + 15) / 16) + 1; // one VTX clause can have up to 16 instructions + final CF instruction is RETURN size_t cfSize = numCFInstructions * 8; @@ -80,16 +81,16 @@ namespace GX2 return cfSize; } - size_t _calcFetchShaderClauseCodeSize(uint32 attributeCount, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode) + size_t _calcFetchShaderClauseCodeSize(uint32 attributeCount, GX2FetchShader::FetchShaderType fetchShaderType, uint32 tessellationMode) { - cemu_assert_debug(fetchShaderType == GX2FetchShader_t::FetchShaderType::NO_TESSELATION); + cemu_assert_debug(fetchShaderType == GX2FetchShader::FetchShaderType::NO_TESSELATION); cemu_assert_debug(tessellationMode == 0); uint32 numClauseInstructions = attributeCount; size_t clauseSize = numClauseInstructions * 16; return clauseSize; } - void _writeFetchShaderCFCode(void* programBufferOut, uint32 attributeCount, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode) + void _writeFetchShaderCFCode(void* programBufferOut, uint32 attributeCount, GX2FetchShader::FetchShaderType fetchShaderType, uint32 tessellationMode) { LatteCFInstruction* cfInstructionWriter = (LatteCFInstruction*)programBufferOut; uint32 attributeIndex = 0; @@ -111,7 +112,7 @@ namespace GX2 memcpy(cfInstructionWriter, &returnInstr, sizeof(LatteCFInstruction)); } - void _writeFetchShaderVTXCode(GX2FetchShader_t* fetchShader, void* programOut, uint32 attributeCount, GX2AttribDescription* attributeDescription, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode) + void _writeFetchShaderVTXCode(GX2FetchShader* fetchShader, void* programOut, uint32 attributeCount, GX2AttribDescription* attributeDescription, GX2FetchShader::FetchShaderType fetchShaderType, uint32 tessellationMode) { uint8* writePtr = (uint8*)programOut; // one instruction per attribute (hardcoded into _writeFetchShaderCFCode) @@ -151,7 +152,7 @@ namespace GX2 bool divisorFound = false; for (uint32 i = 0; i < numDivisors; i++) { - if (_swapEndianU32(fetchShader->divisors[i]) == attrAluDivisor) + if (fetchShader->divisors[i] == attrAluDivisor) { srcSelX = i != 0 ? 2 : 1; divisorFound = true; @@ -168,7 +169,7 @@ namespace GX2 else { srcSelX = numDivisors != 0 ? 2 : 1; - fetchShader->divisors[numDivisors] = _swapEndianU32(attrAluDivisor); + fetchShader->divisors[numDivisors] = attrAluDivisor; numDivisors++; fetchShader->divisorCount = _swapEndianU32(numDivisors); } @@ -213,9 +214,9 @@ namespace GX2 } } - uint32 GX2CalcFetchShaderSizeEx(uint32 attributeCount, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode) + uint32 GX2CalcFetchShaderSizeEx(uint32 attributeCount, GX2FetchShader::FetchShaderType fetchShaderType, uint32 tessellationMode) { - cemu_assert_debug(fetchShaderType == GX2FetchShader_t::FetchShaderType::NO_TESSELATION); // other types are todo + cemu_assert_debug(fetchShaderType == GX2FetchShader::FetchShaderType::NO_TESSELATION); // other types are todo cemu_assert_debug(tessellationMode == 0); // other modes are todo uint32 finalSize = @@ -225,9 +226,9 @@ namespace GX2 return finalSize; } - void GX2InitFetchShaderEx(GX2FetchShader_t* fetchShader, void* programBufferOut, uint32 attributeCount, GX2AttribDescription* attributeDescription, GX2FetchShader_t::FetchShaderType fetchShaderType, uint32 tessellationMode) + void GX2InitFetchShaderEx(GX2FetchShader* fetchShader, void* programBufferOut, uint32 attributeCount, GX2AttribDescription* attributeDescription, GX2FetchShader::FetchShaderType fetchShaderType, uint32 tessellationMode) { - cemu_assert_debug(fetchShaderType == GX2FetchShader_t::FetchShaderType::NO_TESSELATION); + cemu_assert_debug(fetchShaderType == GX2FetchShader::FetchShaderType::NO_TESSELATION); cemu_assert_debug(tessellationMode == 0); /* @@ -238,7 +239,7 @@ namespace GX2 [CLAUSES] */ - memset(fetchShader, 0x00, sizeof(GX2FetchShader_t)); + memset(fetchShader, 0x00, sizeof(GX2FetchShader)); fetchShader->attribCount = _swapEndianU32(attributeCount); fetchShader->shaderPtr = (MPTR)_swapEndianU32(memory_getVirtualOffsetFromPointer(programBufferOut)); @@ -251,14 +252,181 @@ namespace GX2 shaderOutput += _calcFetchShaderClauseCodeSize(attributeCount, fetchShaderType, tessellationMode); uint32 shaderSize = (uint32)(shaderOutput - shaderStart); - cemu_assert_debug(shaderSize == GX2CalcFetchShaderSizeEx(attributeCount, GX2FetchShader_t::FetchShaderType::NO_TESSELATION, tessellationMode)); + cemu_assert_debug(shaderSize == GX2CalcFetchShaderSizeEx(attributeCount, GX2FetchShader::FetchShaderType::NO_TESSELATION, tessellationMode)); fetchShader->shaderSize = _swapEndianU32((uint32)(shaderOutput - shaderStart)); + + fetchShader->reg_SQ_PGM_RESOURCES_FS = Latte::LATTE_SQ_PGM_RESOURCES_FS().set_NUM_GPRS(2); // todo - affected by tesselation params? + } + + uint32 GX2GetVertexShaderGPRs(GX2VertexShader* vertexShader) + { + return vertexShader->regs.SQ_PGM_RESOURCES_VS.value().get_NUM_GPRS(); + } + + uint32 GX2GetVertexShaderStackEntries(GX2VertexShader* vertexShader) + { + return vertexShader->regs.SQ_PGM_RESOURCES_VS.value().get_NUM_STACK_ENTRIES(); + } + + uint32 GX2GetPixelShaderGPRs(GX2PixelShader_t* pixelShader) + { + return _swapEndianU32(pixelShader->regs[0])&0xFF; + } + + uint32 GX2GetPixelShaderStackEntries(GX2PixelShader_t* pixelShader) + { + return (_swapEndianU32(pixelShader->regs[0]>>8))&0xFF; + } + + void GX2SetFetchShader(GX2FetchShader* fetchShaderPtr) + { + GX2ReserveCmdSpace(11); + cemu_assert_debug((_swapEndianU32(fetchShaderPtr->shaderPtr) & 0xFF) == 0); + + gx2WriteGather_submit( + // setup fetch shader + pm4HeaderType3(IT_SET_CONTEXT_REG, 1+5), + Latte::REGADDR::SQ_PGM_START_FS-0xA000, + _swapEndianU32(fetchShaderPtr->shaderPtr)>>8, + _swapEndianU32(fetchShaderPtr->shaderSize)>>3, + 0x10000, // ukn (ring buffer size?) + 0x10000, // ukn (ring buffer size?) + fetchShaderPtr->reg_SQ_PGM_RESOURCES_FS, + + // write instance step + pm4HeaderType3(IT_SET_CONTEXT_REG, 1+2), + Latte::REGADDR::VGT_INSTANCE_STEP_RATE_0-0xA000, + fetchShaderPtr->divisors[0], + fetchShaderPtr->divisors[1]); + } + + void GX2SetVertexShader(GX2VertexShader* vertexShader) + { + GX2ReserveCmdSpace(100); + + MPTR shaderProgramAddr; + uint32 shaderProgramSize; + if (vertexShader->shaderPtr) + { + // without R API + shaderProgramAddr = vertexShader->shaderPtr.GetMPTR(); + shaderProgramSize = vertexShader->shaderSize; + } + else + { + shaderProgramAddr = vertexShader->rBuffer.GetVirtualAddr(); + shaderProgramSize = vertexShader->rBuffer.GetSize(); + } + + cemu_assert_debug(shaderProgramAddr != 0); + cemu_assert_debug(shaderProgramSize != 0); + + if (vertexShader->shaderMode == GX2_SHADER_MODE::GEOMETRY_SHADER) + { + // in geometry shader mode the vertex shader is written to _ES register and almost all vs control registers are set by GX2SetGeometryShader + gx2WriteGather_submit( + pm4HeaderType3(IT_SET_CONTEXT_REG, 6), + Latte::REGADDR::SQ_PGM_START_ES-0xA000, + memory_virtualToPhysical(shaderProgramAddr)>>8, + shaderProgramSize>>3, + 0x100000, + 0x100000, + vertexShader->regs.SQ_PGM_RESOURCES_VS); // SQ_PGM_RESOURCES_VS/SQ_PGM_RESOURCES_ES + } + else + { + gx2WriteGather_submit( + /* vertex shader program */ + pm4HeaderType3(IT_SET_CONTEXT_REG, 6), + Latte::REGADDR::SQ_PGM_START_VS-0xA000, + memory_virtualToPhysical(shaderProgramAddr)>>8, // physical address + shaderProgramSize>>3, + 0x100000, + 0x100000, + vertexShader->regs.SQ_PGM_RESOURCES_VS, // SQ_PGM_RESOURCES_VS/ES + /* primitive id enable */ + pm4HeaderType3(IT_SET_CONTEXT_REG, 2), + Latte::REGADDR::VGT_PRIMITIVEID_EN-0xA000, + vertexShader->regs.VGT_PRIMITIVEID_EN, + /* output config */ + pm4HeaderType3(IT_SET_CONTEXT_REG, 2), + Latte::REGADDR::SPI_VS_OUT_CONFIG-0xA000, + vertexShader->regs.SPI_VS_OUT_CONFIG, + /* PA_CL_VS_OUT_CNTL */ + pm4HeaderType3(IT_SET_CONTEXT_REG, 2), + Latte::REGADDR::PA_CL_VS_OUT_CNTL-0xA000, + vertexShader->regs.PA_CL_VS_OUT_CNTL + ); + + cemu_assert_debug(vertexShader->regs.SPI_VS_OUT_CONFIG.value().get_VS_PER_COMPONENT() == false); // not handled on the GPU side + + uint32 numOutputIds = vertexShader->regs.vsOutIdTableSize; + numOutputIds = std::min(numOutputIds, 0xA); + gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds)); + gx2WriteGather_submitU32AsBE(Latte::REGADDR::SPI_VS_OUT_ID_0-0xA000); + for(uint32 i=0; iregs.LATTE_SPI_VS_OUT_ID_N[i].value().getRawValue()); + + // todo: SQ_PGM_CF_OFFSET_VS + // todo: VGT_STRMOUT_BUFFER_EN + // stream out + if (vertexShader->usesStreamOut != 0) + { + // stride 0 + gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 2), + Latte::REGADDR::VGT_STRMOUT_VTX_STRIDE_0-0xA000, + vertexShader->streamOutVertexStride[0]>>2, + // stride 1 + pm4HeaderType3(IT_SET_CONTEXT_REG, 2), + Latte::REGADDR::VGT_STRMOUT_VTX_STRIDE_1-0xA000, + vertexShader->streamOutVertexStride[1]>>2, + // stride 2 + pm4HeaderType3(IT_SET_CONTEXT_REG, 2), + Latte::REGADDR::VGT_STRMOUT_VTX_STRIDE_2-0xA000, + vertexShader->streamOutVertexStride[2]>>2, + // stride 3 + pm4HeaderType3(IT_SET_CONTEXT_REG, 2), + Latte::REGADDR::VGT_STRMOUT_VTX_STRIDE_3-0xA000, + vertexShader->streamOutVertexStride[3]>>2); + } + } + // update semantic table + uint32 vsSemanticTableSize = vertexShader->regs.semanticTableSize; + if (vsSemanticTableSize > 0) + { + gx2WriteGather_submit( + pm4HeaderType3(IT_SET_CONTEXT_REG, 1+1), + Latte::REGADDR::SQ_VTX_SEMANTIC_CLEAR-0xA000, + 0xFFFFFFFF); + if (vsSemanticTableSize == 0) + { + gx2WriteGather_submit( + pm4HeaderType3(IT_SET_CONTEXT_REG, 1+1), + Latte::REGADDR::SQ_VTX_SEMANTIC_0-0xA000, + 0xFFFFFFFF); + } + else + { + uint32* vsSemanticTable = (uint32*)vertexShader->regs.SQ_VTX_SEMANTIC_N; + vsSemanticTableSize = std::min(vsSemanticTableSize, 32); + gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+vsSemanticTableSize)); + gx2WriteGather_submitU32AsBE(Latte::REGADDR::SQ_VTX_SEMANTIC_0-0xA000); + gx2WriteGather_submitU32AsLEArray(vsSemanticTable, vsSemanticTableSize); + } + } } void GX2ShaderInit() { cafeExportRegister("gx2", GX2CalcFetchShaderSizeEx, LogType::GX2); cafeExportRegister("gx2", GX2InitFetchShaderEx, LogType::GX2); + + cafeExportRegister("gx2", GX2GetVertexShaderGPRs, LogType::GX2); + cafeExportRegister("gx2", GX2GetVertexShaderStackEntries, LogType::GX2); + cafeExportRegister("gx2", GX2GetPixelShaderGPRs, LogType::GX2); + cafeExportRegister("gx2", GX2GetPixelShaderStackEntries, LogType::GX2); + cafeExportRegister("gx2", GX2SetFetchShader, LogType::GX2); + cafeExportRegister("gx2", GX2SetVertexShader, LogType::GX2); } } \ No newline at end of file diff --git a/src/Cafe/OS/libs/gx2/GX2_Shader.h b/src/Cafe/OS/libs/gx2/GX2_Shader.h index 1d1c79cc..960bdf95 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Shader.h +++ b/src/Cafe/OS/libs/gx2/GX2_Shader.h @@ -2,7 +2,7 @@ #include "Cafe/HW/Latte/ISA/LatteReg.h" #include "GX2_Streamout.h" -struct GX2FetchShader_t +struct GX2FetchShader { enum class FetchShaderType : uint32 { @@ -10,12 +10,12 @@ struct GX2FetchShader_t }; /* +0x00 */ betype fetchShaderType; - /* +0x04 */ uint32 _regs[1]; + /* +0x04 */ betype reg_SQ_PGM_RESOURCES_FS; /* +0x08 */ uint32 shaderSize; /* +0x0C */ MPTR shaderPtr; /* +0x10 */ uint32 attribCount; /* +0x14 */ uint32 divisorCount; - /* +0x18 */ uint32 divisors[2]; + /* +0x18 */ uint32be divisors[2]; MPTR GetProgramAddr() const { @@ -23,8 +23,8 @@ struct GX2FetchShader_t } }; -static_assert(sizeof(GX2FetchShader_t) == 0x20); -static_assert(sizeof(betype) == 4); +static_assert(sizeof(GX2FetchShader) == 0x20); +static_assert(sizeof(betype) == 4); namespace GX2 { @@ -32,19 +32,43 @@ namespace GX2 void GX2ShaderInit(); } -// code below still needs to be modernized (use betype, enum classes) +// code below still needs to be modernized (use betype, enum classes, move to namespace) +// deprecated, use GX2_SHADER_MODE enum class instead #define GX2_SHADER_MODE_UNIFORM_REGISTER 0 #define GX2_SHADER_MODE_UNIFORM_BLOCK 1 #define GX2_SHADER_MODE_GEOMETRY_SHADER 2 #define GX2_SHADER_MODE_COMPUTE_SHADER 3 -struct GX2VertexShader_t +enum class GX2_SHADER_MODE : uint32 { - /* +0x000 */ uint32 regs[52]; - /* +0x0D0 */ uint32 shaderSize; - /* +0x0D4 */ MPTR shaderPtr; - /* +0x0D8 */ uint32 shaderMode; // GX2_SHADER_MODE_* + UNIFORM_REGISTER = 0, + UNIFORM_BLOCK = 1, + GEOMETRY_SHADER = 2, + COMPUTE_SHADER = 3, +}; + +struct GX2VertexShader +{ + /* +0x000 */ + struct + { + /* +0x00 */ betype SQ_PGM_RESOURCES_VS; // compatible with SQ_PGM_RESOURCES_ES + /* +0x04 */ betype VGT_PRIMITIVEID_EN; + /* +0x08 */ betype SPI_VS_OUT_CONFIG; + /* +0x0C */ uint32be vsOutIdTableSize; + /* +0x10 */ betype LATTE_SPI_VS_OUT_ID_N[10]; + /* +0x38 */ betype PA_CL_VS_OUT_CNTL; + /* +0x3C */ uint32be uknReg15; // ? + /* +0x40 */ uint32be semanticTableSize; + /* +0x44 */ betype SQ_VTX_SEMANTIC_N[32]; + /* +0xC4 */ uint32be uknReg49; // ? + /* +0xC8 */ uint32be uknReg50; // vgt_vertex_reuse_block_cntl + /* +0xCC */ uint32be uknReg51; // vgt_hos_reuse_depth + }regs; + /* +0x0D0 */ uint32be shaderSize; + /* +0x0D4 */ MEMPTR shaderPtr; + /* +0x0D8 */ betype shaderMode; /* +0x0DC */ uint32 uniformBlockCount; /* +0x0E0 */ MPTR uniformBlockInfo; /* +0x0E4 */ uint32 uniformVarCount; @@ -57,20 +81,20 @@ struct GX2VertexShader_t /* +0x100 */ MPTR samplerInfo; /* +0x104 */ uint32 attribCount; /* +0x108 */ MPTR attribInfo; - /* +0x10C */ uint32 ringItemsize; // for GS - /* +0x110 */ uint32 usesStreamOut; - /* +0x114 */ uint32 streamOutVertexStride[GX2_MAX_STREAMOUT_BUFFERS]; + /* +0x10C */ uint32be ringItemsize; // for GS + /* +0x110 */ uint32be usesStreamOut; + /* +0x114 */ uint32be streamOutVertexStride[GX2_MAX_STREAMOUT_BUFFERS]; /* +0x124 */ GX2RBuffer rBuffer; MPTR GetProgramAddr() const { - if (_swapEndianU32(this->shaderPtr) != MPTR_NULL) - return _swapEndianU32(this->shaderPtr); + if (this->shaderPtr) + return this->shaderPtr.GetMPTR(); return this->rBuffer.GetVirtualAddr(); } }; -static_assert(sizeof(GX2VertexShader_t) == 0x134); +static_assert(sizeof(GX2VertexShader) == 0x134); typedef struct _GX2PixelShader { diff --git a/src/Cafe/OS/libs/gx2/GX2_shader_legacy.cpp b/src/Cafe/OS/libs/gx2/GX2_shader_legacy.cpp index 845292fe..1cb61a7e 100644 --- a/src/Cafe/OS/libs/gx2/GX2_shader_legacy.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_shader_legacy.cpp @@ -8,204 +8,6 @@ #include "GX2.h" #include "GX2_Shader.h" -void gx2Export_GX2SetFetchShader(PPCInterpreter_t* hCPU) -{ - cemuLog_log(LogType::GX2, "GX2SetFetchShader(0x{:08x})", hCPU->gpr[3]); - GX2ReserveCmdSpace(11); - GX2FetchShader_t* fetchShaderPtr = (GX2FetchShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); - cemu_assert_debug((_swapEndianU32(fetchShaderPtr->shaderPtr) & 0xFF) == 0); - - gx2WriteGather_submit( - // setup fetch shader - pm4HeaderType3(IT_SET_CONTEXT_REG, 1+5), - mmSQ_PGM_START_FS-0xA000, - _swapEndianU32(fetchShaderPtr->shaderPtr)>>8, // pointer divided by 256 - _swapEndianU32(fetchShaderPtr->shaderSize)>>3, // size divided by 8 - 0x10000, // ukn (ring buffer size?) - 0x10000, // ukn (ring buffer size?) - *(uint32be*)&(fetchShaderPtr->_regs[0]), - - // write instance step - pm4HeaderType3(IT_SET_CONTEXT_REG, 1+2), - mmVGT_INSTANCE_STEP_RATE_0-0xA000, - *(uint32be*)&(fetchShaderPtr->divisors[0]), - *(uint32be*)&(fetchShaderPtr->divisors[1])); - - osLib_returnFromFunction(hCPU, 0); -} - -void gx2Export_GX2GetVertexShaderGPRs(PPCInterpreter_t* hCPU) -{ - cemuLog_log(LogType::GX2, "GX2GetVertexShaderGPRs(0x{:08x})", hCPU->gpr[3]); - GX2VertexShader_t* vertexShader = (GX2VertexShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); - uint8 numGPRs = _swapEndianU32(vertexShader->regs[0])&0xFF; - osLib_returnFromFunction(hCPU, numGPRs); -} - -void gx2Export_GX2GetVertexShaderStackEntries(PPCInterpreter_t* hCPU) -{ - cemuLog_log(LogType::GX2, "GX2GetVertexShaderStackEntries(0x{:08x})", hCPU->gpr[3]); - GX2VertexShader_t* vertexShader = (GX2VertexShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); - uint8 stackEntries = (_swapEndianU32(vertexShader->regs[0])>>8)&0xFF; - osLib_returnFromFunction(hCPU, stackEntries); -} - -void gx2Export_GX2GetPixelShaderGPRs(PPCInterpreter_t* hCPU) -{ - cemuLog_log(LogType::GX2, "GX2GetPixelShaderGPRs(0x{:08x})", hCPU->gpr[3]); - GX2PixelShader_t* pixelShader = (GX2PixelShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); - uint8 stackEntries = (_swapEndianU32(pixelShader->regs[0]))&0xFF; - osLib_returnFromFunction(hCPU, stackEntries); -} - -void gx2Export_GX2GetPixelShaderStackEntries(PPCInterpreter_t* hCPU) -{ - cemuLog_log(LogType::GX2, "GX2GetPixelShaderStackEntries(0x{:08x})", hCPU->gpr[3]); - GX2PixelShader_t* pixelShader = (GX2PixelShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); - uint8 numGPRs = (_swapEndianU32(pixelShader->regs[0]>>8))&0xFF; - osLib_returnFromFunction(hCPU, numGPRs); -} - -void gx2Export_GX2SetVertexShader(PPCInterpreter_t* hCPU) -{ - cemuLog_log(LogType::GX2, "GX2SetVertexShader(0x{:08x})", hCPU->gpr[3]); - GX2ReserveCmdSpace(100); - - GX2VertexShader_t* vertexShader = (GX2VertexShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); - - MPTR shaderProgramAddr; - uint32 shaderProgramSize; - - if( _swapEndianU32(vertexShader->shaderPtr) != MPTR_NULL ) - { - // without R API - shaderProgramAddr = _swapEndianU32(vertexShader->shaderPtr); - shaderProgramSize = _swapEndianU32(vertexShader->shaderSize); - } - else - { - shaderProgramAddr = vertexShader->rBuffer.GetVirtualAddr(); - shaderProgramSize = vertexShader->rBuffer.GetSize(); - } - - cemu_assert_debug(shaderProgramAddr != 0); - cemu_assert_debug(shaderProgramSize != 0); - - if( _swapEndianU32(vertexShader->shaderMode) == GX2_SHADER_MODE_GEOMETRY_SHADER ) - { - // in geometry shader mode the vertex shader is written to _ES register and almost all vs control registers are set by GX2SetGeometryShader - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 6)); - gx2WriteGather_submitU32AsBE(mmSQ_PGM_START_ES-0xA000); - gx2WriteGather_submitU32AsBE(memory_virtualToPhysical(shaderProgramAddr)>>8); - gx2WriteGather_submitU32AsBE(shaderProgramSize>>3); - gx2WriteGather_submitU32AsBE(0x100000); - gx2WriteGather_submitU32AsBE(0x100000); - gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->regs[0])); // unknown - } - else - { - gx2WriteGather_submit( - /* vertex shader program */ - pm4HeaderType3(IT_SET_CONTEXT_REG, 6), - mmSQ_PGM_START_VS-0xA000, - memory_virtualToPhysical(shaderProgramAddr)>>8, // physical address - shaderProgramSize>>3, // size - 0x100000, - 0x100000, - _swapEndianU32(vertexShader->regs[0]), // unknown - /* primitive id enable */ - pm4HeaderType3(IT_SET_CONTEXT_REG, 2), - mmVGT_PRIMITIVEID_EN-0xA000, - _swapEndianU32(vertexShader->regs[1]), - /* output config */ - pm4HeaderType3(IT_SET_CONTEXT_REG, 2), - mmSPI_VS_OUT_CONFIG-0xA000, - _swapEndianU32(vertexShader->regs[2])); - - if( (_swapEndianU32(vertexShader->regs[2]) & 1) != 0 ) - debugBreakpoint(); // per-component flag? - - // ukn - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2)); - gx2WriteGather_submitU32AsBE(mmPA_CL_VS_OUT_CNTL-0xA000); - gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->regs[14])); - - uint32 numOutputIds = _swapEndianU32(vertexShader->regs[3]); - numOutputIds = std::min(numOutputIds, 0xA); - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds)); - gx2WriteGather_submitU32AsBE(mmSPI_VS_OUT_ID_0-0xA000); - for(uint32 i=0; iregs[4+i])); - } - - /* - VS _regs[]: - 0 ? - 1 mmVGT_PRIMITIVEID_EN (?) - 2 mmSPI_VS_OUT_CONFIG - 3 Number of used SPI_VS_OUT_ID_* entries - 4 - 13 SPI_VS_OUT_ID_0 - SPI_VS_OUT_ID_9 - 14 pa_cl_vs_out_cntl - ... - 17 - ?? semantic table entry (input) - - ... - 50 vgt_vertex_reuse_block_cntl - 51 vgt_hos_reuse_depth - */ - - // todo: mmSQ_PGM_CF_OFFSET_VS - // todo: mmVGT_STRMOUT_BUFFER_EN - // stream out - if( _swapEndianU32(vertexShader->usesStreamOut) != 0 ) - { - // stride 0 - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2)); - gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_0-0xA000); - gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->streamOutVertexStride[0])>>2); - // stride 1 - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2)); - gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_1-0xA000); - gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->streamOutVertexStride[1])>>2); - // stride 2 - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2)); - gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_2-0xA000); - gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->streamOutVertexStride[2])>>2); - // stride 3 - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2)); - gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_3-0xA000); - gx2WriteGather_submitU32AsBE(_swapEndianU32(vertexShader->streamOutVertexStride[3])>>2); - } - } - // update semantic table - uint32 vsSemanticTableSize = _swapEndianU32(vertexShader->regs[0x40/4]); - if( vsSemanticTableSize > 0 ) - { - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+1)); - gx2WriteGather_submitU32AsBE(mmSQ_VTX_SEMANTIC_CLEAR-0xA000); - gx2WriteGather_submitU32AsBE(0xFFFFFFFF); - if( vsSemanticTableSize == 0 ) - { - // todo: Figure out how this is done on real SW/HW (some vertex shaders don't have a semantic table) - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+1)); - gx2WriteGather_submitU32AsBE(mmSQ_VTX_SEMANTIC_0-0xA000); - gx2WriteGather_submitU32AsBE(0xFFFFFFFF); - } - else - { - uint32* vsSemanticTable = vertexShader->regs+(0x44/4); - vsSemanticTableSize = std::min(vsSemanticTableSize, 0x20); - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+vsSemanticTableSize)); - gx2WriteGather_submitU32AsBE(mmSQ_VTX_SEMANTIC_0-0xA000); - for(uint32 i=0; igpr[3]); @@ -415,14 +217,14 @@ void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU) osLib_returnFromFunction(hCPU, 0); } -struct GX2ComputeShader_t +struct GX2ComputeShader { /* +0x00 */ uint32be regs[12]; /* +0x30 */ uint32be programSize; /* +0x34 */ uint32be programPtr; - /* +0x38 */ uint32 ukn38; - /* +0x3C */ uint32 ukn3C; - /* +0x40 */ uint32 ukn40[8]; + /* +0x38 */ uint32be ukn38; + /* +0x3C */ uint32be ukn3C; + /* +0x40 */ uint32be ukn40[8]; /* +0x60 */ uint32be workgroupSizeX; /* +0x64 */ uint32be workgroupSizeY; /* +0x68 */ uint32be workgroupSizeZ; @@ -431,13 +233,13 @@ struct GX2ComputeShader_t /* +0x74 */ GX2RBuffer rBuffer; }; -static_assert(offsetof(GX2ComputeShader_t, programSize) == 0x30); -static_assert(offsetof(GX2ComputeShader_t, workgroupSizeX) == 0x60); -static_assert(offsetof(GX2ComputeShader_t, rBuffer) == 0x74); +static_assert(offsetof(GX2ComputeShader, programSize) == 0x30); +static_assert(offsetof(GX2ComputeShader, workgroupSizeX) == 0x60); +static_assert(offsetof(GX2ComputeShader, rBuffer) == 0x74); void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU) { - ppcDefineParamTypePtr(computeShader, GX2ComputeShader_t, 0); + ppcDefineParamTypePtr(computeShader, GX2ComputeShader, 0); cemuLog_log(LogType::GX2, "GX2SetComputeShader(0x{:08x})", hCPU->gpr[3]); MPTR shaderPtr;