From 8ce3f834c43f6cf87b03795f1ea328adc6acdc63 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Sun, 12 Mar 2023 08:33:31 +0100 Subject: [PATCH] Latte: Optimize shader decompiler output --- .../LatteDecompilerAnalyzer.cpp | 96 +-- .../LatteDecompilerEmitGLSL.cpp | 767 +++++++++--------- .../LatteDecompilerInternal.h | 5 +- 3 files changed, 408 insertions(+), 460 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index dd179ac0..d7a6eca6 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -76,75 +76,6 @@ void _remapUniformAccess(LatteDecompilerShaderContext* shaderContext, bool isReg list_uniformMapping.emplace_back(newMapping); } -/* - * Checks for register collisions and marks the instructions accordingly - * startIndex is the first instruction of the group - * endIndex is inclusive the last instruction of the same group - */ -void _analyzeALUInstructionGroupForRegisterCollision(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex, sint32 endIndex) -{ - uint8 registerChannelWriteMask[(LATTE_NUM_GPR *4+7)/8] = {0}; - - struct - { - uint8 gprIndex; - uint8 channel; - }registerBackupEntries[5]; - sint32 registerBackupCount = 0; - - for(sint32 i=startIndex; i<=endIndex; i++) - { - LatteDecompilerALUInstruction& aluInstruction = cfInstruction->instructionsALU[i]; - // ignore NOP instruction - if( aluInstruction.isOP3 == false && aluInstruction.opcode == ALU_OP2_INST_NOP ) - continue; - if( aluInstruction.destElem > 3 ) - debugBreakpoint(); - registerChannelWriteMask[(aluInstruction.destGpr * 4 + aluInstruction.destElem) / 8] |= (1 << ((aluInstruction.destGpr * 4 + aluInstruction.destElem) % 8)); - // check if any previously written register is read - for(sint32 f=0; f<3; f++) - { - if( GPU7_ALU_SRC_IS_GPR(aluInstruction.sourceOperand[f].sel) == false ) - continue; - sint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction.sourceOperand[f].sel); - if( aluInstruction.sourceOperand[f].chan > 3 ) - debugBreakpoint(); - if( (registerChannelWriteMask[(gprIndex*4+aluInstruction.sourceOperand[f].chan)/8]&(1<<((gprIndex*4+aluInstruction.sourceOperand[f].chan)%8))) != 0 ) - { - // register is overwritten by same or previous instruction, mark register backup for this instruction - // check if this register already has a backup - bool hasBackup = false; - for(sint32 t=0; tinstructionsALU) { // ignore NOP instruction - if( aluInstruction.isOP3 == false && aluInstruction.opcode == ALU_OP2_INST_NOP ) + if( !aluInstruction.isOP3 && aluInstruction.opcode == ALU_OP2_INST_NOP ) continue; // check for CUBE instruction - if( aluInstruction.isOP3 == false && aluInstruction.opcode == ALU_OP2_INST_CUBE ) + if( !aluInstruction.isOP3 && aluInstruction.opcode == ALU_OP2_INST_CUBE ) { shaderContext->analyzer.hasRedcCUBE = true; } @@ -305,7 +236,7 @@ void LatteDecompiler_analyzeALUClause(LatteDecompilerShaderContext* shaderContex // relative register file accesses are tricky because the range of possible indices is unknown // worst case we have to load the full file (256 * 16 byte entries) - // but here we track all access indices so the analyzer can make guesstimates about the actual size when there are relative accesses + // by tracking the accessed base indices the shader analyzer can determine bounds for the potentially accessed ranges shaderContext->analyzer.uniformRegisterAccess = true; if (aluInstruction.sourceOperand[f].rel) @@ -355,30 +286,9 @@ void LatteDecompiler_analyzeALUClause(LatteDecompilerShaderContext* shaderContex } } if( aluInstruction.destRel != 0 ) - { shaderContext->analyzer.usesRelativeGPRWrite = true; - } shaderContext->analyzer.gprUseMask[aluInstruction.destGpr/8] |= (1<<(aluInstruction.destGpr%8)); } - // check for register collisions inside instruction groups (registers that are overwritten while being read) - sint32 currentGroupIndex = 0; - sint32 currentGroupStartIndex = 0; - for(uint32 i=0; iinstructionsALU.size(); i++) - { - LatteDecompilerALUInstruction& aluInstruction = cfInstruction->instructionsALU[i]; - if( aluInstruction.instructionGroupIndex != currentGroupIndex ) - { - cemu_assert_debug(i != 0); // first group cant end at first instruction - _analyzeALUInstructionGroupForRegisterCollision(shaderContext, cfInstruction, currentGroupStartIndex, i-1); - // start next group - currentGroupIndex = aluInstruction.instructionGroupIndex; - currentGroupStartIndex = i; - } - } - if( currentGroupStartIndex < (sint32)cfInstruction->instructionsALU.size() ) - { - _analyzeALUInstructionGroupForRegisterCollision(shaderContext, cfInstruction, currentGroupStartIndex, (uint32)cfInstruction->instructionsALU.size()-1); - } } // analyze TEX CF instruction and all instructions within the TEX clause diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSL.cpp index 1e1d853f..19c2cd87 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSL.cpp @@ -17,6 +17,7 @@ #include "util/helpers/StringBuf.h" #include +#include #define _CRLF "\r\n" @@ -180,7 +181,258 @@ static char* _getRegisterVarName(LatteDecompilerShaderContext* shaderContext, ui return tempStr; } -sint32 _getVertexShaderOutParamSemanticId(uint32* contextRegisters, sint32 index) // deprecated - moved to psInputTable +static void _appendRegisterTypeSuffix(StringBuf* src, sint32 dataType) +{ + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("i"); + else if (dataType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("ui"); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add("f"); + else + cemu_assert_unimplemented(); +} + +// appends x/y/z/w +static void _appendChannel(StringBuf* src, sint32 channelIndex) +{ + cemu_assert_debug(channelIndex >= 0 && channelIndex <= 3); + switch (channelIndex) + { + case 0: + src->add("x"); + return; + case 1: + src->add("y"); + return; + case 2: + src->add("z"); + return; + case 3: + src->add("w"); + return; + } +} + +// appends .x/.y/.z/.w +static void _appendChannelAccess(StringBuf* src, sint32 channelIndex) +{ + cemu_assert_debug(channelIndex >= 0 && channelIndex <= 3); + switch (channelIndex) + { + case 0: + src->add(".x"); + return; + case 1: + src->add(".y"); + return; + case 2: + src->add(".z"); + return; + case 3: + src->add(".w"); + return; + } +} + +static void _appendPVPS(LatteDecompilerShaderContext* shaderContext, StringBuf* src, uint32 groupIndex, uint8 aluUnit) +{ + cemu_assert_debug(aluUnit < 5); + if (aluUnit == 4) + { + src->addFmt("PS{}", (groupIndex & 1)); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + return; + } + src->addFmt("PV{}", (groupIndex & 1)); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + _appendChannel(src, aluUnit); +} + +// tracks PV/PS and register backups +struct ALUClauseTemporariesState +{ + struct PVPSAlias + { + enum class LOCATION_TYPE : uint8 + { + LOCATION_NONE, + LOCATION_GPR, + LOCATION_PVPS, + }; + + LOCATION_TYPE location{ LOCATION_TYPE::LOCATION_NONE }; + uint8 index; // GPR index or temporary index + uint8 aluUnit; // x,y,z,w (or 5 for PS) + + void SetLocationGPR(uint8 gprIndex, uint8 channel) + { + cemu_assert_debug(channel < 4); + this->location = LOCATION_TYPE::LOCATION_GPR; + this->index = gprIndex; + this->aluUnit = channel; + } + + void SetLocationPSPVTemporary(uint8 aluUnit, uint32 groupIndex) + { + cemu_assert_debug(aluUnit < 5); + this->location = LOCATION_TYPE::LOCATION_PVPS; + this->index = groupIndex & 1; + this->aluUnit = aluUnit; + } + }; + + struct GPRTemporary + { + uint8 gprIndex; + uint8 channel; + uint8 backupVarIndex; + }; + + void TrackGroupOutputPVPS(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstr, size_t numInstr) + { + // unset current + for (auto& it : m_pvps) + it.location = PVPSAlias::LOCATION_TYPE::LOCATION_NONE; + for (size_t i = 0; i < numInstr; i++) + { + LatteDecompilerALUInstruction& inst = aluInstr[i]; + if (!inst.isOP3 && inst.opcode == ALU_OP2_INST_NOP) + continue; // skip NOP instruction + + if (inst.writeMask == 0) + { + // map to temporary + m_pvps[inst.aluUnit].SetLocationPSPVTemporary(inst.aluUnit, aluInstr->instructionGroupIndex); + } + else + { + // map to GPR + if(inst.destRel == 0) // is PV/PS set for indexed writes? + m_pvps[inst.aluUnit].SetLocationGPR(inst.destGpr, inst.destElem); + } + } + } + + bool HasPVPS(uint8 aluUnitIndex) const + { + cemu_assert_debug(aluUnitIndex < 5); + return m_pvps[aluUnitIndex].location != PVPSAlias::LOCATION_TYPE::LOCATION_NONE; + } + + void EmitPVPSAccess(LatteDecompilerShaderContext* shaderContext, uint8 aluUnitIndex, uint32 currentGroupIndex) const + { + switch (m_pvps[aluUnitIndex].location) + { + case PVPSAlias::LOCATION_TYPE::LOCATION_GPR: + { + sint32 temporaryIndex = GetTemporaryForGPR(m_pvps[aluUnitIndex].index, m_pvps[aluUnitIndex].aluUnit); + if (temporaryIndex < 0) + { + shaderContext->shaderSource->add(_getRegisterVarName(shaderContext, m_pvps[aluUnitIndex].index, -1)); + _appendChannelAccess(shaderContext->shaderSource, m_pvps[aluUnitIndex].aluUnit); + } + else + { + // use temporary instead of GPR + shaderContext->shaderSource->addFmt("backupReg{}", temporaryIndex); + _appendRegisterTypeSuffix(shaderContext->shaderSource, shaderContext->typeTracker.defaultDataType); + } + break; + } + case PVPSAlias::LOCATION_TYPE::LOCATION_PVPS: + _appendPVPS(shaderContext, shaderContext->shaderSource, currentGroupIndex-1, m_pvps[aluUnitIndex].aluUnit); + break; + default: + cemuLog_log(LogType::Force, "Shader {:016x} accesses PV/PS without writing to it", shaderContext->shaderBaseHash); + cemu_assert_suspicious(); + break; + } + } + + /* + * Check for GPR channels which are modified before they are read within the same group + * These registers need to be copied to a temporary + */ + void CreateGPRTemporaries(LatteDecompilerShaderContext* shaderContext, std::span aluInstructions) + { + uint8 registerChannelWriteMask[(LATTE_NUM_GPR * 4 + 7) / 8] = { 0 }; + + m_gprTemporaries.clear(); + for (auto& aluInstruction : aluInstructions) + { + // ignore NOP instructions + if (aluInstruction.isOP3 == false && aluInstruction.opcode == ALU_OP2_INST_NOP) + continue; + cemu_assert_debug(aluInstruction.destElem <= 3); + // check if any previously written register is read + for (sint32 f = 0; f < 3; f++) + { + uint32 readGPRIndex; + uint32 readGPRChannel; + if (GPU7_ALU_SRC_IS_GPR(aluInstruction.sourceOperand[f].sel)) + { + readGPRIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction.sourceOperand[f].sel); + cemu_assert_debug(aluInstruction.sourceOperand[f].chan <= 3); + readGPRChannel = aluInstruction.sourceOperand[f].chan; + } + else if (GPU7_ALU_SRC_IS_PV(aluInstruction.sourceOperand[f].sel) || GPU7_ALU_SRC_IS_PS(aluInstruction.sourceOperand[f].sel)) + { + uint8 aluUnitIndex = 0; + if (GPU7_ALU_SRC_IS_PV(aluInstruction.sourceOperand[f].sel)) + aluUnitIndex = aluInstruction.sourceOperand[f].chan; + else + aluUnitIndex = 4; + // if aliased to a GPR, then consider it a GPR read + if(m_pvps[aluUnitIndex].location != PVPSAlias::LOCATION_TYPE::LOCATION_GPR) + continue; + readGPRIndex = m_pvps[aluUnitIndex].index; + readGPRChannel = m_pvps[aluUnitIndex].aluUnit; + } + else + continue; + // track GPR read + if ((registerChannelWriteMask[(readGPRIndex * 4 + aluInstruction.sourceOperand[f].chan) / 8] & (1 << ((readGPRIndex * 4 + aluInstruction.sourceOperand[f].chan) % 8))) != 0) + { + // register is overwritten by previous instruction, a temporary variable is required + if (GetTemporaryForGPR(readGPRIndex, readGPRChannel) < 0) + m_gprTemporaries.emplace_back(readGPRIndex, readGPRChannel, m_gprTemporaries.size()); + } + } + // track write + if (aluInstruction.writeMask != 0) + registerChannelWriteMask[(aluInstruction.destGpr * 4 + aluInstruction.destElem) / 8] |= (1 << ((aluInstruction.destGpr * 4 + aluInstruction.destElem) % 8)); + } + // output code to move GPRs into temporaries + StringBuf* src = shaderContext->shaderSource; + for (auto& it : m_gprTemporaries) + { + src->addFmt("backupReg{}", it.backupVarIndex); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + src->add(" = "); + src->add(_getRegisterVarName(shaderContext, it.gprIndex)); + _appendChannelAccess(src, it.channel); + src->add(";" _CRLF); + } + } + + // returns -1 if none present + sint32 GetTemporaryForGPR(uint8 gprIndex, uint8 channel) const + { + for (auto& it : m_gprTemporaries) + { + if (it.gprIndex == gprIndex && it.channel == channel) + return (sint32)it.backupVarIndex; + } + return -1; + } + +private: + PVPSAlias m_pvps[5]{}; + boost::container::small_vector m_gprTemporaries; +}; + +sint32 _getVertexShaderOutParamSemanticId(uint32* contextRegisters, sint32 index) // deprecated - move to LatteShaderPSInputTable { uint32 vsSemanticId = (contextRegisters[mmSPI_VS_OUT_ID_0 + (index / 4)] >> (8 * (index % 4))) & 0xFF; // check if export exists since exports are generated based on PS inputs @@ -209,38 +461,6 @@ bool _isReductionInstruction(LatteDecompilerALUInstruction* aluInstruction) return aluInstruction->isOP3 == false && (aluInstruction->opcode == ALU_OP2_INST_DOT4 || aluInstruction->opcode == ALU_OP2_INST_DOT4_IEEE || aluInstruction->opcode == ALU_OP2_INST_CUBE); } -void _appendRegisterTypeSuffix(StringBuf* src, sint32 dataType) -{ - if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->add("i"); - else if (dataType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) - src->add("ui"); - else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->add("f"); - else - cemu_assert_unimplemented(); -} - -void _appendChannelAccess(StringBuf* src, sint32 channelIndex) -{ - cemu_assert_debug(channelIndex >= 0 && channelIndex <= 3); - switch (channelIndex) - { - case 0: - src->add(".x"); - return; - case 1: - src->add(".y"); - return; - case 2: - src->add(".z"); - return; - case 3: - src->add(".w"); - return; - } -} - /* * Writes the name of the output variable and channel * E.g. R5f.x or tempf.x if writeMask is 0 @@ -252,17 +472,10 @@ void _emitInstructionOutputVariableName(LatteDecompilerShaderContext* shaderCont if( aluInstruction->writeMask == 0 ) { // does not output to GPR - if( aluInstruction->aluUnit < 4 && _isReductionInstruction(aluInstruction) == false ) + if( !_isReductionInstruction(aluInstruction) ) { - // output to PV - src->addFmt("PV{}", (aluInstruction->instructionGroupIndex&1)); - _appendRegisterTypeSuffix(src, outputDataType); - } - else if( aluInstruction->aluUnit == 4 && _isReductionInstruction(aluInstruction) == false ) - { - // output to PS - src->addFmt("PS{}", (aluInstruction->instructionGroupIndex&1)); - _appendRegisterTypeSuffix(src, outputDataType); + // output to PV/PS + _appendPVPS(shaderContext, src, aluInstruction->instructionGroupIndex, aluInstruction->aluUnit); return; } else @@ -275,7 +488,7 @@ void _emitInstructionOutputVariableName(LatteDecompilerShaderContext* shaderCont } else { - // output to GPR + // output to GPR. Aliasing to PV/PS happens at the end of the group src->add(_getRegisterVarName(shaderContext, aluInstruction->destGpr, aluInstruction->destRel==0?-1:aluInstruction->indexMode)); _appendChannelAccess(src, aluInstruction->destElem); } @@ -283,23 +496,7 @@ void _emitInstructionOutputVariableName(LatteDecompilerShaderContext* shaderCont void _emitInstructionPVPSOutputVariableName(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction) { - StringBuf* src = shaderContext->shaderSource; - if( aluInstruction->aluUnit == 4 ) - { - // output to PS - src->addFmt("PS{}", aluInstruction->instructionGroupIndex&1); - } - else - { - // output to PV - src->addFmt("PV{}", aluInstruction->instructionGroupIndex&1); - } - sint32 outputDataType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); - _appendRegisterTypeSuffix(src, outputDataType); - if( aluInstruction->aluUnit < 4 ) - { - _appendChannelAccess(src, aluInstruction->aluUnit); - } + _appendPVPS(shaderContext, shaderContext->shaderSource, aluInstruction->instructionGroupIndex, aluInstruction->aluUnit); } void _emitRegisterAccessCode(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, sint32 channel0, sint32 channel1, sint32 channel2, sint32 channel3, sint32 dataType = -1) @@ -348,14 +545,13 @@ void _emitALURegisterInputAccessCode(LatteDecompilerShaderContext* shaderContext { StringBuf* src = shaderContext->shaderSource; sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); - if( GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[operandIndex].sel) == false ) - debugBreakpoint(); - sint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); - - if( aluInstruction->sourceOperand[operandIndex].requiredRegisterBackup ) + cemu_assert_debug(GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[operandIndex].sel)); + sint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + sint32 temporaryIndex = shaderContext->aluPVPSState->GetTemporaryForGPR(gprIndex, aluInstruction->sourceOperand[operandIndex].chan); + if(temporaryIndex >= 0) { // access via backup variable - src->addFmt("backupReg{}", aluInstruction->sourceOperand[operandIndex].registerBackupIndex); + src->addFmt("backupReg{}", temporaryIndex); _appendRegisterTypeSuffix(src, currentRegisterElementType); } else @@ -365,21 +561,11 @@ void _emitALURegisterInputAccessCode(LatteDecompilerShaderContext* shaderContext } } -void _emitPVAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex) +void _emitPVPSAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, uint8 aluUnitIndex) { - StringBuf* src = shaderContext->shaderSource; - sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); - src->addFmt("PV{}", (aluInstruction->instructionGroupIndex&1)^1); - _appendRegisterTypeSuffix(src, currentRegisterElementType); - _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); -} - -void _emitPSAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex) -{ - StringBuf* src = shaderContext->shaderSource; - sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); - src->addFmt("PS{}", (aluInstruction->instructionGroupIndex&1)^1); - _appendRegisterTypeSuffix(src, currentRegisterElementType); + cemu_assert_debug(aluInstruction->instructionGroupIndex > 0); // PV/PS is uninitialized for group 0 + // PV/PS vars are currently always using the default type (shaderContext->typeTracker.defaultDataType) + shaderContext->aluPVPSState->EmitPVPSAccess(shaderContext, aluUnitIndex, aluInstruction->instructionGroupIndex); } /* @@ -679,26 +865,18 @@ void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, LatteDec src->add("0"); else if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) src->add("0.0"); - else - debugBreakpoint(); } else if( GPU7_ALU_SRC_IS_CONST_1F(aluInstruction->sourceOperand[operandIndex].sel) ) { - if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) - src->add("floatBitsToInt(1.0)"); - else if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) - src->add("1.0"); - else - debugBreakpoint(); + _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->add("1.0"); + _emitTypeConversionSuffix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); } else if( GPU7_ALU_SRC_IS_CONST_0_5F(aluInstruction->sourceOperand[operandIndex].sel) ) { - if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) - src->add("floatBitsToInt(0.5)"); - else if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) - src->add("0.5"); - else - debugBreakpoint(); + _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->add("0.5"); + _emitTypeConversionSuffix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); } else if( GPU7_ALU_SRC_IS_CONST_1I(aluInstruction->sourceOperand[operandIndex].sel) ) { @@ -707,41 +885,36 @@ void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, LatteDec else if (requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) src->add("uint(1)"); else - debugBreakpoint(); + cemu_assert_suspicious(); } else if( GPU7_ALU_SRC_IS_CONST_M1I(aluInstruction->sourceOperand[operandIndex].sel) ) { if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) src->add("int(-1)"); else - debugBreakpoint(); + cemu_assert_suspicious(); } else if( GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[operandIndex].sel) ) { if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) - src->addFmt("0x{:08x}", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + src->addFmt("0x{:x}", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); else if( requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) - src->addFmt("uint(0x{:08x})", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + src->addFmt("uint(0x{:x})", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); else if (requiredType == LATTE_DECOMPILER_DTYPE_FLOAT) { uint32 constVal = aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]; - if (constVal == 0x3f800000) - src->add("1.0"); - else if (constVal == 0x3fc00000) - src->add("1.5"); - else if (constVal == 0x3e800000) - src->add("0.25"); - else if (constVal == 0x80000000) - src->add("-0.0"); - else if (constVal == 0x40000000) - src->add("2.0"); - else if (constVal == 0x40800000) - src->add("4.0"); + sint32 exponent = (constVal >> 23) & 0xFF; + exponent -= 127; + if ((constVal & 0xFF) == 0 && exponent >= -10 && exponent <= 10) + { + char floatAsStr[32]; + size_t floatAsStrLen = fmt::format_to_n(floatAsStr, 32, "{:#}", *(float*)&constVal).size; + cemu_assert_debug(floatAsStrLen >= 3); // shortest possible form is "0.0" + src->add(std::string_view(floatAsStr, floatAsStrLen)); + } else - src->addFmt("intBitsToFloat(0x{:08x})", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + src->addFmt("intBitsToFloat(0x{:08x})", constVal); } - else - debugBreakpoint(); } else if( GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel) ) { @@ -755,34 +928,20 @@ void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, LatteDec else if( GPU7_ALU_SRC_IS_PV(aluInstruction->sourceOperand[operandIndex].sel) ) { sint32 currentPVDataType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); - if( requiredType == currentPVDataType ) - { - _emitPVAccessCode(shaderContext, aluInstruction, operandIndex); - } - else - { - _emitTypeConversionPrefix(shaderContext, currentPVDataType, requiredType); - _emitPVAccessCode(shaderContext, aluInstruction, operandIndex); - _emitTypeConversionSuffix(shaderContext, currentPVDataType, requiredType); - } + _emitTypeConversionPrefix(shaderContext, currentPVDataType, requiredType); + _emitPVPSAccessCode(shaderContext, aluInstruction, operandIndex, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffix(shaderContext, currentPVDataType, requiredType); } else if( GPU7_ALU_SRC_IS_PS(aluInstruction->sourceOperand[operandIndex].sel) ) { sint32 currentPSDataType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); - if( requiredType == currentPSDataType ) - { - _emitPSAccessCode(shaderContext, aluInstruction, operandIndex); - } - else - { - _emitTypeConversionPrefix(shaderContext, currentPSDataType, requiredType); - _emitPSAccessCode(shaderContext, aluInstruction, operandIndex); - _emitTypeConversionSuffix(shaderContext, currentPSDataType, requiredType); - } + _emitTypeConversionPrefix(shaderContext, currentPSDataType, requiredType); + _emitPVPSAccessCode(shaderContext, aluInstruction, operandIndex, 4); + _emitTypeConversionSuffix(shaderContext, currentPSDataType, requiredType); } else { - debug_printf("Unsupported operand sel 0x%x\n", aluInstruction->sourceOperand[operandIndex].sel); + cemuLog_log(LogType::Force, "Unsupported shader ALU operand sel 0x%x\n", aluInstruction->sourceOperand[operandIndex].sel); debugBreakpoint(); } @@ -793,7 +952,6 @@ void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, LatteDec if( requiredTypeOut != requiredType ) _emitTypeConversionSuffix(shaderContext, requiredType, requiredTypeOut); - } void _emitTypeConversionPrefix(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType) @@ -838,13 +996,34 @@ void _emitALUOperationBinary(LatteDecompilerShaderContext* shaderContext, LatteD src->add(";" _CRLF); } +static bool _isSameGPROperand(LatteDecompilerALUInstruction* aluInstruction, sint32 opIndexA, sint32 opIndexB) +{ + if (aluInstruction->sourceOperand[opIndexA].sel != aluInstruction->sourceOperand[opIndexB].sel) + return false; + if (!GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[opIndexA].sel)) + return false; + if (aluInstruction->sourceOperand[opIndexA].chan != aluInstruction->sourceOperand[opIndexB].chan) + return false; + if (aluInstruction->sourceOperand[opIndexA].abs != aluInstruction->sourceOperand[opIndexB].abs) + return false; + if (aluInstruction->sourceOperand[opIndexA].neg != aluInstruction->sourceOperand[opIndexB].neg) + return false; + if (aluInstruction->sourceOperand[opIndexA].rel != aluInstruction->sourceOperand[opIndexB].rel) + return false; + return true; +} + +static bool _operandHasModifiers(LatteDecompilerALUInstruction* aluInstruction, sint32 opIndex) +{ + return aluInstruction->sourceOperand[opIndex].abs != 0 || aluInstruction->sourceOperand[opIndex].neg != 0; +} + void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, LatteDecompilerALUInstruction* aluInstruction) { StringBuf* src = shaderContext->shaderSource; - sint32 outputType; // data type of output + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); // data type of output if( aluInstruction->opcode == ALU_OP2_INST_MOV ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); bool requiresFloatMove = false; requiresFloatMove = aluInstruction->sourceOperand[0].abs != 0 || aluInstruction->sourceOperand[0].neg != 0; if( requiresFloatMove ) @@ -867,12 +1046,8 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat } else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_FLOOR ) { - if( aluInstruction->writeMask != 0 ) - debugBreakpoint(); - if( aluInstruction->omod != 0 ) - debugBreakpoint(); // omod is handled after instruction code, check what affect it has on AR - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); - + cemu_assert_debug(aluInstruction->writeMask == 0); + cemu_assert_debug(aluInstruction->omod == 0); src->add("tempResultf = "); _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); src->add(";" _CRLF); @@ -897,12 +1072,8 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat } else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_INT ) { - if( aluInstruction->writeMask != 0 ) - debugBreakpoint(); - if( aluInstruction->omod != 0 ) - debugBreakpoint(); // omod is handled after instruction code, check what affect it has on AR - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); - + cemu_assert_debug(aluInstruction->writeMask == 0); + cemu_assert_debug(aluInstruction->omod == 0); src->add("tempResulti = "); _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); src->add(";" _CRLF); @@ -932,7 +1103,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat else if( aluInstruction->opcode == ALU_OP2_INST_MUL ) { // 0*anything is always 0 - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -978,7 +1148,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat } else if (aluInstruction->opcode == ALU_OP2_INST_RECIP_IEEE) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -999,7 +1168,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat // -INF becomes -0.0 src->add("else if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); // assign result to output - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1028,7 +1196,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat src->add("else if( isinf(tempResultf) == true && (floatBitsToInt(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); } // assign result to output - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1041,7 +1208,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat aluInstruction->opcode == ALU_OP2_INST_MAX_DX10 || aluInstruction->opcode == ALU_OP2_INST_MIN_DX10 ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1065,7 +1231,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat aluInstruction->opcode == ALU_OP2_INST_FRACT || aluInstruction->opcode == ALU_OP2_INST_TRUNC ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1091,11 +1256,9 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat src->add("tempResultf = log2(tempResultf);" _CRLF); if( aluInstruction->opcode == ALU_OP2_INST_LOG_CLAMPED ) { - // note: apparently -INF < 0.0 does not resolve to true src->add("if( isinf(tempResultf) == true ) tempResultf = -3.40282347E+38F;" _CRLF); } // assign result to output - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1105,7 +1268,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat } else if( aluInstruction->opcode == ALU_OP2_INST_RNDNE ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1118,7 +1280,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat } else if( aluInstruction->opcode == ALU_OP2_INST_EXP_IEEE ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1131,7 +1292,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat } else if( aluInstruction->opcode == ALU_OP2_INST_SQRT_IEEE ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1145,7 +1305,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat else if( aluInstruction->opcode == ALU_OP2_INST_SIN || aluInstruction->opcode == ALU_OP2_INST_COS ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1161,7 +1320,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat } else if( aluInstruction->opcode == ALU_OP2_INST_FLT_TO_INT ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); @@ -1174,7 +1332,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat } else if( aluInstruction->opcode == ALU_OP2_INST_FLT_TO_UINT ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT, outputType); @@ -1187,12 +1344,10 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat } else if( aluInstruction->opcode == ALU_OP2_INST_INT_TO_FLOAT ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); - src->add("float"); - src->add("("); + src->add("float("); _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); src->add(")"); _emitTypeConversionSuffix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1200,12 +1355,10 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat } else if( aluInstruction->opcode == ALU_OP2_INST_UINT_TO_FLOAT ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); - src->add("float"); - src->add("("); + src->add("float("); _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); src->add(")"); _emitTypeConversionSuffix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1219,7 +1372,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat _emitALUOperationBinary(shaderContext, aluInstruction, " ^ "); else if( aluInstruction->opcode == ALU_OP2_INST_NOT_INT ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); @@ -1234,7 +1386,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat else if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT || aluInstruction->opcode == ALU_OP2_INST_MIN_INT ) { // not verified - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT ) src->add(" = max("); @@ -1262,7 +1413,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat _emitALUOperationBinary(shaderContext, aluInstruction, " >> "); else if( aluInstruction->opcode == ALU_OP2_INST_ASHR_INT ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); @@ -1277,7 +1427,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat aluInstruction->opcode == ALU_OP2_INST_SETNE || aluInstruction->opcode == ALU_OP2_INST_SETE ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); @@ -1303,7 +1452,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat { if( aluInstruction->omod != 0 ) debugBreakpoint(); - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); @@ -1318,7 +1466,7 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_DX10 ) src->add(" >= "); _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(")?int(0xFFFFFFFF):int(0x0))"); + src->add(")?-1:0)"); _emitTypeConversionSuffix(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); src->add(";"); src->add(_CRLF); @@ -1328,7 +1476,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat aluInstruction->opcode == ALU_OP2_INST_SETGT_INT || aluInstruction->opcode == ALU_OP2_INST_SETGE_INT ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); @@ -1343,7 +1490,7 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_INT ) src->add(" >= "); _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); - src->add(")?int(0xFFFFFFFF):int(0x0)"); + src->add(")?-1:0"); _emitTypeConversionSuffix(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); src->add(";" _CRLF); } @@ -1351,7 +1498,6 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat aluInstruction->opcode == ALU_OP2_INST_SETGT_UINT ) { // todo: Unsure if the result is unsigned or signed - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); @@ -1375,9 +1521,7 @@ void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT || aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT ) { - if( aluInstruction->writeMask != 0 ) - debugBreakpoint(); - + cemu_assert_debug(aluInstruction->writeMask == 0); bool isIntPred = (aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT); src->add("predResult"); @@ -1463,19 +1607,41 @@ void _emitALUOP3InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat { StringBuf* src = shaderContext->shaderSource; cemu_assert_debug(aluInstruction->destRel == 0); // todo - sint32 outputType; // data type of output + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + + /* check for common no-op or mov-like instructions */ + if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE || + aluInstruction->opcode == ALU_OP3_INST_CMOVE || + aluInstruction->opcode == ALU_OP3_INST_CMOVGT || + aluInstruction->opcode == ALU_OP3_INST_CNDE_INT || + aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT || + aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + { + if (_isSameGPROperand(aluInstruction, 1, 2) && !_operandHasModifiers(aluInstruction, 1)) + { + // the condition is irrelevant as both operands are the same + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, outputType); + src->add(";" _CRLF); + return; + } + } + + + /* generic handlers */ if( aluInstruction->opcode == ALU_OP3_INST_MULADD || aluInstruction->opcode == ALU_OP3_INST_MULADD_D2 || aluInstruction->opcode == ALU_OP3_INST_MULADD_M2 || aluInstruction->opcode == ALU_OP3_INST_MULADD_M4 || aluInstruction->opcode == ALU_OP3_INST_MULADD_IEEE ) { - // todo: Figure out the exact difference between normal MULADD and MULADD_IEEE - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + // todo: The difference between MULADD and MULADD IEEE is that the former has 0*anything=0 rule similar to MUL/MUL_IEEE? _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); - src->add("("); + if (aluInstruction->opcode != ALU_OP3_INST_MULADD) // avoid unnecessary parenthesis to improve code readability slightly + src->add("("); bool useDefaultMul = false; if (GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[1].sel) || @@ -1503,7 +1669,8 @@ void _emitALUOP3InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat src->add(" + "); _emitOperandInputCode(shaderContext, aluInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); - src->add(")"); + if(aluInstruction->opcode != ALU_OP3_INST_MULADD) + src->add(")"); if( aluInstruction->opcode == ALU_OP3_INST_MULADD_D2 ) src->add("/2.0"); else if( aluInstruction->opcode == ALU_OP3_INST_MULADD_M2 ) @@ -1516,13 +1683,11 @@ void _emitALUOP3InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat else if(aluInstruction->opcode == ALU_OP3_INST_CNDE_INT || aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT || aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) { bool requiresFloatResult = (aluInstruction->sourceOperand[1].neg != 0) || (aluInstruction->sourceOperand[2].neg != 0); - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); src->add("(("); _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); - //src->add(" == 0)?("); if (aluInstruction->opcode == ALU_OP3_INST_CNDE_INT) src->add(" == "); else if (aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT) @@ -1542,17 +1707,16 @@ void _emitALUOP3InstructionCode(LatteDecompilerShaderContext* shaderContext, Lat aluInstruction->opcode == ALU_OP3_INST_CMOVE || aluInstruction->opcode == ALU_OP3_INST_CMOVGT ) { - outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); _emitInstructionOutputVariableName(shaderContext, aluInstruction); src->add(" = "); _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); src->add("(("); _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); - if( aluInstruction->opcode == ALU_OP3_INST_CMOVE ) + if (aluInstruction->opcode == ALU_OP3_INST_CMOVE) src->add(" == "); - else if( aluInstruction->opcode == ALU_OP3_INST_CMOVGE ) + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE) src->add(" >= "); - else if( aluInstruction->opcode == ALU_OP3_INST_CMOVGT ) + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGT) src->add(" > "); src->add("0.0)?("); _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); @@ -1692,36 +1856,20 @@ void _emitALUReductionInstructionCode(LatteDecompilerShaderContext* shaderContex cemu_assert_unimplemented(); } -void _emitALUClauseRegisterBackupCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex, sint32 instructionGroupIndex) +void _emitALUClauseRegisterBackupCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex) { - StringBuf* src = shaderContext->shaderSource; - for (size_t i = (uint32)startIndex; i < cfInstruction->instructionsALU.size(); i++) + sint32 instructionGroupIndex = cfInstruction->instructionsALU[startIndex].instructionGroupIndex; + size_t groupSize = 1; + while ((startIndex + groupSize) < cfInstruction->instructionsALU.size()) { - LatteDecompilerALUInstruction& aluInstruction = cfInstruction->instructionsALU[i]; - if( aluInstruction.instructionGroupIndex != instructionGroupIndex ) - break; // end of current group - // ignore NOP instruction - if( aluInstruction.isOP3 == false && aluInstruction.opcode == ALU_OP2_INST_NOP ) - continue; - // check if there are any operand input registers that need to be kept intact - for(sint32 f=0; f<3; f++) - { - if( GPU7_ALU_SRC_IS_GPR(aluInstruction.sourceOperand[f].sel) == false ) - continue; - if( aluInstruction.sourceOperand[f].requiredRegisterBackup == false ) - continue; - sint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction.sourceOperand[f].sel); - if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) - src->addFmt("backupReg{}i = ", aluInstruction.sourceOperand[f].registerBackupIndex); - else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) - src->addFmt("backupReg{}f = ", aluInstruction.sourceOperand[f].registerBackupIndex); - src->add(_getRegisterVarName(shaderContext, gprIndex)); - _appendChannelAccess(src, aluInstruction.sourceOperand[f].chan); - src->add(";" _CRLF); - } + if (instructionGroupIndex != cfInstruction->instructionsALU[startIndex + groupSize].instructionGroupIndex) + break; + groupSize++; } + shaderContext->aluPVPSState->CreateGPRTemporaries(shaderContext, { cfInstruction->instructionsALU.data() + startIndex, groupSize }); } +/* bool _isPVUsedInNextGroup(LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex, sint32 pvUnit) { sint32 currentGroupIndex = cfInstruction->instructionsALU[startIndex].instructionGroupIndex; @@ -1778,6 +1926,7 @@ bool _isPVUsedInNextGroup(LatteDecompilerCFInstruction* cfInstruction, sint32 st } return false; } +*/ void _emitVec3(LatteDecompilerShaderContext* shaderContext, uint32 dataType, LatteDecompilerALUInstruction* aluInst0, sint32 opIdx0, LatteDecompilerALUInstruction* aluInst1, sint32 opIdx1, LatteDecompilerALUInstruction* aluInst2, sint32 opIdx2) { @@ -1819,103 +1968,33 @@ void _emitGPRVectorAssignment(LatteDecompilerShaderContext* shaderContext, Latte src->add(" = "); } -/* - * Analyze 3 instructions from the same group and generate vectorized/optimized code if possible - * Return true if code was generated - */ -bool _handleInstructionPattern3(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex, LatteDecompilerALUInstruction** instructionBlock) -{ - // check if omod and clamp is zero - if (instructionBlock[0]->omod != 0 || instructionBlock[0]->destClamp != 0) - return false; - if (instructionBlock[1]->omod != 0 || instructionBlock[1]->destClamp != 0) - return false; - if (instructionBlock[2]->omod != 0 || instructionBlock[2]->destClamp != 0) - return false; - // check if destination operand is the same register - // todo: Also check for PV - if (instructionBlock[0]->writeMask == 0 || - instructionBlock[1]->writeMask == 0 || - instructionBlock[2]->writeMask == 0 - ) - return false; - if (instructionBlock[0]->destGpr != instructionBlock[1]->destGpr || instructionBlock[1]->destGpr != instructionBlock[2]->destGpr) - return false; - if (instructionBlock[0]->destRel != 0 || instructionBlock[1]->destRel != 0 || instructionBlock[2]->destRel != 0) - return false; - // make sure all instructions are of type OP2 - //if (instructionBlock[0]->isOP3 || instructionBlock[1]->isOP3 || instructionBlock[2]->isOP3) - // return false; - if (instructionBlock[0]->isOP3 != instructionBlock[1]->isOP3 || instructionBlock[1]->isOP3 != instructionBlock[2]->isOP3) - return false; - // check if operation is the same - if (instructionBlock[0]->opcode != instructionBlock[1]->opcode || instructionBlock[1]->opcode != instructionBlock[2]->opcode) - return false; - // at this point we know the following conditions are true for the instruction block: - // All output to the same GPR - // All share the same OP2 operation - // omod and destClamp is 0 - StringBuf* src = shaderContext->shaderSource; - if (instructionBlock[0]->isOP3 == false && instructionBlock[0]->opcode == ALU_OP2_INST_MOV) - { - uint32 outputType = _getALUInstructionOutputDataType(shaderContext, instructionBlock[0]); - _emitGPRVectorAssignment(shaderContext, instructionBlock, 3); - _emitVec3(shaderContext, outputType, instructionBlock[0], 0, instructionBlock[1], 0, instructionBlock[2], 0); - src->add(";" _CRLF); - } - else if (instructionBlock[0]->isOP3 == false && instructionBlock[0]->opcode == ALU_OP2_INST_ADD) - { - uint32 outputType = _getALUInstructionOutputDataType(shaderContext, instructionBlock[0]); - _emitGPRVectorAssignment(shaderContext, instructionBlock, 3); - _emitTypeConversionPrefix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); - _emitVec3(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, instructionBlock[0], 0, instructionBlock[1], 0, instructionBlock[2], 0); - src->add(" + "); - _emitVec3(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, instructionBlock[0], 1, instructionBlock[1], 1, instructionBlock[2], 1); - _emitTypeConversionSuffix(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); - src->add(";" _CRLF); - } - else - return false; - // handle PV/PS - for (sint32 i = 0; i < 3; i++) - { - LatteDecompilerALUInstruction* aluInstruction = instructionBlock[i]; - bool skipPV = false; - if (aluInstruction->writeMask == 0) - skipPV = true; // no write -> Value already written to PV/PS - if (aluInstruction->aluUnit < 4 && _isPVUsedInNextGroup(cfInstruction, startIndex, aluInstruction->aluUnit) == false) - skipPV = true; - if (skipPV == false) - { - _emitInstructionPVPSOutputVariableName(shaderContext, aluInstruction); - src->add(" = "); - _emitInstructionOutputVariableName(shaderContext, aluInstruction); - src->add(";" _CRLF); - } - // todo: We can vectorize this part as well - } - return true; -} - void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) { + ALUClauseTemporariesState pvpsState; + shaderContext->aluPVPSState = &pvpsState; StringBuf* src = shaderContext->shaderSource; LatteDecompilerALUInstruction* aluRedcInstruction[4]; + size_t groupStartIndex = 0; for(size_t i=0; iinstructionsALU.size(); i++) { LatteDecompilerALUInstruction& aluInstruction = cfInstruction->instructionsALU[i]; if( aluInstruction.indexInGroup == 0 ) { src->addFmt("// {}" _CRLF, aluInstruction.instructionGroupIndex); - // create register backup code for this group (if required) - _emitALUClauseRegisterBackupCode(shaderContext, cfInstruction, i, aluInstruction.instructionGroupIndex); + // apply PV/PS updates for previous group + if (i > 0) + { + pvpsState.TrackGroupOutputPVPS(shaderContext, cfInstruction->instructionsALU.data() + groupStartIndex, i - groupStartIndex); + } + groupStartIndex = i; + // backup registers which are read after being written + _emitALUClauseRegisterBackupCode(shaderContext, cfInstruction, i); } // detect reduction instructions and use a special handler bool isReductionOperation = _isReductionInstruction(&aluInstruction); if( isReductionOperation ) { - if( (i+4)> cfInstruction->instructionsALU.size()) - debugBreakpoint(); + cemu_assert_debug((i + 4) <= cfInstruction->instructionsALU.size()); aluRedcInstruction[0] = &aluInstruction; aluRedcInstruction[1] = &cfInstruction->instructionsALU[i + 1]; aluRedcInstruction[2] = &cfInstruction->instructionsALU[i + 2]; @@ -1929,35 +2008,9 @@ void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecomp if( aluRedcInstruction[0]->destClamp != aluRedcInstruction[1]->destClamp || aluRedcInstruction[1]->destClamp != aluRedcInstruction[2]->destClamp || aluRedcInstruction[2]->destClamp != aluRedcInstruction[3]->destClamp ) debugBreakpoint(); _emitALUReductionInstructionCode(shaderContext, aluRedcInstruction); - i += 3; // skip the three following instructions that are part of the reduction operation + i += 3; // skip the instructions that are part of the reduction operation } - // detect and handle common instruction patterns (for example, vectorizable operations) - if (isReductionOperation == false) - { - sint32 remainingInstructions = cfInstruction->instructionsALU.size() - i; - LatteDecompilerALUInstruction* instructionBlock[4]; - // scan for instruction patterns of length 4 - // todo - // scan for instruction patterns of length 3 - if (remainingInstructions >= 3) - { - instructionBlock[0] = &aluInstruction; - instructionBlock[1] = &cfInstruction->instructionsALU[i + 1]; - instructionBlock[2] = &cfInstruction->instructionsALU[i + 2]; - if (instructionBlock[0]->instructionGroupIndex == instructionBlock[1]->instructionGroupIndex && instructionBlock[1]->instructionGroupIndex == instructionBlock[2]->instructionGroupIndex) - { - if (_handleInstructionPattern3(shaderContext, cfInstruction, i, instructionBlock)) - { - i += 2; - continue; - } - } - } - // scan for instruction patterns of length 2 - // todo - } - // handle instruction (if not a reduction operation, else it would already have been handled above) - if( isReductionOperation == false ) + else /* not a reduction operation */ { if( aluInstruction.isOP3 ) { @@ -2003,7 +2056,7 @@ void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecomp } else { - debugBreakpoint(); + cemu_assert_unimplemented(); } } // handle clamp @@ -2025,56 +2078,40 @@ void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecomp } else { - debugBreakpoint(); + cemu_assert_unimplemented(); } } - // set PV or PS - if( isReductionOperation == false ) - { - // make sure the result wasn't already written to PV/PS - bool skipPV = false; - if (aluInstruction.writeMask == 0) - skipPV = true; // no write -> Value already written to PV/PS - if (aluInstruction.aluUnit < 4 && _isPVUsedInNextGroup(cfInstruction, i, aluInstruction.aluUnit) == false) - skipPV = true; - if(skipPV == false) - { - _emitInstructionPVPSOutputVariableName(shaderContext, &aluInstruction); - src->add(" = "); - _emitInstructionOutputVariableName(shaderContext, &aluInstruction); - src->add(";" _CRLF); - } - } - else + // handle result broadcasting for reduction instructions + if( isReductionOperation ) { // reduction operations set all four PV components (todo: Needs further research. According to AMD docs, dot4 only sets PV.x? update: Unlike DOT4, CUBE sets all PV elements accordingly to their GPR output?) if( aluRedcInstruction[0]->opcode == ALU_OP2_INST_CUBE ) { // CUBE - for(sint32 f=0; f<4; f++) + for (sint32 f = 0; f < 4; f++) { + if (aluRedcInstruction[f]->writeMask != 0) + continue; _emitInstructionPVPSOutputVariableName(shaderContext, aluRedcInstruction[f]); src->add(" = "); - _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[f]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); src->add(";" _CRLF); } } else { // DOT4, DOT4_IEEE, etc. + // reduction operation result is only set for output in redc[0], we also need to update redc[1] to redc[3] for(sint32 f=0; f<4; f++) - { - _emitInstructionPVPSOutputVariableName(shaderContext, aluRedcInstruction[f]); - src->add(" = "); - _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); - src->add(";" _CRLF); - } - // reduction operations are only set for output in redc[0], we also need to update redc[1] to redc[3] - for(sint32 f=1; f<4; f++) { if( aluRedcInstruction[f]->writeMask == 0 ) - continue; - _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[f]); + _emitInstructionPVPSOutputVariableName(shaderContext, aluRedcInstruction[f]); + else + { + if (f == 0) + continue; + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[f]); + } src->add(" = "); _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); src->add(";" _CRLF); @@ -2082,6 +2119,7 @@ void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecomp } } } + shaderContext->aluPVPSState = nullptr; } /* @@ -3834,6 +3872,9 @@ void LatteDecompiler_emitGLSLHelperFunctions(LatteDecompilerShaderContext* shade fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){return mix(0.0, a*b, (a != 0.0) && (b != 0.0));}" _CRLF); // compiles faster on Nvidia and also results in lower RAM usage (OpenGL) else fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" _CRLF); + + // DXKV-like: fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b); }" _CRLF); + } else { @@ -3847,7 +3888,7 @@ void LatteDecompiler_emitAttributeImport(LatteDecompilerShaderContext* shaderCon { auto src = shaderContext->shaderSource; - static const char* dsMappingTableFloat[6] = { "int(attrDecoder.x)", "int(attrDecoder.y)", "int(attrDecoder.z)", "int(attrDecoder.w)", "floatBitsToInt(0.0)", "floatBitsToInt(1.0)" }; + static const char* dsMappingTableFloat[6] = { "int(attrDecoder.x)", "int(attrDecoder.y)", "int(attrDecoder.z)", "int(attrDecoder.w)", /*"floatBitsToInt(0.0)"*/ "0", /*"floatBitsToInt(1.0)"*/ "0x3f800000" }; static const char* dsMappingTableInt[6] = { "int(attrDecoder.x)", "int(attrDecoder.y)", "int(attrDecoder.z)", "int(attrDecoder.w)", "0", "1" }; // get register index based on vtx semantic table @@ -3964,13 +4005,13 @@ void LatteDecompiler_emitGLSLShader(LatteDecompilerShaderContext* shaderContext, src->addFmt("float backupReg0f, backupReg1f, backupReg2f, backupReg3f, backupReg4f;" _CRLF); if (shaderContext->typeTracker.genIntReg) { - src->addFmt("ivec4 PV0i = ivec4(0), PV1i = ivec4(0);" _CRLF); + src->addFmt("int PV0ix = 0, PV0iy = 0, PV0iz = 0, PV0iw = 0, PV1ix = 0, PV1iy = 0, PV1iz = 0, PV1iw = 0;" _CRLF); src->addFmt("int PS0i = 0, PS1i = 0;" _CRLF); src->addFmt("ivec4 tempi = ivec4(0);" _CRLF); } if (shaderContext->typeTracker.genFloatReg) { - src->addFmt("vec4 PV0f = vec4(0.0), PV1f = vec4(0.0);" _CRLF); + src->addFmt("float PV0fx = 0.0, PV0fy = 0.0, PV0fz = 0.0, PV0fw = 0.0, PV1fx = 0.0, PV1fy = 0.0, PV1fz = 0.0, PV1fw = 0.0;" _CRLF); src->addFmt("float PS0f = 0.0, PS1f = 0.0;" _CRLF); src->addFmt("vec4 tempf = vec4(0.0);" _CRLF); } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h index 49bd3a64..53fb61ef 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h @@ -25,9 +25,6 @@ struct LatteDecompilerALUInstruction uint8 abs{}; uint8 neg{}; uint8 chan{}; - // register backup information (used for instruction groups where the same register is read and written) - bool requiredRegisterBackup{}; - uint8 registerBackupIndex{}; // index of the used register backup variable (at the beginning of the group the register value is copied to the temporary register with this index) }sourceOperand[3]; union { @@ -214,7 +211,7 @@ struct LatteDecompilerShaderContext // emitter bool hasUniformVarBlock; sint32 currentBindingPointVK{}; - + struct ALUClauseTemporariesState* aluPVPSState{nullptr}; // misc std::vector list_subroutines; };