PPCRec: Fixes and optimizations + rework FRES/FRSQRTE

This commit is contained in:
Exzap 2024-10-23 08:36:12 +02:00
parent 002a03df3d
commit 608757dbeb
13 changed files with 408 additions and 354 deletions

View File

@ -32,7 +32,7 @@ espresso_frsqrte_entry_t frsqrteLookupTable[32] =
{0x20c1000, 0x35e},{0x1f12000, 0x332},{0x1d79000, 0x30a},{0x1bf4000, 0x2e6},
};
double frsqrte_espresso(double input)
ATTR_MS_ABI double frsqrte_espresso(double input)
{
unsigned long long x = *(unsigned long long*)&input;
@ -111,7 +111,7 @@ espresso_fres_entry_t fresLookupTable[32] =
{0x88400, 0x11a}, {0x65000, 0x11a}, {0x41c00, 0x108}, {0x20c00, 0x106}
};
double fres_espresso(double input)
ATTR_MS_ABI double fres_espresso(double input)
{
// based on testing we know that fres uses only the first 15 bits of the mantissa
// seee eeee eeee mmmm mmmm mmmm mmmx xxxx .... (s = sign, e = exponent, m = mantissa, x = not used)

View File

@ -191,8 +191,8 @@ inline double roundTo25BitAccuracy(double d)
return *(double*)&v;
}
double fres_espresso(double input);
double frsqrte_espresso(double input);
ATTR_MS_ABI double fres_espresso(double input);
ATTR_MS_ABI double frsqrte_espresso(double input);
void fcmpu_espresso(PPCInterpreter_t* hCPU, int crfD, double a, double b);

View File

@ -601,8 +601,10 @@ void PPCRecompilerX64Gen_imlInstruction_atomic_cmp_store(PPCRecFunction_t* PPCRe
void PPCRecompilerX64Gen_imlInstruction_call_imm(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)
{
// the register allocator takes care of spilling volatile registers and moving parameters to the right registers, so we don't need to do any special handling here
x64GenContext->emitter->SUB_qi8(X86_REG_RSP, 0x28); // reserve enough space for any parameters while keeping stack alignment of 16 intact
x64GenContext->emitter->MOV_qi64(X86_REG_RAX, imlInstruction->op_call_imm.callAddress);
x64GenContext->emitter->CALL_q(X86_REG_RAX);
x64GenContext->emitter->ADD_qi8(X86_REG_RSP, 0x28);
}
bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)

View File

@ -780,18 +780,6 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction
// move to FPR register
x64Gen_movq_xmmReg_reg64(x64GenContext, regR, REG_RESV_TEMP);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP )
{
// move register to XMM15
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
// call assembly routine to calculate accurate FRES result in XMM15
x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)recompiler_fres);
x64Gen_call_reg64(x64GenContext, REG_RESV_TEMP);
// copy result to bottom and top half of result register
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT)
{
// move register to XMM15

View File

@ -363,7 +363,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP ||
operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED ||
operation == PPCREC_IML_OP_ASSIGN ||
operation == PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP ||
operation == PPCREC_IML_OP_FPR_NEGATE_PAIR ||
operation == PPCREC_IML_OP_FPR_ABS_PAIR ||
operation == PPCREC_IML_OP_FPR_FRES_PAIR ||

View File

@ -143,7 +143,6 @@ enum
PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM, // leave top of destination untouched
PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED,
PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64, // expand bottom f32 to f64 in bottom and top half
PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP, // calculate reciprocal with Espresso accuracy of source bottom half and write result to destination bottom and top half
PPCREC_IML_OP_FPR_FCMPO_BOTTOM, // deprecated
PPCREC_IML_OP_FPR_FCMPU_BOTTOM, // deprecated
PPCREC_IML_OP_FPR_FCMPU_TOP, // deprecated

File diff suppressed because it is too large Load Diff

View File

@ -17,9 +17,19 @@ public:
m_regBitmask &= ~((uint64)1 << index);
}
void SetAllAvailable()
{
m_regBitmask = ~0ull;
}
bool HasAllAvailable() const
{
return m_regBitmask == ~0ull;
}
bool IsAvailable(uint32 index) const
{
return (m_regBitmask & (1 << index)) != 0;
return (m_regBitmask & ((uint64)1 << index)) != 0;
}
IMLPhysRegisterSet& operator&=(const IMLPhysRegisterSet& other)

View File

@ -67,38 +67,30 @@ boost::container::small_vector<raLivenessRange*, 128> raLivenessRange::GetAllSub
return subranges;
}
void raLivenessRange::GetAllowedRegistersExRecursive(raLivenessRange* range, uint32 iterationIndex, IMLPhysRegisterSet& allowedRegs)
{
range->lastIterationIndex = iterationIndex;
for (auto& it : range->list_fixedRegRequirements)
allowedRegs &= it.allowedReg;
// check successors
if (range->subrangeBranchTaken && range->subrangeBranchTaken->lastIterationIndex != iterationIndex)
GetAllowedRegistersExRecursive(range->subrangeBranchTaken, iterationIndex, allowedRegs);
if (range->subrangeBranchNotTaken && range->subrangeBranchNotTaken->lastIterationIndex != iterationIndex)
GetAllowedRegistersExRecursive(range->subrangeBranchNotTaken, iterationIndex, allowedRegs);
// check predecessors
for (auto& prev : range->previousRanges)
{
if (prev->lastIterationIndex != iterationIndex)
GetAllowedRegistersExRecursive(prev, iterationIndex, allowedRegs);
}
};
bool raLivenessRange::GetAllowedRegistersEx(IMLPhysRegisterSet& allowedRegisters)
{
if(interval2.ExtendsPreviousSegment() || interval2.ExtendsIntoNextSegment())
{
auto clusterRanges = GetAllSubrangesInCluster();
bool hasAnyRequirement = false;
for(auto& subrange : clusterRanges)
{
if(subrange->list_fixedRegRequirements.empty())
continue;
allowedRegisters = subrange->list_fixedRegRequirements.front().allowedReg;
hasAnyRequirement = true;
break;
}
if(!hasAnyRequirement)
return false;
for(auto& subrange : clusterRanges)
{
for(auto& fixedRegLoc : subrange->list_fixedRegRequirements)
allowedRegisters &= fixedRegLoc.allowedReg;
}
}
else
{
// local check only, slightly faster
if(list_fixedRegRequirements.empty())
return false;
allowedRegisters = list_fixedRegRequirements.front().allowedReg;
for(auto& fixedRegLoc : list_fixedRegRequirements)
allowedRegisters &= fixedRegLoc.allowedReg;
}
return true;
uint32 iterationIndex = PPCRecRA_getNextIterationIndex();
allowedRegisters.SetAllAvailable();
GetAllowedRegistersExRecursive(this, iterationIndex, allowedRegisters);
return !allowedRegisters.HasAllAvailable();
}
IMLPhysRegisterSet raLivenessRange::GetAllowedRegisters(IMLPhysRegisterSet regPool)
@ -424,6 +416,14 @@ void PPCRecRA_debugValidateSubrange(raLivenessRange* range)
cemu_assert_debug(range->list_locations.front().index >= range->interval2.start.GetInstructionIndexEx());
cemu_assert_debug(range->list_locations.back().index <= range->interval2.end.GetInstructionIndexEx());
}
// validate fixed reg requirements
if (!range->list_fixedRegRequirements.empty())
{
cemu_assert_debug(range->list_fixedRegRequirements.front().pos >= range->interval2.start);
cemu_assert_debug(range->list_fixedRegRequirements.back().pos <= range->interval2.end);
for(sint32 i = 0; i < (sint32)range->list_fixedRegRequirements.size()-1; i++)
cemu_assert_debug(range->list_fixedRegRequirements[i].pos < range->list_fixedRegRequirements[i+1].pos);
}
}
#else
@ -563,7 +563,7 @@ raLivenessRange* PPCRecRA_splitLocalSubrange2(ppcImlGenContext_t* ppcImlGenConte
for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++)
{
raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i;
if (tailInterval.ContainsInstructionIndex(fixedReg->pos.GetInstructionIndex()))
if (tailInterval.ContainsEdge(fixedReg->pos))
{
tailSubrange->list_fixedRegRequirements.push_back(*fixedReg);
}
@ -572,7 +572,7 @@ raLivenessRange* PPCRecRA_splitLocalSubrange2(ppcImlGenContext_t* ppcImlGenConte
for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++)
{
raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i;
if (!headInterval.ContainsInstructionIndex(fixedReg->pos.GetInstructionIndex()))
if (!headInterval.ContainsEdge(fixedReg->pos))
{
subrange->list_fixedRegRequirements.resize(i);
break;

View File

@ -335,6 +335,9 @@ struct raLivenessRange
void SetPhysicalRegister(sint32 physicalRegister);
void SetPhysicalRegisterForCluster(sint32 physicalRegister);
void UnsetPhysicalRegister() { physicalRegister = -1; }
private:
void GetAllowedRegistersExRecursive(raLivenessRange* range, uint32 iterationIndex, IMLPhysRegisterSet& allowedRegs);
};
raLivenessRange* PPCRecRA_createSubrange2(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, IMLRegID virtualRegister, IMLName name, raInstructionEdge startPosition, raInstructionEdge endPosition);

View File

@ -181,9 +181,6 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP
}
}
// if(range.startAddress < 0x0202fa3C || range.startAddress > 0x0202FA7C)
// return nullptr; // DEBUG
PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t();
ppcRecFunc->ppcAddress = range.startAddress;
ppcRecFunc->ppcSize = range.length;
@ -340,15 +337,6 @@ bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext)
//PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext);
//PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext);
// if(ppcImlGenContext.debug_entryPPCAddress >= 0x0240B7F8 && ppcImlGenContext.debug_entryPPCAddress < 0x0240C0AC)
// {
// IMLDebug_Dump(&ppcImlGenContext);
// __debugbreak();
// }
// else if(ppcImlGenContext.debug_entryPPCAddress >= 0x0240B7F8)
// return false;
return true;
}

View File

@ -1513,7 +1513,7 @@ bool PPCRecompilerImlGen_DCBZ(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
ppcImlGenContext->emitInst().make_r_r_r(PPCREC_IML_OP_ADD, regMemResEA, regA, regB);
else
ppcImlGenContext->emitInst().make_r_r(PPCREC_IML_OP_ASSIGN, regMemResEA, regB);
ppcImlGenContext->emitInst().make_r_s32(PPCREC_IML_OP_AND, regMemResEA, ~31);
ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_AND, regMemResEA, regMemResEA, ~31);
// zero out the cacheline
for(sint32 i = 0; i < 32; i += 4)
ppcImlGenContext->emitInst().make_memory_r(regZero, regMemResEA, i, 32, false);

View File

@ -4,6 +4,9 @@
#include "PPCRecompilerIml.h"
#include "Cafe/GameProfile/GameProfile.h"
ATTR_MS_ABI double frsqrte_espresso(double input);
ATTR_MS_ABI double fres_espresso(double input);
IMLReg _GetRegCR(ppcImlGenContext_t* ppcImlGenContext, uint8 crReg, uint8 crBit);
void PPCRecompilerImlGen_generateNewInstruction_fpr_r_memory(ppcImlGenContext_t* ppcImlGenContext, IMLReg registerDestination, IMLReg registerMemory, sint32 immS32, uint32 mode, bool switchEndian, IMLReg registerGQR = IMLREG_INVALID)
@ -1007,9 +1010,12 @@ bool PPCRecompilerImlGen_FRES(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
// load registers
IMLReg fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
IMLReg fprRegisterD = PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frD);
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterB);
ppcImlGenContext->emitInst().make_call_imm((uintptr_t)fres_espresso, fprRegisterB, IMLREG_INVALID, IMLREG_INVALID, fprRegisterD);
// adjust accuracy
PPRecompilerImmGen_optionalRoundBottomFPRToSinglePrecision(ppcImlGenContext, fprRegisterD);
// copy result to top
if( ppcImlGenContext->PSE )
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterD);
return true;
}
@ -1026,9 +1032,7 @@ bool PPCRecompilerImlGen_FRSP(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
}
PPCRecompilerImlGen_generateNewInstruction_fpr_r(ppcImlGenContext, NULL,PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM, fprRegisterD);
if( ppcImlGenContext->PSE )
{
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterD);
}
return true;
}
@ -1075,7 +1079,7 @@ bool PPCRecompilerImlGen_FRSQRTE(ppcImlGenContext_t* ppcImlGenContext, uint32 op
// hCPU->fpr[frD].fpr = 1.0 / sqrt(hCPU->fpr[frB].fpr);
IMLReg fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
IMLReg fprRegisterD = PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frD);
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT, fprRegisterD, fprRegisterB);
ppcImlGenContext->emitInst().make_call_imm((uintptr_t)frsqrte_espresso, fprRegisterB, IMLREG_INVALID, IMLREG_INVALID, fprRegisterD);
// adjust accuracy
PPRecompilerImmGen_optionalRoundBottomFPRToSinglePrecision(ppcImlGenContext, fprRegisterD);
return true;