mirror of https://github.com/cemu-project/Cemu.git
PPCRec: Fixes and optimizations + rework FRES/FRSQRTE
This commit is contained in:
parent
002a03df3d
commit
608757dbeb
|
@ -32,7 +32,7 @@ espresso_frsqrte_entry_t frsqrteLookupTable[32] =
|
|||
{0x20c1000, 0x35e},{0x1f12000, 0x332},{0x1d79000, 0x30a},{0x1bf4000, 0x2e6},
|
||||
};
|
||||
|
||||
double frsqrte_espresso(double input)
|
||||
ATTR_MS_ABI double frsqrte_espresso(double input)
|
||||
{
|
||||
unsigned long long x = *(unsigned long long*)&input;
|
||||
|
||||
|
@ -111,7 +111,7 @@ espresso_fres_entry_t fresLookupTable[32] =
|
|||
{0x88400, 0x11a}, {0x65000, 0x11a}, {0x41c00, 0x108}, {0x20c00, 0x106}
|
||||
};
|
||||
|
||||
double fres_espresso(double input)
|
||||
ATTR_MS_ABI double fres_espresso(double input)
|
||||
{
|
||||
// based on testing we know that fres uses only the first 15 bits of the mantissa
|
||||
// seee eeee eeee mmmm mmmm mmmm mmmx xxxx .... (s = sign, e = exponent, m = mantissa, x = not used)
|
||||
|
|
|
@ -191,8 +191,8 @@ inline double roundTo25BitAccuracy(double d)
|
|||
return *(double*)&v;
|
||||
}
|
||||
|
||||
double fres_espresso(double input);
|
||||
double frsqrte_espresso(double input);
|
||||
ATTR_MS_ABI double fres_espresso(double input);
|
||||
ATTR_MS_ABI double frsqrte_espresso(double input);
|
||||
|
||||
void fcmpu_espresso(PPCInterpreter_t* hCPU, int crfD, double a, double b);
|
||||
|
||||
|
|
|
@ -601,8 +601,10 @@ void PPCRecompilerX64Gen_imlInstruction_atomic_cmp_store(PPCRecFunction_t* PPCRe
|
|||
void PPCRecompilerX64Gen_imlInstruction_call_imm(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)
|
||||
{
|
||||
// the register allocator takes care of spilling volatile registers and moving parameters to the right registers, so we don't need to do any special handling here
|
||||
x64GenContext->emitter->SUB_qi8(X86_REG_RSP, 0x28); // reserve enough space for any parameters while keeping stack alignment of 16 intact
|
||||
x64GenContext->emitter->MOV_qi64(X86_REG_RAX, imlInstruction->op_call_imm.callAddress);
|
||||
x64GenContext->emitter->CALL_q(X86_REG_RAX);
|
||||
x64GenContext->emitter->ADD_qi8(X86_REG_RSP, 0x28);
|
||||
}
|
||||
|
||||
bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)
|
||||
|
|
|
@ -780,18 +780,6 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction
|
|||
// move to FPR register
|
||||
x64Gen_movq_xmmReg_reg64(x64GenContext, regR, REG_RESV_TEMP);
|
||||
}
|
||||
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP )
|
||||
{
|
||||
// move register to XMM15
|
||||
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
|
||||
|
||||
// call assembly routine to calculate accurate FRES result in XMM15
|
||||
x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)recompiler_fres);
|
||||
x64Gen_call_reg64(x64GenContext, REG_RESV_TEMP);
|
||||
|
||||
// copy result to bottom and top half of result register
|
||||
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
|
||||
}
|
||||
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT)
|
||||
{
|
||||
// move register to XMM15
|
||||
|
|
|
@ -363,7 +363,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
|
|||
operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP ||
|
||||
operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED ||
|
||||
operation == PPCREC_IML_OP_ASSIGN ||
|
||||
operation == PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP ||
|
||||
operation == PPCREC_IML_OP_FPR_NEGATE_PAIR ||
|
||||
operation == PPCREC_IML_OP_FPR_ABS_PAIR ||
|
||||
operation == PPCREC_IML_OP_FPR_FRES_PAIR ||
|
||||
|
|
|
@ -143,7 +143,6 @@ enum
|
|||
PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM, // leave top of destination untouched
|
||||
PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED,
|
||||
PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64, // expand bottom f32 to f64 in bottom and top half
|
||||
PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP, // calculate reciprocal with Espresso accuracy of source bottom half and write result to destination bottom and top half
|
||||
PPCREC_IML_OP_FPR_FCMPO_BOTTOM, // deprecated
|
||||
PPCREC_IML_OP_FPR_FCMPU_BOTTOM, // deprecated
|
||||
PPCREC_IML_OP_FPR_FCMPU_TOP, // deprecated
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -17,9 +17,19 @@ public:
|
|||
m_regBitmask &= ~((uint64)1 << index);
|
||||
}
|
||||
|
||||
void SetAllAvailable()
|
||||
{
|
||||
m_regBitmask = ~0ull;
|
||||
}
|
||||
|
||||
bool HasAllAvailable() const
|
||||
{
|
||||
return m_regBitmask == ~0ull;
|
||||
}
|
||||
|
||||
bool IsAvailable(uint32 index) const
|
||||
{
|
||||
return (m_regBitmask & (1 << index)) != 0;
|
||||
return (m_regBitmask & ((uint64)1 << index)) != 0;
|
||||
}
|
||||
|
||||
IMLPhysRegisterSet& operator&=(const IMLPhysRegisterSet& other)
|
||||
|
|
|
@ -67,38 +67,30 @@ boost::container::small_vector<raLivenessRange*, 128> raLivenessRange::GetAllSub
|
|||
return subranges;
|
||||
}
|
||||
|
||||
void raLivenessRange::GetAllowedRegistersExRecursive(raLivenessRange* range, uint32 iterationIndex, IMLPhysRegisterSet& allowedRegs)
|
||||
{
|
||||
range->lastIterationIndex = iterationIndex;
|
||||
for (auto& it : range->list_fixedRegRequirements)
|
||||
allowedRegs &= it.allowedReg;
|
||||
// check successors
|
||||
if (range->subrangeBranchTaken && range->subrangeBranchTaken->lastIterationIndex != iterationIndex)
|
||||
GetAllowedRegistersExRecursive(range->subrangeBranchTaken, iterationIndex, allowedRegs);
|
||||
if (range->subrangeBranchNotTaken && range->subrangeBranchNotTaken->lastIterationIndex != iterationIndex)
|
||||
GetAllowedRegistersExRecursive(range->subrangeBranchNotTaken, iterationIndex, allowedRegs);
|
||||
// check predecessors
|
||||
for (auto& prev : range->previousRanges)
|
||||
{
|
||||
if (prev->lastIterationIndex != iterationIndex)
|
||||
GetAllowedRegistersExRecursive(prev, iterationIndex, allowedRegs);
|
||||
}
|
||||
};
|
||||
|
||||
bool raLivenessRange::GetAllowedRegistersEx(IMLPhysRegisterSet& allowedRegisters)
|
||||
{
|
||||
if(interval2.ExtendsPreviousSegment() || interval2.ExtendsIntoNextSegment())
|
||||
{
|
||||
auto clusterRanges = GetAllSubrangesInCluster();
|
||||
bool hasAnyRequirement = false;
|
||||
for(auto& subrange : clusterRanges)
|
||||
{
|
||||
if(subrange->list_fixedRegRequirements.empty())
|
||||
continue;
|
||||
allowedRegisters = subrange->list_fixedRegRequirements.front().allowedReg;
|
||||
hasAnyRequirement = true;
|
||||
break;
|
||||
}
|
||||
if(!hasAnyRequirement)
|
||||
return false;
|
||||
for(auto& subrange : clusterRanges)
|
||||
{
|
||||
for(auto& fixedRegLoc : subrange->list_fixedRegRequirements)
|
||||
allowedRegisters &= fixedRegLoc.allowedReg;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// local check only, slightly faster
|
||||
if(list_fixedRegRequirements.empty())
|
||||
return false;
|
||||
allowedRegisters = list_fixedRegRequirements.front().allowedReg;
|
||||
for(auto& fixedRegLoc : list_fixedRegRequirements)
|
||||
allowedRegisters &= fixedRegLoc.allowedReg;
|
||||
}
|
||||
return true;
|
||||
uint32 iterationIndex = PPCRecRA_getNextIterationIndex();
|
||||
allowedRegisters.SetAllAvailable();
|
||||
GetAllowedRegistersExRecursive(this, iterationIndex, allowedRegisters);
|
||||
return !allowedRegisters.HasAllAvailable();
|
||||
}
|
||||
|
||||
IMLPhysRegisterSet raLivenessRange::GetAllowedRegisters(IMLPhysRegisterSet regPool)
|
||||
|
@ -424,6 +416,14 @@ void PPCRecRA_debugValidateSubrange(raLivenessRange* range)
|
|||
cemu_assert_debug(range->list_locations.front().index >= range->interval2.start.GetInstructionIndexEx());
|
||||
cemu_assert_debug(range->list_locations.back().index <= range->interval2.end.GetInstructionIndexEx());
|
||||
}
|
||||
// validate fixed reg requirements
|
||||
if (!range->list_fixedRegRequirements.empty())
|
||||
{
|
||||
cemu_assert_debug(range->list_fixedRegRequirements.front().pos >= range->interval2.start);
|
||||
cemu_assert_debug(range->list_fixedRegRequirements.back().pos <= range->interval2.end);
|
||||
for(sint32 i = 0; i < (sint32)range->list_fixedRegRequirements.size()-1; i++)
|
||||
cemu_assert_debug(range->list_fixedRegRequirements[i].pos < range->list_fixedRegRequirements[i+1].pos);
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
|
@ -563,7 +563,7 @@ raLivenessRange* PPCRecRA_splitLocalSubrange2(ppcImlGenContext_t* ppcImlGenConte
|
|||
for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++)
|
||||
{
|
||||
raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i;
|
||||
if (tailInterval.ContainsInstructionIndex(fixedReg->pos.GetInstructionIndex()))
|
||||
if (tailInterval.ContainsEdge(fixedReg->pos))
|
||||
{
|
||||
tailSubrange->list_fixedRegRequirements.push_back(*fixedReg);
|
||||
}
|
||||
|
@ -572,7 +572,7 @@ raLivenessRange* PPCRecRA_splitLocalSubrange2(ppcImlGenContext_t* ppcImlGenConte
|
|||
for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++)
|
||||
{
|
||||
raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i;
|
||||
if (!headInterval.ContainsInstructionIndex(fixedReg->pos.GetInstructionIndex()))
|
||||
if (!headInterval.ContainsEdge(fixedReg->pos))
|
||||
{
|
||||
subrange->list_fixedRegRequirements.resize(i);
|
||||
break;
|
||||
|
|
|
@ -335,6 +335,9 @@ struct raLivenessRange
|
|||
void SetPhysicalRegister(sint32 physicalRegister);
|
||||
void SetPhysicalRegisterForCluster(sint32 physicalRegister);
|
||||
void UnsetPhysicalRegister() { physicalRegister = -1; }
|
||||
|
||||
private:
|
||||
void GetAllowedRegistersExRecursive(raLivenessRange* range, uint32 iterationIndex, IMLPhysRegisterSet& allowedRegs);
|
||||
};
|
||||
|
||||
raLivenessRange* PPCRecRA_createSubrange2(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, IMLRegID virtualRegister, IMLName name, raInstructionEdge startPosition, raInstructionEdge endPosition);
|
||||
|
|
|
@ -181,9 +181,6 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP
|
|||
}
|
||||
}
|
||||
|
||||
// if(range.startAddress < 0x0202fa3C || range.startAddress > 0x0202FA7C)
|
||||
// return nullptr; // DEBUG
|
||||
|
||||
PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t();
|
||||
ppcRecFunc->ppcAddress = range.startAddress;
|
||||
ppcRecFunc->ppcSize = range.length;
|
||||
|
@ -340,15 +337,6 @@ bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext)
|
|||
//PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext);
|
||||
//PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext);
|
||||
|
||||
|
||||
// if(ppcImlGenContext.debug_entryPPCAddress >= 0x0240B7F8 && ppcImlGenContext.debug_entryPPCAddress < 0x0240C0AC)
|
||||
// {
|
||||
// IMLDebug_Dump(&ppcImlGenContext);
|
||||
// __debugbreak();
|
||||
// }
|
||||
// else if(ppcImlGenContext.debug_entryPPCAddress >= 0x0240B7F8)
|
||||
// return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -1513,7 +1513,7 @@ bool PPCRecompilerImlGen_DCBZ(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
|
|||
ppcImlGenContext->emitInst().make_r_r_r(PPCREC_IML_OP_ADD, regMemResEA, regA, regB);
|
||||
else
|
||||
ppcImlGenContext->emitInst().make_r_r(PPCREC_IML_OP_ASSIGN, regMemResEA, regB);
|
||||
ppcImlGenContext->emitInst().make_r_s32(PPCREC_IML_OP_AND, regMemResEA, ~31);
|
||||
ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_AND, regMemResEA, regMemResEA, ~31);
|
||||
// zero out the cacheline
|
||||
for(sint32 i = 0; i < 32; i += 4)
|
||||
ppcImlGenContext->emitInst().make_memory_r(regZero, regMemResEA, i, 32, false);
|
||||
|
|
|
@ -4,6 +4,9 @@
|
|||
#include "PPCRecompilerIml.h"
|
||||
#include "Cafe/GameProfile/GameProfile.h"
|
||||
|
||||
ATTR_MS_ABI double frsqrte_espresso(double input);
|
||||
ATTR_MS_ABI double fres_espresso(double input);
|
||||
|
||||
IMLReg _GetRegCR(ppcImlGenContext_t* ppcImlGenContext, uint8 crReg, uint8 crBit);
|
||||
|
||||
void PPCRecompilerImlGen_generateNewInstruction_fpr_r_memory(ppcImlGenContext_t* ppcImlGenContext, IMLReg registerDestination, IMLReg registerMemory, sint32 immS32, uint32 mode, bool switchEndian, IMLReg registerGQR = IMLREG_INVALID)
|
||||
|
@ -1007,9 +1010,12 @@ bool PPCRecompilerImlGen_FRES(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
|
|||
// load registers
|
||||
IMLReg fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
|
||||
IMLReg fprRegisterD = PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frD);
|
||||
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterB);
|
||||
ppcImlGenContext->emitInst().make_call_imm((uintptr_t)fres_espresso, fprRegisterB, IMLREG_INVALID, IMLREG_INVALID, fprRegisterD);
|
||||
// adjust accuracy
|
||||
PPRecompilerImmGen_optionalRoundBottomFPRToSinglePrecision(ppcImlGenContext, fprRegisterD);
|
||||
// copy result to top
|
||||
if( ppcImlGenContext->PSE )
|
||||
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterD);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1026,9 +1032,7 @@ bool PPCRecompilerImlGen_FRSP(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
|
|||
}
|
||||
PPCRecompilerImlGen_generateNewInstruction_fpr_r(ppcImlGenContext, NULL,PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM, fprRegisterD);
|
||||
if( ppcImlGenContext->PSE )
|
||||
{
|
||||
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterD);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1075,7 +1079,7 @@ bool PPCRecompilerImlGen_FRSQRTE(ppcImlGenContext_t* ppcImlGenContext, uint32 op
|
|||
// hCPU->fpr[frD].fpr = 1.0 / sqrt(hCPU->fpr[frB].fpr);
|
||||
IMLReg fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
|
||||
IMLReg fprRegisterD = PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frD);
|
||||
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT, fprRegisterD, fprRegisterB);
|
||||
ppcImlGenContext->emitInst().make_call_imm((uintptr_t)frsqrte_espresso, fprRegisterB, IMLREG_INVALID, IMLREG_INVALID, fprRegisterD);
|
||||
// adjust accuracy
|
||||
PPRecompilerImmGen_optionalRoundBottomFPRToSinglePrecision(ppcImlGenContext, fprRegisterD);
|
||||
return true;
|
||||
|
|
Loading…
Reference in New Issue