mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-02-01 02:21:26 +02:00
[ARM] Implement psq_l for 2x float loads. Couldn't find a game using quantized loads. Huge speed boost to Ikaruga and THP movies with this one.
This commit is contained in:
parent
614a7c2081
commit
31b69c53f7
@ -215,6 +215,7 @@ if(_M_ARM)
|
||||
Src/PowerPC/JitArm32/JitArm_LoadStore.cpp
|
||||
Src/PowerPC/JitArm32/JitArm_FloatingPoint.cpp
|
||||
Src/PowerPC/JitArm32/JitArm_Paired.cpp
|
||||
Src/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp
|
||||
Src/PowerPC/JitArm32/JitArm_SystemRegisters.cpp
|
||||
Src/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp)
|
||||
endif()
|
||||
|
@ -218,6 +218,9 @@ public:
|
||||
void ps_neg(UGeckoInstruction _inst);
|
||||
void ps_abs(UGeckoInstruction _inst);
|
||||
void ps_nabs(UGeckoInstruction _inst);
|
||||
|
||||
// LoadStore paired
|
||||
void psq_l(UGeckoInstruction _inst);
|
||||
};
|
||||
|
||||
#endif // _JIT64_H
|
||||
|
@ -0,0 +1,59 @@
|
||||
// Copyright 2013 Dolphin Emulator Project
|
||||
// Licensed under GPLv2
|
||||
// Refer to the license.txt file included.
|
||||
#include "Common.h"
|
||||
#include "Thunk.h"
|
||||
|
||||
#include "../../Core.h"
|
||||
#include "../PowerPC.h"
|
||||
#include "../../CoreTiming.h"
|
||||
#include "../PPCTables.h"
|
||||
#include "ArmEmitter.h"
|
||||
|
||||
#include "Jit.h"
|
||||
#include "JitRegCache.h"
|
||||
#include "JitAsm.h"
|
||||
|
||||
void JitArm::psq_l(UGeckoInstruction inst)
|
||||
{
|
||||
INSTRUCTION_START
|
||||
JITDISABLE(bJITLoadStorePairedOff)
|
||||
|
||||
bool update = inst.OPCD == 57;
|
||||
s32 offset = inst.SIMM_12;
|
||||
|
||||
// R12 contains scale
|
||||
// R11 contains type
|
||||
// R10 is the ADDR
|
||||
|
||||
if (js.memcheck) { Default(inst); return; }
|
||||
|
||||
if (inst.W) {
|
||||
// Enable when supporting single loads
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
|
||||
LDR(R11, R9, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
|
||||
//UBFX(R12, R11, 2, 6); // Scale
|
||||
UBFX(R11, R11, 13, 3); // Type
|
||||
|
||||
MOVI2R(R10, (u32)offset);
|
||||
if (inst.RA)
|
||||
ADD(R10, R10, gpr.R(inst.RA));
|
||||
if (update)
|
||||
MOV(gpr.R(inst.RA), R10);
|
||||
if (inst.W)
|
||||
ADD(R11, R11, 8);
|
||||
MOVI2R(R14, (u32)asm_routines.pairedLoadQuantized);
|
||||
ADD(R14, R14, R11);
|
||||
LDR(R14, R14);
|
||||
|
||||
// Values returned in S0, S1
|
||||
BL(R14); // Jump to the quantizer Load
|
||||
|
||||
ARMReg vD0 = fpr.R0(inst.RS, false);
|
||||
ARMReg vD1 = fpr.R1(inst.RS, false);
|
||||
VCVT(vD0, S0, 0);
|
||||
VCVT(vD1, S1, 0);
|
||||
}
|
@ -39,9 +39,6 @@ void JitArm::mtspr(UGeckoInstruction inst)
|
||||
case SPR_LR:
|
||||
case SPR_CTR:
|
||||
case SPR_XER:
|
||||
// These are safe to do the easy way, see the bottom of this function.
|
||||
break;
|
||||
|
||||
case SPR_GQR0:
|
||||
case SPR_GQR0 + 1:
|
||||
case SPR_GQR0 + 2:
|
||||
@ -50,19 +47,9 @@ void JitArm::mtspr(UGeckoInstruction inst)
|
||||
case SPR_GQR0 + 5:
|
||||
case SPR_GQR0 + 6:
|
||||
case SPR_GQR0 + 7:
|
||||
// Prevent recompiler from compiling in old quantizer values.
|
||||
// If the value changed, destroy all blocks using this quantizer
|
||||
// This will create a little bit of block churn, but hopefully not too bad.
|
||||
{
|
||||
/*
|
||||
MOV(32, R(EAX), M(&PowerPC::ppcState.spr[iIndex])); // Load old value
|
||||
CMP(32, R(EAX), gpr.R(inst.RD));
|
||||
FixupBranch skip_destroy = J_CC(CC_E, false);
|
||||
int gqr = iIndex - SPR_GQR0;
|
||||
ABI_CallFunctionC(ProtectFunction(&Jit64::DestroyBlocksWithFlag, 1), (u32)BLOCK_USE_GQR0 << gqr);
|
||||
SetJumpTarget(skip_destroy);*/
|
||||
}
|
||||
// TODO - break block if quantizers are written to.
|
||||
// These are safe to do the easy way, see the bottom of this function.
|
||||
break;
|
||||
|
||||
default:
|
||||
Default(inst);
|
||||
return;
|
||||
|
@ -107,7 +107,7 @@ static GekkoOPTemplate primarytable[] =
|
||||
{54, &JitArm::Default}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
|
||||
{55, &JitArm::Default}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
|
||||
|
||||
{56, &JitArm::Default}, //"psq_l", OPTYPE_PS, FL_IN_A}},
|
||||
{56, &JitArm::psq_l}, //"psq_l", OPTYPE_PS, FL_IN_A}},
|
||||
{57, &JitArm::Default}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
|
||||
{60, &JitArm::Default}, //"psq_st", OPTYPE_PS, FL_IN_A}},
|
||||
{61, &JitArm::Default}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
|
||||
|
@ -137,40 +137,73 @@ void JitArmAsmRoutineManager::Generate()
|
||||
ADD(_SP, _SP, 4);
|
||||
|
||||
POP(9, R4, R5, R6, R7, R8, R9, R10, R11, _PC); // Returns
|
||||
|
||||
GenerateCommon();
|
||||
|
||||
FlushIcache();
|
||||
}
|
||||
|
||||
void JitArmAsmRoutineManager::GenerateCommon()
|
||||
{
|
||||
/* fifoDirectWrite8 = AlignCode4();
|
||||
GenFifoWrite(8);
|
||||
fifoDirectWrite16 = AlignCode4();
|
||||
GenFifoWrite(16);
|
||||
fifoDirectWrite32 = AlignCode4();
|
||||
GenFifoWrite(32);
|
||||
fifoDirectWriteFloat = AlignCode4();
|
||||
GenFifoFloatWrite();
|
||||
fifoDirectWriteXmm64 = AlignCode4();
|
||||
GenFifoXmm64Write();
|
||||
const u8* loadPairedIllegal = GetCodePtr();
|
||||
BKPT(0x10);
|
||||
|
||||
GenQuantizedLoads();
|
||||
GenQuantizedStores();
|
||||
GenQuantizedSingleStores();
|
||||
*/
|
||||
//CMPSD(R(XMM0), M(&zero),
|
||||
// TODO
|
||||
const u8* loadPairedFloatTwo = GetCodePtr();
|
||||
PUSH(2, R12, _LR);
|
||||
// R12, R14 is scratch
|
||||
// R10 is the address
|
||||
MOVI2R(R14, Memory::MEMVIEW32_MASK);
|
||||
AND(R10, R10, R14);
|
||||
MOVI2R(R14, (u32)Memory::base);
|
||||
ADD(R10, R10, R14);
|
||||
|
||||
LDR(R12, R10);
|
||||
REV(R12, R12);
|
||||
VMOV(S0, R12);
|
||||
|
||||
LDR(R12, R10, 4);
|
||||
REV(R12, R12);
|
||||
VMOV(S1, R12);
|
||||
|
||||
POP(2, R12, _PC);
|
||||
const u8* loadPairedFloatOne = GetCodePtr();
|
||||
BKPT(0x12);
|
||||
const u8* loadPairedU8Two = GetCodePtr();
|
||||
BKPT(0x13);
|
||||
const u8* loadPairedU8One = GetCodePtr();
|
||||
BKPT(0x14);
|
||||
const u8* loadPairedS8Two = GetCodePtr();
|
||||
BKPT(0x15);
|
||||
const u8* loadPairedS8One = GetCodePtr();
|
||||
BKPT(0x16);
|
||||
const u8* loadPairedU16Two = GetCodePtr();
|
||||
BKPT(0x17);
|
||||
const u8* loadPairedU16One = GetCodePtr();
|
||||
BKPT(0x18);
|
||||
const u8* loadPairedS16Two = GetCodePtr();
|
||||
BKPT(0x19);
|
||||
const u8* loadPairedS16One = GetCodePtr();
|
||||
BKPT(0x20);
|
||||
|
||||
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
ReserveCodeSpace(16 * sizeof(u8*));
|
||||
|
||||
pairedLoadQuantized[0] = loadPairedFloatTwo;
|
||||
pairedLoadQuantized[1] = loadPairedIllegal;
|
||||
pairedLoadQuantized[2] = loadPairedIllegal;
|
||||
pairedLoadQuantized[3] = loadPairedIllegal;
|
||||
pairedLoadQuantized[4] = loadPairedU8Two;
|
||||
pairedLoadQuantized[5] = loadPairedU16Two;
|
||||
pairedLoadQuantized[6] = loadPairedS8Two;
|
||||
pairedLoadQuantized[7] = loadPairedS16Two;
|
||||
|
||||
pairedLoadQuantized[8] = loadPairedFloatOne;
|
||||
pairedLoadQuantized[9] = loadPairedIllegal;
|
||||
pairedLoadQuantized[10] = loadPairedIllegal;
|
||||
pairedLoadQuantized[11] = loadPairedIllegal;
|
||||
pairedLoadQuantized[12] = loadPairedU8One;
|
||||
pairedLoadQuantized[13] = loadPairedU16One;
|
||||
pairedLoadQuantized[14] = loadPairedS8One;
|
||||
pairedLoadQuantized[15] = loadPairedS16One;
|
||||
|
||||
// Fast write routines - special case the most common hardware write
|
||||
// TODO: use this.
|
||||
// Even in x86, the param values will be in the right registers.
|
||||
/*
|
||||
const u8 *fastMemWrite8 = AlignCode16();
|
||||
CMP(32, R(ABI_PARAM2), Imm32(0xCC008000));
|
||||
FixupBranch skip_fast_write = J_CC(CC_NE, false);
|
||||
MOV(32, EAX, M(&m_gatherPipeCount));
|
||||
MOV(8, MDisp(EAX, (u32)&m_gatherPipe), ABI_PARAM1);
|
||||
ADD(32, 1, M(&m_gatherPipeCount));
|
||||
RET();
|
||||
SetJumpTarget(skip_fast_write);
|
||||
CALL((void *)&Memory::Write_U8);*/
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user