[ARM] Implement psq_l for 2x float loads. Couldn't find a game using quantized loads. Huge speed boost to Ikaruga and THP movies with this one.

This commit is contained in:
Ryan Houdek 2013-09-07 17:44:00 +00:00
parent 614a7c2081
commit 31b69c53f7
6 changed files with 129 additions and 46 deletions

View File

@ -215,6 +215,7 @@ if(_M_ARM)
Src/PowerPC/JitArm32/JitArm_LoadStore.cpp
Src/PowerPC/JitArm32/JitArm_FloatingPoint.cpp
Src/PowerPC/JitArm32/JitArm_Paired.cpp
Src/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp
Src/PowerPC/JitArm32/JitArm_SystemRegisters.cpp
Src/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp)
endif()

View File

@ -218,6 +218,9 @@ public:
void ps_neg(UGeckoInstruction _inst);
void ps_abs(UGeckoInstruction _inst);
void ps_nabs(UGeckoInstruction _inst);
// LoadStore paired
void psq_l(UGeckoInstruction _inst);
};
#endif // _JIT64_H

View File

@ -0,0 +1,59 @@
// Copyright 2013 Dolphin Emulator Project
// Licensed under GPLv2
// Refer to the license.txt file included.
#include "Common.h"
#include "Thunk.h"
#include "../../Core.h"
#include "../PowerPC.h"
#include "../../CoreTiming.h"
#include "../PPCTables.h"
#include "ArmEmitter.h"
#include "Jit.h"
#include "JitRegCache.h"
#include "JitAsm.h"
void JitArm::psq_l(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff)
bool update = inst.OPCD == 57;
s32 offset = inst.SIMM_12;
// R12 contains scale
// R11 contains type
// R10 is the ADDR
if (js.memcheck) { Default(inst); return; }
if (inst.W) {
// Enable when supporting single loads
Default(inst);
return;
}
LDR(R11, R9, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
//UBFX(R12, R11, 2, 6); // Scale
UBFX(R11, R11, 13, 3); // Type
MOVI2R(R10, (u32)offset);
if (inst.RA)
ADD(R10, R10, gpr.R(inst.RA));
if (update)
MOV(gpr.R(inst.RA), R10);
if (inst.W)
ADD(R11, R11, 8);
MOVI2R(R14, (u32)asm_routines.pairedLoadQuantized);
ADD(R14, R14, R11);
LDR(R14, R14);
// Values returned in S0, S1
BL(R14); // Jump to the quantizer Load
ARMReg vD0 = fpr.R0(inst.RS, false);
ARMReg vD1 = fpr.R1(inst.RS, false);
VCVT(vD0, S0, 0);
VCVT(vD1, S1, 0);
}

View File

@ -39,9 +39,6 @@ void JitArm::mtspr(UGeckoInstruction inst)
case SPR_LR:
case SPR_CTR:
case SPR_XER:
// These are safe to do the easy way, see the bottom of this function.
break;
case SPR_GQR0:
case SPR_GQR0 + 1:
case SPR_GQR0 + 2:
@ -50,19 +47,9 @@ void JitArm::mtspr(UGeckoInstruction inst)
case SPR_GQR0 + 5:
case SPR_GQR0 + 6:
case SPR_GQR0 + 7:
// Prevent recompiler from compiling in old quantizer values.
// If the value changed, destroy all blocks using this quantizer
// This will create a little bit of block churn, but hopefully not too bad.
{
/*
MOV(32, R(EAX), M(&PowerPC::ppcState.spr[iIndex])); // Load old value
CMP(32, R(EAX), gpr.R(inst.RD));
FixupBranch skip_destroy = J_CC(CC_E, false);
int gqr = iIndex - SPR_GQR0;
ABI_CallFunctionC(ProtectFunction(&Jit64::DestroyBlocksWithFlag, 1), (u32)BLOCK_USE_GQR0 << gqr);
SetJumpTarget(skip_destroy);*/
}
// TODO - break block if quantizers are written to.
// These are safe to do the easy way, see the bottom of this function.
break;
default:
Default(inst);
return;

View File

@ -107,7 +107,7 @@ static GekkoOPTemplate primarytable[] =
{54, &JitArm::Default}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
{55, &JitArm::Default}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{56, &JitArm::Default}, //"psq_l", OPTYPE_PS, FL_IN_A}},
{56, &JitArm::psq_l}, //"psq_l", OPTYPE_PS, FL_IN_A}},
{57, &JitArm::Default}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
{60, &JitArm::Default}, //"psq_st", OPTYPE_PS, FL_IN_A}},
{61, &JitArm::Default}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},

View File

@ -137,40 +137,73 @@ void JitArmAsmRoutineManager::Generate()
ADD(_SP, _SP, 4);
POP(9, R4, R5, R6, R7, R8, R9, R10, R11, _PC); // Returns
GenerateCommon();
FlushIcache();
}
void JitArmAsmRoutineManager::GenerateCommon()
{
/* fifoDirectWrite8 = AlignCode4();
GenFifoWrite(8);
fifoDirectWrite16 = AlignCode4();
GenFifoWrite(16);
fifoDirectWrite32 = AlignCode4();
GenFifoWrite(32);
fifoDirectWriteFloat = AlignCode4();
GenFifoFloatWrite();
fifoDirectWriteXmm64 = AlignCode4();
GenFifoXmm64Write();
const u8* loadPairedIllegal = GetCodePtr();
BKPT(0x10);
GenQuantizedLoads();
GenQuantizedStores();
GenQuantizedSingleStores();
*/
//CMPSD(R(XMM0), M(&zero),
// TODO
const u8* loadPairedFloatTwo = GetCodePtr();
PUSH(2, R12, _LR);
// R12, R14 is scratch
// R10 is the address
MOVI2R(R14, Memory::MEMVIEW32_MASK);
AND(R10, R10, R14);
MOVI2R(R14, (u32)Memory::base);
ADD(R10, R10, R14);
LDR(R12, R10);
REV(R12, R12);
VMOV(S0, R12);
LDR(R12, R10, 4);
REV(R12, R12);
VMOV(S1, R12);
POP(2, R12, _PC);
const u8* loadPairedFloatOne = GetCodePtr();
BKPT(0x12);
const u8* loadPairedU8Two = GetCodePtr();
BKPT(0x13);
const u8* loadPairedU8One = GetCodePtr();
BKPT(0x14);
const u8* loadPairedS8Two = GetCodePtr();
BKPT(0x15);
const u8* loadPairedS8One = GetCodePtr();
BKPT(0x16);
const u8* loadPairedU16Two = GetCodePtr();
BKPT(0x17);
const u8* loadPairedU16One = GetCodePtr();
BKPT(0x18);
const u8* loadPairedS16Two = GetCodePtr();
BKPT(0x19);
const u8* loadPairedS16One = GetCodePtr();
BKPT(0x20);
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));
pairedLoadQuantized[0] = loadPairedFloatTwo;
pairedLoadQuantized[1] = loadPairedIllegal;
pairedLoadQuantized[2] = loadPairedIllegal;
pairedLoadQuantized[3] = loadPairedIllegal;
pairedLoadQuantized[4] = loadPairedU8Two;
pairedLoadQuantized[5] = loadPairedU16Two;
pairedLoadQuantized[6] = loadPairedS8Two;
pairedLoadQuantized[7] = loadPairedS16Two;
pairedLoadQuantized[8] = loadPairedFloatOne;
pairedLoadQuantized[9] = loadPairedIllegal;
pairedLoadQuantized[10] = loadPairedIllegal;
pairedLoadQuantized[11] = loadPairedIllegal;
pairedLoadQuantized[12] = loadPairedU8One;
pairedLoadQuantized[13] = loadPairedU16One;
pairedLoadQuantized[14] = loadPairedS8One;
pairedLoadQuantized[15] = loadPairedS16One;
// Fast write routines - special case the most common hardware write
// TODO: use this.
// Even in x86, the param values will be in the right registers.
/*
const u8 *fastMemWrite8 = AlignCode16();
CMP(32, R(ABI_PARAM2), Imm32(0xCC008000));
FixupBranch skip_fast_write = J_CC(CC_NE, false);
MOV(32, EAX, M(&m_gatherPipeCount));
MOV(8, MDisp(EAX, (u32)&m_gatherPipe), ABI_PARAM1);
ADD(32, 1, M(&m_gatherPipeCount));
RET();
SetJumpTarget(skip_fast_write);
CALL((void *)&Memory::Write_U8);*/
}