From 3c49200b22029f82c160950043e5122a63c19640 Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 18 Sep 2014 03:57:13 -0700 Subject: [PATCH 1/3] X64Emitter: add MOVHLPS/MOVLHPS --- Source/Core/Common/x64Emitter.cpp | 5 +++++ Source/Core/Common/x64Emitter.h | 3 +++ 2 files changed, 8 insertions(+) diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index 75cd418379..3437598101 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -64,6 +64,8 @@ enum NormalSSEOps sseMOVLPDtoRM = 0x13, sseMOVHPDfromRM= 0x16, sseMOVHPDtoRM = 0x17, + sseMOVHLPS = 0x12, + sseMOVLHPS = 0x16, sseMASKMOVDQU = 0xF7, sseLDDQU = 0xF0, sseSHUF = 0xC6, @@ -1526,6 +1528,9 @@ void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVHPDfromRM, void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVLPDtoRM, false, regOp, arg);} void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVHPDtoRM, false, regOp, arg);} +void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(32, sseMOVHLPS, true, regOp1, R(regOp2));} +void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(32, sseMOVLHPS, true, regOp1, R(regOp2));} + void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, true, regOp, arg);} void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, true, regOp, arg);} diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 8b655c2c42..72042ff45e 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -583,6 +583,9 @@ public: void MOVLPD(OpArg arg, X64Reg regOp); void MOVHPD(OpArg arg, X64Reg regOp); + void MOVHLPS(X64Reg regOp1, X64Reg regOp2); + void MOVLHPS(X64Reg regOp1, X64Reg regOp2); + void MOVD_xmm(X64Reg dest, const OpArg &arg); void MOVQ_xmm(X64Reg dest, OpArg arg); void MOVD_xmm(const OpArg &arg, X64Reg src); From 1b425dedd1264965a6f7145c158a5727279493a8 Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 18 Sep 2014 03:57:24 -0700 Subject: [PATCH 2/3] JIT: add ps_cmp0/ps_cmp1 implementations using current fcmp code --- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 + .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 8 ++-- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 44 +++++++++++++------ Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 9 ++++ 4 files changed, 45 insertions(+), 18 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 0391d258cc..6faf302422 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -142,6 +142,7 @@ public: typedef u32 (*Operation)(u32 a, u32 b); void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); + void FloatCompare(UGeckoInstruction inst, bool upper = false); // OPCODES void unknown_instruction(UGeckoInstruction _inst); @@ -199,6 +200,7 @@ public: void ps_maddXX(UGeckoInstruction inst); void ps_sum(UGeckoInstruction inst); void ps_muls(UGeckoInstruction inst); + void ps_cmpXX(UGeckoInstruction inst); void fp_arith(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 0007915089..4437c67e47 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -110,14 +110,14 @@ static GekkoOPTemplate primarytable[] = static GekkoOPTemplate table4[] = { //SUBOP10 - {0, &Jit64::FallBackToInterpreter}, //"ps_cmpu0", OPTYPE_PS, FL_SET_CRn}}, - {32, &Jit64::FallBackToInterpreter}, //"ps_cmpo0", OPTYPE_PS, FL_SET_CRn}}, + {0, &Jit64::ps_cmpXX}, //"ps_cmpu0", OPTYPE_PS, FL_SET_CRn}}, + {32, &Jit64::ps_cmpXX}, //"ps_cmpo0", OPTYPE_PS, FL_SET_CRn}}, {40, &Jit64::ps_sign}, //"ps_neg", OPTYPE_PS, FL_RC_BIT}}, {136, &Jit64::ps_sign}, //"ps_nabs", OPTYPE_PS, FL_RC_BIT}}, {264, &Jit64::ps_sign}, //"ps_abs", OPTYPE_PS, FL_RC_BIT}}, - {64, &Jit64::FallBackToInterpreter}, //"ps_cmpu1", OPTYPE_PS, FL_RC_BIT}}, + {64, &Jit64::ps_cmpXX}, //"ps_cmpu1", OPTYPE_PS, FL_RC_BIT}}, {72, &Jit64::ps_mr}, //"ps_mr", OPTYPE_PS, FL_RC_BIT}}, - {96, &Jit64::FallBackToInterpreter}, //"ps_cmpo1", OPTYPE_PS, FL_RC_BIT}}, + {96, &Jit64::ps_cmpXX}, //"ps_cmpo1", OPTYPE_PS, FL_RC_BIT}}, {528, &Jit64::ps_mergeXX}, //"ps_merge00", OPTYPE_PS, FL_RC_BIT}}, {560, &Jit64::ps_mergeXX}, //"ps_merge01", OPTYPE_PS, FL_RC_BIT}}, {592, &Jit64::ps_mergeXX}, //"ps_merge10", OPTYPE_PS, FL_RC_BIT}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index a20dd17e89..25bc67a578 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -267,25 +267,32 @@ void Jit64::fmrx(UGeckoInstruction inst) fpr.UnlockAll(); } -void Jit64::fcmpx(UGeckoInstruction inst) +void Jit64::FloatCompare(UGeckoInstruction inst, bool upper) { - INSTRUCTION_START - JITDISABLE(bJITFloatingPointOff); - FALLBACK_IF(jo.fpAccurateFcmp); - - //bool ordered = inst.SUBOP10 == 32; - int a = inst.FA; - int b = inst.FB; - int crf = inst.CRFD; bool fprf = SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableFPRF && js.op->wantsFPRF; + //bool ordered = !!(inst.SUBOP10 & 32); + int a = inst.FA; + int b = inst.FB; + int crf = inst.CRFD; - fpr.Lock(a,b); - fpr.BindToRegister(b, true); + fpr.Lock(a, b); + fpr.BindToRegister(b, true, false); if (fprf) AND(32, PPCSTATE(fpscr), Imm32(~FPRF_MASK)); - // Are we masking sNaN invalid floating point exceptions? If not this could crash if we don't handle the exception? - UCOMISD(fpr.R(b).GetSimpleReg(), fpr.R(a)); + + if (upper) + { + fpr.BindToRegister(a, true, false); + MOVHLPS(XMM0, fpr.RX(a)); + MOVHLPS(XMM1, fpr.RX(b)); + UCOMISD(XMM1, R(XMM0)); + } + else + { + // Are we masking sNaN invalid floating point exceptions? If not this could crash if we don't handle the exception? + UCOMISD(fpr.RX(b), fpr.R(a)); + } FixupBranch pNaN, pLesser, pGreater; FixupBranch continue1, continue2, continue3; @@ -293,7 +300,7 @@ void Jit64::fcmpx(UGeckoInstruction inst) if (a != b) { // if B > A, goto Lesser's jump target - pLesser = J_CC(CC_A); + pLesser = J_CC(CC_A); } // if (B != B) or (A != A), goto NaN's jump target @@ -344,6 +351,15 @@ void Jit64::fcmpx(UGeckoInstruction inst) fpr.UnlockAll(); } +void Jit64::fcmpx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(jo.fpAccurateFcmp); + + FloatCompare(inst); +} + void Jit64::fctiwx(UGeckoInstruction inst) { INSTRUCTION_START diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index f08fb6b863..f8e4adfec4 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -351,3 +351,12 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); } + +void Jit64::ps_cmpXX(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(jo.fpAccurateFcmp); + + FloatCompare(inst, !!(inst.SUBOP10 & 64)); +} From af8772700aa95b31adf280c165cc827b6d59640a Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 18 Sep 2014 12:30:05 -0700 Subject: [PATCH 3/3] JIT: add ps_res and ps_rsqrte using scalar implementations --- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 + .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 4 +- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 53 +++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 6faf302422..704a8cf943 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -198,6 +198,8 @@ public: void ps_arith(UGeckoInstruction inst); //aggregate void ps_mergeXX(UGeckoInstruction inst); void ps_maddXX(UGeckoInstruction inst); + void ps_res(UGeckoInstruction inst); + void ps_rsqrte(UGeckoInstruction inst); void ps_sum(UGeckoInstruction inst); void ps_muls(UGeckoInstruction inst); void ps_cmpXX(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 4437c67e47..3aede22215 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -138,9 +138,9 @@ static GekkoOPTemplate table4_2[] = {20, &Jit64::ps_arith}, //"ps_sub", OPTYPE_PS, 0}}, {21, &Jit64::ps_arith}, //"ps_add", OPTYPE_PS, 0}}, {23, &Jit64::ps_sel}, //"ps_sel", OPTYPE_PS, 0}}, - {24, &Jit64::FallBackToInterpreter}, //"ps_res", OPTYPE_PS, 0}}, + {24, &Jit64::ps_res}, //"ps_res", OPTYPE_PS, 0}}, {25, &Jit64::ps_arith}, //"ps_mul", OPTYPE_PS, 0}}, - {26, &Jit64::FallBackToInterpreter}, //"ps_rsqrte", OPTYPE_PS, 0, 1}}, + {26, &Jit64::ps_rsqrte}, //"ps_rsqrte", OPTYPE_PS, 0, 1}}, {28, &Jit64::ps_maddXX}, //"ps_msub", OPTYPE_PS, 0}}, {29, &Jit64::ps_maddXX}, //"ps_madd", OPTYPE_PS, 0}}, {30, &Jit64::ps_maddXX}, //"ps_nmsub", OPTYPE_PS, 0}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index f8e4adfec4..b246968424 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -282,6 +282,59 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst) fpr.UnlockAll(); } +void Jit64::ps_rsqrte(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + int b = inst.FB; + int d = inst.FD; + + gpr.FlushLockX(RSCRATCH_EXTRA); + fpr.Lock(b, d); + fpr.BindToRegister(b, true, false); + fpr.BindToRegister(d, false); + + MOVSD(XMM0, fpr.R(b)); + CALL((void *)asm_routines.frsqrte); + MOVSD(fpr.R(d), XMM0); + + MOVHLPS(XMM0, fpr.RX(b)); + CALL((void *)asm_routines.frsqrte); + MOVLHPS(fpr.RX(d), XMM0); + + ForceSinglePrecisionP(fpr.RX(d)); + SetFPRFIfNeeded(inst, fpr.RX(d)); + fpr.UnlockAll(); + gpr.UnlockAllX(); +} + +void Jit64::ps_res(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + int b = inst.FB; + int d = inst.FD; + + gpr.FlushLockX(RSCRATCH_EXTRA); + fpr.Lock(b, d); + fpr.BindToRegister(b, true, false); + fpr.BindToRegister(d, false); + + MOVSD(XMM0, fpr.R(b)); + CALL((void *)asm_routines.fres); + MOVSD(fpr.R(d), XMM0); + + MOVHLPS(XMM0, fpr.RX(b)); + CALL((void *)asm_routines.fres); + MOVLHPS(fpr.RX(d), XMM0); + + ForceSinglePrecisionP(fpr.RX(d)); + SetFPRFIfNeeded(inst, fpr.RX(d)); + fpr.UnlockAll(); + gpr.UnlockAllX(); +} //TODO: add optimized cases void Jit64::ps_maddXX(UGeckoInstruction inst)