From 4f18f6078fdb097a927319ae7b12a0dacf4b42f5 Mon Sep 17 00:00:00 2001 From: Fiora Date: Mon, 18 Aug 2014 19:58:00 -0700 Subject: [PATCH] JIT64: Fix fmul rounding issues Thanks to magumagu's softfp experiments, we know a lot more about the Wii's strange floating point unit than we used to. In particular, when doing a single-precision floating point multiply (fmulsx), it rounds the right hand side's mantissa so as to lose the low 28 bits (of the 53-bit mantissa). Emulating this behavior in Dolphin fixes a bunch of issues with games that require extremely precise emulation of floating point hardware, especially game replays. Fortunately, we can do this with rather little CPU cost; just ~5 extra instructions per multiply, instead of the vast load of a pure-software float implementation. This doesn't make floating-point behavior at all perfect. I still suspect fmadd rounding might not be quite right, since the Wii uses fused instructions and Dolphin doesn't, and NaN/infinity/exception handling is probably off in various ways... but it's definitely way better than before. This appears to fix replays in Mario Kart Wii, Mario Kart Double Dash, and Super Smash Brothers Brawl. I wouldn't be surprised if it fixes a bunch of other stuff too. The changes to instructions other than fmulsx may not be strictly necessary, but I included them for completeness, since it feels wrong to fix some instructions but not others, since some games we didn't test might rely on them. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 4 +- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 35 ++++++++++--- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 51 +++++++++++++++---- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 19 +++++++ Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 1 + 5 files changed, 90 insertions(+), 20 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 95fbcd1f7b..97aa522d22 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -117,10 +117,10 @@ public: // is set or not. Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); - void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg)); + void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false); typedef u32 (*Operation)(u32 a, u32 b); void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); - void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg)); + void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false); // OPCODES void unknown_instruction(UGeckoInstruction _inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 7761c636d2..6be6680708 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -14,10 +14,28 @@ static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x800000 static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; -void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg)) +void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS) { fpr.Lock(d, a, b); - if (d == a) + if (roundRHS) + { + if (d == a) + { + fpr.BindToRegister(d, true); + MOVSD(XMM0, fpr.R(b)); + Force25BitPrecision(XMM0, XMM1); + (this->*op)(fpr.RX(d), R(XMM0)); + } + else + { + fpr.BindToRegister(d, d == b); + if (d != b) + MOVSD(fpr.RX(d), fpr.R(b)); + Force25BitPrecision(fpr.RX(d), XMM0); + (this->*op)(fpr.RX(d), fpr.R(a)); + } + } + else if (d == a) { fpr.BindToRegister(d, true); if (!single) @@ -88,7 +106,7 @@ void Jit64::fp_arith(UGeckoInstruction inst) case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD); break; //div case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD); break; //sub case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD); break; //add - case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD); break; //mul + case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, single); break; //mul default: _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } @@ -111,24 +129,25 @@ void Jit64::fmaddXX(UGeckoInstruction inst) int d = inst.FD; fpr.Lock(a, b, c, d); - MOVSD(XMM0, fpr.R(a)); + MOVSD(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, XMM1); switch (inst.SUBOP5) { case 28: //msub - MULSD(XMM0, fpr.R(c)); + MULSD(XMM0, fpr.R(a)); SUBSD(XMM0, fpr.R(b)); break; case 29: //madd - MULSD(XMM0, fpr.R(c)); + MULSD(XMM0, fpr.R(a)); ADDSD(XMM0, fpr.R(b)); break; case 30: //nmsub - MULSD(XMM0, fpr.R(c)); + MULSD(XMM0, fpr.R(a)); SUBSD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits2)); break; case 31: //nmadd - MULSD(XMM0, fpr.R(c)); + MULSD(XMM0, fpr.R(a)); ADDSD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits2)); break; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 6934f56d42..b2fc5e4150 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -113,11 +113,29 @@ add a,b,a */ //There's still a little bit more optimization that can be squeezed out of this -void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg)) +void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), bool roundRHS) { fpr.Lock(d, a, b); - if (d == a) + if (roundRHS) + { + if (d == a) + { + fpr.BindToRegister(d, true); + MOVAPD(XMM0, fpr.R(b)); + Force25BitPrecision(XMM0, XMM1); + (this->*op)(fpr.RX(d), R(XMM0)); + } + else + { + fpr.BindToRegister(d, d == b); + if (d != b) + MOVAPD(fpr.RX(d), fpr.R(b)); + Force25BitPrecision(fpr.RX(d), XMM0); + (this->*op)(fpr.RX(d), fpr.R(a)); + } + } + else if (d == a) { fpr.BindToRegister(d, true); (this->*op)(fpr.RX(d), fpr.R(b)); @@ -166,7 +184,7 @@ void Jit64::ps_arith(UGeckoInstruction inst) tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD); break; case 25: // mul - tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD); + tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, true); break; default: _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); @@ -230,15 +248,17 @@ void Jit64::ps_muls(UGeckoInstruction inst) case 12: // Single multiply scalar high // TODO - faster version for when regs are different - MOVAPD(XMM0, fpr.R(a)); MOVDDUP(XMM1, fpr.R(c)); + Force25BitPrecision(XMM1, XMM0); + MOVAPD(XMM0, fpr.R(a)); MULPD(XMM0, R(XMM1)); MOVAPD(fpr.R(d), XMM0); break; case 13: // TODO - faster version for when regs are different - MOVAPD(XMM0, fpr.R(a)); MOVAPD(XMM1, fpr.R(c)); + Force25BitPrecision(XMM1, XMM0); + MOVAPD(XMM0, fpr.R(a)); SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower MULPD(XMM0, R(XMM1)); MOVAPD(fpr.R(d), XMM0); @@ -300,35 +320,46 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) int d = inst.FD; fpr.Lock(a,b,c,d); - MOVAPD(XMM0, fpr.R(a)); switch (inst.SUBOP5) { case 14: //madds0 MOVDDUP(XMM1, fpr.R(c)); + Force25BitPrecision(XMM1, XMM0); + MOVAPD(XMM0, fpr.R(a)); MULPD(XMM0, R(XMM1)); ADDPD(XMM0, fpr.R(b)); break; case 15: //madds1 MOVAPD(XMM1, fpr.R(c)); SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower + Force25BitPrecision(XMM1, XMM0); + MOVAPD(XMM0, fpr.R(a)); MULPD(XMM0, R(XMM1)); ADDPD(XMM0, fpr.R(b)); break; case 28: //msub - MULPD(XMM0, fpr.R(c)); + MOVAPD(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, XMM1); + MULPD(XMM0, fpr.R(a)); SUBPD(XMM0, fpr.R(b)); break; case 29: //madd - MULPD(XMM0, fpr.R(c)); + MOVAPD(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, XMM1); + MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); break; case 30: //nmsub - MULPD(XMM0, fpr.R(c)); + MOVAPD(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, XMM1); + MULPD(XMM0, fpr.R(a)); SUBPD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits)); break; case 31: //nmadd - MULPD(XMM0, fpr.R(c)); + MOVAPD(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, XMM1); + MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits)); break; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 2734e02715..e5fae2058c 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -519,6 +519,25 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm) } } +static const u64 GC_ALIGNED16(psMantissaTruncate[2]) = {0xFFFFFFFFF8000000ULL, 0xFFFFFFFFF8000000ULL}; +static const u64 GC_ALIGNED16(psRoundBit[2]) = {0x8000000, 0x8000000}; + +// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before +// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa, +// rounding to nearest as it does. +// It needs a temp, so let the caller pass that in. +void EmuCodeBlock::Force25BitPrecision(X64Reg xmm, X64Reg tmp) +{ + if (jit->jo.accurateSinglePrecision) + { + // mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1); + MOVAPD(tmp, R(xmm)); + PAND(xmm, M((void*)&psMantissaTruncate)); + PAND(tmp, M((void*)&psRoundBit)); + PADDQ(xmm, R(tmp)); + } +} + static u32 GC_ALIGNED16(temp32); static u64 GC_ALIGNED16(temp64); diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index af68493571..22bd922d30 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -56,6 +56,7 @@ public: void ForceSinglePrecisionS(Gen::X64Reg xmm); void ForceSinglePrecisionP(Gen::X64Reg xmm); + void Force25BitPrecision(Gen::X64Reg xmm, Gen::X64Reg tmp); // EAX might get trashed void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);