From 4f18f6078fdb097a927319ae7b12a0dacf4b42f5 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Mon, 18 Aug 2014 19:58:00 -0700
Subject: [PATCH] JIT64: Fix fmul rounding issues

Thanks to magumagu's softfp experiments, we know a lot more about the Wii's
strange floating point unit than we used to. In particular, when doing a
single-precision floating point multiply (fmulsx), it rounds the right hand
side's mantissa so as to lose the low 28 bits (of the 53-bit mantissa).

Emulating this behavior in Dolphin fixes a bunch of issues with games that
require extremely precise emulation of floating point hardware, especially
game replays. Fortunately, we can do this with rather little CPU cost; just ~5
extra instructions per multiply, instead of the vast load of a pure-software
float implementation.

This doesn't make floating-point behavior at all perfect. I still suspect
fmadd rounding might not be quite right, since the Wii uses fused instructions
and Dolphin doesn't, and NaN/infinity/exception handling is probably off in
various ways... but it's definitely way better than before.

This appears to fix replays in Mario Kart Wii, Mario Kart Double Dash, and
Super Smash Brothers Brawl. I wouldn't be surprised if it fixes a bunch of
other stuff too.

The changes to instructions other than fmulsx may not be strictly necessary,
but I included them for completeness, since it feels wrong to fix some
instructions but not others, since some games we didn't test might rely on
them.
---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |  4 +-
 .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp  | 35 ++++++++++---
 Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 51 +++++++++++++++----
 .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp  | 19 +++++++
 Source/Core/Core/PowerPC/JitCommon/Jit_Util.h |  1 +
 5 files changed, 90 insertions(+), 20 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 95fbcd1f7b..97aa522d22 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -117,10 +117,10 @@ public:
 	// is set or not.
 	Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
 
-	void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
+	void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);
 	typedef u32 (*Operation)(u32 a, u32 b);
 	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
-	void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
+	void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);
 
 	// OPCODES
 	void unknown_instruction(UGeckoInstruction _inst);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index 7761c636d2..6be6680708 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -14,10 +14,28 @@ static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x800000
 static const u64 GC_ALIGNED16(psAbsMask2[2])  = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
 static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
 
-void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
+void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS)
 {
 	fpr.Lock(d, a, b);
-	if (d == a)
+	if (roundRHS)
+	{
+		if (d == a)
+		{
+			fpr.BindToRegister(d, true);
+			MOVSD(XMM0, fpr.R(b));
+			Force25BitPrecision(XMM0, XMM1);
+			(this->*op)(fpr.RX(d), R(XMM0));
+		}
+		else
+		{
+			fpr.BindToRegister(d, d == b);
+			if (d != b)
+				MOVSD(fpr.RX(d), fpr.R(b));
+			Force25BitPrecision(fpr.RX(d), XMM0);
+			(this->*op)(fpr.RX(d), fpr.R(a));
+		}
+	}
+	else if (d == a)
 	{
 		fpr.BindToRegister(d, true);
 		if (!single)
@@ -88,7 +106,7 @@ void Jit64::fp_arith(UGeckoInstruction inst)
 	case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD); break; //div
 	case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD); break; //sub
 	case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true,  single, &XEmitter::ADDSD); break; //add
-	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD); break; //mul
+	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, single); break; //mul
 	default:
 		_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
 	}
@@ -111,24 +129,25 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 	int d = inst.FD;
 
 	fpr.Lock(a, b, c, d);
-	MOVSD(XMM0, fpr.R(a));
+	MOVSD(XMM0, fpr.R(c));
+	Force25BitPrecision(XMM0, XMM1);
 	switch (inst.SUBOP5)
 	{
 	case 28: //msub
-		MULSD(XMM0, fpr.R(c));
+		MULSD(XMM0, fpr.R(a));
 		SUBSD(XMM0, fpr.R(b));
 		break;
 	case 29: //madd
-		MULSD(XMM0, fpr.R(c));
+		MULSD(XMM0, fpr.R(a));
 		ADDSD(XMM0, fpr.R(b));
 		break;
 	case 30: //nmsub
-		MULSD(XMM0, fpr.R(c));
+		MULSD(XMM0, fpr.R(a));
 		SUBSD(XMM0, fpr.R(b));
 		PXOR(XMM0, M((void*)&psSignBits2));
 		break;
 	case 31: //nmadd
-		MULSD(XMM0, fpr.R(c));
+		MULSD(XMM0, fpr.R(a));
 		ADDSD(XMM0, fpr.R(b));
 		PXOR(XMM0, M((void*)&psSignBits2));
 		break;
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
index 6934f56d42..b2fc5e4150 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@@ -113,11 +113,29 @@ add a,b,a
 */
 
 //There's still a little bit more optimization that can be squeezed out of this
-void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg))
+void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), bool roundRHS)
 {
 	fpr.Lock(d, a, b);
 
-	if (d == a)
+	if (roundRHS)
+	{
+		if (d == a)
+		{
+			fpr.BindToRegister(d, true);
+			MOVAPD(XMM0, fpr.R(b));
+			Force25BitPrecision(XMM0, XMM1);
+			(this->*op)(fpr.RX(d), R(XMM0));
+		}
+		else
+		{
+			fpr.BindToRegister(d, d == b);
+			if (d != b)
+				MOVAPD(fpr.RX(d), fpr.R(b));
+			Force25BitPrecision(fpr.RX(d), XMM0);
+			(this->*op)(fpr.RX(d), fpr.R(a));
+		}
+	}
+	else if (d == a)
 	{
 		fpr.BindToRegister(d, true);
 		(this->*op)(fpr.RX(d), fpr.R(b));
@@ -166,7 +184,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
 		tri_op(inst.FD, inst.FA, inst.FB, true,  &XEmitter::ADDPD);
 		break;
 	case 25: // mul
-		tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD);
+		tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, true);
 		break;
 	default:
 		_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
@@ -230,15 +248,17 @@ void Jit64::ps_muls(UGeckoInstruction inst)
 	case 12:
 		// Single multiply scalar high
 		// TODO - faster version for when regs are different
-		MOVAPD(XMM0, fpr.R(a));
 		MOVDDUP(XMM1, fpr.R(c));
+		Force25BitPrecision(XMM1, XMM0);
+		MOVAPD(XMM0, fpr.R(a));
 		MULPD(XMM0, R(XMM1));
 		MOVAPD(fpr.R(d), XMM0);
 		break;
 	case 13:
 		// TODO - faster version for when regs are different
-		MOVAPD(XMM0, fpr.R(a));
 		MOVAPD(XMM1, fpr.R(c));
+		Force25BitPrecision(XMM1, XMM0);
+		MOVAPD(XMM0, fpr.R(a));
 		SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
 		MULPD(XMM0, R(XMM1));
 		MOVAPD(fpr.R(d), XMM0);
@@ -300,35 +320,46 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
 	int d = inst.FD;
 	fpr.Lock(a,b,c,d);
 
-	MOVAPD(XMM0, fpr.R(a));
 	switch (inst.SUBOP5)
 	{
 	case 14: //madds0
 		MOVDDUP(XMM1, fpr.R(c));
+		Force25BitPrecision(XMM1, XMM0);
+		MOVAPD(XMM0, fpr.R(a));
 		MULPD(XMM0, R(XMM1));
 		ADDPD(XMM0, fpr.R(b));
 		break;
 	case 15: //madds1
 		MOVAPD(XMM1, fpr.R(c));
 		SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
+		Force25BitPrecision(XMM1, XMM0);
+		MOVAPD(XMM0, fpr.R(a));
 		MULPD(XMM0, R(XMM1));
 		ADDPD(XMM0, fpr.R(b));
 		break;
 	case 28: //msub
-		MULPD(XMM0, fpr.R(c));
+		MOVAPD(XMM0, fpr.R(c));
+		Force25BitPrecision(XMM0, XMM1);
+		MULPD(XMM0, fpr.R(a));
 		SUBPD(XMM0, fpr.R(b));
 		break;
 	case 29: //madd
-		MULPD(XMM0, fpr.R(c));
+		MOVAPD(XMM0, fpr.R(c));
+		Force25BitPrecision(XMM0, XMM1);
+		MULPD(XMM0, fpr.R(a));
 		ADDPD(XMM0, fpr.R(b));
 		break;
 	case 30: //nmsub
-		MULPD(XMM0, fpr.R(c));
+		MOVAPD(XMM0, fpr.R(c));
+		Force25BitPrecision(XMM0, XMM1);
+		MULPD(XMM0, fpr.R(a));
 		SUBPD(XMM0, fpr.R(b));
 		PXOR(XMM0, M((void*)&psSignBits));
 		break;
 	case 31: //nmadd
-		MULPD(XMM0, fpr.R(c));
+		MOVAPD(XMM0, fpr.R(c));
+		Force25BitPrecision(XMM0, XMM1);
+		MULPD(XMM0, fpr.R(a));
 		ADDPD(XMM0, fpr.R(b));
 		PXOR(XMM0, M((void*)&psSignBits));
 		break;
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 2734e02715..e5fae2058c 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -519,6 +519,25 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm)
 	}
 }
 
+static const u64 GC_ALIGNED16(psMantissaTruncate[2]) = {0xFFFFFFFFF8000000ULL, 0xFFFFFFFFF8000000ULL};
+static const u64 GC_ALIGNED16(psRoundBit[2]) = {0x8000000, 0x8000000};
+
+// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before
+// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
+// rounding to nearest as it does.
+// It needs a temp, so let the caller pass that in.
+void EmuCodeBlock::Force25BitPrecision(X64Reg xmm, X64Reg tmp)
+{
+	if (jit->jo.accurateSinglePrecision)
+	{
+		// mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
+		MOVAPD(tmp, R(xmm));
+		PAND(xmm, M((void*)&psMantissaTruncate));
+		PAND(tmp, M((void*)&psRoundBit));
+		PADDQ(xmm, R(tmp));
+	}
+}
+
 static u32 GC_ALIGNED16(temp32);
 static u64 GC_ALIGNED16(temp64);
 
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
index af68493571..22bd922d30 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@@ -56,6 +56,7 @@ public:
 
 	void ForceSinglePrecisionS(Gen::X64Reg xmm);
 	void ForceSinglePrecisionP(Gen::X64Reg xmm);
+	void Force25BitPrecision(Gen::X64Reg xmm, Gen::X64Reg tmp);
 
 	// EAX might get trashed
 	void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);