From 11be2314fe8aa9cdf04322b2ac710e9f89e0b8b5 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sat, 15 May 2021 15:41:28 +0200
Subject: [PATCH] JitArm64: Fix fmul rounding issues

This is a port of 4f18f60 to JitArm64.
---
 Source/Core/Common/Arm64Emitter.cpp           | 39 ++++++++++-
 Source/Core/Common/Arm64Emitter.h             |  8 +++
 Source/Core/Core/PowerPC/JitArm64/Jit.h       |  2 +
 .../JitArm64/JitArm64_FloatingPoint.cpp       | 61 ++++++++++++++++-
 .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 65 ++++++++++++++-----
 5 files changed, 155 insertions(+), 20 deletions(-)

diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp
index 2aa3c99198..5bc642551d 100644
--- a/Source/Core/Common/Arm64Emitter.cpp
+++ b/Source/Core/Common/Arm64Emitter.cpp
@@ -2294,6 +2294,15 @@ void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode,
           (opcode << 12) | (1 << 11) | (DecodeReg(Rn) << 5) | DecodeReg(Rd));
 }
 
+void ARM64FloatEmitter::EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+                                            ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __func__);
+
+  Write32((1 << 30) | (U << 29) | (0b11110001 << 21) | (size << 22) | (DecodeReg(Rm) << 16) |
+          (opcode << 11) | (1 << 10) | (DecodeReg(Rn) << 5) | DecodeReg(Rd));
+}
+
 void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
                                       ARM64Reg Rm)
 {
@@ -3103,6 +3112,11 @@ void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn)
 }
 
 // Scalar - 2 Source
+void ARM64FloatEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, IsDouble(Rd), "%s only supports double registers!", __func__);
+  EmitScalarThreeSame(0, 3, 0b10000, Rd, Rn, Rm);
+}
 void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 {
   EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm);
@@ -3174,10 +3188,18 @@ void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8)
 }
 
 // Vector
+void ARM64FloatEmitter::ADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0b10000, Rd, Rn, Rm);
+}
 void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 {
   EmitThreeSame(0, 0, 3, Rd, Rn, Rm);
 }
+void ARM64FloatEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 1, 3, Rd, Rn, Rm);
+}
 void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 {
   EmitThreeSame(1, 1, 3, Rd, Rn, Rm);
@@ -3285,6 +3307,10 @@ void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 {
   EmitThreeSame(0, 2, 3, Rd, Rn, Rm);
 }
+void ARM64FloatEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 3, 3, Rd, Rn, Rm);
+}
 void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn)
 {
   Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn);
@@ -3864,11 +3890,10 @@ void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift)
   EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh);
 }
 
-void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
+void ARM64FloatEmitter::ORR_BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift, u8 op)
 {
   bool Q = IsQuad(Rd);
   u8 cmode = 1;
-  u8 op = 1;
   if (size == 16)
   {
     ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
@@ -3904,6 +3929,16 @@ void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
   EncodeModImm(Q, op, cmode, 0, Rd, imm);
 }
 
+void ARM64FloatEmitter::ORR(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
+{
+  ORR_BIC(size, Rd, imm, shift, 0);
+}
+
+void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
+{
+  ORR_BIC(size, Rd, imm, shift, 1);
+}
+
 void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
 {
   bool bundled_loadstore = false;
diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h
index 58caec8d08..7f878f4c20 100644
--- a/Source/Core/Common/Arm64Emitter.h
+++ b/Source/Core/Common/Arm64Emitter.h
@@ -998,6 +998,7 @@ public:
   void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false);  // Also generalized move between GPR/FP
 
   // Scalar - 2 Source
+  void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
@@ -1018,7 +1019,9 @@ public:
   void FMOV(ARM64Reg Rd, uint8_t imm8);
 
   // Vector
+  void ADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
   void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
@@ -1041,6 +1044,7 @@ public:
   void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void NOT(ARM64Reg Rd, ARM64Reg Rn);
   void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); }
   void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
   void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
@@ -1126,6 +1130,7 @@ public:
 
   // Modified Immediate
   void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0);
+  void ORR(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
   void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
 
   void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = ARM64Reg::INVALID_REG,
@@ -1143,6 +1148,7 @@ private:
   void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
   void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
                          ARM64Reg Rm);
+  void EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
   void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
@@ -1175,6 +1181,8 @@ private:
   void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
   void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);
 
+  void ORR_BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift, u8 op);
+
   void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
   void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
   void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h
index 7d7bba6404..d00c2d58e0 100644
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.h
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h
@@ -264,6 +264,8 @@ protected:
                bool Rc = false);
 
   void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg);
+  void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input,
+                           Arm64Gen::ARM64Reg temp);
 
   // <Fastmem fault location, slowmem handler location>
   std::map<const u8*, FastmemArea> m_fault_to_handler;
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
index 57ad09c1c8..3c5253bf7b 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@@ -31,6 +31,29 @@ void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg)
   gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
 }
 
+// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before
+// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
+// rounding to nearest as it does.
+void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input, ARM64Reg temp)
+{
+  ASSERT(output != input && output != temp && input != temp);
+
+  // temp   = 0x0000'0000'0800'0000ULL
+  // output = 0xFFFF'FFFF'F800'0000ULL
+  m_float_emit.MOVI(32, temp, 0x08, 24);
+  m_float_emit.MOVI(64, output, 0xFFFF'FFFF'0000'0000ULL);
+  m_float_emit.BIC(temp, temp, output);
+  m_float_emit.ORR(32, output, 0xF8, 24);
+
+  // output = (input & ~0xFFFFFFF) + ((input & (1ULL << 27)) << 1)
+  m_float_emit.AND(temp, input, temp);
+  m_float_emit.AND(output, input, output);
+  if (IsQuad(input))
+    m_float_emit.ADD(64, output, output, temp);
+  else
+    m_float_emit.ADD(output, output, temp);
+}
+
 void JitArm64::fp_arith(UGeckoInstruction inst)
 {
   INSTRUCTION_START
@@ -43,8 +66,11 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   bool single = inst.OPCD == 59;
   bool packed = inst.OPCD == 4;
 
-  bool use_c = op5 >= 25;  // fmul and all kind of fmaddXX
-  bool use_b = op5 != 25;  // fmul uses no B
+  const bool use_c = op5 >= 25;  // fmul and all kind of fmaddXX
+  const bool use_b = op5 != 25;  // fmul uses no B
+
+  const bool outputs_are_singles = single || packed;
+  const bool round_c = use_c && outputs_are_singles && !js.op->fprIsSingle[inst.FC];
 
   const auto inputs_are_singles_func = [&] {
     return fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) &&
@@ -54,6 +80,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
 
   ARM64Reg VA{}, VB{}, VC{}, VD{};
 
+  ARM64Reg V0Q = ARM64Reg::INVALID_REG;
+
   if (packed)
   {
     const RegType type = inputs_are_singles ? RegType::Single : RegType::Register;
@@ -67,6 +95,19 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
       VC = reg_encoder(fpr.R(c, type));
     VD = reg_encoder(fpr.RW(d, type));
 
+    if (round_c)
+    {
+      ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
+
+      V0Q = fpr.GetReg();
+      const ARM64Reg V1Q = fpr.GetReg();
+
+      Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q));
+      VC = reg_encoder(V0Q);
+
+      fpr.Unlock(V1Q);
+    }
+
     switch (op5)
     {
     case 18:
@@ -102,6 +143,19 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
       VC = reg_encoder(fpr.R(c, type));
     VD = reg_encoder(fpr.RW(d, type_out));
 
+    if (round_c)
+    {
+      ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
+
+      V0Q = fpr.GetReg();
+      const ARM64Reg V1Q = fpr.GetReg();
+
+      Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q));
+      VC = reg_encoder(V0Q);
+
+      fpr.Unlock(V1Q);
+    }
+
     switch (op5)
     {
     case 18:
@@ -134,7 +188,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
     }
   }
 
-  const bool outputs_are_singles = single || packed;
+  if (V0Q != ARM64Reg::INVALID_REG)
+    fpr.Unlock(V0Q);
 
   if (outputs_are_singles)
   {
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
index 068a4ed1bb..4f63a7d3aa 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@@ -84,16 +84,35 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst)
   const bool upper = inst.SUBOP5 == 13;
 
   const bool singles = fpr.IsSingle(a) && fpr.IsSingle(c);
+  const bool round_c = !js.op->fprIsSingle[inst.FC];
   const RegType type = singles ? RegType::Single : RegType::Register;
   const u8 size = singles ? 32 : 64;
   const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
 
   const ARM64Reg VA = fpr.R(a, type);
-  const ARM64Reg VC = fpr.R(c, type);
+  ARM64Reg VC = fpr.R(c, type);
   const ARM64Reg VD = fpr.RW(d, type);
 
+  ARM64Reg V0Q = ARM64Reg::INVALID_REG;
+
+  if (round_c)
+  {
+    ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
+
+    V0Q = fpr.GetReg();
+    const ARM64Reg V1Q = fpr.GetReg();
+
+    Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC), reg_encoder(V1Q));
+    VC = reg_encoder(V0Q);
+
+    fpr.Unlock(V1Q);
+  }
+
   m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0);
 
+  if (V0Q != ARM64Reg::INVALID_REG)
+    fpr.Unlock(V0Q);
+
   ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(c)),
              "Register allocation turned singles into doubles in the middle of ps_mulsX");
 
@@ -115,31 +134,45 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
   const u32 op5 = inst.SUBOP5;
 
   const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
+  const bool round_c = !js.op->fprIsSingle[inst.FC];
   const RegType type = singles ? RegType::Single : RegType::Register;
   const u8 size = singles ? 32 : 64;
   const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
 
   const ARM64Reg VA = reg_encoder(fpr.R(a, type));
   const ARM64Reg VB = reg_encoder(fpr.R(b, type));
-  const ARM64Reg VC = reg_encoder(fpr.R(c, type));
+  ARM64Reg VC = reg_encoder(fpr.R(c, type));
   const ARM64Reg VD = reg_encoder(fpr.RW(d, type));
+
   ARM64Reg V0Q = ARM64Reg::INVALID_REG;
   ARM64Reg V0 = ARM64Reg::INVALID_REG;
-  if (d != b && (d == a || d == c))
+  ARM64Reg V1Q = ARM64Reg::INVALID_REG;
+
+  if (round_c || (d != b && (d == a || d == c)))
   {
     V0Q = fpr.GetReg();
     V0 = reg_encoder(V0Q);
   }
 
+  if (round_c)
+  {
+    ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
+
+    V1Q = fpr.GetReg();
+
+    Force25BitPrecision(reg_encoder(V1Q), VC, V0);
+    VC = reg_encoder(V1Q);
+  }
+
   switch (op5)
   {
   case 14:  // ps_madds0
     // d = a * c.ps0 + b
-    if (d == b)
+    if (VD == VB)
     {
       m_float_emit.FMLA(size, VD, VA, VC, 0);
     }
-    else if (d != a && d != c)
+    else if (VD != VA && VD != VC)
     {
       m_float_emit.MOV(VD, VB);
       m_float_emit.FMLA(size, VD, VA, VC, 0);
@@ -153,11 +186,11 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
     break;
   case 15:  // ps_madds1
     // d = a * c.ps1 + b
-    if (d == b)
+    if (VD == VB)
     {
       m_float_emit.FMLA(size, VD, VA, VC, 1);
     }
-    else if (d != a && d != c)
+    else if (VD != VA && VD != VC)
     {
       m_float_emit.MOV(VD, VB);
       m_float_emit.FMLA(size, VD, VA, VC, 1);
@@ -171,14 +204,14 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
     break;
   case 28:  // ps_msub
     // d = a * c - b
-    if (d == b)
+    if (VD == VB)
     {
       // d = -(-a * c + b)
       // rounding is incorrect if the rounding mode is +/- infinity
       m_float_emit.FMLS(size, VD, VA, VC);
       m_float_emit.FNEG(size, VD, VD);
     }
-    else if (d != a && d != c)
+    else if (VD != VA && VD != VC)
     {
       m_float_emit.FNEG(size, VD, VB);
       m_float_emit.FMLA(size, VD, VA, VC);
@@ -192,11 +225,11 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
     break;
   case 29:  // ps_madd
     // d = a * c + b
-    if (d == b)
+    if (VD == VB)
     {
       m_float_emit.FMLA(size, VD, VA, VC);
     }
-    else if (d != a && d != c)
+    else if (VD != VA && VD != VC)
     {
       m_float_emit.MOV(VD, VB);
       m_float_emit.FMLA(size, VD, VA, VC);
@@ -215,11 +248,11 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
     // Note: PowerPC rounds before the final negation.
     // We don't handle this at the moment because it's
     // only relevant when rounding to +/- infinity.
-    if (d == b)
+    if (VD == VB)
     {
       m_float_emit.FMLS(size, VD, VA, VC);
     }
-    else if (d != a && d != c)
+    else if (VD != VA && VD != VC)
     {
       m_float_emit.MOV(VD, VB);
       m_float_emit.FMLS(size, VD, VA, VC);
@@ -233,12 +266,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
     break;
   case 31:  // ps_nmadd
     // d = -(a * c + b)
-    if (d == b)
+    if (VD == VB)
     {
       m_float_emit.FMLA(size, VD, VA, VC);
       m_float_emit.FNEG(size, VD, VD);
     }
-    else if (d != a && d != c)
+    else if (VD != VA && VD != VC)
     {
       // d = -a * c - b
       // See rounding note at ps_nmsub.
@@ -259,6 +292,8 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
 
   if (V0Q != ARM64Reg::INVALID_REG)
     fpr.Unlock(V0Q);
+  if (V1Q != ARM64Reg::INVALID_REG)
+    fpr.Unlock(V1Q);
 
   ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
              "Register allocation turned singles into doubles in the middle of ps_maddXX");