From 11be2314fe8aa9cdf04322b2ac710e9f89e0b8b5 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 15 May 2021 15:41:28 +0200 Subject: [PATCH] JitArm64: Fix fmul rounding issues This is a port of 4f18f60 to JitArm64. --- Source/Core/Common/Arm64Emitter.cpp | 39 ++++++++++- Source/Core/Common/Arm64Emitter.h | 8 +++ Source/Core/Core/PowerPC/JitArm64/Jit.h | 2 + .../JitArm64/JitArm64_FloatingPoint.cpp | 61 ++++++++++++++++- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 65 ++++++++++++++----- 5 files changed, 155 insertions(+), 20 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 2aa3c99198..5bc642551d 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2294,6 +2294,15 @@ void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, (opcode << 12) | (1 << 11) | (DecodeReg(Rn) << 5) | DecodeReg(Rd)); } +void ARM64FloatEmitter::EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __func__); + + Write32((1 << 30) | (U << 29) | (0b11110001 << 21) | (size << 22) | (DecodeReg(Rm) << 16) | + (opcode << 11) | (1 << 10) | (DecodeReg(Rn) << 5) | DecodeReg(Rd)); +} + void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { @@ -3103,6 +3112,11 @@ void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn) } // Scalar - 2 Source +void ARM64FloatEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, IsDouble(Rd), "%s only supports double registers!", __func__); + EmitScalarThreeSame(0, 3, 0b10000, Rd, Rn, Rm); +} void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm); @@ -3174,10 +3188,18 @@ void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8) } // Vector +void ARM64FloatEmitter::ADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0b10000, Rd, Rn, Rm); +} void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(0, 0, 3, Rd, Rn, Rm); } +void ARM64FloatEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 1, 3, Rd, Rn, Rm); +} void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(1, 1, 3, Rd, Rn, Rm); @@ -3285,6 +3307,10 @@ void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(0, 2, 3, Rd, Rn, Rm); } +void ARM64FloatEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 3, 3, Rd, Rn, Rm); +} void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn) { Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn); @@ -3864,11 +3890,10 @@ void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift) EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh); } -void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift) +void ARM64FloatEmitter::ORR_BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift, u8 op) { bool Q = IsQuad(Rd); u8 cmode = 1; - u8 op = 1; if (size == 16) { ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!", @@ -3904,6 +3929,16 @@ void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift) EncodeModImm(Q, op, cmode, 0, Rd, imm); } +void ARM64FloatEmitter::ORR(u8 size, ARM64Reg Rd, u8 imm, u8 shift) +{ + ORR_BIC(size, Rd, imm, shift, 0); +} + +void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift) +{ + ORR_BIC(size, Rd, imm, shift, 1); +} + void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp) { bool bundled_loadstore = false; diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 58caec8d08..7f878f4c20 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -998,6 +998,7 @@ public: void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP // Scalar - 2 Source + void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); @@ -1018,7 +1019,9 @@ public: void FMOV(ARM64Reg Rd, uint8_t imm8); // Vector + void ADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); @@ -1041,6 +1044,7 @@ public: void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void NOT(ARM64Reg Rd, ARM64Reg Rn); void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); } void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn); void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn); @@ -1126,6 +1130,7 @@ public: // Modified Immediate void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0); + void ORR(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0); void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0); void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = ARM64Reg::INVALID_REG, @@ -1143,6 +1148,7 @@ private: void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn); void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); @@ -1175,6 +1181,8 @@ private: void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh); + void ORR_BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift, u8 op); + void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 7d7bba6404..d00c2d58e0 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -264,6 +264,8 @@ protected: bool Rc = false); void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg); + void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input, + Arm64Gen::ARM64Reg temp); // std::map m_fault_to_handler; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 57ad09c1c8..3c5253bf7b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -31,6 +31,29 @@ void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg) gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); } +// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before +// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa, +// rounding to nearest as it does. +void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input, ARM64Reg temp) +{ + ASSERT(output != input && output != temp && input != temp); + + // temp = 0x0000'0000'0800'0000ULL + // output = 0xFFFF'FFFF'F800'0000ULL + m_float_emit.MOVI(32, temp, 0x08, 24); + m_float_emit.MOVI(64, output, 0xFFFF'FFFF'0000'0000ULL); + m_float_emit.BIC(temp, temp, output); + m_float_emit.ORR(32, output, 0xF8, 24); + + // output = (input & ~0xFFFFFFF) + ((input & (1ULL << 27)) << 1) + m_float_emit.AND(temp, input, temp); + m_float_emit.AND(output, input, output); + if (IsQuad(input)) + m_float_emit.ADD(64, output, output, temp); + else + m_float_emit.ADD(output, output, temp); +} + void JitArm64::fp_arith(UGeckoInstruction inst) { INSTRUCTION_START @@ -43,8 +66,11 @@ void JitArm64::fp_arith(UGeckoInstruction inst) bool single = inst.OPCD == 59; bool packed = inst.OPCD == 4; - bool use_c = op5 >= 25; // fmul and all kind of fmaddXX - bool use_b = op5 != 25; // fmul uses no B + const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX + const bool use_b = op5 != 25; // fmul uses no B + + const bool outputs_are_singles = single || packed; + const bool round_c = use_c && outputs_are_singles && !js.op->fprIsSingle[inst.FC]; const auto inputs_are_singles_func = [&] { return fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) && @@ -54,6 +80,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst) ARM64Reg VA{}, VB{}, VC{}, VD{}; + ARM64Reg V0Q = ARM64Reg::INVALID_REG; + if (packed) { const RegType type = inputs_are_singles ? RegType::Single : RegType::Register; @@ -67,6 +95,19 @@ void JitArm64::fp_arith(UGeckoInstruction inst) VC = reg_encoder(fpr.R(c, type)); VD = reg_encoder(fpr.RW(d, type)); + if (round_c) + { + ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); + + V0Q = fpr.GetReg(); + const ARM64Reg V1Q = fpr.GetReg(); + + Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q)); + VC = reg_encoder(V0Q); + + fpr.Unlock(V1Q); + } + switch (op5) { case 18: @@ -102,6 +143,19 @@ void JitArm64::fp_arith(UGeckoInstruction inst) VC = reg_encoder(fpr.R(c, type)); VD = reg_encoder(fpr.RW(d, type_out)); + if (round_c) + { + ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); + + V0Q = fpr.GetReg(); + const ARM64Reg V1Q = fpr.GetReg(); + + Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q)); + VC = reg_encoder(V0Q); + + fpr.Unlock(V1Q); + } + switch (op5) { case 18: @@ -134,7 +188,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst) } } - const bool outputs_are_singles = single || packed; + if (V0Q != ARM64Reg::INVALID_REG) + fpr.Unlock(V0Q); if (outputs_are_singles) { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 068a4ed1bb..4f63a7d3aa 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -84,16 +84,35 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst) const bool upper = inst.SUBOP5 == 13; const bool singles = fpr.IsSingle(a) && fpr.IsSingle(c); + const bool round_c = !js.op->fprIsSingle[inst.FC]; const RegType type = singles ? RegType::Single : RegType::Register; const u8 size = singles ? 32 : 64; const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad; const ARM64Reg VA = fpr.R(a, type); - const ARM64Reg VC = fpr.R(c, type); + ARM64Reg VC = fpr.R(c, type); const ARM64Reg VD = fpr.RW(d, type); + ARM64Reg V0Q = ARM64Reg::INVALID_REG; + + if (round_c) + { + ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); + + V0Q = fpr.GetReg(); + const ARM64Reg V1Q = fpr.GetReg(); + + Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC), reg_encoder(V1Q)); + VC = reg_encoder(V0Q); + + fpr.Unlock(V1Q); + } + m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0); + if (V0Q != ARM64Reg::INVALID_REG) + fpr.Unlock(V0Q); + ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(c)), "Register allocation turned singles into doubles in the middle of ps_mulsX"); @@ -115,31 +134,45 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) const u32 op5 = inst.SUBOP5; const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c); + const bool round_c = !js.op->fprIsSingle[inst.FC]; const RegType type = singles ? RegType::Single : RegType::Register; const u8 size = singles ? 32 : 64; const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad; const ARM64Reg VA = reg_encoder(fpr.R(a, type)); const ARM64Reg VB = reg_encoder(fpr.R(b, type)); - const ARM64Reg VC = reg_encoder(fpr.R(c, type)); + ARM64Reg VC = reg_encoder(fpr.R(c, type)); const ARM64Reg VD = reg_encoder(fpr.RW(d, type)); + ARM64Reg V0Q = ARM64Reg::INVALID_REG; ARM64Reg V0 = ARM64Reg::INVALID_REG; - if (d != b && (d == a || d == c)) + ARM64Reg V1Q = ARM64Reg::INVALID_REG; + + if (round_c || (d != b && (d == a || d == c))) { V0Q = fpr.GetReg(); V0 = reg_encoder(V0Q); } + if (round_c) + { + ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); + + V1Q = fpr.GetReg(); + + Force25BitPrecision(reg_encoder(V1Q), VC, V0); + VC = reg_encoder(V1Q); + } + switch (op5) { case 14: // ps_madds0 // d = a * c.ps0 + b - if (d == b) + if (VD == VB) { m_float_emit.FMLA(size, VD, VA, VC, 0); } - else if (d != a && d != c) + else if (VD != VA && VD != VC) { m_float_emit.MOV(VD, VB); m_float_emit.FMLA(size, VD, VA, VC, 0); @@ -153,11 +186,11 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) break; case 15: // ps_madds1 // d = a * c.ps1 + b - if (d == b) + if (VD == VB) { m_float_emit.FMLA(size, VD, VA, VC, 1); } - else if (d != a && d != c) + else if (VD != VA && VD != VC) { m_float_emit.MOV(VD, VB); m_float_emit.FMLA(size, VD, VA, VC, 1); @@ -171,14 +204,14 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) break; case 28: // ps_msub // d = a * c - b - if (d == b) + if (VD == VB) { // d = -(-a * c + b) // rounding is incorrect if the rounding mode is +/- infinity m_float_emit.FMLS(size, VD, VA, VC); m_float_emit.FNEG(size, VD, VD); } - else if (d != a && d != c) + else if (VD != VA && VD != VC) { m_float_emit.FNEG(size, VD, VB); m_float_emit.FMLA(size, VD, VA, VC); @@ -192,11 +225,11 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) break; case 29: // ps_madd // d = a * c + b - if (d == b) + if (VD == VB) { m_float_emit.FMLA(size, VD, VA, VC); } - else if (d != a && d != c) + else if (VD != VA && VD != VC) { m_float_emit.MOV(VD, VB); m_float_emit.FMLA(size, VD, VA, VC); @@ -215,11 +248,11 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) // Note: PowerPC rounds before the final negation. // We don't handle this at the moment because it's // only relevant when rounding to +/- infinity. - if (d == b) + if (VD == VB) { m_float_emit.FMLS(size, VD, VA, VC); } - else if (d != a && d != c) + else if (VD != VA && VD != VC) { m_float_emit.MOV(VD, VB); m_float_emit.FMLS(size, VD, VA, VC); @@ -233,12 +266,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) break; case 31: // ps_nmadd // d = -(a * c + b) - if (d == b) + if (VD == VB) { m_float_emit.FMLA(size, VD, VA, VC); m_float_emit.FNEG(size, VD, VD); } - else if (d != a && d != c) + else if (VD != VA && VD != VC) { // d = -a * c - b // See rounding note at ps_nmsub. @@ -259,6 +292,8 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) if (V0Q != ARM64Reg::INVALID_REG) fpr.Unlock(V0Q); + if (V1Q != ARM64Reg::INVALID_REG) + fpr.Unlock(V1Q); ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)), "Register allocation turned singles into doubles in the middle of ps_maddXX");