From 246adf0d6dbefc329f1347000be4b9ea3aa3447e Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 24 Mar 2021 22:40:35 +0100 Subject: [PATCH 1/3] Jit64: divwx - Eliminate MOV for division by 2 When destination and input registers match, a redundant MOV instruction can be eliminated. Before: 8B C7 mov eax,edi 8B F8 mov edi,eax C1 EF 1F shr edi,1Fh 03 F8 add edi,eax D1 FF sar edi,1 After: 8B C7 mov eax,edi C1 EF 1F shr edi,1Fh 03 F8 add edi,eax D1 FF sar edi,1 --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index d2294ba313..b77914ef99 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1464,12 +1464,21 @@ void Jit64::divwx(UGeckoInstruction inst) else if (divisor == 2 || divisor == -2) { X64Reg tmp = RSCRATCH; - if (Ra.IsSimpleReg() && Ra.GetSimpleReg() != Rd) - tmp = Ra.GetSimpleReg(); - else + if (!Ra.IsSimpleReg()) + { MOV(32, R(tmp), Ra); + MOV(32, Rd, R(tmp)); + } + else if (d == a) + { + MOV(32, R(tmp), Ra); + } + else + { + MOV(32, Rd, Ra); + tmp = Ra.GetSimpleReg(); + } - MOV(32, Rd, R(tmp)); SHR(32, Rd, Imm8(31)); ADD(32, Rd, R(tmp)); SAR(32, Rd, Imm8(1)); From abc4c8f6017e516818eb301718bf3773f827a35e Mon Sep 17 00:00:00 2001 From: Sintendo Date: Sat, 24 Apr 2021 19:28:18 +0200 Subject: [PATCH 2/3] Jit64: divwx - Eliminate MOV for division by power of 2 Division by a power of two can be slightly improved when the destination and dividend registers are the same. Before: 8B C6 mov eax,esi 85 C0 test eax,eax 8D 70 03 lea esi,[rax+3] 0F 49 F0 cmovns esi,eax C1 FE 02 sar esi,2 After: 85 F6 test esi,esi 8D 46 03 lea eax,[rsi+3] 0F 48 F0 cmovs esi,eax C1 FE 02 sar esi,2 --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 40 +++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index b77914ef99..b31abb37e0 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1493,15 +1493,39 @@ void Jit64::divwx(UGeckoInstruction inst) { u32 abs_val = std::abs(divisor); - X64Reg tmp = RSCRATCH; - if (Ra.IsSimpleReg() && Ra.GetSimpleReg() != Rd) - tmp = Ra.GetSimpleReg(); - else - MOV(32, R(tmp), Ra); + X64Reg dividend, sum, src; + CCFlags cond = CC_NS; - TEST(32, R(tmp), R(tmp)); - LEA(32, Rd, MDisp(tmp, abs_val - 1)); - CMOVcc(32, Rd, R(tmp), CC_NS); + if (!Ra.IsSimpleReg()) + { + dividend = RSCRATCH; + sum = Rd; + src = RSCRATCH; + + // Load dividend from memory + MOV(32, R(dividend), Ra); + } + else if (d == a) + { + // Rd holds the dividend, while RSCRATCH holds the sum + // This is opposite of the other cases + dividend = Rd; + sum = RSCRATCH; + src = RSCRATCH; + // Negate condition to compensate the swapped values + cond = CC_S; + } + else + { + // Use dividend from register directly + dividend = Ra.GetSimpleReg(); + sum = Rd; + src = dividend; + } + + TEST(32, R(dividend), R(dividend)); + LEA(32, sum, MDisp(dividend, abs_val - 1)); + CMOVcc(32, Rd, R(src), cond); SAR(32, Rd, Imm8(IntLog2(abs_val))); if (divisor < 0) From 47e16133e5cfb6361b265f1717736b22f896e445 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Sat, 24 Apr 2021 21:31:39 +0200 Subject: [PATCH 3/3] Jit64: divwx - Eliminate XOR for constant dividend We normally check for division by zero to know if we should set the destination register to zero with a XOR. However, when the divisor and destination registers are the same the explicit zeroing can be omitted. In addition, some of the surrounding branching can be simplified as well. Before: 45 85 FF test r15d,r15d 75 05 jne normal_path 45 33 FF xor r15d,r15d EB 0C jmp done normal_path: B8 5A 00 00 00 mov eax,5Ah 99 cdq 41 F7 FF idiv eax,r15d 44 8B F8 mov r15d,eax done: After: 45 85 FF test r15d,r15d 74 0C je done B8 5A 00 00 00 mov eax,5Ah 99 cdq 41 F7 FF idiv eax,r15d 44 8B F8 mov r15d,eax done: --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 61 +++++++++++-------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index b31abb37e0..54ee902737 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1375,38 +1375,49 @@ void Jit64::divwx(UGeckoInstruction inst) // Check for divisor == 0 TEST(32, Rb, Rb); - FixupBranch normal_path; + FixupBranch done; - if (dividend == 0x80000000) + if (d == b && (dividend & 0x80000000) == 0 && !inst.OE) { - // Divisor is 0, proceed to overflow case - const FixupBranch overflow = J_CC(CC_Z); - // Otherwise, check for divisor == -1 - CMP(32, Rb, Imm32(0xFFFFFFFF)); - normal_path = J_CC(CC_NE); - - SetJumpTarget(overflow); + // Divisor is 0, skip to the end + // No need to explicitly set destination to 0 due to overlapping registers + done = J_CC(CC_Z); + // Otherwise, proceed to normal path } else { - // Divisor is not 0, take normal path - normal_path = J_CC(CC_NZ); - // Otherwise, proceed to overflow case + FixupBranch normal_path; + if (dividend == 0x80000000) + { + // Divisor is 0, proceed to overflow case + const FixupBranch overflow = J_CC(CC_Z); + // Otherwise, check for divisor == -1 + CMP(32, Rb, Imm32(0xFFFFFFFF)); + normal_path = J_CC(CC_NE); + + SetJumpTarget(overflow); + } + else + { + // Divisor is not 0, take normal path + normal_path = J_CC(CC_NZ); + // Otherwise, proceed to overflow case + } + + // Set Rd to all ones or all zeroes + if (dividend & 0x80000000) + MOV(32, Rd, Imm32(0xFFFFFFFF)); + else if (d != b) + XOR(32, Rd, Rd); + + if (inst.OE) + GenerateConstantOverflow(true); + + done = J(); + + SetJumpTarget(normal_path); } - // Set Rd to all ones or all zeroes - if (dividend & 0x80000000) - MOV(32, Rd, Imm32(0xFFFFFFFF)); - else - XOR(32, Rd, Rd); - - if (inst.OE) - GenerateConstantOverflow(true); - - const FixupBranch done = J(); - - SetJumpTarget(normal_path); - MOV(32, eax, Imm32(dividend)); CDQ(); IDIV(32, Rb);