From 05b72c5d31f8c3b04744e62626594847ab1d68c2 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 7 Jun 2015 16:57:57 -0500 Subject: [PATCH] [AArch64] Upstream PPSSPP's emitter changes. Requires a minor change to in the JIT to make sure everything still works. --- Source/Core/Common/Arm64Emitter.cpp | 1301 ++++++++++++++--- Source/Core/Common/Arm64Emitter.h | 198 ++- .../JitArm64/JitArm64_LoadStorePaired.cpp | 2 +- 3 files changed, 1275 insertions(+), 226 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index d374491a09..8a93a7a0bd 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -1,15 +1,273 @@ -// Copyright 2014 Dolphin Emulator Project +// Copyright 2015 Dolphin Emulator Project // Licensed under GPLv2+ // Refer to the license.txt file included. -#include +#include +#include #include "Common/Arm64Emitter.h" +#include "Common/CommonTypes.h" #include "Common/MathUtil.h" namespace Arm64Gen { +const int kWRegSizeInBits = 32; +const int kXRegSizeInBits = 64; + +// The below few functions are taken from V8. +static int CountLeadingZeros(uint64_t value, int width) +{ + // TODO(jbramley): Optimize this for ARM64 hosts. + int count = 0; + uint64_t bit_test = 1ULL << (width - 1); + while ((count < width) && ((bit_test & value) == 0)) + { + count++; + bit_test >>= 1; + } + return count; +} + +static uint64_t LargestPowerOf2Divisor(uint64_t value) +{ + return value & -(int64_t)value; +} + +static bool IsPowerOfTwo(uint64_t x) +{ + return (x != 0) && ((x & (x - 1)) == 0); +} + +#define V8_UINT64_C(x) ((uint64_t)(x)) + +bool IsImmArithmetic(uint64_t input, u32 *val, bool *shift) +{ + if (input < 4096) + { + *val = input; + *shift = false; + return true; + } + else if ((input & 0xFFF000) == input) + { + *val = input >> 12; + *shift = true; + return true; + } + return false; +} + +bool IsImmLogical(uint64_t value, unsigned int width, unsigned int *n, unsigned int *imm_s, unsigned int *imm_r) +{ + //DCHECK((n != NULL) && (imm_s != NULL) && (imm_r != NULL)); + // DCHECK((width == kWRegSizeInBits) || (width == kXRegSizeInBits)); + + bool negate = false; + + // Logical immediates are encoded using parameters n, imm_s and imm_r using + // the following table: + // + // N imms immr size S R + // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr) + // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr) + // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr) + // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr) + // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr) + // 0 11110s xxxxxr 2 UInt(s) UInt(r) + // (s bits must not be all set) + // + // A pattern is constructed of size bits, where the least significant S+1 bits + // are set. The pattern is rotated right by R, and repeated across a 32 or + // 64-bit value, depending on destination register width. + // + // Put another way: the basic format of a logical immediate is a single + // contiguous stretch of 1 bits, repeated across the whole word at intervals + // given by a power of 2. To identify them quickly, we first locate the + // lowest stretch of 1 bits, then the next 1 bit above that; that combination + // is different for every logical immediate, so it gives us all the + // information we need to identify the only logical immediate that our input + // could be, and then we simply check if that's the value we actually have. + // + // (The rotation parameter does give the possibility of the stretch of 1 bits + // going 'round the end' of the word. To deal with that, we observe that in + // any situation where that happens the bitwise NOT of the value is also a + // valid logical immediate. So we simply invert the input whenever its low bit + // is set, and then we know that the rotated case can't arise.) + + if (value & 1) + { + // If the low bit is 1, negate the value, and set a flag to remember that we + // did (so that we can adjust the return values appropriately). + negate = true; + value = ~value; + } + + if (width == kWRegSizeInBits) + { + // To handle 32-bit logical immediates, the very easiest thing is to repeat + // the input value twice to make a 64-bit word. The correct encoding of that + // as a logical immediate will also be the correct encoding of the 32-bit + // value. + + // The most-significant 32 bits may not be zero (ie. negate is true) so + // shift the value left before duplicating it. + value <<= kWRegSizeInBits; + value |= value >> kWRegSizeInBits; + } + + // The basic analysis idea: imagine our input word looks like this. + // + // 0011111000111110001111100011111000111110001111100011111000111110 + // c b a + // |<--d-->| + // + // We find the lowest set bit (as an actual power-of-2 value, not its index) + // and call it a. Then we add a to our original number, which wipes out the + // bottommost stretch of set bits and replaces it with a 1 carried into the + // next zero bit. Then we look for the new lowest set bit, which is in + // position b, and subtract it, so now our number is just like the original + // but with the lowest stretch of set bits completely gone. Now we find the + // lowest set bit again, which is position c in the diagram above. Then we'll + // measure the distance d between bit positions a and c (using CLZ), and that + // tells us that the only valid logical immediate that could possibly be equal + // to this number is the one in which a stretch of bits running from a to just + // below b is replicated every d bits. + uint64_t a = LargestPowerOf2Divisor(value); + uint64_t value_plus_a = value + a; + uint64_t b = LargestPowerOf2Divisor(value_plus_a); + uint64_t value_plus_a_minus_b = value_plus_a - b; + uint64_t c = LargestPowerOf2Divisor(value_plus_a_minus_b); + + int d, clz_a, out_n; + uint64_t mask; + + if (c != 0) + { + // The general case, in which there is more than one stretch of set bits. + // Compute the repeat distance d, and set up a bitmask covering the basic + // unit of repetition (i.e. a word with the bottom d bits set). Also, in all + // of these cases the N bit of the output will be zero. + clz_a = CountLeadingZeros(a, kXRegSizeInBits); + int clz_c = CountLeadingZeros(c, kXRegSizeInBits); + d = clz_a - clz_c; + mask = ((V8_UINT64_C(1) << d) - 1); + out_n = 0; + } + else + { + // Handle degenerate cases. + // + // If any of those 'find lowest set bit' operations didn't find a set bit at + // all, then the word will have been zero thereafter, so in particular the + // last lowest_set_bit operation will have returned zero. So we can test for + // all the special case conditions in one go by seeing if c is zero. + if (a == 0) + { + // The input was zero (or all 1 bits, which will come to here too after we + // inverted it at the start of the function), for which we just return + // false. + return false; + } + else + { + // Otherwise, if c was zero but a was not, then there's just one stretch + // of set bits in our word, meaning that we have the trivial case of + // d == 64 and only one 'repetition'. Set up all the same variables as in + // the general case above, and set the N bit in the output. + clz_a = CountLeadingZeros(a, kXRegSizeInBits); + d = 64; + mask = ~V8_UINT64_C(0); + out_n = 1; + } + } + + // If the repeat period d is not a power of two, it can't be encoded. + if (!IsPowerOfTwo(d)) + return false; + + // If the bit stretch (b - a) does not fit within the mask derived from the + // repeat period, then fail. + if (((b - a) & ~mask) != 0) + return false; + + // The only possible option is b - a repeated every d bits. Now we're going to + // actually construct the valid logical immediate derived from that + // specification, and see if it equals our original input. + // + // To repeat a value every d bits, we multiply it by a number of the form + // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can + // be derived using a table lookup on CLZ(d). + static const std::array multipliers = + { + 0x0000000000000001UL, + 0x0000000100000001UL, + 0x0001000100010001UL, + 0x0101010101010101UL, + 0x1111111111111111UL, + 0x5555555555555555UL, + }; + + int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57; + + // Ensure that the index to the multipliers array is within bounds. + _dbg_assert_(DYNA_REC, (multiplier_idx >= 0) && + (static_cast(multiplier_idx) < multipliers.size())); + + uint64_t multiplier = multipliers[multiplier_idx]; + uint64_t candidate = (b - a) * multiplier; + + // The candidate pattern doesn't match our input value, so fail. + if (value != candidate) + return false; + + // We have a match! This is a valid logical immediate, so now we have to + // construct the bits and pieces of the instruction encoding that generates + // it. + + // Count the set bits in our basic stretch. The special case of clz(0) == -1 + // makes the answer come out right for stretches that reach the very top of + // the word (e.g. numbers like 0xffffc00000000000). + int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits); + int s = clz_a - clz_b; + + // Decide how many bits to rotate right by, to put the low bit of that basic + // stretch in position a. + int r; + if (negate) + { + // If we inverted the input right at the start of this function, here's + // where we compensate: the number of set bits becomes the number of clear + // bits, and the rotation count is based on position b rather than position + // a (since b is the location of the 'lowest' 1 bit after inversion). + s = d - s; + r = (clz_b + 1) & (d - 1); + } + else + { + r = (clz_a + 1) & (d - 1); + } + + // Now we're done, except for having to encode the S output in such a way that + // it gives both the number of set bits and the length of the repeated + // segment. The s field is encoded like this: + // + // imms size S + // ssssss 64 UInt(ssssss) + // 0sssss 32 UInt(sssss) + // 10ssss 16 UInt(ssss) + // 110sss 8 UInt(sss) + // 1110ss 4 UInt(ss) + // 11110s 2 UInt(s) + // + // So we 'or' (-d << 1) with our computed s to form imms. + *n = out_n; + *imm_s = ((-d << 1) | (s - 1)) & 0x3f; + *imm_r = r; + + return true; +} + void ARM64XEmitter::SetCodePtr(u8* ptr) { m_code = ptr; @@ -69,8 +327,6 @@ void ARM64XEmitter::FlushIcacheSection(u8* start, u8* end) #endif } - - // Exception generation static const u32 ExcEnc[][3] = { {0, 0, 1}, // SVC @@ -194,7 +450,7 @@ void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr distance >>= 2; - _assert_msg_(DYNA_REC, distance >= -0xFFFFF && distance < 0xFFFFF, "%s: Received too large distance: %lx", __FUNCTION__, distance); + _assert_msg_(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF, "%s: Received too large distance: %lx", __FUNCTION__, distance); Rt = DecodeReg(Rt); Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | \ @@ -225,7 +481,7 @@ void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 op, const void* ptr) distance >>= 2; - _assert_msg_(DYNA_REC, distance >= -0x3FFFFFF && distance < 0x3FFFFFF, "%s: Received too large distance: %lx", __FUNCTION__, distance); + _assert_msg_(DYNA_REC, distance >= -0x2000000LL && distance <= 0x1FFFFFFLL, "%s: Received too large distance: %lx", __FUNCTION__, distance); Write32((op << 31) | (0x5 << 26) | (distance & 0x3FFFFFF)); } @@ -256,7 +512,7 @@ void ARM64XEmitter::EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, A Rn = DecodeReg(Rn); Rm = DecodeReg(Rm); Write32((b64Bit << 31) | (flags << 29) | (ArithEnc[instenc] << 21) | \ - (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? 1 << 21 : 0) | (Rm << 16) | Option.GetData() | (Rn << 5) | Rd); + (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? (1 << 21) : 0) | (Rm << 16) | Option.GetData() | (Rn << 5) | Rd); } void ARM64XEmitter::EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) @@ -349,7 +605,7 @@ void ARM64XEmitter::EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM Rd = DecodeReg(Rd); Rm = DecodeReg(Rm); Rn = DecodeReg(Rn); - Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x50 << 21) | (LogicalEnc[instenc][1] << 21) | \ + Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x5 << 25) | (LogicalEnc[instenc][1] << 21) | \ Shift.GetData() | (Rm << 16) | (Rn << 5) | Rd); } @@ -483,7 +739,7 @@ void ARM64XEmitter::EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, (imm << 10) | (Rn << 5) | Rd); } -void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n) { // Sometimes Rd is fixed to SP, but can still be 32bit or 64bit. // Use Rn to determine bitness here. @@ -492,7 +748,7 @@ void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 i Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); - Write32((b64Bit << 31) | (op << 29) | (0x24 << 23) | (b64Bit << 22) | \ + Write32((b64Bit << 31) | (op << 29) | (0x24 << 23) | (n << 22) | \ (immr << 16) | (imms << 10) | (Rn << 5) | Rd); } @@ -534,12 +790,11 @@ void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64R Write32((op << 30) | (0b101 << 27) | (type_encode << 23) | (load << 22) | \ ((imm & 0x7F) << 15) | (Rt2 << 10) | (Rn << 5) | Rt); } - void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm) { Rd = DecodeReg(Rd); - Write32((op << 31) | ((imm & 0x3) << 29) | (0b10000 << 24) | \ + Write32((op << 31) | ((imm & 0x3) << 29) | (0x10 << 24) | \ ((imm & 0x1FFFFC) << 3) | Rd); } @@ -552,6 +807,36 @@ void ARM64XEmitter::EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64 Write32((size << 30) | (0b111 << 27) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); } +static inline bool IsInRangeImm19(s64 distance) +{ + return (distance >= -0x40000 && distance <= 0x3FFFF); +} + +static inline bool IsInRangeImm14(s64 distance) +{ + return (distance >= -0x2000 && distance <= 0x1FFF); +} + +static inline bool IsInRangeImm26(s64 distance) +{ + return (distance >= -0x2000000 && distance <= 0x1FFFFFF); +} + +static inline u32 MaskImm19(s64 distance) +{ + return distance & 0x7FFFF; +} + +static inline u32 MaskImm14(s64 distance) +{ + return distance & 0x3FFF; +} + +static inline u32 MaskImm26(s64 distance) +{ + return distance & 0x3FFFFFF; +} + // FixupBranch branching void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch) { @@ -566,32 +851,32 @@ void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch) Not = true; case 0: // CBZ { - _assert_msg_(DYNA_REC, distance >= -0xFFFFF && distance < 0xFFFFF, "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); + _assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); bool b64Bit = Is64Bit(branch.reg); ARM64Reg reg = DecodeReg(branch.reg); - inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | ((distance << 5) & 0xFFFFE0) | reg; + inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg; } break; case 2: // B (conditional) - _assert_msg_(DYNA_REC, distance >= -0xFFFFF && distance < 0xFFFFF, "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); - inst = (0x2A << 25) | (distance << 5) | branch.cond; + _assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); + inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond; break; case 4: // TBNZ Not = true; case 3: // TBZ { - _assert_msg_(DYNA_REC, distance >= -0x3FFF && distance < 0x3FFF, "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); + _assert_msg_(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); ARM64Reg reg = DecodeReg(branch.reg); - inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | ((branch.bit & 0x1F) << 19) | (distance << 5) | reg; + inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | ((branch.bit & 0x1F) << 19) | (MaskImm14(distance) << 5) | reg; } break; case 5: // B (uncoditional) - _assert_msg_(DYNA_REC, distance >= -0x3FFFFFF && distance < 0x3FFFFFF, "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); - inst = (0x5 << 26) | distance; + _assert_msg_(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); + inst = (0x5 << 26) | MaskImm26(distance); break; case 6: // BL (unconditional) - _assert_msg_(DYNA_REC, distance >= -0x3FFFFFF && distance < 0x3FFFFFF, "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); - inst = (0x25 << 26) | distance; + _assert_msg_(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); + inst = (0x25 << 26) | MaskImm26(distance); break; } *(u32*)branch.ptr = inst; @@ -674,12 +959,12 @@ void ARM64XEmitter::CBNZ(ARM64Reg Rt, const void* ptr) // Conditional Branch void ARM64XEmitter::B(CCFlags cond, const void* ptr) { - s64 distance = (s64)ptr - (s64(m_code) + 8); + s64 distance = (s64)ptr - (s64)m_code; + distance >>= 2; - _assert_msg_(DYNA_REC, distance >= -0xFFFFF && distance < 0xFFFFF, "%s: Received too large distance: %lx", __FUNCTION__, distance); - - Write32((0x54 << 24) | (distance << 5) | cond); + _assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s: Received too large distance: %p->%p %ld %lx", __FUNCTION__, m_code, ptr, distance, distance); + Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond); } // Test and Branch @@ -702,6 +987,22 @@ void ARM64XEmitter::BL(const void* ptr) EncodeUnconditionalBranchInst(1, ptr); } +void ARM64XEmitter::QuickCallFunction(ARM64Reg scratchreg, const void *func) +{ + s64 distance = (s64)func - (s64)m_code; + distance >>= 2; // Can only branch to opcode-aligned (4) addresses + if (!IsInRangeImm26(distance)) + { + // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code, func); + MOVI2R(scratchreg, (uintptr_t)func); + BLR(scratchreg); + } + else + { + BL(func); + } +} + // Unconditional Branch (register) void ARM64XEmitter::BR(ARM64Reg Rn) { @@ -771,18 +1072,51 @@ void ARM64XEmitter::_MSR(PStateField field, u8 imm) u32 op1 = 0, op2 = 0; switch (field) { - case FIELD_SPSel: - op1 = 0; op2 = 5; + case FIELD_SPSel: op1 = 0; op2 = 5; break; + case FIELD_DAIFSet: op1 = 3; op2 = 6; break; + case FIELD_DAIFClr: op1 = 3; op2 = 7; break; + default: + _assert_msg_(DYNA_REC, false, "Invalid PStateField to do a imm move to"); + break; + } + EncodeSystemInst(0, op1, 4, imm, op2, WSP); +} + +static void GetSystemReg(PStateField field, int &o0, int &op1, int &CRn, int &CRm, int &op2) +{ + switch (field) +{ + case FIELD_NZCV: + o0 = 3; op1 = 3; CRn = 4; CRm = 2; op2 = 0; break; - case FIELD_DAIFSet: - op1 = 3; op2 = 6; + case FIELD_FPCR: + o0 = 3; op1 = 3; CRn = 4; CRm = 4; op2 = 0; break; - case FIELD_DAIFClr: - op1 = 3; op2 = 7; + case FIELD_FPSR: + o0 = 3; op1 = 3; CRn = 4; CRm = 4; op2 = 1; + break; + default: + _assert_msg_(DYNA_REC, false, "Invalid PStateField to do a register move from/to"); break; } - EncodeSystemInst(0, op1, 3, imm, op2, WSP); } + +void ARM64XEmitter::_MSR(PStateField field, ARM64Reg Rt) +{ + int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0; + _assert_msg_(DYNA_REC, Is64Bit(Rt), "MSR: Rt must be 64-bit"); + GetSystemReg(field, o0, op1, CRn, CRm, op2); + EncodeSystemInst(o0, op1, CRn, CRm, op2, DecodeReg(Rt)); +} + +void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field) +{ + int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0; + _assert_msg_(DYNA_REC, Is64Bit(Rt), "MRS: Rt must be 64-bit"); + GetSystemReg(field, o0, op1, CRn, CRm, op2); + EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt)); +} + void ARM64XEmitter::HINT(SystemHint op) { EncodeSystemInst(0, 3, 2, 0, op, WSP); @@ -807,7 +1141,7 @@ void ARM64XEmitter::ISB(BarrierType type) // Add/Subtract (extended register) void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - ADD(Rd, Rn, Rm, ArithOption(Rd)); + ADD(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) @@ -817,7 +1151,7 @@ void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Optio void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeArithmeticInst(0, true, Rd, Rn, Rm, ArithOption(Rd)); + EncodeArithmeticInst(0, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) @@ -827,7 +1161,7 @@ void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Opti void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - SUB(Rd, Rn, Rm, ArithOption(Rd)); + SUB(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) @@ -837,7 +1171,7 @@ void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Optio void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EncodeArithmeticInst(1, false, Rd, Rn, Rm, ArithOption(Rd)); + EncodeArithmeticInst(1, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) @@ -847,22 +1181,22 @@ void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Opti void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm) { - CMN(Rn, Rm, ArithOption(Rn)); + CMN(Rn, Rm, ArithOption(Rn, ST_LSL, 0)); } void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) { - EncodeArithmeticInst(0, true, SP, Rn, Rm, Option); + EncodeArithmeticInst(0, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option); } void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm) { - CMP(Rn, Rm, ArithOption(Rn)); + CMP(Rn, Rm, ArithOption(Rn, ST_LSL, 0)); } void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) { - EncodeArithmeticInst(1, true, SP, Rn, Rm, Option); + EncodeArithmeticInst(1, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option); } // Add/Subtract (with carry) @@ -1084,35 +1418,60 @@ void ARM64XEmitter::BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shif { EncodeLogicalInst(7, Rd, Rn, Rm, Shift); } + +void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift) +{ + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, Shift); +} + void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm) { - ORR(Rd, Is64Bit(Rd) ? SP : WSP, Rm, ArithOption(Rm, ST_LSL, 0)); + if (IsGPR(Rd) && IsGPR(Rm)) + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0)); + else + _assert_msg_(DYNA_REC, false, "Non-GPRs not supported in MOV"); } void ARM64XEmitter::MVN(ARM64Reg Rd, ARM64Reg Rm) { - ORN(Rd, Is64Bit(Rd) ? SP : WSP, Rm, ArithOption(Rm, ST_LSL, 0)); + ORN(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0)); +} +void ARM64XEmitter::LSL(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, shift)); +} +void ARM64XEmitter::LSR(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSR, shift)); +} +void ARM64XEmitter::ASR(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_ASR, shift)); +} +void ARM64XEmitter::ROR(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_ROR, shift)); } // Logical (immediate) -void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) { - EncodeLogicalImmInst(0, Rd, Rn, immr, imms); + EncodeLogicalImmInst(0, Rd, Rn, immr, imms, invert); } -void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) { - EncodeLogicalImmInst(3, Rd, Rn, immr, imms); + EncodeLogicalImmInst(3, Rd, Rn, immr, imms, invert); } -void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) { - EncodeLogicalImmInst(2, Rd, Rn, immr, imms); + EncodeLogicalImmInst(2, Rd, Rn, immr, imms, invert); } -void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) { - EncodeLogicalImmInst(1, Rd, Rn, immr, imms); + EncodeLogicalImmInst(1, Rd, Rn, immr, imms, invert); } -void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms) +void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert) { - EncodeLogicalImmInst(3, SP, Rn, immr, imms); + EncodeLogicalImmInst(3, Is64Bit(Rn) ? ZR : WZR, Rn, immr, imms, invert); } // Add/subtract (immediate) @@ -1164,6 +1523,15 @@ void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) { EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms); } +void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift) +{ + bool sf = Is64Bit(Rd); + bool N = sf; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + Write32((sf << 31) | (0x27 << 23) | (N << 22) | (Rm << 16) | (shift << 10) | (Rm << 5) | Rd); +} void ARM64XEmitter::SXTB(ARM64Reg Rd, ARM64Reg Rn) { SBFM(Rd, Rn, 0, 7); @@ -1175,7 +1543,6 @@ void ARM64XEmitter::SXTH(ARM64Reg Rd, ARM64Reg Rn) void ARM64XEmitter::SXTW(ARM64Reg Rd, ARM64Reg Rn) { _assert_msg_(DYNA_REC, Is64Bit(Rd), "%s requires 64bit register as destination", __FUNCTION__); - SBFM(Rd, Rn, 0, 31); } void ARM64XEmitter::UXTB(ARM64Reg Rd, ARM64Reg Rn) @@ -1485,46 +1852,56 @@ void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm) EncodeAddressInst(1, Rd, imm >> 12); } -// Wrapper around MOVZ+MOVK +// Wrapper around MOVZ+MOVK (and later MOVN) void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) { - unsigned parts = Is64Bit(Rd) ? 4 : 2; + unsigned int parts = Is64Bit(Rd) ? 4 : 2; BitSet32 upload_part(0); - bool need_movz = false; + + // Always start with a movz! Kills the dependency on the register. + bool use_movz = true; if (!imm) { - // Zero immediate, just clear the register - EOR(Rd, Rd, Rd, ArithOption(Rd, ST_LSL, 0)); + // Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks clearer in disasm too. + MOVZ(Rd, 0, SHIFT_0); return; } if ((Is64Bit(Rd) && imm == std::numeric_limits::max()) || (!Is64Bit(Rd) && imm == std::numeric_limits::max())) { - // Max unsigned value + // Max unsigned value (or if signed, -1) // Set to ~ZR ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP; ORN(Rd, ZR, ZR, ArithOption(ZR, ST_LSL, 0)); return; } + // TODO: Make some more systemic use of MOVN, but this will take care of most cases. + // Small negative integer. Use MOVN + if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm) +{ + MOVN(Rd, ~imm, SHIFT_0); + return; + } + + + // XXX: Use MOVN when possible. // XXX: Optimize more // XXX: Support rotating immediates to save instructions if (optimize) { - for (unsigned i = 0; i < parts; ++i) + for (unsigned int i = 0; i < parts; ++i) { if ((imm >> (i * 16)) & 0xFFFF) upload_part[i] = 1; - else - need_movz = true; } } u64 aligned_pc = (u64)GetCodePtr() & ~0xFFF; s64 aligned_offset = (s64)imm - (s64)aligned_pc; - if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0xFFFFFFFF) + if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0xFFFFFFFFLL) { // Immediate we are loading is within 4GB of our aligned range // Most likely a address that we can load in one or two instructions @@ -1554,10 +1931,10 @@ void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) for (unsigned i = 0; i < parts; ++i) { - if (need_movz && upload_part[i]) + if (use_movz && upload_part[i]) { MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i); - need_movz = false; + use_movz = false; } else { @@ -1584,7 +1961,7 @@ void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers) { if (first) { - STR(INDEX_PRE, (ARM64Reg)(X0 + it), SP, -stack_size); + STR(INDEX_PRE, (ARM64Reg)(X0 + it), SP, -(s32)stack_size); first = false; current_offset += 16; } @@ -1692,7 +2069,7 @@ void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, if (type == INDEX_UNSIGNED) { - _assert_msg_(DYNA_REC, !(imm & ((size - 1) >> 3)), "%s(INDEX_UNSIGNED) immediate offset must be aligned to size!", __FUNCTION__); + _assert_msg_(DYNA_REC, !(imm & ((size - 1) >> 3)), "%s(INDEX_UNSIGNED) immediate offset must be aligned to size! (%d) (%p)", __FUNCTION__, imm, m_emit->GetCodePtr()); _assert_msg_(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!", __FUNCTION__); if (size == 16) imm >>= 1; @@ -1714,11 +2091,11 @@ void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, encoded_imm |= 3; } - Write32((encoded_size << 30) | (0b1111 << 26) | (type == INDEX_UNSIGNED ? (1 << 24) : 0) | \ + Write32((encoded_size << 30) | (0xF << 26) | (type == INDEX_UNSIGNED ? (1 << 24) : 0) | \ (size == 128 ? (1 << 23) : 0) | (opc << 22) | (encoded_imm << 10) | (Rn << 5) | Rt); } -void ARM64FloatEmitter::Emit2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { _assert_msg_(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __FUNCTION__); Rd = DecodeReg(Rd); @@ -1750,14 +2127,13 @@ void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd (1 << 10) | (Rn << 5) | Rd); } -void ARM64FloatEmitter::Emit2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { _assert_msg_(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__); - bool quad = IsQuad(Rd); Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); - Write32((quad << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | \ + Write32((Q << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | \ (opcode << 12) | (1 << 11) | (Rn << 5) | Rd); } @@ -1780,7 +2156,7 @@ void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, Rn = DecodeReg(Rn); Rm = DecodeReg(Rm); - Write32((quad << 30) | (0b11011 << 23) | (L << 22) | (R << 21) | (Rm << 16) | \ + Write32((quad << 30) | (0x1B << 23) | (L << 22) | (R << 21) | (Rm << 16) | \ (opcode << 13) | (S << 12) | (size << 10) | (Rn << 5) | Rt); } @@ -1790,7 +2166,7 @@ void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64R Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); - Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | (opcode << 15) | \ + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | \ (1 << 14) | (Rn << 5) | Rd); } @@ -1800,10 +2176,70 @@ void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); - Write32((sf << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | (rmode << 19) | \ + Write32((sf << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (rmode << 19) | \ (opcode << 16) | (Rn << 5) | Rd); } +void ARM64FloatEmitter::EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign) +{ + _dbg_assert_msg_(DYNA_REC, IsScalar(Rn), "fcvts: Rn must be floating point"); + if (IsGPR(Rd)) + { + // Use the encoding that transfers the result to a GPR. + bool sf = Is64Bit(Rd); + int type = IsDouble(Rn) ? 1 : 0; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int opcode = (sign ? 1 : 0); + int rmode = 0; + switch (round) + { + case ROUND_A: rmode = 0; opcode |= 4; break; + case ROUND_P: rmode = 1; break; + case ROUND_M: rmode = 2; break; + case ROUND_Z: rmode = 3; break; + case ROUND_N: rmode = 0; break; + } + EmitConversion2(sf, 0, true, type, rmode, opcode, 0, Rd, Rn); + } + else + { + // Use the encoding (vector, single) that keeps the result in the fp register. + int sz = IsDouble(Rn); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int opcode = 0; + switch (round) + { + case ROUND_A: opcode = 0x1C; break; + case ROUND_N: opcode = 0x1A; break; + case ROUND_M: opcode = 0x1B; break; + case ROUND_P: opcode = 0x1A; sz |= 2; break; + case ROUND_Z: opcode = 0x1B; sz |= 2; break; + } + Write32((0x5E << 24) | (sign << 29) | (sz << 22) | (1 << 21) | (opcode << 12) | (2 << 10) | (Rn << 5) | Rd); + } +} + +void ARM64FloatEmitter::FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round) +{ + EmitConvertScalarToInt(Rd, Rn, round, false); +} + +void ARM64FloatEmitter::FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round) +{ + EmitConvertScalarToInt(Rd, Rn, round, true); +} + +void ARM64FloatEmitter::EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn) +{ + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((sf << 31) | (S << 29) | (0xF0 << 21) | (direction << 21) | (type << 22) | (rmode << 19) | \ + (opcode << 16) | (scale << 10) | (Rn << 5) | Rd); +} + void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm) { _assert_msg_(DYNA_REC, !IsQuad(Rn), "%s doesn't support vector!", __FUNCTION__); @@ -1812,7 +2248,7 @@ void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Re Rn = DecodeReg(Rn); Rm = DecodeReg(Rm); - Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (is_double << 22) | (Rm << 16) | \ + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | \ (op << 14) | (1 << 13) | (Rn << 5) | opcode2); } @@ -1825,8 +2261,8 @@ void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd Rn = DecodeReg(Rn); Rm = DecodeReg(Rm); - Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (is_double << 22) | (Rm << 16) | \ - (cond << 12) | (0b11 << 10) | (Rn << 5) | Rd); + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | \ + (cond << 12) | (3 << 10) | (Rn << 5) | Rd); } void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) @@ -1847,11 +2283,11 @@ void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, Rn = DecodeReg(Rn); Rm = DecodeReg(Rm); - Write32((quad << 30) | (0b111 << 25) | (encoded_size << 22) | (Rm << 16) | (op << 12) | \ + Write32((quad << 30) | (7 << 25) | (encoded_size << 22) | (Rm << 16) | (op << 12) | \ (1 << 11) | (Rn << 5) | Rd); } -void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm) +void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8) { _assert_msg_(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); @@ -1859,22 +2295,29 @@ void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64R Rd = DecodeReg(Rd); - Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (is_double << 22) | (type << 22) | \ - (imm << 13) | (1 << 12) | (imm5 << 5) | Rd); + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (type << 22) | \ + (imm8 << 13) | (1 << 12) | (imm5 << 5) | Rd); } -void ARM64FloatEmitter::EmitShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { - bool quad = IsQuad(Rd); - _assert_msg_(DYNA_REC, immh, "%s bad encoding! Can't have zero immh", __FUNCTION__); Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); - Write32((quad << 30) | (U << 29) | (0b1111 << 24) | (immh << 19) | (immb << 16) | \ + Write32((Q << 30) | (U << 29) | (0xF << 24) | (immh << 19) | (immb << 16) | \ (opcode << 11) | (1 << 10) | (Rn << 5) | Rd); } + +void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +{ + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((2 << 30) | (U << 29) | (0x3E << 23) | (immh << 19) | (immb << 16) | (opcode << 11) | (1 << 10) | (Rn << 5) | Rd); +} + void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn) { bool quad = IsQuad(Rt); @@ -1922,7 +2365,7 @@ void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); - Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | \ + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | \ (opcode << 15) | (1 << 14) | (Rn << 5) | Rd); } @@ -1934,7 +2377,7 @@ void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, Rn = DecodeReg(Rn); Rm = DecodeReg(Rm); - Write32((quad << 30) | (U << 29) | (0b01111 << 24) | (size << 22) | (L << 21) | \ + Write32((quad << 30) | (U << 29) | (0xF << 24) | (size << 22) | (L << 21) | \ (Rm << 16) | (opcode << 12) | (H << 11) | (Rn << 5) | Rd); } @@ -1944,7 +2387,7 @@ void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM Rt = DecodeReg(Rt); Rn = DecodeReg(Rn); - Write32((size << 30) | (0b1111 << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); + Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); } void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) @@ -2084,7 +2527,7 @@ void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) if (size == 8) { - S = index & 4; + S = (index & 4) != 0; opcode = 0; encoded_size = index & 3; if (index & 8) @@ -2095,7 +2538,7 @@ void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) } else if (size == 16) { - S = index & 2; + S = (index & 2) != 0; opcode = 2; encoded_size = (index & 1) << 1; if (index & 4) @@ -2106,7 +2549,7 @@ void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) } else if (size == 32) { - S = index & 1; + S = (index & 1) != 0; opcode = 4; encoded_size = 0; if (index & 2) @@ -2137,7 +2580,7 @@ void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Re if (size == 8) { - S = index & 4; + S = (index & 4) != 0; opcode = 0; encoded_size = index & 3; if (index & 8) @@ -2148,7 +2591,7 @@ void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Re } else if (size == 16) { - S = index & 2; + S = (index & 2) != 0; opcode = 2; encoded_size = (index & 1) << 1; if (index & 4) @@ -2159,7 +2602,7 @@ void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Re } else if (size == 32) { - S = index & 1; + S = (index & 1) != 0; opcode = 4; encoded_size = 0; if (index & 2) @@ -2183,7 +2626,7 @@ void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Re void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn) { - EmitLoadStoreSingleStructure(1, 0, 0b110, 0, size >> 4, Rt, Rn); + EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn); } void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) @@ -2195,7 +2638,7 @@ void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) if (size == 8) { - S = index & 4; + S = (index & 4) != 0; opcode = 0; encoded_size = index & 3; if (index & 8) @@ -2206,7 +2649,7 @@ void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) } else if (size == 16) { - S = index & 2; + S = (index & 2) != 0; opcode = 2; encoded_size = (index & 1) << 1; if (index & 4) @@ -2217,7 +2660,7 @@ void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) } else if (size == 32) { - S = index & 1; + S = (index & 1) != 0; opcode = 4; encoded_size = 0; if (index & 2) @@ -2248,7 +2691,7 @@ void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Re if (size == 8) { - S = index & 4; + S = (index & 4) != 0; opcode = 0; encoded_size = index & 3; if (index & 8) @@ -2259,7 +2702,7 @@ void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Re } else if (size == 16) { - S = index & 2; + S = (index & 2) != 0; opcode = 2; encoded_size = (index & 1) << 1; if (index & 4) @@ -2270,7 +2713,7 @@ void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Re } else if (size == 32) { - S = index & 1; + S = (index & 1) != 0; opcode = 4; encoded_size = 0; if (index & 2) @@ -2354,6 +2797,39 @@ void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM6 EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm); } +// Scalar - 1 Source +void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top) +{ + if (IsScalar(Rd) && IsScalar(Rn)) + { + EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn); + } + else + { + _assert_msg_(DYNA_REC, !IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads"); + int rmode = 0; + int opcode = 6; + int sf = 0; + if (IsSingle(Rd) && !Is64Bit(Rn) && !top) + { + // GPR to scalar single + opcode |= 1; + } + else if (!Is64Bit(Rd) && IsSingle(Rn) && !top) + { + // Scalar single to GPR - defaults are correct + } + else + { + // TODO + _assert_msg_(DYNA_REC, 0, "FMOV: Unhandled case"); + } + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((sf << 31) | (0x1e2 << 20) | (rmode << 19) | (opcode << 16) | (Rn << 5) | Rd); + } +} + // Loadstore paired void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) { @@ -2364,44 +2840,101 @@ void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm); } -// Scalar - 1 Source void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn) { EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn); } void ARM64FloatEmitter::FNEG(ARM64Reg Rd, ARM64Reg Rn) { - EmitScalar1Source(0, 0, IsDouble(Rd), 0b000010, Rd, Rn); + EmitScalar1Source(0, 0, IsDouble(Rd), 2, Rd, Rn); } +void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn); +} + // Scalar - 2 Source void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - Emit2Source(0, 0, IsDouble(Rd), 0b0010, Rd, Rn, Rm); + EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm); } void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - Emit2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm); + EmitScalar2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm); } void ARM64FloatEmitter::FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - Emit2Source(0, 0, IsDouble(Rd), 0b0011, Rd, Rn, Rm); + EmitScalar2Source(0, 0, IsDouble(Rd), 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 1, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 4, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 5, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 6, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 7, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 8, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 0); +} +void ARM64FloatEmitter::FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 1); +} +void ARM64FloatEmitter::FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 2); +} +void ARM64FloatEmitter::FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 3); +} + +void ARM64FloatEmitter::EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, int opcode) +{ + int type = isDouble ? 1 : 0; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + Ra = DecodeReg(Ra); + int o1 = opcode >> 1; + int o0 = opcode & 1; + m_emit->Write32((0x1F << 24) | (type << 22) | (o1 << 21) | (Rm << 16) | (o0 << 15) | (Ra << 10) | (Rn << 5) | Rd); } // Scalar floating point immediate -void ARM64FloatEmitter::FMOV(ARM64Reg Rd, u32 imm) +void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8) { - EmitScalarImm(0, 0, 0, 0, Rd, imm); + EmitScalarImm(0, 0, 0, 0, Rd, imm8); } // Vector void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EmitThreeSame(0, 0, 0b00011, Rd, Rn, Rm); + EmitThreeSame(0, 0, 3, Rd, Rn, Rm); } void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EmitThreeSame(1, 1, 0b00011, Rd, Rn, Rm); + EmitThreeSame(1, 1, 3, Rd, Rn, Rm); } void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) { @@ -2432,79 +2965,102 @@ void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) } void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, 2 | (size >> 6), 0b01111, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn); } void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EmitThreeSame(0, size >> 6, 0b11010, Rd, Rn, Rm); + EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm); } void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, size >> 6, 0b10111, Rd, Rn); + Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn); +} +void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn); } void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, dest_size >> 5, 0b10110, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 5, 0x16, Rd, Rn); } void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, 2 | (size >> 6), 0b11011, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1B, Rd, Rn); } void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(1, 2 | (size >> 6), 0b11011, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn); } void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EmitThreeSame(1, size >> 6, 0b11111, Rd, Rn, Rm); + EmitThreeSame(1, size >> 6, 0x1F, Rd, Rn, Rm); } void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EmitThreeSame(1, size >> 6, 0b11011, Rd, Rn, Rm); + EmitThreeSame(1, size >> 6, 0x1B, Rd, Rn, Rm); } void ARM64FloatEmitter::FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(1, 2 | (size >> 6), 0b01111, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xF, Rd, Rn); } void ARM64FloatEmitter::FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(1, 2 | (size >> 6), 0b11101, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1D, Rd, Rn); } void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EmitThreeSame(0, 2 | (size >> 6), 0b11010, Rd, Rn, Rm); + EmitThreeSame(0, 2 | (size >> 6), 0x1A, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 2 | (size >> 6), 0x19, Rd, Rn, Rm); } void ARM64FloatEmitter::NOT(ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(1, 0, 0b00101, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 1, 0, 5, Rd, Rn); } void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EmitThreeSame(0, 2, 0b00011, Rd, Rn, Rm); + EmitThreeSame(0, 2, 3, Rd, Rn, Rm); } void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, size >> 4, 1, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn); } void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(1, size >> 4, 0, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 1, size >> 4, 0, Rd, Rn); } void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, size >> 4, 0, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 0, Rd, Rn); } void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, size >> 6, 0b11101, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 0, size >> 6, 0x1D, Rd, Rn); } void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(1, size >> 6, 0b11101, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 1, size >> 6, 0x1D, Rd, Rn); } +void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) +{ + int imm = size * 2 - scale; + EmitShiftImm(IsQuad(Rd), 0, imm >> 3, imm & 7, 0x1C, Rd, Rn); +} +void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) +{ + int imm = size * 2 - scale; + EmitShiftImm(IsQuad(Rd), 1, imm >> 3, imm & 7, 0x1C, Rd, Rn); +} + void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, dest_size >> 4, 0b10010, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 4, 0x12, Rd, Rn); } // Move @@ -2521,7 +3077,7 @@ void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn) else if (size == 64) imm5 = 8; - EmitCopy(IsQuad(Rd), 0, imm5, 0b0001, Rd, Rn); + EmitCopy(IsQuad(Rd), 0, imm5, 1, Rd, Rn); } void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn) @@ -2549,7 +3105,7 @@ void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn) imm5 |= index << 4; } - EmitCopy(1, 0, imm5, 0b0011, Rd, Rn); + EmitCopy(1, 0, imm5, 3, Rd, Rn); } void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2) { @@ -2611,14 +3167,13 @@ void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) imm5 |= index << 4; } - EmitCopy(b64Bit, 0, imm5, 0b0111, Rd, Rn); + EmitCopy(b64Bit, 0, imm5, 7, Rd, Rn); } void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) { bool b64Bit = Is64Bit(Rd); _assert_msg_(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __FUNCTION__); _assert_msg_(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __FUNCTION__); - _assert_msg_(DYNA_REC, !b64Bit && size != 32, "%s doesn't support 32bit move to 32bit register. Use UMOV!", __FUNCTION__); u32 imm5 = 0; if (size == 8) @@ -2637,7 +3192,7 @@ void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) imm5 |= index << 3; } - EmitCopy(b64Bit, 0, imm5, 0b0101, Rd, Rn); + EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn); } // One source @@ -2660,44 +3215,70 @@ void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn) else if (size_from == 64) src_encoding = 1; - Emit1Source(0, 0, src_encoding, 0b100 | dst_encoding, Rd, Rn); -} - -// Conversion between float and integer -void ARM64FloatEmitter::FMOV(u8 size, bool top, ARM64Reg Rd, ARM64Reg Rn) -{ - bool sf = size == 64 ? true : false; - u32 type = 0; - u32 rmode = top ? 1 : 0; - if (size == 64) - { - if (top) - type = 2; - else - type = 1; - } - - EmitConversion(sf, 0, type, rmode, IsVector(Rd) ? 0b111 : 0b110, Rd, Rn); + Emit1Source(0, 0, src_encoding, 4 | dst_encoding, Rd, Rn); } void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn) { - bool sf = Is64Bit(Rn); - u32 type = 0; - if (IsDouble(Rd)) - type = 1; - - EmitConversion(sf, 0, type, 0, 0b010, Rd, Rn); + if (IsScalar(Rn)) + { + // Source is in FP register (like destination!). We must use a vector encoding. + bool sign = false; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int sz = IsDouble(Rn); + Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd); + } + else + { + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + EmitConversion(sf, 0, type, 0, 2, Rd, Rn); + } } void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn) +{ + if (IsScalar(Rn)) + { + // Source is in FP register (like destination!). We must use a vector encoding. + bool sign = true; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int sz = IsDouble(Rn); + Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd); + } + else + { + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion(sf, 0, type, 0, 3, Rd, Rn); + } +} + +void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale) { bool sf = Is64Bit(Rn); u32 type = 0; if (IsDouble(Rd)) type = 1; - EmitConversion(sf, 0, type, 0, 0b011, Rd, Rn); + EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn); +} + +void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale) +{ + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn); } void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm) @@ -2706,47 +3287,47 @@ void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm) } void ARM64FloatEmitter::FCMP(ARM64Reg Rn) { - EmitCompare(0, 0, 0, 0b01000, Rn, (ARM64Reg)0); + EmitCompare(0, 0, 0, 8, Rn, (ARM64Reg)0); } void ARM64FloatEmitter::FCMPE(ARM64Reg Rn, ARM64Reg Rm) { - EmitCompare(0, 0, 0, 0b10000, Rn, Rm); + EmitCompare(0, 0, 0, 0x10, Rn, Rm); } void ARM64FloatEmitter::FCMPE(ARM64Reg Rn) { - EmitCompare(0, 0, 0, 0b11000, Rn, (ARM64Reg)0); + EmitCompare(0, 0, 0, 0x18, Rn, (ARM64Reg)0); } void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EmitThreeSame(0, size >> 6, 0b11100, Rd, Rn, Rm); + EmitThreeSame(0, size >> 6, 0x1C, Rd, Rn, Rm); } void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, 2 | (size >> 6), 0b01101, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1D, Rd, Rn); } void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EmitThreeSame(1, size >> 6, 0b11100, Rd, Rn, Rm); + EmitThreeSame(1, size >> 6, 0x1C, Rd, Rn, Rm); } void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(1, 2 | (size >> 6), 0b01100, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1C, Rd, Rn); } void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - EmitThreeSame(1, 2 | (size >> 6), 0b11100, Rd, Rn, Rm); + EmitThreeSame(1, 2 | (size >> 6), 0x1C, Rd, Rn, Rm); } void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, 2 | (size >> 6), 0b01100, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x0C, Rd, Rn); } void ARM64FloatEmitter::FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(1, 2 | (size >> 6), 0b01101, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xD, Rd, Rn); } void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, 2 | (size >> 6), 0b01110, Rd, Rn); + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn); } void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) @@ -2783,26 +3364,46 @@ void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) // Shift by immediate void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { - _assert_msg_(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!", __FUNCTION__); - u32 immh = 0; - u32 immb = shift & 0xFFF; - - if (src_size == 8) - { - immh = 1; - } - else if (src_size == 16) - { - immh = 2 | ((shift >> 3) & 1); - } - else if (src_size == 32) - { - immh = 4 | ((shift >> 3) & 3);; - } - EmitShiftImm(0, immh, immb, 0b10100, Rd, Rn); + SSHLL(src_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SSHLL(src_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SHRN(dest_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SHRN(dest_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + USHLL(src_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + USHLL(src_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + SXTL(src_size, Rd, Rn, false); +} +void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + SXTL(src_size, Rd, Rn, true); +} +void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + UXTL(src_size, Rd, Rn, false); +} +void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + UXTL(src_size, Rd, Rn, true); } -void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) { _assert_msg_(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!", __FUNCTION__); u32 immh = 0; @@ -2820,10 +3421,31 @@ void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { immh = 4 | ((shift >> 3) & 3);; } - EmitShiftImm(1, immh, immb, 0b10100, Rd, Rn); + EmitShiftImm(upper, 0, immh, immb, 0b10100, Rd, Rn); } -void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) +{ + _assert_msg_(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!", __FUNCTION__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (src_size == 8) + { + immh = 1; + } + else if (src_size == 16) + { + immh = 2 | ((shift >> 3) & 1); + } + else if (src_size == 32) + { + immh = 4 | ((shift >> 3) & 3);; + } + EmitShiftImm(upper, 1, immh, immb, 0b10100, Rd, Rn); +} + +void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) { _assert_msg_(DYNA_REC, shift < dest_size, "%s shift amount must less than the element size!", __FUNCTION__); u32 immh = 0; @@ -2841,17 +3463,17 @@ void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { immh = 4 | ((shift >> 3) & 3);; } - EmitShiftImm(1, immh, immb, 0b10000, Rd, Rn); + EmitShiftImm(upper, 1, immh, immb, 0b10000, Rd, Rn); } -void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) { - SSHLL(src_size, Rd, Rn, 0); + SSHLL(src_size, Rd, Rn, 0, upper); } -void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) { - USHLL(src_size, Rd, Rn, 0); + USHLL(src_size, Rd, Rn, 0, upper); } // vector x indexed element @@ -2861,7 +3483,6 @@ void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 bool L = false; bool H = false; - if (size == 32) { L = index & 1; @@ -2872,7 +3493,26 @@ void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 H = index == 1; } - EmitVectorxElement(0, 2 | (size >> 6), L, 0b1001, H, Rd, Rn, Rm); + EmitVectorxElement(0, 2 | (size >> 6), L, 0x9, H, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index) +{ + _assert_msg_(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __FUNCTION__); + + bool L = false; + bool H = false; + if (size == 32) + { + L = index & 1; + H = (index >> 1) & 1; + } + else if (size == 64) + { + H = index == 1; + } + + EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm); } void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp) @@ -3032,5 +3672,268 @@ void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp) } } + +void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (!Is64Bit(Rn)) + imm &= 0xFFFFFFFF; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + AND(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + _assert_msg_(DYNA_REC, scratch != INVALID_REG, "ANDSI2R - failed to construct logical immediate value from %08x, need scratch", (u32)imm); + MOVI2R(scratch, imm); + AND(Rd, Rn, scratch); + } } +void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + ORR(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + _assert_msg_(DYNA_REC, scratch != INVALID_REG, "ORRI2R - failed to construct logical immediate value from %08x, need scratch", (u32)imm); + MOVI2R(scratch, imm); + ORR(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + EOR(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + _assert_msg_(DYNA_REC, scratch != INVALID_REG, "EORI2R - failed to construct logical immediate value from %08x, need scratch", (u32)imm); + MOVI2R(scratch, imm); + EOR(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + ANDS(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + _assert_msg_(DYNA_REC, scratch != INVALID_REG, "ANDSI2R - failed to construct logical immediate value from %08x, need scratch", (u32)imm); + MOVI2R(scratch, imm); + ANDS(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + { + ADD(Rd, Rn, val, shift); + } + else + { + _assert_msg_(DYNA_REC, scratch != INVALID_REG, "ADDI2R - failed to construct arithmetic immediate value from %08x, need scratch", (u32)imm); + MOVI2R(scratch, imm); + ADD(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + { + SUB(Rd, Rn, val, shift); + } + else + { + _assert_msg_(DYNA_REC, scratch != INVALID_REG, "SUBI2R - failed to construct arithmetic immediate value from %08x, need scratch", (u32)imm); + MOVI2R(scratch, imm); + SUB(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + { + CMP(Rn, val, shift); + } + else + { + _assert_msg_(DYNA_REC, scratch != INVALID_REG, "CMPI2R - failed to construct arithmetic immediate value from %08x, need scratch", (u32)imm); + MOVI2R(scratch, imm); + CMP(Rn, scratch); + } +} + +bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + ADD(Rd, Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + SUB(Rd, Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + CMP(Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + AND(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} +bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + ORR(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} +bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + EOR(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} + +float FPImm8ToFloat(uint8_t bits) +{ + int sign = bits >> 7; + uint32_t f = (sign << 31); + int bit6 = (bits >> 6) & 1; + uint32_t exp = ((!bit6) << 7) | (0x7C * bit6) | ((bits >> 4) & 3); + uint32_t mantissa = (bits & 0xF) << 19; + f |= exp << 23; + f |= mantissa; + float fl; + memcpy(&fl, &f, sizeof(float)); + return fl; +} + +bool FPImm8FromFloat(float value, uint8_t *immOut) +{ + uint32_t f; + memcpy(&f, &value, sizeof(float)); + uint32_t mantissa4 = (f & 0x7FFFFF) >> 19; + uint32_t exponent = (f >> 23) & 0xFF; + uint32_t sign = f >> 31; + if ((exponent >> 7) == ((exponent >> 6) & 1)) + return false; + uint8_t imm8 = (sign << 7) | ((!(exponent >> 7)) << 6) | ((exponent & 3) << 4) | mantissa4; + float newFloat = FPImm8ToFloat(imm8); + if (newFloat == value) + *immOut = imm8; + else + return false; + return true; +} + +void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate) +{ + _assert_msg_(DYNA_REC, !IsDouble(Rd), "MOVI2F does not yet support double precision"); + uint8_t imm8; + if (value == 0.0) + { + FMOV(Rd, IsDouble(Rd) ? ZR : WZR); + if (negate) + FNEG(Rd, Rd); + // TODO: There are some other values we could generate with the float-imm instruction, like 1.0... + } + else if (FPImm8FromFloat(value, &imm8)) + { + FMOV(Rd, imm8); + } + else + { + _assert_msg_(DYNA_REC, scratch != INVALID_REG, "Failed to find a way to generate FP immediate %f without scratch", value); + u32 ival; + if (negate) + value = -value; + + memcpy(&ival, &value, sizeof(ival)); + m_emit->MOVI2R(scratch, ival); + FMOV(Rd, scratch); + } +} + +// TODO: Quite a few values could be generated easily using the MOVI instruction and friends. +void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch) +{ + // TODO: Make it work with more element sizes + // TODO: Optimize - there are shorter solution for many values + ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd)); + MOVI2F(s, value, scratch); + DUP(32, Rd, Rd, 0); +} + +void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + { + SUBS(Rd, Rn, val, shift); + } + else + { + _assert_msg_(DYNA_REC, scratch != INVALID_REG, "ANDSI2R - failed to construct immediate value from %08x, need scratch", (u32)imm); + MOVI2R(scratch, imm); + SUBS(Rd, Rn, scratch); + } +} + +} // namespace diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 7281970889..c6be23a966 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -1,4 +1,4 @@ -// Copyright 2014 Dolphin Emulator Project +// Copyright 2015 Dolphin Emulator Project // Licensed under GPLv2+ // Refer to the license.txt file included. @@ -74,19 +74,34 @@ enum ARM64Reg PLTL2KEEP, PLTL2STRM, PLTL3KEEP, PLTL3STRM, + WZR = WSP, + ZR = SP, + INVALID_REG = 0xFFFFFFFF }; -inline bool Is64Bit(ARM64Reg reg) { return reg & 0x20; } +inline bool Is64Bit(ARM64Reg reg) { return (reg & 0x20) != 0; } inline bool IsSingle(ARM64Reg reg) { return (reg & 0xC0) == 0x40; } inline bool IsDouble(ARM64Reg reg) { return (reg & 0xC0) == 0x80; } +inline bool IsScalar(ARM64Reg reg) { return IsSingle(reg) || IsDouble(reg); } inline bool IsQuad(ARM64Reg reg) { return (reg & 0xC0) == 0xC0; } inline bool IsVector(ARM64Reg reg) { return (reg & 0xC0) != 0; } +inline bool IsGPR(ARM64Reg reg) { return (int)reg < 0x40; } + inline ARM64Reg DecodeReg(ARM64Reg reg) { return (ARM64Reg)(reg & 0x1F); } inline ARM64Reg EncodeRegTo64(ARM64Reg reg) { return (ARM64Reg)(reg | 0x20); } +inline ARM64Reg EncodeRegToSingle(ARM64Reg reg) { return (ARM64Reg)(DecodeReg(reg) + S0); } inline ARM64Reg EncodeRegToDouble(ARM64Reg reg) { return (ARM64Reg)((reg & ~0xC0) | 0x80); } inline ARM64Reg EncodeRegToQuad(ARM64Reg reg) { return (ARM64Reg)(reg | 0xC0); } +// For AND/TST/ORR/EOR etc +bool IsImmLogical(uint64_t value, unsigned int width, unsigned int *n, unsigned int *imm_s, unsigned int *imm_r); +// For ADD/SUB +bool IsImmArithmetic(uint64_t input, u32 *val, bool *shift); + +float FPImm8ToFloat(uint8_t bits); +bool FPImm8FromFloat(float value, uint8_t *immOut); + enum OpType { TYPE_IMM = 0, @@ -109,8 +124,7 @@ enum IndexType INDEX_UNSIGNED, INDEX_POST, INDEX_PRE, - // Only for VFP loadstore paired - INDEX_SIGNED, + INDEX_SIGNED, // used in LDP/STP }; enum ShiftAmount @@ -121,12 +135,12 @@ enum ShiftAmount SHIFT_48 = 3, }; -enum ExtendType -{ - EXTEND_UXTW = 2, - EXTEND_LSL = 3, // Default for zero shift amount - EXTEND_SXTW = 6, - EXTEND_SXTX = 7, +enum RoundingMode { + ROUND_A, // round to nearest, ties to away + ROUND_M, // round towards -inf + ROUND_N, // round to nearest, ties to even + ROUND_P, // round towards +inf + ROUND_Z, // round towards zero }; struct FixupBranch @@ -157,6 +171,9 @@ enum PStateField FIELD_SPSel = 0, FIELD_DAIFSet, FIELD_DAIFClr, + FIELD_NZCV, // The only system registers accessible from EL0 (user space) + FIELD_FPCR = 0x340, + FIELD_FPSR = 0x341, }; enum SystemHint @@ -252,6 +269,7 @@ public: m_width = WIDTH_32BIT; m_extend = EXTEND_UXTW; } + m_shifttype = ST_LSL; } ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift) { @@ -333,7 +351,7 @@ private: void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd); - void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n); void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm); void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); @@ -398,7 +416,7 @@ public: // Unconditional Branch (register) void BR(ARM64Reg Rn); void BLR(ARM64Reg Rn); - void RET(ARM64Reg Rn); + void RET(ARM64Reg Rn = X30); void ERET(); void DRPS(); @@ -414,6 +432,10 @@ public: // System void _MSR(PStateField field, u8 imm); + + void _MSR(PStateField field, ARM64Reg Rt); + void MRS(ARM64Reg Rt, PStateField field); + void HINT(SystemHint op); void CLREX(); void DSB(BarrierType type); @@ -454,6 +476,17 @@ public: void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + // Aliases + void CSET(ARM64Reg Rd, CCFlags cond) + { + ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR; + CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); + } + void NEG(ARM64Reg Rd, ARM64Reg Rs) + { + SUB(Rd, Is64Bit(Rd) ? ZR : WZR, Rs); + } + // Data-Processing 1 source void RBIT(ARM64Reg Rd, ARM64Reg Rn); void REV16(ARM64Reg Rd, ARM64Reg Rn); @@ -500,15 +533,34 @@ public: void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + + // Wrap the above for saner syntax + void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { AND(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BIC(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORN(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EOR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EON(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ANDS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BICS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + + // Convenience wrappers around ORR. These match the official convenience syntax. + void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift); void MOV(ARM64Reg Rd, ARM64Reg Rm); void MVN(ARM64Reg Rd, ARM64Reg Rm); + // TODO: These are "slow" as they use arith+shift, should be replaced with UBFM/EXTR variants. + void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift); + void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift); + void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift); + void ROR(ARM64Reg Rd, ARM64Reg Rm, int shift); + // Logical (immediate) - void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); - void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); - void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); - void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); - void TST(ARM64Reg Rn, u32 immr, u32 imms); + void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); // Add/subtract (immediate) void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); @@ -526,12 +578,22 @@ public: void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + + // Extract register (ROR with two inputs, if same then faster on A67) + void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift); + + // Aliases void SXTB(ARM64Reg Rd, ARM64Reg Rn); void SXTH(ARM64Reg Rd, ARM64Reg Rn); void SXTW(ARM64Reg Rd, ARM64Reg Rn); void UXTB(ARM64Reg Rd, ARM64Reg Rn); void UXTH(ARM64Reg Rd, ARM64Reg Rn); + void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) + { + UBFM(Rd, Rn, lsb, lsb + width <= (Is64Bit(Rn) ? 64 : 32)); + } + // Load Register (Literal) void LDR(ARM64Reg Rt, u32 imm); void LDRSW(ARM64Reg Rt, u32 imm); @@ -610,6 +672,32 @@ public: // Wrapper around MOVZ+MOVK void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true); + template + void MOVP2R(ARM64Reg Rd, P *ptr) + { + _assert_msg_(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers"); + MOVI2R(Rd, (uintptr_t)ptr); + } + + // Wrapper around AND x, y, imm etc. If you are sure the imm will work, no need to pass a scratch register. + void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG) { ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch); } + void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + + void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + + bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryCMPI2R(ARM64Reg Rn, u32 imm); + + bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); // ABI related void ABI_PushRegisters(BitSet32 registers); @@ -633,10 +721,17 @@ public: ARM64Reg ABI_SetupLambda(const std::function* f) { auto trampoline = &ARM64XEmitter::CallLambdaTrampoline; - MOVI2R(X30, (u64)trampoline); - MOVI2R(X0, (u64)const_cast((const void*)f)); + MOVI2R(X30, (uintptr_t)trampoline); + MOVI2R(X0, (uintptr_t)const_cast((const void*)f)); return X30; } + + // Plain function call + void QuickCallFunction(ARM64Reg scratchreg, const void *func); + template void QuickCallFunction(ARM64Reg scratchreg, T func) + { + QuickCallFunction(scratchreg, (const void *)func); + } }; class ARM64FloatEmitter @@ -671,14 +766,28 @@ public: // Scalar - 1 Source void FABS(ARM64Reg Rd, ARM64Reg Rn); void FNEG(ARM64Reg Rd, ARM64Reg Rn); + void FSQRT(ARM64Reg Rd, ARM64Reg Rn); + void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP // Scalar - 2 Source void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Scalar - 3 Source. Note - the accumulator is last on ARM! + void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); // Scalar floating point immediate - void FMOV(ARM64Reg Rd, u32 imm); + void FMOV(ARM64Reg Rd, uint8_t imm8); // Vector void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); @@ -686,7 +795,10 @@ public: void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn); @@ -697,11 +809,17 @@ public: void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void NOT(ARM64Reg Rd, ARM64Reg Rn); void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void MOV(ARM64Reg Rd, ARM64Reg Rn) + { + ORR(Rd, Rn, Rn); + } void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn); void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn); void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn); void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); + void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); // Move @@ -714,11 +832,20 @@ public: // One source void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn); - // Conversion between float and integer - void FMOV(u8 size, bool top, ARM64Reg Rd, ARM64Reg Rn); + // Scalar convert float to int, in a lot of variants. + // Note that the scalar version of this operation has two encodings, one that goes to an integer register + // and one that outputs to a scalar fp register. + void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round); + void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round); + + // Scalar convert int to float. No rounding mode specifier necessary. void SCVTF(ARM64Reg Rd, ARM64Reg Rn); void UCVTF(ARM64Reg Rd, ARM64Reg Rn); + // Scalar fixed point to float. scale is the number of fractional bits. + void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); + void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); + // Float comparison void FCMP(ARM64Reg Rn, ARM64Reg Rm); void FCMP(ARM64Reg Rn); @@ -746,13 +873,22 @@ public: // Shift by immediate void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); // vector x indexed element void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); + void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); + + void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false); + void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG); // ABI related void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG); @@ -764,25 +900,35 @@ private: // Emitting functions void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); - void Emit2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn); - void Emit2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn); void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn); void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm); void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); - void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm); - void EmitShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8); + void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn); void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign); + void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, int opcode); void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + + void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); + void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); }; class ARM64CodeBlock : public CodeBlock diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 872e37d71e..703241fa64 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -68,7 +68,7 @@ void JitArm64::psq_l(UGeckoInstruction inst) fpr.BindToRegister(inst.RS, false); ARM64Reg VS = fpr.R(inst.RS); - m_float_emit.FCVTL(64, EncodeRegToDouble(VS), D0); + m_float_emit.FCVTL(64, VS, D0); if (inst.W) { m_float_emit.FMOV(D0, 0x70); // 1.0 as a Double