From bfbbddd76fd3137c5018c127e0e891b2ba50fdfc Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 20 Nov 2014 13:56:50 +0000 Subject: [PATCH] Rewrites ARMv7 fastmem entirely. This is a fairly lengthy change that can't be separated out to multiple commits well due to the nature of fastmem being a bit of an intertangled mess. This makes my life easier for maintaining fastmem on ARMv7 because I now don't have to do any terrible instruction counting and NOP padding. Really makes my brain stop hurting when working with it. This enables fastmem for a whole bunch of new instructions, which basically means that all instructions now have fastmem working for them. This also rewrites the floating point loadstores again because the last implementation was pretty crap when it comes to performance, even if they were the cleanest implementation from my point of view. This initially started with me rewriting the fastmem routines to work just like the previous/current implementation of floating loadstores. That was when I noticed that the performance tanked and decided to rewrite all of it. This also happens to implement gatherpipe optimizations alongside constant address optimization. Overall this comment brings a fairly large speedboost when using fastmem. --- Source/Core/Core/PowerPC/JitArm32/Jit.cpp | 1 + Source/Core/Core/PowerPC/JitArm32/Jit.h | 41 +- .../PowerPC/JitArm32/JitArm_BackPatch.cpp | 662 +++++++++++++++--- .../PowerPC/JitArm32/JitArm_LoadStore.cpp | 481 ++++++------- .../JitArm32/JitArm_LoadStoreFloating.cpp | 400 ++++++----- 5 files changed, 1065 insertions(+), 520 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp index 7a95e38f5a..59ddf184e9 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp @@ -40,6 +40,7 @@ void JitArm::Init() code_block.m_gpa = &js.gpa; code_block.m_fpa = &js.fpa; analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE); + InitBackpatch(); } void JitArm::ClearCache() diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.h b/Source/Core/Core/PowerPC/JitArm32/Jit.h index 4d9493a463..1f8684bcc9 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.h @@ -48,6 +48,26 @@ private: ArmFPRCache fpr; PPCAnalyst::CodeBuffer code_buffer; + struct BackPatchInfo + { + enum + { + FLAG_STORE = (1 << 0), + FLAG_LOAD = (1 << 1), + FLAG_SIZE_8 = (1 << 2), + FLAG_SIZE_16 = (1 << 3), + FLAG_SIZE_32 = (1 << 4), + FLAG_SIZE_F32 = (1 << 5), + FLAG_SIZE_F64 = (1 << 6), + FLAG_REVERSE = (1 << 7), + }; + + u32 m_fastmem_size; + u32 m_fastmem_trouble_inst_offset; + u32 m_slowmem_size; + }; + // The key is the flags + std::map m_backpatch_info; void DoDownCount(); @@ -57,11 +77,19 @@ private: ArmGen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set); - bool BackPatch(SContext* ctx); - void BeginTimeProfile(JitBlock* b); void EndTimeProfile(JitBlock* b); + bool BackPatch(SContext* ctx); + bool DisasmLoadStore(const u8* ptr, u32* flags, ArmGen::ARMReg* rD, ArmGen::ARMReg* V1); + // Initializes the information that backpatching needs + // This is required so we know the backpatch routine sizes and trouble offsets + void InitBackpatch(); + + // Returns the trouble instruction offset + // Zero if it isn't a fastmem routine + u32 EmitBackpatchRoutine(ARMXEmitter* emit, u32 flags, bool fastmem, bool do_padding, ArmGen::ARMReg RS, ArmGen::ARMReg V1 = ArmGen::ARMReg::INVALID_REG); + public: JitArm() : code_buffer(32000) {} ~JitArm() {} @@ -118,13 +146,8 @@ public: void GetCarryAndClear(ArmGen::ARMReg reg); void FinalizeCarry(ArmGen::ARMReg reg); - // TODO: This shouldn't be here - void UnsafeStoreFromReg(ArmGen::ARMReg dest, ArmGen::ARMReg value, int accessSize, s32 offset); - void SafeStoreFromReg(bool fastmem, s32 dest, u32 value, s32 offsetReg, int accessSize, s32 offset); - - void UnsafeLoadToReg(ArmGen::ARMReg dest, ArmGen::ARMReg addr, int accessSize, s32 offsetReg, s32 offset); - void SafeLoadToReg(bool fastmem, u32 dest, s32 addr, s32 offsetReg, int accessSize, s32 offset, bool signExtend, bool reverse); - + void SafeStoreFromReg(s32 dest, u32 value, s32 offsetReg, int accessSize, s32 offset); + void SafeLoadToReg(ArmGen::ARMReg dest, s32 addr, s32 offsetReg, int accessSize, s32 offset, bool signExtend, bool reverse, bool update); // OPCODES void unknown_instruction(UGeckoInstruction _inst); diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp index 43349f1eca..c703e5a88d 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp @@ -16,47 +16,65 @@ using namespace ArmGen; // 1) It's really necessary. We don't know anything about the context. // 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be // that many of them in a typical program/game. -static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store, bool *new_system) +bool JitArm::DisasmLoadStore(const u8* ptr, u32* flags, ARMReg* rD, ARMReg* V1) { + u32 inst = *(u32*)ptr; + u32 prev_inst = *(u32*)(ptr - 4); + u32 next_inst = *(u32*)(ptr + 4); u8 op = (inst >> 20) & 0xFF; - rD = (ARMReg)((inst >> 12) & 0xF); + *rD = (ARMReg)((inst >> 12) & 0xF); switch (op) { case 0x58: // STR { - Store = true; - accessSize = 32; + *flags |= + BackPatchInfo::FLAG_STORE | + BackPatchInfo::FLAG_SIZE_32; + *rD = (ARMReg)(prev_inst & 0xF); } break; case 0x59: // LDR { - Store = false; - accessSize = 32; + *flags |= + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_32; + // REV + if ((next_inst & 0x0FFF0FF0) != 0x06BF0F30) + *flags |= BackPatchInfo::FLAG_REVERSE; } break; case 0x1D: // LDRH { - Store = false; - accessSize = 16; + *flags |= + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_16; + // REV16 + if((next_inst & 0x0FFF0FF0) != 0x06BF0FB0) + *flags |= BackPatchInfo::FLAG_REVERSE; } break; case 0x45 + 0x18: // LDRB { - Store = false; - accessSize = 8; + *flags |= + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_8; } break; case 0x5C: // STRB { - Store = true; - accessSize = 8; + *flags |= + BackPatchInfo::FLAG_STORE | + BackPatchInfo::FLAG_SIZE_8; + *rD = (ARMReg)((inst >> 12) & 0xF); } break; case 0x1C: // STRH { - Store = true; - accessSize = 16; + *flags |= + BackPatchInfo::FLAG_STORE | + BackPatchInfo::FLAG_SIZE_16; + *rD = (ARMReg)(prev_inst & 0xF); } break; default: @@ -66,10 +84,92 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto switch (op2) { case 0xD: // VLDR/VSTR - *new_system = true; + { + bool load = (inst >> 20) & 1; + bool single = !((inst >> 8) & 1); + + if (load) + *flags |= BackPatchInfo::FLAG_LOAD; + else + *flags |= BackPatchInfo::FLAG_STORE; + + if (single) + *flags |= BackPatchInfo::FLAG_SIZE_F32; + else + *flags |= BackPatchInfo::FLAG_SIZE_F64; + if (single) + { + if (!load) + { + u32 vcvt = *(u32*)(ptr - 8); + u32 src_register = vcvt & 0xF; + src_register |= (vcvt >> 1) & 0x10; + *rD = (ARMReg)(src_register + D0); + } + } + } break; case 0x4: // VST1/VLD1 - *new_system = true; + { + u32 size = (inst >> 6) & 0x3; + bool load = (inst >> 21) & 1; + if (load) + *flags |= BackPatchInfo::FLAG_LOAD; + else + *flags |= BackPatchInfo::FLAG_STORE; + + + if (size == 2) // 32bit + { + if (load) + { + // For 32bit loads we are loading to a temporary + // So we need to read PC+8,PC+12 to get the two destination registers + u32 vcvt_1 = *(u32*)(ptr + 8); + u32 vcvt_2 = *(u32*)(ptr + 12); + + u32 dest_register_1 = (vcvt_1 >> 12) & 0xF; + dest_register_1 |= (vcvt_1 >> 18) & 0x10; + + u32 dest_register_2 = (vcvt_2 >> 12) & 0xF; + dest_register_2 |= (vcvt_2 >> 18) & 0x10; + + // Make sure to encode the destination register to something our emitter understands + *rD = (ARMReg)(dest_register_1 + D0); + *V1 = (ARMReg)(dest_register_2 + D0); + } + else + { + // For 32bit stores we are storing from a temporary + // So we need to check the VCVT at PC-8 for the source register + u32 vcvt = *(u32*)(ptr - 8); + u32 src_register = vcvt & 0xF; + src_register |= (vcvt >> 1) & 0x10; + *rD = (ARMReg)(src_register + D0); + } + *flags |= BackPatchInfo::FLAG_SIZE_F32; + } + else if (size == 3) // 64bit + { + if (load) + { + // For 64bit loads we load directly in to the VFP register + u32 dest_register = (inst >> 12) & 0xF; + dest_register |= (inst >> 18) & 0x10; + // Make sure to encode the destination register to something our emitter understands + *rD = (ARMReg)(dest_register + D0); + } + else + { + // For 64bit stores we are storing from a temporary + // Check the previous VREV64 instruction for the real register + u32 src_register = prev_inst & 0xF; + src_register |= (prev_inst >> 1) & 0x10; + *rD = (ARMReg)(src_register + D0); + } + *flags |= BackPatchInfo::FLAG_SIZE_F64; + } + } break; default: printf("Op is 0x%02x\n", op); @@ -95,94 +195,484 @@ bool JitArm::BackPatch(SContext* ctx) // We need to get the destination register before we start u8* codePtr = (u8*)ctx->CTX_PC; u32 Value = *(u32*)codePtr; - ARMReg rD; - u8 accessSize; - bool Store; - bool new_system = false; + ARMReg rD = INVALID_REG; + ARMReg V1 = INVALID_REG; + u32 flags = 0; - if (!DisamLoadStore(Value, rD, accessSize, Store, &new_system)) + if (!DisasmLoadStore(codePtr, &flags, &rD, &V1)) { printf("Invalid backpatch at location 0x%08lx(0x%08x)\n", ctx->CTX_PC, Value); exit(0); } - if (new_system) - { - // The new system is a lot easier to backpatch than the old crap. - // Instead of backpatching over code and making sure we NOP pad and other crap - // We emit both the slow and fast path and branch over the slow path each time - // We search backwards until we find the second branch instruction - // Then proceed to replace it with a NOP and set that to the new PC. - // This ensures that we run the slow path and then branch over the fast path. + BackPatchInfo& info = m_backpatch_info[flags]; + ARMXEmitter emitter(codePtr - info.m_fastmem_trouble_inst_offset * 4); + u32 new_pc = (u32)emitter.GetCodePtr(); + EmitBackpatchRoutine(&emitter, flags, false, true, rD, V1); + emitter.FlushIcache(); + ctx->CTX_PC = new_pc; + return true; +} - // Run backwards until we find the branch we want to NOP - for (int branches = 2; branches > 0; ctx->CTX_PC -= 4) - if ((*(u32*)ctx->CTX_PC & 0x0F000000) == 0x0A000000) // B - --branches; +u32 JitArm::EmitBackpatchRoutine(ARMXEmitter* emit, u32 flags, bool fastmem, bool do_padding, ARMReg RS, ARMReg V1) +{ + ARMReg addr = R12; + ARMReg temp = R11; + u32 trouble_offset = 0; + const u8* code_base = emit->GetCodePtr(); - ctx->CTX_PC += 4; - ARMXEmitter emitter((u8*)ctx->CTX_PC); - emitter.NOP(1); - emitter.FlushIcache(); - return true; - } - else + if (fastmem) { - if (Store) + ARMReg temp2 = R10; + Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) + emit->BIC(temp, addr, mask); // 1 + emit->MOVI2R(temp2, (u32)Memory::base); // 2-3 + emit->ADD(temp, temp, temp2); // 4 + + if (flags & BackPatchInfo::FLAG_STORE && + flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64)) { - const u32 ARMREGOFFSET = 4 * 5; - ARMXEmitter emitter(codePtr - ARMREGOFFSET); - switch (accessSize) + NEONXEmitter nemit(emit); + if (flags & BackPatchInfo::FLAG_SIZE_F32) { - case 8: // 8bit - emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2 - return 0; - break; - case 16: // 16bit - emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2 - return 0; - break; - case 32: // 32bit - emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2 - break; + emit->VCVT(S0, RS, 0); + nemit.VREV32(I_8, D0, D0); + trouble_offset = (emit->GetCodePtr() - code_base) / 4; + emit->VSTR(S0, temp, 0); } - emitter.PUSH(4, R0, R1, R2, R3); // 3 - emitter.MOV(R0, rD); // Value - 4 - emitter.MOV(R1, R10); // Addr- 5 - emitter.BL(R14); // 6 - emitter.POP(4, R0, R1, R2, R3); // 7 - u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4); - ctx->CTX_PC = newPC; - emitter.FlushIcache(); - return true; + else + { + nemit.VREV64(I_8, D0, RS); + trouble_offset = (emit->GetCodePtr() - code_base) / 4; + nemit.VST1(I_64, D0, temp); + } + } + else if (flags & BackPatchInfo::FLAG_LOAD && + flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64)) + { + NEONXEmitter nemit(emit); + + trouble_offset = (emit->GetCodePtr() - code_base) / 4; + if (flags & BackPatchInfo::FLAG_SIZE_F32) + { + nemit.VLD1(F_32, D0, temp); + nemit.VREV32(I_8, D0, D0); // Byte swap to result + emit->VCVT(RS, S0, 0); + emit->VCVT(V1, S0, 0); + } + else + { + nemit.VLD1(I_64, RS, temp); + nemit.VREV64(I_8, RS, RS); // Byte swap to result + } + } + else if (flags & BackPatchInfo::FLAG_STORE) + { + if (flags & BackPatchInfo::FLAG_SIZE_32) + emit->REV(temp2, RS); + else if (flags & BackPatchInfo::FLAG_SIZE_16) + emit->REV16(temp2, RS); + + trouble_offset = (emit->GetCodePtr() - code_base) / 4; + + if (flags & BackPatchInfo::FLAG_SIZE_32) + emit->STR(temp2, temp); + else if (flags & BackPatchInfo::FLAG_SIZE_16) + emit->STRH(temp2, temp); + else + emit->STRB(RS, temp); } else { - const u32 ARMREGOFFSET = 4 * 4; - ARMXEmitter emitter(codePtr - ARMREGOFFSET); - switch (accessSize) + trouble_offset = (emit->GetCodePtr() - code_base) / 4; + + if (flags & BackPatchInfo::FLAG_SIZE_32) + emit->LDR(RS, temp); // 5 + else if (flags & BackPatchInfo::FLAG_SIZE_16) + emit->LDRH(RS, temp); + else if (flags & BackPatchInfo::FLAG_SIZE_8) + emit->LDRB(RS, temp); + + + if (!(flags & BackPatchInfo::FLAG_REVERSE)) { - case 8: // 8bit - emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2 - break; - case 16: // 16bit - emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2 - break; - case 32: // 32bit - emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2 - break; + if (flags & BackPatchInfo::FLAG_SIZE_32) + emit->REV(RS, RS); // 6 + else if (flags & BackPatchInfo::FLAG_SIZE_16) + emit->REV16(RS, RS); } - emitter.PUSH(4, R0, R1, R2, R3); // 3 - emitter.MOV(R0, R10); // 4 - emitter.BL(R14); // 5 - emitter.MOV(R14, R0); // 6 - emitter.POP(4, R0, R1, R2, R3); // 7 - emitter.MOV(rD, R14); // 8 - ctx->CTX_PC -= ARMREGOFFSET + (4 * 4); - emitter.FlushIcache(); - return true; } } - return 0; + else + { + if (flags & BackPatchInfo::FLAG_STORE && + flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64)) + { + emit->PUSH(4, R0, R1, R2, R3); + if (flags & BackPatchInfo::FLAG_SIZE_F32) + { + emit->MOV(R1, addr); + emit->VCVT(S0, RS, 0); + emit->VMOV(R0, S0); + emit->MOVI2R(temp, (u32)&Memory::Write_U32); + emit->BL(temp); + } + else + { + emit->MOVI2R(temp, (u32)&Memory::Write_F64); +#if !defined(__ARM_PCS_VFP) // SoftFP returns in R0 and R1 + emit->VMOV(R0, RS); + emit->MOV(R2, addr); +#else + emit->VMOV(D0, RS); + emit->MOV(R0, addr); +#endif + emit->BL(temp); + } + emit->POP(4, R0, R1, R2, R3); + } + else if (flags & BackPatchInfo::FLAG_LOAD && + flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64)) + { + emit->PUSH(4, R0, R1, R2, R3); + emit->MOV(R0, addr); + if (flags & BackPatchInfo::FLAG_SIZE_F32) + { + emit->MOVI2R(temp, (u32)&Memory::Read_U32); + emit->BL(temp); + emit->VMOV(S0, R0); + emit->VCVT(RS, S0, 0); + emit->VCVT(V1, S0, 0); + } + else + { + emit->MOVI2R(temp, (u32)&Memory::Read_F64); + emit->BL(temp); + +#if !defined(__ARM_PCS_VFP) // SoftFP returns in R0 and R1 + emit->VMOV(RS, R0); +#else + emit->VMOV(RS, D0); +#endif + } + emit->POP(4, R0, R1, R2, R3); + } + else if (flags & BackPatchInfo::FLAG_STORE) + { + emit->PUSH(4, R0, R1, R2, R3); + emit->MOV(R0, RS); + emit->MOV(R1, addr); + + if (flags & BackPatchInfo::FLAG_SIZE_32) + emit->MOVI2R(temp, (u32)&Memory::Write_U32); + else if (flags & BackPatchInfo::FLAG_SIZE_16) + emit->MOVI2R(temp, (u32)&Memory::Write_U16); + else + emit->MOVI2R(temp, (u32)&Memory::Write_U8); + + emit->BL(temp); + emit->POP(4, R0, R1, R2, R3); + } + else + { + emit->PUSH(4, R0, R1, R2, R3); + emit->MOV(R0, addr); + + if (flags & BackPatchInfo::FLAG_SIZE_32) + emit->MOVI2R(temp, (u32)&Memory::Read_U32); + else if (flags & BackPatchInfo::FLAG_SIZE_16) + emit->MOVI2R(temp, (u32)&Memory::Read_U16); + else if (flags & BackPatchInfo::FLAG_SIZE_8) + emit->MOVI2R(temp, (u32)&Memory::Read_U8); + + emit->BL(temp); + emit->MOV(temp, R0); + emit->POP(4, R0, R1, R2, R3); + + if (!(flags & BackPatchInfo::FLAG_REVERSE)) + { + emit->MOV(RS, temp); + } + else + { + if (flags & BackPatchInfo::FLAG_SIZE_32) + emit->REV(RS, temp); // 6 + else if (flags & BackPatchInfo::FLAG_SIZE_16) + emit->REV16(RS, temp); + } + } + } + + if (do_padding) + { + BackPatchInfo& info = m_backpatch_info[flags]; + u32 num_insts_max = std::max(info.m_fastmem_size, info.m_slowmem_size); + + u32 code_size = emit->GetCodePtr() - code_base; + code_size /= 4; + + emit->NOP(num_insts_max - code_size); + } + + return trouble_offset; } +void JitArm::InitBackpatch() +{ + u32 flags = 0; + BackPatchInfo info; + u8* code_base = GetWritableCodePtr(); + u8* code_end; + + // Writes + { + // 8bit + { + flags = + BackPatchInfo::FLAG_STORE | + BackPatchInfo::FLAG_SIZE_8; + EmitBackpatchRoutine(this, flags, false, false, R0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, R0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + // 16bit + { + flags = + BackPatchInfo::FLAG_STORE | + BackPatchInfo::FLAG_SIZE_16; + EmitBackpatchRoutine(this, flags, false, false, R0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, R0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + // 32bit + { + flags = + BackPatchInfo::FLAG_STORE | + BackPatchInfo::FLAG_SIZE_32; + EmitBackpatchRoutine(this, flags, false, false, R0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, R0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + // 32bit float + { + flags = + BackPatchInfo::FLAG_STORE | + BackPatchInfo::FLAG_SIZE_F32; + EmitBackpatchRoutine(this, flags, false, false, D0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, D0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + // 64bit float + { + flags = + BackPatchInfo::FLAG_STORE | + BackPatchInfo::FLAG_SIZE_F64; + EmitBackpatchRoutine(this, flags, false, false, D0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, D0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + + } + + // Loads + { + // 8bit + { + flags = + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_8; + EmitBackpatchRoutine(this, flags, false, false, R0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, R0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + // 16bit + { + flags = + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_16; + EmitBackpatchRoutine(this, flags, false, false, R0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, R0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + // 32bit + { + flags = + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_32; + EmitBackpatchRoutine(this, flags, false, false, R0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, R0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + + // 16bit - reverse + { + flags = + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_16 | + BackPatchInfo::FLAG_REVERSE; + EmitBackpatchRoutine(this, flags, false, false, R0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, R0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + // 32bit - reverse + { + flags = + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_32 | + BackPatchInfo::FLAG_REVERSE; + EmitBackpatchRoutine(this, flags, false, false, R0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, R0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + // 32bit float + { + flags = + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_F32; + EmitBackpatchRoutine(this, flags, false, false, D0, D1); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, D0, D1); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + // 64bit float + { + flags = + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_F64; + EmitBackpatchRoutine(this, flags, false, false, D0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, D0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + } +} diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp index c1f13f288f..1f50c7fb49 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp @@ -18,114 +18,149 @@ using namespace ArmGen; -void JitArm::UnsafeStoreFromReg(ARMReg dest, ARMReg value, int accessSize, s32 offset) +void JitArm::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, int accessSize, s32 offset) { - // All this gets replaced on backpatch - Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) - BIC(dest, dest, mask); // 1 - MOVI2R(R14, (u32)Memory::base, false); // 2-3 - ADD(dest, dest, R14); // 4 - switch (accessSize) - { - case 32: - REV(value, value); // 5 - break; - case 16: - REV16(value, value); - break; - case 8: - NOP(1); - break; - } - switch (accessSize) - { - case 32: - STR(value, dest); // 6 - break; - case 16: - STRH(value, dest); - break; - case 8: - STRB(value, dest); - break; - } - NOP(1); // 7 -} + // We want to make sure to not get LR as a temp register + ARMReg rA = R12; -void JitArm::SafeStoreFromReg(bool fastmem, s32 dest, u32 value, s32 regOffset, int accessSize, s32 offset) -{ - if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && fastmem) - { - ARMReg RA; - ARMReg RB; - ARMReg RS = gpr.R(value); + u32 imm_addr = 0; + bool is_immediate = false; - if (dest != -1) - RA = gpr.R(dest); - - if (regOffset != -1) - { - RB = gpr.R(regOffset); - MOV(R10, RB); - NOP(1); - } - else - { - MOVI2R(R10, (u32)offset, false); - } - - if (dest != -1) - ADD(R10, R10, RA); - else - NOP(1); - - MOV(R12, RS); - UnsafeStoreFromReg(R10, R12, accessSize, 0); - return; - } - ARMReg rA = gpr.GetReg(); - ARMReg rB = gpr.GetReg(); - ARMReg rC = gpr.GetReg(); - ARMReg RA = INVALID_REG; - ARMReg RB = INVALID_REG; - if (dest != -1) - RA = gpr.R(dest); - if (regOffset != -1) - RB = gpr.R(regOffset); - ARMReg RS = gpr.R(value); - switch (accessSize) - { - case 32: - MOVI2R(rA, (u32)&Memory::Write_U32); - break; - case 16: - MOVI2R(rA, (u32)&Memory::Write_U16); - break; - case 8: - MOVI2R(rA, (u32)&Memory::Write_U8); - break; - } - MOV(rB, RS); if (regOffset == -1) { - MOVI2R(rC, offset); if (dest != -1) - ADD(rC, rC, RA); + { + if (gpr.IsImm(dest)) + { + is_immediate = true; + imm_addr = gpr.GetImm(dest) + offset; + } + else + { + Operand2 off; + if (TryMakeOperand2(offset, off)) + { + ADD(rA, gpr.R(dest), off); + } + else + { + MOVI2R(rA, offset); + ADD(rA, rA, gpr.R(dest)); + } + } + } + else + { + is_immediate = true; + imm_addr = offset; + } } else { if (dest != -1) - ADD(rC, RA, RB); + { + if (gpr.IsImm(dest) && gpr.IsImm(regOffset)) + { + is_immediate = true; + imm_addr = gpr.GetImm(dest) + gpr.GetImm(regOffset); + } + else if (gpr.IsImm(dest) && !gpr.IsImm(regOffset)) + { + Operand2 off; + if (TryMakeOperand2(gpr.GetImm(dest), off)) + { + ADD(rA, gpr.R(regOffset), off); + } + else + { + MOVI2R(rA, gpr.GetImm(dest)); + ADD(rA, rA, gpr.R(regOffset)); + } + } + else if (!gpr.IsImm(dest) && gpr.IsImm(regOffset)) + { + Operand2 off; + if (TryMakeOperand2(gpr.GetImm(regOffset), off)) + { + ADD(rA, gpr.R(dest), off); + } + else + { + MOVI2R(rA, gpr.GetImm(regOffset)); + ADD(rA, rA, gpr.R(dest)); + } + } + else + { + ADD(rA, gpr.R(dest), gpr.R(regOffset)); + } + } else - MOV(rC, RB); + { + if (gpr.IsImm(regOffset)) + { + is_immediate = true; + imm_addr = gpr.GetImm(regOffset); + } + else + { + MOV(rA, gpr.R(regOffset)); + } + } + } + ARMReg RS = gpr.R(value); + + u32 flags = BackPatchInfo::FLAG_STORE; + if (accessSize == 32) + flags |= BackPatchInfo::FLAG_SIZE_32; + else if (accessSize == 16) + flags |= BackPatchInfo::FLAG_SIZE_16; + else + flags |= BackPatchInfo::FLAG_SIZE_8; + + if (is_immediate) + { + if ((imm_addr & 0xFFFFF000) == 0xCC008000 && jit->jo.optimizeGatherPipe) + { + MOVI2R(R14, (u32)&GPFifo::m_gatherPipeCount); + MOVI2R(R10, (u32)GPFifo::m_gatherPipe); + LDR(R11, R14); + if (accessSize == 32) + { + REV(RS, RS); + STR(RS, R10, R11); + REV(RS, RS); + } + else if (accessSize == 16) + { + REV16(RS, RS); + STRH(RS, R10, R11); + REV16(RS, RS); + } + else + { + STRB(RS, R10, R11); + } + ADD(R11, R11, accessSize >> 3); + STR(R11, R14); + jit->js.fifoBytesThisBlock += accessSize >> 3; + } + else if (Memory::IsRAMAddress(imm_addr)) + { + MOVI2R(rA, imm_addr); + EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, false, RS); + } + else + { + MOVI2R(rA, imm_addr); + EmitBackpatchRoutine(this, flags, false, false, RS); + } + } + else + { + EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, true, RS); } - PUSH(4, R0, R1, R2, R3); - MOV(R0, rB); - MOV(R1, rC); - BL(rA); - POP(4, R0, R1, R2, R3); - gpr.Unlock(rA, rB, rC); } void JitArm::stX(UGeckoInstruction inst) @@ -138,7 +173,6 @@ void JitArm::stX(UGeckoInstruction inst) u32 accessSize = 0; s32 regOffset = -1; bool update = false; - bool fastmem = false; switch (inst.OPCD) { case 45: // sthu @@ -152,7 +186,6 @@ void JitArm::stX(UGeckoInstruction inst) case 183: // stwux update = true; case 151: // stwx - fastmem = true; accessSize = 32; regOffset = b; break; @@ -173,7 +206,6 @@ void JitArm::stX(UGeckoInstruction inst) case 37: // stwu update = true; case 36: // stw - fastmem = true; accessSize = 32; break; case 39: // stbu @@ -182,7 +214,9 @@ void JitArm::stX(UGeckoInstruction inst) accessSize = 8; break; } - SafeStoreFromReg(fastmem, update ? a : (a ? a : -1), s, regOffset, accessSize, offset); + + SafeStoreFromReg(update ? a : (a ? a : -1), s, regOffset, accessSize, offset); + if (update) { ARMReg rA = gpr.GetReg(); @@ -193,143 +227,135 @@ void JitArm::stX(UGeckoInstruction inst) // Check for DSI exception prior to writing back address LDR(rA, R9, PPCSTATE_OFF(Exceptions)); TST(rA, EXCEPTION_DSI); - FixupBranch DoNotWrite = B_CC(CC_NEQ); - if (a) + SetCC(CC_EQ); + if (regOffset == -1) { - if (regOffset == -1) - { - MOVI2R(rA, offset); - ADD(RA, RA, rA); - } - else - { - ADD(RA, RA, RB); - } + MOVI2R(rA, offset); + ADD(RA, RA, rA); } else { - if (regOffset == -1) - MOVI2R(RA, (u32)offset); - else - MOV(RA, RB); + ADD(RA, RA, RB); } - SetJumpTarget(DoNotWrite); + SetCC(); gpr.Unlock(rA); } } -void JitArm::UnsafeLoadToReg(ARMReg dest, ARMReg addr, int accessSize, s32 offsetReg, s32 offset) +void JitArm::SafeLoadToReg(ARMReg dest, s32 addr, s32 offsetReg, int accessSize, s32 offset, bool signExtend, bool reverse, bool update) { - ARMReg rA = gpr.GetReg(); + // We want to make sure to not get LR as a temp register + ARMReg rA = R12; + + u32 imm_addr = 0; + bool is_immediate = false; + if (offsetReg == -1) { - MOVI2R(rA, offset, false); // -3 - ADD(addr, addr, rA); // - 1 - } - else - { - NOP(2); // -3, -2 - // offsetReg is preloaded here - ADD(addr, addr, gpr.R(offsetReg)); // -1 - } - - // All this gets replaced on backpatch - Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) - BIC(addr, addr, mask); // 1 - MOVI2R(rA, (u32)Memory::base, false); // 2-3 - ADD(addr, addr, rA); // 4 - switch (accessSize) - { - case 32: - LDR(dest, addr); // 5 - break; - case 16: - LDRH(dest, addr); - break; - case 8: - LDRB(dest, addr); - break; - } - switch (accessSize) - { - case 32: - REV(dest, dest); // 6 - break; - case 16: - REV16(dest, dest); - break; - case 8: - NOP(1); - break; - - } - NOP(2); // 7-8 - gpr.Unlock(rA); -} - -void JitArm::SafeLoadToReg(bool fastmem, u32 dest, s32 addr, s32 offsetReg, int accessSize, s32 offset, bool signExtend, bool reverse) -{ - ARMReg RD = gpr.R(dest); - - if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && fastmem) - { - // Preload for fastmem - if (offsetReg != -1) - gpr.R(offsetReg); - if (addr != -1) - MOV(R10, gpr.R(addr)); + { + if (gpr.IsImm(addr)) + { + is_immediate = true; + imm_addr = gpr.GetImm(addr) + offset; + } + else + { + Operand2 off; + if (TryMakeOperand2(offset, off)) + { + ADD(rA, gpr.R(addr), off); + } + else + { + MOVI2R(rA, offset); + ADD(rA, rA, gpr.R(addr)); + } + } + } else - MOV(R10, 0); - - UnsafeLoadToReg(RD, R10, accessSize, offsetReg, offset); - return; - } - ARMReg rA = gpr.GetReg(); - ARMReg rB = gpr.GetReg(); - - if (offsetReg == -1) - { - MOVI2R(rA, offset); - if (addr != -1) - ADD(rA, rA, gpr.R(addr)); + { + is_immediate = true; + imm_addr = offset; + } } else { if (addr != -1) - ADD(rA, gpr.R(addr), gpr.R(offsetReg)); + { + if (gpr.IsImm(addr) && gpr.IsImm(offsetReg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(addr) + gpr.GetImm(offsetReg); + } + else if (gpr.IsImm(addr) && !gpr.IsImm(offsetReg)) + { + Operand2 off; + if (TryMakeOperand2(gpr.GetImm(addr), off)) + { + ADD(rA, gpr.R(offsetReg), off); + } + else + { + MOVI2R(rA, gpr.GetImm(addr)); + ADD(rA, rA, gpr.R(offsetReg)); + } + } + else if (!gpr.IsImm(addr) && gpr.IsImm(offsetReg)) + { + Operand2 off; + if (TryMakeOperand2(gpr.GetImm(offsetReg), off)) + { + ADD(rA, gpr.R(addr), off); + } + else + { + MOVI2R(rA, gpr.GetImm(offsetReg)); + ADD(rA, rA, gpr.R(addr)); + } + } + else + { + ADD(rA, gpr.R(addr), gpr.R(offsetReg)); + } + } else - MOV(rA, gpr.R(offsetReg)); + { + if (gpr.IsImm(offsetReg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(offsetReg); + } + else + { + MOV(rA, gpr.R(offsetReg)); + } + } } - switch (accessSize) - { - case 8: - MOVI2R(rB, (u32)&Memory::Read_U8); - break; - case 16: - MOVI2R(rB, (u32)&Memory::Read_U16); - break; - case 32: - MOVI2R(rB, (u32)&Memory::Read_U32); - break; - } - PUSH(4, R0, R1, R2, R3); - MOV(R0, rA); - BL(rB); - MOV(rA, R0); - POP(4, R0, R1, R2, R3); - MOV(RD, rA); - if (signExtend) // Only on 16 loads - SXTH(RD, RD); + if (is_immediate) + MOVI2R(rA, imm_addr); + + u32 flags = BackPatchInfo::FLAG_LOAD; + if (accessSize == 32) + flags |= BackPatchInfo::FLAG_SIZE_32; + else if (accessSize == 16) + flags |= BackPatchInfo::FLAG_SIZE_16; + else + flags |= BackPatchInfo::FLAG_SIZE_8; + if (reverse) - { - if (accessSize == 32) - REV(RD, RD); - else if (accessSize == 16) - REV16(RD, RD); - } - gpr.Unlock(rA, rB); + flags |= BackPatchInfo::FLAG_REVERSE; + + EmitBackpatchRoutine(this, flags, + SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, + !(is_immediate && Memory::IsRAMAddress(imm_addr)), dest); + + if (signExtend) // Only on 16 loads + SXTH(dest, dest); + + if (update) + MOV(gpr.R(addr), rA); } void JitArm::lXX(UGeckoInstruction inst) @@ -344,7 +370,6 @@ void JitArm::lXX(UGeckoInstruction inst) bool update = false; bool signExtend = false; bool reverse = false; - bool fastmem = false; switch (inst.OPCD) { @@ -354,21 +379,18 @@ void JitArm::lXX(UGeckoInstruction inst) case 55: // lwzux update = true; case 23: // lwzx - fastmem = true; accessSize = 32; offsetReg = b; break; case 119: //lbzux update = true; case 87: // lbzx - fastmem = true; accessSize = 8; offsetReg = b; break; case 311: // lhzux update = true; case 279: // lhzx - fastmem = true; accessSize = 16; offsetReg = b; break; @@ -392,19 +414,16 @@ void JitArm::lXX(UGeckoInstruction inst) case 33: // lwzu update = true; case 32: // lwz - fastmem = true; accessSize = 32; break; case 35: // lbzu update = true; case 34: // lbz - fastmem = true; accessSize = 8; break; case 41: // lhzu update = true; case 40: // lhz - fastmem = true; accessSize = 16; break; case 43: // lhau @@ -417,27 +436,13 @@ void JitArm::lXX(UGeckoInstruction inst) // Check for exception before loading ARMReg rA = gpr.GetReg(false); + ARMReg RD = gpr.R(d); LDR(rA, R9, PPCSTATE_OFF(Exceptions)); TST(rA, EXCEPTION_DSI); FixupBranch DoNotLoad = B_CC(CC_NEQ); - SafeLoadToReg(fastmem, d, update ? a : (a ? a : -1), offsetReg, accessSize, offset, signExtend, reverse); - - if (update) - { - ARMReg RA = gpr.R(a); - if (offsetReg == -1) - { - rA = gpr.GetReg(false); - MOVI2R(rA, offset); - ADD(RA, RA, rA); - } - else - { - ADD(RA, RA, gpr.R(offsetReg)); - } - } + SafeLoadToReg(RD, update ? a : (a ? a : -1), offsetReg, accessSize, offset, signExtend, reverse, update); SetJumpTarget(DoNotLoad); @@ -449,8 +454,6 @@ void JitArm::lXX(UGeckoInstruction inst) (SConfig::GetInstance().m_LocalCoreStartupParameter.bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) && Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8) { - ARMReg RD = gpr.R(d); - // if it's still 0, we can wait until the next event TST(RD, RD); FixupBranch noIdle = B_CC(CC_NEQ); diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp index ea5fecf5ce..e0c95152c1 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp @@ -24,16 +24,13 @@ void JitArm::lfXX(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); - ARMReg rA = gpr.GetReg(); - ARMReg rB = gpr.GetReg(); ARMReg RA; u32 a = inst.RA, b = inst.RB; s32 offset = inst.SIMM_16; - bool single = false; + u32 flags = BackPatchInfo::FLAG_LOAD; bool update = false; - bool zeroA = false; s32 offsetReg = -1; switch (inst.OPCD) @@ -42,157 +39,152 @@ void JitArm::lfXX(UGeckoInstruction inst) switch (inst.SUBOP10) { case 567: // lfsux - single = true; + flags |= BackPatchInfo::FLAG_SIZE_F32; update = true; offsetReg = b; break; case 535: // lfsx - single = true; - zeroA = true; + flags |= BackPatchInfo::FLAG_SIZE_F32; offsetReg = b; break; case 631: // lfdux + flags |= BackPatchInfo::FLAG_SIZE_F64; update = true; offsetReg = b; break; case 599: // lfdx - zeroA = true; + flags |= BackPatchInfo::FLAG_SIZE_F64; offsetReg = b; break; } break; case 49: // lfsu + flags |= BackPatchInfo::FLAG_SIZE_F32; update = true; - single = true; break; case 48: // lfs - single = true; - zeroA = true; + flags |= BackPatchInfo::FLAG_SIZE_F32; break; case 51: // lfdu + flags |= BackPatchInfo::FLAG_SIZE_F64; update = true; break; case 50: // lfd - zeroA = true; + flags |= BackPatchInfo::FLAG_SIZE_F64; break; } - ARMReg v0 = fpr.R0(inst.FD, false), v1; - if (single) + ARMReg v0 = fpr.R0(inst.FD, false), v1 = INVALID_REG; + if (flags & BackPatchInfo::FLAG_SIZE_F32) v1 = fpr.R1(inst.FD, false); + ARMReg rA = R11; + ARMReg addr = R12; + + u32 imm_addr = 0; + bool is_immediate = false; if (update) { - RA = gpr.R(a); - // Update path /always/ uses RA - if (offsetReg == -1) // uses SIMM_16 + // Always uses RA + if (gpr.IsImm(a) && offsetReg == -1) { - MOVI2R(rB, offset); - ADD(rB, rB, RA); + is_immediate = true; + imm_addr = offset + gpr.GetImm(a); + } + else if (gpr.IsImm(a) && offsetReg != -1 && gpr.IsImm(offsetReg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(a) + gpr.GetImm(offsetReg); } else - { - ADD(rB, gpr.R(offsetReg), RA); - } - } - else - { - if (zeroA) { if (offsetReg == -1) { - if (a) + Operand2 off; + if (TryMakeOperand2(offset, off)) { - RA = gpr.R(a); - MOVI2R(rB, offset); - ADD(rB, rB, RA); + ADD(addr, gpr.R(a), off); } else { - MOVI2R(rB, (u32)offset); + MOVI2R(addr, offset); + ADD(addr, addr, gpr.R(a)); } } else { - ARMReg RB = gpr.R(offsetReg); - if (a) - { - RA = gpr.R(a); - ADD(rB, RB, RA); - } - else - { - MOV(rB, RB); - } + ADD(addr, gpr.R(offsetReg), gpr.R(a)); } } } + else + { + if (offsetReg == -1) + { + if (a && gpr.IsImm(a)) + { + is_immediate = true; + imm_addr = gpr.GetImm(a) + offset; + } + else if (a) + { + Operand2 off; + if (TryMakeOperand2(offset, off)) + { + ADD(addr, gpr.R(a), off); + } + else + { + MOVI2R(addr, offset); + ADD(addr, addr, gpr.R(a)); + } + } + else + { + is_immediate = true; + imm_addr = offset; + } + } + else + { + if (a && gpr.IsImm(a) && gpr.IsImm(offsetReg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(a) + gpr.GetImm(offsetReg); + } + else if (!a && gpr.IsImm(offsetReg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(offsetReg); + } + else if (a) + { + ADD(addr, gpr.R(a), gpr.R(offsetReg)); + } + else + { + MOV(addr, gpr.R(offsetReg)); + } + } + } + + if (update) + RA = gpr.R(a); + + if (is_immediate) + MOVI2R(addr, imm_addr); + LDR(rA, R9, PPCSTATE_OFF(Exceptions)); CMP(rA, EXCEPTION_DSI); FixupBranch DoNotLoad = B_CC(CC_EQ); if (update) - MOV(RA, rB); + MOV(RA, addr); - // This branch gets changed to a NOP when the fastpath fails - FixupBranch fast_path; - if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem) - fast_path = B(); + EmitBackpatchRoutine(this, flags, + SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, + !(is_immediate && Memory::IsRAMAddress(imm_addr)), v0, v1); - { - PUSH(4, R0, R1, R2, R3); - MOV(R0, rB); - if (single) - { - MOVI2R(rA, (u32)&Memory::Read_U32); - BL(rA); - VMOV(S0, R0); - VCVT(v0, S0, 0); - VCVT(v1, S0, 0); - } - else - { - MOVI2R(rA, (u32)&Memory::Read_F64); - BL(rA); - -#if !defined(__ARM_PCS_VFP) // SoftFP returns in R0 and R1 - VMOV(v0, R0); -#else - VMOV(v0, D0); -#endif - } - POP(4, R0, R1, R2, R3); - } - if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem) - { - FixupBranch slow_out = B(); - SetJumpTarget(fast_path); - { - Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) - ARMReg rC = gpr.GetReg(); - BIC(rC, rB, mask); - MOVI2R(rA, (u32)Memory::base); - ADD(rC, rC, rA); - - NEONXEmitter nemit(this); - if (single) - { - nemit.VLD1(F_32, D0, rC); - nemit.VREV32(I_8, D0, D0); // Byte swap to result - VCVT(v0, S0, 0); - VCVT(v1, S0, 0); - } - else - { - nemit.VLD1(I_64, v0, rC); - nemit.VREV64(I_8, v0, v0); // Byte swap to result - } - gpr.Unlock(rC); - } - SetJumpTarget(slow_out); - } - - gpr.Unlock(rA, rB); SetJumpTarget(DoNotLoad); } @@ -201,16 +193,13 @@ void JitArm::stfXX(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); - ARMReg rA = gpr.GetReg(); - ARMReg rB = gpr.GetReg(); ARMReg RA; u32 a = inst.RA, b = inst.RB; s32 offset = inst.SIMM_16; - bool single = false; + u32 flags = BackPatchInfo::FLAG_STORE; bool update = false; - bool zeroA = false; s32 offsetReg = -1; switch (inst.OPCD) @@ -219,157 +208,196 @@ void JitArm::stfXX(UGeckoInstruction inst) switch (inst.SUBOP10) { case 663: // stfsx - single = true; - zeroA = true; + flags |= BackPatchInfo::FLAG_SIZE_F32; offsetReg = b; break; case 695: // stfsux - single = true; + flags |= BackPatchInfo::FLAG_SIZE_F32; offsetReg = b; break; case 727: // stfdx - zeroA = true; + flags |= BackPatchInfo::FLAG_SIZE_F64; offsetReg = b; break; case 759: // stfdux + flags |= BackPatchInfo::FLAG_SIZE_F64; update = true; offsetReg = b; break; } break; case 53: // stfsu + flags |= BackPatchInfo::FLAG_SIZE_F32; update = true; - single = true; break; case 52: // stfs - single = true; - zeroA = true; + flags |= BackPatchInfo::FLAG_SIZE_F32; break; case 55: // stfdu + flags |= BackPatchInfo::FLAG_SIZE_F64; update = true; break; case 54: // stfd - zeroA = true; + flags |= BackPatchInfo::FLAG_SIZE_F64; break; } ARMReg v0 = fpr.R0(inst.FS); + ARMReg rA = R11; + ARMReg addr = R12; + + u32 imm_addr = 0; + bool is_immediate = false; if (update) { - RA = gpr.R(a); - // Update path /always/ uses RA - if (offsetReg == -1) // uses SIMM_16 + // Always uses RA + if (gpr.IsImm(a) && offsetReg == -1) { - MOVI2R(rB, offset); - ADD(rB, rB, RA); + is_immediate = true; + imm_addr = offset + gpr.GetImm(a); + } + else if (gpr.IsImm(a) && offsetReg != -1 && gpr.IsImm(offsetReg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(a) + gpr.GetImm(offsetReg); } else { - ADD(rB, gpr.R(offsetReg), RA); + if (offsetReg == -1) + { + Operand2 off; + if (TryMakeOperand2(offset, off)) + { + ADD(addr, gpr.R(a), off); + } + else + { + MOVI2R(addr, offset); + ADD(addr, addr, gpr.R(a)); + } + } + else + { + ADD(addr, gpr.R(offsetReg), gpr.R(a)); + } } } else { - if (zeroA) + if (offsetReg == -1) { - if (offsetReg == -1) + if (a && gpr.IsImm(a)) { - if (a) + is_immediate = true; + imm_addr = gpr.GetImm(a) + offset; + } + else if (a) + { + Operand2 off; + if (TryMakeOperand2(offset, off)) { - RA = gpr.R(a); - MOVI2R(rB, offset); - ADD(rB, rB, RA); + ADD(addr, gpr.R(a), off); } else { - MOVI2R(rB, (u32)offset); + MOVI2R(addr, offset); + ADD(addr, addr, gpr.R(a)); } } else { - ARMReg RB = gpr.R(offsetReg); - if (a) - { - RA = gpr.R(a); - ADD(rB, RB, RA); - } - else - { - MOV(rB, RB); - } + is_immediate = true; + imm_addr = offset; + } + } + else + { + if (a && gpr.IsImm(a) && gpr.IsImm(offsetReg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(a) + gpr.GetImm(offsetReg); + } + else if (!a && gpr.IsImm(offsetReg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(offsetReg); + } + else if (a) + { + ADD(addr, gpr.R(a), gpr.R(offsetReg)); + } + else + { + MOV(addr, gpr.R(offsetReg)); } } } + if (is_immediate) + MOVI2R(addr, imm_addr); + if (update) { + RA = gpr.R(a); LDR(rA, R9, PPCSTATE_OFF(Exceptions)); CMP(rA, EXCEPTION_DSI); SetCC(CC_NEQ); - MOV(RA, rB); + MOV(RA, addr); SetCC(); } - // This branch gets changed to a NOP when the fastpath fails - FixupBranch fast_path; - if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem) - fast_path = B(); - + if (is_immediate) { - PUSH(4, R0, R1, R2, R3); - if (single) + if ((imm_addr & 0xFFFFF000) == 0xCC008000 && jit->jo.optimizeGatherPipe) { - MOV(R1, rB); - VCVT(S0, v0, 0); - VMOV(R0, S0); - MOVI2R(rA, (u32)&Memory::Write_U32); - BL(rA); - } - else - { - MOVI2R(rA, (u32)&Memory::Write_F64); -#if !defined(__ARM_PCS_VFP) // SoftFP returns in R0 and R1 - VMOV(R0, v0); - MOV(R2, rB); -#else - VMOV(D0, v0); - MOV(R0, rB); -#endif - BL(rA); - } - POP(4, R0, R1, R2, R3); - } - - if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem) - { - FixupBranch slow_out = B(); - SetJumpTarget(fast_path); - { - Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) - ARMReg rC = gpr.GetReg(); - BIC(rC, rB, mask); - MOVI2R(rA, (u32)Memory::base); - ADD(rC, rC, rA); + int accessSize; + if (flags & BackPatchInfo::FLAG_SIZE_F64) + accessSize = 64; + else + accessSize = 32; + MOVI2R(R14, (u32)&GPFifo::m_gatherPipeCount); + MOVI2R(R10, (u32)GPFifo::m_gatherPipe); + LDR(R11, R14); + ADD(R10, R10, R11); NEONXEmitter nemit(this); - if (single) + if (accessSize == 64) + { + PUSH(2, R0, R1); + nemit.VREV64(I_8, D0, v0); + VMOV(R0, D0); + STR(R0, R10, 0); + STR(R1, R10, 4); + POP(2, R0, R1); + } + else if (accessSize == 32) { VCVT(S0, v0, 0); nemit.VREV32(I_8, D0, D0); - VSTR(S0, rC, 0); + VMOV(addr, S0); + STR(addr, R10); } - else - { - nemit.VREV64(I_8, D0, v0); - VSTR(D0, rC, 0); - } - gpr.Unlock(rC); - } - SetJumpTarget(slow_out); - } + ADD(R11, R11, accessSize >> 3); + STR(R11, R14); + jit->js.fifoBytesThisBlock += accessSize >> 3; - gpr.Unlock(rA, rB); + } + else if (Memory::IsRAMAddress(imm_addr)) + { + MOVI2R(addr, imm_addr); + EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, false, v0); + } + else + { + MOVI2R(addr, imm_addr); + EmitBackpatchRoutine(this, flags, false, false, v0); + } + } + else + { + EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, true, v0); + } }