/* This file is part of the dynarmic project. * Copyright (c) 2016 MerryMage * This software may be used and distributed according to the terms of the GNU * General Public License version 2 or any later version. */ #include #include #include #include "backend_x64/emit_x64.h" #include "common/x64/abi.h" #include "common/x64/emitter.h" #include "frontend/arm_types.h" // TODO: More optimal use of immediates. // TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary. // TODO: Actually implement that proper instruction selector you've always wanted to sweetheart. using namespace Gen; namespace Dynarmic { namespace BackendX64 { static OpArg MJitStateReg(Arm::Reg reg) { return MDisp(R15, offsetof(JitState, Reg) + sizeof(u32) * static_cast(reg)); } static OpArg MJitStateExtReg(Arm::ExtReg reg) { if (reg >= Arm::ExtReg::S0 && reg <= Arm::ExtReg::S31) { size_t index = static_cast(reg) - static_cast(Arm::ExtReg::S0); return MDisp(R15, int(offsetof(JitState, ExtReg) + sizeof(u32) * index)); } if (reg >= Arm::ExtReg::D0 && reg <= Arm::ExtReg::D31) { size_t index = static_cast(reg) - static_cast(Arm::ExtReg::D0); return MDisp(R15, int(offsetof(JitState, ExtReg) + sizeof(u64) * index)); } ASSERT_MSG(false, "Should never happen."); } static OpArg MJitStateCpsr() { return MDisp(R15, offsetof(JitState, Cpsr)); } static IR::Inst* FindUseWithOpcode(IR::Inst* inst, IR::Opcode opcode) { switch (opcode) { case IR::Opcode::GetCarryFromOp: return inst->carry_inst; case IR::Opcode::GetOverflowFromOp: return inst->overflow_inst; default: break; } ASSERT_MSG(false, "unreachable"); return nullptr; } static void EraseInstruction(IR::Block& block, IR::Inst* inst) { block.instructions.erase(block.instructions.iterator_to(*inst)); } EmitX64::BlockDescriptor EmitX64::Emit(const Arm::LocationDescriptor descriptor, Dynarmic::IR::Block& block) { inhibit_emission.clear(); reg_alloc.Reset(); code->INT3(); const CodePtr code_ptr = code->GetCodePtr(); basic_blocks[descriptor].code_ptr = code_ptr; unique_hash_to_code_ptr[descriptor.UniqueHash()] = code_ptr; EmitCondPrelude(block); for (auto iter = block.begin(); iter != block.end(); ++iter) { IR::Inst* inst = &*iter; // Call the relevant Emit* member function. switch (inst->GetOpcode()) { #define OPCODE(name, type, ...) \ case IR::Opcode::name: \ EmitX64::Emit##name(block, inst); \ break; #include "frontend/ir/opcodes.inc" #undef OPCODE default: ASSERT_MSG(false, "Invalid opcode %zu", static_cast(inst->GetOpcode())); break; } reg_alloc.EndOfAllocScope(); } EmitAddCycles(block.cycle_count); EmitTerminal(block.terminal, block.location); reg_alloc.AssertNoMoreUses(); Patch(descriptor, code_ptr); basic_blocks[descriptor].size = code->GetCodePtr() - code_ptr; return basic_blocks[descriptor]; } void EmitX64::EmitBreakpoint(IR::Block&, IR::Inst*) { code->INT3(); } void EmitX64::EmitIdentity(IR::Block& block, IR::Inst* inst) { if (!inst->GetArg(0).IsImmediate()) { reg_alloc.RegisterAddDef(inst, inst->GetArg(0)); } } void EmitX64::EmitGetRegister(IR::Block&, IR::Inst* inst) { Arm::Reg reg = inst->GetArg(0).GetRegRef(); X64Reg result = reg_alloc.DefRegister(inst, any_gpr); code->MOV(32, R(result), MJitStateReg(reg)); } void EmitX64::EmitGetExtendedRegister32(IR::Block& block, IR::Inst* inst) { Arm::ExtReg reg = inst->GetArg(0).GetExtRegRef(); ASSERT(reg >= Arm::ExtReg::S0 && reg <= Arm::ExtReg::S31); X64Reg result = reg_alloc.DefRegister(inst, any_xmm); code->MOVSS(result, MJitStateExtReg(reg)); } void EmitX64::EmitGetExtendedRegister64(IR::Block&, IR::Inst* inst) { Arm::ExtReg reg = inst->GetArg(0).GetExtRegRef(); ASSERT(reg >= Arm::ExtReg::D0 && reg <= Arm::ExtReg::D31); X64Reg result = reg_alloc.DefRegister(inst, any_xmm); code->MOVSD(result, MJitStateExtReg(reg)); } void EmitX64::EmitSetRegister(IR::Block&, IR::Inst* inst) { Arm::Reg reg = inst->GetArg(0).GetRegRef(); IR::Value arg = inst->GetArg(1); if (arg.IsImmediate()) { code->MOV(32, MJitStateReg(reg), Imm32(arg.GetU32())); } else { X64Reg to_store = reg_alloc.UseRegister(arg.GetInst(), any_gpr); code->MOV(32, MJitStateReg(reg), R(to_store)); } } void EmitX64::EmitSetExtendedRegister32(IR::Block&, IR::Inst* inst) { Arm::ExtReg reg = inst->GetArg(0).GetExtRegRef(); ASSERT(reg >= Arm::ExtReg::S0 && reg <= Arm::ExtReg::S31); X64Reg source = reg_alloc.UseRegister(inst->GetArg(1), any_xmm); code->MOVSS(MJitStateExtReg(reg), source); } void EmitX64::EmitSetExtendedRegister64(IR::Block&, IR::Inst* inst) { Arm::ExtReg reg = inst->GetArg(0).GetExtRegRef(); ASSERT(reg >= Arm::ExtReg::D0 && reg <= Arm::ExtReg::D31); X64Reg source = reg_alloc.UseRegister(inst->GetArg(1), any_xmm); code->MOVSD(MJitStateExtReg(reg), source); } void EmitX64::EmitGetNFlag(IR::Block&, IR::Inst* inst) { X64Reg result = reg_alloc.DefRegister(inst, any_gpr); code->MOV(32, R(result), MJitStateCpsr()); code->SHR(32, R(result), Imm8(31)); } void EmitX64::EmitSetNFlag(IR::Block&, IR::Inst* inst) { constexpr size_t flag_bit = 31; constexpr u32 flag_mask = 1u << flag_bit; IR::Value arg = inst->GetArg(0); if (arg.IsImmediate()) { if (arg.GetU1()) { code->OR(32, MJitStateCpsr(), Imm32(flag_mask)); } else { code->AND(32, MJitStateCpsr(), Imm32(~flag_mask)); } } else { X64Reg to_store = reg_alloc.UseScratchRegister(arg.GetInst(), any_gpr); code->SHL(32, R(to_store), Imm8(flag_bit)); code->AND(32, MJitStateCpsr(), Imm32(~flag_mask)); code->OR(32, MJitStateCpsr(), R(to_store)); } } void EmitX64::EmitGetZFlag(IR::Block&, IR::Inst* inst) { X64Reg result = reg_alloc.DefRegister(inst, any_gpr); code->MOV(32, R(result), MJitStateCpsr()); code->SHR(32, R(result), Imm8(30)); code->AND(32, R(result), Imm32(1)); } void EmitX64::EmitSetZFlag(IR::Block&, IR::Inst* inst) { constexpr size_t flag_bit = 30; constexpr u32 flag_mask = 1u << flag_bit; IR::Value arg = inst->GetArg(0); if (arg.IsImmediate()) { if (arg.GetU1()) { code->OR(32, MJitStateCpsr(), Imm32(flag_mask)); } else { code->AND(32, MJitStateCpsr(), Imm32(~flag_mask)); } } else { X64Reg to_store = reg_alloc.UseScratchRegister(arg.GetInst(), any_gpr); code->SHL(32, R(to_store), Imm8(flag_bit)); code->AND(32, MJitStateCpsr(), Imm32(~flag_mask)); code->OR(32, MJitStateCpsr(), R(to_store)); } } void EmitX64::EmitGetCpsr(IR::Block&, IR::Inst* inst) { X64Reg result = reg_alloc.DefRegister(inst, any_gpr); code->MOV(32, R(result), MJitStateCpsr()); } void EmitX64::EmitSetCpsr(IR::Block&, IR::Inst* inst) { X64Reg arg = reg_alloc.UseRegister(inst->GetArg(0), any_gpr); code->MOV(32, MJitStateCpsr(), R(arg)); } void EmitX64::EmitGetCFlag(IR::Block&, IR::Inst* inst) { X64Reg result = reg_alloc.DefRegister(inst, any_gpr); code->MOV(32, R(result), MJitStateCpsr()); code->SHR(32, R(result), Imm8(29)); code->AND(32, R(result), Imm32(1)); } void EmitX64::EmitSetCFlag(IR::Block&, IR::Inst* inst) { constexpr size_t flag_bit = 29; constexpr u32 flag_mask = 1u << flag_bit; IR::Value arg = inst->GetArg(0); if (arg.IsImmediate()) { if (arg.GetU1()) { code->OR(32, MJitStateCpsr(), Imm32(flag_mask)); } else { code->AND(32, MJitStateCpsr(), Imm32(~flag_mask)); } } else { X64Reg to_store = reg_alloc.UseScratchRegister(arg.GetInst(), any_gpr); code->SHL(32, R(to_store), Imm8(flag_bit)); code->AND(32, MJitStateCpsr(), Imm32(~flag_mask)); code->OR(32, MJitStateCpsr(), R(to_store)); } } void EmitX64::EmitGetVFlag(IR::Block&, IR::Inst* inst) { X64Reg result = reg_alloc.DefRegister(inst, any_gpr); code->MOV(32, R(result), MJitStateCpsr()); code->SHR(32, R(result), Imm8(28)); code->AND(32, R(result), Imm32(1)); } void EmitX64::EmitSetVFlag(IR::Block&, IR::Inst* inst) { constexpr size_t flag_bit = 28; constexpr u32 flag_mask = 1u << flag_bit; IR::Value arg = inst->GetArg(0); if (arg.IsImmediate()) { if (arg.GetU1()) { code->OR(32, MJitStateCpsr(), Imm32(flag_mask)); } else { code->AND(32, MJitStateCpsr(), Imm32(~flag_mask)); } } else { X64Reg to_store = reg_alloc.UseScratchRegister(arg.GetInst(), any_gpr); code->SHL(32, R(to_store), Imm8(flag_bit)); code->AND(32, MJitStateCpsr(), Imm32(~flag_mask)); code->OR(32, MJitStateCpsr(), R(to_store)); } } void EmitX64::EmitOrQFlag(IR::Block&, IR::Inst* inst) { constexpr size_t flag_bit = 27; constexpr u32 flag_mask = 1u << flag_bit; IR::Value arg = inst->GetArg(0); if (arg.IsImmediate()) { if (arg.GetU1()) code->OR(32, MJitStateCpsr(), Imm32(flag_mask)); } else { X64Reg to_store = reg_alloc.UseScratchRegister(arg.GetInst(), any_gpr); code->SHL(32, R(to_store), Imm8(flag_bit)); code->OR(32, MJitStateCpsr(), R(to_store)); } } void EmitX64::EmitBXWritePC(IR::Block&, IR::Inst* inst) { const u32 T_bit = 1 << 5; auto arg = inst->GetArg(0); // Pseudocode: // if (new_pc & 1) { // new_pc &= 0xFFFFFFFE; // cpsr.T = true; // } else { // new_pc &= 0xFFFFFFFC; // cpsr.T = false; // } if (arg.IsImmediate()) { u32 new_pc = arg.GetU32(); if (Common::Bit<0>(new_pc)) { new_pc &= 0xFFFFFFFE; code->MOV(32, MJitStateReg(Arm::Reg::PC), Imm32(new_pc)); code->OR(32, MJitStateCpsr(), Imm32(T_bit)); } else { new_pc &= 0xFFFFFFFC; code->MOV(32, MJitStateReg(Arm::Reg::PC), Imm32(new_pc)); code->AND(32, MJitStateCpsr(), Imm32(~T_bit)); } } else { X64Reg new_pc = reg_alloc.UseScratchRegister(arg.GetInst(), any_gpr); X64Reg tmp1 = reg_alloc.ScratchRegister(any_gpr); X64Reg tmp2 = reg_alloc.ScratchRegister(any_gpr); code->MOV(32, R(tmp1), MJitStateCpsr()); code->MOV(32, R(tmp2), R(tmp1)); code->AND(32, R(tmp2), Imm32(~T_bit)); // CPSR.T = 0 code->OR(32, R(tmp1), Imm32(T_bit)); // CPSR.T = 1 code->TEST(8, R(new_pc), Imm8(1)); code->CMOVcc(32, tmp1, R(tmp2), CC_E); // CPSR.T = pc & 1 code->MOV(32, MJitStateCpsr(), R(tmp1)); code->LEA(32, tmp2, MComplex(new_pc, new_pc, 1, 0)); code->OR(32, R(tmp2), Imm32(0xFFFFFFFC)); // tmp2 = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC code->AND(32, R(new_pc), R(tmp2)); code->MOV(32, MJitStateReg(Arm::Reg::PC), R(new_pc)); } } void EmitX64::EmitCallSupervisor(IR::Block&, IR::Inst* inst) { auto imm32 = inst->GetArg(0); reg_alloc.HostCall(nullptr, imm32); code->SwitchMxcsrOnExit(); code->ABI_CallFunction(reinterpret_cast(cb.CallSVC)); code->SwitchMxcsrOnEntry(); } void EmitX64::EmitPushRSB(IR::Block&, IR::Inst* inst) { ASSERT(inst->GetArg(0).IsImmediate()); u64 imm64 = inst->GetArg(0).GetU64(); X64Reg code_ptr_reg = reg_alloc.ScratchRegister({HostLoc::RCX}); X64Reg loc_desc_reg = reg_alloc.ScratchRegister(any_gpr); X64Reg index_reg = reg_alloc.ScratchRegister(any_gpr); u64 code_ptr = unique_hash_to_code_ptr.find(imm64) != unique_hash_to_code_ptr.end() ? u64(unique_hash_to_code_ptr[imm64]) : u64(code->GetReturnFromRunCodeAddress()); code->MOV(32, R(index_reg), MDisp(R15, offsetof(JitState, rsb_ptr))); code->ADD(32, R(index_reg), Imm8(1)); code->AND(32, R(index_reg), Imm32(JitState::RSBSize - 1)); code->MOV(64, R(loc_desc_reg), Imm64(imm64)); CodePtr patch_location = code->GetCodePtr(); patch_unique_hash_locations[imm64].emplace_back(patch_location); code->MOV(64, R(code_ptr_reg), Imm64(code_ptr)); // This line has to match up with EmitX64::Patch. ASSERT((code->GetCodePtr() - patch_location) == 10); std::vector fixups; fixups.reserve(JitState::RSBSize); for (size_t i = 0; i < JitState::RSBSize; ++i) { code->CMP(64, R(loc_desc_reg), MDisp(R15, int(offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)))); fixups.push_back(code->J_CC(CC_E)); } code->MOV(32, MDisp(R15, offsetof(JitState, rsb_ptr)), R(index_reg)); code->MOV(64, MComplex(R15, index_reg, SCALE_8, offsetof(JitState, rsb_location_descriptors)), R(loc_desc_reg)); code->MOV(64, MComplex(R15, index_reg, SCALE_8, offsetof(JitState, rsb_codeptrs)), R(code_ptr_reg)); for (auto f : fixups) { code->SetJumpTarget(f); } } void EmitX64::EmitGetCarryFromOp(IR::Block&, IR::Inst*) { ASSERT_MSG(0, "should never happen"); } void EmitX64::EmitGetOverflowFromOp(IR::Block&, IR::Inst*) { ASSERT_MSG(0, "should never happen"); } void EmitX64::EmitPack2x32To1x64(IR::Block&, IR::Inst* inst) { OpArg lo; X64Reg result; if (inst->GetArg(0).IsImmediate()) { // TODO: Optimize result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); lo = Gen::R(result); } else { std::tie(lo, result) = reg_alloc.UseDefOpArg(inst->GetArg(0), inst, any_gpr); } X64Reg hi = reg_alloc.UseScratchRegister(inst->GetArg(1), any_gpr); code->SHL(64, R(hi), Imm8(32)); code->MOVZX(64, 32, result, lo); code->OR(64, R(result), R(hi)); } void EmitX64::EmitLeastSignificantWord(IR::Block&, IR::Inst* inst) { reg_alloc.RegisterAddDef(inst, inst->GetArg(0)); } void EmitX64::EmitMostSignificantWord(IR::Block& block, IR::Inst* inst) { auto carry_inst = FindUseWithOpcode(inst, IR::Opcode::GetCarryFromOp); auto result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); code->SHR(64, R(result), Imm8(32)); if (carry_inst) { EraseInstruction(block, carry_inst); reg_alloc.DecrementRemainingUses(inst); X64Reg carry = reg_alloc.DefRegister(carry_inst, any_gpr); code->SETcc(CC_C, R(carry)); } } void EmitX64::EmitLeastSignificantHalf(IR::Block&, IR::Inst* inst) { reg_alloc.RegisterAddDef(inst, inst->GetArg(0)); } void EmitX64::EmitLeastSignificantByte(IR::Block&, IR::Inst* inst) { reg_alloc.RegisterAddDef(inst, inst->GetArg(0)); } void EmitX64::EmitMostSignificantBit(IR::Block&, IR::Inst* inst) { X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); // TODO: Flag optimization code->SHR(32, R(result), Imm8(31)); } void EmitX64::EmitIsZero(IR::Block&, IR::Inst* inst) { X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); // TODO: Flag optimization code->TEST(32, R(result), R(result)); code->SETcc(CCFlags::CC_E, R(result)); code->MOVZX(32, 8, result, R(result)); } void EmitX64::EmitIsZero64(IR::Block&, IR::Inst* inst) { X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); // TODO: Flag optimization code->TEST(64, R(result), R(result)); code->SETcc(CCFlags::CC_E, R(result)); code->MOVZX(32, 8, result, R(result)); } void EmitX64::EmitLogicalShiftLeft(IR::Block& block, IR::Inst* inst) { auto carry_inst = FindUseWithOpcode(inst, IR::Opcode::GetCarryFromOp); // TODO: Consider using BMI2 instructions like SHLX when arm-in-host flags is implemented. if (!carry_inst) { if (!inst->GetArg(2).IsImmediate()) { // TODO: Remove redundant argument. reg_alloc.DecrementRemainingUses(inst->GetArg(2).GetInst()); } auto shift_arg = inst->GetArg(1); if (shift_arg.IsImmediate()) { X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); u8 shift = shift_arg.GetU8(); if (shift <= 31) { code->SHL(32, R(result), Imm8(shift)); } else { code->XOR(32, R(result), R(result)); } } else { X64Reg shift = reg_alloc.UseRegister(shift_arg.GetInst(), {HostLoc::RCX}); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg zero = reg_alloc.ScratchRegister(any_gpr); // The 32-bit x64 SHL instruction masks the shift count by 0x1F before performing the shift. // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros. code->SHL(32, R(result), R(shift)); code->XOR(32, R(zero), R(zero)); code->CMP(8, R(shift), Imm8(32)); code->CMOVcc(32, result, R(zero), CC_NB); } } else { EraseInstruction(block, carry_inst); reg_alloc.DecrementRemainingUses(inst); auto shift_arg = inst->GetArg(1); if (shift_arg.IsImmediate()) { u8 shift = shift_arg.GetU8(); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg carry = reg_alloc.UseDefRegister(inst->GetArg(2), carry_inst, any_gpr); if (shift == 0) { // There is nothing more to do. } else if (shift < 32) { code->BT(32, R(carry), Imm8(0)); code->SHL(32, R(result), Imm8(shift)); code->SETcc(CC_C, R(carry)); } else if (shift > 32) { code->XOR(32, R(result), R(result)); code->XOR(32, R(carry), R(carry)); } else { code->MOV(32, R(carry), R(result)); code->XOR(32, R(result), R(result)); code->AND(32, R(carry), Imm32(1)); } } else { X64Reg shift = reg_alloc.UseRegister(shift_arg.GetInst(), {HostLoc::RCX}); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg carry = reg_alloc.UseDefRegister(inst->GetArg(2), carry_inst, any_gpr); // TODO: Optimize this. code->CMP(8, R(shift), Imm8(32)); auto Rs_gt32 = code->J_CC(CC_A); auto Rs_eq32 = code->J_CC(CC_E); // if (Rs & 0xFF < 32) { code->BT(32, R(carry), Imm8(0)); // Set the carry flag for correct behaviour in the case when Rs & 0xFF == 0 code->SHL(32, R(result), R(shift)); code->SETcc(CC_C, R(carry)); auto jmp_to_end_1 = code->J(); // } else if (Rs & 0xFF > 32) { code->SetJumpTarget(Rs_gt32); code->XOR(32, R(result), R(result)); code->XOR(32, R(carry), R(carry)); auto jmp_to_end_2 = code->J(); // } else if (Rs & 0xFF == 32) { code->SetJumpTarget(Rs_eq32); code->MOV(32, R(carry), R(result)); code->AND(32, R(carry), Imm8(1)); code->XOR(32, R(result), R(result)); // } code->SetJumpTarget(jmp_to_end_1); code->SetJumpTarget(jmp_to_end_2); } } } void EmitX64::EmitLogicalShiftRight(IR::Block& block, IR::Inst* inst) { auto carry_inst = FindUseWithOpcode(inst, IR::Opcode::GetCarryFromOp); if (!carry_inst) { if (!inst->GetArg(2).IsImmediate()) { // TODO: Remove redundant argument. reg_alloc.DecrementRemainingUses(inst->GetArg(2).GetInst()); } auto shift_arg = inst->GetArg(1); if (shift_arg.IsImmediate()) { X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); u8 shift = shift_arg.GetU8(); if (shift <= 31) { code->SHR(32, R(result), Imm8(shift)); } else { code->XOR(32, R(result), R(result)); } } else { X64Reg shift = reg_alloc.UseRegister(shift_arg.GetInst(), {HostLoc::RCX}); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg zero = reg_alloc.ScratchRegister(any_gpr); // The 32-bit x64 SHR instruction masks the shift count by 0x1F before performing the shift. // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros. code->SHR(32, R(result), R(shift)); code->XOR(32, R(zero), R(zero)); code->CMP(8, R(shift), Imm8(32)); code->CMOVcc(32, result, R(zero), CC_NB); } } else { EraseInstruction(block, carry_inst); reg_alloc.DecrementRemainingUses(inst); auto shift_arg = inst->GetArg(1); if (shift_arg.IsImmediate()) { u8 shift = shift_arg.GetU8(); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg carry = reg_alloc.UseDefRegister(inst->GetArg(2), carry_inst, any_gpr); if (shift == 0) { // There is nothing more to do. } else if (shift < 32) { code->SHR(32, R(result), Imm8(shift)); code->SETcc(CC_C, R(carry)); } else if (shift == 32) { code->BT(32, R(result), Imm8(31)); code->SETcc(CC_C, R(carry)); code->MOV(32, R(result), Imm32(0)); } else { code->XOR(32, R(result), R(result)); code->XOR(32, R(carry), R(carry)); } } else { X64Reg shift = reg_alloc.UseRegister(shift_arg.GetInst(), {HostLoc::RCX}); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg carry = reg_alloc.UseDefRegister(inst->GetArg(2), carry_inst, any_gpr); // TODO: Optimize this. code->CMP(8, R(shift), Imm8(32)); auto Rs_gt32 = code->J_CC(CC_A); auto Rs_eq32 = code->J_CC(CC_E); // if (Rs & 0xFF == 0) goto end; code->TEST(8, R(shift), R(shift)); auto Rs_zero = code->J_CC(CC_Z); // if (Rs & 0xFF < 32) { code->SHR(32, R(result), R(shift)); code->SETcc(CC_C, R(carry)); auto jmp_to_end_1 = code->J(); // } else if (Rs & 0xFF > 32) { code->SetJumpTarget(Rs_gt32); code->XOR(32, R(result), R(result)); code->XOR(32, R(carry), R(carry)); auto jmp_to_end_2 = code->J(); // } else if (Rs & 0xFF == 32) { code->SetJumpTarget(Rs_eq32); code->BT(32, R(result), Imm8(31)); code->SETcc(CC_C, R(carry)); code->MOV(32, R(result), Imm32(0)); // } code->SetJumpTarget(jmp_to_end_1); code->SetJumpTarget(jmp_to_end_2); code->SetJumpTarget(Rs_zero); } } } void EmitX64::EmitLogicalShiftRight64(IR::Block& block, IR::Inst* inst) { X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); auto shift_arg = inst->GetArg(1); ASSERT_MSG(shift_arg.IsImmediate(), "variable 64 bit shifts are not implemented"); u8 shift = shift_arg.GetU8(); ASSERT_MSG(shift < 64, "shift width clamping is not implemented"); code->SHR(64, R(result), Imm8(shift)); } void EmitX64::EmitArithmeticShiftRight(IR::Block& block, IR::Inst* inst) { auto carry_inst = FindUseWithOpcode(inst, IR::Opcode::GetCarryFromOp); if (!carry_inst) { if (!inst->GetArg(2).IsImmediate()) { // TODO: Remove redundant argument. reg_alloc.DecrementRemainingUses(inst->GetArg(2).GetInst()); } auto shift_arg = inst->GetArg(1); if (shift_arg.IsImmediate()) { u8 shift = shift_arg.GetU8(); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); code->SAR(32, R(result), Imm8(shift < 31 ? shift : 31)); } else { X64Reg shift = reg_alloc.UseScratchRegister(shift_arg.GetInst(), {HostLoc::RCX}); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg const31 = reg_alloc.ScratchRegister(any_gpr); // The 32-bit x64 SAR instruction masks the shift count by 0x1F before performing the shift. // ARM differs from the behaviour: It does not mask the count. // We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31. code->MOV(32, R(const31), Imm32(31)); code->MOVZX(32, 8, shift, R(shift)); code->CMP(32, R(shift), Imm32(31)); code->CMOVcc(32, shift, R(const31), CC_G); code->SAR(32, R(result), R(shift)); } } else { EraseInstruction(block, carry_inst); reg_alloc.DecrementRemainingUses(inst); auto shift_arg = inst->GetArg(1); if (shift_arg.IsImmediate()) { u8 shift = shift_arg.GetU8(); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg carry = reg_alloc.UseDefRegister(inst->GetArg(2), carry_inst, any_gpr); if (shift == 0) { // There is nothing more to do. } else if (shift <= 31) { code->SAR(32, R(result), Imm8(shift)); code->SETcc(CC_C, R(carry)); } else { code->SAR(32, R(result), Imm8(31)); code->BT(32, R(result), Imm8(31)); code->SETcc(CC_C, R(carry)); } } else { X64Reg shift = reg_alloc.UseRegister(shift_arg.GetInst(), {HostLoc::RCX}); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg carry = reg_alloc.UseDefRegister(inst->GetArg(2), carry_inst, any_gpr); // TODO: Optimize this. code->CMP(8, R(shift), Imm8(31)); auto Rs_gt31 = code->J_CC(CC_A); // if (Rs & 0xFF == 0) goto end; code->TEST(8, R(shift), R(shift)); auto Rs_zero = code->J_CC(CC_Z); // if (Rs & 0xFF <= 31) { code->SAR(32, R(result), R(shift)); code->SETcc(CC_C, R(carry)); auto jmp_to_end = code->J(); // } else if (Rs & 0xFF > 31) { code->SetJumpTarget(Rs_gt31); code->SAR(32, R(result), Imm8(31)); // Verified. code->BT(32, R(result), Imm8(31)); code->SETcc(CC_C, R(carry)); // } code->SetJumpTarget(jmp_to_end); code->SetJumpTarget(Rs_zero); } } } void EmitX64::EmitRotateRight(IR::Block& block, IR::Inst* inst) { auto carry_inst = FindUseWithOpcode(inst, IR::Opcode::GetCarryFromOp); if (!carry_inst) { if (!inst->GetArg(2).IsImmediate()) { // TODO: Remove redundant argument. reg_alloc.DecrementRemainingUses(inst->GetArg(2).GetInst()); } auto shift_arg = inst->GetArg(1); if (shift_arg.IsImmediate()) { u8 shift = shift_arg.GetU8(); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); code->ROR(32, R(result), Imm8(shift & 0x1F)); } else { X64Reg shift = reg_alloc.UseRegister(shift_arg.GetInst(), {HostLoc::RCX}); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); // x64 ROR instruction does (shift & 0x1F) for us. code->ROR(32, R(result), R(shift)); } } else { EraseInstruction(block, carry_inst); reg_alloc.DecrementRemainingUses(inst); auto shift_arg = inst->GetArg(1); if (shift_arg.IsImmediate()) { u8 shift = shift_arg.GetU8(); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg carry = reg_alloc.UseDefRegister(inst->GetArg(2), carry_inst, any_gpr); if (shift == 0) { // There is nothing more to do. } else if ((shift & 0x1F) == 0) { code->BT(32, R(result), Imm8(31)); code->SETcc(CC_C, R(carry)); } else { code->ROR(32, R(result), Imm8(shift)); code->SETcc(CC_C, R(carry)); } } else { X64Reg shift = reg_alloc.UseScratchRegister(shift_arg.GetInst(), {HostLoc::RCX}); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg carry = reg_alloc.UseDefRegister(inst->GetArg(2), carry_inst, any_gpr); // TODO: Optimize // if (Rs & 0xFF == 0) goto end; code->TEST(8, R(shift), R(shift)); auto Rs_zero = code->J_CC(CC_Z); code->AND(32, R(shift), Imm8(0x1F)); auto zero_1F = code->J_CC(CC_Z); // if (Rs & 0x1F != 0) { code->ROR(32, R(result), R(shift)); code->SETcc(CC_C, R(carry)); auto jmp_to_end = code->J(); // } else { code->SetJumpTarget(zero_1F); code->BT(32, R(result), Imm8(31)); code->SETcc(CC_C, R(carry)); // } code->SetJumpTarget(jmp_to_end); code->SetJumpTarget(Rs_zero); } } } void EmitX64::EmitRotateRightExtended(IR::Block& block, IR::Inst* inst) { auto carry_inst = FindUseWithOpcode(inst, IR::Opcode::GetCarryFromOp); X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); X64Reg carry = carry_inst ? reg_alloc.UseDefRegister(inst->GetArg(1), carry_inst, any_gpr) : reg_alloc.UseRegister(inst->GetArg(1), any_gpr); code->BT(32, R(carry), Imm8(0)); code->RCR(32, R(result), Imm8(1)); if (carry_inst) { EraseInstruction(block, carry_inst); reg_alloc.DecrementRemainingUses(inst); code->SETcc(CC_C, R(carry)); } } static X64Reg DoCarry(RegAlloc& reg_alloc, const IR::Value& carry_in, IR::Inst* carry_out) { if (carry_in.IsImmediate()) { return carry_out ? reg_alloc.DefRegister(carry_out, any_gpr) : INVALID_REG; } else { IR::Inst* in = carry_in.GetInst(); return carry_out ? reg_alloc.UseDefRegister(in, carry_out, any_gpr) : reg_alloc.UseRegister(in, any_gpr); } } void EmitX64::EmitAddWithCarry(IR::Block& block, IR::Inst* inst) { auto carry_inst = FindUseWithOpcode(inst, IR::Opcode::GetCarryFromOp); auto overflow_inst = FindUseWithOpcode(inst, IR::Opcode::GetOverflowFromOp); IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); IR::Value carry_in = inst->GetArg(2); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_gpr); X64Reg carry = DoCarry(reg_alloc, carry_in, carry_inst); X64Reg overflow = overflow_inst ? reg_alloc.DefRegister(overflow_inst, any_gpr) : INVALID_REG; // TODO: Consider using LEA. OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); if (carry_in.IsImmediate()) { if (carry_in.GetU1()) { code->STC(); code->ADC(32, R(result), op_arg); } else { code->ADD(32, R(result), op_arg); } } else { code->BT(32, R(carry), Imm8(0)); code->ADC(32, R(result), op_arg); } if (carry_inst) { EraseInstruction(block, carry_inst); reg_alloc.DecrementRemainingUses(inst); code->SETcc(Gen::CC_C, R(carry)); } if (overflow_inst) { EraseInstruction(block, overflow_inst); reg_alloc.DecrementRemainingUses(inst); code->SETcc(Gen::CC_O, R(overflow)); } } void EmitX64::EmitAdd64(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_gpr); OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); code->ADD(64, R(result), op_arg); } void EmitX64::EmitSubWithCarry(IR::Block& block, IR::Inst* inst) { auto carry_inst = FindUseWithOpcode(inst, IR::Opcode::GetCarryFromOp); auto overflow_inst = FindUseWithOpcode(inst, IR::Opcode::GetOverflowFromOp); IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); IR::Value carry_in = inst->GetArg(2); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_gpr); X64Reg carry = DoCarry(reg_alloc, carry_in, carry_inst); X64Reg overflow = overflow_inst ? reg_alloc.DefRegister(overflow_inst, any_gpr) : INVALID_REG; // TODO: Consider using LEA. // TODO: Optimize CMP case. // Note that x64 CF is inverse of what the ARM carry flag is here. OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); if (carry_in.IsImmediate()) { if (carry_in.GetU1()) { code->SUB(32, R(result), op_arg); } else { code->STC(); code->SBB(32, R(result), op_arg); } } else { code->BT(32, R(carry), Imm8(0)); code->CMC(); code->SBB(32, R(result), op_arg); } if (carry_inst) { EraseInstruction(block, carry_inst); reg_alloc.DecrementRemainingUses(inst); code->SETcc(Gen::CC_NC, R(carry)); } if (overflow_inst) { EraseInstruction(block, overflow_inst); reg_alloc.DecrementRemainingUses(inst); code->SETcc(Gen::CC_O, R(overflow)); } } void EmitX64::EmitSub64(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_gpr); OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); code->SUB(64, R(result), op_arg); } void EmitX64::EmitMul(IR::Block&, IR::Inst* inst) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); if (a.IsImmediate()) std::swap(a, b); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_gpr); if (b.IsImmediate()) { code->IMUL(32, result, R(result), Imm32(b.GetU32())); } else { OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); code->IMUL(32, result, op_arg); } } void EmitX64::EmitMul64(IR::Block&, IR::Inst* inst) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_gpr); OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); code->IMUL(64, result, op_arg); } void EmitX64::EmitAnd(IR::Block&, IR::Inst* inst) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_gpr); OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); code->AND(32, R(result), op_arg); } void EmitX64::EmitEor(IR::Block&, IR::Inst* inst) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_gpr); OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); code->XOR(32, R(result), op_arg); } void EmitX64::EmitOr(IR::Block&, IR::Inst* inst) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_gpr); OpArg op_arg = reg_alloc.UseOpArg(b, any_gpr); code->OR(32, R(result), op_arg); } void EmitX64::EmitNot(IR::Block&, IR::Inst* inst) { IR::Value a = inst->GetArg(0); if (a.IsImmediate()) { X64Reg result = reg_alloc.DefRegister(inst, any_gpr); code->MOV(32, R(result), Imm32(~a.GetU32())); } else { X64Reg result = reg_alloc.UseDefRegister(a.GetInst(), inst, any_gpr); code->NOT(32, R(result)); } } void EmitX64::EmitSignExtendWordToLong(IR::Block&, IR::Inst* inst) { OpArg source; X64Reg result; if (inst->GetArg(0).IsImmediate()) { // TODO: Optimize result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); source = Gen::R(result); } else { std::tie(source, result) = reg_alloc.UseDefOpArg(inst->GetArg(0), inst, any_gpr); } code->MOVSX(64, 32, result, source); } void EmitX64::EmitSignExtendHalfToWord(IR::Block&, IR::Inst* inst) { OpArg source; X64Reg result; if (inst->GetArg(0).IsImmediate()) { // TODO: Optimize result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); source = Gen::R(result); } else { std::tie(source, result) = reg_alloc.UseDefOpArg(inst->GetArg(0), inst, any_gpr); } code->MOVSX(32, 16, result, source); } void EmitX64::EmitSignExtendByteToWord(IR::Block&, IR::Inst* inst) { OpArg source; X64Reg result; if (inst->GetArg(0).IsImmediate()) { // TODO: Optimize result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); source = Gen::R(result); } else { std::tie(source, result) = reg_alloc.UseDefOpArg(inst->GetArg(0), inst, any_gpr); } code->MOVSX(32, 8, result, source); } void EmitX64::EmitZeroExtendWordToLong(IR::Block&, IR::Inst* inst) { OpArg source; X64Reg result; if (inst->GetArg(0).IsImmediate()) { // TODO: Optimize result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); source = Gen::R(result); } else { std::tie(source, result) = reg_alloc.UseDefOpArg(inst->GetArg(0), inst, any_gpr); } code->MOVZX(64, 32, result, source); } void EmitX64::EmitZeroExtendHalfToWord(IR::Block&, IR::Inst* inst) { OpArg source; X64Reg result; if (inst->GetArg(0).IsImmediate()) { // TODO: Optimize result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); source = Gen::R(result); } else { std::tie(source, result) = reg_alloc.UseDefOpArg(inst->GetArg(0), inst, any_gpr); } code->MOVZX(32, 16, result, source); } void EmitX64::EmitZeroExtendByteToWord(IR::Block&, IR::Inst* inst) { OpArg source; X64Reg result; if (inst->GetArg(0).IsImmediate()) { // TODO: Optimize result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); source = Gen::R(result); } else { std::tie(source, result) = reg_alloc.UseDefOpArg(inst->GetArg(0), inst, any_gpr); } code->MOVZX(32, 8, result, source); } void EmitX64::EmitByteReverseWord(IR::Block&, IR::Inst* inst) { X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); code->BSWAP(32, result); } void EmitX64::EmitByteReverseHalf(IR::Block&, IR::Inst* inst) { X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); code->ROL(16, R(result), Imm8(8)); } void EmitX64::EmitByteReverseDual(IR::Block&, IR::Inst* inst) { X64Reg result = reg_alloc.UseDefRegister(inst->GetArg(0), inst, any_gpr); code->BSWAP(64, result); } static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void (XEmitter::*fn)(X64Reg, const OpArg&)) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_gpr); X64Reg arg = reg_alloc.UseRegister(b, any_gpr); X64Reg xmm_scratch_a = reg_alloc.ScratchRegister(any_xmm); X64Reg xmm_scratch_b = reg_alloc.ScratchRegister(any_xmm); code->MOVD_xmm(xmm_scratch_a, R(result)); code->MOVD_xmm(xmm_scratch_b, R(arg)); (code->*fn)(xmm_scratch_a, R(xmm_scratch_b)); code->MOVD_xmm(R(result), xmm_scratch_a); } void EmitX64::EmitPackedSaturatedAddU8(IR::Block& block, IR::Inst* inst) { EmitPackedOperation(code, reg_alloc, inst, &XEmitter::PADDUSB); } void EmitX64::EmitPackedSaturatedAddS8(IR::Block& block, IR::Inst* inst) { EmitPackedOperation(code, reg_alloc, inst, &XEmitter::PADDSB); } void EmitX64::EmitPackedSaturatedSubU8(IR::Block& block, IR::Inst* inst) { EmitPackedOperation(code, reg_alloc, inst, &XEmitter::PSUBUSB); } void EmitX64::EmitPackedSaturatedSubS8(IR::Block& block, IR::Inst* inst) { EmitPackedOperation(code, reg_alloc, inst, &XEmitter::PSUBSB); } void EmitX64::EmitPackedSaturatedAddU16(IR::Block& block, IR::Inst* inst) { EmitPackedOperation(code, reg_alloc, inst, &XEmitter::PADDUSW); } void EmitX64::EmitPackedSaturatedAddS16(IR::Block& block, IR::Inst* inst) { EmitPackedOperation(code, reg_alloc, inst, &XEmitter::PADDSW); } void EmitX64::EmitPackedSaturatedSubU16(IR::Block& block, IR::Inst* inst) { EmitPackedOperation(code, reg_alloc, inst, &XEmitter::PSUBUSW); } void EmitX64::EmitPackedSaturatedSubS16(IR::Block& block, IR::Inst* inst) { EmitPackedOperation(code, reg_alloc, inst, &XEmitter::PSUBSW); } static void DenormalsAreZero32(BlockOfCode* code, X64Reg xmm_value, X64Reg gpr_scratch) { // We need to report back whether we've found a denormal on input. // SSE doesn't do this for us when SSE's DAZ is enabled. code->MOVD_xmm(R(gpr_scratch), xmm_value); code->AND(32, R(gpr_scratch), Imm32(0x7FFFFFFF)); code->SUB(32, R(gpr_scratch), Imm32(1)); code->CMP(32, R(gpr_scratch), Imm32(0x007FFFFE)); auto fixup = code->J_CC(CC_A); code->PXOR(xmm_value, R(xmm_value)); code->MOV(32, MDisp(R15, offsetof(JitState, FPSCR_IDC)), Imm32(1 << 7)); code->SetJumpTarget(fixup); } static void DenormalsAreZero64(BlockOfCode* code, X64Reg xmm_value, X64Reg gpr_scratch) { code->MOVQ_xmm(R(gpr_scratch), xmm_value); code->AND(64, R(gpr_scratch), code->MFloatNonSignMask64()); code->SUB(64, R(gpr_scratch), Imm32(1)); code->CMP(64, R(gpr_scratch), code->MFloatPenultimatePositiveDenormal64()); auto fixup = code->J_CC(CC_A); code->PXOR(xmm_value, R(xmm_value)); code->MOV(32, MDisp(R15, offsetof(JitState, FPSCR_IDC)), Imm32(1 << 7)); code->SetJumpTarget(fixup); } static void FlushToZero32(BlockOfCode* code, X64Reg xmm_value, X64Reg gpr_scratch) { code->MOVD_xmm(R(gpr_scratch), xmm_value); code->AND(32, R(gpr_scratch), Imm32(0x7FFFFFFF)); code->SUB(32, R(gpr_scratch), Imm32(1)); code->CMP(32, R(gpr_scratch), Imm32(0x007FFFFE)); auto fixup = code->J_CC(CC_A); code->PXOR(xmm_value, R(xmm_value)); code->MOV(32, MDisp(R15, offsetof(JitState, FPSCR_UFC)), Imm32(1 << 3)); code->SetJumpTarget(fixup); } static void FlushToZero64(BlockOfCode* code, X64Reg xmm_value, X64Reg gpr_scratch) { code->MOVQ_xmm(R(gpr_scratch), xmm_value); code->AND(64, R(gpr_scratch), code->MFloatNonSignMask64()); code->SUB(64, R(gpr_scratch), Imm32(1)); code->CMP(64, R(gpr_scratch), code->MFloatPenultimatePositiveDenormal64()); auto fixup = code->J_CC(CC_A); code->PXOR(xmm_value, R(xmm_value)); code->MOV(32, MDisp(R15, offsetof(JitState, FPSCR_UFC)), Imm32(1 << 3)); code->SetJumpTarget(fixup); } static void DefaultNaN32(BlockOfCode* code, X64Reg xmm_value) { code->UCOMISS(xmm_value, R(xmm_value)); auto fixup = code->J_CC(CC_NP); code->MOVAPS(xmm_value, code->MFloatNaN32()); code->SetJumpTarget(fixup); } static void DefaultNaN64(BlockOfCode* code, X64Reg xmm_value) { code->UCOMISD(xmm_value, R(xmm_value)); auto fixup = code->J_CC(CC_NP); code->MOVAPS(xmm_value, code->MFloatNaN64()); code->SetJumpTarget(fixup); } static void ZeroIfNaN64(BlockOfCode* code, X64Reg xmm_value) { code->UCOMISD(xmm_value, R(xmm_value)); auto fixup = code->J_CC(CC_NP); code->MOVAPS(xmm_value, code->MFloatPositiveZero64()); code->SetJumpTarget(fixup); } static void FPThreeOp32(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (XEmitter::*fn)(X64Reg, const OpArg&)) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm); X64Reg operand = reg_alloc.UseRegister(b, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); if (block.location.FPSCR().FTZ()) { DenormalsAreZero32(code, result, gpr_scratch); DenormalsAreZero32(code, operand, gpr_scratch); } (code->*fn)(result, R(operand)); if (block.location.FPSCR().FTZ()) { FlushToZero32(code, result, gpr_scratch); } if (block.location.FPSCR().DN()) { DefaultNaN32(code, result); } } static void FPThreeOp64(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (XEmitter::*fn)(X64Reg, const OpArg&)) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm); X64Reg operand = reg_alloc.UseRegister(b, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); if (block.location.FPSCR().FTZ()) { DenormalsAreZero64(code, result, gpr_scratch); DenormalsAreZero64(code, operand, gpr_scratch); } (code->*fn)(result, R(operand)); if (block.location.FPSCR().FTZ()) { FlushToZero64(code, result, gpr_scratch); } if (block.location.FPSCR().DN()) { DefaultNaN64(code, result); } } static void FPTwoOp32(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (XEmitter::*fn)(X64Reg, const OpArg&)) { IR::Value a = inst->GetArg(0); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); if (block.location.FPSCR().FTZ()) { DenormalsAreZero32(code, result, gpr_scratch); } (code->*fn)(result, R(result)); if (block.location.FPSCR().FTZ()) { FlushToZero32(code, result, gpr_scratch); } if (block.location.FPSCR().DN()) { DefaultNaN32(code, result); } } static void FPTwoOp64(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (XEmitter::*fn)(X64Reg, const OpArg&)) { IR::Value a = inst->GetArg(0); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); if (block.location.FPSCR().FTZ()) { DenormalsAreZero64(code, result, gpr_scratch); } (code->*fn)(result, R(result)); if (block.location.FPSCR().FTZ()) { FlushToZero64(code, result, gpr_scratch); } if (block.location.FPSCR().DN()) { DefaultNaN64(code, result); } } void EmitX64::EmitTransferFromFP32(IR::Block& block, IR::Inst* inst) { X64Reg result = reg_alloc.DefRegister(inst, any_gpr); X64Reg source = reg_alloc.UseRegister(inst->GetArg(0), any_xmm); // TODO: Eliminate this. code->MOVD_xmm(R(result), source); } void EmitX64::EmitTransferFromFP64(IR::Block& block, IR::Inst* inst) { X64Reg result = reg_alloc.DefRegister(inst, any_gpr); X64Reg source = reg_alloc.UseRegister(inst->GetArg(0), any_xmm); // TODO: Eliminate this. code->MOVQ_xmm(R(result), source); } void EmitX64::EmitTransferToFP32(IR::Block& block, IR::Inst* inst) { X64Reg result = reg_alloc.DefRegister(inst, any_xmm); X64Reg source = reg_alloc.UseRegister(inst->GetArg(0), any_gpr); // TODO: Eliminate this. code->MOVD_xmm(result, R(source)); } void EmitX64::EmitTransferToFP64(IR::Block& block, IR::Inst* inst) { X64Reg result = reg_alloc.DefRegister(inst, any_xmm); X64Reg source = reg_alloc.UseRegister(inst->GetArg(0), any_gpr); // TODO: Eliminate this. code->MOVQ_xmm(result, R(source)); } void EmitX64::EmitFPAbs32(IR::Block&, IR::Inst* inst) { IR::Value a = inst->GetArg(0); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm); code->PAND(result, code->MFloatNonSignMask32()); } void EmitX64::EmitFPAbs64(IR::Block&, IR::Inst* inst) { IR::Value a = inst->GetArg(0); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm); code->PAND(result, code->MFloatNonSignMask64()); } void EmitX64::EmitFPNeg32(IR::Block&, IR::Inst* inst) { IR::Value a = inst->GetArg(0); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm); code->PXOR(result, code->MFloatNegativeZero32()); } void EmitX64::EmitFPNeg64(IR::Block&, IR::Inst* inst) { IR::Value a = inst->GetArg(0); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm); code->PXOR(result, code->MFloatNegativeZero64()); } void EmitX64::EmitFPAdd32(IR::Block& block, IR::Inst* inst) { FPThreeOp32(code, reg_alloc, block, inst, &XEmitter::ADDSS); } void EmitX64::EmitFPAdd64(IR::Block& block, IR::Inst* inst) { FPThreeOp64(code, reg_alloc, block, inst, &XEmitter::ADDSD); } void EmitX64::EmitFPDiv32(IR::Block& block, IR::Inst* inst) { FPThreeOp32(code, reg_alloc, block, inst, &XEmitter::DIVSS); } void EmitX64::EmitFPDiv64(IR::Block& block, IR::Inst* inst) { FPThreeOp64(code, reg_alloc, block, inst, &XEmitter::DIVSD); } void EmitX64::EmitFPMul32(IR::Block& block, IR::Inst* inst) { FPThreeOp32(code, reg_alloc, block, inst, &XEmitter::MULSS); } void EmitX64::EmitFPMul64(IR::Block& block, IR::Inst* inst) { FPThreeOp64(code, reg_alloc, block, inst, &XEmitter::MULSD); } void EmitX64::EmitFPSqrt32(IR::Block& block, IR::Inst* inst) { FPTwoOp32(code, reg_alloc, block, inst, &XEmitter::SQRTSS); } void EmitX64::EmitFPSqrt64(IR::Block& block, IR::Inst* inst) { FPTwoOp64(code, reg_alloc, block, inst, &XEmitter::SQRTSD); } void EmitX64::EmitFPSub32(IR::Block& block, IR::Inst* inst) { FPThreeOp32(code, reg_alloc, block, inst, &XEmitter::SUBSS); } void EmitX64::EmitFPSub64(IR::Block& block, IR::Inst* inst) { FPThreeOp64(code, reg_alloc, block, inst, &XEmitter::SUBSD); } void EmitX64::EmitFPSingleToDouble(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); if (block.location.FPSCR().FTZ()) { DenormalsAreZero32(code, result, gpr_scratch); } code->CVTSS2SD(result, R(result)); if (block.location.FPSCR().FTZ()) { FlushToZero64(code, result, gpr_scratch); } if (block.location.FPSCR().DN()) { DefaultNaN64(code, result); } } void EmitX64::EmitFPDoubleToSingle(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); if (block.location.FPSCR().FTZ()) { DenormalsAreZero64(code, result, gpr_scratch); } code->CVTSD2SS(result, R(result)); if (block.location.FPSCR().FTZ()) { FlushToZero32(code, result, gpr_scratch); } if (block.location.FPSCR().DN()) { DefaultNaN32(code, result); } } void EmitX64::EmitFPSingleToS32(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); bool round_towards_zero = inst->GetArg(1).GetU1(); X64Reg from = reg_alloc.UseScratchRegister(a, any_xmm); X64Reg to = reg_alloc.DefRegister(inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); // ARM saturates on conversion; this differs from x64 which returns a sentinel value. // Conversion to double is lossless, and allows for clamping. if (block.location.FPSCR().FTZ()) { DenormalsAreZero32(code, from, gpr_scratch); } code->CVTSS2SD(from, R(from)); // First time is to set flags if (round_towards_zero) { code->CVTTSD2SI(gpr_scratch, R(from)); // 32 bit gpr } else { code->CVTSD2SI(gpr_scratch, R(from)); // 32 bit gpr } // Clamp to output range ZeroIfNaN64(code, from); code->MINSD(from, code->MFloatMaxS32()); code->MAXSD(from, code->MFloatMinS32()); // Second time is for real if (round_towards_zero) { code->CVTTSD2SI(gpr_scratch, R(from)); // 32 bit gpr } else { code->CVTSD2SI(gpr_scratch, R(from)); // 32 bit gpr } code->MOVD_xmm(to, R(gpr_scratch)); } void EmitX64::EmitFPSingleToU32(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); bool round_towards_zero = inst->GetArg(1).GetU1(); X64Reg from = reg_alloc.UseScratchRegister(a, any_xmm); X64Reg to = reg_alloc.DefRegister(inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); // ARM saturates on conversion; this differs from x64 which returns a sentinel value. // Conversion to double is lossless, and allows for accurate clamping. // // Since SSE2 doesn't provide an unsigned conversion, we shift the range as appropriate. // // FIXME: Inexact exception not correctly signalled with the below code if (block.location.FPSCR().RMode() != Arm::FPSCR::RoundingMode::TowardsZero && !round_towards_zero) { if (block.location.FPSCR().FTZ()) { DenormalsAreZero32(code, from, gpr_scratch); } code->CVTSS2SD(from, R(from)); ZeroIfNaN64(code, from); // Bring into SSE range code->ADDSD(from, code->MFloatMinS32()); // First time is to set flags code->CVTSD2SI(gpr_scratch, R(from)); // 32 bit gpr // Clamp to output range code->MINSD(from, code->MFloatMaxS32()); code->MAXSD(from, code->MFloatMinS32()); // Actually convert code->CVTSD2SI(gpr_scratch, R(from)); // 32 bit gpr // Bring back into original range code->ADD(32, R(gpr_scratch), Imm32(2147483648u)); code->MOVQ_xmm(to, R(gpr_scratch)); } else { X64Reg xmm_mask = reg_alloc.ScratchRegister(any_xmm); X64Reg gpr_mask = reg_alloc.ScratchRegister(any_gpr); if (block.location.FPSCR().FTZ()) { DenormalsAreZero32(code, from, gpr_scratch); } code->CVTSS2SD(from, R(from)); ZeroIfNaN64(code, from); // Generate masks if out-of-signed-range code->MOVAPS(xmm_mask, code->MFloatMaxS32()); code->CMPLTSD(xmm_mask, R(from)); code->MOVQ_xmm(R(gpr_mask), xmm_mask); code->PAND(xmm_mask, code->MFloatMinS32()); code->AND(32, R(gpr_mask), Imm32(2147483648u)); // Bring into range if necessary code->ADDSD(from, R(xmm_mask)); // First time is to set flags code->CVTTSD2SI(gpr_scratch, R(from)); // 32 bit gpr // Clamp to output range code->MINSD(from, code->MFloatMaxS32()); code->MAXSD(from, code->MFloatMinU32()); // Actually convert code->CVTTSD2SI(gpr_scratch, R(from)); // 32 bit gpr // Bring back into original range if necessary code->ADD(32, R(gpr_scratch), R(gpr_mask)); code->MOVQ_xmm(to, R(gpr_scratch)); } } void EmitX64::EmitFPDoubleToS32(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); bool round_towards_zero = inst->GetArg(1).GetU1(); X64Reg from = reg_alloc.UseScratchRegister(a, any_xmm); X64Reg to = reg_alloc.DefRegister(inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); // ARM saturates on conversion; this differs from x64 which returns a sentinel value. if (block.location.FPSCR().FTZ()) { DenormalsAreZero64(code, from, gpr_scratch); } // First time is to set flags if (round_towards_zero) { code->CVTTSD2SI(gpr_scratch, R(from)); // 32 bit gpr } else { code->CVTSD2SI(gpr_scratch, R(from)); // 32 bit gpr } // Clamp to output range ZeroIfNaN64(code, from); code->MINSD(from, code->MFloatMaxS32()); code->MAXSD(from, code->MFloatMinS32()); // Second time is for real if (round_towards_zero) { code->CVTTSD2SI(gpr_scratch, R(from)); // 32 bit gpr } else { code->CVTSD2SI(gpr_scratch, R(from)); // 32 bit gpr } code->MOVD_xmm(to, R(gpr_scratch)); } void EmitX64::EmitFPDoubleToU32(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); bool round_towards_zero = inst->GetArg(1).GetU1(); X64Reg from = reg_alloc.UseScratchRegister(a, any_xmm); X64Reg to = reg_alloc.DefRegister(inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); X64Reg xmm_scratch = reg_alloc.ScratchRegister(any_xmm); // ARM saturates on conversion; this differs from x64 which returns a sentinel value. // TODO: Use VCVTPD2UDQ when AVX512VL is available. // FIXME: Inexact exception not correctly signalled with the below code if (block.location.FPSCR().RMode() != Arm::FPSCR::RoundingMode::TowardsZero && !round_towards_zero) { if (block.location.FPSCR().FTZ()) { DenormalsAreZero64(code, from, gpr_scratch); } ZeroIfNaN64(code, from); // Bring into SSE range code->ADDSD(from, code->MFloatMinS32()); // First time is to set flags code->CVTSD2SI(gpr_scratch, R(from)); // 32 bit gpr // Clamp to output range code->MINSD(from, code->MFloatMaxS32()); code->MAXSD(from, code->MFloatMinS32()); // Actually convert code->CVTSD2SI(gpr_scratch, R(from)); // 32 bit gpr // Bring back into original range code->ADD(32, R(gpr_scratch), Imm32(2147483648u)); code->MOVQ_xmm(to, R(gpr_scratch)); } else { X64Reg xmm_mask = reg_alloc.ScratchRegister(any_xmm); X64Reg gpr_mask = reg_alloc.ScratchRegister(any_gpr); if (block.location.FPSCR().FTZ()) { DenormalsAreZero64(code, from, gpr_scratch); } ZeroIfNaN64(code, from); // Generate masks if out-of-signed-range code->MOVAPS(xmm_mask, code->MFloatMaxS32()); code->CMPLTSD(xmm_mask, R(from)); code->MOVQ_xmm(R(gpr_mask), xmm_mask); code->PAND(xmm_mask, code->MFloatMinS32()); code->AND(32, R(gpr_mask), Imm32(2147483648u)); // Bring into range if necessary code->ADDSD(from, R(xmm_mask)); // First time is to set flags code->CVTTSD2SI(gpr_scratch, R(from)); // 32 bit gpr // Clamp to output range code->MINSD(from, code->MFloatMaxS32()); code->MAXSD(from, code->MFloatMinU32()); // Actually convert code->CVTTSD2SI(gpr_scratch, R(from)); // 32 bit gpr // Bring back into original range if necessary code->ADD(32, R(gpr_scratch), R(gpr_mask)); code->MOVQ_xmm(to, R(gpr_scratch)); } } void EmitX64::EmitFPS32ToSingle(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); bool round_to_nearest = inst->GetArg(1).GetU1(); ASSERT_MSG(!round_to_nearest, "round_to_nearest unimplemented"); X64Reg from = reg_alloc.UseRegister(a, any_xmm); X64Reg to = reg_alloc.DefRegister(inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); code->MOVD_xmm(R(gpr_scratch), from); code->CVTSI2SS(32, to, R(gpr_scratch)); } void EmitX64::EmitFPU32ToSingle(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); bool round_to_nearest = inst->GetArg(1).GetU1(); ASSERT_MSG(!round_to_nearest, "round_to_nearest unimplemented"); X64Reg from = reg_alloc.UseRegister(a, any_xmm); X64Reg to = reg_alloc.DefRegister(inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); code->MOVD_xmm(R(gpr_scratch), from); code->CVTSI2SS(64, to, R(gpr_scratch)); } void EmitX64::EmitFPS32ToDouble(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); bool round_to_nearest = inst->GetArg(1).GetU1(); ASSERT_MSG(!round_to_nearest, "round_to_nearest unimplemented"); X64Reg from = reg_alloc.UseRegister(a, any_xmm); X64Reg to = reg_alloc.DefRegister(inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); code->MOVD_xmm(R(gpr_scratch), from); code->CVTSI2SD(32, to, R(gpr_scratch)); } void EmitX64::EmitFPU32ToDouble(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); bool round_to_nearest = inst->GetArg(1).GetU1(); ASSERT_MSG(!round_to_nearest, "round_to_nearest unimplemented"); X64Reg from = reg_alloc.UseRegister(a, any_xmm); X64Reg to = reg_alloc.DefRegister(inst, any_xmm); X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr); code->MOVD_xmm(R(gpr_scratch), from); code->CVTSI2SD(64, to, R(gpr_scratch)); } void EmitX64::EmitClearExclusive(IR::Block&, IR::Inst*) { code->MOV(8, MDisp(R15, offsetof(JitState, exclusive_state)), Imm8(0)); } void EmitX64::EmitSetExclusive(IR::Block&, IR::Inst* inst) { ASSERT(inst->GetArg(1).IsImmediate()); X64Reg address = reg_alloc.UseRegister(inst->GetArg(0), any_gpr); code->MOV(8, MDisp(R15, offsetof(JitState, exclusive_state)), Imm8(1)); code->MOV(32, MDisp(R15, offsetof(JitState, exclusive_address)), R(address)); } void EmitX64::EmitReadMemory8(IR::Block&, IR::Inst* inst) { reg_alloc.HostCall(inst, inst->GetArg(0)); code->ABI_CallFunction(reinterpret_cast(cb.MemoryRead8)); } void EmitX64::EmitReadMemory16(IR::Block&, IR::Inst* inst) { reg_alloc.HostCall(inst, inst->GetArg(0)); code->ABI_CallFunction(reinterpret_cast(cb.MemoryRead16)); } void EmitX64::EmitReadMemory32(IR::Block&, IR::Inst* inst) { reg_alloc.HostCall(inst, inst->GetArg(0)); code->ABI_CallFunction(reinterpret_cast(cb.MemoryRead32)); } void EmitX64::EmitReadMemory64(IR::Block&, IR::Inst* inst) { reg_alloc.HostCall(inst, inst->GetArg(0)); code->ABI_CallFunction(reinterpret_cast(cb.MemoryRead64)); } void EmitX64::EmitWriteMemory8(IR::Block&, IR::Inst* inst) { reg_alloc.HostCall(nullptr, inst->GetArg(0), inst->GetArg(1)); code->ABI_CallFunction(reinterpret_cast(cb.MemoryWrite8)); } void EmitX64::EmitWriteMemory16(IR::Block&, IR::Inst* inst) { reg_alloc.HostCall(nullptr, inst->GetArg(0), inst->GetArg(1)); code->ABI_CallFunction(reinterpret_cast(cb.MemoryWrite16)); } void EmitX64::EmitWriteMemory32(IR::Block&, IR::Inst* inst) { reg_alloc.HostCall(nullptr, inst->GetArg(0), inst->GetArg(1)); code->ABI_CallFunction(reinterpret_cast(cb.MemoryWrite32)); } void EmitX64::EmitWriteMemory64(IR::Block&, IR::Inst* inst) { reg_alloc.HostCall(nullptr, inst->GetArg(0), inst->GetArg(1)); code->ABI_CallFunction(reinterpret_cast(cb.MemoryWrite64)); } static void ExclusiveWrite(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void* fn) { reg_alloc.HostCall(nullptr, inst->GetArg(0), inst->GetArg(1)); X64Reg passed = reg_alloc.DefRegister(inst, any_gpr); X64Reg tmp = ABI_RETURN; // Use one of the unusued HostCall registers. code->MOV(32, R(passed), Imm32(1)); code->CMP(8, MDisp(R15, offsetof(JitState, exclusive_state)), Imm8(0)); auto fail1_fixup = code->J_CC(CC_E); code->MOV(32, R(tmp), R(ABI_PARAM1)); code->XOR(32, R(tmp), MDisp(R15, offsetof(JitState, exclusive_address))); code->TEST(32, R(tmp), Imm32(JitState::RESERVATION_GRANULE_MASK)); auto fail2_fixup = code->J_CC(CC_NE); code->MOV(8, MDisp(R15, offsetof(JitState, exclusive_state)), Imm8(0)); code->ABI_CallFunction(fn); code->XOR(32, R(passed), R(passed)); code->SetJumpTarget(fail1_fixup); code->SetJumpTarget(fail2_fixup); } void EmitX64::EmitExclusiveWriteMemory8(IR::Block&, IR::Inst* inst) { ExclusiveWrite(code, reg_alloc, inst, reinterpret_cast(cb.MemoryWrite8)); } void EmitX64::EmitExclusiveWriteMemory16(IR::Block&, IR::Inst* inst) { ExclusiveWrite(code, reg_alloc, inst, reinterpret_cast(cb.MemoryWrite16)); } void EmitX64::EmitExclusiveWriteMemory32(IR::Block&, IR::Inst* inst) { ExclusiveWrite(code, reg_alloc, inst, reinterpret_cast(cb.MemoryWrite32)); } void EmitX64::EmitExclusiveWriteMemory64(IR::Block&, IR::Inst* inst) { reg_alloc.HostCall(nullptr, inst->GetArg(0), inst->GetArg(1)); X64Reg passed = reg_alloc.DefRegister(inst, any_gpr); X64Reg value_hi = reg_alloc.UseScratchRegister(inst->GetArg(2), any_gpr); X64Reg value = ABI_PARAM2; X64Reg tmp = ABI_RETURN; // Use one of the unusued HostCall registers. code->MOV(32, R(passed), Imm32(1)); code->CMP(8, MDisp(R15, offsetof(JitState, exclusive_state)), Imm8(0)); auto fail1_fixup = code->J_CC(CC_E); code->MOV(32, R(tmp), R(ABI_PARAM1)); code->XOR(32, R(tmp), MDisp(R15, offsetof(JitState, exclusive_address))); code->TEST(32, R(tmp), Imm32(JitState::RESERVATION_GRANULE_MASK)); auto fail2_fixup = code->J_CC(CC_NE); code->MOV(8, MDisp(R15, offsetof(JitState, exclusive_state)), Imm8(0)); code->MOVZX(64, 32, value, R(value)); code->SHL(64, R(value_hi), Imm8(32)); code->OR(64, R(value), R(value_hi)); code->ABI_CallFunction(reinterpret_cast(cb.MemoryWrite64)); code->XOR(32, R(passed), R(passed)); code->SetJumpTarget(fail1_fixup); code->SetJumpTarget(fail2_fixup); } void EmitX64::EmitAddCycles(size_t cycles) { ASSERT(cycles < std::numeric_limits::max()); code->SUB(64, MDisp(R15, offsetof(JitState, cycles_remaining)), Imm32(static_cast(cycles))); } static CCFlags EmitCond(BlockOfCode* code, Arm::Cond cond) { // TODO: This code is a quick copy-paste-and-quickly-modify job from a previous JIT. Clean this up. auto NFlag = [code](X64Reg reg){ code->MOV(32, R(reg), MJitStateCpsr()); code->SHR(32, R(reg), Imm8(31)); code->AND(32, R(reg), Imm32(1)); }; auto ZFlag = [code](X64Reg reg){ code->MOV(32, R(reg), MJitStateCpsr()); code->SHR(32, R(reg), Imm8(30)); code->AND(32, R(reg), Imm32(1)); }; auto CFlag = [code](X64Reg reg){ code->MOV(32, R(reg), MJitStateCpsr()); code->SHR(32, R(reg), Imm8(29)); code->AND(32, R(reg), Imm32(1)); }; auto VFlag = [code](X64Reg reg){ code->MOV(32, R(reg), MJitStateCpsr()); code->SHR(32, R(reg), Imm8(28)); code->AND(32, R(reg), Imm32(1)); }; CCFlags cc; switch (cond) { case Arm::Cond::EQ: //z ZFlag(RAX); code->CMP(8, R(RAX), Imm8(0)); cc = CC_NE; break; case Arm::Cond::NE: //!z ZFlag(RAX); code->CMP(8, R(RAX), Imm8(0)); cc = CC_E; break; case Arm::Cond::CS: //c CFlag(RBX); code->CMP(8, R(RBX), Imm8(0)); cc = CC_NE; break; case Arm::Cond::CC: //!c CFlag(RBX); code->CMP(8, R(RBX), Imm8(0)); cc = CC_E; break; case Arm::Cond::MI: //n NFlag(RCX); code->CMP(8, R(RCX), Imm8(0)); cc = CC_NE; break; case Arm::Cond::PL: //!n NFlag(RCX); code->CMP(8, R(RCX), Imm8(0)); cc = CC_E; break; case Arm::Cond::VS: //v VFlag(RDX); code->CMP(8, R(RDX), Imm8(0)); cc = CC_NE; break; case Arm::Cond::VC: //!v VFlag(RDX); code->CMP(8, R(RDX), Imm8(0)); cc = CC_E; break; case Arm::Cond::HI: { //c & !z const X64Reg tmp = RSI; ZFlag(RAX); code->MOVZX(64, 8, tmp, R(RAX)); CFlag(RBX); code->CMP(8, R(RBX), R(tmp)); cc = CC_A; break; } case Arm::Cond::LS: { //!c | z const X64Reg tmp = RSI; ZFlag(RAX); code->MOVZX(64, 8, tmp, R(RAX)); CFlag(RBX); code->CMP(8, R(RBX), R(tmp)); cc = CC_BE; break; } case Arm::Cond::GE: { // n == v const X64Reg tmp = RSI; VFlag(RDX); code->MOVZX(64, 8, tmp, R(RDX)); NFlag(RCX); code->CMP(8, R(RCX), R(tmp)); cc = CC_E; break; } case Arm::Cond::LT: { // n != v const X64Reg tmp = RSI; VFlag(RDX); code->MOVZX(64, 8, tmp, R(RDX)); NFlag(RCX); code->CMP(8, R(RCX), R(tmp)); cc = CC_NE; break; } case Arm::Cond::GT: { // !z & (n == v) const X64Reg tmp = RSI; NFlag(RCX); code->MOVZX(64, 8, tmp, R(RCX)); VFlag(RDX); code->XOR(8, R(tmp), R(RDX)); ZFlag(RAX); code->OR(8, R(tmp), R(RAX)); code->TEST(8, R(tmp), R(tmp)); cc = CC_Z; break; } case Arm::Cond::LE: { // z | (n != v) X64Reg tmp = RSI; NFlag(RCX); code->MOVZX(64, 8, tmp, R(RCX)); VFlag(RDX); code->XOR(8, R(tmp), R(RDX)); ZFlag(RAX); code->OR(8, R(tmp), R(RAX)); code->TEST(8, R(tmp), R(tmp)); cc = CC_NZ; break; } default: ASSERT_MSG(0, "Unknown cond %zu", static_cast(cond)); break; } return cc; } void EmitX64::EmitCondPrelude(const IR::Block& block) { if (block.cond == Arm::Cond::AL) { ASSERT(!block.cond_failed.is_initialized()); return; } ASSERT(block.cond_failed.is_initialized()); CCFlags cc = EmitCond(code, block.cond); // TODO: Improve, maybe. auto fixup = code->J_CC(cc, true); EmitAddCycles(block.cond_failed_cycle_count); EmitTerminalLinkBlock(IR::Term::LinkBlock{block.cond_failed.get()}, block.location); code->SetJumpTarget(fixup); } void EmitX64::EmitTerminal(IR::Terminal terminal, Arm::LocationDescriptor initial_location) { switch (terminal.which()) { case 1: EmitTerminalInterpret(boost::get(terminal), initial_location); return; case 2: EmitTerminalReturnToDispatch(boost::get(terminal), initial_location); return; case 3: EmitTerminalLinkBlock(boost::get(terminal), initial_location); return; case 4: EmitTerminalLinkBlockFast(boost::get(terminal), initial_location); return; case 5: EmitTerminalPopRSBHint(boost::get(terminal), initial_location); return; case 6: EmitTerminalIf(boost::get(terminal), initial_location); return; case 7: EmitTerminalCheckHalt(boost::get(terminal), initial_location); return; default: ASSERT_MSG(0, "Invalid Terminal. Bad programmer."); return; } } void EmitX64::EmitTerminalInterpret(IR::Term::Interpret terminal, Arm::LocationDescriptor initial_location) { ASSERT_MSG(terminal.next.TFlag() == initial_location.TFlag(), "Unimplemented"); ASSERT_MSG(terminal.next.EFlag() == initial_location.EFlag(), "Unimplemented"); code->MOV(64, R(ABI_PARAM1), Imm64(terminal.next.PC())); code->MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast(jit_interface))); code->MOV(32, MJitStateReg(Arm::Reg::PC), R(ABI_PARAM1)); code->SwitchMxcsrOnExit(); code->ABI_CallFunction(reinterpret_cast(cb.InterpreterFallback)); code->ReturnFromRunCode(false); // TODO: Check cycles } void EmitX64::EmitTerminalReturnToDispatch(IR::Term::ReturnToDispatch, Arm::LocationDescriptor initial_location) { code->ReturnFromRunCode(); } void EmitX64::EmitTerminalLinkBlock(IR::Term::LinkBlock terminal, Arm::LocationDescriptor initial_location) { if (terminal.next.TFlag() != initial_location.TFlag()) { if (terminal.next.TFlag()) { code->OR(32, MJitStateCpsr(), Imm32(1 << 5)); } else { code->AND(32, MJitStateCpsr(), Imm32(~(1 << 5))); } } if (terminal.next.EFlag() != initial_location.EFlag()) { if (terminal.next.EFlag()) { code->OR(32, MJitStateCpsr(), Imm32(1 << 9)); } else { code->AND(32, MJitStateCpsr(), Imm32(~(1 << 9))); } } code->CMP(64, MDisp(R15, offsetof(JitState, cycles_remaining)), Imm32(0)); patch_jg_locations[terminal.next].emplace_back(code->GetWritableCodePtr()); if (auto next_bb = GetBasicBlock(terminal.next)) { code->J_CC(CC_G, next_bb->code_ptr, true); } else { code->NOP(6); // Leave enough space for a jg instruction. } code->MOV(32, MJitStateReg(Arm::Reg::PC), Imm32(terminal.next.PC())); code->ReturnFromRunCode(); // TODO: Check cycles, Properly do a link } void EmitX64::EmitTerminalLinkBlockFast(IR::Term::LinkBlockFast terminal, Arm::LocationDescriptor initial_location) { if (terminal.next.TFlag() != initial_location.TFlag()) { if (terminal.next.TFlag()) { code->OR(32, MJitStateCpsr(), Imm32(1 << 5)); } else { code->AND(32, MJitStateCpsr(), Imm32(~(1 << 5))); } } if (terminal.next.EFlag() != initial_location.EFlag()) { if (terminal.next.EFlag()) { code->OR(32, MJitStateCpsr(), Imm32(1 << 9)); } else { code->AND(32, MJitStateCpsr(), Imm32(~(1 << 9))); } } patch_jmp_locations[terminal.next].emplace_back(code->GetWritableCodePtr()); if (auto next_bb = GetBasicBlock(terminal.next)) { code->JMP(next_bb->code_ptr, true); } else { code->MOV(32, MJitStateReg(Arm::Reg::PC), Imm32(terminal.next.PC())); code->JMP(code->GetReturnFromRunCodeAddress(), true); } } void EmitX64::EmitTerminalPopRSBHint(IR::Term::PopRSBHint, Arm::LocationDescriptor initial_location) { // This calculation has to match up with IREmitter::PushRSB code->MOV(32, R(RBX), MJitStateCpsr()); code->MOVZX(64, 32, RCX, MJitStateReg(Arm::Reg::PC)); code->AND(32, R(RBX), Imm32((1 << 5) | (1 << 9))); code->SHR(32, R(RBX), Imm8(2)); code->OR(32, R(RBX), MDisp(R15, offsetof(JitState, guest_FPSCR_mode))); code->SHL(64, R(RBX), Imm8(32)); code->OR(64, R(RBX), R(RCX)); code->MOV(64, R(RAX), Imm64(u64(code->GetReturnFromRunCodeAddress()))); for (size_t i = 0; i < JitState::RSBSize; ++i) { code->CMP(64, R(RBX), MDisp(R15, int(offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)))); code->CMOVcc(64, RAX, MDisp(R15, int(offsetof(JitState, rsb_codeptrs) + i * sizeof(u64))), CC_E); } code->JMPptr(R(RAX)); } void EmitX64::EmitTerminalIf(IR::Term::If terminal, Arm::LocationDescriptor initial_location) { CCFlags cc = EmitCond(code, terminal.if_); auto fixup = code->J_CC(cc, true); EmitTerminal(terminal.else_, initial_location); code->SetJumpTarget(fixup); EmitTerminal(terminal.then_, initial_location); } void EmitX64::EmitTerminalCheckHalt(IR::Term::CheckHalt terminal, Arm::LocationDescriptor initial_location) { code->CMP(8, MDisp(R15, offsetof(JitState, halt_requested)), Imm8(0)); code->J_CC(CC_NE, code->GetReturnFromRunCodeAddress(), true); EmitTerminal(terminal.else_, initial_location); } void EmitX64::Patch(Arm::LocationDescriptor desc, CodePtr bb) { u8* const save_code_ptr = code->GetWritableCodePtr(); for (CodePtr location : patch_jg_locations[desc]) { code->SetCodePtr(const_cast(location)); code->J_CC(CC_G, bb, true); ASSERT(code->GetCodePtr() - location == 6); } for (CodePtr location : patch_jmp_locations[desc]) { code->SetCodePtr(const_cast(location)); code->JMP(bb, true); ASSERT(code->GetCodePtr() - location == 5); } for (CodePtr location : patch_unique_hash_locations[desc.UniqueHash()]) { code->SetCodePtr(const_cast(location)); code->MOV(64, R(RCX), Imm64(u64(bb))); ASSERT((code->GetCodePtr() - location) == 10); } code->SetCodePtr(save_code_ptr); } void EmitX64::ClearCache() { unique_hash_to_code_ptr.clear(); patch_unique_hash_locations.clear(); basic_blocks.clear(); patch_jg_locations.clear(); patch_jmp_locations.clear(); } } // namespace BackendX64 } // namespace Dynarmic