From c326f9b02f63d3635ad360953f4b2b85b2073d78 Mon Sep 17 00:00:00 2001 From: merry Date: Thu, 30 Mar 2023 15:54:45 +0100 Subject: [PATCH] backend/arm64: Implement RSB --- .../backend/arm64/a32_address_space.cpp | 20 ++++++++++++- .../backend/arm64/a64_address_space.cpp | 20 ++++++++++++- src/dynarmic/backend/arm64/address_space.cpp | 13 ++++---- src/dynarmic/backend/arm64/emit_arm64.cpp | 24 ++++++++++++--- src/dynarmic/backend/arm64/emit_arm64.h | 2 +- src/dynarmic/backend/arm64/emit_arm64_a32.cpp | 25 ++++++++++++++-- src/dynarmic/backend/arm64/emit_arm64_a64.cpp | 30 +++++++++++++++++-- src/dynarmic/backend/arm64/stack_layout.h | 16 ++++++++-- 8 files changed, 128 insertions(+), 22 deletions(-) diff --git a/src/dynarmic/backend/arm64/a32_address_space.cpp b/src/dynarmic/backend/arm64/a32_address_space.cpp index 2a90d745..864c4588 100644 --- a/src/dynarmic/backend/arm64/a32_address_space.cpp +++ b/src/dynarmic/backend/arm64/a32_address_space.cpp @@ -220,7 +220,7 @@ void A32AddressSpace::EmitPrelude() { prelude_info.add_ticks = EmitCallTrampoline<&A32::UserCallbacks::AddTicks>(code, conf.callbacks); prelude_info.get_ticks_remaining = EmitCallTrampoline<&A32::UserCallbacks::GetTicksRemaining>(code, conf.callbacks); - oaknut::Label return_from_run_code; + oaknut::Label return_from_run_code, l_return_to_dispatcher; prelude_info.run_code = code.ptr(); { @@ -236,6 +236,13 @@ void A32AddressSpace::EmitPrelude() { code.MOV(Xfastmem, mcl::bit_cast(conf.fastmem_pointer)); } + if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { + code.LDR(Xscratch0, l_return_to_dispatcher); + for (size_t i = 0; i < RSBCount; i++) { + code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry)); + } + } + if (conf.enable_cycle_counting) { code.BL(prelude_info.get_ticks_remaining); code.MOV(Xticks, X0); @@ -268,6 +275,13 @@ void A32AddressSpace::EmitPrelude() { code.MOV(Xfastmem, mcl::bit_cast(conf.fastmem_pointer)); } + if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { + code.LDR(Xscratch0, l_return_to_dispatcher); + for (size_t i = 0; i < RSBCount; i++) { + code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry)); + } + } + if (conf.enable_cycle_counting) { code.MOV(Xticks, 1); code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run)); @@ -342,6 +356,10 @@ void A32AddressSpace::EmitPrelude() { code.RET(); } + code.align(8); + code.l(l_return_to_dispatcher); + code.dx(mcl::bit_cast(prelude_info.return_to_dispatcher)); + prelude_info.end_of_prelude = code.ptr(); mem.invalidate_all(); diff --git a/src/dynarmic/backend/arm64/a64_address_space.cpp b/src/dynarmic/backend/arm64/a64_address_space.cpp index 9c171b52..1e5632ab 100644 --- a/src/dynarmic/backend/arm64/a64_address_space.cpp +++ b/src/dynarmic/backend/arm64/a64_address_space.cpp @@ -398,7 +398,7 @@ void A64AddressSpace::EmitPrelude() { prelude_info.add_ticks = EmitCallTrampoline<&A64::UserCallbacks::AddTicks>(code, conf.callbacks); prelude_info.get_ticks_remaining = EmitCallTrampoline<&A64::UserCallbacks::GetTicksRemaining>(code, conf.callbacks); - oaknut::Label return_from_run_code; + oaknut::Label return_from_run_code, l_return_to_dispatcher; prelude_info.run_code = code.ptr(); { @@ -414,6 +414,13 @@ void A64AddressSpace::EmitPrelude() { code.MOV(Xfastmem, mcl::bit_cast(conf.fastmem_pointer)); } + if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { + code.LDR(Xscratch0, l_return_to_dispatcher); + for (size_t i = 0; i < RSBCount; i++) { + code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry)); + } + } + if (conf.enable_cycle_counting) { code.BL(prelude_info.get_ticks_remaining); code.MOV(Xticks, X0); @@ -445,6 +452,13 @@ void A64AddressSpace::EmitPrelude() { code.MOV(Xfastmem, mcl::bit_cast(conf.fastmem_pointer)); } + if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { + code.LDR(Xscratch0, l_return_to_dispatcher); + for (size_t i = 0; i < RSBCount; i++) { + code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry)); + } + } + if (conf.enable_cycle_counting) { code.MOV(Xticks, 1); code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run)); @@ -518,6 +532,10 @@ void A64AddressSpace::EmitPrelude() { code.RET(); } + code.align(8); + code.l(l_return_to_dispatcher); + code.dx(mcl::bit_cast(prelude_info.return_to_dispatcher)); + prelude_info.end_of_prelude = code.ptr(); mem.invalidate_all(); diff --git a/src/dynarmic/backend/arm64/address_space.cpp b/src/dynarmic/backend/arm64/address_space.cpp index 70acb229..fdb4efe1 100644 --- a/src/dynarmic/backend/arm64/address_space.cpp +++ b/src/dynarmic/backend/arm64/address_space.cpp @@ -120,7 +120,7 @@ EmittedBlockInfo AddressSpace::Emit(IR::Block block) { return block_info; } -static void LinkBlockLinks(const CodePtr entry_point, const CodePtr target_ptr, const std::vector& block_relocations_list) { +static void LinkBlockLinks(const CodePtr entry_point, const CodePtr target_ptr, const std::vector& block_relocations_list, void* return_to_dispatcher) { using namespace oaknut; using namespace oaknut::util; @@ -135,12 +135,11 @@ static void LinkBlockLinks(const CodePtr entry_point, const CodePtr target_ptr, c.NOP(); } break; - case BlockRelocationType::MoveToScratch0: + case BlockRelocationType::MoveToScratch1: if (target_ptr) { - c.ADRL(Xscratch0, (void*)target_ptr); + c.ADRL(Xscratch1, (void*)target_ptr); } else { - c.NOP(); - c.NOP(); + c.ADRL(Xscratch1, return_to_dispatcher); } break; default: @@ -284,7 +283,7 @@ void AddressSpace::Link(EmittedBlockInfo& block_info) { for (auto [target_descriptor, list] : block_info.block_relocations) { block_references[target_descriptor].emplace(block_info.entry_point); - LinkBlockLinks(block_info.entry_point, Get(target_descriptor), list); + LinkBlockLinks(block_info.entry_point, Get(target_descriptor), list, prelude_info.return_to_dispatcher); } } @@ -294,7 +293,7 @@ void AddressSpace::RelinkForDescriptor(IR::LocationDescriptor target_descriptor, const EmittedBlockInfo& block_info = block_iter->second; if (auto relocation_iter = block_info.block_relocations.find(target_descriptor); relocation_iter != block_info.block_relocations.end()) { - LinkBlockLinks(block_info.entry_point, target_ptr, relocation_iter->second); + LinkBlockLinks(block_info.entry_point, target_ptr, relocation_iter->second, prelude_info.return_to_dispatcher); } mem.invalidate(reinterpret_cast(block_info.entry_point), block_info.size); diff --git a/src/dynarmic/backend/arm64/emit_arm64.cpp b/src/dynarmic/backend/arm64/emit_arm64.cpp index 117c957a..cc55b26a 100644 --- a/src/dynarmic/backend/arm64/emit_arm64.cpp +++ b/src/dynarmic/backend/arm64/emit_arm64.cpp @@ -45,8 +45,24 @@ void EmitIR(oaknut::CodeGenerator& code, EmitConte } template<> -void EmitIR(oaknut::CodeGenerator&, EmitContext&, IR::Inst*) { - // TODO +void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + if (!ctx.conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[0].IsImmediate()); + const IR::LocationDescriptor target{args[0].GetImmediateU64()}; + + code.LDR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + code.ADD(Wscratch2, Wscratch2, sizeof(RSBEntry)); + code.AND(Wscratch2, Wscratch2, RSBIndexMask); + code.STR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + code.ADD(Xscratch2, SP, Xscratch2); + + code.MOV(Xscratch0, target.Value()); + EmitBlockLinkRelocation(code, ctx, target, BlockRelocationType::MoveToScratch1); + code.STP(Xscratch0, Xscratch1, Xscratch2, offsetof(StackLayout, rsb)); } template<> @@ -262,8 +278,8 @@ void EmitBlockLinkRelocation(oaknut::CodeGenerator& code, EmitContext& ctx, cons case BlockRelocationType::Branch: code.NOP(); break; - case BlockRelocationType::MoveToScratch0: - code.NOP(); + case BlockRelocationType::MoveToScratch1: + code.BRK(0); code.NOP(); break; default: diff --git a/src/dynarmic/backend/arm64/emit_arm64.h b/src/dynarmic/backend/arm64/emit_arm64.h index eac19d24..4b7020d6 100644 --- a/src/dynarmic/backend/arm64/emit_arm64.h +++ b/src/dynarmic/backend/arm64/emit_arm64.h @@ -93,7 +93,7 @@ struct Relocation { enum class BlockRelocationType { Branch, - MoveToScratch0, + MoveToScratch1, }; struct BlockRelocation { diff --git a/src/dynarmic/backend/arm64/emit_arm64_a32.cpp b/src/dynarmic/backend/arm64/emit_arm64_a32.cpp index cee91dc9..909dde73 100644 --- a/src/dynarmic/backend/arm64/emit_arm64_a32.cpp +++ b/src/dynarmic/backend/arm64/emit_arm64_a32.cpp @@ -93,10 +93,29 @@ void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Li EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); } -void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool) { - EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) { + if (ctx.conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) && !is_single_step) { + oaknut::Label fail; - // TODO: Implement PopRSBHint optimization + code.LDR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + code.AND(Wscratch2, Wscratch2, RSBIndexMask); + code.ADD(X2, SP, Xscratch2); + code.SUB(Wscratch2, Wscratch2, sizeof(RSBEntry)); + code.STR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + + code.LDP(Xscratch0, Xscratch1, X2, offsetof(StackLayout, rsb)); + + static_assert(offsetof(A32JitState, regs) + 16 * sizeof(u32) == offsetof(A32JitState, upper_location_descriptor)); + code.LDUR(X0, Xstate, offsetof(A32JitState, regs) + 15 * sizeof(u32)); + + code.CMP(X0, Xscratch0); + code.B(NE, fail); + code.BR(Xscratch1); + + code.l(fail); + } + + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); } void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::FastDispatchHint, IR::LocationDescriptor, bool) { diff --git a/src/dynarmic/backend/arm64/emit_arm64_a64.cpp b/src/dynarmic/backend/arm64/emit_arm64_a64.cpp index 85973b79..67fd91f2 100644 --- a/src/dynarmic/backend/arm64/emit_arm64_a64.cpp +++ b/src/dynarmic/backend/arm64/emit_arm64_a64.cpp @@ -70,10 +70,34 @@ void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Li EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); } -void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool) { - EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) { + if (ctx.conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) && !is_single_step) { + oaknut::Label fail; - // TODO: Implement PopRSBHint optimization + code.MOV(Wscratch0, A64::LocationDescriptor::fpcr_mask); + code.LDR(W0, Xstate, offsetof(A64JitState, fpcr)); + code.LDR(X1, Xstate, offsetof(A64JitState, pc)); + code.AND(W0, W0, Wscratch0); + code.AND(X1, X1, A64::LocationDescriptor::pc_mask); + code.LSL(X0, X0, A64::LocationDescriptor::fpcr_shift); + code.ORR(X0, X0, X1); + + code.LDR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + code.AND(Wscratch2, Wscratch2, RSBIndexMask); + code.ADD(X2, SP, Xscratch2); + code.SUB(Wscratch2, Wscratch2, sizeof(RSBEntry)); + code.STR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + + code.LDP(Xscratch0, Xscratch1, X2, offsetof(StackLayout, rsb)); + + code.CMP(X0, Xscratch0); + code.B(NE, fail); + code.BR(Xscratch1); + + code.l(fail); + } + + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); } void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::FastDispatchHint, IR::LocationDescriptor, bool) { diff --git a/src/dynarmic/backend/arm64/stack_layout.h b/src/dynarmic/backend/arm64/stack_layout.h index 63218b18..cf7f3259 100644 --- a/src/dynarmic/backend/arm64/stack_layout.h +++ b/src/dynarmic/backend/arm64/stack_layout.h @@ -11,16 +11,28 @@ namespace Dynarmic::Backend::Arm64 { -constexpr size_t SpillCount = 64; - #ifdef _MSC_VER # pragma warning(push) # pragma warning(disable : 4324) // Structure was padded due to alignment specifier #endif +constexpr size_t SpillCount = 64; + +struct alignas(16) RSBEntry { + u64 target; + u64 code_ptr; +}; + +constexpr size_t RSBCount = 8; +constexpr u64 RSBIndexMask = (RSBCount - 1) * sizeof(RSBEntry); + struct alignas(16) StackLayout { + std::array rsb; + std::array, SpillCount> spill; + u32 rsb_ptr; + s64 cycles_to_run; u32 save_host_fpcr;