backend/arm64: Implement cycle counting

This commit is contained in:
Merry 2022-11-06 01:10:29 +00:00
parent b5ad066372
commit e476fad5a2
7 changed files with 138 additions and 81 deletions

View file

@ -152,55 +152,6 @@ void A32AddressSpace::EmitPrelude() {
mem.unprotect(); mem.unprotect();
prelude_info.run_code = code.ptr<PreludeInfo::RunCodeFuncType>();
ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.MOV(Xstate, X1);
code.MOV(Xhalt, X2);
code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
code.AND(Wscratch0, Wscratch0, 0xffff0000);
code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
code.BR(X0);
prelude_info.step_code = code.ptr<PreludeInfo::RunCodeFuncType>();
ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.MOV(Xstate, X1);
code.MOV(Xhalt, X2);
code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
code.AND(Wscratch0, Wscratch0, 0xffff0000);
code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
oaknut::Label step_hr_loop;
code.l(step_hr_loop);
code.LDAXR(Wscratch0, Xhalt);
code.ORR(Wscratch0, Wscratch0, static_cast<u32>(HaltReason::Step));
code.STLXR(Wscratch1, Wscratch0, Xhalt);
code.CBNZ(Wscratch1, step_hr_loop);
code.BR(X0);
prelude_info.return_from_run_code = code.ptr<void*>();
code.LDR(Wscratch0, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
oaknut::Label exit_hr_loop;
code.l(exit_hr_loop);
code.LDAXR(W0, Xhalt);
code.STLXR(Wscratch0, WZR, Xhalt);
code.CBNZ(Wscratch0, exit_hr_loop);
ABI_PopRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.RET();
prelude_info.read_memory_8 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead8>(code, conf.callbacks); prelude_info.read_memory_8 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead8>(code, conf.callbacks);
prelude_info.read_memory_16 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead16>(code, conf.callbacks); prelude_info.read_memory_16 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead16>(code, conf.callbacks);
prelude_info.read_memory_32 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead32>(code, conf.callbacks); prelude_info.read_memory_32 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead32>(code, conf.callbacks);
@ -223,6 +174,112 @@ void A32AddressSpace::EmitPrelude() {
prelude_info.add_ticks = EmitCallTrampoline<&A32::UserCallbacks::AddTicks>(code, conf.callbacks); prelude_info.add_ticks = EmitCallTrampoline<&A32::UserCallbacks::AddTicks>(code, conf.callbacks);
prelude_info.get_ticks_remaining = EmitCallTrampoline<&A32::UserCallbacks::GetTicksRemaining>(code, conf.callbacks); prelude_info.get_ticks_remaining = EmitCallTrampoline<&A32::UserCallbacks::GetTicksRemaining>(code, conf.callbacks);
oaknut::Label return_from_run_code;
prelude_info.run_code = code.ptr<PreludeInfo::RunCodeFuncType>();
{
ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.MOV(X19, X0);
code.MOV(Xstate, X1);
code.MOV(Xhalt, X2);
if (conf.enable_cycle_counting) {
code.BL(prelude_info.get_ticks_remaining);
code.MOV(Xticks, X0);
code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run));
}
code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
code.AND(Wscratch0, Wscratch0, 0xffff0000);
code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
code.BR(X19);
}
prelude_info.step_code = code.ptr<PreludeInfo::RunCodeFuncType>();
{
ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.MOV(X19, X0);
code.MOV(Xstate, X1);
code.MOV(Xhalt, X2);
if (conf.enable_cycle_counting) {
code.MOV(Xticks, 1);
code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run));
}
code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
code.AND(Wscratch0, Wscratch0, 0xffff0000);
code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
oaknut::Label step_hr_loop;
code.l(step_hr_loop);
code.LDAXR(Wscratch0, Xhalt);
code.ORR(Wscratch0, Wscratch0, static_cast<u32>(HaltReason::Step));
code.STLXR(Wscratch1, Wscratch0, Xhalt);
code.CBNZ(Wscratch1, step_hr_loop);
code.BR(X19);
}
prelude_info.return_to_dispatcher = code.ptr<void*>();
{
oaknut::Label l_this, l_addr;
code.LDAR(Wscratch0, Xhalt);
code.CBNZ(Wscratch0, return_from_run_code);
if (conf.enable_cycle_counting) {
code.CMP(Xticks, 0);
code.B(LE, return_from_run_code);
}
code.LDR(X0, l_this);
code.MOV(X1, Xstate);
code.LDR(Xscratch0, l_addr);
code.BLR(Xscratch0);
code.BR(X0);
const auto fn = [](A32AddressSpace& self, A32JitState& context) -> CodePtr {
return self.GetOrEmit(context.GetLocationDescriptor());
};
code.align(8);
code.l(l_this);
code.dx(mcl::bit_cast<u64>(this));
code.l(l_addr);
code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
}
prelude_info.return_from_run_code = code.ptr<void*>();
{
code.l(return_from_run_code);
if (conf.enable_cycle_counting) {
code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run));
code.SUB(X1, X1, Xticks);
code.BL(prelude_info.add_ticks);
}
code.LDR(Wscratch0, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
oaknut::Label exit_hr_loop;
code.l(exit_hr_loop);
code.LDAXR(W0, Xhalt);
code.STLXR(Wscratch0, WZR, Xhalt);
code.CBNZ(Wscratch0, exit_hr_loop);
ABI_PopRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.RET();
}
prelude_info.end_of_prelude = code.ptr<u32*>(); prelude_info.end_of_prelude = code.ptr<u32*>();
mem.invalidate_all(); mem.invalidate_all();
@ -267,6 +324,9 @@ void A32AddressSpace::Link(EmittedBlockInfo& block_info) {
CodeGenerator c{reinterpret_cast<u32*>(block_info.entry_point + ptr_offset)}; CodeGenerator c{reinterpret_cast<u32*>(block_info.entry_point + ptr_offset)};
switch (target) { switch (target) {
case LinkTarget::ReturnToDispatcher:
c.B(prelude_info.return_to_dispatcher);
break;
case LinkTarget::ReturnFromRunCode: case LinkTarget::ReturnFromRunCode:
c.B(prelude_info.return_from_run_code); c.B(prelude_info.return_from_run_code);
break; break;

View file

@ -55,6 +55,7 @@ private:
using RunCodeFuncType = HaltReason (*)(CodePtr entry_point, A32JitState* context, volatile u32* halt_reason); using RunCodeFuncType = HaltReason (*)(CodePtr entry_point, A32JitState* context, volatile u32* halt_reason);
RunCodeFuncType run_code; RunCodeFuncType run_code;
RunCodeFuncType step_code; RunCodeFuncType step_code;
void* return_to_dispatcher;
void* return_from_run_code; void* return_from_run_code;
void* read_memory_8; void* read_memory_8;

View file

@ -14,6 +14,7 @@ namespace Dynarmic::Backend::Arm64 {
constexpr oaknut::XReg Xstate{28}; constexpr oaknut::XReg Xstate{28};
constexpr oaknut::XReg Xhalt{27}; constexpr oaknut::XReg Xhalt{27};
constexpr oaknut::XReg Xticks{26};
constexpr oaknut::XReg Xscratch0{16}, Xscratch1{17}; constexpr oaknut::XReg Xscratch0{16}, Xscratch1{17};
constexpr oaknut::WReg Wscratch0{16}, Wscratch1{17}; constexpr oaknut::WReg Wscratch0{16}, Wscratch1{17};
@ -40,7 +41,7 @@ constexpr auto Rscratch1() {
} }
} }
constexpr std::initializer_list<int> GPR_ORDER{19, 20, 21, 22, 23, 24, 25, 26, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8}; constexpr std::initializer_list<int> GPR_ORDER{19, 20, 21, 22, 23, 24, 25, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8};
constexpr std::initializer_list<int> FPR_ORDER{8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; constexpr std::initializer_list<int> FPR_ORDER{8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
using RegisterList = u64; using RegisterList = u64;

View file

@ -135,15 +135,17 @@ void EmitIR<IR::Opcode::NZCVFromPackedFlags>(oaknut::CodeGenerator&, EmitContext
ctx.reg_alloc.DefineAsExisting(inst, args[0]); ctx.reg_alloc.DefineAsExisting(inst, args[0]);
} }
static void EmitAddCycles(oaknut::CodeGenerator& code, EmitContext&, size_t cycles_to_add) { static void EmitAddCycles(oaknut::CodeGenerator& code, EmitContext& ctx, size_t cycles_to_add) {
code.LDR(Xscratch0, SP, offsetof(StackLayout, cycles_remaining)); if (!ctx.conf.enable_cycle_counting) {
return;
}
if (oaknut::AddSubImm::is_valid(cycles_to_add)) { if (oaknut::AddSubImm::is_valid(cycles_to_add)) {
code.SUBS(Xscratch0, Xscratch0, cycles_to_add); code.SUB(Xticks, Xticks, cycles_to_add);
} else { } else {
code.MOV(Xscratch1, cycles_to_add); code.MOV(Xscratch1, cycles_to_add);
code.SUBS(Xscratch0, Xscratch0, Xscratch1); code.SUB(Xticks, Xticks, Xscratch1);
} }
code.STR(Xscratch0, SP, offsetof(StackLayout, cycles_remaining));
} }
EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const EmitConfig& conf) { EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const EmitConfig& conf) {
@ -161,9 +163,7 @@ EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const E
ASSERT(ctx.block.HasConditionFailedLocation()); ASSERT(ctx.block.HasConditionFailedLocation());
oaknut::Label pass = EmitA32Cond(code, ctx, ctx.block.GetCondition()); oaknut::Label pass = EmitA32Cond(code, ctx, ctx.block.GetCondition());
if (conf.enable_cycle_counting) { EmitAddCycles(code, ctx, ctx.block.ConditionFailedCycleCount());
EmitAddCycles(code, ctx, ctx.block.ConditionFailedCycleCount());
}
EmitA32ConditionFailedTerminal(code, ctx); EmitA32ConditionFailedTerminal(code, ctx);
code.l(pass); code.l(pass);
} }
@ -201,10 +201,7 @@ EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const E
reg_alloc.AssertNoMoreUses(); reg_alloc.AssertNoMoreUses();
if (ctx.conf.enable_cycle_counting) { EmitAddCycles(code, ctx, block.CycleCount());
EmitAddCycles(code, ctx, block.CycleCount());
}
EmitA32Terminal(code, ctx); EmitA32Terminal(code, ctx);
ebi.size = code.ptr<CodePtr>() - ebi.entry_point; ebi.size = code.ptr<CodePtr>() - ebi.entry_point;

View file

@ -39,6 +39,7 @@ namespace Dynarmic::Backend::Arm64 {
using CodePtr = std::byte*; using CodePtr = std::byte*;
enum class LinkTarget { enum class LinkTarget {
ReturnToDispatcher,
ReturnFromRunCode, ReturnFromRunCode,
ReadMemory8, ReadMemory8,
ReadMemory16, ReadMemory16,

View file

@ -38,7 +38,7 @@ void EmitA32Terminal(oaknut::CodeGenerator&, EmitContext&, IR::Term::Interpret,
} }
void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) { void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) {
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
} }
void EmitSetUpperLocationDescriptor(oaknut::CodeGenerator& code, EmitContext& ctx, IR::LocationDescriptor new_location, IR::LocationDescriptor old_location) { void EmitSetUpperLocationDescriptor(oaknut::CodeGenerator& code, EmitContext& ctx, IR::LocationDescriptor new_location, IR::LocationDescriptor old_location) {
@ -63,7 +63,7 @@ void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Li
code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC()); code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC());
code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15); code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15);
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
// TODO: Implement LinkBlock optimization // TODO: Implement LinkBlock optimization
} }
@ -73,19 +73,19 @@ void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Li
code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC()); code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC());
code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15); code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15);
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
// TODO: Implement LinkBlockFast optimization // TODO: Implement LinkBlockFast optimization
} }
void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool) { void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool) {
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
// TODO: Implement PopRSBHint optimization // TODO: Implement PopRSBHint optimization
} }
void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::FastDispatchHint, IR::LocationDescriptor, bool) { void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::FastDispatchHint, IR::LocationDescriptor, bool) {
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
// TODO: Implement FastDispatchHint optimization // TODO: Implement FastDispatchHint optimization
} }
@ -112,7 +112,7 @@ void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Ch
code.CBNZ(Wscratch0, fail); code.CBNZ(Wscratch0, fail);
EmitA32Terminal(code, ctx, terminal.else_, initial_location, is_single_step); EmitA32Terminal(code, ctx, terminal.else_, initial_location, is_single_step);
code.l(fail); code.l(fail);
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
} }
void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) { void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
@ -508,11 +508,9 @@ void EmitIR<IR::Opcode::A32CallSupervisor>(oaknut::CodeGenerator& code, EmitCont
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.PrepareForCall(nullptr); ctx.reg_alloc.PrepareForCall(nullptr);
static_assert(offsetof(StackLayout, cycles_remaining) + sizeof(u64) == offsetof(StackLayout, cycles_to_run));
if (ctx.conf.enable_cycle_counting) { if (ctx.conf.enable_cycle_counting) {
code.LDP(Xscratch0, Xscratch1, SP, offsetof(StackLayout, cycles_remaining)); code.LDR(Xscratch0, SP, offsetof(StackLayout, cycles_to_run));
code.SUB(Xscratch0, Xscratch1, Xscratch0); code.SUB(Xscratch0, Xscratch0, Xticks);
EmitRelocation(code, ctx, LinkTarget::AddTicks); EmitRelocation(code, ctx, LinkTarget::AddTicks);
} }
@ -521,7 +519,8 @@ void EmitIR<IR::Opcode::A32CallSupervisor>(oaknut::CodeGenerator& code, EmitCont
if (ctx.conf.enable_cycle_counting) { if (ctx.conf.enable_cycle_counting) {
EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining); EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining);
code.STP(X0, X0, SP, offsetof(StackLayout, cycles_remaining)); code.STR(X0, SP, offsetof(StackLayout, cycles_to_run));
code.MOV(Xticks, X0);
} }
} }
@ -530,11 +529,9 @@ void EmitIR<IR::Opcode::A32ExceptionRaised>(oaknut::CodeGenerator& code, EmitCon
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.PrepareForCall(nullptr); ctx.reg_alloc.PrepareForCall(nullptr);
static_assert(offsetof(StackLayout, cycles_remaining) + sizeof(u64) == offsetof(StackLayout, cycles_to_run));
if (ctx.conf.enable_cycle_counting) { if (ctx.conf.enable_cycle_counting) {
code.LDP(Xscratch0, Xscratch1, SP, offsetof(StackLayout, cycles_remaining)); code.LDR(Xscratch0, SP, offsetof(StackLayout, cycles_to_run));
code.SUB(Xscratch0, Xscratch1, Xscratch0); code.SUB(Xscratch0, Xscratch0, Xticks);
EmitRelocation(code, ctx, LinkTarget::AddTicks); EmitRelocation(code, ctx, LinkTarget::AddTicks);
} }
@ -544,7 +541,8 @@ void EmitIR<IR::Opcode::A32ExceptionRaised>(oaknut::CodeGenerator& code, EmitCon
if (ctx.conf.enable_cycle_counting) { if (ctx.conf.enable_cycle_counting) {
EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining); EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining);
code.STP(X0, X0, SP, offsetof(StackLayout, cycles_remaining)); code.STR(X0, SP, offsetof(StackLayout, cycles_to_run));
code.MOV(Xticks, X0);
} }
} }

View file

@ -19,11 +19,10 @@ constexpr size_t SpillCount = 64;
#endif #endif
struct alignas(16) StackLayout { struct alignas(16) StackLayout {
s64 cycles_remaining;
s64 cycles_to_run;
std::array<std::array<u64, 2>, SpillCount> spill; std::array<std::array<u64, 2>, SpillCount> spill;
s64 cycles_to_run;
u32 save_host_fpcr; u32 save_host_fpcr;
bool check_bit; bool check_bit;