backend/arm64: Implement cycle counting

This commit is contained in:
Merry 2022-11-06 01:10:29 +00:00
parent b5ad066372
commit e476fad5a2
7 changed files with 138 additions and 81 deletions

View file

@ -152,55 +152,6 @@ void A32AddressSpace::EmitPrelude() {
mem.unprotect();
prelude_info.run_code = code.ptr<PreludeInfo::RunCodeFuncType>();
ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.MOV(Xstate, X1);
code.MOV(Xhalt, X2);
code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
code.AND(Wscratch0, Wscratch0, 0xffff0000);
code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
code.BR(X0);
prelude_info.step_code = code.ptr<PreludeInfo::RunCodeFuncType>();
ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.MOV(Xstate, X1);
code.MOV(Xhalt, X2);
code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
code.AND(Wscratch0, Wscratch0, 0xffff0000);
code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
oaknut::Label step_hr_loop;
code.l(step_hr_loop);
code.LDAXR(Wscratch0, Xhalt);
code.ORR(Wscratch0, Wscratch0, static_cast<u32>(HaltReason::Step));
code.STLXR(Wscratch1, Wscratch0, Xhalt);
code.CBNZ(Wscratch1, step_hr_loop);
code.BR(X0);
prelude_info.return_from_run_code = code.ptr<void*>();
code.LDR(Wscratch0, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
oaknut::Label exit_hr_loop;
code.l(exit_hr_loop);
code.LDAXR(W0, Xhalt);
code.STLXR(Wscratch0, WZR, Xhalt);
code.CBNZ(Wscratch0, exit_hr_loop);
ABI_PopRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.RET();
prelude_info.read_memory_8 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead8>(code, conf.callbacks);
prelude_info.read_memory_16 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead16>(code, conf.callbacks);
prelude_info.read_memory_32 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead32>(code, conf.callbacks);
@ -223,6 +174,112 @@ void A32AddressSpace::EmitPrelude() {
prelude_info.add_ticks = EmitCallTrampoline<&A32::UserCallbacks::AddTicks>(code, conf.callbacks);
prelude_info.get_ticks_remaining = EmitCallTrampoline<&A32::UserCallbacks::GetTicksRemaining>(code, conf.callbacks);
oaknut::Label return_from_run_code;
prelude_info.run_code = code.ptr<PreludeInfo::RunCodeFuncType>();
{
ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.MOV(X19, X0);
code.MOV(Xstate, X1);
code.MOV(Xhalt, X2);
if (conf.enable_cycle_counting) {
code.BL(prelude_info.get_ticks_remaining);
code.MOV(Xticks, X0);
code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run));
}
code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
code.AND(Wscratch0, Wscratch0, 0xffff0000);
code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
code.BR(X19);
}
prelude_info.step_code = code.ptr<PreludeInfo::RunCodeFuncType>();
{
ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.MOV(X19, X0);
code.MOV(Xstate, X1);
code.MOV(Xhalt, X2);
if (conf.enable_cycle_counting) {
code.MOV(Xticks, 1);
code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run));
}
code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
code.AND(Wscratch0, Wscratch0, 0xffff0000);
code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
oaknut::Label step_hr_loop;
code.l(step_hr_loop);
code.LDAXR(Wscratch0, Xhalt);
code.ORR(Wscratch0, Wscratch0, static_cast<u32>(HaltReason::Step));
code.STLXR(Wscratch1, Wscratch0, Xhalt);
code.CBNZ(Wscratch1, step_hr_loop);
code.BR(X19);
}
prelude_info.return_to_dispatcher = code.ptr<void*>();
{
oaknut::Label l_this, l_addr;
code.LDAR(Wscratch0, Xhalt);
code.CBNZ(Wscratch0, return_from_run_code);
if (conf.enable_cycle_counting) {
code.CMP(Xticks, 0);
code.B(LE, return_from_run_code);
}
code.LDR(X0, l_this);
code.MOV(X1, Xstate);
code.LDR(Xscratch0, l_addr);
code.BLR(Xscratch0);
code.BR(X0);
const auto fn = [](A32AddressSpace& self, A32JitState& context) -> CodePtr {
return self.GetOrEmit(context.GetLocationDescriptor());
};
code.align(8);
code.l(l_this);
code.dx(mcl::bit_cast<u64>(this));
code.l(l_addr);
code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
}
prelude_info.return_from_run_code = code.ptr<void*>();
{
code.l(return_from_run_code);
if (conf.enable_cycle_counting) {
code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run));
code.SUB(X1, X1, Xticks);
code.BL(prelude_info.add_ticks);
}
code.LDR(Wscratch0, SP, offsetof(StackLayout, save_host_fpcr));
code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
oaknut::Label exit_hr_loop;
code.l(exit_hr_loop);
code.LDAXR(W0, Xhalt);
code.STLXR(Wscratch0, WZR, Xhalt);
code.CBNZ(Wscratch0, exit_hr_loop);
ABI_PopRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
code.RET();
}
prelude_info.end_of_prelude = code.ptr<u32*>();
mem.invalidate_all();
@ -267,6 +324,9 @@ void A32AddressSpace::Link(EmittedBlockInfo& block_info) {
CodeGenerator c{reinterpret_cast<u32*>(block_info.entry_point + ptr_offset)};
switch (target) {
case LinkTarget::ReturnToDispatcher:
c.B(prelude_info.return_to_dispatcher);
break;
case LinkTarget::ReturnFromRunCode:
c.B(prelude_info.return_from_run_code);
break;

View file

@ -55,6 +55,7 @@ private:
using RunCodeFuncType = HaltReason (*)(CodePtr entry_point, A32JitState* context, volatile u32* halt_reason);
RunCodeFuncType run_code;
RunCodeFuncType step_code;
void* return_to_dispatcher;
void* return_from_run_code;
void* read_memory_8;

View file

@ -14,6 +14,7 @@ namespace Dynarmic::Backend::Arm64 {
constexpr oaknut::XReg Xstate{28};
constexpr oaknut::XReg Xhalt{27};
constexpr oaknut::XReg Xticks{26};
constexpr oaknut::XReg Xscratch0{16}, Xscratch1{17};
constexpr oaknut::WReg Wscratch0{16}, Wscratch1{17};
@ -40,7 +41,7 @@ constexpr auto Rscratch1() {
}
}
constexpr std::initializer_list<int> GPR_ORDER{19, 20, 21, 22, 23, 24, 25, 26, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8};
constexpr std::initializer_list<int> GPR_ORDER{19, 20, 21, 22, 23, 24, 25, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8};
constexpr std::initializer_list<int> FPR_ORDER{8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
using RegisterList = u64;

View file

@ -135,15 +135,17 @@ void EmitIR<IR::Opcode::NZCVFromPackedFlags>(oaknut::CodeGenerator&, EmitContext
ctx.reg_alloc.DefineAsExisting(inst, args[0]);
}
static void EmitAddCycles(oaknut::CodeGenerator& code, EmitContext&, size_t cycles_to_add) {
code.LDR(Xscratch0, SP, offsetof(StackLayout, cycles_remaining));
static void EmitAddCycles(oaknut::CodeGenerator& code, EmitContext& ctx, size_t cycles_to_add) {
if (!ctx.conf.enable_cycle_counting) {
return;
}
if (oaknut::AddSubImm::is_valid(cycles_to_add)) {
code.SUBS(Xscratch0, Xscratch0, cycles_to_add);
code.SUB(Xticks, Xticks, cycles_to_add);
} else {
code.MOV(Xscratch1, cycles_to_add);
code.SUBS(Xscratch0, Xscratch0, Xscratch1);
code.SUB(Xticks, Xticks, Xscratch1);
}
code.STR(Xscratch0, SP, offsetof(StackLayout, cycles_remaining));
}
EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const EmitConfig& conf) {
@ -161,9 +163,7 @@ EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const E
ASSERT(ctx.block.HasConditionFailedLocation());
oaknut::Label pass = EmitA32Cond(code, ctx, ctx.block.GetCondition());
if (conf.enable_cycle_counting) {
EmitAddCycles(code, ctx, ctx.block.ConditionFailedCycleCount());
}
EmitAddCycles(code, ctx, ctx.block.ConditionFailedCycleCount());
EmitA32ConditionFailedTerminal(code, ctx);
code.l(pass);
}
@ -201,10 +201,7 @@ EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const E
reg_alloc.AssertNoMoreUses();
if (ctx.conf.enable_cycle_counting) {
EmitAddCycles(code, ctx, block.CycleCount());
}
EmitAddCycles(code, ctx, block.CycleCount());
EmitA32Terminal(code, ctx);
ebi.size = code.ptr<CodePtr>() - ebi.entry_point;

View file

@ -39,6 +39,7 @@ namespace Dynarmic::Backend::Arm64 {
using CodePtr = std::byte*;
enum class LinkTarget {
ReturnToDispatcher,
ReturnFromRunCode,
ReadMemory8,
ReadMemory16,

View file

@ -38,7 +38,7 @@ void EmitA32Terminal(oaknut::CodeGenerator&, EmitContext&, IR::Term::Interpret,
}
void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) {
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode);
EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
}
void EmitSetUpperLocationDescriptor(oaknut::CodeGenerator& code, EmitContext& ctx, IR::LocationDescriptor new_location, IR::LocationDescriptor old_location) {
@ -63,7 +63,7 @@ void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Li
code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC());
code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15);
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode);
EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
// TODO: Implement LinkBlock optimization
}
@ -73,19 +73,19 @@ void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Li
code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC());
code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15);
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode);
EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
// TODO: Implement LinkBlockFast optimization
}
void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool) {
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode);
EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
// TODO: Implement PopRSBHint optimization
}
void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::FastDispatchHint, IR::LocationDescriptor, bool) {
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode);
EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
// TODO: Implement FastDispatchHint optimization
}
@ -112,7 +112,7 @@ void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Ch
code.CBNZ(Wscratch0, fail);
EmitA32Terminal(code, ctx, terminal.else_, initial_location, is_single_step);
code.l(fail);
EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode);
EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
}
void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
@ -508,11 +508,9 @@ void EmitIR<IR::Opcode::A32CallSupervisor>(oaknut::CodeGenerator& code, EmitCont
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.PrepareForCall(nullptr);
static_assert(offsetof(StackLayout, cycles_remaining) + sizeof(u64) == offsetof(StackLayout, cycles_to_run));
if (ctx.conf.enable_cycle_counting) {
code.LDP(Xscratch0, Xscratch1, SP, offsetof(StackLayout, cycles_remaining));
code.SUB(Xscratch0, Xscratch1, Xscratch0);
code.LDR(Xscratch0, SP, offsetof(StackLayout, cycles_to_run));
code.SUB(Xscratch0, Xscratch0, Xticks);
EmitRelocation(code, ctx, LinkTarget::AddTicks);
}
@ -521,7 +519,8 @@ void EmitIR<IR::Opcode::A32CallSupervisor>(oaknut::CodeGenerator& code, EmitCont
if (ctx.conf.enable_cycle_counting) {
EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining);
code.STP(X0, X0, SP, offsetof(StackLayout, cycles_remaining));
code.STR(X0, SP, offsetof(StackLayout, cycles_to_run));
code.MOV(Xticks, X0);
}
}
@ -530,11 +529,9 @@ void EmitIR<IR::Opcode::A32ExceptionRaised>(oaknut::CodeGenerator& code, EmitCon
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.PrepareForCall(nullptr);
static_assert(offsetof(StackLayout, cycles_remaining) + sizeof(u64) == offsetof(StackLayout, cycles_to_run));
if (ctx.conf.enable_cycle_counting) {
code.LDP(Xscratch0, Xscratch1, SP, offsetof(StackLayout, cycles_remaining));
code.SUB(Xscratch0, Xscratch1, Xscratch0);
code.LDR(Xscratch0, SP, offsetof(StackLayout, cycles_to_run));
code.SUB(Xscratch0, Xscratch0, Xticks);
EmitRelocation(code, ctx, LinkTarget::AddTicks);
}
@ -544,7 +541,8 @@ void EmitIR<IR::Opcode::A32ExceptionRaised>(oaknut::CodeGenerator& code, EmitCon
if (ctx.conf.enable_cycle_counting) {
EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining);
code.STP(X0, X0, SP, offsetof(StackLayout, cycles_remaining));
code.STR(X0, SP, offsetof(StackLayout, cycles_to_run));
code.MOV(Xticks, X0);
}
}

View file

@ -19,11 +19,10 @@ constexpr size_t SpillCount = 64;
#endif
struct alignas(16) StackLayout {
s64 cycles_remaining;
s64 cycles_to_run;
std::array<std::array<u64, 2>, SpillCount> spill;
s64 cycles_to_run;
u32 save_host_fpcr;
bool check_bit;