A64: Implement FastDispatchHint
This commit is contained in:
parent
f96c43d422
commit
9b65100660
5 changed files with 92 additions and 23 deletions
|
@ -163,6 +163,9 @@ struct UserConfig {
|
|||
/// definite behaviour for some unpredictable instructions.
|
||||
bool define_unpredictable_behaviour = false;
|
||||
|
||||
/// This enables the fast dispatcher.
|
||||
bool enable_fast_dispatch = true;
|
||||
|
||||
// The below options relate to accuracy of floating-point emulation.
|
||||
|
||||
/// Determines how accurate NaN handling is.
|
||||
|
|
|
@ -64,11 +64,12 @@ bool A64EmitContext::AccurateNaN() const {
|
|||
}
|
||||
|
||||
A64EmitX64::A64EmitX64(BlockOfCode& code, A64::UserConfig conf, A64::Jit* jit_interface)
|
||||
: EmitX64(code), conf(conf), jit_interface{jit_interface}
|
||||
{
|
||||
: EmitX64(code), conf(conf), jit_interface{jit_interface} {
|
||||
GenMemory128Accessors();
|
||||
GenFastmemFallbacks();
|
||||
GenTerminalHandlers();
|
||||
code.PreludeComplete();
|
||||
ClearFastDispatchTable();
|
||||
}
|
||||
|
||||
A64EmitX64::~A64EmitX64() = default;
|
||||
|
@ -134,10 +135,16 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) {
|
|||
void A64EmitX64::ClearCache() {
|
||||
EmitX64::ClearCache();
|
||||
block_ranges.ClearCache();
|
||||
ClearFastDispatchTable();
|
||||
}
|
||||
|
||||
void A64EmitX64::InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges) {
|
||||
InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges));
|
||||
ClearFastDispatchTable();
|
||||
}
|
||||
|
||||
void A64EmitX64::ClearFastDispatchTable() {
|
||||
fast_dispatch_table.fill({0xFFFFFFFFFFFFFFFFull, nullptr});
|
||||
}
|
||||
|
||||
void A64EmitX64::GenMemory128Accessors() {
|
||||
|
@ -290,6 +297,62 @@ void A64EmitX64::GenFastmemFallbacks() {
|
|||
}
|
||||
}
|
||||
|
||||
void A64EmitX64::GenTerminalHandlers() {
|
||||
// PC ends up in rbp, location_descriptor ends up in rbx
|
||||
const auto calculate_location_descriptor = [this] {
|
||||
// This calculation has to match up with A64::LocationDescriptor::UniqueHash
|
||||
// TODO: Optimization is available here based on known state of FPSCR_mode and CPSR_et.
|
||||
code.mov(rbp, qword[r15 + offsetof(A64JitState, pc)]);
|
||||
code.mov(rcx, A64::LocationDescriptor::PC_MASK);
|
||||
code.and_(rcx, rbp);
|
||||
code.mov(ebx, dword[r15 + offsetof(A64JitState, fpcr)]);
|
||||
code.and_(ebx, A64::LocationDescriptor::FPCR_MASK);
|
||||
code.shl(ebx, 37);
|
||||
code.or_(rbx, rcx);
|
||||
};
|
||||
|
||||
Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss;
|
||||
|
||||
code.align();
|
||||
terminal_handler_pop_rsb_hint = code.getCurr<const void*>();
|
||||
calculate_location_descriptor();
|
||||
code.mov(eax, dword[r15 + offsetof(A64JitState, rsb_ptr)]);
|
||||
code.sub(eax, 1);
|
||||
code.and_(eax, u32(A64JitState::RSBPtrMask));
|
||||
code.mov(dword[r15 + offsetof(A64JitState, rsb_ptr)], eax);
|
||||
code.cmp(rbx, qword[r15 + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
|
||||
if (conf.enable_fast_dispatch) {
|
||||
code.jne(rsb_cache_miss);
|
||||
} else {
|
||||
code.jne(code.GetReturnFromRunCodeAddress());
|
||||
}
|
||||
code.mov(rax, qword[r15 + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]);
|
||||
code.jmp(rax);
|
||||
PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a64_terminal_handler_pop_rsb_hint");
|
||||
|
||||
if (conf.enable_fast_dispatch) {
|
||||
code.align();
|
||||
terminal_handler_fast_dispatch_hint = code.getCurr<const void*>();
|
||||
calculate_location_descriptor();
|
||||
code.L(rsb_cache_miss);
|
||||
code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) {
|
||||
code.crc32(rbp, r12d);
|
||||
}
|
||||
code.and_(ebp, fast_dispatch_table_mask);
|
||||
code.lea(rbp, ptr[r12 + rbp]);
|
||||
code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]);
|
||||
code.jne(fast_dispatch_cache_miss);
|
||||
code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]);
|
||||
code.L(fast_dispatch_cache_miss);
|
||||
code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx);
|
||||
code.LookupBlock();
|
||||
code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax);
|
||||
code.jmp(rax);
|
||||
PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a64_terminal_handler_fast_dispatch_hint");
|
||||
}
|
||||
}
|
||||
|
||||
void A64EmitX64::EmitA64SetCheckBit(A64EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
|
||||
|
@ -1051,27 +1114,15 @@ void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::Location
|
|||
}
|
||||
|
||||
void A64EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor) {
|
||||
// This calculation has to match up with A64::LocationDescriptor::UniqueHash
|
||||
// TODO: Optimization is available here based on known state of FPSCR_mode and CPSR_et.
|
||||
code.mov(rcx, A64::LocationDescriptor::PC_MASK);
|
||||
code.and_(rcx, qword[r15 + offsetof(A64JitState, pc)]);
|
||||
code.mov(ebx, dword[r15 + offsetof(A64JitState, fpcr)]);
|
||||
code.and_(ebx, A64::LocationDescriptor::FPCR_MASK);
|
||||
code.shl(ebx, 37);
|
||||
code.or_(rbx, rcx);
|
||||
|
||||
code.mov(eax, dword[r15 + offsetof(A64JitState, rsb_ptr)]);
|
||||
code.sub(eax, 1);
|
||||
code.and_(eax, u32(A64JitState::RSBPtrMask));
|
||||
code.mov(dword[r15 + offsetof(A64JitState, rsb_ptr)], eax);
|
||||
code.cmp(rbx, qword[r15 + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
|
||||
code.jne(code.GetReturnFromRunCodeAddress());
|
||||
code.mov(rax, qword[r15 + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]);
|
||||
code.jmp(rax);
|
||||
code.jmp(terminal_handler_pop_rsb_hint);
|
||||
}
|
||||
|
||||
void A64EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor initial_location) {
|
||||
EmitTerminalImpl(IR::Term::ReturnToDispatch{}, initial_location);
|
||||
void A64EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor) {
|
||||
if (conf.enable_fast_dispatch) {
|
||||
code.jmp(terminal_handler_fast_dispatch_hint);
|
||||
} else {
|
||||
code.ReturnFromRunCode();
|
||||
}
|
||||
}
|
||||
|
||||
void A64EmitX64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location) {
|
||||
|
|
|
@ -53,6 +53,16 @@ protected:
|
|||
A64::Jit* jit_interface;
|
||||
BlockRangeInformation<u64> block_ranges;
|
||||
|
||||
struct FastDispatchEntry {
|
||||
u64 location_descriptor;
|
||||
const void* code_ptr;
|
||||
};
|
||||
static_assert(sizeof(FastDispatchEntry) == 0x10);
|
||||
static constexpr u64 fast_dispatch_table_mask = 0xFFFFF0;
|
||||
static constexpr size_t fast_dispatch_table_size = 0x100000;
|
||||
std::array<FastDispatchEntry, fast_dispatch_table_size> fast_dispatch_table;
|
||||
void ClearFastDispatchTable();
|
||||
|
||||
void (*memory_read_128)();
|
||||
void (*memory_write_128)();
|
||||
void GenMemory128Accessors();
|
||||
|
@ -61,6 +71,10 @@ protected:
|
|||
std::map<std::tuple<size_t, int, int>, void(*)()> write_fallbacks;
|
||||
void GenFastmemFallbacks();
|
||||
|
||||
const void* terminal_handler_pop_rsb_hint;
|
||||
const void* terminal_handler_fast_dispatch_hint = nullptr;
|
||||
void GenTerminalHandlers();
|
||||
|
||||
void EmitDirectPageTableMemoryRead(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize);
|
||||
void EmitDirectPageTableMemoryWrite(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize);
|
||||
void EmitExclusiveWrite(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize);
|
||||
|
|
|
@ -44,7 +44,7 @@ bool TranslatorVisitor::BLR(Reg Rn) {
|
|||
ir.PushRSB(ir.current_location->AdvancePC(4));
|
||||
|
||||
ir.SetPC(target);
|
||||
ir.SetTerm(IR::Term::ReturnToDispatch{});
|
||||
ir.SetTerm(IR::Term::FastDispatchHint{});
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -52,7 +52,7 @@ bool TranslatorVisitor::BR(Reg Rn) {
|
|||
auto target = X(64, Rn);
|
||||
|
||||
ir.SetPC(target);
|
||||
ir.SetTerm(IR::Term::ReturnToDispatch{});
|
||||
ir.SetTerm(IR::Term::FastDispatchHint{});
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -150,6 +150,7 @@ static u32 GenFloatInst(u64 pc, bool is_last_inst) {
|
|||
|
||||
static Dynarmic::A64::UserConfig GetUserConfig(A64TestEnv& jit_env) {
|
||||
Dynarmic::A64::UserConfig jit_user_config{&jit_env};
|
||||
jit_user_config.enable_fast_dispatch = false;
|
||||
// The below corresponds to the settings for qemu's aarch64_max_initfn
|
||||
jit_user_config.dczid_el0 = 7;
|
||||
jit_user_config.ctr_el0 = 0x80038003;
|
||||
|
|
Loading…
Reference in a new issue