Fix 128-bit ops
This commit is contained in:
parent
ef2851d595
commit
57871c5159
5 changed files with 161 additions and 17 deletions
|
@ -96,6 +96,123 @@ static void* EmitExclusiveWriteCallTrampoline(oaknut::CodeGenerator& code, const
|
|||
return target;
|
||||
}
|
||||
|
||||
/* =========================== 128-bit versions =========================== */
|
||||
|
||||
static void* EmitRead128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) {
|
||||
using namespace oaknut::util;
|
||||
|
||||
const auto info = Devirtualize<&A64::UserCallbacks::MemoryRead128>(this_);
|
||||
|
||||
oaknut::Label l_addr, l_this;
|
||||
|
||||
void* target = code.ptr<void*>();
|
||||
ABI_PushRegisters(code, (1ull << 29) | (1ull << 30), sizeof(Vector));
|
||||
code.LDR(X0, l_this);
|
||||
code.LDR(Xscratch0, l_addr);
|
||||
code.BLR(Xscratch0);
|
||||
code.STP(X0, X1, SP);
|
||||
code.LDR(Q0, SP);
|
||||
ABI_PopRegisters(code, (1ull << 29) | (1ull << 30), sizeof(Vector));
|
||||
code.RET();
|
||||
|
||||
code.align(8);
|
||||
code.l(l_this);
|
||||
code.dx(info.this_ptr);
|
||||
code.l(l_addr);
|
||||
code.dx(info.fn_ptr);
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
static void* EmitExclusiveRead128CallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) {
|
||||
using namespace oaknut::util;
|
||||
|
||||
oaknut::Label l_addr, l_this;
|
||||
|
||||
auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr) -> Vector {
|
||||
return conf.global_monitor->ReadAndMark<Vector>(conf.processor_id, vaddr, [&]() -> Vector {
|
||||
return conf.callbacks->MemoryRead128(vaddr);
|
||||
});
|
||||
};
|
||||
|
||||
void* target = code.ptr<void*>();
|
||||
ABI_PushRegisters(code, (1ull << 29) | (1ull << 30), sizeof(Vector));
|
||||
code.LDR(X0, l_this);
|
||||
code.LDR(Xscratch0, l_addr);
|
||||
code.BLR(Xscratch0);
|
||||
code.STP(X0, X1, SP);
|
||||
code.LDR(Q0, SP);
|
||||
ABI_PopRegisters(code, (1ull << 29) | (1ull << 30), sizeof(Vector));
|
||||
code.RET();
|
||||
|
||||
code.align(8);
|
||||
code.l(l_this);
|
||||
code.dx(mcl::bit_cast<u64>(&conf));
|
||||
code.l(l_addr);
|
||||
code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
static void* EmitWrite128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) {
|
||||
using namespace oaknut::util;
|
||||
|
||||
const auto info = Devirtualize<&A64::UserCallbacks::MemoryWrite128>(this_);
|
||||
|
||||
oaknut::Label l_addr, l_this;
|
||||
|
||||
void* target = code.ptr<void*>();
|
||||
ABI_PushRegisters(code, 0, sizeof(Vector));
|
||||
code.STR(Q0, SP);
|
||||
code.LDP(X2, X3, SP);
|
||||
ABI_PopRegisters(code, 0, sizeof(Vector));
|
||||
|
||||
code.LDR(X0, l_this);
|
||||
code.LDR(Xscratch0, l_addr);
|
||||
code.BR(Xscratch0);
|
||||
|
||||
code.align(8);
|
||||
code.l(l_this);
|
||||
code.dx(info.this_ptr);
|
||||
code.l(l_addr);
|
||||
code.dx(info.fn_ptr);
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
static void* EmitExclusiveWrite128CallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) {
|
||||
using namespace oaknut::util;
|
||||
|
||||
oaknut::Label l_addr, l_this;
|
||||
|
||||
auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr, Vector value) -> u32 {
|
||||
return conf.global_monitor->DoExclusiveOperation<Vector>(conf.processor_id, vaddr,
|
||||
[&](Vector expected) -> bool {
|
||||
return conf.callbacks->MemoryWriteExclusive128(vaddr, value, expected);
|
||||
})
|
||||
? 0
|
||||
: 1;
|
||||
};
|
||||
|
||||
void* target = code.ptr<void*>();
|
||||
ABI_PushRegisters(code, 0, sizeof(Vector));
|
||||
code.STR(Q0, SP);
|
||||
code.LDP(X2, X3, SP);
|
||||
ABI_PopRegisters(code, 0, sizeof(Vector));
|
||||
|
||||
code.LDR(X0, l_this);
|
||||
code.LDR(Xscratch0, l_addr);
|
||||
code.BR(Xscratch0);
|
||||
|
||||
code.align(8);
|
||||
code.l(l_this);
|
||||
code.dx(mcl::bit_cast<u64>(&conf));
|
||||
code.l(l_addr);
|
||||
code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
A64AddressSpace::A64AddressSpace(const A64::UserConfig& conf)
|
||||
: conf(conf)
|
||||
, mem(conf.code_cache_size)
|
||||
|
@ -161,22 +278,22 @@ void A64AddressSpace::EmitPrelude() {
|
|||
prelude_info.read_memory_16 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead16>(code, conf.callbacks);
|
||||
prelude_info.read_memory_32 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead32>(code, conf.callbacks);
|
||||
prelude_info.read_memory_64 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead64>(code, conf.callbacks);
|
||||
prelude_info.read_memory_128 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead128>(code, conf.callbacks);
|
||||
prelude_info.read_memory_128 = EmitRead128CallTrampoline(code, conf.callbacks);
|
||||
prelude_info.exclusive_read_memory_8 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead8, u8>(code, conf);
|
||||
prelude_info.exclusive_read_memory_16 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead16, u16>(code, conf);
|
||||
prelude_info.exclusive_read_memory_32 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead32, u32>(code, conf);
|
||||
prelude_info.exclusive_read_memory_64 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead64, u64>(code, conf);
|
||||
prelude_info.exclusive_read_memory_128 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead128, Vector>(code, conf);
|
||||
prelude_info.exclusive_read_memory_128 = EmitExclusiveRead128CallTrampoline(code, conf);
|
||||
prelude_info.write_memory_8 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite8>(code, conf.callbacks);
|
||||
prelude_info.write_memory_16 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite16>(code, conf.callbacks);
|
||||
prelude_info.write_memory_32 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite32>(code, conf.callbacks);
|
||||
prelude_info.write_memory_64 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite64>(code, conf.callbacks);
|
||||
prelude_info.write_memory_128 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite128>(code, conf.callbacks);
|
||||
prelude_info.write_memory_128 = EmitWrite128CallTrampoline(code, conf.callbacks);
|
||||
prelude_info.exclusive_write_memory_8 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive8, u8>(code, conf);
|
||||
prelude_info.exclusive_write_memory_16 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive16, u16>(code, conf);
|
||||
prelude_info.exclusive_write_memory_32 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive32, u32>(code, conf);
|
||||
prelude_info.exclusive_write_memory_64 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive64, u64>(code, conf);
|
||||
prelude_info.exclusive_write_memory_128 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive128, Vector>(code, conf);
|
||||
prelude_info.exclusive_write_memory_128 = EmitExclusiveWrite128CallTrampoline(code, conf);
|
||||
prelude_info.call_svc = EmitCallTrampoline<&A64::UserCallbacks::CallSVC>(code, conf.callbacks);
|
||||
prelude_info.exception_raised = EmitCallTrampoline<&A64::UserCallbacks::ExceptionRaised>(code, conf.callbacks);
|
||||
prelude_info.isb_raised = EmitCallTrampoline<&A64::UserCallbacks::InstructionSynchronizationBarrierRaised>(code, conf.callbacks);
|
||||
|
|
|
@ -55,13 +55,15 @@ static FrameInfo CalculateFrameInfo(RegisterList rl, size_t frame_size) {
|
|||
};
|
||||
}
|
||||
|
||||
#define DO_IT(TYPE, REG_TYPE, PAIR_OP, SINGLE_OP, OFFSET) \
|
||||
for (size_t i = 0; i < frame_info.TYPE##s.size() - 1; i += 2) { \
|
||||
code.PAIR_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, oaknut::REG_TYPE{frame_info.TYPE##s[i + 1]}, SP, (OFFSET) + i * TYPE##_size); \
|
||||
} \
|
||||
if (frame_info.TYPE##s.size() % 2 == 1) { \
|
||||
const size_t i = frame_info.TYPE##s.size() - 1; \
|
||||
code.SINGLE_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, SP, (OFFSET) + i * TYPE##_size); \
|
||||
#define DO_IT(TYPE, REG_TYPE, PAIR_OP, SINGLE_OP, OFFSET) \
|
||||
if (frame_info.TYPE##s.size() > 0) { \
|
||||
for (size_t i = 0; i < frame_info.TYPE##s.size() - 1; i += 2) { \
|
||||
code.PAIR_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, oaknut::REG_TYPE{frame_info.TYPE##s[i + 1]}, SP, (OFFSET) + i * TYPE##_size); \
|
||||
} \
|
||||
if (frame_info.TYPE##s.size() % 2 == 1) { \
|
||||
const size_t i = frame_info.TYPE##s.size() - 1; \
|
||||
code.SINGLE_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, SP, (OFFSET) + i * TYPE##_size); \
|
||||
} \
|
||||
}
|
||||
|
||||
void ABI_PushRegisters(oaknut::CodeGenerator& code, RegisterList rl, size_t frame_size) {
|
||||
|
|
|
@ -465,7 +465,7 @@ void EmitIR<IR::Opcode::A64SetTPIDR>(oaknut::CodeGenerator& code, EmitContext& c
|
|||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
auto Xvalue = ctx.reg_alloc.ReadX(args[0]);
|
||||
RegAlloc::Realize(Xvalue);
|
||||
code.MOV(Xscratch0, mcl::bit_cast<u64>(ctx.conf.tpidrro_el0));
|
||||
code.MOV(Xscratch0, mcl::bit_cast<u64>(ctx.conf.tpidr_el0));
|
||||
code.STR(Xvalue, Xscratch0);
|
||||
}
|
||||
|
||||
|
|
|
@ -1353,9 +1353,13 @@ void EmitIR<IR::Opcode::ZeroExtendWordToLong>(oaknut::CodeGenerator&, EmitContex
|
|||
}
|
||||
|
||||
template<>
|
||||
void EmitIR<IR::Opcode::ZeroExtendLongToQuad>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
|
||||
void EmitIR<IR::Opcode::ZeroExtendLongToQuad>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
ctx.reg_alloc.DefineAsExisting(inst, args[0]);
|
||||
auto Xvalue = ctx.reg_alloc.ReadX(args[0]);
|
||||
auto Qresult = ctx.reg_alloc.WriteQ(inst);
|
||||
RegAlloc::Realize(Xvalue, Qresult);
|
||||
|
||||
code.FMOV(Qresult->toD(), Xvalue);
|
||||
}
|
||||
|
||||
template<>
|
||||
|
|
|
@ -157,15 +157,36 @@ void RegAlloc::PrepareForCall(IR::Inst* result, std::optional<Argument::copyable
|
|||
}
|
||||
|
||||
const std::array<std::optional<Argument::copyable_reference>, 4> args{arg0, arg1, arg2, arg3};
|
||||
|
||||
// AAPCS64 Next General-purpose Register Number
|
||||
int ngrn = 0;
|
||||
// AAPCS64 Next SIMD and Floating-point Register Number
|
||||
int nsrn = 0;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (args[i]) {
|
||||
ASSERT(gprs[i].IsCompletelyEmpty());
|
||||
LoadCopyInto(args[i]->get().value, oaknut::XReg{i});
|
||||
if (args[i]->get().GetType() == IR::Type::U128) {
|
||||
ASSERT(fprs[nsrn].IsCompletelyEmpty());
|
||||
LoadCopyInto(args[i]->get().value, oaknut::QReg{nsrn});
|
||||
nsrn++;
|
||||
} else {
|
||||
ASSERT(gprs[ngrn].IsCompletelyEmpty());
|
||||
LoadCopyInto(args[i]->get().value, oaknut::XReg{ngrn});
|
||||
ngrn++;
|
||||
}
|
||||
} else {
|
||||
// Gaps are assumed to be in general-purpose registers
|
||||
// TODO: should there be a separate list passed for FPRs instead?
|
||||
ngrn++;
|
||||
}
|
||||
}
|
||||
|
||||
if (result) {
|
||||
DefineAsRegister(result, X0);
|
||||
if (result->GetType() == IR::Type::U128) {
|
||||
DefineAsRegister(result, Q0);
|
||||
} else {
|
||||
DefineAsRegister(result, X0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue