IR: Implement FPVector{Min,Max}Numeric
This commit is contained in:
parent
c8cd37898b
commit
7cb9254e9f
5 changed files with 173 additions and 0 deletions
|
@ -416,6 +416,16 @@ void EmitIR<IR::Opcode::FPVectorMax64>(oaknut::CodeGenerator& code, EmitContext&
|
||||||
EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAX(Vresult, Va, Vb); });
|
EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAX(Vresult, Va, Vb); });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void EmitIR<IR::Opcode::FPVectorMaxNumeric32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAXNM(Vresult, Va, Vb); });
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void EmitIR<IR::Opcode::FPVectorMaxNumeric64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAXNM(Vresult, Va, Vb); });
|
||||||
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
void EmitIR<IR::Opcode::FPVectorMin32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
void EmitIR<IR::Opcode::FPVectorMin32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMIN(Vresult, Va, Vb); });
|
EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMIN(Vresult, Va, Vb); });
|
||||||
|
@ -426,6 +436,16 @@ void EmitIR<IR::Opcode::FPVectorMin64>(oaknut::CodeGenerator& code, EmitContext&
|
||||||
EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMIN(Vresult, Va, Vb); });
|
EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMIN(Vresult, Va, Vb); });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void EmitIR<IR::Opcode::FPVectorMinNumeric32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMINNM(Vresult, Va, Vb); });
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void EmitIR<IR::Opcode::FPVectorMinNumeric64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMINNM(Vresult, Va, Vb); });
|
||||||
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
void EmitIR<IR::Opcode::FPVectorMul32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
void EmitIR<IR::Opcode::FPVectorMul32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMUL(Vresult, Va, Vb); });
|
EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMUL(Vresult, Va, Vb); });
|
||||||
|
|
|
@ -1060,6 +1060,117 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
|
||||||
CheckInputNaN::Yes);
|
CheckInputNaN::Yes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<size_t fsize, bool is_max>
|
||||||
|
static void EmitFPVectorMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
const bool fpcr_controlled = inst->GetArg(2).GetU1();
|
||||||
|
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
const Xbyak::Xmm intermediate_result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
const Xbyak::Xmm tmp1 = xmm0;
|
||||||
|
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
// NaN requirements:
|
||||||
|
// op1 op2 result
|
||||||
|
// SNaN anything op1
|
||||||
|
// !SNaN SNaN op2
|
||||||
|
// QNaN !NaN op2
|
||||||
|
// !NaN QNaN op1
|
||||||
|
// QNaN QNaN op1
|
||||||
|
|
||||||
|
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
|
||||||
|
using FPT = mcl::unsigned_integer_of_size<fsize>;
|
||||||
|
|
||||||
|
// result = xmm_a == SNaN || xmm_b == QNaN
|
||||||
|
{
|
||||||
|
// evaluate xmm_b == QNaN
|
||||||
|
code.xorps(tmp1, tmp1);
|
||||||
|
FCODE(cmpunordp)(tmp1, xmm_b);
|
||||||
|
code.movaps(tmp2, xmm_b);
|
||||||
|
ICODE(psll)(tmp2, static_cast<int>(fsize - FP::FPInfo<FPT>::explicit_mantissa_width));
|
||||||
|
{
|
||||||
|
code.psrad(tmp2, 31);
|
||||||
|
if constexpr (fsize == 64) {
|
||||||
|
code.pshufd(tmp2, tmp2, 0b11110101);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
code.andps(tmp1, tmp2);
|
||||||
|
|
||||||
|
code.movaps(result, tmp1);
|
||||||
|
|
||||||
|
// evaluate xmm_a == SNaN
|
||||||
|
code.xorps(tmp1, tmp1);
|
||||||
|
FCODE(cmpunordp)(tmp1, xmm_a);
|
||||||
|
code.movaps(tmp2, xmm_a);
|
||||||
|
ICODE(psll)(tmp2, static_cast<int>(fsize - FP::FPInfo<FPT>::explicit_mantissa_width));
|
||||||
|
{
|
||||||
|
code.psrad(tmp2, 31);
|
||||||
|
if constexpr (fsize == 64) {
|
||||||
|
code.pshufd(tmp2, tmp2, 0b11110101);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
code.andnps(tmp2, tmp1);
|
||||||
|
|
||||||
|
code.orps(result, tmp2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Denormalization quiets SNaNs, therefore should happen after SNaN detection!
|
||||||
|
DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {xmm_a, xmm_b}, tmp1);
|
||||||
|
|
||||||
|
// intermediate result = max/min(xmm_a, xmm_b)
|
||||||
|
{
|
||||||
|
const Xbyak::Xmm eq_mask = tmp1;
|
||||||
|
const Xbyak::Xmm eq = tmp2;
|
||||||
|
|
||||||
|
code.movaps(eq_mask, xmm_a);
|
||||||
|
FCODE(cmpneqp)(eq_mask, xmm_b);
|
||||||
|
|
||||||
|
code.movaps(eq, xmm_a);
|
||||||
|
code.movaps(intermediate_result, xmm_a);
|
||||||
|
if constexpr (is_max) {
|
||||||
|
code.andps(eq, xmm_b);
|
||||||
|
FCODE(maxp)(intermediate_result, xmm_b);
|
||||||
|
} else {
|
||||||
|
code.orps(eq, xmm_b);
|
||||||
|
FCODE(minp)(intermediate_result, xmm_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
code.andps(intermediate_result, eq_mask);
|
||||||
|
code.andnps(eq_mask, eq);
|
||||||
|
code.orps(intermediate_result, eq_mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
code.andps(xmm_a, result);
|
||||||
|
code.andnps(result, intermediate_result);
|
||||||
|
code.orps(result, xmm_a);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx.FPCR(fpcr_controlled).DN()) {
|
||||||
|
const Xbyak::Xmm ord_mask = tmp1;
|
||||||
|
|
||||||
|
code.xorps(ord_mask, ord_mask);
|
||||||
|
FCODE(cmpordp)(ord_mask, result);
|
||||||
|
|
||||||
|
code.andps(result, ord_mask);
|
||||||
|
code.andnps(ord_mask, GetNaNVector<fsize>(code));
|
||||||
|
code.orps(result, ord_mask);
|
||||||
|
} else {
|
||||||
|
const Xbyak::Xmm nan_mask = tmp1;
|
||||||
|
|
||||||
|
code.xorps(nan_mask, nan_mask);
|
||||||
|
FCODE(cmpunordp)(nan_mask, result);
|
||||||
|
code.andps(nan_mask, GetVectorOf<fsize, FP::FPInfo<FPT>::mantissa_msb>(code));
|
||||||
|
code.orps(result, nan_mask);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorMax32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorMax32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPVectorMinMax<32, true>(code, ctx, inst);
|
EmitFPVectorMinMax<32, true>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
@ -1068,6 +1179,14 @@ void EmitX64::EmitFPVectorMax64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPVectorMinMax<64, true>(code, ctx, inst);
|
EmitFPVectorMinMax<64, true>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPVectorMaxNumeric32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPVectorMinMaxNumeric<32, true>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPVectorMaxNumeric64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPVectorMinMaxNumeric<64, true>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorMin32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorMin32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPVectorMinMax<32, false>(code, ctx, inst);
|
EmitFPVectorMinMax<32, false>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
@ -1076,6 +1195,14 @@ void EmitX64::EmitFPVectorMin64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPVectorMinMax<64, false>(code, ctx, inst);
|
EmitFPVectorMinMax<64, false>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPVectorMinNumeric32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPVectorMinMaxNumeric<32, false>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPVectorMinNumeric64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPVectorMinMaxNumeric<64, false>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps);
|
EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps);
|
||||||
}
|
}
|
||||||
|
|
|
@ -2634,6 +2634,16 @@ U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpc
|
||||||
UNREACHABLE();
|
UNREACHABLE();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
||||||
|
switch (esize) {
|
||||||
|
case 32:
|
||||||
|
return Inst<U128>(Opcode::FPVectorMaxNumeric32, a, b, Imm1(fpcr_controlled));
|
||||||
|
case 64:
|
||||||
|
return Inst<U128>(Opcode::FPVectorMaxNumeric64, a, b, Imm1(fpcr_controlled));
|
||||||
|
}
|
||||||
|
UNREACHABLE();
|
||||||
|
}
|
||||||
|
|
||||||
U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 32:
|
case 32:
|
||||||
|
@ -2644,6 +2654,16 @@ U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpc
|
||||||
UNREACHABLE();
|
UNREACHABLE();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
||||||
|
switch (esize) {
|
||||||
|
case 32:
|
||||||
|
return Inst<U128>(Opcode::FPVectorMinNumeric32, a, b, Imm1(fpcr_controlled));
|
||||||
|
case 64:
|
||||||
|
return Inst<U128>(Opcode::FPVectorMinNumeric64, a, b, Imm1(fpcr_controlled));
|
||||||
|
}
|
||||||
|
UNREACHABLE();
|
||||||
|
}
|
||||||
|
|
||||||
U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 32:
|
case 32:
|
||||||
|
|
|
@ -372,7 +372,9 @@ public:
|
||||||
U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
U128 FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
U128 FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
|
U128 FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
U128 FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
U128 FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
|
U128 FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
U128 FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
U128 FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2, bool fpcr_controlled = true);
|
U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2, bool fpcr_controlled = true);
|
||||||
U128 FPVectorMulX(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
U128 FPVectorMulX(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
|
|
|
@ -668,8 +668,12 @@ OPCODE(FPVectorGreaterEqual32, U128, U128
|
||||||
OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 )
|
OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMax32, U128, U128, U128, U1 )
|
OPCODE(FPVectorMax32, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMax64, U128, U128, U128, U1 )
|
OPCODE(FPVectorMax64, U128, U128, U128, U1 )
|
||||||
|
OPCODE(FPVectorMaxNumeric32, U128, U128, U128, U1 )
|
||||||
|
OPCODE(FPVectorMaxNumeric64, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMin32, U128, U128, U128, U1 )
|
OPCODE(FPVectorMin32, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMin64, U128, U128, U128, U1 )
|
OPCODE(FPVectorMin64, U128, U128, U128, U1 )
|
||||||
|
OPCODE(FPVectorMinNumeric32, U128, U128, U128, U1 )
|
||||||
|
OPCODE(FPVectorMinNumeric64, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMul32, U128, U128, U128, U1 )
|
OPCODE(FPVectorMul32, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMul64, U128, U128, U128, U1 )
|
OPCODE(FPVectorMul64, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMulAdd16, U128, U128, U128, U128, U1 )
|
OPCODE(FPVectorMulAdd16, U128, U128, U128, U128, U1 )
|
||||||
|
|
Loading…
Reference in a new issue