diff --git a/src/backend_x64/emit_x64_floating_point.cpp b/src/backend_x64/emit_x64_floating_point.cpp
index 3b651f80..4f2a8bc0 100644
--- a/src/backend_x64/emit_x64_floating_point.cpp
+++ b/src/backend_x64/emit_x64_floating_point.cpp
@@ -92,7 +92,7 @@ void DenormalsAreZero(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Reg64 gpr_
     // SSE doesn't do this for us when SSE's DAZ is enabled.
 
     code.ja(end);
-    code.pxor(xmm_value, xmm_value);
+    code.andps(xmm_value, code.MConst(xword, fsize == 32 ? f32_negative_zero : f64_negative_zero));
     code.mov(dword[r15 + code.GetJitStateInfo().offsetof_FPSCR_IDC], u32(1 << 7));
     code.L(end);
 }
@@ -267,13 +267,13 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, [[maybe_unus
     Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
     Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
 
-    if constexpr(!std::is_same_v<PreprocessFunction, std::nullptr_t>) {
-        preprocess(result, operand, gpr_scratch, end);
-    }
     if (ctx.FPSCR_FTZ()) {
         DenormalsAreZero<fsize>(code, result, gpr_scratch);
         DenormalsAreZero<fsize>(code, operand, gpr_scratch);
     }
+    if constexpr(!std::is_same_v<PreprocessFunction, std::nullptr_t>) {
+        preprocess(result, operand, gpr_scratch, end);
+    }
     if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
         PreProcessNaNs<fsize>(code, result, operand, end);
     }
diff --git a/src/backend_x64/emit_x64_vector_floating_point.cpp b/src/backend_x64/emit_x64_vector_floating_point.cpp
index 452b2337..bbb181b6 100644
--- a/src/backend_x64/emit_x64_vector_floating_point.cpp
+++ b/src/backend_x64/emit_x64_vector_floating_point.cpp
@@ -131,6 +131,15 @@ Xbyak::Address GetNaNVector(BlockOfCode& code) {
     }
 }
 
+template<size_t fsize>
+Xbyak::Address GetNegativeZeroVector(BlockOfCode& code) {
+    if constexpr (fsize == 32) {
+        return code.MConst(xword, 0x8000'0000'8000'0000, 0x8000'0000'8000'0000);
+    } else {
+        return code.MConst(xword, 0x8000'0000'0000'0000, 0x8000'0000'0000'0000);
+    }
+}
+
 template<size_t fsize>
 void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
     if (ctx.FPSCR_DN()) {
@@ -146,6 +155,20 @@ void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
     }
 }
 
+template<size_t fsize>
+void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
+    if (ctx.FPSCR_FTZ()) {
+        if (ctx.FPSCR_RMode() != FP::RoundingMode::TowardsMinusInfinity) {
+            code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
+        } else {
+            code.xorps(tmp, tmp);
+        }
+        for (const Xbyak::Xmm& xmm : to_daz) {
+            FCODE(addp)(xmm, tmp);
+        }
+    }
+}
+
 template<typename T>
 struct DefaultIndexer {
     std::tuple<T> operator()(size_t i, const VectorArray<T>& a) {
@@ -565,12 +588,14 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
     if (ctx.FPSCR_DN()) {
         auto args = ctx.reg_alloc.GetArgumentInfo(inst);
         const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+        const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
 
         const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
         const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
         const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
 
+        DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
+
         if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
             FCODE(vcmpeqp)(mask, result, xmm_b);
             FCODE(vcmpunordp)(nan_mask, result, xmm_b);
@@ -602,10 +627,17 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
         return;
     }
 
-    EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){
+    EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
         const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
         const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
 
+        if (ctx.FPSCR_FTZ()) {
+            const Xbyak::Xmm prev_xmm_b = xmm_b;
+            xmm_b = ctx.reg_alloc.ScratchXmm();
+            code.movaps(xmm_b, prev_xmm_b);
+            DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
+        }
+
         // What we are doing here is handling the case when the inputs are differently signed zeros.
         // x86-64 treats differently signed zeros as equal while ARM does not.
         // Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero.
@@ -643,12 +675,14 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
     if (ctx.FPSCR_DN()) {
         auto args = ctx.reg_alloc.GetArgumentInfo(inst);
         const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+        const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
 
         const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
         const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
         const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
 
+        DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
+
         if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
             FCODE(vcmpeqp)(mask, result, xmm_b);
             FCODE(vcmpunordp)(nan_mask, result, xmm_b);
@@ -680,10 +714,17 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
         return;
     }
 
-    EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){
+    EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
         const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
         const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
 
+        if (ctx.FPSCR_FTZ()) {
+            const Xbyak::Xmm prev_xmm_b = xmm_b;
+            xmm_b = ctx.reg_alloc.ScratchXmm();
+            code.movaps(xmm_b, prev_xmm_b);
+            DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
+        }
+
         // What we are doing here is handling the case when the inputs are differently signed zeros.
         // x86-64 treats differently signed zeros as equal while ARM does not.
         // Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero.