From 0f412247ed1d39e3d7ae4cdf110a8e422cae77f8 Mon Sep 17 00:00:00 2001
From: MerryMage <MerryMage@users.noreply.github.com>
Date: Sun, 7 Aug 2016 12:19:07 +0100
Subject: [PATCH] VFP: Implement VSQRT

---
 src/backend_x64/emit_x64.cpp                  | 64 ++++++++++++++++---
 src/frontend/decoder/vfp2.h                   |  2 +-
 .../disassembler/disassembler_arm.cpp         |  4 ++
 src/frontend/ir/ir_emitter.cpp                |  7 ++
 src/frontend/ir/ir_emitter.h                  |  2 +
 src/frontend/ir/opcodes.inc                   |  2 +
 .../translate/translate_arm/translate_arm.h   |  1 +
 src/frontend/translate/translate_arm/vfp2.cpp | 17 +++++
 8 files changed, 88 insertions(+), 11 deletions(-)

diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp
index d6edd91b..4a95edb5 100644
--- a/src/backend_x64/emit_x64.cpp
+++ b/src/backend_x64/emit_x64.cpp
@@ -1090,7 +1090,7 @@ static void DefaultNaN64(XEmitter* code, Routines* routines, X64Reg xmm_value) {
     code->SetJumpTarget(fixup);
 }
 
-static void FPOp32(XEmitter* code, Routines* routines, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (XEmitter::*fn)(X64Reg, const OpArg&)) {
+static void FPThreeOp32(XEmitter* code, Routines* routines, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (XEmitter::*fn)(X64Reg, const OpArg&)) {
     IR::Value a = inst->GetArg(0);
     IR::Value b = inst->GetArg(1);
 
@@ -1111,7 +1111,7 @@ static void FPOp32(XEmitter* code, Routines* routines, RegAlloc& reg_alloc, IR::
     }
 }
 
-static void FPOp64(XEmitter* code, Routines* routines, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (XEmitter::*fn)(X64Reg, const OpArg&)) {
+static void FPThreeOp64(XEmitter* code, Routines* routines, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (XEmitter::*fn)(X64Reg, const OpArg&)) {
     IR::Value a = inst->GetArg(0);
     IR::Value b = inst->GetArg(1);
 
@@ -1132,6 +1132,42 @@ static void FPOp64(XEmitter* code, Routines* routines, RegAlloc& reg_alloc, IR::
     }
 }
 
+static void FPTwoOp32(XEmitter* code, Routines* routines, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (XEmitter::*fn)(X64Reg, const OpArg&)) {
+    IR::Value a = inst->GetArg(0);
+
+    X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm);
+    X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr);
+
+    if (block.location.FPSCR_FTZ()) {
+        DenormalsAreZero32(code, result, gpr_scratch);
+    }
+    (code->*fn)(result, R(result));
+    if (block.location.FPSCR_FTZ()) {
+        FlushToZero32(code, result, gpr_scratch);
+    }
+    if (block.location.FPSCR_DN()) {
+        DefaultNaN32(code, routines, result);
+    }
+}
+
+static void FPTwoOp64(XEmitter* code, Routines* routines, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (XEmitter::*fn)(X64Reg, const OpArg&)) {
+    IR::Value a = inst->GetArg(0);
+
+    X64Reg result = reg_alloc.UseDefRegister(a, inst, any_xmm);
+    X64Reg gpr_scratch = reg_alloc.ScratchRegister(any_gpr);
+
+    if (block.location.FPSCR_FTZ()) {
+        DenormalsAreZero64(code, routines, result, gpr_scratch);
+    }
+    (code->*fn)(result, R(result));
+    if (block.location.FPSCR_FTZ()) {
+        FlushToZero64(code, routines, result, gpr_scratch);
+    }
+    if (block.location.FPSCR_DN()) {
+        DefaultNaN64(code, routines, result);
+    }
+}
+
 void EmitX64::EmitFPAbs32(IR::Block&, IR::Inst* inst) {
     IR::Value a = inst->GetArg(0);
 
@@ -1165,35 +1201,43 @@ void EmitX64::EmitFPNeg64(IR::Block&, IR::Inst* inst) {
 }
 
 void EmitX64::EmitFPAdd32(IR::Block& block, IR::Inst* inst) {
-    FPOp32(code, routines, reg_alloc, block, inst, &XEmitter::ADDSS);
+    FPThreeOp32(code, routines, reg_alloc, block, inst, &XEmitter::ADDSS);
 }
 
 void EmitX64::EmitFPAdd64(IR::Block& block, IR::Inst* inst) {
-    FPOp64(code, routines, reg_alloc, block, inst, &XEmitter::ADDSD);
+    FPThreeOp64(code, routines, reg_alloc, block, inst, &XEmitter::ADDSD);
 }
 
 void EmitX64::EmitFPDiv32(IR::Block& block, IR::Inst* inst) {
-    FPOp32(code, routines, reg_alloc, block, inst, &XEmitter::DIVSS);
+    FPThreeOp32(code, routines, reg_alloc, block, inst, &XEmitter::DIVSS);
 }
 
 void EmitX64::EmitFPDiv64(IR::Block& block, IR::Inst* inst) {
-    FPOp64(code, routines, reg_alloc, block, inst, &XEmitter::DIVSD);
+    FPThreeOp64(code, routines, reg_alloc, block, inst, &XEmitter::DIVSD);
 }
 
 void EmitX64::EmitFPMul32(IR::Block& block, IR::Inst* inst) {
-    FPOp32(code, routines, reg_alloc, block, inst, &XEmitter::MULSS);
+    FPThreeOp32(code, routines, reg_alloc, block, inst, &XEmitter::MULSS);
 }
 
 void EmitX64::EmitFPMul64(IR::Block& block, IR::Inst* inst) {
-    FPOp64(code, routines, reg_alloc, block, inst, &XEmitter::MULSD);
+    FPThreeOp64(code, routines, reg_alloc, block, inst, &XEmitter::MULSD);
+}
+
+void EmitX64::EmitFPSqrt32(IR::Block& block, IR::Inst* inst) {
+    FPTwoOp32(code, routines, reg_alloc, block, inst, &XEmitter::SQRTSS);
+}
+
+void EmitX64::EmitFPSqrt64(IR::Block& block, IR::Inst* inst) {
+    FPTwoOp64(code, routines, reg_alloc, block, inst, &XEmitter::SQRTSD);
 }
 
 void EmitX64::EmitFPSub32(IR::Block& block, IR::Inst* inst) {
-    FPOp32(code, routines, reg_alloc, block, inst, &XEmitter::SUBSS);
+    FPThreeOp32(code, routines, reg_alloc, block, inst, &XEmitter::SUBSS);
 }
 
 void EmitX64::EmitFPSub64(IR::Block& block, IR::Inst* inst) {
-    FPOp64(code, routines, reg_alloc, block, inst, &XEmitter::SUBSD);
+    FPThreeOp64(code, routines, reg_alloc, block, inst, &XEmitter::SUBSD);
 }
 
 void EmitX64::EmitReadMemory8(IR::Block&, IR::Inst* inst) {
diff --git a/src/frontend/decoder/vfp2.h b/src/frontend/decoder/vfp2.h
index 0feb013c..523a235e 100644
--- a/src/frontend/decoder/vfp2.h
+++ b/src/frontend/decoder/vfp2.h
@@ -79,7 +79,7 @@ boost::optional<const VFP2Matcher<V>&> DecodeVFP2(u32 instruction) {
     // VMOV_reg
     INST(&V::vfp2_VABS,       "VABS",                "cccc11101D110000dddd101z11M0mmmm"),
     INST(&V::vfp2_VNEG,       "VNEG",                "cccc11101D110001dddd101z01M0mmmm"),
-    // VSQRT
+    INST(&V::vfp2_VSQRT,      "VSQRT",               "cccc11101D110001dddd101z11M0mmmm"),
     // VCMP
     // VCMPE
     // VCVT
diff --git a/src/frontend/disassembler/disassembler_arm.cpp b/src/frontend/disassembler/disassembler_arm.cpp
index 66bf19ea..fdd135cd 100644
--- a/src/frontend/disassembler/disassembler_arm.cpp
+++ b/src/frontend/disassembler/disassembler_arm.cpp
@@ -603,6 +603,10 @@ public:
     std::string vfp2_VNEG(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
         return Common::StringFromFormat("vneg%s.%s %s, %s", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D).c_str(), FPRegStr(sz, Vm, M).c_str());
     }
+
+    std::string vfp2_VSQRT(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+        return Common::StringFromFormat("vsqrt%s.%s %s, %s", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D).c_str(), FPRegStr(sz, Vm, M).c_str());
+    }
 };
 
 std::string DisassembleArm(u32 instruction) {
diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp
index 40c6e890..cdc5ca97 100644
--- a/src/frontend/ir/ir_emitter.cpp
+++ b/src/frontend/ir/ir_emitter.cpp
@@ -320,6 +320,13 @@ IR::Value IREmitter::FPNeg64(const IR::Value& a) {
     return Inst(IR::Opcode::FPNeg64, {a});
 }
 
+IR::Value IREmitter::FPSqrt32(const IR::Value& a) {
+    return Inst(IR::Opcode::FPSqrt32, {a});
+}
+
+IR::Value IREmitter::FPSqrt64(const IR::Value& a) {
+    return Inst(IR::Opcode::FPSqrt64, {a});
+}
 
 IR::Value IREmitter::FPSub32(const IR::Value& a, const IR::Value& b, bool fpscr_controlled) {
     ASSERT(fpscr_controlled);
diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h
index 4960690f..f64ad122 100644
--- a/src/frontend/ir/ir_emitter.h
+++ b/src/frontend/ir/ir_emitter.h
@@ -102,6 +102,8 @@ public:
     IR::Value FPMul64(const IR::Value& a, const IR::Value& b, bool fpscr_controlled);
     IR::Value FPNeg32(const IR::Value& a);
     IR::Value FPNeg64(const IR::Value& a);
+    IR::Value FPSqrt32(const IR::Value& a);
+    IR::Value FPSqrt64(const IR::Value& a);
     IR::Value FPSub32(const IR::Value& a, const IR::Value& b, bool fpscr_controlled);
     IR::Value FPSub64(const IR::Value& a, const IR::Value& b, bool fpscr_controlled);
 
diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc
index 007082eb..891683c8 100644
--- a/src/frontend/ir/opcodes.inc
+++ b/src/frontend/ir/opcodes.inc
@@ -70,6 +70,8 @@ OPCODE(FPMul32,                 T::F32,         T::F32,         T::F32
 OPCODE(FPMul64,                 T::F64,         T::F64,         T::F64                          )
 OPCODE(FPNeg32,                 T::F32,         T::F32                                          )
 OPCODE(FPNeg64,                 T::F64,         T::F64                                          )
+OPCODE(FPSqrt32,                T::F32,         T::F32                                          )
+OPCODE(FPSqrt64,                T::F64,         T::F64                                          )
 OPCODE(FPSub32,                 T::F32,         T::F32,         T::F32                          )
 OPCODE(FPSub64,                 T::F64,         T::F64,         T::F64                          )
 
diff --git a/src/frontend/translate/translate_arm/translate_arm.h b/src/frontend/translate/translate_arm/translate_arm.h
index 5555e7c5..a1bb3dc6 100644
--- a/src/frontend/translate/translate_arm/translate_arm.h
+++ b/src/frontend/translate/translate_arm/translate_arm.h
@@ -332,6 +332,7 @@ struct ArmTranslatorVisitor final {
     // Floating-point misc instructions
     bool vfp2_VABS(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm);
     bool vfp2_VNEG(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm);
+    bool vfp2_VSQRT(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm);
 };
 
 } // namespace Arm
diff --git a/src/frontend/translate/translate_arm/vfp2.cpp b/src/frontend/translate/translate_arm/vfp2.cpp
index 52877abe..4e43fadb 100644
--- a/src/frontend/translate/translate_arm/vfp2.cpp
+++ b/src/frontend/translate/translate_arm/vfp2.cpp
@@ -226,5 +226,22 @@ bool ArmTranslatorVisitor::vfp2_VNEG(Cond cond, bool D, size_t Vd, bool sz, bool
     return true;
 }
 
+bool ArmTranslatorVisitor::vfp2_VSQRT(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+    if (ir.current_location.FPSCR_Len() != 1 || ir.current_location.FPSCR_Stride() != 1)
+        return InterpretThisInstruction(); // TODO: Vectorised floating point instructions
+
+    ExtReg d = ToExtReg(sz, Vd, D);
+    ExtReg m = ToExtReg(sz, Vm, M);
+    // VSQRT.{F32,F64} <{S,D}d>, <{S,D}m>
+    if (ConditionPassed(cond)) {
+        auto a = ir.GetExtendedRegister(m);
+        auto result = sz
+                      ? ir.FPSqrt64(a)
+                      : ir.FPSqrt32(a);
+        ir.SetExtendedRegister(d, result);
+    }
+    return true;
+}
+
 } // namespace Arm
 } // namespace Dynarmic