1
0
Fork 0
forked from suyu/suyu

Shaders: Fix multiplications between 0.0 and inf

The PICA200 semantics for multiplication are so that when multiplying
inf by exactly 0.0, the result is 0.0, instead of NaN, as defined by
IEEE. This is relied upon by games.

Fixes #1024 (missing OoT interface items)
This commit is contained in:
Yuri Kunde Schlesner 2015-08-24 01:48:15 -03:00
parent 082b74fa24
commit 630a850d4d
3 changed files with 60 additions and 42 deletions

View file

@ -1021,12 +1021,20 @@ struct float24 {
return ret; return ret;
} }
static float24 Zero() {
return FromFloat32(0.f);
}
// Not recommended for anything but logging // Not recommended for anything but logging
float ToFloat32() const { float ToFloat32() const {
return value; return value;
} }
float24 operator * (const float24& flt) const { float24 operator * (const float24& flt) const {
if ((this->value == 0.f && flt.value == flt.value) ||
(flt.value == 0.f && this->value == this->value))
// PICA gives 0 instead of NaN when multiplying by inf
return Zero();
return float24::FromFloat32(ToFloat32() * flt.ToFloat32()); return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
} }
@ -1043,7 +1051,11 @@ struct float24 {
} }
float24& operator *= (const float24& flt) { float24& operator *= (const float24& flt) {
value *= flt.ToFloat32(); if ((this->value == 0.f && flt.value == flt.value) ||
(flt.value == 0.f && this->value == this->value))
// PICA gives 0 instead of NaN when multiplying by inf
*this = Zero();
else value *= flt.ToFloat32();
return *this; return *this;
} }

View file

@ -246,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
} }
} }
void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
MOVAPS(scratch, R(src1));
CMPPS(scratch, R(src2), CMP_ORD);
MULPS(src1, R(src2));
MOVAPS(src2, R(src1));
CMPPS(src2, R(src2), CMP_UNORD);
XORPS(scratch, R(src2));
ANDPS(src1, R(scratch));
}
void JitCompiler::Compile_EvaluateCondition(Instruction instr) { void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
// Note: NXOR is used below to check for equality // Note: NXOR is used below to check for equality
switch (instr.flow_control.op) { switch (instr.flow_control.op) {
@ -309,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
if (Common::GetCPUCaps().sse4_1) { Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
DPPS(SRC1, R(SRC2), 0x7f);
} else {
MULPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1)); MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
MOVAPS(SRC3, R(SRC1)); MOVAPS(SRC3, R(SRC1));
SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
ADDPS(SRC1, R(SRC2)); ADDPS(SRC1, R(SRC2));
ADDPS(SRC1, R(SRC3)); ADDPS(SRC1, R(SRC3));
}
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
@ -332,19 +341,15 @@ void JitCompiler::Compile_DP4(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
if (Common::GetCPUCaps().sse4_1) { Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
DPPS(SRC1, R(SRC2), 0xff);
} else {
MULPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1)); MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
ADDPS(SRC1, R(SRC2)); ADDPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1)); MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
ADDPS(SRC1, R(SRC2)); ADDPS(SRC1, R(SRC2));
}
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
@ -361,24 +366,23 @@ void JitCompiler::Compile_DPH(Instruction instr) {
if (Common::GetCPUCaps().sse4_1) { if (Common::GetCPUCaps().sse4_1) {
// Set 4th component to 1.0 // Set 4th component to 1.0
BLENDPS(SRC1, R(ONE), 0x8); // 0b1000 BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
DPPS(SRC1, R(SRC2), 0xff);
} else { } else {
// Reverse to set the 4th component to 1.0 // Reverse to set the 4th component to 1.0
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
MOVSS(SRC1, R(ONE)); MOVSS(SRC1, R(ONE));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
MULPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
ADDPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
ADDPS(SRC1, R(SRC2));
} }
Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
ADDPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
ADDPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
@ -417,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) {
void JitCompiler::Compile_MUL(Instruction instr) { void JitCompiler::Compile_MUL(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
MULPS(SRC1, R(SRC2)); Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
@ -635,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
} }
if (Common::GetCPUCaps().fma) { Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
VFMADD213PS(SRC1, SRC2, R(SRC3)); ADDPS(SRC1, R(SRC3));
} else {
MULPS(SRC1, R(SRC2));
ADDPS(SRC1, R(SRC3));
}
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }

View file

@ -68,6 +68,12 @@ private:
void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);
void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); void Compile_DestEnable(Instruction instr, Gen::X64Reg dest);
/**
* Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
* zero by inf. Clobbers `src2` and `scratch`.
*/
void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch);
void Compile_EvaluateCondition(Instruction instr); void Compile_EvaluateCondition(Instruction instr);
void Compile_UniformCondition(Instruction instr); void Compile_UniformCondition(Instruction instr);