fp: Change FPUnpacked to a normalized representation
Having a known position for the highest set bit makes writing algorithms easier
This commit is contained in:
parent
680395a803
commit
7a673a8a43
10 changed files with 71 additions and 56 deletions
|
@ -150,7 +150,7 @@ inline size_t BitCount(Integral value) {
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
inline int HighestSetBit(T value) {
|
||||
constexpr int HighestSetBit(T value) {
|
||||
auto x = static_cast<std::make_unsigned_t<T>>(value);
|
||||
int result = -1;
|
||||
while (x != 0) {
|
||||
|
|
|
@ -11,26 +11,15 @@
|
|||
|
||||
namespace Dynarmic::FP {
|
||||
|
||||
constexpr size_t normalized_point_position = 62;
|
||||
constexpr size_t product_point_position = normalized_point_position * 2;
|
||||
|
||||
static FPUnpacked NormalizeUnpacked(FPUnpacked op) {
|
||||
constexpr int desired_highest = static_cast<int>(normalized_point_position);
|
||||
|
||||
const int highest_bit = Common::HighestSetBit(op.mantissa);
|
||||
DEBUG_ASSERT(highest_bit < desired_highest);
|
||||
|
||||
const int offset = desired_highest - highest_bit;
|
||||
op.mantissa <<= offset;
|
||||
op.exponent -= offset;
|
||||
return op;
|
||||
static FPUnpacked ReduceMantissa(bool sign, int exponent, const u128& mantissa) {
|
||||
constexpr int point_position_correction = normalized_point_position - (product_point_position - 64);
|
||||
// We round-to-odd here when reducing the bitwidth of the mantissa so that subsequent roundings are accurate.
|
||||
return {sign, exponent + point_position_correction, mantissa.upper | static_cast<u64>(mantissa.lower != 0)};
|
||||
}
|
||||
|
||||
FPUnpacked FusedMulAdd(FPUnpacked addend, FPUnpacked op1, FPUnpacked op2) {
|
||||
addend = NormalizeUnpacked(addend);
|
||||
op1 = NormalizeUnpacked(op1);
|
||||
op2 = NormalizeUnpacked(op2);
|
||||
|
||||
const bool product_sign = op1.sign != op2.sign;
|
||||
const auto [product_exponent, product_value] = [op1, op2]{
|
||||
int exponent = op1.exponent + op2.exponent;
|
||||
|
@ -47,10 +36,10 @@ FPUnpacked FusedMulAdd(FPUnpacked addend, FPUnpacked op1, FPUnpacked op2) {
|
|||
}
|
||||
|
||||
if (addend.mantissa == 0) {
|
||||
return FPUnpacked{product_sign, product_exponent + 64, product_value.upper | u64(product_value.lower != 0)};
|
||||
return ReduceMantissa(product_sign, product_exponent, product_value);
|
||||
}
|
||||
|
||||
const int exp_diff = product_exponent - (addend.exponent - normalized_point_position);
|
||||
const int exp_diff = product_exponent - addend.exponent;
|
||||
|
||||
if (product_sign == addend.sign) {
|
||||
// Addition
|
||||
|
@ -63,7 +52,7 @@ FPUnpacked FusedMulAdd(FPUnpacked addend, FPUnpacked op1, FPUnpacked op2) {
|
|||
|
||||
// addend < product
|
||||
const u128 result = product_value + StickyLogicalShiftRight(addend.mantissa, exp_diff - normalized_point_position);
|
||||
return FPUnpacked{product_sign, product_exponent + 64, result.upper | u64(result.lower != 0)};
|
||||
return ReduceMantissa(product_sign, product_exponent, result);
|
||||
}
|
||||
|
||||
// Subtraction
|
||||
|
@ -80,7 +69,7 @@ FPUnpacked FusedMulAdd(FPUnpacked addend, FPUnpacked op1, FPUnpacked op2) {
|
|||
result = product_value - addend_long;
|
||||
} else if (exp_diff <= 0) {
|
||||
result_sign = !product_sign;
|
||||
result_exponent = addend.exponent - normalized_point_position;
|
||||
result_exponent = addend.exponent;
|
||||
result = addend_long - StickyLogicalShiftRight(product_value, -exp_diff);
|
||||
} else {
|
||||
result_sign = product_sign;
|
||||
|
@ -95,7 +84,7 @@ FPUnpacked FusedMulAdd(FPUnpacked addend, FPUnpacked op1, FPUnpacked op2) {
|
|||
const int required_shift = normalized_point_position - Common::HighestSetBit(result.upper);
|
||||
result = result << required_shift;
|
||||
result_exponent -= required_shift;
|
||||
return FPUnpacked{result_sign, result_exponent + 64, result.upper | u64(result.lower != 0)};
|
||||
return ReduceMantissa(result_sign, result_exponent, result);
|
||||
}
|
||||
|
||||
} // namespace Dynarmic::FP
|
||||
|
|
|
@ -10,6 +10,7 @@ namespace Dynarmic::FP {
|
|||
|
||||
struct FPUnpacked;
|
||||
|
||||
/// This function assumes all arguments have been normalized.
|
||||
FPUnpacked FusedMulAdd(FPUnpacked addend, FPUnpacked op1, FPUnpacked op2);
|
||||
|
||||
} // namespace Dynarmic::FP
|
||||
|
|
|
@ -79,11 +79,10 @@ FPT FPRSqrtEstimate(FPT op, FPCR fpcr, FPSR& fpsr) {
|
|||
return FPInfo<FPT>::Zero(false);
|
||||
}
|
||||
|
||||
const int highest_bit = Common::HighestSetBit(value.mantissa);
|
||||
const int result_exponent = (-(value.exponent + highest_bit + 1)) >> 1;
|
||||
const bool was_exponent_odd = (value.exponent + highest_bit) % 2 == 0;
|
||||
const int result_exponent = (-(value.exponent + 1)) >> 1;
|
||||
const bool was_exponent_odd = (value.exponent) % 2 == 0;
|
||||
|
||||
const u64 scaled = Safe::LogicalShiftRight(value.mantissa, highest_bit - (was_exponent_odd ? 7 : 8));
|
||||
const u64 scaled = Safe::LogicalShiftRight(value.mantissa, normalized_point_position - (was_exponent_odd ? 7 : 8));
|
||||
const u64 estimate = RecipSqrtEstimate(scaled);
|
||||
|
||||
const FPT bits_exponent = static_cast<FPT>(result_exponent + FPInfo<FPT>::exponent_bias);
|
||||
|
|
|
@ -41,7 +41,7 @@ FPT FPRSqrtStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
|
|||
}
|
||||
|
||||
// result_value = (3.0 + (value1 * value2)) / 2.0
|
||||
FPUnpacked result_value = FusedMulAdd({false, 0, 3}, value1, value2);
|
||||
FPUnpacked result_value = FusedMulAdd(ToNormalized(false, 0, 3), value1, value2);
|
||||
result_value.exponent--;
|
||||
|
||||
if (result_value.mantissa == 0) {
|
||||
|
|
|
@ -38,14 +38,17 @@ u64 FPRoundInt(FPT op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr)
|
|||
return FPInfo<FPT>::Zero(sign);
|
||||
}
|
||||
|
||||
if (value.exponent >= 0) {
|
||||
// Reshift decimal point back to bit zero.
|
||||
const int exponent = value.exponent - normalized_point_position;
|
||||
|
||||
if (exponent >= 0) {
|
||||
// Guaranteed to be an integer
|
||||
return op;
|
||||
}
|
||||
|
||||
u64 int_result = sign ? Safe::Negate<u64>(value.mantissa) : static_cast<u64>(value.mantissa);
|
||||
const ResidualError error = ResidualErrorOnRightShift(int_result, -value.exponent);
|
||||
int_result = Safe::ArithmeticShiftLeft(int_result, value.exponent);
|
||||
const ResidualError error = ResidualErrorOnRightShift(int_result, -exponent);
|
||||
int_result = Safe::ArithmeticShiftLeft(int_result, exponent);
|
||||
|
||||
bool round_up = false;
|
||||
switch (rounding) {
|
||||
|
@ -77,7 +80,7 @@ u64 FPRoundInt(FPT op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr)
|
|||
|
||||
const FPT result = int_result == 0
|
||||
? FPInfo<FPT>::Zero(sign)
|
||||
: FPRound<FPT>(FPUnpacked{new_sign, 0, abs_int_result}, fpcr, RoundingMode::TowardsZero, fpsr);
|
||||
: FPRound<FPT>(FPUnpacked{new_sign, normalized_point_position, abs_int_result}, fpcr, RoundingMode::TowardsZero, fpsr);
|
||||
|
||||
if (error != ResidualError::Zero && exact) {
|
||||
FPProcessException(FPExc::Inexact, fpcr, fpsr);
|
||||
|
|
|
@ -40,12 +40,12 @@ u64 FPToFixed(size_t ibits, FPT op, size_t fbits, bool unsigned_, FPCR fpcr, Rou
|
|||
return 0;
|
||||
}
|
||||
|
||||
// value *= 2.0^fbits
|
||||
value.exponent += static_cast<int>(fbits);
|
||||
// value *= 2.0^fbits and reshift the decimal point back to bit zero.
|
||||
int exponent = value.exponent + static_cast<int>(fbits) - normalized_point_position;
|
||||
|
||||
u64 int_result = sign ? Safe::Negate<u64>(value.mantissa) : static_cast<u64>(value.mantissa);
|
||||
const ResidualError error = ResidualErrorOnRightShift(int_result, -value.exponent);
|
||||
int_result = Safe::ArithmeticShiftLeft(int_result, value.exponent);
|
||||
const ResidualError error = ResidualErrorOnRightShift(int_result, -exponent);
|
||||
int_result = Safe::ArithmeticShiftLeft(int_result, exponent);
|
||||
|
||||
bool round_up = false;
|
||||
switch (rounding) {
|
||||
|
@ -74,7 +74,7 @@ u64 FPToFixed(size_t ibits, FPT op, size_t fbits, bool unsigned_, FPCR fpcr, Rou
|
|||
|
||||
// Detect Overflow
|
||||
const int min_exponent_for_overflow = static_cast<int>(ibits) - static_cast<int>(Common::HighestSetBit(value.mantissa + (round_up ? 1 : 0))) - (unsigned_ ? 0 : 1);
|
||||
if (value.exponent >= min_exponent_for_overflow) {
|
||||
if (exponent >= min_exponent_for_overflow) {
|
||||
// Positive overflow
|
||||
if (unsigned_ || !sign) {
|
||||
FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
|
||||
|
@ -83,7 +83,7 @@ u64 FPToFixed(size_t ibits, FPT op, size_t fbits, bool unsigned_, FPCR fpcr, Rou
|
|||
|
||||
// Negative overflow
|
||||
const u64 min_value = Safe::Negate<u64>(static_cast<u64>(1) << (ibits - 1));
|
||||
if (!(value.exponent == min_exponent_for_overflow && int_result == min_value)) {
|
||||
if (!(exponent == min_exponent_for_overflow && int_result == min_value)) {
|
||||
FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
|
||||
return static_cast<u64>(1) << (ibits - 1);
|
||||
}
|
||||
|
|
|
@ -35,20 +35,20 @@ std::tuple<FPType, bool, FPUnpacked> FPUnpack(FPT op, FPCR fpcr, FPSR& fpsr) {
|
|||
return {FPType::Zero, sign, {sign, 0, 0}};
|
||||
}
|
||||
|
||||
return {FPType::Nonzero, sign, {sign, denormal_exponent, frac_raw}};
|
||||
return {FPType::Nonzero, sign, ToNormalized(sign, denormal_exponent, frac_raw)};
|
||||
}
|
||||
|
||||
if (exp_raw == Common::Ones<FPT>(FPInfo<FPT>::exponent_width)) {
|
||||
if (frac_raw == 0) {
|
||||
return {FPType::Infinity, sign, {sign, 1000000, 1}};
|
||||
return {FPType::Infinity, sign, ToNormalized(sign, 1000000, 1)};
|
||||
}
|
||||
|
||||
const bool is_quiet = Common::Bit<mantissa_high_bit>(frac_raw);
|
||||
return {is_quiet ? FPType::QNaN : FPType::SNaN, sign, {sign, 0, 0}};
|
||||
}
|
||||
|
||||
const int exp = static_cast<int>(exp_raw) - FPInfo<FPT>::exponent_bias - FPInfo<FPT>::explicit_mantissa_width;
|
||||
const u64 frac = frac_raw | FPInfo<FPT>::implicit_leading_bit;
|
||||
const int exp = static_cast<int>(exp_raw) - FPInfo<FPT>::exponent_bias;
|
||||
const u64 frac = static_cast<u64>(frac_raw | FPInfo<FPT>::implicit_leading_bit) << (normalized_point_position - FPInfo<FPT>::explicit_mantissa_width);
|
||||
return {FPType::Nonzero, sign, {sign, exp, frac}};
|
||||
}
|
||||
|
||||
|
@ -61,7 +61,7 @@ std::tuple<bool, int, u64, ResidualError> Normalize(FPUnpacked op, int extra_rig
|
|||
const int shift_amount = highest_set_bit - static_cast<int>(F) + extra_right_shift;
|
||||
const u64 mantissa = Safe::LogicalShiftRight(op.mantissa, shift_amount);
|
||||
const ResidualError error = ResidualErrorOnRightShift(op.mantissa, shift_amount);
|
||||
const int exponent = op.exponent + highest_set_bit;
|
||||
const int exponent = op.exponent + highest_set_bit - normalized_point_position;
|
||||
return std::make_tuple(op.sign, exponent, mantissa, error);
|
||||
}
|
||||
|
||||
|
|
|
@ -24,7 +24,10 @@ enum class FPType {
|
|||
SNaN,
|
||||
};
|
||||
|
||||
/// value = (sign ? -1 : +1) * mantissa * 2^exponent
|
||||
constexpr size_t normalized_point_position = 62;
|
||||
|
||||
/// value = (sign ? -1 : +1) * mantissa/(2^62) * 2^exponent
|
||||
/// 63rd bit of mantissa is always set (unless value is zero)
|
||||
struct FPUnpacked {
|
||||
bool sign;
|
||||
int exponent;
|
||||
|
@ -35,6 +38,19 @@ inline bool operator==(const FPUnpacked& a, const FPUnpacked& b) {
|
|||
return std::tie(a.sign, a.exponent, a.mantissa) == std::tie(b.sign, b.exponent, b.mantissa);
|
||||
}
|
||||
|
||||
/// return value = (sign ? -1 : +1) * value * 2^exponent
|
||||
constexpr FPUnpacked ToNormalized(bool sign, int exponent, u64 value) {
|
||||
if (value == 0) {
|
||||
return {sign, 0, 0};
|
||||
}
|
||||
|
||||
const int highest_bit = Common::HighestSetBit(value);
|
||||
const int offset = static_cast<int>(normalized_point_position) - highest_bit;
|
||||
value <<= offset;
|
||||
exponent -= offset - normalized_point_position;
|
||||
return {sign, exponent, value};
|
||||
}
|
||||
|
||||
template<typename FPT>
|
||||
std::tuple<FPType, bool, FPUnpacked> FPUnpack(FPT op, FPCR fpcr, FPSR& fpsr);
|
||||
|
||||
|
|
|
@ -20,15 +20,15 @@ using namespace Dynarmic::FP;
|
|||
|
||||
TEST_CASE("FPUnpack Tests", "[fp]") {
|
||||
const static std::vector<std::tuple<u32, std::tuple<FPType, bool, FPUnpacked>, u32>> test_cases {
|
||||
{0x00000000, {FPType::Zero, false, {false, 0, 0}}, 0},
|
||||
{0x7F800000, {FPType::Infinity, false, {false, 1000000, 1}}, 0},
|
||||
{0xFF800000, {FPType::Infinity, true, {true, 1000000, 1}}, 0},
|
||||
{0x7F800001, {FPType::SNaN, false, {false, 0, 0}}, 0},
|
||||
{0xFF800001, {FPType::SNaN, true, {true, 0, 0}}, 0},
|
||||
{0x7FC00001, {FPType::QNaN, false, {false, 0, 0}}, 0},
|
||||
{0xFFC00001, {FPType::QNaN, true, {true, 0, 0}}, 0},
|
||||
{0x00000001, {FPType::Nonzero, false, {false, -149, 1}}, 0}, // Smallest single precision denormal is 2^-149.
|
||||
{0x3F7FFFFF, {FPType::Nonzero, false, {false, -24, 0xFFFFFF}}, 0}, // 1.0 - epsilon
|
||||
{0x00000000, {FPType::Zero, false, ToNormalized(false, 0, 0)}, 0},
|
||||
{0x7F800000, {FPType::Infinity, false, ToNormalized(false, 1000000, 1)}, 0},
|
||||
{0xFF800000, {FPType::Infinity, true, ToNormalized(true, 1000000, 1)}, 0},
|
||||
{0x7F800001, {FPType::SNaN, false, ToNormalized(false, 0, 0)}, 0},
|
||||
{0xFF800001, {FPType::SNaN, true, ToNormalized(true, 0, 0)}, 0},
|
||||
{0x7FC00001, {FPType::QNaN, false, ToNormalized(false, 0, 0)}, 0},
|
||||
{0xFFC00001, {FPType::QNaN, true, ToNormalized(true, 0, 0)}, 0},
|
||||
{0x00000001, {FPType::Nonzero, false, ToNormalized(false, -149, 1)}, 0}, // Smallest single precision denormal is 2^-149.
|
||||
{0x3F7FFFFF, {FPType::Nonzero, false, ToNormalized(false, -24, 0xFFFFFF)}, 0}, // 1.0 - epsilon
|
||||
};
|
||||
|
||||
const FPCR fpcr;
|
||||
|
@ -37,6 +37,13 @@ TEST_CASE("FPUnpack Tests", "[fp]") {
|
|||
const auto output = FPUnpack<u32>(input, fpcr, fpsr);
|
||||
|
||||
INFO("Input: " << std::hex << input);
|
||||
INFO("Output Sign: " << std::get<2>(output).sign);
|
||||
INFO("Output Exponent: " << std::get<2>(output).exponent);
|
||||
INFO("Output Mantissa: " << std::hex << std::get<2>(output).mantissa);
|
||||
INFO("Expected Sign: " << std::get<2>(expected_output).sign);
|
||||
INFO("Expected Exponent: " << std::get<2>(expected_output).exponent);
|
||||
INFO("Expected Mantissa: " << std::hex << std::get<2>(expected_output).mantissa);
|
||||
|
||||
REQUIRE(output == expected_output);
|
||||
REQUIRE(fpsr.Value() == expected_fpsr);
|
||||
}
|
||||
|
@ -44,11 +51,11 @@ TEST_CASE("FPUnpack Tests", "[fp]") {
|
|||
|
||||
TEST_CASE("FPRound Tests", "[fp]") {
|
||||
const static std::vector<std::tuple<u32, std::tuple<FPType, bool, FPUnpacked>, u32>> test_cases {
|
||||
{0x7F800000, {FPType::Infinity, false, {false, 1000000, 1}}, 0x14},
|
||||
{0xFF800000, {FPType::Infinity, true, {true, 1000000, 1}}, 0x14},
|
||||
{0x00000001, {FPType::Nonzero, false, {false, -149, 1}}, 0}, // Smallest single precision denormal is 2^-149.
|
||||
{0x3F7FFFFF, {FPType::Nonzero, false, {false, -24, 0xFFFFFF}}, 0}, // 1.0 - epsilon
|
||||
{0x3F800000, {FPType::Nonzero, false, {false, -28, 0xFFFFFFF}}, 0x10}, // rounds to 1.0
|
||||
{0x7F800000, {FPType::Infinity, false, ToNormalized(false, 1000000, 1)}, 0x14},
|
||||
{0xFF800000, {FPType::Infinity, true, ToNormalized(true, 1000000, 1)}, 0x14},
|
||||
{0x00000001, {FPType::Nonzero, false, ToNormalized(false, -149, 1)}, 0}, // Smallest single precision denormal is 2^-149.
|
||||
{0x3F7FFFFF, {FPType::Nonzero, false, ToNormalized(false, -24, 0xFFFFFF)}, 0}, // 1.0 - epsilon
|
||||
{0x3F800000, {FPType::Nonzero, false, ToNormalized(false, -28, 0xFFFFFFF)}, 0x10}, // rounds to 1.0
|
||||
};
|
||||
|
||||
const FPCR fpcr;
|
||||
|
|
Loading…
Reference in a new issue