tint: Implement f16 value binary representation
This CL add methods that return the binary16 bit pattern for a constructed Number<detail::NumberKindF16>. This is required for generating SIPR-V oprand. Bug: tint:1473, tint:1502 Change-Id: Ia3680cdb5a0e64d31bfe2f48432cda3850c1f5a7 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/95240 Reviewed-by: Dan Sinclair <dsinclair@chromium.org> Commit-Queue: Zhaoming Jiang <zhaoming.jiang@intel.com>
This commit is contained in:
parent
590040cfb4
commit
2c7440a13f
|
@ -204,4 +204,108 @@ f16::type f16::Quantize(f16::type value) {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint16_t f16::BitsRepresentation() const {
|
||||||
|
constexpr uint16_t f16_nan = 0x7e00u;
|
||||||
|
constexpr uint16_t f16_pos_inf = 0x7c00u;
|
||||||
|
constexpr uint16_t f16_neg_inf = 0xfc00u;
|
||||||
|
|
||||||
|
// Assert we use binary32 (i.e. float) as underlying type, which has 4 bytes.
|
||||||
|
static_assert(std::is_same<f16::type, float>());
|
||||||
|
|
||||||
|
// The stored value in f16 object must be already quantized, so it should be either NaN, +/-
|
||||||
|
// Inf, or exactly representable by normal or subnormal f16.
|
||||||
|
|
||||||
|
if (std::isnan(value)) {
|
||||||
|
return f16_nan;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (std::isinf(value)) {
|
||||||
|
return value > 0 ? f16_pos_inf : f16_neg_inf;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now quantized_value must be a finite f16 exactly-representable value.
|
||||||
|
// The following table shows exponent cases for all finite f16 exactly-representable value.
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// | Value category | Unbiased exp | F16 biased exp | F32 biased exp |
|
||||||
|
// |------------------|----------------|------------------|------------------|
|
||||||
|
// | +/- zero | \ | 0 | 0 |
|
||||||
|
// | Subnormal f16 | [-24, -15] | 0 | [103, 112] |
|
||||||
|
// | Normal f16 | [-14, 15] | [1, 30] | [113, 142] |
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
constexpr uint32_t max_f32_biased_exp_for_f16_normal_number = 142;
|
||||||
|
constexpr uint32_t min_f32_biased_exp_for_f16_normal_number = 113;
|
||||||
|
constexpr uint32_t max_f32_biased_exp_for_f16_subnormal_number = 112;
|
||||||
|
constexpr uint32_t min_f32_biased_exp_for_f16_subnormal_number = 103;
|
||||||
|
|
||||||
|
constexpr uint32_t f32_sign_mask = 0x80000000u;
|
||||||
|
constexpr uint32_t f32_exp_mask = 0x7f800000u;
|
||||||
|
constexpr uint32_t f32_mantissa_mask = 0x007fffffu;
|
||||||
|
constexpr uint32_t f32_mantissa_bis_number = 23;
|
||||||
|
constexpr uint32_t f32_exp_bias = 127;
|
||||||
|
|
||||||
|
constexpr uint16_t f16_sign_mask = 0x8000u;
|
||||||
|
constexpr uint16_t f16_exp_mask = 0x7c00u;
|
||||||
|
constexpr uint16_t f16_mantissa_mask = 0x03ffu;
|
||||||
|
constexpr uint32_t f16_mantissa_bis_number = 10;
|
||||||
|
constexpr uint32_t f16_exp_bias = 15;
|
||||||
|
|
||||||
|
uint32_t f32_bit_pattern;
|
||||||
|
memcpy(&f32_bit_pattern, &value, 4);
|
||||||
|
uint32_t f32_biased_exponent = (f32_bit_pattern & f32_exp_mask) >> f32_mantissa_bis_number;
|
||||||
|
uint32_t f32_mantissa = f32_bit_pattern & f32_mantissa_mask;
|
||||||
|
|
||||||
|
uint16_t f16_sign_part = static_cast<uint16_t>((f32_bit_pattern & f32_sign_mask) >> 16);
|
||||||
|
TINT_ASSERT(Semantic, (f16_sign_part & ~f16_sign_mask) == 0);
|
||||||
|
|
||||||
|
if ((f32_bit_pattern & ~f32_sign_mask) == 0) {
|
||||||
|
// +/- zero
|
||||||
|
return f16_sign_part;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((min_f32_biased_exp_for_f16_normal_number <= f32_biased_exponent) &&
|
||||||
|
(f32_biased_exponent <= max_f32_biased_exp_for_f16_normal_number)) {
|
||||||
|
// Normal f16
|
||||||
|
uint32_t f16_biased_exponent = f32_biased_exponent - f32_exp_bias + f16_exp_bias;
|
||||||
|
uint16_t f16_exp_part =
|
||||||
|
static_cast<uint16_t>(f16_biased_exponent << f16_mantissa_bis_number);
|
||||||
|
uint16_t f16_mantissa_part = static_cast<uint16_t>(
|
||||||
|
f32_mantissa >> (f32_mantissa_bis_number - f16_mantissa_bis_number));
|
||||||
|
|
||||||
|
TINT_ASSERT(Semantic, (f16_exp_part & ~f16_exp_mask) == 0);
|
||||||
|
TINT_ASSERT(Semantic, (f16_mantissa_part & ~f16_mantissa_mask) == 0);
|
||||||
|
|
||||||
|
return f16_sign_part | f16_exp_part | f16_mantissa_part;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((min_f32_biased_exp_for_f16_subnormal_number <= f32_biased_exponent) &&
|
||||||
|
(f32_biased_exponent <= max_f32_biased_exp_for_f16_subnormal_number)) {
|
||||||
|
// Subnormal f16
|
||||||
|
// The resulting exp bits are always 0, and the mantissa bits should be handled specially.
|
||||||
|
uint16_t f16_exp_part = 0;
|
||||||
|
// The resulting subnormal f16 will have only 1 valid mantissa bit if the unbiased exponent
|
||||||
|
// of value is of the minimum, i.e. -24; and have all 10 mantissa bits valid if the unbiased
|
||||||
|
// exponent of value is of the maximum, i.e. -15.
|
||||||
|
uint32_t f16_valid_mantissa_bits =
|
||||||
|
f32_biased_exponent - min_f32_biased_exp_for_f16_subnormal_number + 1;
|
||||||
|
// The resulting f16 mantissa part comes from right-shifting the f32 mantissa bits with
|
||||||
|
// leading 1 added.
|
||||||
|
uint16_t f16_mantissa_part =
|
||||||
|
static_cast<uint16_t>((f32_mantissa | (f32_mantissa_mask + 1)) >>
|
||||||
|
(f32_mantissa_bis_number + 1 - f16_valid_mantissa_bits));
|
||||||
|
|
||||||
|
TINT_ASSERT(Semantic, (1 <= f16_valid_mantissa_bits) &&
|
||||||
|
(f16_valid_mantissa_bits <= f16_mantissa_bis_number));
|
||||||
|
TINT_ASSERT(Semantic, (f16_mantissa_part & ~((1u << f16_valid_mantissa_bits) - 1)) == 0);
|
||||||
|
TINT_ASSERT(Semantic, (f16_mantissa_part != 0));
|
||||||
|
|
||||||
|
return f16_sign_part | f16_exp_part | f16_mantissa_part;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Neither zero, subnormal f16 or normal f16, shall never hit.
|
||||||
|
tint::diag::List diag;
|
||||||
|
TINT_UNREACHABLE(Semantic, diag);
|
||||||
|
return f16_nan;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace tint
|
} // namespace tint
|
||||||
|
|
|
@ -186,6 +186,13 @@ struct Number<detail::NumberKindF16> {
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the binary16 bit pattern in type uint16_t of this value.
|
||||||
|
/// @returns the binary16 bit pattern, in type uint16_t, of the stored quantized f16 value. If
|
||||||
|
/// the value is NaN, the returned value will be 0x7e00u. If the value is positive infinity, the
|
||||||
|
/// returned value will be 0x7c00u. If the input value is negative infinity, the returned value
|
||||||
|
/// will be 0xfc00u.
|
||||||
|
uint16_t BitsRepresentation() const;
|
||||||
|
|
||||||
/// @param value the input float32 value
|
/// @param value the input float32 value
|
||||||
/// @returns the float32 value quantized to the smaller float16 value, through truncation of the
|
/// @returns the float32 value quantized to the smaller float16 value, through truncation of the
|
||||||
/// mantissa bits (no rounding). If the float32 value is too large (positive or negative) to be
|
/// mantissa bits (no rounding). If the float32 value is too large (positive or negative) to be
|
||||||
|
|
|
@ -217,83 +217,164 @@ TEST(NumberTest, CheckedConvertSubnormals) {
|
||||||
EXPECT_EQ(CheckedConvert<f16>(AFloat(-kHighestF16Subnormal)), f16(-kHighestF16Subnormal));
|
EXPECT_EQ(CheckedConvert<f16>(AFloat(-kHighestF16Subnormal)), f16(-kHighestF16Subnormal));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(NumberTest, QuantizeF16) {
|
// Test cases for f16 subnormal quantization and BitsRepresentation.
|
||||||
constexpr float nan = std::numeric_limits<float>::quiet_NaN();
|
// The ULP is based on float rather than double or f16, since F16::Quantize and
|
||||||
constexpr float inf = std::numeric_limits<float>::infinity();
|
// F16::BitsRepresentation take float as input.
|
||||||
|
constexpr float lowestPositiveNormalF16 = 0x1p-14;
|
||||||
|
constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14;
|
||||||
|
constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15;
|
||||||
|
constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14;
|
||||||
|
constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15;
|
||||||
|
constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15;
|
||||||
|
constexpr float lowestPositiveSubnormalF16 = 0x1.p-24;
|
||||||
|
constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24;
|
||||||
|
constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25;
|
||||||
|
|
||||||
EXPECT_EQ(f16(0.0), 0.0f);
|
constexpr uint16_t lowestPositiveNormalF16Bits = 0x0400u;
|
||||||
EXPECT_EQ(f16(1.0), 1.0f);
|
constexpr uint16_t highestPositiveSubnormalF16Bits = 0x03ffu;
|
||||||
EXPECT_EQ(f16(0.00006106496), 0.000061035156f);
|
constexpr uint16_t lowestPositiveSubnormalF16Bits = 0x0001u;
|
||||||
EXPECT_EQ(f16(1.0004883), 1.0f);
|
|
||||||
EXPECT_EQ(f16(-8196), -8192.f);
|
|
||||||
EXPECT_EQ(f16(65504.003), inf);
|
|
||||||
EXPECT_EQ(f16(-65504.003), -inf);
|
|
||||||
EXPECT_EQ(f16(inf), inf);
|
|
||||||
EXPECT_EQ(f16(-inf), -inf);
|
|
||||||
EXPECT_TRUE(std::isnan(f16(nan)));
|
|
||||||
|
|
||||||
// Test for subnormal quantization.
|
constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16;
|
||||||
// The ULP is based on float rather than double or f16, since F16::Quantize take float as input.
|
constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP;
|
||||||
constexpr float lowestPositiveNormalF16 = 0x1p-14;
|
constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP;
|
||||||
constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14;
|
constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16;
|
||||||
constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15;
|
constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP;
|
||||||
constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14;
|
constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP;
|
||||||
constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15;
|
constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16;
|
||||||
constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15;
|
constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP;
|
||||||
constexpr float lowestPositiveSubnormalF16 = 0x1.p-24;
|
constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP;
|
||||||
constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24;
|
|
||||||
constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25;
|
|
||||||
|
|
||||||
constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16;
|
constexpr uint16_t highestNegativeNormalF16Bits = 0x8400u;
|
||||||
constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP;
|
constexpr uint16_t lowestNegativeSubnormalF16Bits = 0x83ffu;
|
||||||
constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP;
|
constexpr uint16_t highestNegativeSubnormalF16Bits = 0x8001u;
|
||||||
constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16;
|
|
||||||
constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP;
|
|
||||||
constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP;
|
|
||||||
constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16;
|
|
||||||
constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP;
|
|
||||||
constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP;
|
|
||||||
|
|
||||||
// Value larger than or equal to lowest positive normal f16 will be quantized to normal f16.
|
constexpr float f32_nan = std::numeric_limits<float>::quiet_NaN();
|
||||||
EXPECT_EQ(f16(lowestPositiveNormalF16PlusULP), lowestPositiveNormalF16);
|
constexpr float f32_inf = std::numeric_limits<float>::infinity();
|
||||||
EXPECT_EQ(f16(lowestPositiveNormalF16), lowestPositiveNormalF16);
|
|
||||||
// Positive value smaller than lowest positive normal f16 but not smaller than lowest positive
|
|
||||||
// subnormal f16 will be quantized to subnormal f16 or zero.
|
|
||||||
EXPECT_EQ(f16(lowestPositiveNormalF16MinusULP), highestPositiveSubnormalF16);
|
|
||||||
EXPECT_EQ(f16(highestPositiveSubnormalF16PlusULP), highestPositiveSubnormalF16);
|
|
||||||
EXPECT_EQ(f16(highestPositiveSubnormalF16), highestPositiveSubnormalF16);
|
|
||||||
EXPECT_EQ(f16(highestPositiveSubnormalF16MinusULP), 0x0.ff8p-14);
|
|
||||||
EXPECT_EQ(f16(lowestPositiveSubnormalF16PlusULP), lowestPositiveSubnormalF16);
|
|
||||||
EXPECT_EQ(f16(lowestPositiveSubnormalF16), lowestPositiveSubnormalF16);
|
|
||||||
// Positive value smaller than lowest positive subnormal f16 will be quantized to zero.
|
|
||||||
EXPECT_EQ(f16(lowestPositiveSubnormalF16MinusULP), 0.0);
|
|
||||||
// Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 = 0x0.004p-14.
|
|
||||||
EXPECT_EQ(f16(0x0.064p-14), 0x0.064p-14);
|
|
||||||
EXPECT_EQ(f16(0x0.067fecp-14), 0x0.064p-14);
|
|
||||||
EXPECT_EQ(f16(0x0.063ffep-14), 0x0.060p-14);
|
|
||||||
EXPECT_EQ(f16(0x0.008p-14), 0x0.008p-14);
|
|
||||||
EXPECT_EQ(f16(0x0.00bffep-14), 0x0.008p-14);
|
|
||||||
EXPECT_EQ(f16(0x0.007ffep-14), 0x0.004p-14);
|
|
||||||
|
|
||||||
// Vice versa for negative cases.
|
struct F16TestCase {
|
||||||
EXPECT_EQ(f16(highestNegativeNormalF16MinusULP), highestNegativeNormalF16);
|
float input_value;
|
||||||
EXPECT_EQ(f16(highestNegativeNormalF16), highestNegativeNormalF16);
|
float quantized_value;
|
||||||
EXPECT_EQ(f16(highestNegativeNormalF16PlusULP), lowestNegativeSubnormalF16);
|
uint16_t f16_bit_pattern;
|
||||||
EXPECT_EQ(f16(lowestNegativeSubnormalF16MinusULP), lowestNegativeSubnormalF16);
|
};
|
||||||
EXPECT_EQ(f16(lowestNegativeSubnormalF16), lowestNegativeSubnormalF16);
|
|
||||||
EXPECT_EQ(f16(lowestNegativeSubnormalF16PlusULP), -0x0.ff8p-14);
|
using NumberF16Test = testing::TestWithParam<F16TestCase>;
|
||||||
EXPECT_EQ(f16(highestNegativeSubnormalF16MinusULP), highestNegativeSubnormalF16);
|
|
||||||
EXPECT_EQ(f16(highestNegativeSubnormalF16), highestNegativeSubnormalF16);
|
TEST_P(NumberF16Test, QuantizeF16) {
|
||||||
EXPECT_EQ(f16(highestNegativeSubnormalF16PlusULP), -0.0);
|
float input_value = GetParam().input_value;
|
||||||
// Test the mantissa discarding.
|
float quantized_value = GetParam().quantized_value;
|
||||||
EXPECT_EQ(f16(-0x0.064p-14), -0x0.064p-14);
|
|
||||||
EXPECT_EQ(f16(-0x0.067fecp-14), -0x0.064p-14);
|
std::stringstream ss;
|
||||||
EXPECT_EQ(f16(-0x0.063ffep-14), -0x0.060p-14);
|
ss << "input value = " << input_value << ", expected quantized value = " << quantized_value;
|
||||||
EXPECT_EQ(f16(-0x0.008p-14), -0x0.008p-14);
|
SCOPED_TRACE(ss.str());
|
||||||
EXPECT_EQ(f16(-0x0.00bffep-14), -0x0.008p-14);
|
|
||||||
EXPECT_EQ(f16(-0x0.007ffep-14), -0x0.004p-14);
|
if (std::isnan(quantized_value)) {
|
||||||
|
EXPECT_TRUE(std::isnan(f16(input_value)));
|
||||||
|
} else {
|
||||||
|
EXPECT_EQ(f16(input_value), quantized_value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_P(NumberF16Test, BitsRepresentation) {
|
||||||
|
float input_value = GetParam().input_value;
|
||||||
|
uint16_t representation = GetParam().f16_bit_pattern;
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "input value = " << input_value
|
||||||
|
<< ", expected binary16 bits representation = " << std::hex << std::showbase
|
||||||
|
<< representation;
|
||||||
|
SCOPED_TRACE(ss.str());
|
||||||
|
|
||||||
|
EXPECT_EQ(f16(input_value).BitsRepresentation(), representation);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
NumberF16Test,
|
||||||
|
NumberF16Test,
|
||||||
|
testing::ValuesIn(std::vector<F16TestCase>{
|
||||||
|
// NaN, Inf
|
||||||
|
{f32_inf, f32_inf, 0x7c00u},
|
||||||
|
{-f32_inf, -f32_inf, 0xfc00u},
|
||||||
|
{f32_nan, f32_nan, 0x7e00u},
|
||||||
|
{-f32_nan, -f32_nan, 0x7e00u},
|
||||||
|
// +/- zero
|
||||||
|
{+0.0f, 0.0f, 0x0000u},
|
||||||
|
{-0.0f, -0.0f, 0x8000u},
|
||||||
|
// Value in normal f16 range
|
||||||
|
{1.0f, 1.0f, 0x3c00u},
|
||||||
|
{-1.0f, -1.0f, 0xbc00u},
|
||||||
|
// 0.00006106496 quantized to 0.000061035156 = 0x1p-14
|
||||||
|
{0.00006106496f, 0.000061035156f, 0x0400u},
|
||||||
|
{-0.00006106496f, -0.000061035156f, 0x8400u},
|
||||||
|
// 1.0004883 quantized to 1.0 = 0x1p0
|
||||||
|
{1.0004883f, 1.0f, 0x3c00u},
|
||||||
|
{-1.0004883f, -1.0f, 0xbc00u},
|
||||||
|
// 8196.0 quantized to 8192.0 = 0x1p13
|
||||||
|
{8196.0f, 8192.f, 0x7000u},
|
||||||
|
{-8196.0f, -8192.f, 0xf000u},
|
||||||
|
// Value in subnormal f16 range
|
||||||
|
{0x0.034p-14f, 0x0.034p-14f, 0x000du},
|
||||||
|
{-0x0.034p-14f, -0x0.034p-14f, 0x800du},
|
||||||
|
{0x0.068p-14f, 0x0.068p-14f, 0x001au},
|
||||||
|
{-0x0.068p-14f, -0x0.068p-14f, 0x801au},
|
||||||
|
// 0x0.06b7p-14 quantized to 0x0.068p-14
|
||||||
|
{0x0.06b7p-14f, 0x0.068p-14f, 0x001au},
|
||||||
|
{-0x0.06b7p-14f, -0x0.068p-14, 0x801au},
|
||||||
|
// Value out of f16 range
|
||||||
|
{65504.003f, f32_inf, 0x7c00u},
|
||||||
|
{-65504.003f, -f32_inf, 0xfc00u},
|
||||||
|
{0x1.234p56f, f32_inf, 0x7c00u},
|
||||||
|
{-0x4.321p65f, -f32_inf, 0xfc00u},
|
||||||
|
|
||||||
|
// Test for subnormal quantization.
|
||||||
|
// Value larger than or equal to lowest positive normal f16 will be quantized to normal f16.
|
||||||
|
{lowestPositiveNormalF16PlusULP, lowestPositiveNormalF16, lowestPositiveNormalF16Bits},
|
||||||
|
{lowestPositiveNormalF16, lowestPositiveNormalF16, lowestPositiveNormalF16Bits},
|
||||||
|
// Positive value smaller than lowest positive normal f16 but not smaller than lowest
|
||||||
|
// positive
|
||||||
|
// subnormal f16 will be quantized to subnormal f16 or zero.
|
||||||
|
{lowestPositiveNormalF16MinusULP, highestPositiveSubnormalF16,
|
||||||
|
highestPositiveSubnormalF16Bits},
|
||||||
|
{highestPositiveSubnormalF16PlusULP, highestPositiveSubnormalF16,
|
||||||
|
highestPositiveSubnormalF16Bits},
|
||||||
|
{highestPositiveSubnormalF16, highestPositiveSubnormalF16, highestPositiveSubnormalF16Bits},
|
||||||
|
{highestPositiveSubnormalF16MinusULP, 0x0.ff8p-14, 0x03feu},
|
||||||
|
{lowestPositiveSubnormalF16PlusULP, lowestPositiveSubnormalF16,
|
||||||
|
lowestPositiveSubnormalF16Bits},
|
||||||
|
{lowestPositiveSubnormalF16, lowestPositiveSubnormalF16, lowestPositiveSubnormalF16Bits},
|
||||||
|
// Positive value smaller than lowest positive subnormal f16 will be quantized to zero.
|
||||||
|
{lowestPositiveSubnormalF16MinusULP, 0.0, 0x0000u},
|
||||||
|
// Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 =
|
||||||
|
// 0x0.004p-14.
|
||||||
|
{0x0.064p-14f, 0x0.064p-14, 0x0019u},
|
||||||
|
{0x0.067fecp-14f, 0x0.064p-14, 0x0019u},
|
||||||
|
{0x0.063ffep-14f, 0x0.060p-14, 0x0018u},
|
||||||
|
{0x0.008p-14f, 0x0.008p-14, 0x0002u},
|
||||||
|
{0x0.00bffep-14f, 0x0.008p-14, 0x0002u},
|
||||||
|
{0x0.007ffep-14f, 0x0.004p-14, 0x0001u},
|
||||||
|
|
||||||
|
// Vice versa for negative cases.
|
||||||
|
{highestNegativeNormalF16MinusULP, highestNegativeNormalF16, highestNegativeNormalF16Bits},
|
||||||
|
{highestNegativeNormalF16, highestNegativeNormalF16, highestNegativeNormalF16Bits},
|
||||||
|
{highestNegativeNormalF16PlusULP, lowestNegativeSubnormalF16,
|
||||||
|
lowestNegativeSubnormalF16Bits},
|
||||||
|
{lowestNegativeSubnormalF16MinusULP, lowestNegativeSubnormalF16,
|
||||||
|
lowestNegativeSubnormalF16Bits},
|
||||||
|
{lowestNegativeSubnormalF16, lowestNegativeSubnormalF16, lowestNegativeSubnormalF16Bits},
|
||||||
|
{lowestNegativeSubnormalF16PlusULP, -0x0.ff8p-14, 0x83feu},
|
||||||
|
{highestNegativeSubnormalF16MinusULP, highestNegativeSubnormalF16,
|
||||||
|
highestNegativeSubnormalF16Bits},
|
||||||
|
{highestNegativeSubnormalF16, highestNegativeSubnormalF16, highestNegativeSubnormalF16Bits},
|
||||||
|
{highestNegativeSubnormalF16PlusULP, -0.0, 0x8000u},
|
||||||
|
// Test the mantissa discarding.
|
||||||
|
{-0x0.064p-14f, -0x0.064p-14, 0x8019u},
|
||||||
|
{-0x0.067fecp-14f, -0x0.064p-14, 0x8019u},
|
||||||
|
{-0x0.063ffep-14f, -0x0.060p-14, 0x8018u},
|
||||||
|
{-0x0.008p-14f, -0x0.008p-14, 0x8002u},
|
||||||
|
{-0x0.00bffep-14f, -0x0.008p-14, 0x8002u},
|
||||||
|
{-0x0.007ffep-14f, -0x0.004p-14, 0x8001u},
|
||||||
|
/////////////////////////////////////
|
||||||
|
}));
|
||||||
|
|
||||||
using BinaryCheckedCase = std::tuple<std::optional<AInt>, AInt, AInt>;
|
using BinaryCheckedCase = std::tuple<std::optional<AInt>, AInt, AInt>;
|
||||||
|
|
||||||
#undef OVERFLOW // corecrt_math.h :(
|
#undef OVERFLOW // corecrt_math.h :(
|
||||||
|
|
Loading…
Reference in New Issue