tint: Implement f16 value binary representation

This CL add methods that return the binary16 bit pattern for a
constructed Number<detail::NumberKindF16>. This is required for
generating SIPR-V oprand.

Bug: tint:1473, tint:1502
Change-Id: Ia3680cdb5a0e64d31bfe2f48432cda3850c1f5a7
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/95240
Reviewed-by: Dan Sinclair <dsinclair@chromium.org>
Commit-Queue: Zhaoming Jiang <zhaoming.jiang@intel.com>
This commit is contained in:
Zhaoming Jiang 2022-07-07 03:29:11 +00:00 committed by Dawn LUCI CQ
parent 590040cfb4
commit 2c7440a13f
3 changed files with 262 additions and 70 deletions

View File

@ -204,4 +204,108 @@ f16::type f16::Quantize(f16::type value) {
return value;
}
uint16_t f16::BitsRepresentation() const {
constexpr uint16_t f16_nan = 0x7e00u;
constexpr uint16_t f16_pos_inf = 0x7c00u;
constexpr uint16_t f16_neg_inf = 0xfc00u;
// Assert we use binary32 (i.e. float) as underlying type, which has 4 bytes.
static_assert(std::is_same<f16::type, float>());
// The stored value in f16 object must be already quantized, so it should be either NaN, +/-
// Inf, or exactly representable by normal or subnormal f16.
if (std::isnan(value)) {
return f16_nan;
}
if (std::isinf(value)) {
return value > 0 ? f16_pos_inf : f16_neg_inf;
}
// Now quantized_value must be a finite f16 exactly-representable value.
// The following table shows exponent cases for all finite f16 exactly-representable value.
// ---------------------------------------------------------------------------
// | Value category | Unbiased exp | F16 biased exp | F32 biased exp |
// |------------------|----------------|------------------|------------------|
// | +/- zero | \ | 0 | 0 |
// | Subnormal f16 | [-24, -15] | 0 | [103, 112] |
// | Normal f16 | [-14, 15] | [1, 30] | [113, 142] |
// ---------------------------------------------------------------------------
constexpr uint32_t max_f32_biased_exp_for_f16_normal_number = 142;
constexpr uint32_t min_f32_biased_exp_for_f16_normal_number = 113;
constexpr uint32_t max_f32_biased_exp_for_f16_subnormal_number = 112;
constexpr uint32_t min_f32_biased_exp_for_f16_subnormal_number = 103;
constexpr uint32_t f32_sign_mask = 0x80000000u;
constexpr uint32_t f32_exp_mask = 0x7f800000u;
constexpr uint32_t f32_mantissa_mask = 0x007fffffu;
constexpr uint32_t f32_mantissa_bis_number = 23;
constexpr uint32_t f32_exp_bias = 127;
constexpr uint16_t f16_sign_mask = 0x8000u;
constexpr uint16_t f16_exp_mask = 0x7c00u;
constexpr uint16_t f16_mantissa_mask = 0x03ffu;
constexpr uint32_t f16_mantissa_bis_number = 10;
constexpr uint32_t f16_exp_bias = 15;
uint32_t f32_bit_pattern;
memcpy(&f32_bit_pattern, &value, 4);
uint32_t f32_biased_exponent = (f32_bit_pattern & f32_exp_mask) >> f32_mantissa_bis_number;
uint32_t f32_mantissa = f32_bit_pattern & f32_mantissa_mask;
uint16_t f16_sign_part = static_cast<uint16_t>((f32_bit_pattern & f32_sign_mask) >> 16);
TINT_ASSERT(Semantic, (f16_sign_part & ~f16_sign_mask) == 0);
if ((f32_bit_pattern & ~f32_sign_mask) == 0) {
// +/- zero
return f16_sign_part;
}
if ((min_f32_biased_exp_for_f16_normal_number <= f32_biased_exponent) &&
(f32_biased_exponent <= max_f32_biased_exp_for_f16_normal_number)) {
// Normal f16
uint32_t f16_biased_exponent = f32_biased_exponent - f32_exp_bias + f16_exp_bias;
uint16_t f16_exp_part =
static_cast<uint16_t>(f16_biased_exponent << f16_mantissa_bis_number);
uint16_t f16_mantissa_part = static_cast<uint16_t>(
f32_mantissa >> (f32_mantissa_bis_number - f16_mantissa_bis_number));
TINT_ASSERT(Semantic, (f16_exp_part & ~f16_exp_mask) == 0);
TINT_ASSERT(Semantic, (f16_mantissa_part & ~f16_mantissa_mask) == 0);
return f16_sign_part | f16_exp_part | f16_mantissa_part;
}
if ((min_f32_biased_exp_for_f16_subnormal_number <= f32_biased_exponent) &&
(f32_biased_exponent <= max_f32_biased_exp_for_f16_subnormal_number)) {
// Subnormal f16
// The resulting exp bits are always 0, and the mantissa bits should be handled specially.
uint16_t f16_exp_part = 0;
// The resulting subnormal f16 will have only 1 valid mantissa bit if the unbiased exponent
// of value is of the minimum, i.e. -24; and have all 10 mantissa bits valid if the unbiased
// exponent of value is of the maximum, i.e. -15.
uint32_t f16_valid_mantissa_bits =
f32_biased_exponent - min_f32_biased_exp_for_f16_subnormal_number + 1;
// The resulting f16 mantissa part comes from right-shifting the f32 mantissa bits with
// leading 1 added.
uint16_t f16_mantissa_part =
static_cast<uint16_t>((f32_mantissa | (f32_mantissa_mask + 1)) >>
(f32_mantissa_bis_number + 1 - f16_valid_mantissa_bits));
TINT_ASSERT(Semantic, (1 <= f16_valid_mantissa_bits) &&
(f16_valid_mantissa_bits <= f16_mantissa_bis_number));
TINT_ASSERT(Semantic, (f16_mantissa_part & ~((1u << f16_valid_mantissa_bits) - 1)) == 0);
TINT_ASSERT(Semantic, (f16_mantissa_part != 0));
return f16_sign_part | f16_exp_part | f16_mantissa_part;
}
// Neither zero, subnormal f16 or normal f16, shall never hit.
tint::diag::List diag;
TINT_UNREACHABLE(Semantic, diag);
return f16_nan;
}
} // namespace tint

View File

@ -186,6 +186,13 @@ struct Number<detail::NumberKindF16> {
return *this;
}
/// Get the binary16 bit pattern in type uint16_t of this value.
/// @returns the binary16 bit pattern, in type uint16_t, of the stored quantized f16 value. If
/// the value is NaN, the returned value will be 0x7e00u. If the value is positive infinity, the
/// returned value will be 0x7c00u. If the input value is negative infinity, the returned value
/// will be 0xfc00u.
uint16_t BitsRepresentation() const;
/// @param value the input float32 value
/// @returns the float32 value quantized to the smaller float16 value, through truncation of the
/// mantissa bits (no rounding). If the float32 value is too large (positive or negative) to be

View File

@ -217,83 +217,164 @@ TEST(NumberTest, CheckedConvertSubnormals) {
EXPECT_EQ(CheckedConvert<f16>(AFloat(-kHighestF16Subnormal)), f16(-kHighestF16Subnormal));
}
TEST(NumberTest, QuantizeF16) {
constexpr float nan = std::numeric_limits<float>::quiet_NaN();
constexpr float inf = std::numeric_limits<float>::infinity();
// Test cases for f16 subnormal quantization and BitsRepresentation.
// The ULP is based on float rather than double or f16, since F16::Quantize and
// F16::BitsRepresentation take float as input.
constexpr float lowestPositiveNormalF16 = 0x1p-14;
constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14;
constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15;
constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14;
constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15;
constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15;
constexpr float lowestPositiveSubnormalF16 = 0x1.p-24;
constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24;
constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25;
EXPECT_EQ(f16(0.0), 0.0f);
EXPECT_EQ(f16(1.0), 1.0f);
EXPECT_EQ(f16(0.00006106496), 0.000061035156f);
EXPECT_EQ(f16(1.0004883), 1.0f);
EXPECT_EQ(f16(-8196), -8192.f);
EXPECT_EQ(f16(65504.003), inf);
EXPECT_EQ(f16(-65504.003), -inf);
EXPECT_EQ(f16(inf), inf);
EXPECT_EQ(f16(-inf), -inf);
EXPECT_TRUE(std::isnan(f16(nan)));
constexpr uint16_t lowestPositiveNormalF16Bits = 0x0400u;
constexpr uint16_t highestPositiveSubnormalF16Bits = 0x03ffu;
constexpr uint16_t lowestPositiveSubnormalF16Bits = 0x0001u;
// Test for subnormal quantization.
// The ULP is based on float rather than double or f16, since F16::Quantize take float as input.
constexpr float lowestPositiveNormalF16 = 0x1p-14;
constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14;
constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15;
constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14;
constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15;
constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15;
constexpr float lowestPositiveSubnormalF16 = 0x1.p-24;
constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24;
constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25;
constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16;
constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP;
constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP;
constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16;
constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP;
constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP;
constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16;
constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP;
constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP;
constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16;
constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP;
constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP;
constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16;
constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP;
constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP;
constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16;
constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP;
constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP;
constexpr uint16_t highestNegativeNormalF16Bits = 0x8400u;
constexpr uint16_t lowestNegativeSubnormalF16Bits = 0x83ffu;
constexpr uint16_t highestNegativeSubnormalF16Bits = 0x8001u;
// Value larger than or equal to lowest positive normal f16 will be quantized to normal f16.
EXPECT_EQ(f16(lowestPositiveNormalF16PlusULP), lowestPositiveNormalF16);
EXPECT_EQ(f16(lowestPositiveNormalF16), lowestPositiveNormalF16);
// Positive value smaller than lowest positive normal f16 but not smaller than lowest positive
// subnormal f16 will be quantized to subnormal f16 or zero.
EXPECT_EQ(f16(lowestPositiveNormalF16MinusULP), highestPositiveSubnormalF16);
EXPECT_EQ(f16(highestPositiveSubnormalF16PlusULP), highestPositiveSubnormalF16);
EXPECT_EQ(f16(highestPositiveSubnormalF16), highestPositiveSubnormalF16);
EXPECT_EQ(f16(highestPositiveSubnormalF16MinusULP), 0x0.ff8p-14);
EXPECT_EQ(f16(lowestPositiveSubnormalF16PlusULP), lowestPositiveSubnormalF16);
EXPECT_EQ(f16(lowestPositiveSubnormalF16), lowestPositiveSubnormalF16);
// Positive value smaller than lowest positive subnormal f16 will be quantized to zero.
EXPECT_EQ(f16(lowestPositiveSubnormalF16MinusULP), 0.0);
// Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 = 0x0.004p-14.
EXPECT_EQ(f16(0x0.064p-14), 0x0.064p-14);
EXPECT_EQ(f16(0x0.067fecp-14), 0x0.064p-14);
EXPECT_EQ(f16(0x0.063ffep-14), 0x0.060p-14);
EXPECT_EQ(f16(0x0.008p-14), 0x0.008p-14);
EXPECT_EQ(f16(0x0.00bffep-14), 0x0.008p-14);
EXPECT_EQ(f16(0x0.007ffep-14), 0x0.004p-14);
constexpr float f32_nan = std::numeric_limits<float>::quiet_NaN();
constexpr float f32_inf = std::numeric_limits<float>::infinity();
// Vice versa for negative cases.
EXPECT_EQ(f16(highestNegativeNormalF16MinusULP), highestNegativeNormalF16);
EXPECT_EQ(f16(highestNegativeNormalF16), highestNegativeNormalF16);
EXPECT_EQ(f16(highestNegativeNormalF16PlusULP), lowestNegativeSubnormalF16);
EXPECT_EQ(f16(lowestNegativeSubnormalF16MinusULP), lowestNegativeSubnormalF16);
EXPECT_EQ(f16(lowestNegativeSubnormalF16), lowestNegativeSubnormalF16);
EXPECT_EQ(f16(lowestNegativeSubnormalF16PlusULP), -0x0.ff8p-14);
EXPECT_EQ(f16(highestNegativeSubnormalF16MinusULP), highestNegativeSubnormalF16);
EXPECT_EQ(f16(highestNegativeSubnormalF16), highestNegativeSubnormalF16);
EXPECT_EQ(f16(highestNegativeSubnormalF16PlusULP), -0.0);
// Test the mantissa discarding.
EXPECT_EQ(f16(-0x0.064p-14), -0x0.064p-14);
EXPECT_EQ(f16(-0x0.067fecp-14), -0x0.064p-14);
EXPECT_EQ(f16(-0x0.063ffep-14), -0x0.060p-14);
EXPECT_EQ(f16(-0x0.008p-14), -0x0.008p-14);
EXPECT_EQ(f16(-0x0.00bffep-14), -0x0.008p-14);
EXPECT_EQ(f16(-0x0.007ffep-14), -0x0.004p-14);
struct F16TestCase {
float input_value;
float quantized_value;
uint16_t f16_bit_pattern;
};
using NumberF16Test = testing::TestWithParam<F16TestCase>;
TEST_P(NumberF16Test, QuantizeF16) {
float input_value = GetParam().input_value;
float quantized_value = GetParam().quantized_value;
std::stringstream ss;
ss << "input value = " << input_value << ", expected quantized value = " << quantized_value;
SCOPED_TRACE(ss.str());
if (std::isnan(quantized_value)) {
EXPECT_TRUE(std::isnan(f16(input_value)));
} else {
EXPECT_EQ(f16(input_value), quantized_value);
}
}
TEST_P(NumberF16Test, BitsRepresentation) {
float input_value = GetParam().input_value;
uint16_t representation = GetParam().f16_bit_pattern;
std::stringstream ss;
ss << "input value = " << input_value
<< ", expected binary16 bits representation = " << std::hex << std::showbase
<< representation;
SCOPED_TRACE(ss.str());
EXPECT_EQ(f16(input_value).BitsRepresentation(), representation);
}
INSTANTIATE_TEST_SUITE_P(
NumberF16Test,
NumberF16Test,
testing::ValuesIn(std::vector<F16TestCase>{
// NaN, Inf
{f32_inf, f32_inf, 0x7c00u},
{-f32_inf, -f32_inf, 0xfc00u},
{f32_nan, f32_nan, 0x7e00u},
{-f32_nan, -f32_nan, 0x7e00u},
// +/- zero
{+0.0f, 0.0f, 0x0000u},
{-0.0f, -0.0f, 0x8000u},
// Value in normal f16 range
{1.0f, 1.0f, 0x3c00u},
{-1.0f, -1.0f, 0xbc00u},
// 0.00006106496 quantized to 0.000061035156 = 0x1p-14
{0.00006106496f, 0.000061035156f, 0x0400u},
{-0.00006106496f, -0.000061035156f, 0x8400u},
// 1.0004883 quantized to 1.0 = 0x1p0
{1.0004883f, 1.0f, 0x3c00u},
{-1.0004883f, -1.0f, 0xbc00u},
// 8196.0 quantized to 8192.0 = 0x1p13
{8196.0f, 8192.f, 0x7000u},
{-8196.0f, -8192.f, 0xf000u},
// Value in subnormal f16 range
{0x0.034p-14f, 0x0.034p-14f, 0x000du},
{-0x0.034p-14f, -0x0.034p-14f, 0x800du},
{0x0.068p-14f, 0x0.068p-14f, 0x001au},
{-0x0.068p-14f, -0x0.068p-14f, 0x801au},
// 0x0.06b7p-14 quantized to 0x0.068p-14
{0x0.06b7p-14f, 0x0.068p-14f, 0x001au},
{-0x0.06b7p-14f, -0x0.068p-14, 0x801au},
// Value out of f16 range
{65504.003f, f32_inf, 0x7c00u},
{-65504.003f, -f32_inf, 0xfc00u},
{0x1.234p56f, f32_inf, 0x7c00u},
{-0x4.321p65f, -f32_inf, 0xfc00u},
// Test for subnormal quantization.
// Value larger than or equal to lowest positive normal f16 will be quantized to normal f16.
{lowestPositiveNormalF16PlusULP, lowestPositiveNormalF16, lowestPositiveNormalF16Bits},
{lowestPositiveNormalF16, lowestPositiveNormalF16, lowestPositiveNormalF16Bits},
// Positive value smaller than lowest positive normal f16 but not smaller than lowest
// positive
// subnormal f16 will be quantized to subnormal f16 or zero.
{lowestPositiveNormalF16MinusULP, highestPositiveSubnormalF16,
highestPositiveSubnormalF16Bits},
{highestPositiveSubnormalF16PlusULP, highestPositiveSubnormalF16,
highestPositiveSubnormalF16Bits},
{highestPositiveSubnormalF16, highestPositiveSubnormalF16, highestPositiveSubnormalF16Bits},
{highestPositiveSubnormalF16MinusULP, 0x0.ff8p-14, 0x03feu},
{lowestPositiveSubnormalF16PlusULP, lowestPositiveSubnormalF16,
lowestPositiveSubnormalF16Bits},
{lowestPositiveSubnormalF16, lowestPositiveSubnormalF16, lowestPositiveSubnormalF16Bits},
// Positive value smaller than lowest positive subnormal f16 will be quantized to zero.
{lowestPositiveSubnormalF16MinusULP, 0.0, 0x0000u},
// Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 =
// 0x0.004p-14.
{0x0.064p-14f, 0x0.064p-14, 0x0019u},
{0x0.067fecp-14f, 0x0.064p-14, 0x0019u},
{0x0.063ffep-14f, 0x0.060p-14, 0x0018u},
{0x0.008p-14f, 0x0.008p-14, 0x0002u},
{0x0.00bffep-14f, 0x0.008p-14, 0x0002u},
{0x0.007ffep-14f, 0x0.004p-14, 0x0001u},
// Vice versa for negative cases.
{highestNegativeNormalF16MinusULP, highestNegativeNormalF16, highestNegativeNormalF16Bits},
{highestNegativeNormalF16, highestNegativeNormalF16, highestNegativeNormalF16Bits},
{highestNegativeNormalF16PlusULP, lowestNegativeSubnormalF16,
lowestNegativeSubnormalF16Bits},
{lowestNegativeSubnormalF16MinusULP, lowestNegativeSubnormalF16,
lowestNegativeSubnormalF16Bits},
{lowestNegativeSubnormalF16, lowestNegativeSubnormalF16, lowestNegativeSubnormalF16Bits},
{lowestNegativeSubnormalF16PlusULP, -0x0.ff8p-14, 0x83feu},
{highestNegativeSubnormalF16MinusULP, highestNegativeSubnormalF16,
highestNegativeSubnormalF16Bits},
{highestNegativeSubnormalF16, highestNegativeSubnormalF16, highestNegativeSubnormalF16Bits},
{highestNegativeSubnormalF16PlusULP, -0.0, 0x8000u},
// Test the mantissa discarding.
{-0x0.064p-14f, -0x0.064p-14, 0x8019u},
{-0x0.067fecp-14f, -0x0.064p-14, 0x8019u},
{-0x0.063ffep-14f, -0x0.060p-14, 0x8018u},
{-0x0.008p-14f, -0x0.008p-14, 0x8002u},
{-0x0.00bffep-14f, -0x0.008p-14, 0x8002u},
{-0x0.007ffep-14f, -0x0.004p-14, 0x8001u},
/////////////////////////////////////
}));
using BinaryCheckedCase = std::tuple<std::optional<AInt>, AInt, AInt>;
#undef OVERFLOW // corecrt_math.h :(