tint: Implement f16 value binary representation

This CL add methods that return the binary16 bit pattern for a constructed Number<detail::NumberKindF16>. This is required for generating SIPR-V oprand. Bug: tint:1473, tint:1502 Change-Id: Ia3680cdb5a0e64d31bfe2f48432cda3850c1f5a7 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/95240 Reviewed-by: Dan Sinclair <dsinclair@chromium.org> Commit-Queue: Zhaoming Jiang <zhaoming.jiang@intel.com>
2025-05-16 12:21:35 +00:00 · 2022-07-07 03:29:11 +00:00 · 2022-07-07 03:29:11 +00:00 · 2c7440a13f
commit 2c7440a13f
parent 590040cfb4
3 changed files with 262 additions and 70 deletions
--- a/src/tint/number.cc
+++ b/src/tint/number.cc
@ -204,4 +204,108 @@ f16::type f16::Quantize(f16::type value) {
    return value;
 }
 uint16_t f16::BitsRepresentation() const {
    constexpr uint16_t f16_nan = 0x7e00u;
    constexpr uint16_t f16_pos_inf = 0x7c00u;
    constexpr uint16_t f16_neg_inf = 0xfc00u;
    // Assert we use binary32 (i.e. float) as underlying type, which has 4 bytes.
    static_assert(std::is_same<f16::type, float>());
    // The stored value in f16 object must be already quantized, so it should be either NaN, +/-
    // Inf, or exactly representable by normal or subnormal f16.
    if (std::isnan(value)) {
        return f16_nan;
    }
    if (std::isinf(value)) {
        return value > 0 ? f16_pos_inf : f16_neg_inf;
    }
    // Now quantized_value must be a finite f16 exactly-representable value.
    // The following table shows exponent cases for all finite f16 exactly-representable value.
    // ---------------------------------------------------------------------------
    // |  Value category  |  Unbiased exp  |  F16 biased exp  |  F32 biased exp  |
    // |------------------|----------------|------------------|------------------|
    // |     +/- zero     |        \       |         0        |         0        |
    // |  Subnormal f16   |   [-24, -15]   |         0        |    [103, 112]    |
    // |    Normal f16    |   [-14, 15]    |      [1, 30]     |    [113, 142]    |
    // ---------------------------------------------------------------------------
    constexpr uint32_t max_f32_biased_exp_for_f16_normal_number = 142;
    constexpr uint32_t min_f32_biased_exp_for_f16_normal_number = 113;
    constexpr uint32_t max_f32_biased_exp_for_f16_subnormal_number = 112;
    constexpr uint32_t min_f32_biased_exp_for_f16_subnormal_number = 103;
    constexpr uint32_t f32_sign_mask = 0x80000000u;
    constexpr uint32_t f32_exp_mask = 0x7f800000u;
    constexpr uint32_t f32_mantissa_mask = 0x007fffffu;
    constexpr uint32_t f32_mantissa_bis_number = 23;
    constexpr uint32_t f32_exp_bias = 127;
    constexpr uint16_t f16_sign_mask = 0x8000u;
    constexpr uint16_t f16_exp_mask = 0x7c00u;
    constexpr uint16_t f16_mantissa_mask = 0x03ffu;
    constexpr uint32_t f16_mantissa_bis_number = 10;
    constexpr uint32_t f16_exp_bias = 15;
    uint32_t f32_bit_pattern;
    memcpy(&f32_bit_pattern, &value, 4);
    uint32_t f32_biased_exponent = (f32_bit_pattern & f32_exp_mask) >> f32_mantissa_bis_number;
    uint32_t f32_mantissa = f32_bit_pattern & f32_mantissa_mask;
    uint16_t f16_sign_part = static_cast<uint16_t>((f32_bit_pattern & f32_sign_mask) >> 16);
    TINT_ASSERT(Semantic, (f16_sign_part & ~f16_sign_mask) == 0);
    if ((f32_bit_pattern & ~f32_sign_mask) == 0) {
        // +/- zero
        return f16_sign_part;
    }
    if ((min_f32_biased_exp_for_f16_normal_number <= f32_biased_exponent) &&
        (f32_biased_exponent <= max_f32_biased_exp_for_f16_normal_number)) {
        // Normal f16
        uint32_t f16_biased_exponent = f32_biased_exponent - f32_exp_bias + f16_exp_bias;
        uint16_t f16_exp_part =
            static_cast<uint16_t>(f16_biased_exponent << f16_mantissa_bis_number);
        uint16_t f16_mantissa_part = static_cast<uint16_t>(
            f32_mantissa >> (f32_mantissa_bis_number - f16_mantissa_bis_number));
        TINT_ASSERT(Semantic, (f16_exp_part & ~f16_exp_mask) == 0);
        TINT_ASSERT(Semantic, (f16_mantissa_part & ~f16_mantissa_mask) == 0);
        return f16_sign_part | f16_exp_part | f16_mantissa_part;
    }
    if ((min_f32_biased_exp_for_f16_subnormal_number <= f32_biased_exponent) &&
        (f32_biased_exponent <= max_f32_biased_exp_for_f16_subnormal_number)) {
        // Subnormal f16
        // The resulting exp bits are always 0, and the mantissa bits should be handled specially.
        uint16_t f16_exp_part = 0;
        // The resulting subnormal f16 will have only 1 valid mantissa bit if the unbiased exponent
        // of value is of the minimum, i.e. -24; and have all 10 mantissa bits valid if the unbiased
        // exponent of value is of the maximum, i.e. -15.
        uint32_t f16_valid_mantissa_bits =
            f32_biased_exponent - min_f32_biased_exp_for_f16_subnormal_number + 1;
        // The resulting f16 mantissa part comes from right-shifting the f32 mantissa bits with
        // leading 1 added.
        uint16_t f16_mantissa_part =
            static_cast<uint16_t>((f32_mantissa | (f32_mantissa_mask + 1)) >>
                                  (f32_mantissa_bis_number + 1 - f16_valid_mantissa_bits));
        TINT_ASSERT(Semantic, (1 <= f16_valid_mantissa_bits) &&
                                  (f16_valid_mantissa_bits <= f16_mantissa_bis_number));
        TINT_ASSERT(Semantic, (f16_mantissa_part & ~((1u << f16_valid_mantissa_bits) - 1)) == 0);
        TINT_ASSERT(Semantic, (f16_mantissa_part != 0));
        return f16_sign_part | f16_exp_part | f16_mantissa_part;
    }
    // Neither zero, subnormal f16 or normal f16, shall never hit.
    tint::diag::List diag;
    TINT_UNREACHABLE(Semantic, diag);
    return f16_nan;
 }
 }  // namespace tint
--- a/src/tint/number.h
+++ b/src/tint/number.h
@ -186,6 +186,13 @@ struct Number<detail::NumberKindF16> {
        return *this;
    }
    /// Get the binary16 bit pattern in type uint16_t of this value.
    /// @returns the binary16 bit pattern, in type uint16_t, of the stored quantized f16 value. If
    /// the value is NaN, the returned value will be 0x7e00u. If the value is positive infinity, the
    /// returned value will be 0x7c00u. If the input value is negative infinity, the returned value
    /// will be 0xfc00u.
    uint16_t BitsRepresentation() const;
    /// @param value the input float32 value
    /// @returns the float32 value quantized to the smaller float16 value, through truncation of the
    /// mantissa bits (no rounding). If the float32 value is too large (positive or negative) to be
--- a/src/tint/number_test.cc
+++ b/src/tint/number_test.cc
@ -217,83 +217,164 @@ TEST(NumberTest, CheckedConvertSubnormals) {
    EXPECT_EQ(CheckedConvert<f16>(AFloat(-kHighestF16Subnormal)), f16(-kHighestF16Subnormal));
 }
-TEST(NumberTest, QuantizeF16) {
+// Test cases for f16 subnormal quantization and BitsRepresentation.
-    constexpr float nan = std::numeric_limits<float>::quiet_NaN();
+// The ULP is based on float rather than double or f16, since F16::Quantize and
-    constexpr float inf = std::numeric_limits<float>::infinity();
+// F16::BitsRepresentation take float as input.
 constexpr float lowestPositiveNormalF16 = 0x1p-14;
 constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14;
 constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15;
 constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14;
 constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15;
 constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15;
 constexpr float lowestPositiveSubnormalF16 = 0x1.p-24;
 constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24;
 constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25;
-    EXPECT_EQ(f16(0.0), 0.0f);
+constexpr uint16_t lowestPositiveNormalF16Bits = 0x0400u;
-    EXPECT_EQ(f16(1.0), 1.0f);
+constexpr uint16_t highestPositiveSubnormalF16Bits = 0x03ffu;
-    EXPECT_EQ(f16(0.00006106496), 0.000061035156f);
+constexpr uint16_t lowestPositiveSubnormalF16Bits = 0x0001u;
    EXPECT_EQ(f16(1.0004883), 1.0f);
    EXPECT_EQ(f16(-8196), -8192.f);
    EXPECT_EQ(f16(65504.003), inf);
    EXPECT_EQ(f16(-65504.003), -inf);
    EXPECT_EQ(f16(inf), inf);
    EXPECT_EQ(f16(-inf), -inf);
    EXPECT_TRUE(std::isnan(f16(nan)));
-    // Test for subnormal quantization.
+constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16;
-    // The ULP is based on float rather than double or f16, since F16::Quantize take float as input.
+constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP;
-    constexpr float lowestPositiveNormalF16 = 0x1p-14;
+constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP;
-    constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14;
+constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16;
-    constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15;
+constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP;
-    constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14;
+constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP;
-    constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15;
+constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16;
-    constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15;
+constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP;
-    constexpr float lowestPositiveSubnormalF16 = 0x1.p-24;
+constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP;
    constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24;
    constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25;
-    constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16;
+constexpr uint16_t highestNegativeNormalF16Bits = 0x8400u;
-    constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP;
+constexpr uint16_t lowestNegativeSubnormalF16Bits = 0x83ffu;
-    constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP;
+constexpr uint16_t highestNegativeSubnormalF16Bits = 0x8001u;
    constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16;
    constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP;
    constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP;
    constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16;
    constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP;
    constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP;
-    // Value larger than or equal to lowest positive normal f16 will be quantized to normal f16.
+constexpr float f32_nan = std::numeric_limits<float>::quiet_NaN();
-    EXPECT_EQ(f16(lowestPositiveNormalF16PlusULP), lowestPositiveNormalF16);
+constexpr float f32_inf = std::numeric_limits<float>::infinity();
    EXPECT_EQ(f16(lowestPositiveNormalF16), lowestPositiveNormalF16);
    // Positive value smaller than lowest positive normal f16 but not smaller than lowest positive
    // subnormal f16 will be quantized to subnormal f16 or zero.
    EXPECT_EQ(f16(lowestPositiveNormalF16MinusULP), highestPositiveSubnormalF16);
    EXPECT_EQ(f16(highestPositiveSubnormalF16PlusULP), highestPositiveSubnormalF16);
    EXPECT_EQ(f16(highestPositiveSubnormalF16), highestPositiveSubnormalF16);
    EXPECT_EQ(f16(highestPositiveSubnormalF16MinusULP), 0x0.ff8p-14);
    EXPECT_EQ(f16(lowestPositiveSubnormalF16PlusULP), lowestPositiveSubnormalF16);
    EXPECT_EQ(f16(lowestPositiveSubnormalF16), lowestPositiveSubnormalF16);
    // Positive value smaller than lowest positive subnormal f16 will be quantized to zero.
    EXPECT_EQ(f16(lowestPositiveSubnormalF16MinusULP), 0.0);
    // Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 = 0x0.004p-14.
    EXPECT_EQ(f16(0x0.064p-14), 0x0.064p-14);
    EXPECT_EQ(f16(0x0.067fecp-14), 0x0.064p-14);
    EXPECT_EQ(f16(0x0.063ffep-14), 0x0.060p-14);
    EXPECT_EQ(f16(0x0.008p-14), 0x0.008p-14);
    EXPECT_EQ(f16(0x0.00bffep-14), 0x0.008p-14);
    EXPECT_EQ(f16(0x0.007ffep-14), 0x0.004p-14);
-    // Vice versa for negative cases.
+struct F16TestCase {
-    EXPECT_EQ(f16(highestNegativeNormalF16MinusULP), highestNegativeNormalF16);
+    float input_value;
-    EXPECT_EQ(f16(highestNegativeNormalF16), highestNegativeNormalF16);
+    float quantized_value;
-    EXPECT_EQ(f16(highestNegativeNormalF16PlusULP), lowestNegativeSubnormalF16);
+    uint16_t f16_bit_pattern;
-    EXPECT_EQ(f16(lowestNegativeSubnormalF16MinusULP), lowestNegativeSubnormalF16);
+};
-    EXPECT_EQ(f16(lowestNegativeSubnormalF16), lowestNegativeSubnormalF16);
+
-    EXPECT_EQ(f16(lowestNegativeSubnormalF16PlusULP), -0x0.ff8p-14);
+using NumberF16Test = testing::TestWithParam<F16TestCase>;
-    EXPECT_EQ(f16(highestNegativeSubnormalF16MinusULP), highestNegativeSubnormalF16);
+
-    EXPECT_EQ(f16(highestNegativeSubnormalF16), highestNegativeSubnormalF16);
+TEST_P(NumberF16Test, QuantizeF16) {
-    EXPECT_EQ(f16(highestNegativeSubnormalF16PlusULP), -0.0);
+    float input_value = GetParam().input_value;
-    // Test the mantissa discarding.
+    float quantized_value = GetParam().quantized_value;
-    EXPECT_EQ(f16(-0x0.064p-14), -0x0.064p-14);
+
-    EXPECT_EQ(f16(-0x0.067fecp-14), -0x0.064p-14);
+    std::stringstream ss;
-    EXPECT_EQ(f16(-0x0.063ffep-14), -0x0.060p-14);
+    ss << "input value = " << input_value << ", expected quantized value = " << quantized_value;
-    EXPECT_EQ(f16(-0x0.008p-14), -0x0.008p-14);
+    SCOPED_TRACE(ss.str());
-    EXPECT_EQ(f16(-0x0.00bffep-14), -0x0.008p-14);
+
-    EXPECT_EQ(f16(-0x0.007ffep-14), -0x0.004p-14);
+    if (std::isnan(quantized_value)) {
        EXPECT_TRUE(std::isnan(f16(input_value)));
    } else {
        EXPECT_EQ(f16(input_value), quantized_value);
    }
 }
 TEST_P(NumberF16Test, BitsRepresentation) {
    float input_value = GetParam().input_value;
    uint16_t representation = GetParam().f16_bit_pattern;
    std::stringstream ss;
    ss << "input value = " << input_value
       << ", expected binary16 bits representation = " << std::hex << std::showbase
       << representation;
    SCOPED_TRACE(ss.str());
    EXPECT_EQ(f16(input_value).BitsRepresentation(), representation);
 }
 INSTANTIATE_TEST_SUITE_P(
    NumberF16Test,
    NumberF16Test,
    testing::ValuesIn(std::vector<F16TestCase>{
        // NaN, Inf
        {f32_inf, f32_inf, 0x7c00u},
        {-f32_inf, -f32_inf, 0xfc00u},
        {f32_nan, f32_nan, 0x7e00u},
        {-f32_nan, -f32_nan, 0x7e00u},
        // +/- zero
        {+0.0f, 0.0f, 0x0000u},
        {-0.0f, -0.0f, 0x8000u},
        // Value in normal f16 range
        {1.0f, 1.0f, 0x3c00u},
        {-1.0f, -1.0f, 0xbc00u},
        //   0.00006106496 quantized to 0.000061035156 = 0x1p-14
        {0.00006106496f, 0.000061035156f, 0x0400u},
        {-0.00006106496f, -0.000061035156f, 0x8400u},
        //   1.0004883 quantized to 1.0 = 0x1p0
        {1.0004883f, 1.0f, 0x3c00u},
        {-1.0004883f, -1.0f, 0xbc00u},
        //   8196.0 quantized to 8192.0 = 0x1p13
        {8196.0f, 8192.f, 0x7000u},
        {-8196.0f, -8192.f, 0xf000u},
        // Value in subnormal f16 range
        {0x0.034p-14f, 0x0.034p-14f, 0x000du},
        {-0x0.034p-14f, -0x0.034p-14f, 0x800du},
        {0x0.068p-14f, 0x0.068p-14f, 0x001au},
        {-0x0.068p-14f, -0x0.068p-14f, 0x801au},
        //   0x0.06b7p-14 quantized to 0x0.068p-14
        {0x0.06b7p-14f, 0x0.068p-14f, 0x001au},
        {-0x0.06b7p-14f, -0x0.068p-14, 0x801au},
        // Value out of f16 range
        {65504.003f, f32_inf, 0x7c00u},
        {-65504.003f, -f32_inf, 0xfc00u},
        {0x1.234p56f, f32_inf, 0x7c00u},
        {-0x4.321p65f, -f32_inf, 0xfc00u},
        // Test for subnormal quantization.
        // Value larger than or equal to lowest positive normal f16 will be quantized to normal f16.
        {lowestPositiveNormalF16PlusULP, lowestPositiveNormalF16, lowestPositiveNormalF16Bits},
        {lowestPositiveNormalF16, lowestPositiveNormalF16, lowestPositiveNormalF16Bits},
        // Positive value smaller than lowest positive normal f16 but not smaller than lowest
        // positive
        // subnormal f16 will be quantized to subnormal f16 or zero.
        {lowestPositiveNormalF16MinusULP, highestPositiveSubnormalF16,
         highestPositiveSubnormalF16Bits},
        {highestPositiveSubnormalF16PlusULP, highestPositiveSubnormalF16,
         highestPositiveSubnormalF16Bits},
        {highestPositiveSubnormalF16, highestPositiveSubnormalF16, highestPositiveSubnormalF16Bits},
        {highestPositiveSubnormalF16MinusULP, 0x0.ff8p-14, 0x03feu},
        {lowestPositiveSubnormalF16PlusULP, lowestPositiveSubnormalF16,
         lowestPositiveSubnormalF16Bits},
        {lowestPositiveSubnormalF16, lowestPositiveSubnormalF16, lowestPositiveSubnormalF16Bits},
        // Positive value smaller than lowest positive subnormal f16 will be quantized to zero.
        {lowestPositiveSubnormalF16MinusULP, 0.0, 0x0000u},
        // Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 =
        // 0x0.004p-14.
        {0x0.064p-14f, 0x0.064p-14, 0x0019u},
        {0x0.067fecp-14f, 0x0.064p-14, 0x0019u},
        {0x0.063ffep-14f, 0x0.060p-14, 0x0018u},
        {0x0.008p-14f, 0x0.008p-14, 0x0002u},
        {0x0.00bffep-14f, 0x0.008p-14, 0x0002u},
        {0x0.007ffep-14f, 0x0.004p-14, 0x0001u},
        // Vice versa for negative cases.
        {highestNegativeNormalF16MinusULP, highestNegativeNormalF16, highestNegativeNormalF16Bits},
        {highestNegativeNormalF16, highestNegativeNormalF16, highestNegativeNormalF16Bits},
        {highestNegativeNormalF16PlusULP, lowestNegativeSubnormalF16,
         lowestNegativeSubnormalF16Bits},
        {lowestNegativeSubnormalF16MinusULP, lowestNegativeSubnormalF16,
         lowestNegativeSubnormalF16Bits},
        {lowestNegativeSubnormalF16, lowestNegativeSubnormalF16, lowestNegativeSubnormalF16Bits},
        {lowestNegativeSubnormalF16PlusULP, -0x0.ff8p-14, 0x83feu},
        {highestNegativeSubnormalF16MinusULP, highestNegativeSubnormalF16,
         highestNegativeSubnormalF16Bits},
        {highestNegativeSubnormalF16, highestNegativeSubnormalF16, highestNegativeSubnormalF16Bits},
        {highestNegativeSubnormalF16PlusULP, -0.0, 0x8000u},
        // Test the mantissa discarding.
        {-0x0.064p-14f, -0x0.064p-14, 0x8019u},
        {-0x0.067fecp-14f, -0x0.064p-14, 0x8019u},
        {-0x0.063ffep-14f, -0x0.060p-14, 0x8018u},
        {-0x0.008p-14f, -0x0.008p-14, 0x8002u},
        {-0x0.00bffep-14f, -0x0.008p-14, 0x8002u},
        {-0x0.007ffep-14f, -0x0.004p-14, 0x8001u},
        /////////////////////////////////////
    }));
 using BinaryCheckedCase = std::tuple<std::optional<AInt>, AInt, AInt>;
 #undef OVERFLOW  // corecrt_math.h :(