tint: Implement f16 value binary representation

This CL add methods that return the binary16 bit pattern for a constructed Number<detail::NumberKindF16>. This is required for generating SIPR-V oprand. Bug: tint:1473, tint:1502 Change-Id: Ia3680cdb5a0e64d31bfe2f48432cda3850c1f5a7 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/95240 Reviewed-by: Dan Sinclair <dsinclair@chromium.org> Commit-Queue: Zhaoming Jiang <zhaoming.jiang@intel.com>
2025-12-10 05:57:51 +00:00 · 2022-07-07 03:29:11 +00:00
parent 590040cfb4
commit 2c7440a13f
3 changed files with 262 additions and 70 deletions
--- a/src/tint/number.cc
+++ b/src/tint/number.cc
@@ -204,4 +204,108 @@ f16::type f16::Quantize(f16::type value) {
    return value;
 }

+uint16_t f16::BitsRepresentation() const {
+    constexpr uint16_t f16_nan = 0x7e00u;
+    constexpr uint16_t f16_pos_inf = 0x7c00u;
+    constexpr uint16_t f16_neg_inf = 0xfc00u;
+
+    // Assert we use binary32 (i.e. float) as underlying type, which has 4 bytes.
+    static_assert(std::is_same<f16::type, float>());
+
+    // The stored value in f16 object must be already quantized, so it should be either NaN, +/-
+    // Inf, or exactly representable by normal or subnormal f16.
+
+    if (std::isnan(value)) {
+        return f16_nan;
+    }
+
+    if (std::isinf(value)) {
+        return value > 0 ? f16_pos_inf : f16_neg_inf;
+    }
+
+    // Now quantized_value must be a finite f16 exactly-representable value.
+    // The following table shows exponent cases for all finite f16 exactly-representable value.
+    // ---------------------------------------------------------------------------
+    // |  Value category  |  Unbiased exp  |  F16 biased exp  |  F32 biased exp  |
+    // |------------------|----------------|------------------|------------------|
+    // |     +/- zero     |        \       |         0        |         0        |
+    // |  Subnormal f16   |   [-24, -15]   |         0        |    [103, 112]    |
+    // |    Normal f16    |   [-14, 15]    |      [1, 30]     |    [113, 142]    |
+    // ---------------------------------------------------------------------------
+
+    constexpr uint32_t max_f32_biased_exp_for_f16_normal_number = 142;
+    constexpr uint32_t min_f32_biased_exp_for_f16_normal_number = 113;
+    constexpr uint32_t max_f32_biased_exp_for_f16_subnormal_number = 112;
+    constexpr uint32_t min_f32_biased_exp_for_f16_subnormal_number = 103;
+
+    constexpr uint32_t f32_sign_mask = 0x80000000u;
+    constexpr uint32_t f32_exp_mask = 0x7f800000u;
+    constexpr uint32_t f32_mantissa_mask = 0x007fffffu;
+    constexpr uint32_t f32_mantissa_bis_number = 23;
+    constexpr uint32_t f32_exp_bias = 127;
+
+    constexpr uint16_t f16_sign_mask = 0x8000u;
+    constexpr uint16_t f16_exp_mask = 0x7c00u;
+    constexpr uint16_t f16_mantissa_mask = 0x03ffu;
+    constexpr uint32_t f16_mantissa_bis_number = 10;
+    constexpr uint32_t f16_exp_bias = 15;
+
+    uint32_t f32_bit_pattern;
+    memcpy(&f32_bit_pattern, &value, 4);
+    uint32_t f32_biased_exponent = (f32_bit_pattern & f32_exp_mask) >> f32_mantissa_bis_number;
+    uint32_t f32_mantissa = f32_bit_pattern & f32_mantissa_mask;
+
+    uint16_t f16_sign_part = static_cast<uint16_t>((f32_bit_pattern & f32_sign_mask) >> 16);
+    TINT_ASSERT(Semantic, (f16_sign_part & ~f16_sign_mask) == 0);
+
+    if ((f32_bit_pattern & ~f32_sign_mask) == 0) {
+        // +/- zero
+        return f16_sign_part;
+    }
+
+    if ((min_f32_biased_exp_for_f16_normal_number <= f32_biased_exponent) &&
+        (f32_biased_exponent <= max_f32_biased_exp_for_f16_normal_number)) {
+        // Normal f16
+        uint32_t f16_biased_exponent = f32_biased_exponent - f32_exp_bias + f16_exp_bias;
+        uint16_t f16_exp_part =
+            static_cast<uint16_t>(f16_biased_exponent << f16_mantissa_bis_number);
+        uint16_t f16_mantissa_part = static_cast<uint16_t>(
+            f32_mantissa >> (f32_mantissa_bis_number - f16_mantissa_bis_number));
+
+        TINT_ASSERT(Semantic, (f16_exp_part & ~f16_exp_mask) == 0);
+        TINT_ASSERT(Semantic, (f16_mantissa_part & ~f16_mantissa_mask) == 0);
+
+        return f16_sign_part | f16_exp_part | f16_mantissa_part;
+    }
+
+    if ((min_f32_biased_exp_for_f16_subnormal_number <= f32_biased_exponent) &&
+        (f32_biased_exponent <= max_f32_biased_exp_for_f16_subnormal_number)) {
+        // Subnormal f16
+        // The resulting exp bits are always 0, and the mantissa bits should be handled specially.
+        uint16_t f16_exp_part = 0;
+        // The resulting subnormal f16 will have only 1 valid mantissa bit if the unbiased exponent
+        // of value is of the minimum, i.e. -24; and have all 10 mantissa bits valid if the unbiased
+        // exponent of value is of the maximum, i.e. -15.
+        uint32_t f16_valid_mantissa_bits =
+            f32_biased_exponent - min_f32_biased_exp_for_f16_subnormal_number + 1;
+        // The resulting f16 mantissa part comes from right-shifting the f32 mantissa bits with
+        // leading 1 added.
+        uint16_t f16_mantissa_part =
+            static_cast<uint16_t>((f32_mantissa | (f32_mantissa_mask + 1)) >>
+                                  (f32_mantissa_bis_number + 1 - f16_valid_mantissa_bits));
+
+        TINT_ASSERT(Semantic, (1 <= f16_valid_mantissa_bits) &&
+                                  (f16_valid_mantissa_bits <= f16_mantissa_bis_number));
+        TINT_ASSERT(Semantic, (f16_mantissa_part & ~((1u << f16_valid_mantissa_bits) - 1)) == 0);
+        TINT_ASSERT(Semantic, (f16_mantissa_part != 0));
+
+        return f16_sign_part | f16_exp_part | f16_mantissa_part;
+    }
+
+    // Neither zero, subnormal f16 or normal f16, shall never hit.
+    tint::diag::List diag;
+    TINT_UNREACHABLE(Semantic, diag);
+    return f16_nan;
+}
+
 }  // namespace tint
--- a/src/tint/number.h
+++ b/src/tint/number.h
@@ -186,6 +186,13 @@ struct Number<detail::NumberKindF16> {
        return *this;
    }

+    /// Get the binary16 bit pattern in type uint16_t of this value.
+    /// @returns the binary16 bit pattern, in type uint16_t, of the stored quantized f16 value. If
+    /// the value is NaN, the returned value will be 0x7e00u. If the value is positive infinity, the
+    /// returned value will be 0x7c00u. If the input value is negative infinity, the returned value
+    /// will be 0xfc00u.
+    uint16_t BitsRepresentation() const;
+
    /// @param value the input float32 value
    /// @returns the float32 value quantized to the smaller float16 value, through truncation of the
    /// mantissa bits (no rounding). If the float32 value is too large (positive or negative) to be
--- a/src/tint/number_test.cc
+++ b/src/tint/number_test.cc
@@ -217,83 +217,164 @@ TEST(NumberTest, CheckedConvertSubnormals) {
    EXPECT_EQ(CheckedConvert<f16>(AFloat(-kHighestF16Subnormal)), f16(-kHighestF16Subnormal));
 }

-TEST(NumberTest, QuantizeF16) {
-    constexpr float nan = std::numeric_limits<float>::quiet_NaN();
-    constexpr float inf = std::numeric_limits<float>::infinity();
+// Test cases for f16 subnormal quantization and BitsRepresentation.
+// The ULP is based on float rather than double or f16, since F16::Quantize and
+// F16::BitsRepresentation take float as input.
+constexpr float lowestPositiveNormalF16 = 0x1p-14;
+constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14;
+constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15;
+constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14;
+constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15;
+constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15;
+constexpr float lowestPositiveSubnormalF16 = 0x1.p-24;
+constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24;
+constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25;

-    EXPECT_EQ(f16(0.0), 0.0f);
-    EXPECT_EQ(f16(1.0), 1.0f);
-    EXPECT_EQ(f16(0.00006106496), 0.000061035156f);
-    EXPECT_EQ(f16(1.0004883), 1.0f);
-    EXPECT_EQ(f16(-8196), -8192.f);
-    EXPECT_EQ(f16(65504.003), inf);
-    EXPECT_EQ(f16(-65504.003), -inf);
-    EXPECT_EQ(f16(inf), inf);
-    EXPECT_EQ(f16(-inf), -inf);
-    EXPECT_TRUE(std::isnan(f16(nan)));
+constexpr uint16_t lowestPositiveNormalF16Bits = 0x0400u;
+constexpr uint16_t highestPositiveSubnormalF16Bits = 0x03ffu;
+constexpr uint16_t lowestPositiveSubnormalF16Bits = 0x0001u;

-    // Test for subnormal quantization.
-    // The ULP is based on float rather than double or f16, since F16::Quantize take float as input.
-    constexpr float lowestPositiveNormalF16 = 0x1p-14;
-    constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14;
-    constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15;
-    constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14;
-    constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15;
-    constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15;
-    constexpr float lowestPositiveSubnormalF16 = 0x1.p-24;
-    constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24;
-    constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25;
+constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16;
+constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP;
+constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP;
+constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16;
+constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP;
+constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP;
+constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16;
+constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP;
+constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP;

-    constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16;
-    constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP;
-    constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP;
-    constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16;
-    constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP;
-    constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP;
-    constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16;
-    constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP;
-    constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP;
+constexpr uint16_t highestNegativeNormalF16Bits = 0x8400u;
+constexpr uint16_t lowestNegativeSubnormalF16Bits = 0x83ffu;
+constexpr uint16_t highestNegativeSubnormalF16Bits = 0x8001u;

-    // Value larger than or equal to lowest positive normal f16 will be quantized to normal f16.
-    EXPECT_EQ(f16(lowestPositiveNormalF16PlusULP), lowestPositiveNormalF16);
-    EXPECT_EQ(f16(lowestPositiveNormalF16), lowestPositiveNormalF16);
-    // Positive value smaller than lowest positive normal f16 but not smaller than lowest positive
-    // subnormal f16 will be quantized to subnormal f16 or zero.
-    EXPECT_EQ(f16(lowestPositiveNormalF16MinusULP), highestPositiveSubnormalF16);
-    EXPECT_EQ(f16(highestPositiveSubnormalF16PlusULP), highestPositiveSubnormalF16);
-    EXPECT_EQ(f16(highestPositiveSubnormalF16), highestPositiveSubnormalF16);
-    EXPECT_EQ(f16(highestPositiveSubnormalF16MinusULP), 0x0.ff8p-14);
-    EXPECT_EQ(f16(lowestPositiveSubnormalF16PlusULP), lowestPositiveSubnormalF16);
-    EXPECT_EQ(f16(lowestPositiveSubnormalF16), lowestPositiveSubnormalF16);
-    // Positive value smaller than lowest positive subnormal f16 will be quantized to zero.
-    EXPECT_EQ(f16(lowestPositiveSubnormalF16MinusULP), 0.0);
-    // Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 = 0x0.004p-14.
-    EXPECT_EQ(f16(0x0.064p-14), 0x0.064p-14);
-    EXPECT_EQ(f16(0x0.067fecp-14), 0x0.064p-14);
-    EXPECT_EQ(f16(0x0.063ffep-14), 0x0.060p-14);
-    EXPECT_EQ(f16(0x0.008p-14), 0x0.008p-14);
-    EXPECT_EQ(f16(0x0.00bffep-14), 0x0.008p-14);
-    EXPECT_EQ(f16(0x0.007ffep-14), 0x0.004p-14);
+constexpr float f32_nan = std::numeric_limits<float>::quiet_NaN();
+constexpr float f32_inf = std::numeric_limits<float>::infinity();

-    // Vice versa for negative cases.
-    EXPECT_EQ(f16(highestNegativeNormalF16MinusULP), highestNegativeNormalF16);
-    EXPECT_EQ(f16(highestNegativeNormalF16), highestNegativeNormalF16);
-    EXPECT_EQ(f16(highestNegativeNormalF16PlusULP), lowestNegativeSubnormalF16);
-    EXPECT_EQ(f16(lowestNegativeSubnormalF16MinusULP), lowestNegativeSubnormalF16);
-    EXPECT_EQ(f16(lowestNegativeSubnormalF16), lowestNegativeSubnormalF16);
-    EXPECT_EQ(f16(lowestNegativeSubnormalF16PlusULP), -0x0.ff8p-14);
-    EXPECT_EQ(f16(highestNegativeSubnormalF16MinusULP), highestNegativeSubnormalF16);
-    EXPECT_EQ(f16(highestNegativeSubnormalF16), highestNegativeSubnormalF16);
-    EXPECT_EQ(f16(highestNegativeSubnormalF16PlusULP), -0.0);
-    // Test the mantissa discarding.
-    EXPECT_EQ(f16(-0x0.064p-14), -0x0.064p-14);
-    EXPECT_EQ(f16(-0x0.067fecp-14), -0x0.064p-14);
-    EXPECT_EQ(f16(-0x0.063ffep-14), -0x0.060p-14);
-    EXPECT_EQ(f16(-0x0.008p-14), -0x0.008p-14);
-    EXPECT_EQ(f16(-0x0.00bffep-14), -0x0.008p-14);
-    EXPECT_EQ(f16(-0x0.007ffep-14), -0x0.004p-14);
+struct F16TestCase {
+    float input_value;
+    float quantized_value;
+    uint16_t f16_bit_pattern;
+};
+
+using NumberF16Test = testing::TestWithParam<F16TestCase>;
+
+TEST_P(NumberF16Test, QuantizeF16) {
+    float input_value = GetParam().input_value;
+    float quantized_value = GetParam().quantized_value;
+
+    std::stringstream ss;
+    ss << "input value = " << input_value << ", expected quantized value = " << quantized_value;
+    SCOPED_TRACE(ss.str());
+
+    if (std::isnan(quantized_value)) {
+        EXPECT_TRUE(std::isnan(f16(input_value)));
+    } else {
+        EXPECT_EQ(f16(input_value), quantized_value);
+    }
 }

+TEST_P(NumberF16Test, BitsRepresentation) {
+    float input_value = GetParam().input_value;
+    uint16_t representation = GetParam().f16_bit_pattern;
+
+    std::stringstream ss;
+    ss << "input value = " << input_value
+       << ", expected binary16 bits representation = " << std::hex << std::showbase
+       << representation;
+    SCOPED_TRACE(ss.str());
+
+    EXPECT_EQ(f16(input_value).BitsRepresentation(), representation);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    NumberF16Test,
+    NumberF16Test,
+    testing::ValuesIn(std::vector<F16TestCase>{
+        // NaN, Inf
+        {f32_inf, f32_inf, 0x7c00u},
+        {-f32_inf, -f32_inf, 0xfc00u},
+        {f32_nan, f32_nan, 0x7e00u},
+        {-f32_nan, -f32_nan, 0x7e00u},
+        // +/- zero
+        {+0.0f, 0.0f, 0x0000u},
+        {-0.0f, -0.0f, 0x8000u},
+        // Value in normal f16 range
+        {1.0f, 1.0f, 0x3c00u},
+        {-1.0f, -1.0f, 0xbc00u},
+        //   0.00006106496 quantized to 0.000061035156 = 0x1p-14
+        {0.00006106496f, 0.000061035156f, 0x0400u},
+        {-0.00006106496f, -0.000061035156f, 0x8400u},
+        //   1.0004883 quantized to 1.0 = 0x1p0
+        {1.0004883f, 1.0f, 0x3c00u},
+        {-1.0004883f, -1.0f, 0xbc00u},
+        //   8196.0 quantized to 8192.0 = 0x1p13
+        {8196.0f, 8192.f, 0x7000u},
+        {-8196.0f, -8192.f, 0xf000u},
+        // Value in subnormal f16 range
+        {0x0.034p-14f, 0x0.034p-14f, 0x000du},
+        {-0x0.034p-14f, -0x0.034p-14f, 0x800du},
+        {0x0.068p-14f, 0x0.068p-14f, 0x001au},
+        {-0x0.068p-14f, -0x0.068p-14f, 0x801au},
+        //   0x0.06b7p-14 quantized to 0x0.068p-14
+        {0x0.06b7p-14f, 0x0.068p-14f, 0x001au},
+        {-0x0.06b7p-14f, -0x0.068p-14, 0x801au},
+        // Value out of f16 range
+        {65504.003f, f32_inf, 0x7c00u},
+        {-65504.003f, -f32_inf, 0xfc00u},
+        {0x1.234p56f, f32_inf, 0x7c00u},
+        {-0x4.321p65f, -f32_inf, 0xfc00u},
+
+        // Test for subnormal quantization.
+        // Value larger than or equal to lowest positive normal f16 will be quantized to normal f16.
+        {lowestPositiveNormalF16PlusULP, lowestPositiveNormalF16, lowestPositiveNormalF16Bits},
+        {lowestPositiveNormalF16, lowestPositiveNormalF16, lowestPositiveNormalF16Bits},
+        // Positive value smaller than lowest positive normal f16 but not smaller than lowest
+        // positive
+        // subnormal f16 will be quantized to subnormal f16 or zero.
+        {lowestPositiveNormalF16MinusULP, highestPositiveSubnormalF16,
+         highestPositiveSubnormalF16Bits},
+        {highestPositiveSubnormalF16PlusULP, highestPositiveSubnormalF16,
+         highestPositiveSubnormalF16Bits},
+        {highestPositiveSubnormalF16, highestPositiveSubnormalF16, highestPositiveSubnormalF16Bits},
+        {highestPositiveSubnormalF16MinusULP, 0x0.ff8p-14, 0x03feu},
+        {lowestPositiveSubnormalF16PlusULP, lowestPositiveSubnormalF16,
+         lowestPositiveSubnormalF16Bits},
+        {lowestPositiveSubnormalF16, lowestPositiveSubnormalF16, lowestPositiveSubnormalF16Bits},
+        // Positive value smaller than lowest positive subnormal f16 will be quantized to zero.
+        {lowestPositiveSubnormalF16MinusULP, 0.0, 0x0000u},
+        // Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 =
+        // 0x0.004p-14.
+        {0x0.064p-14f, 0x0.064p-14, 0x0019u},
+        {0x0.067fecp-14f, 0x0.064p-14, 0x0019u},
+        {0x0.063ffep-14f, 0x0.060p-14, 0x0018u},
+        {0x0.008p-14f, 0x0.008p-14, 0x0002u},
+        {0x0.00bffep-14f, 0x0.008p-14, 0x0002u},
+        {0x0.007ffep-14f, 0x0.004p-14, 0x0001u},
+
+        // Vice versa for negative cases.
+        {highestNegativeNormalF16MinusULP, highestNegativeNormalF16, highestNegativeNormalF16Bits},
+        {highestNegativeNormalF16, highestNegativeNormalF16, highestNegativeNormalF16Bits},
+        {highestNegativeNormalF16PlusULP, lowestNegativeSubnormalF16,
+         lowestNegativeSubnormalF16Bits},
+        {lowestNegativeSubnormalF16MinusULP, lowestNegativeSubnormalF16,
+         lowestNegativeSubnormalF16Bits},
+        {lowestNegativeSubnormalF16, lowestNegativeSubnormalF16, lowestNegativeSubnormalF16Bits},
+        {lowestNegativeSubnormalF16PlusULP, -0x0.ff8p-14, 0x83feu},
+        {highestNegativeSubnormalF16MinusULP, highestNegativeSubnormalF16,
+         highestNegativeSubnormalF16Bits},
+        {highestNegativeSubnormalF16, highestNegativeSubnormalF16, highestNegativeSubnormalF16Bits},
+        {highestNegativeSubnormalF16PlusULP, -0.0, 0x8000u},
+        // Test the mantissa discarding.
+        {-0x0.064p-14f, -0x0.064p-14, 0x8019u},
+        {-0x0.067fecp-14f, -0x0.064p-14, 0x8019u},
+        {-0x0.063ffep-14f, -0x0.060p-14, 0x8018u},
+        {-0x0.008p-14f, -0x0.008p-14, 0x8002u},
+        {-0x0.00bffep-14f, -0x0.008p-14, 0x8002u},
+        {-0x0.007ffep-14f, -0x0.004p-14, 0x8001u},
+        /////////////////////////////////////
+    }));
+
 using BinaryCheckedCase = std::tuple<std::optional<AInt>, AInt, AInt>;

 #undef OVERFLOW  // corecrt_math.h :(