diff --git a/src/tint/number.cc b/src/tint/number.cc index 17b005b8e2..6371442f48 100644 --- a/src/tint/number.cc +++ b/src/tint/number.cc @@ -204,4 +204,108 @@ f16::type f16::Quantize(f16::type value) { return value; } +uint16_t f16::BitsRepresentation() const { + constexpr uint16_t f16_nan = 0x7e00u; + constexpr uint16_t f16_pos_inf = 0x7c00u; + constexpr uint16_t f16_neg_inf = 0xfc00u; + + // Assert we use binary32 (i.e. float) as underlying type, which has 4 bytes. + static_assert(std::is_same()); + + // The stored value in f16 object must be already quantized, so it should be either NaN, +/- + // Inf, or exactly representable by normal or subnormal f16. + + if (std::isnan(value)) { + return f16_nan; + } + + if (std::isinf(value)) { + return value > 0 ? f16_pos_inf : f16_neg_inf; + } + + // Now quantized_value must be a finite f16 exactly-representable value. + // The following table shows exponent cases for all finite f16 exactly-representable value. + // --------------------------------------------------------------------------- + // | Value category | Unbiased exp | F16 biased exp | F32 biased exp | + // |------------------|----------------|------------------|------------------| + // | +/- zero | \ | 0 | 0 | + // | Subnormal f16 | [-24, -15] | 0 | [103, 112] | + // | Normal f16 | [-14, 15] | [1, 30] | [113, 142] | + // --------------------------------------------------------------------------- + + constexpr uint32_t max_f32_biased_exp_for_f16_normal_number = 142; + constexpr uint32_t min_f32_biased_exp_for_f16_normal_number = 113; + constexpr uint32_t max_f32_biased_exp_for_f16_subnormal_number = 112; + constexpr uint32_t min_f32_biased_exp_for_f16_subnormal_number = 103; + + constexpr uint32_t f32_sign_mask = 0x80000000u; + constexpr uint32_t f32_exp_mask = 0x7f800000u; + constexpr uint32_t f32_mantissa_mask = 0x007fffffu; + constexpr uint32_t f32_mantissa_bis_number = 23; + constexpr uint32_t f32_exp_bias = 127; + + constexpr uint16_t f16_sign_mask = 0x8000u; + constexpr uint16_t f16_exp_mask = 0x7c00u; + constexpr uint16_t f16_mantissa_mask = 0x03ffu; + constexpr uint32_t f16_mantissa_bis_number = 10; + constexpr uint32_t f16_exp_bias = 15; + + uint32_t f32_bit_pattern; + memcpy(&f32_bit_pattern, &value, 4); + uint32_t f32_biased_exponent = (f32_bit_pattern & f32_exp_mask) >> f32_mantissa_bis_number; + uint32_t f32_mantissa = f32_bit_pattern & f32_mantissa_mask; + + uint16_t f16_sign_part = static_cast((f32_bit_pattern & f32_sign_mask) >> 16); + TINT_ASSERT(Semantic, (f16_sign_part & ~f16_sign_mask) == 0); + + if ((f32_bit_pattern & ~f32_sign_mask) == 0) { + // +/- zero + return f16_sign_part; + } + + if ((min_f32_biased_exp_for_f16_normal_number <= f32_biased_exponent) && + (f32_biased_exponent <= max_f32_biased_exp_for_f16_normal_number)) { + // Normal f16 + uint32_t f16_biased_exponent = f32_biased_exponent - f32_exp_bias + f16_exp_bias; + uint16_t f16_exp_part = + static_cast(f16_biased_exponent << f16_mantissa_bis_number); + uint16_t f16_mantissa_part = static_cast( + f32_mantissa >> (f32_mantissa_bis_number - f16_mantissa_bis_number)); + + TINT_ASSERT(Semantic, (f16_exp_part & ~f16_exp_mask) == 0); + TINT_ASSERT(Semantic, (f16_mantissa_part & ~f16_mantissa_mask) == 0); + + return f16_sign_part | f16_exp_part | f16_mantissa_part; + } + + if ((min_f32_biased_exp_for_f16_subnormal_number <= f32_biased_exponent) && + (f32_biased_exponent <= max_f32_biased_exp_for_f16_subnormal_number)) { + // Subnormal f16 + // The resulting exp bits are always 0, and the mantissa bits should be handled specially. + uint16_t f16_exp_part = 0; + // The resulting subnormal f16 will have only 1 valid mantissa bit if the unbiased exponent + // of value is of the minimum, i.e. -24; and have all 10 mantissa bits valid if the unbiased + // exponent of value is of the maximum, i.e. -15. + uint32_t f16_valid_mantissa_bits = + f32_biased_exponent - min_f32_biased_exp_for_f16_subnormal_number + 1; + // The resulting f16 mantissa part comes from right-shifting the f32 mantissa bits with + // leading 1 added. + uint16_t f16_mantissa_part = + static_cast((f32_mantissa | (f32_mantissa_mask + 1)) >> + (f32_mantissa_bis_number + 1 - f16_valid_mantissa_bits)); + + TINT_ASSERT(Semantic, (1 <= f16_valid_mantissa_bits) && + (f16_valid_mantissa_bits <= f16_mantissa_bis_number)); + TINT_ASSERT(Semantic, (f16_mantissa_part & ~((1u << f16_valid_mantissa_bits) - 1)) == 0); + TINT_ASSERT(Semantic, (f16_mantissa_part != 0)); + + return f16_sign_part | f16_exp_part | f16_mantissa_part; + } + + // Neither zero, subnormal f16 or normal f16, shall never hit. + tint::diag::List diag; + TINT_UNREACHABLE(Semantic, diag); + return f16_nan; +} + } // namespace tint diff --git a/src/tint/number.h b/src/tint/number.h index e1300193c3..32cca596ee 100644 --- a/src/tint/number.h +++ b/src/tint/number.h @@ -186,6 +186,13 @@ struct Number { return *this; } + /// Get the binary16 bit pattern in type uint16_t of this value. + /// @returns the binary16 bit pattern, in type uint16_t, of the stored quantized f16 value. If + /// the value is NaN, the returned value will be 0x7e00u. If the value is positive infinity, the + /// returned value will be 0x7c00u. If the input value is negative infinity, the returned value + /// will be 0xfc00u. + uint16_t BitsRepresentation() const; + /// @param value the input float32 value /// @returns the float32 value quantized to the smaller float16 value, through truncation of the /// mantissa bits (no rounding). If the float32 value is too large (positive or negative) to be diff --git a/src/tint/number_test.cc b/src/tint/number_test.cc index 81acc0489c..eeb31edacd 100644 --- a/src/tint/number_test.cc +++ b/src/tint/number_test.cc @@ -217,83 +217,164 @@ TEST(NumberTest, CheckedConvertSubnormals) { EXPECT_EQ(CheckedConvert(AFloat(-kHighestF16Subnormal)), f16(-kHighestF16Subnormal)); } -TEST(NumberTest, QuantizeF16) { - constexpr float nan = std::numeric_limits::quiet_NaN(); - constexpr float inf = std::numeric_limits::infinity(); +// Test cases for f16 subnormal quantization and BitsRepresentation. +// The ULP is based on float rather than double or f16, since F16::Quantize and +// F16::BitsRepresentation take float as input. +constexpr float lowestPositiveNormalF16 = 0x1p-14; +constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14; +constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15; +constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14; +constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15; +constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15; +constexpr float lowestPositiveSubnormalF16 = 0x1.p-24; +constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24; +constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25; - EXPECT_EQ(f16(0.0), 0.0f); - EXPECT_EQ(f16(1.0), 1.0f); - EXPECT_EQ(f16(0.00006106496), 0.000061035156f); - EXPECT_EQ(f16(1.0004883), 1.0f); - EXPECT_EQ(f16(-8196), -8192.f); - EXPECT_EQ(f16(65504.003), inf); - EXPECT_EQ(f16(-65504.003), -inf); - EXPECT_EQ(f16(inf), inf); - EXPECT_EQ(f16(-inf), -inf); - EXPECT_TRUE(std::isnan(f16(nan))); +constexpr uint16_t lowestPositiveNormalF16Bits = 0x0400u; +constexpr uint16_t highestPositiveSubnormalF16Bits = 0x03ffu; +constexpr uint16_t lowestPositiveSubnormalF16Bits = 0x0001u; - // Test for subnormal quantization. - // The ULP is based on float rather than double or f16, since F16::Quantize take float as input. - constexpr float lowestPositiveNormalF16 = 0x1p-14; - constexpr float lowestPositiveNormalF16PlusULP = 0x1.000002p-14; - constexpr float lowestPositiveNormalF16MinusULP = 0x1.fffffep-15; - constexpr float highestPositiveSubnormalF16 = 0x0.ffcp-14; - constexpr float highestPositiveSubnormalF16PlusULP = 0x1.ff8002p-15; - constexpr float highestPositiveSubnormalF16MinusULP = 0x1.ff7ffep-15; - constexpr float lowestPositiveSubnormalF16 = 0x1.p-24; - constexpr float lowestPositiveSubnormalF16PlusULP = 0x1.000002p-24; - constexpr float lowestPositiveSubnormalF16MinusULP = 0x1.fffffep-25; +constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16; +constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP; +constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP; +constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16; +constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP; +constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP; +constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16; +constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP; +constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP; - constexpr float highestNegativeNormalF16 = -lowestPositiveNormalF16; - constexpr float highestNegativeNormalF16PlusULP = -lowestPositiveNormalF16MinusULP; - constexpr float highestNegativeNormalF16MinusULP = -lowestPositiveNormalF16PlusULP; - constexpr float lowestNegativeSubnormalF16 = -highestPositiveSubnormalF16; - constexpr float lowestNegativeSubnormalF16PlusULP = -highestPositiveSubnormalF16MinusULP; - constexpr float lowestNegativeSubnormalF16MinusULP = -highestPositiveSubnormalF16PlusULP; - constexpr float highestNegativeSubnormalF16 = -lowestPositiveSubnormalF16; - constexpr float highestNegativeSubnormalF16PlusULP = -lowestPositiveSubnormalF16MinusULP; - constexpr float highestNegativeSubnormalF16MinusULP = -lowestPositiveSubnormalF16PlusULP; +constexpr uint16_t highestNegativeNormalF16Bits = 0x8400u; +constexpr uint16_t lowestNegativeSubnormalF16Bits = 0x83ffu; +constexpr uint16_t highestNegativeSubnormalF16Bits = 0x8001u; - // Value larger than or equal to lowest positive normal f16 will be quantized to normal f16. - EXPECT_EQ(f16(lowestPositiveNormalF16PlusULP), lowestPositiveNormalF16); - EXPECT_EQ(f16(lowestPositiveNormalF16), lowestPositiveNormalF16); - // Positive value smaller than lowest positive normal f16 but not smaller than lowest positive - // subnormal f16 will be quantized to subnormal f16 or zero. - EXPECT_EQ(f16(lowestPositiveNormalF16MinusULP), highestPositiveSubnormalF16); - EXPECT_EQ(f16(highestPositiveSubnormalF16PlusULP), highestPositiveSubnormalF16); - EXPECT_EQ(f16(highestPositiveSubnormalF16), highestPositiveSubnormalF16); - EXPECT_EQ(f16(highestPositiveSubnormalF16MinusULP), 0x0.ff8p-14); - EXPECT_EQ(f16(lowestPositiveSubnormalF16PlusULP), lowestPositiveSubnormalF16); - EXPECT_EQ(f16(lowestPositiveSubnormalF16), lowestPositiveSubnormalF16); - // Positive value smaller than lowest positive subnormal f16 will be quantized to zero. - EXPECT_EQ(f16(lowestPositiveSubnormalF16MinusULP), 0.0); - // Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 = 0x0.004p-14. - EXPECT_EQ(f16(0x0.064p-14), 0x0.064p-14); - EXPECT_EQ(f16(0x0.067fecp-14), 0x0.064p-14); - EXPECT_EQ(f16(0x0.063ffep-14), 0x0.060p-14); - EXPECT_EQ(f16(0x0.008p-14), 0x0.008p-14); - EXPECT_EQ(f16(0x0.00bffep-14), 0x0.008p-14); - EXPECT_EQ(f16(0x0.007ffep-14), 0x0.004p-14); +constexpr float f32_nan = std::numeric_limits::quiet_NaN(); +constexpr float f32_inf = std::numeric_limits::infinity(); - // Vice versa for negative cases. - EXPECT_EQ(f16(highestNegativeNormalF16MinusULP), highestNegativeNormalF16); - EXPECT_EQ(f16(highestNegativeNormalF16), highestNegativeNormalF16); - EXPECT_EQ(f16(highestNegativeNormalF16PlusULP), lowestNegativeSubnormalF16); - EXPECT_EQ(f16(lowestNegativeSubnormalF16MinusULP), lowestNegativeSubnormalF16); - EXPECT_EQ(f16(lowestNegativeSubnormalF16), lowestNegativeSubnormalF16); - EXPECT_EQ(f16(lowestNegativeSubnormalF16PlusULP), -0x0.ff8p-14); - EXPECT_EQ(f16(highestNegativeSubnormalF16MinusULP), highestNegativeSubnormalF16); - EXPECT_EQ(f16(highestNegativeSubnormalF16), highestNegativeSubnormalF16); - EXPECT_EQ(f16(highestNegativeSubnormalF16PlusULP), -0.0); - // Test the mantissa discarding. - EXPECT_EQ(f16(-0x0.064p-14), -0x0.064p-14); - EXPECT_EQ(f16(-0x0.067fecp-14), -0x0.064p-14); - EXPECT_EQ(f16(-0x0.063ffep-14), -0x0.060p-14); - EXPECT_EQ(f16(-0x0.008p-14), -0x0.008p-14); - EXPECT_EQ(f16(-0x0.00bffep-14), -0x0.008p-14); - EXPECT_EQ(f16(-0x0.007ffep-14), -0x0.004p-14); +struct F16TestCase { + float input_value; + float quantized_value; + uint16_t f16_bit_pattern; +}; + +using NumberF16Test = testing::TestWithParam; + +TEST_P(NumberF16Test, QuantizeF16) { + float input_value = GetParam().input_value; + float quantized_value = GetParam().quantized_value; + + std::stringstream ss; + ss << "input value = " << input_value << ", expected quantized value = " << quantized_value; + SCOPED_TRACE(ss.str()); + + if (std::isnan(quantized_value)) { + EXPECT_TRUE(std::isnan(f16(input_value))); + } else { + EXPECT_EQ(f16(input_value), quantized_value); + } } +TEST_P(NumberF16Test, BitsRepresentation) { + float input_value = GetParam().input_value; + uint16_t representation = GetParam().f16_bit_pattern; + + std::stringstream ss; + ss << "input value = " << input_value + << ", expected binary16 bits representation = " << std::hex << std::showbase + << representation; + SCOPED_TRACE(ss.str()); + + EXPECT_EQ(f16(input_value).BitsRepresentation(), representation); +} + +INSTANTIATE_TEST_SUITE_P( + NumberF16Test, + NumberF16Test, + testing::ValuesIn(std::vector{ + // NaN, Inf + {f32_inf, f32_inf, 0x7c00u}, + {-f32_inf, -f32_inf, 0xfc00u}, + {f32_nan, f32_nan, 0x7e00u}, + {-f32_nan, -f32_nan, 0x7e00u}, + // +/- zero + {+0.0f, 0.0f, 0x0000u}, + {-0.0f, -0.0f, 0x8000u}, + // Value in normal f16 range + {1.0f, 1.0f, 0x3c00u}, + {-1.0f, -1.0f, 0xbc00u}, + // 0.00006106496 quantized to 0.000061035156 = 0x1p-14 + {0.00006106496f, 0.000061035156f, 0x0400u}, + {-0.00006106496f, -0.000061035156f, 0x8400u}, + // 1.0004883 quantized to 1.0 = 0x1p0 + {1.0004883f, 1.0f, 0x3c00u}, + {-1.0004883f, -1.0f, 0xbc00u}, + // 8196.0 quantized to 8192.0 = 0x1p13 + {8196.0f, 8192.f, 0x7000u}, + {-8196.0f, -8192.f, 0xf000u}, + // Value in subnormal f16 range + {0x0.034p-14f, 0x0.034p-14f, 0x000du}, + {-0x0.034p-14f, -0x0.034p-14f, 0x800du}, + {0x0.068p-14f, 0x0.068p-14f, 0x001au}, + {-0x0.068p-14f, -0x0.068p-14f, 0x801au}, + // 0x0.06b7p-14 quantized to 0x0.068p-14 + {0x0.06b7p-14f, 0x0.068p-14f, 0x001au}, + {-0x0.06b7p-14f, -0x0.068p-14, 0x801au}, + // Value out of f16 range + {65504.003f, f32_inf, 0x7c00u}, + {-65504.003f, -f32_inf, 0xfc00u}, + {0x1.234p56f, f32_inf, 0x7c00u}, + {-0x4.321p65f, -f32_inf, 0xfc00u}, + + // Test for subnormal quantization. + // Value larger than or equal to lowest positive normal f16 will be quantized to normal f16. + {lowestPositiveNormalF16PlusULP, lowestPositiveNormalF16, lowestPositiveNormalF16Bits}, + {lowestPositiveNormalF16, lowestPositiveNormalF16, lowestPositiveNormalF16Bits}, + // Positive value smaller than lowest positive normal f16 but not smaller than lowest + // positive + // subnormal f16 will be quantized to subnormal f16 or zero. + {lowestPositiveNormalF16MinusULP, highestPositiveSubnormalF16, + highestPositiveSubnormalF16Bits}, + {highestPositiveSubnormalF16PlusULP, highestPositiveSubnormalF16, + highestPositiveSubnormalF16Bits}, + {highestPositiveSubnormalF16, highestPositiveSubnormalF16, highestPositiveSubnormalF16Bits}, + {highestPositiveSubnormalF16MinusULP, 0x0.ff8p-14, 0x03feu}, + {lowestPositiveSubnormalF16PlusULP, lowestPositiveSubnormalF16, + lowestPositiveSubnormalF16Bits}, + {lowestPositiveSubnormalF16, lowestPositiveSubnormalF16, lowestPositiveSubnormalF16Bits}, + // Positive value smaller than lowest positive subnormal f16 will be quantized to zero. + {lowestPositiveSubnormalF16MinusULP, 0.0, 0x0000u}, + // Test the mantissa discarding, the least significant mantissa bit is 0x1p-24 = + // 0x0.004p-14. + {0x0.064p-14f, 0x0.064p-14, 0x0019u}, + {0x0.067fecp-14f, 0x0.064p-14, 0x0019u}, + {0x0.063ffep-14f, 0x0.060p-14, 0x0018u}, + {0x0.008p-14f, 0x0.008p-14, 0x0002u}, + {0x0.00bffep-14f, 0x0.008p-14, 0x0002u}, + {0x0.007ffep-14f, 0x0.004p-14, 0x0001u}, + + // Vice versa for negative cases. + {highestNegativeNormalF16MinusULP, highestNegativeNormalF16, highestNegativeNormalF16Bits}, + {highestNegativeNormalF16, highestNegativeNormalF16, highestNegativeNormalF16Bits}, + {highestNegativeNormalF16PlusULP, lowestNegativeSubnormalF16, + lowestNegativeSubnormalF16Bits}, + {lowestNegativeSubnormalF16MinusULP, lowestNegativeSubnormalF16, + lowestNegativeSubnormalF16Bits}, + {lowestNegativeSubnormalF16, lowestNegativeSubnormalF16, lowestNegativeSubnormalF16Bits}, + {lowestNegativeSubnormalF16PlusULP, -0x0.ff8p-14, 0x83feu}, + {highestNegativeSubnormalF16MinusULP, highestNegativeSubnormalF16, + highestNegativeSubnormalF16Bits}, + {highestNegativeSubnormalF16, highestNegativeSubnormalF16, highestNegativeSubnormalF16Bits}, + {highestNegativeSubnormalF16PlusULP, -0.0, 0x8000u}, + // Test the mantissa discarding. + {-0x0.064p-14f, -0x0.064p-14, 0x8019u}, + {-0x0.067fecp-14f, -0x0.064p-14, 0x8019u}, + {-0x0.063ffep-14f, -0x0.060p-14, 0x8018u}, + {-0x0.008p-14f, -0x0.008p-14, 0x8002u}, + {-0x0.00bffep-14f, -0x0.008p-14, 0x8002u}, + {-0x0.007ffep-14f, -0x0.004p-14, 0x8001u}, + ///////////////////////////////////// + })); + using BinaryCheckedCase = std::tuple, AInt, AInt>; #undef OVERFLOW // corecrt_math.h :(