transform/VertexPulling: Implement remaining work

Implement missing formats. Implement vector width conversions. Implement unaligned loads. Bug: dawn:805 Change-Id: I89724b3027c637c99999c8ecdbf0d8ca4f571afc Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/56062 Commit-Queue: Ben Clayton <bclayton@google.com> Kokoro: Kokoro <noreply+kokoro@google.com> Auto-Submit: Ben Clayton <bclayton@google.com> Reviewed-by: Corentin Wallez <cwallez@chromium.org>
2025-08-17 17:31:47 +00:00 · 2021-06-29 12:37:45 +00:00 · 2021-06-29 12:37:45 +00:00 · d960328f07
commit d960328f07
parent f2ec7f38e5
7 changed files with 1293 additions and 230 deletions
--- a/fuzzers/tint_common_fuzzer.cc
+++ b/fuzzers/tint_common_fuzzer.cc
@ -51,7 +51,7 @@ transform::VertexAttributeDescriptor ExtractVertexAttributeDescriptor(
  transform::VertexAttributeDescriptor desc;
  desc.format = r->enum_class<transform::VertexFormat>(
      static_cast<uint8_t>(transform::VertexFormat::kLastEntry) + 1);
-  desc.offset = r->read<uint64_t>();
+  desc.offset = r->read<uint32_t>();
  desc.shader_location = r->read<uint32_t>();
  return desc;
 }
@ -59,7 +59,7 @@ transform::VertexAttributeDescriptor ExtractVertexAttributeDescriptor(
 transform::VertexBufferLayoutDescriptor ExtractVertexBufferLayoutDescriptor(
    Reader* r) {
  transform::VertexBufferLayoutDescriptor desc;
-  desc.array_stride = r->read<uint64_t>();
+  desc.array_stride = r->read<uint32_t>();
  desc.step_mode = r->enum_class<transform::InputStepMode>(
      static_cast<uint8_t>(transform::InputStepMode::kLastEntry) + 1);
  desc.attributes = r->vector(ExtractVertexAttributeDescriptor);
--- a/src/program_builder.h
+++ b/src/program_builder.h
@ -25,6 +25,7 @@
 #include "src/ast/assignment_statement.h"
 #include "src/ast/atomic.h"
 #include "src/ast/binary_expression.h"
+#include "src/ast/bitcast_expression.h"
 #include "src/ast/bool.h"
 #include "src/ast/bool_literal.h"
 #include "src/ast/call_expression.h"
@ -1101,6 +1102,36 @@ class ProgramBuilder {
        source, type, ExprList(std::forward<ARGS>(args)...));
  }

+  /// @param expr the expression for the bitcast
+  /// @return an `ast::BitcastExpression` of type `ty`, with the values of
+  /// `expr` converted to `ast::Expression`s using `Expr()`
+  template <typename T, typename EXPR>
+  ast::BitcastExpression* Bitcast(EXPR&& expr) {
+    return Bitcast(ty.Of<T>(), std::forward<EXPR>(expr));
+  }
+
+  /// @param type the type to cast to
+  /// @param expr the expression for the bitcast
+  /// @return an `ast::BitcastExpression` of `type` constructed with the values
+  /// `expr`.
+  template <typename EXPR>
+  ast::BitcastExpression* Bitcast(ast::Type* type, EXPR&& expr) {
+    return create<ast::BitcastExpression>(type, Expr(std::forward<EXPR>(expr)));
+  }
+
+  /// @param source the source information
+  /// @param type the type to cast to
+  /// @param expr the expression for the bitcast
+  /// @return an `ast::BitcastExpression` of `type` constructed with the values
+  /// `expr`.
+  template <typename EXPR>
+  ast::BitcastExpression* Bitcast(const Source& source,
+                                  ast::Type* type,
+                                  EXPR&& expr) {
+    return create<ast::BitcastExpression>(source, type,
+                                          Expr(std::forward<EXPR>(expr)));
+  }
+
  /// @param args the arguments for the vector constructor
  /// @param type the vector type
  /// @param size the vector size
@ -1503,6 +1534,16 @@ class ProgramBuilder {
                                         Expr(std::forward<RHS>(rhs)));
  }

+  /// @param lhs the left hand argument to the or operation
+  /// @param rhs the right hand argument to the or operation
+  /// @returns a `ast::BinaryExpression` bitwise or-ing `lhs` and `rhs`
+  template <typename LHS, typename RHS>
+  ast::BinaryExpression* Or(LHS&& lhs, RHS&& rhs) {
+    return create<ast::BinaryExpression>(ast::BinaryOp::kOr,
+                                         Expr(std::forward<LHS>(lhs)),
+                                         Expr(std::forward<RHS>(rhs)));
+  }
+
  /// @param lhs the left hand argument to the subtraction operation
  /// @param rhs the right hand argument to the subtraction operation
  /// @returns a `ast::BinaryExpression` subtracting `rhs` from `lhs`
@ -1544,6 +1585,26 @@ class ProgramBuilder {
                                         Expr(std::forward<RHS>(rhs)));
  }

+  /// @param lhs the left hand argument to the bit shift right operation
+  /// @param rhs the right hand argument to the bit shift right operation
+  /// @returns a `ast::BinaryExpression` bit shifting right `lhs` by `rhs`
+  template <typename LHS, typename RHS>
+  ast::BinaryExpression* Shr(LHS&& lhs, RHS&& rhs) {
+    return create<ast::BinaryExpression>(ast::BinaryOp::kShiftRight,
+                                         Expr(std::forward<LHS>(lhs)),
+                                         Expr(std::forward<RHS>(rhs)));
+  }
+
+  /// @param lhs the left hand argument to the bit shift left operation
+  /// @param rhs the right hand argument to the bit shift left operation
+  /// @returns a `ast::BinaryExpression` bit shifting left `lhs` by `rhs`
+  template <typename LHS, typename RHS>
+  ast::BinaryExpression* Shl(LHS&& lhs, RHS&& rhs) {
+    return create<ast::BinaryExpression>(ast::BinaryOp::kShiftLeft,
+                                         Expr(std::forward<LHS>(lhs)),
+                                         Expr(std::forward<RHS>(rhs)));
+  }
+
  /// @param source the source information
  /// @param arr the array argument for the array accessor expression
  /// @param idx the index argument for the array accessor expression
--- a/src/transform/vertex_pulling.cc
+++ b/src/transform/vertex_pulling.cc
@ -14,6 +14,7 @@

 #include "src/transform/vertex_pulling.h"

+#include <algorithm>
 #include <utility>

 #include "src/ast/assignment_statement.h"
@ -23,6 +24,7 @@
 #include "src/program_builder.h"
 #include "src/sem/variable.h"
 #include "src/utils/get_or_create.h"
+#include "src/utils/math.h"

 TINT_INSTANTIATE_TYPEINFO(tint::transform::VertexPulling);
 TINT_INSTANTIATE_TYPEINFO(tint::transform::VertexPulling::Config);
@ -32,6 +34,173 @@ namespace transform {

 namespace {

+/// The base type of a component.
+/// The format type is either this type or a vector of this type.
+enum class BaseType {
+  kInvalid,
+  kU32,
+  kI32,
+  kF32,
+};
+
+/// Writes the BaseType to the std::ostream.
+/// @param out the std::ostream to write to
+/// @param format the BaseType to write
+/// @returns out so calls can be chained
+std::ostream& operator<<(std::ostream& out, BaseType format) {
+  switch (format) {
+    case BaseType::kInvalid:
+      return out << "invalid";
+    case BaseType::kU32:
+      return out << "u32";
+    case BaseType::kI32:
+      return out << "i32";
+    case BaseType::kF32:
+      return out << "f32";
+  }
+  return out << "<unknown>";
+}
+
+/// Writes the VertexFormat to the std::ostream.
+/// @param out the std::ostream to write to
+/// @param format the VertexFormat to write
+/// @returns out so calls can be chained
+std::ostream& operator<<(std::ostream& out, VertexFormat format) {
+  switch (format) {
+    case VertexFormat::kUint8x2:
+      return out << "uint8x2";
+    case VertexFormat::kUint8x4:
+      return out << "uint8x4";
+    case VertexFormat::kSint8x2:
+      return out << "sint8x2";
+    case VertexFormat::kSint8x4:
+      return out << "sint8x4";
+    case VertexFormat::kUnorm8x2:
+      return out << "unorm8x2";
+    case VertexFormat::kUnorm8x4:
+      return out << "unorm8x4";
+    case VertexFormat::kSnorm8x2:
+      return out << "snorm8x2";
+    case VertexFormat::kSnorm8x4:
+      return out << "snorm8x4";
+    case VertexFormat::kUint16x2:
+      return out << "uint16x2";
+    case VertexFormat::kUint16x4:
+      return out << "uint16x4";
+    case VertexFormat::kSint16x2:
+      return out << "sint16x2";
+    case VertexFormat::kSint16x4:
+      return out << "sint16x4";
+    case VertexFormat::kUnorm16x2:
+      return out << "unorm16x2";
+    case VertexFormat::kUnorm16x4:
+      return out << "unorm16x4";
+    case VertexFormat::kSnorm16x2:
+      return out << "snorm16x2";
+    case VertexFormat::kSnorm16x4:
+      return out << "snorm16x4";
+    case VertexFormat::kFloat16x2:
+      return out << "float16x2";
+    case VertexFormat::kFloat16x4:
+      return out << "float16x4";
+    case VertexFormat::kFloat32:
+      return out << "float32";
+    case VertexFormat::kFloat32x2:
+      return out << "float32x2";
+    case VertexFormat::kFloat32x3:
+      return out << "float32x3";
+    case VertexFormat::kFloat32x4:
+      return out << "float32x4";
+    case VertexFormat::kUint32:
+      return out << "uint32";
+    case VertexFormat::kUint32x2:
+      return out << "uint32x2";
+    case VertexFormat::kUint32x3:
+      return out << "uint32x3";
+    case VertexFormat::kUint32x4:
+      return out << "uint32x4";
+    case VertexFormat::kSint32:
+      return out << "sint32";
+    case VertexFormat::kSint32x2:
+      return out << "sint32x2";
+    case VertexFormat::kSint32x3:
+      return out << "sint32x3";
+    case VertexFormat::kSint32x4:
+      return out << "sint32x4";
+  }
+  return out << "<unknown>";
+}
+
+/// A vertex attribute data format.
+struct DataType {
+  BaseType base_type;
+  uint32_t width;  // 1 for scalar, 2+ for a vector
+};
+
+DataType DataTypeOf(sem::Type* ty) {
+  if (ty->Is<sem::I32>()) {
+    return {BaseType::kI32, 1};
+  }
+  if (ty->Is<sem::U32>()) {
+    return {BaseType::kU32, 1};
+  }
+  if (ty->Is<sem::F32>()) {
+    return {BaseType::kF32, 1};
+  }
+  if (auto* vec = ty->As<sem::Vector>()) {
+    return {DataTypeOf(vec->type()).base_type, vec->size()};
+  }
+  return {BaseType::kInvalid, 0};
+}
+
+DataType DataTypeOf(VertexFormat format) {
+  switch (format) {
+    case VertexFormat::kUint32:
+      return {BaseType::kU32, 1};
+    case VertexFormat::kUint8x2:
+    case VertexFormat::kUint16x2:
+    case VertexFormat::kUint32x2:
+      return {BaseType::kU32, 2};
+    case VertexFormat::kUint32x3:
+      return {BaseType::kU32, 3};
+    case VertexFormat::kUint8x4:
+    case VertexFormat::kUint16x4:
+    case VertexFormat::kUint32x4:
+      return {BaseType::kU32, 4};
+    case VertexFormat::kSint32:
+      return {BaseType::kI32, 1};
+    case VertexFormat::kSint8x2:
+    case VertexFormat::kSint16x2:
+    case VertexFormat::kSint32x2:
+      return {BaseType::kI32, 2};
+    case VertexFormat::kSint32x3:
+      return {BaseType::kI32, 3};
+    case VertexFormat::kSint8x4:
+    case VertexFormat::kSint16x4:
+    case VertexFormat::kSint32x4:
+      return {BaseType::kI32, 4};
+    case VertexFormat::kFloat32:
+      return {BaseType::kF32, 1};
+    case VertexFormat::kUnorm8x2:
+    case VertexFormat::kSnorm8x2:
+    case VertexFormat::kUnorm16x2:
+    case VertexFormat::kSnorm16x2:
+    case VertexFormat::kFloat16x2:
+    case VertexFormat::kFloat32x2:
+      return {BaseType::kF32, 2};
+    case VertexFormat::kFloat32x3:
+      return {BaseType::kF32, 3};
+    case VertexFormat::kUnorm8x4:
+    case VertexFormat::kSnorm8x4:
+    case VertexFormat::kUnorm16x4:
+    case VertexFormat::kSnorm16x4:
+    case VertexFormat::kFloat16x4:
+    case VertexFormat::kFloat32x4:
+      return {BaseType::kF32, 4};
+  }
+  return {BaseType::kInvalid, 0};
+}
+
 struct State {
  State(CloneContext& context, const VertexPulling::Config& c)
      : ctx(context), cfg(c) {}
@ -47,10 +216,14 @@ struct State {
    ast::Variable* to;
  };

+  struct LocationInfo {
+    std::function<ast::Expression*()> expr;
+    sem::Type* type;
+  };
+
  CloneContext& ctx;
  VertexPulling::Config const cfg;
-  std::unordered_map<uint32_t, std::function<ast::Expression*()>>
-      location_to_expr;
+  std::unordered_map<uint32_t, LocationInfo> location_info;
  std::function<ast::Expression*()> vertex_index_expr = nullptr;
  std::function<ast::Expression*()> instance_index_expr = nullptr;
  Symbol pulling_position_name;
@ -69,15 +242,6 @@ struct State {
    });
  }

-  /// Lazily generates the pulling position symbol
-  Symbol GetPullingPositionName() {
-    if (!pulling_position_name.IsValid()) {
-      static const char kPullingPosVarName[] = "tint_pulling_pos";
-      pulling_position_name = ctx.dst->Symbols().New(kPullingPosVarName);
-    }
-    return pulling_position_name;
-  }
-
  /// Lazily generates the structure buffer symbol
  Symbol GetStructBufferName() {
    if (!struct_buffer_name.IsValid()) {
@ -89,9 +253,6 @@ struct State {

  /// Adds storage buffer decorated variables for the vertex buffers
  void AddVertexStorageBuffers() {
-    // TODO(idanr): Make this readonly
-    // https://github.com/gpuweb/gpuweb/issues/935
-
    // Creating the struct type
    static const char kStructName[] = "TintVertexData";
    auto* struct_type = ctx.dst->Structure(
@ -122,151 +283,434 @@ struct State {

    ast::StatementList stmts;

-    // Declare the pulling position variable in the shader
-    stmts.emplace_back(ctx.dst->Decl(
-        ctx.dst->Var(GetPullingPositionName(), ctx.dst->ty.u32())));
+    for (uint32_t buffer_idx = 0; buffer_idx < cfg.vertex_state.size();
+         ++buffer_idx) {
+      const VertexBufferLayoutDescriptor& buffer_layout =
+          cfg.vertex_state[buffer_idx];

-    for (uint32_t i = 0; i < cfg.vertex_state.size(); ++i) {
-      const VertexBufferLayoutDescriptor& buffer_layout = cfg.vertex_state[i];
+      if ((buffer_layout.array_stride & 3) != 0) {
+        ctx.dst->Diagnostics().add_error(
+            diag::System::Transform,
+            "WebGPU requires that vertex stride must be a multiple of 4 bytes, "
+            "but VertexPulling array stride for buffer " +
+                std::to_string(buffer_idx) + " was " +
+                std::to_string(buffer_layout.array_stride) + " bytes");
+        return nullptr;
+      }
+
+      auto* index_expr = buffer_layout.step_mode == InputStepMode::kVertex
+                             ? vertex_index_expr()
+                             : instance_index_expr();
+
+      // buffer_array_base is the base array offset for all the vertex
+      // attributes. These are units of uint (4 bytes).
+      auto buffer_array_base = ctx.dst->Symbols().New(
+          "buffer_array_base_" + std::to_string(buffer_idx));
+
+      auto* attribute_offset = index_expr;
+      if (buffer_layout.array_stride != 4) {
+        attribute_offset =
+            ctx.dst->Mul(index_expr, buffer_layout.array_stride / 4u);
+      }
+
+      // let pulling_offset_n = <attribute_offset>
+      stmts.emplace_back(ctx.dst->Decl(
+          ctx.dst->Const(buffer_array_base, nullptr, attribute_offset)));

      for (const VertexAttributeDescriptor& attribute_desc :
           buffer_layout.attributes) {
-        auto it = location_to_expr.find(attribute_desc.shader_location);
-        if (it == location_to_expr.end()) {
+        auto it = location_info.find(attribute_desc.shader_location);
+        if (it == location_info.end()) {
          continue;
        }
-        auto* ident = it->second();
+        auto& var = it->second;

-        auto* index_expr = buffer_layout.step_mode == InputStepMode::kVertex
-                               ? vertex_index_expr()
-                               : instance_index_expr();
+        // Data type of the target WGSL variable
+        auto var_dt = DataTypeOf(var.type);
+        // Data type of the vertex stream attribute
+        auto fmt_dt = DataTypeOf(attribute_desc.format);

-        // An expression for the start of the read in the buffer in bytes
-        auto* pos_value = ctx.dst->Add(
-            ctx.dst->Mul(index_expr,
-                         static_cast<uint32_t>(buffer_layout.array_stride)),
-            static_cast<uint32_t>(attribute_desc.offset));
+        // Base types must match between the vertex stream and the WGSL variable
+        if (var_dt.base_type != fmt_dt.base_type) {
+          std::stringstream err;
+          err << "VertexAttributeDescriptor for location "
+              << std::to_string(attribute_desc.shader_location)
+              << " has format " << attribute_desc.format
+              << " but shader expects "
+              << var.type->FriendlyName(ctx.src->Symbols());
+          ctx.dst->Diagnostics().add_error(diag::System::Transform, err.str());
+          return nullptr;
+        }

-        // Update position of the read
-        auto* set_pos_expr =
-            ctx.dst->Assign(ctx.dst->Expr(GetPullingPositionName()), pos_value);
-        stmts.emplace_back(set_pos_expr);
+        // Load the attribute value
+        auto* fetch = Fetch(buffer_array_base, attribute_desc.offset,
+                            buffer_idx, attribute_desc.format);

-        stmts.emplace_back(
-            ctx.dst->Assign(ident, AccessByFormat(i, attribute_desc.format)));
+        // The attribute value may not be of the desired vector width. If it is
+        // not, we'll need to either reduce the width with a swizzle, or append
+        // 0's and / or a 1.
+        auto* value = fetch;
+        if (var_dt.width < fmt_dt.width) {
+          // WGSL variable vector width is smaller than the loaded vector width
+          switch (var_dt.width) {
+            case 1:
+              value = ctx.dst->MemberAccessor(fetch, "x");
+              break;
+            case 2:
+              value = ctx.dst->MemberAccessor(fetch, "xy");
+              break;
+            case 3:
+              value = ctx.dst->MemberAccessor(fetch, "xyz");
+              break;
+            default:
+              TINT_UNREACHABLE(Transform, ctx.dst->Diagnostics())
+                  << var_dt.width;
+              return nullptr;
+          }
+        } else if (var_dt.width > fmt_dt.width) {
+          // WGSL variable vector width is wider than the loaded vector width
+          ast::Type* ty = nullptr;
+          ast::ExpressionList values{fetch};
+          switch (var_dt.base_type) {
+            case BaseType::kI32:
+              ty = ctx.dst->ty.i32();
+              for (uint32_t i = fmt_dt.width; i < var_dt.width; i++) {
+                values.emplace_back(ctx.dst->Expr((i == 3) ? 1 : 0));
+              }
+              break;
+            case BaseType::kU32:
+              ty = ctx.dst->ty.u32();
+              for (uint32_t i = fmt_dt.width; i < var_dt.width; i++) {
+                values.emplace_back(ctx.dst->Expr((i == 3) ? 1u : 0u));
+              }
+              break;
+            case BaseType::kF32:
+              ty = ctx.dst->ty.f32();
+              for (uint32_t i = fmt_dt.width; i < var_dt.width; i++) {
+                values.emplace_back(ctx.dst->Expr((i == 3) ? 1.f : 0.f));
+              }
+              break;
+            default:
+              TINT_UNREACHABLE(Transform, ctx.dst->Diagnostics())
+                  << var_dt.base_type;
+              return nullptr;
+          }
+          value = ctx.dst->Construct(ctx.dst->ty.vec(ty, var_dt.width), values);
+        }
+
+        // Assign the value to the WGSL variable
+        stmts.emplace_back(ctx.dst->Assign(var.expr(), value));
      }
    }

+    if (stmts.empty()) {
+      return nullptr;
+    }
+
    return ctx.dst->create<ast::BlockStatement>(stmts);
  }

  /// Generates an expression reading from a buffer a specific format.
-  /// This reads the value wherever `kPullingPosVarName` points to at the time
-  /// of the read.
+  /// @param array_base the symbol of the variable holding the base array offset
+  /// of the vertex array (each index is 4-bytes).
+  /// @param offset the byte offset of the data from `buffer_base`
  /// @param buffer the index of the vertex buffer
  /// @param format the format to read
-  ast::Expression* AccessByFormat(uint32_t buffer, VertexFormat format) {
-    // TODO(idanr): this doesn't account for the format of the attribute in the
-    // shader. ex: vec<u32> in shader, and attribute claims VertexFormat::Float4
-    // right now, we would try to assign a vec4<f32> to this attribute, but we
-    // really need to assign a vec4<u32> by casting.
-    // We could split this function to first do memory accesses and unpacking
-    // into int/uint/float1-4/etc, then convert that variable to a var<in> with
-    // the conversion defined in the WebGPU spec.
+  ast::Expression* Fetch(Symbol array_base,
+                         uint32_t offset,
+                         uint32_t buffer,
+                         VertexFormat format) {
+    using u32 = ProgramBuilder::u32;
+    using i32 = ProgramBuilder::i32;
+    using f32 = ProgramBuilder::f32;
+
+    // Returns a u32 loaded from buffer_base + offset.
+    auto load_u32 = [&] {
+      return LoadPrimitive(array_base, offset, buffer, VertexFormat::kU32);
+    };
+
+    // Returns a i32 loaded from buffer_base + offset.
+    auto load_i32 = [&] { return ctx.dst->Bitcast<i32>(load_u32()); };
+
+    // Returns a u32 loaded from buffer_base + offset + 4.
+    auto load_next_u32 = [&] {
+      return LoadPrimitive(array_base, offset + 4, buffer, VertexFormat::kU32);
+    };
+
+    // Returns a i32 loaded from buffer_base + offset + 4.
+    auto load_next_i32 = [&] { return ctx.dst->Bitcast<i32>(load_next_u32()); };
+
+    // Returns a u16 loaded from offset, packed in the high 16 bits of a u32.
+    // The low 16 bits are 0.
+    // `min_alignment` must be a power of two.
+    // `offset` must be `min_alignment` bytes aligned.
+    auto load_u16_h = [&] {
+      auto low_u32_offset = offset & ~3u;
+      auto* low_u32 =
+          LoadPrimitive(array_base, low_u32_offset, buffer, VertexFormat::kU32);
+      switch (offset & 3) {
+        case 0:
+          return ctx.dst->Shl(low_u32, 16u);
+        case 1:
+          return ctx.dst->And(ctx.dst->Shl(low_u32, 8u), 0xffff0000u);
+        case 2:
+          return ctx.dst->And(low_u32, 0xffff0000u);
+        default: {  // 3:
+          auto* high_u32 = LoadPrimitive(array_base, low_u32_offset + 4, buffer,
+                                         VertexFormat::kU32);
+          auto* shr = ctx.dst->Shr(low_u32, 8u);
+          auto* shl = ctx.dst->Shl(high_u32, 24u);
+          return ctx.dst->And(ctx.dst->Or(shl, shr), 0xffff0000u);
+        }
+      }
+    };
+
+    // Returns a u16 loaded from offset, packed in the low 16 bits of a u32.
+    // The high 16 bits are 0.
+    auto load_u16_l = [&] {
+      auto low_u32_offset = offset & ~3u;
+      auto* low_u32 =
+          LoadPrimitive(array_base, low_u32_offset, buffer, VertexFormat::kU32);
+      switch (offset & 3) {
+        case 0:
+          return ctx.dst->And(low_u32, 0xffffu);
+        case 1:
+          return ctx.dst->And(ctx.dst->Shr(low_u32, 8u), 0xffffu);
+        case 2:
+          return ctx.dst->Shr(low_u32, 16u);
+        default: {  // 3:
+          auto* high_u32 = LoadPrimitive(array_base, low_u32_offset + 4, buffer,
+                                         VertexFormat::kU32);
+          auto* shr = ctx.dst->Shr(low_u32, 24u);
+          auto* shl = ctx.dst->Shl(high_u32, 8u);
+          return ctx.dst->And(ctx.dst->Or(shl, shr), 0xffffu);
+        }
+      }
+    };
+
+    // Returns a i16 loaded from offset, packed in the high 16 bits of a u32.
+    // The low 16 bits are 0.
+    auto load_i16_h = [&] { return ctx.dst->Bitcast<i32>(load_u16_h()); };
+
+    // Assumptions are made that alignment must be at least as large as the size
+    // of a single component.
+    switch (format) {
+      // Basic primitives
+      case VertexFormat::kUint32:
+      case VertexFormat::kSint32:
+      case VertexFormat::kFloat32:
+        return LoadPrimitive(array_base, offset, buffer, format);
+
+        // Vectors of basic primitives
+      case VertexFormat::kUint32x2:
+        return LoadVec(array_base, offset, buffer, 4, ctx.dst->ty.u32(),
+                       VertexFormat::kU32, 2);
+      case VertexFormat::kUint32x3:
+        return LoadVec(array_base, offset, buffer, 4, ctx.dst->ty.u32(),
+                       VertexFormat::kU32, 3);
+      case VertexFormat::kUint32x4:
+        return LoadVec(array_base, offset, buffer, 4, ctx.dst->ty.u32(),
+                       VertexFormat::kU32, 4);
+      case VertexFormat::kSint32x2:
+        return LoadVec(array_base, offset, buffer, 4, ctx.dst->ty.i32(),
+                       VertexFormat::kI32, 2);
+      case VertexFormat::kSint32x3:
+        return LoadVec(array_base, offset, buffer, 4, ctx.dst->ty.i32(),
+                       VertexFormat::kI32, 3);
+      case VertexFormat::kSint32x4:
+        return LoadVec(array_base, offset, buffer, 4, ctx.dst->ty.i32(),
+                       VertexFormat::kI32, 4);
+      case VertexFormat::kFloat32x2:
+        return LoadVec(array_base, offset, buffer, 4, ctx.dst->ty.f32(),
+                       VertexFormat::kF32, 2);
+      case VertexFormat::kFloat32x3:
+        return LoadVec(array_base, offset, buffer, 4, ctx.dst->ty.f32(),
+                       VertexFormat::kF32, 3);
+      case VertexFormat::kFloat32x4:
+        return LoadVec(array_base, offset, buffer, 4, ctx.dst->ty.f32(),
+                       VertexFormat::kF32, 4);
+
+      case VertexFormat::kUint8x2: {
+        // yyxx0000, yyxx0000
+        auto* u16s = ctx.dst->vec2<u32>(load_u16_h());
+        // xx000000, yyxx0000
+        auto* shl = ctx.dst->Shl(u16s, ctx.dst->vec2<u32>(8u, 0u));
+        // 000000xx, 000000yy
+        return ctx.dst->Shr(shl, ctx.dst->vec2<u32>(24u));
+      }
+      case VertexFormat::kUint8x4: {
+        // wwzzyyxx, wwzzyyxx, wwzzyyxx, wwzzyyxx
+        auto* u32s = ctx.dst->vec4<u32>(load_u32());
+        // xx000000, yyxx0000, zzyyxx00, wwzzyyxx
+        auto* shl = ctx.dst->Shl(u32s, ctx.dst->vec4<u32>(24u, 16u, 8u, 0u));
+        // 000000xx, 000000yy, 000000zz, 000000ww
+        return ctx.dst->Shr(shl, ctx.dst->vec4<u32>(24u));
+      }
+      case VertexFormat::kUint16x2: {
+        // yyyyxxxx, yyyyxxxx
+        auto* u32s = ctx.dst->vec2<u32>(load_u32());
+        // xxxx0000, yyyyxxxx
+        auto* shl = ctx.dst->Shl(u32s, ctx.dst->vec2<u32>(16u, 0u));
+        // 0000xxxx, 0000yyyy
+        return ctx.dst->Shr(shl, ctx.dst->vec2<u32>(16u));
+      }
+      case VertexFormat::kUint16x4: {
+        // yyyyxxxx, wwwwzzzz
+        auto* u32s = ctx.dst->vec2<u32>(load_u32(), load_next_u32());
+        // yyyyxxxx, yyyyxxxx, wwwwzzzz, wwwwzzzz
+        auto* xxyy = ctx.dst->MemberAccessor(u32s, "xxyy");
+        // xxxx0000, yyyyxxxx, zzzz0000, wwwwzzzz
+        auto* shl = ctx.dst->Shl(xxyy, ctx.dst->vec4<u32>(16u, 0u, 16u, 0u));
+        // 0000xxxx, 0000yyyy, 0000zzzz, 0000wwww
+        return ctx.dst->Shr(shl, ctx.dst->vec4<u32>(16u));
+      }
+      case VertexFormat::kSint8x2: {
+        // yyxx0000, yyxx0000
+        auto* i16s = ctx.dst->vec2<i32>(load_i16_h());
+        // xx000000, yyxx0000
+        auto* shl = ctx.dst->Shl(i16s, ctx.dst->vec2<u32>(8u, 0u));
+        // ssssssxx, ssssssyy
+        return ctx.dst->Shr(shl, ctx.dst->vec2<u32>(24u));
+      }
+      case VertexFormat::kSint8x4: {
+        // wwzzyyxx, wwzzyyxx, wwzzyyxx, wwzzyyxx
+        auto* i32s = ctx.dst->vec4<i32>(load_i32());
+        // xx000000, yyxx0000, zzyyxx00, wwzzyyxx
+        auto* shl = ctx.dst->Shl(i32s, ctx.dst->vec4<u32>(24u, 16u, 8u, 0u));
+        // ssssssxx, ssssssyy, sssssszz, ssssssww
+        return ctx.dst->Shr(shl, ctx.dst->vec4<u32>(24u));
+      }
+      case VertexFormat::kSint16x2: {
+        // yyyyxxxx, yyyyxxxx
+        auto* i32s = ctx.dst->vec2<i32>(load_i32());
+        // xxxx0000, yyyyxxxx
+        auto* shl = ctx.dst->Shl(i32s, ctx.dst->vec2<u32>(16u, 0u));
+        // ssssxxxx, ssssyyyy
+        return ctx.dst->Shr(shl, ctx.dst->vec2<u32>(16u));
+      }
+      case VertexFormat::kSint16x4: {
+        // yyyyxxxx, wwwwzzzz
+        auto* i32s = ctx.dst->vec2<i32>(load_i32(), load_next_i32());
+        // yyyyxxxx, yyyyxxxx, wwwwzzzz, wwwwzzzz
+        auto* xxyy = ctx.dst->MemberAccessor(i32s, "xxyy");
+        // xxxx0000, yyyyxxxx, zzzz0000, wwwwzzzz
+        auto* shl = ctx.dst->Shl(xxyy, ctx.dst->vec4<u32>(16u, 0u, 16u, 0u));
+        // ssssxxxx, ssssyyyy, sssszzzz, sssswwww
+        return ctx.dst->Shr(shl, ctx.dst->vec4<u32>(16u));
+      }
+      case VertexFormat::kUnorm8x2:
+        return ctx.dst->MemberAccessor(
+            ctx.dst->Call("unpack4x8unorm", load_u16_l()), "xy");
+      case VertexFormat::kSnorm8x2:
+        return ctx.dst->MemberAccessor(
+            ctx.dst->Call("unpack4x8snorm", load_u16_l()), "xy");
+      case VertexFormat::kUnorm8x4:
+        return ctx.dst->Call("unpack4x8unorm", load_u32());
+      case VertexFormat::kSnorm8x4:
+        return ctx.dst->Call("unpack4x8snorm", load_u32());
+      case VertexFormat::kUnorm16x2:
+        return ctx.dst->Call("unpack2x16unorm", load_u32());
+      case VertexFormat::kSnorm16x2:
+        return ctx.dst->Call("unpack2x16snorm", load_u32());
+      case VertexFormat::kFloat16x2:
+        return ctx.dst->Call("unpack2x16float", load_u32());
+      case VertexFormat::kUnorm16x4:
+        return ctx.dst->vec4<f32>(
+            ctx.dst->Call("unpack2x16unorm", load_u32()),
+            ctx.dst->Call("unpack2x16unorm", load_next_u32()));
+      case VertexFormat::kSnorm16x4:
+        return ctx.dst->vec4<f32>(
+            ctx.dst->Call("unpack2x16snorm", load_u32()),
+            ctx.dst->Call("unpack2x16snorm", load_next_u32()));
+      case VertexFormat::kFloat16x4:
+        return ctx.dst->vec4<f32>(
+            ctx.dst->Call("unpack2x16float", load_u32()),
+            ctx.dst->Call("unpack2x16float", load_next_u32()));
+    }
+
+    TINT_UNREACHABLE(Transform, ctx.dst->Diagnostics())
+        << "format " << static_cast<int>(format);
+    return nullptr;
+  }
+
+  /// Generates an expression reading an aligned basic type (u32, i32, f32) from
+  /// a vertex buffer.
+  /// @param array_base the symbol of the variable holding the base array offset
+  /// of the vertex array (each index is 4-bytes).
+  /// @param offset the byte offset of the data from `buffer_base`
+  /// @param buffer the index of the vertex buffer
+  /// @param format VertexFormat::kU32, VertexFormat::kI32 or VertexFormat::kF32
+  ast::Expression* LoadPrimitive(Symbol array_base,
+                                 uint32_t offset,
+                                 uint32_t buffer,
+                                 VertexFormat format) {
+    ast::Expression* u32 = nullptr;
+    if ((offset & 3) == 0) {
+      // Aligned load.
+
+      ast ::Expression* index = nullptr;
+      if (offset > 0) {
+        index = ctx.dst->Add(array_base, offset / 4);
+      } else {
+        index = ctx.dst->Expr(array_base);
+      }
+      u32 = ctx.dst->IndexAccessor(
+          ctx.dst->MemberAccessor(GetVertexBufferName(buffer),
+                                  GetStructBufferName()),
+          index);
+
+    } else {
+      // Unaligned load
+      uint32_t offset_aligned = offset & ~3u;
+      auto* low =
+          LoadPrimitive(array_base, offset_aligned, buffer, VertexFormat::kU32);
+      auto* high = LoadPrimitive(array_base, offset_aligned + 4u, buffer,
+                                 VertexFormat::kU32);
+
+      uint32_t shift = 8u * (offset & 3u);
+
+      auto* low_shr = ctx.dst->Shr(low, shift);
+      auto* high_shl = ctx.dst->Shl(high, 32u - shift);
+      u32 = ctx.dst->Or(low_shr, high_shl);
+    }
+
    switch (format) {
      case VertexFormat::kU32:
-        return AccessU32(buffer, ctx.dst->Expr(GetPullingPositionName()));
+        return u32;
      case VertexFormat::kI32:
-        return AccessI32(buffer, ctx.dst->Expr(GetPullingPositionName()));
+        return ctx.dst->Bitcast(ctx.dst->ty.i32(), u32);
      case VertexFormat::kF32:
-        return AccessF32(buffer, ctx.dst->Expr(GetPullingPositionName()));
-      case VertexFormat::kVec2F32:
-        return AccessVec(buffer, 4, ctx.dst->ty.f32(), VertexFormat::kF32, 2);
-      case VertexFormat::kVec3F32:
-        return AccessVec(buffer, 4, ctx.dst->ty.f32(), VertexFormat::kF32, 3);
-      case VertexFormat::kVec4F32:
-        return AccessVec(buffer, 4, ctx.dst->ty.f32(), VertexFormat::kF32, 4);
+        return ctx.dst->Bitcast(ctx.dst->ty.f32(), u32);
      default:
-        return nullptr;
-    }
-  }
-
-  /// Generates an expression reading a uint32 from a vertex buffer
-  /// @param buffer the index of the vertex buffer
-  /// @param pos an expression for the position of the access, in bytes
-  ast::Expression* AccessU32(uint32_t buffer, ast::Expression* pos) {
-    // Here we divide by 4, since the buffer is uint32 not uint8. The input
-    // buffer has byte offsets for each attribute, and we will convert it to u32
-    // indexes by dividing. Then, that element is going to be read, and if
-    // needed, unpacked into an appropriate variable. All reads should end up
-    // here as a base case.
-    return ctx.dst->create<ast::ArrayAccessorExpression>(
-        ctx.dst->MemberAccessor(GetVertexBufferName(buffer),
-                                GetStructBufferName()),
-        ctx.dst->Div(pos, 4u));
-  }
-
-  /// Generates an expression reading an int32 from a vertex buffer
-  /// @param buffer the index of the vertex buffer
-  /// @param pos an expression for the position of the access, in bytes
-  ast::Expression* AccessI32(uint32_t buffer, ast::Expression* pos) {
-    // as<T> reinterprets bits
-    return ctx.dst->create<ast::BitcastExpression>(ctx.dst->ty.i32(),
-                                                   AccessU32(buffer, pos));
-  }
-
-  /// Generates an expression reading a float from a vertex buffer
-  /// @param buffer the index of the vertex buffer
-  /// @param pos an expression for the position of the access, in bytes
-  ast::Expression* AccessF32(uint32_t buffer, ast::Expression* pos) {
-    // as<T> reinterprets bits
-    return ctx.dst->create<ast::BitcastExpression>(ctx.dst->ty.f32(),
-                                                   AccessU32(buffer, pos));
-  }
-
-  /// Generates an expression reading a basic type (u32, i32, f32) from a
-  /// vertex buffer
-  /// @param buffer the index of the vertex buffer
-  /// @param pos an expression for the position of the access, in bytes
-  /// @param format the underlying vertex format
-  ast::Expression* AccessPrimitive(uint32_t buffer,
-                                   ast::Expression* pos,
-                                   VertexFormat format) {
-    // This function uses a position expression to read, rather than using the
-    // position variable. This allows us to read from offset positions relative
-    // to |kPullingPosVarName|. We can't call AccessByFormat because it reads
-    // only from the position variable.
-    switch (format) {
-      case VertexFormat::kU32:
-        return AccessU32(buffer, pos);
-      case VertexFormat::kI32:
-        return AccessI32(buffer, pos);
-      case VertexFormat::kF32:
-        return AccessF32(buffer, pos);
-      default:
-        return nullptr;
+        break;
    }
+    TINT_UNREACHABLE(Transform, ctx.dst->Diagnostics())
+        << "invalid format for LoadPrimitive" << static_cast<int>(format);
+    return nullptr;
  }

  /// Generates an expression reading a vec2/3/4 from a vertex buffer.
-  /// This reads the value wherever `kPullingPosVarName` points to at the time
-  /// of the read.
+  /// @param array_base the symbol of the variable holding the base array offset
+  /// of the vertex array (each index is 4-bytes).
+  /// @param offset the byte offset of the data from `buffer_base`
  /// @param buffer the index of the vertex buffer
  /// @param element_stride stride between elements, in bytes
  /// @param base_type underlying AST type
  /// @param base_format underlying vertex format
  /// @param count how many elements the vector has
-  ast::Expression* AccessVec(uint32_t buffer,
-                             uint32_t element_stride,
-                             ast::Type* base_type,
-                             VertexFormat base_format,
-                             uint32_t count) {
+  ast::Expression* LoadVec(Symbol array_base,
+                           uint32_t offset,
+                           uint32_t buffer,
+                           uint32_t element_stride,
+                           ast::Type* base_type,
+                           VertexFormat base_format,
+                           uint32_t count) {
    ast::ExpressionList expr_list;
    for (uint32_t i = 0; i < count; ++i) {
      // Offset read position by element_stride for each component
-      auto* cur_pos =
-          ctx.dst->Add(GetPullingPositionName(), element_stride * i);
-      expr_list.push_back(AccessPrimitive(buffer, cur_pos, base_format));
+      uint32_t primitive_offset = offset + element_stride * i;
+      expr_list.push_back(
+          LoadPrimitive(array_base, primitive_offset, buffer, base_format));
    }

    return ctx.dst->create<ast::TypeConstructorExpression>(
@ -285,12 +729,12 @@ struct State {
      auto func_var_sym = ctx.Clone(param->symbol());
      auto* func_var_type = ctx.Clone(param->type());
      auto* func_var = ctx.dst->Var(func_var_sym, func_var_type);
-      ctx.InsertBefore(func->body()->statements(), *func->body()->begin(),
-                       ctx.dst->Decl(func_var));
+      ctx.InsertFront(func->body()->statements(), ctx.dst->Decl(func_var));
      // Capture mapping from location to the new variable.
-      location_to_expr[location->value()] = [this, func_var]() {
-        return ctx.dst->Expr(func_var);
-      };
+      LocationInfo info;
+      info.expr = [this, func_var]() { return ctx.dst->Expr(func_var); };
+      info.type = ctx.src->Sem().Get(param)->Type();
+      location_info[location->value()] = info;
    } else if (auto* builtin = ast::GetDecoration<ast::BuiltinDecoration>(
                   param->decorations())) {
      // Check for existing vertex_index and instance_index builtins.
@ -336,7 +780,10 @@ struct State {
      if (auto* location = ast::GetDecoration<ast::LocationDecoration>(
              member->decorations())) {
        // Capture mapping from location to struct member.
-        location_to_expr[location->value()] = member_expr;
+        LocationInfo info;
+        info.expr = member_expr;
+        info.type = ctx.src->Sem().Get(member)->Type();
+        location_info[location->value()] = info;
        has_locations = true;
      } else if (auto* builtin = ast::GetDecoration<ast::BuiltinDecoration>(
                     member->decorations())) {
@ -361,8 +808,7 @@ struct State {

    // Create a function-scope variable to replace the parameter.
    auto* func_var = ctx.dst->Var(param_sym, ctx.Clone(param->type()));
-    ctx.InsertBefore(func->body()->statements(), *func->body()->begin(),
-                     ctx.dst->Decl(func_var));
+    ctx.InsertFront(func->body()->statements(), ctx.dst->Decl(func_var));

    if (!members_to_clone.empty()) {
      // Create a new struct without the location attributes.
@ -384,8 +830,8 @@ struct State {
      // Copy values from the new parameter to the function-scope variable.
      for (auto* member : members_to_clone) {
        auto member_name = ctx.Clone(member->symbol());
-        ctx.InsertBefore(
-            func->body()->statements(), *func->body()->begin(),
+        ctx.InsertFront(
+            func->body()->statements(),
            ctx.dst->Assign(ctx.dst->MemberAccessor(func_var, member_name),
                            ctx.dst->MemberAccessor(new_param, member_name)));
      }
@ -436,8 +882,9 @@ struct State {
    }

    // Generate vertex pulling preamble.
-    ctx.InsertBefore(func->body()->statements(), *func->body()->begin(),
-                     CreateVertexPullingPreamble());
+    if (auto* block = CreateVertexPullingPreamble()) {
+      ctx.InsertFront(func->body()->statements(), block);
+    }

    // Rewrite the function header with the new parameters.
    auto func_sym = ctx.Clone(func->symbol());
@ -495,7 +942,7 @@ VertexPulling::Config& VertexPulling::Config::operator=(const Config&) =
 VertexBufferLayoutDescriptor::VertexBufferLayoutDescriptor() = default;

 VertexBufferLayoutDescriptor::VertexBufferLayoutDescriptor(
-    uint64_t in_array_stride,
+    uint32_t in_array_stride,
    InputStepMode in_step_mode,
    std::vector<VertexAttributeDescriptor> in_attributes)
    : array_stride(in_array_stride),
--- a/src/transform/vertex_pulling.h
+++ b/src/transform/vertex_pulling.h
@ -27,36 +27,68 @@ namespace transform {

 /// Describes the format of data in a vertex buffer
 enum class VertexFormat {
-  kVec2U8,
-  kVec4U8,
-  kVec2I8,
-  kVec4I8,
-  kVec2U8Norm,
-  kVec4U8Norm,
-  kVec2I8Norm,
-  kVec4I8Norm,
-  kVec2U16,
-  kVec4U16,
-  kVec2I16,
-  kVec4I16,
-  kVec2U16Norm,
-  kVec4U16Norm,
-  kVec2I16Norm,
-  kVec4I16Norm,
-  kVec2F16,
-  kVec4F16,
-  kF32,
-  kVec2F32,
-  kVec3F32,
-  kVec4F32,
-  kU32,
-  kVec2U32,
-  kVec3U32,
-  kVec4U32,
-  kI32,
-  kVec2I32,
-  kVec3I32,
-  kVec4I32,
+  kUint8x2,    // uint8x2
+  kUint8x4,    // uint8x4
+  kSint8x2,    // sint8x2
+  kSint8x4,    // sint8x4
+  kUnorm8x2,   // unorm8x2
+  kUnorm8x4,   // unorm8x4
+  kSnorm8x2,   // snorm8x2
+  kSnorm8x4,   // snorm8x4
+  kUint16x2,   // uint16x2
+  kUint16x4,   // uint16x4
+  kSint16x2,   // sint16x2
+  kSint16x4,   // sint16x4
+  kUnorm16x2,  // unorm16x2
+  kUnorm16x4,  // unorm16x4
+  kSnorm16x2,  // snorm16x2
+  kSnorm16x4,  // snorm16x4
+  kFloat16x2,  // float16x2
+  kFloat16x4,  // float16x4
+  kFloat32,    // float32
+  kFloat32x2,  // float32x2
+  kFloat32x3,  // float32x3
+  kFloat32x4,  // float32x4
+  kUint32,     // uint32
+  kUint32x2,   // uint32x2
+  kUint32x3,   // uint32x3
+  kUint32x4,   // uint32x4
+  kSint32,     // sint32
+  kSint32x2,   // sint32x2
+  kSint32x3,   // sint32x3
+  kSint32x4,   // sint32x4
+
+  // Deprecated names
+  kVec2U8 = kUint8x2,
+  kVec4U8 = kUint8x4,
+  kVec2I8 = kSint8x2,
+  kVec4I8 = kSint8x4,
+  kVec2U8Norm = kUnorm8x2,
+  kVec4U8Norm = kUnorm8x4,
+  kVec2I8Norm = kSnorm8x2,
+  kVec4I8Norm = kSnorm8x4,
+  kVec2U16 = kUint16x2,
+  kVec4U16 = kUint16x4,
+  kVec2I16 = kSint16x2,
+  kVec4I16 = kSint16x4,
+  kVec2U16Norm = kUnorm16x2,
+  kVec4U16Norm = kUnorm16x4,
+  kVec2I16Norm = kSnorm16x2,
+  kVec4I16Norm = kSnorm16x4,
+  kVec2F16 = kFloat16x2,
+  kVec4F16 = kFloat16x4,
+  kF32 = kFloat32,
+  kVec2F32 = kFloat32x2,
+  kVec3F32 = kFloat32x3,
+  kVec4F32 = kFloat32x4,
+  kU32 = kUint32,
+  kVec2U32 = kUint32x2,
+  kVec3U32 = kUint32x3,
+  kVec4U32 = kUint32x4,
+  kI32 = kSint32,
+  kVec2I32 = kSint32x2,
+  kVec3I32 = kSint32x3,
+  kVec4I32 = kSint32x4,
  kLastEntry = kVec4I32
 };

@ -69,7 +101,7 @@ struct VertexAttributeDescriptor {
  /// The format of the attribute
  VertexFormat format;
  /// The byte offset of the attribute in the buffer
-  uint64_t offset;
+  uint32_t offset;
  /// The shader location used for the attribute
  uint32_t shader_location;
 };
@ -83,7 +115,7 @@ struct VertexBufferLayoutDescriptor {
  /// @param in_step_mode the step mode of the in buffer
  /// @param in_attributes the in attributes
  VertexBufferLayoutDescriptor(
-      uint64_t in_array_stride,
+      uint32_t in_array_stride,
      InputStepMode in_step_mode,
      std::vector<VertexAttributeDescriptor> in_attributes);
  /// Copy constructor
@ -99,7 +131,7 @@ struct VertexBufferLayoutDescriptor {
  ~VertexBufferLayoutDescriptor();

  /// The array stride used in the in buffer
-  uint64_t array_stride = 0u;
+  uint32_t array_stride = 0u;
  /// The input step mode used
  InputStepMode step_mode = InputStepMode::kVertex;
  /// The vertex attributes
--- a/src/transform/vertex_pulling_test.cc
+++ b/src/transform/vertex_pulling_test.cc
@ -74,6 +74,30 @@ fn main() {}
  EXPECT_EQ(expect, str(got));
 }

+TEST_F(VertexPullingTest, Error_BadStride) {
+  auto* src = R"(
+[[stage(vertex)]]
+fn main([[location(0)]] var_a : f32) -> [[builtin(position)]] vec4<f32> {
+  return vec4<f32>(var_a, 0.0, 0.0, 1.0);
+}
+)";
+
+  auto* expect =
+      "error: WebGPU requires that vertex stride must be a multiple of 4 "
+      "bytes, but VertexPulling array stride for buffer 0 was 15 bytes";
+
+  VertexPulling::Config cfg;
+  cfg.vertex_state = {
+      {{15, InputStepMode::kVertex, {{VertexFormat::kFloat32, 0, 0}}}}};
+  cfg.entry_point_name = "main";
+
+  DataMap data;
+  data.Add<VertexPulling::Config>(cfg);
+  auto got = Run<VertexPulling>(src, data);
+
+  EXPECT_EQ(expect, str(got));
+}
+
 TEST_F(VertexPullingTest, BasicModule) {
  auto* src = R"(
 [[stage(vertex)]]
@ -90,9 +114,6 @@ struct TintVertexData {

 [[stage(vertex)]]
 fn main() -> [[builtin(position)]] vec4<f32> {
-  {
-    var tint_pulling_pos : u32;
-  }
  return vec4<f32>();
 }
 )";
@ -127,9 +148,8 @@ struct TintVertexData {
 fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(position)]] vec4<f32> {
  var var_a : f32;
  {
-    var tint_pulling_pos : u32;
-    tint_pulling_pos = ((tint_pulling_vertex_index * 4u) + 0u);
-    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(tint_pulling_pos / 4u)]);
+    let buffer_array_base_0 = tint_pulling_vertex_index;
+    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[buffer_array_base_0]);
  }
  return vec4<f32>(var_a, 0.0, 0.0, 1.0);
 }
@ -137,7 +157,7 @@ fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(

  VertexPulling::Config cfg;
  cfg.vertex_state = {
-      {{4, InputStepMode::kVertex, {{VertexFormat::kF32, 0, 0}}}}};
+      {{4, InputStepMode::kVertex, {{VertexFormat::kFloat32, 0, 0}}}}};
  cfg.entry_point_name = "main";

  DataMap data;
@ -167,9 +187,8 @@ struct TintVertexData {
 fn main([[builtin(instance_index)]] tint_pulling_instance_index : u32) -> [[builtin(position)]] vec4<f32> {
  var var_a : f32;
  {
-    var tint_pulling_pos : u32;
-    tint_pulling_pos = ((tint_pulling_instance_index * 4u) + 0u);
-    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(tint_pulling_pos / 4u)]);
+    let buffer_array_base_0 = tint_pulling_instance_index;
+    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[buffer_array_base_0]);
  }
  return vec4<f32>(var_a, 0.0, 0.0, 1.0);
 }
@ -177,7 +196,7 @@ fn main([[builtin(instance_index)]] tint_pulling_instance_index : u32) -> [[buil

  VertexPulling::Config cfg;
  cfg.vertex_state = {
-      {{4, InputStepMode::kInstance, {{VertexFormat::kF32, 0, 0}}}}};
+      {{4, InputStepMode::kInstance, {{VertexFormat::kFloat32, 0, 0}}}}};
  cfg.entry_point_name = "main";

  DataMap data;
@ -207,9 +226,8 @@ struct TintVertexData {
 fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(position)]] vec4<f32> {
  var var_a : f32;
  {
-    var tint_pulling_pos : u32;
-    tint_pulling_pos = ((tint_pulling_vertex_index * 4u) + 0u);
-    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(tint_pulling_pos / 4u)]);
+    let buffer_array_base_0 = tint_pulling_vertex_index;
+    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[buffer_array_base_0]);
  }
  return vec4<f32>(var_a, 0.0, 0.0, 1.0);
 }
@ -217,7 +235,7 @@ fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(

  VertexPulling::Config cfg;
  cfg.vertex_state = {
-      {{4, InputStepMode::kVertex, {{VertexFormat::kF32, 0, 0}}}}};
+      {{4, InputStepMode::kVertex, {{VertexFormat::kFloat32, 0, 0}}}}};
  cfg.pulling_group = 5;
  cfg.entry_point_name = "main";

@ -257,9 +275,8 @@ struct Inputs {
 fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(position)]] vec4<f32> {
  var inputs : Inputs;
  {
-    var tint_pulling_pos : u32;
-    tint_pulling_pos = ((tint_pulling_vertex_index * 4u) + 0u);
-    inputs.var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(tint_pulling_pos / 4u)]);
+    let buffer_array_base_0 = tint_pulling_vertex_index;
+    inputs.var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[buffer_array_base_0]);
  }
  return vec4<f32>(inputs.var_a, 0.0, 0.0, 1.0);
 }
@ -267,7 +284,7 @@ fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(

  VertexPulling::Config cfg;
  cfg.vertex_state = {
-      {{4, InputStepMode::kVertex, {{VertexFormat::kF32, 0, 0}}}}};
+      {{4, InputStepMode::kVertex, {{VertexFormat::kFloat32, 0, 0}}}}};
  cfg.entry_point_name = "main";

  DataMap data;
@ -305,11 +322,10 @@ fn main([[builtin(vertex_index)]] custom_vertex_index : u32, [[builtin(instance_
  var var_a : f32;
  var var_b : f32;
  {
-    var tint_pulling_pos : u32;
-    tint_pulling_pos = ((custom_vertex_index * 4u) + 0u);
-    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(tint_pulling_pos / 4u)]);
-    tint_pulling_pos = ((custom_instance_index * 4u) + 0u);
-    var_b = bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[(tint_pulling_pos / 4u)]);
+    let buffer_array_base_0 = custom_vertex_index;
+    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[buffer_array_base_0]);
+    let buffer_array_base_1 = custom_instance_index;
+    var_b = bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[buffer_array_base_1]);
  }
  return vec4<f32>(var_a, var_b, 0.0, 1.0);
 }
@ -320,12 +336,12 @@ fn main([[builtin(vertex_index)]] custom_vertex_index : u32, [[builtin(instance_
      {
          4,
          InputStepMode::kVertex,
-          {{VertexFormat::kF32, 0, 0}},
+          {{VertexFormat::kFloat32, 0, 0}},
      },
      {
          4,
          InputStepMode::kInstance,
-          {{VertexFormat::kF32, 0, 1}},
+          {{VertexFormat::kFloat32, 0, 1}},
      },
  }};
  cfg.entry_point_name = "main";
@ -386,11 +402,10 @@ fn main(tint_symbol_1 : tint_symbol) -> [[builtin(position)]] vec4<f32> {
  inputs.custom_vertex_index = tint_symbol_1.custom_vertex_index;
  inputs.custom_instance_index = tint_symbol_1.custom_instance_index;
  {
-    var tint_pulling_pos : u32;
-    tint_pulling_pos = ((inputs.custom_vertex_index * 4u) + 0u);
-    inputs.var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(tint_pulling_pos / 4u)]);
-    tint_pulling_pos = ((inputs.custom_instance_index * 4u) + 0u);
-    inputs.var_b = bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[(tint_pulling_pos / 4u)]);
+    let buffer_array_base_0 = inputs.custom_vertex_index;
+    inputs.var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[buffer_array_base_0]);
+    let buffer_array_base_1 = inputs.custom_instance_index;
+    inputs.var_b = bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[buffer_array_base_1]);
  }
  return vec4<f32>(inputs.var_a, inputs.var_b, 0.0, 1.0);
 }
@ -401,12 +416,12 @@ fn main(tint_symbol_1 : tint_symbol) -> [[builtin(position)]] vec4<f32> {
      {
          4,
          InputStepMode::kVertex,
-          {{VertexFormat::kF32, 0, 0}},
+          {{VertexFormat::kFloat32, 0, 0}},
      },
      {
          4,
          InputStepMode::kInstance,
-          {{VertexFormat::kF32, 0, 1}},
+          {{VertexFormat::kFloat32, 0, 1}},
      },
  }};
  cfg.entry_point_name = "main";
@ -464,11 +479,10 @@ struct Indices {
 fn main(indices : Indices) -> [[builtin(position)]] vec4<f32> {
  var inputs : Inputs;
  {
-    var tint_pulling_pos : u32;
-    tint_pulling_pos = ((indices.custom_vertex_index * 4u) + 0u);
-    inputs.var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(tint_pulling_pos / 4u)]);
-    tint_pulling_pos = ((indices.custom_instance_index * 4u) + 0u);
-    inputs.var_b = bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[(tint_pulling_pos / 4u)]);
+    let buffer_array_base_0 = indices.custom_vertex_index;
+    inputs.var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[buffer_array_base_0]);
+    let buffer_array_base_1 = indices.custom_instance_index;
+    inputs.var_b = bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[buffer_array_base_1]);
  }
  return vec4<f32>(inputs.var_a, inputs.var_b, 0.0, 1.0);
 }
@ -479,12 +493,12 @@ fn main(indices : Indices) -> [[builtin(position)]] vec4<f32> {
      {
          4,
          InputStepMode::kVertex,
-          {{VertexFormat::kF32, 0, 0}},
+          {{VertexFormat::kFloat32, 0, 0}},
      },
      {
          4,
          InputStepMode::kInstance,
-          {{VertexFormat::kF32, 0, 1}},
+          {{VertexFormat::kFloat32, 0, 1}},
      },
  }};
  cfg.entry_point_name = "main";
@ -518,11 +532,9 @@ fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(
  var var_a : f32;
  var var_b : vec4<f32>;
  {
-    var tint_pulling_pos : u32;
-    tint_pulling_pos = ((tint_pulling_vertex_index * 16u) + 0u);
-    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(tint_pulling_pos / 4u)]);
-    tint_pulling_pos = ((tint_pulling_vertex_index * 16u) + 0u);
-    var_b = vec4<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[((tint_pulling_pos + 0u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[((tint_pulling_pos + 4u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[((tint_pulling_pos + 8u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[((tint_pulling_pos + 12u) / 4u)]));
+    let buffer_array_base_0 = (tint_pulling_vertex_index * 4u);
+    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[buffer_array_base_0]);
+    var_b = vec4<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[buffer_array_base_0]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 1u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 2u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 3u)]));
  }
  return vec4<f32>();
 }
@ -532,7 +544,7 @@ fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(
  cfg.vertex_state = {
      {{16,
        InputStepMode::kVertex,
-        {{VertexFormat::kF32, 0, 0}, {VertexFormat::kVec4F32, 0, 1}}}}};
+        {{VertexFormat::kFloat32, 0, 0}, {VertexFormat::kVec4F32, 0, 1}}}}};
  cfg.entry_point_name = "main";

  DataMap data;
@ -571,13 +583,12 @@ fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(
  var var_b : vec3<f32>;
  var var_c : vec4<f32>;
  {
-    var tint_pulling_pos : u32;
-    tint_pulling_pos = ((tint_pulling_vertex_index * 8u) + 0u);
-    var_a = vec2<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[((tint_pulling_pos + 0u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[((tint_pulling_pos + 4u) / 4u)]));
-    tint_pulling_pos = ((tint_pulling_vertex_index * 12u) + 0u);
-    var_b = vec3<f32>(bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[((tint_pulling_pos + 0u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[((tint_pulling_pos + 4u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[((tint_pulling_pos + 8u) / 4u)]));
-    tint_pulling_pos = ((tint_pulling_vertex_index * 16u) + 0u);
-    var_c = vec4<f32>(bitcast<f32>(tint_pulling_vertex_buffer_2.tint_vertex_data[((tint_pulling_pos + 0u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_2.tint_vertex_data[((tint_pulling_pos + 4u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_2.tint_vertex_data[((tint_pulling_pos + 8u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_2.tint_vertex_data[((tint_pulling_pos + 12u) / 4u)]));
+    let buffer_array_base_0 = (tint_pulling_vertex_index * 2u);
+    var_a = vec2<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[buffer_array_base_0]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 1u)]));
+    let buffer_array_base_1 = (tint_pulling_vertex_index * 3u);
+    var_b = vec3<f32>(bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[buffer_array_base_1]), bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[(buffer_array_base_1 + 1u)]), bitcast<f32>(tint_pulling_vertex_buffer_1.tint_vertex_data[(buffer_array_base_1 + 2u)]));
+    let buffer_array_base_2 = (tint_pulling_vertex_index * 4u);
+    var_c = vec4<f32>(bitcast<f32>(tint_pulling_vertex_buffer_2.tint_vertex_data[buffer_array_base_2]), bitcast<f32>(tint_pulling_vertex_buffer_2.tint_vertex_data[(buffer_array_base_2 + 1u)]), bitcast<f32>(tint_pulling_vertex_buffer_2.tint_vertex_data[(buffer_array_base_2 + 2u)]), bitcast<f32>(tint_pulling_vertex_buffer_2.tint_vertex_data[(buffer_array_base_2 + 3u)]));
  }
  return vec4<f32>();
 }
@ -624,11 +635,9 @@ fn main([[builtin(vertex_index)]] tint_pulling_vertex_index_1 : u32) -> [[builti
  var var_a : f32;
  var var_b : vec4<f32>;
  {
-    var tint_pulling_pos_1 : u32;
-    tint_pulling_pos_1 = ((tint_pulling_vertex_index_1 * 16u) + 0u);
-    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0_1.tint_vertex_data_1[(tint_pulling_pos_1 / 4u)]);
-    tint_pulling_pos_1 = ((tint_pulling_vertex_index_1 * 16u) + 0u);
-    var_b = vec4<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0_1.tint_vertex_data_1[((tint_pulling_pos_1 + 0u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_0_1.tint_vertex_data_1[((tint_pulling_pos_1 + 4u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_0_1.tint_vertex_data_1[((tint_pulling_pos_1 + 8u) / 4u)]), bitcast<f32>(tint_pulling_vertex_buffer_0_1.tint_vertex_data_1[((tint_pulling_pos_1 + 12u) / 4u)]));
+    let buffer_array_base_0 = (tint_pulling_vertex_index_1 * 4u);
+    var_a = bitcast<f32>(tint_pulling_vertex_buffer_0_1.tint_vertex_data_1[buffer_array_base_0]);
+    var_b = vec4<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0_1.tint_vertex_data_1[buffer_array_base_0]), bitcast<f32>(tint_pulling_vertex_buffer_0_1.tint_vertex_data_1[(buffer_array_base_0 + 1u)]), bitcast<f32>(tint_pulling_vertex_buffer_0_1.tint_vertex_data_1[(buffer_array_base_0 + 2u)]), bitcast<f32>(tint_pulling_vertex_buffer_0_1.tint_vertex_data_1[(buffer_array_base_0 + 3u)]));
  }
  var tint_pulling_vertex_index : i32;
  var tint_pulling_vertex_buffer_0 : i32;
@ -642,7 +651,7 @@ fn main([[builtin(vertex_index)]] tint_pulling_vertex_index_1 : u32) -> [[builti
  cfg.vertex_state = {
      {{16,
        InputStepMode::kVertex,
-        {{VertexFormat::kF32, 0, 0}, {VertexFormat::kVec4F32, 0, 1}}}}};
+        {{VertexFormat::kFloat32, 0, 0}, {VertexFormat::kVec4F32, 0, 1}}}}};
  cfg.entry_point_name = "main";

  DataMap data;
@ -652,6 +661,487 @@ fn main([[builtin(vertex_index)]] tint_pulling_vertex_index_1 : u32) -> [[builti
  EXPECT_EQ(expect, str(got));
 }

+TEST_F(VertexPullingTest, FormatsAligned) {
+  auto* src = R"(
+[[stage(vertex)]]
+fn main(
+    [[location(0)]] uint8x2 : vec2<u32>,
+    [[location(1)]] uint8x4 : vec4<u32>,
+    [[location(2)]] sint8x2 : vec2<i32>,
+    [[location(3)]] sint8x4 : vec4<i32>,
+    [[location(4)]] unorm8x2 : vec2<f32>,
+    [[location(5)]] unorm8x4 : vec4<f32>,
+    [[location(6)]] snorm8x2 : vec2<f32>,
+    [[location(7)]] snorm8x4 : vec4<f32>,
+    [[location(8)]] uint16x2 : vec2<u32>,
+    [[location(9)]] uint16x4 : vec4<u32>,
+    [[location(10)]] sint16x2 : vec2<i32>,
+    [[location(11)]] sint16x4 : vec4<i32>,
+    [[location(12)]] unorm16x2 : vec2<f32>,
+    [[location(13)]] unorm16x4 : vec4<f32>,
+    [[location(14)]] snorm16x2 : vec2<f32>,
+    [[location(15)]] snorm16x4 : vec4<f32>,
+    [[location(16)]] float16x2 : vec2<f32>,
+    [[location(17)]] float16x4 : vec4<f32>,
+    [[location(18)]] float32 : f32,
+    [[location(19)]] float32x2 : vec2<f32>,
+    [[location(20)]] float32x3 : vec3<f32>,
+    [[location(21)]] float32x4 : vec4<f32>,
+    [[location(22)]] uint32 : u32,
+    [[location(23)]] uint32x2 : vec2<u32>,
+    [[location(24)]] uint32x3 : vec3<u32>,
+    [[location(25)]] uint32x4 : vec4<u32>,
+    [[location(26)]] sint32 : i32,
+    [[location(27)]] sint32x2 : vec2<i32>,
+    [[location(28)]] sint32x3 : vec3<i32>,
+    [[location(29)]] sint32x4 : vec4<i32>
+  ) -> [[builtin(position)]] vec4<f32> {
+  return vec4<f32>(0.0, 0.0, 0.0, 1.0);
+}
+)";
+
+  auto* expect = R"(
+[[block]]
+struct TintVertexData {
+  tint_vertex_data : [[stride(4)]] array<u32>;
+};
+
+[[binding(0), group(4)]] var<storage, read> tint_pulling_vertex_buffer_0 : TintVertexData;
+
+[[stage(vertex)]]
+fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(position)]] vec4<f32> {
+  var uint8x2 : vec2<u32>;
+  var uint8x4 : vec4<u32>;
+  var sint8x2 : vec2<i32>;
+  var sint8x4 : vec4<i32>;
+  var unorm8x2 : vec2<f32>;
+  var unorm8x4 : vec4<f32>;
+  var snorm8x2 : vec2<f32>;
+  var snorm8x4 : vec4<f32>;
+  var uint16x2 : vec2<u32>;
+  var uint16x4 : vec4<u32>;
+  var sint16x2 : vec2<i32>;
+  var sint16x4 : vec4<i32>;
+  var unorm16x2 : vec2<f32>;
+  var unorm16x4 : vec4<f32>;
+  var snorm16x2 : vec2<f32>;
+  var snorm16x4 : vec4<f32>;
+  var float16x2 : vec2<f32>;
+  var float16x4 : vec4<f32>;
+  var float32 : f32;
+  var float32x2 : vec2<f32>;
+  var float32x3 : vec3<f32>;
+  var float32x4 : vec4<f32>;
+  var uint32 : u32;
+  var uint32x2 : vec2<u32>;
+  var uint32x3 : vec3<u32>;
+  var uint32x4 : vec4<u32>;
+  var sint32 : i32;
+  var sint32x2 : vec2<i32>;
+  var sint32x3 : vec3<i32>;
+  var sint32x4 : vec4<i32>;
+  {
+    let buffer_array_base_0 = (tint_pulling_vertex_index * 64u);
+    uint8x2 = ((vec2<u32>((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 16u)) << vec2<u32>(8u, 0u)) >> vec2<u32>(24u));
+    uint8x4 = ((vec4<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]) << vec4<u32>(24u, 16u, 8u, 0u)) >> vec4<u32>(24u));
+    sint8x2 = ((vec2<i32>(bitcast<i32>((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 16u))) << vec2<u32>(8u, 0u)) >> vec2<u32>(24u));
+    sint8x4 = ((vec4<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)])) << vec4<u32>(24u, 16u, 8u, 0u)) >> vec4<u32>(24u));
+    unorm8x2 = unpack4x8unorm((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] & 65535u)).xy;
+    unorm8x4 = unpack4x8unorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]);
+    snorm8x2 = unpack4x8snorm((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] & 65535u)).xy;
+    snorm8x4 = unpack4x8snorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]);
+    uint16x2 = ((vec2<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]) << vec2<u32>(16u, 0u)) >> vec2<u32>(16u));
+    uint16x4 = ((vec2<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]).xxyy << vec4<u32>(16u, 0u, 16u, 0u)) >> vec4<u32>(16u));
+    sint16x2 = ((vec2<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)])) << vec2<u32>(16u, 0u)) >> vec2<u32>(16u));
+    sint16x4 = ((vec2<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)])).xxyy << vec4<u32>(16u, 0u, 16u, 0u)) >> vec4<u32>(16u));
+    unorm16x2 = unpack2x16unorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]);
+    unorm16x4 = vec4<f32>(unpack2x16unorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), unpack2x16unorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]));
+    snorm16x2 = unpack2x16snorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]);
+    snorm16x4 = vec4<f32>(unpack2x16snorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), unpack2x16snorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]));
+    float16x2 = unpack2x16float(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]);
+    float16x4 = vec4<f32>(unpack2x16float(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), unpack2x16float(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]));
+    float32 = bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]);
+    float32x2 = vec2<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]));
+    float32x3 = vec3<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)]));
+    float32x4 = vec4<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 19u)]));
+    uint32 = tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)];
+    uint32x2 = vec2<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]);
+    uint32x3 = vec3<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)]);
+    uint32x4 = vec4<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 19u)]);
+    sint32 = bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]);
+    sint32x2 = vec2<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]));
+    sint32x3 = vec3<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)]));
+    sint32x4 = vec4<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 19u)]));
+  }
+  return vec4<f32>(0.0, 0.0, 0.0, 1.0);
+}
+)";
+
+  VertexPulling::Config cfg;
+  cfg.vertex_state = {{{256,
+                        InputStepMode::kVertex,
+                        {
+                            {VertexFormat::kUint8x2, 64, 0},
+                            {VertexFormat::kUint8x4, 64, 1},
+                            {VertexFormat::kSint8x2, 64, 2},
+                            {VertexFormat::kSint8x4, 64, 3},
+                            {VertexFormat::kUnorm8x2, 64, 4},
+                            {VertexFormat::kUnorm8x4, 64, 5},
+                            {VertexFormat::kSnorm8x2, 64, 6},
+                            {VertexFormat::kSnorm8x4, 64, 7},
+                            {VertexFormat::kUint16x2, 64, 8},
+                            {VertexFormat::kUint16x4, 64, 9},
+                            {VertexFormat::kSint16x2, 64, 10},
+                            {VertexFormat::kSint16x4, 64, 11},
+                            {VertexFormat::kUnorm16x2, 64, 12},
+                            {VertexFormat::kUnorm16x4, 64, 13},
+                            {VertexFormat::kSnorm16x2, 64, 14},
+                            {VertexFormat::kSnorm16x4, 64, 15},
+                            {VertexFormat::kFloat16x2, 64, 16},
+                            {VertexFormat::kFloat16x4, 64, 17},
+                            {VertexFormat::kFloat32, 64, 18},
+                            {VertexFormat::kFloat32x2, 64, 19},
+                            {VertexFormat::kFloat32x3, 64, 20},
+                            {VertexFormat::kFloat32x4, 64, 21},
+                            {VertexFormat::kUint32, 64, 22},
+                            {VertexFormat::kUint32x2, 64, 23},
+                            {VertexFormat::kUint32x3, 64, 24},
+                            {VertexFormat::kUint32x4, 64, 25},
+                            {VertexFormat::kSint32, 64, 26},
+                            {VertexFormat::kSint32x2, 64, 27},
+                            {VertexFormat::kSint32x3, 64, 28},
+                            {VertexFormat::kSint32x4, 64, 29},
+                        }}}};
+  cfg.entry_point_name = "main";
+
+  DataMap data;
+  data.Add<VertexPulling::Config>(cfg);
+  auto got = Run<VertexPulling>(src, data);
+
+  EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(VertexPullingTest, FormatsStrideUnaligned) {
+  auto* src = R"(
+[[stage(vertex)]]
+fn main(
+    [[location(0)]] uint8x2 : vec2<u32>,
+    [[location(1)]] uint8x4 : vec4<u32>,
+    [[location(2)]] sint8x2 : vec2<i32>,
+    [[location(3)]] sint8x4 : vec4<i32>,
+    [[location(4)]] unorm8x2 : vec2<f32>,
+    [[location(5)]] unorm8x4 : vec4<f32>,
+    [[location(6)]] snorm8x2 : vec2<f32>,
+    [[location(7)]] snorm8x4 : vec4<f32>,
+    [[location(8)]] uint16x2 : vec2<u32>,
+    [[location(9)]] uint16x4 : vec4<u32>,
+    [[location(10)]] sint16x2 : vec2<i32>,
+    [[location(11)]] sint16x4 : vec4<i32>,
+    [[location(12)]] unorm16x2 : vec2<f32>,
+    [[location(13)]] unorm16x4 : vec4<f32>,
+    [[location(14)]] snorm16x2 : vec2<f32>,
+    [[location(15)]] snorm16x4 : vec4<f32>,
+    [[location(16)]] float16x2 : vec2<f32>,
+    [[location(17)]] float16x4 : vec4<f32>,
+    [[location(18)]] float32 : f32,
+    [[location(19)]] float32x2 : vec2<f32>,
+    [[location(20)]] float32x3 : vec3<f32>,
+    [[location(21)]] float32x4 : vec4<f32>,
+    [[location(22)]] uint32 : u32,
+    [[location(23)]] uint32x2 : vec2<u32>,
+    [[location(24)]] uint32x3 : vec3<u32>,
+    [[location(25)]] uint32x4 : vec4<u32>,
+    [[location(26)]] sint32 : i32,
+    [[location(27)]] sint32x2 : vec2<i32>,
+    [[location(28)]] sint32x3 : vec3<i32>,
+    [[location(29)]] sint32x4 : vec4<i32>
+  ) -> [[builtin(position)]] vec4<f32> {
+  return vec4<f32>(0.0, 0.0, 0.0, 1.0);
+}
+)";
+
+  auto* expect =
+      R"(
+[[block]]
+struct TintVertexData {
+  tint_vertex_data : [[stride(4)]] array<u32>;
+};
+
+[[binding(0), group(4)]] var<storage, read> tint_pulling_vertex_buffer_0 : TintVertexData;
+
+[[stage(vertex)]]
+fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(position)]] vec4<f32> {
+  var uint8x2 : vec2<u32>;
+  var uint8x4 : vec4<u32>;
+  var sint8x2 : vec2<i32>;
+  var sint8x4 : vec4<i32>;
+  var unorm8x2 : vec2<f32>;
+  var unorm8x4 : vec4<f32>;
+  var snorm8x2 : vec2<f32>;
+  var snorm8x4 : vec4<f32>;
+  var uint16x2 : vec2<u32>;
+  var uint16x4 : vec4<u32>;
+  var sint16x2 : vec2<i32>;
+  var sint16x4 : vec4<i32>;
+  var unorm16x2 : vec2<f32>;
+  var unorm16x4 : vec4<f32>;
+  var snorm16x2 : vec2<f32>;
+  var snorm16x4 : vec4<f32>;
+  var float16x2 : vec2<f32>;
+  var float16x4 : vec4<f32>;
+  var float32 : f32;
+  var float32x2 : vec2<f32>;
+  var float32x3 : vec3<f32>;
+  var float32x4 : vec4<f32>;
+  var uint32 : u32;
+  var uint32x2 : vec2<u32>;
+  var uint32x3 : vec3<u32>;
+  var uint32x4 : vec4<u32>;
+  var sint32 : i32;
+  var sint32x2 : vec2<i32>;
+  var sint32x3 : vec3<i32>;
+  var sint32x4 : vec4<i32>;
+  {
+    let buffer_array_base_0 = (tint_pulling_vertex_index * 64u);
+    uint8x2 = ((vec2<u32>((((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 8u)) & 4294901760u)) << vec2<u32>(8u, 0u)) >> vec2<u32>(24u));
+    uint8x4 = ((vec4<u32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))) << vec4<u32>(24u, 16u, 8u, 0u)) >> vec4<u32>(24u));
+    sint8x2 = ((vec2<i32>(bitcast<i32>((((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 8u)) & 4294901760u))) << vec2<u32>(8u, 0u)) >> vec2<u32>(24u));
+    sint8x4 = ((vec4<i32>(bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)))) << vec4<u32>(24u, 16u, 8u, 0u)) >> vec4<u32>(24u));
+    unorm8x2 = unpack4x8unorm((((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u)) & 65535u)).xy;
+    unorm8x4 = unpack4x8unorm(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)));
+    snorm8x2 = unpack4x8snorm((((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u)) & 65535u)).xy;
+    snorm8x4 = unpack4x8snorm(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)));
+    uint16x2 = ((vec2<u32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))) << vec2<u32>(16u, 0u)) >> vec2<u32>(16u));
+    uint16x4 = ((vec2<u32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)), ((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u))).xxyy << vec4<u32>(16u, 0u, 16u, 0u)) >> vec4<u32>(16u));
+    sint16x2 = ((vec2<i32>(bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)))) << vec2<u32>(16u, 0u)) >> vec2<u32>(16u));
+    sint16x4 = ((vec2<i32>(bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))), bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u)))).xxyy << vec4<u32>(16u, 0u, 16u, 0u)) >> vec4<u32>(16u));
+    unorm16x2 = unpack2x16unorm(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)));
+    unorm16x4 = vec4<f32>(unpack2x16unorm(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))), unpack2x16unorm(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u))));
+    snorm16x2 = unpack2x16snorm(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)));
+    snorm16x4 = vec4<f32>(unpack2x16snorm(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))), unpack2x16snorm(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u))));
+    float16x2 = unpack2x16float(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)));
+    float16x4 = vec4<f32>(unpack2x16float(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))), unpack2x16float(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u))));
+    float32 = bitcast<f32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)));
+    float32x2 = vec2<f32>(bitcast<f32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))), bitcast<f32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u))));
+    float32x3 = vec3<f32>(bitcast<f32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))), bitcast<f32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u))), bitcast<f32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)] << 8u))));
+    float32x4 = vec4<f32>(bitcast<f32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))), bitcast<f32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u))), bitcast<f32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)] << 8u))), bitcast<f32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 19u)] << 8u))));
+    uint32 = ((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u));
+    uint32x2 = vec2<u32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)), ((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u)));
+    uint32x3 = vec3<u32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)), ((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u)), ((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)] << 8u)));
+    uint32x4 = vec4<u32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)), ((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u)), ((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)] << 8u)), ((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 19u)] << 8u)));
+    sint32 = bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u)));
+    sint32x2 = vec2<i32>(bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))), bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u))));
+    sint32x3 = vec3<i32>(bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))), bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u))), bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)] << 8u))));
+    sint32x4 = vec4<i32>(bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 15u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 8u))), bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] << 8u))), bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)] << 8u))), bitcast<i32>(((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)] >> 24u) | (tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 19u)] << 8u))));
+  }
+  return vec4<f32>(0.0, 0.0, 0.0, 1.0);
+}
+)";
+
+  VertexPulling::Config cfg;
+  cfg.vertex_state = {{{256,
+                        InputStepMode::kVertex,
+                        {
+                            {VertexFormat::kUint8x2, 63, 0},
+                            {VertexFormat::kUint8x4, 63, 1},
+                            {VertexFormat::kSint8x2, 63, 2},
+                            {VertexFormat::kSint8x4, 63, 3},
+                            {VertexFormat::kUnorm8x2, 63, 4},
+                            {VertexFormat::kUnorm8x4, 63, 5},
+                            {VertexFormat::kSnorm8x2, 63, 6},
+                            {VertexFormat::kSnorm8x4, 63, 7},
+                            {VertexFormat::kUint16x2, 63, 8},
+                            {VertexFormat::kUint16x4, 63, 9},
+                            {VertexFormat::kSint16x2, 63, 10},
+                            {VertexFormat::kSint16x4, 63, 11},
+                            {VertexFormat::kUnorm16x2, 63, 12},
+                            {VertexFormat::kUnorm16x4, 63, 13},
+                            {VertexFormat::kSnorm16x2, 63, 14},
+                            {VertexFormat::kSnorm16x4, 63, 15},
+                            {VertexFormat::kFloat16x2, 63, 16},
+                            {VertexFormat::kFloat16x4, 63, 17},
+                            {VertexFormat::kFloat32, 63, 18},
+                            {VertexFormat::kFloat32x2, 63, 19},
+                            {VertexFormat::kFloat32x3, 63, 20},
+                            {VertexFormat::kFloat32x4, 63, 21},
+                            {VertexFormat::kUint32, 63, 22},
+                            {VertexFormat::kUint32x2, 63, 23},
+                            {VertexFormat::kUint32x3, 63, 24},
+                            {VertexFormat::kUint32x4, 63, 25},
+                            {VertexFormat::kSint32, 63, 26},
+                            {VertexFormat::kSint32x2, 63, 27},
+                            {VertexFormat::kSint32x3, 63, 28},
+                            {VertexFormat::kSint32x4, 63, 29},
+                        }}}};
+  cfg.entry_point_name = "main";
+
+  DataMap data;
+  data.Add<VertexPulling::Config>(cfg);
+  auto got = Run<VertexPulling>(src, data);
+
+  EXPECT_EQ(expect, str(got));
+}
+
+TEST_F(VertexPullingTest, FormatsWithVectorsResized) {
+  auto* src = R"(
+[[stage(vertex)]]
+fn main(
+    [[location(0)]] uint8x2 : vec3<u32>,
+    [[location(1)]] uint8x4 : vec2<u32>,
+    [[location(2)]] sint8x2 : i32,
+    [[location(3)]] sint8x4 : vec2<i32>,
+    [[location(4)]] unorm8x2 : vec4<f32>,
+    [[location(5)]] unorm8x4 : f32,
+    [[location(6)]] snorm8x2 : vec3<f32>,
+    [[location(7)]] snorm8x4 : f32,
+    [[location(8)]] uint16x2 : vec3<u32>,
+    [[location(9)]] uint16x4 : vec2<u32>,
+    [[location(10)]] sint16x2 : vec4<i32>,
+    [[location(11)]] sint16x4 : i32,
+    [[location(12)]] unorm16x2 : vec3<f32>,
+    [[location(13)]] unorm16x4 : f32,
+    [[location(14)]] snorm16x2 : vec4<f32>,
+    [[location(15)]] snorm16x4 : vec3<f32>,
+    [[location(16)]] float16x2 : vec4<f32>,
+    [[location(17)]] float16x4 : f32,
+    [[location(18)]] float32 : vec4<f32>,
+    [[location(19)]] float32x2 : vec4<f32>,
+    [[location(20)]] float32x3 : vec2<f32>,
+    [[location(21)]] float32x4 : vec3<f32>,
+    [[location(22)]] uint32 : vec3<u32>,
+    [[location(23)]] uint32x2 : vec4<u32>,
+    [[location(24)]] uint32x3 : vec4<u32>,
+    [[location(25)]] uint32x4 : vec2<u32>,
+    [[location(26)]] sint32 : vec4<i32>,
+    [[location(27)]] sint32x2 : vec3<i32>,
+    [[location(28)]] sint32x3 : i32,
+    [[location(29)]] sint32x4 : vec2<i32>
+  ) -> [[builtin(position)]] vec4<f32> {
+  return vec4<f32>(0.0, 0.0, 0.0, 1.0);
+}
+)";
+
+  auto* expect = R"(
+[[block]]
+struct TintVertexData {
+  tint_vertex_data : [[stride(4)]] array<u32>;
+};
+
+[[binding(0), group(4)]] var<storage, read> tint_pulling_vertex_buffer_0 : TintVertexData;
+
+[[stage(vertex)]]
+fn main([[builtin(vertex_index)]] tint_pulling_vertex_index : u32) -> [[builtin(position)]] vec4<f32> {
+  var uint8x2 : vec3<u32>;
+  var uint8x4 : vec2<u32>;
+  var sint8x2 : i32;
+  var sint8x4 : vec2<i32>;
+  var unorm8x2 : vec4<f32>;
+  var unorm8x4 : f32;
+  var snorm8x2 : vec3<f32>;
+  var snorm8x4 : f32;
+  var uint16x2 : vec3<u32>;
+  var uint16x4 : vec2<u32>;
+  var sint16x2 : vec4<i32>;
+  var sint16x4 : i32;
+  var unorm16x2 : vec3<f32>;
+  var unorm16x4 : f32;
+  var snorm16x2 : vec4<f32>;
+  var snorm16x4 : vec3<f32>;
+  var float16x2 : vec4<f32>;
+  var float16x4 : f32;
+  var float32 : vec4<f32>;
+  var float32x2 : vec4<f32>;
+  var float32x3 : vec2<f32>;
+  var float32x4 : vec3<f32>;
+  var uint32 : vec3<u32>;
+  var uint32x2 : vec4<u32>;
+  var uint32x3 : vec4<u32>;
+  var uint32x4 : vec2<u32>;
+  var sint32 : vec4<i32>;
+  var sint32x2 : vec3<i32>;
+  var sint32x3 : i32;
+  var sint32x4 : vec2<i32>;
+  {
+    let buffer_array_base_0 = (tint_pulling_vertex_index * 64u);
+    uint8x2 = vec3<u32>(((vec2<u32>((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 16u)) << vec2<u32>(8u, 0u)) >> vec2<u32>(24u)), 0u);
+    uint8x4 = (((vec4<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]) << vec4<u32>(24u, 16u, 8u, 0u)) >> vec4<u32>(24u))).xy;
+    sint8x2 = (((vec2<i32>(bitcast<i32>((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] << 16u))) << vec2<u32>(8u, 0u)) >> vec2<u32>(24u))).x;
+    sint8x4 = (((vec4<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)])) << vec4<u32>(24u, 16u, 8u, 0u)) >> vec4<u32>(24u))).xy;
+    unorm8x2 = vec4<f32>(unpack4x8unorm((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] & 65535u)).xy, 0.0, 1.0);
+    unorm8x4 = unpack4x8unorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]).x;
+    snorm8x2 = vec3<f32>(unpack4x8snorm((tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)] & 65535u)).xy, 0.0);
+    snorm8x4 = unpack4x8snorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]).x;
+    uint16x2 = vec3<u32>(((vec2<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]) << vec2<u32>(16u, 0u)) >> vec2<u32>(16u)), 0u);
+    uint16x4 = (((vec2<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]).xxyy << vec4<u32>(16u, 0u, 16u, 0u)) >> vec4<u32>(16u))).xy;
+    sint16x2 = vec4<i32>(((vec2<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)])) << vec2<u32>(16u, 0u)) >> vec2<u32>(16u)), 0, 1);
+    sint16x4 = (((vec2<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)])).xxyy << vec4<u32>(16u, 0u, 16u, 0u)) >> vec4<u32>(16u))).x;
+    unorm16x2 = vec3<f32>(unpack2x16unorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), 0.0);
+    unorm16x4 = vec4<f32>(unpack2x16unorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), unpack2x16unorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)])).x;
+    snorm16x2 = vec4<f32>(unpack2x16snorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), 0.0, 1.0);
+    snorm16x4 = vec4<f32>(unpack2x16snorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), unpack2x16snorm(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)])).xyz;
+    float16x2 = vec4<f32>(unpack2x16float(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), 0.0, 1.0);
+    float16x4 = vec4<f32>(unpack2x16float(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), unpack2x16float(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)])).x;
+    float32 = vec4<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), 0.0, 0.0, 1.0);
+    float32x2 = vec4<f32>(vec2<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)])), 0.0, 1.0);
+    float32x3 = vec3<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)])).xy;
+    float32x4 = vec4<f32>(bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)]), bitcast<f32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 19u)])).xyz;
+    uint32 = vec3<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)], 0u, 0u);
+    uint32x2 = vec4<u32>(vec2<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]), 0u, 1u);
+    uint32x3 = vec4<u32>(vec3<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)]), 1u);
+    uint32x4 = vec4<u32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)], tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 19u)]).xy;
+    sint32 = vec4<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), 0, 0, 1);
+    sint32x2 = vec3<i32>(vec2<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)])), 0);
+    sint32x3 = vec3<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)])).x;
+    sint32x4 = vec4<i32>(bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 16u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 17u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 18u)]), bitcast<i32>(tint_pulling_vertex_buffer_0.tint_vertex_data[(buffer_array_base_0 + 19u)])).xy;
+  }
+  return vec4<f32>(0.0, 0.0, 0.0, 1.0);
+}
+)";
+
+  VertexPulling::Config cfg;
+  cfg.vertex_state = {{{256,
+                        InputStepMode::kVertex,
+                        {
+                            {VertexFormat::kUint8x2, 64, 0},
+                            {VertexFormat::kUint8x4, 64, 1},
+                            {VertexFormat::kSint8x2, 64, 2},
+                            {VertexFormat::kSint8x4, 64, 3},
+                            {VertexFormat::kUnorm8x2, 64, 4},
+                            {VertexFormat::kUnorm8x4, 64, 5},
+                            {VertexFormat::kSnorm8x2, 64, 6},
+                            {VertexFormat::kSnorm8x4, 64, 7},
+                            {VertexFormat::kUint16x2, 64, 8},
+                            {VertexFormat::kUint16x4, 64, 9},
+                            {VertexFormat::kSint16x2, 64, 10},
+                            {VertexFormat::kSint16x4, 64, 11},
+                            {VertexFormat::kUnorm16x2, 64, 12},
+                            {VertexFormat::kUnorm16x4, 64, 13},
+                            {VertexFormat::kSnorm16x2, 64, 14},
+                            {VertexFormat::kSnorm16x4, 64, 15},
+                            {VertexFormat::kFloat16x2, 64, 16},
+                            {VertexFormat::kFloat16x4, 64, 17},
+                            {VertexFormat::kFloat32, 64, 18},
+                            {VertexFormat::kFloat32x2, 64, 19},
+                            {VertexFormat::kFloat32x3, 64, 20},
+                            {VertexFormat::kFloat32x4, 64, 21},
+                            {VertexFormat::kUint32, 64, 22},
+                            {VertexFormat::kUint32x2, 64, 23},
+                            {VertexFormat::kUint32x3, 64, 24},
+                            {VertexFormat::kUint32x4, 64, 25},
+                            {VertexFormat::kSint32, 64, 26},
+                            {VertexFormat::kSint32x2, 64, 27},
+                            {VertexFormat::kSint32x3, 64, 28},
+                            {VertexFormat::kSint32x4, 64, 29},
+                        }}}};
+  cfg.entry_point_name = "main";
+
+  DataMap data;
+  data.Add<VertexPulling::Config>(cfg);
+  auto got = Run<VertexPulling>(src, data);
+
+  EXPECT_EQ(expect, str(got));
+}
+
 }  // namespace
 }  // namespace transform
 }  // namespace tint
--- a/src/utils/math.h
+++ b/src/utils/math.h
@ -17,6 +17,7 @@

 #include <sstream>
 #include <string>
+#include <type_traits>

 namespace tint {
 namespace utils {
@ -38,6 +39,18 @@ inline bool IsPowerOfTwo(T value) {
  return (value & (value - 1)) == 0;
 }

+/// @param value the input value
+/// @returns the largest power of two that `value` is a multiple of
+template <typename T>
+inline std::enable_if_t<std::is_unsigned<T>::value, T> MaxAlignOf(T value) {
+  T pot = 1;
+  while (value && ((value & 1u) == 0)) {
+    pot <<= 1;
+    value >>= 1;
+  }
+  return pot;
+}
+
 }  // namespace utils
 }  // namespace tint

--- a/src/utils/math_test.cc
+++ b/src/utils/math_test.cc
@ -58,6 +58,26 @@ TEST(MathTests, IsPowerOfTwo) {
  EXPECT_EQ(IsPowerOfTwo(9), false);
 }

+TEST(MathTests, MaxAlignOf) {
+  EXPECT_EQ(MaxAlignOf(0u), 1u);
+  EXPECT_EQ(MaxAlignOf(1u), 1u);
+  EXPECT_EQ(MaxAlignOf(2u), 2u);
+  EXPECT_EQ(MaxAlignOf(3u), 1u);
+  EXPECT_EQ(MaxAlignOf(4u), 4u);
+  EXPECT_EQ(MaxAlignOf(5u), 1u);
+  EXPECT_EQ(MaxAlignOf(6u), 2u);
+  EXPECT_EQ(MaxAlignOf(7u), 1u);
+  EXPECT_EQ(MaxAlignOf(8u), 8u);
+  EXPECT_EQ(MaxAlignOf(9u), 1u);
+  EXPECT_EQ(MaxAlignOf(10u), 2u);
+  EXPECT_EQ(MaxAlignOf(11u), 1u);
+  EXPECT_EQ(MaxAlignOf(12u), 4u);
+  EXPECT_EQ(MaxAlignOf(13u), 1u);
+  EXPECT_EQ(MaxAlignOf(14u), 2u);
+  EXPECT_EQ(MaxAlignOf(15u), 1u);
+  EXPECT_EQ(MaxAlignOf(16u), 16u);
+}
+
 }  // namespace
 }  // namespace utils
 }  // namespace tint