[wip] mx: expose a fast path for casting to fp4x2

vkuzo · vkuzo · commit 1a081769b53b · 2025-08-22T05:38:04.000-07:00
Summary: not ready for review yet Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: deefa24 ghstack-comment-id: 3210931181 Pull-Request: #2832
diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py
@@ -561,3 +561,29 @@ def test_cuda_mx_dim1_invalid_block_size():
             scale_dim_x=1,
             scale_dim_y=invalid_block_size,
         )
+
+
+def _fp32_to_fp4_reference(
+    data_hp: torch.Tensor,
+) -> torch.Tensor:
+    data_lp = f32_to_f4_unpacked(data_hp.float())
+    data_lp = pack_uint4(data_lp)
+    return data_lp
+
+
+@pytest.mark.skipif(
+    not is_sm_at_least_100(),
+    reason="requires CUDA capability 10.0 or greater",
+)
+def test_fp32_cast_to_fp4x2():
+    from torchao.prototype.mx_formats.kernels import triton_fp32_cast_to_fp4x2
+
+    M, K = 16, 16
+    x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
+    # make x's range be the representable range of fp4
+    x = x * 6.0
+
+    data_ref = _fp32_to_fp4_reference(x)
+    data = triton_fp32_cast_to_fp4x2(x)
+    torch.testing.assert_close(data_ref, data, atol=0, rtol=0)
+    assert data.shape == (M, K // 2)
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -1454,6 +1454,56 @@ def _(scale_tensor):
         padded_cols = n_col_blocks * 4
 
         return scale_tensor.new_empty((padded_rows, padded_cols))
+
+    @triton.jit
+    def fp32_cast_to_fp4x2_triton_kernel(
+        x_ptr,
+        q_ptr,
+        stride_xm,
+        stride_xn,
+        M,
+        N,
+    ):
+        pid_m = tl.program_id(1)
+        pid_n = tl.program_id(0)
+        offs_m = pid_m * 128 + tl.arange(0, 128)[:, None]
+        offs_n = pid_n * 64 + tl.arange(0, 64)[None, :]
+        mask = None
+        other = None
+        x = tl.load(
+            x_ptr + offs_m * stride_xm + offs_n * stride_xn, mask=mask, other=other
+        )  # [128, 64]
+        x_blocks = x.to(tl.float32).reshape(128, 4, 16)  # [128, 4, 16]
+        # Convert to FP4
+        x_fp4x2 = convert_fp32_to_fp4_packed(x_blocks.reshape(128, 32, 2).split())
+        offs_m = pid_m * 128 + tl.arange(0, 128)[:, None]
+        offs_n = pid_n * 32 + tl.arange(0, 32)[None, :]
+        mask = (offs_m < M) & (offs_n < N // 2)
+        tl.store(q_ptr + offs_m * (N // 2) + offs_n, x_fp4x2, mask=mask)
+
+    def triton_fp32_cast_to_fp4x2(x: torch.Tensor) -> torch.Tensor:
+        """
+        Input: a float32 tensor with shape (M, N)
+        Output: a uint8 tensor with shape (M, N // 2), with the values being the result
+          of casting each original value to fp4_e2m1, and then packing fp4x2
+
+        TODO(future PR): optimize performance
+        TODO(future PR): better checks for shapes, etc
+        TODO(future PR): integrate into training/inference
+        TODO(future PR): integrate with compile, ideally allowing fusion
+        """
+        M, N = x.shape
+        xq = x.new_empty(M, N // 2, dtype=torch.uint8)
+        grid = (triton.cdiv(N, 64), triton.cdiv(M, 128))
+        fp32_cast_to_fp4x2_triton_kernel[grid](
+            x,
+            xq,
+            x.stride(0),
+            x.stride(1),
+            M,
+            N,
+        )
+        return xq.view(torch.uint8)
 else:
 
     def triton_to_mxfp8_dim1(