Use int64 indexing for pids as well (#1195)

Chillee · web-flow · commit 998204120507 · 2025-12-03T20:31:52.000-08:00
diff --git a/helion/_compiler/program_id.py b/helion/_compiler/program_id.py
@@ -12,6 +12,20 @@
 from .device_function import DeviceFunction
 from .host_function import HostFunction
 
+
+def typed_program_id(dim: int = 0) -> str:
+    """Generate tl.program_id() with int64 casting when needed.
+
+    Only casts to int64 when index_dtype is int64, to avoid overhead
+    for the common int32 case.
+    """
+    env = CompileEnvironment.current()
+    dtype = env.triton_index_type()
+    if dtype != "tl.int32":
+        return f"tl.program_id({dim}).to({dtype})"
+    return f"tl.program_id({dim})"
+
+
 if TYPE_CHECKING:
     import sympy
 
@@ -108,7 +122,7 @@ def _setup_persistent_kernel_and_wrap_body(
     @property
     def virtual_program_id(self) -> str:
         """Get the virtual program ID expression for this strategy."""
-        return "tl.program_id(0)"
+        return typed_program_id(0)
 
     def _is_persistent(self) -> bool:
         """Check if this is a persistent strategy. Default False."""
@@ -157,7 +171,7 @@ def codegen_pid_init(self) -> list[ast.stmt]:
         pid_type = current_device_fn.config.get("pid_type", "flat")
         if isinstance(pid_type, str) and pid_type.startswith("persistent"):
             return []
-        return [statement_from_string(f"{self.shared_pid_var} = tl.program_id(0)")]
+        return [statement_from_string(f"{self.shared_pid_var} = {typed_program_id(0)}")]
 
     def _get_cdiv_blocks(
         self, state: CodegenState, exclude_last: bool = False
@@ -228,7 +242,7 @@ class XYZProgramIDs(ProgramIDs):
     def codegen(self, state: CodegenState) -> None:
         for i, pid in enumerate(self.pid_info):
             state.codegen.statements_stack[-1].insert(
-                i, statement_from_string(f"{pid.pid_var} = tl.program_id({i})")
+                i, statement_from_string(f"{pid.pid_var} = {typed_program_id(i)}")
             )
 
     def codegen_grid(self) -> ast.AST:
@@ -242,7 +256,7 @@ class FlatProgramIDs(ProgramIDs):
     """Only use the x grid and compute other dimensions"""
 
     def codegen(self, state: CodegenState) -> None:
-        pid_var = self.shared_pid_var or "tl.program_id(0)"
+        pid_var = self.shared_pid_var or typed_program_id(0)
         statements = self._decompose_pid_to_statements(pid_var, state)
         state.codegen.statements_stack[-1][:] = [
             *statements,
@@ -420,7 +434,7 @@ def __init__(self, is_blocked: bool = False) -> None:
             }
         else:
             self.range_kwargs: dict[str, str] = {
-                "begin": "tl.program_id(0)",
+                "begin": typed_program_id(0),
                 "end": self.total_pids_var,
                 "step": NUM_SM_VAR,
             }
@@ -471,7 +485,7 @@ def setup_persistent_kernel(
                     ),
                     (
                         self.start_pid_var,
-                        f"tl.program_id(0) * {self.block_size_var}",
+                        f"{typed_program_id(0)} * {self.block_size_var}",
                     ),
                     (
                         self.end_pid_var,
@@ -521,7 +535,7 @@ def _generate_pid_statements(self, state: CodegenState) -> list[ast.stmt]:
         if not self.virtual_pid_var:
             # Generate regular PID decomposition
             return self._decompose_pid_to_statements(
-                self.shared_pid_var or "tl.program_id(0)", state
+                self.shared_pid_var or typed_program_id(0), state
             )
 
         # Generate persistent PID decomposition
diff --git a/helion/_compiler/tile_strategy.py b/helion/_compiler/tile_strategy.py
@@ -498,13 +498,15 @@ def _codegen_common(
         return block_size_var, offsets_var, total_numel, statements
 
     def codegen_grid(self, state: CodegenState) -> DeviceGridState:
+        from .program_id import typed_program_id
+
         block_size_var, offsets_var, total_numel, statements = self._codegen_common(
             state
         )
         env = CompileEnvironment.current()
         dtype = env.triton_index_type()
         state.add_statement(
-            f"{offsets_var} = tl.program_id(0) * ({block_size_var}) + tl.arange(0, {block_size_var}).to({dtype})"
+            f"{offsets_var} = {typed_program_id(0)} * ({block_size_var}) + tl.arange(0, {block_size_var}).to({dtype})"
         )
         state.codegen.statements_stack[-1].extend(statements)
 
diff --git a/test/test_indexing.py b/test/test_indexing.py
@@ -582,6 +582,60 @@ def passthrough_int64(x: torch.Tensor) -> torch.Tensor:
             passthrough_int64.specialization_key((large,)),
         )
 
+    @skipIfRefEager("Test checks generated code")
+    def test_program_id_cast_to_int64(self):
+        """Test that tl.program_id() is cast to int64 when index_dtype is int64."""
+
+        @helion.kernel(index_dtype=torch.int64)
+        def add_kernel_int64(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            out = torch.empty_like(x)
+            for tile in hl.tile(x.size(0)):
+                out[tile] = x[tile] + y[tile]
+            return out
+
+        @helion.kernel(index_dtype=torch.int32)
+        def add_kernel_int32(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            out = torch.empty_like(x)
+            for tile in hl.tile(x.size(0)):
+                out[tile] = x[tile] + y[tile]
+            return out
+
+        x = torch.randn(1024, device=DEVICE)
+        y = torch.randn(1024, device=DEVICE)
+
+        # Test int64 case: program_id should be cast to int64
+        code_int64, result_int64 = code_and_output(add_kernel_int64, (x, y))
+        self.assertIn("tl.program_id(0).to(tl.int64)", code_int64)
+
+        # Test int32 case: program_id should NOT be cast
+        code_int32, result_int32 = code_and_output(add_kernel_int32, (x, y))
+        self.assertNotIn(".to(tl.int64)", code_int32)
+        self.assertIn("tl.program_id(0)", code_int32)
+
+        # Both should produce correct results
+        expected = x + y
+        torch.testing.assert_close(result_int64, expected)
+        torch.testing.assert_close(result_int32, expected)
+
+    @skipIfRefEager("Test checks for no IMA")
+    @skipIfRocm("Test takes too long on ROCm")
+    @skipIfCpu("Test requires GPU")
+    @skipIfLowVRAM("Test requires large memory")
+    def test_large_tensor(self):
+        @helion.kernel(autotune_effort="none")
+        def f(x: torch.Tensor) -> torch.Tensor:
+            out = x.new_empty(x.shape)
+            for (b,) in hl.grid([x.shape[0]]):
+                for (x_tile,) in hl.tile([x.shape[1]]):
+                    out[b, x_tile] = x[b, x_tile]
+            return out
+
+        B = 2**15
+        D = 2**17
+        inp = torch.randn(B, D, device=DEVICE, dtype=torch.float16)
+        out = f(inp)
+        assert (out == inp).all()
+
     def test_assign_int(self):
         @helion.kernel
         def fn(x: torch.Tensor) -> torch.Tensor: