Fix unbackend symints in generated code (#1179)

oulgen · web-flow · commit 9d0b8bd8c32a · 2025-12-01T13:49:15.000-08:00
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -383,8 +383,12 @@ def size_hint(self, n: int | torch.SymInt) -> int:
         if isinstance(n, torch.SymInt):
             expr = n._sympy_()
             if _has_unbacked(expr):
-                # If the size is a symbolic expression with unbacked symbols, then the shape environment
-                # hint will be wrong since we assign a default value to unbacked symbols.  Return a default hint.
+                # For unbacked symbols, try to use the hint we stored in var_to_val
+                # when creating the symint (see create_unbacked_symint).
+                # This preserves the original value passed to the kernel.
+                if expr in self.shape_env.var_to_val:
+                    return int(self.shape_env.var_to_val[expr])
+                # Fall back to default hint if not found
                 return 8192
 
             # pyrefly: ignore [no-matching-overload]
diff --git a/test/test_misc.expected b/test/test_misc.expected
@@ -491,7 +491,7 @@ def call():
     # src[test_misc.py:N]: ) -> tuple[torch.Tensor, torch.Tensor]:
     # src[test_misc.py:N-N]: ...
     t = rand_strided(size=(16, 1), stride=(1, 1), dtype=torch.float32, device=DEVICE)
-    i = 8192
+    i = 1
     s = 'foo'
     b = False
     f = 1.1
@@ -546,7 +546,7 @@ def call():
     # src[test_misc.py:N]: ) -> tuple[torch.Tensor, torch.Tensor]:
     # src[test_misc.py:N-N]: ...
     t = rand_strided(size=(16, 1), stride=(1, 1), dtype=torch.float32, device=DEVICE)
-    i = 8192
+    i = 1
     s = 'foo'
     b = False
     f = 1.1
diff --git a/test/test_specialize.expected b/test/test_specialize.expected
@@ -335,6 +335,38 @@ def fn(x: torch.Tensor, *, _launcher=_default_launcher):
     # src[test_specialize.py:N]: return out
     return out
 
+--- assertExpectedJournal(TestSpecialize.test_specialize_tuple_element)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_foo(x, out, _BLOCK_SIZE_0: tl.constexpr):
+    # src[test_specialize.py:N]: for x_tile in hl.tile([x.shape[0]]):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    # src[test_specialize.py:N]: out[x_tile] = x[x_tile] + (1 << (32 - val))
+    load = tl.load(x + indices_0 * 1, None)
+    v_0 = tl.full([], 65536, tl.int32)
+    v_1 = load + v_0
+    tl.store(out + indices_0 * 1, v_1, None)
+
+def foo(x: torch.Tensor, bitshift: tuple[int, int], *, _launcher=_default_launcher):
+    # src[test_specialize.py:N]: out = x.new_empty(x.shape)
+    out = x.new_empty(x.shape)
+    # src[test_specialize.py:N]: for x_tile in hl.tile([x.shape[0]]):
+    _BLOCK_SIZE_0 = 32
+    # src[test_specialize.py:N]: for x_tile in hl.tile([x.shape[0]]):
+    # src[test_specialize.py:N]:     # compute_val equivalent: 1 << (32 - val)
+    # src[test_specialize.py:N]:     out[x_tile] = x[x_tile] + (1 << (32 - val))
+    _launcher(_helion_foo, (triton.cdiv(64, _BLOCK_SIZE_0),), x, out, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
+    # src[test_specialize.py:N]: return out
+    return out
+
 --- assertExpectedJournal(TestSpecialize.test_sqrt_does_not_specialize)
 from __future__ import annotations
 
diff --git a/test/test_specialize.py b/test/test_specialize.py
@@ -305,6 +305,27 @@ def fn(
         )
         self.assertExpectedJournal(code)
 
+    def test_specialize_tuple_element(self):
+        """Test that hl.specialize works correctly with tuple elements."""
+
+        @helion.kernel(config=helion.Config(block_sizes=[32]))
+        def foo(x: torch.Tensor, bitshift: tuple[int, int]) -> torch.Tensor:
+            out = x.new_empty(x.shape)
+            val = hl.specialize(bitshift[0])
+            for x_tile in hl.tile([x.shape[0]]):
+                # compute_val equivalent: 1 << (32 - val)
+                out[x_tile] = x[x_tile] + (1 << (32 - val))
+            return out
+
+        x = torch.ones(64, dtype=torch.int32, device=DEVICE)
+        code, result = code_and_output(foo, (x, (16, 16)))
+        # 1 << (32-16) = 1 << 16 = 65536
+        expected = x + 65536
+        torch.testing.assert_close(result, expected)
+        # Verify that 65536 appears in the generated code as a constant
+        self.assertIn("65536", code)
+        self.assertExpectedJournal(code)
+
 
 if __name__ == "__main__":
     unittest.main()