pytorch
diff --git a/‎helion/_testing.py‎
Lines changed: 37 additions & 0 deletions b/‎helion/_testing.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎helion/_utils.py‎
Lines changed: 57 additions & 0 deletions b/‎helion/_utils.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎helion/language/device_print.py‎
Lines changed: 5 additions & 0 deletions b/‎helion/language/device_print.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎helion/language/memory_ops.py‎
Lines changed: 167 additions & 1 deletion b/‎helion/language/memory_ops.py‎
Lines changed: 167 additions & 1 deletion
@@ -15,6 +15,7 @@
 from typing import Generator
 import unittest
 
+import pytest
 import torch
 from triton.testing import do_bench
 
@@ -110,6 +111,8 @@ class RefEagerTestBase:
     # Class-level tracking for skipTest counting
     _skip_test_count = 0
     _original_skip_test_func = None
+    # Class-level tracking for pytest.raises patching
+    _original_pytest_raises = None
 
     def setUp(self) -> None:
         """Common setup for all ref eager tests."""
@@ -163,6 +166,18 @@ def counting_skip_test(*args: object, **kwargs: object) -> object:
         self._run_ref_tracker = track_run_ref_calls()
         self._run_ref_count = self._run_ref_tracker.__enter__()
 
+        # Patch pytest.raises to count calls
+        if RefEagerTestBase._original_pytest_raises is None:  # pyright: ignore[reportAttributeAccessIssue]
+            RefEagerTestBase._original_pytest_raises = pytest.raises
+
+        def counting_pytest_raises(*args: object, **kwargs: object) -> object:
+            """Wrapper for pytest.raises that counts calls but still runs the original logic."""
+            RefEagerTestBase._assert_raises_count += 1
+            assert RefEagerTestBase._original_pytest_raises is not None  # pyright: ignore[reportAttributeAccessIssue]
+            return RefEagerTestBase._original_pytest_raises(*args, **kwargs)  # pyright: ignore[reportAttributeAccessIssue]
+
+        pytest.raises = counting_pytest_raises  # type: ignore[assignment]
+
     def tearDown(self) -> None:
         """Common teardown with assertion counting check."""
         # If not in ref eager mode, skip the teardown logic
@@ -215,6 +230,10 @@ def tearDown(self) -> None:
             if RefEagerTestBase._original_skip_test_func is not None:
                 self.skipTest = RefEagerTestBase._original_skip_test_func
 
+            # Restore the original pytest.raises function
+            if RefEagerTestBase._original_pytest_raises is not None:  # pyright: ignore[reportAttributeAccessIssue]
+                pytest.raises = RefEagerTestBase._original_pytest_raises  # pyright: ignore[reportAttributeAccessIssue]
+
             super().tearDown()  # type: ignore[misc]
 
     # NOTE: We no-op these methods because they commonly check behaviors that are not relevant in ref eager mode.
@@ -235,6 +254,10 @@ def assertNotIn(
         if not self._in_ref_eager_mode:
             super().assertNotIn(member, container, msg)  # type: ignore[misc]
 
+    def assertTrueIfInNormalMode(self, condition: bool, msg: str | None = None) -> None:
+        if not self._in_ref_eager_mode:
+            self.assertTrue(condition, msg)  # type: ignore[attr-defined]
+
     def assertEqualCode(self, first: str, second: str, msg: str | None = None) -> None:
         if not self._in_ref_eager_mode:
             super().assertEqual(first, second, msg)  # type: ignore[misc]
@@ -245,6 +268,20 @@ def assertNotEqualCode(
         if not self._in_ref_eager_mode:
             super().assertNotEqual(first, second, msg)  # type: ignore[misc]
 
+    def getUserDefinedTunable(
+        self, user_defined_tunables: dict[str, object], key: str
+    ) -> object | None:
+        """Look up a specific value via key from user defined tunables. Returns None in ref mode."""
+        if self._in_ref_eager_mode:
+            return None
+        return user_defined_tunables.get(key)
+
+    def assertIsInstance(
+        self, obj: object, cls: type | tuple[type, ...], msg: str | None = None
+    ) -> None:
+        if not self._in_ref_eager_mode:
+            super().assertIsInstance(obj, cls, msg)  # type: ignore[misc]
+
 
 def import_path(filename: Path) -> types.ModuleType:
     module_name = f"{__name__}.{filename.stem}"
 
@@ -1,7 +1,64 @@
 from __future__ import annotations
 
 import collections
+from typing import Sequence
 
 counters: collections.defaultdict[str, collections.Counter[str]] = (
     collections.defaultdict(collections.Counter)
 )
+
+
+def create_shape_matching_slices(
+    shape1: Sequence[int], shape2: Sequence[int]
+) -> tuple[slice, ...]:
+    """Create slices to match the smaller of two shapes.
+
+    This is used for masking tensors to compatible shapes by taking the
+    minimum size in each dimension.
+
+    Args:
+        shape1: First shape (can be torch.Size or any sequence of ints)
+        shape2: Second shape (can be torch.Size or any sequence of ints)
+
+    Returns:
+        Tuple of slices that can be used to index a tensor
+    """
+    return tuple(slice(0, min(d1, d2)) for d1, d2 in zip(shape1, shape2, strict=False))
+
+
+def convert_size_arg(size: object) -> object:
+    """Convert a size argument that may contain RefTile objects.
+
+    Handles:
+    - Single RefTile -> int
+    - List/tuple containing RefTiles -> list with converted sizes
+    - Other values -> unchanged
+    """
+    # Import here to avoid circular dependency
+    from helion.language.ref_tile import RefTile
+
+    if isinstance(size, (list, tuple)):
+        return [convert_size_arg(item) for item in size]
+    if isinstance(size, RefTile):
+        return size._slice.stop - size._slice.start
+    return size
+
+
+def convert_tile_indices_to_slices(index: object) -> object:
+    """Convert RefTile objects in index to their corresponding slice objects.
+
+    Args:
+        index: Index that may contain RefTile objects or tuples of indices
+
+    Returns:
+        Index with RefTile objects replaced by their slice objects
+    """
+    # Import here to avoid circular dependency
+    from helion.language.ref_tile import RefTile
+
+    def _extract_slice(obj: object) -> object:
+        return obj._slice if isinstance(obj, RefTile) else obj
+
+    if isinstance(index, tuple):
+        return tuple(_extract_slice(idx) for idx in index)
+    return _extract_slice(index)
@@ -90,3 +90,8 @@ def _(state: CodegenState) -> None:
     )
     stmt = create(ast.Expr, value=call_expr)
     state.add_statement(stmt)
+
+
+@_decorators.ref(device_print)
+def _(prefix: str, *values: object) -> None:
+    print(prefix, *values)
@@ -84,9 +84,45 @@ def _(state: CodegenState) -> ast.AST:
     )
 
 
+@_decorators.ref(store)
+def _(
+    tensor: torch.Tensor,
+    index: list[object],
+    value: torch.Tensor | torch.SymInt | float,
+    extra_mask: torch.Tensor | None = None,
+) -> None:
+    # Convert index list to tuple for tensor indexing
+    index_tuple = tuple(index)
+
+    # Apply extra mask if provided
+    if extra_mask is not None:
+        # Only store where the mask is True
+        if isinstance(value, torch.Tensor):
+            tensor[index_tuple] = torch.where(extra_mask, value, tensor[index_tuple])  # pyright: ignore[reportArgumentType]
+        else:
+            # For scalar values, we need to create a tensor of the right shape
+            current = tensor[index_tuple]  # pyright: ignore[reportArgumentType]
+            # Cast value to a proper numeric type for full_like
+            if isinstance(value, torch.SymInt):
+                numeric_value = int(value)
+            else:
+                numeric_value = value
+            tensor[index_tuple] = torch.where(  # pyright: ignore[reportArgumentType]
+                extra_mask, torch.full_like(current, numeric_value), current
+            )
+    else:
+        # Handle SymInt case for assignment
+        if isinstance(value, torch.SymInt):
+            tensor[index_tuple] = int(value)  # pyright: ignore[reportArgumentType]
+        else:
+            tensor[index_tuple] = value  # pyright: ignore[reportArgumentType]
+
+
 @_decorators.api(tiles_as_sizes=True, allow_host_tensor=True)
 def load(
-    tensor: torch.Tensor, index: list[object], extra_mask: torch.Tensor | None = None
+    tensor: torch.Tensor,
+    index: list[object],
+    extra_mask: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """Load a value from a tensor using a list of indices.
 
@@ -129,6 +165,83 @@ def _(node: torch.fx.Node) -> int:
     return 0  # loads are always masked to 0
 
 
+@_decorators.ref(load)
+def _(
+    tensor: torch.Tensor,
+    index: list[object],
+    extra_mask: torch.Tensor | None = None,
+) -> torch.Tensor:
+    from .ref_tile import RefTile
+
+    if extra_mask is None:
+        return tensor[tuple(index)]  # pyright: ignore[reportArgumentType]
+
+    # Create zero result matching mask shape
+    result = torch.zeros(extra_mask.shape, dtype=tensor.dtype, device=tensor.device)
+
+    # Process indices: convert RefTiles and clamp tensor indices
+    orig_indices, safe_indices, is_tensor_mask = [], [], []
+    for i, idx in enumerate(index):
+        if isinstance(idx, RefTile):
+            idx = idx.index  # Convert RefTile to tensor
+
+        if isinstance(idx, torch.Tensor):
+            dim_size = tensor.shape[i] if i < len(tensor.shape) else tensor.numel()
+            orig_indices.append(idx)
+            safe_indices.append(torch.clamp(idx, 0, dim_size - 1))
+            is_tensor_mask.append(True)
+        else:
+            orig_indices.append(idx)
+            safe_indices.append(idx)
+            is_tensor_mask.append(False)
+
+    # Apply broadcasting if we have multiple tensor indices
+    tensor_positions = [i for i, is_tensor in enumerate(is_tensor_mask) if is_tensor]
+
+    if len(tensor_positions) > 1:
+        # Add unsqueeze operations for broadcasting
+        broadcast_indices = []
+        for i, (idx, is_tensor) in enumerate(
+            zip(safe_indices, is_tensor_mask, strict=False)
+        ):
+            if is_tensor:
+                new_idx = idx
+                # Add dimension for each other tensor index
+                for j, other_pos in enumerate(tensor_positions):
+                    if other_pos != i:
+                        new_idx = new_idx.unsqueeze(j if other_pos < i else -1)
+                broadcast_indices.append(new_idx)
+            else:
+                broadcast_indices.append(idx)
+        values = tensor[tuple(broadcast_indices)]
+    else:
+        values = tensor[tuple(safe_indices)]
+
+    # Build validity mask
+    valid_mask = extra_mask.clone()
+    for i, (orig_idx, is_tensor) in enumerate(
+        zip(orig_indices, is_tensor_mask, strict=False)
+    ):
+        if is_tensor:
+            dim_size = tensor.shape[i] if i < len(tensor.shape) else tensor.numel()
+            in_bounds = (orig_idx >= 0) & (orig_idx < dim_size)
+            # Broadcast to match mask shape by adding dimensions
+            # Count how many tensor indices come before and after this one
+            n_before = sum(1 for j in range(i) if is_tensor_mask[j])
+            n_after = sum(
+                1 for j in range(i + 1, len(is_tensor_mask)) if is_tensor_mask[j]
+            )
+
+            # Add dimensions: n_after dimensions at the end, n_before at the beginning
+            for _ in range(n_after):
+                in_bounds = in_bounds.unsqueeze(-1)
+            for _ in range(n_before):
+                in_bounds = in_bounds.unsqueeze(0)
+            valid_mask = valid_mask & in_bounds
+
+    return torch.where(valid_mask, values, result)
+
+
 @has_side_effect
 @_decorators.api(allow_host_tensor=True)
 def atomic_add(
@@ -210,6 +323,59 @@ def _(
     return None
 
 
+@_decorators.ref(atomic_add)
+def _(
+    target: torch.Tensor,
+    index: list[object],
+    value: torch.Tensor | float,
+    sem: str = "relaxed",
+) -> None:
+    """Reference implementation of atomic_add for interpret mode."""
+    from .. import exc
+    from .ref_tile import RefTile
+
+    # Validate sem parameter
+    if sem not in ["relaxed", "acquire", "release", "acq_rel"]:
+        raise exc.InternalError(
+            ValueError(
+                f"Invalid memory semantic '{sem}'. Valid options are: relaxed, acquire, release, acq_rel"
+            )
+        )
+
+    # Convert indices to proper format
+    processed_index = []
+    for idx in index:
+        if isinstance(idx, RefTile):
+            processed_index.append(idx._slice)
+        elif isinstance(idx, torch.Tensor) and idx.numel() == 1:
+            processed_index.append(int(idx.item()))
+        else:
+            processed_index.append(idx)
+
+    # Find tensor indices that need element-wise processing
+    tensor_indices = [
+        (i, idx)
+        for i, idx in enumerate(processed_index)
+        if isinstance(idx, torch.Tensor) and idx.numel() > 1
+    ]
+
+    if tensor_indices:
+        # Element-wise processing for tensor indices
+        i, tensor_idx = tensor_indices[0]  # Handle first tensor index
+        for j, elem in enumerate(tensor_idx):
+            new_index = processed_index.copy()
+            new_index[i] = int(elem.item())
+            val = (
+                value[j]
+                if isinstance(value, torch.Tensor) and value.numel() > 1
+                else value
+            )
+            target[tuple(new_index)] += val
+    else:
+        # Direct atomic add
+        target[tuple(processed_index)] += value
+
+
 @_decorators.codegen(atomic_add)
 def _(state: CodegenState) -> ast.AST:
     target = state.proxy_arg(0)
Original file line number	Diff line number	Diff line change
`@@ -90,3 +90,8 @@ def _(state: CodegenState) -> None:`
`90`	`90`	`)`
`91`	`91`	`stmt = create(ast.Expr, value=call_expr)`
`92`	`92`	`state.add_statement(stmt)`
	`93`	`+`
	`94`	`+`
	`95`	`+@_decorators.ref(device_print)`
	`96`	`+def _(prefix: str, *values: object) -> None:`
	`97`	`+ print(prefix, *values)`