pytorch
diff --git a/‎examples/split_k_barrier.py‎
Lines changed: 92 additions & 0 deletions b/‎examples/split_k_barrier.py‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎helion/_compiler/compile_environment.py‎
Lines changed: 4 additions & 6 deletions b/‎helion/_compiler/compile_environment.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎helion/_compiler/device_function.py‎
Lines changed: 11 additions & 1 deletion b/‎helion/_compiler/device_function.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎helion/_compiler/device_ir.py‎
Lines changed: 69 additions & 0 deletions b/‎helion/_compiler/device_ir.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎helion/_compiler/generate_ast.py‎
Lines changed: 25 additions & 3 deletions b/‎helion/_compiler/generate_ast.py‎
Lines changed: 25 additions & 3 deletions
@@ -0,0 +1,92 @@
+"""
+Helion Split-K Matmul with Barrier Example
+==========================================
+This example demonstrates a two-stage split-K matrix multiplication using
+hl.barrier() for grid-wide synchronization. The barrier approach ensures
+deterministic results as opposed to atomic_add approaches.
+
+Stage 1: Split K dimension into chunks and compute partial products
+Barrier: Grid-wide synchronization to ensure all partials are written
+Stage 2: Reduce partials across the split dimension
+"""
+
+from __future__ import annotations
+
+import torch
+
+import helion
+from helion._testing import DEVICE
+from helion._testing import run_example
+from helion.autotuner import PowerOfTwoFragment
+import helion.language as hl
+
+
+@helion.kernel(static_shapes=True, dot_precision="ieee")
+def split_k_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Two-stage split-K matmul using hl.barrier().  The barrier approach
+    gives deterministic results as opposed to the atomic_add approach.
+
+    Stage 1:
+      - Split K into `split_k` contiguous chunks.
+      - Each chunk computes a partial [tile_m, tile_n] product into its own slice of `tmp`.
+
+    Barrier:
+      - Grid-wide barrier to ensure all partials are written before reduction.
+
+    Stage 2:
+      - Reduce partials across the split dimension and write `out`.
+
+    Shapes:
+      a: [M, K]
+      b: [K, N]
+      tmp: [M, N, split_k]
+      out: [M, N]
+
+    Notes:
+      - Static shapes keep codegen simpler.
+      - `split_k` is fixed for clarity; autotuning could choose it instead.
+    """
+    m, k = a.shape
+    _, n = b.shape
+    split_k = hl.register_tunable("split_k", PowerOfTwoFragment(16, 512, 64))
+    block_k = helion.next_power_of_2(helion.cdiv(k, split_k))
+    tmp = torch.zeros((m, n, split_k), device=a.device, dtype=a.dtype)
+    out = torch.empty((m, n), device=a.device, dtype=a.dtype)
+
+    for tile_m, tile_n, tile_k_outer in hl.tile(
+        [m, n, k], block_size=[None, None, block_k]
+    ):
+        acc = hl.zeros([tile_m, tile_n], device=a.device, dtype=a.dtype)
+        for tile_k_inner in hl.tile(tile_k_outer.begin, tile_k_outer.end):
+            acc = torch.addmm(acc, a[tile_m, tile_k_inner], b[tile_k_inner, tile_n])
+        # this could be a hl.atomic_add to avoid the barrier, but that would be non-determinstic
+        tmp[tile_m, tile_n, tile_k_outer.id] = acc
+
+    hl.barrier()
+
+    for tile_m, tile_n in hl.tile([m, n]):
+        out[tile_m, tile_n] = torch.sum(tmp[tile_m, tile_n, :], dim=-1)
+
+    return out
+
+
+def check(m: int, k: int, n: int) -> None:
+    a = torch.randn(m, k, device=DEVICE)
+    b = torch.randn(n, k, device=DEVICE).T
+
+    run_example(
+        split_k_matmul,
+        torch.matmul,
+        args=(a, b),
+        atol=5e-1,  # long reduction accumulate errors
+    )
+
+
+def main() -> None:
+    torch.manual_seed(0)
+    check(16, 4096, 16)
+
+
+if __name__ == "__main__":
+    main()
@@ -27,7 +27,6 @@
 
 from .. import exc
 from ..language.constexpr import ConstExpr
-from .loop_dependency_checker import LoopDependencyChecker
 from .source_location import SourceLocation
 from .source_location import current_location
 from .variable_origin import BlockSizeOrigin
@@ -118,19 +117,19 @@ def __init__(
         self.block_sizes: list[BlockSizeInfo] = []
         self.debug_shape_renames: dict[sympy.Expr, sympy.Expr] = {}
         self.config_spec = ConfigSpec()
-        if settings.autotune_force_persistent:
-            for pid_type in ("flat", "xyz"):
-                self.config_spec.disallow_pid_type(pid_type)
         self.kernel_tensor_sizes: dict[tuple[sympy.Expr, ...], int] = (
             collections.Counter()
         )
         self.specialized_vars: set[sympy.Symbol] = set()
         self.specialized_strides: set[tuple[str, int]] = set()
-        self.loop_dependency_checker = LoopDependencyChecker()
         self._symint_cache: dict[object, torch.SymInt] = {}
         self.device_load_count = (
             0  # Track number of loads in all device code for eviction policy tuning
         )
+        if settings.autotune_force_persistent:
+            for pid_type in ("flat", "xyz"):
+                self.config_spec.disallow_pid_type(pid_type)
+        self.has_barrier: bool = False
 
     def specialize_expr(self, expr: sympy.Expr) -> sympy.Expr:
         """Substitute any specialized vars with their concrete values."""
@@ -561,7 +560,6 @@ def __enter__(self) -> Self:
         assert getattr(tls, "env", None) is None, "CompileEnvironment already active"
         self.fake_mode.__enter__()
         tls.env = self
-        self.loop_dependency_checker = LoopDependencyChecker()
         return self
 
     def __exit__(
 
@@ -198,7 +198,12 @@ def __init__(self, val: int) -> None:
 
 
 class DeviceFunction:
-    def __init__(self, name: str, config: Config, codegen: GenerateAST) -> None:
+    def __init__(
+        self,
+        name: str,
+        config: Config,
+        codegen: GenerateAST,
+    ) -> None:
         super().__init__()
         self.name = name
         self.config = config
@@ -673,6 +678,11 @@ def codegen_function_call(self) -> ast.AST:
             [
                 f"num_warps={num_warps}",
                 f"num_stages={self.config.num_stages}",
+                *(
+                    ["launch_cooperative_grid=True"]
+                    if CompileEnvironment.current().has_barrier
+                    else []
+                ),
             ]
             + [
                 f"{x.removeprefix('_triton_config_')}={self.config[x]}"
 
@@ -48,6 +48,7 @@
 from .inductor_lowering import CodegenState
 from .inductor_lowering import codegen_call_with_graph
 from .inductor_lowering import prepare_graph_lowerings
+from .loop_dependency_checker import LoopDependencyChecker
 from .matmul_utils import tensor_matmul_replacement
 from .matmul_utils import torch_matmul_replacement
 from .node_masking import remove_unnecessary_masking
@@ -223,6 +224,8 @@ def codegen(self, state: CodegenState) -> list[object]:
 
 
 class RootGraphInfo(GraphInfo):
+    phase_index: int = 0
+
     @property
     def name(self) -> str:
         return f"root_graph_{self.graph_id}"
@@ -410,12 +413,22 @@ class RolledReductionInfo(NamedTuple):
     can_be_rolled_by_caller: bool
 
 
+@dataclasses.dataclass
+class KernelPhase:
+    roots: list[int]  # store root indices
+    root_nodes: list[ast.For]
+    loop_dependency_checker: LoopDependencyChecker = dataclasses.field(
+        default_factory=LoopDependencyChecker
+    )
+
+
 class DeviceIR:
     def __init__(self) -> None:
         super().__init__()
         self.graphs: list[GraphInfo] = []
         self.root_ids: list[int] = []
         self.rolled_reductions: list[RolledReductionInfo] = []
+        self.phases: list[KernelPhase] = []
         self.grid_block_ids: list[list[int]] = []
 
     def get_root(self, config: Config, graph_id: int) -> torch.fx.Graph:
@@ -469,6 +482,11 @@ def add_reduction_loop_graph(
     def add_root_graph(self, graph: torch.fx.Graph) -> None:
         self.root_ids.append(self.add_graph(graph, graph_info_cls=RootGraphInfo))
 
+    def phase_for_root(self, root_id: int) -> int:
+        graph_info = self.graphs[self.root_ids[root_id]]
+        assert isinstance(graph_info, RootGraphInfo)
+        return graph_info.phase_index
+
     def build_rolled_reductions(self) -> None:
         env = CompileEnvironment.current()
         rdims = [bs for bs in env.block_sizes if bs.reduction]
@@ -1354,6 +1372,10 @@ class WalkHostAST(NodeVisitor):
     def __init__(self, device_ir: DeviceIR) -> None:
         super().__init__()
         self.device_ir = device_ir
+        self.root_index = 0
+        self.current_phase_roots: list[int] = []
+        self.phases: list[KernelPhase] = []
+        self.root_nodes: list[ast.For] = []
 
     def visit_For(self, node: ast.For) -> None:
         assert isinstance(node, ExtendedAST)
@@ -1372,9 +1394,44 @@ def visit_For(self, node: ast.For) -> None:
                 # pyrefly: ignore [missing-attribute]
                 block_ids = [inner.block_id]
             self.device_ir.grid_block_ids.append(block_ids)
+            # store root index (position) not graph id
+            self.root_nodes.append(node)
+            self.current_phase_roots.append(len(self.device_ir.root_ids) - 1)
+            self.root_index += 1
         else:
             self.generic_visit(node)
 
+    def visit_Expr(self, node: ast.Expr) -> None:
+        # Record barrier placement between top-level loops.
+        from .type_propagation import BarrierResultType
+
+        assert isinstance(node, ExtendedAST)
+        assert isinstance(node.value, ExtendedAST)
+        is_barrier = isinstance(node.value._type_info, BarrierResultType)
+
+        if is_barrier:
+            if self.root_index == 0 or not self.current_phase_roots:
+                raise exc.BarrierOnlyAllowedAtTopLevel
+            self.phases.append(
+                KernelPhase(
+                    roots=self.current_phase_roots,
+                    root_nodes=[self.root_nodes[r] for r in self.current_phase_roots],
+                )
+            )
+            self.current_phase_roots = []
+            return
+        self.generic_visit(node)
+
+    def flush_phases(self) -> None:
+        if self.current_phase_roots:
+            self.phases.append(
+                KernelPhase(
+                    roots=self.current_phase_roots,
+                    root_nodes=[self.root_nodes[r] for r in self.current_phase_roots],
+                )
+            )
+            self.current_phase_roots = []
+
 
 def _count_device_loads_and_stores(device_ir: DeviceIR) -> tuple[int, int, int]:
     """Count the number of load and store operations in device code for autotuning.
@@ -1466,6 +1523,18 @@ def lower_to_device_ir(func: HostFunction) -> DeviceIR:
         visitor = WalkHostAST(device_ir)
         for stmt in func.body:
             visitor.visit(stmt)
+        visitor.flush_phases()
+        device_ir.phases = visitor.phases
+        # Run dependency checks once, per phase, so codegen does not redo it per-config.
+        for phase in device_ir.phases:
+            checker = phase.loop_dependency_checker
+            for loop_node in phase.root_nodes:
+                checker.register_loop(loop_node)
+        for phase_idx, phase in enumerate(device_ir.phases):
+            for ridx in phase.roots:
+                graph_info = device_ir.graphs[device_ir.root_ids[ridx]]
+                assert isinstance(graph_info, RootGraphInfo)
+                graph_info.phase_index = phase_idx
         # If there are no top-level device loops, we cannot generate a valid kernel.
         # Raise a friendly error instead of emitting an empty Triton function body.
         if len(device_ir.root_ids) == 0:
 
@@ -10,6 +10,7 @@
 
 from .. import exc
 from ..language._decorators import is_api_func
+from ..runtime.config import Config
 from .ast_extension import ExtendedAST
 from .ast_extension import LoopType
 from .ast_extension import NodeVisitor
@@ -24,6 +25,7 @@
 from .helper_function import CodegenInterface
 from .inductor_lowering import CodegenState
 from .inductor_lowering import codegen_call_with_graph
+from .loop_dependency_checker import LoopDependencyChecker
 from .program_id import ForEachProgramID
 from .tile_strategy import DeviceLoopState
 from .variable_origin import ArgumentOrigin
@@ -35,6 +37,7 @@
 
     from ..runtime import Config
     from .host_function import HostFunction
+    from .loop_dependency_checker import LoopDependencyChecker
     from .tile_strategy import DeviceLoopOrGridState
     from .type_propagation import TensorType
 
@@ -56,7 +59,11 @@ def __init__(self, func: HostFunction, config: Config) -> None:
         self.next_else_block: list[ast.AST] | None = None
 
         # Now create device function and initialize CodegenInterface
-        self.device_function = DeviceFunction(f"_helion_{func.name}", config, self)
+        self.device_function = DeviceFunction(
+            f"_helion_{func.name}",
+            config,
+            self,
+        )
         CodegenInterface.__init__(self, self.device_function)
 
     def offset_var(self, block_idx: int) -> str:
@@ -70,6 +77,10 @@ def mask_var(self, block_idx: int) -> str | None:
             return loops[-1].strategy.mask_var(block_idx)
         return None
 
+    def _phase_checker(self, root_id: int) -> LoopDependencyChecker:
+        phase_idx = self.host_function.device_ir.phase_for_root(root_id)
+        return self.host_function.device_ir.phases[phase_idx].loop_dependency_checker
+
     def add_statement(self, stmt: ast.AST | str | None) -> None:
         if stmt is None:
             return
@@ -227,17 +238,20 @@ def visit_For(self, node: ast.For) -> ast.AST | None:
         if node._loop_type == LoopType.GRID:
             assert not node.orelse
 
+            assert node._root_id is not None
+            # Loop dependency checks were already run during lowering; phase checker kept for symmetry/debug.
+            self._phase_checker(node._root_id)
+
             if len(self.host_function.device_ir.root_ids) == 1:
                 body = self.device_function.body
             else:
                 assert len(self.host_function.device_ir.root_ids) > 1
-                assert node._root_id is not None
                 # Multiple top level for loops
 
                 if node._root_id == 0:
                     self.device_function.set_pid(
                         ForEachProgramID(
-                            self.device_function.new_var("pid_shared", dce=False)
+                            self.device_function.new_var("pid_shared", dce=False),
                         )
                     )
                     self.device_function.body.extend(
@@ -310,6 +324,11 @@ def visit_For(self, node: ast.For) -> ast.AST | None:
                 # This ensures block size and rdim vars are defined in the correct order
                 self.device_function.flush_deferred_rdim_defs(self)
 
+                if isinstance(self.device_function.pid, ForEachProgramID):
+                    self.device_function.pid.case_phases.append(
+                        self.host_function.device_ir.phase_for_root(node._root_id)
+                    )
+
                 # If we are in a multi top level loop, for all loops except for the last one
                 # emit ifthenelse blocks
                 if node._root_id < len(self.host_function.device_ir.root_ids) - 1:
@@ -477,6 +496,9 @@ def generate_ast(
     func: HostFunction, config: Config, emit_repro_caller: bool
 ) -> ast.AST:
     with func:
+        if len(func.device_ir.phases) > 1:
+            if not str(config.pid_type).startswith("persistent"):
+                raise exc.BarrierRequiresPersistent(config.pid_type)
         codegen = GenerateAST(func, config)
         with codegen.device_function:
             for stmt in func.body: