pytorch
diff --git a/‎helion/_compiler/device_function.py‎
Lines changed: 7 additions & 0 deletions b/‎helion/_compiler/device_function.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎helion/_compiler/program_id.py‎
Lines changed: 12 additions & 4 deletions b/‎helion/_compiler/program_id.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎helion/autotuner/config_spec.py‎
Lines changed: 104 additions & 11 deletions b/‎helion/autotuner/config_spec.py‎
Lines changed: 104 additions & 11 deletions
diff --git a/‎helion/runtime/config.py‎
Lines changed: 25 additions & 0 deletions b/‎helion/runtime/config.py‎
Lines changed: 25 additions & 0 deletions
@@ -693,6 +693,13 @@ def codegen_function_call(self) -> ast.AST:
         for key in ("waves_per_eu", "matrix_instr_nonkdim"):
             if key in self.config:
                 args.append(f"{key}={self.config[key]}")
+        # Only pass maxnreg if it's set to a non-None value and not on AMD
+        if (
+            "maxnreg" in self.config
+            and self.config["maxnreg"] is not None
+            and torch.version.hip is None
+        ):
+            args.append(f"maxnreg={self.config['maxnreg']}")
         pid = self.pid
         assert pid is not None
         # TODO(jansel): we should run CSE this statement
 
@@ -583,6 +583,14 @@ def __init__(self, is_blocked: bool = False) -> None:
         device_function = DeviceFunction.current()
         self.virtual_pid_var: str = device_function.new_var("virtual_pid")
         self.total_pids_var: str = device_function.new_var("total_pids")
+        # Get num_sm_multiplier from config for multi-occupancy support
+        # pyrefly: ignore [bad-assignment]
+        self.num_sm_multiplier: int = device_function.config.get("num_sm_multiplier", 1)
+        # Compute grid size expression based on multiplier
+        if self.num_sm_multiplier == 1:
+            self.grid_size_expr: str = NUM_SM_VAR
+        else:
+            self.grid_size_expr = f"({NUM_SM_VAR} * {self.num_sm_multiplier})"
         # Generate variables and range expression based on strategy type
         if self.is_blocked:
             self.block_size_var: str = device_function.new_var("block_size")
@@ -596,7 +604,7 @@ def __init__(self, is_blocked: bool = False) -> None:
             self.range_kwargs: dict[str, str] = {
                 "begin": typed_program_id(0),
                 "end": self.total_pids_var,
-                "step": NUM_SM_VAR,
+                "step": self.grid_size_expr,
             }
         if device_function.constexpr_arg(NUM_SM_VAR):
             reserved_sms = CompileEnvironment.current().settings.persistent_reserved_sms
@@ -619,8 +627,8 @@ def get_device_str(self) -> str:
         return f"torch.{device!r}"
 
     def codegen_grid(self) -> ast.AST:
-        # Use num_sms for persistent kernels
-        return expr_from_string(f"({NUM_SM_VAR},)")
+        # Use num_sms * multiplier for persistent kernels (multi-occupancy)
+        return expr_from_string(f"({self.grid_size_expr},)")
 
     def setup_persistent_kernel(
         self, device_function: DeviceFunction, total_pids_expr: str | None = None
@@ -641,7 +649,7 @@ def setup_persistent_kernel(
                 assignments = [
                     (
                         self.block_size_var,
-                        f"tl.cdiv({self.total_pids_var}, {NUM_SM_VAR})",
+                        f"tl.cdiv({self.total_pids_var}, {self.grid_size_expr})",
                     ),
                     (
                         self.start_pid_var,
 
@@ -6,6 +6,7 @@
 from typing import TYPE_CHECKING
 from typing import cast
 
+import torch
 from torch._inductor.runtime.runtime_utils import next_power_of_2
 
 from .._compat import supports_amd_cdna_tunables
@@ -52,12 +53,21 @@
         "num_warps",
         "num_stages",
         "pid_type",
+        "num_sm_multiplier",
+        "maxnreg",
         "indexing",
         "load_eviction_policies",
         *AMD_CDNA_TUNABLES,
     ]
 )
 VALID_PID_TYPES = ("flat", "xyz", "persistent_blocked", "persistent_interleaved")
+MIN_NUM_SM_MULTIPLIER = 1
+MAX_NUM_SM_MULTIPLIER = 128
+DEFAULT_NUM_SM_MULTIPLIER = 1
+# maxnreg values: None means no limit, otherwise limit to this many registers per thread
+# Lower values allow higher occupancy but may hurt performance for register-heavy kernels
+VALID_MAXNREG = (None, 32, 64, 128, 256)
+DEFAULT_MAXNREG = None
 VALID_EVICTION_POLICIES = ("", "first", "last")
 VALID_WAVES_PER_EU = (1, 2, 3, 4)
 VALID_MATRIX_INSTR_NONKDIM = (0, 16, 32)
@@ -158,10 +168,18 @@ def disallow_pid_type(self, pid_type: PidTypeLiteral) -> None:
         )
         assert self.allowed_pid_types
 
-    def normalize(self, config: helion.Config | dict[str, object]) -> None:
-        """Normalize the config to match the block_sizes and validate the config."""
+    def normalize(
+        self, config: helion.Config | dict[str, object], *, _fix_invalid: bool = False
+    ) -> None:
+        """Normalize the config to match the block_sizes and validate the config.
+
+        Args:
+            config: The config to normalize (modified in place).
+            _fix_invalid: If True, silently fix invalid combinations instead of raising
+                errors. Used internally during autotuning config generation.
+        """
         if isinstance(config, helion.Config):
-            self.normalize(config.config)
+            self.normalize(config.config, _fix_invalid=_fix_invalid)
             return
 
         for name in (
@@ -250,19 +268,84 @@ def normalize(self, config: helion.Config | dict[str, object]) -> None:
             elif key in config:
                 raise InvalidConfig(f"{key} is not supported on this target hardware")
 
-        # TODO(jansel): include num_ctas and max_nreg
+        if "pid_type" in config:
+            if config["pid_type"] not in VALID_PID_TYPES:
+                raise InvalidConfig(
+                    f"Invalid value for 'pid_type': {config['pid_type']!r} must be one of {list(VALID_PID_TYPES)!r}"
+                )
+        else:
+            config["pid_type"] = VALID_PID_TYPES[0]
+
+        # Validate num_sm_multiplier is a power of two in range
+        if "num_sm_multiplier" in config:
+            val = config["num_sm_multiplier"]
+            if (
+                not isinstance(val, int)
+                or val < MIN_NUM_SM_MULTIPLIER
+                or val > MAX_NUM_SM_MULTIPLIER
+                or (val & (val - 1)) != 0  # not a power of two
+            ):
+                raise InvalidConfig(
+                    f"Invalid value for 'num_sm_multiplier': {val!r} must be a power of two between {MIN_NUM_SM_MULTIPLIER} and {MAX_NUM_SM_MULTIPLIER}"
+                )
+        else:
+            config["num_sm_multiplier"] = DEFAULT_NUM_SM_MULTIPLIER
 
-        for name, values in (("pid_type", VALID_PID_TYPES),):
-            if name in config:
-                if config[name] not in values:
+        # Only validate maxnreg on non-AMD devices (not supported on AMD)
+        if torch.version.hip is None:
+            if "maxnreg" in config:
+                if config["maxnreg"] not in VALID_MAXNREG:
                     raise InvalidConfig(
-                        f"Invalid value for {name!r}: {config[name]!r} must be one of {[*values]!r}"
+                        f"Invalid value for 'maxnreg': {config['maxnreg']!r} must be one of {list(VALID_MAXNREG)!r}"
                     )
             else:
-                config[name] = values[0]
+                config["maxnreg"] = VALID_MAXNREG[0]
+        else:
+            # Remove maxnreg on AMD if present
+            config.pop("maxnreg", None)
 
-        # Set default values for grid indices when pid_type is not persistent
+        # Handle num_sm_multiplier and maxnreg for non-persistent pid_types
+        # These options only make sense for persistent kernels
         pid_type = config["pid_type"]
+        if pid_type in ("flat", "xyz"):
+            # Handle num_sm_multiplier
+            num_sm_multiplier = config.get(
+                "num_sm_multiplier", DEFAULT_NUM_SM_MULTIPLIER
+            )
+            if num_sm_multiplier != DEFAULT_NUM_SM_MULTIPLIER:
+                if _fix_invalid:
+                    # Silently fix during autotuning config generation
+                    config.pop("num_sm_multiplier", None)
+                else:
+                    # Raise error for user-specified invalid combinations
+                    raise InvalidConfig(
+                        f"num_sm_multiplier={num_sm_multiplier} can only be used with persistent "
+                        f"pid_type ('persistent_blocked' or 'persistent_interleaved'), "
+                        f"got pid_type={pid_type!r}"
+                    )
+            else:
+                # Remove default value from config
+                config.pop("num_sm_multiplier", None)
+
+            # Handle maxnreg - only makes sense for persistent kernels (and only on non-AMD)
+            if torch.version.hip is None:
+                maxnreg = config.get("maxnreg", DEFAULT_MAXNREG)
+                if maxnreg != DEFAULT_MAXNREG:
+                    if _fix_invalid:
+                        # Silently fix during autotuning config generation
+                        config.pop("maxnreg", None)
+                    else:
+                        # Raise error for user-specified invalid combinations
+                        raise InvalidConfig(
+                            f"maxnreg={maxnreg} can only be used with persistent "
+                            f"pid_type ('persistent_blocked' or 'persistent_interleaved'), "
+                            f"got pid_type={pid_type!r}"
+                        )
+                else:
+                    # Remove default value from config
+                    config.pop("maxnreg", None)
+
+        # Set default values for grid indices when pid_type is not persistent
         if pid_type in ("flat", "xyz") and self.grid_block_ids:
             for name, mapping in (
                 ("range_unroll_factors", self.range_unroll_factors),
@@ -322,8 +405,18 @@ def flat_config(self, fn: Callable[[ConfigSpecFragment], object]) -> helion.Conf
             "num_stages": fn(IntegerFragment(1, 8, DEFAULT_NUM_STAGES)),
             "indexing": fn(self.indexing),
             "pid_type": fn(EnumFragment(self.allowed_pid_types)),
+            "num_sm_multiplier": fn(
+                PowerOfTwoFragment(
+                    MIN_NUM_SM_MULTIPLIER,
+                    MAX_NUM_SM_MULTIPLIER,
+                    DEFAULT_NUM_SM_MULTIPLIER,
+                )
+            ),
             "load_eviction_policies": fn(self.load_eviction_policies),
         }
+        # Only include maxnreg on non-AMD devices (not supported on AMD)
+        if torch.version.hip is None:
+            config["maxnreg"] = fn(EnumFragment(VALID_MAXNREG))
         # Add tunable parameters
         config.update(
             {key: fn(fragment) for key, fragment in self.user_defined_tunables.items()}
@@ -345,7 +438,7 @@ def flat_config(self, fn: Callable[[ConfigSpecFragment], object]) -> helion.Conf
         ):
             if not config.get(name):
                 config.pop(name, None)
-        self.normalize(config)
+        self.normalize(config, _fix_invalid=True)
         # pyrefly: ignore [bad-argument-type]
         return helion.Config(**config)
 
 
@@ -12,6 +12,8 @@
 IndexingLiteral = Literal["pointer", "tensor_descriptor", "block_ptr"]
 PidTypeLiteral = Literal["flat", "xyz", "persistent_blocked", "persistent_interleaved"]
 EvictionPolicyLiteral = Literal["", "first", "last"]
+NumSmMultiplierLiteral = Literal[1, 2, 4, 8]
+MaxnregLiteral = Literal[32, 64, 128, 256] | None
 
 
 class Config(Mapping[str, object]):
@@ -36,6 +38,8 @@ def __init__(
         num_warps: int | None = None,
         num_stages: int | None = None,
         pid_type: PidTypeLiteral | None = None,
+        num_sm_multiplier: NumSmMultiplierLiteral | None = None,
+        maxnreg: MaxnregLiteral | None = None,
         indexing: IndexingLiteral | list[IndexingLiteral] | None = None,
         # For user-defined properties
         **kwargs: object,
@@ -58,6 +62,11 @@ def __init__(
             num_warps: Number of warps per block.
             num_stages: Number of stages for software pipelining.
             pid_type: Program ID type strategy ("flat", "xyz", "persistent_blocked", "persistent_interleaved").
+            num_sm_multiplier: Multiplier for the number of SMs in persistent kernels (1, 2, 4, 8).
+                Controls multi-occupancy by launching N * num_sms thread blocks instead of just num_sms.
+            maxnreg: Maximum number of registers per thread (None, 32, 64, 128, 256).
+                Lower values allow higher occupancy but may hurt performance. Used with persistent kernels
+                to ensure multi-occupancy can be achieved.
             indexing: Indexing strategy for load and store operations. Can be:
                 - A single strategy string (all loads/stores use this strategy):
                   indexing="block_ptr"  # backward compatible
@@ -85,6 +94,8 @@ def __init__(
             "num_stages": num_stages,
             "indexing": indexing,
             "pid_type": pid_type,
+            "num_sm_multiplier": num_sm_multiplier,
+            "maxnreg": maxnreg,
         }
         for key, value in core_props.items():
             if value is not None:
@@ -182,6 +193,20 @@ def l2_groupings(self) -> list[int]:
     def pid_type(self) -> PidTypeLiteral:
         return cast("PidTypeLiteral", self.config.get("pid_type", "flat"))
 
+    @property
+    def num_sm_multiplier(self) -> int:
+        from ..autotuner.config_spec import DEFAULT_NUM_SM_MULTIPLIER
+
+        return cast(
+            "int", self.config.get("num_sm_multiplier", DEFAULT_NUM_SM_MULTIPLIER)
+        )
+
+    @property
+    def maxnreg(self) -> int | None:
+        from ..autotuner.config_spec import DEFAULT_MAXNREG
+
+        return cast("int | None", self.config.get("maxnreg", DEFAULT_MAXNREG))
+
     @property
     def range_unroll_factors(self) -> list[int]:
         return cast("list[int]", self.config.get("range_unroll_factors", []))