pytorch
diff --git a/‎docs/api/settings.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/api/settings.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/deployment_autotuning.md‎
Lines changed: 25 additions & 0 deletions b/‎docs/deployment_autotuning.md‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎helion/_testing.py‎
Lines changed: 22 additions & 0 deletions b/‎helion/_testing.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎helion/autotuner/base_search.py‎
Lines changed: 199 additions & 0 deletions b/‎helion/autotuner/base_search.py‎
Lines changed: 199 additions & 0 deletions
@@ -197,6 +197,13 @@ def my_kernel(x: torch.Tensor) -> torch.Tensor:
 
    Users can still override individual ``autotune_*`` settings; explicit values win over the preset. Controlled by ``HELION_AUTOTUNE_EFFORT``.
 
+.. autoattribute:: Settings.autotune_checkpoint_id
+
+   Checkpoint ID for resuming autotuning from a previous checkpoint. When set, the autotuner attempts to load
+   state from a checkpoint file matching this ID, allowing long-running autotuning sessions to be interrupted
+   and resumed. The checkpoint ID contains a hash prefix that identifies the kernel, hardware, and input shapes.
+   If the hash doesn't match, the checkpoint is ignored and autotuning starts fresh with a warning message.
+   Controlled by ``HELION_AUTOTUNE_CHECKPOINT_ID``.
 
 ```
 
@@ -295,6 +302,7 @@ Built-in values for ``HELION_AUTOTUNER`` include ``"PatternSearch"``, ``"Differe
 | ``HELION_AUTOTUNE_PROGRESS_BAR`` | ``autotune_progress_bar`` | Enable or disable the progress bar UI during autotuning. |
 | ``HELION_AUTOTUNE_IGNORE_ERRORS`` | ``autotune_ignore_errors`` | Continue autotuning even when recoverable runtime errors occur. |
 | ``HELION_AUTOTUNE_CONFIG_OVERRIDES`` | ``autotune_config_overrides`` | Supply JSON forcing particular autotuner config key/value pairs. |
+| ``HELION_AUTOTUNE_CHECKPOINT_ID`` | ``autotune_checkpoint_id`` | Checkpoint ID for resuming autotuning from a previous checkpoint. |
 | ``HELION_CACHE_DIR`` | ``LocalAutotuneCache`` | Override the on-disk directory used for cached autotuning artifacts. |
 | ``HELION_SKIP_CACHE`` | ``LocalAutotuneCache`` | When set to ``1``, ignore cached autotuning entries and rerun searches. |
 | ``HELION_ASSERT_CACHE_HIT`` | ``AutotuneCacheBase`` | When set to ``1``, require a cache hit; raises ``CacheAssertionError`` on cache miss with detailed diagnostics. |
 
@@ -104,6 +104,31 @@ tuning time versus coverage, or try different search algorithms.
 need more reproducibility; see {doc}`api/settings`.  Note this only
 affects which configs are tried, not the timing results.
 
+### Checkpointing Long-Running Autotuning
+
+For very long autotuning sessions, you can save and resume state using
+checkpoints. This is useful when tuning might be interrupted (e.g., preemptible
+instances) or when you want to continue tuning from a previous unfinished run.
+
+The simplest approach is to use the `HELION_AUTOTUNE_CHECKPOINT_ID` environment
+variable. When autotuning runs, it periodically saves checkpoints and logs the
+checkpoint ID. To resume, set this environment variable to the checkpoint ID
+from a previous run.
+
+```bash
+# First run - autotuning will log checkpoint IDs as it progresses:
+# "Checkpoint saved: .../autotuner_checkpoints/a1b2c3d4-1706123456-e5f6g7h8.checkpoint"
+# "To resume from this checkpoint, set HELION_AUTOTUNE_CHECKPOINT_ID=a1b2c3d4-1706123456-e5f6g7h8 ..."
+python run_kernel.py
+
+# If interrupted, resume from the last checkpoint:
+HELION_AUTOTUNE_CHECKPOINT_ID=a1b2c3d4-1706123456-e5f6g7h8 python run_kernel.py
+```
+
+The checkpoint ID contains a hash prefix that identifies the kernel, hardware,
+and input shapes. If the hash doesn't match, the checkpoint is ignored and autotuning
+starts fresh with a warning message.
+
 ## Deploy a Single Config
 
 If one configuration wins for every production call, bake it into the decorator:
 
@@ -10,13 +10,15 @@
 import operator
 import os
 from pathlib import Path
+import random
 import re
 import sys
 from typing import TYPE_CHECKING
 from typing import Callable
 from typing import Generator
 import unittest
 
+import numpy as np
 from packaging import version
 import pytest
 import torch
@@ -39,6 +41,26 @@
     from .runtime.kernel import Kernel
 
 
+def seed_rng(seed: int) -> None:
+    random.seed(seed)
+    np.random.seed(seed)  # noqa: NPY002
+    torch.manual_seed(seed)
+
+
+@contextlib.contextmanager
+def fork_rng() -> Generator[None, None, None]:
+    """Context manager that forks all RNGs and restores original state on exit."""
+    python_state = random.getstate()
+    numpy_state = np.random.get_state()  # noqa: NPY002
+
+    with torch.random.fork_rng():
+        try:
+            yield
+        finally:
+            random.setstate(python_state)
+            np.random.set_state(numpy_state)  # noqa: NPY002
+
+
 def _strip_launcher_args(value: str) -> str:
     strip_pairs = []
     if supports_amd_cdna_tunables():
 
@@ -156,6 +156,17 @@ def cleanup(self) -> None:
         self._precompile_args_path = None
         self._precompile_result_counter = count()
 
+    def _get_checkpoint_dir(self) -> Path:
+        """Get checkpoint directory for autotuner checkpoints."""
+        from torch._inductor.runtime.cache_dir_utils import cache_dir
+
+        if (user_path := os.environ.get("HELION_CACHE_DIR", None)) is not None:
+            base = Path(user_path)
+        else:
+            base = Path(cache_dir()) / "helion"
+
+        return base / "autotuner_checkpoints"
+
     def _clone_args(self, args: Sequence[object]) -> Sequence[object]:
         def _clone_leaf(leaf: object) -> object:
             if isinstance(leaf, torch.Tensor):
@@ -685,6 +696,43 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
                 torch.save(self.args, args_path)
                 self._precompile_args_path = args_path
             exit_stack.callback(self.cleanup)
+
+            checkpoint_loaded = False
+            if self.settings.autotune_checkpoint_id is not None:
+                from .local_cache import LocalAutotuneCache
+
+                checkpoint_id = self.settings.autotune_checkpoint_id
+                current_hash = LocalAutotuneCache(self)._generate_key().stable_hash()
+
+                # Checkpoint ID format: {8-char-hash}-{timestamp}-{8-char-uuid}
+                # Extract hash prefix and check compatibility
+                hash_prefix = checkpoint_id.split("-")[0]
+                if hash_prefix != current_hash[:8]:
+                    self.log(
+                        f"Warning: Checkpoint '{checkpoint_id}' is for a different kernel "
+                        f"(hash mismatch). Ignoring checkpoint and starting fresh autotuning run.",
+                        level=logging.WARNING,
+                    )
+                else:
+                    # Hash matches, load checkpoint
+                    checkpoint_dir = self._get_checkpoint_dir()
+                    checkpoint_file = checkpoint_dir / f"{checkpoint_id}.checkpoint"
+                    if not checkpoint_file.exists():
+                        raise FileNotFoundError(
+                            f"Checkpoint file not found: {checkpoint_file}"
+                        )
+                    self.log(f"Resuming from checkpoint: {checkpoint_file}")
+                    with open(checkpoint_file, "rb") as f:
+                        state = pickle.load(f)
+                    self.load_state_dict(state)
+                    self.log(
+                        f"Resumed at generation {self._current_generation} with "
+                        f"{len(self.population)} configs"  # type: ignore[attr-defined]
+                    )
+                    checkpoint_loaded = True
+
+            if not checkpoint_loaded:
+                self._init_search()
             best = self._autotune()
         end = time.perf_counter()
         kernel_decorator = self.kernel.format_kernel_decorator(best, self.settings)
@@ -701,6 +749,16 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
             print(triton_code, file=sys.stderr)
         return best
 
+    def _init_search(self) -> None:
+        """
+        Initialize the search state for a fresh autotuning run.
+
+        This method is called when starting autotuning without a checkpoint.
+        Subclasses should override this to set up initial population and state.
+        After this method, _current_generation should reflect the last completed generation.
+        """
+        raise NotImplementedError
+
     def _autotune(self) -> Config:
         """
         Abstract method to perform the actual autotuning.
@@ -712,6 +770,102 @@ def _autotune(self) -> Config:
         """
         raise NotImplementedError
 
+    def save_checkpoint(self) -> Path:
+        """
+        Save current autotuner state to checkpoint file.
+
+        Each call generates a new checkpoint ID for the saved checkpoint.
+
+        Returns:
+            Path to saved checkpoint file
+        """
+        from .local_cache import LocalAutotuneCache
+
+        # Checkpoint ID format: {8-char-hash}-{timestamp}-{8-char-uuid}
+        stable_hash = LocalAutotuneCache(self)._generate_key().stable_hash()[:8]
+        timestamp = int(time.time())
+        short_uuid = uuid.uuid4().hex[:8]
+        new_checkpoint_id = f"{stable_hash}-{timestamp}-{short_uuid}"
+        filename = f"{new_checkpoint_id}.checkpoint"
+
+        checkpoint_dir = self._get_checkpoint_dir()
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        checkpoint_path = checkpoint_dir / filename
+
+        state = self.state_dict()
+
+        # Atomic write using temp file
+        tmp = checkpoint_dir / f"tmp.{uuid.uuid4()!s}"
+        with open(tmp, "wb") as f:
+            pickle.dump(state, f)
+        os.rename(str(tmp), str(checkpoint_path))
+
+        self.log(f"Checkpoint saved: {checkpoint_path}")
+        self.log(
+            f"To resume from this checkpoint, set HELION_AUTOTUNE_CHECKPOINT_ID={new_checkpoint_id} "
+            f'or `autotune_checkpoint_id="{new_checkpoint_id}"` in the kernel settings'
+        )
+        return checkpoint_path
+
+    def state_dict(self) -> dict[str, Any]:
+        """
+        Return autotuner state as a dictionary.
+
+        Subclasses should call super().state_dict() first, then update with their own fields.
+        """
+        import numpy as np
+
+        from .local_cache import LocalAutotuneCache
+
+        rng_state: dict[str, Any] = {
+            "random": random.getstate(),
+            "torch": torch.random.get_rng_state(),
+            "numpy": np.random.get_state(),  # noqa: NPY002
+        }
+        if torch.cuda.is_available():
+            rng_state["torch_cuda"] = torch.cuda.get_rng_state()
+
+        return {
+            "algorithm": self.__class__.__name__,
+            "cache_key_stable_hash": LocalAutotuneCache(self)
+            ._generate_key()
+            .stable_hash(),
+            "counters": dict(self.counters),
+            "rng_state": rng_state,
+            "best_perf_so_far": self.best_perf_so_far,
+            "current_generation": self._current_generation,
+        }
+
+    def load_state_dict(self, state: dict[str, Any]) -> None:
+        """
+        Restore autotuner state from a dictionary.
+
+        Subclasses should call super().load_state_dict(state) first,
+        then restore their own fields.
+        """
+        from .local_cache import LocalAutotuneCache
+
+        current_hash = LocalAutotuneCache(self)._generate_key().stable_hash()
+        if state.get("cache_key_stable_hash") != current_hash:
+            raise exc.CheckpointError(
+                "State dict is incompatible: kernel, hardware, or input shapes may have changed"
+            )
+
+        import numpy as np
+
+        # Restore RNG state
+        rng_state = state["rng_state"]
+        random.setstate(rng_state["random"])
+        torch.random.set_rng_state(rng_state["torch"])
+        np.random.set_state(rng_state["numpy"])  # noqa: NPY002
+        if "torch_cuda" in rng_state and torch.cuda.is_available():
+            torch.cuda.set_rng_state(rng_state["torch_cuda"])
+
+        # Restore autotuner state
+        self.counters = collections.Counter(state["counters"])
+        self.best_perf_so_far = state["best_perf_so_far"]
+        self._current_generation = state["current_generation"]
+
 
 @dataclasses.dataclass
 class PopulationMember:
@@ -817,6 +971,8 @@ def best(self) -> PopulationMember:
 
     def set_generation(self, generation: int) -> None:
         self._current_generation = generation
+        if generation > 0:
+            self.save_checkpoint()
 
     def benchmark_flat(self, flat_values: FlatConfig) -> PopulationMember:
         """
@@ -970,6 +1126,49 @@ def statistics(self) -> str:
         """
         return population_statistics(self.population)
 
+    def state_dict(self) -> dict[str, Any]:
+        state = super().state_dict()
+        # Serialize population (excluding fn which will be recompiled on load)
+        population_state = []
+        for member in self.population:
+            population_state.append(
+                {
+                    "perfs": member.perfs,
+                    "flat_values": member.flat_values,
+                    "config": member.config,
+                    "status": member.status,
+                    "compile_time": member.compile_time,
+                }
+            )
+        state["population"] = population_state
+        return state
+
+    def load_state_dict(self, state: dict[str, Any]) -> None:
+        super().load_state_dict(state)
+
+        # Restore population
+        self.population = []
+        for member_state in state["population"]:
+            member = PopulationMember(
+                fn=_unset_fn,
+                perfs=member_state["perfs"],
+                flat_values=member_state["flat_values"],
+                config=member_state["config"],
+                status=member_state["status"],
+                compile_time=member_state.get("compile_time"),
+            )
+            self.population.append(member)
+
+        # Recompile kernel functions for all population members
+        for member in self.population:
+            if member.fn is _unset_fn and member.status == "ok":
+                try:
+                    member.fn = self.kernel.compile_config(
+                        member.config, allow_print=False
+                    )
+                except Exception:
+                    member.fn = _unset_fn
+
 
 def population_statistics(population: list[PopulationMember]) -> str:
     """