Add PatternSearch autotuning algorithm (#696)

jansel · web-flow · commit 66cbf6a7345b · 2025-09-27T20:39:44.000-07:00
diff --git a/helion/autotuner/__init__.py b/helion/autotuner/__init__.py
@@ -11,4 +11,12 @@
 from .finite_search import FiniteSearch as FiniteSearch
 from .local_cache import LocalAutotuneCache as LocalAutotuneCache
 from .local_cache import StrictLocalAutotuneCache as StrictLocalAutotuneCache
+from .pattern_search import PatternSearch as PatternSearch
 from .random_search import RandomSearch as RandomSearch
+
+search_algorithms = {
+    "DifferentialEvolutionSearch": DifferentialEvolutionSearch,
+    "FiniteSearch": FiniteSearch,
+    "PatternSearch": PatternSearch,
+    "RandomSearch": RandomSearch,
+}
diff --git a/helion/autotuner/config_fragment.py b/helion/autotuner/config_fragment.py
@@ -3,7 +3,9 @@
 import dataclasses
 import enum
 import random
+from typing import Iterable
 from typing import TypeGuard
+from typing import cast
 
 from ..exc import InvalidConfig
 
@@ -36,6 +38,10 @@ def random(self) -> object:
         """Return the default value for this fragment."""
         raise NotImplementedError
 
+    def pattern_neighbors(self, current: object) -> list[object]:
+        """Return neighbors for PatternSearch."""
+        raise NotImplementedError
+
     def differential_mutation(self, a: object, b: object, c: object) -> object:
         """Create a new value by combining a, b, and c with something like: `a + (b - c)`"""
         if b == c:
@@ -62,6 +68,24 @@ def default(self) -> list[int]:
     def random(self) -> list[int]:
         return random.sample(range(self.length), self.length)
 
+    def pattern_neighbors(self, current: object) -> list[object]:
+        sequence = list(cast("Iterable[int]", current))
+        if len(sequence) != self.length:
+            raise ValueError(
+                f"Expected permutation of length {self.length}, got {len(sequence)}"
+            )
+        if {*sequence} != {*range(self.length)}:
+            raise ValueError(
+                f"Expected permutation of range({self.length}), got {sequence!r}"
+            )
+        neighbors: list[object] = []
+        for i in range(self.length):
+            for j in range(i + 1, self.length):
+                swapped = [*sequence]
+                swapped[i], swapped[j] = swapped[j], swapped[i]
+                neighbors.append(swapped)
+        return neighbors
+
 
 @dataclasses.dataclass
 class BaseIntegerFragment(ConfigSpecFragment):
@@ -85,13 +109,37 @@ def clamp(self, val: int) -> int:
     def get_minimum(self) -> int:
         return self.low
 
+    def pattern_neighbors(self, current: object) -> list[object]:
+        if type(current) is not int:  # bool is not allowed
+            raise TypeError(f"Expected int, got {type(current).__name__}")
+        neighbors: list[object] = []
+        lower = current - 1
+        upper = current + 1
+        if lower >= self.low:
+            neighbors.append(lower)
+        if upper <= self.high:
+            neighbors.append(upper)
+        return neighbors
+
 
 class PowerOfTwoFragment(BaseIntegerFragment):
     def random(self) -> int:
         assert_integer_power_of_two(self.low)
         assert_integer_power_of_two(self.high)
         return 2 ** random.randrange(self.low.bit_length() - 1, self.high.bit_length())
 
+    def pattern_neighbors(self, current: object) -> list[object]:
+        if type(current) is not int or current <= 0:
+            raise TypeError(f"Expected positive power-of-two int, got {current!r}")
+        neighbors: list[object] = []
+        lower = current // 2
+        if lower >= self.low:
+            neighbors.append(lower)
+        upper = current * 2
+        if upper <= self.high:
+            neighbors.append(upper)
+        return neighbors
+
     def differential_mutation(self, a: object, b: object, c: object) -> int:
         ai = assert_integer_power_of_two(a)
         assert isinstance(b, int)
@@ -132,6 +180,11 @@ def default(self) -> object:
     def random(self) -> object:
         return random.choice(self.choices)
 
+    def pattern_neighbors(self, current: object) -> list[object]:
+        if current not in self.choices:
+            raise ValueError(f"{current!r} not a valid choice")
+        return [choice for choice in self.choices if choice != current]
+
     def differential_mutation(self, a: object, b: object, c: object) -> object:
         if b == c:
             return a
@@ -148,6 +201,11 @@ def default(self) -> bool:
     def random(self) -> bool:
         return random.choice((False, True))
 
+    def pattern_neighbors(self, current: object) -> list[object]:
+        if type(current) is not bool:
+            raise TypeError(f"Expected bool, got {type(current).__name__}")
+        return [not current]
+
     def differential_mutation(self, a: object, b: object, c: object) -> bool:
         assert isinstance(a, bool)
         if b is c:
diff --git a/helion/autotuner/pattern_search.py b/helion/autotuner/pattern_search.py
@@ -0,0 +1,163 @@
+from __future__ import annotations
+
+import math
+from typing import TYPE_CHECKING
+
+from .. import exc
+from .base_search import FlatConfig
+from .base_search import PopulationBasedSearch
+from .base_search import PopulationMember
+from .base_search import performance
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from collections.abc import Sequence
+
+    from ..runtime.config import Config
+    from ..runtime.kernel import BoundKernel
+
+
+class PatternSearch(PopulationBasedSearch):
+    """Search that explores single-parameter perturbations around the current best."""
+
+    def __init__(
+        self,
+        kernel: BoundKernel,
+        args: Sequence[object],
+        *,
+        initial_population: int = 200,
+        copies: int = 5,
+        max_generations: int = 100,
+    ) -> None:
+        """
+        Create a PatternSearch autotuner.
+
+        Args:
+            kernel: The kernel to be autotuned.
+            args: The arguments to be passed to the kernel.
+            initial_population: The number of random configurations to generate for the initial population.
+            copies: Count of top Configs to run pattern search on.
+            max_generations: The maximum number of generations to run.
+        """
+        super().__init__(kernel, args)
+        self.initial_population = initial_population
+        self.copies = copies
+        self.max_generations = max_generations
+
+    def _autotune(self) -> Config:
+        self.log(
+            f"Starting PatternSearch with initial_population={self.initial_population}, copies={self.copies}"
+        )
+        visited = set()
+        self.population = []
+        for flat_config in self.config_gen.random_population_flat(
+            self.initial_population
+        ):
+            member = self.make_unbenchmarked(flat_config)
+            if member.config not in visited:
+                visited.add(member.config)
+                self.population.append(member)
+        self.parallel_benchmark_population(self.population)
+        # again with higher accuracy
+        self.rebenchmark_population(self.population)
+        self.population.sort(key=performance)
+        starting_points = []
+        for member in self.population[: self.copies]:
+            if math.isfinite(member.perf):  # filter failed compiles
+                starting_points.append(member)
+        self.log(
+            f"Initial random population of {len(self.population)}, {len(starting_points)} starting points:",
+            self.statistics,
+        )
+        if not starting_points:
+            raise exc.NoConfigFound
+
+        search_copies = [self._pattern_search_from(m, visited) for m in starting_points]
+        for generation in range(1, self.max_generations + 1):
+            prior_best = self.best
+            new_population = {id(prior_best): prior_best}
+            num_neighbors = 0
+            num_active = 0
+            for search_copy in search_copies:
+                added = next(search_copy, ())
+                if added:
+                    assert len(added) > 1
+                    num_active += 1
+                    num_neighbors += len(added) - 1
+                    for member in added:
+                        new_population[id(member)] = member
+            if num_active == 0:
+                break
+            self.population = [*new_population.values()]
+            # compile any unbenchmarked members in parallel
+            self.parallel_benchmark_population(
+                [m for m in self.population if len(m.perfs) == 0]
+            )
+            # higher-accuracy rebenchmark
+            self.rebenchmark_population(self.population)
+            self.log(
+                f"Generation {generation}, {num_neighbors} neighbors, {num_active} active:",
+                self.statistics,
+            )
+        return self.best.config
+
+    def _pattern_search_from(
+        self, current: PopulationMember, visited: set[Config]
+    ) -> Iterator[list[PopulationMember]]:
+        """
+        Run a single copy of pattern search from the given starting point.
+
+        We use a generator and yield the new population at each generation so that we can
+        run multiple copies of pattern search in parallel.
+        """
+        for _ in range(self.max_generations):
+            candidates = [current]
+            for flat_config in self._generate_neighbors(current.flat_values):
+                new_member = self.make_unbenchmarked(flat_config)
+                if new_member.config not in visited:
+                    visited.add(new_member.config)
+                    candidates.append(new_member)
+            if len(candidates) <= 1:
+                return  # no new candidates, stop searching
+            yield candidates  # yield new population to benchmark in parallel
+            best = min(candidates, key=performance)
+            if best is current:
+                return  # no improvement, stop searching
+            current = best
+
+    def _generate_neighbors(self, base: FlatConfig) -> list[FlatConfig]:
+        """
+        Generate neighboring configurations by changing one or two parameters at a time.
+        """
+        candidates_by_index = [
+            spec.pattern_neighbors(base[index])
+            for index, spec in enumerate(self.config_gen.flat_spec)
+        ]
+        assert len(candidates_by_index) == len(base)
+        neighbors: list[FlatConfig] = []
+
+        # Add all single-parameter changes
+        for index, candidates in enumerate(candidates_by_index):
+            for candidate_value in candidates:
+                new_flat = [*base]
+                new_flat[index] = candidate_value
+                neighbors.append(new_flat)
+
+        # Block sizes are important enough to try pairs of changes at a time
+        block_indices = self.config_gen.block_size_indices
+        for i_pos, first in enumerate(block_indices):
+            first_candidates = candidates_by_index[first]
+            if not first_candidates:
+                continue
+            for second in block_indices[i_pos + 1 :]:
+                second_candidates = candidates_by_index[second]
+                if not second_candidates:
+                    continue
+                for first_value in first_candidates:
+                    for second_value in second_candidates:
+                        new_flat = [*base]
+                        new_flat[first] = first_value
+                        new_flat[second] = second_value
+                        neighbors.append(new_flat)
+
+        return neighbors
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -63,10 +63,17 @@ def __exit__(self, *args: object) -> None:
 def default_autotuner_fn(
     bound_kernel: BoundKernel, args: Sequence[object], **kwargs: object
 ) -> BaseAutotuner:
-    from ..autotuner import DifferentialEvolutionSearch
     from ..autotuner import LocalAutotuneCache
-
-    return LocalAutotuneCache(DifferentialEvolutionSearch(bound_kernel, args, **kwargs))  # pyright: ignore[reportArgumentType]
+    from ..autotuner import search_algorithms
+
+    autotuner_name = os.environ.get("HELION_AUTOTUNER", "PatternSearch")
+    autotuner_cls = search_algorithms.get(autotuner_name)
+    if autotuner_cls is None:
+        raise ValueError(
+            f"Unknown HELION_AUTOTUNER value: {autotuner_name}, valid options are: "
+            f"{', '.join(search_algorithms.keys())}"
+        )
+    return LocalAutotuneCache(autotuner_cls(bound_kernel, args, **kwargs))  # pyright: ignore[reportArgumentType]
 
 
 def _get_autotune_random_seed() -> int:
diff --git a/test/test_autotuner.py b/test/test_autotuner.py
@@ -6,7 +6,9 @@
 from pathlib import Path
 import random
 import tempfile
+from types import SimpleNamespace
 import unittest
+from unittest import skip
 from unittest.mock import patch
 
 import pytest
@@ -20,6 +22,11 @@
 from helion._testing import import_path
 from helion._testing import skipIfRocm
 from helion.autotuner import DifferentialEvolutionSearch
+from helion.autotuner import PatternSearch
+from helion.autotuner.config_fragment import BooleanFragment
+from helion.autotuner.config_fragment import EnumFragment
+from helion.autotuner.config_fragment import IntegerFragment
+from helion.autotuner.config_fragment import PowerOfTwoFragment
 from helion.autotuner.config_generation import ConfigGeneration
 from helion.autotuner.finite_search import FiniteSearch
 from helion.autotuner.random_search import RandomSearch
@@ -174,6 +181,68 @@ def test_differential_evolution_search(self):
         fn = bound_kernel.compile_config(best)
         torch.testing.assert_close(fn(*args), args[0] @ args[1], rtol=1e-2, atol=1e-1)
 
+    @skip("too slow")
+    def test_pattern_search(self):
+        args = (
+            torch.randn([64, 64], device=DEVICE),
+            torch.randn([64, 64], device=DEVICE),
+        )
+        bound_kernel = basic_kernels.add.bind(args)
+        random.seed(123)
+        best = PatternSearch(
+            bound_kernel, args, initial_population=10, max_generations=2, copies=1
+        ).autotune()
+        fn = bound_kernel.compile_config(best)
+        torch.testing.assert_close(fn(*args), sum(args), rtol=1e-2, atol=1e-1)
+
+    def test_pattern_search_neighbor_values(self):
+        self.assertEqual(
+            PowerOfTwoFragment(1, 128, 32).pattern_neighbors(32),
+            [16, 64],
+        )
+        self.assertEqual(
+            sorted(IntegerFragment(1, 5, 3).pattern_neighbors(3)),
+            [2, 4],
+        )
+        self.assertEqual(BooleanFragment().pattern_neighbors(True), [False])
+        self.assertEqual(
+            sorted(EnumFragment(("a", "b", "c")).pattern_neighbors("b")),
+            ["a", "c"],
+        )
+
+    def test_pattern_search_block_size_pair_neighbors(self):
+        search = PatternSearch.__new__(PatternSearch)
+        search._visited = set()
+        search.config_gen = SimpleNamespace(
+            flat_spec=[
+                PowerOfTwoFragment(16, 128, 32),
+                PowerOfTwoFragment(16, 128, 64),
+                EnumFragment(("a", "b")),
+            ],
+            block_size_indices=[0, 1],
+        )
+
+        base = [32, 64, "a"]
+        neighbors = search._generate_neighbors(base)
+
+        def diff_count(flat):
+            return sum(
+                1
+                for current, original in zip(flat, base, strict=False)
+                if current != original
+            )
+
+        pair_neighbors = [
+            flat for flat in neighbors if diff_count(flat) == 2 and flat[2] == "a"
+        ]
+        expected = [
+            [16, 32, "a"],
+            [16, 128, "a"],
+            [64, 32, "a"],
+            [64, 128, "a"],
+        ]
+        self.assertEqual(sorted(pair_neighbors), sorted(expected))
+
     def test_accuracy_check_filters_bad_config_wrong_output(self) -> None:
         bad_config = helion.Config(block_sizes=[1], num_warps=8)
         good_config = helion.Config(block_sizes=[1], num_warps=4)