permit up to 99% assume() failures #4623

ajdavis · ajdavis · commit 76d3207e4b0d · 2026-01-18T17:19:32.000-05:00
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -8,6 +8,7 @@ their individual contributions.
 
 .. NOTE - this list is in alphabetical order by first name (or handle).
 
+* `A. Jesse Jiryu Davis <https://github.com/ajdavis>`_
 * `Aaron Meurer <https://github.com/asmeurer>`_
 * `Adam Johnson <https://github.com/adamchainz>`_
 * `Adam Matan <https://github.com/adamatan/adamatan>_`
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,8 @@
+RELEASE_TYPE: patch
+
+This patch makes Hypothesis more tolerant of slow-to-satisfy ``assume()`` calls.
+Previously, Hypothesis would give up after ``max_examples * 10`` attempts; now it
+uses a statistical test to stop only when 99% confident that <1% of examples
+would pass (:issue:`4623`).
+
+Thanks to @ajdavis for this improvement!
diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/engine.py b/hypothesis-python/src/hypothesis/internal/conjecture/engine.py
@@ -156,11 +156,26 @@ def timing_report(self) -> str:
         return "\n".join(out)
 
 
+# Statistical thresholds for assumption satisfaction rate.
+# We want to stop when we're 99% confident the true valid rate is below 1%.
+#
+# With k valid examples, we need n invalid examples such that:
+#     P(seeing <=k valid in n+k trials | true rate = 1%) <= 1%
+#
+# For k=0: (0.99)^n <= 0.01  →  n >= ln(0.01)/ln(0.99) ~= 459
+# Each additional valid example adds ~153 to the threshold (solving the
+# cumulative binomial for subsequent k values).
+#
+# Formula: stop when invalid_examples > INVALID_THRESHOLD_BASE + INVALID_PER_VALID * valid_examples
+INVALID_THRESHOLD_BASE = 459
+INVALID_PER_VALID = 153
+
+
 class ExitReason(Enum):
     max_examples = "settings.max_examples={s.max_examples}"
     max_iterations = (
         "settings.max_examples={s.max_examples}, "
-        "but < 10% of examples satisfied assumptions"
+        "but < 1% of examples satisfied assumptions"
     )
     max_shrinks = f"shrunk example {MAX_SHRINKS} times"
     finished = "nothing left to do"
@@ -713,12 +728,11 @@ def test_function(self, data: ConjectureData) -> None:
             #  while in the other case below we just want to move on to shrinking.)
             if self.valid_examples >= self.settings.max_examples:
                 self.exit_with(ExitReason.max_examples)
-            if self.call_count >= max(
-                self.settings.max_examples * 10,
-                # We have a high-ish default max iterations, so that tests
-                # don't become flaky when max_examples is too low.
-                1000,
-            ):
+            # Stop when we're 99% confident the true valid rate is below 1%.
+            invalid_threshold = (
+                INVALID_THRESHOLD_BASE + INVALID_PER_VALID * self.valid_examples
+            )
+            if (self.invalid_examples + self.overrun_examples) > invalid_threshold:
                 self.exit_with(ExitReason.max_iterations)
 
         if self.__tree_is_exhausted():
@@ -1077,8 +1091,12 @@ def should_generate_more(self) -> bool:
         # but with the important distinction that this clause will move on to
         # the shrinking phase having found one or more bugs, while the other
         # will exit having found zero bugs.
-        if self.valid_examples >= self.settings.max_examples or self.call_count >= max(
-            self.settings.max_examples * 10, 1000
+        invalid_threshold = (
+            INVALID_THRESHOLD_BASE + INVALID_PER_VALID * self.valid_examples
+        )
+        if (
+            self.valid_examples >= self.settings.max_examples
+            or (self.invalid_examples + self.overrun_examples) > invalid_threshold
         ):  # pragma: no cover
             return False
 
diff --git a/hypothesis-python/tests/conjecture/test_engine.py b/hypothesis-python/tests/conjecture/test_engine.py
@@ -36,6 +36,7 @@
 from hypothesis.internal.conjecture.data import ConjectureData, Overrun, Status
 from hypothesis.internal.conjecture.datatree import compute_max_children
 from hypothesis.internal.conjecture.engine import (
+    INVALID_THRESHOLD_BASE,
     MIN_TEST_CALLS,
     ConjectureRunner,
     ExitReason,
@@ -1215,11 +1216,11 @@ def test(data):
 
 
 def test_shrink_after_max_iterations():
-    """If we find a bug, keep looking for more, and then hit the test call
-    limit, we should still proceed to shrinking.
+    """If we find a bug, keep looking for more, and then hit the invalid
+    examples limit, we should still proceed to shrinking.
     """
     max_examples = 10
-    max_iterations = max_examples * 10
+    max_iterations = INVALID_THRESHOLD_BASE
     fail_at = max_iterations - 5
 
     invalid = set()
diff --git a/hypothesis-python/tests/cover/test_testdecorators.py b/hypothesis-python/tests/cover/test_testdecorators.py
@@ -25,6 +25,7 @@
     strategies as st,
 )
 from hypothesis.errors import Unsatisfiable
+from hypothesis.internal.conjecture.engine import INVALID_THRESHOLD_BASE
 from hypothesis.strategies import (
     binary,
     booleans,
@@ -507,8 +508,8 @@ def f(v):
     with pytest.raises(
         Unsatisfiable,
         match=(
-            r"Unable to satisfy assumptions of f\. 1000 of 1000 examples "
-            r"failed a \.filter\(\) or assume\(\)"
+            rf"Unable to satisfy assumptions of f\. {INVALID_THRESHOLD_BASE+1} of "
+            rf"{INVALID_THRESHOLD_BASE+1} examples failed a \.filter\(\) or assume\(\)"
         ),
     ):
         f()
@@ -532,8 +533,8 @@ def f(v):
         pass
 
     match = (
-        r"1000 of 1000 examples were too large to finish generating; try "
-        r"reducing the typical size of your inputs\?"
+        rf"{INVALID_THRESHOLD_BASE+1} of {INVALID_THRESHOLD_BASE+1} examples were too large to"
+        rf" finish generating; try reducing the typical size of your inputs\?"
     )
     with (
         pytest.raises(Unsatisfiable, match=match),
diff --git a/hypothesis-python/tests/nocover/test_conjecture_engine.py b/hypothesis-python/tests/nocover/test_conjecture_engine.py
@@ -28,7 +28,7 @@ def test_lot_of_dead_nodes():
     @run_to_nodes
     def nodes(data):
         for i in range(4):
-            if data.draw_integer(0, 2**8 - 1) != i:
+            if data.draw_integer(0, 2**7 - 1) != i:
                 data.mark_invalid()
         data.mark_interesting(interesting_origin())
 
diff --git a/hypothesis-python/tests/pytest/test_statistics.py b/hypothesis-python/tests/pytest/test_statistics.py
@@ -53,21 +53,21 @@ def test_prints_statistics_given_option(testdir):
     out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION)
     assert "Hypothesis Statistics" in out
     assert "max_examples=100" in out
-    assert "< 10% of examples satisfied assumptions" in out
+    assert "< 1% of examples satisfied assumptions" in out
 
 
 def test_prints_statistics_given_option_under_xdist(testdir):
     out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION, "-n", "2")
     assert "Hypothesis Statistics" in out
     assert "max_examples=100" in out
-    assert "< 10% of examples satisfied assumptions" in out
+    assert "< 1% of examples satisfied assumptions" in out
 
 
 def test_prints_statistics_given_option_with_junitxml(testdir):
     out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION, "--junit-xml=out.xml")
     assert "Hypothesis Statistics" in out
     assert "max_examples=100" in out
-    assert "< 10% of examples satisfied assumptions" in out
+    assert "< 1% of examples satisfied assumptions" in out
 
 
 @skipif_threading
@@ -80,7 +80,7 @@ def test_prints_statistics_given_option_under_xdist_with_junitxml(testdir):
     )
     assert "Hypothesis Statistics" in out
     assert "max_examples=100" in out
-    assert "< 10% of examples satisfied assumptions" in out
+    assert "< 1% of examples satisfied assumptions" in out
 
 
 UNITTEST_TESTSUITE = """