permit up to 99% assume() failures #4643

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

Zac-HD merged 5 commits into HypothesisWorks:master from ajdavis:issue-4623-filter-condition

Jan 28, 2026

+101 −21

AUTHORS.rst

-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ their individual contributions. @@
     .. NOTE - this list is in alphabetical order by first name (or handle).
+    * `A. Jesse Jiryu Davis <https://github.com/ajdavis>`_
     * `Aaron Meurer <https://github.com/asmeurer>`_
     * `Adam Johnson <https://github.com/adamchainz>`_
     * `Adam Matan <https://github.com/adamatan/adamatan>_`
@@ Expand Down @@

hypothesis-python/RELEASE.rst

-Original file line number
+Diff line change
@@ -0,0 +1,8 @@
+    RELEASE_TYPE: patch
+    This patch makes Hypothesis more tolerant of slow-to-satisfy ``assume()`` calls.
+    Previously, Hypothesis would give up after ``max_examples * 10`` attempts; now it
+    uses a statistical test to stop only when 99% confident that <1% of examples
+    would pass (:issue:`4623`).
+    Thanks to @ajdavis for this improvement!

hypothesis-python/src/hypothesis/internal/conjecture/engine.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -156,11 +156,30 @@ def timing_report(self) -> str: @@
             return "\n".join(out)
+    # Stop when 99% confident the true valid rate is below 1%.
+    # For k valid examples, we need n invalid such that:
+    #     P(seeing <= k valid in n+k trials | rate=1%) <= 1%
+    # k=0: (0.99)^n <= 0.01 -> n >= ln(0.01)/ln(0.99)
+    # Each additional valid example adds ~ln(0.01)/ln(0.99)/3 to threshold.
+    def _calculate_thresholds(
+        confidence: float = 0.99, min_valid_rate: float = 0.01
+    ) -> tuple[int, int]:
+        log_confidence = math.log(1 - confidence)
+        log_invalid_rate = math.log(1 - min_valid_rate)
+        base = math.ceil(log_confidence / log_invalid_rate)
+        # Approximate increase per valid example (from binomial CDF)
+        per_valid = math.ceil(base / 3)
+        return base, per_valid
+    INVALID_THRESHOLD_BASE, INVALID_PER_VALID = _calculate_thresholds()
     class ExitReason(Enum):
         max_examples = "settings.max_examples={s.max_examples}"
         max_iterations = (
             "settings.max_examples={s.max_examples}, "
-            "but < 10% of examples satisfied assumptions"
+            "but < 1% of examples satisfied assumptions"
         )
         max_shrinks = f"shrunk example {MAX_SHRINKS} times"
         finished = "nothing left to do"
@@ Expand Down Expand Up / @@ -713,12 +732,11 @@ def test_function(self, data: ConjectureData) -> None: @@
                 #  while in the other case below we just want to move on to shrinking.)
                 if self.valid_examples >= self.settings.max_examples:
                     self.exit_with(ExitReason.max_examples)
-                if self.call_count >= max(
-                    self.settings.max_examples * 10,
-                    # We have a high-ish default max iterations, so that tests
-                    # don't become flaky when max_examples is too low.
-,
-                ):
+                # Stop when we're 99% confident the true valid rate is below 1%.
+                invalid_threshold = (
+                    INVALID_THRESHOLD_BASE + INVALID_PER_VALID * self.valid_examples
+                )
+                if (self.invalid_examples + self.overrun_examples) > invalid_threshold:
                     self.exit_with(ExitReason.max_iterations)
             if self.__tree_is_exhausted():
@@ Expand Down Expand Up / @@ -1077,8 +1095,12 @@ def should_generate_more(self) -> bool: @@
             # but with the important distinction that this clause will move on to
             # the shrinking phase having found one or more bugs, while the other
             # will exit having found zero bugs.
-            if self.valid_examples >= self.settings.max_examples or self.call_count >= max(
-                self.settings.max_examples * 10, 1000
+            invalid_threshold = (
+                INVALID_THRESHOLD_BASE + INVALID_PER_VALID * self.valid_examples
+            )
+            if (
+                self.valid_examples >= self.settings.max_examples
+                or (self.invalid_examples + self.overrun_examples) > invalid_threshold
             ):  # pragma: no cover
                 return False
@@ Expand Down @@

hypothesis-python/tests/conjecture/test_engine.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -36,6 +36,8 @@
  
    from hypothesis.internal.conjecture.data import ConjectureData, Overrun, Status

    from hypothesis.internal.conjecture.datatree import compute_max_children

    from hypothesis.internal.conjecture.engine import (

        INVALID_PER_VALID,

        INVALID_THRESHOLD_BASE,

        MIN_TEST_CALLS,

        ConjectureRunner,

        ExitReason,

    @@ -883,6 +885,52 @@ def f(data):
  
        assert runner.exit_reason == ExitReason.max_iterations

    def test_max_iterations_with_all_invalid():

        # With assume(False) on every example, we stop after INVALID_THRESHOLD_BASE + 1

        # invalid attempts (the check is > not >=).

        def f(data):

            data.draw_integer(0, 2**64 - 1)

            data.mark_invalid()

        runner = ConjectureRunner(

            f,

            settings=settings(

                max_examples=10_000, database=None, suppress_health_check=list(HealthCheck)

            ),

        )

        runner.run()

        assert runner.call_count == INVALID_THRESHOLD_BASE + 1

        assert runner.exit_reason == ExitReason.max_iterations

    @pytest.mark.parametrize("n_valid", [1, 2, 5])

    def test_max_iterations_with_some_valid(n_valid):

        valid_count = 0

        def f(data):

            nonlocal valid_count

            data.draw_integer(0, 2**64 - 1)

            if valid_count < n_valid:

                valid_count += 1

            else:

                data.mark_invalid()

        runner = ConjectureRunner(

            f,

            settings=settings(

                max_examples=10_000, database=None, suppress_health_check=list(HealthCheck)

            ),

        )

        runner.run()

        assert (

            runner.call_count

            == n_valid + INVALID_THRESHOLD_BASE + n_valid * INVALID_PER_VALID + 1

        )

        assert runner.exit_reason == ExitReason.max_iterations

    def test_exit_because_shrink_phase_timeout(monkeypatch):

        val = 0

    @@ -1215,11 +1263,11 @@ def test(data):
  
    def test_shrink_after_max_iterations():

        """If we find a bug, keep looking for more, and then hit the test call

        limit, we should still proceed to shrinking.

        """If we find a bug, keep looking for more, and then hit the invalid

        examples limit, we should still proceed to shrinking.

        """

        max_examples = 10

        max_iterations = max_examples * 10

        max_iterations = INVALID_THRESHOLD_BASE

        fail_at = max_iterations - 5

        invalid = set()

hypothesis-python/tests/cover/test_testdecorators.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -25,6 +25,7 @@
  
        strategies as st,

    )

    from hypothesis.errors import Unsatisfiable

    from hypothesis.internal.conjecture.engine import INVALID_THRESHOLD_BASE

    from hypothesis.strategies import (

        binary,

        booleans,

    @@ -507,8 +508,8 @@ def f(v):
  
        with pytest.raises(

            Unsatisfiable,

            match=(

                r"Unable to satisfy assumptions of f\. 1000 of 1000 examples "

                r"failed a \.filter\(\) or assume\(\)"

                rf"Unable to satisfy assumptions of f\. {INVALID_THRESHOLD_BASE+1} of "

                rf"{INVALID_THRESHOLD_BASE+1} examples failed a \.filter\(\) or assume\(\)"

            ),

        ):

            f()

    @@ -532,8 +533,8 @@ def f(v):
  
            pass

        match = (

            r"1000 of 1000 examples were too large to finish generating; try "

            r"reducing the typical size of your inputs\?"

            rf"{INVALID_THRESHOLD_BASE+1} of {INVALID_THRESHOLD_BASE+1} examples were too large to"

            rf" finish generating; try reducing the typical size of your inputs\?"

        )

        with (

            pytest.raises(Unsatisfiable, match=match),

hypothesis-python/tests/nocover/test_conjecture_engine.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -28,7 +28,7 @@ def test_lot_of_dead_nodes(): @@
         @run_to_nodes
         def nodes(data):
             for i in range(4):
-                if data.draw_integer(0, 2**8 - 1) != i:
+                if data.draw_integer(0, 2**7 - 1) != i:
                     data.mark_invalid()
             data.mark_interesting(interesting_origin())
@@ Expand Down @@

hypothesis-python/tests/pytest/test_statistics.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -53,21 +53,21 @@ def test_prints_statistics_given_option(testdir):
  
        out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION)

        assert "Hypothesis Statistics" in out

        assert "max_examples=100" in out

        assert "< 10% of examples satisfied assumptions" in out

        assert "< 1% of examples satisfied assumptions" in out

    def test_prints_statistics_given_option_under_xdist(testdir):

        out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION, "-n", "2")

        assert "Hypothesis Statistics" in out

        assert "max_examples=100" in out

        assert "< 10% of examples satisfied assumptions" in out

        assert "< 1% of examples satisfied assumptions" in out

    def test_prints_statistics_given_option_with_junitxml(testdir):

        out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION, "--junit-xml=out.xml")

        assert "Hypothesis Statistics" in out

        assert "max_examples=100" in out

        assert "< 10% of examples satisfied assumptions" in out

        assert "< 1% of examples satisfied assumptions" in out

    @skipif_threading

    @@ -80,7 +80,7 @@ def test_prints_statistics_given_option_under_xdist_with_junitxml(testdir):
  
        )

        assert "Hypothesis Statistics" in out

        assert "max_examples=100" in out

        assert "< 10% of examples satisfied assumptions" in out

        assert "< 1% of examples satisfied assumptions" in out

    UNITTEST_TESTSUITE = """

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

permit up to 99% assume() failures #4643

Diff view

Diff view

There are no files selected for viewing

permit up to 99% assume() failures #4643

permit up to 99% assume() failures #4643

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing