diff --git a/AUTHORS.rst b/AUTHORS.rst index bd06aff14d..5c4a93908f 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -8,6 +8,7 @@ their individual contributions. .. NOTE - this list is in alphabetical order by first name (or handle). +* `A. Jesse Jiryu Davis `_ * `Aaron Meurer `_ * `Adam Johnson `_ * `Adam Matan _` diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst new file mode 100644 index 0000000000..d3364654c5 --- /dev/null +++ b/hypothesis-python/RELEASE.rst @@ -0,0 +1,8 @@ +RELEASE_TYPE: patch + +This patch makes Hypothesis more tolerant of slow-to-satisfy ``assume()`` calls. +Previously, Hypothesis would give up after ``max_examples * 10`` attempts; now it +uses a statistical test to stop only when 99% confident that <1% of examples +would pass (:issue:`4623`). + +Thanks to @ajdavis for this improvement! diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/engine.py b/hypothesis-python/src/hypothesis/internal/conjecture/engine.py index 531797c7ab..ad3874e110 100644 --- a/hypothesis-python/src/hypothesis/internal/conjecture/engine.py +++ b/hypothesis-python/src/hypothesis/internal/conjecture/engine.py @@ -156,11 +156,30 @@ def timing_report(self) -> str: return "\n".join(out) +# Stop when 99% confident the true valid rate is below 1%. +# For k valid examples, we need n invalid such that: +# P(seeing <= k valid in n+k trials | rate=1%) <= 1% +# k=0: (0.99)^n <= 0.01 -> n >= ln(0.01)/ln(0.99) +# Each additional valid example adds ~ln(0.01)/ln(0.99)/3 to threshold. +def _calculate_thresholds( + confidence: float = 0.99, min_valid_rate: float = 0.01 +) -> tuple[int, int]: + log_confidence = math.log(1 - confidence) + log_invalid_rate = math.log(1 - min_valid_rate) + base = math.ceil(log_confidence / log_invalid_rate) + # Approximate increase per valid example (from binomial CDF) + per_valid = math.ceil(base / 3) + return base, per_valid + + +INVALID_THRESHOLD_BASE, INVALID_PER_VALID = _calculate_thresholds() + + class ExitReason(Enum): max_examples = "settings.max_examples={s.max_examples}" max_iterations = ( "settings.max_examples={s.max_examples}, " - "but < 10% of examples satisfied assumptions" + "but < 1% of examples satisfied assumptions" ) max_shrinks = f"shrunk example {MAX_SHRINKS} times" finished = "nothing left to do" @@ -713,12 +732,11 @@ def test_function(self, data: ConjectureData) -> None: # while in the other case below we just want to move on to shrinking.) if self.valid_examples >= self.settings.max_examples: self.exit_with(ExitReason.max_examples) - if self.call_count >= max( - self.settings.max_examples * 10, - # We have a high-ish default max iterations, so that tests - # don't become flaky when max_examples is too low. - 1000, - ): + # Stop when we're 99% confident the true valid rate is below 1%. + invalid_threshold = ( + INVALID_THRESHOLD_BASE + INVALID_PER_VALID * self.valid_examples + ) + if (self.invalid_examples + self.overrun_examples) > invalid_threshold: self.exit_with(ExitReason.max_iterations) if self.__tree_is_exhausted(): @@ -1077,8 +1095,12 @@ def should_generate_more(self) -> bool: # but with the important distinction that this clause will move on to # the shrinking phase having found one or more bugs, while the other # will exit having found zero bugs. - if self.valid_examples >= self.settings.max_examples or self.call_count >= max( - self.settings.max_examples * 10, 1000 + invalid_threshold = ( + INVALID_THRESHOLD_BASE + INVALID_PER_VALID * self.valid_examples + ) + if ( + self.valid_examples >= self.settings.max_examples + or (self.invalid_examples + self.overrun_examples) > invalid_threshold ): # pragma: no cover return False diff --git a/hypothesis-python/tests/conjecture/test_engine.py b/hypothesis-python/tests/conjecture/test_engine.py index c251b2d96d..44d5d9e11e 100644 --- a/hypothesis-python/tests/conjecture/test_engine.py +++ b/hypothesis-python/tests/conjecture/test_engine.py @@ -36,6 +36,8 @@ from hypothesis.internal.conjecture.data import ConjectureData, Overrun, Status from hypothesis.internal.conjecture.datatree import compute_max_children from hypothesis.internal.conjecture.engine import ( + INVALID_PER_VALID, + INVALID_THRESHOLD_BASE, MIN_TEST_CALLS, ConjectureRunner, ExitReason, @@ -883,6 +885,52 @@ def f(data): assert runner.exit_reason == ExitReason.max_iterations +def test_max_iterations_with_all_invalid(): + # With assume(False) on every example, we stop after INVALID_THRESHOLD_BASE + 1 + # invalid attempts (the check is > not >=). + def f(data): + data.draw_integer(0, 2**64 - 1) + data.mark_invalid() + + runner = ConjectureRunner( + f, + settings=settings( + max_examples=10_000, database=None, suppress_health_check=list(HealthCheck) + ), + ) + runner.run() + + assert runner.call_count == INVALID_THRESHOLD_BASE + 1 + assert runner.exit_reason == ExitReason.max_iterations + + +@pytest.mark.parametrize("n_valid", [1, 2, 5]) +def test_max_iterations_with_some_valid(n_valid): + valid_count = 0 + + def f(data): + nonlocal valid_count + data.draw_integer(0, 2**64 - 1) + if valid_count < n_valid: + valid_count += 1 + else: + data.mark_invalid() + + runner = ConjectureRunner( + f, + settings=settings( + max_examples=10_000, database=None, suppress_health_check=list(HealthCheck) + ), + ) + runner.run() + + assert ( + runner.call_count + == n_valid + INVALID_THRESHOLD_BASE + n_valid * INVALID_PER_VALID + 1 + ) + assert runner.exit_reason == ExitReason.max_iterations + + def test_exit_because_shrink_phase_timeout(monkeypatch): val = 0 @@ -1215,11 +1263,11 @@ def test(data): def test_shrink_after_max_iterations(): - """If we find a bug, keep looking for more, and then hit the test call - limit, we should still proceed to shrinking. + """If we find a bug, keep looking for more, and then hit the invalid + examples limit, we should still proceed to shrinking. """ max_examples = 10 - max_iterations = max_examples * 10 + max_iterations = INVALID_THRESHOLD_BASE fail_at = max_iterations - 5 invalid = set() diff --git a/hypothesis-python/tests/cover/test_testdecorators.py b/hypothesis-python/tests/cover/test_testdecorators.py index 33a5509c22..ad49d90701 100644 --- a/hypothesis-python/tests/cover/test_testdecorators.py +++ b/hypothesis-python/tests/cover/test_testdecorators.py @@ -25,6 +25,7 @@ strategies as st, ) from hypothesis.errors import Unsatisfiable +from hypothesis.internal.conjecture.engine import INVALID_THRESHOLD_BASE from hypothesis.strategies import ( binary, booleans, @@ -507,8 +508,8 @@ def f(v): with pytest.raises( Unsatisfiable, match=( - r"Unable to satisfy assumptions of f\. 1000 of 1000 examples " - r"failed a \.filter\(\) or assume\(\)" + rf"Unable to satisfy assumptions of f\. {INVALID_THRESHOLD_BASE+1} of " + rf"{INVALID_THRESHOLD_BASE+1} examples failed a \.filter\(\) or assume\(\)" ), ): f() @@ -532,8 +533,8 @@ def f(v): pass match = ( - r"1000 of 1000 examples were too large to finish generating; try " - r"reducing the typical size of your inputs\?" + rf"{INVALID_THRESHOLD_BASE+1} of {INVALID_THRESHOLD_BASE+1} examples were too large to" + rf" finish generating; try reducing the typical size of your inputs\?" ) with ( pytest.raises(Unsatisfiable, match=match), diff --git a/hypothesis-python/tests/nocover/test_conjecture_engine.py b/hypothesis-python/tests/nocover/test_conjecture_engine.py index 123d568b00..3795793917 100644 --- a/hypothesis-python/tests/nocover/test_conjecture_engine.py +++ b/hypothesis-python/tests/nocover/test_conjecture_engine.py @@ -28,7 +28,7 @@ def test_lot_of_dead_nodes(): @run_to_nodes def nodes(data): for i in range(4): - if data.draw_integer(0, 2**8 - 1) != i: + if data.draw_integer(0, 2**7 - 1) != i: data.mark_invalid() data.mark_interesting(interesting_origin()) diff --git a/hypothesis-python/tests/pytest/test_statistics.py b/hypothesis-python/tests/pytest/test_statistics.py index 3c67bc949c..be8fe7cec4 100644 --- a/hypothesis-python/tests/pytest/test_statistics.py +++ b/hypothesis-python/tests/pytest/test_statistics.py @@ -53,21 +53,21 @@ def test_prints_statistics_given_option(testdir): out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION) assert "Hypothesis Statistics" in out assert "max_examples=100" in out - assert "< 10% of examples satisfied assumptions" in out + assert "< 1% of examples satisfied assumptions" in out def test_prints_statistics_given_option_under_xdist(testdir): out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION, "-n", "2") assert "Hypothesis Statistics" in out assert "max_examples=100" in out - assert "< 10% of examples satisfied assumptions" in out + assert "< 1% of examples satisfied assumptions" in out def test_prints_statistics_given_option_with_junitxml(testdir): out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION, "--junit-xml=out.xml") assert "Hypothesis Statistics" in out assert "max_examples=100" in out - assert "< 10% of examples satisfied assumptions" in out + assert "< 1% of examples satisfied assumptions" in out @skipif_threading @@ -80,7 +80,7 @@ def test_prints_statistics_given_option_under_xdist_with_junitxml(testdir): ) assert "Hypothesis Statistics" in out assert "max_examples=100" in out - assert "< 10% of examples satisfied assumptions" in out + assert "< 1% of examples satisfied assumptions" in out UNITTEST_TESTSUITE = """