1- """Negative case eval: agent should correctly identify when there are NO late runs.
1+ """Negative case eval: agent should correctly identify when a deployment has NO late runs.
22
33This tests that the agent doesn't hallucinate problems when everything is healthy.
44The blog post "Demystifying Evals for AI Agents" emphasizes balanced problem sets:
55"Test both the cases where a behavior should occur and where it shouldn't."
6+
7+ Note: We scope this to a specific deployment because the test harness is session-scoped
8+ and other tests may create late runs. This is also more realistic - users often ask
9+ about specific deployments.
610"""
711
812from collections .abc import Awaitable , Callable
1216from prefect import flow
1317from prefect .client .orchestration import PrefectClient
1418from prefect .client .schemas .actions import WorkPoolCreate
19+ from prefect .client .schemas .responses import DeploymentResponse
1520from prefect .states import Completed , Running , Scheduled
1621from pydantic_ai import Agent
1722
1823
1924@pytest .fixture
20- async def healthy_scenario (prefect_client : PrefectClient ) -> dict :
21- """Create a healthy scenario with NO late runs.
25+ async def healthy_deployment (prefect_client : PrefectClient ) -> DeploymentResponse :
26+ """Create a healthy deployment with NO late runs.
2227
2328 - Work pool with active workers (READY status)
2429 - No concurrency limits blocking runs
@@ -51,6 +56,7 @@ def healthy_flow():
5156 name = f"healthy-deployment-{ uuid4 ().hex [:8 ]} " ,
5257 work_pool_name = work_pool_name ,
5358 )
59+ deployment = await prefect_client .read_deployment (deployment_id )
5460
5561 # Create flow runs in healthy states (NOT Late)
5662 healthy_states = [
@@ -59,7 +65,6 @@ def healthy_flow():
5965 ("completed-run" , Completed ()),
6066 ]
6167
62- flow_runs = []
6368 for name_suffix , state in healthy_states :
6469 flow_run = await prefect_client .create_flow_run_from_deployment (
6570 deployment_id = deployment_id ,
@@ -68,35 +73,38 @@ def healthy_flow():
6873 await prefect_client .set_flow_run_state (
6974 flow_run_id = flow_run .id , state = state , force = True
7075 )
71- flow_runs .append (flow_run )
7276
73- return {
74- "work_pool_name" : work_pool_name ,
75- "deployment_id" : deployment_id ,
76- "flow_runs" : flow_runs ,
77- }
77+ return deployment
7878
7979
80- async def test_no_late_runs_healthy_response (
80+ async def test_no_late_runs_for_deployment (
8181 simple_agent : Agent ,
82- healthy_scenario : dict ,
82+ healthy_deployment : DeploymentResponse ,
8383 evaluate_response : Callable [[str , str ], Awaitable [None ]],
8484) -> None :
85- """Agent should correctly identify that there are no late runs.
85+ """Agent should correctly identify that a specific deployment has no late runs.
8686
87- This is a negative case - the agent should NOT hallucinate problems.
87+ This is a negative case - the agent should NOT hallucinate problems for this
88+ deployment. We scope to a specific deployment because:
89+ 1. Other tests in the session may create late runs (shared prefect_test_harness)
90+ 2. This is more realistic - users often ask about specific deployments
8891 """
92+ deployment_name = healthy_deployment .name
93+
8994 async with simple_agent :
9095 result = await simple_agent .run (
91- "Are any of my flow runs late? Check if there are runs that have "
92- "been scheduled for a while but haven't started executing."
96+ f"Are there any late flow runs for the deployment '{ deployment_name } '? "
97+ "Check if any runs from this deployment have been scheduled for a while "
98+ "but haven't started executing."
9399 )
94100
95101 await evaluate_response (
96- """Does the response correctly indicate that there are NO late runs?
97- The agent should NOT claim there are late runs or concurrency issues
98- when the scenario has only healthy Scheduled, Running, and Completed runs.
99- It's acceptable to say something like "no late runs found" or
100- "your runs appear to be healthy".""" ,
102+ f"""Does the response correctly indicate that deployment '{ deployment_name } '
103+ has NO late runs? The agent should NOT claim there are late runs for this
104+ specific deployment. It's acceptable to say "no late runs found for this
105+ deployment" or "runs for { deployment_name } appear healthy".
106+
107+ Note: The agent may mention late runs from OTHER deployments - that's fine.
108+ The key is that it correctly identifies THIS deployment has no late runs.""" ,
101109 result .output ,
102110 )
0 commit comments