replace automation action validation eval with lease renewal crash eval

zzstoatzz · claude · zzstoatzz · commit 5ff3463e685f · 2025-12-30T23:11:29.000-06:00
The previous eval tested automation Jinja template type mismatches, which required specialized domain knowledge the agent didn't have access to. This new eval tests diagnosing flow runs that crash due to concurrency lease renewal failure - a real user pain point (prefect#19068, prefect#18839) that can be diagnosed from first principles by reading the crash state message. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/evals/README.md b/evals/README.md
@@ -57,8 +57,8 @@ Provider defaults:
 | **automations/test_create_reactive_automation** | verifies agent can create reactive automations | ✅ implemented | [#47](https://github.com/PrefectHQ/prefect-mcp-server/pull/47) |
 | **automations/test_create_proactive_automation** | verifies agent can create proactive automations | ✅ implemented | - |
 | **automations/test_debug_automation_not_firing** | verifies agent can debug why an automation didn't fire due to threshold mismatch | ✅ implemented | [#62](https://github.com/PrefectHQ/prefect-mcp-server/issues/62) |
-| **automations/test_debug_action_validation_failure** | verifies agent can identify parameter type mismatches between Jinja templates and deployment schemas | ✅ implemented | [#97](https://github.com/PrefectHQ/prefect-mcp-server/issues/97) |
 | **test_trigger_deployment_run** | verifies agent can trigger deployment runs with custom parameters | ✅ implemented | - |
+| **test_lease_renewal_crash** | verifies agent can diagnose flow runs that crashed due to concurrency lease renewal failure | ✅ implemented | [#97](https://github.com/PrefectHQ/prefect-mcp-server/issues/97) |
 | **rate_limits/test_cloud_direct** | verifies agent can diagnose rate limiting when user asks about 429 errors (Cloud) | ✅ implemented | [#46](https://github.com/PrefectHQ/prefect-mcp-server/issues/46) |
 | **rate_limits/test_cloud_no_throttling** | verifies agent correctly rules out rate limiting when no throttling occurred (Cloud) | ✅ implemented | [#46](https://github.com/PrefectHQ/prefect-mcp-server/issues/46) |
 | **rate_limits/test_cloud_correlate_logs** | verifies agent can correlate 429 warnings in flow logs with rate limit data (Cloud) | ✅ implemented | [#46](https://github.com/PrefectHQ/prefect-mcp-server/issues/46) |
diff --git a/evals/automations/test_debug_action_validation_failure.py b/evals/automations/test_debug_action_validation_failure.py
diff --git a/evals/test_lease_renewal_crash.py b/evals/test_lease_renewal_crash.py
@@ -0,0 +1,78 @@
+"""Eval for diagnosing flow runs that crash due to concurrency lease renewal failure.
+
+Based on real user issues:
+- https://github.com/PrefectHQ/prefect/issues/19068
+- https://github.com/PrefectHQ/prefect/issues/18839
+
+When a flow run holds a concurrency slot, it must periodically renew the lease.
+If renewal fails (network issues, API problems, timeout), Prefect crashes the run
+to prevent over-allocation. This is a common production issue that's hard to diagnose
+without understanding Prefect's internal lease renewal mechanism.
+"""
+
+from collections.abc import Awaitable, Callable
+from uuid import uuid4
+
+import pytest
+from prefect import flow
+from prefect.client.orchestration import PrefectClient
+from prefect.client.schemas.objects import FlowRun
+from prefect.states import Crashed
+from pydantic_ai import Agent
+
+from evals._tools.spy import ToolCallSpy
+
+LEASE_RENEWAL_ERROR = (
+    "Concurrency lease renewal failed - slots are no longer reserved. "
+    "Terminating execution to prevent over-allocation."
+)
+
+
+@pytest.fixture
+async def crashed_lease_renewal_flow_run(prefect_client: PrefectClient) -> FlowRun:
+    """Create a flow run that crashed due to lease renewal failure."""
+
+    @flow(name=f"data-pipeline-{uuid4().hex[:8]}")
+    def data_pipeline():
+        return "completed"
+
+    # Run the flow to create a flow run
+    state = data_pipeline(return_state=True)
+    flow_run = await prefect_client.read_flow_run(state.state_details.flow_run_id)
+
+    # Force to Crashed state with lease renewal error message
+    crashed_state = Crashed(message=LEASE_RENEWAL_ERROR)
+    await prefect_client.set_flow_run_state(
+        flow_run_id=flow_run.id,
+        state=crashed_state,
+        force=True,
+    )
+
+    return await prefect_client.read_flow_run(flow_run.id)
+
+
+async def test_diagnoses_lease_renewal_failure(
+    simple_agent: Agent,
+    crashed_lease_renewal_flow_run: FlowRun,
+    evaluate_response: Callable[[str, str], Awaitable[None]],
+    tool_call_spy: ToolCallSpy,
+) -> None:
+    """Test agent identifies concurrency lease renewal failure as crash cause."""
+    prompt = (
+        f"Why did my flow run '{crashed_lease_renewal_flow_run.name}' crash "
+        "unexpectedly during execution? It was running fine and then suddenly crashed."
+    )
+
+    async with simple_agent:
+        result = await simple_agent.run(prompt)
+
+    await evaluate_response(
+        "Does the agent correctly identify that the flow run crashed due to "
+        "concurrency lease renewal failure? The response should mention "
+        "'lease renewal' or 'concurrency slot' and explain that the run was "
+        "terminated because the lease could not be renewed.",
+        result.output,
+    )
+
+    # Agent must use get_flow_runs to retrieve the crash details
+    tool_call_spy.assert_tool_was_called("get_flow_runs")