Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 50 additions & 9 deletions src/operator/utils/node_validation_test/connection_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,15 @@ class URLTestConfig(pydantic.BaseModel):
method: str = pydantic.Field(default='GET', description='HTTP method to use')
timeout: int = pydantic.Field(
default=30, description='Timeout in seconds for the connection test')
expected_status_code: int = pydantic.Field(
default=200, description='Expected HTTP status code')
expected_status_code: Optional[int] = pydantic.Field(
default=None, description='Expected HTTP status code (None means any non-5xx is success)')
condition_name: Optional[str] = pydantic.Field(
default='ServiceConnectionTestFailure', description='Custom condition name for this URL')
retriable_status_codes: List[int] = pydantic.Field(
default=[429, 503], description='Status codes that should trigger retry')
failure_status_codes: List[int] = pydantic.Field(
default=[500, 501, 502, 504, 505, 506, 507, 508, 510, 511],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems kind of odd that by default this is full of 500 status codes, but down below we have

 if status_code in url_config.failure_status_codes or status_code >= 500:

This might as well default to an empty list if we always check all 500 status codes.

description='Status codes indicating actual service failure (5xx except retriable)')


class ConnectionTestConfig(test_base.NodeTestConfig):
Expand Down Expand Up @@ -119,6 +124,13 @@ def _connection_test(self, url_config: URLTestConfig) -> test_base.NodeCondition

Returns:
NodeCondition on success, None on failure (to trigger retry/backoff).

Status code handling:
- If expected_status_code is set, only that code is considered success
- If expected_status_code is None (default):
- Retriable codes (429, 503) trigger retry
- Failure codes (5xx except retriable) indicate service is down
- All other codes (2xx, 3xx, 4xx) indicate service is reachable
"""
try:
logging.info('Testing URL: %s', url_config.url)
Expand All @@ -128,21 +140,50 @@ def _connection_test(self, url_config: URLTestConfig) -> test_base.NodeCondition
timeout=url_config.timeout,
)

if response.status_code != url_config.expected_status_code:
logging.error(
'Unexpected status code from %s: %s != %s',
status_code = response.status_code

# If expected_status_code is explicitly set, use strict matching
if url_config.expected_status_code is not None:
if status_code != url_config.expected_status_code:
logging.error(
'Unexpected status code from %s: %s != %s',
url_config.url,
status_code,
url_config.expected_status_code,
)
return None
else:
# Check if status code is retriable (e.g., 429 rate limiting, 503 unavailable)
if status_code in url_config.retriable_status_codes:
logging.warning(
'Retriable status code from %s: %s, will retry',
url_config.url,
status_code,
)
return None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is the behavior different between this and failure_status_codes since they both return None. How do we differentiate between the two to retry with this one but not with the one below?


# Check if status code indicates actual service failure (5xx errors)
if status_code in url_config.failure_status_codes or status_code >= 500:
logging.error(
'Service failure status code from %s: %s',
url_config.url,
status_code,
)
return None

# Any other status code (2xx, 3xx, 4xx) means service is reachable
logging.info(
'Service reachable at %s with status code %s',
url_config.url,
response.status_code,
url_config.expected_status_code,
status_code,
)
return None

logging.info('URL test passed: %s (%s)', url_config.url, url_config.condition_name)
return test_base.NodeCondition(
type=url_config.condition_name or self.config.condition_name,
status='False',
reason='ServiceConnectionSuccess',
message=f'Connection test passed: {url_config.url}',
message=f'Connection test passed: {url_config.url} (status: {status_code})',
)
except requests.RequestException as e:
logging.error('Connection test failed for %s: %s', url_config.url, str(e))
Expand Down
Loading