Skip to content

Commit cb5e457

Browse files
Arch fixed dates (#1001)
* feat: add fixed date tracking to arch provider Because all.json does not seem to contain fixed dates, leave the fixed dates in "first-observed" mode, where the first time vunnel sees a given fixed version, the fix will be noted and will remember. Signed-off-by: Will Murphy <[email protected]> * fix: make http wrapper well-behaved Previously, the http wrapper had some unfriendly behaviors, such as retrying 404s and 429s and not respecting the host asking it to back off. Change the http_wrapper so that it respects the Retry-After header in 429 and 503 responses, and so that it respects these rate limits even when called from multiple threads about the same host. This was done in such a way that clients of the http_wrapper don't need to change; all the complexity is hidden inside http_wrapper (and requests that are rate limited will just block). Signed-off-by: Will Murphy <[email protected]> * lint fix Signed-off-by: Will Murphy <[email protected]> * simplify http_wrapper Signed-off-by: Will Murphy <[email protected]> * relax regex on arch advisory ID Typically, these are ASA-YYYYMM-NN, but permit some flexibility in case this format changes. Signed-off-by: Will Murphy <[email protected]> * fix: properly handle 0 or negative Retry-After Some APIs (e.g. NVD) might send a Retry-After date that is in the past, or a string that can be parsed as the int 0, both of which resulted in instantaneous retries. Instead, make a Retry-After header that would result in no wait or negative wait result in 30 seconds. Signed-off-by: Will Murphy <[email protected]> --------- Signed-off-by: Will Murphy <[email protected]>
1 parent 3b9c6a8 commit cb5e457

File tree

13 files changed

+1226
-119
lines changed

13 files changed

+1226
-119
lines changed

src/vunnel/providers/arch/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def tags(cls) -> list[str]:
5252

5353
def update(self, last_updated: datetime.datetime | None) -> tuple[list[str], int]:
5454
self.logger.info("Starting Arch Linux provider update")
55-
with timer(self.name(), self.logger), self.results_writer() as writer:
55+
with timer(self.name(), self.logger), self.results_writer() as writer, self.parser:
5656
count = 0
5757
for identifier, payload in self.parser.parse():
5858
writer.write(

src/vunnel/providers/arch/parser.py

Lines changed: 184 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,23 @@
11
from __future__ import annotations
22

3+
import concurrent.futures
4+
import datetime
35
import logging
46
import os
7+
import re
8+
import time
59
from typing import TYPE_CHECKING, Any
610

711
import orjson
12+
import requests
813

14+
from vunnel.tool import fixdate
915
from vunnel.utils import http_wrapper
10-
from vunnel.utils.vulnerability import FixedIn, Vulnerability
16+
from vunnel.utils.vulnerability import FixAvailability, FixedIn, Vulnerability
1117

1218
if TYPE_CHECKING:
1319
from collections.abc import Generator
20+
from types import TracebackType
1421

1522
from vunnel import workspace
1623

@@ -25,17 +32,43 @@
2532
class Parser:
2633
_input_file_ = "all.json"
2734

28-
def __init__(self, ws: workspace.Workspace, url: str, timeout: int, logger: logging.Logger | None = None):
35+
def __init__(
36+
self,
37+
ws: workspace.Workspace,
38+
url: str,
39+
timeout: int,
40+
fixdater: fixdate.Finder | None = None,
41+
logger: logging.Logger | None = None,
42+
):
2943
self.workspace = ws
3044
self.url = url
3145
self.timeout = timeout
46+
if not fixdater:
47+
fixdater = fixdate.default_finder(ws)
48+
self.fixdater = fixdater
3249
if logger is None:
3350
logger = logging.getLogger(self.__class__.__name__)
3451
self.logger = logger
3552
self.input_file_path = os.path.join(self.workspace.input_path, self._input_file_)
53+
# Cache for ASA advisory dates, populated by _prefetch_asa_dates.
54+
# Thread-safe: single dict operations (get/set) are atomic in CPython due to the GIL.
55+
self._asa_date_cache: dict[str, str | None] = {}
56+
57+
def __enter__(self) -> Parser:
58+
self.fixdater.__enter__()
59+
return self
60+
61+
def __exit__(
62+
self,
63+
exc_type: type[BaseException] | None,
64+
exc_val: BaseException | None,
65+
exc_tb: TracebackType | None,
66+
) -> None:
67+
self.fixdater.__exit__(exc_type, exc_val, exc_tb)
3668

3769
def _download(self) -> None:
3870
"""Download the all.json data from Arch Linux security tracker and save to disk."""
71+
self.fixdater.download()
3972
self.logger.info(f"Downloading Arch Linux vulnerability data from {self.url}")
4073

4174
response = http_wrapper.get(
@@ -60,6 +93,147 @@ def _load(self) -> list[dict[str, Any]]:
6093
with open(self.input_file_path, "rb") as f:
6194
return orjson.loads(f.read())
6295

96+
def _fetch_and_cache_asa_date(self, asa_id: str) -> None:
97+
"""Fetch publication date for an ASA advisory and store in cache.
98+
99+
This method is called during prefetch only. It fetches the date from the
100+
Arch Linux security tracker and caches it for later lookup.
101+
"""
102+
# Validate ASA ID format before constructing URL (defensive check)
103+
if not re.match(r"^ASA-\d{1,9}-\d{1,10}$", asa_id):
104+
self.logger.warning(f"Unexpected ASA ID format: {asa_id}")
105+
self._asa_date_cache[asa_id] = None
106+
return
107+
108+
url = f"https://security.archlinux.org/{asa_id}/raw"
109+
try:
110+
response = http_wrapper.get(
111+
url,
112+
logger=self.logger,
113+
retries=3,
114+
backoff_in_seconds=3,
115+
timeout=self.timeout,
116+
user_agent="vunnel/1.0 (archlinux-provider; +https://github.com/anchore/vunnel)",
117+
)
118+
# Parse date from plain text format: "Date : 2021-06-22"
119+
text = response.text
120+
match = re.search(r"^Date\s*:\s*(\d{4}-\d{2}-\d{2})", text, re.MULTILINE)
121+
self._asa_date_cache[asa_id] = match.group(1) if match else None
122+
except requests.RequestException:
123+
self.logger.debug(f"Failed to fetch ASA {asa_id}, will use first-observed fallback")
124+
self._asa_date_cache[asa_id] = None
125+
126+
def _get_cached_asa_date(self, asa_id: str) -> str | None:
127+
"""Get ASA date from cache. Returns None if not found or fetch failed."""
128+
return self._asa_date_cache.get(asa_id)
129+
130+
def _get_best_asa_date(self, advisories: list[str]) -> str | None:
131+
"""Get earliest ASA date from list of advisories (cache lookup only)."""
132+
if not advisories:
133+
return None
134+
dates = [d for asa in advisories if (d := self._get_cached_asa_date(asa))]
135+
return min(dates) if dates else None
136+
137+
def _prefetch_asa_dates(self, data: list[dict[str, Any]], max_workers: int = 2, batch_size: int = 10) -> None:
138+
"""Prefetch all ASA dates with rate limiting to respect server limits.
139+
140+
This must be called before processing records. After this method completes,
141+
all ASA dates are available via _get_cached_asa_date().
142+
"""
143+
# Collect all unique ASA IDs from fixed vulnerabilities
144+
asa_ids: set[str] = set()
145+
for record in data:
146+
if record.get("fixed") and record.get("status") != "Not affected":
147+
for asa_id in record.get("advisories", []):
148+
asa_ids.add(asa_id)
149+
150+
if not asa_ids:
151+
return
152+
153+
asa_id_list = sorted(asa_ids) # Sort for deterministic ordering
154+
self.logger.info(f"Prefetching {len(asa_id_list)} ASA advisory dates")
155+
156+
# Process in batches with delays to avoid overwhelming the server
157+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
158+
for i in range(0, len(asa_id_list), batch_size):
159+
batch = asa_id_list[i : i + batch_size]
160+
futures = [executor.submit(self._fetch_and_cache_asa_date, asa_id) for asa_id in batch]
161+
concurrent.futures.wait(futures)
162+
163+
# Rate limit: pause between batches (skip delay after last batch)
164+
if i + batch_size < len(asa_id_list):
165+
time.sleep(1.0)
166+
167+
def _build_fixed_in_entries( # noqa: PLR0913
168+
self,
169+
packages: list[str],
170+
fixed_version: str,
171+
issues: list[str],
172+
group_id: str,
173+
ecosystem: str,
174+
advisories: list[str],
175+
) -> list[FixedIn]:
176+
"""Build FixedIn entries for each affected package."""
177+
fixed_in_list: list[FixedIn] = []
178+
179+
# Get ASA date once for all packages
180+
asa_date = self._get_best_asa_date(advisories) if fixed_version else None
181+
182+
for package_name in packages:
183+
if not package_name:
184+
continue
185+
186+
available: FixAvailability | None = None
187+
if fixed_version:
188+
# Use CVE ID if available, otherwise use AVG ID
189+
vuln_id = issues[0] if issues else group_id
190+
191+
# Build candidates with ASA date if available
192+
candidates: list[fixdate.Result] | None = None
193+
if asa_date:
194+
candidates = [
195+
fixdate.Result(
196+
date=datetime.date.fromisoformat(asa_date),
197+
kind="advisory",
198+
accurate=True,
199+
),
200+
]
201+
202+
result = self.fixdater.best(
203+
vuln_id=vuln_id,
204+
cpe_or_package=package_name,
205+
fix_version=fixed_version,
206+
ecosystem=ecosystem,
207+
candidates=candidates,
208+
)
209+
if result and result.date:
210+
available = FixAvailability(
211+
Date=result.date.isoformat(),
212+
Kind=result.kind,
213+
)
214+
215+
fixed_in_list.append(
216+
FixedIn(
217+
Name=package_name,
218+
NamespaceName=ecosystem,
219+
VersionFormat="pacman",
220+
Version=fixed_version if fixed_version else "None",
221+
Module=None,
222+
VendorAdvisory=None,
223+
Available=available,
224+
),
225+
)
226+
return fixed_in_list
227+
228+
def _build_metadata(self, issues: list[str], advisories: list[str]) -> dict[str, Any]:
229+
"""Build metadata dict with CVEs and advisories."""
230+
metadata: dict[str, Any] = {}
231+
if issues:
232+
metadata["CVE"] = [{"Name": cve, "Link": f"https://nvd.nist.gov/vuln/detail/{cve}"} for cve in issues]
233+
if advisories:
234+
metadata["Advisories"] = advisories
235+
return metadata
236+
63237
def parse(self) -> Generator[tuple[str, dict[str, Any]]]:
64238
"""Parse the Arch Linux security data and yield normalized vulnerability records."""
65239
self._download()
@@ -71,6 +245,9 @@ def parse(self) -> Generator[tuple[str, dict[str, Any]]]:
71245

72246
self.logger.info(f"Processing {len(data)} vulnerability records")
73247

248+
# Prefetch all ASA dates concurrently before processing
249+
self._prefetch_asa_dates(data)
250+
74251
for record in data:
75252
try:
76253
# The AVG ID is in the "name" field (e.g., "AVG-2843")
@@ -97,33 +274,15 @@ def parse(self) -> Generator[tuple[str, dict[str, Any]]]:
97274
# Normalize severity
98275
mapped_severity = SEVERITY_MAPPING.get(severity, "Unknown")
99276

100-
# Build FixedIn entries for each affected package
101-
# Use "None" as version for unfixed vulnerabilities
102-
fixed_in_list: list[FixedIn] = []
103-
for package_name in packages:
104-
if package_name:
105-
fixed_in_list.append(
106-
FixedIn(
107-
Name=package_name,
108-
NamespaceName="arch:rolling",
109-
VersionFormat="pacman",
110-
Version=fixed_version if fixed_version else "None",
111-
Module=None,
112-
VendorAdvisory=None,
113-
),
114-
)
115-
116-
# Build metadata with CVEs and advisories
117-
metadata: dict[str, Any] = {}
118-
if issues:
119-
metadata["CVE"] = [{"Name": cve, "Link": f"https://nvd.nist.gov/vuln/detail/{cve}"} for cve in issues]
120-
if advisories:
121-
metadata["Advisories"] = advisories
277+
# Build FixedIn entries and metadata
278+
ecosystem = "arch:rolling"
279+
fixed_in_list = self._build_fixed_in_entries(packages, fixed_version, issues, group_id, ecosystem, advisories)
280+
metadata = self._build_metadata(issues, advisories)
122281

123282
# Build the Vulnerability object
124283
vuln = Vulnerability(
125284
Name=group_id,
126-
NamespaceName="arch:rolling",
285+
NamespaceName=ecosystem,
127286
Description=vuln_type or f"Arch vulnerability {group_id}",
128287
Severity=mapped_severity,
129288
Link=f"https://security.archlinux.org/{group_id}",

0 commit comments

Comments
 (0)