11from __future__ import annotations
22
3+ import concurrent .futures
4+ import datetime
35import logging
46import os
7+ import re
8+ import time
59from typing import TYPE_CHECKING , Any
610
711import orjson
12+ import requests
813
14+ from vunnel .tool import fixdate
915from vunnel .utils import http_wrapper
10- from vunnel .utils .vulnerability import FixedIn , Vulnerability
16+ from vunnel .utils .vulnerability import FixAvailability , FixedIn , Vulnerability
1117
1218if TYPE_CHECKING :
1319 from collections .abc import Generator
20+ from types import TracebackType
1421
1522 from vunnel import workspace
1623
2532class Parser :
2633 _input_file_ = "all.json"
2734
28- def __init__ (self , ws : workspace .Workspace , url : str , timeout : int , logger : logging .Logger | None = None ):
35+ def __init__ (
36+ self ,
37+ ws : workspace .Workspace ,
38+ url : str ,
39+ timeout : int ,
40+ fixdater : fixdate .Finder | None = None ,
41+ logger : logging .Logger | None = None ,
42+ ):
2943 self .workspace = ws
3044 self .url = url
3145 self .timeout = timeout
46+ if not fixdater :
47+ fixdater = fixdate .default_finder (ws )
48+ self .fixdater = fixdater
3249 if logger is None :
3350 logger = logging .getLogger (self .__class__ .__name__ )
3451 self .logger = logger
3552 self .input_file_path = os .path .join (self .workspace .input_path , self ._input_file_ )
53+ # Cache for ASA advisory dates, populated by _prefetch_asa_dates.
54+ # Thread-safe: single dict operations (get/set) are atomic in CPython due to the GIL.
55+ self ._asa_date_cache : dict [str , str | None ] = {}
56+
57+ def __enter__ (self ) -> Parser :
58+ self .fixdater .__enter__ ()
59+ return self
60+
61+ def __exit__ (
62+ self ,
63+ exc_type : type [BaseException ] | None ,
64+ exc_val : BaseException | None ,
65+ exc_tb : TracebackType | None ,
66+ ) -> None :
67+ self .fixdater .__exit__ (exc_type , exc_val , exc_tb )
3668
3769 def _download (self ) -> None :
3870 """Download the all.json data from Arch Linux security tracker and save to disk."""
71+ self .fixdater .download ()
3972 self .logger .info (f"Downloading Arch Linux vulnerability data from { self .url } " )
4073
4174 response = http_wrapper .get (
@@ -60,6 +93,147 @@ def _load(self) -> list[dict[str, Any]]:
6093 with open (self .input_file_path , "rb" ) as f :
6194 return orjson .loads (f .read ())
6295
96+ def _fetch_and_cache_asa_date (self , asa_id : str ) -> None :
97+ """Fetch publication date for an ASA advisory and store in cache.
98+
99+ This method is called during prefetch only. It fetches the date from the
100+ Arch Linux security tracker and caches it for later lookup.
101+ """
102+ # Validate ASA ID format before constructing URL (defensive check)
103+ if not re .match (r"^ASA-\d{1,9}-\d{1,10}$" , asa_id ):
104+ self .logger .warning (f"Unexpected ASA ID format: { asa_id } " )
105+ self ._asa_date_cache [asa_id ] = None
106+ return
107+
108+ url = f"https://security.archlinux.org/{ asa_id } /raw"
109+ try :
110+ response = http_wrapper .get (
111+ url ,
112+ logger = self .logger ,
113+ retries = 3 ,
114+ backoff_in_seconds = 3 ,
115+ timeout = self .timeout ,
116+ user_agent = "vunnel/1.0 (archlinux-provider; +https://github.com/anchore/vunnel)" ,
117+ )
118+ # Parse date from plain text format: "Date : 2021-06-22"
119+ text = response .text
120+ match = re .search (r"^Date\s*:\s*(\d{4}-\d{2}-\d{2})" , text , re .MULTILINE )
121+ self ._asa_date_cache [asa_id ] = match .group (1 ) if match else None
122+ except requests .RequestException :
123+ self .logger .debug (f"Failed to fetch ASA { asa_id } , will use first-observed fallback" )
124+ self ._asa_date_cache [asa_id ] = None
125+
126+ def _get_cached_asa_date (self , asa_id : str ) -> str | None :
127+ """Get ASA date from cache. Returns None if not found or fetch failed."""
128+ return self ._asa_date_cache .get (asa_id )
129+
130+ def _get_best_asa_date (self , advisories : list [str ]) -> str | None :
131+ """Get earliest ASA date from list of advisories (cache lookup only)."""
132+ if not advisories :
133+ return None
134+ dates = [d for asa in advisories if (d := self ._get_cached_asa_date (asa ))]
135+ return min (dates ) if dates else None
136+
137+ def _prefetch_asa_dates (self , data : list [dict [str , Any ]], max_workers : int = 2 , batch_size : int = 10 ) -> None :
138+ """Prefetch all ASA dates with rate limiting to respect server limits.
139+
140+ This must be called before processing records. After this method completes,
141+ all ASA dates are available via _get_cached_asa_date().
142+ """
143+ # Collect all unique ASA IDs from fixed vulnerabilities
144+ asa_ids : set [str ] = set ()
145+ for record in data :
146+ if record .get ("fixed" ) and record .get ("status" ) != "Not affected" :
147+ for asa_id in record .get ("advisories" , []):
148+ asa_ids .add (asa_id )
149+
150+ if not asa_ids :
151+ return
152+
153+ asa_id_list = sorted (asa_ids ) # Sort for deterministic ordering
154+ self .logger .info (f"Prefetching { len (asa_id_list )} ASA advisory dates" )
155+
156+ # Process in batches with delays to avoid overwhelming the server
157+ with concurrent .futures .ThreadPoolExecutor (max_workers = max_workers ) as executor :
158+ for i in range (0 , len (asa_id_list ), batch_size ):
159+ batch = asa_id_list [i : i + batch_size ]
160+ futures = [executor .submit (self ._fetch_and_cache_asa_date , asa_id ) for asa_id in batch ]
161+ concurrent .futures .wait (futures )
162+
163+ # Rate limit: pause between batches (skip delay after last batch)
164+ if i + batch_size < len (asa_id_list ):
165+ time .sleep (1.0 )
166+
167+ def _build_fixed_in_entries ( # noqa: PLR0913
168+ self ,
169+ packages : list [str ],
170+ fixed_version : str ,
171+ issues : list [str ],
172+ group_id : str ,
173+ ecosystem : str ,
174+ advisories : list [str ],
175+ ) -> list [FixedIn ]:
176+ """Build FixedIn entries for each affected package."""
177+ fixed_in_list : list [FixedIn ] = []
178+
179+ # Get ASA date once for all packages
180+ asa_date = self ._get_best_asa_date (advisories ) if fixed_version else None
181+
182+ for package_name in packages :
183+ if not package_name :
184+ continue
185+
186+ available : FixAvailability | None = None
187+ if fixed_version :
188+ # Use CVE ID if available, otherwise use AVG ID
189+ vuln_id = issues [0 ] if issues else group_id
190+
191+ # Build candidates with ASA date if available
192+ candidates : list [fixdate .Result ] | None = None
193+ if asa_date :
194+ candidates = [
195+ fixdate .Result (
196+ date = datetime .date .fromisoformat (asa_date ),
197+ kind = "advisory" ,
198+ accurate = True ,
199+ ),
200+ ]
201+
202+ result = self .fixdater .best (
203+ vuln_id = vuln_id ,
204+ cpe_or_package = package_name ,
205+ fix_version = fixed_version ,
206+ ecosystem = ecosystem ,
207+ candidates = candidates ,
208+ )
209+ if result and result .date :
210+ available = FixAvailability (
211+ Date = result .date .isoformat (),
212+ Kind = result .kind ,
213+ )
214+
215+ fixed_in_list .append (
216+ FixedIn (
217+ Name = package_name ,
218+ NamespaceName = ecosystem ,
219+ VersionFormat = "pacman" ,
220+ Version = fixed_version if fixed_version else "None" ,
221+ Module = None ,
222+ VendorAdvisory = None ,
223+ Available = available ,
224+ ),
225+ )
226+ return fixed_in_list
227+
228+ def _build_metadata (self , issues : list [str ], advisories : list [str ]) -> dict [str , Any ]:
229+ """Build metadata dict with CVEs and advisories."""
230+ metadata : dict [str , Any ] = {}
231+ if issues :
232+ metadata ["CVE" ] = [{"Name" : cve , "Link" : f"https://nvd.nist.gov/vuln/detail/{ cve } " } for cve in issues ]
233+ if advisories :
234+ metadata ["Advisories" ] = advisories
235+ return metadata
236+
63237 def parse (self ) -> Generator [tuple [str , dict [str , Any ]]]:
64238 """Parse the Arch Linux security data and yield normalized vulnerability records."""
65239 self ._download ()
@@ -71,6 +245,9 @@ def parse(self) -> Generator[tuple[str, dict[str, Any]]]:
71245
72246 self .logger .info (f"Processing { len (data )} vulnerability records" )
73247
248+ # Prefetch all ASA dates concurrently before processing
249+ self ._prefetch_asa_dates (data )
250+
74251 for record in data :
75252 try :
76253 # The AVG ID is in the "name" field (e.g., "AVG-2843")
@@ -97,33 +274,15 @@ def parse(self) -> Generator[tuple[str, dict[str, Any]]]:
97274 # Normalize severity
98275 mapped_severity = SEVERITY_MAPPING .get (severity , "Unknown" )
99276
100- # Build FixedIn entries for each affected package
101- # Use "None" as version for unfixed vulnerabilities
102- fixed_in_list : list [FixedIn ] = []
103- for package_name in packages :
104- if package_name :
105- fixed_in_list .append (
106- FixedIn (
107- Name = package_name ,
108- NamespaceName = "arch:rolling" ,
109- VersionFormat = "pacman" ,
110- Version = fixed_version if fixed_version else "None" ,
111- Module = None ,
112- VendorAdvisory = None ,
113- ),
114- )
115-
116- # Build metadata with CVEs and advisories
117- metadata : dict [str , Any ] = {}
118- if issues :
119- metadata ["CVE" ] = [{"Name" : cve , "Link" : f"https://nvd.nist.gov/vuln/detail/{ cve } " } for cve in issues ]
120- if advisories :
121- metadata ["Advisories" ] = advisories
277+ # Build FixedIn entries and metadata
278+ ecosystem = "arch:rolling"
279+ fixed_in_list = self ._build_fixed_in_entries (packages , fixed_version , issues , group_id , ecosystem , advisories )
280+ metadata = self ._build_metadata (issues , advisories )
122281
123282 # Build the Vulnerability object
124283 vuln = Vulnerability (
125284 Name = group_id ,
126- NamespaceName = "arch:rolling" ,
285+ NamespaceName = ecosystem ,
127286 Description = vuln_type or f"Arch vulnerability { group_id } " ,
128287 Severity = mapped_severity ,
129288 Link = f"https://security.archlinux.org/{ group_id } " ,
0 commit comments