1+ import asyncio
2+ import aiohttp
3+ import pandas as pd
4+ import whois
5+ from bs4 import BeautifulSoup
6+ from typing import Dict , Any , Tuple
7+ from datetime import datetime
8+ from urllib .parse import urlparse
9+ import logging
10+ from concurrent .futures import ThreadPoolExecutor
11+
12+ logging .basicConfig (
13+ level = logging .INFO ,
14+ format = '%(asctime)s - %(levelname)s - %(message)s' ,
15+ datefmt = '%Y-%m-%d %H:%M:%S'
16+ )
17+ logger = logging .getLogger (__name__ )
18+
19+ USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
20+
21+ class WebsiteEnricher :
22+ def __init__ (self , max_concurrent : int = 10 ):
23+ self .max_concurrent = max_concurrent
24+ self .session = None
25+ self .total_processed = 0
26+ self .successful_fetches = 0
27+ self .failed_fetches = 0
28+
29+ async def __aenter__ (self ):
30+ timeout = aiohttp .ClientTimeout (total = 30 )
31+ self .session = aiohttp .ClientSession (timeout = timeout )
32+ return self
33+
34+ async def __aexit__ (self , exc_type , exc_val , exc_tb ):
35+ if self .session :
36+ await self .session .close ()
37+
38+ def is_github_url (self , url : str ) -> bool :
39+ parsed = urlparse (url )
40+ return parsed .netloc .lower () in ['github.com' , 'www.github.com' ]
41+
42+ async def get_page_data (self , url : str ) -> Tuple [Dict [str , Any ], str ]:
43+ try :
44+ logger .info (f"Fetching URL: { url } " )
45+ async with self .session .get (url ) as response :
46+ data = {
47+ 'status_code' : response .status ,
48+ 'content_type' : response .headers .get ('content-type' ),
49+ 'server' : response .headers .get ('server' )
50+ }
51+
52+ if response .status == 200 :
53+ self .successful_fetches += 1
54+ html = await response .text ()
55+ soup = BeautifulSoup (html , 'html.parser' )
56+ logger .info (f"Successfully fetched { url } - Status: { response .status } " )
57+
58+ data .update ({
59+ 'title' : soup .title .string .strip () if soup .title else None ,
60+ 'description' : (soup .find ('meta' , {'name' : 'description' }) or {}).get ('content' ),
61+ 'og_title' : (soup .find ('meta' , {'property' : 'og:title' }) or {}).get ('content' ),
62+ })
63+ return data , html
64+ else :
65+ self .failed_fetches += 1
66+ logger .warning (f"Failed to fetch { url } - Status: { response .status } " )
67+ return data , ""
68+ except Exception as e :
69+ self .failed_fetches += 1
70+ logger .error (f"Error fetching { url } : { str (e )} " )
71+ return {'status_code' : None , 'error' : str (e )}, ""
72+
73+ def get_domain_info (self , url : str ) -> Dict [str , Any ]:
74+ try :
75+ domain = urlparse (url ).netloc
76+ logger .info (f"Getting WHOIS info for domain: { domain } " )
77+ w = whois .whois (domain )
78+ return {
79+ 'creation_date' : w .creation_date [0 ] if isinstance (w .creation_date , list ) else w .creation_date ,
80+ 'registrar' : w .registrar
81+ }
82+ except Exception as e :
83+ logger .error (f"WHOIS error for { url } : { str (e )} " )
84+ return {}
85+
86+ async def process_url (self , url : str ) -> Dict [str , Any ]:
87+ self .total_processed += 1
88+ if self .is_github_url (url ):
89+ logger .info (f"Skipping GitHub URL: { url } " )
90+ return {'url' : url , 'type' : 'github' , 'skipped' : True }
91+
92+ data = {'url' : url , 'type' : 'website' , 'skipped' : False }
93+
94+ metadata , html = await self .get_page_data (url )
95+ data .update (metadata )
96+ data ['html_content' ] = html
97+
98+ with ThreadPoolExecutor () as executor :
99+ domain_info = await asyncio .get_event_loop ().run_in_executor (
100+ executor , self .get_domain_info , url
101+ )
102+ data .update (domain_info )
103+
104+ return data
105+
106+ async def process_urls (self , csv_path : str ) -> pd .DataFrame :
107+ df = pd .read_csv (csv_path )
108+ urls = df ['url' ].tolist ()
109+
110+ logger .info (f"Starting processing of { len (urls )} URLs..." )
111+
112+ results = []
113+ for i in range (0 , len (urls ), self .max_concurrent ):
114+ chunk = urls [i :i + self .max_concurrent ]
115+ chunk_results = await asyncio .gather (
116+ * [self .process_url (url ) for url in chunk ]
117+ )
118+ results .extend (chunk_results )
119+
120+ processed = min (i + self .max_concurrent , len (urls ))
121+ non_github = sum (1 for r in results if not r .get ('skipped' , False ))
122+ logger .info (f"Progress: { processed } /{ len (urls )} URLs ({ non_github } websites)" )
123+ logger .info (f"Success rate: { self .successful_fetches } /{ self .total_processed } ({ self .successful_fetches / self .total_processed * 100 :.1f} %)" )
124+
125+ enriched_df = pd .DataFrame (results )
126+ output_path = f'enriched_websites_{ datetime .now ():%Y%m%d_%H%M%S} .csv'
127+ enriched_df .to_csv (output_path , index = False )
128+ logger .info (f"Results saved to { output_path } " )
129+
130+ return enriched_df
131+
132+ async def main ():
133+ async with WebsiteEnricher () as enricher :
134+ df = await enricher .process_urls ('data/projects.csv' )
135+ logger .info ("\n Final Summary:" )
136+ logger .info (f"Total URLs processed: { len (df )} " )
137+ logger .info (f"GitHub repositories skipped: { df ['skipped' ].sum ()} " )
138+ logger .info (f"Websites processed: { (~ df ['skipped' ]).sum ()} " )
139+ logger .info (f"Successful fetches: { enricher .successful_fetches } " )
140+ logger .info (f"Failed fetches: { enricher .failed_fetches } " )
141+ logger .info (f"Success rate: { enricher .successful_fetches / enricher .total_processed * 100 :.1f} %" )
142+
143+ if __name__ == "__main__" :
144+ asyncio .run (main ())
0 commit comments