Skip to content

Commit a90cc06

Browse files
authored
Merge pull request #1 from nwspk/yh-dataexplore
Yh dataexplore (pls don't kill me if i did sth stupid)
2 parents 68f1ef1 + f60efa6 commit a90cc06

File tree

3 files changed

+465
-0
lines changed

3 files changed

+465
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.DS_Store

data/scrape_links_v2.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import asyncio
2+
import aiohttp
3+
import pandas as pd
4+
import whois
5+
from bs4 import BeautifulSoup
6+
from typing import Dict, Any, Tuple
7+
from datetime import datetime
8+
from urllib.parse import urlparse
9+
import logging
10+
from concurrent.futures import ThreadPoolExecutor
11+
12+
logging.basicConfig(
13+
level=logging.INFO,
14+
format='%(asctime)s - %(levelname)s - %(message)s',
15+
datefmt='%Y-%m-%d %H:%M:%S'
16+
)
17+
logger = logging.getLogger(__name__)
18+
19+
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
20+
21+
class WebsiteEnricher:
22+
def __init__(self, max_concurrent: int = 10):
23+
self.max_concurrent = max_concurrent
24+
self.session = None
25+
self.total_processed = 0
26+
self.successful_fetches = 0
27+
self.failed_fetches = 0
28+
29+
async def __aenter__(self):
30+
timeout = aiohttp.ClientTimeout(total=30)
31+
self.session = aiohttp.ClientSession(timeout=timeout)
32+
return self
33+
34+
async def __aexit__(self, exc_type, exc_val, exc_tb):
35+
if self.session:
36+
await self.session.close()
37+
38+
def is_github_url(self, url: str) -> bool:
39+
parsed = urlparse(url)
40+
return parsed.netloc.lower() in ['github.com', 'www.github.com']
41+
42+
async def get_page_data(self, url: str) -> Tuple[Dict[str, Any], str]:
43+
try:
44+
logger.info(f"Fetching URL: {url}")
45+
async with self.session.get(url) as response:
46+
data = {
47+
'status_code': response.status,
48+
'content_type': response.headers.get('content-type'),
49+
'server': response.headers.get('server')
50+
}
51+
52+
if response.status == 200:
53+
self.successful_fetches += 1
54+
html = await response.text()
55+
soup = BeautifulSoup(html, 'html.parser')
56+
logger.info(f"Successfully fetched {url} - Status: {response.status}")
57+
58+
data.update({
59+
'title': soup.title.string.strip() if soup.title else None,
60+
'description': (soup.find('meta', {'name': 'description'}) or {}).get('content'),
61+
'og_title': (soup.find('meta', {'property': 'og:title'}) or {}).get('content'),
62+
})
63+
return data, html
64+
else:
65+
self.failed_fetches += 1
66+
logger.warning(f"Failed to fetch {url} - Status: {response.status}")
67+
return data, ""
68+
except Exception as e:
69+
self.failed_fetches += 1
70+
logger.error(f"Error fetching {url}: {str(e)}")
71+
return {'status_code': None, 'error': str(e)}, ""
72+
73+
def get_domain_info(self, url: str) -> Dict[str, Any]:
74+
try:
75+
domain = urlparse(url).netloc
76+
logger.info(f"Getting WHOIS info for domain: {domain}")
77+
w = whois.whois(domain)
78+
return {
79+
'creation_date': w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date,
80+
'registrar': w.registrar
81+
}
82+
except Exception as e:
83+
logger.error(f"WHOIS error for {url}: {str(e)}")
84+
return {}
85+
86+
async def process_url(self, url: str) -> Dict[str, Any]:
87+
self.total_processed += 1
88+
if self.is_github_url(url):
89+
logger.info(f"Skipping GitHub URL: {url}")
90+
return {'url': url, 'type': 'github', 'skipped': True}
91+
92+
data = {'url': url, 'type': 'website', 'skipped': False}
93+
94+
metadata, html = await self.get_page_data(url)
95+
data.update(metadata)
96+
data['html_content'] = html
97+
98+
with ThreadPoolExecutor() as executor:
99+
domain_info = await asyncio.get_event_loop().run_in_executor(
100+
executor, self.get_domain_info, url
101+
)
102+
data.update(domain_info)
103+
104+
return data
105+
106+
async def process_urls(self, csv_path: str) -> pd.DataFrame:
107+
df = pd.read_csv(csv_path)
108+
urls = df['url'].tolist()
109+
110+
logger.info(f"Starting processing of {len(urls)} URLs...")
111+
112+
results = []
113+
for i in range(0, len(urls), self.max_concurrent):
114+
chunk = urls[i:i + self.max_concurrent]
115+
chunk_results = await asyncio.gather(
116+
*[self.process_url(url) for url in chunk]
117+
)
118+
results.extend(chunk_results)
119+
120+
processed = min(i + self.max_concurrent, len(urls))
121+
non_github = sum(1 for r in results if not r.get('skipped', False))
122+
logger.info(f"Progress: {processed}/{len(urls)} URLs ({non_github} websites)")
123+
logger.info(f"Success rate: {self.successful_fetches}/{self.total_processed} ({self.successful_fetches/self.total_processed*100:.1f}%)")
124+
125+
enriched_df = pd.DataFrame(results)
126+
output_path = f'enriched_websites_{datetime.now():%Y%m%d_%H%M%S}.csv'
127+
enriched_df.to_csv(output_path, index=False)
128+
logger.info(f"Results saved to {output_path}")
129+
130+
return enriched_df
131+
132+
async def main():
133+
async with WebsiteEnricher() as enricher:
134+
df = await enricher.process_urls('data/projects.csv')
135+
logger.info("\nFinal Summary:")
136+
logger.info(f"Total URLs processed: {len(df)}")
137+
logger.info(f"GitHub repositories skipped: {df['skipped'].sum()}")
138+
logger.info(f"Websites processed: {(~df['skipped']).sum()}")
139+
logger.info(f"Successful fetches: {enricher.successful_fetches}")
140+
logger.info(f"Failed fetches: {enricher.failed_fetches}")
141+
logger.info(f"Success rate: {enricher.successful_fetches/enricher.total_processed*100:.1f}%")
142+
143+
if __name__ == "__main__":
144+
asyncio.run(main())

0 commit comments

Comments
 (0)