Skip to content

Commit 731291b

Browse files
authored
v3.2.2 - Improve Threats Watcher with Bluesky integration and updated sources
- Added Bluesky as a new threat intelligence source for monitoring cybersecurity threats and discussions - Adapted Threats Watcher logic to support Bluesky-specific data formats - Introduced a custom User-Agent to improve source fetching reliability - Cleaned and normalized existing sources to improve relevance and consistency - Updated sources.csv with new Bluesky RSS feeds and refined confidence classification - Improved overall data quality and robustness of the Threats Watcher module
1 parent f2395e0 commit 731291b

File tree

4 files changed

+182
-123
lines changed

4 files changed

+182
-123
lines changed

Watcher/Watcher/threats_watcher/core.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@
3535
r"\bHFG\d+\b",
3636
]
3737

38+
HEADERS = {
39+
'User-Agent': (
40+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
41+
"AppleWebKit/537.36 (KHTML, like Gecko) "
42+
"Chrome/120.0.0.0 Safari/537.36"
43+
),
44+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
45+
}
46+
3847
def extract_entities_and_threats(title: str) -> dict:
3948
"""Extract and clean entities and threats from title using NER model."""
4049
ner_pipe = get_ner_pipeline()
@@ -114,7 +123,7 @@ def start_scheduler():
114123
"""
115124
scheduler = BackgroundScheduler(timezone=str(tzlocal.get_localzone()))
116125

117-
scheduler.add_job(main_watch, 'cron', day_of_week='mon-sun', minute='*/5', id='main_watch_job',
126+
scheduler.add_job(main_watch, 'cron', day_of_week='mon-sun', minute='*/30', id='main_watch_job',
118127
max_instances=10, replace_existing=True)
119128

120129
scheduler.add_job(cleanup, 'cron', day_of_week='mon-sun', hour=8, minute=0, id='day_clean', replace_existing=True)
@@ -207,7 +216,7 @@ def load_feeds():
207216

208217
def fetch_last_posts(nb_max_post):
209218
"""
210-
Fetch the nb last posts for each feed.
219+
Fetch the nb last posts for each feed (non-Bluesky) .
211220
212221
:param nb_max_post: The deepness of the search on each feed.
213222
"""
@@ -216,6 +225,7 @@ def fetch_last_posts(nb_max_post):
216225
posts = dict()
217226
tmp_posts = dict()
218227
posts_published = dict()
228+
219229
for url in rss_urls:
220230
try:
221231
feed_content = requests.get(url, timeout=10)
@@ -225,27 +235,27 @@ def fetch_last_posts(nb_max_post):
225235
logger.warning(f"Feed: {url} => Error: Status code: {feed_content.status_code}")
226236
except requests.exceptions.RequestException as e:
227237
logger.error(str(e))
238+
228239
for feed in feeds:
229240
count = 1
230-
for post in feed.entries:
241+
for entry in feed.entries:
231242
if count <= nb_max_post:
232243
count += 1
233-
if 'published_parsed' in post:
234-
if post.published_parsed is not None:
235-
dt = datetime.fromtimestamp(calendar.timegm(post.published_parsed))
236-
else:
244+
dt = "no-date"
245+
parsed = entry.get('published_parsed') or entry.get('updated_parsed')
246+
if parsed:
247+
try:
248+
dt = datetime.fromtimestamp(calendar.timegm(parsed))
249+
except Exception:
237250
dt = "no-date"
238-
else:
239-
dt = "no-date"
240-
if 'link' in post:
241-
if 'title' in post:
242-
tmp_posts[str(post.title)] = post.link
243-
posts_published[str(post.link)] = dt
244-
251+
link = entry.get('link') or entry.get('guid') or entry.get('id') or None
252+
title_raw = entry.get('title') or entry.get('summary') or entry.get('description') or (entry.get('guid') if isinstance(entry.get('guid'), str) else None) or link or ""
253+
title_clean = re.sub(r'<[^>]+>', '', title_raw).replace(u'\xa0', u' ').strip()
254+
if link and title_clean:
255+
tmp_posts[title_clean] = link
256+
posts_published[link] = dt
245257
for title, url in tmp_posts.items():
246-
string = title.replace(u'\xa0', u' ')
247-
posts[string] = url
248-
# print("title lower : " + string.lower() + " url: " + url)
258+
posts[title] = url
249259

250260

251261
def tokenize_count_urls():
@@ -559,4 +569,4 @@ def send_threats_watcher_notifications(content):
559569
else:
560570
return
561571

562-
send_app_specific_notifications('threats_watcher', context_data, subscribers)
572+
send_app_specific_notifications('threats_watcher', context_data, subscribers)

Watcher/Watcher/threats_watcher/datas/banned_words.csv

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,33 @@ Google
7373
Cisco
7474
Linux
7575
Tools
76+
America
77+
Ukraine
78+
French
79+
France
80+
Research
81+
Black
82+
Korean
83+
Russia
84+
China
85+
Email
86+
Products
87+
Centre
88+
Agent
89+
Finland
90+
Hacktivist
91+
Office
92+
Linked
93+
Hackers
94+
North Korea
95+
Secure
96+
Information
97+
Iranian
98+
Japan
99+
Australia
100+
University
101+
Technologies
102+
Italian
103+
International
104+
Denmark
105+
European

0 commit comments

Comments
 (0)