3535 r"\bHFG\d+\b" ,
3636]
3737
38+ HEADERS = {
39+ 'User-Agent' : (
40+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
41+ "AppleWebKit/537.36 (KHTML, like Gecko) "
42+ "Chrome/120.0.0.0 Safari/537.36"
43+ ),
44+ "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
45+ }
46+
3847def extract_entities_and_threats (title : str ) -> dict :
3948 """Extract and clean entities and threats from title using NER model."""
4049 ner_pipe = get_ner_pipeline ()
@@ -114,7 +123,7 @@ def start_scheduler():
114123 """
115124 scheduler = BackgroundScheduler (timezone = str (tzlocal .get_localzone ()))
116125
117- scheduler .add_job (main_watch , 'cron' , day_of_week = 'mon-sun' , minute = '*/5 ' , id = 'main_watch_job' ,
126+ scheduler .add_job (main_watch , 'cron' , day_of_week = 'mon-sun' , minute = '*/30 ' , id = 'main_watch_job' ,
118127 max_instances = 10 , replace_existing = True )
119128
120129 scheduler .add_job (cleanup , 'cron' , day_of_week = 'mon-sun' , hour = 8 , minute = 0 , id = 'day_clean' , replace_existing = True )
@@ -207,7 +216,7 @@ def load_feeds():
207216
208217def fetch_last_posts (nb_max_post ):
209218 """
210- Fetch the nb last posts for each feed.
219+ Fetch the nb last posts for each feed (non-Bluesky) .
211220
212221 :param nb_max_post: The deepness of the search on each feed.
213222 """
@@ -216,6 +225,7 @@ def fetch_last_posts(nb_max_post):
216225 posts = dict ()
217226 tmp_posts = dict ()
218227 posts_published = dict ()
228+
219229 for url in rss_urls :
220230 try :
221231 feed_content = requests .get (url , timeout = 10 )
@@ -225,27 +235,27 @@ def fetch_last_posts(nb_max_post):
225235 logger .warning (f"Feed: { url } => Error: Status code: { feed_content .status_code } " )
226236 except requests .exceptions .RequestException as e :
227237 logger .error (str (e ))
238+
228239 for feed in feeds :
229240 count = 1
230- for post in feed .entries :
241+ for entry in feed .entries :
231242 if count <= nb_max_post :
232243 count += 1
233- if 'published_parsed' in post :
234- if post .published_parsed is not None :
235- dt = datetime .fromtimestamp (calendar .timegm (post .published_parsed ))
236- else :
244+ dt = "no-date"
245+ parsed = entry .get ('published_parsed' ) or entry .get ('updated_parsed' )
246+ if parsed :
247+ try :
248+ dt = datetime .fromtimestamp (calendar .timegm (parsed ))
249+ except Exception :
237250 dt = "no-date"
238- else :
239- dt = "no-date"
240- if 'link' in post :
241- if 'title' in post :
242- tmp_posts [str (post .title )] = post .link
243- posts_published [str (post .link )] = dt
244-
251+ link = entry .get ('link' ) or entry .get ('guid' ) or entry .get ('id' ) or None
252+ title_raw = entry .get ('title' ) or entry .get ('summary' ) or entry .get ('description' ) or (entry .get ('guid' ) if isinstance (entry .get ('guid' ), str ) else None ) or link or ""
253+ title_clean = re .sub (r'<[^>]+>' , '' , title_raw ).replace (u'\xa0 ' , u' ' ).strip ()
254+ if link and title_clean :
255+ tmp_posts [title_clean ] = link
256+ posts_published [link ] = dt
245257 for title , url in tmp_posts .items ():
246- string = title .replace (u'\xa0 ' , u' ' )
247- posts [string ] = url
248- # print("title lower : " + string.lower() + " url: " + url)
258+ posts [title ] = url
249259
250260
251261def tokenize_count_urls ():
@@ -559,4 +569,4 @@ def send_threats_watcher_notifications(content):
559569 else :
560570 return
561571
562- send_app_specific_notifications ('threats_watcher' , context_data , subscribers )
572+ send_app_specific_notifications ('threats_watcher' , context_data , subscribers )
0 commit comments