broadfield-dev commited on
Commit
4a45db6
·
verified ·
1 Parent(s): 6d6a251

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +49 -33
rss_processor.py CHANGED
@@ -8,8 +8,9 @@ from huggingface_hub import HfApi, login
8
  import shutil
9
  import rss_feeds
10
  from datetime import datetime
11
- import dateutil.parser # For flexible date parsing
12
- import hashlib # For generating unique hashes
 
13
 
14
  # Setup logging
15
  logging.basicConfig(level=logging.INFO)
@@ -38,6 +39,16 @@ vector_db = Chroma(
38
  collection_name=COLLECTION_NAME
39
  )
40
 
 
 
 
 
 
 
 
 
 
 
41
  def fetch_rss_feeds():
42
  articles = []
43
  seen_keys = set()
@@ -52,13 +63,18 @@ def fetch_rss_feeds():
52
  for entry in feed.entries:
53
  if article_count >= MAX_ARTICLES_PER_FEED:
54
  break
55
- title = entry.get("title", "No Title").strip().lower()
56
- link = entry.get("link", "").strip().lower()
57
- description = entry.get("summary", entry.get("description", "No Description")).strip()
 
 
 
 
 
58
 
59
  # Try multiple date fields and parse flexibly
60
  published = "Unknown Date"
61
- for date_field in ["published", "updated", "created"]:
62
  if date_field in entry:
63
  try:
64
  parsed_date = dateutil.parser.parse(entry[date_field])
@@ -68,23 +84,23 @@ def fetch_rss_feeds():
68
  logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
69
  continue
70
 
71
- # Use a robust key for deduplication, including a hash of the description
72
- description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
73
  key = f"{title}|{link}|{published}|{description_hash}"
74
  if key not in seen_keys:
75
  seen_keys.add(key)
76
  # Try multiple image sources
77
  image = "svg" # Default fallback
78
  for img_source in [
79
- lambda e: e.get("media_content", [{}])[0].get("url"),
80
- lambda e: e.get("media_thumbnail", [{}])[0].get("url"),
81
- lambda e: e.get("enclosure", {}).get("url"),
82
- lambda e: next((lnk.get("href") for lnk in e.get("links", []) if lnk.get("type", "").startswith("image")), None),
83
  ]:
84
  try:
85
  img = img_source(entry)
86
- if img and isinstance(img, str) and img.strip():
87
- image = img.strip()
88
  break
89
  except (IndexError, AttributeError, TypeError):
90
  continue
@@ -106,33 +122,33 @@ def fetch_rss_feeds():
106
  return articles
107
 
108
  def categorize_feed(url):
109
- if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
110
  return "Academic Papers"
111
- elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
112
  return "Business"
113
- elif "investing.com" in url or "cnbc.com/market" in url or "marketwatch.com/market" in url or "fool.co.uk" in url or "zacks.com" in url or "seekingalpha.com" in url or "barrons.com" in url or "yahoofinance.com" in url:
114
  return "Stocks & Markets"
115
- elif "whitehouse.gov" in url or "state.gov" in url or "commerce.gov" in url or "transportation.gov" in url or "ed.gov" in url or "dol.gov" in url or "justice.gov" in url or "federalreserve.gov" in url or "occ.gov" in url or "sec.gov" in url or "bls.gov" in url or "usda.gov" in url or "gao.gov" in url or "cbo.gov" in url or "fema.gov" in url or "defense.gov" in url or "hhs.gov" in url or "energy.gov" in url or "interior.gov" in url:
116
  return "Federal Government"
117
- elif "weather.gov" in url or "metoffice.gov.uk" in url or "accuweather.com" in url or "weatherunderground.com" in url or "noaa.gov" in url or "wunderground.com" in url or "climate.gov" in url or "ecmwf.int" in url or "bom.gov.au" in url:
118
  return "Weather"
119
- elif "data.worldbank.org" in url or "imf.org" in url or "un.org" in url or "oecd.org" in url or "statista.com" in url or "kff.org" in url or "who.int" in url or "cdc.gov" in url or "bea.gov" in url or "census.gov" in url or "fdic.gov" in url:
120
  return "Data & Statistics"
121
- elif "nasa" in url or "spaceweatherlive" in url or "space" in url or "universetoday" in url or "skyandtelescope" in url or "esa" in url:
122
  return "Space"
123
- elif "sciencedaily" in url or "quantamagazine" in url or "smithsonianmag" in url or "popsci" in url or "discovermagazine" in url or "scientificamerican" in url or "newscientist" in url or "livescience" in url or "atlasobscura" in url:
124
  return "Science"
125
- elif "wired" in url or "techcrunch" in url or "arstechnica" in url or "gizmodo" in url or "theverge" in url:
126
  return "Tech"
127
- elif "horoscope" in url or "astrostyle" in url:
128
  return "Astrology"
129
- elif "cnn_allpolitics" in url or "bbci.co.uk/news/politics" in url or "reuters.com/arc/outboundfeeds/newsletter-politics" in url or "politico.com/rss/politics" in url or "thehill" in url:
130
  return "Politics"
131
- elif "weather" in url or "swpc.noaa.gov" in url or "foxweather" in url:
132
  return "Earth Weather"
133
- elif "vogue" in url:
134
  return "Lifestyle"
135
- elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
136
  return "Physics"
137
  return "Uncategorized"
138
 
@@ -141,12 +157,12 @@ def process_and_store_articles(articles):
141
  existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
142
  for article in articles:
143
  try:
144
- # Create a unique ID based on normalized fields, including description hash
145
- title = article["title"].lower().strip()
146
- link = article["link"].lower().strip()
147
- description = article["description"].strip()
148
  published = article["published"]
149
- description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
150
  doc_id = f"{title}|{link}|{published}|{description_hash}"
151
  if doc_id in existing_ids:
152
  logger.debug(f"Skipping duplicate in DB: {doc_id}")
 
8
  import shutil
9
  import rss_feeds
10
  from datetime import datetime
11
+ import dateutil.parser
12
+ import hashlib
13
+ import re # For cleaning HTML and whitespace
14
 
15
  # Setup logging
16
  logging.basicConfig(level=logging.INFO)
 
39
  collection_name=COLLECTION_NAME
40
  )
41
 
42
+ def clean_text(text):
43
+ """Clean text by removing HTML tags and extra whitespace."""
44
+ if not text or not isinstance(text, str):
45
+ return ""
46
+ # Remove HTML tags
47
+ text = re.sub(r'<.*?>', '', text)
48
+ # Normalize whitespace (remove extra spaces, newlines, tabs)
49
+ text = ' '.join(text.split())
50
+ return text.strip().lower()
51
+
52
  def fetch_rss_feeds():
53
  articles = []
54
  seen_keys = set()
 
63
  for entry in feed.entries:
64
  if article_count >= MAX_ARTICLES_PER_FEED:
65
  break
66
+ title = entry.get("title", "No Title")
67
+ link = entry.get("link", "")
68
+ description = entry.get("summary", entry.get("description", ""))
69
+
70
+ # Clean and normalize all text fields
71
+ title = clean_text(title)
72
+ link = clean_text(link)
73
+ description = clean_text(description)
74
 
75
  # Try multiple date fields and parse flexibly
76
  published = "Unknown Date"
77
+ for date_field in ["published", "updated", "created", "pubDate"]: # Added "pubDate" for broader compatibility
78
  if date_field in entry:
79
  try:
80
  parsed_date = dateutil.parser.parse(entry[date_field])
 
84
  logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
85
  continue
86
 
87
+ # Use a robust key for deduplication, including cleaned fields
88
+ description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest() # Switched to SHA256 for better uniqueness
89
  key = f"{title}|{link}|{published}|{description_hash}"
90
  if key not in seen_keys:
91
  seen_keys.add(key)
92
  # Try multiple image sources
93
  image = "svg" # Default fallback
94
  for img_source in [
95
+ lambda e: clean_text(e.get("media_content", [{}])[0].get("url")) if e.get("media_content") else "",
96
+ lambda e: clean_text(e.get("media_thumbnail", [{}])[0].get("url")) if e.get("media_thumbnail") else "",
97
+ lambda e: clean_text(e.get("enclosure", {}).get("url")) if e.get("enclosure") else "",
98
+ lambda e: clean_text(next((lnk.get("href") for lnk in e.get("links", []) if lnk.get("type", "").startswith("image")), "")),
99
  ]:
100
  try:
101
  img = img_source(entry)
102
+ if img and img.strip():
103
+ image = img
104
  break
105
  except (IndexError, AttributeError, TypeError):
106
  continue
 
122
  return articles
123
 
124
  def categorize_feed(url):
125
+ if "nature" in url.lower() or "science.org" in url.lower() or "arxiv.org" in url.lower() or "plos.org" in url.lower() or "annualreviews.org" in url.lower() or "journals.uchicago.edu" in url.lower() or "jneurosci.org" in url.lower() or "cell.com" in url.lower() or "nejm.org" in url.lower() or "lancet.com" in url.lower():
126
  return "Academic Papers"
127
+ elif "reuters.com/business" in url.lower() or "bloomberg.com" in url.lower() or "ft.com" in url.lower() or "marketwatch.com" in url.lower() or "cnbc.com" in url.lower() or "foxbusiness.com" in url.lower() or "wsj.com" in url.lower() or "bworldonline.com" in url.lower() or "economist.com" in url.lower() or "forbes.com" in url.lower():
128
  return "Business"
129
+ elif "investing.com" in url.lower() or "cnbc.com/market" in url.lower() or "marketwatch.com/market" in url.lower() or "fool.co.uk" in url.lower() or "zacks.com" in url.lower() or "seekingalpha.com" in url.lower() or "barrons.com" in url.lower() or "yahoofinance.com" in url.lower():
130
  return "Stocks & Markets"
131
+ elif "whitehouse.gov" in url.lower() or "state.gov" in url.lower() or "commerce.gov" in url.lower() or "transportation.gov" in url.lower() or "ed.gov" in url.lower() or "dol.gov" in url.lower() or "justice.gov" in url.lower() or "federalreserve.gov" in url.lower() or "occ.gov" in url.lower() or "sec.gov" in url.lower() or "bls.gov" in url.lower() or "usda.gov" in url.lower() or "gao.gov" in url.lower() or "cbo.gov" in url.lower() or "fema.gov" in url.lower() or "defense.gov" in url.lower() or "hhs.gov" in url.lower() or "energy.gov" in url.lower() or "interior.gov" in url.lower():
132
  return "Federal Government"
133
+ elif "weather.gov" in url.lower() or "metoffice.gov.uk" in url.lower() or "accuweather.com" in url.lower() or "weatherunderground.com" in url.lower() or "noaa.gov" in url.lower() or "wunderground.com" in url.lower() or "climate.gov" in url.lower() or "ecmwf.int" in url.lower() or "bom.gov.au" in url.lower():
134
  return "Weather"
135
+ elif "data.worldbank.org" in url.lower() or "imf.org" in url.lower() or "un.org" in url.lower() or "oecd.org" in url.lower() or "statista.com" in url.lower() or "kff.org" in url.lower() or "who.int" in url.lower() or "cdc.gov" in url.lower() or "bea.gov" in url.lower() or "census.gov" in url.lower() or "fdic.gov" in url.lower():
136
  return "Data & Statistics"
137
+ elif "nasa" in url.lower() or "spaceweatherlive" in url.lower() or "space" in url.lower() or "universetoday" in url.lower() or "skyandtelescope" in url.lower() or "esa" in url.lower():
138
  return "Space"
139
+ elif "sciencedaily" in url.lower() or "quantamagazine" in url.lower() or "smithsonianmag" in url.lower() or "popsci" in url.lower() or "discovermagazine" in url.lower() or "scientificamerican" in url.lower() or "newscientist" in url.lower() or "livescience" in url.lower() or "atlasobscura" in url.lower():
140
  return "Science"
141
+ elif "wired" in url.lower() or "techcrunch" in url.lower() or "arstechnica" in url.lower() or "gizmodo" in url.lower() or "theverge" in url.lower():
142
  return "Tech"
143
+ elif "horoscope" in url.lower() or "astrostyle" in url.lower():
144
  return "Astrology"
145
+ elif "cnn_allpolitics" in url.lower() or "bbci.co.uk/news/politics" in url.lower() or "reuters.com/arc/outboundfeeds/newsletter-politics" in url.lower() or "politico.com/rss/politics" in url.lower() or "thehill" in url.lower():
146
  return "Politics"
147
+ elif "weather" in url.lower() or "swpc.noaa.gov" in url.lower() or "foxweather" in url.lower():
148
  return "Earth Weather"
149
+ elif "vogue" in url.lower():
150
  return "Lifestyle"
151
+ elif "phys.org" in url.lower() or "aps.org" in url.lower() or "physicsworld" in url.lower():
152
  return "Physics"
153
  return "Uncategorized"
154
 
 
157
  existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
158
  for article in articles:
159
  try:
160
+ # Clean and normalize all fields
161
+ title = clean_text(article["title"])
162
+ link = clean_text(article["link"])
163
+ description = clean_text(article["description"])
164
  published = article["published"]
165
+ description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
166
  doc_id = f"{title}|{link}|{published}|{description_hash}"
167
  if doc_id in existing_ids:
168
  logger.debug(f"Skipping duplicate in DB: {doc_id}")