broadfield-dev commited on
Commit
a058939
·
verified ·
1 Parent(s): 97a599a

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +39 -38
rss_processor.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import feedparser
3
  from langchain.vectorstores import Chroma
@@ -10,18 +11,17 @@ import rss_feeds
10
  from datetime import datetime
11
  import dateutil.parser
12
  import hashlib
13
- import re # For cleaning HTML and whitespace
14
 
15
  # Setup logging
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
  # Constants
20
- MAX_ARTICLES_PER_FEED = 10
21
  LOCAL_DB_DIR = "chroma_db"
22
  RSS_FEEDS = rss_feeds.RSS_FEEDS
23
  COLLECTION_NAME = "news_articles"
24
-
25
  HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
26
  REPO_ID = "broadfield-dev/news-rag-db"
27
 
@@ -43,9 +43,7 @@ def clean_text(text):
43
  """Clean text by removing HTML tags and extra whitespace."""
44
  if not text or not isinstance(text, str):
45
  return ""
46
- # Remove HTML tags
47
  text = re.sub(r'<.*?>', '', text)
48
- # Normalize whitespace (remove extra spaces, newlines, tabs)
49
  text = ' '.join(text.split())
50
  return text.strip().lower()
51
 
@@ -67,14 +65,12 @@ def fetch_rss_feeds():
67
  link = entry.get("link", "")
68
  description = entry.get("summary", entry.get("description", ""))
69
 
70
- # Clean and normalize all text fields
71
  title = clean_text(title)
72
  link = clean_text(link)
73
  description = clean_text(description)
74
 
75
- # Try multiple date fields and parse flexibly
76
  published = "Unknown Date"
77
- for date_field in ["published", "updated", "created", "pubDate"]: # Added "pubDate" for broader compatibility
78
  if date_field in entry:
79
  try:
80
  parsed_date = dateutil.parser.parse(entry[date_field])
@@ -84,13 +80,11 @@ def fetch_rss_feeds():
84
  logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
85
  continue
86
 
87
- # Use a robust key for deduplication, including cleaned fields
88
- description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest() # Switched to SHA256 for better uniqueness
89
  key = f"{title}|{link}|{published}|{description_hash}"
90
  if key not in seen_keys:
91
  seen_keys.add(key)
92
- # Try multiple image sources
93
- image = "svg" # Default fallback
94
  for img_source in [
95
  lambda e: clean_text(e.get("media_content", [{}])[0].get("url")) if e.get("media_content") else "",
96
  lambda e: clean_text(e.get("media_thumbnail", [{}])[0].get("url")) if e.get("media_thumbnail") else "",
@@ -114,50 +108,58 @@ def fetch_rss_feeds():
114
  "image": image,
115
  })
116
  article_count += 1
117
- else:
118
- logger.debug(f"Duplicate article skipped in feed {feed_url}: {key}")
119
  except Exception as e:
120
  logger.error(f"Error fetching {feed_url}: {e}")
121
  logger.info(f"Total articles fetched: {len(articles)}")
122
  return articles
123
 
124
  def categorize_feed(url):
125
- if "nature" in url.lower() or "science.org" in url.lower() or "arxiv.org" in url.lower() or "plos.org" in url.lower() or "annualreviews.org" in url.lower() or "journals.uchicago.edu" in url.lower() or "jneurosci.org" in url.lower() or "cell.com" in url.lower() or "nejm.org" in url.lower() or "lancet.com" in url.lower():
 
 
 
 
 
 
 
 
 
126
  return "Academic Papers"
127
- elif "reuters.com/business" in url.lower() or "bloomberg.com" in url.lower() or "ft.com" in url.lower() or "marketwatch.com" in url.lower() or "cnbc.com" in url.lower() or "foxbusiness.com" in url.lower() or "wsj.com" in url.lower() or "bworldonline.com" in url.lower() or "economist.com" in url.lower() or "forbes.com" in url.lower():
128
  return "Business"
129
- elif "investing.com" in url.lower() or "cnbc.com/market" in url.lower() or "marketwatch.com/market" in url.lower() or "fool.co.uk" in url.lower() or "zacks.com" in url.lower() or "seekingalpha.com" in url.lower() or "barrons.com" in url.lower() or "yahoofinance.com" in url.lower():
130
  return "Stocks & Markets"
131
- elif "whitehouse.gov" in url.lower() or "state.gov" in url.lower() or "commerce.gov" in url.lower() or "transportation.gov" in url.lower() or "ed.gov" in url.lower() or "dol.gov" in url.lower() or "justice.gov" in url.lower() or "federalreserve.gov" in url.lower() or "occ.gov" in url.lower() or "sec.gov" in url.lower() or "bls.gov" in url.lower() or "usda.gov" in url.lower() or "gao.gov" in url.lower() or "cbo.gov" in url.lower() or "fema.gov" in url.lower() or "defense.gov" in url.lower() or "hhs.gov" in url.lower() or "energy.gov" in url.lower() or "interior.gov" in url.lower():
132
  return "Federal Government"
133
- elif "weather.gov" in url.lower() or "metoffice.gov.uk" in url.lower() or "accuweather.com" in url.lower() or "weatherunderground.com" in url.lower() or "noaa.gov" in url.lower() or "wunderground.com" in url.lower() or "climate.gov" in url.lower() or "ecmwf.int" in url.lower() or "bom.gov.au" in url.lower():
134
  return "Weather"
135
- elif "data.worldbank.org" in url.lower() or "imf.org" in url.lower() or "un.org" in url.lower() or "oecd.org" in url.lower() or "statista.com" in url.lower() or "kff.org" in url.lower() or "who.int" in url.lower() or "cdc.gov" in url.lower() or "bea.gov" in url.lower() or "census.gov" in url.lower() or "fdic.gov" in url.lower():
136
  return "Data & Statistics"
137
- elif "nasa" in url.lower() or "spaceweatherlive" in url.lower() or "space" in url.lower() or "universetoday" in url.lower() or "skyandtelescope" in url.lower() or "esa" in url.lower():
138
  return "Space"
139
- elif "sciencedaily" in url.lower() or "quantamagazine" in url.lower() or "smithsonianmag" in url.lower() or "popsci" in url.lower() or "discovermagazine" in url.lower() or "scientificamerican" in url.lower() or "newscientist" in url.lower() or "livescience" in url.lower() or "atlasobscura" in url.lower():
140
  return "Science"
141
- elif "wired" in url.lower() or "techcrunch" in url.lower() or "arstechnica" in url.lower() or "gizmodo" in url.lower() or "theverge" in url.lower():
142
  return "Tech"
143
- elif "horoscope" in url.lower() or "astrostyle" in url.lower():
144
  return "Astrology"
145
- elif "cnn_allpolitics" in url.lower() or "bbci.co.uk/news/politics" in url.lower() or "reuters.com/arc/outboundfeeds/newsletter-politics" in url.lower() or "politico.com/rss/politics" in url.lower() or "thehill" in url.lower():
146
  return "Politics"
147
- elif "weather" in url.lower() or "swpc.noaa.gov" in url.lower() or "foxweather" in url.lower():
148
  return "Earth Weather"
149
- elif "vogue" in url.lower():
150
  return "Lifestyle"
151
- elif "phys.org" in url.lower() or "aps.org" in url.lower() or "physicsworld" in url.lower():
152
  return "Physics"
153
- return "Uncategorized"
 
 
154
 
155
  def process_and_store_articles(articles):
156
  documents = []
157
- existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
158
  for article in articles:
159
  try:
160
- # Clean and normalize all fields
161
  title = clean_text(article["title"])
162
  link = clean_text(article["link"])
163
  description = clean_text(article["description"])
@@ -177,29 +179,28 @@ def process_and_store_articles(articles):
177
  }
178
  doc = Document(page_content=description, metadata=metadata, id=doc_id)
179
  documents.append(doc)
 
180
  except Exception as e:
181
  logger.error(f"Error processing article {article['title']}: {e}")
182
 
183
  if documents:
184
  try:
185
  vector_db.add_documents(documents)
186
- vector_db.persist() # Explicitly persist changes
187
- logger.info(f"Added {len(documents)} new articles to DB")
188
  except Exception as e:
189
  logger.error(f"Error storing articles: {e}")
190
 
191
  def download_from_hf_hub():
192
- # Only download if the local DB doesn’t exist (initial setup)
193
  if not os.path.exists(LOCAL_DB_DIR):
194
  try:
195
  hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
196
  logger.info(f"Downloading Chroma DB from {REPO_ID}...")
197
- hf_api.download_repo(repo_id=REPO_ID, repo_type="dataset", local_dir=LOCAL_DB_DIR, token=HF_API_TOKEN)
198
  except Exception as e:
199
  logger.error(f"Error downloading from Hugging Face Hub: {e}")
200
- raise
201
  else:
202
- logger.info("Local Chroma DB already exists, skipping download.")
203
 
204
  def upload_to_hf_hub():
205
  if os.path.exists(LOCAL_DB_DIR):
@@ -219,9 +220,9 @@ def upload_to_hf_hub():
219
  logger.info(f"Database uploaded to: {REPO_ID}")
220
  except Exception as e:
221
  logger.error(f"Error uploading to Hugging Face Hub: {e}")
222
- raise
223
 
224
  if __name__ == "__main__":
 
225
  articles = fetch_rss_feeds()
226
  process_and_store_articles(articles)
227
  upload_to_hf_hub()
 
1
+ # rss_processor.py
2
  import os
3
  import feedparser
4
  from langchain.vectorstores import Chroma
 
11
  from datetime import datetime
12
  import dateutil.parser
13
  import hashlib
14
+ import re
15
 
16
  # Setup logging
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
  # Constants
21
+ MAX_ARTICLES_PER_FEED = 1000
22
  LOCAL_DB_DIR = "chroma_db"
23
  RSS_FEEDS = rss_feeds.RSS_FEEDS
24
  COLLECTION_NAME = "news_articles"
 
25
  HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
26
  REPO_ID = "broadfield-dev/news-rag-db"
27
 
 
43
  """Clean text by removing HTML tags and extra whitespace."""
44
  if not text or not isinstance(text, str):
45
  return ""
 
46
  text = re.sub(r'<.*?>', '', text)
 
47
  text = ' '.join(text.split())
48
  return text.strip().lower()
49
 
 
65
  link = entry.get("link", "")
66
  description = entry.get("summary", entry.get("description", ""))
67
 
 
68
  title = clean_text(title)
69
  link = clean_text(link)
70
  description = clean_text(description)
71
 
 
72
  published = "Unknown Date"
73
+ for date_field in ["published", "updated", "created", "pubDate"]:
74
  if date_field in entry:
75
  try:
76
  parsed_date = dateutil.parser.parse(entry[date_field])
 
80
  logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
81
  continue
82
 
83
+ description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
 
84
  key = f"{title}|{link}|{published}|{description_hash}"
85
  if key not in seen_keys:
86
  seen_keys.add(key)
87
+ image = "svg"
 
88
  for img_source in [
89
  lambda e: clean_text(e.get("media_content", [{}])[0].get("url")) if e.get("media_content") else "",
90
  lambda e: clean_text(e.get("media_thumbnail", [{}])[0].get("url")) if e.get("media_thumbnail") else "",
 
108
  "image": image,
109
  })
110
  article_count += 1
 
 
111
  except Exception as e:
112
  logger.error(f"Error fetching {feed_url}: {e}")
113
  logger.info(f"Total articles fetched: {len(articles)}")
114
  return articles
115
 
116
  def categorize_feed(url):
117
+ """Categorize an RSS feed based on its URL."""
118
+ if not url or not isinstance(url, str):
119
+ logger.warning(f"Invalid URL provided for categorization: {url}")
120
+ return "Uncategorized"
121
+
122
+ url = url.lower().strip() # Normalize the URL
123
+
124
+ logger.debug(f"Categorizing URL: {url}") # Add debugging for visibility
125
+
126
+ if any(keyword in url for keyword in ["nature", "science.org", "arxiv.org", "plos.org", "annualreviews.org", "journals.uchicago.edu", "jneurosci.org", "cell.com", "nejm.org", "lancet.com"]):
127
  return "Academic Papers"
128
+ elif any(keyword in url for keyword in ["reuters.com/business", "bloomberg.com", "ft.com", "marketwatch.com", "cnbc.com", "foxbusiness.com", "wsj.com", "bworldonline.com", "economist.com", "forbes.com"]):
129
  return "Business"
130
+ elif any(keyword in url for keyword in ["investing.com", "cnbc.com/market", "marketwatch.com/market", "fool.co.uk", "zacks.com", "seekingalpha.com", "barrons.com", "yahoofinance.com"]):
131
  return "Stocks & Markets"
132
+ elif any(keyword in url for keyword in ["whitehouse.gov", "state.gov", "commerce.gov", "transportation.gov", "ed.gov", "dol.gov", "justice.gov", "federalreserve.gov", "occ.gov", "sec.gov", "bls.gov", "usda.gov", "gao.gov", "cbo.gov", "fema.gov", "defense.gov", "hhs.gov", "energy.gov", "interior.gov"]):
133
  return "Federal Government"
134
+ elif any(keyword in url for keyword in ["weather.gov", "metoffice.gov.uk", "accuweather.com", "weatherunderground.com", "noaa.gov", "wunderground.com", "climate.gov", "ecmwf.int", "bom.gov.au"]):
135
  return "Weather"
136
+ elif any(keyword in url for keyword in ["data.worldbank.org", "imf.org", "un.org", "oecd.org", "statista.com", "kff.org", "who.int", "cdc.gov", "bea.gov", "census.gov", "fdic.gov"]):
137
  return "Data & Statistics"
138
+ elif any(keyword in url for keyword in ["nasa", "spaceweatherlive", "space", "universetoday", "skyandtelescope", "esa"]):
139
  return "Space"
140
+ elif any(keyword in url for keyword in ["sciencedaily", "quantamagazine", "smithsonianmag", "popsci", "discovermagazine", "scientificamerican", "newscientist", "livescience", "atlasobscura"]):
141
  return "Science"
142
+ elif any(keyword in url for keyword in ["wired", "techcrunch", "arstechnica", "gizmodo", "theverge"]):
143
  return "Tech"
144
+ elif any(keyword in url for keyword in ["horoscope", "astrostyle"]):
145
  return "Astrology"
146
+ elif any(keyword in url for keyword in ["cnn_allpolitics", "bbci.co.uk/news/politics", "reuters.com/arc/outboundfeeds/newsletter-politics", "politico.com/rss/politics", "thehill"]):
147
  return "Politics"
148
+ elif any(keyword in url for keyword in ["weather", "swpc.noaa.gov", "foxweather"]):
149
  return "Earth Weather"
150
+ elif "vogue" in url:
151
  return "Lifestyle"
152
+ elif any(keyword in url for keyword in ["phys.org", "aps.org", "physicsworld"]):
153
  return "Physics"
154
+ else:
155
+ logger.warning(f"No matching category found for URL: {url}")
156
+ return "Uncategorized"
157
 
158
  def process_and_store_articles(articles):
159
  documents = []
160
+ existing_ids = set(vector_db.get()["ids"]) # Load existing IDs once
161
  for article in articles:
162
  try:
 
163
  title = clean_text(article["title"])
164
  link = clean_text(article["link"])
165
  description = clean_text(article["description"])
 
179
  }
180
  doc = Document(page_content=description, metadata=metadata, id=doc_id)
181
  documents.append(doc)
182
+ existing_ids.add(doc_id) # Update in-memory set to avoid duplicates within this batch
183
  except Exception as e:
184
  logger.error(f"Error processing article {article['title']}: {e}")
185
 
186
  if documents:
187
  try:
188
  vector_db.add_documents(documents)
189
+ vector_db.persist()
190
+ logger.info(f"Added {len(documents)} new articles to DB. Total documents: {len(vector_db.get()['ids'])}")
191
  except Exception as e:
192
  logger.error(f"Error storing articles: {e}")
193
 
194
  def download_from_hf_hub():
 
195
  if not os.path.exists(LOCAL_DB_DIR):
196
  try:
197
  hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
198
  logger.info(f"Downloading Chroma DB from {REPO_ID}...")
199
+ hf_api.hf_hub_download(repo_id=REPO_ID, filename="chroma_db", local_dir=LOCAL_DB_DIR, repo_type="dataset", token=HF_API_TOKEN)
200
  except Exception as e:
201
  logger.error(f"Error downloading from Hugging Face Hub: {e}")
 
202
  else:
203
+ logger.info("Local Chroma DB exists, loading existing data.")
204
 
205
  def upload_to_hf_hub():
206
  if os.path.exists(LOCAL_DB_DIR):
 
220
  logger.info(f"Database uploaded to: {REPO_ID}")
221
  except Exception as e:
222
  logger.error(f"Error uploading to Hugging Face Hub: {e}")
 
223
 
224
  if __name__ == "__main__":
225
+ download_from_hf_hub() # Ensure DB is initialized
226
  articles = fetch_rss_feeds()
227
  process_and_store_articles(articles)
228
  upload_to_hf_hub()