broadfield-dev commited on
Commit
a69bc3b
·
verified ·
1 Parent(s): 47649d8

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +17 -13
rss_processor.py CHANGED
@@ -7,16 +7,18 @@ import logging
7
  from huggingface_hub import HfApi, login
8
  import shutil
9
  import rss_feeds
 
 
10
 
11
  # Setup logging
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
14
 
15
  # Constants
16
- MAX_ARTICLES_PER_FEED = 10
17
  LOCAL_DB_DIR = "chroma_db"
18
  RSS_FEEDS = rss_feeds.RSS_FEEDS
19
- COLLECTION_NAME = "news_articles" # Explicitly name the collection
20
 
21
  HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
22
  REPO_ID = "broadfield-dev/news-rag-db"
@@ -35,9 +37,6 @@ vector_db = Chroma(
35
  collection_name=COLLECTION_NAME
36
  )
37
 
38
- from datetime import datetime
39
- import dateutil.parser # Add this dependency: pip install python-dateutil
40
-
41
  def fetch_rss_feeds():
42
  articles = []
43
  seen_keys = set()
@@ -52,8 +51,8 @@ def fetch_rss_feeds():
52
  for entry in feed.entries:
53
  if article_count >= MAX_ARTICLES_PER_FEED:
54
  break
55
- title = entry.get("title", "No Title").strip()
56
- link = entry.get("link", "").strip()
57
  description = entry.get("summary", entry.get("description", "No Description")).strip()
58
 
59
  # Try multiple date fields and parse flexibly
@@ -68,6 +67,7 @@ def fetch_rss_feeds():
68
  logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
69
  continue
70
 
 
71
  key = f"{title}|{link}|{published}"
72
  if key not in seen_keys:
73
  seen_keys.add(key)
@@ -81,8 +81,8 @@ def fetch_rss_feeds():
81
  ]:
82
  try:
83
  img = img_source(entry)
84
- if img:
85
- image = img
86
  break
87
  except (IndexError, AttributeError, TypeError):
88
  continue
@@ -96,12 +96,15 @@ def fetch_rss_feeds():
96
  "image": image,
97
  })
98
  article_count += 1
 
 
99
  except Exception as e:
100
  logger.error(f"Error fetching {feed_url}: {e}")
101
  logger.info(f"Total articles fetched: {len(articles)}")
102
  return articles
103
 
104
  def categorize_feed(url):
 
105
  if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
106
  return "Academic Papers"
107
  elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
@@ -131,16 +134,17 @@ def categorize_feed(url):
131
  elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
132
  return "Physics"
133
  return "Uncategorized"
134
-
135
  def process_and_store_articles(articles):
136
  documents = []
137
  existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
138
  for article in articles:
139
  try:
140
- # Create a unique ID for deduplication
141
- doc_id = f"{article['title']}|{article['link']}|{article['published']}"
142
  if doc_id in existing_ids:
143
- continue # Skip if already in DB
 
144
  metadata = {
145
  "title": article["title"],
146
  "link": article["link"],
 
7
  from huggingface_hub import HfApi, login
8
  import shutil
9
  import rss_feeds
10
+ from datetime import datetime
11
+ import dateutil.parser # For flexible date parsing
12
 
13
  # Setup logging
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
16
 
17
  # Constants
18
+ MAX_ARTICLES_PER_FEED = 5
19
  LOCAL_DB_DIR = "chroma_db"
20
  RSS_FEEDS = rss_feeds.RSS_FEEDS
21
+ COLLECTION_NAME = "news_articles"
22
 
23
  HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
24
  REPO_ID = "broadfield-dev/news-rag-db"
 
37
  collection_name=COLLECTION_NAME
38
  )
39
 
 
 
 
40
  def fetch_rss_feeds():
41
  articles = []
42
  seen_keys = set()
 
51
  for entry in feed.entries:
52
  if article_count >= MAX_ARTICLES_PER_FEED:
53
  break
54
+ title = entry.get("title", "No Title").strip().lower() # Normalize case and whitespace
55
+ link = entry.get("link", "").strip().lower()
56
  description = entry.get("summary", entry.get("description", "No Description")).strip()
57
 
58
  # Try multiple date fields and parse flexibly
 
67
  logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
68
  continue
69
 
70
+ # Use a robust key for deduplication
71
  key = f"{title}|{link}|{published}"
72
  if key not in seen_keys:
73
  seen_keys.add(key)
 
81
  ]:
82
  try:
83
  img = img_source(entry)
84
+ if img and isinstance(img, str) and img.strip():
85
+ image = img.strip()
86
  break
87
  except (IndexError, AttributeError, TypeError):
88
  continue
 
96
  "image": image,
97
  })
98
  article_count += 1
99
+ else:
100
+ logger.debug(f"Duplicate article skipped in feed {feed_url}: {key}")
101
  except Exception as e:
102
  logger.error(f"Error fetching {feed_url}: {e}")
103
  logger.info(f"Total articles fetched: {len(articles)}")
104
  return articles
105
 
106
  def categorize_feed(url):
107
+ # (Unchanged, keeping your existing categorization logic)
108
  if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
109
  return "Academic Papers"
110
  elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
 
134
  elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
135
  return "Physics"
136
  return "Uncategorized"
137
+
138
  def process_and_store_articles(articles):
139
  documents = []
140
  existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
141
  for article in articles:
142
  try:
143
+ # Create a unique ID based on normalized fields
144
+ doc_id = f"{article['title'].lower()}|{article['link'].lower()}|{article['published']}"
145
  if doc_id in existing_ids:
146
+ logger.debug(f"Skipping duplicate in DB: {doc_id}")
147
+ continue
148
  metadata = {
149
  "title": article["title"],
150
  "link": article["link"],