broadfield-dev commited on
Commit
a13e6db
·
verified ·
1 Parent(s): 00d6940

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +13 -10
rss_processor.py CHANGED
@@ -9,7 +9,7 @@ import shutil
9
  import rss_feeds
10
  from datetime import datetime
11
  import dateutil.parser # For flexible date parsing
12
- import hashlib
13
 
14
  # Setup logging
15
  logging.basicConfig(level=logging.INFO)
@@ -52,7 +52,7 @@ def fetch_rss_feeds():
52
  for entry in feed.entries:
53
  if article_count >= MAX_ARTICLES_PER_FEED:
54
  break
55
- title = entry.get("title", "No Title").strip().lower() # Normalize case and whitespace
56
  link = entry.get("link", "").strip().lower()
57
  description = entry.get("summary", entry.get("description", "No Description")).strip()
58
 
@@ -68,10 +68,9 @@ def fetch_rss_feeds():
68
  logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
69
  continue
70
 
71
- # Use a robust key for deduplication
72
- #key = f"{title}|{link}|{published}"
73
- key = f"{title.lower()}|{link.lower()}|{published}|{hash(description.encode('utf-8'))}"
74
-
75
  if key not in seen_keys:
76
  seen_keys.add(key)
77
  # Try multiple image sources
@@ -107,7 +106,6 @@ def fetch_rss_feeds():
107
  return articles
108
 
109
  def categorize_feed(url):
110
- # (Unchanged, keeping your existing categorization logic)
111
  if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
112
  return "Academic Papers"
113
  elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
@@ -143,8 +141,13 @@ def process_and_store_articles(articles):
143
  existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
144
  for article in articles:
145
  try:
146
- # Create a unique ID based on normalized fields
147
- doc_id = f"{article['title'].lower()}|{article['link'].lower()}|{article['published']}"
 
 
 
 
 
148
  if doc_id in existing_ids:
149
  logger.debug(f"Skipping duplicate in DB: {doc_id}")
150
  continue
@@ -156,7 +159,7 @@ def process_and_store_articles(articles):
156
  "category": article["category"],
157
  "image": article["image"],
158
  }
159
- doc = Document(page_content=article["description"], metadata=metadata, id=doc_id)
160
  documents.append(doc)
161
  except Exception as e:
162
  logger.error(f"Error processing article {article['title']}: {e}")
 
9
  import rss_feeds
10
  from datetime import datetime
11
  import dateutil.parser # For flexible date parsing
12
+ import hashlib # For generating unique hashes
13
 
14
  # Setup logging
15
  logging.basicConfig(level=logging.INFO)
 
52
  for entry in feed.entries:
53
  if article_count >= MAX_ARTICLES_PER_FEED:
54
  break
55
+ title = entry.get("title", "No Title").strip().lower()
56
  link = entry.get("link", "").strip().lower()
57
  description = entry.get("summary", entry.get("description", "No Description")).strip()
58
 
 
68
  logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
69
  continue
70
 
71
+ # Use a robust key for deduplication, including a hash of the description
72
+ description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
73
+ key = f"{title}|{link}|{published}|{description_hash}"
 
74
  if key not in seen_keys:
75
  seen_keys.add(key)
76
  # Try multiple image sources
 
106
  return articles
107
 
108
  def categorize_feed(url):
 
109
  if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
110
  return "Academic Papers"
111
  elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
 
141
  existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
142
  for article in articles:
143
  try:
144
+ # Create a unique ID based on normalized fields, including description hash
145
+ title = article["title"].lower().strip()
146
+ link = article["link"].lower().strip()
147
+ description = article["description"].strip()
148
+ published = article["published"]
149
+ description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
150
+ doc_id = f"{title}|{link}|{published}|{description_hash}"
151
  if doc_id in existing_ids:
152
  logger.debug(f"Skipping duplicate in DB: {doc_id}")
153
  continue
 
159
  "category": article["category"],
160
  "image": article["image"],
161
  }
162
+ doc = Document(page_content=description, metadata=metadata, id=doc_id)
163
  documents.append(doc)
164
  except Exception as e:
165
  logger.error(f"Error processing article {article['title']}: {e}")