broadfield-dev commited on
Commit
5d0b159
·
verified ·
1 Parent(s): e06bdde

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +4 -4
rss_processor.py CHANGED
@@ -71,7 +71,7 @@ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_
71
 
72
  def fetch_rss_feeds():
73
  articles = []
74
- seen_articles = set() # Track unique articles by title, link, and description hash
75
  for feed_url in RSS_FEEDS:
76
  try:
77
  logger.info(f"Fetching feed: {feed_url}")
@@ -84,8 +84,8 @@ def fetch_rss_feeds():
84
  title = entry.get("title", "No Title")
85
  link = entry.get("link", "")
86
  description = entry.get("summary", entry.get("description", "No Description"))
87
- # Use MD5 hash of description for uniqueness
88
- desc_hash = hashlib.md5(description.encode()).hexdigest()[:10]
89
  article_key = f"{title}|{link}|{desc_hash}"
90
  if article_key not in seen_articles:
91
  seen_articles.add(article_key)
@@ -123,7 +123,7 @@ def process_and_store_articles(articles):
123
  seen_docs = set() # Additional de-duplication at DB level
124
  for article in articles:
125
  try:
126
- desc_hash = hashlib.md5(article["description"].encode()).hexdigest()[:10]
127
  key = f"{article['title']}|{article['link']}|{desc_hash}"
128
  if key not in seen_docs:
129
  seen_docs.add(key)
 
71
 
72
  def fetch_rss_feeds():
73
  articles = []
74
+ seen_articles = set() # Track unique articles by title, link, and full description hash
75
  for feed_url in RSS_FEEDS:
76
  try:
77
  logger.info(f"Fetching feed: {feed_url}")
 
84
  title = entry.get("title", "No Title")
85
  link = entry.get("link", "")
86
  description = entry.get("summary", entry.get("description", "No Description"))
87
+ # Use full MD5 hash of description for stricter uniqueness
88
+ desc_hash = hashlib.md5(description.encode()).hexdigest()
89
  article_key = f"{title}|{link}|{desc_hash}"
90
  if article_key not in seen_articles:
91
  seen_articles.add(article_key)
 
123
  seen_docs = set() # Additional de-duplication at DB level
124
  for article in articles:
125
  try:
126
+ desc_hash = hashlib.md5(article["description"].encode()).hexdigest()
127
  key = f"{article['title']}|{article['link']}|{desc_hash}"
128
  if key not in seen_docs:
129
  seen_docs.add(key)