Spaces:
Running
Running
Update rss_processor.py
Browse files- rss_processor.py +4 -4
rss_processor.py
CHANGED
@@ -71,7 +71,7 @@ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_
|
|
71 |
|
72 |
def fetch_rss_feeds():
|
73 |
articles = []
|
74 |
-
seen_articles = set() # Track unique articles by title, link, and description hash
|
75 |
for feed_url in RSS_FEEDS:
|
76 |
try:
|
77 |
logger.info(f"Fetching feed: {feed_url}")
|
@@ -84,8 +84,8 @@ def fetch_rss_feeds():
|
|
84 |
title = entry.get("title", "No Title")
|
85 |
link = entry.get("link", "")
|
86 |
description = entry.get("summary", entry.get("description", "No Description"))
|
87 |
-
# Use MD5 hash of description for uniqueness
|
88 |
-
desc_hash = hashlib.md5(description.encode()).hexdigest()
|
89 |
article_key = f"{title}|{link}|{desc_hash}"
|
90 |
if article_key not in seen_articles:
|
91 |
seen_articles.add(article_key)
|
@@ -123,7 +123,7 @@ def process_and_store_articles(articles):
|
|
123 |
seen_docs = set() # Additional de-duplication at DB level
|
124 |
for article in articles:
|
125 |
try:
|
126 |
-
desc_hash = hashlib.md5(article["description"].encode()).hexdigest()
|
127 |
key = f"{article['title']}|{article['link']}|{desc_hash}"
|
128 |
if key not in seen_docs:
|
129 |
seen_docs.add(key)
|
|
|
71 |
|
72 |
def fetch_rss_feeds():
|
73 |
articles = []
|
74 |
+
seen_articles = set() # Track unique articles by title, link, and full description hash
|
75 |
for feed_url in RSS_FEEDS:
|
76 |
try:
|
77 |
logger.info(f"Fetching feed: {feed_url}")
|
|
|
84 |
title = entry.get("title", "No Title")
|
85 |
link = entry.get("link", "")
|
86 |
description = entry.get("summary", entry.get("description", "No Description"))
|
87 |
+
# Use full MD5 hash of description for stricter uniqueness
|
88 |
+
desc_hash = hashlib.md5(description.encode()).hexdigest()
|
89 |
article_key = f"{title}|{link}|{desc_hash}"
|
90 |
if article_key not in seen_articles:
|
91 |
seen_articles.add(article_key)
|
|
|
123 |
seen_docs = set() # Additional de-duplication at DB level
|
124 |
for article in articles:
|
125 |
try:
|
126 |
+
desc_hash = hashlib.md5(article["description"].encode()).hexdigest()
|
127 |
key = f"{article['title']}|{article['link']}|{desc_hash}"
|
128 |
if key not in seen_docs:
|
129 |
seen_docs.add(key)
|