Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +7 -6
rss_processor.py
CHANGED
@@ -69,7 +69,7 @@ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_
|
|
69 |
|
70 |
def fetch_rss_feeds():
|
71 |
articles = []
|
72 |
-
seen_articles = set() # Track unique articles by title and
|
73 |
for feed_url in RSS_FEEDS:
|
74 |
try:
|
75 |
logger.info(f"Fetching feed: {feed_url}")
|
@@ -78,11 +78,12 @@ def fetch_rss_feeds():
|
|
78 |
logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
|
79 |
continue
|
80 |
unique_count = 0
|
81 |
-
for entry in feed.entries[:
|
82 |
title = entry.get("title", "No Title")
|
83 |
link = entry.get("link", "")
|
84 |
-
|
85 |
-
|
|
|
86 |
if article_key not in seen_articles:
|
87 |
seen_articles.add(article_key)
|
88 |
unique_count += 1
|
@@ -90,7 +91,7 @@ def fetch_rss_feeds():
|
|
90 |
articles.append({
|
91 |
"title": title,
|
92 |
"link": link,
|
93 |
-
"description":
|
94 |
"published": entry.get("published", "Unknown Date"),
|
95 |
"category": categorize_feed(feed_url),
|
96 |
"image": image if image else "",
|
@@ -119,7 +120,7 @@ def process_and_store_articles(articles):
|
|
119 |
seen_docs = set() # Additional de-duplication at DB level
|
120 |
for article in articles:
|
121 |
try:
|
122 |
-
key = f"{article['title']}|{article['link']}"
|
123 |
if key not in seen_docs:
|
124 |
seen_docs.add(key)
|
125 |
metadata = {
|
|
|
69 |
|
70 |
def fetch_rss_feeds():
|
71 |
articles = []
|
72 |
+
seen_articles = set() # Track unique articles by title, link, and description
|
73 |
for feed_url in RSS_FEEDS:
|
74 |
try:
|
75 |
logger.info(f"Fetching feed: {feed_url}")
|
|
|
78 |
logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
|
79 |
continue
|
80 |
unique_count = 0
|
81 |
+
for entry in feed.entries[:5]:
|
82 |
title = entry.get("title", "No Title")
|
83 |
link = entry.get("link", "")
|
84 |
+
description = entry.get("summary", entry.get("description", "No Description"))
|
85 |
+
# Create a unique key for deduplication (title, link, and description for stricter uniqueness)
|
86 |
+
article_key = f"{title}|{link}|{description[:50]}" # Use first 50 chars of description to avoid overly long keys
|
87 |
if article_key not in seen_articles:
|
88 |
seen_articles.add(article_key)
|
89 |
unique_count += 1
|
|
|
91 |
articles.append({
|
92 |
"title": title,
|
93 |
"link": link,
|
94 |
+
"description": description,
|
95 |
"published": entry.get("published", "Unknown Date"),
|
96 |
"category": categorize_feed(feed_url),
|
97 |
"image": image if image else "",
|
|
|
120 |
seen_docs = set() # Additional de-duplication at DB level
|
121 |
for article in articles:
|
122 |
try:
|
123 |
+
key = f"{article['title']}|{article['link']}|{article['description'][:50]}"
|
124 |
if key not in seen_docs:
|
125 |
seen_docs.add(key)
|
126 |
metadata = {
|