Spaces:

broadfield-dev
/

grok_test

Runtime error

App Files Files Community

broadfield-dev commited on Feb 20

Commit

8179b58

verified ·

1 Parent(s): c4a29ef

Update rss_processor.py

Browse files

Files changed (1) hide show

rss_processor.py +7 -6

rss_processor.py CHANGED Viewed

@@ -69,7 +69,7 @@ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_
 def fetch_rss_feeds():
     articles = []
-    seen_articles = set()  # Track unique articles by title and link
     for feed_url in RSS_FEEDS:
         try:
             logger.info(f"Fetching feed: {feed_url}")
@@ -78,11 +78,12 @@ def fetch_rss_feeds():
                 logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
                 continue
             unique_count = 0
-            for entry in feed.entries[:100]:
                 title = entry.get("title", "No Title")
                 link = entry.get("link", "")
-                # Create a unique key for deduplication (title and link)
-                article_key = f"{title}|{link}"
                 if article_key not in seen_articles:
                     seen_articles.add(article_key)
                     unique_count += 1
@@ -90,7 +91,7 @@ def fetch_rss_feeds():
                     articles.append({
                         "title": title,
                         "link": link,
-                        "description": entry.get("summary", entry.get("description", "No Description")),
                         "published": entry.get("published", "Unknown Date"),
                         "category": categorize_feed(feed_url),
                         "image": image if image else "",
@@ -119,7 +120,7 @@ def process_and_store_articles(articles):
     seen_docs = set()  # Additional de-duplication at DB level
     for article in articles:
         try:
-            key = f"{article['title']}|{article['link']}"
             if key not in seen_docs:
                 seen_docs.add(key)
                 metadata = {

 def fetch_rss_feeds():
     articles = []
+    seen_articles = set()  # Track unique articles by title, link, and description
     for feed_url in RSS_FEEDS:
         try:
             logger.info(f"Fetching feed: {feed_url}")
                 logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
                 continue
             unique_count = 0
+            for entry in feed.entries[:5]:
                 title = entry.get("title", "No Title")
                 link = entry.get("link", "")
+                description = entry.get("summary", entry.get("description", "No Description"))
+                # Create a unique key for deduplication (title, link, and description for stricter uniqueness)
+                article_key = f"{title}|{link}|{description[:50]}"  # Use first 50 chars of description to avoid overly long keys
                 if article_key not in seen_articles:
                     seen_articles.add(article_key)
                     unique_count += 1
                     articles.append({
                         "title": title,
                         "link": link,
+                        "description": description,
                         "published": entry.get("published", "Unknown Date"),
                         "category": categorize_feed(feed_url),
                         "image": image if image else "",
     seen_docs = set()  # Additional de-duplication at DB level
     for article in articles:
         try:
+            key = f"{article['title']}|{article['link']}|{article['description'][:50]}"
             if key not in seen_docs:
                 seen_docs.add(key)
                 metadata = {