Spaces:

broadfield-dev
/

RSS_News

Sleeping

App Files Files Community

broadfield-dev commited on Feb 22

Commit

a69bc3b

verified ·

1 Parent(s): 47649d8

Update rss_processor.py

Browse files

Files changed (1) hide show

rss_processor.py +17 -13

rss_processor.py CHANGED Viewed

@@ -7,16 +7,18 @@ import logging
 from huggingface_hub import HfApi, login
 import shutil
 import rss_feeds
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Constants
-MAX_ARTICLES_PER_FEED = 10
 LOCAL_DB_DIR = "chroma_db"
 RSS_FEEDS = rss_feeds.RSS_FEEDS
-COLLECTION_NAME = "news_articles"  # Explicitly name the collection
 HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
 REPO_ID = "broadfield-dev/news-rag-db"
@@ -35,9 +37,6 @@ vector_db = Chroma(
     collection_name=COLLECTION_NAME
 )
-from datetime import datetime
-import dateutil.parser  # Add this dependency: pip install python-dateutil
 def fetch_rss_feeds():
     articles = []
     seen_keys = set()
@@ -52,8 +51,8 @@ def fetch_rss_feeds():
             for entry in feed.entries:
                 if article_count >= MAX_ARTICLES_PER_FEED:
                     break
-                title = entry.get("title", "No Title").strip()
-                link = entry.get("link", "").strip()
                 description = entry.get("summary", entry.get("description", "No Description")).strip()
                 # Try multiple date fields and parse flexibly
@@ -68,6 +67,7 @@ def fetch_rss_feeds():
                             logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
                             continue
                 key = f"{title}|{link}|{published}"
                 if key not in seen_keys:
                     seen_keys.add(key)
@@ -81,8 +81,8 @@ def fetch_rss_feeds():
                     ]:
                         try:
                             img = img_source(entry)
-                            if img:
-                                image = img
                                 break
                         except (IndexError, AttributeError, TypeError):
                             continue
@@ -96,12 +96,15 @@ def fetch_rss_feeds():
                         "image": image,
                     })
                     article_count += 1
         except Exception as e:
             logger.error(f"Error fetching {feed_url}: {e}")
     logger.info(f"Total articles fetched: {len(articles)}")
     return articles
 def categorize_feed(url):
     if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
         return "Academic Papers"
     elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
@@ -131,16 +134,17 @@ def categorize_feed(url):
     elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
         return "Physics"
     return "Uncategorized"
 def process_and_store_articles(articles):
     documents = []
     existing_ids = set(vector_db.get()["ids"])  # Get existing document IDs to avoid duplicates
     for article in articles:
         try:
-            # Create a unique ID for deduplication
-            doc_id = f"{article['title']}|{article['link']}|{article['published']}"
             if doc_id in existing_ids:
-                continue  # Skip if already in DB
             metadata = {
                 "title": article["title"],
                 "link": article["link"],

 from huggingface_hub import HfApi, login
 import shutil
 import rss_feeds
+from datetime import datetime
+import dateutil.parser  # For flexible date parsing
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Constants
+MAX_ARTICLES_PER_FEED = 5
 LOCAL_DB_DIR = "chroma_db"
 RSS_FEEDS = rss_feeds.RSS_FEEDS
+COLLECTION_NAME = "news_articles"
 HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
 REPO_ID = "broadfield-dev/news-rag-db"
     collection_name=COLLECTION_NAME
 )
 def fetch_rss_feeds():
     articles = []
     seen_keys = set()
             for entry in feed.entries:
                 if article_count >= MAX_ARTICLES_PER_FEED:
                     break
+                title = entry.get("title", "No Title").strip().lower()  # Normalize case and whitespace
+                link = entry.get("link", "").strip().lower()
                 description = entry.get("summary", entry.get("description", "No Description")).strip()
                 # Try multiple date fields and parse flexibly
                             logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
                             continue
+                # Use a robust key for deduplication
                 key = f"{title}|{link}|{published}"
                 if key not in seen_keys:
                     seen_keys.add(key)
                     ]:
                         try:
                             img = img_source(entry)
+                            if img and isinstance(img, str) and img.strip():
+                                image = img.strip()
                                 break
                         except (IndexError, AttributeError, TypeError):
                             continue
                         "image": image,
                     })
                     article_count += 1
+                else:
+                    logger.debug(f"Duplicate article skipped in feed {feed_url}: {key}")
         except Exception as e:
             logger.error(f"Error fetching {feed_url}: {e}")
     logger.info(f"Total articles fetched: {len(articles)}")
     return articles
 def categorize_feed(url):
+    # (Unchanged, keeping your existing categorization logic)
     if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
         return "Academic Papers"
     elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
     elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
         return "Physics"
     return "Uncategorized"
 def process_and_store_articles(articles):
     documents = []
     existing_ids = set(vector_db.get()["ids"])  # Get existing document IDs to avoid duplicates
     for article in articles:
         try:
+            # Create a unique ID based on normalized fields
+            doc_id = f"{article['title'].lower()}|{article['link'].lower()}|{article['published']}"
             if doc_id in existing_ids:
+                logger.debug(f"Skipping duplicate in DB: {doc_id}")
+                continue
             metadata = {
                 "title": article["title"],
                 "link": article["link"],