Spaces:

broadfield-dev
/

RSS_News

Runtime error

App Files Files Community

broadfield-dev commited on Feb 22

Commit

bc16436

verified ·

1 Parent(s): 4868a00

Update rss_processor.py

Browse files

Files changed (1) hide show

rss_processor.py +32 -45

rss_processor.py CHANGED Viewed

@@ -13,9 +13,10 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Constants
-MAX_ARTICLES_PER_FEED = 5  # Set to 5 for testing, increase later as needed
 LOCAL_DB_DIR = "chroma_db"
 RSS_FEEDS = rss_feeds.RSS_FEEDS
 HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
 REPO_ID = "broadfield-dev/news-rag-db"
@@ -24,9 +25,15 @@ REPO_ID = "broadfield-dev/news-rag-db"
 login(token=HF_API_TOKEN)
 hf_api = HfApi()
-# Initialize embedding model and vector DB
 embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
 def fetch_rss_feeds():
     articles = []
@@ -66,40 +73,18 @@ def fetch_rss_feeds():
     return articles
 def categorize_feed(url):
-    if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
-        return "Academic Papers"
-    elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
-        return "Business"
-    elif "investing.com" in url or "cnbc.com/market" in url or "marketwatch.com/market" in url or "fool.co.uk" in url or "zacks.com" in url or "seekingalpha.com" in url or "barrons.com" in url or "yahoofinance.com" in url:
-        return "Stocks & Markets"
-    elif "whitehouse.gov" in url or "state.gov" in url or "commerce.gov" in url or "transportation.gov" in url or "ed.gov" in url or "dol.gov" in url or "justice.gov" in url or "federalreserve.gov" in url or "occ.gov" in url or "sec.gov" in url or "bls.gov" in url or "usda.gov" in url or "gao.gov" in url or "cbo.gov" in url or "fema.gov" in url or "defense.gov" in url or "hhs.gov" in url or "energy.gov" in url or "interior.gov" in url:
-        return "Federal Government"
-    elif "weather.gov" in url or "metoffice.gov.uk" in url or "accuweather.com" in url or "weatherunderground.com" in url or "noaa.gov" in url or "wunderground.com" in url or "climate.gov" in url or "ecmwf.int" in url or "bom.gov.au" in url:
-        return "Weather"
-    elif "data.worldbank.org" in url or "imf.org" in url or "un.org" in url or "oecd.org" in url or "statista.com" in url or "kff.org" in url or "who.int" in url or "cdc.gov" in url or "bea.gov" in url or "census.gov" in url or "fdic.gov" in url:
-        return "Data & Statistics"
-    elif "nasa" in url or "spaceweatherlive" in url or "space" in url or "universetoday" in url or "skyandtelescope" in url or "esa" in url:
-        return "Space"
-    elif "sciencedaily" in url or "quantamagazine" in url or "smithsonianmag" in url or "popsci" in url or "discovermagazine" in url or "scientificamerican" in url or "newscientist" in url or "livescience" in url or "atlasobscura" in url:
-        return "Science"
-    elif "wired" in url or "techcrunch" in url or "arstechnica" in url or "gizmodo" in url or "theverge" in url:
-        return "Tech"
-    elif "horoscope" in url or "astrostyle" in url:
-        return "Astrology"
-    elif "cnn_allpolitics" in url or "bbci.co.uk/news/politics" in url or "reuters.com/arc/outboundfeeds/newsletter-politics" in url or "politico.com/rss/politics" in url or "thehill" in url:
-        return "Politics"
-    elif "weather" in url or "swpc.noaa.gov" in url or "foxweather" in url:
-        return "Earth Weather"
-    elif "vogue" in url:
-        return "Lifestyle"
-    elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
-        return "Physics"
-    return "Uncategorized"
 def process_and_store_articles(articles):
     documents = []
     for article in articles:
         try:
             metadata = {
                 "title": article["title"],
                 "link": article["link"],
@@ -108,7 +93,7 @@ def process_and_store_articles(articles):
                 "category": article["category"],
                 "image": article["image"],
             }
-            doc = Document(page_content=article["description"], metadata=metadata)
             documents.append(doc)
         except Exception as e:
             logger.error(f"Error processing article {article['title']}: {e}")
@@ -116,26 +101,28 @@ def process_and_store_articles(articles):
     if documents:
         try:
             vector_db.add_documents(documents)
-            logger.info(f"Stored {len(documents)} articles in DB")
         except Exception as e:
             logger.error(f"Error storing articles: {e}")
 def download_from_hf_hub():
-    if os.path.exists(LOCAL_DB_DIR):
-        shutil.rmtree(LOCAL_DB_DIR)
-    try:
-        hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
-        logger.info(f"Downloading Chroma DB from {REPO_ID}...")
-        hf_api.download_repo(repo_id=REPO_ID, repo_type="dataset", local_dir=LOCAL_DB_DIR, token=HF_API_TOKEN)
-    except Exception as e:
-        logger.error(f"Error downloading from Hugging Face Hub: {e}")
-        raise
 def upload_to_hf_hub():
     if os.path.exists(LOCAL_DB_DIR):
         try:
-            hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
-            logger.info(f"Uploading Chroma DB to {REPO_ID}...")
             for root, _, files in os.walk(LOCAL_DB_DIR):
                 for file in files:
                     local_path = os.path.join(root, file)

 logger = logging.getLogger(__name__)
 # Constants
+MAX_ARTICLES_PER_FEED = 5
 LOCAL_DB_DIR = "chroma_db"
 RSS_FEEDS = rss_feeds.RSS_FEEDS
+COLLECTION_NAME = "news_articles"  # Explicitly name the collection
 HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
 REPO_ID = "broadfield-dev/news-rag-db"
 login(token=HF_API_TOKEN)
 hf_api = HfApi()
+# Initialize embedding model (global, reusable)
 embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# Initialize vector DB with a specific collection name
+vector_db = Chroma(
+    persist_directory=LOCAL_DB_DIR,
+    embedding_function=embedding_model,
+    collection_name=COLLECTION_NAME
+)
 def fetch_rss_feeds():
     articles = []
     return articles
 def categorize_feed(url):
+    # (Unchanged, keeping your existing categorization logic)
+    # ...
 def process_and_store_articles(articles):
     documents = []
+    existing_ids = set(vector_db.get()["ids"])  # Get existing document IDs to avoid duplicates
     for article in articles:
         try:
+            # Create a unique ID for deduplication
+            doc_id = f"{article['title']}|{article['link']}|{article['published']}"
+            if doc_id in existing_ids:
+                continue  # Skip if already in DB
             metadata = {
                 "title": article["title"],
                 "link": article["link"],
                 "category": article["category"],
                 "image": article["image"],
             }
+            doc = Document(page_content=article["description"], metadata=metadata, id=doc_id)
             documents.append(doc)
         except Exception as e:
             logger.error(f"Error processing article {article['title']}: {e}")
     if documents:
         try:
             vector_db.add_documents(documents)
+            vector_db.persist()  # Explicitly persist changes
+            logger.info(f"Added {len(documents)} new articles to DB")
         except Exception as e:
             logger.error(f"Error storing articles: {e}")
 def download_from_hf_hub():
+    # Only download if the local DB doesn’t exist (initial setup)
+    if not os.path.exists(LOCAL_DB_DIR):
+        try:
+            hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
+            logger.info(f"Downloading Chroma DB from {REPO_ID}...")
+            hf_api.download_repo(repo_id=REPO_ID, repo_type="dataset", local_dir=LOCAL_DB_DIR, token=HF_API_TOKEN)
+        except Exception as e:
+            logger.error(f"Error downloading from Hugging Face Hub: {e}")
+            raise
+    else:
+        logger.info("Local Chroma DB already exists, skipping download.")
 def upload_to_hf_hub():
     if os.path.exists(LOCAL_DB_DIR):
         try:
+            logger.info(f"Uploading updated Chroma DB to {REPO_ID}...")
             for root, _, files in os.walk(LOCAL_DB_DIR):
                 for file in files:
                     local_path = os.path.join(root, file)