Spaces:

broadfield-dev
/

grok_test

Runtime error

App Files Files Community

broadfield-dev commited on Feb 20

Commit

715921b

verified ·

1 Parent(s): 5d47c6a

Update rss_processor.py

Browse files

Files changed (1) hide show

rss_processor.py +39 -128

rss_processor.py CHANGED Viewed

@@ -1,181 +1,92 @@
 import os
 import feedparser
-import sys
-from huggingface_hub import HfApi, login
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.docstore.document import Document
-import shutil
 import logging
-import hashlib
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Hugging Face setup
-HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
-REPO_ID = "broadfield-dev/news-rag-db"
 LOCAL_DB_DIR = "chroma_db"
-# Explicitly login to Hugging Face Hub
-#login(token=HF_API_TOKEN)
-hf_api = HfApi()
-# RSS feeds
 RSS_FEEDS = [
     "https://www.sciencedaily.com/rss/top/science.xml",
-    "https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx",
-    "http://rss.cnn.com/rss/cnn_allpolitics.rss",
-    "https://phys.org/rss-feed/physics-news/",
-    "https://www.spaceweatherlive.com/en/news/rss",
-    "https://weather.com/feeds/rss",
     "https://www.wired.com/feed/rss",
-    "https://www.nasa.gov/rss/dyn/breaking_news.rss",
-    "https://www.nationalgeographic.com/feed/",
-    "https://www.nature.com/nature.rss",
-    "https://www.scientificamerican.com/rss/",
-    "https://www.newscientist.com/feed/home/",
-    "https://www.livescience.com/feeds/all",
-    "https://astrostyle.com/feed/",
-    "https://www.vogue.com/feed/rss",
-    "https://feeds.bbci.co.uk/news/politics/rss.xml",
-    "https://www.reuters.com/arc/outboundfeeds/newsletter-politics/?outputType=xml",
-    "https://www.politico.com/rss/politics.xml",
-    "https://thehill.com/feed/",
-    "https://www.aps.org/publications/apsnews/updates/rss.cfm",
-    "https://www.quantamagazine.org/feed/",
-    "https://www.sciencedaily.com/rss/matter_energy/physics.xml",
-    "https://physicsworld.com/feed/",
-    "https://www.swpc.noaa.gov/rss.xml",
-    "https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
-    "https://www.weather.gov/rss",
-    "https://www.foxweather.com/rss",
-    "https://techcrunch.com/feed/",
-    "https://arstechnica.com/feed/",
-    "https://gizmodo.com/rss",
-    "https://www.theverge.com/rss/index.xml",
-    "https://www.space.com/feeds/all",
-    "https://www.universetoday.com/feed/",
-    "https://skyandtelescope.org/feed/",
-    "https://www.esa.int/rss",
-    "https://www.smithsonianmag.com/rss/",
-    "https://www.popsci.com/rss.xml",
-    "https://www.discovermagazine.com/rss",
-    "https://www.atlasobscura.com/feeds/latest"
 ]
-# Embedding model and vector DB
 embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
 def fetch_rss_feeds():
     articles = []
-    seen_articles = set()  # Track unique articles by title, link, and full description hash
     for feed_url in RSS_FEEDS:
         try:
-            logger.info(f"Fetching feed: {feed_url}")
             feed = feedparser.parse(feed_url)
             if feed.bozo:
-                logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
                 continue
-            unique_count = 0
-            for entry in feed.entries[:5]:
                 title = entry.get("title", "No Title")
                 link = entry.get("link", "")
                 description = entry.get("summary", entry.get("description", "No Description"))
-                # Use full MD5 hash of description for stricter uniqueness
-                desc_hash = hashlib.md5(description.encode()).hexdigest()
-                article_key = f"{title}|{link}|{desc_hash}"
-                if article_key not in seen_articles:
-                    seen_articles.add(article_key)
-                    unique_count += 1
-                    image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or ""
                     articles.append({
                         "title": title,
                         "link": link,
                         "description": description,
                         "published": entry.get("published", "Unknown Date"),
                         "category": categorize_feed(feed_url),
-                        "image": image if image else "",
                     })
-            logger.info(f"Processed {unique_count} unique entries from {feed_url}")
         except Exception as e:
             logger.error(f"Error fetching {feed_url}: {e}")
     return articles
 def categorize_feed(url):
-    if "sciencedaily" in url or "phys.org" in url:
-        return "Science & Physics"
-    elif "horoscope" in url:
-        return "Astrology"
-    elif "politics" in url:
-        return "Politics"
-    elif "spaceweather" in url or "nasa" in url:
-        return "Solar & Space"
-    elif "weather" in url:
-        return "Earth Weather"
-    else:
-        return "Cool Stuff"
 def process_and_store_articles(articles):
     documents = []
-    seen_docs = set()  # Additional de-duplication at DB level
     for article in articles:
         try:
-            desc_hash = hashlib.md5(article["description"].encode()).hexdigest()
-            key = f"{article['title']}|{article['link']}|{desc_hash}"
-            if key not in seen_docs:
-                seen_docs.add(key)
-                metadata = {
-                    "title": article["title"] or "No Title",
-                    "link": article["link"] or "",
-                    "original_description": article["description"] or "No Description",
-                    "published": article["published"] or "Unknown Date",
-                    "category": article["category"] or "Uncategorized",
-                    "image": article["image"] or "",
-                }
-                doc = Document(
-                    page_content=article["description"] or "No Description",
-                    metadata=metadata
-                )
-                documents.append(doc)
         except Exception as e:
             logger.error(f"Error processing article {article['title']}: {e}")
-    try:
-        vector_db.add_documents(documents)
-        # Removed manual persist() as Chroma auto-persists since 0.4.x
-        logger.info("Vector DB updated (auto-persisted)")
-    except Exception as e:
-        logger.error(f"Error adding documents to vector DB: {e}")
-    upload_to_hf_hub()
-def upload_to_hf_hub():
-    if os.path.exists(LOCAL_DB_DIR):
         try:
-            hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
-            logger.info(f"Repository {REPO_ID} created or exists.")
         except Exception as e:
-            logger.error(f"Error creating repo: {e}")
-            return
-        for root, _, files in os.walk(LOCAL_DB_DIR):
-            for file in files:
-                local_path = os.path.join(root, file)
-                remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
-                try:
-                    hf_api.upload_file(
-                        path_or_fileobj=local_path,
-                        path_in_repo=remote_path,
-                        repo_id=REPO_ID,
-                        repo_type="dataset",
-                        token=HF_API_TOKEN
-                    )
-                    logger.info(f"Uploaded {file} to {REPO_ID}")
-                except Exception as e:
-                    logger.error(f"Error uploading file {file}: {e}")
-        logger.info(f"Database uploaded to: {REPO_ID}")
 if __name__ == "__main__":
-    if len(sys.argv) > 1 and sys.argv[1] == "load_feeds":
-        articles = fetch_rss_feeds()
-        process_and_store_articles(articles)

 import os
 import feedparser
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.docstore.document import Document
 import logging
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Constants
 LOCAL_DB_DIR = "chroma_db"
 RSS_FEEDS = [
+    "https://www.nasa.gov/rss/dyn/breaking_news.rss",
     "https://www.sciencedaily.com/rss/top/science.xml",
     "https://www.wired.com/feed/rss",
+    # Add more feeds as needed; starting with reliable ones
 ]
+# Initialize embedding model and vector DB
 embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
 def fetch_rss_feeds():
     articles = []
+    seen_keys = set()
     for feed_url in RSS_FEEDS:
         try:
+            logger.info(f"Fetching {feed_url}")
             feed = feedparser.parse(feed_url)
             if feed.bozo:
+                logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
                 continue
+            for entry in feed.entries:
                 title = entry.get("title", "No Title")
                 link = entry.get("link", "")
                 description = entry.get("summary", entry.get("description", "No Description"))
+                key = f"{title}|{link}"
+                if key not in seen_keys:
+                    seen_keys.add(key)
+                    image = (entry.get("media_content", [{}])[0].get("url") or
+                             entry.get("media_thumbnail", [{}])[0].get("url") or "svg")
                     articles.append({
                         "title": title,
                         "link": link,
                         "description": description,
                         "published": entry.get("published", "Unknown Date"),
                         "category": categorize_feed(feed_url),
+                        "image": image,
                     })
         except Exception as e:
             logger.error(f"Error fetching {feed_url}: {e}")
+    logger.info(f"Total articles fetched: {len(articles)}")
     return articles
 def categorize_feed(url):
+    if "sciencedaily" in url:
+        return "Science"
+    elif "nasa" in url:
+        return "Space"
+    elif "wired" in url:
+        return "Tech"
+    return "Uncategorized"
 def process_and_store_articles(articles):
     documents = []
     for article in articles:
         try:
+            metadata = {
+                "title": article["title"],
+                "link": article["link"],
+                "original_description": article["description"],
+                "published": article["published"],
+                "category": article["category"],
+                "image": article["image"],
+            }
+            doc = Document(page_content=article["description"], metadata=metadata)
+            documents.append(doc)
         except Exception as e:
             logger.error(f"Error processing article {article['title']}: {e}")
+    if documents:
         try:
+            vector_db.add_documents(documents)
+            logger.info(f"Stored {len(documents)} articles in DB")
         except Exception as e:
+            logger.error(f"Error storing articles: {e}")
 if __name__ == "__main__":
+    articles = fetch_rss_feeds()
+    process_and_store_articles(articles)