Spaces:

broadfield-dev
/

grok_test

Runtime error

App Files Files Community

broadfield-dev commited on Feb 20

Commit

f827315

verified ·

1 Parent(s): 833eb54

Update rss_processor.py

Browse files

Files changed (1) hide show

rss_processor.py +29 -11

rss_processor.py CHANGED Viewed

@@ -1,16 +1,24 @@
 import os
 import feedparser
-from huggingface_hub import HfApi, InferenceClient
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.docstore.document import Document
 import shutil
 # Hugging Face setup
 HF_API_TOKEN = os.getenv("HF_API_TOKEN", "DEMO_HF_API_TOKEN")
 HF_MODEL = "Qwen/Qwen-72B-Instruct"
-REPO_ID = "broadfield-dev/news-rag-db"
 LOCAL_DB_DIR = "chroma_db"
 client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
 # RSS feeds
@@ -28,6 +36,8 @@ RSS_FEEDS = [
     "https://www.scientificamerican.com/rss/",
     "https://www.newscientist.com/feed/home/",
     "https://www.livescience.com/feeds/all",
     "https://astrostyle.com/feed/",
     "https://www.vogue.com/feed/rss",
     "https://feeds.bbci.co.uk/news/politics/rss.xml",
@@ -39,6 +49,10 @@ RSS_FEEDS = [
     "https://www.sciencedaily.com/rss/matter_energy/physics.xml",
     "https://physicsworld.com/feed/",
     "https://www.swpc.noaa.gov/rss.xml",
     "https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
     "https://www.weather.gov/rss",
     "https://www.foxweather.com/rss",
@@ -64,17 +78,17 @@ hf_api = HfApi()
 def fetch_rss_feeds():
     articles = []
     for feed_url in RSS_FEEDS:
-        print('processing ', feed_url)
         feed = feedparser.parse(feed_url)
-        for entry in feed.entries[:5]:  # Limit to 5 per feed
             articles.append({
                 "title": entry.get("title", "No Title"),
                 "link": entry.get("link", ""),
                 "description": entry.get("summary", entry.get("description", "No Description")),
                 "published": entry.get("published", "Unknown Date"),
                 "category": categorize_feed(feed_url),
             })
-            print(entry)
     return articles
 def categorize_feed(url):
@@ -97,7 +111,7 @@ def summarize_article(text):
         response = client.text_generation(prompt, max_new_tokens=100, temperature=0.7)
         return response.strip()
     except Exception as e:
-        print(f"Error summarizing article: {e}")
         return "Summary unavailable"
 def categorize_article(text):
@@ -106,7 +120,7 @@ def categorize_article(text):
         response = client.text_generation(prompt, max_new_tokens=10, temperature=0.7)
         return response.strip()
     except Exception as e:
-        print(f"Error categorizing article: {e}")
         return "Neutral"
 def process_and_store_articles(articles):
@@ -123,6 +137,7 @@ def process_and_store_articles(articles):
                 "published": article["published"],
                 "category": article["category"],
                 "sentiment": sentiment,
             }
         )
         documents.append(doc)
@@ -133,9 +148,11 @@ def process_and_store_articles(articles):
 def upload_to_hf_hub():
     if os.path.exists(LOCAL_DB_DIR):
         try:
-            hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
         except Exception as e:
-            print(f"Error creating repo: {e}")
         for root, _, files in os.walk(LOCAL_DB_DIR):
             for file in files:
                 local_path = os.path.join(root, file)
@@ -148,6 +165,7 @@ def upload_to_hf_hub():
                         repo_type="dataset",
                         token=HF_API_TOKEN
                     )
                 except Exception as e:
-                    print(f"Error uploading file {file}: {e}")
-        print(f"Database uploaded to: {REPO_ID}")

 import os
 import feedparser
+from huggingface_hub import HfApi, InferenceClient, login
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.docstore.document import Document
 import shutil
+import logging
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Hugging Face setup
 HF_API_TOKEN = os.getenv("HF_API_TOKEN", "DEMO_HF_API_TOKEN")
 HF_MODEL = "Qwen/Qwen-72B-Instruct"
+REPO_ID = "broadfield-dev/news-rag-db"  # Ensure this is your repo
 LOCAL_DB_DIR = "chroma_db"
+# Explicitly login to Hugging Face Hub
+login(token=HF_API_TOKEN)
 client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
 # RSS feeds
     "https://www.scientificamerican.com/rss/",
     "https://www.newscientist.com/feed/home/",
     "https://www.livescience.com/feeds/all",
+    "https://www.hindustantimes.com/feed/horoscope/rss",
+    "https://www.washingtonpost.com/wp-srv/style/horoscopes/rss.xml",
     "https://astrostyle.com/feed/",
     "https://www.vogue.com/feed/rss",
     "https://feeds.bbci.co.uk/news/politics/rss.xml",
     "https://www.sciencedaily.com/rss/matter_energy/physics.xml",
     "https://physicsworld.com/feed/",
     "https://www.swpc.noaa.gov/rss.xml",
+    "https://www.nasa.gov/rss/dyn/solar_system.rss",
+    "https://weather.com/science/space/rss",
+    "https://www.space.com/feeds/space-weather",
+    "https://www.accuweather.com/en/rss",
     "https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
     "https://www.weather.gov/rss",
     "https://www.foxweather.com/rss",
 def fetch_rss_feeds():
     articles = []
     for feed_url in RSS_FEEDS:
         feed = feedparser.parse(feed_url)
+        for entry in feed.entries[:5]:
+            image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or None
             articles.append({
                 "title": entry.get("title", "No Title"),
                 "link": entry.get("link", ""),
                 "description": entry.get("summary", entry.get("description", "No Description")),
                 "published": entry.get("published", "Unknown Date"),
                 "category": categorize_feed(feed_url),
+                "image": image,
             })
     return articles
 def categorize_feed(url):
         response = client.text_generation(prompt, max_new_tokens=100, temperature=0.7)
         return response.strip()
     except Exception as e:
+        logger.error(f"Error summarizing article: {e}")
         return "Summary unavailable"
 def categorize_article(text):
         response = client.text_generation(prompt, max_new_tokens=10, temperature=0.7)
         return response.strip()
     except Exception as e:
+        logger.error(f"Error categorizing article: {e}")
         return "Neutral"
 def process_and_store_articles(articles):
                 "published": article["published"],
                 "category": article["category"],
                 "sentiment": sentiment,
+                "image": article["image"] if article["image"] else "https://via.placeholder.com/150",
             }
         )
         documents.append(doc)
 def upload_to_hf_hub():
     if os.path.exists(LOCAL_DB_DIR):
         try:
+            hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
+            logger.info(f"Repository {REPO_ID} created or exists.")
         except Exception as e:
+            logger.error(f"Error creating repo: {e}")
+            return
         for root, _, files in os.walk(LOCAL_DB_DIR):
             for file in files:
                 local_path = os.path.join(root, file)
                         repo_type="dataset",
                         token=HF_API_TOKEN
                     )
+                    logger.info(f"Uploaded {file} to {REPO_ID}")
                 except Exception as e:
+                    logger.error(f"Error uploading file {file}: {e}")
+        logger.info(f"Database uploaded to: {REPO_ID}")