Spaces:
Sleeping
Sleeping
File size: 6,648 Bytes
f63fa31 f827315 e727948 f827315 f63fa31 715921b f63fa31 d8b8e62 f63fa31 d8b8e62 f63fa31 e727948 f63fa31 715921b f63fa31 715921b f63fa31 de78f0e 715921b de78f0e 715921b de78f0e 715921b 8091043 8179b58 715921b 8091043 8179b58 8091043 715921b 8091043 de78f0e 715921b f63fa31 715921b f63fa31 86fe81e f63fa31 de78f0e 715921b de78f0e 715921b f63fa31 715921b f63fa31 715921b fdfda12 e727948 fdfda12 715921b e727948 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import os
import feedparser
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import logging
from huggingface_hub import HfApi, login
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Constants
LOCAL_DB_DIR = "chroma_db"
RSS_FEEDS = [
"https://www.sciencedaily.com/rss/top/science.xml",
"https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx",
"http://rss.cnn.com/rss/cnn_allpolitics.rss",
"https://phys.org/rss-feed/physics-news/",
"https://www.spaceweatherlive.com/en/news/rss",
"https://weather.com/feeds/rss",
"https://www.wired.com/feed/rss",
"https://www.nasa.gov/rss/dyn/breaking_news.rss",
"https://www.nationalgeographic.com/feed/",
"https://www.nature.com/nature.rss",
"https://www.scientificamerican.com/rss/",
"https://www.newscientist.com/feed/home/",
"https://www.livescience.com/feeds/all",
"https://astrostyle.com/feed/",
"https://www.vogue.com/feed/rss",
"https://feeds.bbci.co.uk/news/politics/rss.xml",
"https://www.reuters.com/arc/outboundfeeds/newsletter-politics/?outputType=xml",
"https://www.politico.com/rss/politics.xml",
"https://thehill.com/feed/",
"https://www.aps.org/publications/apsnews/updates/rss.cfm",
"https://www.quantamagazine.org/feed/",
"https://www.sciencedaily.com/rss/matter_energy/physics.xml",
"https://physicsworld.com/feed/",
"https://www.swpc.noaa.gov/rss.xml",
"https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
"https://www.weather.gov/rss",
"https://www.foxweather.com/rss",
"https://techcrunch.com/feed/",
"https://arstechnica.com/feed/",
"https://gizmodo.com/rss",
"https://www.theverge.com/rss/index.xml",
"https://www.space.com/feeds/all",
"https://www.universetoday.com/feed/",
"https://skyandtelescope.org/feed/",
"https://www.esa.int/rss",
"https://www.smithsonianmag.com/rss/",
"https://www.popsci.com/rss.xml",
"https://www.discovermagazine.com/rss",
"https://www.atlasobscura.com/feeds/latest"
]
HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
REPO_ID = "broadfield-dev/news-rag-db"
# Initialize Hugging Face API
login(token=HF_API_TOKEN)
hf_api = HfApi()
# Initialize embedding model and vector DB
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
def fetch_rss_feeds():
articles = []
seen_keys = set()
for feed_url in RSS_FEEDS:
try:
logger.info(f"Fetching {feed_url}")
feed = feedparser.parse(feed_url)
if feed.bozo:
logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
continue
for entry in feed.entries:
title = entry.get("title", "No Title")
link = entry.get("link", "")
description = entry.get("summary", entry.get("description", "No Description"))
key = f"{title}|{link}"
if key not in seen_keys:
seen_keys.add(key)
image = (entry.get("media_content", [{}])[0].get("url") or
entry.get("media_thumbnail", [{}])[0].get("url") or "svg")
articles.append({
"title": title,
"link": link,
"description": description,
"published": entry.get("published", "Unknown Date"),
"category": categorize_feed(feed_url),
"image": image,
})
except Exception as e:
logger.error(f"Error fetching {feed_url}: {e}")
logger.info(f"Total articles fetched: {len(articles)}")
return articles
def categorize_feed(url):
if "sciencedaily" in url:
return "Science"
elif "nasa" in url:
return "Space"
elif "wired" in url:
return "Tech"
return "Uncategorized"
def process_and_store_articles(articles):
documents = []
for article in articles:
try:
metadata = {
"title": article["title"],
"link": article["link"],
"original_description": article["description"],
"published": article["published"],
"category": article["category"],
"image": article["image"],
}
doc = Document(page_content=article["description"], metadata=metadata)
documents.append(doc)
except Exception as e:
logger.error(f"Error processing article {article['title']}: {e}")
if documents:
try:
vector_db.add_documents(documents)
logger.info(f"Stored {len(documents)} articles in DB")
except Exception as e:
logger.error(f"Error storing articles: {e}")
def download_from_hf_hub():
if os.path.exists(LOCAL_DB_DIR):
shutil.rmtree(LOCAL_DB_DIR)
try:
hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
logger.info(f"Downloading Chroma DB from {REPO_ID}...")
hf_api.download_repo(repo_id=REPO_ID, repo_type="dataset", local_dir=LOCAL_DB_DIR, token=HF_API_TOKEN)
except Exception as e:
logger.error(f"Error downloading from Hugging Face Hub: {e}")
raise
def upload_to_hf_hub():
if os.path.exists(LOCAL_DB_DIR):
try:
hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
logger.info(f"Uploading Chroma DB to {REPO_ID}...")
for root, _, files in os.walk(LOCAL_DB_DIR):
for file in files:
local_path = os.path.join(root, file)
remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
hf_api.upload_file(
path_or_fileobj=local_path,
path_in_repo=remote_path,
repo_id=REPO_ID,
repo_type="dataset",
token=HF_API_TOKEN
)
logger.info(f"Database uploaded to: {REPO_ID}")
except Exception as e:
logger.error(f"Error uploading to Hugging Face Hub: {e}")
raise
if __name__ == "__main__":
articles = fetch_rss_feeds()
process_and_store_articles(articles)
upload_to_hf_hub() |