Spaces:
Running
Running
File size: 7,215 Bytes
f63fa31 36572bc f63fa31 f827315 f63fa31 de78f0e f63fa31 f827315 efdc13f de78f0e 36572bc f63fa31 86fe81e f63fa31 86fe81e f63fa31 8179b58 f63fa31 de78f0e 8a41dca 8179b58 8091043 8179b58 8091043 8a41dca 8091043 8179b58 8091043 8a41dca de78f0e f63fa31 86fe81e f63fa31 78dac58 f63fa31 de78f0e 8179b58 78dac58 de78f0e efdc13f f63fa31 f827315 f63fa31 f827315 f63fa31 fe57b98 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import os
import feedparser
from huggingface_hub import HfApi, login
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import shutil
import logging
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Hugging Face setup
HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
REPO_ID = "broadfield-dev/news-rag-db"
LOCAL_DB_DIR = "chroma_db"
# Explicitly login to Hugging Face Hub
login(token=HF_API_TOKEN)
hf_api = HfApi()
# RSS feeds
RSS_FEEDS = [
"https://www.sciencedaily.com/rss/top/science.xml",
"https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx",
"http://rss.cnn.com/rss/cnn_allpolitics.rss",
"https://phys.org/rss-feed/physics-news/",
"https://www.spaceweatherlive.com/en/news/rss",
"https://weather.com/feeds/rss",
"https://www.wired.com/feed/rss",
"https://www.nasa.gov/rss/dyn/breaking_news.rss",
"https://www.nationalgeographic.com/feed/",
"https://www.nature.com/nature.rss",
"https://www.scientificamerican.com/rss/",
"https://www.newscientist.com/feed/home/",
"https://www.livescience.com/feeds/all",
"https://astrostyle.com/feed/",
"https://www.vogue.com/feed/rss",
"https://feeds.bbci.co.uk/news/politics/rss.xml",
"https://www.reuters.com/arc/outboundfeeds/newsletter-politics/?outputType=xml",
"https://www.politico.com/rss/politics.xml",
"https://thehill.com/feed/",
"https://www.aps.org/publications/apsnews/updates/rss.cfm",
"https://www.quantamagazine.org/feed/",
"https://www.sciencedaily.com/rss/matter_energy/physics.xml",
"https://physicsworld.com/feed/",
"https://www.swpc.noaa.gov/rss.xml",
"https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
"https://www.weather.gov/rss",
"https://www.foxweather.com/rss",
"https://techcrunch.com/feed/",
"https://arstechnica.com/feed/",
"https://gizmodo.com/rss",
"https://www.theverge.com/rss/index.xml",
"https://www.space.com/feeds/all",
"https://www.universetoday.com/feed/",
"https://skyandtelescope.org/feed/",
"https://www.esa.int/rss",
"https://www.smithsonianmag.com/rss/",
"https://www.popsci.com/rss.xml",
"https://www.discovermagazine.com/rss",
"https://www.atlasobscura.com/feeds/latest"
]
# Embedding model and vector DB
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
def fetch_rss_feeds():
articles = []
seen_articles = set() # Track unique articles by title, link, and description
for feed_url in RSS_FEEDS:
try:
logger.info(f"Fetching feed: {feed_url}")
feed = feedparser.parse(feed_url)
if feed.bozo:
logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
continue
unique_count = 0
for entry in feed.entries[:5]:
title = entry.get("title", "No Title")
link = entry.get("link", "")
description = entry.get("summary", entry.get("description", "No Description"))
# Create a unique key for deduplication (title, link, and description for stricter uniqueness)
article_key = f"{title}|{link}|{description[:50]}" # Use first 50 chars of description to avoid overly long keys
if article_key not in seen_articles:
seen_articles.add(article_key)
unique_count += 1
image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or ""
articles.append({
"title": title,
"link": link,
"description": description,
"published": entry.get("published", "Unknown Date"),
"category": categorize_feed(feed_url),
"image": image if image else "",
})
logger.info(f"Processed {unique_count} unique entries from {feed_url}")
except Exception as e:
logger.error(f"Error fetching {feed_url}: {e}")
return articles
def categorize_feed(url):
if "sciencedaily" in url or "phys.org" in url:
return "Science & Physics"
elif "horoscope" in url:
return "Astrology"
elif "politics" in url:
return "Politics"
elif "spaceweather" in url or "nasa" in url:
return "Solar & Space"
elif "weather" in url:
return "Earth Weather"
else:
return "Cool Stuff"
def process_and_store_articles(articles):
documents = []
seen_docs = set() # Additional de-duplication at DB level
for article in articles:
try:
key = f"{article['title']}|{article['link']}|{article['description'][:50]}"
if key not in seen_docs:
seen_docs.add(key)
metadata = {
"title": article["title"] or "No Title",
"link": article["link"] or "",
"original_description": article["description"] or "No Description",
"published": article["published"] or "Unknown Date",
"category": article["category"] or "Uncategorized",
"image": article["image"] or "",
}
doc = Document(
page_content=article["description"] or "No Description",
metadata=metadata
)
documents.append(doc)
except Exception as e:
logger.error(f"Error processing article {article['title']}: {e}")
try:
vector_db.add_documents(documents)
vector_db.persist()
logger.info("Vector DB persisted")
except Exception as e:
logger.error(f"Error adding documents to vector DB: {e}")
upload_to_hf_hub()
def upload_to_hf_hub():
if os.path.exists(LOCAL_DB_DIR):
try:
hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
logger.info(f"Repository {REPO_ID} created or exists.")
except Exception as e:
logger.error(f"Error creating repo: {e}")
return
for root, _, files in os.walk(LOCAL_DB_DIR):
for file in files:
local_path = os.path.join(root, file)
remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
try:
hf_api.upload_file(
path_or_fileobj=local_path,
path_in_repo=remote_path,
repo_id=REPO_ID,
repo_type="dataset",
token=HF_API_TOKEN
)
logger.info(f"Uploaded {file} to {REPO_ID}")
except Exception as e:
logger.error(f"Error uploading file {file}: {e}")
logger.info(f"Database uploaded to: {REPO_ID}") |