Spaces:
Running
Running
File size: 6,142 Bytes
f63fa31 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import os
import feedparser
from huggingface_hub import HfApi, InferenceClient
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import shutil
# Hugging Face setup
HF_API_TOKEN = os.getenv("HF_API_TOKEN", "YOUR_HF_API_TOKEN")
HF_MODEL = "Qwen/Qwen-72B-Instruct"
REPO_ID = "your-username/news-rag-db"
LOCAL_DB_DIR = "chroma_db"
client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
# RSS feeds
RSS_FEEDS = [
"https://www.sciencedaily.com/rss/top/science.xml",
"https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx",
"http://rss.cnn.com/rss/cnn_allpolitics.rss",
"https://phys.org/rss-feed/physics-news/",
"https://www.spaceweatherlive.com/en/news/rss",
"https://weather.com/feeds/rss",
"https://www.wired.com/feed/rss",
"https://www.nasa.gov/rss/dyn/breaking_news.rss",
"https://www.nationalgeographic.com/feed/",
"https://www.nature.com/nature.rss",
"https://www.scientificamerican.com/rss/",
"https://www.newscientist.com/feed/home/",
"https://www.livescience.com/feeds/all",
"https://www.hindustantimes.com/feed/horoscope/rss",
"https://www.washingtonpost.com/wp-srv/style/horoscopes/rss.xml",
"https://astrostyle.com/feed/",
"https://www.vogue.com/feed/rss",
"https://feeds.bbci.co.uk/news/politics/rss.xml",
"https://www.reuters.com/arc/outboundfeeds/newsletter-politics/?outputType=xml",
"https://www.politico.com/rss/politics.xml",
"https://thehill.com/feed/",
"https://www.aps.org/publications/apsnews/updates/rss.cfm",
"https://www.quantamagazine.org/feed/",
"https://www.sciencedaily.com/rss/matter_energy/physics.xml",
"https://physicsworld.com/feed/",
"https://www.swpc.noaa.gov/rss.xml",
"https://www.nasa.gov/rss/dyn/solar_system.rss",
"https://weather.com/science/space/rss",
"https://www.space.com/feeds/space-weather",
"https://www.accuweather.com/en/rss",
"https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
"https://www.weather.gov/rss",
"https://www.foxweather.com/rss",
"https://techcrunch.com/feed/",
"https://arstechnica.com/feed/",
"https://gizmodo.com/rss",
"https://www.theverge.com/rss/index.xml",
"https://www.space.com/feeds/all",
"https://www.universetoday.com/feed/",
"https://skyandtelescope.org/feed/",
"https://www.esa.int/rss",
"https://www.smithsonianmag.com/rss/",
"https://www.popsci.com/rss.xml",
"https://www.discovermagazine.com/rss",
"https://www.atlasobscura.com/feeds/latest"
]
# Embedding model and vector DB
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
hf_api = HfApi()
def fetch_rss_feeds():
articles = []
for feed_url in RSS_FEEDS:
feed = feedparser.parse(feed_url)
for entry in feed.entries[:5]: # Limit to 5 per feed
articles.append({
"title": entry.get("title", "No Title"),
"link": entry.get("link", ""),
"description": entry.get("summary", entry.get("description", "No Description")),
"published": entry.get("published", "Unknown Date"),
"category": categorize_feed(feed_url),
})
return articles
def categorize_feed(url):
if "sciencedaily" in url or "phys.org" in url:
return "Science & Physics"
elif "horoscope" in url:
return "Astrology"
elif "politics" in url:
return "Politics"
elif "spaceweather" in url or "nasa" in url:
return "Solar & Space"
elif "weather" in url:
return "Earth Weather"
else:
return "Cool Stuff"
def summarize_article(text):
prompt = f"Summarize the following text concisely:\n\n{text}"
try:
response = client.text_generation(prompt, max_new_tokens=100, temperature=0.7)
return response.strip()
except Exception as e:
print(f"Error summarizing article: {e}")
return "Summary unavailable"
def categorize_article(text):
prompt = f"Classify the sentiment as positive, negative, or neutral:\n\n{text}"
try:
response = client.text_generation(prompt, max_new_tokens=10, temperature=0.7)
return response.strip()
except Exception as e:
print(f"Error categorizing article: {e}")
return "Neutral"
def process_and_store_articles(articles):
documents = []
for article in articles:
summary = summarize_article(article["description"])
sentiment = categorize_article(article["description"])
doc = Document(
page_content=summary,
metadata={
"title": article["title"],
"link": article["link"],
"original_description": article["description"],
"published": article["published"],
"category": article["category"],
"sentiment": sentiment,
}
)
documents.append(doc)
vector_db.add_documents(documents)
vector_db.persist()
upload_to_hf_hub()
def upload_to_hf_hub():
if os.path.exists(LOCAL_DB_DIR):
try:
hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
except Exception as e:
print(f"Error creating repo: {e}")
for root, _, files in os.walk(LOCAL_DB_DIR):
for file in files:
local_path = os.path.join(root, file)
remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
try:
hf_api.upload_file(
path_or_fileobj=local_path,
path_in_repo=remote_path,
repo_id=REPO_ID,
repo_type="dataset",
token=HF_API_TOKEN
)
except Exception as e:
print(f"Error uploading file {file}: {e}")
print(f"Database uploaded to: {REPO_ID}") |