Spaces:
Running
Running
File size: 6,802 Bytes
f63fa31 f827315 f63fa31 f827315 f63fa31 cd69f86 f63fa31 f827315 f63fa31 f827315 f63fa31 f827315 f63fa31 f827315 f63fa31 f827315 f63fa31 f827315 f63fa31 f827315 f63fa31 f827315 f63fa31 f827315 f63fa31 f827315 f63fa31 f827315 f63fa31 f827315 f63fa31 f827315 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import os
import feedparser
from huggingface_hub import HfApi, InferenceClient, login
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import shutil
import logging
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Hugging Face setup
HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "DEMO_HF_API_TOKEN")
HF_MODEL = "Qwen/Qwen-72B-Instruct"
REPO_ID = "broadfield-dev/news-rag-db" # Ensure this is your repo
LOCAL_DB_DIR = "chroma_db"
# Explicitly login to Hugging Face Hub
login(token=HF_API_TOKEN)
client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
# RSS feeds
RSS_FEEDS = [
"https://www.sciencedaily.com/rss/top/science.xml",
"https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx",
"http://rss.cnn.com/rss/cnn_allpolitics.rss",
"https://phys.org/rss-feed/physics-news/",
"https://www.spaceweatherlive.com/en/news/rss",
"https://weather.com/feeds/rss",
"https://www.wired.com/feed/rss",
"https://www.nasa.gov/rss/dyn/breaking_news.rss",
"https://www.nationalgeographic.com/feed/",
"https://www.nature.com/nature.rss",
"https://www.scientificamerican.com/rss/",
"https://www.newscientist.com/feed/home/",
"https://www.livescience.com/feeds/all",
"https://www.hindustantimes.com/feed/horoscope/rss",
"https://www.washingtonpost.com/wp-srv/style/horoscopes/rss.xml",
"https://astrostyle.com/feed/",
"https://www.vogue.com/feed/rss",
"https://feeds.bbci.co.uk/news/politics/rss.xml",
"https://www.reuters.com/arc/outboundfeeds/newsletter-politics/?outputType=xml",
"https://www.politico.com/rss/politics.xml",
"https://thehill.com/feed/",
"https://www.aps.org/publications/apsnews/updates/rss.cfm",
"https://www.quantamagazine.org/feed/",
"https://www.sciencedaily.com/rss/matter_energy/physics.xml",
"https://physicsworld.com/feed/",
"https://www.swpc.noaa.gov/rss.xml",
"https://www.nasa.gov/rss/dyn/solar_system.rss",
"https://weather.com/science/space/rss",
"https://www.space.com/feeds/space-weather",
"https://www.accuweather.com/en/rss",
"https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
"https://www.weather.gov/rss",
"https://www.foxweather.com/rss",
"https://techcrunch.com/feed/",
"https://arstechnica.com/feed/",
"https://gizmodo.com/rss",
"https://www.theverge.com/rss/index.xml",
"https://www.space.com/feeds/all",
"https://www.universetoday.com/feed/",
"https://skyandtelescope.org/feed/",
"https://www.esa.int/rss",
"https://www.smithsonianmag.com/rss/",
"https://www.popsci.com/rss.xml",
"https://www.discovermagazine.com/rss",
"https://www.atlasobscura.com/feeds/latest"
]
# Embedding model and vector DB
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
hf_api = HfApi()
def fetch_rss_feeds():
articles = []
for feed_url in RSS_FEEDS:
feed = feedparser.parse(feed_url)
for entry in feed.entries[:5]:
image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or None
articles.append({
"title": entry.get("title", "No Title"),
"link": entry.get("link", ""),
"description": entry.get("summary", entry.get("description", "No Description")),
"published": entry.get("published", "Unknown Date"),
"category": categorize_feed(feed_url),
"image": image,
})
return articles
def categorize_feed(url):
if "sciencedaily" in url or "phys.org" in url:
return "Science & Physics"
elif "horoscope" in url:
return "Astrology"
elif "politics" in url:
return "Politics"
elif "spaceweather" in url or "nasa" in url:
return "Solar & Space"
elif "weather" in url:
return "Earth Weather"
else:
return "Cool Stuff"
def summarize_article(text):
prompt = f"Summarize the following text concisely:\n\n{text}"
try:
response = client.text_generation(prompt, max_new_tokens=100, temperature=0.7)
return response.strip()
except Exception as e:
logger.error(f"Error summarizing article: {e}")
return "Summary unavailable"
def categorize_article(text):
prompt = f"Classify the sentiment as positive, negative, or neutral:\n\n{text}"
try:
response = client.text_generation(prompt, max_new_tokens=10, temperature=0.7)
return response.strip()
except Exception as e:
logger.error(f"Error categorizing article: {e}")
return "Neutral"
def process_and_store_articles(articles):
documents = []
for article in articles:
summary = summarize_article(article["description"])
sentiment = categorize_article(article["description"])
doc = Document(
page_content=summary,
metadata={
"title": article["title"],
"link": article["link"],
"original_description": article["description"],
"published": article["published"],
"category": article["category"],
"sentiment": sentiment,
"image": article["image"] if article["image"] else "https://via.placeholder.com/150",
}
)
documents.append(doc)
vector_db.add_documents(documents)
vector_db.persist()
upload_to_hf_hub()
def upload_to_hf_hub():
if os.path.exists(LOCAL_DB_DIR):
try:
hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
logger.info(f"Repository {REPO_ID} created or exists.")
except Exception as e:
logger.error(f"Error creating repo: {e}")
return
for root, _, files in os.walk(LOCAL_DB_DIR):
for file in files:
local_path = os.path.join(root, file)
remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
try:
hf_api.upload_file(
path_or_fileobj=local_path,
path_in_repo=remote_path,
repo_id=REPO_ID,
repo_type="dataset",
token=HF_API_TOKEN
)
logger.info(f"Uploaded {file} to {REPO_ID}")
except Exception as e:
logger.error(f"Error uploading file {file}: {e}")
logger.info(f"Database uploaded to: {REPO_ID}") |