Spaces:
Running
Running
Update rss_processor.py
Browse files- rss_processor.py +8 -5
rss_processor.py
CHANGED
@@ -17,11 +17,9 @@ HF_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
|
17 |
REPO_ID = "broadfield-dev/news-rag-db"
|
18 |
LOCAL_DB_DIR = "chroma_db"
|
19 |
|
20 |
-
# Explicitly login to Hugging Face Hub
|
21 |
login(token=HF_API_TOKEN)
|
22 |
client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
|
23 |
|
24 |
-
# RSS feeds
|
25 |
RSS_FEEDS = [
|
26 |
"https://www.sciencedaily.com/rss/top/science.xml",
|
27 |
"https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx",
|
@@ -36,6 +34,8 @@ RSS_FEEDS = [
|
|
36 |
"https://www.scientificamerican.com/rss/",
|
37 |
"https://www.newscientist.com/feed/home/",
|
38 |
"https://www.livescience.com/feeds/all",
|
|
|
|
|
39 |
"https://astrostyle.com/feed/",
|
40 |
"https://www.vogue.com/feed/rss",
|
41 |
"https://feeds.bbci.co.uk/news/politics/rss.xml",
|
@@ -47,6 +47,10 @@ RSS_FEEDS = [
|
|
47 |
"https://www.sciencedaily.com/rss/matter_energy/physics.xml",
|
48 |
"https://physicsworld.com/feed/",
|
49 |
"https://www.swpc.noaa.gov/rss.xml",
|
|
|
|
|
|
|
|
|
50 |
"https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
|
51 |
"https://www.weather.gov/rss",
|
52 |
"https://www.foxweather.com/rss",
|
@@ -64,7 +68,6 @@ RSS_FEEDS = [
|
|
64 |
"https://www.atlasobscura.com/feeds/latest"
|
65 |
]
|
66 |
|
67 |
-
# Embedding model and vector DB
|
68 |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
69 |
vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
|
70 |
hf_api = HfApi()
|
@@ -125,11 +128,11 @@ def categorize_article(text):
|
|
125 |
logger.error(f"Error categorizing article: {e}")
|
126 |
return "Neutral"
|
127 |
|
128 |
-
def process_and_store_articles(articles):
|
129 |
documents = []
|
130 |
for article in articles:
|
131 |
try:
|
132 |
-
summary = summarize_article(article["description"])
|
133 |
sentiment = categorize_article(article["description"])
|
134 |
doc = Document(
|
135 |
page_content=summary,
|
|
|
17 |
REPO_ID = "broadfield-dev/news-rag-db"
|
18 |
LOCAL_DB_DIR = "chroma_db"
|
19 |
|
|
|
20 |
login(token=HF_API_TOKEN)
|
21 |
client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
|
22 |
|
|
|
23 |
RSS_FEEDS = [
|
24 |
"https://www.sciencedaily.com/rss/top/science.xml",
|
25 |
"https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx",
|
|
|
34 |
"https://www.scientificamerican.com/rss/",
|
35 |
"https://www.newscientist.com/feed/home/",
|
36 |
"https://www.livescience.com/feeds/all",
|
37 |
+
"https://www.hindustantimes.com/feed/horoscope/rss",
|
38 |
+
"https://www.washingtonpost.com/wp-srv/style/horoscopes/rss.xml",
|
39 |
"https://astrostyle.com/feed/",
|
40 |
"https://www.vogue.com/feed/rss",
|
41 |
"https://feeds.bbci.co.uk/news/politics/rss.xml",
|
|
|
47 |
"https://www.sciencedaily.com/rss/matter_energy/physics.xml",
|
48 |
"https://physicsworld.com/feed/",
|
49 |
"https://www.swpc.noaa.gov/rss.xml",
|
50 |
+
"https://www.nasa.gov/rss/dyn/solar_system.rss",
|
51 |
+
"https://weather.com/science/space/rss",
|
52 |
+
"https://www.space.com/feeds/space-weather",
|
53 |
+
"https://www.accuweather.com/en/rss",
|
54 |
"https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
|
55 |
"https://www.weather.gov/rss",
|
56 |
"https://www.foxweather.com/rss",
|
|
|
68 |
"https://www.atlasobscura.com/feeds/latest"
|
69 |
]
|
70 |
|
|
|
71 |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
72 |
vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
|
73 |
hf_api = HfApi()
|
|
|
128 |
logger.error(f"Error categorizing article: {e}")
|
129 |
return "Neutral"
|
130 |
|
131 |
+
def process_and_store_articles(articles, summarize=False):
|
132 |
documents = []
|
133 |
for article in articles:
|
134 |
try:
|
135 |
+
summary = summarize_article(article["description"]) if summarize else article["description"]
|
136 |
sentiment = categorize_article(article["description"])
|
137 |
doc = Document(
|
138 |
page_content=summary,
|