broadfield-dev commited on
Commit
f63fa31
·
verified ·
1 Parent(s): b9891ea

Create rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +157 -0
rss_processor.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import feedparser
3
+ from huggingface_hub import HfApi, InferenceClient
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.docstore.document import Document
7
+ import shutil
8
+
9
+ # Hugging Face setup
10
+ HF_API_TOKEN = os.getenv("HF_API_TOKEN", "YOUR_HF_API_TOKEN")
11
+ HF_MODEL = "Qwen/Qwen-72B-Instruct"
12
+ REPO_ID = "your-username/news-rag-db"
13
+ LOCAL_DB_DIR = "chroma_db"
14
+ client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
15
+
16
+ # RSS feeds
17
+ RSS_FEEDS = [
18
+ "https://www.sciencedaily.com/rss/top/science.xml",
19
+ "https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx",
20
+ "http://rss.cnn.com/rss/cnn_allpolitics.rss",
21
+ "https://phys.org/rss-feed/physics-news/",
22
+ "https://www.spaceweatherlive.com/en/news/rss",
23
+ "https://weather.com/feeds/rss",
24
+ "https://www.wired.com/feed/rss",
25
+ "https://www.nasa.gov/rss/dyn/breaking_news.rss",
26
+ "https://www.nationalgeographic.com/feed/",
27
+ "https://www.nature.com/nature.rss",
28
+ "https://www.scientificamerican.com/rss/",
29
+ "https://www.newscientist.com/feed/home/",
30
+ "https://www.livescience.com/feeds/all",
31
+ "https://www.hindustantimes.com/feed/horoscope/rss",
32
+ "https://www.washingtonpost.com/wp-srv/style/horoscopes/rss.xml",
33
+ "https://astrostyle.com/feed/",
34
+ "https://www.vogue.com/feed/rss",
35
+ "https://feeds.bbci.co.uk/news/politics/rss.xml",
36
+ "https://www.reuters.com/arc/outboundfeeds/newsletter-politics/?outputType=xml",
37
+ "https://www.politico.com/rss/politics.xml",
38
+ "https://thehill.com/feed/",
39
+ "https://www.aps.org/publications/apsnews/updates/rss.cfm",
40
+ "https://www.quantamagazine.org/feed/",
41
+ "https://www.sciencedaily.com/rss/matter_energy/physics.xml",
42
+ "https://physicsworld.com/feed/",
43
+ "https://www.swpc.noaa.gov/rss.xml",
44
+ "https://www.nasa.gov/rss/dyn/solar_system.rss",
45
+ "https://weather.com/science/space/rss",
46
+ "https://www.space.com/feeds/space-weather",
47
+ "https://www.accuweather.com/en/rss",
48
+ "https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
49
+ "https://www.weather.gov/rss",
50
+ "https://www.foxweather.com/rss",
51
+ "https://techcrunch.com/feed/",
52
+ "https://arstechnica.com/feed/",
53
+ "https://gizmodo.com/rss",
54
+ "https://www.theverge.com/rss/index.xml",
55
+ "https://www.space.com/feeds/all",
56
+ "https://www.universetoday.com/feed/",
57
+ "https://skyandtelescope.org/feed/",
58
+ "https://www.esa.int/rss",
59
+ "https://www.smithsonianmag.com/rss/",
60
+ "https://www.popsci.com/rss.xml",
61
+ "https://www.discovermagazine.com/rss",
62
+ "https://www.atlasobscura.com/feeds/latest"
63
+ ]
64
+
65
+ # Embedding model and vector DB
66
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
67
+ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
68
+ hf_api = HfApi()
69
+
70
+ def fetch_rss_feeds():
71
+ articles = []
72
+ for feed_url in RSS_FEEDS:
73
+ feed = feedparser.parse(feed_url)
74
+ for entry in feed.entries[:5]: # Limit to 5 per feed
75
+ articles.append({
76
+ "title": entry.get("title", "No Title"),
77
+ "link": entry.get("link", ""),
78
+ "description": entry.get("summary", entry.get("description", "No Description")),
79
+ "published": entry.get("published", "Unknown Date"),
80
+ "category": categorize_feed(feed_url),
81
+ })
82
+ return articles
83
+
84
+ def categorize_feed(url):
85
+ if "sciencedaily" in url or "phys.org" in url:
86
+ return "Science & Physics"
87
+ elif "horoscope" in url:
88
+ return "Astrology"
89
+ elif "politics" in url:
90
+ return "Politics"
91
+ elif "spaceweather" in url or "nasa" in url:
92
+ return "Solar & Space"
93
+ elif "weather" in url:
94
+ return "Earth Weather"
95
+ else:
96
+ return "Cool Stuff"
97
+
98
+ def summarize_article(text):
99
+ prompt = f"Summarize the following text concisely:\n\n{text}"
100
+ try:
101
+ response = client.text_generation(prompt, max_new_tokens=100, temperature=0.7)
102
+ return response.strip()
103
+ except Exception as e:
104
+ print(f"Error summarizing article: {e}")
105
+ return "Summary unavailable"
106
+
107
+ def categorize_article(text):
108
+ prompt = f"Classify the sentiment as positive, negative, or neutral:\n\n{text}"
109
+ try:
110
+ response = client.text_generation(prompt, max_new_tokens=10, temperature=0.7)
111
+ return response.strip()
112
+ except Exception as e:
113
+ print(f"Error categorizing article: {e}")
114
+ return "Neutral"
115
+
116
+ def process_and_store_articles(articles):
117
+ documents = []
118
+ for article in articles:
119
+ summary = summarize_article(article["description"])
120
+ sentiment = categorize_article(article["description"])
121
+ doc = Document(
122
+ page_content=summary,
123
+ metadata={
124
+ "title": article["title"],
125
+ "link": article["link"],
126
+ "original_description": article["description"],
127
+ "published": article["published"],
128
+ "category": article["category"],
129
+ "sentiment": sentiment,
130
+ }
131
+ )
132
+ documents.append(doc)
133
+ vector_db.add_documents(documents)
134
+ vector_db.persist()
135
+ upload_to_hf_hub()
136
+
137
+ def upload_to_hf_hub():
138
+ if os.path.exists(LOCAL_DB_DIR):
139
+ try:
140
+ hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
141
+ except Exception as e:
142
+ print(f"Error creating repo: {e}")
143
+ for root, _, files in os.walk(LOCAL_DB_DIR):
144
+ for file in files:
145
+ local_path = os.path.join(root, file)
146
+ remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
147
+ try:
148
+ hf_api.upload_file(
149
+ path_or_fileobj=local_path,
150
+ path_in_repo=remote_path,
151
+ repo_id=REPO_ID,
152
+ repo_type="dataset",
153
+ token=HF_API_TOKEN
154
+ )
155
+ except Exception as e:
156
+ print(f"Error uploading file {file}: {e}")
157
+ print(f"Database uploaded to: {REPO_ID}")