broadfield-dev commited on
Commit
715921b
·
verified ·
1 Parent(s): 5d47c6a

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +39 -128
rss_processor.py CHANGED
@@ -1,181 +1,92 @@
1
  import os
2
  import feedparser
3
- import sys
4
- from huggingface_hub import HfApi, login
5
  from langchain.vectorstores import Chroma
6
  from langchain.embeddings import HuggingFaceEmbeddings
7
  from langchain.docstore.document import Document
8
- import shutil
9
  import logging
10
- import hashlib
11
 
12
  # Setup logging
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
- # Hugging Face setup
17
- HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
18
- REPO_ID = "broadfield-dev/news-rag-db"
19
  LOCAL_DB_DIR = "chroma_db"
20
-
21
- # Explicitly login to Hugging Face Hub
22
- #login(token=HF_API_TOKEN)
23
- hf_api = HfApi()
24
-
25
- # RSS feeds
26
  RSS_FEEDS = [
 
27
  "https://www.sciencedaily.com/rss/top/science.xml",
28
- "https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx",
29
- "http://rss.cnn.com/rss/cnn_allpolitics.rss",
30
- "https://phys.org/rss-feed/physics-news/",
31
- "https://www.spaceweatherlive.com/en/news/rss",
32
- "https://weather.com/feeds/rss",
33
  "https://www.wired.com/feed/rss",
34
- "https://www.nasa.gov/rss/dyn/breaking_news.rss",
35
- "https://www.nationalgeographic.com/feed/",
36
- "https://www.nature.com/nature.rss",
37
- "https://www.scientificamerican.com/rss/",
38
- "https://www.newscientist.com/feed/home/",
39
- "https://www.livescience.com/feeds/all",
40
- "https://astrostyle.com/feed/",
41
- "https://www.vogue.com/feed/rss",
42
- "https://feeds.bbci.co.uk/news/politics/rss.xml",
43
- "https://www.reuters.com/arc/outboundfeeds/newsletter-politics/?outputType=xml",
44
- "https://www.politico.com/rss/politics.xml",
45
- "https://thehill.com/feed/",
46
- "https://www.aps.org/publications/apsnews/updates/rss.cfm",
47
- "https://www.quantamagazine.org/feed/",
48
- "https://www.sciencedaily.com/rss/matter_energy/physics.xml",
49
- "https://physicsworld.com/feed/",
50
- "https://www.swpc.noaa.gov/rss.xml",
51
- "https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
52
- "https://www.weather.gov/rss",
53
- "https://www.foxweather.com/rss",
54
- "https://techcrunch.com/feed/",
55
- "https://arstechnica.com/feed/",
56
- "https://gizmodo.com/rss",
57
- "https://www.theverge.com/rss/index.xml",
58
- "https://www.space.com/feeds/all",
59
- "https://www.universetoday.com/feed/",
60
- "https://skyandtelescope.org/feed/",
61
- "https://www.esa.int/rss",
62
- "https://www.smithsonianmag.com/rss/",
63
- "https://www.popsci.com/rss.xml",
64
- "https://www.discovermagazine.com/rss",
65
- "https://www.atlasobscura.com/feeds/latest"
66
  ]
67
 
68
- # Embedding model and vector DB
69
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
70
  vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
71
 
72
  def fetch_rss_feeds():
73
  articles = []
74
- seen_articles = set() # Track unique articles by title, link, and full description hash
75
  for feed_url in RSS_FEEDS:
76
  try:
77
- logger.info(f"Fetching feed: {feed_url}")
78
  feed = feedparser.parse(feed_url)
79
  if feed.bozo:
80
- logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
81
  continue
82
- unique_count = 0
83
- for entry in feed.entries[:5]:
84
  title = entry.get("title", "No Title")
85
  link = entry.get("link", "")
86
  description = entry.get("summary", entry.get("description", "No Description"))
87
- # Use full MD5 hash of description for stricter uniqueness
88
- desc_hash = hashlib.md5(description.encode()).hexdigest()
89
- article_key = f"{title}|{link}|{desc_hash}"
90
- if article_key not in seen_articles:
91
- seen_articles.add(article_key)
92
- unique_count += 1
93
- image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or ""
94
  articles.append({
95
  "title": title,
96
  "link": link,
97
  "description": description,
98
  "published": entry.get("published", "Unknown Date"),
99
  "category": categorize_feed(feed_url),
100
- "image": image if image else "",
101
  })
102
- logger.info(f"Processed {unique_count} unique entries from {feed_url}")
103
  except Exception as e:
104
  logger.error(f"Error fetching {feed_url}: {e}")
 
105
  return articles
106
 
107
  def categorize_feed(url):
108
- if "sciencedaily" in url or "phys.org" in url:
109
- return "Science & Physics"
110
- elif "horoscope" in url:
111
- return "Astrology"
112
- elif "politics" in url:
113
- return "Politics"
114
- elif "spaceweather" in url or "nasa" in url:
115
- return "Solar & Space"
116
- elif "weather" in url:
117
- return "Earth Weather"
118
- else:
119
- return "Cool Stuff"
120
 
121
  def process_and_store_articles(articles):
122
  documents = []
123
- seen_docs = set() # Additional de-duplication at DB level
124
  for article in articles:
125
  try:
126
- desc_hash = hashlib.md5(article["description"].encode()).hexdigest()
127
- key = f"{article['title']}|{article['link']}|{desc_hash}"
128
- if key not in seen_docs:
129
- seen_docs.add(key)
130
- metadata = {
131
- "title": article["title"] or "No Title",
132
- "link": article["link"] or "",
133
- "original_description": article["description"] or "No Description",
134
- "published": article["published"] or "Unknown Date",
135
- "category": article["category"] or "Uncategorized",
136
- "image": article["image"] or "",
137
- }
138
- doc = Document(
139
- page_content=article["description"] or "No Description",
140
- metadata=metadata
141
- )
142
- documents.append(doc)
143
  except Exception as e:
144
  logger.error(f"Error processing article {article['title']}: {e}")
145
- try:
146
- vector_db.add_documents(documents)
147
- # Removed manual persist() as Chroma auto-persists since 0.4.x
148
- logger.info("Vector DB updated (auto-persisted)")
149
- except Exception as e:
150
- logger.error(f"Error adding documents to vector DB: {e}")
151
- upload_to_hf_hub()
152
-
153
- def upload_to_hf_hub():
154
- if os.path.exists(LOCAL_DB_DIR):
155
  try:
156
- hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
157
- logger.info(f"Repository {REPO_ID} created or exists.")
158
  except Exception as e:
159
- logger.error(f"Error creating repo: {e}")
160
- return
161
- for root, _, files in os.walk(LOCAL_DB_DIR):
162
- for file in files:
163
- local_path = os.path.join(root, file)
164
- remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
165
- try:
166
- hf_api.upload_file(
167
- path_or_fileobj=local_path,
168
- path_in_repo=remote_path,
169
- repo_id=REPO_ID,
170
- repo_type="dataset",
171
- token=HF_API_TOKEN
172
- )
173
- logger.info(f"Uploaded {file} to {REPO_ID}")
174
- except Exception as e:
175
- logger.error(f"Error uploading file {file}: {e}")
176
- logger.info(f"Database uploaded to: {REPO_ID}")
177
 
178
  if __name__ == "__main__":
179
- if len(sys.argv) > 1 and sys.argv[1] == "load_feeds":
180
- articles = fetch_rss_feeds()
181
- process_and_store_articles(articles)
 
1
  import os
2
  import feedparser
 
 
3
  from langchain.vectorstores import Chroma
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.docstore.document import Document
 
6
  import logging
 
7
 
8
  # Setup logging
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
+ # Constants
 
 
13
  LOCAL_DB_DIR = "chroma_db"
 
 
 
 
 
 
14
  RSS_FEEDS = [
15
+ "https://www.nasa.gov/rss/dyn/breaking_news.rss",
16
  "https://www.sciencedaily.com/rss/top/science.xml",
 
 
 
 
 
17
  "https://www.wired.com/feed/rss",
18
+ # Add more feeds as needed; starting with reliable ones
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  ]
20
 
21
+ # Initialize embedding model and vector DB
22
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
23
  vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
24
 
25
  def fetch_rss_feeds():
26
  articles = []
27
+ seen_keys = set()
28
  for feed_url in RSS_FEEDS:
29
  try:
30
+ logger.info(f"Fetching {feed_url}")
31
  feed = feedparser.parse(feed_url)
32
  if feed.bozo:
33
+ logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
34
  continue
35
+ for entry in feed.entries:
 
36
  title = entry.get("title", "No Title")
37
  link = entry.get("link", "")
38
  description = entry.get("summary", entry.get("description", "No Description"))
39
+ key = f"{title}|{link}"
40
+ if key not in seen_keys:
41
+ seen_keys.add(key)
42
+ image = (entry.get("media_content", [{}])[0].get("url") or
43
+ entry.get("media_thumbnail", [{}])[0].get("url") or "svg")
 
 
44
  articles.append({
45
  "title": title,
46
  "link": link,
47
  "description": description,
48
  "published": entry.get("published", "Unknown Date"),
49
  "category": categorize_feed(feed_url),
50
+ "image": image,
51
  })
 
52
  except Exception as e:
53
  logger.error(f"Error fetching {feed_url}: {e}")
54
+ logger.info(f"Total articles fetched: {len(articles)}")
55
  return articles
56
 
57
  def categorize_feed(url):
58
+ if "sciencedaily" in url:
59
+ return "Science"
60
+ elif "nasa" in url:
61
+ return "Space"
62
+ elif "wired" in url:
63
+ return "Tech"
64
+ return "Uncategorized"
 
 
 
 
 
65
 
66
  def process_and_store_articles(articles):
67
  documents = []
 
68
  for article in articles:
69
  try:
70
+ metadata = {
71
+ "title": article["title"],
72
+ "link": article["link"],
73
+ "original_description": article["description"],
74
+ "published": article["published"],
75
+ "category": article["category"],
76
+ "image": article["image"],
77
+ }
78
+ doc = Document(page_content=article["description"], metadata=metadata)
79
+ documents.append(doc)
 
 
 
 
 
 
 
80
  except Exception as e:
81
  logger.error(f"Error processing article {article['title']}: {e}")
82
+
83
+ if documents:
 
 
 
 
 
 
 
 
84
  try:
85
+ vector_db.add_documents(documents)
86
+ logger.info(f"Stored {len(documents)} articles in DB")
87
  except Exception as e:
88
+ logger.error(f"Error storing articles: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  if __name__ == "__main__":
91
+ articles = fetch_rss_feeds()
92
+ process_and_store_articles(articles)