broadfield-dev commited on
Commit
f827315
·
verified ·
1 Parent(s): 833eb54

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +29 -11
rss_processor.py CHANGED
@@ -1,16 +1,24 @@
1
  import os
2
  import feedparser
3
- from huggingface_hub import HfApi, InferenceClient
4
  from langchain.vectorstores import Chroma
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.docstore.document import Document
7
  import shutil
 
 
 
 
 
8
 
9
  # Hugging Face setup
10
  HF_API_TOKEN = os.getenv("HF_API_TOKEN", "DEMO_HF_API_TOKEN")
11
  HF_MODEL = "Qwen/Qwen-72B-Instruct"
12
- REPO_ID = "broadfield-dev/news-rag-db"
13
  LOCAL_DB_DIR = "chroma_db"
 
 
 
14
  client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
15
 
16
  # RSS feeds
@@ -28,6 +36,8 @@ RSS_FEEDS = [
28
  "https://www.scientificamerican.com/rss/",
29
  "https://www.newscientist.com/feed/home/",
30
  "https://www.livescience.com/feeds/all",
 
 
31
  "https://astrostyle.com/feed/",
32
  "https://www.vogue.com/feed/rss",
33
  "https://feeds.bbci.co.uk/news/politics/rss.xml",
@@ -39,6 +49,10 @@ RSS_FEEDS = [
39
  "https://www.sciencedaily.com/rss/matter_energy/physics.xml",
40
  "https://physicsworld.com/feed/",
41
  "https://www.swpc.noaa.gov/rss.xml",
 
 
 
 
42
  "https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
43
  "https://www.weather.gov/rss",
44
  "https://www.foxweather.com/rss",
@@ -64,17 +78,17 @@ hf_api = HfApi()
64
  def fetch_rss_feeds():
65
  articles = []
66
  for feed_url in RSS_FEEDS:
67
- print('processing ', feed_url)
68
  feed = feedparser.parse(feed_url)
69
- for entry in feed.entries[:5]: # Limit to 5 per feed
 
70
  articles.append({
71
  "title": entry.get("title", "No Title"),
72
  "link": entry.get("link", ""),
73
  "description": entry.get("summary", entry.get("description", "No Description")),
74
  "published": entry.get("published", "Unknown Date"),
75
  "category": categorize_feed(feed_url),
 
76
  })
77
- print(entry)
78
  return articles
79
 
80
  def categorize_feed(url):
@@ -97,7 +111,7 @@ def summarize_article(text):
97
  response = client.text_generation(prompt, max_new_tokens=100, temperature=0.7)
98
  return response.strip()
99
  except Exception as e:
100
- print(f"Error summarizing article: {e}")
101
  return "Summary unavailable"
102
 
103
  def categorize_article(text):
@@ -106,7 +120,7 @@ def categorize_article(text):
106
  response = client.text_generation(prompt, max_new_tokens=10, temperature=0.7)
107
  return response.strip()
108
  except Exception as e:
109
- print(f"Error categorizing article: {e}")
110
  return "Neutral"
111
 
112
  def process_and_store_articles(articles):
@@ -123,6 +137,7 @@ def process_and_store_articles(articles):
123
  "published": article["published"],
124
  "category": article["category"],
125
  "sentiment": sentiment,
 
126
  }
127
  )
128
  documents.append(doc)
@@ -133,9 +148,11 @@ def process_and_store_articles(articles):
133
  def upload_to_hf_hub():
134
  if os.path.exists(LOCAL_DB_DIR):
135
  try:
136
- hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
 
137
  except Exception as e:
138
- print(f"Error creating repo: {e}")
 
139
  for root, _, files in os.walk(LOCAL_DB_DIR):
140
  for file in files:
141
  local_path = os.path.join(root, file)
@@ -148,6 +165,7 @@ def upload_to_hf_hub():
148
  repo_type="dataset",
149
  token=HF_API_TOKEN
150
  )
 
151
  except Exception as e:
152
- print(f"Error uploading file {file}: {e}")
153
- print(f"Database uploaded to: {REPO_ID}")
 
1
  import os
2
  import feedparser
3
+ from huggingface_hub import HfApi, InferenceClient, login
4
  from langchain.vectorstores import Chroma
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.docstore.document import Document
7
  import shutil
8
+ import logging
9
+
10
+ # Setup logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
 
14
  # Hugging Face setup
15
  HF_API_TOKEN = os.getenv("HF_API_TOKEN", "DEMO_HF_API_TOKEN")
16
  HF_MODEL = "Qwen/Qwen-72B-Instruct"
17
+ REPO_ID = "broadfield-dev/news-rag-db" # Ensure this is your repo
18
  LOCAL_DB_DIR = "chroma_db"
19
+
20
+ # Explicitly login to Hugging Face Hub
21
+ login(token=HF_API_TOKEN)
22
  client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
23
 
24
  # RSS feeds
 
36
  "https://www.scientificamerican.com/rss/",
37
  "https://www.newscientist.com/feed/home/",
38
  "https://www.livescience.com/feeds/all",
39
+ "https://www.hindustantimes.com/feed/horoscope/rss",
40
+ "https://www.washingtonpost.com/wp-srv/style/horoscopes/rss.xml",
41
  "https://astrostyle.com/feed/",
42
  "https://www.vogue.com/feed/rss",
43
  "https://feeds.bbci.co.uk/news/politics/rss.xml",
 
49
  "https://www.sciencedaily.com/rss/matter_energy/physics.xml",
50
  "https://physicsworld.com/feed/",
51
  "https://www.swpc.noaa.gov/rss.xml",
52
+ "https://www.nasa.gov/rss/dyn/solar_system.rss",
53
+ "https://weather.com/science/space/rss",
54
+ "https://www.space.com/feeds/space-weather",
55
+ "https://www.accuweather.com/en/rss",
56
  "https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
57
  "https://www.weather.gov/rss",
58
  "https://www.foxweather.com/rss",
 
78
  def fetch_rss_feeds():
79
  articles = []
80
  for feed_url in RSS_FEEDS:
 
81
  feed = feedparser.parse(feed_url)
82
+ for entry in feed.entries[:5]:
83
+ image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or None
84
  articles.append({
85
  "title": entry.get("title", "No Title"),
86
  "link": entry.get("link", ""),
87
  "description": entry.get("summary", entry.get("description", "No Description")),
88
  "published": entry.get("published", "Unknown Date"),
89
  "category": categorize_feed(feed_url),
90
+ "image": image,
91
  })
 
92
  return articles
93
 
94
  def categorize_feed(url):
 
111
  response = client.text_generation(prompt, max_new_tokens=100, temperature=0.7)
112
  return response.strip()
113
  except Exception as e:
114
+ logger.error(f"Error summarizing article: {e}")
115
  return "Summary unavailable"
116
 
117
  def categorize_article(text):
 
120
  response = client.text_generation(prompt, max_new_tokens=10, temperature=0.7)
121
  return response.strip()
122
  except Exception as e:
123
+ logger.error(f"Error categorizing article: {e}")
124
  return "Neutral"
125
 
126
  def process_and_store_articles(articles):
 
137
  "published": article["published"],
138
  "category": article["category"],
139
  "sentiment": sentiment,
140
+ "image": article["image"] if article["image"] else "https://via.placeholder.com/150",
141
  }
142
  )
143
  documents.append(doc)
 
148
  def upload_to_hf_hub():
149
  if os.path.exists(LOCAL_DB_DIR):
150
  try:
151
+ hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
152
+ logger.info(f"Repository {REPO_ID} created or exists.")
153
  except Exception as e:
154
+ logger.error(f"Error creating repo: {e}")
155
+ return
156
  for root, _, files in os.walk(LOCAL_DB_DIR):
157
  for file in files:
158
  local_path = os.path.join(root, file)
 
165
  repo_type="dataset",
166
  token=HF_API_TOKEN
167
  )
168
+ logger.info(f"Uploaded {file} to {REPO_ID}")
169
  except Exception as e:
170
+ logger.error(f"Error uploading file {file}: {e}")
171
+ logger.info(f"Database uploaded to: {REPO_ID}")