broadfield-dev commited on
Commit
de78f0e
·
verified ·
1 Parent(s): cb518f2

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +41 -29
rss_processor.py CHANGED
@@ -12,13 +12,13 @@ logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
14
  # Hugging Face setup
15
- HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "DEMO_HF_API_TOKEN")
16
  HF_MODEL = "Qwen/Qwen-72B-Instruct"
17
- REPO_ID = "broadfield-dev/news-rag-db" # Ensure this is your repo
18
  LOCAL_DB_DIR = "chroma_db"
19
 
20
  # Explicitly login to Hugging Face Hub
21
- #login(token=HF_API_TOKEN)
22
  client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
23
 
24
  # RSS feeds
@@ -78,17 +78,25 @@ hf_api = HfApi()
78
  def fetch_rss_feeds():
79
  articles = []
80
  for feed_url in RSS_FEEDS:
81
- feed = feedparser.parse(feed_url)
82
- for entry in feed.entries[:5]:
83
- image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or None
84
- articles.append({
85
- "title": entry.get("title", "No Title"),
86
- "link": entry.get("link", ""),
87
- "description": entry.get("summary", entry.get("description", "No Description")),
88
- "published": entry.get("published", "Unknown Date"),
89
- "category": categorize_feed(feed_url),
90
- "image": image,
91
- })
 
 
 
 
 
 
 
 
92
  return articles
93
 
94
  def categorize_feed(url):
@@ -126,23 +134,27 @@ def categorize_article(text):
126
  def process_and_store_articles(articles):
127
  documents = []
128
  for article in articles:
129
- summary = summarize_article(article["description"])
130
- sentiment = categorize_article(article["description"])
131
- doc = Document(
132
- page_content=summary,
133
- metadata={
134
- "title": article["title"],
135
- "link": article["link"],
136
- "original_description": article["description"],
137
- "published": article["published"],
138
- "category": article["category"],
139
- "sentiment": sentiment,
140
- "image": article["image"] if article["image"] else "https://via.placeholder.com/150",
141
- }
142
- )
143
- documents.append(doc)
 
 
 
144
  vector_db.add_documents(documents)
145
  vector_db.persist()
 
146
  upload_to_hf_hub()
147
 
148
  def upload_to_hf_hub():
 
12
  logger = logging.getLogger(__name__)
13
 
14
  # Hugging Face setup
15
+ HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
16
  HF_MODEL = "Qwen/Qwen-72B-Instruct"
17
+ REPO_ID = "broadfield-dev/news-rag-db"
18
  LOCAL_DB_DIR = "chroma_db"
19
 
20
  # Explicitly login to Hugging Face Hub
21
+ login(token=HF_API_TOKEN)
22
  client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
23
 
24
  # RSS feeds
 
78
  def fetch_rss_feeds():
79
  articles = []
80
  for feed_url in RSS_FEEDS:
81
+ try:
82
+ logger.info(f"Fetching feed: {feed_url}")
83
+ feed = feedparser.parse(feed_url)
84
+ if feed.bozo:
85
+ logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
86
+ continue
87
+ for entry in feed.entries[:5]:
88
+ image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or None
89
+ articles.append({
90
+ "title": entry.get("title", "No Title"),
91
+ "link": entry.get("link", ""),
92
+ "description": entry.get("summary", entry.get("description", "No Description")),
93
+ "published": entry.get("published", "Unknown Date"),
94
+ "category": categorize_feed(feed_url),
95
+ "image": image,
96
+ })
97
+ logger.info(f"Processed {len(feed.entries[:5])} entries from {feed_url}")
98
+ except Exception as e:
99
+ logger.error(f"Error fetching {feed_url}: {e}")
100
  return articles
101
 
102
  def categorize_feed(url):
 
134
  def process_and_store_articles(articles):
135
  documents = []
136
  for article in articles:
137
+ try:
138
+ summary = summarize_article(article["description"])
139
+ sentiment = categorize_article(article["description"])
140
+ doc = Document(
141
+ page_content=summary,
142
+ metadata={
143
+ "title": article["title"],
144
+ "link": article["link"],
145
+ "original_description": article["description"],
146
+ "published": article["published"],
147
+ "category": article["category"],
148
+ "sentiment": sentiment,
149
+ "image": article["image"],
150
+ }
151
+ )
152
+ documents.append(doc)
153
+ except Exception as e:
154
+ logger.error(f"Error processing article {article['title']}: {e}")
155
  vector_db.add_documents(documents)
156
  vector_db.persist()
157
+ logger.info("Vector DB persisted")
158
  upload_to_hf_hub()
159
 
160
  def upload_to_hf_hub():