broadfield-dev commited on
Commit
efdc13f
·
verified ·
1 Parent(s): ef5f71a

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +20 -15
rss_processor.py CHANGED
@@ -16,7 +16,7 @@ HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
16
  REPO_ID = "broadfield-dev/news-rag-db"
17
  LOCAL_DB_DIR = "chroma_db"
18
 
19
- # Explicitly login to Hugging Face Hub (no InferenceClient needed anymore)
20
  login(token=HF_API_TOKEN)
21
  hf_api = HfApi()
22
 
@@ -77,14 +77,14 @@ def fetch_rss_feeds():
77
  logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
78
  continue
79
  for entry in feed.entries[:5]:
80
- image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or None
81
  articles.append({
82
  "title": entry.get("title", "No Title"),
83
  "link": entry.get("link", ""),
84
  "description": entry.get("summary", entry.get("description", "No Description")),
85
  "published": entry.get("published", "Unknown Date"),
86
  "category": categorize_feed(feed_url),
87
- "image": image,
88
  })
89
  logger.info(f"Processed {len(feed.entries[:5])} entries from {feed_url}")
90
  except Exception as e:
@@ -109,23 +109,28 @@ def process_and_store_articles(articles):
109
  documents = []
110
  for article in articles:
111
  try:
 
 
 
 
 
 
 
 
 
112
  doc = Document(
113
- page_content=article["description"],
114
- metadata={
115
- "title": article["title"],
116
- "link": article["link"],
117
- "original_description": article["description"],
118
- "published": article["published"],
119
- "category": article["category"],
120
- "image": article["image"],
121
- }
122
  )
123
  documents.append(doc)
124
  except Exception as e:
125
  logger.error(f"Error processing article {article['title']}: {e}")
126
- vector_db.add_documents(documents)
127
- vector_db.persist()
128
- logger.info("Vector DB persisted")
 
 
 
129
  upload_to_hf_hub()
130
 
131
  def upload_to_hf_hub():
 
16
  REPO_ID = "broadfield-dev/news-rag-db"
17
  LOCAL_DB_DIR = "chroma_db"
18
 
19
+ # Explicitly login to Hugging Face Hub
20
  login(token=HF_API_TOKEN)
21
  hf_api = HfApi()
22
 
 
77
  logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
78
  continue
79
  for entry in feed.entries[:5]:
80
+ image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or ""
81
  articles.append({
82
  "title": entry.get("title", "No Title"),
83
  "link": entry.get("link", ""),
84
  "description": entry.get("summary", entry.get("description", "No Description")),
85
  "published": entry.get("published", "Unknown Date"),
86
  "category": categorize_feed(feed_url),
87
+ "image": image if image else "",
88
  })
89
  logger.info(f"Processed {len(feed.entries[:5])} entries from {feed_url}")
90
  except Exception as e:
 
109
  documents = []
110
  for article in articles:
111
  try:
112
+ # Ensure no None values in metadata
113
+ metadata = {
114
+ "title": article["title"] or "No Title",
115
+ "link": article["link"] or "",
116
+ "original_description": article["description"] or "No Description",
117
+ "published": article["published"] or "Unknown Date",
118
+ "category": article["category"] or "Uncategorized",
119
+ "image": article["image"] or "",
120
+ }
121
  doc = Document(
122
+ page_content=article["description"] or "No Description",
123
+ metadata=metadata
 
 
 
 
 
 
 
124
  )
125
  documents.append(doc)
126
  except Exception as e:
127
  logger.error(f"Error processing article {article['title']}: {e}")
128
+ try:
129
+ vector_db.add_documents(documents)
130
+ vector_db.persist()
131
+ logger.info("Vector DB persisted")
132
+ except Exception as e:
133
+ logger.error(f"Error adding documents to vector DB: {e}")
134
  upload_to_hf_hub()
135
 
136
  def upload_to_hf_hub():