broadfield-dev commited on
Commit
15033cb
·
verified ·
1 Parent(s): 2aa963e

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +33 -4
rss_processor.py CHANGED
@@ -35,6 +35,9 @@ vector_db = Chroma(
35
  collection_name=COLLECTION_NAME
36
  )
37
 
 
 
 
38
  def fetch_rss_feeds():
39
  articles = []
40
  seen_keys = set()
@@ -51,13 +54,39 @@ def fetch_rss_feeds():
51
  break
52
  title = entry.get("title", "No Title").strip()
53
  link = entry.get("link", "").strip()
54
- description = entry.get("summary", entry.get("description", "No Description"))
55
- published = entry.get("published", "Unknown Date").strip()
 
 
 
 
 
 
 
 
 
 
 
 
56
  key = f"{title}|{link}|{published}"
57
  if key not in seen_keys:
58
  seen_keys.add(key)
59
- image = (entry.get("media_content", [{}])[0].get("url") or
60
- entry.get("media_thumbnail", [{}])[0].get("url") or "svg")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  articles.append({
62
  "title": title,
63
  "link": link,
 
35
  collection_name=COLLECTION_NAME
36
  )
37
 
38
+ from datetime import datetime
39
+ import dateutil.parser # Add this dependency: pip install python-dateutil
40
+
41
  def fetch_rss_feeds():
42
  articles = []
43
  seen_keys = set()
 
54
  break
55
  title = entry.get("title", "No Title").strip()
56
  link = entry.get("link", "").strip()
57
+ description = entry.get("summary", entry.get("description", "No Description")).strip()
58
+
59
+ # Try multiple date fields and parse flexibly
60
+ published = "Unknown Date"
61
+ for date_field in ["published", "updated", "created"]:
62
+ if date_field in entry:
63
+ try:
64
+ parsed_date = dateutil.parser.parse(entry[date_field])
65
+ published = parsed_date.strftime("%Y-%m-%d %H:%M:%S")
66
+ break
67
+ except (ValueError, TypeError) as e:
68
+ logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
69
+ continue
70
+
71
  key = f"{title}|{link}|{published}"
72
  if key not in seen_keys:
73
  seen_keys.add(key)
74
+ # Try multiple image sources
75
+ image = "svg" # Default fallback
76
+ for img_source in [
77
+ lambda e: e.get("media_content", [{}])[0].get("url"),
78
+ lambda e: e.get("media_thumbnail", [{}])[0].get("url"),
79
+ lambda e: e.get("enclosure", {}).get("url"),
80
+ lambda e: next((lnk.get("href") for lnk in e.get("links", []) if lnk.get("type", "").startswith("image")), None),
81
+ ]:
82
+ try:
83
+ img = img_source(entry)
84
+ if img:
85
+ image = img
86
+ break
87
+ except (IndexError, AttributeError, TypeError):
88
+ continue
89
+
90
  articles.append({
91
  "title": title,
92
  "link": link,