Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +33 -4
rss_processor.py
CHANGED
@@ -35,6 +35,9 @@ vector_db = Chroma(
|
|
35 |
collection_name=COLLECTION_NAME
|
36 |
)
|
37 |
|
|
|
|
|
|
|
38 |
def fetch_rss_feeds():
|
39 |
articles = []
|
40 |
seen_keys = set()
|
@@ -51,13 +54,39 @@ def fetch_rss_feeds():
|
|
51 |
break
|
52 |
title = entry.get("title", "No Title").strip()
|
53 |
link = entry.get("link", "").strip()
|
54 |
-
description = entry.get("summary", entry.get("description", "No Description"))
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
key = f"{title}|{link}|{published}"
|
57 |
if key not in seen_keys:
|
58 |
seen_keys.add(key)
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
articles.append({
|
62 |
"title": title,
|
63 |
"link": link,
|
|
|
35 |
collection_name=COLLECTION_NAME
|
36 |
)
|
37 |
|
38 |
+
from datetime import datetime
|
39 |
+
import dateutil.parser # Add this dependency: pip install python-dateutil
|
40 |
+
|
41 |
def fetch_rss_feeds():
|
42 |
articles = []
|
43 |
seen_keys = set()
|
|
|
54 |
break
|
55 |
title = entry.get("title", "No Title").strip()
|
56 |
link = entry.get("link", "").strip()
|
57 |
+
description = entry.get("summary", entry.get("description", "No Description")).strip()
|
58 |
+
|
59 |
+
# Try multiple date fields and parse flexibly
|
60 |
+
published = "Unknown Date"
|
61 |
+
for date_field in ["published", "updated", "created"]:
|
62 |
+
if date_field in entry:
|
63 |
+
try:
|
64 |
+
parsed_date = dateutil.parser.parse(entry[date_field])
|
65 |
+
published = parsed_date.strftime("%Y-%m-%d %H:%M:%S")
|
66 |
+
break
|
67 |
+
except (ValueError, TypeError) as e:
|
68 |
+
logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
|
69 |
+
continue
|
70 |
+
|
71 |
key = f"{title}|{link}|{published}"
|
72 |
if key not in seen_keys:
|
73 |
seen_keys.add(key)
|
74 |
+
# Try multiple image sources
|
75 |
+
image = "svg" # Default fallback
|
76 |
+
for img_source in [
|
77 |
+
lambda e: e.get("media_content", [{}])[0].get("url"),
|
78 |
+
lambda e: e.get("media_thumbnail", [{}])[0].get("url"),
|
79 |
+
lambda e: e.get("enclosure", {}).get("url"),
|
80 |
+
lambda e: next((lnk.get("href") for lnk in e.get("links", []) if lnk.get("type", "").startswith("image")), None),
|
81 |
+
]:
|
82 |
+
try:
|
83 |
+
img = img_source(entry)
|
84 |
+
if img:
|
85 |
+
image = img
|
86 |
+
break
|
87 |
+
except (IndexError, AttributeError, TypeError):
|
88 |
+
continue
|
89 |
+
|
90 |
articles.append({
|
91 |
"title": title,
|
92 |
"link": link,
|