Spaces:
Sleeping
Sleeping
Update newsletter_api.py
Browse files- newsletter_api.py +100 -42
newsletter_api.py
CHANGED
@@ -37,51 +37,109 @@ async def extract_titles_from_rss(feed_urls: list[str]) -> list[str]:
|
|
37 |
except Exception as e:
|
38 |
return {"Error": str(e)}
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
@app.post("/extract_news")
|
41 |
-
def extract_news_from_rss(feed_urls: list[str],
|
42 |
-
"""Extracts news articles from RSS feeds relevant to
|
43 |
try:
|
44 |
-
|
45 |
|
46 |
-
|
|
|
|
|
|
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
except Exception as e:
|
87 |
return {"Error": str(e)}
|
|
|
37 |
except Exception as e:
|
38 |
return {"Error": str(e)}
|
39 |
|
40 |
+
# @app.post("/extract_news")
|
41 |
+
# def extract_news_from_rss(feed_urls: list[str], topic: str, threshold: float = 0.7):
|
42 |
+
# """Extracts news articles from RSS feeds relevant to a single topic using embeddings."""
|
43 |
+
# try:
|
44 |
+
# topic_articles = []
|
45 |
+
|
46 |
+
# topic_embedding = model.encode(topic, convert_to_tensor=True)
|
47 |
+
|
48 |
+
# for url in feed_urls:
|
49 |
+
# feed = feedparser.parse(url)
|
50 |
+
# for entry in feed.entries:
|
51 |
+
# title = entry.get('title', '')
|
52 |
+
# link = entry.get('link', '')
|
53 |
+
# summary = entry.get('summary', '') or entry.get('description', '')
|
54 |
+
|
55 |
+
# raw_content = entry.get('content')
|
56 |
+
# if isinstance(raw_content, list) and raw_content:
|
57 |
+
# content = raw_content[0].get('value', '')
|
58 |
+
# elif isinstance(raw_content, str):
|
59 |
+
# content = raw_content
|
60 |
+
# else:
|
61 |
+
# content = ''
|
62 |
+
|
63 |
+
# article_text = title + " " + summary
|
64 |
+
# article_embedding = model.encode(article_text, convert_to_tensor=True)
|
65 |
+
|
66 |
+
# score = util.cos_sim(article_embedding, topic_embedding).item()
|
67 |
+
|
68 |
+
# # Replace double quotes inside title and summary with single quotes
|
69 |
+
# title = title.replace('"', "'")
|
70 |
+
# summary = summary.replace('"', "'")
|
71 |
+
# content = content.replace('"', "'")
|
72 |
+
|
73 |
+
# if score >= threshold:
|
74 |
+
# topic_articles.append({
|
75 |
+
# "topic": topic,
|
76 |
+
# "title": title,
|
77 |
+
# "link": link,
|
78 |
+
# "summary": summary,
|
79 |
+
# "similarity": score
|
80 |
+
# })
|
81 |
+
|
82 |
+
# # Sort articles by similarity score
|
83 |
+
# topic_articles.sort(key=lambda x: x["similarity"], reverse=True)
|
84 |
+
|
85 |
+
# return topic_articles
|
86 |
+
# except Exception as e:
|
87 |
+
# return {"Error": str(e)}
|
88 |
+
|
89 |
@app.post("/extract_news")
|
90 |
+
def extract_news_from_rss(feed_urls: list[str], topics: list[str], threshold: float = 0.7):
|
91 |
+
"""Extracts news articles from RSS feeds relevant to multiple topics using embeddings."""
|
92 |
try:
|
93 |
+
all_results = []
|
94 |
|
95 |
+
# Process each topic
|
96 |
+
for topic in topics:
|
97 |
+
topic_articles = []
|
98 |
+
topic_embedding = model.encode(topic, convert_to_tensor=True)
|
99 |
|
100 |
+
# Process each feed URL
|
101 |
+
for url in feed_urls:
|
102 |
+
feed = feedparser.parse(url)
|
103 |
+
for entry in feed.entries:
|
104 |
+
title = entry.get('title', '')
|
105 |
+
link = entry.get('link', '')
|
106 |
+
summary = entry.get('summary', '') or entry.get('description', '')
|
107 |
+
|
108 |
+
raw_content = entry.get('content')
|
109 |
+
if isinstance(raw_content, list) and raw_content:
|
110 |
+
content = raw_content[0].get('value', '')
|
111 |
+
elif isinstance(raw_content, str):
|
112 |
+
content = raw_content
|
113 |
+
else:
|
114 |
+
content = ''
|
115 |
+
|
116 |
+
article_text = title + " " + summary
|
117 |
+
article_embedding = model.encode(article_text, convert_to_tensor=True)
|
118 |
+
|
119 |
+
score = util.cos_sim(article_embedding, topic_embedding).item()
|
120 |
+
|
121 |
+
# Replace double quotes with single quotes
|
122 |
+
title = title.replace('"', "'")
|
123 |
+
summary = summary.replace('"', "'")
|
124 |
+
content = content.replace('"', "'")
|
125 |
+
|
126 |
+
if score >= threshold:
|
127 |
+
topic_articles.append({
|
128 |
+
"title": title,
|
129 |
+
"link": link,
|
130 |
+
"summary": summary,
|
131 |
+
"similarity": score
|
132 |
+
})
|
133 |
+
|
134 |
+
# Sort articles for this topic by similarity
|
135 |
+
topic_articles.sort(key=lambda x: x["similarity"], reverse=True)
|
136 |
+
|
137 |
+
# Add topic results to main list
|
138 |
+
all_results.append({
|
139 |
+
"topic": topic,
|
140 |
+
"news_articles": topic_articles
|
141 |
+
})
|
142 |
+
|
143 |
+
return all_results
|
144 |
except Exception as e:
|
145 |
return {"Error": str(e)}
|