Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,17 +7,58 @@ from bs4 import BeautifulSoup
|
|
7 |
import hashlib
|
8 |
import threading
|
9 |
import logging
|
10 |
-
|
11 |
-
# Add this to your imports
|
12 |
from transformers import MarianMTModel, MarianTokenizer
|
13 |
|
14 |
# Set up logging
|
15 |
logging.basicConfig(level=logging.INFO)
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
18 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-ar-en"
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
class Translator:
|
22 |
def __init__(self):
|
23 |
self.model = None
|
@@ -43,15 +84,87 @@ class Translator:
|
|
43 |
logger.error(f"Translation error: {str(e)}")
|
44 |
return text
|
45 |
|
46 |
-
# Initialize translator
|
47 |
translator = Translator()
|
48 |
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
def is_arabic_source(source_name):
|
53 |
return any(arabic_indicator in source_name.lower() for arabic_indicator in ['arabic', 'alarabiya', 'aljazeera', 'alwatanvoice'])
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
def summarize_text(text, model_name, source):
|
56 |
try:
|
57 |
# Translate if it's an Arabic source
|
@@ -93,7 +206,6 @@ def summarize_articles(articles, model_name):
|
|
93 |
def get_summary(tech_sources, business_sources, science_sources, world_sources,
|
94 |
sports_sources, health_sources, selected_model):
|
95 |
try:
|
96 |
-
# Check if any sources are selected
|
97 |
if not any([tech_sources, business_sources, science_sources,
|
98 |
world_sources, sports_sources, health_sources]):
|
99 |
return "Please select at least one news source."
|
@@ -118,21 +230,18 @@ with demo:
|
|
118 |
|
119 |
with gr.Row():
|
120 |
with gr.Column():
|
121 |
-
# Technology sources
|
122 |
tech_sources = gr.CheckboxGroup(
|
123 |
choices=list(NEWS_SOURCES["Technology"].keys()),
|
124 |
label="Technology Sources",
|
125 |
value=[]
|
126 |
)
|
127 |
|
128 |
-
# Business sources
|
129 |
business_sources = gr.CheckboxGroup(
|
130 |
choices=list(NEWS_SOURCES["Business"].keys()),
|
131 |
label="Business Sources",
|
132 |
value=[]
|
133 |
)
|
134 |
|
135 |
-
# Science sources
|
136 |
science_sources = gr.CheckboxGroup(
|
137 |
choices=list(NEWS_SOURCES["Science"].keys()),
|
138 |
label="Science Sources",
|
@@ -140,21 +249,18 @@ with demo:
|
|
140 |
)
|
141 |
|
142 |
with gr.Column():
|
143 |
-
# World News sources
|
144 |
world_sources = gr.CheckboxGroup(
|
145 |
choices=list(NEWS_SOURCES["World News"].keys()),
|
146 |
label="World News Sources",
|
147 |
value=[]
|
148 |
)
|
149 |
|
150 |
-
# Sports sources
|
151 |
sports_sources = gr.CheckboxGroup(
|
152 |
choices=list(NEWS_SOURCES["Sports"].keys()),
|
153 |
label="Sports Sources",
|
154 |
value=[]
|
155 |
)
|
156 |
|
157 |
-
# Health sources
|
158 |
health_sources = gr.CheckboxGroup(
|
159 |
choices=list(NEWS_SOURCES["Health"].keys()),
|
160 |
label="Health Sources",
|
@@ -171,7 +277,6 @@ with demo:
|
|
171 |
summarize_button = gr.Button("Get News Summary")
|
172 |
summary_output = gr.Textbox(label="News Summary", lines=20)
|
173 |
|
174 |
-
# Connect the components to the summary function
|
175 |
summarize_button.click(
|
176 |
get_summary,
|
177 |
inputs=[
|
|
|
7 |
import hashlib
|
8 |
import threading
|
9 |
import logging
|
|
|
|
|
10 |
from transformers import MarianMTModel, MarianTokenizer
|
11 |
|
12 |
# Set up logging
|
13 |
logging.basicConfig(level=logging.INFO)
|
14 |
logger = logging.getLogger(__name__)
|
15 |
|
16 |
+
# Global settings
|
17 |
+
SUMMARIZER_MODELS = {
|
18 |
+
"Default (facebook/bart-large-cnn)": "facebook/bart-large-cnn",
|
19 |
+
"Free Model (distilbart-cnn-6-6)": "sshleifer/distilbart-cnn-6-6"
|
20 |
+
}
|
21 |
+
CACHE_SIZE = 500
|
22 |
+
RSS_FETCH_INTERVAL = timedelta(hours=8)
|
23 |
+
ARTICLE_LIMIT = 5
|
24 |
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-ar-en"
|
25 |
|
26 |
+
# Categories and news sources
|
27 |
+
CATEGORIES = ["Technology", "Business", "Science", "World News", "Sports", "Health"]
|
28 |
+
NEWS_SOURCES = {
|
29 |
+
"Technology": {
|
30 |
+
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
|
31 |
+
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best",
|
32 |
+
"alarabiya arabic": "https://www.alarabiya.net/feed/rss2/ar/technology.xml",
|
33 |
+
},
|
34 |
+
"Business": {
|
35 |
+
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
|
36 |
+
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best",
|
37 |
+
"alwatanvoice arabic": "https://feeds.alwatanvoice.com/ar/business.xml",
|
38 |
+
},
|
39 |
+
"Science": {
|
40 |
+
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
|
41 |
+
},
|
42 |
+
"World News": {
|
43 |
+
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
|
44 |
+
"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
|
45 |
+
"CNN": "http://rss.cnn.com/rss/edition_world.rss",
|
46 |
+
"reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best",
|
47 |
+
"france24 arabic": "https://www.france24.com/ar/rss",
|
48 |
+
"aljazera arabic": "https://www.aljazeera.net/aljazeerarss/a7c186be-1baa-4bd4-9d80-a84db769f779/73d0e1b4-532f-45ef-b135-bfdff8b8cab9",
|
49 |
+
},
|
50 |
+
"Sports": {
|
51 |
+
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
|
52 |
+
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best",
|
53 |
+
"france24 arabic": "https://www.france24.com/ar/%D8%B1%D9%8A%D8%A7%D8%B6%D8%A9/rss",
|
54 |
+
},
|
55 |
+
"Health": {
|
56 |
+
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
|
57 |
+
"politico": "http://rss.politico.com/healthcare.xml",
|
58 |
+
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
|
59 |
+
},
|
60 |
+
}
|
61 |
+
|
62 |
class Translator:
|
63 |
def __init__(self):
|
64 |
self.model = None
|
|
|
84 |
logger.error(f"Translation error: {str(e)}")
|
85 |
return text
|
86 |
|
87 |
+
# Initialize translator and cache
|
88 |
translator = Translator()
|
89 |
|
90 |
+
class NewsCache:
|
91 |
+
def __init__(self, size):
|
92 |
+
self.cache = {}
|
93 |
+
self.size = size
|
94 |
+
self.lock = threading.Lock()
|
95 |
+
|
96 |
+
def get(self, key):
|
97 |
+
with self.lock:
|
98 |
+
return self.cache.get(key)
|
99 |
+
|
100 |
+
def set(self, key, value):
|
101 |
+
with self.lock:
|
102 |
+
if len(self.cache) >= self.size:
|
103 |
+
oldest_key = next(iter(self.cache))
|
104 |
+
del self.cache[oldest_key]
|
105 |
+
self.cache[key] = value
|
106 |
+
|
107 |
+
cache = NewsCache(CACHE_SIZE)
|
108 |
|
109 |
def is_arabic_source(source_name):
|
110 |
return any(arabic_indicator in source_name.lower() for arabic_indicator in ['arabic', 'alarabiya', 'aljazeera', 'alwatanvoice'])
|
111 |
|
112 |
+
def fetch_rss_news(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources):
|
113 |
+
articles = []
|
114 |
+
cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
|
115 |
+
|
116 |
+
category_sources = {
|
117 |
+
"Technology": tech_sources if tech_sources else [],
|
118 |
+
"Business": business_sources if business_sources else [],
|
119 |
+
"Science": science_sources if science_sources else [],
|
120 |
+
"World News": world_sources if world_sources else [],
|
121 |
+
"Sports": sports_sources if sports_sources else [],
|
122 |
+
"Health": health_sources if health_sources else []
|
123 |
+
}
|
124 |
+
|
125 |
+
logger.info(f"Selected sources: {category_sources}")
|
126 |
+
|
127 |
+
for category, sources in category_sources.items():
|
128 |
+
if not sources:
|
129 |
+
continue
|
130 |
+
|
131 |
+
logger.info(f"Processing category: {category} with sources: {sources}")
|
132 |
+
|
133 |
+
for source in sources:
|
134 |
+
if source in NEWS_SOURCES[category]:
|
135 |
+
url = NEWS_SOURCES[category][source]
|
136 |
+
try:
|
137 |
+
logger.info(f"Fetching from URL: {url}")
|
138 |
+
feed = feedparser.parse(url)
|
139 |
+
|
140 |
+
if hasattr(feed, 'status') and feed.status != 200:
|
141 |
+
logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
|
142 |
+
continue
|
143 |
+
|
144 |
+
for entry in feed.entries:
|
145 |
+
try:
|
146 |
+
published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
|
147 |
+
if published > cutoff_time:
|
148 |
+
articles.append({
|
149 |
+
"title": entry.title,
|
150 |
+
"description": BeautifulSoup(entry.description, "html.parser").get_text(),
|
151 |
+
"link": entry.link,
|
152 |
+
"category": category,
|
153 |
+
"source": source,
|
154 |
+
"published": published
|
155 |
+
})
|
156 |
+
except (AttributeError, TypeError) as e:
|
157 |
+
logger.error(f"Error processing entry: {str(e)}")
|
158 |
+
continue
|
159 |
+
|
160 |
+
except Exception as e:
|
161 |
+
logger.error(f"Error fetching feed from {url}: {str(e)}")
|
162 |
+
continue
|
163 |
+
|
164 |
+
logger.info(f"Total articles fetched: {len(articles)}")
|
165 |
+
articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT]
|
166 |
+
return articles
|
167 |
+
|
168 |
def summarize_text(text, model_name, source):
|
169 |
try:
|
170 |
# Translate if it's an Arabic source
|
|
|
206 |
def get_summary(tech_sources, business_sources, science_sources, world_sources,
|
207 |
sports_sources, health_sources, selected_model):
|
208 |
try:
|
|
|
209 |
if not any([tech_sources, business_sources, science_sources,
|
210 |
world_sources, sports_sources, health_sources]):
|
211 |
return "Please select at least one news source."
|
|
|
230 |
|
231 |
with gr.Row():
|
232 |
with gr.Column():
|
|
|
233 |
tech_sources = gr.CheckboxGroup(
|
234 |
choices=list(NEWS_SOURCES["Technology"].keys()),
|
235 |
label="Technology Sources",
|
236 |
value=[]
|
237 |
)
|
238 |
|
|
|
239 |
business_sources = gr.CheckboxGroup(
|
240 |
choices=list(NEWS_SOURCES["Business"].keys()),
|
241 |
label="Business Sources",
|
242 |
value=[]
|
243 |
)
|
244 |
|
|
|
245 |
science_sources = gr.CheckboxGroup(
|
246 |
choices=list(NEWS_SOURCES["Science"].keys()),
|
247 |
label="Science Sources",
|
|
|
249 |
)
|
250 |
|
251 |
with gr.Column():
|
|
|
252 |
world_sources = gr.CheckboxGroup(
|
253 |
choices=list(NEWS_SOURCES["World News"].keys()),
|
254 |
label="World News Sources",
|
255 |
value=[]
|
256 |
)
|
257 |
|
|
|
258 |
sports_sources = gr.CheckboxGroup(
|
259 |
choices=list(NEWS_SOURCES["Sports"].keys()),
|
260 |
label="Sports Sources",
|
261 |
value=[]
|
262 |
)
|
263 |
|
|
|
264 |
health_sources = gr.CheckboxGroup(
|
265 |
choices=list(NEWS_SOURCES["Health"].keys()),
|
266 |
label="Health Sources",
|
|
|
277 |
summarize_button = gr.Button("Get News Summary")
|
278 |
summary_output = gr.Textbox(label="News Summary", lines=20)
|
279 |
|
|
|
280 |
summarize_button.click(
|
281 |
get_summary,
|
282 |
inputs=[
|