loayshabet commited on
Commit
372c5e7
·
verified ·
1 Parent(s): 2cc6057

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -14
app.py CHANGED
@@ -7,17 +7,58 @@ from bs4 import BeautifulSoup
7
  import hashlib
8
  import threading
9
  import logging
10
-
11
- # Add this to your imports
12
  from transformers import MarianMTModel, MarianTokenizer
13
 
14
  # Set up logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
- # Add translation model configuration
 
 
 
 
 
 
 
19
  TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-ar-en"
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  class Translator:
22
  def __init__(self):
23
  self.model = None
@@ -43,15 +84,87 @@ class Translator:
43
  logger.error(f"Translation error: {str(e)}")
44
  return text
45
 
46
- # Initialize translator
47
  translator = Translator()
48
 
49
- # Rest of your existing configurations...
50
- [Your existing SUMMARIZER_MODELS, CACHE_SIZE, RSS_FETCH_INTERVAL, ARTICLE_LIMIT, CATEGORIES, and NEWS_SOURCES definitions]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  def is_arabic_source(source_name):
53
  return any(arabic_indicator in source_name.lower() for arabic_indicator in ['arabic', 'alarabiya', 'aljazeera', 'alwatanvoice'])
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def summarize_text(text, model_name, source):
56
  try:
57
  # Translate if it's an Arabic source
@@ -93,7 +206,6 @@ def summarize_articles(articles, model_name):
93
  def get_summary(tech_sources, business_sources, science_sources, world_sources,
94
  sports_sources, health_sources, selected_model):
95
  try:
96
- # Check if any sources are selected
97
  if not any([tech_sources, business_sources, science_sources,
98
  world_sources, sports_sources, health_sources]):
99
  return "Please select at least one news source."
@@ -118,21 +230,18 @@ with demo:
118
 
119
  with gr.Row():
120
  with gr.Column():
121
- # Technology sources
122
  tech_sources = gr.CheckboxGroup(
123
  choices=list(NEWS_SOURCES["Technology"].keys()),
124
  label="Technology Sources",
125
  value=[]
126
  )
127
 
128
- # Business sources
129
  business_sources = gr.CheckboxGroup(
130
  choices=list(NEWS_SOURCES["Business"].keys()),
131
  label="Business Sources",
132
  value=[]
133
  )
134
 
135
- # Science sources
136
  science_sources = gr.CheckboxGroup(
137
  choices=list(NEWS_SOURCES["Science"].keys()),
138
  label="Science Sources",
@@ -140,21 +249,18 @@ with demo:
140
  )
141
 
142
  with gr.Column():
143
- # World News sources
144
  world_sources = gr.CheckboxGroup(
145
  choices=list(NEWS_SOURCES["World News"].keys()),
146
  label="World News Sources",
147
  value=[]
148
  )
149
 
150
- # Sports sources
151
  sports_sources = gr.CheckboxGroup(
152
  choices=list(NEWS_SOURCES["Sports"].keys()),
153
  label="Sports Sources",
154
  value=[]
155
  )
156
 
157
- # Health sources
158
  health_sources = gr.CheckboxGroup(
159
  choices=list(NEWS_SOURCES["Health"].keys()),
160
  label="Health Sources",
@@ -171,7 +277,6 @@ with demo:
171
  summarize_button = gr.Button("Get News Summary")
172
  summary_output = gr.Textbox(label="News Summary", lines=20)
173
 
174
- # Connect the components to the summary function
175
  summarize_button.click(
176
  get_summary,
177
  inputs=[
 
7
  import hashlib
8
  import threading
9
  import logging
 
 
10
  from transformers import MarianMTModel, MarianTokenizer
11
 
12
  # Set up logging
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
+ # Global settings
17
+ SUMMARIZER_MODELS = {
18
+ "Default (facebook/bart-large-cnn)": "facebook/bart-large-cnn",
19
+ "Free Model (distilbart-cnn-6-6)": "sshleifer/distilbart-cnn-6-6"
20
+ }
21
+ CACHE_SIZE = 500
22
+ RSS_FETCH_INTERVAL = timedelta(hours=8)
23
+ ARTICLE_LIMIT = 5
24
  TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-ar-en"
25
 
26
+ # Categories and news sources
27
+ CATEGORIES = ["Technology", "Business", "Science", "World News", "Sports", "Health"]
28
+ NEWS_SOURCES = {
29
+ "Technology": {
30
+ "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
31
+ "reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best",
32
+ "alarabiya arabic": "https://www.alarabiya.net/feed/rss2/ar/technology.xml",
33
+ },
34
+ "Business": {
35
+ "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
36
+ "reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best",
37
+ "alwatanvoice arabic": "https://feeds.alwatanvoice.com/ar/business.xml",
38
+ },
39
+ "Science": {
40
+ "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
41
+ },
42
+ "World News": {
43
+ "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
44
+ "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
45
+ "CNN": "http://rss.cnn.com/rss/edition_world.rss",
46
+ "reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best",
47
+ "france24 arabic": "https://www.france24.com/ar/rss",
48
+ "aljazera arabic": "https://www.aljazeera.net/aljazeerarss/a7c186be-1baa-4bd4-9d80-a84db769f779/73d0e1b4-532f-45ef-b135-bfdff8b8cab9",
49
+ },
50
+ "Sports": {
51
+ "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
52
+ "reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best",
53
+ "france24 arabic": "https://www.france24.com/ar/%D8%B1%D9%8A%D8%A7%D8%B6%D8%A9/rss",
54
+ },
55
+ "Health": {
56
+ "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
57
+ "politico": "http://rss.politico.com/healthcare.xml",
58
+ "reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
59
+ },
60
+ }
61
+
62
  class Translator:
63
  def __init__(self):
64
  self.model = None
 
84
  logger.error(f"Translation error: {str(e)}")
85
  return text
86
 
87
+ # Initialize translator and cache
88
  translator = Translator()
89
 
90
+ class NewsCache:
91
+ def __init__(self, size):
92
+ self.cache = {}
93
+ self.size = size
94
+ self.lock = threading.Lock()
95
+
96
+ def get(self, key):
97
+ with self.lock:
98
+ return self.cache.get(key)
99
+
100
+ def set(self, key, value):
101
+ with self.lock:
102
+ if len(self.cache) >= self.size:
103
+ oldest_key = next(iter(self.cache))
104
+ del self.cache[oldest_key]
105
+ self.cache[key] = value
106
+
107
+ cache = NewsCache(CACHE_SIZE)
108
 
109
  def is_arabic_source(source_name):
110
  return any(arabic_indicator in source_name.lower() for arabic_indicator in ['arabic', 'alarabiya', 'aljazeera', 'alwatanvoice'])
111
 
112
+ def fetch_rss_news(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources):
113
+ articles = []
114
+ cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
115
+
116
+ category_sources = {
117
+ "Technology": tech_sources if tech_sources else [],
118
+ "Business": business_sources if business_sources else [],
119
+ "Science": science_sources if science_sources else [],
120
+ "World News": world_sources if world_sources else [],
121
+ "Sports": sports_sources if sports_sources else [],
122
+ "Health": health_sources if health_sources else []
123
+ }
124
+
125
+ logger.info(f"Selected sources: {category_sources}")
126
+
127
+ for category, sources in category_sources.items():
128
+ if not sources:
129
+ continue
130
+
131
+ logger.info(f"Processing category: {category} with sources: {sources}")
132
+
133
+ for source in sources:
134
+ if source in NEWS_SOURCES[category]:
135
+ url = NEWS_SOURCES[category][source]
136
+ try:
137
+ logger.info(f"Fetching from URL: {url}")
138
+ feed = feedparser.parse(url)
139
+
140
+ if hasattr(feed, 'status') and feed.status != 200:
141
+ logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
142
+ continue
143
+
144
+ for entry in feed.entries:
145
+ try:
146
+ published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
147
+ if published > cutoff_time:
148
+ articles.append({
149
+ "title": entry.title,
150
+ "description": BeautifulSoup(entry.description, "html.parser").get_text(),
151
+ "link": entry.link,
152
+ "category": category,
153
+ "source": source,
154
+ "published": published
155
+ })
156
+ except (AttributeError, TypeError) as e:
157
+ logger.error(f"Error processing entry: {str(e)}")
158
+ continue
159
+
160
+ except Exception as e:
161
+ logger.error(f"Error fetching feed from {url}: {str(e)}")
162
+ continue
163
+
164
+ logger.info(f"Total articles fetched: {len(articles)}")
165
+ articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT]
166
+ return articles
167
+
168
  def summarize_text(text, model_name, source):
169
  try:
170
  # Translate if it's an Arabic source
 
206
  def get_summary(tech_sources, business_sources, science_sources, world_sources,
207
  sports_sources, health_sources, selected_model):
208
  try:
 
209
  if not any([tech_sources, business_sources, science_sources,
210
  world_sources, sports_sources, health_sources]):
211
  return "Please select at least one news source."
 
230
 
231
  with gr.Row():
232
  with gr.Column():
 
233
  tech_sources = gr.CheckboxGroup(
234
  choices=list(NEWS_SOURCES["Technology"].keys()),
235
  label="Technology Sources",
236
  value=[]
237
  )
238
 
 
239
  business_sources = gr.CheckboxGroup(
240
  choices=list(NEWS_SOURCES["Business"].keys()),
241
  label="Business Sources",
242
  value=[]
243
  )
244
 
 
245
  science_sources = gr.CheckboxGroup(
246
  choices=list(NEWS_SOURCES["Science"].keys()),
247
  label="Science Sources",
 
249
  )
250
 
251
  with gr.Column():
 
252
  world_sources = gr.CheckboxGroup(
253
  choices=list(NEWS_SOURCES["World News"].keys()),
254
  label="World News Sources",
255
  value=[]
256
  )
257
 
 
258
  sports_sources = gr.CheckboxGroup(
259
  choices=list(NEWS_SOURCES["Sports"].keys()),
260
  label="Sports Sources",
261
  value=[]
262
  )
263
 
 
264
  health_sources = gr.CheckboxGroup(
265
  choices=list(NEWS_SOURCES["Health"].keys()),
266
  label="Health Sources",
 
277
  summarize_button = gr.Button("Get News Summary")
278
  summary_output = gr.Textbox(label="News Summary", lines=20)
279
 
 
280
  summarize_button.click(
281
  get_summary,
282
  inputs=[