Spaces:

loayshabet
/

news-sumarry

Sleeping

App Files Files Community

loayshabet commited on Dec 25, 2024

Commit

372c5e7

verified ·

1 Parent(s): 2cc6057

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -14

app.py CHANGED Viewed

@@ -7,17 +7,58 @@ from bs4 import BeautifulSoup
 import hashlib
 import threading
 import logging
-# Add this to your imports
 from transformers import MarianMTModel, MarianTokenizer
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Add translation model configuration
 TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-ar-en"
 class Translator:
     def __init__(self):
         self.model = None
@@ -43,15 +84,87 @@ class Translator:
             logger.error(f"Translation error: {str(e)}")
             return text
-# Initialize translator
 translator = Translator()
-# Rest of your existing configurations...
-[Your existing SUMMARIZER_MODELS, CACHE_SIZE, RSS_FETCH_INTERVAL, ARTICLE_LIMIT, CATEGORIES, and NEWS_SOURCES definitions]
 def is_arabic_source(source_name):
     return any(arabic_indicator in source_name.lower() for arabic_indicator in ['arabic', 'alarabiya', 'aljazeera', 'alwatanvoice'])
 def summarize_text(text, model_name, source):
     try:
         # Translate if it's an Arabic source
@@ -93,7 +206,6 @@ def summarize_articles(articles, model_name):
 def get_summary(tech_sources, business_sources, science_sources, world_sources,
                 sports_sources, health_sources, selected_model):
     try:
-        # Check if any sources are selected
         if not any([tech_sources, business_sources, science_sources,
                    world_sources, sports_sources, health_sources]):
             return "Please select at least one news source."
@@ -118,21 +230,18 @@ with demo:
     with gr.Row():
         with gr.Column():
-            # Technology sources
             tech_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Technology"].keys()),
                 label="Technology Sources",
                 value=[]
             )
-            # Business sources
             business_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Business"].keys()),
                 label="Business Sources",
                 value=[]
             )
-            # Science sources
             science_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Science"].keys()),
                 label="Science Sources",
@@ -140,21 +249,18 @@ with demo:
             )
         with gr.Column():
-            # World News sources
             world_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["World News"].keys()),
                 label="World News Sources",
                 value=[]
             )
-            # Sports sources
             sports_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Sports"].keys()),
                 label="Sports Sources",
                 value=[]
             )
-            # Health sources
             health_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Health"].keys()),
                 label="Health Sources",
@@ -171,7 +277,6 @@ with demo:
     summarize_button = gr.Button("Get News Summary")
     summary_output = gr.Textbox(label="News Summary", lines=20)
-    # Connect the components to the summary function
     summarize_button.click(
         get_summary,
         inputs=[

 import hashlib
 import threading
 import logging
 from transformers import MarianMTModel, MarianTokenizer
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Global settings
+SUMMARIZER_MODELS = {
+    "Default (facebook/bart-large-cnn)": "facebook/bart-large-cnn",
+    "Free Model (distilbart-cnn-6-6)": "sshleifer/distilbart-cnn-6-6"
+}
+CACHE_SIZE = 500
+RSS_FETCH_INTERVAL = timedelta(hours=8)
+ARTICLE_LIMIT = 5
 TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-ar-en"
+# Categories and news sources
+CATEGORIES = ["Technology", "Business", "Science", "World News", "Sports", "Health"]
+NEWS_SOURCES = {
+    "Technology": {
+        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
+        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best",
+        "alarabiya arabic": "https://www.alarabiya.net/feed/rss2/ar/technology.xml",
+    },
+    "Business": {
+        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
+        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best",
+        "alwatanvoice arabic": "https://feeds.alwatanvoice.com/ar/business.xml",
+    },
+    "Science": {
+        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
+    },
+    "World News": {
+        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
+        "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
+        "CNN": "http://rss.cnn.com/rss/edition_world.rss",
+        "reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best",
+        "france24 arabic": "https://www.france24.com/ar/rss",
+        "aljazera arabic": "https://www.aljazeera.net/aljazeerarss/a7c186be-1baa-4bd4-9d80-a84db769f779/73d0e1b4-532f-45ef-b135-bfdff8b8cab9",
+    },
+    "Sports": {
+        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
+        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best",
+        "france24 arabic": "https://www.france24.com/ar/%D8%B1%D9%8A%D8%A7%D8%B6%D8%A9/rss",
+    },
+    "Health": {
+        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
+        "politico": "http://rss.politico.com/healthcare.xml",
+        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
+    },
+}
 class Translator:
     def __init__(self):
         self.model = None
             logger.error(f"Translation error: {str(e)}")
             return text
+# Initialize translator and cache
 translator = Translator()
+class NewsCache:
+    def __init__(self, size):
+        self.cache = {}
+        self.size = size
+        self.lock = threading.Lock()
+    def get(self, key):
+        with self.lock:
+            return self.cache.get(key)
+    def set(self, key, value):
+        with self.lock:
+            if len(self.cache) >= self.size:
+                oldest_key = next(iter(self.cache))
+                del self.cache[oldest_key]
+            self.cache[key] = value
+cache = NewsCache(CACHE_SIZE)
 def is_arabic_source(source_name):
     return any(arabic_indicator in source_name.lower() for arabic_indicator in ['arabic', 'alarabiya', 'aljazeera', 'alwatanvoice'])
+def fetch_rss_news(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources):
+    articles = []
+    cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
+    category_sources = {
+        "Technology": tech_sources if tech_sources else [],
+        "Business": business_sources if business_sources else [],
+        "Science": science_sources if science_sources else [],
+        "World News": world_sources if world_sources else [],
+        "Sports": sports_sources if sports_sources else [],
+        "Health": health_sources if health_sources else []
+    }
+    logger.info(f"Selected sources: {category_sources}")
+    for category, sources in category_sources.items():
+        if not sources:
+            continue
+        logger.info(f"Processing category: {category} with sources: {sources}")
+        for source in sources:
+            if source in NEWS_SOURCES[category]:
+                url = NEWS_SOURCES[category][source]
+                try:
+                    logger.info(f"Fetching from URL: {url}")
+                    feed = feedparser.parse(url)
+                    if hasattr(feed, 'status') and feed.status != 200:
+                        logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
+                        continue
+                    for entry in feed.entries:
+                        try:
+                            published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
+                            if published > cutoff_time:
+                                articles.append({
+                                    "title": entry.title,
+                                    "description": BeautifulSoup(entry.description, "html.parser").get_text(),
+                                    "link": entry.link,
+                                    "category": category,
+                                    "source": source,
+                                    "published": published
+                                })
+                        except (AttributeError, TypeError) as e:
+                            logger.error(f"Error processing entry: {str(e)}")
+                            continue
+                except Exception as e:
+                    logger.error(f"Error fetching feed from {url}: {str(e)}")
+                    continue
+    logger.info(f"Total articles fetched: {len(articles)}")
+    articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT]
+    return articles
 def summarize_text(text, model_name, source):
     try:
         # Translate if it's an Arabic source
 def get_summary(tech_sources, business_sources, science_sources, world_sources,
                 sports_sources, health_sources, selected_model):
     try:
         if not any([tech_sources, business_sources, science_sources,
                    world_sources, sports_sources, health_sources]):
             return "Please select at least one news source."
     with gr.Row():
         with gr.Column():
             tech_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Technology"].keys()),
                 label="Technology Sources",
                 value=[]
             )
             business_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Business"].keys()),
                 label="Business Sources",
                 value=[]
             )
             science_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Science"].keys()),
                 label="Science Sources",
             )
         with gr.Column():
             world_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["World News"].keys()),
                 label="World News Sources",
                 value=[]
             )
             sports_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Sports"].keys()),
                 label="Sports Sources",
                 value=[]
             )
             health_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Health"].keys()),
                 label="Health Sources",
     summarize_button = gr.Button("Get News Summary")
     summary_output = gr.Textbox(label="News Summary", lines=20)
     summarize_button.click(
         get_summary,
         inputs=[