Spaces:

sohail-shaikh-s07
/

News-Article-Summarization

Sleeping

App Files Files

sohail-shaikh-s07 commited on Dec 21, 2024

Commit

e31262c

verified ·

1 Parent(s): 5b9028d

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -31

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import nltk
 import torch
 from urllib.parse import urlparse
 import time
 # Download required NLTK data
 try:
@@ -18,7 +19,7 @@ try:
     device = "cuda" if torch.cuda.is_available() else "cpu"
     summarizer = pipeline(
         "summarization",
-        model="facebook/bart-base-cnn",  # Using smaller base model instead of large
         device=device,
         model_kwargs={"cache_dir": "model_cache"}
     )
@@ -33,44 +34,88 @@ def is_valid_url(url):
     except:
         return False
 def extract_article_text(url):
-    """Extract article text using BeautifulSoup with timeout"""
     headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
     }
     try:
         # Add a shorter timeout
-        response = requests.get(url, headers=headers, timeout=5)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         # Remove unwanted elements
-        for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
             tag.decompose()
-        # Find the main content - optimized search
-        main_content = (
-            soup.find('article') or
-            soup.find(attrs={"class": lambda x: x and any(c in x for c in ['article', 'post-content', 'entry-content', 'content'])})
-        )
-        if main_content:
-            # Only get paragraphs from main content
-            paragraphs = main_content.find_all('p', recursive=False)
         else:
-            # Limit number of paragraphs if no main content found
-            paragraphs = soup.find_all('p', limit=20)
-        # Extract text from paragraphs with minimum length requirement
-        article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40])
-        # Limit total text length
-        return article_text[:5000]
     except Exception as e:
-        raise Exception(f"Error fetching article: {str(e)}")
 def extract_and_summarize(url, progress=gr.Progress()):
     if not url or not url.strip():
@@ -91,22 +136,19 @@ def extract_and_summarize(url, progress=gr.Progress()):
         progress(0.4, desc="Processing text...")
         # Split text into smaller chunks
-        max_chunk_length = 512  # Reduced chunk size
         chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
-        # Limit number of chunks
         chunks = chunks[:3]  # Process at most 3 chunks
         progress(0.6, desc="Generating summary...")
-        # Summarize each chunk with shorter output
         summaries = []
         for chunk in chunks:
-            if len(chunk.strip()) > 50:  # Reduced minimum length requirement
                 try:
                     summary = summarizer(
                         chunk,
-                        max_length=100,  # Reduced max length
-                        min_length=20,   # Reduced min length
                         do_sample=False
                     )
                     summaries.append(summary[0]['summary_text'])
@@ -117,11 +159,9 @@ def extract_and_summarize(url, progress=gr.Progress()):
         if not summaries:
             return "Could not generate summary. Please try a different article."
-        # Combine summaries
         final_summary = " ".join(summaries)
-        # Add processing time information
         processing_time = round(time.time() - start_time, 2)
         return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
     except Exception as e:
@@ -140,10 +180,17 @@ demo = gr.Interface(
     description="""
     This app quickly summarizes news articles using AI.
     Simply paste a URL and get a concise summary in seconds!
     """,
     examples=[
         ["https://www.bbc.com/news/world-us-canada-67841980"],
-        ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
     ],
     theme=gr.themes.Soft()
 )

 import torch
 from urllib.parse import urlparse
 import time
+import re
 # Download required NLTK data
 try:
     device = "cuda" if torch.cuda.is_available() else "cpu"
     summarizer = pipeline(
         "summarization",
+        model="facebook/bart-base-cnn",
         device=device,
         model_kwargs={"cache_dir": "model_cache"}
     )
     except:
         return False
+def clean_text(text):
+    # Remove extra whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove special characters
+    text = re.sub(r'[^\w\s.,!?-]', '', text)
+    return text.strip()
 def extract_article_text(url):
+    """Extract article text with support for various news sites"""
     headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'DNT': '1',
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
     }
     try:
         # Add a shorter timeout
+        response = requests.get(url, headers=headers, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         # Remove unwanted elements
+        for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'meta', 'link']):
             tag.decompose()
+        # Try multiple methods to find the main content
+        article_text = ""
+        # Method 1: Look for article tag
+        article = soup.find('article')
+        # Method 2: Look for specific class names common in news sites
+        if not article:
+            article = soup.find(class_=lambda x: x and any(c in str(x).lower() for c in [
+                'article', 'story', 'content', 'body', 'text', 'main', 'entry'
+            ]))
+        # Method 3: Look for specific div patterns
+        if not article:
+            article = soup.find('div', {'id': re.compile('article|content|story|main', re.I)})
+        # Method 4: The Hindu specific
+        if 'thehindu.com' in url:
+            article = soup.find('div', {'id': 'content-body'}) or soup.find(class_='article')
+        if article:
+            # Get text from paragraphs
+            paragraphs = article.find_all(['p', 'div'], class_=lambda x: x and not any(c in str(x).lower() for c in [
+                'caption', 'footer', 'social', 'meta', 'share', 'related', 'ad', 'copyright'
+            ]))
         else:
+            # Fallback: get all paragraphs
+            paragraphs = soup.find_all('p', recursive=True)
+        # Extract and clean text
+        texts = []
+        for p in paragraphs:
+            text = p.get_text().strip()
+            if len(text) > 40 and not any(x in text.lower() for x in ['advertisement', 'subscribe', 'subscription']):
+                texts.append(clean_text(text))
+        article_text = ' '.join(texts)
+        # If still no text, try getting all text from body
+        if not article_text:
+            body = soup.find('body')
+            if body:
+                article_text = clean_text(body.get_text())
+        # Limit total text length but ensure it's not too short
+        if len(article_text) < 100:
+            raise Exception("Could not find enough article content")
+        return article_text[:8000]  # Limit to 8000 characters
     except Exception as e:
+        print(f"Error in extract_article_text: {str(e)}")
+        raise Exception(f"Error extracting article: {str(e)}")
 def extract_and_summarize(url, progress=gr.Progress()):
     if not url or not url.strip():
         progress(0.4, desc="Processing text...")
         # Split text into smaller chunks
+        max_chunk_length = 512
         chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
         chunks = chunks[:3]  # Process at most 3 chunks
         progress(0.6, desc="Generating summary...")
         summaries = []
         for chunk in chunks:
+            if len(chunk.strip()) > 50:
                 try:
                     summary = summarizer(
                         chunk,
+                        max_length=100,
+                        min_length=20,
                         do_sample=False
                     )
                     summaries.append(summary[0]['summary_text'])
         if not summaries:
             return "Could not generate summary. Please try a different article."
         final_summary = " ".join(summaries)
         processing_time = round(time.time() - start_time, 2)
         return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
     except Exception as e:
     description="""
     This app quickly summarizes news articles using AI.
     Simply paste a URL and get a concise summary in seconds!
+    Supported news sites include:
+    - BBC News
+    - Reuters
+    - The Hindu
+    - And many more!
     """,
     examples=[
         ["https://www.bbc.com/news/world-us-canada-67841980"],
+        ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"],
+        ["https://www.thehindu.com/news/cities/mumbai/mumbai-boat-accident-body-of-missing-boy-found-off-mumbai-coast-toll-rises-to-15/article69012138.ece"]
     ],
     theme=gr.themes.Soft()
 )