Spaces:

sohail-shaikh-s07
/

News-Article-Summarization

Sleeping

App Files Files

sohail-shaikh-s07 commited on Dec 21, 2024

Commit

e521008

verified ·

1 Parent(s): b12afd3

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -176

app.py CHANGED Viewed

@@ -5,25 +5,17 @@ from transformers import pipeline
 import nltk
 import torch
 from urllib.parse import urlparse
-import time
-import re
-import json
 # Download required NLTK data
 try:
-    nltk.download('punkt', quiet=True)
 except Exception as e:
     print(f"Error downloading NLTK data: {e}")
-# Initialize the summarization pipeline with a smaller, faster model
 try:
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    summarizer = pipeline(
-        "summarization",
-        model="facebook/bart-base-cnn",
-        device=device,
-        model_kwargs={"cache_dir": "model_cache"}
-    )
 except Exception as e:
     print(f"Error loading model: {e}")
     summarizer = None
@@ -35,163 +27,43 @@ def is_valid_url(url):
     except:
         return False
-def clean_text(text):
-    # Remove extra whitespace and special characters
-    text = re.sub(r'\s+', ' ', text)
-    text = re.sub(r'[^\w\s.,!?-]', '', text)
-    # Remove common unwanted phrases
-    text = re.sub(r'advertisement|subscribe now|subscription required|please sign in', '', text, flags=re.IGNORECASE)
-    return text.strip()
-def get_hindu_article(url):
-    """Special handler for The Hindu website"""
-    try:
-        # First request to get cookies and tokens
-        session = requests.Session()
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.5',
-            'Referer': 'https://www.thehindu.com/',
-            'DNT': '1',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1',
-            'Sec-Fetch-Dest': 'document',
-            'Sec-Fetch-Mode': 'navigate',
-            'Sec-Fetch-Site': 'same-origin',
-            'Sec-Fetch-User': '?1',
-            'Cache-Control': 'max-age=0'
-        }
-        # Get the article ID from the URL
-        article_id = re.search(r'article(\d+)', url)
-        if article_id:
-            article_id = article_id.group(1)
-            api_url = f"https://www.thehindu.com/api/article/{article_id}/"
-            response = session.get(api_url, headers=headers)
-            if response.status_code == 200:
-                try:
-                    data = response.json()
-                    if 'body' in data:
-                        # Parse the HTML content from the API response
-                        soup = BeautifulSoup(data['body'], 'html.parser')
-                        text = ' '.join(p.get_text().strip() for p in soup.find_all('p'))
-                        if text:
-                            return text
-                except:
-                    pass
-        # Fallback to regular page scraping
-        response = session.get(url, headers=headers)
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Try multiple selectors specific to The Hindu
-        selectors = [
-            'div.article-text',
-            'div#content-body',
-            'div.article',
-            'div[itemprop="articleBody"]',
-            'div.paywall'
-        ]
-        article_text = ""
-        for selector in selectors:
-            content = soup.select_one(selector)
-            if content:
-                paragraphs = content.find_all(['p', 'div'], class_=lambda x: x and not any(c in str(x).lower() for c in [
-                    'caption', 'footer', 'social', 'meta', 'share', 'related', 'ad', 'copyright'
-                ]))
-                texts = [p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40]
-                if texts:
-                    article_text = ' '.join(texts)
-                    break
-        if article_text:
-            return article_text
-        # Last resort: try to find any substantial paragraphs
-        all_paragraphs = soup.find_all('p')
-        texts = [p.get_text().strip() for p in all_paragraphs if len(p.get_text().strip()) > 40]
-        return ' '.join(texts) if texts else None
-    except Exception as e:
-        print(f"Error in get_hindu_article: {str(e)}")
-        return None
 def extract_article_text(url):
-    """Extract article text with special handling for different news sites"""
     headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'DNT': '1',
-        'Connection': 'keep-alive',
-        'Upgrade-Insecure-Requests': '1',
     }
     try:
-        # Special handling for The Hindu
-        if 'thehindu.com' in url:
-            article_text = get_hindu_article(url)
-            if article_text:
-                return clean_text(article_text)[:8000]
-        # Regular handling for other sites
         response = requests.get(url, headers=headers, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         # Remove unwanted elements
-        for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'meta', 'link']):
             tag.decompose()
-        # Try multiple methods to find the main content
         article_text = ""
-        # Method 1: Look for article tag
-        article = soup.find('article')
-        # Method 2: Look for specific class names
-        if not article:
-            article = soup.find(class_=lambda x: x and any(c in str(x).lower() for c in [
-                'article', 'story', 'content', 'body', 'text', 'main', 'entry'
-            ]))
-        # Method 3: Look for specific div patterns
-        if not article:
-            article = soup.find('div', {'id': re.compile('article|content|story|main', re.I)})
-        if article:
-            paragraphs = article.find_all(['p', 'div'], class_=lambda x: x and not any(c in str(x).lower() for c in [
-                'caption', 'footer', 'social', 'meta', 'share', 'related', 'ad', 'copyright'
-            ]))
         else:
-            paragraphs = soup.find_all('p', recursive=True)
-        texts = []
-        for p in paragraphs:
-            text = p.get_text().strip()
-            if len(text) > 40 and not any(x in text.lower() for x in ['advertisement', 'subscribe', 'subscription']):
-                texts.append(clean_text(text))
-        article_text = ' '.join(texts)
-        if not article_text:
-            body = soup.find('body')
-            if body:
-                article_text = clean_text(body.get_text())
-        if len(article_text) < 100:
-            raise Exception("Could not find enough article content")
-        return article_text[:8000]
     except Exception as e:
-        print(f"Error in extract_article_text: {str(e)}")
-        raise Exception(f"Error extracting article: {str(e)}")
-def extract_and_summarize(url, progress=gr.Progress()):
     if not url or not url.strip():
         return "Please enter a valid URL"
@@ -199,30 +71,22 @@ def extract_and_summarize(url, progress=gr.Progress()):
         return "Please enter a valid URL starting with http:// or https://"
     try:
-        start_time = time.time()
-        progress(0.2, desc="Fetching article...")
         text = extract_article_text(url)
         if not text:
             return "Could not extract text from the article. Please make sure it's a valid news article."
-        progress(0.4, desc="Processing text...")
-        max_chunk_length = 512
         chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
-        chunks = chunks[:3]
-        progress(0.6, desc="Generating summary...")
         summaries = []
         for chunk in chunks:
-            if len(chunk.strip()) > 50:
                 try:
-                    summary = summarizer(
-                        chunk,
-                        max_length=100,
-                        min_length=20,
-                        do_sample=False
-                    )
                     summaries.append(summary[0]['summary_text'])
                 except Exception as e:
                     print(f"Error summarizing chunk: {e}")
@@ -230,11 +94,11 @@ def extract_and_summarize(url, progress=gr.Progress()):
         if not summaries:
             return "Could not generate summary. Please try a different article."
         final_summary = " ".join(summaries)
-        processing_time = round(time.time() - start_time, 2)
-        return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
     except Exception as e:
         return f"Error processing article: {str(e)}"
@@ -245,24 +109,17 @@ demo = gr.Interface(
     inputs=gr.Textbox(
         label="Enter News Article URL",
         placeholder="https://...",
-        info="Enter a news article URL to get a quick summary"
     ),
     outputs=gr.Textbox(label="Summary", lines=5),
-    title="📰 Fast News Article Summarizer",
     description="""
-    This app quickly summarizes news articles using AI.
-    Simply paste a URL and get a concise summary in seconds!
-    Supported news sites include:
-    - BBC News
-    - Reuters
-    - The Hindu
-    - And many more!
     """,
     examples=[
         ["https://www.bbc.com/news/world-us-canada-67841980"],
-        ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"],
-        ["https://www.thehindu.com/news/cities/mumbai/mumbai-boat-accident-body-of-missing-boy-found-off-mumbai-coast-toll-rises-to-15/article69012138.ece"]
     ],
     theme=gr.themes.Soft()
 )

 import nltk
 import torch
 from urllib.parse import urlparse
 # Download required NLTK data
 try:
+    nltk.download('punkt')
 except Exception as e:
     print(f"Error downloading NLTK data: {e}")
+# Initialize the summarization pipeline
 try:
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
 except Exception as e:
     print(f"Error loading model: {e}")
     summarizer = None
     except:
         return False
 def extract_article_text(url):
+    """Extract article text using BeautifulSoup instead of newspaper3k"""
     headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
     }
     try:
         response = requests.get(url, headers=headers, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         # Remove unwanted elements
+        for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
             tag.decompose()
+        # Find the main content
         article_text = ""
+        # Look for common article containers
+        main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
+        if main_content:
+            paragraphs = main_content.find_all('p')
         else:
+            # Fallback to all paragraphs if no article container found
+            paragraphs = soup.find_all('p')
+        # Extract text from paragraphs
+        article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
+        return article_text
     except Exception as e:
+        raise Exception(f"Error fetching article: {str(e)}")
+def extract_and_summarize(url):
     if not url or not url.strip():
         return "Please enter a valid URL"
         return "Please enter a valid URL starting with http:// or https://"
     try:
+        # Extract article text
         text = extract_article_text(url)
         if not text:
             return "Could not extract text from the article. Please make sure it's a valid news article."
+        # Split text into chunks if it's too long
+        max_chunk_length = 1024
         chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
+        # Summarize each chunk
         summaries = []
         for chunk in chunks:
+            if len(chunk.strip()) > 100:  # Only summarize substantial chunks
                 try:
+                    summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
                     summaries.append(summary[0]['summary_text'])
                 except Exception as e:
                     print(f"Error summarizing chunk: {e}")
         if not summaries:
             return "Could not generate summary. Please try a different article."
+        # Combine all summaries
         final_summary = " ".join(summaries)
+        return final_summary
     except Exception as e:
         return f"Error processing article: {str(e)}"
     inputs=gr.Textbox(
         label="Enter News Article URL",
         placeholder="https://...",
+        info="Enter a news article URL to get a summary"
     ),
     outputs=gr.Textbox(label="Summary", lines=5),
+    title="📰 News Article Summarizer",
     description="""
+    This app creates concise summaries of news articles using AI.
+    Simply paste a URL of a news article and get a summary!
     """,
     examples=[
         ["https://www.bbc.com/news/world-us-canada-67841980"],
+        ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
     ],
     theme=gr.themes.Soft()
 )