Spaces:

sohail-shaikh-s07
/

News-Article-Summarization

Sleeping

App Files Files

sohail-shaikh-s07 commited on 24 days ago

Commit

5b9028d

verified ·

1 Parent(s): dd0f744

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -31

app.py CHANGED Viewed

@@ -5,17 +5,23 @@ from transformers import pipeline
 import nltk
 import torch
 from urllib.parse import urlparse
 # Download required NLTK data
 try:
-    nltk.download('punkt')
 except Exception as e:
     print(f"Error downloading NLTK data: {e}")
-# Initialize the summarization pipeline
 try:
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
 except Exception as e:
     print(f"Error loading model: {e}")
     summarizer = None
@@ -28,13 +34,14 @@ def is_valid_url(url):
         return False
 def extract_article_text(url):
-    """Extract article text using BeautifulSoup instead of newspaper3k"""
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
     }
     try:
-        response = requests.get(url, headers=headers, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
@@ -43,27 +50,29 @@ def extract_article_text(url):
         for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
             tag.decompose()
-        # Find the main content
-        article_text = ""
-        # Look for common article containers
-        main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
         if main_content:
-            paragraphs = main_content.find_all('p')
         else:
-            # Fallback to all paragraphs if no article container found
-            paragraphs = soup.find_all('p')
-        # Extract text from paragraphs
-        article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
-        return article_text
     except Exception as e:
         raise Exception(f"Error fetching article: {str(e)}")
-def extract_and_summarize(url):
     if not url or not url.strip():
         return "Please enter a valid URL"
@@ -71,22 +80,35 @@ def extract_and_summarize(url):
         return "Please enter a valid URL starting with http:// or https://"
     try:
-        # Extract article text
         text = extract_article_text(url)
         if not text:
             return "Could not extract text from the article. Please make sure it's a valid news article."
-        # Split text into chunks if it's too long
-        max_chunk_length = 1024
         chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
-        # Summarize each chunk
         summaries = []
         for chunk in chunks:
-            if len(chunk.strip()) > 100:  # Only summarize substantial chunks
                 try:
-                    summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
                     summaries.append(summary[0]['summary_text'])
                 except Exception as e:
                     print(f"Error summarizing chunk: {e}")
@@ -94,11 +116,13 @@ def extract_and_summarize(url):
         if not summaries:
             return "Could not generate summary. Please try a different article."
-        # Combine all summaries
         final_summary = " ".join(summaries)
-        return final_summary
     except Exception as e:
         return f"Error processing article: {str(e)}"
@@ -109,13 +133,13 @@ demo = gr.Interface(
     inputs=gr.Textbox(
         label="Enter News Article URL",
         placeholder="https://...",
-        info="Enter a news article URL to get a summary"
     ),
     outputs=gr.Textbox(label="Summary", lines=5),
-    title="📰 News Article Summarizer",
     description="""
-    This app creates concise summaries of news articles using AI.
-    Simply paste a URL of a news article and get a summary!
     """,
     examples=[
         ["https://www.bbc.com/news/world-us-canada-67841980"],

 import nltk
 import torch
 from urllib.parse import urlparse
+import time
 # Download required NLTK data
 try:
+    nltk.download('punkt', quiet=True)
 except Exception as e:
     print(f"Error downloading NLTK data: {e}")
+# Initialize the summarization pipeline with a smaller, faster model
 try:
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    summarizer = pipeline(
+        "summarization",
+        model="facebook/bart-base-cnn",  # Using smaller base model instead of large
+        device=device,
+        model_kwargs={"cache_dir": "model_cache"}
+    )
 except Exception as e:
     print(f"Error loading model: {e}")
     summarizer = None
         return False
 def extract_article_text(url):
+    """Extract article text using BeautifulSoup with timeout"""
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
     }
     try:
+        # Add a shorter timeout
+        response = requests.get(url, headers=headers, timeout=5)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
             tag.decompose()
+        # Find the main content - optimized search
+        main_content = (
+            soup.find('article') or
+            soup.find(attrs={"class": lambda x: x and any(c in x for c in ['article', 'post-content', 'entry-content', 'content'])})
+        )
         if main_content:
+            # Only get paragraphs from main content
+            paragraphs = main_content.find_all('p', recursive=False)
         else:
+            # Limit number of paragraphs if no main content found
+            paragraphs = soup.find_all('p', limit=20)
+        # Extract text from paragraphs with minimum length requirement
+        article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40])
+        # Limit total text length
+        return article_text[:5000]
     except Exception as e:
         raise Exception(f"Error fetching article: {str(e)}")
+def extract_and_summarize(url, progress=gr.Progress()):
     if not url or not url.strip():
         return "Please enter a valid URL"
         return "Please enter a valid URL starting with http:// or https://"
     try:
+        start_time = time.time()
+        # Extract article text with progress updates
+        progress(0.2, desc="Fetching article...")
         text = extract_article_text(url)
         if not text:
             return "Could not extract text from the article. Please make sure it's a valid news article."
+        progress(0.4, desc="Processing text...")
+        # Split text into smaller chunks
+        max_chunk_length = 512  # Reduced chunk size
         chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
+        # Limit number of chunks
+        chunks = chunks[:3]  # Process at most 3 chunks
+        progress(0.6, desc="Generating summary...")
+        # Summarize each chunk with shorter output
         summaries = []
         for chunk in chunks:
+            if len(chunk.strip()) > 50:  # Reduced minimum length requirement
                 try:
+                    summary = summarizer(
+                        chunk,
+                        max_length=100,  # Reduced max length
+                        min_length=20,   # Reduced min length
+                        do_sample=False
+                    )
                     summaries.append(summary[0]['summary_text'])
                 except Exception as e:
                     print(f"Error summarizing chunk: {e}")
         if not summaries:
             return "Could not generate summary. Please try a different article."
+        # Combine summaries
         final_summary = " ".join(summaries)
+        # Add processing time information
+        processing_time = round(time.time() - start_time, 2)
+        return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
     except Exception as e:
         return f"Error processing article: {str(e)}"
     inputs=gr.Textbox(
         label="Enter News Article URL",
         placeholder="https://...",
+        info="Enter a news article URL to get a quick summary"
     ),
     outputs=gr.Textbox(label="Summary", lines=5),
+    title="📰 Fast News Article Summarizer",
     description="""
+    This app quickly summarizes news articles using AI.
+    Simply paste a URL and get a concise summary in seconds!
     """,
     examples=[
         ["https://www.bbc.com/news/world-us-canada-67841980"],