Spaces:

sohail-shaikh-s07
/

News-Article-Summarization

Sleeping

App Files Files

sohail-shaikh-s07 commited on 24 days ago

Commit

ca0d432

verified ·

1 Parent(s): 771797e

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -11

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import gradio as gr
-from newspaper import Article
 from transformers import pipeline
 import nltk
 import torch
 # Download required NLTK data
 try:
@@ -18,20 +20,62 @@ except Exception as e:
     print(f"Error loading model: {e}")
     summarizer = None
 def extract_and_summarize(url):
     if not url or not url.strip():
         return "Please enter a valid URL"
     try:
-        # Download and parse article
-        article = Article(url)
-        article.download()
-        article.parse()
-        # Get the text content
-        text = article.text
         if not text:
-            return "Could not extract text from the article"
         # Split text into chunks if it's too long
         max_chunk_length = 1024
@@ -62,10 +106,17 @@ def extract_and_summarize(url):
 # Create Gradio interface
 demo = gr.Interface(
     fn=extract_and_summarize,
-    inputs=gr.Textbox(label="Enter News Article URL", placeholder="https://..."),
-    outputs=gr.Textbox(label="Summary"),
     title="📰 News Article Summarizer",
-    description="Enter a news article URL to get a concise summary. The summary will focus on the main points of the article.",
     examples=[
         ["https://www.bbc.com/news/world-us-canada-67841980"],
         ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]

 import gradio as gr
+import requests
+from bs4 import BeautifulSoup
 from transformers import pipeline
 import nltk
 import torch
+from urllib.parse import urlparse
 # Download required NLTK data
 try:
     print(f"Error loading model: {e}")
     summarizer = None
+def is_valid_url(url):
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except:
+        return False
+def extract_article_text(url):
+    """Extract article text using BeautifulSoup instead of newspaper3k"""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove unwanted elements
+        for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
+            tag.decompose()
+        # Find the main content
+        article_text = ""
+        # Look for common article containers
+        main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
+        if main_content:
+            paragraphs = main_content.find_all('p')
+        else:
+            # Fallback to all paragraphs if no article container found
+            paragraphs = soup.find_all('p')
+        # Extract text from paragraphs
+        article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
+        return article_text
+    except Exception as e:
+        raise Exception(f"Error fetching article: {str(e)}")
 def extract_and_summarize(url):
     if not url or not url.strip():
         return "Please enter a valid URL"
+    if not is_valid_url(url):
+        return "Please enter a valid URL starting with http:// or https://"
     try:
+        # Extract article text
+        text = extract_article_text(url)
         if not text:
+            return "Could not extract text from the article. Please make sure it's a valid news article."
         # Split text into chunks if it's too long
         max_chunk_length = 1024
 # Create Gradio interface
 demo = gr.Interface(
     fn=extract_and_summarize,
+    inputs=gr.Textbox(
+        label="Enter News Article URL",
+        placeholder="https://...",
+        info="Enter a news article URL to get a summary"
+    ),
+    outputs=gr.Textbox(label="Summary", lines=5),
     title="📰 News Article Summarizer",
+    description="""
+    This app creates concise summaries of news articles using AI.
+    Simply paste a URL of a news article and get a summary!
+    """,
     examples=[
         ["https://www.bbc.com/news/world-us-canada-67841980"],
         ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]