Spaces:

sohail-shaikh-s07
/

News-Article-Summarization

Sleeping

File size: 4,847 Bytes

9e1060f
ca0d432
 
78d5ce1
 
9e1060f
ca0d432
9e1060f
53ed53b
78d5ce1
e521008
d52b4e9
 
9e1060f
51a065f
d52b4e9
 
746134a
 
d52b4e9
 
 
9e1060f
ca0d432
 
 
 
 
 
 
 
e521008
ca0d432
e521008
ca0d432
 
 
e31262c
ca0d432
 
 
 
51a065f
e521008
ca0d432
 
51a065f
e31262c
51a065f
 
e521008
e31262c
e521008
 
ca0d432
51a065f
e521008
e31262c
e521008
 
ca0d432
e521008
ca0d432
 
e521008
ca0d432
e521008
d52b4e9
 
 
ca0d432
 
 
9e1060f
e521008
ca0d432
9e1060f
d52b4e9
ca0d432
e521008
51a065f
e521008
78d5ce1
5b9028d
51a065f
9e1060f
51a065f
e521008
d52b4e9
e521008
d52b4e9
 
 
 
9e1060f
d52b4e9
 
e521008
51a065f
9e1060f
e31262c
51a065f
9e1060f
 
d52b4e9
9e1060f
 
d52b4e9
 
ca0d432
 
 
e521008
ca0d432
 
e521008
54af9ee
 
 
 
 
 
 
 
d52b4e9
53ed53b
 
d52b4e9
 
 
9e1060f
 
78d5ce1
54af9ee

import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import nltk
import torch
from urllib.parse import urlparse


try:
    nltk.download('punkt')
except Exception as e:
    print(f"Error downloading NLTK data: {e}")


try:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)  # I have used BART-Large-CNN, you can use any model as per your preference like gpt2, t5, etc
    #summarizer = pipeline("summarization", model="openai-community/gpt2", device=device) 
except Exception as e:
    print(f"Error loading model: {e}")
    summarizer = None

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False

def extract_article_text(url):
    """Extract article text using BeautifulSoup instead of newspaper3k"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove unwanted elements --- to avoid the model to get distract and generate wrong summary
        for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
            tag.decompose()
        
 
        article_text = ""


        main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
        
        if main_content:
            paragraphs = main_content.find_all('p')
        else:
            # Fallback to all paragraphs if no article container found
            paragraphs = soup.find_all('p')
        
        # Extract text from paragraphs
        article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
        
        return article_text
    
    except Exception as e:
        raise Exception(f"Error fetching article: {str(e)}")

def extract_and_summarize(url):
    if not url or not url.strip():
        return "Please enter a valid URL"
    
    if not is_valid_url(url):
        return "Please enter a valid URL starting with http:// or https://"
    
    try:
        # Extract article text
        text = extract_article_text(url)
        
        if not text:
            return "Could not extract text from the article. Please make sure it's a valid news article."
            
        # Split text into chunks if it's too long  --- it will divide the text into 1024 tokens
        max_chunk_length = 1024
        chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
        
        # Summarize each chunk --- each small part will be summarized indiviually
        summaries = []
        for chunk in chunks:
            if len(chunk.strip()) > 100:  # Only summarize substantial chunks
                try:
                    summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
                    summaries.append(summary[0]['summary_text'])
                except Exception as e:
                    print(f"Error summarizing chunk: {e}")
                    continue
        
        if not summaries:
            return "Could not generate summary. Please try a different article."
            
        # Combine all summaries --- we need to combine all summaries to get a complete summary
        final_summary = " ".join(summaries)
        
        return final_summary
        
    except Exception as e:
        return f"Error processing article: {str(e)}"

# Create Gradio interface
demo = gr.Interface(
    fn=extract_and_summarize,
    inputs=gr.Textbox(
        label="Enter News Article URL",
        placeholder="https://...",
        info="Enter a news article URL to get a summary"
    ),
    outputs=gr.Textbox(label="Summary", lines=5),
    title="📰 News Article Summarizer",
    description="""This app creates concise summaries of news articles using AI.
Simply paste a URL of a news article and get a summary!

⏳ Processing Time: The summarization process typically takes 30-60 seconds, depending on article length.

📊 Status Indicator: Look for "Processing" in the output box - this indicates the model is actively generating your summary.

✨ Quality Assurance: Please wait for the process to complete for the best results.""",
    examples=[
        ["https://www.bbc.com/sport/football/articles/cvgxmzy86e4o"],
        ["https://globalsouthworld.com/article/biden-approves-571-million-in-defense-support-for-taiwan"]
    ],
    theme=gr.themes.Soft()
)

if __name__ == "__main__":
    demo.launch()