sohail-shaikh-s07's picture
Update app.py
5b9028d verified
raw
history blame
5.35 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import nltk
import torch
from urllib.parse import urlparse
import time
# Download required NLTK data
try:
nltk.download('punkt', quiet=True)
except Exception as e:
print(f"Error downloading NLTK data: {e}")
# Initialize the summarization pipeline with a smaller, faster model
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
summarizer = pipeline(
"summarization",
model="facebook/bart-base-cnn", # Using smaller base model instead of large
device=device,
model_kwargs={"cache_dir": "model_cache"}
)
except Exception as e:
print(f"Error loading model: {e}")
summarizer = None
def is_valid_url(url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def extract_article_text(url):
"""Extract article text using BeautifulSoup with timeout"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
# Add a shorter timeout
response = requests.get(url, headers=headers, timeout=5)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Remove unwanted elements
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
tag.decompose()
# Find the main content - optimized search
main_content = (
soup.find('article') or
soup.find(attrs={"class": lambda x: x and any(c in x for c in ['article', 'post-content', 'entry-content', 'content'])})
)
if main_content:
# Only get paragraphs from main content
paragraphs = main_content.find_all('p', recursive=False)
else:
# Limit number of paragraphs if no main content found
paragraphs = soup.find_all('p', limit=20)
# Extract text from paragraphs with minimum length requirement
article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40])
# Limit total text length
return article_text[:5000]
except Exception as e:
raise Exception(f"Error fetching article: {str(e)}")
def extract_and_summarize(url, progress=gr.Progress()):
if not url or not url.strip():
return "Please enter a valid URL"
if not is_valid_url(url):
return "Please enter a valid URL starting with http:// or https://"
try:
start_time = time.time()
# Extract article text with progress updates
progress(0.2, desc="Fetching article...")
text = extract_article_text(url)
if not text:
return "Could not extract text from the article. Please make sure it's a valid news article."
progress(0.4, desc="Processing text...")
# Split text into smaller chunks
max_chunk_length = 512 # Reduced chunk size
chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
# Limit number of chunks
chunks = chunks[:3] # Process at most 3 chunks
progress(0.6, desc="Generating summary...")
# Summarize each chunk with shorter output
summaries = []
for chunk in chunks:
if len(chunk.strip()) > 50: # Reduced minimum length requirement
try:
summary = summarizer(
chunk,
max_length=100, # Reduced max length
min_length=20, # Reduced min length
do_sample=False
)
summaries.append(summary[0]['summary_text'])
except Exception as e:
print(f"Error summarizing chunk: {e}")
continue
if not summaries:
return "Could not generate summary. Please try a different article."
# Combine summaries
final_summary = " ".join(summaries)
# Add processing time information
processing_time = round(time.time() - start_time, 2)
return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
except Exception as e:
return f"Error processing article: {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=extract_and_summarize,
inputs=gr.Textbox(
label="Enter News Article URL",
placeholder="https://...",
info="Enter a news article URL to get a quick summary"
),
outputs=gr.Textbox(label="Summary", lines=5),
title="📰 Fast News Article Summarizer",
description="""
This app quickly summarizes news articles using AI.
Simply paste a URL and get a concise summary in seconds!
""",
examples=[
["https://www.bbc.com/news/world-us-canada-67841980"],
["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
],
theme=gr.themes.Soft()
)
if __name__ == "__main__":
demo.launch()