Spaces:

sohail-shaikh-s07
/

News-Article-Summarization

Sleeping

App Files Files

News-Article-Summarization / app.py

sohail-shaikh-s07

Update app.py

5b9028d verified 25 days ago

raw

history blame

5.35 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from transformers import pipeline
	import nltk
	import torch
	from urllib.parse import urlparse
	import time

	# Download required NLTK data
	try:
	nltk.download('punkt', quiet=True)
	except Exception as e:
	print(f"Error downloading NLTK data: {e}")

	# Initialize the summarization pipeline with a smaller, faster model
	try:
	device = "cuda" if torch.cuda.is_available() else "cpu"
	summarizer = pipeline(
	"summarization",
	model="facebook/bart-base-cnn", # Using smaller base model instead of large
	device=device,
	model_kwargs={"cache_dir": "model_cache"}
	)
	except Exception as e:
	print(f"Error loading model: {e}")
	summarizer = None

	def is_valid_url(url):
	try:
	result = urlparse(url)
	return all([result.scheme, result.netloc])
	except:
	return False

	def extract_article_text(url):
	"""Extract article text using BeautifulSoup with timeout"""
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	try:
	# Add a shorter timeout
	response = requests.get(url, headers=headers, timeout=5)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove unwanted elements
	for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
	tag.decompose()

	# Find the main content - optimized search
	main_content = (
	soup.find('article') or
	soup.find(attrs={"class": lambda x: x and any(c in x for c in ['article', 'post-content', 'entry-content', 'content'])})
	)

	if main_content:
	# Only get paragraphs from main content
	paragraphs = main_content.find_all('p', recursive=False)
	else:
	# Limit number of paragraphs if no main content found
	paragraphs = soup.find_all('p', limit=20)

	# Extract text from paragraphs with minimum length requirement
	article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40])

	# Limit total text length
	return article_text[:5000]

	except Exception as e:
	raise Exception(f"Error fetching article: {str(e)}")

	def extract_and_summarize(url, progress=gr.Progress()):
	if not url or not url.strip():
	return "Please enter a valid URL"

	if not is_valid_url(url):
	return "Please enter a valid URL starting with http:// or https://"

	try:
	start_time = time.time()

	# Extract article text with progress updates
	progress(0.2, desc="Fetching article...")
	text = extract_article_text(url)

	if not text:
	return "Could not extract text from the article. Please make sure it's a valid news article."

	progress(0.4, desc="Processing text...")
	# Split text into smaller chunks
	max_chunk_length = 512 # Reduced chunk size
	chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]

	# Limit number of chunks
	chunks = chunks[:3] # Process at most 3 chunks

	progress(0.6, desc="Generating summary...")
	# Summarize each chunk with shorter output
	summaries = []
	for chunk in chunks:
	if len(chunk.strip()) > 50: # Reduced minimum length requirement
	try:
	summary = summarizer(
	chunk,
	max_length=100, # Reduced max length
	min_length=20, # Reduced min length
	do_sample=False
	)
	summaries.append(summary[0]['summary_text'])
	except Exception as e:
	print(f"Error summarizing chunk: {e}")
	continue

	if not summaries:
	return "Could not generate summary. Please try a different article."

	# Combine summaries
	final_summary = " ".join(summaries)

	# Add processing time information
	processing_time = round(time.time() - start_time, 2)
	return f"Summary (processed in {processing_time}s):\n\n{final_summary}"

	except Exception as e:
	return f"Error processing article: {str(e)}"

	# Create Gradio interface
	demo = gr.Interface(
	fn=extract_and_summarize,
	inputs=gr.Textbox(
	label="Enter News Article URL",
	placeholder="https://...",
	info="Enter a news article URL to get a quick summary"
	),
	outputs=gr.Textbox(label="Summary", lines=5),
	title="📰 Fast News Article Summarizer",
	description="""
	This app quickly summarizes news articles using AI.
	Simply paste a URL and get a concise summary in seconds!
	""",
	examples=[
	["https://www.bbc.com/news/world-us-canada-67841980"],
	["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
	],
	theme=gr.themes.Soft()
	)

	if __name__ == "__main__":
	demo.launch()