sohail-shaikh-s07's picture
Update app.py
746134a verified
raw
history blame
4.85 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import nltk
import torch
from urllib.parse import urlparse
try:
nltk.download('punkt')
except Exception as e:
print(f"Error downloading NLTK data: {e}")
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device) # I have used BART-Large-CNN, you can use any model as per your preference like gpt2, t5, etc
#summarizer = pipeline("summarization", model="openai-community/gpt2", device=device)
except Exception as e:
print(f"Error loading model: {e}")
summarizer = None
def is_valid_url(url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def extract_article_text(url):
"""Extract article text using BeautifulSoup instead of newspaper3k"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Remove unwanted elements --- to avoid the model to get distract and generate wrong summary
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
tag.decompose()
article_text = ""
main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
if main_content:
paragraphs = main_content.find_all('p')
else:
# Fallback to all paragraphs if no article container found
paragraphs = soup.find_all('p')
# Extract text from paragraphs
article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
return article_text
except Exception as e:
raise Exception(f"Error fetching article: {str(e)}")
def extract_and_summarize(url):
if not url or not url.strip():
return "Please enter a valid URL"
if not is_valid_url(url):
return "Please enter a valid URL starting with http:// or https://"
try:
# Extract article text
text = extract_article_text(url)
if not text:
return "Could not extract text from the article. Please make sure it's a valid news article."
# Split text into chunks if it's too long --- it will divide the text into 1024 tokens
max_chunk_length = 1024
chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
# Summarize each chunk --- each small part will be summarized indiviually
summaries = []
for chunk in chunks:
if len(chunk.strip()) > 100: # Only summarize substantial chunks
try:
summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
summaries.append(summary[0]['summary_text'])
except Exception as e:
print(f"Error summarizing chunk: {e}")
continue
if not summaries:
return "Could not generate summary. Please try a different article."
# Combine all summaries --- we need to combine all summaries to get a complete summary
final_summary = " ".join(summaries)
return final_summary
except Exception as e:
return f"Error processing article: {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=extract_and_summarize,
inputs=gr.Textbox(
label="Enter News Article URL",
placeholder="https://...",
info="Enter a news article URL to get a summary"
),
outputs=gr.Textbox(label="Summary", lines=5),
title="πŸ“° News Article Summarizer",
description="""This app creates concise summaries of news articles using AI.
Simply paste a URL of a news article and get a summary!
⏳ Processing Time: The summarization process typically takes 30-60 seconds, depending on article length.
πŸ“Š Status Indicator: Look for "Processing" in the output box - this indicates the model is actively generating your summary.
✨ Quality Assurance: Please wait for the process to complete for the best results.""",
examples=[
["https://www.bbc.com/sport/football/articles/cvgxmzy86e4o"],
["https://globalsouthworld.com/article/biden-approves-571-million-in-defense-support-for-taiwan"]
],
theme=gr.themes.Soft()
)
if __name__ == "__main__":
demo.launch()