|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from transformers import pipeline |
|
import nltk |
|
import torch |
|
from urllib.parse import urlparse |
|
import time |
|
|
|
|
|
try: |
|
nltk.download('punkt', quiet=True) |
|
except Exception as e: |
|
print(f"Error downloading NLTK data: {e}") |
|
|
|
|
|
try: |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
summarizer = pipeline( |
|
"summarization", |
|
model="facebook/bart-base-cnn", |
|
device=device, |
|
model_kwargs={"cache_dir": "model_cache"} |
|
) |
|
except Exception as e: |
|
print(f"Error loading model: {e}") |
|
summarizer = None |
|
|
|
def is_valid_url(url): |
|
try: |
|
result = urlparse(url) |
|
return all([result.scheme, result.netloc]) |
|
except: |
|
return False |
|
|
|
def extract_article_text(url): |
|
"""Extract article text using BeautifulSoup with timeout""" |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
} |
|
|
|
try: |
|
|
|
response = requests.get(url, headers=headers, timeout=5) |
|
response.raise_for_status() |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']): |
|
tag.decompose() |
|
|
|
|
|
main_content = ( |
|
soup.find('article') or |
|
soup.find(attrs={"class": lambda x: x and any(c in x for c in ['article', 'post-content', 'entry-content', 'content'])}) |
|
) |
|
|
|
if main_content: |
|
|
|
paragraphs = main_content.find_all('p', recursive=False) |
|
else: |
|
|
|
paragraphs = soup.find_all('p', limit=20) |
|
|
|
|
|
article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40]) |
|
|
|
|
|
return article_text[:5000] |
|
|
|
except Exception as e: |
|
raise Exception(f"Error fetching article: {str(e)}") |
|
|
|
def extract_and_summarize(url, progress=gr.Progress()): |
|
if not url or not url.strip(): |
|
return "Please enter a valid URL" |
|
|
|
if not is_valid_url(url): |
|
return "Please enter a valid URL starting with http:// or https://" |
|
|
|
try: |
|
start_time = time.time() |
|
|
|
|
|
progress(0.2, desc="Fetching article...") |
|
text = extract_article_text(url) |
|
|
|
if not text: |
|
return "Could not extract text from the article. Please make sure it's a valid news article." |
|
|
|
progress(0.4, desc="Processing text...") |
|
|
|
max_chunk_length = 512 |
|
chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)] |
|
|
|
|
|
chunks = chunks[:3] |
|
|
|
progress(0.6, desc="Generating summary...") |
|
|
|
summaries = [] |
|
for chunk in chunks: |
|
if len(chunk.strip()) > 50: |
|
try: |
|
summary = summarizer( |
|
chunk, |
|
max_length=100, |
|
min_length=20, |
|
do_sample=False |
|
) |
|
summaries.append(summary[0]['summary_text']) |
|
except Exception as e: |
|
print(f"Error summarizing chunk: {e}") |
|
continue |
|
|
|
if not summaries: |
|
return "Could not generate summary. Please try a different article." |
|
|
|
|
|
final_summary = " ".join(summaries) |
|
|
|
|
|
processing_time = round(time.time() - start_time, 2) |
|
return f"Summary (processed in {processing_time}s):\n\n{final_summary}" |
|
|
|
except Exception as e: |
|
return f"Error processing article: {str(e)}" |
|
|
|
|
|
demo = gr.Interface( |
|
fn=extract_and_summarize, |
|
inputs=gr.Textbox( |
|
label="Enter News Article URL", |
|
placeholder="https://...", |
|
info="Enter a news article URL to get a quick summary" |
|
), |
|
outputs=gr.Textbox(label="Summary", lines=5), |
|
title="📰 Fast News Article Summarizer", |
|
description=""" |
|
This app quickly summarizes news articles using AI. |
|
Simply paste a URL and get a concise summary in seconds! |
|
""", |
|
examples=[ |
|
["https://www.bbc.com/news/world-us-canada-67841980"], |
|
["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"] |
|
], |
|
theme=gr.themes.Soft() |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|