import gradio as gr import requests from bs4 import BeautifulSoup from transformers import pipeline import nltk import torch from urllib.parse import urlparse import time # Download required NLTK data try: nltk.download('punkt', quiet=True) except Exception as e: print(f"Error downloading NLTK data: {e}") # Initialize the summarization pipeline with a smaller, faster model try: device = "cuda" if torch.cuda.is_available() else "cpu" summarizer = pipeline( "summarization", model="facebook/bart-base-cnn", # Using smaller base model instead of large device=device, model_kwargs={"cache_dir": "model_cache"} ) except Exception as e: print(f"Error loading model: {e}") summarizer = None def is_valid_url(url): try: result = urlparse(url) return all([result.scheme, result.netloc]) except: return False def extract_article_text(url): """Extract article text using BeautifulSoup with timeout""" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } try: # Add a shorter timeout response = requests.get(url, headers=headers, timeout=5) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Remove unwanted elements for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']): tag.decompose() # Find the main content - optimized search main_content = ( soup.find('article') or soup.find(attrs={"class": lambda x: x and any(c in x for c in ['article', 'post-content', 'entry-content', 'content'])}) ) if main_content: # Only get paragraphs from main content paragraphs = main_content.find_all('p', recursive=False) else: # Limit number of paragraphs if no main content found paragraphs = soup.find_all('p', limit=20) # Extract text from paragraphs with minimum length requirement article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40]) # Limit total text length return article_text[:5000] except Exception as e: raise Exception(f"Error fetching article: {str(e)}") def extract_and_summarize(url, progress=gr.Progress()): if not url or not url.strip(): return "Please enter a valid URL" if not is_valid_url(url): return "Please enter a valid URL starting with http:// or https://" try: start_time = time.time() # Extract article text with progress updates progress(0.2, desc="Fetching article...") text = extract_article_text(url) if not text: return "Could not extract text from the article. Please make sure it's a valid news article." progress(0.4, desc="Processing text...") # Split text into smaller chunks max_chunk_length = 512 # Reduced chunk size chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)] # Limit number of chunks chunks = chunks[:3] # Process at most 3 chunks progress(0.6, desc="Generating summary...") # Summarize each chunk with shorter output summaries = [] for chunk in chunks: if len(chunk.strip()) > 50: # Reduced minimum length requirement try: summary = summarizer( chunk, max_length=100, # Reduced max length min_length=20, # Reduced min length do_sample=False ) summaries.append(summary[0]['summary_text']) except Exception as e: print(f"Error summarizing chunk: {e}") continue if not summaries: return "Could not generate summary. Please try a different article." # Combine summaries final_summary = " ".join(summaries) # Add processing time information processing_time = round(time.time() - start_time, 2) return f"Summary (processed in {processing_time}s):\n\n{final_summary}" except Exception as e: return f"Error processing article: {str(e)}" # Create Gradio interface demo = gr.Interface( fn=extract_and_summarize, inputs=gr.Textbox( label="Enter News Article URL", placeholder="https://...", info="Enter a news article URL to get a quick summary" ), outputs=gr.Textbox(label="Summary", lines=5), title="📰 Fast News Article Summarizer", description=""" This app quickly summarizes news articles using AI. Simply paste a URL and get a concise summary in seconds! """, examples=[ ["https://www.bbc.com/news/world-us-canada-67841980"], ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"] ], theme=gr.themes.Soft() ) if __name__ == "__main__": demo.launch()