import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import nltk
import torch
from urllib.parse import urlparse
import time

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
except Exception as e:
    print(f"Error downloading NLTK data: {e}")

# Initialize the summarization pipeline with a smaller, faster model
try:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    summarizer = pipeline(
        "summarization",
        model="facebook/bart-base-cnn",  # Using smaller base model instead of large
        device=device,
        model_kwargs={"cache_dir": "model_cache"}
    )
except Exception as e:
    print(f"Error loading model: {e}")
    summarizer = None

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False

def extract_article_text(url):
    """Extract article text using BeautifulSoup with timeout"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        # Add a shorter timeout
        response = requests.get(url, headers=headers, timeout=5)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove unwanted elements
        for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
            tag.decompose()
        
        # Find the main content - optimized search
        main_content = (
            soup.find('article') or 
            soup.find(attrs={"class": lambda x: x and any(c in x for c in ['article', 'post-content', 'entry-content', 'content'])})
        )
        
        if main_content:
            # Only get paragraphs from main content
            paragraphs = main_content.find_all('p', recursive=False)
        else:
            # Limit number of paragraphs if no main content found
            paragraphs = soup.find_all('p', limit=20)
        
        # Extract text from paragraphs with minimum length requirement
        article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40])
        
        # Limit total text length
        return article_text[:5000]
    
    except Exception as e:
        raise Exception(f"Error fetching article: {str(e)}")

def extract_and_summarize(url, progress=gr.Progress()):
    if not url or not url.strip():
        return "Please enter a valid URL"
    
    if not is_valid_url(url):
        return "Please enter a valid URL starting with http:// or https://"
    
    try:
        start_time = time.time()
        
        # Extract article text with progress updates
        progress(0.2, desc="Fetching article...")
        text = extract_article_text(url)
        
        if not text:
            return "Could not extract text from the article. Please make sure it's a valid news article."
        
        progress(0.4, desc="Processing text...")
        # Split text into smaller chunks
        max_chunk_length = 512  # Reduced chunk size
        chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
        
        # Limit number of chunks
        chunks = chunks[:3]  # Process at most 3 chunks
        
        progress(0.6, desc="Generating summary...")
        # Summarize each chunk with shorter output
        summaries = []
        for chunk in chunks:
            if len(chunk.strip()) > 50:  # Reduced minimum length requirement
                try:
                    summary = summarizer(
                        chunk,
                        max_length=100,  # Reduced max length
                        min_length=20,   # Reduced min length
                        do_sample=False
                    )
                    summaries.append(summary[0]['summary_text'])
                except Exception as e:
                    print(f"Error summarizing chunk: {e}")
                    continue
        
        if not summaries:
            return "Could not generate summary. Please try a different article."
        
        # Combine summaries
        final_summary = " ".join(summaries)
        
        # Add processing time information
        processing_time = round(time.time() - start_time, 2)
        return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
        
    except Exception as e:
        return f"Error processing article: {str(e)}"

# Create Gradio interface
demo = gr.Interface(
    fn=extract_and_summarize,
    inputs=gr.Textbox(
        label="Enter News Article URL",
        placeholder="https://...",
        info="Enter a news article URL to get a quick summary"
    ),
    outputs=gr.Textbox(label="Summary", lines=5),
    title="📰 Fast News Article Summarizer",
    description="""
    This app quickly summarizes news articles using AI.
    Simply paste a URL and get a concise summary in seconds!
    """,
    examples=[
        ["https://www.bbc.com/news/world-us-canada-67841980"],
        ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
    ],
    theme=gr.themes.Soft()
)

if __name__ == "__main__":
    demo.launch()