import streamlit as st
import pdfplumber
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy as np

# Download necessary NLTK data
nltk.download('vader_lexicon')

# Initialize necessary components
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-xsum")
sia = SentimentIntensityAnalyzer()

# Helper functions
def extract_text_from_pdf(file):
    with pdfplumber.open(file) as pdf:
        text = ''.join([page.extract_text() for page in pdf.pages])
    return text

def clean_text(text):
    text = re.sub(r'\s*Page \d+\s*', '', text)  # Remove page numbers
    return text.strip()

def split_text_into_paragraphs(text):
    return text.split('\n\n')

def summarize_text_pegasus(text, max_length=512):
    inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
    summary_ids = model.generate(
        inputs["input_ids"], max_length=max_length, min_length=50,
        length_penalty=2.0, num_beams=4, early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def summarize_large_document(text, max_length=512):
    paragraphs = split_text_into_paragraphs(text)
    summaries = [summarize_text_pegasus(paragraph, max_length=max_length) for paragraph in paragraphs[:10]]  # Limit to first 10 paragraphs
    return " ".join(summaries)

def text_to_speech(text, lang="en"):
    tts = gTTS(text=text, lang=lang)
    tts.save("summary.mp3")
    return "summary.mp3"

def extract_keywords(text, top_n=10):
    vectorizer = CountVectorizer(stop_words="english")
    word_counts = vectorizer.fit_transform([text])
    keywords = sorted(
        zip(vectorizer.get_feature_names_out(), word_counts.toarray()[0]), 
        key=lambda x: x[1], reverse=True
    )[:top_n]
    return [word for word, count in keywords]

def analyze_sentiment(text):
    return sia.polarity_scores(text)

# Streamlit App Interface
st.title("Enhanced PDF to Audiobook App")
st.markdown("### Turn documents into interactive audiobooks with advanced features.")

if 'audio_path' not in st.session_state:
    st.session_state['audio_path'] = None

uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file:
    with st.spinner("Extracting and cleaning PDF content..."):
        raw_text = extract_text_from_pdf(uploaded_file)
        cleaned_text = clean_text(raw_text)
    st.text_area("Extracted Text", cleaned_text[:5000], height=300, help="Displaying first 5000 characters.")

    if st.button("Summarize Document"):
        with st.spinner("Summarizing document..."):
            summary = summarize_large_document(cleaned_text, max_length=512)
        st.text_area("Summary", summary, height=300)

        if st.button("Convert Summary to Audiobook"):
            with st.spinner("Generating audio..."):
                audio_path = text_to_speech(summary)
                st.session_state['audio_path'] = audio_path
        if st.session_state['audio_path']:
            st.audio(st.session_state['audio_path'], format="audio/mp3")
            st.download_button("Download Audio", data=open(st.session_state['audio_path'], "rb"), file_name="summary_audio.mp3")

    st.markdown("### Document Insights")
    if st.checkbox("Extract Keywords"):
        with st.spinner("Extracting keywords..."):
            keywords = extract_keywords(cleaned_text)
        st.write("Keywords:", ", ".join(keywords))

    if st.checkbox("Analyze Sentiment"):
        with st.spinner("Analyzing sentiment..."):
            sentiment = analyze_sentiment(cleaned_text)
        st.write("Sentiment Analysis:", sentiment)