import streamlit as st import pdfplumber import re from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from gtts import gTTS from sklearn.feature_extraction.text import CountVectorizer import nltk from nltk.sentiment import SentimentIntensityAnalyzer import numpy as np # Download necessary NLTK data nltk.download('vader_lexicon') # Initialize necessary components tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum") model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-xsum") sia = SentimentIntensityAnalyzer() # Helper functions def extract_text_from_pdf(file): with pdfplumber.open(file) as pdf: text = ''.join([page.extract_text() for page in pdf.pages]) return text def clean_text(text): text = re.sub(r'\s*Page \d+\s*', '', text) # Remove page numbers return text.strip() def split_text_into_paragraphs(text): return text.split('\n\n') def summarize_text_pegasus(text, max_length=512): inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt") summary_ids = model.generate( inputs["input_ids"], max_length=max_length, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True ) return tokenizer.decode(summary_ids[0], skip_special_tokens=True) def summarize_large_document(text, max_length=512): paragraphs = split_text_into_paragraphs(text) summaries = [summarize_text_pegasus(paragraph, max_length=max_length) for paragraph in paragraphs[:10]] # Limit to first 10 paragraphs return " ".join(summaries) def text_to_speech(text, lang="en"): tts = gTTS(text=text, lang=lang) tts.save("summary.mp3") return "summary.mp3" def extract_keywords(text, top_n=10): vectorizer = CountVectorizer(stop_words="english") word_counts = vectorizer.fit_transform([text]) keywords = sorted( zip(vectorizer.get_feature_names_out(), word_counts.toarray()[0]), key=lambda x: x[1], reverse=True )[:top_n] return [word for word, count in keywords] def analyze_sentiment(text): return sia.polarity_scores(text) # Streamlit App Interface st.title("Enhanced PDF to Audiobook App") st.markdown("### Turn documents into interactive audiobooks with advanced features.") if 'audio_path' not in st.session_state: st.session_state['audio_path'] = None uploaded_file = st.file_uploader("Upload a PDF", type="pdf") if uploaded_file: with st.spinner("Extracting and cleaning PDF content..."): raw_text = extract_text_from_pdf(uploaded_file) cleaned_text = clean_text(raw_text) st.text_area("Extracted Text", cleaned_text[:5000], height=300, help="Displaying first 5000 characters.") if st.button("Summarize Document"): with st.spinner("Summarizing document..."): summary = summarize_large_document(cleaned_text, max_length=512) st.text_area("Summary", summary, height=300) if st.button("Convert Summary to Audiobook"): with st.spinner("Generating audio..."): audio_path = text_to_speech(summary) st.session_state['audio_path'] = audio_path if st.session_state['audio_path']: st.audio(st.session_state['audio_path'], format="audio/mp3") st.download_button("Download Audio", data=open(st.session_state['audio_path'], "rb"), file_name="summary_audio.mp3") st.markdown("### Document Insights") if st.checkbox("Extract Keywords"): with st.spinner("Extracting keywords..."): keywords = extract_keywords(cleaned_text) st.write("Keywords:", ", ".join(keywords)) if st.checkbox("Analyze Sentiment"): with st.spinner("Analyzing sentiment..."): sentiment = analyze_sentiment(cleaned_text) st.write("Sentiment Analysis:", sentiment)