Spaces:

Arslan17121
/

AudiobookCoV2

Running

App Files Files Community

Arslan17121 commited on Jan 2

Commit

e3578a9

verified ·

1 Parent(s): ba78b60

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -50

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import streamlit as st
 import pdfplumber
 import re
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, DPRQuestionEncoder, DPRContextEncoder
 from gtts import gTTS
 from sklearn.feature_extraction.text import CountVectorizer
 import nltk
 from nltk.sentiment import SentimentIntensityAnalyzer
-import faiss
 import numpy as np
 # Download necessary NLTK data
@@ -15,13 +14,8 @@ nltk.download('vader_lexicon')
 # Initialize necessary components
 tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
 model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-xsum")
-qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
 sia = SentimentIntensityAnalyzer()
-# Initialize RAG components
-question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
-context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
 # Helper functions
 def extract_text_from_pdf(file):
     with pdfplumber.open(file) as pdf:
@@ -48,36 +42,10 @@ def summarize_large_document(text, max_length=800):
     summaries = [summarize_text_pegasus(paragraph, max_length=max_length) for paragraph in paragraphs]
     return " ".join(summaries)
-def embed_text(text, encoder, tokenizer):
-    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="longest")
-    embeddings = encoder(**inputs).pooler_output
-    return embeddings.detach().numpy()
-def build_index(paragraphs):
-    index = faiss.IndexFlatL2(768)
-    embeddings = []
-    for paragraph in paragraphs:
-        embeddings.append(embed_text(paragraph, context_encoder, tokenizer))
-    embeddings = np.vstack(embeddings)
-    index.add(embeddings)
-    return index, paragraphs
-def retrieve_relevant_paragraphs(question, index, paragraphs, top_k=5):
-    question_embedding = embed_text(question, question_encoder, tokenizer)
-    distances, indices = index.search(question_embedding, top_k)
-    return [paragraphs[i] for i in indices[0]]
-def answer_question_with_rag(question, context, top_k=5):
-    paragraphs = split_text_into_paragraphs(context)
-    index, paragraphs = build_index(paragraphs)
-    relevant_paragraphs = retrieve_relevant_paragraphs(question, index, paragraphs, top_k)
-    answers = [qa_pipeline({'question': question, 'context': paragraph})['answer'] for paragraph in relevant_paragraphs]
-    return " ".join(answers)
 def text_to_speech(text, lang="en"):
     tts = gTTS(text=text, lang=lang)
-    tts.save("discussion_points.mp3")
-    return "discussion_points.mp3"
 def extract_keywords(text, top_n=10):
     vectorizer = CountVectorizer(stop_words="english")
@@ -119,20 +87,6 @@ if uploaded_file:
             st.audio(st.session_state['audio_path'], format="audio/mp3")
             st.download_button("Download Audio", data=open(st.session_state['audio_path'], "rb"), file_name="summary_audio.mp3")
-    st.markdown("### Ask Questions About the Document")
-    question = st.text_input("Your Question:")
-    if question:
-        with st.spinner("Answering your question..."):
-            answer = answer_question_with_rag(question, cleaned_text)
-        st.write(f"**Answer:** {answer}")
-        if st.button("Convert Answer to Audio"):
-            with st.spinner("Generating answer audio..."):
-                answer_audio_path = text_to_speech(answer)
-                st.session_state['audio_path'] = answer_audio_path
-        if st.session_state['audio_path']:
-            st.audio(st.session_state['audio_path'], format="audio/mp3")
-            st.download_button("Download Answer Audio", data=open(st.session_state['audio_path'], "rb"), file_name="answer_audio.mp3")
     st.markdown("### Document Insights")
     if st.checkbox("Extract Keywords"):
         with st.spinner("Extracting keywords..."):
@@ -142,4 +96,4 @@ if uploaded_file:
     if st.checkbox("Analyze Sentiment"):
         with st.spinner("Analyzing sentiment..."):
             sentiment = analyze_sentiment(cleaned_text)
-        st.write("Sentiment Analysis:", sentiment)

 import streamlit as st
 import pdfplumber
 import re
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from gtts import gTTS
 from sklearn.feature_extraction.text import CountVectorizer
 import nltk
 from nltk.sentiment import SentimentIntensityAnalyzer
 import numpy as np
 # Download necessary NLTK data
 # Initialize necessary components
 tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
 model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-xsum")
 sia = SentimentIntensityAnalyzer()
 # Helper functions
 def extract_text_from_pdf(file):
     with pdfplumber.open(file) as pdf:
     summaries = [summarize_text_pegasus(paragraph, max_length=max_length) for paragraph in paragraphs]
     return " ".join(summaries)
 def text_to_speech(text, lang="en"):
     tts = gTTS(text=text, lang=lang)
+    tts.save("summary.mp3")
+    return "summary.mp3"
 def extract_keywords(text, top_n=10):
     vectorizer = CountVectorizer(stop_words="english")
             st.audio(st.session_state['audio_path'], format="audio/mp3")
             st.download_button("Download Audio", data=open(st.session_state['audio_path'], "rb"), file_name="summary_audio.mp3")
     st.markdown("### Document Insights")
     if st.checkbox("Extract Keywords"):
         with st.spinner("Extracting keywords..."):
     if st.checkbox("Analyze Sentiment"):
         with st.spinner("Analyzing sentiment..."):
             sentiment = analyze_sentiment(cleaned_text)
+        st.write("Sentiment Analysis:", sentiment)