Arslan17121 commited on
Commit
e3578a9
·
verified ·
1 Parent(s): ba78b60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -50
app.py CHANGED
@@ -1,12 +1,11 @@
1
  import streamlit as st
2
  import pdfplumber
3
  import re
4
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, DPRQuestionEncoder, DPRContextEncoder
5
  from gtts import gTTS
6
  from sklearn.feature_extraction.text import CountVectorizer
7
  import nltk
8
  from nltk.sentiment import SentimentIntensityAnalyzer
9
- import faiss
10
  import numpy as np
11
 
12
  # Download necessary NLTK data
@@ -15,13 +14,8 @@ nltk.download('vader_lexicon')
15
  # Initialize necessary components
16
  tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
17
  model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-xsum")
18
- qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
19
  sia = SentimentIntensityAnalyzer()
20
 
21
- # Initialize RAG components
22
- question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
23
- context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
24
-
25
  # Helper functions
26
  def extract_text_from_pdf(file):
27
  with pdfplumber.open(file) as pdf:
@@ -48,36 +42,10 @@ def summarize_large_document(text, max_length=800):
48
  summaries = [summarize_text_pegasus(paragraph, max_length=max_length) for paragraph in paragraphs]
49
  return " ".join(summaries)
50
 
51
- def embed_text(text, encoder, tokenizer):
52
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="longest")
53
- embeddings = encoder(**inputs).pooler_output
54
- return embeddings.detach().numpy()
55
-
56
- def build_index(paragraphs):
57
- index = faiss.IndexFlatL2(768)
58
- embeddings = []
59
- for paragraph in paragraphs:
60
- embeddings.append(embed_text(paragraph, context_encoder, tokenizer))
61
- embeddings = np.vstack(embeddings)
62
- index.add(embeddings)
63
- return index, paragraphs
64
-
65
- def retrieve_relevant_paragraphs(question, index, paragraphs, top_k=5):
66
- question_embedding = embed_text(question, question_encoder, tokenizer)
67
- distances, indices = index.search(question_embedding, top_k)
68
- return [paragraphs[i] for i in indices[0]]
69
-
70
- def answer_question_with_rag(question, context, top_k=5):
71
- paragraphs = split_text_into_paragraphs(context)
72
- index, paragraphs = build_index(paragraphs)
73
- relevant_paragraphs = retrieve_relevant_paragraphs(question, index, paragraphs, top_k)
74
- answers = [qa_pipeline({'question': question, 'context': paragraph})['answer'] for paragraph in relevant_paragraphs]
75
- return " ".join(answers)
76
-
77
  def text_to_speech(text, lang="en"):
78
  tts = gTTS(text=text, lang=lang)
79
- tts.save("discussion_points.mp3")
80
- return "discussion_points.mp3"
81
 
82
  def extract_keywords(text, top_n=10):
83
  vectorizer = CountVectorizer(stop_words="english")
@@ -119,20 +87,6 @@ if uploaded_file:
119
  st.audio(st.session_state['audio_path'], format="audio/mp3")
120
  st.download_button("Download Audio", data=open(st.session_state['audio_path'], "rb"), file_name="summary_audio.mp3")
121
 
122
- st.markdown("### Ask Questions About the Document")
123
- question = st.text_input("Your Question:")
124
- if question:
125
- with st.spinner("Answering your question..."):
126
- answer = answer_question_with_rag(question, cleaned_text)
127
- st.write(f"**Answer:** {answer}")
128
- if st.button("Convert Answer to Audio"):
129
- with st.spinner("Generating answer audio..."):
130
- answer_audio_path = text_to_speech(answer)
131
- st.session_state['audio_path'] = answer_audio_path
132
- if st.session_state['audio_path']:
133
- st.audio(st.session_state['audio_path'], format="audio/mp3")
134
- st.download_button("Download Answer Audio", data=open(st.session_state['audio_path'], "rb"), file_name="answer_audio.mp3")
135
-
136
  st.markdown("### Document Insights")
137
  if st.checkbox("Extract Keywords"):
138
  with st.spinner("Extracting keywords..."):
@@ -142,4 +96,4 @@ if uploaded_file:
142
  if st.checkbox("Analyze Sentiment"):
143
  with st.spinner("Analyzing sentiment..."):
144
  sentiment = analyze_sentiment(cleaned_text)
145
- st.write("Sentiment Analysis:", sentiment)
 
1
  import streamlit as st
2
  import pdfplumber
3
  import re
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from gtts import gTTS
6
  from sklearn.feature_extraction.text import CountVectorizer
7
  import nltk
8
  from nltk.sentiment import SentimentIntensityAnalyzer
 
9
  import numpy as np
10
 
11
  # Download necessary NLTK data
 
14
  # Initialize necessary components
15
  tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
16
  model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-xsum")
 
17
  sia = SentimentIntensityAnalyzer()
18
 
 
 
 
 
19
  # Helper functions
20
  def extract_text_from_pdf(file):
21
  with pdfplumber.open(file) as pdf:
 
42
  summaries = [summarize_text_pegasus(paragraph, max_length=max_length) for paragraph in paragraphs]
43
  return " ".join(summaries)
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def text_to_speech(text, lang="en"):
46
  tts = gTTS(text=text, lang=lang)
47
+ tts.save("summary.mp3")
48
+ return "summary.mp3"
49
 
50
  def extract_keywords(text, top_n=10):
51
  vectorizer = CountVectorizer(stop_words="english")
 
87
  st.audio(st.session_state['audio_path'], format="audio/mp3")
88
  st.download_button("Download Audio", data=open(st.session_state['audio_path'], "rb"), file_name="summary_audio.mp3")
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  st.markdown("### Document Insights")
91
  if st.checkbox("Extract Keywords"):
92
  with st.spinner("Extracting keywords..."):
 
96
  if st.checkbox("Analyze Sentiment"):
97
  with st.spinner("Analyzing sentiment..."):
98
  sentiment = analyze_sentiment(cleaned_text)
99
+ st.write("Sentiment Analysis:", sentiment)