Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,33 +4,36 @@ import streamlit as st
|
|
4 |
from gtts import gTTS
|
5 |
import os
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
-
|
8 |
-
|
|
|
9 |
text = ""
|
10 |
with pdfplumber.open(pdf_path) as pdf:
|
11 |
-
for page in pdf.pages:
|
12 |
text += page.extract_text() + "\n"
|
13 |
return text
|
14 |
|
15 |
-
# Load your PDF file (upload it in
|
16 |
-
pdf_path = "/content/Accounting.pdf" #
|
17 |
-
pdf_text = extract_text_from_pdf(pdf_path)
|
18 |
|
19 |
-
#
|
20 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
# Function to respond to user query
|
25 |
def respond_to_query(query):
|
26 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
27 |
-
|
28 |
-
# Find the closest sentence based on cosine similarity
|
29 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
30 |
similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
|
31 |
best_match_index = similarities.argmax()
|
32 |
-
|
33 |
-
response = pdf_sentences[best_match_index]
|
34 |
return response
|
35 |
|
36 |
# Streamlit app
|
@@ -42,14 +45,14 @@ submit_button = st.button("Ask")
|
|
42 |
if submit_button:
|
43 |
if query:
|
44 |
response = respond_to_query(query)
|
45 |
-
|
46 |
# Text-to-Speech
|
47 |
tts = gTTS(response)
|
48 |
tts.save("response.mp3")
|
49 |
-
|
50 |
-
# Playing audio
|
51 |
-
os.system("mpg321 response.mp3")
|
52 |
-
|
53 |
st.write(response)
|
54 |
else:
|
55 |
-
st.write("Please enter a question.")
|
|
|
4 |
from gtts import gTTS
|
5 |
import os
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
+
|
8 |
+
# Function to extract text from a limited number of pages in a PDF
|
9 |
+
def extract_text_from_pdf(pdf_path, start_page=0, end_page=10):
|
10 |
text = ""
|
11 |
with pdfplumber.open(pdf_path) as pdf:
|
12 |
+
for page in pdf.pages[start_page:end_page]:
|
13 |
text += page.extract_text() + "\n"
|
14 |
return text
|
15 |
|
16 |
+
# Load your PDF file (you might want to upload it separately in Spaces)
|
17 |
+
pdf_path = "/content/Accounting.pdf" # Update this with the actual file path in Spaces
|
|
|
18 |
|
19 |
+
# Initialize the model
|
20 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
21 |
+
|
22 |
+
# Example: Process the first 100 pages in batches
|
23 |
+
all_sentences = []
|
24 |
+
for i in range(0, 300, 10): # Adjust the step as needed
|
25 |
+
pdf_text = extract_text_from_pdf(pdf_path, start_page=i, end_page=i+10)
|
26 |
+
all_sentences.extend(pdf_text.split('. '))
|
27 |
+
|
28 |
+
# Create embeddings from extracted sentences
|
29 |
+
pdf_embeddings = model.encode(all_sentences, convert_to_tensor=True)
|
30 |
|
31 |
# Function to respond to user query
|
32 |
def respond_to_query(query):
|
33 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
|
|
|
|
|
|
34 |
similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
|
35 |
best_match_index = similarities.argmax()
|
36 |
+
response = all_sentences[best_match_index]
|
|
|
37 |
return response
|
38 |
|
39 |
# Streamlit app
|
|
|
45 |
if submit_button:
|
46 |
if query:
|
47 |
response = respond_to_query(query)
|
48 |
+
|
49 |
# Text-to-Speech
|
50 |
tts = gTTS(response)
|
51 |
tts.save("response.mp3")
|
52 |
+
|
53 |
+
# Playing audio (this might not work in Spaces, consider alternatives)
|
54 |
+
os.system("mpg321 response.mp3")
|
55 |
+
|
56 |
st.write(response)
|
57 |
else:
|
58 |
+
st.write("Please enter a question.")
|