Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,31 +5,40 @@ from gtts import gTTS
|
|
5 |
import os
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
|
8 |
-
#
|
9 |
@st.cache_resource
|
10 |
-
def load_pdf_and_extract_text(pdf_path):
|
11 |
all_sentences = []
|
12 |
with pdfplumber.open(pdf_path) as pdf:
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
14 |
text = page.extract_text()
|
15 |
if text:
|
16 |
all_sentences.extend(text.split('. '))
|
|
|
17 |
return all_sentences
|
18 |
|
19 |
-
# Load
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
@st.cache_resource
|
21 |
def create_embeddings(sentences):
|
22 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
23 |
return model.encode(sentences, convert_to_tensor=True)
|
24 |
|
25 |
-
# Load your PDF file and create embeddings
|
26 |
-
pdf_path = "Accounting.pdf" # Ensure this is uploaded to your space
|
27 |
-
all_sentences = load_pdf_and_extract_text(pdf_path)
|
28 |
pdf_embeddings = create_embeddings(all_sentences)
|
29 |
|
30 |
# Function to respond to user query
|
31 |
def respond_to_query(query):
|
32 |
-
query_embedding =
|
33 |
similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
|
34 |
best_match_index = similarities.argmax()
|
35 |
response = all_sentences[best_match_index]
|
@@ -44,11 +53,11 @@ submit_button = st.button("Ask")
|
|
44 |
if submit_button:
|
45 |
if query:
|
46 |
response = respond_to_query(query)
|
47 |
-
|
48 |
# Text-to-Speech
|
49 |
tts = gTTS(response)
|
50 |
tts.save("response.mp3")
|
51 |
-
|
52 |
# (Optional) Playing audio might not work in Spaces, consider alternatives
|
53 |
st.write(response)
|
54 |
else:
|
|
|
5 |
import os
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
|
8 |
+
# Function to extract text from a limited number of pages in a PDF
|
9 |
@st.cache_resource
|
10 |
+
def load_pdf_and_extract_text(pdf_path, max_pages=20):
|
11 |
all_sentences = []
|
12 |
with pdfplumber.open(pdf_path) as pdf:
|
13 |
+
total_pages = len(pdf.pages)
|
14 |
+
st.write(f"Total pages to process: {total_pages}")
|
15 |
+
for i, page in enumerate(pdf.pages):
|
16 |
+
if i >= max_pages:
|
17 |
+
break
|
18 |
+
st.write(f"Processing page {i + 1}...")
|
19 |
text = page.extract_text()
|
20 |
if text:
|
21 |
all_sentences.extend(text.split('. '))
|
22 |
+
st.progress((i + 1) / max_pages) # Update progress
|
23 |
return all_sentences
|
24 |
|
25 |
+
# Load your PDF file
|
26 |
+
pdf_path = "Accounting.pdf" # Ensure this is uploaded to your space
|
27 |
+
all_sentences = load_pdf_and_extract_text(pdf_path)
|
28 |
+
|
29 |
+
# Initialize the model
|
30 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
31 |
+
|
32 |
+
# Create embeddings from extracted sentences
|
33 |
@st.cache_resource
|
34 |
def create_embeddings(sentences):
|
|
|
35 |
return model.encode(sentences, convert_to_tensor=True)
|
36 |
|
|
|
|
|
|
|
37 |
pdf_embeddings = create_embeddings(all_sentences)
|
38 |
|
39 |
# Function to respond to user query
|
40 |
def respond_to_query(query):
|
41 |
+
query_embedding = model.encode(query, convert_to_tensor=True)
|
42 |
similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
|
43 |
best_match_index = similarities.argmax()
|
44 |
response = all_sentences[best_match_index]
|
|
|
53 |
if submit_button:
|
54 |
if query:
|
55 |
response = respond_to_query(query)
|
56 |
+
|
57 |
# Text-to-Speech
|
58 |
tts = gTTS(response)
|
59 |
tts.save("response.mp3")
|
60 |
+
|
61 |
# (Optional) Playing audio might not work in Spaces, consider alternatives
|
62 |
st.write(response)
|
63 |
else:
|