Spaces:

Khd-B
/

Study_Assistant

Sleeping

App Files Files Community

Khd-B commited on Oct 28, 2024

Commit

00da70a

verified ·

1 Parent(s): 41e2a16

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -27

app.py CHANGED Viewed

@@ -5,32 +5,31 @@ from gtts import gTTS
 import os
 from sklearn.metrics.pairwise import cosine_similarity
-# Function to extract text from a limited number of pages in a PDF
-def extract_text_from_pdf(pdf_path, start_page=0, end_page=10):
-    text = ""
     with pdfplumber.open(pdf_path) as pdf:
-        for page in pdf.pages[start_page:end_page]:
-            text += page.extract_text() + "\n"
-    return text
-# Load your PDF file (you might want to upload it separately in Spaces)
-pdf_path = "Accounting.pdf"  # Update this with the actual file path in Spaces
-# Initialize the model
-model = SentenceTransformer('all-MiniLM-L6-v2')
-# Example: Process the first 100 pages in batches
-all_sentences = []
-for i in range(0, 300, 10):  # Adjust the step as needed
-    pdf_text = extract_text_from_pdf(pdf_path, start_page=i, end_page=i+10)
-    all_sentences.extend(pdf_text.split('. '))
-# Create embeddings from extracted sentences
-pdf_embeddings = model.encode(all_sentences, convert_to_tensor=True)
 # Function to respond to user query
 def respond_to_query(query):
-    query_embedding = model.encode(query, convert_to_tensor=True)
     similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
     best_match_index = similarities.argmax()
     response = all_sentences[best_match_index]
@@ -45,14 +44,12 @@ submit_button = st.button("Ask")
 if submit_button:
     if query:
         response = respond_to_query(query)
         # Text-to-Speech
         tts = gTTS(response)
         tts.save("response.mp3")
-        # Playing audio (this might not work in Spaces, consider alternatives)
-        os.system("mpg321 response.mp3")
         st.write(response)
     else:
         st.write("Please enter a question.")

 import os
 from sklearn.metrics.pairwise import cosine_similarity
+# Load the PDF and extract text once
+@st.cache_resource
+def load_pdf_and_extract_text(pdf_path):
+    all_sentences = []
     with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            text = page.extract_text()
+            if text:
+                all_sentences.extend(text.split('. '))
+    return all_sentences
+# Load embeddings for the extracted text
+@st.cache_resource
+def create_embeddings(sentences):
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    return model.encode(sentences, convert_to_tensor=True)
+# Load your PDF file and create embeddings
+pdf_path = "Accounting.pdf"  # Ensure this is uploaded to your space
+all_sentences = load_pdf_and_extract_text(pdf_path)
+pdf_embeddings = create_embeddings(all_sentences)
 # Function to respond to user query
 def respond_to_query(query):
+    query_embedding = SentenceTransformer('all-MiniLM-L6-v2').encode(query, convert_to_tensor=True)
     similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
     best_match_index = similarities.argmax()
     response = all_sentences[best_match_index]
 if submit_button:
     if query:
         response = respond_to_query(query)
         # Text-to-Speech
         tts = gTTS(response)
         tts.save("response.mp3")
+        # (Optional) Playing audio might not work in Spaces, consider alternatives
         st.write(response)
     else:
         st.write("Please enter a question.")