Spaces:

Khd-B
/

Study_Assistant

Sleeping

App Files Files Community

Khd-B commited on 5 days ago

Commit

7e83395

•

1 Parent(s): 2cffe20

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -20

app.py CHANGED Viewed

@@ -4,33 +4,36 @@ import streamlit as st
 from gtts import gTTS
 import os
 from sklearn.metrics.pairwise import cosine_similarity
-# Function to extract text from a PDF
-def extract_text_from_pdf(pdf_path):
     text = ""
     with pdfplumber.open(pdf_path) as pdf:
-        for page in pdf.pages:
             text += page.extract_text() + "\n"
     return text
-# Load your PDF file (upload it in Colab)
-pdf_path = "/content/Accounting.pdf"  # Change this to your uploaded PDF file path
-pdf_text = extract_text_from_pdf(pdf_path)
-# Create embeddings from the PDF text
 model = SentenceTransformer('all-MiniLM-L6-v2')
-pdf_sentences = pdf_text.split('. ')  # Split text into sentences for embedding
-pdf_embeddings = model.encode(pdf_sentences, convert_to_tensor=True)
 # Function to respond to user query
 def respond_to_query(query):
     query_embedding = model.encode(query, convert_to_tensor=True)
-    # Find the closest sentence based on cosine similarity
-    from sklearn.metrics.pairwise import cosine_similarity
     similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
     best_match_index = similarities.argmax()
-    response = pdf_sentences[best_match_index]
     return response
 # Streamlit app
@@ -42,14 +45,14 @@ submit_button = st.button("Ask")
 if submit_button:
     if query:
         response = respond_to_query(query)
         # Text-to-Speech
         tts = gTTS(response)
         tts.save("response.mp3")
-        # Playing audio
-        os.system("mpg321 response.mp3")  # Ensure mpg321 is installed in the Colab environment
         st.write(response)
     else:
-        st.write("Please enter a question.")

 from gtts import gTTS
 import os
 from sklearn.metrics.pairwise import cosine_similarity
+# Function to extract text from a limited number of pages in a PDF
+def extract_text_from_pdf(pdf_path, start_page=0, end_page=10):
     text = ""
     with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages[start_page:end_page]:
             text += page.extract_text() + "\n"
     return text
+# Load your PDF file (you might want to upload it separately in Spaces)
+pdf_path = "/content/Accounting.pdf"  # Update this with the actual file path in Spaces
+# Initialize the model
 model = SentenceTransformer('all-MiniLM-L6-v2')
+# Example: Process the first 100 pages in batches
+all_sentences = []
+for i in range(0, 300, 10):  # Adjust the step as needed
+    pdf_text = extract_text_from_pdf(pdf_path, start_page=i, end_page=i+10)
+    all_sentences.extend(pdf_text.split('. '))
+# Create embeddings from extracted sentences
+pdf_embeddings = model.encode(all_sentences, convert_to_tensor=True)
 # Function to respond to user query
 def respond_to_query(query):
     query_embedding = model.encode(query, convert_to_tensor=True)
     similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
     best_match_index = similarities.argmax()
+    response = all_sentences[best_match_index]
     return response
 # Streamlit app
 if submit_button:
     if query:
         response = respond_to_query(query)
         # Text-to-Speech
         tts = gTTS(response)
         tts.save("response.mp3")
+        # Playing audio (this might not work in Spaces, consider alternatives)
+        os.system("mpg321 response.mp3")
         st.write(response)
     else:
+        st.write("Please enter a question.")