Study_Assistant / app.py
Khd-B's picture
Update app.py
c3ca56d verified
raw
history blame
1.98 kB
import pdfplumber
from sentence_transformers import SentenceTransformer
import streamlit as st
from gtts import gTTS
import os
from sklearn.metrics.pairwise import cosine_similarity
# Function to extract text from a limited number of pages in a PDF
def extract_text_from_pdf(pdf_path, start_page=0, end_page=10):
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages[start_page:end_page]:
text += page.extract_text() + "\n"
return text
# Load your PDF file (you might want to upload it separately in Spaces)
pdf_path = "https://huggingface.co/spaces/Khd-B/Study_Assistant/raw/main/Accounting.pdf" # Update this with the actual file path in Spaces
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Example: Process the first 100 pages in batches
all_sentences = []
for i in range(0, 300, 10): # Adjust the step as needed
pdf_text = extract_text_from_pdf(pdf_path, start_page=i, end_page=i+10)
all_sentences.extend(pdf_text.split('. '))
# Create embeddings from extracted sentences
pdf_embeddings = model.encode(all_sentences, convert_to_tensor=True)
# Function to respond to user query
def respond_to_query(query):
query_embedding = model.encode(query, convert_to_tensor=True)
similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
best_match_index = similarities.argmax()
response = all_sentences[best_match_index]
return response
# Streamlit app
st.title("Study Assistant")
query = st.text_input("Type your question:")
submit_button = st.button("Ask")
if submit_button:
if query:
response = respond_to_query(query)
# Text-to-Speech
tts = gTTS(response)
tts.save("response.mp3")
# Playing audio (this might not work in Spaces, consider alternatives)
os.system("mpg321 response.mp3")
st.write(response)
else:
st.write("Please enter a question.")