Spaces:
Sleeping
Sleeping
import pdfplumber | |
from sentence_transformers import SentenceTransformer | |
import streamlit as st | |
from gtts import gTTS | |
import os | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Function to extract text from a limited number of pages in a PDF | |
def extract_text_from_pdf(pdf_path, start_page=0, end_page=10): | |
text = "" | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages[start_page:end_page]: | |
text += page.extract_text() + "\n" | |
return text | |
# Load your PDF file (you might want to upload it separately in Spaces) | |
pdf_path = "https://huggingface.co/spaces/Khd-B/Study_Assistant/raw/main/Accounting.pdf" # Update this with the actual file path in Spaces | |
# Initialize the model | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Example: Process the first 100 pages in batches | |
all_sentences = [] | |
for i in range(0, 300, 10): # Adjust the step as needed | |
pdf_text = extract_text_from_pdf(pdf_path, start_page=i, end_page=i+10) | |
all_sentences.extend(pdf_text.split('. ')) | |
# Create embeddings from extracted sentences | |
pdf_embeddings = model.encode(all_sentences, convert_to_tensor=True) | |
# Function to respond to user query | |
def respond_to_query(query): | |
query_embedding = model.encode(query, convert_to_tensor=True) | |
similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings) | |
best_match_index = similarities.argmax() | |
response = all_sentences[best_match_index] | |
return response | |
# Streamlit app | |
st.title("Study Assistant") | |
query = st.text_input("Type your question:") | |
submit_button = st.button("Ask") | |
if submit_button: | |
if query: | |
response = respond_to_query(query) | |
# Text-to-Speech | |
tts = gTTS(response) | |
tts.save("response.mp3") | |
# Playing audio (this might not work in Spaces, consider alternatives) | |
os.system("mpg321 response.mp3") | |
st.write(response) | |
else: | |
st.write("Please enter a question.") | |