import pdfplumber from sentence_transformers import SentenceTransformer import streamlit as st from gtts import gTTS import os from sklearn.metrics.pairwise import cosine_similarity # Function to extract text from a limited number of pages in a PDF def extract_text_from_pdf(pdf_path, start_page=0, end_page=10): text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages[start_page:end_page]: text += page.extract_text() + "\n" return text # Load your PDF file (you might want to upload it separately in Spaces) pdf_path = "https://huggingface.co/spaces/Khd-B/Study_Assistant/raw/main/Accounting.pdf" # Update this with the actual file path in Spaces # Initialize the model model = SentenceTransformer('all-MiniLM-L6-v2') # Example: Process the first 100 pages in batches all_sentences = [] for i in range(0, 300, 10): # Adjust the step as needed pdf_text = extract_text_from_pdf(pdf_path, start_page=i, end_page=i+10) all_sentences.extend(pdf_text.split('. ')) # Create embeddings from extracted sentences pdf_embeddings = model.encode(all_sentences, convert_to_tensor=True) # Function to respond to user query def respond_to_query(query): query_embedding = model.encode(query, convert_to_tensor=True) similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings) best_match_index = similarities.argmax() response = all_sentences[best_match_index] return response # Streamlit app st.title("Study Assistant") query = st.text_input("Type your question:") submit_button = st.button("Ask") if submit_button: if query: response = respond_to_query(query) # Text-to-Speech tts = gTTS(response) tts.save("response.mp3") # Playing audio (this might not work in Spaces, consider alternatives) os.system("mpg321 response.mp3") st.write(response) else: st.write("Please enter a question.")