File size: 1,925 Bytes
c7a62b9
 
2a5fb30
 
 
c7a62b9
7e83395
 
 
c7a62b9
 
7e83395
c7a62b9
 
2a5fb30
7e83395
c08fa5d
2a5fb30
7e83395
c7a62b9
7e83395
 
 
 
 
 
 
 
 
2a5fb30
c7a62b9
 
 
 
 
7e83395
c7a62b9
2a5fb30
c7a62b9
 
2a5fb30
c7a62b9
 
2a5fb30
c7a62b9
2a5fb30
c7a62b9
7e83395
c7a62b9
 
2a5fb30
7e83395
 
 
 
c7a62b9
 
7e83395
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pdfplumber
from sentence_transformers import SentenceTransformer
import streamlit as st
from gtts import gTTS
import os
from sklearn.metrics.pairwise import cosine_similarity

# Function to extract text from a limited number of pages in a PDF
def extract_text_from_pdf(pdf_path, start_page=0, end_page=10):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages[start_page:end_page]:
            text += page.extract_text() + "\n"
    return text

# Load your PDF file (you might want to upload it separately in Spaces)
pdf_path = "main/Accounting.pdf"  # Update this with the actual file path in Spaces

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example: Process the first 100 pages in batches
all_sentences = []
for i in range(0, 300, 10):  # Adjust the step as needed
    pdf_text = extract_text_from_pdf(pdf_path, start_page=i, end_page=i+10)
    all_sentences.extend(pdf_text.split('. '))

# Create embeddings from extracted sentences
pdf_embeddings = model.encode(all_sentences, convert_to_tensor=True)

# Function to respond to user query
def respond_to_query(query):
    query_embedding = model.encode(query, convert_to_tensor=True)
    similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
    best_match_index = similarities.argmax()
    response = all_sentences[best_match_index]
    return response

# Streamlit app
st.title("Study Assistant")

query = st.text_input("Type your question:")
submit_button = st.button("Ask")

if submit_button:
    if query:
        response = respond_to_query(query)
        
        # Text-to-Speech
        tts = gTTS(response)
        tts.save("response.mp3")
        
        # Playing audio (this might not work in Spaces, consider alternatives)
        os.system("mpg321 response.mp3")
        
        st.write(response)
    else:
        st.write("Please enter a question.")