Khd-B commited on
Commit
c7a62b9
·
verified ·
1 Parent(s): 1e83790

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -60
app.py CHANGED
@@ -1,76 +1,57 @@
 
 
1
  import streamlit as st
2
- import PyPDF2
3
- from transformers import AutoTokenizer, AutoModel
4
- import torch
5
- import numpy as np
6
- import faiss
7
  from gtts import gTTS
8
  import os
 
 
 
 
 
 
 
 
9
 
10
- # Initialize the model and tokenizer
11
- model_name = "sentence-transformers/all-MiniLM-L6-v2"
12
- tokenizer = AutoTokenizer.from_pretrained(model_name)
13
- model = AutoModel.from_pretrained(model_name)
14
-
15
- # Function to get embeddings
16
- def get_embedding(text):
17
- inputs = tokenizer(text, return_tensors='pt')
18
- with torch.no_grad():
19
- embeddings = model(**inputs).last_hidden_state.mean(dim=1).numpy()
20
- return embeddings
21
-
22
- # Initialize FAISS index
23
- embeddings_dimension = 384 # for MiniLM
24
- index = faiss.IndexFlatL2(embeddings_dimension)
25
-
26
- # Title of the app
27
- st.title("Study Assistant for Grade 9")
28
-
29
- # File uploader widget
30
- uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
31
 
32
- if uploaded_file is not None:
33
- # Read the uploaded PDF file
34
- pdf_reader = PyPDF2.PdfReader(uploaded_file)
35
- text = ""
36
 
37
- # Extract text from each page
38
- for page in pdf_reader.pages:
39
- text += page.extract_text() if page.extract_text() else ""
40
 
41
- st.subheader("Extracted Text:")
42
- st.write(text)
 
 
43
 
44
- # Generate embedding for the extracted text
45
- embeddings = get_embedding(text)
46
- index.add(embeddings) # Add embedding to the FAISS index
47
 
48
- st.success("Text extracted and embeddings generated!")
 
49
 
50
- # Subject selection and query input
51
- subject = st.selectbox("Select Subject", ["Accounting"])
52
- query = st.text_input("Type your query")
53
 
54
- if st.button("Submit"):
55
  if query:
56
- # Get embedding for the query
57
- query_embedding = get_embedding(query)
58
-
59
- # Search for the nearest neighbors in the FAISS index
60
- D, I = index.search(query_embedding, k=5) # Retrieve top 5 matches
61
-
62
- st.subheader("Top Matches:")
63
- for idx in I[0]:
64
- if idx < len(embeddings): # Ensure index is valid
65
- st.write(f"Match Index: {idx}, Distance: {D[0][idx]}") # Display match details
66
 
67
- # Convert response to speech
68
- response_text = f"You asked about '{query}' in {subject}. Here are your top matches."
69
- tts = gTTS(text=response_text, lang='en')
70
  tts.save("response.mp3")
71
 
72
- # Display audio controls (Streamlit doesn't support direct playback)
73
- st.audio("response.mp3")
74
-
75
- st.success("Response generated!")
76
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ from sentence_transformers import SentenceTransformer
3
  import streamlit as st
 
 
 
 
 
4
  from gtts import gTTS
5
  import os
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ # Function to extract text from a PDF
8
+ def extract_text_from_pdf(pdf_path):
9
+ text = ""
10
+ with pdfplumber.open(pdf_path) as pdf:
11
+ for page in pdf.pages:
12
+ text += page.extract_text() + "\n"
13
+ return text
14
 
15
+ # Load your PDF file (upload it in Colab)
16
+ pdf_path = "/content/Accounting.pdf" # Change this to your uploaded PDF file path
17
+ pdf_text = extract_text_from_pdf(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ # Create embeddings from the PDF text
20
+ model = SentenceTransformer('all-MiniLM-L6-v2')
21
+ pdf_sentences = pdf_text.split('. ') # Split text into sentences for embedding
22
+ pdf_embeddings = model.encode(pdf_sentences, convert_to_tensor=True)
23
 
24
+ # Function to respond to user query
25
+ def respond_to_query(query):
26
+ query_embedding = model.encode(query, convert_to_tensor=True)
27
 
28
+ # Find the closest sentence based on cosine similarity
29
+ from sklearn.metrics.pairwise import cosine_similarity
30
+ similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
31
+ best_match_index = similarities.argmax()
32
 
33
+ response = pdf_sentences[best_match_index]
34
+ return response
 
35
 
36
+ # Streamlit app
37
+ st.title("Study Assistant")
38
 
39
+ query = st.text_input("Type your question:")
40
+ submit_button = st.button("Ask")
 
41
 
42
+ if submit_button:
43
  if query:
44
+ response = respond_to_query(query)
 
 
 
 
 
 
 
 
 
45
 
46
+ # Text-to-Speech
47
+ tts = gTTS(response)
 
48
  tts.save("response.mp3")
49
 
50
+ # Playing audio
51
+ os.system("mpg321 response.mp3") # Ensure mpg321 is installed in the Colab environment
 
 
52
 
53
+ st.write(response)
54
+ else:
55
+ st.write("Please enter a question.")
56
+ # Run the Streamlit app and expose it
57
+ !streamlit run app.py & npx localtunnel --port 8501