Khd-B commited on
Commit
7e83395
1 Parent(s): 2cffe20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -20
app.py CHANGED
@@ -4,33 +4,36 @@ import streamlit as st
4
  from gtts import gTTS
5
  import os
6
  from sklearn.metrics.pairwise import cosine_similarity
7
- # Function to extract text from a PDF
8
- def extract_text_from_pdf(pdf_path):
 
9
  text = ""
10
  with pdfplumber.open(pdf_path) as pdf:
11
- for page in pdf.pages:
12
  text += page.extract_text() + "\n"
13
  return text
14
 
15
- # Load your PDF file (upload it in Colab)
16
- pdf_path = "/content/Accounting.pdf" # Change this to your uploaded PDF file path
17
- pdf_text = extract_text_from_pdf(pdf_path)
18
 
19
- # Create embeddings from the PDF text
20
  model = SentenceTransformer('all-MiniLM-L6-v2')
21
- pdf_sentences = pdf_text.split('. ') # Split text into sentences for embedding
22
- pdf_embeddings = model.encode(pdf_sentences, convert_to_tensor=True)
 
 
 
 
 
 
 
23
 
24
  # Function to respond to user query
25
  def respond_to_query(query):
26
  query_embedding = model.encode(query, convert_to_tensor=True)
27
-
28
- # Find the closest sentence based on cosine similarity
29
- from sklearn.metrics.pairwise import cosine_similarity
30
  similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
31
  best_match_index = similarities.argmax()
32
-
33
- response = pdf_sentences[best_match_index]
34
  return response
35
 
36
  # Streamlit app
@@ -42,14 +45,14 @@ submit_button = st.button("Ask")
42
  if submit_button:
43
  if query:
44
  response = respond_to_query(query)
45
-
46
  # Text-to-Speech
47
  tts = gTTS(response)
48
  tts.save("response.mp3")
49
-
50
- # Playing audio
51
- os.system("mpg321 response.mp3") # Ensure mpg321 is installed in the Colab environment
52
-
53
  st.write(response)
54
  else:
55
- st.write("Please enter a question.")
 
4
  from gtts import gTTS
5
  import os
6
  from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ # Function to extract text from a limited number of pages in a PDF
9
+ def extract_text_from_pdf(pdf_path, start_page=0, end_page=10):
10
  text = ""
11
  with pdfplumber.open(pdf_path) as pdf:
12
+ for page in pdf.pages[start_page:end_page]:
13
  text += page.extract_text() + "\n"
14
  return text
15
 
16
+ # Load your PDF file (you might want to upload it separately in Spaces)
17
+ pdf_path = "/content/Accounting.pdf" # Update this with the actual file path in Spaces
 
18
 
19
+ # Initialize the model
20
  model = SentenceTransformer('all-MiniLM-L6-v2')
21
+
22
+ # Example: Process the first 100 pages in batches
23
+ all_sentences = []
24
+ for i in range(0, 300, 10): # Adjust the step as needed
25
+ pdf_text = extract_text_from_pdf(pdf_path, start_page=i, end_page=i+10)
26
+ all_sentences.extend(pdf_text.split('. '))
27
+
28
+ # Create embeddings from extracted sentences
29
+ pdf_embeddings = model.encode(all_sentences, convert_to_tensor=True)
30
 
31
  # Function to respond to user query
32
  def respond_to_query(query):
33
  query_embedding = model.encode(query, convert_to_tensor=True)
 
 
 
34
  similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
35
  best_match_index = similarities.argmax()
36
+ response = all_sentences[best_match_index]
 
37
  return response
38
 
39
  # Streamlit app
 
45
  if submit_button:
46
  if query:
47
  response = respond_to_query(query)
48
+
49
  # Text-to-Speech
50
  tts = gTTS(response)
51
  tts.save("response.mp3")
52
+
53
+ # Playing audio (this might not work in Spaces, consider alternatives)
54
+ os.system("mpg321 response.mp3")
55
+
56
  st.write(response)
57
  else:
58
+ st.write("Please enter a question.")