Khd-B commited on
Commit
00da70a
1 Parent(s): 41e2a16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -27
app.py CHANGED
@@ -5,32 +5,31 @@ from gtts import gTTS
5
  import os
6
  from sklearn.metrics.pairwise import cosine_similarity
7
 
8
- # Function to extract text from a limited number of pages in a PDF
9
- def extract_text_from_pdf(pdf_path, start_page=0, end_page=10):
10
- text = ""
 
11
  with pdfplumber.open(pdf_path) as pdf:
12
- for page in pdf.pages[start_page:end_page]:
13
- text += page.extract_text() + "\n"
14
- return text
15
-
16
- # Load your PDF file (you might want to upload it separately in Spaces)
17
- pdf_path = "Accounting.pdf" # Update this with the actual file path in Spaces
18
-
19
- # Initialize the model
20
- model = SentenceTransformer('all-MiniLM-L6-v2')
21
-
22
- # Example: Process the first 100 pages in batches
23
- all_sentences = []
24
- for i in range(0, 300, 10): # Adjust the step as needed
25
- pdf_text = extract_text_from_pdf(pdf_path, start_page=i, end_page=i+10)
26
- all_sentences.extend(pdf_text.split('. '))
27
-
28
- # Create embeddings from extracted sentences
29
- pdf_embeddings = model.encode(all_sentences, convert_to_tensor=True)
30
 
31
  # Function to respond to user query
32
  def respond_to_query(query):
33
- query_embedding = model.encode(query, convert_to_tensor=True)
34
  similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
35
  best_match_index = similarities.argmax()
36
  response = all_sentences[best_match_index]
@@ -45,14 +44,12 @@ submit_button = st.button("Ask")
45
  if submit_button:
46
  if query:
47
  response = respond_to_query(query)
48
-
49
  # Text-to-Speech
50
  tts = gTTS(response)
51
  tts.save("response.mp3")
52
-
53
- # Playing audio (this might not work in Spaces, consider alternatives)
54
- os.system("mpg321 response.mp3")
55
-
56
  st.write(response)
57
  else:
58
  st.write("Please enter a question.")
 
5
  import os
6
  from sklearn.metrics.pairwise import cosine_similarity
7
 
8
+ # Load the PDF and extract text once
9
+ @st.cache_resource
10
+ def load_pdf_and_extract_text(pdf_path):
11
+ all_sentences = []
12
  with pdfplumber.open(pdf_path) as pdf:
13
+ for page in pdf.pages:
14
+ text = page.extract_text()
15
+ if text:
16
+ all_sentences.extend(text.split('. '))
17
+ return all_sentences
18
+
19
+ # Load embeddings for the extracted text
20
+ @st.cache_resource
21
+ def create_embeddings(sentences):
22
+ model = SentenceTransformer('all-MiniLM-L6-v2')
23
+ return model.encode(sentences, convert_to_tensor=True)
24
+
25
+ # Load your PDF file and create embeddings
26
+ pdf_path = "Accounting.pdf" # Ensure this is uploaded to your space
27
+ all_sentences = load_pdf_and_extract_text(pdf_path)
28
+ pdf_embeddings = create_embeddings(all_sentences)
 
 
29
 
30
  # Function to respond to user query
31
  def respond_to_query(query):
32
+ query_embedding = SentenceTransformer('all-MiniLM-L6-v2').encode(query, convert_to_tensor=True)
33
  similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
34
  best_match_index = similarities.argmax()
35
  response = all_sentences[best_match_index]
 
44
  if submit_button:
45
  if query:
46
  response = respond_to_query(query)
47
+
48
  # Text-to-Speech
49
  tts = gTTS(response)
50
  tts.save("response.mp3")
51
+
52
+ # (Optional) Playing audio might not work in Spaces, consider alternatives
 
 
53
  st.write(response)
54
  else:
55
  st.write("Please enter a question.")