Khd-B commited on
Commit
51c0f70
1 Parent(s): 00da70a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -11
app.py CHANGED
@@ -5,31 +5,40 @@ from gtts import gTTS
5
  import os
6
  from sklearn.metrics.pairwise import cosine_similarity
7
 
8
- # Load the PDF and extract text once
9
  @st.cache_resource
10
- def load_pdf_and_extract_text(pdf_path):
11
  all_sentences = []
12
  with pdfplumber.open(pdf_path) as pdf:
13
- for page in pdf.pages:
 
 
 
 
 
14
  text = page.extract_text()
15
  if text:
16
  all_sentences.extend(text.split('. '))
 
17
  return all_sentences
18
 
19
- # Load embeddings for the extracted text
 
 
 
 
 
 
 
20
  @st.cache_resource
21
  def create_embeddings(sentences):
22
- model = SentenceTransformer('all-MiniLM-L6-v2')
23
  return model.encode(sentences, convert_to_tensor=True)
24
 
25
- # Load your PDF file and create embeddings
26
- pdf_path = "Accounting.pdf" # Ensure this is uploaded to your space
27
- all_sentences = load_pdf_and_extract_text(pdf_path)
28
  pdf_embeddings = create_embeddings(all_sentences)
29
 
30
  # Function to respond to user query
31
  def respond_to_query(query):
32
- query_embedding = SentenceTransformer('all-MiniLM-L6-v2').encode(query, convert_to_tensor=True)
33
  similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
34
  best_match_index = similarities.argmax()
35
  response = all_sentences[best_match_index]
@@ -44,11 +53,11 @@ submit_button = st.button("Ask")
44
  if submit_button:
45
  if query:
46
  response = respond_to_query(query)
47
-
48
  # Text-to-Speech
49
  tts = gTTS(response)
50
  tts.save("response.mp3")
51
-
52
  # (Optional) Playing audio might not work in Spaces, consider alternatives
53
  st.write(response)
54
  else:
 
5
  import os
6
  from sklearn.metrics.pairwise import cosine_similarity
7
 
8
+ # Function to extract text from a limited number of pages in a PDF
9
  @st.cache_resource
10
+ def load_pdf_and_extract_text(pdf_path, max_pages=20):
11
  all_sentences = []
12
  with pdfplumber.open(pdf_path) as pdf:
13
+ total_pages = len(pdf.pages)
14
+ st.write(f"Total pages to process: {total_pages}")
15
+ for i, page in enumerate(pdf.pages):
16
+ if i >= max_pages:
17
+ break
18
+ st.write(f"Processing page {i + 1}...")
19
  text = page.extract_text()
20
  if text:
21
  all_sentences.extend(text.split('. '))
22
+ st.progress((i + 1) / max_pages) # Update progress
23
  return all_sentences
24
 
25
+ # Load your PDF file
26
+ pdf_path = "Accounting.pdf" # Ensure this is uploaded to your space
27
+ all_sentences = load_pdf_and_extract_text(pdf_path)
28
+
29
+ # Initialize the model
30
+ model = SentenceTransformer('all-MiniLM-L6-v2')
31
+
32
+ # Create embeddings from extracted sentences
33
  @st.cache_resource
34
  def create_embeddings(sentences):
 
35
  return model.encode(sentences, convert_to_tensor=True)
36
 
 
 
 
37
  pdf_embeddings = create_embeddings(all_sentences)
38
 
39
  # Function to respond to user query
40
  def respond_to_query(query):
41
+ query_embedding = model.encode(query, convert_to_tensor=True)
42
  similarities = cosine_similarity(query_embedding.reshape(1, -1), pdf_embeddings)
43
  best_match_index = similarities.argmax()
44
  response = all_sentences[best_match_index]
 
53
  if submit_button:
54
  if query:
55
  response = respond_to_query(query)
56
+
57
  # Text-to-Speech
58
  tts = gTTS(response)
59
  tts.save("response.mp3")
60
+
61
  # (Optional) Playing audio might not work in Spaces, consider alternatives
62
  st.write(response)
63
  else: