eagle0504 commited on
Commit
1d33079
·
verified ·
1 Parent(s): db0606a

Update helper/utils.py

Browse files
Files changed (1) hide show
  1. helper/utils.py +6 -5
helper/utils.py CHANGED
@@ -58,7 +58,7 @@ import PyPDF2
58
 
59
 
60
  def read_and_textify(
61
- files: List[str], chunk_size: int = 50 # Default chunk size set to 50
62
  ) -> Tuple[List[str], List[str]]:
63
  """
64
  Reads PDF files and extracts text from each page, breaking the text into specified segments.
@@ -89,9 +89,9 @@ def read_and_textify(
89
  text = pageObj.extract_text() # Extract text from the page
90
  if text:
91
  # Split text into chunks of approximately 'chunk_size' words
92
- words = text.split()
93
  for j in range(0, len(words), chunk_size):
94
- chunk = " ".join(words[j : j + chunk_size])
95
  text_list.append(chunk)
96
  # Create a source identifier for each chunk and add it to the list
97
  sources_list.append(f"{file.name}_page_{i}_chunk_{j // chunk_size}")
@@ -237,7 +237,7 @@ def query_search(
237
  scores = [
238
  [
239
  sentences[i], # The sentence itself
240
- query_database[i], # Embedding of the sentence
241
  sources[i], # Source of the sentence
242
  quantized_influence(
243
  prompt_embed_[0], query_database[i], k=levels, use_dagger=False
@@ -250,7 +250,8 @@ def query_search(
250
  refs = pd.DataFrame(scores)
251
  # Rename columns for clarity
252
  refs = refs.rename(
253
- columns={0: "sentences", 1: "query_embeddings", 2: "page no", 3: "qim"}
 
254
  )
255
  # Sort the DataFrame based on the 'qim' score in descending order
256
  refs = refs.sort_values(by="qim", ascending=False)
 
58
 
59
 
60
  def read_and_textify(
61
+ files: List[str], chunk_size: int = 2 # Default chunk size set to 50
62
  ) -> Tuple[List[str], List[str]]:
63
  """
64
  Reads PDF files and extracts text from each page, breaking the text into specified segments.
 
89
  text = pageObj.extract_text() # Extract text from the page
90
  if text:
91
  # Split text into chunks of approximately 'chunk_size' words
92
+ words = text.split('. ')
93
  for j in range(0, len(words), chunk_size):
94
+ chunk = ". ".join(words[j : j + chunk_size]) + '.'
95
  text_list.append(chunk)
96
  # Create a source identifier for each chunk and add it to the list
97
  sources_list.append(f"{file.name}_page_{i}_chunk_{j // chunk_size}")
 
237
  scores = [
238
  [
239
  sentences[i], # The sentence itself
240
+ # query_database[i], # Embedding of the sentence
241
  sources[i], # Source of the sentence
242
  quantized_influence(
243
  prompt_embed_[0], query_database[i], k=levels, use_dagger=False
 
250
  refs = pd.DataFrame(scores)
251
  # Rename columns for clarity
252
  refs = refs.rename(
253
+ # columns={0: "sentences", 1: "query_embeddings", 2: "page no", 3: "qim"}
254
+ columns={0: "sentences", 1: "page no", 2: "qim"}
255
  )
256
  # Sort the DataFrame based on the 'qim' score in descending order
257
  refs = refs.sort_values(by="qim", ascending=False)