Update helper/utils.py
Browse files- helper/utils.py +6 -5
helper/utils.py
CHANGED
@@ -58,7 +58,7 @@ import PyPDF2
|
|
58 |
|
59 |
|
60 |
def read_and_textify(
|
61 |
-
files: List[str], chunk_size: int =
|
62 |
) -> Tuple[List[str], List[str]]:
|
63 |
"""
|
64 |
Reads PDF files and extracts text from each page, breaking the text into specified segments.
|
@@ -89,9 +89,9 @@ def read_and_textify(
|
|
89 |
text = pageObj.extract_text() # Extract text from the page
|
90 |
if text:
|
91 |
# Split text into chunks of approximately 'chunk_size' words
|
92 |
-
words = text.split()
|
93 |
for j in range(0, len(words), chunk_size):
|
94 |
-
chunk = " ".join(words[j : j + chunk_size])
|
95 |
text_list.append(chunk)
|
96 |
# Create a source identifier for each chunk and add it to the list
|
97 |
sources_list.append(f"{file.name}_page_{i}_chunk_{j // chunk_size}")
|
@@ -237,7 +237,7 @@ def query_search(
|
|
237 |
scores = [
|
238 |
[
|
239 |
sentences[i], # The sentence itself
|
240 |
-
query_database[i], # Embedding of the sentence
|
241 |
sources[i], # Source of the sentence
|
242 |
quantized_influence(
|
243 |
prompt_embed_[0], query_database[i], k=levels, use_dagger=False
|
@@ -250,7 +250,8 @@ def query_search(
|
|
250 |
refs = pd.DataFrame(scores)
|
251 |
# Rename columns for clarity
|
252 |
refs = refs.rename(
|
253 |
-
columns={0: "sentences", 1: "query_embeddings", 2: "page no", 3: "qim"}
|
|
|
254 |
)
|
255 |
# Sort the DataFrame based on the 'qim' score in descending order
|
256 |
refs = refs.sort_values(by="qim", ascending=False)
|
|
|
58 |
|
59 |
|
60 |
def read_and_textify(
|
61 |
+
files: List[str], chunk_size: int = 2 # Default chunk size set to 50
|
62 |
) -> Tuple[List[str], List[str]]:
|
63 |
"""
|
64 |
Reads PDF files and extracts text from each page, breaking the text into specified segments.
|
|
|
89 |
text = pageObj.extract_text() # Extract text from the page
|
90 |
if text:
|
91 |
# Split text into chunks of approximately 'chunk_size' words
|
92 |
+
words = text.split('. ')
|
93 |
for j in range(0, len(words), chunk_size):
|
94 |
+
chunk = ". ".join(words[j : j + chunk_size]) + '.'
|
95 |
text_list.append(chunk)
|
96 |
# Create a source identifier for each chunk and add it to the list
|
97 |
sources_list.append(f"{file.name}_page_{i}_chunk_{j // chunk_size}")
|
|
|
237 |
scores = [
|
238 |
[
|
239 |
sentences[i], # The sentence itself
|
240 |
+
# query_database[i], # Embedding of the sentence
|
241 |
sources[i], # Source of the sentence
|
242 |
quantized_influence(
|
243 |
prompt_embed_[0], query_database[i], k=levels, use_dagger=False
|
|
|
250 |
refs = pd.DataFrame(scores)
|
251 |
# Rename columns for clarity
|
252 |
refs = refs.rename(
|
253 |
+
# columns={0: "sentences", 1: "query_embeddings", 2: "page no", 3: "qim"}
|
254 |
+
columns={0: "sentences", 1: "page no", 2: "qim"}
|
255 |
)
|
256 |
# Sort the DataFrame based on the 'qim' score in descending order
|
257 |
refs = refs.sort_values(by="qim", ascending=False)
|