seanpedrickcase
commited on
Commit
•
d3b1ac5
1
Parent(s):
6768a6d
Now works correctly with npz. Minor formatting improvements
Browse files- app.py +39 -77
- search_funcs/ingest.py +0 -6
app.py
CHANGED
@@ -19,6 +19,8 @@ import pandas as pd
|
|
19 |
import numpy as np
|
20 |
import os
|
21 |
import time
|
|
|
|
|
22 |
from chromadb.config import Settings
|
23 |
|
24 |
from transformers import AutoModel
|
@@ -124,28 +126,14 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
|
|
124 |
|
125 |
#df = pd.read_parquet(file_in.name)
|
126 |
df_list = list(df[text_column].astype(str).str.lower())
|
127 |
-
#df_list = df
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
def batch(iterable, batch_size):
|
137 |
-
iterator = iter(iterable)
|
138 |
-
for first in iterator:
|
139 |
-
yield [first] + list(islice(iterator, batch_size - 1))
|
140 |
-
|
141 |
-
#def batch(my_list, batch_size):
|
142 |
-
# Splitting the list into batches
|
143 |
-
# for i in range(0, len(my_list), batch_size):
|
144 |
-
# batch = my_list[i:i + batch_size]
|
145 |
-
|
146 |
-
# Process each batch
|
147 |
-
# Replace this with your processing logic
|
148 |
-
#print("Processing batch:", batch)
|
149 |
|
150 |
batch_size = 256
|
151 |
|
@@ -157,29 +145,7 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
|
|
157 |
# Save to file if you have cleaned the data
|
158 |
out_file_name = save_prepared_data(in_file, df_list_clean, df, text_column)
|
159 |
|
160 |
-
#corpus = [word_tokenize(doc.lower()) for doc in df_list_clean]
|
161 |
-
#corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows")]
|
162 |
-
|
163 |
-
#total_batches = get_total_batches(df_list_clean, batch_size)
|
164 |
-
#data_batched = batch(df_list_clean, batch_size)
|
165 |
|
166 |
-
#print(data_batched)
|
167 |
-
|
168 |
-
#print(df_list_clean[0])
|
169 |
-
|
170 |
-
# Using encode_batch
|
171 |
-
#encodings = tokenizer.encode_batch(texts)
|
172 |
-
|
173 |
-
# Extracting tokens
|
174 |
-
#tokens_list = [encoding.tokens for encoding in encodings]
|
175 |
-
|
176 |
-
#corpus = [tokenizer(doc.lower()) for doc in progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows")]
|
177 |
-
#corpus = [tokenizer.encode(doc_batch) for doc_batch in progress.tqdm(data_batched, desc = "Tokenising text", unit = "batches out of " + str(total_batches))] # for jina
|
178 |
-
# print(df_list_clean)
|
179 |
-
# corpus = tokenizer.batch_encode_plus(df_list_clean).tokens
|
180 |
-
|
181 |
-
#corpus = [[token.text for token in nlp(text)] for text in df_list_clean]
|
182 |
-
|
183 |
# Tokenize texts in batches
|
184 |
if not tokenised_df.empty:
|
185 |
corpus = tokenised_df.iloc[:,0].tolist()
|
@@ -189,24 +155,11 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
|
|
189 |
corpus = []
|
190 |
for doc in tokenizer.pipe(progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
|
191 |
corpus.append([token.text for token in doc])
|
192 |
-
|
193 |
-
# corpus.append([token.text for token in doc])
|
194 |
-
|
195 |
-
|
196 |
else:
|
197 |
-
|
198 |
-
#data_batched = batch(df_list, batch_size)
|
199 |
-
|
200 |
-
#print(data_batched)
|
201 |
-
|
202 |
-
#corpus = [word_tokenize(doc.lower()) for doc in df_list]
|
203 |
-
#corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list, desc = "Tokenising text", unit = "rows")]
|
204 |
-
#corpus = [tokenizer.encode(doc_batch) for doc_batch in progress.tqdm(data_batched, desc = "Tokenising text", unit = "batches out of " + str(total_batches))] # for jina
|
205 |
-
#corpus = tokenizer.batch_encode_plus(df_list).tokens # for jina
|
206 |
-
|
207 |
print(df_list[0])
|
208 |
-
|
209 |
-
|
210 |
# Tokenize texts in batches
|
211 |
if not tokenised_df.empty:
|
212 |
corpus = tokenised_df.iloc[:,0].tolist()
|
@@ -216,10 +169,8 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
|
|
216 |
|
217 |
corpus = []
|
218 |
for doc in tokenizer.pipe(progress.tqdm(df_list, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
|
219 |
-
#for doc in nlp.pipe(progress.tqdm(df_list, desc = "Tokenising text", unit = "batches out of " + str(total_batches)), #batch_size=batch_size): # You can adjust batch_size based on your requirement
|
220 |
corpus.append([token.text for token in doc])
|
221 |
|
222 |
-
#corpus = tokenizer(df_list)
|
223 |
out_file_name = None
|
224 |
|
225 |
print(corpus[0])
|
@@ -235,9 +186,10 @@ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress())
|
|
235 |
else:
|
236 |
message = "Data loaded. Warning: dataset may be too short to get consistent search results."
|
237 |
|
238 |
-
|
|
|
239 |
|
240 |
-
return corpus, message, df, out_file_name
|
241 |
|
242 |
def get_file_path_end(file_path):
|
243 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
@@ -551,12 +503,14 @@ def docs_to_np_array(docs_out, in_file, embeddings = embeddings, progress=gr.Pro
|
|
551 |
## Load in pre-embedded file if exists
|
552 |
file_list = [string.name for string in in_file]
|
553 |
|
554 |
-
print(file_list)
|
555 |
|
556 |
embeddings_file_names = [string for string in file_list if "embedding" in string]
|
557 |
|
|
|
|
|
558 |
if embeddings_file_names:
|
559 |
-
embeddings_out = np.load(embeddings_file_names[0])
|
560 |
print("embeddings loaded: ", embeddings_out)
|
561 |
|
562 |
if not embeddings_file_names:
|
@@ -568,16 +522,24 @@ def docs_to_np_array(docs_out, in_file, embeddings = embeddings, progress=gr.Pro
|
|
568 |
embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
|
569 |
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
570 |
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
|
|
|
|
|
|
571 |
|
572 |
toc = time.perf_counter()
|
573 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
574 |
|
575 |
-
|
|
|
576 |
|
577 |
-
|
|
|
|
|
|
|
|
|
578 |
print(out_message)
|
579 |
|
580 |
-
return out_message, embeddings_out
|
581 |
|
582 |
def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column):
|
583 |
|
@@ -787,7 +749,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
787 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
788 |
|
789 |
with gr.Accordion(label = "Load in data", open=True):
|
790 |
-
in_bm25_file = gr.File(label="Upload
|
791 |
with gr.Row():
|
792 |
in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
793 |
load_bm25_data_button = gr.Button(value="Load data")
|
@@ -815,9 +777,9 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
815 |
|
816 |
with gr.Row():
|
817 |
in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
818 |
-
load_semantic_data_button = gr.Button(value="Load
|
819 |
|
820 |
-
|
821 |
|
822 |
semantic_query = gr.Textbox(label="Enter semantic search query here")
|
823 |
semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
|
@@ -865,25 +827,25 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
865 |
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
|
866 |
|
867 |
# Load in BM25 data
|
868 |
-
load_bm25_data_button.click(fn=prepare_input_data, inputs=[in_bm25_file, in_bm25_column, in_clean_data], outputs=[corpus_state, load_finished_message, data_state, output_file]).\
|
869 |
then(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message]).\
|
870 |
then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
|
871 |
|
872 |
# BM25 search functions on click or enter
|
873 |
-
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query], api_name="
|
874 |
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query])
|
875 |
|
876 |
### SEMANTIC SEARCH ###
|
877 |
# Load in a csv/excel file for semantic search
|
878 |
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
|
879 |
-
load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic,
|
880 |
-
then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs,
|
881 |
-
then(docs_to_np_array, inputs=[ingest_docs, in_semantic_file], outputs=[
|
882 |
|
883 |
# Semantic search query
|
884 |
semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
885 |
|
886 |
-
semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file]
|
887 |
|
888 |
# Dummy functions just to get dropdowns to work correctly with Gradio 3.50
|
889 |
in_bm25_column.change(dummy_function, in_bm25_column, None)
|
|
|
19 |
import numpy as np
|
20 |
import os
|
21 |
import time
|
22 |
+
import math
|
23 |
+
from itertools import islice
|
24 |
from chromadb.config import Settings
|
25 |
|
26 |
from transformers import AutoModel
|
|
|
126 |
|
127 |
#df = pd.read_parquet(file_in.name)
|
128 |
df_list = list(df[text_column].astype(str).str.lower())
|
|
|
129 |
|
130 |
+
# def get_total_batches(my_list, batch_size):
|
131 |
+
# return math.ceil(len(my_list) / batch_size)
|
132 |
+
|
133 |
+
# def batch(iterable, batch_size):
|
134 |
+
# iterator = iter(iterable)
|
135 |
+
# for first in iterator:
|
136 |
+
# yield [first] + list(islice(iterator, batch_size - 1))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
batch_size = 256
|
139 |
|
|
|
145 |
# Save to file if you have cleaned the data
|
146 |
out_file_name = save_prepared_data(in_file, df_list_clean, df, text_column)
|
147 |
|
|
|
|
|
|
|
|
|
|
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
# Tokenize texts in batches
|
150 |
if not tokenised_df.empty:
|
151 |
corpus = tokenised_df.iloc[:,0].tolist()
|
|
|
155 |
corpus = []
|
156 |
for doc in tokenizer.pipe(progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
|
157 |
corpus.append([token.text for token in doc])
|
158 |
+
|
|
|
|
|
|
|
159 |
else:
|
160 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
print(df_list[0])
|
162 |
+
|
|
|
163 |
# Tokenize texts in batches
|
164 |
if not tokenised_df.empty:
|
165 |
corpus = tokenised_df.iloc[:,0].tolist()
|
|
|
169 |
|
170 |
corpus = []
|
171 |
for doc in tokenizer.pipe(progress.tqdm(df_list, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
|
|
|
172 |
corpus.append([token.text for token in doc])
|
173 |
|
|
|
174 |
out_file_name = None
|
175 |
|
176 |
print(corpus[0])
|
|
|
186 |
else:
|
187 |
message = "Data loaded. Warning: dataset may be too short to get consistent search results."
|
188 |
|
189 |
+
tokenised_data_file_name = "keyword_search_tokenised_data.parquet"
|
190 |
+
pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
|
191 |
|
192 |
+
return corpus, message, df, out_file_name, tokenised_data_file_name
|
193 |
|
194 |
def get_file_path_end(file_path):
|
195 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
|
|
503 |
## Load in pre-embedded file if exists
|
504 |
file_list = [string.name for string in in_file]
|
505 |
|
506 |
+
#print(file_list)
|
507 |
|
508 |
embeddings_file_names = [string for string in file_list if "embedding" in string]
|
509 |
|
510 |
+
out_message = "Document processing complete. Ready to search."
|
511 |
+
|
512 |
if embeddings_file_names:
|
513 |
+
embeddings_out = np.load(embeddings_file_names[0])['arr_0']
|
514 |
print("embeddings loaded: ", embeddings_out)
|
515 |
|
516 |
if not embeddings_file_names:
|
|
|
522 |
embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
|
523 |
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
524 |
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
525 |
+
|
526 |
+
print(embeddings_out)
|
527 |
+
embeddings_out_round = np.round(embeddings_out, 4)
|
528 |
|
529 |
toc = time.perf_counter()
|
530 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
531 |
|
532 |
+
semantic_search_file_name = 'semantic_search_embeddings.npz'
|
533 |
+
semantic_search_rounded_file_name = 'semantic_search_embeddings_rounded.npz'
|
534 |
|
535 |
+
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
536 |
+
np.savez_compressed(semantic_search_rounded_file_name, embeddings_out_round)
|
537 |
+
|
538 |
+
return out_message, embeddings_out, semantic_search_file_name, semantic_search_rounded_file_name
|
539 |
+
|
540 |
print(out_message)
|
541 |
|
542 |
+
return out_message, embeddings_out, None, None
|
543 |
|
544 |
def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column):
|
545 |
|
|
|
749 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
750 |
|
751 |
with gr.Accordion(label = "Load in data", open=True):
|
752 |
+
in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types = ['.parquet', '.csv'])
|
753 |
with gr.Row():
|
754 |
in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
755 |
load_bm25_data_button = gr.Button(value="Load data")
|
|
|
777 |
|
778 |
with gr.Row():
|
779 |
in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
780 |
+
load_semantic_data_button = gr.Button(value="Load data", variant="secondary")
|
781 |
|
782 |
+
semantic_load_progress = gr.Textbox(label="Load progress")
|
783 |
|
784 |
semantic_query = gr.Textbox(label="Enter semantic search query here")
|
785 |
semantic_submit = gr.Button(value="Start semantic search", variant="secondary", scale = 1)
|
|
|
827 |
in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
|
828 |
|
829 |
# Load in BM25 data
|
830 |
+
load_bm25_data_button.click(fn=prepare_input_data, inputs=[in_bm25_file, in_bm25_column, in_clean_data], outputs=[corpus_state, load_finished_message, data_state, output_file, output_file]).\
|
831 |
then(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message]).\
|
832 |
then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
|
833 |
|
834 |
# BM25 search functions on click or enter
|
835 |
+
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query], api_name="keyword")
|
836 |
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file, mod_query])
|
837 |
|
838 |
### SEMANTIC SEARCH ###
|
839 |
# Load in a csv/excel file for semantic search
|
840 |
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
|
841 |
+
load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
|
842 |
+
then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs, semantic_load_progress]).\
|
843 |
+
then(docs_to_np_array, inputs=[ingest_docs, in_semantic_file], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file, semantic_output_file])
|
844 |
|
845 |
# Semantic search query
|
846 |
semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
|
847 |
|
848 |
+
semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, vec_score_cut_off, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
|
849 |
|
850 |
# Dummy functions just to get dropdowns to work correctly with Gradio 3.50
|
851 |
in_bm25_column.change(dummy_function, in_bm25_column, None)
|
search_funcs/ingest.py
CHANGED
@@ -36,9 +36,6 @@ class Document(BaseModel):
|
|
36 |
"""
|
37 |
type: Literal["Document"] = "Document"
|
38 |
|
39 |
-
|
40 |
-
# -
|
41 |
-
|
42 |
split_strat = ["\n\n", "\n", ". ", "! ", "? "]
|
43 |
chunk_size = 500
|
44 |
chunk_overlap = 0
|
@@ -221,7 +218,6 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
|
|
221 |
|
222 |
return doc_sections#, page_docs
|
223 |
|
224 |
-
|
225 |
def write_out_metadata_as_string(metadata_in):
|
226 |
# If metadata_in is a single dictionary, wrap it in a list
|
227 |
if isinstance(metadata_in, dict):
|
@@ -301,8 +297,6 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
|
|
301 |
|
302 |
return doc_sections, message
|
303 |
|
304 |
-
|
305 |
-
|
306 |
def clean_line_breaks(text):
|
307 |
# Replace \n and \r\n with a space
|
308 |
return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
|
|
|
36 |
"""
|
37 |
type: Literal["Document"] = "Document"
|
38 |
|
|
|
|
|
|
|
39 |
split_strat = ["\n\n", "\n", ". ", "! ", "? "]
|
40 |
chunk_size = 500
|
41 |
chunk_overlap = 0
|
|
|
218 |
|
219 |
return doc_sections#, page_docs
|
220 |
|
|
|
221 |
def write_out_metadata_as_string(metadata_in):
|
222 |
# If metadata_in is a single dictionary, wrap it in a list
|
223 |
if isinstance(metadata_in, dict):
|
|
|
297 |
|
298 |
return doc_sections, message
|
299 |
|
|
|
|
|
300 |
def clean_line_breaks(text):
|
301 |
# Replace \n and \r\n with a space
|
302 |
return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
|