|
import os |
|
import time |
|
import pandas as pd |
|
from typing import Type |
|
import gradio as gr |
|
import numpy as np |
|
from datetime import datetime |
|
|
|
from search_funcs.helper_functions import get_file_path_end |
|
|
|
from torch import cuda, backends |
|
from sentence_transformers import SentenceTransformer |
|
|
|
today_rev = datetime.now().strftime("%Y%m%d") |
|
|
|
|
|
print("Is CUDA enabled? ", cuda.is_available()) |
|
print("Is a CUDA device available on this computer?", backends.cudnn.enabled) |
|
if cuda.is_available(): |
|
torch_device = "cuda" |
|
os.system("nvidia-smi") |
|
|
|
else: |
|
torch_device = "cpu" |
|
|
|
print("Device used is: ", torch_device) |
|
|
|
from search_funcs.helper_functions import create_highlighted_excel_wb, ensure_output_folder_exists, output_folder |
|
|
|
PandasDataFrame = Type[pd.DataFrame] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embeddings_name = "BAAI/bge-small-en-v1.5" |
|
|
|
|
|
local_embeddings_locations = [ |
|
"model/bge/", |
|
"/model/bge/", |
|
"/home/user/app/model/bge/" |
|
] |
|
|
|
|
|
for location in local_embeddings_locations: |
|
try: |
|
embeddings_model = SentenceTransformer(location) |
|
print(f"Found local model installation at: {location}") |
|
break |
|
except Exception as e: |
|
print(f"Failed to load model from {location}: {e}") |
|
continue |
|
else: |
|
|
|
embeddings_model = SentenceTransformer(embeddings_name) |
|
print("Could not find local model installation. Downloading from Huggingface") |
|
|
|
def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, output_file_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)): |
|
''' |
|
Takes a Langchain document class and saves it into a Numpy array. |
|
''' |
|
|
|
ensure_output_folder_exists(output_folder) |
|
|
|
if not in_file: |
|
out_message = "No input file found. Please load in at least one file." |
|
print(out_message) |
|
return out_message, None, None, output_file_state |
|
|
|
|
|
progress(0.6, desc = "Loading/creating embeddings") |
|
|
|
print(f"> Total split documents: {len(docs_out)}") |
|
|
|
|
|
|
|
page_contents = [doc.page_content for doc in docs_out] |
|
|
|
|
|
file_list = [string.name for string in in_file] |
|
|
|
|
|
|
|
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()] |
|
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()] |
|
data_file_name = data_file_names[0] |
|
data_file_name_no_ext = get_file_path_end(data_file_name) |
|
|
|
out_message = "Document processing complete. Ready to search." |
|
|
|
|
|
|
|
if embeddings_state.size == 0: |
|
tic = time.perf_counter() |
|
print("Starting to embed documents.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = 32, normalize_embeddings=True) |
|
|
|
|
|
|
|
toc = time.perf_counter() |
|
time_out = f"The embedding took {toc - tic:0.1f} seconds" |
|
print(time_out) |
|
|
|
|
|
if return_intermediate_files == "Yes": |
|
if clean == "Yes": data_file_name_no_ext = data_file_name_no_ext + "_cleaned" |
|
else: data_file_name_no_ext = data_file_name_no_ext |
|
|
|
progress(0.9, desc = "Saving embeddings to file") |
|
if embeddings_super_compress == "No": |
|
semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embeddings.npz' |
|
np.savez_compressed(semantic_search_file_name, embeddings_out) |
|
else: |
|
semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embedding_compress.npz' |
|
embeddings_out_round = np.round(embeddings_out, 3) |
|
embeddings_out_round *= 100 |
|
np.savez_compressed(semantic_search_file_name, embeddings_out_round) |
|
|
|
output_file_state.append(semantic_search_file_name) |
|
|
|
return out_message, embeddings_out, output_file_state, output_file_state |
|
|
|
return out_message, embeddings_out, output_file_state, output_file_state |
|
else: |
|
|
|
embeddings_out = embeddings_state |
|
|
|
print(out_message) |
|
|
|
return out_message, embeddings_out, output_file_state, output_file_state |
|
|
|
def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column, progress = gr.Progress(track_tqdm=True)): |
|
|
|
def create_docs_keep_from_df(df): |
|
dict_out = {'ids' : [df['ids']], |
|
'documents': [df['documents']], |
|
'metadatas': [df['metadatas']], |
|
'distances': [round(df['distances'].astype(float), 4)], |
|
'embeddings': None |
|
} |
|
return dict_out |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs_scores = df_docs["distances"] |
|
|
|
|
|
score_more_limit = df_docs.loc[docs_scores > vec_score_cut_off, :] |
|
|
|
|
|
|
|
|
|
if score_more_limit.empty: |
|
return pd.DataFrame() |
|
|
|
|
|
docs_len = score_more_limit["documents"].str.len() >= 100 |
|
|
|
|
|
|
|
length_more_limit = score_more_limit.loc[docs_len == True, :] |
|
|
|
|
|
|
|
|
|
if length_more_limit.empty: |
|
return pd.DataFrame() |
|
|
|
length_more_limit['ids'] = length_more_limit['ids'].astype(int) |
|
|
|
|
|
|
|
|
|
df_metadata_expanded = length_more_limit['metadatas'].apply(pd.Series) |
|
|
|
|
|
|
|
|
|
|
|
results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1) |
|
|
|
results_df_out = results_df_out.rename(columns={"documents":"search_text"}) |
|
|
|
results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore") |
|
results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if not in_join_file.empty: |
|
progress(0.5, desc = "Joining on additional data file") |
|
join_df = in_join_file |
|
|
|
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True) |
|
|
|
|
|
join_df = join_df.drop_duplicates(in_join_column) |
|
|
|
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True) |
|
|
|
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y')) |
|
|
|
return results_df_out |
|
|
|
def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int, |
|
vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): |
|
|
|
|
|
progress(0, desc = "Conducting semantic search") |
|
|
|
ensure_output_folder_exists(output_folder) |
|
|
|
print("Searching") |
|
|
|
|
|
|
|
|
|
|
|
embeddings = embeddings.to(device) |
|
|
|
|
|
query = embeddings.encode(query_str, normalize_embeddings=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosine_similarities = query @ vectorstore.T |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosine_similarities = cosine_similarities.flatten() |
|
|
|
|
|
|
|
|
|
|
|
cosine_similarities_series = pd.Series(cosine_similarities) |
|
|
|
|
|
page_contents = [doc.page_content for doc in docs] |
|
page_meta = [doc.metadata for doc in docs] |
|
ids_range = range(0,len(page_contents)) |
|
ids = [str(element) for element in ids_range] |
|
|
|
df_docs = pd.DataFrame(data={"ids": ids, |
|
"documents": page_contents, |
|
"metadatas":page_meta, |
|
"distances":cosine_similarities_series}).sort_values("distances", ascending=False).iloc[0:k_val,:] |
|
|
|
|
|
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column) |
|
|
|
print("Search complete") |
|
|
|
|
|
if results_df_out.empty: |
|
return 'No result found!', None |
|
|
|
query_str_file = query_str.replace(" ", "_") |
|
|
|
results_df_name = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx" |
|
|
|
print("Saving search output to file") |
|
progress(0.7, desc = "Saving search output to file") |
|
|
|
|
|
results_df_out_wb = create_highlighted_excel_wb(results_df_out, query_str, "search_text") |
|
results_df_out_wb.save(results_df_name) |
|
|
|
|
|
results_first_text = results_df_out.iloc[0, 1] |
|
|
|
print("Returning results") |
|
|
|
return results_first_text, results_df_name |
|
|
|
|
|
def docs_to_jina_embed_np_array_deprecated(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): |
|
''' |
|
Takes a Langchain document class and saves it into a Chroma sqlite file. |
|
''' |
|
if not in_file: |
|
out_message = "No input file found. Please load in at least one file." |
|
print(out_message) |
|
return out_message, None, None |
|
|
|
|
|
progress(0.6, desc = "Loading/creating embeddings") |
|
|
|
print(f"> Total split documents: {len(docs_out)}") |
|
|
|
|
|
|
|
page_contents = [doc.page_content for doc in docs_out] |
|
|
|
|
|
file_list = [string.name for string in in_file] |
|
|
|
|
|
|
|
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()] |
|
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()] |
|
data_file_name = data_file_names[0] |
|
data_file_name_no_ext = get_file_path_end(data_file_name) |
|
|
|
out_message = "Document processing complete. Ready to search." |
|
|
|
|
|
|
|
if embeddings_state.size == 0: |
|
tic = time.perf_counter() |
|
print("Starting to embed documents.") |
|
|
|
|
|
|
|
|
|
embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) |
|
|
|
|
|
|
|
toc = time.perf_counter() |
|
time_out = f"The embedding took {toc - tic:0.1f} seconds" |
|
print(time_out) |
|
|
|
|
|
if return_intermediate_files == "Yes": |
|
progress(0.9, desc = "Saving embeddings to file") |
|
if embeddings_super_compress == "No": |
|
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz' |
|
np.savez_compressed(semantic_search_file_name, embeddings_out) |
|
else: |
|
semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz' |
|
embeddings_out_round = np.round(embeddings_out, 3) |
|
embeddings_out_round *= 100 |
|
np.savez_compressed(semantic_search_file_name, embeddings_out_round) |
|
|
|
return out_message, embeddings_out, semantic_search_file_name |
|
|
|
return out_message, embeddings_out, None |
|
else: |
|
|
|
embeddings_out = embeddings_state |
|
|
|
print(out_message) |
|
|
|
return out_message, embeddings_out, None |
|
|
|
def jina_simple_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int, |
|
vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): |
|
|
|
|
|
progress(0, desc = "Conducting semantic search") |
|
|
|
print("Searching") |
|
|
|
|
|
vectorstore_tensor = tensor(vectorstore).to(device) |
|
|
|
|
|
embeddings = embeddings.to(device) |
|
|
|
|
|
query = embeddings.encode(query_str) |
|
query_tensor = tensor(query).to(device) |
|
|
|
if query_tensor.dim() == 1: |
|
query_tensor = query_tensor.unsqueeze(0) |
|
|
|
|
|
query_norm = query_tensor / query_tensor.norm(dim=1, keepdim=True) |
|
vectorstore_norm = vectorstore_tensor / vectorstore_tensor.norm(dim=1, keepdim=True) |
|
|
|
|
|
cosine_similarities = mm(query_norm, vectorstore_norm.T) |
|
|
|
|
|
cosine_similarities = cosine_similarities.flatten() |
|
|
|
|
|
cosine_similarities = cosine_similarities.cpu().numpy() |
|
|
|
|
|
cosine_similarities_series = pd.Series(cosine_similarities) |
|
|
|
|
|
page_contents = [doc.page_content for doc in docs] |
|
page_meta = [doc.metadata for doc in docs] |
|
ids_range = range(0,len(page_contents)) |
|
ids = [str(element) for element in ids_range] |
|
|
|
df_docs = pd.DataFrame(data={"ids": ids, |
|
"documents": page_contents, |
|
"metadatas":page_meta, |
|
"distances":cosine_similarities_series}).sort_values("distances", ascending=False).iloc[0:k_val,:] |
|
|
|
|
|
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column) |
|
|
|
print("Search complete") |
|
|
|
|
|
if results_df_out.empty: |
|
return 'No result found!', None |
|
|
|
query_str_file = query_str.replace(" ", "_") |
|
|
|
results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx" |
|
|
|
print("Saving search output to file") |
|
progress(0.7, desc = "Saving search output to file") |
|
|
|
results_df_out.to_excel(results_df_name, index= None) |
|
results_first_text = results_df_out.iloc[0, 1] |
|
|
|
print("Returning results") |
|
|
|
return results_first_text, results_df_name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()): |
|
''' |
|
Takes a Langchain document class and saves it into a Chroma sqlite file. Not currently used. |
|
''' |
|
|
|
print(f"> Total split documents: {len(docs_out)}") |
|
|
|
|
|
|
|
page_contents = [doc.page_content for doc in docs_out] |
|
page_meta = [doc.metadata for doc in docs_out] |
|
ids_range = range(0,len(page_contents)) |
|
ids = [str(element) for element in ids_range] |
|
|
|
tic = time.perf_counter() |
|
|
|
|
|
|
|
|
|
embeddings_list = embeddings.encode(sentences=page_contents, max_length=256, show_progress_bar = True, batch_size = 32).tolist() |
|
|
|
|
|
|
|
toc = time.perf_counter() |
|
time_out = f"The embedding took {toc - tic:0.1f} seconds" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(time_out) |
|
|
|
chroma_tic = time.perf_counter() |
|
|
|
|
|
client = chromadb.PersistentClient(path="./last_year", settings=Settings( |
|
anonymized_telemetry=False)) |
|
|
|
try: |
|
print("Deleting existing collection.") |
|
|
|
client.delete_collection(name="my_collection") |
|
print("Creating new collection.") |
|
collection = client.create_collection(name="my_collection") |
|
except: |
|
print("Creating new collection.") |
|
collection = client.create_collection(name="my_collection") |
|
|
|
|
|
def create_batch_ranges(in_list, batch_size=40000): |
|
total_rows = len(in_list) |
|
ranges = [] |
|
|
|
for start in range(0, total_rows, batch_size): |
|
end = min(start + batch_size, total_rows) |
|
ranges.append(range(start, end)) |
|
|
|
return ranges |
|
|
|
batch_ranges = create_batch_ranges(embeddings_list) |
|
print(batch_ranges) |
|
|
|
for row_range in progress.tqdm(batch_ranges, desc = "Creating vector database", unit = "batches of 40,000 rows"): |
|
|
|
collection.add( |
|
documents = page_contents[row_range[0]:row_range[-1]], |
|
embeddings = embeddings_list[row_range[0]:row_range[-1]], |
|
metadatas = page_meta[row_range[0]:row_range[-1]], |
|
ids = ids[row_range[0]:row_range[-1]]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chroma_toc = time.perf_counter() |
|
|
|
chroma_time_out = f"Loading to Chroma db took {chroma_toc - chroma_tic:0.1f} seconds" |
|
print(chroma_time_out) |
|
|
|
out_message = "Document processing complete" |
|
|
|
return out_message, collection |
|
|
|
def chroma_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int, |
|
vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, embeddings = embeddings_model): |
|
|
|
query = embeddings.encode(query_str).tolist() |
|
|
|
docs = vectorstore.query( |
|
query_embeddings=query, |
|
n_results= k_val |
|
|
|
|
|
) |
|
|
|
df_docs = pd.DataFrame(data={'ids': docs['ids'][0], |
|
'documents': docs['documents'][0], |
|
'metadatas':docs['metadatas'][0], |
|
'distances':docs['distances'][0] |
|
|
|
}) |
|
|
|
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column) |
|
|
|
results_df_name = output_folder + "semantic_search_result.csv" |
|
results_df_out.to_csv(results_df_name, index= None) |
|
results_first_text = results_df_out[orig_df_col].iloc[0] |
|
|
|
return results_first_text, results_df_name |
|
|