Now accepts .zip file as inputs. Moved semantic search option bar. Minor API mode changes.
7f029b5
import time | |
import pandas as pd | |
from typing import Type | |
import gradio as gr | |
import numpy as np | |
from datetime import datetime | |
from search_funcs.helper_functions import get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder | |
PandasDataFrame = Type[pd.DataFrame] | |
today_rev = datetime.now().strftime("%Y%m%d") | |
def load_embedding_model(embeddings_name = "BAAI/bge-small-en-v1.5", embedding_loc="bge/"): | |
from torch import cuda, backends | |
from sentence_transformers import SentenceTransformer | |
# Check for torch cuda | |
print("Is CUDA enabled? ", cuda.is_available()) | |
print("Is a CUDA device available on this computer?", backends.cudnn.enabled) | |
if cuda.is_available(): | |
torch_device = "cuda" | |
#os.system("nvidia-smi") | |
else: | |
torch_device = "cpu" | |
print("Device used is: ", torch_device) | |
# Define a list of possible local locations to search for the model | |
local_embeddings_locations = [ | |
"model/" + embedding_loc, # Potential local location | |
"/model/" + embedding_loc, # Potential location in Docker container | |
"/home/user/app/model/" + embedding_loc # This is inside a Docker container | |
] | |
# Attempt to load the model from each local location | |
for location in local_embeddings_locations: | |
try: | |
embeddings_model = SentenceTransformer(location) | |
print(f"Found local model installation at: {location}") | |
break # Exit the loop if the model is found | |
except Exception as e: | |
print(f"Failed to load model from {location}: {e}") | |
continue | |
else: | |
# If the loop completes without finding the model in any local location | |
embeddings_model = SentenceTransformer(embeddings_name) | |
print("Could not find local model installation. Downloading from Huggingface") | |
# Load the sentence transformer model and move it to CPU/GPU | |
embeddings_model = embeddings_model.to(torch_device) | |
return embeddings_model, torch_device | |
def docs_to_bge_embed_np_array( | |
docs_out: list, | |
in_file: list, | |
output_file_state: str, | |
clean: str, | |
embeddings_state: np.ndarray, | |
embeddings_model_name:str, | |
embeddings_model_loc:str, | |
return_intermediate_files: str = "No", | |
embeddings_compress: str = "No", | |
progress: gr.Progress = gr.Progress(track_tqdm=True) | |
) -> tuple: | |
""" | |
Process documents to create BGE embeddings and save them as a numpy array. | |
Parameters: | |
- docs_out (list): List of documents to be embedded. | |
- in_file (list): List of input files. | |
- output_file_state (str): State of the output file. | |
- clean (str): Indicates if the data should be cleaned. | |
- embeddings_state (np.ndarray): Current state of embeddings. | |
- embeddings_model_name (str): The Huggingface repo name of the embeddings model. | |
- embeddings_model_loc (str): Embeddings model save location. | |
- return_intermediate_files (str, optional): Whether to return intermediate files. Default is "No". | |
- embeddings_compress (str, optional): Whether to compress the embeddings to int8 precision. Default is "No". | |
- progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True). | |
Returns: | |
- tuple: A tuple containing the output message, embeddings, and output file state. | |
""" | |
embeddings_model, torch_device = load_embedding_model(embeddings_model_name, embeddings_model_loc) | |
ensure_output_folder_exists(output_folder) | |
if not in_file: | |
out_message = "No input file found. Please load in at least one file." | |
print(out_message) | |
return out_message, None, None, output_file_state | |
progress(0.6, desc = "Loading/creating embeddings") | |
print(f"> Total split documents: {len(docs_out)}") | |
page_contents = [doc.page_content for doc in docs_out] | |
## Load in pre-embedded file if exists | |
file_list = [string.name for string in in_file] | |
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()] | |
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()] | |
data_file_name = data_file_names[0] | |
data_file_name_no_ext = get_file_path_end(data_file_name) | |
out_message = "Document processing complete. Ready to search." | |
if embeddings_state.size == 0: | |
tic = time.perf_counter() | |
print("Starting to embed documents.") | |
# Encode embeddings. If in normal mode, float32, if in 'super compress' mode, int8 | |
batch_size = 32 | |
if "bge" in embeddings_model_name: | |
print("Embedding with BGE model") | |
else: | |
print("Embedding with MiniLM-L6-v2 model") | |
if embeddings_compress == "No": | |
print("Embedding with full fp32 precision") | |
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size) | |
else: | |
print("Embedding with int8 precision") | |
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, precision="int8") | |
toc = time.perf_counter() | |
time_out = f"The embedding took {toc - tic:0.1f} seconds" | |
print(time_out) | |
# If you want to save your files for next time | |
if return_intermediate_files == "Yes": | |
if clean == "Yes": data_file_name_no_ext = data_file_name_no_ext + "_cleaned" | |
else: data_file_name_no_ext = data_file_name_no_ext | |
progress(0.9, desc = "Saving embeddings to file") | |
if embeddings_compress == "No": | |
semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embeddings.npz' | |
else: | |
semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embedding_compress.npz' | |
np.savez_compressed(semantic_search_file_name, embeddings_out) | |
output_file_state.append(semantic_search_file_name) | |
return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model | |
return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model | |
else: | |
# Just return existing embeddings if already exist | |
embeddings_out = embeddings_state | |
print(out_message) | |
return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model | |
def process_data_from_scores_df( | |
df_docs: pd.DataFrame, | |
in_join_file: pd.DataFrame, | |
vec_score_cut_off: float, | |
in_join_column: str, | |
search_df_join_column: str, | |
progress: gr.Progress = gr.Progress(track_tqdm=True) | |
) -> pd.DataFrame: | |
""" | |
Process the data from the scores DataFrame by filtering based on score cutoff and document length, | |
and optionally joining with an additional file. | |
Parameters | |
---------- | |
df_docs : pd.DataFrame | |
DataFrame containing document scores and metadata. | |
in_join_file : pd.DataFrame | |
DataFrame to join with the results based on specified columns. | |
vec_score_cut_off : float | |
Cutoff value for the vector similarity score. | |
in_join_column : str | |
Column name in the join file to join on. | |
search_df_join_column : str | |
Column name in the search DataFrame to join on. | |
progress : gr.Progress, optional | |
Progress tracker for the function (default is gr.Progress(track_tqdm=True)). | |
Returns | |
------- | |
pd.DataFrame | |
Processed DataFrame with filtered and joined data. | |
""" | |
docs_scores = df_docs["distances"] #.astype(float) | |
# Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below) | |
score_more_limit = df_docs.loc[docs_scores > vec_score_cut_off, :] | |
if score_more_limit.empty: | |
return pd.DataFrame() | |
# Only keep sources that are at least 100 characters long | |
docs_len = score_more_limit["documents"].str.len() >= 100 | |
length_more_limit = score_more_limit.loc[docs_len == True, :] #pd.Series(docs_len) >= 100 | |
if length_more_limit.empty: | |
return pd.DataFrame() | |
length_more_limit['ids'] = length_more_limit['ids'].astype(int) | |
# Explode the 'metadatas' dictionary into separate columns | |
df_metadata_expanded = length_more_limit['metadatas'].apply(pd.Series) | |
# Concatenate the original DataFrame with the expanded metadata DataFrame | |
results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1) | |
results_df_out = results_df_out.rename(columns={"documents":"search_text"}) | |
results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore") | |
results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3) | |
# Join on additional files | |
if not in_join_file.empty: | |
progress(0.5, desc = "Joining on additional data file") | |
join_df = in_join_file | |
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True) | |
# Duplicates dropped so as not to expand out dataframe | |
join_df = join_df.drop_duplicates(in_join_column) | |
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True) | |
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y'))#.drop(in_join_column, axis=1) | |
return results_df_out | |
def bge_semantic_search( | |
query_str: str, | |
embeddings: np.ndarray, | |
documents: list, | |
k_val: int, | |
vec_score_cut_off: float, | |
embeddings_model, | |
embeddings_model_name: str, | |
embeddings_compress:str, | |
in_join_file: pd.DataFrame, | |
in_join_column: str = None, | |
search_df_join_column: str = None, | |
progress: gr.Progress = gr.Progress(track_tqdm=True) | |
) -> tuple: | |
""" | |
Perform a semantic search using the BGE model. | |
Parameters: | |
- query_str (str): The query string to search for. | |
- embeddings (np.ndarray): The embeddings to search within. | |
- documents (list): The list of documents to search. | |
- k_val (int): The number of top results to return. | |
- vec_score_cut_off (float): The score cutoff for filtering results. | |
- embeddings_model (SentenceTransformer, optional): The embeddings model to use. | |
- embeddings_model_name (str): The Huggingface repo name of the embeddings model. | |
- embeddings_compress (str): Whether the embeddings have been compressed to int8 precision | |
- in_join_file (pd.DataFrame): The DataFrame to join with the search results. | |
- in_join_column (str, optional): The column name in the join DataFrame to join on. Default is None. | |
- search_df_join_column (str, optional): The column name in the search DataFrame to join on. Default is None. | |
- progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True). | |
Returns: | |
- tuple: The DataFrame containing the search results. | |
""" | |
progress(0, desc = "Conducting semantic search") | |
output_files = [] | |
ensure_output_folder_exists(output_folder) | |
print("Searching") | |
from sentence_transformers import quantize_embeddings | |
# Encode the query using the sentence transformer and convert to a PyTorch tensor | |
if "bge" in embeddings_model_name: | |
print("Comparing similarity using BGE model") | |
else: | |
print("Comparing similarity using MiniLM-L6-v2 model") | |
if embeddings_compress == "Yes": | |
query_fp32 = embeddings_model.encode(query_str) | |
# Using a query as int8 doesn't actually seem to work | |
# query_int8 = quantize_embeddings( | |
# query_fp32, precision="int8", calibration_embeddings=embeddings | |
# ) | |
else: | |
query_fp32 = embeddings_model.encode(query_str) | |
#print("query:", query_fp32) | |
#print("embeddings:", embeddings) | |
# Normalise embeddings | |
query = query_fp32.astype('float32') | |
query_norm = np.linalg.norm(query) | |
normalized_query = query / query_norm | |
embeddings = embeddings.astype('float32') | |
embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True) # Keep dims to allow broadcasting | |
normalized_embeddings = embeddings / embeddings_norm | |
#print("normalized_query:", normalized_query) | |
#print("normalized_embeddings:", normalized_embeddings) | |
cosine_similarities = (normalized_query @ normalized_embeddings.T) | |
#print("Initial cosine similarities:", cosine_similarities) | |
# Create a Pandas Series | |
cosine_similarities_series = pd.Series(cosine_similarities) | |
# Pull out relevent info from documents | |
page_contents = [doc.page_content for doc in documents] | |
page_meta = [doc.metadata for doc in documents] | |
ids_range = range(0,len(page_contents)) | |
ids = [str(element) for element in ids_range] | |
df_documents = pd.DataFrame(data={"ids": ids, | |
"documents": page_contents, | |
"metadatas":page_meta, | |
"distances":cosine_similarities_series}).sort_values("distances", ascending=False).iloc[0:k_val,:] | |
results_df_out = process_data_from_scores_df(df_documents, in_join_file, vec_score_cut_off, in_join_column, search_df_join_column) | |
print("Search complete") | |
# If nothing found, return error message | |
if results_df_out.empty: | |
return 'No result found!', None | |
query_str_file = query_str.replace(" ", "_") | |
results_df_name = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx" | |
print("Saving search output to file") | |
progress(0.7, desc = "Saving search output to file") | |
# Highlight found text and save to file | |
results_df_out_wb = create_highlighted_excel_wb(results_df_out, query_str, "search_text") | |
results_df_out_wb.save(results_df_name) | |
#results_df_out.to_excel(results_df_name, index= None) | |
results_first_text = results_df_out.iloc[0, 1] | |
output_files.append(results_df_name) | |
#csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".csv" | |
#results_df_out.to_csv(csv_output_file, index=None) | |
#output_files.append(csv_output_file) | |
print("Returning results") | |
return results_first_text, output_files |