File size: 14,580 Bytes
99d6fba a95ef9f 99d6fba ea0dd40 a95ef9f ea0dd40 a95ef9f ea0dd40 a95ef9f ea0dd40 a95ef9f ea0dd40 a95ef9f ea0dd40 2089141 d3ff2e2 2089141 63049fe a95ef9f 63049fe 2bcd818 99d6fba 739b386 99d6fba 63049fe 99d6fba 2bcd818 ea0dd40 7f029b5 99d6fba 352c02a 63049fe ea0dd40 1dc162b 99d6fba 1dc162b ea0dd40 99d6fba 4ee3470 99d6fba ea0dd40 4ee3470 ea0dd40 63049fe 99d6fba ea0dd40 99d6fba a95ef9f 99d6fba 352c02a 99d6fba 2754a2b 99d6fba 63049fe 99d6fba 4ce2224 99d6fba a95ef9f ea0dd40 a95ef9f ea0dd40 a95ef9f ea0dd40 a95ef9f ea0dd40 a95ef9f ea0dd40 a95ef9f ea0dd40 a95ef9f 2bcd818 ea0dd40 d3ff2e2 2089141 2bcd818 ea0dd40 2bcd818 ea0dd40 7f029b5 2bcd818 ea0dd40 7f029b5 ea0dd40 7f029b5 ea0dd40 7f029b5 ea0dd40 7f029b5 ea0dd40 7f029b5 ea0dd40 7f029b5 ea0dd40 7f029b5 ea0dd40 7f029b5 ea0dd40 7f029b5 ea0dd40 7f029b5 ea0dd40 7f029b5 2bcd818 a95ef9f 2bcd818 a95ef9f 2bcd818 a95ef9f 2bcd818 d3ff2e2 2bcd818 352c02a 2bcd818 ea0dd40 7f029b5 ea0dd40 2bcd818 7f029b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 |
import time
import pandas as pd
from typing import Type
import gradio as gr
import numpy as np
from datetime import datetime
from search_funcs.helper_functions import get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
PandasDataFrame = Type[pd.DataFrame]
today_rev = datetime.now().strftime("%Y%m%d")
def load_embedding_model(embeddings_name = "BAAI/bge-small-en-v1.5", embedding_loc="bge/"):
from torch import cuda, backends
from sentence_transformers import SentenceTransformer
# Check for torch cuda
print("Is CUDA enabled? ", cuda.is_available())
print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
if cuda.is_available():
torch_device = "cuda"
#os.system("nvidia-smi")
else:
torch_device = "cpu"
print("Device used is: ", torch_device)
# Define a list of possible local locations to search for the model
local_embeddings_locations = [
"model/" + embedding_loc, # Potential local location
"/model/" + embedding_loc, # Potential location in Docker container
"/home/user/app/model/" + embedding_loc # This is inside a Docker container
]
# Attempt to load the model from each local location
for location in local_embeddings_locations:
try:
embeddings_model = SentenceTransformer(location)
print(f"Found local model installation at: {location}")
break # Exit the loop if the model is found
except Exception as e:
print(f"Failed to load model from {location}: {e}")
continue
else:
# If the loop completes without finding the model in any local location
embeddings_model = SentenceTransformer(embeddings_name)
print("Could not find local model installation. Downloading from Huggingface")
# Load the sentence transformer model and move it to CPU/GPU
embeddings_model = embeddings_model.to(torch_device)
return embeddings_model, torch_device
def docs_to_bge_embed_np_array(
docs_out: list,
in_file: list,
output_file_state: str,
clean: str,
embeddings_state: np.ndarray,
embeddings_model_name:str,
embeddings_model_loc:str,
return_intermediate_files: str = "No",
embeddings_compress: str = "No",
progress: gr.Progress = gr.Progress(track_tqdm=True)
) -> tuple:
"""
Process documents to create BGE embeddings and save them as a numpy array.
Parameters:
- docs_out (list): List of documents to be embedded.
- in_file (list): List of input files.
- output_file_state (str): State of the output file.
- clean (str): Indicates if the data should be cleaned.
- embeddings_state (np.ndarray): Current state of embeddings.
- embeddings_model_name (str): The Huggingface repo name of the embeddings model.
- embeddings_model_loc (str): Embeddings model save location.
- return_intermediate_files (str, optional): Whether to return intermediate files. Default is "No".
- embeddings_compress (str, optional): Whether to compress the embeddings to int8 precision. Default is "No".
- progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
Returns:
- tuple: A tuple containing the output message, embeddings, and output file state.
"""
embeddings_model, torch_device = load_embedding_model(embeddings_model_name, embeddings_model_loc)
ensure_output_folder_exists(output_folder)
if not in_file:
out_message = "No input file found. Please load in at least one file."
print(out_message)
return out_message, None, None, output_file_state
progress(0.6, desc = "Loading/creating embeddings")
print(f"> Total split documents: {len(docs_out)}")
page_contents = [doc.page_content for doc in docs_out]
## Load in pre-embedded file if exists
file_list = [string.name for string in in_file]
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
data_file_name = data_file_names[0]
data_file_name_no_ext = get_file_path_end(data_file_name)
out_message = "Document processing complete. Ready to search."
if embeddings_state.size == 0:
tic = time.perf_counter()
print("Starting to embed documents.")
# Encode embeddings. If in normal mode, float32, if in 'super compress' mode, int8
batch_size = 32
if "bge" in embeddings_model_name:
print("Embedding with BGE model")
else:
print("Embedding with MiniLM-L6-v2 model")
if embeddings_compress == "No":
print("Embedding with full fp32 precision")
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
else:
print("Embedding with int8 precision")
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, precision="int8")
toc = time.perf_counter()
time_out = f"The embedding took {toc - tic:0.1f} seconds"
print(time_out)
# If you want to save your files for next time
if return_intermediate_files == "Yes":
if clean == "Yes": data_file_name_no_ext = data_file_name_no_ext + "_cleaned"
else: data_file_name_no_ext = data_file_name_no_ext
progress(0.9, desc = "Saving embeddings to file")
if embeddings_compress == "No":
semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embeddings.npz'
else:
semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embedding_compress.npz'
np.savez_compressed(semantic_search_file_name, embeddings_out)
output_file_state.append(semantic_search_file_name)
return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
else:
# Just return existing embeddings if already exist
embeddings_out = embeddings_state
print(out_message)
return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
def process_data_from_scores_df(
df_docs: pd.DataFrame,
in_join_file: pd.DataFrame,
vec_score_cut_off: float,
in_join_column: str,
search_df_join_column: str,
progress: gr.Progress = gr.Progress(track_tqdm=True)
) -> pd.DataFrame:
"""
Process the data from the scores DataFrame by filtering based on score cutoff and document length,
and optionally joining with an additional file.
Parameters
----------
df_docs : pd.DataFrame
DataFrame containing document scores and metadata.
in_join_file : pd.DataFrame
DataFrame to join with the results based on specified columns.
vec_score_cut_off : float
Cutoff value for the vector similarity score.
in_join_column : str
Column name in the join file to join on.
search_df_join_column : str
Column name in the search DataFrame to join on.
progress : gr.Progress, optional
Progress tracker for the function (default is gr.Progress(track_tqdm=True)).
Returns
-------
pd.DataFrame
Processed DataFrame with filtered and joined data.
"""
docs_scores = df_docs["distances"] #.astype(float)
# Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
score_more_limit = df_docs.loc[docs_scores > vec_score_cut_off, :]
if score_more_limit.empty:
return pd.DataFrame()
# Only keep sources that are at least 100 characters long
docs_len = score_more_limit["documents"].str.len() >= 100
length_more_limit = score_more_limit.loc[docs_len == True, :] #pd.Series(docs_len) >= 100
if length_more_limit.empty:
return pd.DataFrame()
length_more_limit['ids'] = length_more_limit['ids'].astype(int)
# Explode the 'metadatas' dictionary into separate columns
df_metadata_expanded = length_more_limit['metadatas'].apply(pd.Series)
# Concatenate the original DataFrame with the expanded metadata DataFrame
results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
results_df_out = results_df_out.rename(columns={"documents":"search_text"})
results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
# Join on additional files
if not in_join_file.empty:
progress(0.5, desc = "Joining on additional data file")
join_df = in_join_file
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
# Duplicates dropped so as not to expand out dataframe
join_df = join_df.drop_duplicates(in_join_column)
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y'))#.drop(in_join_column, axis=1)
return results_df_out
def bge_semantic_search(
query_str: str,
embeddings: np.ndarray,
documents: list,
k_val: int,
vec_score_cut_off: float,
embeddings_model,
embeddings_model_name: str,
embeddings_compress:str,
in_join_file: pd.DataFrame,
in_join_column: str = None,
search_df_join_column: str = None,
progress: gr.Progress = gr.Progress(track_tqdm=True)
) -> tuple:
"""
Perform a semantic search using the BGE model.
Parameters:
- query_str (str): The query string to search for.
- embeddings (np.ndarray): The embeddings to search within.
- documents (list): The list of documents to search.
- k_val (int): The number of top results to return.
- vec_score_cut_off (float): The score cutoff for filtering results.
- embeddings_model (SentenceTransformer, optional): The embeddings model to use.
- embeddings_model_name (str): The Huggingface repo name of the embeddings model.
- embeddings_compress (str): Whether the embeddings have been compressed to int8 precision
- in_join_file (pd.DataFrame): The DataFrame to join with the search results.
- in_join_column (str, optional): The column name in the join DataFrame to join on. Default is None.
- search_df_join_column (str, optional): The column name in the search DataFrame to join on. Default is None.
- progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
Returns:
- tuple: The DataFrame containing the search results.
"""
progress(0, desc = "Conducting semantic search")
output_files = []
ensure_output_folder_exists(output_folder)
print("Searching")
from sentence_transformers import quantize_embeddings
# Encode the query using the sentence transformer and convert to a PyTorch tensor
if "bge" in embeddings_model_name:
print("Comparing similarity using BGE model")
else:
print("Comparing similarity using MiniLM-L6-v2 model")
if embeddings_compress == "Yes":
query_fp32 = embeddings_model.encode(query_str)
# Using a query as int8 doesn't actually seem to work
# query_int8 = quantize_embeddings(
# query_fp32, precision="int8", calibration_embeddings=embeddings
# )
else:
query_fp32 = embeddings_model.encode(query_str)
#print("query:", query_fp32)
#print("embeddings:", embeddings)
# Normalise embeddings
query = query_fp32.astype('float32')
query_norm = np.linalg.norm(query)
normalized_query = query / query_norm
embeddings = embeddings.astype('float32')
embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True) # Keep dims to allow broadcasting
normalized_embeddings = embeddings / embeddings_norm
#print("normalized_query:", normalized_query)
#print("normalized_embeddings:", normalized_embeddings)
cosine_similarities = (normalized_query @ normalized_embeddings.T)
#print("Initial cosine similarities:", cosine_similarities)
# Create a Pandas Series
cosine_similarities_series = pd.Series(cosine_similarities)
# Pull out relevent info from documents
page_contents = [doc.page_content for doc in documents]
page_meta = [doc.metadata for doc in documents]
ids_range = range(0,len(page_contents))
ids = [str(element) for element in ids_range]
df_documents = pd.DataFrame(data={"ids": ids,
"documents": page_contents,
"metadatas":page_meta,
"distances":cosine_similarities_series}).sort_values("distances", ascending=False).iloc[0:k_val,:]
results_df_out = process_data_from_scores_df(df_documents, in_join_file, vec_score_cut_off, in_join_column, search_df_join_column)
print("Search complete")
# If nothing found, return error message
if results_df_out.empty:
return 'No result found!', None
query_str_file = query_str.replace(" ", "_")
results_df_name = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
print("Saving search output to file")
progress(0.7, desc = "Saving search output to file")
# Highlight found text and save to file
results_df_out_wb = create_highlighted_excel_wb(results_df_out, query_str, "search_text")
results_df_out_wb.save(results_df_name)
#results_df_out.to_excel(results_df_name, index= None)
results_first_text = results_df_out.iloc[0, 1]
output_files.append(results_df_name)
#csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".csv"
#results_df_out.to_csv(csv_output_file, index=None)
#output_files.append(csv_output_file)
print("Returning results")
return results_first_text, output_files |