File size: 14,580 Bytes
99d6fba
 
 
 
 
 
a95ef9f
 
99d6fba
 
ea0dd40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a95ef9f
 
 
 
 
ea0dd40
 
 
 
a95ef9f
ea0dd40
a95ef9f
 
 
 
 
 
 
 
 
 
ea0dd40
 
 
a95ef9f
ea0dd40
a95ef9f
 
 
 
 
 
ea0dd40
2089141
d3ff2e2
2089141
63049fe
 
 
a95ef9f
63049fe
2bcd818
99d6fba
 
 
 
 
 
 
 
739b386
 
99d6fba
 
 
 
 
63049fe
99d6fba
 
2bcd818
ea0dd40
 
 
 
 
 
 
7f029b5
 
 
 
 
 
 
99d6fba
 
 
 
 
 
 
352c02a
 
 
63049fe
ea0dd40
1dc162b
99d6fba
1dc162b
ea0dd40
 
99d6fba
4ee3470
99d6fba
ea0dd40
4ee3470
ea0dd40
63049fe
 
 
99d6fba
 
 
ea0dd40
99d6fba
a95ef9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99d6fba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352c02a
99d6fba
 
 
2754a2b
99d6fba
 
63049fe
 
 
99d6fba
 
 
 
 
 
 
 
4ce2224
99d6fba
 
 
a95ef9f
 
 
 
 
ea0dd40
 
 
 
a95ef9f
 
ea0dd40
a95ef9f
ea0dd40
a95ef9f
 
 
 
 
 
 
 
 
ea0dd40
 
 
a95ef9f
 
ea0dd40
a95ef9f
 
 
ea0dd40
a95ef9f
2bcd818
 
 
ea0dd40
 
d3ff2e2
2089141
2bcd818
 
ea0dd40
2bcd818
 
ea0dd40
7f029b5
 
 
2bcd818
ea0dd40
7f029b5
 
ea0dd40
7f029b5
 
 
 
 
ea0dd40
7f029b5
 
 
 
ea0dd40
7f029b5
ea0dd40
7f029b5
ea0dd40
7f029b5
 
ea0dd40
7f029b5
ea0dd40
7f029b5
 
ea0dd40
7f029b5
 
ea0dd40
7f029b5
ea0dd40
7f029b5
2bcd818
 
 
 
a95ef9f
 
 
2bcd818
 
 
a95ef9f
2bcd818
 
 
 
 
a95ef9f
2bcd818
 
 
 
 
 
 
 
 
d3ff2e2
2bcd818
 
 
 
352c02a
 
 
 
 
2bcd818
ea0dd40
 
7f029b5
 
 
ea0dd40
2bcd818
 
7f029b5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
import time
import pandas as pd
from typing import Type
import gradio as gr
import numpy as np
from datetime import datetime
from search_funcs.helper_functions import get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
PandasDataFrame = Type[pd.DataFrame]
today_rev = datetime.now().strftime("%Y%m%d")

def load_embedding_model(embeddings_name = "BAAI/bge-small-en-v1.5", embedding_loc="bge/"):

    from torch import cuda, backends
    from sentence_transformers import SentenceTransformer

    # Check for torch cuda
    print("Is CUDA enabled? ", cuda.is_available())
    print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
    if cuda.is_available():
        torch_device = "cuda"
        #os.system("nvidia-smi")
    else: 
        torch_device =  "cpu"

    print("Device used is: ", torch_device)

    # Define a list of possible local locations to search for the model
    local_embeddings_locations = [
        "model/" + embedding_loc, # Potential local location
        "/model/" + embedding_loc, # Potential location in Docker container
        "/home/user/app/model/" + embedding_loc # This is inside a Docker container
    ]

    # Attempt to load the model from each local location
    for location in local_embeddings_locations:
        try:
            embeddings_model = SentenceTransformer(location)
            print(f"Found local model installation at: {location}")
            break  # Exit the loop if the model is found
        except Exception as e:
            print(f"Failed to load model from {location}: {e}")
            continue
    else:
        # If the loop completes without finding the model in any local location
        embeddings_model = SentenceTransformer(embeddings_name)
        print("Could not find local model installation. Downloading from Huggingface")

    # Load the sentence transformer model and move it to CPU/GPU
    embeddings_model = embeddings_model.to(torch_device)

    return embeddings_model, torch_device

def docs_to_bge_embed_np_array(
    docs_out: list, 
    in_file: list, 
    output_file_state: str, 
    clean: str,
    embeddings_state: np.ndarray,
    embeddings_model_name:str,
    embeddings_model_loc:str,
    return_intermediate_files: str = "No", 
    embeddings_compress: str = "No",    
    progress: gr.Progress = gr.Progress(track_tqdm=True)
) -> tuple:
    """
    Process documents to create BGE embeddings and save them as a numpy array.

    Parameters:
    - docs_out (list): List of documents to be embedded.
    - in_file (list): List of input files.
    - output_file_state (str): State of the output file.
    - clean (str): Indicates if the data should be cleaned.
    - embeddings_state (np.ndarray): Current state of embeddings.
    - embeddings_model_name (str): The Huggingface repo name of the embeddings model.
    - embeddings_model_loc (str): Embeddings model save location.
    - return_intermediate_files (str, optional): Whether to return intermediate files. Default is "No".
    - embeddings_compress (str, optional): Whether to compress the embeddings to int8 precision. Default is "No".    
    - progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).

    Returns:
    - tuple: A tuple containing the output message, embeddings, and output file state.
    """

    embeddings_model, torch_device = load_embedding_model(embeddings_model_name, embeddings_model_loc)

    ensure_output_folder_exists(output_folder)

    if not in_file:
        out_message = "No input file found. Please load in at least one file."
        print(out_message)
        return out_message, None, None, output_file_state        

    progress(0.6, desc = "Loading/creating embeddings")

    print(f"> Total split documents: {len(docs_out)}")

    page_contents = [doc.page_content for doc in docs_out]

    ## Load in pre-embedded file if exists
    file_list = [string.name for string in in_file]

    embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
    data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
    data_file_name = data_file_names[0]
    data_file_name_no_ext = get_file_path_end(data_file_name)

    out_message = "Document processing complete. Ready to search."

    if embeddings_state.size == 0:
        tic = time.perf_counter()
        print("Starting to embed documents.")

        # Encode embeddings. If in normal mode, float32, if in 'super compress' mode, int8
        batch_size = 32

        if "bge" in embeddings_model_name:
            print("Embedding with BGE model")
        else:
            print("Embedding with MiniLM-L6-v2 model")

        if embeddings_compress == "No":
            print("Embedding with full fp32 precision")
            embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
        else:
            print("Embedding with int8 precision")
            embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, precision="int8")

        toc = time.perf_counter()
        time_out = f"The embedding took {toc - tic:0.1f} seconds"
        print(time_out)

        # If you want to save your files for next time
        if return_intermediate_files == "Yes":
            if clean == "Yes": data_file_name_no_ext = data_file_name_no_ext + "_cleaned"
            else: data_file_name_no_ext = data_file_name_no_ext

            progress(0.9, desc = "Saving embeddings to file")
            if embeddings_compress == "No":
                semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embeddings.npz'
            else:
                semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embedding_compress.npz'
                
            np.savez_compressed(semantic_search_file_name, embeddings_out)

            output_file_state.append(semantic_search_file_name)

            return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model

        return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
    else:
        # Just return existing embeddings if already exist
        embeddings_out = embeddings_state
    
    print(out_message)

    return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model

def process_data_from_scores_df(
    df_docs: pd.DataFrame, 
    in_join_file: pd.DataFrame, 
    vec_score_cut_off: float, 
    in_join_column: str, 
    search_df_join_column: str, 
    progress: gr.Progress = gr.Progress(track_tqdm=True)
) -> pd.DataFrame:
    """
    Process the data from the scores DataFrame by filtering based on score cutoff and document length,
    and optionally joining with an additional file.

    Parameters
    ----------
    df_docs : pd.DataFrame
        DataFrame containing document scores and metadata.
    in_join_file : pd.DataFrame
        DataFrame to join with the results based on specified columns.
    vec_score_cut_off : float
        Cutoff value for the vector similarity score.
    in_join_column : str
        Column name in the join file to join on.
    search_df_join_column : str
        Column name in the search DataFrame to join on.
    progress : gr.Progress, optional
        Progress tracker for the function (default is gr.Progress(track_tqdm=True)).

    Returns
    -------
    pd.DataFrame
        Processed DataFrame with filtered and joined data.
    """
           
    docs_scores = df_docs["distances"] #.astype(float)

    # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
    score_more_limit = df_docs.loc[docs_scores > vec_score_cut_off, :]

    if score_more_limit.empty:
        return pd.DataFrame()

    # Only keep sources that are at least 100 characters long
    docs_len = score_more_limit["documents"].str.len() >= 100

    length_more_limit = score_more_limit.loc[docs_len == True, :] #pd.Series(docs_len) >= 100

    if length_more_limit.empty:
        return pd.DataFrame()
            
    length_more_limit['ids'] = length_more_limit['ids'].astype(int)


    # Explode the 'metadatas' dictionary into separate columns
    df_metadata_expanded = length_more_limit['metadatas'].apply(pd.Series)

    # Concatenate the original DataFrame with the expanded metadata DataFrame
    results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1)

    results_df_out = results_df_out.rename(columns={"documents":"search_text"})

    results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
    results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
    

    # Join on additional files
    if not in_join_file.empty:
        progress(0.5, desc = "Joining on additional data file")
        join_df = in_join_file

        join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)

        # Duplicates dropped so as not to expand out dataframe
        join_df = join_df.drop_duplicates(in_join_column)

        results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)

        results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y'))#.drop(in_join_column, axis=1)

    return results_df_out

def bge_semantic_search(
    query_str: str, 
    embeddings: np.ndarray, 
    documents: list, 
    k_val: int, 
    vec_score_cut_off: float,
    embeddings_model,
    embeddings_model_name: str,
    embeddings_compress:str,
    in_join_file: pd.DataFrame, 
    in_join_column: str = None, 
    search_df_join_column: str = None,
    progress: gr.Progress = gr.Progress(track_tqdm=True)
) -> tuple:
    """
    Perform a semantic search using the BGE model.

    Parameters:
    - query_str (str): The query string to search for.
    - embeddings (np.ndarray): The embeddings to search within.
    - documents (list): The list of documents to search.
    - k_val (int): The number of top results to return.
    - vec_score_cut_off (float): The score cutoff for filtering results.
    - embeddings_model (SentenceTransformer, optional): The embeddings model to use.
    - embeddings_model_name (str): The Huggingface repo name of the embeddings model.
    - embeddings_compress (str): Whether the embeddings have been compressed to int8 precision
    - in_join_file (pd.DataFrame): The DataFrame to join with the search results.
    - in_join_column (str, optional): The column name in the join DataFrame to join on. Default is None.
    - search_df_join_column (str, optional): The column name in the search DataFrame to join on. Default is None.  
    - progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).

    Returns:
    - tuple: The DataFrame containing the search results.
    """

    progress(0, desc = "Conducting semantic search")

    output_files = []

    ensure_output_folder_exists(output_folder)

    print("Searching")

    from sentence_transformers import quantize_embeddings

    # Encode the query using the sentence transformer and convert to a PyTorch tensor
    if "bge" in embeddings_model_name:
        print("Comparing similarity using BGE model")
    else:
        print("Comparing similarity using MiniLM-L6-v2 model")


    if embeddings_compress == "Yes":
        query_fp32 = embeddings_model.encode(query_str)

        # Using a query as int8 doesn't actually seem to work
        # query_int8 = quantize_embeddings(
        #     query_fp32, precision="int8", calibration_embeddings=embeddings
        # )
        
    else:
        query_fp32 = embeddings_model.encode(query_str)
  
    #print("query:", query_fp32)
    #print("embeddings:", embeddings)

    # Normalise embeddings

    query = query_fp32.astype('float32')

    query_norm = np.linalg.norm(query)
    normalized_query = query / query_norm

    embeddings = embeddings.astype('float32')

    embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True)  # Keep dims to allow broadcasting
    normalized_embeddings = embeddings / embeddings_norm

    #print("normalized_query:", normalized_query)
    #print("normalized_embeddings:", normalized_embeddings)

    cosine_similarities = (normalized_query @ normalized_embeddings.T)

    #print("Initial cosine similarities:", cosine_similarities)

    # Create a Pandas Series
    cosine_similarities_series = pd.Series(cosine_similarities)

    # Pull out relevent info from documents
    page_contents = [doc.page_content for doc in documents]
    page_meta = [doc.metadata for doc in documents]
    ids_range = range(0,len(page_contents)) 
    ids = [str(element) for element in ids_range]

    df_documents = pd.DataFrame(data={"ids": ids,
                                "documents": page_contents,
                                    "metadatas":page_meta,
                                    "distances":cosine_similarities_series}).sort_values("distances", ascending=False).iloc[0:k_val,:]

    
    results_df_out = process_data_from_scores_df(df_documents, in_join_file, vec_score_cut_off, in_join_column, search_df_join_column)

    print("Search complete")

    # If nothing found, return error message
    if results_df_out.empty:
        return 'No result found!', None
    
    query_str_file = query_str.replace(" ", "_")

    results_df_name = output_folder + "semantic_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"

    print("Saving search output to file")
    progress(0.7, desc = "Saving search output to file")

    # Highlight found text and save to file
    results_df_out_wb = create_highlighted_excel_wb(results_df_out, query_str, "search_text")
    results_df_out_wb.save(results_df_name)

    #results_df_out.to_excel(results_df_name, index= None)
    results_first_text = results_df_out.iloc[0, 1]
    output_files.append(results_df_name)

    #csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" +  query_str_file + ".csv" 
    #results_df_out.to_csv(csv_output_file, index=None)
    #output_files.append(csv_output_file)

    print("Returning results")

    return results_first_text, output_files