Sean-Case
Fixed data input for semantic search. Allowed for docs to be loaded in directly for semantic search. 0.2.1
3df8e40
import os | |
import time | |
import pandas as pd | |
from typing import Type | |
import gradio as gr | |
import numpy as np | |
from datetime import datetime | |
import accelerate | |
today_rev = datetime.now().strftime("%Y%m%d") | |
from transformers import AutoModel | |
from torch import cuda, backends, tensor, mm | |
from search_funcs.helper_functions import read_file | |
# Check for torch cuda | |
print("Is CUDA enabled? ", cuda.is_available()) | |
print("Is a CUDA device available on this computer?", backends.cudnn.enabled) | |
if cuda.is_available(): | |
torch_device = "cuda" | |
os.system("nvidia-smi") | |
else: | |
torch_device = "cpu" | |
print("Device used is: ", torch_device) | |
#from search_funcs.helper_functions import get_file_path_end | |
PandasDataFrame = Type[pd.DataFrame] | |
# Load embeddings | |
# Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/ | |
# Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29 | |
embeddings_name = "jinaai/jina-embeddings-v2-small-en" | |
local_embeddings_location = "model/jina/" | |
revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99" | |
try: | |
embeddings_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto") | |
except: | |
embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto") | |
# Chroma support is currently deprecated | |
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk. | |
#import chromadb | |
#from chromadb.config import Settings | |
#from typing_extensions import Protocol | |
#from chromadb import Documents, EmbeddingFunction, Embeddings | |
# Remove Chroma database file. If it exists as it can cause issues | |
#chromadb_file = "chroma.sqlite3" | |
#if os.path.isfile(chromadb_file): | |
# os.remove(chromadb_file) | |
def get_file_path_end(file_path): | |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt") | |
basename = os.path.basename(file_path) | |
# Then, split the basename and its extension and return only the basename without the extension | |
filename_without_extension, _ = os.path.splitext(basename) | |
#print(filename_without_extension) | |
return filename_without_extension | |
def load_embeddings(embeddings_name = embeddings_name): | |
''' | |
Load embeddings model and create a global variable based on it. | |
''' | |
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk. | |
#else: | |
embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto") | |
global embeddings | |
embeddings = embeddings_func | |
return embeddings | |
def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress()): | |
''' | |
Takes a Langchain document class and saves it into a Chroma sqlite file. | |
''' | |
print(f"> Total split documents: {len(docs_out)}") | |
#print(docs_out) | |
page_contents = [doc.page_content for doc in docs_out] | |
## Load in pre-embedded file if exists | |
file_list = [string.name for string in in_file] | |
#print(file_list) | |
embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()] | |
data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()] | |
data_file_name = data_file_names[0] | |
data_file_name_no_ext = get_file_path_end(data_file_name) | |
out_message = "Document processing complete. Ready to search." | |
if embeddings_file_names: | |
print("Loading embeddings from file.") | |
embeddings_out = np.load(embeddings_file_names[0])['arr_0'] | |
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save | |
if "compress" in embeddings_file_names[0]: | |
embeddings_out /= 100 | |
# print("embeddings loaded: ", embeddings_out) | |
if not embeddings_file_names: | |
tic = time.perf_counter() | |
print("Starting to embed documents.") | |
#embeddings_list = [] | |
#for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"): | |
# embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist()) | |
embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings | |
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings | |
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm | |
toc = time.perf_counter() | |
time_out = f"The embedding took {toc - tic:0.1f} seconds" | |
print(time_out) | |
# If you want to save your files for next time | |
if return_intermediate_files == "Yes": | |
if embeddings_super_compress == "No": | |
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz' | |
np.savez_compressed(semantic_search_file_name, embeddings_out) | |
else: | |
semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz' | |
embeddings_out_round = np.round(embeddings_out, 3) | |
embeddings_out_round *= 100 # Rounding not currently used | |
np.savez_compressed(semantic_search_file_name, embeddings_out_round) | |
return out_message, embeddings_out, semantic_search_file_name | |
return out_message, embeddings_out, None | |
print(out_message) | |
return out_message, embeddings_out, None#, None | |
def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column): | |
def create_docs_keep_from_df(df): | |
dict_out = {'ids' : [df['ids']], | |
'documents': [df['documents']], | |
'metadatas': [df['metadatas']], | |
'distances': [round(df['distances'].astype(float), 4)], | |
'embeddings': None | |
} | |
return dict_out | |
# Prepare the DataFrame by transposing | |
#df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True) | |
# Keep only documents with a certain score | |
#print(df_docs) | |
docs_scores = df_docs["distances"] #.astype(float) | |
# Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below) | |
score_more_limit = df_docs.loc[docs_scores > vec_score_cut_off, :] | |
#docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit)) | |
#print(docs_keep) | |
if score_more_limit.empty: | |
return pd.DataFrame() | |
# Only keep sources that are at least 100 characters long | |
docs_len = score_more_limit["documents"].str.len() >= 100 | |
#print(docs_len) | |
length_more_limit = score_more_limit.loc[docs_len == True, :] #pd.Series(docs_len) >= 100 | |
#docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit)) | |
#print(length_more_limit) | |
if length_more_limit.empty: | |
return pd.DataFrame() | |
length_more_limit['ids'] = length_more_limit['ids'].astype(int) | |
#length_more_limit.to_csv("length_more_limit.csv", index = None) | |
# Explode the 'metadatas' dictionary into separate columns | |
df_metadata_expanded = length_more_limit['metadatas'].apply(pd.Series) | |
#print(length_more_limit) | |
#print(df_metadata_expanded) | |
# Concatenate the original DataFrame with the expanded metadata DataFrame | |
results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1) | |
results_df_out = results_df_out.rename(columns={"documents":orig_df_col}) | |
results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore") | |
results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3) | |
# Join back to original df | |
# results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances") | |
# Join on additional files | |
if in_join_file: | |
join_filename = in_join_file.name | |
# Import data | |
join_df = read_file(join_filename) | |
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True) | |
# Duplicates dropped so as not to expand out dataframe | |
join_df = join_df.drop_duplicates(in_join_column) | |
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True) | |
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1) | |
return results_df_out | |
def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int, | |
vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress()): # ,vectorstore, embeddings | |
# print("vectorstore loaded: ", vectorstore) | |
# Convert it to a PyTorch tensor and transfer to GPU | |
vectorstore_tensor = tensor(vectorstore).to(device) | |
# Load the sentence transformer model and move it to GPU | |
embeddings = embeddings.to(device) | |
# Encode the query using the sentence transformer and convert to a PyTorch tensor | |
query = embeddings.encode(query_str) | |
query_tensor = tensor(query).to(device) | |
if query_tensor.dim() == 1: | |
query_tensor = query_tensor.unsqueeze(0) # Reshape to 2D with one row | |
# Normalize the query tensor and vectorstore tensor | |
query_norm = query_tensor / query_tensor.norm(dim=1, keepdim=True) | |
vectorstore_norm = vectorstore_tensor / vectorstore_tensor.norm(dim=1, keepdim=True) | |
# Calculate cosine similarities (batch processing) | |
cosine_similarities = mm(query_norm, vectorstore_norm.T) | |
# Flatten the tensor to a 1D array | |
cosine_similarities = cosine_similarities.flatten() | |
# Convert to a NumPy array if it's still a PyTorch tensor | |
cosine_similarities = cosine_similarities.cpu().numpy() | |
# Create a Pandas Series | |
cosine_similarities_series = pd.Series(cosine_similarities) | |
# Pull out relevent info from docs | |
page_contents = [doc.page_content for doc in docs] | |
page_meta = [doc.metadata for doc in docs] | |
ids_range = range(0,len(page_contents)) | |
ids = [str(element) for element in ids_range] | |
df_docs = pd.DataFrame(data={"ids": ids, | |
"documents": page_contents, | |
"metadatas":page_meta, | |
"distances":cosine_similarities_series}).sort_values("distances", ascending=False).iloc[0:k_val,:] | |
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column) | |
# If nothing found, return error message | |
if results_df_out.empty: | |
return 'No result found!', None | |
query_str_file = query_str.replace(" ", "_") | |
results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx" | |
results_df_out.to_excel(results_df_name, index= None) | |
results_first_text = results_df_out.iloc[0, 1] | |
return results_first_text, results_df_name | |
# Deprecated Chroma functions - kept just in case needed in future. | |
def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()): | |
''' | |
Takes a Langchain document class and saves it into a Chroma sqlite file. Not currently used. | |
''' | |
print(f"> Total split documents: {len(docs_out)}") | |
#print(docs_out) | |
page_contents = [doc.page_content for doc in docs_out] | |
page_meta = [doc.metadata for doc in docs_out] | |
ids_range = range(0,len(page_contents)) | |
ids = [str(element) for element in ids_range] | |
tic = time.perf_counter() | |
#embeddings_list = [] | |
#for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"): | |
# embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist()) | |
embeddings_list = embeddings.encode(sentences=page_contents, max_length=256, show_progress_bar = True, batch_size = 32).tolist() # For Jina embeddings | |
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings | |
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm | |
toc = time.perf_counter() | |
time_out = f"The embedding took {toc - tic:0.1f} seconds" | |
#pd.Series(embeddings_list).to_csv("embeddings_out.csv") | |
# Jina tiny | |
# This takes about 300 seconds for 240,000 records = 800 / second, 1024 max length | |
# For 50k records: | |
# 61 seconds at 1024 max length | |
# 55 seconds at 512 max length | |
# 43 seconds at 256 max length | |
# 31 seconds at 128 max length | |
# The embedding took 1372.5 seconds at 256 max length for 655,020 case notes | |
# BGE small | |
# 96 seconds for 50k records at 512 length | |
# all-MiniLM-L6-v2 | |
# 42.5 seconds at (256?) max length | |
# paraphrase-MiniLM-L3-v2 | |
# 22 seconds for 128 max length | |
print(time_out) | |
chroma_tic = time.perf_counter() | |
# Create a new Chroma collection to store the documents and metadata. We don't need to specify an embedding fuction, and the default will be used. | |
client = chromadb.PersistentClient(path="./last_year", settings=Settings( | |
anonymized_telemetry=False)) | |
try: | |
print("Deleting existing collection.") | |
#collection = client.get_collection(name="my_collection") | |
client.delete_collection(name="my_collection") | |
print("Creating new collection.") | |
collection = client.create_collection(name="my_collection") | |
except: | |
print("Creating new collection.") | |
collection = client.create_collection(name="my_collection") | |
# Match batch size is about 40,000, so add that amount in a loop | |
def create_batch_ranges(in_list, batch_size=40000): | |
total_rows = len(in_list) | |
ranges = [] | |
for start in range(0, total_rows, batch_size): | |
end = min(start + batch_size, total_rows) | |
ranges.append(range(start, end)) | |
return ranges | |
batch_ranges = create_batch_ranges(embeddings_list) | |
print(batch_ranges) | |
for row_range in progress.tqdm(batch_ranges, desc = "Creating vector database", unit = "batches of 40,000 rows"): | |
collection.add( | |
documents = page_contents[row_range[0]:row_range[-1]], | |
embeddings = embeddings_list[row_range[0]:row_range[-1]], | |
metadatas = page_meta[row_range[0]:row_range[-1]], | |
ids = ids[row_range[0]:row_range[-1]]) | |
#print("Here") | |
# print(collection.count()) | |
#chatf.vectorstore = vectorstore_func | |
chroma_toc = time.perf_counter() | |
chroma_time_out = f"Loading to Chroma db took {chroma_toc - chroma_tic:0.1f} seconds" | |
print(chroma_time_out) | |
out_message = "Document processing complete" | |
return out_message, collection | |
def chroma_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int, | |
vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, embeddings = embeddings_model): # ,vectorstore, embeddings | |
query = embeddings.encode(query_str).tolist() | |
docs = vectorstore.query( | |
query_embeddings=query, | |
n_results= k_val # No practical limit on number of responses returned | |
#where={"metadata_field": "is_equal_to_this"}, | |
#where_document={"$contains":"search_string"} | |
) | |
df_docs = pd.DataFrame(data={'ids': docs['ids'][0], | |
'documents': docs['documents'][0], | |
'metadatas':docs['metadatas'][0], | |
'distances':docs['distances'][0]#, | |
#'embeddings': docs['embeddings'] | |
}) | |
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column) | |
results_df_name = "semantic_search_result.csv" | |
results_df_out.to_csv(results_df_name, index= None) | |
results_first_text = results_df_out[orig_df_col].iloc[0] | |
return results_first_text, results_df_name | |