|
import spacy |
|
spacy.prefer_gpu() |
|
from spacy.cli.download import download |
|
from spacy.matcher import Matcher |
|
import numpy as np |
|
import gradio as gr |
|
import pandas as pd |
|
from typing import List, Type |
|
from datetime import datetime |
|
from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder |
|
|
|
PandasDataFrame = Type[pd.DataFrame] |
|
|
|
today_rev = datetime.now().strftime("%Y%m%d") |
|
|
|
|
|
|
|
|
|
try: |
|
import en_core_web_sm |
|
nlp = en_core_web_sm.load() |
|
print("Successfully imported spaCy model") |
|
|
|
|
|
except: |
|
download("en_core_web_sm") |
|
nlp = spacy.load("en_core_web_sm") |
|
print("Successfully imported spaCy model") |
|
|
|
def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)): |
|
''' Conduct fuzzy match on a list of data.''' |
|
|
|
|
|
|
|
|
|
df_list = list(map(" ".join, tokenised_data)) |
|
|
|
if len(df_list) > 10000: |
|
out_message = "Your data has more than 10,000 rows and will take more than three minutes to do a fuzzy search. Please try keyword or semantic search for data of this size." |
|
return out_message, None |
|
|
|
query = nlp(string_query) |
|
tokenised_query = [token.text for token in query] |
|
print(tokenised_query) |
|
|
|
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(no_spelling_mistakes) |
|
|
|
|
|
if len(tokenised_query) > 1: |
|
pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}] |
|
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}] |
|
else: |
|
pattern_lemma = [{"LEMMA": tokenised_query[0]}] |
|
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}] |
|
|
|
|
|
|
|
matcher = Matcher(nlp.vocab) |
|
|
|
|
|
matcher.add(string_query, [pattern_fuzz]) |
|
matcher.add(string_query, [pattern_lemma]) |
|
|
|
|
|
batch_size = 256 |
|
docs = nlp.pipe(df_list, batch_size=batch_size) |
|
|
|
|
|
all_matches = [] |
|
|
|
|
|
for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"): |
|
matches = matcher(doc) |
|
match_count = len(matches) |
|
all_matches.append(match_count) |
|
|
|
print("Search complete") |
|
|
|
|
|
lengths = [] |
|
for element in df_list: |
|
lengths.append(len(element)) |
|
|
|
|
|
match_scores = (np.array(all_matches)/np.array(lengths)).tolist() |
|
|
|
|
|
results_df = pd.DataFrame(data={"index": list(range(len(df_list))), |
|
"search_text": df_list, |
|
"search_score_abs": match_scores}) |
|
results_df['search_score_abs'] = abs(round(results_df['search_score_abs']*100, 2)) |
|
results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left") |
|
|
|
|
|
results_df_out = results_df_out.loc[results_df["search_score_abs"] > 0, :] |
|
|
|
|
|
if not in_join_file.empty: |
|
progress(0.5, desc = "Joining on additional data file") |
|
join_df = in_join_file |
|
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True) |
|
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True) |
|
|
|
|
|
join_df = join_df.drop_duplicates(in_join_column) |
|
|
|
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y')) |
|
|
|
|
|
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False) |
|
|
|
|
|
query_str_file = ("_").join(tokenised_query) |
|
results_df_name = output_folder + "fuzzy_keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx" |
|
|
|
print("Saving search file output") |
|
progress(0.7, desc = "Saving search output to file") |
|
|
|
|
|
|
|
|
|
results_df_out_wb = create_highlighted_excel_wb(results_df_out, string_query, "search_text") |
|
results_df_out_wb.save(results_df_name) |
|
|
|
results_first_text = results_df_out[text_column].iloc[0] |
|
|
|
print("Returning results") |
|
|
|
return results_first_text, results_df_name |