File size: 4,849 Bytes
739b386 2bcd818 739b386 4ce2224 fea085c 739b386 4ce2224 2bcd818 739b386 2bcd818 739b386 a95ef9f 739b386 a95ef9f 2bcd818 739b386 4ce2224 739b386 4ce2224 739b386 4ce2224 739b386 4ce2224 739b386 4ce2224 739b386 4ce2224 739b386 4ce2224 fea085c 739b386 4ee3470 ff8dfa3 4ee3470 739b386 4ce2224 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import spacy
spacy.prefer_gpu()
from spacy.cli.download import download
from spacy.matcher import Matcher
import numpy as np
import gradio as gr
import pandas as pd
from typing import List, Type
from datetime import datetime
from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder
PandasDataFrame = Type[pd.DataFrame]
today_rev = datetime.now().strftime("%Y%m%d")
# Load the SpaCy model
#os.system("python -m spacy download en_core_web_sm")
try:
import en_core_web_sm
nlp = en_core_web_sm.load()
print("Successfully imported spaCy model")
#nlp = spacy.load("en_core_web_sm")
#print(nlp._path)
except:
download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
print("Successfully imported spaCy model")
def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
''' Conduct fuzzy match on a list of data.'''
#print("df_list:", df_list)
# Convert tokenised data back into a list of strings
df_list = list(map(" ".join, tokenised_data))
if len(df_list) > 10000:
out_message = "Your data has more than 10,000 rows and will take more than three minutes to do a fuzzy search. Please try keyword or semantic search for data of this size."
return out_message, None
query = nlp(string_query)
tokenised_query = [token.text for token in query]
print(tokenised_query)
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(no_spelling_mistakes)
# %%
if len(tokenised_query) > 1:
pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}]
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}]
else:
pattern_lemma = [{"LEMMA": tokenised_query[0]}]
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}]
# %%
matcher = Matcher(nlp.vocab)
# %%
matcher.add(string_query, [pattern_fuzz])
matcher.add(string_query, [pattern_lemma])
# %%
batch_size = 256
docs = nlp.pipe(df_list, batch_size=batch_size)
# %%
all_matches = []
# Get number of matches per doc
for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"):
matches = matcher(doc)
match_count = len(matches)
all_matches.append(match_count)
print("Search complete")
## Get document lengths
lengths = []
for element in df_list:
lengths.append(len(element))
# Score is number of matches divided by length of document
match_scores = (np.array(all_matches)/np.array(lengths)).tolist()
# Prepare results and export
results_df = pd.DataFrame(data={"index": list(range(len(df_list))),
"search_text": df_list,
"search_score_abs": match_scores})
results_df['search_score_abs'] = abs(round(results_df['search_score_abs']*100, 2))
results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")
# Keep only results with at least one match
results_df_out = results_df_out.loc[results_df["search_score_abs"] > 0, :]
# Join on additional files
if not in_join_file.empty:
progress(0.5, desc = "Joining on additional data file")
join_df = in_join_file
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
# Duplicates dropped so as not to expand out dataframe
join_df = join_df.drop_duplicates(in_join_column)
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left", suffixes=('','_y'))#.drop(in_join_column, axis=1)
# Reorder results by score
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
# Out file
query_str_file = ("_").join(tokenised_query)
results_df_name = output_folder + "fuzzy_keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
print("Saving search file output")
progress(0.7, desc = "Saving search output to file")
#results_df_out.to_excel(results_df_name, index= None)
# Highlight found text and save to file
results_df_out_wb = create_highlighted_excel_wb(results_df_out, string_query, "search_text")
results_df_out_wb.save(results_df_name)
results_first_text = results_df_out[text_column].iloc[0]
print("Returning results")
return results_first_text, results_df_name |