import os import nltk from nltk import word_tokenize import numpy as np from leven import levenshtein from sklearn.cluster import DBSCAN, dbscan def delete_if_exists(filepath): if os.path.exists(filepath): os.remove(filepath) def pairwise_meteor(candidate, reference): # Todo this is not thread safe, no idea how to make it so return nltk.translate.meteor_score.single_meteor_score(word_tokenize(reference), word_tokenize(candidate)) def count_stats(candidate_dict, reference_dict): count_match = [0 for _ in candidate_dict] count_diff = [0 for _ in candidate_dict] for i, k in enumerate(candidate_dict.keys()): pred_parts = candidate_dict[k] tgt_parts = reference_dict[k] if len(pred_parts) == len(tgt_parts): count_match[i] = 1 count_diff[i] = abs(len(pred_parts) - len(tgt_parts)) count_match_score = np.mean(count_match) count_diff_score = np.mean(count_diff) return { "count_match_score": count_match_score, "count_diff_score": count_diff_score } def f1_metric(candidate_dict, reference_dict, pairwise_metric): all_best_p = [0 for _ in candidate_dict] all_best_t = [0 for _ in candidate_dict] p_unnorm = [] for i, k in enumerate(candidate_dict.keys()): pred_parts = candidate_dict[k] tgt_parts = reference_dict[k] best_p_score = [0 for _ in pred_parts] best_t_score = [0 for _ in tgt_parts] for p_idx in range(len(pred_parts)): for t_idx in range(len(tgt_parts)): #meteor_score = pairwise_meteor(pred_parts[p_idx], tgt_parts[t_idx]) metric_score = pairwise_metric(pred_parts[p_idx], tgt_parts[t_idx]) if metric_score > best_p_score[p_idx]: best_p_score[p_idx] = metric_score if metric_score > best_t_score[t_idx]: best_t_score[t_idx] = metric_score all_best_p[i] = np.mean(best_p_score) if len(best_p_score) > 0 else 1.0 all_best_t[i] = np.mean(best_t_score) if len(best_t_score) > 0 else 1.0 p_unnorm.extend(best_p_score) p_score = np.mean(all_best_p) r_score = np.mean(all_best_t) avg_score = (p_score + r_score) / 2 f1_score = 2 * p_score * r_score / (p_score + r_score + 1e-8) p_unnorm_score = np.mean(p_unnorm) return { "p": p_score, "r": r_score, "avg": avg_score, "f1": f1_score, "p_unnorm": p_unnorm_score, } def edit_distance_dbscan(data): # Inspired by https://scikit-learn.org/stable/faq.html#how-do-i-deal-with-string-data-or-trees-graphs def lev_metric(x, y): i, j = int(x[0]), int(y[0]) return levenshtein(data[i], data[j]) X = np.arange(len(data)).reshape(-1, 1) clustering = dbscan(X, metric=lev_metric, eps=20, min_samples=2, algorithm='brute') return clustering def compute_all_pairwise_edit_distances(data): X = np.empty((len(data), len(data))) for i in range(len(data)): for j in range(len(data)): X[i][j] = levenshtein(data[i], data[j]) return X def compute_all_pairwise_scores(src_data, tgt_data, metric): X = np.empty((len(src_data), len(tgt_data))) for i in range(len(src_data)): for j in range(len(tgt_data)): X[i][j] = (metric(src_data[i], tgt_data[j])) return X def compute_all_pairwise_meteor_scores(data): X = np.empty((len(data), len(data))) for i in range(len(data)): for j in range(len(data)): X[i][j] = (pairwise_meteor(data[i], data[j]) + pairwise_meteor(data[j], data[i])) / 2 return X def edit_distance_custom(data, X, eps=0.5, min_samples=3): clustering = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples).fit(X) return clustering.labels_