File size: 940 Bytes
3648e12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import numpy as np
from typing import Callable

from sklearn.cluster import AgglomerativeClustering


def condense_labels(labels: np.ndarray, embedding_func: Callable, threshold: float=0.5):
    """Combine cosine-similar labels under same name."""

    embeddings = np.array(embedding_func(labels))
    
    clustering = AgglomerativeClustering(
        n_clusters=None, 
        distance_threshold=threshold
    ).fit(embeddings)

    clusters = [np.where(clustering.labels_ == l)[0] 
                for l in range(clustering.n_clusters_)]

    clusters_reduced = []
    
    for c in clusters:
        embs = embeddings[c]
        centroid = np.mean(embs)

        idx = c[np.argmin(np.linalg.norm(embs - centroid, axis=1))]
        clusters_reduced.append(idx)

    old2new = {old_id: new_id for old_ids, new_id in zip(clusters, clusters_reduced) for old_id in old_ids}
    
    return {labels[i]: labels[j] for i, j in old2new.items()}