Spaces:
Configuration error
Configuration error
from modules.module_ann import Ann | |
from memory_profiler import profile | |
from sklearn.neighbors import NearestNeighbors | |
from sklearn.decomposition import PCA | |
from gensim.models import KeyedVectors | |
from typing import List, Any | |
import os | |
import pandas as pd | |
import numpy as np | |
from numpy import dot | |
from gensim import matutils | |
class Embedding: | |
def __init__(self, | |
path: str, | |
limit: int=None, | |
randomizedPCA: bool=False, | |
max_neighbors: int=20, | |
nn_method: str='sklearn' | |
) -> None: | |
# Embedding vars | |
self.path = path | |
self.limit = limit | |
self.randomizedPCA = randomizedPCA | |
self.max_neighbors = max_neighbors | |
self.availables_nn_methods = ['sklearn', 'ann'] | |
self.nn_method = nn_method | |
# Full embedding dataset | |
self.ds = None | |
# Estimate NearestNeighbors | |
self.ann = None # Aproximate with Annoy method | |
self.neigh = None # Exact with Sklearn method | |
# Load embedding and pca dataset | |
self.__load() | |
def __load( | |
self, | |
) -> None: | |
assert(self.nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!" | |
print(f"Preparing {os.path.basename(self.path)} embeddings...") | |
# --- Prepare dataset --- | |
self.ds = self.__preparate( | |
self.path, self.limit, self.randomizedPCA | |
) | |
# --- Estimate Nearest Neighbors | |
if self.nn_method == 'sklearn': | |
# Method A: Througth Sklearn method | |
self.__init_sklearn_method( | |
max_neighbors=self.max_neighbors, | |
vectors=self.ds['embedding'].to_list() | |
) | |
elif self.nn_method == 'ann': | |
# Method B: Througth annoy using forest tree | |
self.__init_ann_method( | |
words=self.ds['word'].to_list(), | |
vectors=self.ds['embedding'].to_list(), | |
coord=self.ds['pca'].to_list() | |
) | |
def __preparate( | |
self, | |
path: str, | |
limit: int, | |
randomizedPCA: bool | |
) -> pd.DataFrame: | |
if randomizedPCA: | |
pca = PCA( | |
n_components=2, | |
copy=False, | |
whiten=False, | |
svd_solver='randomized', | |
iterated_power='auto' | |
) | |
else: | |
pca = PCA( | |
n_components=2 | |
) | |
try: | |
model = KeyedVectors.load_word2vec_format( | |
fname=path, | |
binary=path.endswith('.bin'), | |
limit=limit, | |
unicode_errors='ignore' | |
) | |
except: | |
raise Exception(f"Can't load {path}. If it's a .bin extended file, only gensims c binary format are valid") | |
# Cased Vocab | |
cased_words = model.index_to_key | |
cased_emb = model.get_normed_vectors() | |
cased_pca = pca.fit_transform(cased_emb) | |
df_cased = pd.DataFrame( | |
zip( | |
cased_words, | |
cased_emb, | |
cased_pca | |
), | |
columns=['word', 'embedding', 'pca'] | |
) | |
df_cased['word'] = df_cased.word.apply(lambda w: w.lower()) | |
df_uncased = df_cased.drop_duplicates(subset='word') | |
return df_uncased | |
def __init_ann_method( | |
self, | |
words: List[str], | |
vectors: List[float], | |
coord: List[float], | |
n_trees: int=20, | |
metric: str='dot' | |
) -> None: | |
print("Initializing Annoy method to search for nearby neighbors...") | |
self.ann = Ann( | |
words=words, | |
vectors=vectors, | |
coord=coord, | |
) | |
self.ann.init( | |
n_trees=n_trees, | |
metric=metric, | |
n_jobs=-1 | |
) | |
def __init_sklearn_method( | |
self, | |
max_neighbors: int, | |
vectors: List[float] | |
) -> None: | |
print("Initializing sklearn method to search for nearby neighbors...") | |
self.neigh = NearestNeighbors( | |
n_neighbors=max_neighbors | |
) | |
self.neigh.fit( | |
X=vectors | |
) | |
def __getValue( | |
self, | |
word: str, | |
feature: str | |
) -> Any: | |
word_id, value = None, None | |
if word in self: | |
word_id = self.ds['word'].to_list().index(word) | |
if word_id != None: | |
value = self.ds[feature].to_list()[word_id] | |
else: | |
print(f"The word '{word}' does not exist") | |
return value | |
def getEmbedding( | |
self, | |
word: str | |
) -> np.ndarray: | |
return self.__getValue(word, 'embedding') | |
def getPCA( | |
self, | |
word: str | |
) -> np.ndarray: | |
return self.__getValue(word, 'pca') | |
def getNearestNeighbors( | |
self, | |
word: str, | |
n_neighbors: int=10, | |
nn_method: str='sklearn' | |
) -> List[str]: | |
assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!." | |
assert(nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!" | |
neighbors_list = [] | |
if word not in self: | |
print(f"The word '{word}' does not exist") | |
return neighbors_list | |
if nn_method == 'ann': | |
if self.ann is None: | |
self.__init_ann_method( | |
words=self.ds['word'].to_list(), | |
vectors=self.ds['embedding'].to_list(), | |
coord=self.ds['pca'].to_list() | |
) | |
neighbors_list = self.ann.get(word, n_neighbors) | |
elif nn_method == 'sklearn': | |
if self.neigh is None: | |
self.__init_sklearn_method( | |
max_neighbors=self.max_neighbors, | |
vectors=self.ds['embedding'].to_list() | |
) | |
word_emb = self.getEmbedding(word).reshape(1,-1) | |
_, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1) | |
neighbors_list = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:] | |
return neighbors_list | |
def cosineSimilarities( | |
self, | |
vector_1, | |
vectors_all | |
): | |
norm = np.linalg.norm(vector_1) | |
all_norms = np.linalg.norm(vectors_all, axis=1) | |
dot_products = dot(vectors_all, vector_1) | |
similarities = dot_products / (norm * all_norms) | |
return similarities | |
def getCosineSimilarities( | |
self, | |
w1, | |
w2 | |
): | |
return dot( | |
matutils.unitvec(self.getEmbedding(w1)), | |
matutils.unitvec(self.getEmbedding(w2)) | |
) | |
def __contains__( | |
self, | |
word: str | |
) -> bool: | |
return word in self.ds['word'].to_list() |