Spaces:

vialibre
/

edia_we_es

Configuration error

App Files Files Community

edia_we_es / modules /model_embbeding.py

LMartinezEXEX

Added config for centralization.

421e27d over 2 years ago

raw

history blame contribute delete

7.08 kB

	from modules.module_ann import Ann
	from memory_profiler import profile
	from sklearn.neighbors import NearestNeighbors
	from sklearn.decomposition import PCA
	from gensim.models import KeyedVectors
	from typing import List, Any
	import os
	import pandas as pd

	import numpy as np
	from numpy import dot
	from gensim import matutils


	class Embedding:
	def __init__(self,
	path: str,
	limit: int=None,
	randomizedPCA: bool=False,
	max_neighbors: int=20,
	nn_method: str='sklearn'
	) -> None:

	# Embedding vars
	self.path = path
	self.limit = limit
	self.randomizedPCA = randomizedPCA
	self.max_neighbors = max_neighbors

	self.availables_nn_methods = ['sklearn', 'ann']
	self.nn_method = nn_method

	# Full embedding dataset
	self.ds = None

	# Estimate NearestNeighbors
	self.ann = None # Aproximate with Annoy method
	self.neigh = None # Exact with Sklearn method

	# Load embedding and pca dataset
	self.__load()

	def __load(
	self,
	) -> None:

	assert(self.nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"

	print(f"Preparing {os.path.basename(self.path)} embeddings...")

	# --- Prepare dataset ---
	self.ds = self.__preparate(
	self.path, self.limit, self.randomizedPCA
	)

	# --- Estimate Nearest Neighbors
	if self.nn_method == 'sklearn':
	# Method A: Througth Sklearn method
	self.__init_sklearn_method(
	max_neighbors=self.max_neighbors,
	vectors=self.ds['embedding'].to_list()
	)

	elif self.nn_method == 'ann':
	# Method B: Througth annoy using forest tree
	self.__init_ann_method(
	words=self.ds['word'].to_list(),
	vectors=self.ds['embedding'].to_list(),
	coord=self.ds['pca'].to_list()
	)

	def __preparate(
	self,
	path: str,
	limit: int,
	randomizedPCA: bool
	) -> pd.DataFrame:

	if randomizedPCA:
	pca = PCA(
	n_components=2,
	copy=False,
	whiten=False,
	svd_solver='randomized',
	iterated_power='auto'
	)

	else:
	pca = PCA(
	n_components=2
	)

	try:
	model = KeyedVectors.load_word2vec_format(
	fname=path,
	binary=path.endswith('.bin'),
	limit=limit,
	unicode_errors='ignore'
	)
	except:
	raise Exception(f"Can't load {path}. If it's a .bin extended file, only gensims c binary format are valid")

	# Cased Vocab
	cased_words = model.index_to_key
	cased_emb = model.get_normed_vectors()
	cased_pca = pca.fit_transform(cased_emb)

	df_cased = pd.DataFrame(
	zip(
	cased_words,
	cased_emb,
	cased_pca
	),
	columns=['word', 'embedding', 'pca']
	)

	df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
	df_uncased = df_cased.drop_duplicates(subset='word')
	return df_uncased

	def __init_ann_method(
	self,
	words: List[str],
	vectors: List[float],
	coord: List[float],
	n_trees: int=20,
	metric: str='dot'
	) -> None:

	print("Initializing Annoy method to search for nearby neighbors...")
	self.ann = Ann(
	words=words,
	vectors=vectors,
	coord=coord,
	)

	self.ann.init(
	n_trees=n_trees,
	metric=metric,
	n_jobs=-1
	)

	def __init_sklearn_method(
	self,
	max_neighbors: int,
	vectors: List[float]
	) -> None:

	print("Initializing sklearn method to search for nearby neighbors...")
	self.neigh = NearestNeighbors(
	n_neighbors=max_neighbors
	)
	self.neigh.fit(
	X=vectors
	)

	def __getValue(
	self,
	word: str,
	feature: str
	) -> Any:

	word_id, value = None, None

	if word in self:
	word_id = self.ds['word'].to_list().index(word)

	if word_id != None:
	value = self.ds[feature].to_list()[word_id]
	else:
	print(f"The word '{word}' does not exist")

	return value

	def getEmbedding(
	self,
	word: str
	) -> np.ndarray:

	return self.__getValue(word, 'embedding')

	def getPCA(
	self,
	word: str
	) -> np.ndarray:

	return self.__getValue(word, 'pca')

	def getNearestNeighbors(
	self,
	word: str,
	n_neighbors: int=10,
	nn_method: str='sklearn'
	) -> List[str]:

	assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."

	assert(nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"

	neighbors_list = []

	if word not in self:
	print(f"The word '{word}' does not exist")
	return neighbors_list

	if nn_method == 'ann':
	if self.ann is None:
	self.__init_ann_method(
	words=self.ds['word'].to_list(),
	vectors=self.ds['embedding'].to_list(),
	coord=self.ds['pca'].to_list()
	)
	neighbors_list = self.ann.get(word, n_neighbors)

	elif nn_method == 'sklearn':
	if self.neigh is None:
	self.__init_sklearn_method(
	max_neighbors=self.max_neighbors,
	vectors=self.ds['embedding'].to_list()
	)

	word_emb = self.getEmbedding(word).reshape(1,-1)
	_, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1)
	neighbors_list = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]

	return neighbors_list

	def cosineSimilarities(
	self,
	vector_1,
	vectors_all
	):
	norm = np.linalg.norm(vector_1)
	all_norms = np.linalg.norm(vectors_all, axis=1)
	dot_products = dot(vectors_all, vector_1)
	similarities = dot_products / (norm * all_norms)
	return similarities

	def getCosineSimilarities(
	self,
	w1,
	w2
	):

	return dot(
	matutils.unitvec(self.getEmbedding(w1)),
	matutils.unitvec(self.getEmbedding(w2))
	)

	def __contains__(
	self,
	word: str
	) -> bool:

	return word in self.ds['word'].to_list()