edia_we_es / data /data_loader.py
LMartinezEXEX's picture
Using data_loader to load vector files instead of json.
785b2ef
import pandas as pd
from sklearn.decomposition import PCA
from gensim.models import KeyedVectors
def load_embeddings(path, binary = False, randomPCA = False, limit = None):
if randomPCA:
pca = PCA(n_components=2,
copy=False,
whiten=False,
svd_solver='randomized',
iterated_power='auto'
)
else:
pca = PCA(n_components=2)
print("--------> PATH:", path)
model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
# Cased Vocab
cased_words = model.index_to_key
cased_emb = model.get_normed_vectors()
cased_pca = pca.fit_transform(cased_emb)
df_cased = pd.DataFrame(
zip(
cased_words,
cased_emb,
cased_pca
),
columns=['word', 'embedding', 'pca']
)
df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
df_uncased = df_cased.drop_duplicates(subset='word')
return df_uncased
#load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000)