Spaces:
Configuration error
Configuration error
import pandas as pd | |
from sklearn.decomposition import PCA | |
from gensim.models import KeyedVectors | |
def load_embeddings(path, binary = False, randomPCA = False, limit = None): | |
if randomPCA: | |
pca = PCA(n_components=2, | |
copy=False, | |
whiten=False, | |
svd_solver='randomized', | |
iterated_power='auto' | |
) | |
else: | |
pca = PCA(n_components=2) | |
print("--------> PATH:", path) | |
model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit) | |
# Cased Vocab | |
cased_words = model.index_to_key | |
cased_emb = model.get_normed_vectors() | |
cased_pca = pca.fit_transform(cased_emb) | |
df_cased = pd.DataFrame( | |
zip( | |
cased_words, | |
cased_emb, | |
cased_pca | |
), | |
columns=['word', 'embedding', 'pca'] | |
) | |
df_cased['word'] = df_cased.word.apply(lambda w: w.lower()) | |
df_uncased = df_cased.drop_duplicates(subset='word') | |
return df_uncased | |
#load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000) |