Spaces:
Configuration error
Configuration error
File size: 1,082 Bytes
a779273 785b2ef a779273 785b2ef a779273 785b2ef a779273 785b2ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
import pandas as pd
from sklearn.decomposition import PCA
from gensim.models import KeyedVectors
def load_embeddings(path, binary = False, randomPCA = False, limit = None):
if randomPCA:
pca = PCA(n_components=2,
copy=False,
whiten=False,
svd_solver='randomized',
iterated_power='auto'
)
else:
pca = PCA(n_components=2)
print("--------> PATH:", path)
model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
# Cased Vocab
cased_words = model.index_to_key
cased_emb = model.get_normed_vectors()
cased_pca = pca.fit_transform(cased_emb)
df_cased = pd.DataFrame(
zip(
cased_words,
cased_emb,
cased_pca
),
columns=['word', 'embedding', 'pca']
)
df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
df_uncased = df_cased.drop_duplicates(subset='word')
return df_uncased
#load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000) |