File size: 1,082 Bytes
a779273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785b2ef
a779273
 
 
785b2ef
 
a779273
 
 
 
 
 
 
 
 
 
 
 
 
785b2ef
a779273
785b2ef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
from sklearn.decomposition import PCA
from gensim.models import KeyedVectors

def load_embeddings(path, binary = False, randomPCA = False, limit = None):
    if randomPCA:
        pca = PCA(n_components=2, 
                  copy=False, 
                  whiten=False, 
                  svd_solver='randomized', 
                  iterated_power='auto'
                  )
    else:
        pca = PCA(n_components=2)
    
    print("--------> PATH:", path)
    model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)

    # Cased Vocab
    cased_words = model.index_to_key
    cased_emb = model.get_normed_vectors()
    cased_pca = pca.fit_transform(cased_emb)

    df_cased = pd.DataFrame(
        zip(
            cased_words,
            cased_emb,
            cased_pca
        ),
        columns=['word', 'embedding', 'pca']
    )

    df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
    df_uncased = df_cased.drop_duplicates(subset='word')
    return df_uncased

#load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000)