Spaces:

vialibre
/

edia_we_es

Configuration error

File size: 1,082 Bytes

import pandas as pd
from sklearn.decomposition import PCA
from gensim.models import KeyedVectors

def load_embeddings(path, binary = False, randomPCA = False, limit = None):
    if randomPCA:
        pca = PCA(n_components=2, 
                  copy=False, 
                  whiten=False, 
                  svd_solver='randomized', 
                  iterated_power='auto'
                  )
    else:
        pca = PCA(n_components=2)
    
    print("--------> PATH:", path)
    model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)

    # Cased Vocab
    cased_words = model.index_to_key
    cased_emb = model.get_normed_vectors()
    cased_pca = pca.fit_transform(cased_emb)

    df_cased = pd.DataFrame(
        zip(
            cased_words,
            cased_emb,
            cased_pca
        ),
        columns=['word', 'embedding', 'pca']
    )

    df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
    df_uncased = df_cased.drop_duplicates(subset='word')
    return df_uncased

#load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000)