WangA
/

word_embedding

Model card Files Files and versions Community

File size: 1,940 Bytes

7660c6f

import numpy as np
import argparse
import random

path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"


def read_vectors(path, topn=0):  # read top n word vectors, i.e. top is 10000
    lines_num = 0
    vectors = []
    iw = []
    with open(path, encoding='utf-8', errors='ignore') as f:
        first_line = True
        for line in f:
            if first_line:
                first_line = False
                dim = int(line.rstrip().split()[1])
                continue
            lines_num += 1
            tokens = line.rstrip().split(' ')
            vectors.append([float(x) for x in tokens[1:]])
            iw.append(tokens[0])
            if topn != 0 and lines_num >= topn:
                break
    return np.array(vectors), np.array(iw)


def main():
    vectors_path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"
    # embedding_matrix, word_list = read_vectors(vectors_path)
    
    # np.save("ZHglove.wordlist.npy", word_list)
    # np.save("ZHglove.300d.mat.npy", embedding_matrix)
    

    
    embedding_matrix = np.load("ZHglove.300d.mat.npy")
    word_list = np.load("ZHglove.wordlist.npy")

    print(embedding_matrix.shape)
    print(word_list.shape)

    word2id = {}
    if embedding_matrix is not None:
        words = []
        words_id  = []
        for i, word in enumerate(word_list):
            if word in word2id:
                words.append(word)
                words_id.append(i)
            # assert word not in word2id, "Duplicate words in pre-trained embeddings"
            word2id[word] = len(word2id)
    

    embedding_matrix = np.delete(embedding_matrix, words_id, 0)
    print(embedding_matrix.shape)
    word_list = np.delete(word_list, words_id, 0)


    np.save("ZHglove.wordlist.npy", word_list)
    np.save("ZHglove.300d.mat.npy", embedding_matrix)


    
    print(word_list.shape)

if __name__ == "__main__":
    main()