word_embedding / test4emb.py
root@autodl-container-32ce119752-f4e7b2aa
word_list fix and process script upload
7660c6f
import numpy as np
import argparse
import random
path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"
def read_vectors(path, topn=0): # read top n word vectors, i.e. top is 10000
lines_num = 0
vectors = []
iw = []
with open(path, encoding='utf-8', errors='ignore') as f:
first_line = True
for line in f:
if first_line:
first_line = False
dim = int(line.rstrip().split()[1])
continue
lines_num += 1
tokens = line.rstrip().split(' ')
vectors.append([float(x) for x in tokens[1:]])
iw.append(tokens[0])
if topn != 0 and lines_num >= topn:
break
return np.array(vectors), np.array(iw)
def main():
vectors_path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"
# embedding_matrix, word_list = read_vectors(vectors_path)
# np.save("ZHglove.wordlist.npy", word_list)
# np.save("ZHglove.300d.mat.npy", embedding_matrix)
embedding_matrix = np.load("ZHglove.300d.mat.npy")
word_list = np.load("ZHglove.wordlist.npy")
print(embedding_matrix.shape)
print(word_list.shape)
word2id = {}
if embedding_matrix is not None:
words = []
words_id = []
for i, word in enumerate(word_list):
if word in word2id:
words.append(word)
words_id.append(i)
# assert word not in word2id, "Duplicate words in pre-trained embeddings"
word2id[word] = len(word2id)
embedding_matrix = np.delete(embedding_matrix, words_id, 0)
print(embedding_matrix.shape)
word_list = np.delete(word_list, words_id, 0)
np.save("ZHglove.wordlist.npy", word_list)
np.save("ZHglove.300d.mat.npy", embedding_matrix)
print(word_list.shape)
if __name__ == "__main__":
main()