File size: 1,940 Bytes
7660c6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import numpy as np
import argparse
import random
path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"
def read_vectors(path, topn=0): # read top n word vectors, i.e. top is 10000
lines_num = 0
vectors = []
iw = []
with open(path, encoding='utf-8', errors='ignore') as f:
first_line = True
for line in f:
if first_line:
first_line = False
dim = int(line.rstrip().split()[1])
continue
lines_num += 1
tokens = line.rstrip().split(' ')
vectors.append([float(x) for x in tokens[1:]])
iw.append(tokens[0])
if topn != 0 and lines_num >= topn:
break
return np.array(vectors), np.array(iw)
def main():
vectors_path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"
# embedding_matrix, word_list = read_vectors(vectors_path)
# np.save("ZHglove.wordlist.npy", word_list)
# np.save("ZHglove.300d.mat.npy", embedding_matrix)
embedding_matrix = np.load("ZHglove.300d.mat.npy")
word_list = np.load("ZHglove.wordlist.npy")
print(embedding_matrix.shape)
print(word_list.shape)
word2id = {}
if embedding_matrix is not None:
words = []
words_id = []
for i, word in enumerate(word_list):
if word in word2id:
words.append(word)
words_id.append(i)
# assert word not in word2id, "Duplicate words in pre-trained embeddings"
word2id[word] = len(word2id)
embedding_matrix = np.delete(embedding_matrix, words_id, 0)
print(embedding_matrix.shape)
word_list = np.delete(word_list, words_id, 0)
np.save("ZHglove.wordlist.npy", word_list)
np.save("ZHglove.300d.mat.npy", embedding_matrix)
print(word_list.shape)
if __name__ == "__main__":
main()
|