|
import numpy as np |
|
import argparse |
|
import random |
|
|
|
path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5" |
|
|
|
|
|
def read_vectors(path, topn=0): |
|
lines_num = 0 |
|
vectors = [] |
|
iw = [] |
|
with open(path, encoding='utf-8', errors='ignore') as f: |
|
first_line = True |
|
for line in f: |
|
if first_line: |
|
first_line = False |
|
dim = int(line.rstrip().split()[1]) |
|
continue |
|
lines_num += 1 |
|
tokens = line.rstrip().split(' ') |
|
vectors.append([float(x) for x in tokens[1:]]) |
|
iw.append(tokens[0]) |
|
if topn != 0 and lines_num >= topn: |
|
break |
|
return np.array(vectors), np.array(iw) |
|
|
|
|
|
def main(): |
|
vectors_path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embedding_matrix = np.load("ZHglove.300d.mat.npy") |
|
word_list = np.load("ZHglove.wordlist.npy") |
|
|
|
print(embedding_matrix.shape) |
|
print(word_list.shape) |
|
|
|
word2id = {} |
|
if embedding_matrix is not None: |
|
words = [] |
|
words_id = [] |
|
for i, word in enumerate(word_list): |
|
if word in word2id: |
|
words.append(word) |
|
words_id.append(i) |
|
|
|
word2id[word] = len(word2id) |
|
|
|
|
|
embedding_matrix = np.delete(embedding_matrix, words_id, 0) |
|
print(embedding_matrix.shape) |
|
word_list = np.delete(word_list, words_id, 0) |
|
|
|
|
|
np.save("ZHglove.wordlist.npy", word_list) |
|
np.save("ZHglove.300d.mat.npy", embedding_matrix) |
|
|
|
|
|
|
|
print(word_list.shape) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|