root@autodl-container-32ce119752-f4e7b2aa commited on
Commit
7660c6f
·
1 Parent(s): 2ad8b66

word_list fix and process script upload

Browse files
Files changed (2) hide show
  1. ZHglove.wordlist.npy +2 -2
  2. test4emb.py +68 -0
ZHglove.wordlist.npy CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ff372ba013253eb16f8199f756a8d31afaa336beb7c89b57cbba58115f15b64
3
- size 159219712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a81a627b55d86d800194fed251fa598477aa0e901af31da8bb89a6a552476c21
3
+ size 249297624
test4emb.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import argparse
3
+ import random
4
+
5
+ path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"
6
+
7
+
8
+ def read_vectors(path, topn=0): # read top n word vectors, i.e. top is 10000
9
+ lines_num = 0
10
+ vectors = []
11
+ iw = []
12
+ with open(path, encoding='utf-8', errors='ignore') as f:
13
+ first_line = True
14
+ for line in f:
15
+ if first_line:
16
+ first_line = False
17
+ dim = int(line.rstrip().split()[1])
18
+ continue
19
+ lines_num += 1
20
+ tokens = line.rstrip().split(' ')
21
+ vectors.append([float(x) for x in tokens[1:]])
22
+ iw.append(tokens[0])
23
+ if topn != 0 and lines_num >= topn:
24
+ break
25
+ return np.array(vectors), np.array(iw)
26
+
27
+
28
+ def main():
29
+ vectors_path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"
30
+ # embedding_matrix, word_list = read_vectors(vectors_path)
31
+
32
+ # np.save("ZHglove.wordlist.npy", word_list)
33
+ # np.save("ZHglove.300d.mat.npy", embedding_matrix)
34
+
35
+
36
+
37
+ embedding_matrix = np.load("ZHglove.300d.mat.npy")
38
+ word_list = np.load("ZHglove.wordlist.npy")
39
+
40
+ print(embedding_matrix.shape)
41
+ print(word_list.shape)
42
+
43
+ word2id = {}
44
+ if embedding_matrix is not None:
45
+ words = []
46
+ words_id = []
47
+ for i, word in enumerate(word_list):
48
+ if word in word2id:
49
+ words.append(word)
50
+ words_id.append(i)
51
+ # assert word not in word2id, "Duplicate words in pre-trained embeddings"
52
+ word2id[word] = len(word2id)
53
+
54
+
55
+ embedding_matrix = np.delete(embedding_matrix, words_id, 0)
56
+ print(embedding_matrix.shape)
57
+ word_list = np.delete(word_list, words_id, 0)
58
+
59
+
60
+ np.save("ZHglove.wordlist.npy", word_list)
61
+ np.save("ZHglove.300d.mat.npy", embedding_matrix)
62
+
63
+
64
+
65
+ print(word_list.shape)
66
+
67
+ if __name__ == "__main__":
68
+ main()