WangA
/

word_embedding

Model card Files Files and versions Community

root@autodl-container-32ce119752-f4e7b2aa commited on Mar 2, 2024

Commit

7660c6f

1 Parent(s): 2ad8b66

word_list fix and process script upload

Browse files

Files changed (2) hide show

ZHglove.wordlist.npy +2 -2
test4emb.py +68 -0

ZHglove.wordlist.npy CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1ff372ba013253eb16f8199f756a8d31afaa336beb7c89b57cbba58115f15b64
-size 159219712

 version https://git-lfs.github.com/spec/v1
+oid sha256:a81a627b55d86d800194fed251fa598477aa0e901af31da8bb89a6a552476c21
+size 249297624

test4emb.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import numpy as np
+import argparse
+import random
+path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"
+def read_vectors(path, topn=0):  # read top n word vectors, i.e. top is 10000
+    lines_num = 0
+    vectors = []
+    iw = []
+    with open(path, encoding='utf-8', errors='ignore') as f:
+        first_line = True
+        for line in f:
+            if first_line:
+                first_line = False
+                dim = int(line.rstrip().split()[1])
+                continue
+            lines_num += 1
+            tokens = line.rstrip().split(' ')
+            vectors.append([float(x) for x in tokens[1:]])
+            iw.append(tokens[0])
+            if topn != 0 and lines_num >= topn:
+                break
+    return np.array(vectors), np.array(iw)
+def main():
+    vectors_path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"
+    # embedding_matrix, word_list = read_vectors(vectors_path)
+    # np.save("ZHglove.wordlist.npy", word_list)
+    # np.save("ZHglove.300d.mat.npy", embedding_matrix)
+    embedding_matrix = np.load("ZHglove.300d.mat.npy")
+    word_list = np.load("ZHglove.wordlist.npy")
+    print(embedding_matrix.shape)
+    print(word_list.shape)
+    word2id = {}
+    if embedding_matrix is not None:
+        words = []
+        words_id  = []
+        for i, word in enumerate(word_list):
+            if word in word2id:
+                words.append(word)
+                words_id.append(i)
+            # assert word not in word2id, "Duplicate words in pre-trained embeddings"
+            word2id[word] = len(word2id)
+    embedding_matrix = np.delete(embedding_matrix, words_id, 0)
+    print(embedding_matrix.shape)
+    word_list = np.delete(word_list, words_id, 0)
+    np.save("ZHglove.wordlist.npy", word_list)
+    np.save("ZHglove.300d.mat.npy", embedding_matrix)
+    print(word_list.shape)
+if __name__ == "__main__":
+    main()