WangA
/

word_embedding

Model card Files Files and versions Community

word_embedding / test4emb.py

root@autodl-container-32ce119752-f4e7b2aa

word_list fix and process script upload

7660c6f about 1 year ago

history blame contribute delete

1.94 kB

	import numpy as np
	import argparse
	import random

	path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"


	def read_vectors(path, topn=0): # read top n word vectors, i.e. top is 10000
	lines_num = 0
	vectors = []
	iw = []
	with open(path, encoding='utf-8', errors='ignore') as f:
	first_line = True
	for line in f:
	if first_line:
	first_line = False
	dim = int(line.rstrip().split()[1])
	continue
	lines_num += 1
	tokens = line.rstrip().split(' ')
	vectors.append([float(x) for x in tokens[1:]])
	iw.append(tokens[0])
	if topn != 0 and lines_num >= topn:
	break
	return np.array(vectors), np.array(iw)


	def main():
	vectors_path = "/root/autodl-tmp/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5"
	# embedding_matrix, word_list = read_vectors(vectors_path)

	# np.save("ZHglove.wordlist.npy", word_list)
	# np.save("ZHglove.300d.mat.npy", embedding_matrix)



	embedding_matrix = np.load("ZHglove.300d.mat.npy")
	word_list = np.load("ZHglove.wordlist.npy")

	print(embedding_matrix.shape)
	print(word_list.shape)

	word2id = {}
	if embedding_matrix is not None:
	words = []
	words_id = []
	for i, word in enumerate(word_list):
	if word in word2id:
	words.append(word)
	words_id.append(i)
	# assert word not in word2id, "Duplicate words in pre-trained embeddings"
	word2id[word] = len(word2id)


	embedding_matrix = np.delete(embedding_matrix, words_id, 0)
	print(embedding_matrix.shape)
	word_list = np.delete(word_list, words_id, 0)


	np.save("ZHglove.wordlist.npy", word_list)
	np.save("ZHglove.300d.mat.npy", embedding_matrix)



	print(word_list.shape)

	if __name__ == "__main__":
	main()