Spaces:

ginigen
/

Sign-language

Building

Sign-language / src /synonyms_preprocess.py

Upload synonyms_preprocess.py

fe1339d verified 12 months ago

1.41 kB

	import spacy
	import pickle
	from nltk.corpus import wordnet


	def load_spacy_values(filepath_model_spacy='model_spacy_synonyms', filepath_docs_spacy = 'dict_spacy_object.pkl'):

	nlp = spacy.load(filepath_model_spacy)


	with open(filepath_docs_spacy, 'rb') as file:
	dict_docs_spacy_bytes = pickle.load(file)

	dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}

	return nlp, dict_docs_spacy

	def find_antonyms(word):
	antonyms = set()
	syn_set = wordnet.synsets(word)
	for syn in syn_set:
	for lemma in syn.lemmas():
	if lemma.antonyms():
	antonyms.add(lemma.antonyms()[0].name())
	return antonyms

	def find_synonyms(word, model, dict_embedding, dict_2000_tokens): #cluster_to_words, dbscan_model):
	"""
	This function finds the most similar word in the same cluster, and excludes antonyms
	"""
	antonyms = find_antonyms(word)
	dict_2000_tokens_less_antonyms = [token for token in dict_2000_tokens if token not in antonyms]

	word_embedding = model(word)

	similarities=[]

	for token in dict_2000_tokens_less_antonyms:
	similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))

	most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]

	return most_similar_token