Spaces:
Building
Building
File size: 1,502 Bytes
fe1339d c55ac1a fe1339d c55ac1a fe1339d c55ac1a fe1339d c55ac1a fe1339d c55ac1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import spacy
import pickle
from nltk.corpus import wordnet
def load_spacy_values(filepath_model_spacy='model_spacy_synonyms', filepath_docs_spacy = 'dict_spacy_object.pkl'):
nlp = spacy.load(filepath_model_spacy)
with open(filepath_docs_spacy, 'rb') as file:
dict_docs_spacy_bytes = pickle.load(file)
dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}
return nlp, dict_docs_spacy
def find_antonyms(word):
antonyms = set()
syn_set = wordnet.synsets(word)
for syn in syn_set:
for lemma in syn.lemmas():
if lemma.antonyms():
antonyms.add(lemma.antonyms()[0].name())
return antonyms
def find_synonyms(word, model, dict_embedding, dict_2000_tokens): #cluster_to_words, dbscan_model):
"""
This function finds the most similar word in the same cluster, and excludes antonyms
"""
if word in dict_2000_tokens:
return word
else:
antonyms = find_antonyms(word)
dict_2000_tokens_less_antonyms = [token for token in dict_2000_tokens if token not in antonyms]
word_embedding = model(word)
similarities=[]
for token in dict_2000_tokens_less_antonyms:
similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
return most_similar_token |