File size: 1,502 Bytes
fe1339d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c55ac1a
 
 
 
 
fe1339d
c55ac1a
 
 
fe1339d
c55ac1a
 
fe1339d
c55ac1a
fe1339d
c55ac1a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import spacy
import pickle
from nltk.corpus import wordnet


def load_spacy_values(filepath_model_spacy='model_spacy_synonyms', filepath_docs_spacy = 'dict_spacy_object.pkl'):

    nlp = spacy.load(filepath_model_spacy)

    
    with open(filepath_docs_spacy, 'rb') as file:
        dict_docs_spacy_bytes = pickle.load(file)
    
    dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}
    
    return nlp, dict_docs_spacy

def find_antonyms(word):
    antonyms = set()
    syn_set = wordnet.synsets(word)
    for syn in syn_set:
        for lemma in syn.lemmas():
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())
    return antonyms

def find_synonyms(word, model, dict_embedding, dict_2000_tokens): #cluster_to_words, dbscan_model):
    """
    This function finds the most similar word in the same cluster, and excludes antonyms
    """

    if word in dict_2000_tokens:
        return word
    else:
        antonyms = find_antonyms(word)
        dict_2000_tokens_less_antonyms = [token for token in dict_2000_tokens if token not in antonyms]

        word_embedding = model(word)

        similarities=[]
    
        for token in dict_2000_tokens_less_antonyms:
            similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
    
        most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]

        return most_similar_token