Figea commited on
Commit
fe1339d
·
verified ·
1 Parent(s): b825309

Upload synonyms_preprocess.py

Browse files
Files changed (1) hide show
  1. src/synonyms_preprocess.py +43 -0
src/synonyms_preprocess.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import pickle
3
+ from nltk.corpus import wordnet
4
+
5
+
6
+ def load_spacy_values(filepath_model_spacy='model_spacy_synonyms', filepath_docs_spacy = 'dict_spacy_object.pkl'):
7
+
8
+ nlp = spacy.load(filepath_model_spacy)
9
+
10
+
11
+ with open(filepath_docs_spacy, 'rb') as file:
12
+ dict_docs_spacy_bytes = pickle.load(file)
13
+
14
+ dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}
15
+
16
+ return nlp, dict_docs_spacy
17
+
18
+ def find_antonyms(word):
19
+ antonyms = set()
20
+ syn_set = wordnet.synsets(word)
21
+ for syn in syn_set:
22
+ for lemma in syn.lemmas():
23
+ if lemma.antonyms():
24
+ antonyms.add(lemma.antonyms()[0].name())
25
+ return antonyms
26
+
27
+ def find_synonyms(word, model, dict_embedding, dict_2000_tokens): #cluster_to_words, dbscan_model):
28
+ """
29
+ This function finds the most similar word in the same cluster, and excludes antonyms
30
+ """
31
+ antonyms = find_antonyms(word)
32
+ dict_2000_tokens_less_antonyms = [token for token in dict_2000_tokens if token not in antonyms]
33
+
34
+ word_embedding = model(word)
35
+
36
+ similarities=[]
37
+
38
+ for token in dict_2000_tokens_less_antonyms:
39
+ similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
40
+
41
+ most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
42
+
43
+ return most_similar_token