Spaces:
Building
Building
File size: 3,934 Bytes
fe1339d abc6394 974d749 abc6394 fe1339d 974d749 abc6394 974d749 abc6394 974d749 fe1339d 974d749 abc6394 974d749 fe1339d 974d749 abc6394 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 fe1339d 974d749 c55ac1a 974d749 c55ac1a 974d749 fe1339d 974d749 c55ac1a fe1339d 974d749 c55ac1a 974d749 c55ac1a fe1339d c55ac1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import spacy
import pickle
from nltk.corpus import wordnet
def load_spacy_values(model = "en_core_web_md", filepath_docs_spacy = 'dict_spacy_object.pkl'):
'''
Loads a spaCy language model and a dictionary of spaCy Doc objects from a pickle file.
Parameters
----------
model : str
The name or local path of the spaCy model to be loaded for processing text.
For example, "en_core_web_sm" or a custom model path.
filepath_docs_spacy : str
The path to the pickle file containing a dictionary where the keys are tokens
(strings) and the values are the corresponding serialized spaCy Doc objects.
Returns
-------
nlp : spacy.language.Language
The loaded spaCy language model.
dict_docs_spacy : dict
A dictionary where the keys are tokens (strings) and the values are spaCy Doc
objects reconstructed from the serialized bytes stored in the pickle file.
'''
# ---- Load the spaCy NLP model
#
nlp = spacy.load(model)
# ---- Load pickle file and reconstruct the dictionary with tokens as keys and spaCy Doc objects as values
#
with open(filepath_docs_spacy, 'rb') as file:
dict_docs_spacy_bytes = pickle.load(file)
dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}
return nlp, dict_docs_spacy
def find_antonyms(word):
'''
Generate a set of all the antonyms of a given word
Parameters
----------
word : str
The word that we want to find the antonyms
Returns
-------
antonyms : set of str
A set of all the antonym detected using nltk and WordNet
'''
antonyms = set()
# ---- Load all the set of synonyms of the word recorded from wordnet
#
syn_set = wordnet.synsets(word)
# ---- Loop over each set of synonyms
#
for syn in syn_set:
# ---- Loop over each synonym
#
for lemma in syn.lemmas():
# ---- Add antonyms of the synonyms to the antonyms set
#
if lemma.antonyms():
antonyms.add(lemma.antonyms()[0].name())
return antonyms
def find_synonyms(word, model, dict_embedding, list_2000_tokens):
'''
Finds the most similar token to a given word.
Parameters
----------
word : str
The word that we want to find the most similar word
model : spacy.language.Language
spaCy language model to use for the detection of the synonym
dict_embedding: dict
A dictionary where the keys are tokens (str) and the values are spaCy Doc objects
list_2000_tokens : list of str
A list of 2000 tokens against which the gloss will be checked.
Returns
-------
most_similar_token : str
The most similar token to the given word
'''
# ---- Skip synonym detection if the word is already in the list_2000_token
#
if word in list_2000_tokens:
return word
else:
# ---- Remove antonyms of the given word of the list_2000_tokens (a word and an antonym might be similar in embedding representation)
#
antonyms = find_antonyms(word)
list_2000_tokens_less_antonyms = [token for token in list_2000_tokens if token not in antonyms]
# ---- Generate a list of tuple (token, similarities values between the embedding of the given word and the embedding of each token of the list_2000_tokens)
#
word_embedding = model(word)
similarities=[]
for token in list_2000_tokens_less_antonyms:
similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))
# ---- Extract the most similar token of the list
#
most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]
return most_similar_token |