Spaces:

ginigen
/

Sign-language

Building

File size: 3,934 Bytes

fe1339d
 
 
 
 
abc6394
974d749
abc6394
fe1339d
974d749
 
abc6394
 
 
974d749
 
abc6394
 
974d749
 
 
 
 
fe1339d
974d749
abc6394
 
974d749
fe1339d
974d749
 
abc6394
974d749
 
 
fe1339d
 
 
 
 
 
 
974d749
fe1339d
974d749
 
 
 
 
 
 
 
 
 
 
 
 
 
fe1339d
974d749
 
 
fe1339d
974d749
 
 
fe1339d
974d749
 
fe1339d
974d749
 
fe1339d
 
974d749
fe1339d
 
 
974d749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c55ac1a
 
974d749
 
c55ac1a
974d749
fe1339d
974d749
 
c55ac1a
 
fe1339d
974d749
c55ac1a
974d749
 
 
c55ac1a
fe1339d
c55ac1a

import spacy
import pickle
from nltk.corpus import wordnet


def load_spacy_values(model = "en_core_web_md", filepath_docs_spacy = 'dict_spacy_object.pkl'):
    '''
    Loads a spaCy language model and a dictionary of spaCy Doc objects from a pickle file.

    Parameters
    ----------
    model : str
        The name or local path of the spaCy model to be loaded for processing text. 
        For example, "en_core_web_sm" or a custom model path.

    filepath_docs_spacy : str
        The path to the pickle file containing a dictionary where the keys are tokens 
        (strings) and the values are the corresponding serialized spaCy Doc objects.

    Returns
    -------
    nlp : spacy.language.Language
        The loaded spaCy language model.

    dict_docs_spacy : dict
        A dictionary where the keys are tokens (strings) and the values are spaCy Doc 
        objects reconstructed from the serialized bytes stored in the pickle file.
    '''
    
    # ---- Load the spaCy NLP model
    #
    nlp = spacy.load(model)
    
    # ---- Load pickle file and reconstruct the dictionary with tokens as keys and spaCy Doc objects as values
    #
    with open(filepath_docs_spacy, 'rb') as file:
        dict_docs_spacy_bytes = pickle.load(file)
    
    dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()}
    
    return nlp, dict_docs_spacy


def find_antonyms(word):
    '''
    Generate a set of all the antonyms of a given word

    Parameters
    ----------
    word : str
        The word that we want to find the antonyms

    Returns
    -------
    antonyms : set of str
        A set of all the antonym detected using nltk and WordNet
    '''
    
    antonyms = set()

    # ---- Load all the set of synonyms of the word recorded from wordnet
    #
    syn_set = wordnet.synsets(word)

    # ---- Loop over each set of synonyms
    #
    for syn in syn_set:
        # ---- Loop over each synonym
        #
        for lemma in syn.lemmas():
            # ---- Add antonyms of the synonyms to the antonyms set
            #
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())

    return antonyms


def find_synonyms(word, model, dict_embedding, list_2000_tokens):
    '''
    Finds the most similar token to a given word.

    Parameters
    ----------
    word : str
        The word that we want to find the most similar word

    model : spacy.language.Language
        spaCy language model to use for the detection of the synonym
    
    dict_embedding: dict
        A dictionary where the keys are tokens (str) and the values are spaCy Doc objects

    list_2000_tokens : list of str
        A list of 2000 tokens against which the gloss will be checked.
    
    Returns
    -------
    most_similar_token : str
        The most similar token to the given word 
    '''

    # ---- Skip synonym detection if the word is already in the list_2000_token
    #
    if word in list_2000_tokens:
        return word
    else:
        # ---- Remove antonyms of the given word of the list_2000_tokens (a word and an antonym might be similar in embedding representation)
        #
        antonyms = find_antonyms(word)
        list_2000_tokens_less_antonyms = [token for token in list_2000_tokens if token not in antonyms]

        # ---- Generate a list of tuple (token, similarities values between the embedding of the given word and the embedding of each token of the list_2000_tokens)
        #
        word_embedding = model(word)
        similarities=[]
    
        for token in list_2000_tokens_less_antonyms:
            similarities.append((token, dict_embedding.get(token).similarity(word_embedding)))

        # ---- Extract the most similar token of the list
        #
        most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0]

        return most_similar_token