Spaces:

vanessbut
/

tldr_keywords

Build error

File size: 4,821 Bytes

1b80991

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from scipy.special import softmax

def preprocess(strings):
    """
    Заменить символы '\n' на пробелы и убрать лишние пробелы.
    
    strings - список строк.
    """
    
    for index in range(len(strings)):
        strings[index] = strings[index].replace('\n', ' ')
        strings[index] = re.sub(' +', ' ', strings[index])
        
    return strings


def get_candidates(text, nlp, min_df=0.0, ngram_range=(1, 3), max_words=None):
    """
    Получить список из max(max_words, #слов в text) кандидатов в ключевые слова.
    
    text - входной текст.
    nlp  - инструмент для анализа языка (см. spacy)
    min_df      - минимальная частота вхождения слова в текст.
    ngram_range - число грам в ключевом слове.
    max_words   - максимальное число слов на выходе.
    """
    
    # Получим самый базовый набор грам.
    count = CountVectorizer(ngram_range=ngram_range,
                            stop_words="english",
                            min_df=min_df,
                            max_features=max_words).fit([text])
    candidates = count.get_feature_names()
    #print(candidates)
    
    # Обработаем полученный список.
    nlp_result = nlp(text)
    
    # Фразы, содержащие существительные.
    noun_phrases = set(chunk.text.strip().lower() for chunk in nlp_result.noun_chunks)
    #print(noun_phrases)
    
    # Отдельно существительные.
    noun_lemmas = set()
    for token in nlp_result:
        if token.pos_ == "NOUN":
            noun_lemmas.add(token.lemma_) # Для одного слова всё-таки бессмысленно хранить форму.
    #print(noun_lemmas)
            
    nouns = set()
    for token in nlp_result:
        if token.pos_ == "NOUN" and not (token.text in noun_lemmas):
            nouns.add(token.text)
    #print(nouns)
    nouns = nouns.union(noun_lemmas)
    
    # Объединение.
    with_nouns = nouns.union(noun_phrases)
    
    # Отфильтровывание.
    candidates = list(filter(lambda candidate: candidate in with_nouns, candidates))
    
    return candidates


def get_embedding(texts, model, tokenizer, chunk_size=128):
    """
    Перевести набор текстов в эмбеддинги.
    """
    
    n_chunks = len(texts) // chunk_size + int(len(texts) % chunk_size != 0)
    embeddings = []
    
    for chunk_index in range(n_chunks):
        start = chunk_index * chunk_size
        end   = min(start + chunk_size, len(texts))
        chunk = texts[start:end]
        
        chunk_tokens = tokenizer(chunk, padding=True, truncation=True, return_tensors="pt")
        chunk_embeddings = model(**chunk_tokens)["pooler_output"]
        chunk_embeddings = chunk_embeddings.detach().numpy()
        
        embeddings.append(chunk_embeddings)
        
    embeddings = np.vstack(embeddings)
    
    return embeddings


def score_candidates(text, candidates, model, tokenizer):
    """
    Ранжирование ключевых слов.
    """
    
    if len(candidates) == 1:
        return np.array([1.0])
    elif len(candidates) == 0:
        return np.array([])
    
    # Эмбеддинг для текста.
    text_embedding = get_embedding([text], model, tokenizer)
    
    # Эмбеддинг для ключевых слов.
    candidate_embeddings = get_embedding(candidates, model, tokenizer)
    
    # Будем брать softmax от нормированных косинусных расстояний.
    distances = cosine_similarity(text_embedding, candidate_embeddings)
    score = softmax((distances - np.mean(distances)) / np.std(distances))[0]
    
    return score


def get_keywords(text, nlp, model, tokenizer, top=0.95, max_words=None):
    candidates = get_candidates(text, nlp)
    score = score_candidates(text, candidates, model, tokenizer)
    
    candidates_scored = [(candidates[index], score[index]) for index in score.argsort()[::-1]]
    
    result = []
    sum_probability = 0.0
    max_words = len(candidates_scored) if max_words is None else min(len(candidates_scored), max_words)
    for index in range(max_words):
        if sum_probability > top:
            break
            
        result.append(candidates_scored[index])
        sum_probability += candidates_scored[index][1]
    
    return result