File size: 1,742 Bytes
3ad0459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from codecs import open
from nltk import word_tokenize 
from gensim import corpora, models, similarities
import re


'''

Sparse extractive techniques

'''

def tfidf(docs, query=None, n_tokens=None, n_documents=None):
    texts = [filter_paragraph(text).replace('  ', ' ').split(' ') for text in docs]
    #print(texts)
    dictionary = corpora.Dictionary(texts)
    feature_cnt = len(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(corpus)
    #print(word_tokenize(query))
    #print(texts)
    if(query == None):
        query = " ".join(docs)
    kw_vector = dictionary.doc2bow(query.replace('  ', ' ').split(' '))
    #print(query)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)
    scores = index[tfidf[kw_vector]]
    #print(scores)
    to_out_ind = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    #print(to_out_ind)
    to_out = []
    if(n_tokens != None):
        n = 0
        for ind in to_out_ind:
            n = n + len(word_tokenize(docs[ind]))
            if(n > n_tokens):
                break
            to_out.append(docs[ind])
    elif(n_documents != None):
        for ind in to_out_ind[:n_documents]:
            to_out.append(docs[ind])
    return to_out


def filter_paragraph(p):
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy .
    p = re.sub(r"([?.!,¿()])", r" \1 ", p)
    p = re.sub(r'[" "]+', " ", p)
    # substituir tudo por espaço exceto (a-z, A-Z, ".", "?", "!", ",", letras com acentos da lingua pt)
    p = re.sub(r"[^a-zA-ZçÇéêíáâãõôóúûÉÊÍÁÂÃÕÔÓÚÛ0-9]+", " ", p).lower()
    return p