Spaces:
Runtime error
Runtime error
from codecs import open | |
from nltk import word_tokenize | |
from gensim import corpora, models, similarities | |
import re | |
''' | |
Sparse extractive techniques | |
''' | |
def tfidf(docs, query=None, n_tokens=None, n_documents=None): | |
texts = [filter_paragraph(text).replace(' ', ' ').split(' ') for text in docs] | |
#print(texts) | |
dictionary = corpora.Dictionary(texts) | |
feature_cnt = len(dictionary.token2id) | |
corpus = [dictionary.doc2bow(text) for text in texts] | |
tfidf = models.TfidfModel(corpus) | |
#print(word_tokenize(query)) | |
#print(texts) | |
if(query == None): | |
query = " ".join(docs) | |
kw_vector = dictionary.doc2bow(query.replace(' ', ' ').split(' ')) | |
#print(query) | |
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt) | |
scores = index[tfidf[kw_vector]] | |
#print(scores) | |
to_out_ind = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True) | |
#print(to_out_ind) | |
to_out = [] | |
if(n_tokens != None): | |
n = 0 | |
for ind in to_out_ind: | |
n = n + len(word_tokenize(docs[ind])) | |
if(n > n_tokens): | |
break | |
to_out.append(docs[ind]) | |
elif(n_documents != None): | |
for ind in to_out_ind[:n_documents]: | |
to_out.append(docs[ind]) | |
return to_out | |
def filter_paragraph(p): | |
# creating a space between a word and the punctuation following it | |
# eg: "he is a boy." => "he is a boy . | |
p = re.sub(r"([?.!,¿()])", r" \1 ", p) | |
p = re.sub(r'[" "]+', " ", p) | |
# substituir tudo por espaço exceto (a-z, A-Z, ".", "?", "!", ",", letras com acentos da lingua pt) | |
p = re.sub(r"[^a-zA-ZçÇéêíáâãõôóúûÉÊÍÁÂÃÕÔÓÚÛ0-9]+", " ", p).lower() | |
return p | |