plsum_autowiki / tfidf.py
seidel's picture
initial commit
3ad0459
from codecs import open
from nltk import word_tokenize
from gensim import corpora, models, similarities
import re
'''
Sparse extractive techniques
'''
def tfidf(docs, query=None, n_tokens=None, n_documents=None):
texts = [filter_paragraph(text).replace(' ', ' ').split(' ') for text in docs]
#print(texts)
dictionary = corpora.Dictionary(texts)
feature_cnt = len(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
#print(word_tokenize(query))
#print(texts)
if(query == None):
query = " ".join(docs)
kw_vector = dictionary.doc2bow(query.replace(' ', ' ').split(' '))
#print(query)
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)
scores = index[tfidf[kw_vector]]
#print(scores)
to_out_ind = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
#print(to_out_ind)
to_out = []
if(n_tokens != None):
n = 0
for ind in to_out_ind:
n = n + len(word_tokenize(docs[ind]))
if(n > n_tokens):
break
to_out.append(docs[ind])
elif(n_documents != None):
for ind in to_out_ind[:n_documents]:
to_out.append(docs[ind])
return to_out
def filter_paragraph(p):
# creating a space between a word and the punctuation following it
# eg: "he is a boy." => "he is a boy .
p = re.sub(r"([?.!,¿()])", r" \1 ", p)
p = re.sub(r'[" "]+', " ", p)
# substituir tudo por espaço exceto (a-z, A-Z, ".", "?", "!", ",", letras com acentos da lingua pt)
p = re.sub(r"[^a-zA-ZçÇéêíáâãõôóúûÉÊÍÁÂÃÕÔÓÚÛ0-9]+", " ", p).lower()
return p