Spaces:
Runtime error
Runtime error
initial commit
Browse files- app.py +79 -0
- requirements.txt +3 -0
- tfidf.py +52 -0
app.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from venv import create
|
2 |
+
import streamlit as st
|
3 |
+
from transformers import T5TokenizerFast, T5ForConditionalGeneration
|
4 |
+
from tfidf import tfidf, filter_paragraph
|
5 |
+
|
6 |
+
|
7 |
+
def remove_doc(i):
|
8 |
+
if ('docs' in st.session_state):
|
9 |
+
if (len(st.session_state['docs']) > i):
|
10 |
+
st.session_state['docs'].pop(i)
|
11 |
+
|
12 |
+
def split_sentences(paragraph):
|
13 |
+
sentences = paragraph.split(' . ')
|
14 |
+
return sentences
|
15 |
+
|
16 |
+
st.markdown('## Use o PLSUM para criar leads do Wikipedia automaticamente')
|
17 |
+
st.markdown('''
|
18 |
+
Crie resumos no estilo do wikipedia a partir de multiplos documentos.
|
19 |
+
Cole textos de referência no formulário a baixo e depois clique em "Gerar resumo".
|
20 |
+
''')
|
21 |
+
|
22 |
+
if ('tokenizer' not in st.session_state):
|
23 |
+
with st.sidebar:
|
24 |
+
st.info('Carregando o tokenizador')
|
25 |
+
st.session_state['tokenizer'] = T5TokenizerFast.from_pretrained("seidel/plsum-base-ptt5")
|
26 |
+
|
27 |
+
if ('model' not in st.session_state):
|
28 |
+
with st.sidebar:
|
29 |
+
st.info('Carregando o modelo')
|
30 |
+
st.session_state['model'] = T5ForConditionalGeneration.from_pretrained("seidel/plsum-base-ptt5", use_cache=True)
|
31 |
+
|
32 |
+
if ('docs' not in st.session_state):
|
33 |
+
st.session_state['docs'] = []
|
34 |
+
|
35 |
+
with st.form("my_form", clear_on_submit=True):
|
36 |
+
new_doc = st.text_area('Cole um documento de referência aqui')
|
37 |
+
# Every form must have a submit button.
|
38 |
+
submitted = st.form_submit_button("Adicionar texto")
|
39 |
+
|
40 |
+
if submitted:
|
41 |
+
if (new_doc != None and new_doc != ''):
|
42 |
+
st.session_state['docs'].append(filter_paragraph(new_doc))
|
43 |
+
st.info('Documento adicionado')
|
44 |
+
else:
|
45 |
+
st.error('Adicione algum texto')
|
46 |
+
|
47 |
+
for i, doc in enumerate(st.session_state['docs']):
|
48 |
+
with st.sidebar:
|
49 |
+
col1, col2 = st.columns([8, 1])
|
50 |
+
with col1:
|
51 |
+
with st.expander('Documento {}'.format(i+1)):
|
52 |
+
st.caption(doc)
|
53 |
+
with col2:
|
54 |
+
st.button('X', key='remove_{}'.format(i), on_click=remove_doc, args=(i,))
|
55 |
+
|
56 |
+
query = st.text_input('Título do resumo')
|
57 |
+
create_summary = st.button('Criar resumo')
|
58 |
+
|
59 |
+
if (create_summary):
|
60 |
+
if (query != None and query != ''):
|
61 |
+
if (len(st.session_state['docs']) > 0):
|
62 |
+
with st.sidebar:
|
63 |
+
st.info('Criando resumo')
|
64 |
+
sentences = []
|
65 |
+
for doc in st.session_state['docs']:
|
66 |
+
sentences = sentences + split_sentences(doc)
|
67 |
+
filtered_sentences = tfidf(sentences, n_documents=7)
|
68 |
+
input_text = 'summarize: {} </s> {}'.format(query.lower(), '</s>'.join(sentences))
|
69 |
+
x = st.session_state['tokenizer'](input_text, padding="max_length", max_length=512, return_tensors="pt", truncation=True)
|
70 |
+
y = st.session_state['model'].generate(**x)
|
71 |
+
summary = st.session_state['tokenizer'].batch_decode(y, skip_special_tokens=True)[0]
|
72 |
+
st.markdown('#### {}'.format(query))
|
73 |
+
st.markdown('{}'.format(summary))
|
74 |
+
else:
|
75 |
+
with st.sidebar:
|
76 |
+
st.error('Adicione documentos de referência')
|
77 |
+
else:
|
78 |
+
with st.sidebar:
|
79 |
+
st.error('Adicione título para o resumo')
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
nltk
|
3 |
+
gensim
|
tfidf.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from codecs import open
|
2 |
+
from nltk import word_tokenize
|
3 |
+
from gensim import corpora, models, similarities
|
4 |
+
import re
|
5 |
+
|
6 |
+
|
7 |
+
'''
|
8 |
+
|
9 |
+
Sparse extractive techniques
|
10 |
+
|
11 |
+
'''
|
12 |
+
|
13 |
+
def tfidf(docs, query=None, n_tokens=None, n_documents=None):
|
14 |
+
texts = [filter_paragraph(text).replace(' ', ' ').split(' ') for text in docs]
|
15 |
+
#print(texts)
|
16 |
+
dictionary = corpora.Dictionary(texts)
|
17 |
+
feature_cnt = len(dictionary.token2id)
|
18 |
+
corpus = [dictionary.doc2bow(text) for text in texts]
|
19 |
+
tfidf = models.TfidfModel(corpus)
|
20 |
+
#print(word_tokenize(query))
|
21 |
+
#print(texts)
|
22 |
+
if(query == None):
|
23 |
+
query = " ".join(docs)
|
24 |
+
kw_vector = dictionary.doc2bow(query.replace(' ', ' ').split(' '))
|
25 |
+
#print(query)
|
26 |
+
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)
|
27 |
+
scores = index[tfidf[kw_vector]]
|
28 |
+
#print(scores)
|
29 |
+
to_out_ind = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
|
30 |
+
#print(to_out_ind)
|
31 |
+
to_out = []
|
32 |
+
if(n_tokens != None):
|
33 |
+
n = 0
|
34 |
+
for ind in to_out_ind:
|
35 |
+
n = n + len(word_tokenize(docs[ind]))
|
36 |
+
if(n > n_tokens):
|
37 |
+
break
|
38 |
+
to_out.append(docs[ind])
|
39 |
+
elif(n_documents != None):
|
40 |
+
for ind in to_out_ind[:n_documents]:
|
41 |
+
to_out.append(docs[ind])
|
42 |
+
return to_out
|
43 |
+
|
44 |
+
|
45 |
+
def filter_paragraph(p):
|
46 |
+
# creating a space between a word and the punctuation following it
|
47 |
+
# eg: "he is a boy." => "he is a boy .
|
48 |
+
p = re.sub(r"([?.!,¿()])", r" \1 ", p)
|
49 |
+
p = re.sub(r'[" "]+', " ", p)
|
50 |
+
# substituir tudo por espaço exceto (a-z, A-Z, ".", "?", "!", ",", letras com acentos da lingua pt)
|
51 |
+
p = re.sub(r"[^a-zA-ZçÇéêíáâãõôóúûÉÊÍÁÂÃÕÔÓÚÛ0-9]+", " ", p).lower()
|
52 |
+
return p
|