plsum_autowiki / app.py
seidel's picture
initial commit
3ad0459
from venv import create
import streamlit as st
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from tfidf import tfidf, filter_paragraph
def remove_doc(i):
if ('docs' in st.session_state):
if (len(st.session_state['docs']) > i):
st.session_state['docs'].pop(i)
def split_sentences(paragraph):
sentences = paragraph.split(' . ')
return sentences
st.markdown('## Use o PLSUM para criar leads do Wikipedia automaticamente')
st.markdown('''
Crie resumos no estilo do wikipedia a partir de multiplos documentos.
Cole textos de referência no formulário a baixo e depois clique em "Gerar resumo".
''')
if ('tokenizer' not in st.session_state):
with st.sidebar:
st.info('Carregando o tokenizador')
st.session_state['tokenizer'] = T5TokenizerFast.from_pretrained("seidel/plsum-base-ptt5")
if ('model' not in st.session_state):
with st.sidebar:
st.info('Carregando o modelo')
st.session_state['model'] = T5ForConditionalGeneration.from_pretrained("seidel/plsum-base-ptt5", use_cache=True)
if ('docs' not in st.session_state):
st.session_state['docs'] = []
with st.form("my_form", clear_on_submit=True):
new_doc = st.text_area('Cole um documento de referência aqui')
# Every form must have a submit button.
submitted = st.form_submit_button("Adicionar texto")
if submitted:
if (new_doc != None and new_doc != ''):
st.session_state['docs'].append(filter_paragraph(new_doc))
st.info('Documento adicionado')
else:
st.error('Adicione algum texto')
for i, doc in enumerate(st.session_state['docs']):
with st.sidebar:
col1, col2 = st.columns([8, 1])
with col1:
with st.expander('Documento {}'.format(i+1)):
st.caption(doc)
with col2:
st.button('X', key='remove_{}'.format(i), on_click=remove_doc, args=(i,))
query = st.text_input('Título do resumo')
create_summary = st.button('Criar resumo')
if (create_summary):
if (query != None and query != ''):
if (len(st.session_state['docs']) > 0):
with st.sidebar:
st.info('Criando resumo')
sentences = []
for doc in st.session_state['docs']:
sentences = sentences + split_sentences(doc)
filtered_sentences = tfidf(sentences, n_documents=7)
input_text = 'summarize: {} </s> {}'.format(query.lower(), '</s>'.join(sentences))
x = st.session_state['tokenizer'](input_text, padding="max_length", max_length=512, return_tensors="pt", truncation=True)
y = st.session_state['model'].generate(**x)
summary = st.session_state['tokenizer'].batch_decode(y, skip_special_tokens=True)[0]
st.markdown('#### {}'.format(query))
st.markdown('{}'.format(summary))
else:
with st.sidebar:
st.error('Adicione documentos de referência')
else:
with st.sidebar:
st.error('Adicione título para o resumo')