File size: 2,873 Bytes
3ad0459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from venv import create
import streamlit as st
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from tfidf import tfidf, filter_paragraph


def remove_doc(i):
  if ('docs' in st.session_state):
    if (len(st.session_state['docs']) > i):
      st.session_state['docs'].pop(i)

def split_sentences(paragraph):
  sentences = paragraph.split(' . ')
  return sentences

st.markdown('## Use o PLSUM para criar leads do Wikipedia automaticamente')
st.markdown('''
      Crie resumos no estilo do wikipedia a partir de multiplos documentos. 
      Cole textos de referência no formulário a baixo e depois clique em "Gerar resumo".
  ''')

if ('tokenizer' not in st.session_state):
  with st.sidebar:
    st.info('Carregando o tokenizador')
  st.session_state['tokenizer'] = T5TokenizerFast.from_pretrained("seidel/plsum-base-ptt5")

if ('model' not in st.session_state):
  with st.sidebar:
    st.info('Carregando o modelo')
  st.session_state['model'] = T5ForConditionalGeneration.from_pretrained("seidel/plsum-base-ptt5", use_cache=True)

if ('docs' not in st.session_state):
  st.session_state['docs'] = []

with st.form("my_form", clear_on_submit=True):
  new_doc = st.text_area('Cole um documento de referência aqui')
  # Every form must have a submit button.
  submitted = st.form_submit_button("Adicionar texto")

  if submitted:
    if (new_doc != None and new_doc != ''):
      st.session_state['docs'].append(filter_paragraph(new_doc))
      st.info('Documento adicionado')
    else:
      st.error('Adicione algum texto')

for i, doc in enumerate(st.session_state['docs']):
  with st.sidebar:
    col1, col2 = st.columns([8, 1])
    with col1:
      with st.expander('Documento {}'.format(i+1)):
        st.caption(doc)
    with col2:
      st.button('X', key='remove_{}'.format(i), on_click=remove_doc, args=(i,))

query = st.text_input('Título do resumo')
create_summary = st.button('Criar resumo')

if (create_summary):
  if (query != None and query != ''):
    if (len(st.session_state['docs']) > 0):
      with st.sidebar:
        st.info('Criando resumo')
      sentences = []
      for doc in st.session_state['docs']:
        sentences = sentences + split_sentences(doc)
      filtered_sentences = tfidf(sentences, n_documents=7)
      input_text = 'summarize: {} </s> {}'.format(query.lower(), '</s>'.join(sentences))
      x = st.session_state['tokenizer'](input_text, padding="max_length", max_length=512, return_tensors="pt", truncation=True)
      y = st.session_state['model'].generate(**x)
      summary = st.session_state['tokenizer'].batch_decode(y, skip_special_tokens=True)[0]
      st.markdown('#### {}'.format(query))
      st.markdown('{}'.format(summary))
    else:
      with st.sidebar:
        st.error('Adicione documentos de referência')
  else:
    with st.sidebar:
      st.error('Adicione título para o resumo')