seidel commited on
Commit
3ad0459
·
1 Parent(s): 25439f3

initial commit

Browse files
Files changed (3) hide show
  1. app.py +79 -0
  2. requirements.txt +3 -0
  3. tfidf.py +52 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from venv import create
2
+ import streamlit as st
3
+ from transformers import T5TokenizerFast, T5ForConditionalGeneration
4
+ from tfidf import tfidf, filter_paragraph
5
+
6
+
7
+ def remove_doc(i):
8
+ if ('docs' in st.session_state):
9
+ if (len(st.session_state['docs']) > i):
10
+ st.session_state['docs'].pop(i)
11
+
12
+ def split_sentences(paragraph):
13
+ sentences = paragraph.split(' . ')
14
+ return sentences
15
+
16
+ st.markdown('## Use o PLSUM para criar leads do Wikipedia automaticamente')
17
+ st.markdown('''
18
+ Crie resumos no estilo do wikipedia a partir de multiplos documentos.
19
+ Cole textos de referência no formulário a baixo e depois clique em "Gerar resumo".
20
+ ''')
21
+
22
+ if ('tokenizer' not in st.session_state):
23
+ with st.sidebar:
24
+ st.info('Carregando o tokenizador')
25
+ st.session_state['tokenizer'] = T5TokenizerFast.from_pretrained("seidel/plsum-base-ptt5")
26
+
27
+ if ('model' not in st.session_state):
28
+ with st.sidebar:
29
+ st.info('Carregando o modelo')
30
+ st.session_state['model'] = T5ForConditionalGeneration.from_pretrained("seidel/plsum-base-ptt5", use_cache=True)
31
+
32
+ if ('docs' not in st.session_state):
33
+ st.session_state['docs'] = []
34
+
35
+ with st.form("my_form", clear_on_submit=True):
36
+ new_doc = st.text_area('Cole um documento de referência aqui')
37
+ # Every form must have a submit button.
38
+ submitted = st.form_submit_button("Adicionar texto")
39
+
40
+ if submitted:
41
+ if (new_doc != None and new_doc != ''):
42
+ st.session_state['docs'].append(filter_paragraph(new_doc))
43
+ st.info('Documento adicionado')
44
+ else:
45
+ st.error('Adicione algum texto')
46
+
47
+ for i, doc in enumerate(st.session_state['docs']):
48
+ with st.sidebar:
49
+ col1, col2 = st.columns([8, 1])
50
+ with col1:
51
+ with st.expander('Documento {}'.format(i+1)):
52
+ st.caption(doc)
53
+ with col2:
54
+ st.button('X', key='remove_{}'.format(i), on_click=remove_doc, args=(i,))
55
+
56
+ query = st.text_input('Título do resumo')
57
+ create_summary = st.button('Criar resumo')
58
+
59
+ if (create_summary):
60
+ if (query != None and query != ''):
61
+ if (len(st.session_state['docs']) > 0):
62
+ with st.sidebar:
63
+ st.info('Criando resumo')
64
+ sentences = []
65
+ for doc in st.session_state['docs']:
66
+ sentences = sentences + split_sentences(doc)
67
+ filtered_sentences = tfidf(sentences, n_documents=7)
68
+ input_text = 'summarize: {} </s> {}'.format(query.lower(), '</s>'.join(sentences))
69
+ x = st.session_state['tokenizer'](input_text, padding="max_length", max_length=512, return_tensors="pt", truncation=True)
70
+ y = st.session_state['model'].generate(**x)
71
+ summary = st.session_state['tokenizer'].batch_decode(y, skip_special_tokens=True)[0]
72
+ st.markdown('#### {}'.format(query))
73
+ st.markdown('{}'.format(summary))
74
+ else:
75
+ with st.sidebar:
76
+ st.error('Adicione documentos de referência')
77
+ else:
78
+ with st.sidebar:
79
+ st.error('Adicione título para o resumo')
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ nltk
3
+ gensim
tfidf.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from codecs import open
2
+ from nltk import word_tokenize
3
+ from gensim import corpora, models, similarities
4
+ import re
5
+
6
+
7
+ '''
8
+
9
+ Sparse extractive techniques
10
+
11
+ '''
12
+
13
+ def tfidf(docs, query=None, n_tokens=None, n_documents=None):
14
+ texts = [filter_paragraph(text).replace(' ', ' ').split(' ') for text in docs]
15
+ #print(texts)
16
+ dictionary = corpora.Dictionary(texts)
17
+ feature_cnt = len(dictionary.token2id)
18
+ corpus = [dictionary.doc2bow(text) for text in texts]
19
+ tfidf = models.TfidfModel(corpus)
20
+ #print(word_tokenize(query))
21
+ #print(texts)
22
+ if(query == None):
23
+ query = " ".join(docs)
24
+ kw_vector = dictionary.doc2bow(query.replace(' ', ' ').split(' '))
25
+ #print(query)
26
+ index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)
27
+ scores = index[tfidf[kw_vector]]
28
+ #print(scores)
29
+ to_out_ind = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
30
+ #print(to_out_ind)
31
+ to_out = []
32
+ if(n_tokens != None):
33
+ n = 0
34
+ for ind in to_out_ind:
35
+ n = n + len(word_tokenize(docs[ind]))
36
+ if(n > n_tokens):
37
+ break
38
+ to_out.append(docs[ind])
39
+ elif(n_documents != None):
40
+ for ind in to_out_ind[:n_documents]:
41
+ to_out.append(docs[ind])
42
+ return to_out
43
+
44
+
45
+ def filter_paragraph(p):
46
+ # creating a space between a word and the punctuation following it
47
+ # eg: "he is a boy." => "he is a boy .
48
+ p = re.sub(r"([?.!,¿()])", r" \1 ", p)
49
+ p = re.sub(r'[" "]+', " ", p)
50
+ # substituir tudo por espaço exceto (a-z, A-Z, ".", "?", "!", ",", letras com acentos da lingua pt)
51
+ p = re.sub(r"[^a-zA-ZçÇéêíáâãõôóúûÉÊÍÁÂÃÕÔÓÚÛ0-9]+", " ", p).lower()
52
+ return p