Spaces:

seidel
/

plsum_autowiki

Runtime error

App Files Files Community

seidel commited on Oct 12, 2022

Commit

3ad0459

1 Parent(s): 25439f3

initial commit

Browse files

Files changed (3) hide show

app.py +79 -0
requirements.txt +3 -0
tfidf.py +52 -0

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from venv import create
+import streamlit as st
+from transformers import T5TokenizerFast, T5ForConditionalGeneration
+from tfidf import tfidf, filter_paragraph
+def remove_doc(i):
+  if ('docs' in st.session_state):
+    if (len(st.session_state['docs']) > i):
+      st.session_state['docs'].pop(i)
+def split_sentences(paragraph):
+  sentences = paragraph.split(' . ')
+  return sentences
+st.markdown('## Use o PLSUM para criar leads do Wikipedia automaticamente')
+st.markdown('''
+      Crie resumos no estilo do wikipedia a partir de multiplos documentos.
+      Cole textos de referência no formulário a baixo e depois clique em "Gerar resumo".
+  ''')
+if ('tokenizer' not in st.session_state):
+  with st.sidebar:
+    st.info('Carregando o tokenizador')
+  st.session_state['tokenizer'] = T5TokenizerFast.from_pretrained("seidel/plsum-base-ptt5")
+if ('model' not in st.session_state):
+  with st.sidebar:
+    st.info('Carregando o modelo')
+  st.session_state['model'] = T5ForConditionalGeneration.from_pretrained("seidel/plsum-base-ptt5", use_cache=True)
+if ('docs' not in st.session_state):
+  st.session_state['docs'] = []
+with st.form("my_form", clear_on_submit=True):
+  new_doc = st.text_area('Cole um documento de referência aqui')
+  # Every form must have a submit button.
+  submitted = st.form_submit_button("Adicionar texto")
+  if submitted:
+    if (new_doc != None and new_doc != ''):
+      st.session_state['docs'].append(filter_paragraph(new_doc))
+      st.info('Documento adicionado')
+    else:
+      st.error('Adicione algum texto')
+for i, doc in enumerate(st.session_state['docs']):
+  with st.sidebar:
+    col1, col2 = st.columns([8, 1])
+    with col1:
+      with st.expander('Documento {}'.format(i+1)):
+        st.caption(doc)
+    with col2:
+      st.button('X', key='remove_{}'.format(i), on_click=remove_doc, args=(i,))
+query = st.text_input('Título do resumo')
+create_summary = st.button('Criar resumo')
+if (create_summary):
+  if (query != None and query != ''):
+    if (len(st.session_state['docs']) > 0):
+      with st.sidebar:
+        st.info('Criando resumo')
+      sentences = []
+      for doc in st.session_state['docs']:
+        sentences = sentences + split_sentences(doc)
+      filtered_sentences = tfidf(sentences, n_documents=7)
+      input_text = 'summarize: {} </s> {}'.format(query.lower(), '</s>'.join(sentences))
+      x = st.session_state['tokenizer'](input_text, padding="max_length", max_length=512, return_tensors="pt", truncation=True)
+      y = st.session_state['model'].generate(**x)
+      summary = st.session_state['tokenizer'].batch_decode(y, skip_special_tokens=True)[0]
+      st.markdown('#### {}'.format(query))
+      st.markdown('{}'.format(summary))
+    else:
+      with st.sidebar:
+        st.error('Adicione documentos de referência')
+  else:
+    with st.sidebar:
+      st.error('Adicione título para o resumo')

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers
+nltk
+gensim

tfidf.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from codecs import open
+from nltk import word_tokenize
+from gensim import corpora, models, similarities
+import re
+'''
+Sparse extractive techniques
+'''
+def tfidf(docs, query=None, n_tokens=None, n_documents=None):
+    texts = [filter_paragraph(text).replace('  ', ' ').split(' ') for text in docs]
+    #print(texts)
+    dictionary = corpora.Dictionary(texts)
+    feature_cnt = len(dictionary.token2id)
+    corpus = [dictionary.doc2bow(text) for text in texts]
+    tfidf = models.TfidfModel(corpus)
+    #print(word_tokenize(query))
+    #print(texts)
+    if(query == None):
+        query = " ".join(docs)
+    kw_vector = dictionary.doc2bow(query.replace('  ', ' ').split(' '))
+    #print(query)
+    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)
+    scores = index[tfidf[kw_vector]]
+    #print(scores)
+    to_out_ind = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
+    #print(to_out_ind)
+    to_out = []
+    if(n_tokens != None):
+        n = 0
+        for ind in to_out_ind:
+            n = n + len(word_tokenize(docs[ind]))
+            if(n > n_tokens):
+                break
+            to_out.append(docs[ind])
+    elif(n_documents != None):
+        for ind in to_out_ind[:n_documents]:
+            to_out.append(docs[ind])
+    return to_out
+def filter_paragraph(p):
+    # creating a space between a word and the punctuation following it
+    # eg: "he is a boy." => "he is a boy .
+    p = re.sub(r"([?.!,¿()])", r" \1 ", p)
+    p = re.sub(r'[" "]+', " ", p)
+    # substituir tudo por espaço exceto (a-z, A-Z, ".", "?", "!", ",", letras com acentos da lingua pt)
+    p = re.sub(r"[^a-zA-ZçÇéêíáâãõôóúûÉÊÍÁÂÃÕÔÓÚÛ0-9]+", " ", p).lower()
+    return p