File size: 3,008 Bytes
ff6c896
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6dece3e
a9573f3
 
 
6dece3e
a9573f3
 
 
ff6c896
 
 
 
 
a9573f3
ff6c896
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8d61b9
ff6c896
0d95fbf
 
 
 
 
ff6c896
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import torch
import re
import PyPDF2
import utils
import streamlit as st
from transformers import BertTokenizerFast, EncoderDecoderModel
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_id = 'mrm8488/bert2bert_shared-spanish-finetuned-summarization'
tokenizer = BertTokenizerFast.from_pretrained(model_id)
modelo = EncoderDecoderModel.from_pretrained(model_id).to(device)

def generate_summary(text):
   inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
   input_ids = inputs.input_ids.to(device)
   attention_mask = inputs.attention_mask.to(device) #attention_mask only says that the model that this words are not pedded
   output = modelo.generate(input_ids, attention_mask=attention_mask)
   return tokenizer.decode(output[0], skip_special_tokens=True)

def summarize_pdf(pdf_file):
    if pdf_file is not None: 
        with st.spinner('Generando resumen, espera un poco...'):
            reader = PyPDF2.PdfReader(pdf_file)
            if reader.metadata.title == None:
                title = ''
            else:
                title = reader.metadata.title
            if reader.metadata.author == None:
                author = ''
            else:
                author = reader.metadata.author            
            pages =reader.pages
            text = [pages[i].extract_text() for i in range(len(pages))]
            text = [utils.drop_non_relevant_text(utils.preprocess_text(x)) for x in text]
            text = [' '.join(x) for x in text]
            text=[x+'\n' if len(x) < 50 else generate_summary(x)+'  \n' for x in text]
            results = [title+'  \n', author+'  \n'] + text
            st.session_state["summary"] = ' '.join(results)

## Graphic interfaz
def output(pdf_file):
    if pdf_file is not None: 
        reader = PyPDF2.PdfReader(pdf_file)
        title = reader.metadata.title
        st.session_state["summary"] = title

if 'summary' not in st.session_state:
    st.session_state['summary'] = ''
#output = summarize_pdf(pdf_file)
#reader = PyPDF2.PdfReader(pdf_file)
#        title = reader.metadata.title
#        output = title
#        st.write(output)
st.caption('Demo para la generación de resumenes en español')
with st.sidebar:
    with st.container(border = True):
        st.title('PDF-Summarizer para español')
    st.caption('Este demo está basado en el modelo:  \n mrm8488/bert2bert_shared-spanish-finetuned-summarization  \n creado por Manuel Romero/@mrm8488 con el soporte de Narrativa.')
    pdf_file = st.file_uploader('Carga tu archivo PDF', type="pdf")
    with st.spinner('Estamos generando tu resuen, espera un poco...'):
        corre_button = st.button('Genera resumen', 
                             on_click=summarize_pdf,
                                 args = (pdf_file, ),
                             help = 'Presiona para generar resumen')
        
container = st.container(height=300)
container.write('Resumen:')
container.write(st.session_state["summary"])