JCRios commited on
Commit
ff6c896
verified
1 Parent(s): e7e4aac

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -0
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import re
3
+ import PyPDF2
4
+ import utils
5
+ import streamlit as st
6
+ from transformers import BertTokenizerFast, EncoderDecoderModel
7
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
8
+ model_id = 'mrm8488/bert2bert_shared-spanish-finetuned-summarization'
9
+ tokenizer = BertTokenizerFast.from_pretrained(model_id)
10
+ modelo = EncoderDecoderModel.from_pretrained(model_id).to(device)
11
+
12
+ def generate_summary(text):
13
+ inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
14
+ input_ids = inputs.input_ids.to(device)
15
+ attention_mask = inputs.attention_mask.to(device) #attention_mask only says that the model that this words are not pedded
16
+ output = modelo.generate(input_ids, attention_mask=attention_mask)
17
+ return tokenizer.decode(output[0], skip_special_tokens=True)
18
+
19
+ def summarize_pdf(pdf_file):
20
+ if pdf_file is not None:
21
+ with st.spinner('Generando resumen, espera un poco...'):
22
+ reader = PyPDF2.PdfReader(pdf_file)
23
+ title = reader.metadata.title
24
+ author = reader.metadata.author
25
+ pages =reader.pages
26
+ text = [pages[i].extract_text() for i in range(len(pages))]
27
+ text = [utils.drop_non_relevant_text(utils.preprocess_text(x)) for x in text]
28
+ text = [' '.join(x) for x in text]
29
+ text=[x+'\n' if len(x) < 50 else generate_summary(x)+' \n' for x in text]
30
+ results = [reader.metadata.title+' \n', reader.metadata.author+' \n'] + text
31
+ st.session_state["summary"] = ' '.join(results)
32
+
33
+ ## Graphic interfaz
34
+ def output(pdf_file):
35
+ if pdf_file is not None:
36
+ reader = PyPDF2.PdfReader(pdf_file)
37
+ title = reader.metadata.title
38
+ st.session_state["summary"] = title
39
+
40
+ if 'summary' not in st.session_state:
41
+ st.session_state['summary'] = ''
42
+ #output = summarize_pdf(pdf_file)
43
+ #reader = PyPDF2.PdfReader(pdf_file)
44
+ # title = reader.metadata.title
45
+ # output = title
46
+ # st.write(output)
47
+ st.caption('Demo para la generaci贸n de resumenes en espa帽ol')
48
+ with st.sidebar:
49
+ with st.container(border = True):
50
+ st.title('PDF-Summarizer para espa帽ol')
51
+ st.caption('Este demo est谩 basado en el modelo: \n mrm8488/bert2bert_shared-spanish-finetuned-summarization \n Creado por Manuel Romero/@mrm8488 con el soporte de Narrativa')
52
+ pdf_file = st.file_uploader('Carga tu archivo PDF', type="pdf")
53
+ corre_button = st.button('Genera resumen',
54
+ on_click=summarize_pdf,
55
+ args = (pdf_file, ),
56
+ help = 'Presiona para generar resumen')
57
+ #if pdf_file is not None:
58
+
59
+ container = st.container(height=300)
60
+ container.write('Resumen:')
61
+ container.write(st.session_state["summary"])
62
+