Spaces:
Sleeping
Sleeping
File size: 3,008 Bytes
ff6c896 6dece3e a9573f3 6dece3e a9573f3 ff6c896 a9573f3 ff6c896 d8d61b9 ff6c896 0d95fbf ff6c896 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import torch
import re
import PyPDF2
import utils
import streamlit as st
from transformers import BertTokenizerFast, EncoderDecoderModel
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_id = 'mrm8488/bert2bert_shared-spanish-finetuned-summarization'
tokenizer = BertTokenizerFast.from_pretrained(model_id)
modelo = EncoderDecoderModel.from_pretrained(model_id).to(device)
def generate_summary(text):
inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device) #attention_mask only says that the model that this words are not pedded
output = modelo.generate(input_ids, attention_mask=attention_mask)
return tokenizer.decode(output[0], skip_special_tokens=True)
def summarize_pdf(pdf_file):
if pdf_file is not None:
with st.spinner('Generando resumen, espera un poco...'):
reader = PyPDF2.PdfReader(pdf_file)
if reader.metadata.title == None:
title = ''
else:
title = reader.metadata.title
if reader.metadata.author == None:
author = ''
else:
author = reader.metadata.author
pages =reader.pages
text = [pages[i].extract_text() for i in range(len(pages))]
text = [utils.drop_non_relevant_text(utils.preprocess_text(x)) for x in text]
text = [' '.join(x) for x in text]
text=[x+'\n' if len(x) < 50 else generate_summary(x)+' \n' for x in text]
results = [title+' \n', author+' \n'] + text
st.session_state["summary"] = ' '.join(results)
## Graphic interfaz
def output(pdf_file):
if pdf_file is not None:
reader = PyPDF2.PdfReader(pdf_file)
title = reader.metadata.title
st.session_state["summary"] = title
if 'summary' not in st.session_state:
st.session_state['summary'] = ''
#output = summarize_pdf(pdf_file)
#reader = PyPDF2.PdfReader(pdf_file)
# title = reader.metadata.title
# output = title
# st.write(output)
st.caption('Demo para la generación de resumenes en español')
with st.sidebar:
with st.container(border = True):
st.title('PDF-Summarizer para español')
st.caption('Este demo está basado en el modelo: \n mrm8488/bert2bert_shared-spanish-finetuned-summarization \n creado por Manuel Romero/@mrm8488 con el soporte de Narrativa.')
pdf_file = st.file_uploader('Carga tu archivo PDF', type="pdf")
with st.spinner('Estamos generando tu resuen, espera un poco...'):
corre_button = st.button('Genera resumen',
on_click=summarize_pdf,
args = (pdf_file, ),
help = 'Presiona para generar resumen')
container = st.container(height=300)
container.write('Resumen:')
container.write(st.session_state["summary"])
|