Spaces:
Sleeping
Sleeping
import os | |
import fitz | |
import gradio as gr | |
import re | |
from summarizer import Summarizer | |
def preprocess(text): | |
text = text.replace('\n', ' ') | |
text = re.sub('\s+', ' ', text) | |
return text | |
def pdf_to_text(path, start_page=1, end_page=None): | |
doc = fitz.open(path) | |
total_pages = doc.page_count | |
if end_page is None: | |
end_page = total_pages | |
text_list = [] | |
for i in range(start_page-1, end_page): | |
text = doc.load_page(i).get_text("text") | |
text = preprocess(text) | |
text_list.append(text) | |
doc.close() | |
return ' '.join(text_list) | |
def generate_summary(text, model='bert-base-uncased', ratio=0.2): | |
model = Summarizer() | |
summary = model(text, ratio=ratio) | |
return summary | |
def pdf_summary(file, secret): | |
if secret != os.environ.get('Secret'): | |
return '[Error]: Please provide the correct secret' | |
elif file is None: | |
return '[ERROR]: Please upload a PDF file.' | |
else: | |
old_file_name = file.name | |
file_name = file.name | |
file_name = file_name[:-12] + file_name[-4:] | |
os.rename(old_file_name, file_name) | |
text = pdf_to_text(file_name) | |
if text.strip() == '': | |
return '[ERROR]: The content of PDF is empty.' | |
return generate_summary(text) | |
title = 'PDF Summarizer' | |
description = "A platform for generating summary for a PDF using BERT model" | |
with gr.Interface( | |
fn=pdf_summary, | |
inputs=[ | |
gr.File(label='PDF', file_types=['.pdf']), | |
gr.Textbox(label='Secret') | |
], | |
outputs=gr.Textbox(label='Summary'), | |
title=title, | |
description=description | |
) as iface: | |
iface.launch() | |