Spaces:
Sleeping
Sleeping
File size: 1,653 Bytes
3a5bd74 204a311 3a5bd74 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import os
import fitz
import gradio as gr
import re
from summarizer import Summarizer
def preprocess(text):
text = text.replace('\n', ' ')
text = re.sub('\s+', ' ', text)
return text
def pdf_to_text(path, start_page=1, end_page=None):
doc = fitz.open(path)
total_pages = doc.page_count
if end_page is None:
end_page = total_pages
text_list = []
for i in range(start_page-1, end_page):
text = doc.load_page(i).get_text("text")
text = preprocess(text)
text_list.append(text)
doc.close()
return ' '.join(text_list)
def generate_summary(text, model='bert-base-uncased', ratio=0.2):
model = Summarizer()
summary = model(text, ratio=ratio)
return summary
def pdf_summary(file, secret):
if secret != os.environ.get('Secret'):
return '[Error]: Please provide the correct secret'
elif file is None:
return '[ERROR]: Please upload a PDF file.'
else:
old_file_name = file.name
file_name = file.name
file_name = file_name[:-12] + file_name[-4:]
os.rename(old_file_name, file_name)
text = pdf_to_text(file_name)
if text.strip() == '':
return '[ERROR]: The content of PDF is empty.'
return generate_summary(text)
title = 'PDF Summarizer'
description = "A platform for generating summary for a PDF using BERT model"
with gr.Interface(
fn=pdf_summary,
inputs=[
gr.File(label='PDF', file_types=['.pdf']),
gr.Textbox(label='Secret')
],
outputs=gr.Textbox(label='Summary'),
title=title,
description=description
) as iface:
iface.launch()
|