pdf-reader / app.py
Omnibus's picture
Update app.py
352bdf6
raw
history blame
4.42 kB
import gradio as gr
import requests
from pypdf import PdfReader
import pypdfium2 as pdfium
import easyocr
ocr_id = {
"Afrikaans": "af",
"Albanian": "sq",
"Arabic": "ar",
"Azerbaijani": "az",
"Belarusian": "be",
"Bulgarian": "bg",
"Bengali": "bn",
"Bosnian": "bs",
"Chinese (simplified)": "ch_sim",
"Chinese (traditional)": "ch_tra",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Estonian": "et",
"French": "fr",
"German": "de",
"Irish": "ga",
"Hindi": "hi",
"Hungarian": "hu",
"Indonesian": "id",
"Icelandic": "is",
"Italian": "it",
"Japanese": "ja",
"Kannada": "kn",
"Korean": "ko",
"Lithuanian": "lt",
"Latvian": "lv",
"Mongolian": "mn",
"Marathi": "mr",
"Malay": "ms",
"Nepali": "ne",
"Norwegian": "no",
"Occitan": "oc",
"Polish": "pl",
"Portuguese": "pt",
"Romanian": "ro",
"Russian": "ru",
"Serbian (cyrillic)": "rs_cyrillic",
"Serbian (latin)": "rs_latin",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Swedish": "sv",
"Swahili": "sw",
"Tamil": "ta",
"Thai": "th",
"Tagalog": "tl",
"Turkish": "tr",
"Ukrainian": "uk",
"Urdu": "ur",
"Uzbek": "uz",
"Vietnamese": "vi",
"Welsh": "cy",
"Zulu": "zu",
}
def pdf_pil(file_path,page_num):
pdf = pdfium.PdfDocument("data.pdf")
print ("\n PDF read !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
#n_pages = len(pdf)
#for page_number in range(n_pages):
page = pdf.get_page(int(page_num)-1)
print ("\n Page read !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
bitmap = page.render(
scale = 1, # 72dpi resolution
rotation = 0, # no additional rotation
# ... further rendering options
)
print ("\n Page rendered !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
pil_image = bitmap.to_pil()
print ("\n Page to PIL !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
pil_image.save(f"image_{page_num}.png")
print ("\n Page saved !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
return (f"image_{page_num}.png")
def ocrpdf(file_path,pdf_lang,page_num):
img1 = pdf_pil(file_path,page_num)
print("DONE 1 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
lang=[f"{ocr_id[pdf_lang]}"]
print("DONE 2 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
reader = easyocr.Reader(lang)
print("DONE 3 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
bounds = reader.readtext(img1)
print("DONE 4 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
this = ""
for bound in bounds:
this = (f'{this}'+ f'{bound[1]}')
return this
def scrape(instring):
html_src=(f'''
<div style="text-align:center">
<h4>PDF Viewer</h4>
<iframe src="https://docs.google.com/viewer?url={instring}&embedded=true" frameborder="0" height="1200px" width="100%"></iframe>
</div>''')
return gr.HTML.update(f'''{html_src}''')
def scrape00(instring, page_num,pdf_lang):
response = requests.get(instring, stream=True)
if response.status_code == 200:
with open("data.pdf", "wb") as f:
f.write(response.content)
else:
print(response.status_code)
#out = Path("./data.pdf")
#print (out)
reader = PdfReader("data.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[int(page_num)-1]
text = page.extract_text()
print (text)
try:
summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
sum_out = summarizer(text)
except Exception:
try:
text = ocrpdf("data.pdf",pdf_lang,page_num)
sum_out = summarizer(text)
except Exception:
sum_out = "Error"
return text, sum_out
with gr.Blocks() as app:
gr.Markdown('''<h1>PDF Viewer''')
with gr.Row():
inp=gr.Textbox(label="PDF URL",scale=3)
pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1)
with gr.Row():
go_btn = gr.Button("Load PDF")
sum_btn = gr.Button("Summarize")
target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
outp = gr.HTML()
with gr.Row():
text_out = gr.Textbox()
sum_out = gr.Textbox()
go_btn.click(scrape,inp,outp)
sum_btn.click(scrape00,[inp,pg_num,target_lang],[text_out,sum_out])
app.queue(concurrency_count=10).launch()