Spaces:
Running
Running
File size: 3,826 Bytes
3919e25 2e2a7b2 4ffb3fe 9774c1c 5135496 d171ec8 8d346b2 9774c1c 8d346b2 de8ef09 9774c1c de8ef09 9774c1c de8ef09 9774c1c a856582 47a5a07 b4be601 9774c1c 9c777f4 352bdf6 d171ec8 a923971 3919e25 2d1281f a923971 d599b56 d02b2ab a923971 9774c1c 2e2a7b2 a923971 e86a2c5 2e2a7b2 b7f89cc 2e2a7b2 1c5b68c fb04ca9 352bdf6 1e634f8 8f70505 502b110 d599b56 d846da3 9c7f619 9774c1c 563ca5d 1e634f8 ba73e05 9774c1c 3919e25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
import requests
from pypdf import PdfReader
import pypdfium2 as pdfium
import easyocr
ocr_id = {
"Afrikaans": "af",
"Albanian": "sq",
"Arabic": "ar",
"Azerbaijani": "az",
"Belarusian": "be",
"Bulgarian": "bg",
"Bengali": "bn",
"Bosnian": "bs",
"Chinese (simplified)": "ch_sim",
"Chinese (traditional)": "ch_tra",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Estonian": "et",
"French": "fr",
"German": "de",
"Irish": "ga",
"Hindi": "hi",
"Hungarian": "hu",
"Indonesian": "id",
"Icelandic": "is",
"Italian": "it",
"Japanese": "ja",
"Kannada": "kn",
"Korean": "ko",
"Lithuanian": "lt",
"Latvian": "lv",
"Mongolian": "mn",
"Marathi": "mr",
"Malay": "ms",
"Nepali": "ne",
"Norwegian": "no",
"Occitan": "oc",
"Polish": "pl",
"Portuguese": "pt",
"Romanian": "ro",
"Russian": "ru",
"Serbian (cyrillic)": "rs_cyrillic",
"Serbian (latin)": "rs_latin",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Swedish": "sv",
"Swahili": "sw",
"Tamil": "ta",
"Thai": "th",
"Tagalog": "tl",
"Turkish": "tr",
"Ukrainian": "uk",
"Urdu": "ur",
"Uzbek": "uz",
"Vietnamese": "vi",
"Welsh": "cy",
"Zulu": "zu",
}
def pdf_pil(file_path,page_num):
pdf = pdfium.PdfDocument("data.pdf")
page = pdf.get_page(int(page_num)-1)
bitmap = page.render(
scale = 1, # 72dpi resolution
rotation = 0, # no additional rotation
# ... further rendering options
)
pil_image = bitmap.to_pil()
pil_image.save(f"image_{page_num}.png")
return (f"image_{page_num}.png")
def ocrpdf(file_path,pdf_lang,page_num):
img1 = pdf_pil(file_path,page_num)
lang=[f"{ocr_id[pdf_lang]}"]
reader = easyocr.Reader(lang)
bounds = reader.readtext(img1,width_ths=1)
this = ""
for bound in bounds:
this = (f'{this} \n{bound[1]}')
return this
def scrape(instring):
html_src=(f'''
<div style="text-align:center">
<h4>PDF Viewer</h4>
<iframe src="https://docs.google.com/viewer?url={instring}&embedded=true" frameborder="0" height="1200px" width="100%"></iframe>
</div>''')
return gr.HTML.update(f'''{html_src}''')
def scrape00(instring, page_num,pdf_lang):
response = requests.get(instring, stream=True)
if response.status_code == 200:
with open("data.pdf", "wb") as f:
f.write(response.content)
else:
print(response.status_code)
#out = Path("./data.pdf")
#print (out)
reader = PdfReader("data.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[int(page_num)-1]
text = page.extract_text()
print (text)
try:
summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
sum_out = summarizer(text)
except Exception:
try:
text = ocrpdf("data.pdf",pdf_lang,page_num)
sum_out = summarizer(text)
except Exception:
sum_out = "Error"
return text, sum_out
with gr.Blocks() as app:
gr.Markdown('''<h1>PDF Viewer''')
with gr.Row():
inp=gr.Textbox(label="PDF URL",scale=3)
pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1)
with gr.Row():
go_btn = gr.Button("Load PDF")
sum_btn = gr.Button("Summarize")
target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
outp = gr.HTML()
with gr.Row():
text_out = gr.Textbox()
sum_out = gr.Textbox()
go_btn.click(scrape,inp,outp)
sum_btn.click(scrape00,[inp,pg_num,target_lang],[text_out,sum_out])
app.queue(concurrency_count=10).launch() |