Spaces:
Running
Running
File size: 4,420 Bytes
3919e25 2e2a7b2 4ffb3fe 9774c1c 5135496 de8ef09 9774c1c d171ec8 de8ef09 8d346b2 9774c1c de8ef09 8d346b2 de8ef09 9774c1c de8ef09 9774c1c de8ef09 47a5a07 9774c1c 47a5a07 9774c1c 47a5a07 9774c1c 47a5a07 b4be601 9774c1c 352bdf6 d171ec8 a923971 3919e25 2d1281f a923971 d599b56 d02b2ab a923971 9774c1c 2e2a7b2 a923971 e86a2c5 2e2a7b2 b7f89cc 2e2a7b2 1c5b68c fb04ca9 352bdf6 1e634f8 8f70505 502b110 d599b56 d846da3 9c7f619 9774c1c 563ca5d 1e634f8 ba73e05 9774c1c 3919e25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import gradio as gr
import requests
from pypdf import PdfReader
import pypdfium2 as pdfium
import easyocr
ocr_id = {
"Afrikaans": "af",
"Albanian": "sq",
"Arabic": "ar",
"Azerbaijani": "az",
"Belarusian": "be",
"Bulgarian": "bg",
"Bengali": "bn",
"Bosnian": "bs",
"Chinese (simplified)": "ch_sim",
"Chinese (traditional)": "ch_tra",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Estonian": "et",
"French": "fr",
"German": "de",
"Irish": "ga",
"Hindi": "hi",
"Hungarian": "hu",
"Indonesian": "id",
"Icelandic": "is",
"Italian": "it",
"Japanese": "ja",
"Kannada": "kn",
"Korean": "ko",
"Lithuanian": "lt",
"Latvian": "lv",
"Mongolian": "mn",
"Marathi": "mr",
"Malay": "ms",
"Nepali": "ne",
"Norwegian": "no",
"Occitan": "oc",
"Polish": "pl",
"Portuguese": "pt",
"Romanian": "ro",
"Russian": "ru",
"Serbian (cyrillic)": "rs_cyrillic",
"Serbian (latin)": "rs_latin",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Swedish": "sv",
"Swahili": "sw",
"Tamil": "ta",
"Thai": "th",
"Tagalog": "tl",
"Turkish": "tr",
"Ukrainian": "uk",
"Urdu": "ur",
"Uzbek": "uz",
"Vietnamese": "vi",
"Welsh": "cy",
"Zulu": "zu",
}
def pdf_pil(file_path,page_num):
pdf = pdfium.PdfDocument("data.pdf")
print ("\n PDF read !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
#n_pages = len(pdf)
#for page_number in range(n_pages):
page = pdf.get_page(int(page_num)-1)
print ("\n Page read !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
bitmap = page.render(
scale = 1, # 72dpi resolution
rotation = 0, # no additional rotation
# ... further rendering options
)
print ("\n Page rendered !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
pil_image = bitmap.to_pil()
print ("\n Page to PIL !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
pil_image.save(f"image_{page_num}.png")
print ("\n Page saved !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
return (f"image_{page_num}.png")
def ocrpdf(file_path,pdf_lang,page_num):
img1 = pdf_pil(file_path,page_num)
print("DONE 1 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
lang=[f"{ocr_id[pdf_lang]}"]
print("DONE 2 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
reader = easyocr.Reader(lang)
print("DONE 3 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
bounds = reader.readtext(img1)
print("DONE 4 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
this = ""
for bound in bounds:
this = (f'{this}'+ f'{bound[1]}')
return this
def scrape(instring):
html_src=(f'''
<div style="text-align:center">
<h4>PDF Viewer</h4>
<iframe src="https://docs.google.com/viewer?url={instring}&embedded=true" frameborder="0" height="1200px" width="100%"></iframe>
</div>''')
return gr.HTML.update(f'''{html_src}''')
def scrape00(instring, page_num,pdf_lang):
response = requests.get(instring, stream=True)
if response.status_code == 200:
with open("data.pdf", "wb") as f:
f.write(response.content)
else:
print(response.status_code)
#out = Path("./data.pdf")
#print (out)
reader = PdfReader("data.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[int(page_num)-1]
text = page.extract_text()
print (text)
try:
summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
sum_out = summarizer(text)
except Exception:
try:
text = ocrpdf("data.pdf",pdf_lang,page_num)
sum_out = summarizer(text)
except Exception:
sum_out = "Error"
return text, sum_out
with gr.Blocks() as app:
gr.Markdown('''<h1>PDF Viewer''')
with gr.Row():
inp=gr.Textbox(label="PDF URL",scale=3)
pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1)
with gr.Row():
go_btn = gr.Button("Load PDF")
sum_btn = gr.Button("Summarize")
target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
outp = gr.HTML()
with gr.Row():
text_out = gr.Textbox()
sum_out = gr.Textbox()
go_btn.click(scrape,inp,outp)
sum_btn.click(scrape00,[inp,pg_num,target_lang],[text_out,sum_out])
app.queue(concurrency_count=10).launch() |