Spaces:
Running
Running
File size: 3,755 Bytes
3919e25 2e2a7b2 4ffb3fe 9774c1c a923971 3919e25 2d1281f a923971 d599b56 d02b2ab a923971 9774c1c 2e2a7b2 a923971 e86a2c5 2e2a7b2 b7f89cc 2e2a7b2 1c5b68c fb04ca9 9774c1c fb04ca9 1e634f8 8f70505 502b110 d599b56 d846da3 9c7f619 9774c1c 563ca5d 1e634f8 ba73e05 9774c1c 3919e25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
import requests
from pypdf import PdfReader
import pypdfium2 as pdfium
import easyocr
ocr_id = {
"Afrikaans": "af",
"Albanian": "sq",
"Arabic": "ar",
"Azerbaijani": "az",
"Belarusian": "be",
"Bulgarian": "bg",
"Bengali": "bn",
"Bosnian": "bs",
"Chinese (simplified)": "ch_sim",
"Chinese (traditional)": "ch_tra",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Estonian": "et",
"French": "fr",
"German": "de",
"Irish": "ga",
"Hindi": "hi",
"Hungarian": "hu",
"Indonesian": "id",
"Icelandic": "is",
"Italian": "it",
"Japanese": "ja",
"Kannada": "kn",
"Korean": "ko",
"Lithuanian": "lt",
"Latvian": "lv",
"Mongolian": "mn",
"Marathi": "mr",
"Malay": "ms",
"Nepali": "ne",
"Norwegian": "no",
"Occitan": "oc",
"Polish": "pl",
"Portuguese": "pt",
"Romanian": "ro",
"Russian": "ru",
"Serbian (cyrillic)": "rs_cyrillic",
"Serbian (latin)": "rs_latin",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Swedish": "sv",
"Swahili": "sw",
"Tamil": "ta",
"Thai": "th",
"Tagalog": "tl",
"Turkish": "tr",
"Ukrainian": "uk",
"Urdu": "ur",
"Uzbek": "uz",
"Vietnamese": "vi",
"Welsh": "cy",
"Zulu": "zu",
}
def pdf_pil(file_path,page_num):
pdf = pdfium.PdfDocument(f"{file_path}")
#n_pages = len(pdf)
#for page_number in range(n_pages):
page = pdf.get_page(page_num)
pil_image = page.render_topil(
scale=1,
rotation=0,
crop=(0, 0, 0, 0),
colour=(255, 255, 255, 255),
annotations=True,
greyscale=False,
optimise_mode=pdfium.OptimiseMode.NONE,
)
#pil_image.save(f"image_{page_num}.png")
return pil_image
def ocrpdf(file_path,pdf_lang,page_num):
img1=pdf_pil(file_path,page_num)
lang=[f"{ocr_id[pdf_lang]}"]
reader = easyocr.Reader(lang)
bounds = reader.readtext(img1)
for bound in bounds:
print(bound[1])
def scrape(instring):
html_src=(f'''
<div style="text-align:center">
<h4>PDF Viewer</h4>
<iframe src="https://docs.google.com/viewer?url={instring}&embedded=true" frameborder="0" height="1200px" width="100%"></iframe>
</div>''')
return gr.HTML.update(f'''{html_src}''')
def scrape00(instring, page_num,pdf_lang):
response = requests.get(instring, stream=True)
if response.status_code == 200:
with open("data.pdf", "wb") as f:
f.write(response.content)
else:
print(response.status_code)
#out = Path("./data.pdf")
#print (out)
reader = PdfReader("data.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[int(page_num)-1]
text = page.extract_text()
print (text)
try:
summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
sum_out = summarizer(text)
except Exception:
ocr_pdf(data.pdf,pdf_lang,page_num)
sum_out = "Error"
return text, sum_out
with gr.Blocks() as app:
gr.Markdown('''<h1>PDF Viewer''')
with gr.Row():
inp=gr.Textbox(label="PDF URL",scale=3)
pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1)
with gr.Row():
go_btn = gr.Button("Load PDF")
sum_btn = gr.Button("Summarize")
target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
outp = gr.HTML()
with gr.Row():
text_out = gr.Textbox()
sum_out = gr.Textbox()
go_btn.click(scrape,inp,outp)
sum_btn.click(scrape00,[inp,pg_num,target_lang],[text_out,sum_out])
app.queue(concurrency_count=10).launch() |