Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -63,12 +63,12 @@ ocr_id = {
|
|
63 |
"Zulu": "zu",
|
64 |
}
|
65 |
|
66 |
-
def pdf_pil(file_path,page_num):
|
67 |
|
68 |
pdf = pdfium.PdfDocument("data.pdf")
|
69 |
page = pdf.get_page(int(page_num)-1)
|
70 |
bitmap = page.render(
|
71 |
-
scale =
|
72 |
rotation = 0, # no additional rotation
|
73 |
# ... further rendering options
|
74 |
)
|
@@ -77,8 +77,8 @@ def pdf_pil(file_path,page_num):
|
|
77 |
|
78 |
return (f"image_{page_num}.png")
|
79 |
|
80 |
-
def ocrpdf(file_path,pdf_lang,page_num,sent_wid,contrast_det):
|
81 |
-
img1 = pdf_pil(file_path,page_num)
|
82 |
lang=[f"{ocr_id[pdf_lang]}"]
|
83 |
reader = easyocr.Reader(lang)
|
84 |
bounds = reader.readtext(img1,width_ths=sent_wid,contrast_ths=contrast_det)
|
@@ -97,7 +97,9 @@ def scrape(instring):
|
|
97 |
</div>''')
|
98 |
return gr.HTML.update(f'''{html_src}''')
|
99 |
|
100 |
-
def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det):
|
|
|
|
|
101 |
response = requests.get(instring, stream=True)
|
102 |
|
103 |
if response.status_code == 200:
|
@@ -114,16 +116,19 @@ def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det):
|
|
114 |
page = reader.pages[int(page_num)-1]
|
115 |
text = page.extract_text()
|
116 |
print (text)
|
|
|
117 |
try:
|
118 |
-
summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
|
119 |
sum_out = summarizer(text)
|
120 |
except Exception:
|
121 |
try:
|
122 |
-
|
|
|
123 |
sum_out = summarizer(text)
|
124 |
except Exception:
|
125 |
sum_out = "Error"
|
126 |
-
|
|
|
|
|
127 |
|
128 |
with gr.Blocks() as app:
|
129 |
gr.Markdown('''<h1>PDF Viewer''')
|
@@ -139,6 +144,7 @@ with gr.Blocks() as app:
|
|
139 |
with gr.Box():
|
140 |
sent_wid=gr.Slider(0.1, 3, step=0.1,value=1,label="Horizontal Word Space")
|
141 |
contrast_det=gr.Slider(0.1, 1, step=0.1,value=0.1,label="Contrast Threshold")
|
|
|
142 |
with gr.Column():
|
143 |
target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
|
144 |
sum_btn = gr.Button("Summarize")
|
@@ -146,5 +152,5 @@ with gr.Blocks() as app:
|
|
146 |
text_out = gr.Textbox()
|
147 |
sum_out = gr.Textbox()
|
148 |
go_btn.click(scrape,inp,outp)
|
149 |
-
sum_btn.click(scrape00,[inp,pg_num,target_lang,sent_wid,contrast_det],[text_out,sum_out])
|
150 |
app.queue(concurrency_count=10).launch()
|
|
|
63 |
"Zulu": "zu",
|
64 |
}
|
65 |
|
66 |
+
def pdf_pil(file_path,page_num,up_scale):
|
67 |
|
68 |
pdf = pdfium.PdfDocument("data.pdf")
|
69 |
page = pdf.get_page(int(page_num)-1)
|
70 |
bitmap = page.render(
|
71 |
+
scale = int(up_scale), # 72dpi resolution
|
72 |
rotation = 0, # no additional rotation
|
73 |
# ... further rendering options
|
74 |
)
|
|
|
77 |
|
78 |
return (f"image_{page_num}.png")
|
79 |
|
80 |
+
def ocrpdf(file_path,pdf_lang,page_num,sent_wid,contrast_det,up_scale):
|
81 |
+
img1 = pdf_pil(file_path,page_num,up_scale)
|
82 |
lang=[f"{ocr_id[pdf_lang]}"]
|
83 |
reader = easyocr.Reader(lang)
|
84 |
bounds = reader.readtext(img1,width_ths=sent_wid,contrast_ths=contrast_det)
|
|
|
97 |
</div>''')
|
98 |
return gr.HTML.update(f'''{html_src}''')
|
99 |
|
100 |
+
def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det,up_scale):
|
101 |
+
yield (None,None,gr.Markdown.update("""<h3> Trying Native Text Detection"""))
|
102 |
+
|
103 |
response = requests.get(instring, stream=True)
|
104 |
|
105 |
if response.status_code == 200:
|
|
|
116 |
page = reader.pages[int(page_num)-1]
|
117 |
text = page.extract_text()
|
118 |
print (text)
|
119 |
+
summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
|
120 |
try:
|
|
|
121 |
sum_out = summarizer(text)
|
122 |
except Exception:
|
123 |
try:
|
124 |
+
yield (None,None,gr.Markdown.update("""<h3> Trying OCR Text Detection"""))
|
125 |
+
text = ocrpdf("data.pdf",pdf_lang,page_num,sent_wid,contrast_det,up_scale)
|
126 |
sum_out = summarizer(text)
|
127 |
except Exception:
|
128 |
sum_out = "Error"
|
129 |
+
yield (None,None,gr.Markdown.update("""<h3> Error"""))
|
130 |
+
|
131 |
+
return text, sum_out,gr.Markdown.update("""<h3> Complete"""))
|
132 |
|
133 |
with gr.Blocks() as app:
|
134 |
gr.Markdown('''<h1>PDF Viewer''')
|
|
|
144 |
with gr.Box():
|
145 |
sent_wid=gr.Slider(0.1, 3, step=0.1,value=1,label="Horizontal Word Space")
|
146 |
contrast_det=gr.Slider(0.1, 1, step=0.1,value=0.1,label="Contrast Threshold")
|
147 |
+
up_scale=gr.Slider(0.1, 5, step=0.1,value=1,label="PDF to Image Scale")
|
148 |
with gr.Column():
|
149 |
target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
|
150 |
sum_btn = gr.Button("Summarize")
|
|
|
152 |
text_out = gr.Textbox()
|
153 |
sum_out = gr.Textbox()
|
154 |
go_btn.click(scrape,inp,outp)
|
155 |
+
sum_btn.click(scrape00,[inp,pg_num,target_lang,sent_wid,contrast_det,up_scale],[text_out,sum_out])
|
156 |
app.queue(concurrency_count=10).launch()
|