Omnibus commited on
Commit
456846b
·
1 Parent(s): bdb82f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -9
app.py CHANGED
@@ -63,12 +63,12 @@ ocr_id = {
63
  "Zulu": "zu",
64
  }
65
 
66
- def pdf_pil(file_path,page_num):
67
 
68
  pdf = pdfium.PdfDocument("data.pdf")
69
  page = pdf.get_page(int(page_num)-1)
70
  bitmap = page.render(
71
- scale = 3, # 72dpi resolution
72
  rotation = 0, # no additional rotation
73
  # ... further rendering options
74
  )
@@ -77,8 +77,8 @@ def pdf_pil(file_path,page_num):
77
 
78
  return (f"image_{page_num}.png")
79
 
80
- def ocrpdf(file_path,pdf_lang,page_num,sent_wid,contrast_det):
81
- img1 = pdf_pil(file_path,page_num)
82
  lang=[f"{ocr_id[pdf_lang]}"]
83
  reader = easyocr.Reader(lang)
84
  bounds = reader.readtext(img1,width_ths=sent_wid,contrast_ths=contrast_det)
@@ -97,7 +97,9 @@ def scrape(instring):
97
  </div>''')
98
  return gr.HTML.update(f'''{html_src}''')
99
 
100
- def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det):
 
 
101
  response = requests.get(instring, stream=True)
102
 
103
  if response.status_code == 200:
@@ -114,16 +116,19 @@ def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det):
114
  page = reader.pages[int(page_num)-1]
115
  text = page.extract_text()
116
  print (text)
 
117
  try:
118
- summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
119
  sum_out = summarizer(text)
120
  except Exception:
121
  try:
122
- text = ocrpdf("data.pdf",pdf_lang,page_num,sent_wid,contrast_det)
 
123
  sum_out = summarizer(text)
124
  except Exception:
125
  sum_out = "Error"
126
- return text, sum_out
 
 
127
 
128
  with gr.Blocks() as app:
129
  gr.Markdown('''<h1>PDF Viewer''')
@@ -139,6 +144,7 @@ with gr.Blocks() as app:
139
  with gr.Box():
140
  sent_wid=gr.Slider(0.1, 3, step=0.1,value=1,label="Horizontal Word Space")
141
  contrast_det=gr.Slider(0.1, 1, step=0.1,value=0.1,label="Contrast Threshold")
 
142
  with gr.Column():
143
  target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
144
  sum_btn = gr.Button("Summarize")
@@ -146,5 +152,5 @@ with gr.Blocks() as app:
146
  text_out = gr.Textbox()
147
  sum_out = gr.Textbox()
148
  go_btn.click(scrape,inp,outp)
149
- sum_btn.click(scrape00,[inp,pg_num,target_lang,sent_wid,contrast_det],[text_out,sum_out])
150
  app.queue(concurrency_count=10).launch()
 
63
  "Zulu": "zu",
64
  }
65
 
66
+ def pdf_pil(file_path,page_num,up_scale):
67
 
68
  pdf = pdfium.PdfDocument("data.pdf")
69
  page = pdf.get_page(int(page_num)-1)
70
  bitmap = page.render(
71
+ scale = int(up_scale), # 72dpi resolution
72
  rotation = 0, # no additional rotation
73
  # ... further rendering options
74
  )
 
77
 
78
  return (f"image_{page_num}.png")
79
 
80
+ def ocrpdf(file_path,pdf_lang,page_num,sent_wid,contrast_det,up_scale):
81
+ img1 = pdf_pil(file_path,page_num,up_scale)
82
  lang=[f"{ocr_id[pdf_lang]}"]
83
  reader = easyocr.Reader(lang)
84
  bounds = reader.readtext(img1,width_ths=sent_wid,contrast_ths=contrast_det)
 
97
  </div>''')
98
  return gr.HTML.update(f'''{html_src}''')
99
 
100
+ def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det,up_scale):
101
+ yield (None,None,gr.Markdown.update("""<h3> Trying Native Text Detection"""))
102
+
103
  response = requests.get(instring, stream=True)
104
 
105
  if response.status_code == 200:
 
116
  page = reader.pages[int(page_num)-1]
117
  text = page.extract_text()
118
  print (text)
119
+ summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
120
  try:
 
121
  sum_out = summarizer(text)
122
  except Exception:
123
  try:
124
+ yield (None,None,gr.Markdown.update("""<h3> Trying OCR Text Detection"""))
125
+ text = ocrpdf("data.pdf",pdf_lang,page_num,sent_wid,contrast_det,up_scale)
126
  sum_out = summarizer(text)
127
  except Exception:
128
  sum_out = "Error"
129
+ yield (None,None,gr.Markdown.update("""<h3> Error"""))
130
+
131
+ return text, sum_out,gr.Markdown.update("""<h3> Complete"""))
132
 
133
  with gr.Blocks() as app:
134
  gr.Markdown('''<h1>PDF Viewer''')
 
144
  with gr.Box():
145
  sent_wid=gr.Slider(0.1, 3, step=0.1,value=1,label="Horizontal Word Space")
146
  contrast_det=gr.Slider(0.1, 1, step=0.1,value=0.1,label="Contrast Threshold")
147
+ up_scale=gr.Slider(0.1, 5, step=0.1,value=1,label="PDF to Image Scale")
148
  with gr.Column():
149
  target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
150
  sum_btn = gr.Button("Summarize")
 
152
  text_out = gr.Textbox()
153
  sum_out = gr.Textbox()
154
  go_btn.click(scrape,inp,outp)
155
+ sum_btn.click(scrape00,[inp,pg_num,target_lang,sent_wid,contrast_det,up_scale],[text_out,sum_out])
156
  app.queue(concurrency_count=10).launch()