myhloli commited on
Commit
a1d901d
Β·
verified Β·
1 Parent(s): 2e3e846

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -6
app.py CHANGED
@@ -41,13 +41,16 @@ def read_fn(path):
41
 
42
 
43
  # @spaces.GPU
44
- def parse_pdf(doc_path, output_dir, end_page_id):
45
  os.makedirs(output_dir, exist_ok=True)
46
 
47
  try:
48
  file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
49
  pdf_data = read_fn(doc_path)
50
- parse_method = "auto"
 
 
 
51
  local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
52
  do_parse(
53
  output_dir,
@@ -108,9 +111,9 @@ def replace_image_with_base64(markdown_text, image_dir_path):
108
  return re.sub(pattern, replace, markdown_text)
109
 
110
 
111
- def to_markdown(file_path, end_pages):
112
  # θŽ·ε–θ―†εˆ«ηš„mdζ–‡δ»Άδ»₯εŠεŽ‹ηΌ©εŒ…ζ–‡δ»Άθ·―εΎ„
113
- local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1)
114
  archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
115
  zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
116
  if zip_archive_success == 0:
@@ -177,6 +180,7 @@ if __name__ == "__main__":
177
  pdf_show = gr.Markdown()
178
  max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
179
  with gr.Row() as bu_flow:
 
180
  change_bu = gr.Button("Convert")
181
  clear_bu = gr.ClearButton([pdf_show], value="Clear")
182
  # pdf_show = gr.HTML(label="PDF preview")
@@ -191,7 +195,7 @@ if __name__ == "__main__":
191
  with gr.Tab("Markdown text"):
192
  md_text = gr.TextArea(lines=45, show_copy_button=True)
193
  # file.upload(fn=show_pdf, inputs=file, outputs=pdf_show)
194
- change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages], outputs=[md, md_text, output_file, pdf_show])
195
- clear_bu.add([md, pdf_show, md_text, output_file])
196
 
197
  demo.launch()
 
41
 
42
 
43
  # @spaces.GPU
44
+ def parse_pdf(doc_path, output_dir, end_page_id, ocr):
45
  os.makedirs(output_dir, exist_ok=True)
46
 
47
  try:
48
  file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
49
  pdf_data = read_fn(doc_path)
50
+ if ocr:
51
+ parse_method = "ocr"
52
+ else:
53
+ parse_method = "auto"
54
  local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
55
  do_parse(
56
  output_dir,
 
111
  return re.sub(pattern, replace, markdown_text)
112
 
113
 
114
+ def to_markdown(file_path, end_pages, ocr):
115
  # θŽ·ε–θ―†εˆ«ηš„mdζ–‡δ»Άδ»₯εŠεŽ‹ηΌ©εŒ…ζ–‡δ»Άθ·―εΎ„
116
+ local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, ocr)
117
  archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
118
  zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
119
  if zip_archive_success == 0:
 
180
  pdf_show = gr.Markdown()
181
  max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
182
  with gr.Row() as bu_flow:
183
+ is_ocr = gr.Checkbox(label="Force enable OCR")
184
  change_bu = gr.Button("Convert")
185
  clear_bu = gr.ClearButton([pdf_show], value="Clear")
186
  # pdf_show = gr.HTML(label="PDF preview")
 
195
  with gr.Tab("Markdown text"):
196
  md_text = gr.TextArea(lines=45, show_copy_button=True)
197
  # file.upload(fn=show_pdf, inputs=file, outputs=pdf_show)
198
+ change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr], outputs=[md, md_text, output_file, pdf_show])
199
+ clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
200
 
201
  demo.launch()