Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -41,13 +41,16 @@ def read_fn(path):
|
|
41 |
|
42 |
|
43 |
# @spaces.GPU
|
44 |
-
def parse_pdf(doc_path, output_dir, end_page_id):
|
45 |
os.makedirs(output_dir, exist_ok=True)
|
46 |
|
47 |
try:
|
48 |
file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
|
49 |
pdf_data = read_fn(doc_path)
|
50 |
-
|
|
|
|
|
|
|
51 |
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
|
52 |
do_parse(
|
53 |
output_dir,
|
@@ -108,9 +111,9 @@ def replace_image_with_base64(markdown_text, image_dir_path):
|
|
108 |
return re.sub(pattern, replace, markdown_text)
|
109 |
|
110 |
|
111 |
-
def to_markdown(file_path, end_pages):
|
112 |
# θ·εθ―ε«ηmdζδ»Άδ»₯εεηΌ©ε
ζδ»Άθ·―εΎ
|
113 |
-
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1)
|
114 |
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
|
115 |
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
|
116 |
if zip_archive_success == 0:
|
@@ -177,6 +180,7 @@ if __name__ == "__main__":
|
|
177 |
pdf_show = gr.Markdown()
|
178 |
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
|
179 |
with gr.Row() as bu_flow:
|
|
|
180 |
change_bu = gr.Button("Convert")
|
181 |
clear_bu = gr.ClearButton([pdf_show], value="Clear")
|
182 |
# pdf_show = gr.HTML(label="PDF preview")
|
@@ -191,7 +195,7 @@ if __name__ == "__main__":
|
|
191 |
with gr.Tab("Markdown text"):
|
192 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
193 |
# file.upload(fn=show_pdf, inputs=file, outputs=pdf_show)
|
194 |
-
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages], outputs=[md, md_text, output_file, pdf_show])
|
195 |
-
clear_bu.add([md, pdf_show, md_text, output_file])
|
196 |
|
197 |
demo.launch()
|
|
|
41 |
|
42 |
|
43 |
# @spaces.GPU
|
44 |
+
def parse_pdf(doc_path, output_dir, end_page_id, ocr):
|
45 |
os.makedirs(output_dir, exist_ok=True)
|
46 |
|
47 |
try:
|
48 |
file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
|
49 |
pdf_data = read_fn(doc_path)
|
50 |
+
if ocr:
|
51 |
+
parse_method = "ocr"
|
52 |
+
else:
|
53 |
+
parse_method = "auto"
|
54 |
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
|
55 |
do_parse(
|
56 |
output_dir,
|
|
|
111 |
return re.sub(pattern, replace, markdown_text)
|
112 |
|
113 |
|
114 |
+
def to_markdown(file_path, end_pages, ocr):
|
115 |
# θ·εθ―ε«ηmdζδ»Άδ»₯εεηΌ©ε
ζδ»Άθ·―εΎ
|
116 |
+
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, ocr)
|
117 |
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
|
118 |
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
|
119 |
if zip_archive_success == 0:
|
|
|
180 |
pdf_show = gr.Markdown()
|
181 |
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
|
182 |
with gr.Row() as bu_flow:
|
183 |
+
is_ocr = gr.Checkbox(label="Force enable OCR")
|
184 |
change_bu = gr.Button("Convert")
|
185 |
clear_bu = gr.ClearButton([pdf_show], value="Clear")
|
186 |
# pdf_show = gr.HTML(label="PDF preview")
|
|
|
195 |
with gr.Tab("Markdown text"):
|
196 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
197 |
# file.upload(fn=show_pdf, inputs=file, outputs=pdf_show)
|
198 |
+
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr], outputs=[md, md_text, output_file, pdf_show])
|
199 |
+
clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
|
200 |
|
201 |
demo.launch()
|