Update app.py
Browse files
app.py
CHANGED
@@ -35,6 +35,132 @@ from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
|
35 |
from magic_pdf.libs.hash_utils import compute_sha256
|
36 |
from magic_pdf.tools.common import do_parse, prepare_env
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
def read_fn(path):
|
39 |
disk_rw = FileBasedDataReader(os.path.dirname(path))
|
40 |
return disk_rw.read(os.path.basename(path))
|
@@ -113,6 +239,18 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
|
|
113 |
new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
|
114 |
return md_content, txt_content, archive_zip_path, new_pdf_path
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
|
117 |
{"left": '$', "right": '$', "display": False}]
|
118 |
|
@@ -152,55 +290,143 @@ other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
|
|
152 |
all_lang = ['', 'auto']
|
153 |
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
|
154 |
|
155 |
-
def to_pdf(file_path):
|
156 |
-
with pymupdf.open(file_path) as f:
|
157 |
-
if f.is_pdf:
|
158 |
-
return file_path
|
159 |
-
else:
|
160 |
-
pdf_bytes = f.convert_to_pdf()
|
161 |
-
unique_filename = f"{uuid.uuid4()}.pdf"
|
162 |
-
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
|
163 |
-
with open(tmp_file_path, 'wb') as tmp_pdf_file:
|
164 |
-
tmp_pdf_file.write(pdf_bytes)
|
165 |
-
return tmp_file_path
|
166 |
-
|
167 |
if __name__ == "__main__":
|
168 |
-
with gr.Blocks(title="OCR FLEX") as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
with gr.Row():
|
|
|
170 |
with gr.Column(variant='panel', scale=5):
|
171 |
-
file = gr.File(
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
with gr.Row():
|
174 |
-
layout_mode = gr.Dropdown(
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
with gr.Row():
|
177 |
-
formula_enable = gr.Checkbox(
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
with gr.Row():
|
181 |
-
change_bu = gr.Button(
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
example_root = os.path.join(os.path.dirname(__file__), "examples")
|
186 |
gr.Examples(
|
187 |
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
|
188 |
_.endswith("pdf")],
|
189 |
-
inputs=file
|
|
|
190 |
)
|
191 |
|
|
|
192 |
with gr.Column(variant='panel', scale=5):
|
193 |
-
output_file = gr.File(
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
195 |
with gr.Tab("๋งํฌ๋ค์ด ๋ ๋๋ง"):
|
196 |
-
md = gr.Markdown(
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
with gr.Tab("๋งํฌ๋ค์ด ํ
์คํธ"):
|
199 |
-
md_text = gr.TextArea(
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
|
205 |
|
206 |
-
|
|
|
|
35 |
from magic_pdf.libs.hash_utils import compute_sha256
|
36 |
from magic_pdf.tools.common import do_parse, prepare_env
|
37 |
|
38 |
+
def create_css():
|
39 |
+
return """
|
40 |
+
/* ์ ์ฒด ์คํ์ผ */
|
41 |
+
.gradio-container {
|
42 |
+
background: linear-gradient(135deg, #EFF6FF 0%, #F5F3FF 100%);
|
43 |
+
max-width: 1200px !important;
|
44 |
+
margin: 0 auto !important;
|
45 |
+
padding: 2rem !important;
|
46 |
+
}
|
47 |
+
|
48 |
+
/* ์ ๋ชฉ ์คํ์ผ */
|
49 |
+
.title-area {
|
50 |
+
text-align: center;
|
51 |
+
margin-bottom: 2rem;
|
52 |
+
padding: 1rem;
|
53 |
+
background: white;
|
54 |
+
border-radius: 1rem;
|
55 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
|
56 |
+
}
|
57 |
+
|
58 |
+
.title-area h1 {
|
59 |
+
background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%);
|
60 |
+
-webkit-background-clip: text;
|
61 |
+
-webkit-text-fill-color: transparent;
|
62 |
+
font-size: 2.5rem;
|
63 |
+
font-weight: bold;
|
64 |
+
margin-bottom: 0.5rem;
|
65 |
+
}
|
66 |
+
|
67 |
+
.title-area p {
|
68 |
+
color: #6B7280;
|
69 |
+
font-size: 1.1rem;
|
70 |
+
}
|
71 |
+
|
72 |
+
/* ์ปดํฌ๋ํธ ์คํ์ผ๋ง */
|
73 |
+
.gr-box, .gr-panel {
|
74 |
+
border: 2px solid #E0E7FF !important;
|
75 |
+
border-radius: 12px !important;
|
76 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1) !important;
|
77 |
+
background: white !important;
|
78 |
+
}
|
79 |
+
|
80 |
+
/* ํ์ผ ์
๋ก๋ ์์ญ */
|
81 |
+
.file-upload {
|
82 |
+
border: 2px dashed #93C5FD !important;
|
83 |
+
border-radius: 8px !important;
|
84 |
+
padding: 2rem !important;
|
85 |
+
background: #F0F9FF !important;
|
86 |
+
transition: all 0.3s ease;
|
87 |
+
}
|
88 |
+
|
89 |
+
.file-upload:hover {
|
90 |
+
background: #E0F2FE !important;
|
91 |
+
border-color: #60A5FA !important;
|
92 |
+
}
|
93 |
+
|
94 |
+
/* ๋ฒํผ ์คํ์ผ๋ง */
|
95 |
+
.gr-button.primary-button {
|
96 |
+
background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%) !important;
|
97 |
+
color: white !important;
|
98 |
+
border: none !important;
|
99 |
+
border-radius: 8px !important;
|
100 |
+
padding: 0.75rem 1.5rem !important;
|
101 |
+
font-weight: bold !important;
|
102 |
+
transition: opacity 0.2s !important;
|
103 |
+
}
|
104 |
+
|
105 |
+
.gr-button.primary-button:hover {
|
106 |
+
opacity: 0.9 !important;
|
107 |
+
}
|
108 |
+
|
109 |
+
.gr-button.secondary-button {
|
110 |
+
background: white !important;
|
111 |
+
color: #4B5563 !important;
|
112 |
+
border: 1px solid #D1D5DB !important;
|
113 |
+
border-radius: 8px !important;
|
114 |
+
padding: 0.75rem 1.5rem !important;
|
115 |
+
}
|
116 |
+
|
117 |
+
.gr-button.secondary-button:hover {
|
118 |
+
background: #F9FAFB !important;
|
119 |
+
}
|
120 |
+
|
121 |
+
/* ์ฌ๋ผ์ด๋ ์คํ์ผ๋ง */
|
122 |
+
.gr-slider {
|
123 |
+
background: #E0E7FF !important;
|
124 |
+
}
|
125 |
+
|
126 |
+
.gr-slider .gr-slider-handle {
|
127 |
+
background: #4F46E5 !important;
|
128 |
+
}
|
129 |
+
|
130 |
+
/* ์ฒดํฌ๋ฐ์ค ์คํ์ผ๋ง */
|
131 |
+
.gr-checkbox {
|
132 |
+
border-color: #6366F1 !important;
|
133 |
+
}
|
134 |
+
|
135 |
+
.gr-checkbox:checked {
|
136 |
+
background-color: #4F46E5 !important;
|
137 |
+
}
|
138 |
+
|
139 |
+
/* ํญ ์คํ์ผ๋ง */
|
140 |
+
.gr-tabs {
|
141 |
+
border-bottom: 2px solid #E0E7FF !important;
|
142 |
+
}
|
143 |
+
|
144 |
+
.gr-tab-button {
|
145 |
+
color: #6B7280 !important;
|
146 |
+
padding: 0.75rem 1rem !important;
|
147 |
+
font-weight: 500 !important;
|
148 |
+
}
|
149 |
+
|
150 |
+
.gr-tab-button.selected {
|
151 |
+
color: #4F46E5 !important;
|
152 |
+
border-bottom: 2px solid #4F46E5 !important;
|
153 |
+
}
|
154 |
+
|
155 |
+
/* ๋งํฌ๋ค์ด ์ถ๋ ฅ ์์ญ */
|
156 |
+
.markdown-output {
|
157 |
+
background: white !important;
|
158 |
+
border-radius: 8px !important;
|
159 |
+
padding: 1rem !important;
|
160 |
+
box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.05) !important;
|
161 |
+
}
|
162 |
+
"""
|
163 |
+
|
164 |
def read_fn(path):
|
165 |
disk_rw = FileBasedDataReader(os.path.dirname(path))
|
166 |
return disk_rw.read(os.path.basename(path))
|
|
|
239 |
new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
|
240 |
return md_content, txt_content, archive_zip_path, new_pdf_path
|
241 |
|
242 |
+
def to_pdf(file_path):
|
243 |
+
with pymupdf.open(file_path) as f:
|
244 |
+
if f.is_pdf:
|
245 |
+
return file_path
|
246 |
+
else:
|
247 |
+
pdf_bytes = f.convert_to_pdf()
|
248 |
+
unique_filename = f"{uuid.uuid4()}.pdf"
|
249 |
+
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
|
250 |
+
with open(tmp_file_path, 'wb') as tmp_pdf_file:
|
251 |
+
tmp_pdf_file.write(pdf_bytes)
|
252 |
+
return tmp_file_path
|
253 |
+
|
254 |
latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
|
255 |
{"left": '$', "right": '$', "display": False}]
|
256 |
|
|
|
290 |
all_lang = ['', 'auto']
|
291 |
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
|
292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
if __name__ == "__main__":
|
294 |
+
with gr.Blocks(title="OCR FLEX", css=create_css()) as demo:
|
295 |
+
# ํ์ดํ ์์ญ
|
296 |
+
with gr.Row(elem_classes="title-area"):
|
297 |
+
gr.HTML("""
|
298 |
+
<h1>OCR FLEX</h1>
|
299 |
+
<p>PDF์ ์ด๋ฏธ์ง์์ ํ
์คํธ๋ฅผ ๋น ๋ฅด๊ณ ์ ํํ๊ฒ ์ถ์ถํ์ธ์</p>
|
300 |
+
""")
|
301 |
+
|
302 |
with gr.Row():
|
303 |
+
# ์ผ์ชฝ ํจ๋
|
304 |
with gr.Column(variant='panel', scale=5):
|
305 |
+
file = gr.File(
|
306 |
+
label="PDF ๋๋ ์ด๋ฏธ์ง ํ์ผ์ ์
๋ก๋ํ์ธ์",
|
307 |
+
file_types=[".pdf", ".png", ".jpeg", ".jpg"],
|
308 |
+
elem_classes="file-upload"
|
309 |
+
)
|
310 |
+
|
311 |
+
max_pages = gr.Slider(
|
312 |
+
1, 20, 10,
|
313 |
+
step=1,
|
314 |
+
label='์ต๋ ๋ณํ ํ์ด์ง ์',
|
315 |
+
elem_classes="custom-slider"
|
316 |
+
)
|
317 |
+
|
318 |
with gr.Row():
|
319 |
+
layout_mode = gr.Dropdown(
|
320 |
+
["layoutlmv3", "doclayout_yolo"],
|
321 |
+
label="๋ ์ด์์ ๋ชจ๋ธ",
|
322 |
+
value="doclayout_yolo",
|
323 |
+
elem_classes="custom-dropdown"
|
324 |
+
)
|
325 |
+
language = gr.Dropdown(
|
326 |
+
all_lang,
|
327 |
+
label="์ธ์ด",
|
328 |
+
value='auto',
|
329 |
+
elem_classes="custom-dropdown"
|
330 |
+
)
|
331 |
+
|
332 |
with gr.Row():
|
333 |
+
formula_enable = gr.Checkbox(
|
334 |
+
label="์์ ์ธ์ ํ์ฑํ",
|
335 |
+
value=True,
|
336 |
+
elem_classes="custom-checkbox"
|
337 |
+
)
|
338 |
+
is_ocr = gr.Checkbox(
|
339 |
+
label="OCR ๊ฐ์ ํ์ฑํ",
|
340 |
+
value=False,
|
341 |
+
elem_classes="custom-checkbox"
|
342 |
+
)
|
343 |
+
table_enable = gr.Checkbox(
|
344 |
+
label="ํ ์ธ์ ํ์ฑํ(ํ
์คํธ)",
|
345 |
+
value=True,
|
346 |
+
elem_classes="custom-checkbox"
|
347 |
+
)
|
348 |
+
|
349 |
with gr.Row():
|
350 |
+
change_bu = gr.Button(
|
351 |
+
"๋ณํ",
|
352 |
+
elem_classes="primary-button"
|
353 |
+
)
|
354 |
+
clear_bu = gr.ClearButton(
|
355 |
+
value="์ด๊ธฐํ",
|
356 |
+
elem_classes="secondary-button"
|
357 |
+
)
|
358 |
+
|
359 |
+
pdf_show = PDF(
|
360 |
+
label='PDF ๋ฏธ๋ฆฌ๋ณด๊ธฐ',
|
361 |
+
interactive=False,
|
362 |
+
visible=True,
|
363 |
+
height=800,
|
364 |
+
elem_classes="pdf-preview"
|
365 |
+
)
|
366 |
+
|
367 |
+
with gr.Accordion("์์ :", open=False):
|
368 |
example_root = os.path.join(os.path.dirname(__file__), "examples")
|
369 |
gr.Examples(
|
370 |
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
|
371 |
_.endswith("pdf")],
|
372 |
+
inputs=file,
|
373 |
+
elem_classes="examples-section"
|
374 |
)
|
375 |
|
376 |
+
# ์ค๋ฅธ์ชฝ ํจ๋
|
377 |
with gr.Column(variant='panel', scale=5):
|
378 |
+
output_file = gr.File(
|
379 |
+
label="๋ณํ ๊ฒฐ๊ณผ",
|
380 |
+
interactive=False,
|
381 |
+
elem_classes="output-file"
|
382 |
+
)
|
383 |
+
|
384 |
+
with gr.Tabs() as tabs:
|
385 |
with gr.Tab("๋งํฌ๋ค์ด ๋ ๋๋ง"):
|
386 |
+
md = gr.Markdown(
|
387 |
+
label="๋งํฌ๋ค์ด ๋ ๋๋ง",
|
388 |
+
height=1100,
|
389 |
+
show_copy_button=True,
|
390 |
+
latex_delimiters=latex_delimiters,
|
391 |
+
line_breaks=True,
|
392 |
+
elem_classes="markdown-output"
|
393 |
+
)
|
394 |
+
|
395 |
with gr.Tab("๋งํฌ๋ค์ด ํ
์คํธ"):
|
396 |
+
md_text = gr.TextArea(
|
397 |
+
lines=45,
|
398 |
+
show_copy_button=True,
|
399 |
+
elem_classes="markdown-text"
|
400 |
+
)
|
401 |
+
|
402 |
+
# ์ด๋ฒคํธ ํธ๋ค๋ฌ
|
403 |
+
file.change(
|
404 |
+
fn=to_pdf,
|
405 |
+
inputs=file,
|
406 |
+
outputs=pdf_show
|
407 |
+
)
|
408 |
+
|
409 |
+
change_bu.click(
|
410 |
+
fn=to_markdown,
|
411 |
+
inputs=[
|
412 |
+
file,
|
413 |
+
max_pages,
|
414 |
+
is_ocr,
|
415 |
+
layout_mode,
|
416 |
+
formula_enable,
|
417 |
+
table_enable,
|
418 |
+
language
|
419 |
+
],
|
420 |
+
outputs=[
|
421 |
+
md,
|
422 |
+
md_text,
|
423 |
+
output_file,
|
424 |
+
pdf_show
|
425 |
+
],
|
426 |
+
api_name=False
|
427 |
+
)
|
428 |
+
|
429 |
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
|
430 |
|
431 |
+
# ์ฑ ์คํ
|
432 |
+
demo.launch(ssr_mode=True)
|