openfree commited on
Commit
ed647a9
ยท
verified ยท
1 Parent(s): 0a1b0d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -35
app.py CHANGED
@@ -35,6 +35,132 @@ from magic_pdf.data.data_reader_writer import FileBasedDataReader
35
  from magic_pdf.libs.hash_utils import compute_sha256
36
  from magic_pdf.tools.common import do_parse, prepare_env
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def read_fn(path):
39
  disk_rw = FileBasedDataReader(os.path.dirname(path))
40
  return disk_rw.read(os.path.basename(path))
@@ -113,6 +239,18 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
113
  new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
114
  return md_content, txt_content, archive_zip_path, new_pdf_path
115
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
117
  {"left": '$', "right": '$', "display": False}]
118
 
@@ -152,55 +290,143 @@ other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
152
  all_lang = ['', 'auto']
153
  all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
154
 
155
- def to_pdf(file_path):
156
- with pymupdf.open(file_path) as f:
157
- if f.is_pdf:
158
- return file_path
159
- else:
160
- pdf_bytes = f.convert_to_pdf()
161
- unique_filename = f"{uuid.uuid4()}.pdf"
162
- tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
163
- with open(tmp_file_path, 'wb') as tmp_pdf_file:
164
- tmp_pdf_file.write(pdf_bytes)
165
- return tmp_file_path
166
-
167
  if __name__ == "__main__":
168
- with gr.Blocks(title="OCR FLEX") as demo:
 
 
 
 
 
 
 
169
  with gr.Row():
 
170
  with gr.Column(variant='panel', scale=5):
171
- file = gr.File(label="PDF ๋˜๋Š” ์ด๋ฏธ์ง€ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜์„ธ์š”", file_types=[".pdf", ".png", ".jpeg", ".jpg"])
172
- max_pages = gr.Slider(1, 20, 10, step=1, label='์ตœ๋Œ€ ๋ณ€ํ™˜ ํŽ˜์ด์ง€ ์ˆ˜')
 
 
 
 
 
 
 
 
 
 
 
173
  with gr.Row():
174
- layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="๋ ˆ์ด์•„์›ƒ ๋ชจ๋ธ", value="doclayout_yolo")
175
- language = gr.Dropdown(all_lang, label="์–ธ์–ด", value='auto')
 
 
 
 
 
 
 
 
 
 
 
176
  with gr.Row():
177
- formula_enable = gr.Checkbox(label="์ˆ˜์‹ ์ธ์‹ ํ™œ์„ฑํ™”", value=True)
178
- is_ocr = gr.Checkbox(label="OCR ๊ฐ•์ œ ํ™œ์„ฑํ™”", value=False)
179
- table_enable = gr.Checkbox(label="ํ‘œ ์ธ์‹ ํ™œ์„ฑํ™”(ํ…Œ์ŠคํŠธ)", value=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  with gr.Row():
181
- change_bu = gr.Button("๋ณ€ํ™˜")
182
- clear_bu = gr.ClearButton(value="์ดˆ๊ธฐํ™”")
183
- pdf_show = PDF(label='PDF ๋ฏธ๋ฆฌ๋ณด๊ธฐ', interactive=False, visible=True, height=800)
184
- with gr.Accordion("์˜ˆ์ œ:"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  example_root = os.path.join(os.path.dirname(__file__), "examples")
186
  gr.Examples(
187
  examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
188
  _.endswith("pdf")],
189
- inputs=file
 
190
  )
191
 
 
192
  with gr.Column(variant='panel', scale=5):
193
- output_file = gr.File(label="๋ณ€ํ™˜ ๊ฒฐ๊ณผ", interactive=False)
194
- with gr.Tabs():
 
 
 
 
 
195
  with gr.Tab("๋งˆํฌ๋‹ค์šด ๋ Œ๋”๋ง"):
196
- md = gr.Markdown(label="๋งˆํฌ๋‹ค์šด ๋ Œ๋”๋ง", height=1100, show_copy_button=True,
197
- latex_delimiters=latex_delimiters, line_breaks=True)
 
 
 
 
 
 
 
198
  with gr.Tab("๋งˆํฌ๋‹ค์šด ํ…์ŠคํŠธ"):
199
- md_text = gr.TextArea(lines=45, show_copy_button=True)
200
-
201
- file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
202
- change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
203
- outputs=[md, md_text, output_file, pdf_show], api_name=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
205
 
206
- demo.launch(ssr_mode=True)
 
 
35
  from magic_pdf.libs.hash_utils import compute_sha256
36
  from magic_pdf.tools.common import do_parse, prepare_env
37
 
38
+ def create_css():
39
+ return """
40
+ /* ์ „์ฒด ์Šคํƒ€์ผ */
41
+ .gradio-container {
42
+ background: linear-gradient(135deg, #EFF6FF 0%, #F5F3FF 100%);
43
+ max-width: 1200px !important;
44
+ margin: 0 auto !important;
45
+ padding: 2rem !important;
46
+ }
47
+
48
+ /* ์ œ๋ชฉ ์Šคํƒ€์ผ */
49
+ .title-area {
50
+ text-align: center;
51
+ margin-bottom: 2rem;
52
+ padding: 1rem;
53
+ background: white;
54
+ border-radius: 1rem;
55
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
56
+ }
57
+
58
+ .title-area h1 {
59
+ background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%);
60
+ -webkit-background-clip: text;
61
+ -webkit-text-fill-color: transparent;
62
+ font-size: 2.5rem;
63
+ font-weight: bold;
64
+ margin-bottom: 0.5rem;
65
+ }
66
+
67
+ .title-area p {
68
+ color: #6B7280;
69
+ font-size: 1.1rem;
70
+ }
71
+
72
+ /* ์ปดํฌ๋„ŒํŠธ ์Šคํƒ€์ผ๋ง */
73
+ .gr-box, .gr-panel {
74
+ border: 2px solid #E0E7FF !important;
75
+ border-radius: 12px !important;
76
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1) !important;
77
+ background: white !important;
78
+ }
79
+
80
+ /* ํŒŒ์ผ ์—…๋กœ๋“œ ์˜์—ญ */
81
+ .file-upload {
82
+ border: 2px dashed #93C5FD !important;
83
+ border-radius: 8px !important;
84
+ padding: 2rem !important;
85
+ background: #F0F9FF !important;
86
+ transition: all 0.3s ease;
87
+ }
88
+
89
+ .file-upload:hover {
90
+ background: #E0F2FE !important;
91
+ border-color: #60A5FA !important;
92
+ }
93
+
94
+ /* ๋ฒ„ํŠผ ์Šคํƒ€์ผ๋ง */
95
+ .gr-button.primary-button {
96
+ background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%) !important;
97
+ color: white !important;
98
+ border: none !important;
99
+ border-radius: 8px !important;
100
+ padding: 0.75rem 1.5rem !important;
101
+ font-weight: bold !important;
102
+ transition: opacity 0.2s !important;
103
+ }
104
+
105
+ .gr-button.primary-button:hover {
106
+ opacity: 0.9 !important;
107
+ }
108
+
109
+ .gr-button.secondary-button {
110
+ background: white !important;
111
+ color: #4B5563 !important;
112
+ border: 1px solid #D1D5DB !important;
113
+ border-radius: 8px !important;
114
+ padding: 0.75rem 1.5rem !important;
115
+ }
116
+
117
+ .gr-button.secondary-button:hover {
118
+ background: #F9FAFB !important;
119
+ }
120
+
121
+ /* ์Šฌ๋ผ์ด๋” ์Šคํƒ€์ผ๋ง */
122
+ .gr-slider {
123
+ background: #E0E7FF !important;
124
+ }
125
+
126
+ .gr-slider .gr-slider-handle {
127
+ background: #4F46E5 !important;
128
+ }
129
+
130
+ /* ์ฒดํฌ๋ฐ•์Šค ์Šคํƒ€์ผ๋ง */
131
+ .gr-checkbox {
132
+ border-color: #6366F1 !important;
133
+ }
134
+
135
+ .gr-checkbox:checked {
136
+ background-color: #4F46E5 !important;
137
+ }
138
+
139
+ /* ํƒญ ์Šคํƒ€์ผ๋ง */
140
+ .gr-tabs {
141
+ border-bottom: 2px solid #E0E7FF !important;
142
+ }
143
+
144
+ .gr-tab-button {
145
+ color: #6B7280 !important;
146
+ padding: 0.75rem 1rem !important;
147
+ font-weight: 500 !important;
148
+ }
149
+
150
+ .gr-tab-button.selected {
151
+ color: #4F46E5 !important;
152
+ border-bottom: 2px solid #4F46E5 !important;
153
+ }
154
+
155
+ /* ๋งˆํฌ๋‹ค์šด ์ถœ๋ ฅ ์˜์—ญ */
156
+ .markdown-output {
157
+ background: white !important;
158
+ border-radius: 8px !important;
159
+ padding: 1rem !important;
160
+ box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.05) !important;
161
+ }
162
+ """
163
+
164
  def read_fn(path):
165
  disk_rw = FileBasedDataReader(os.path.dirname(path))
166
  return disk_rw.read(os.path.basename(path))
 
239
  new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
240
  return md_content, txt_content, archive_zip_path, new_pdf_path
241
 
242
+ def to_pdf(file_path):
243
+ with pymupdf.open(file_path) as f:
244
+ if f.is_pdf:
245
+ return file_path
246
+ else:
247
+ pdf_bytes = f.convert_to_pdf()
248
+ unique_filename = f"{uuid.uuid4()}.pdf"
249
+ tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
250
+ with open(tmp_file_path, 'wb') as tmp_pdf_file:
251
+ tmp_pdf_file.write(pdf_bytes)
252
+ return tmp_file_path
253
+
254
  latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
255
  {"left": '$', "right": '$', "display": False}]
256
 
 
290
  all_lang = ['', 'auto']
291
  all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
292
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  if __name__ == "__main__":
294
+ with gr.Blocks(title="OCR FLEX", css=create_css()) as demo:
295
+ # ํƒ€์ดํ‹€ ์˜์—ญ
296
+ with gr.Row(elem_classes="title-area"):
297
+ gr.HTML("""
298
+ <h1>OCR FLEX</h1>
299
+ <p>PDF์™€ ์ด๋ฏธ์ง€์—์„œ ํ…์ŠคํŠธ๋ฅผ ๋น ๋ฅด๊ณ  ์ •ํ™•ํ•˜๊ฒŒ ์ถ”์ถœํ•˜์„ธ์š”</p>
300
+ """)
301
+
302
  with gr.Row():
303
+ # ์™ผ์ชฝ ํŒจ๋„
304
  with gr.Column(variant='panel', scale=5):
305
+ file = gr.File(
306
+ label="PDF ๋˜๋Š” ์ด๋ฏธ์ง€ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜์„ธ์š”",
307
+ file_types=[".pdf", ".png", ".jpeg", ".jpg"],
308
+ elem_classes="file-upload"
309
+ )
310
+
311
+ max_pages = gr.Slider(
312
+ 1, 20, 10,
313
+ step=1,
314
+ label='์ตœ๋Œ€ ๋ณ€ํ™˜ ํŽ˜์ด์ง€ ์ˆ˜',
315
+ elem_classes="custom-slider"
316
+ )
317
+
318
  with gr.Row():
319
+ layout_mode = gr.Dropdown(
320
+ ["layoutlmv3", "doclayout_yolo"],
321
+ label="๋ ˆ์ด์•„์›ƒ ๋ชจ๋ธ",
322
+ value="doclayout_yolo",
323
+ elem_classes="custom-dropdown"
324
+ )
325
+ language = gr.Dropdown(
326
+ all_lang,
327
+ label="์–ธ์–ด",
328
+ value='auto',
329
+ elem_classes="custom-dropdown"
330
+ )
331
+
332
  with gr.Row():
333
+ formula_enable = gr.Checkbox(
334
+ label="์ˆ˜์‹ ์ธ์‹ ํ™œ์„ฑํ™”",
335
+ value=True,
336
+ elem_classes="custom-checkbox"
337
+ )
338
+ is_ocr = gr.Checkbox(
339
+ label="OCR ๊ฐ•์ œ ํ™œ์„ฑํ™”",
340
+ value=False,
341
+ elem_classes="custom-checkbox"
342
+ )
343
+ table_enable = gr.Checkbox(
344
+ label="ํ‘œ ์ธ์‹ ํ™œ์„ฑํ™”(ํ…Œ์ŠคํŠธ)",
345
+ value=True,
346
+ elem_classes="custom-checkbox"
347
+ )
348
+
349
  with gr.Row():
350
+ change_bu = gr.Button(
351
+ "๋ณ€ํ™˜",
352
+ elem_classes="primary-button"
353
+ )
354
+ clear_bu = gr.ClearButton(
355
+ value="์ดˆ๊ธฐํ™”",
356
+ elem_classes="secondary-button"
357
+ )
358
+
359
+ pdf_show = PDF(
360
+ label='PDF ๋ฏธ๋ฆฌ๋ณด๊ธฐ',
361
+ interactive=False,
362
+ visible=True,
363
+ height=800,
364
+ elem_classes="pdf-preview"
365
+ )
366
+
367
+ with gr.Accordion("์˜ˆ์ œ:", open=False):
368
  example_root = os.path.join(os.path.dirname(__file__), "examples")
369
  gr.Examples(
370
  examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
371
  _.endswith("pdf")],
372
+ inputs=file,
373
+ elem_classes="examples-section"
374
  )
375
 
376
+ # ์˜ค๋ฅธ์ชฝ ํŒจ๋„
377
  with gr.Column(variant='panel', scale=5):
378
+ output_file = gr.File(
379
+ label="๋ณ€ํ™˜ ๊ฒฐ๊ณผ",
380
+ interactive=False,
381
+ elem_classes="output-file"
382
+ )
383
+
384
+ with gr.Tabs() as tabs:
385
  with gr.Tab("๋งˆํฌ๋‹ค์šด ๋ Œ๋”๋ง"):
386
+ md = gr.Markdown(
387
+ label="๋งˆํฌ๋‹ค์šด ๋ Œ๋”๋ง",
388
+ height=1100,
389
+ show_copy_button=True,
390
+ latex_delimiters=latex_delimiters,
391
+ line_breaks=True,
392
+ elem_classes="markdown-output"
393
+ )
394
+
395
  with gr.Tab("๋งˆํฌ๋‹ค์šด ํ…์ŠคํŠธ"):
396
+ md_text = gr.TextArea(
397
+ lines=45,
398
+ show_copy_button=True,
399
+ elem_classes="markdown-text"
400
+ )
401
+
402
+ # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
403
+ file.change(
404
+ fn=to_pdf,
405
+ inputs=file,
406
+ outputs=pdf_show
407
+ )
408
+
409
+ change_bu.click(
410
+ fn=to_markdown,
411
+ inputs=[
412
+ file,
413
+ max_pages,
414
+ is_ocr,
415
+ layout_mode,
416
+ formula_enable,
417
+ table_enable,
418
+ language
419
+ ],
420
+ outputs=[
421
+ md,
422
+ md_text,
423
+ output_file,
424
+ pdf_show
425
+ ],
426
+ api_name=False
427
+ )
428
+
429
  clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
430
 
431
+ # ์•ฑ ์‹คํ–‰
432
+ demo.launch(ssr_mode=True)