myhloli commited on
Commit
0a681f9
·
verified ·
1 Parent(s): 8a270b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -5
app.py CHANGED
@@ -6,6 +6,8 @@ import time
6
  import zipfile
7
  from pathlib import Path
8
  import re
 
 
9
 
10
  # os.system('pip install -U magic-pdf==0.8.1')
11
  os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
@@ -174,12 +176,32 @@ all_lang = [""]
174
  all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
175
 
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  if __name__ == "__main__":
178
  with gr.Blocks() as demo:
179
  gr.HTML(header)
180
  with gr.Row():
181
  with gr.Column(variant='panel', scale=5):
182
- pdf_show = gr.Markdown()
183
  max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
184
  with gr.Row():
185
  layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3")
@@ -190,14 +212,14 @@ if __name__ == "__main__":
190
  table_enable = gr.Checkbox(label="Enable table recognition(test)", value=False)
191
  with gr.Row():
192
  change_bu = gr.Button("Convert")
193
- clear_bu = gr.ClearButton([pdf_show], value="Clear")
194
- pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
195
  with gr.Accordion("Examples:"):
196
  example_root = os.path.join(os.path.dirname(__file__), "examples")
197
  gr.Examples(
198
  examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
199
  _.endswith("pdf")],
200
- inputs=pdf_show,
201
  )
202
 
203
  with gr.Column(variant='panel', scale=5):
@@ -208,8 +230,9 @@ if __name__ == "__main__":
208
  latex_delimiters=latex_delimiters, line_breaks=True)
209
  with gr.Tab("Markdown text"):
210
  md_text = gr.TextArea(lines=45, show_copy_button=True)
 
211
  change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
212
  outputs=[md, md_text, output_file, pdf_show])
213
- clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
214
 
215
  demo.launch()
 
6
  import zipfile
7
  from pathlib import Path
8
  import re
9
+ import uuid
10
+ import pymupdf
11
 
12
  # os.system('pip install -U magic-pdf==0.8.1')
13
  os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
 
176
  all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
177
 
178
 
179
+ def to_pdf(file_path):
180
+ with pymupdf.open(file_path) as f:
181
+ if f.is_pdf:
182
+ return file_path
183
+ else:
184
+ pdf_bytes = f.convert_to_pdf()
185
+ # 将pdfbytes 写入到uuid.pdf中
186
+ # 生成唯一的文件名
187
+ unique_filename = f"{uuid.uuid4()}.pdf"
188
+
189
+ # 构建完整的文件路径
190
+ tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
191
+
192
+ # 将字节数据写入文件
193
+ with open(tmp_file_path, 'wb') as tmp_pdf_file:
194
+ tmp_pdf_file.write(pdf_bytes)
195
+
196
+ return tmp_file_path
197
+
198
+
199
  if __name__ == "__main__":
200
  with gr.Blocks() as demo:
201
  gr.HTML(header)
202
  with gr.Row():
203
  with gr.Column(variant='panel', scale=5):
204
+ file = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", "jpg"])
205
  max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
206
  with gr.Row():
207
  layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3")
 
212
  table_enable = gr.Checkbox(label="Enable table recognition(test)", value=False)
213
  with gr.Row():
214
  change_bu = gr.Button("Convert")
215
+ clear_bu = gr.ClearButton(value="Clear")
216
+ pdf_show = PDF(label="PDF preview", interactive=True, height=800)
217
  with gr.Accordion("Examples:"):
218
  example_root = os.path.join(os.path.dirname(__file__), "examples")
219
  gr.Examples(
220
  examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
221
  _.endswith("pdf")],
222
+ inputs=pdf_show
223
  )
224
 
225
  with gr.Column(variant='panel', scale=5):
 
230
  latex_delimiters=latex_delimiters, line_breaks=True)
231
  with gr.Tab("Markdown text"):
232
  md_text = gr.TextArea(lines=45, show_copy_button=True)
233
+ file.upload(fn=to_pdf, inputs=file, outputs=pdf_show)
234
  change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
235
  outputs=[md, md_text, output_file, pdf_show])
236
+ clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr, table_enable, language])
237
 
238
  demo.launch()