myhloli commited on
Commit
970470b
·
verified ·
1 Parent(s): a3c8298

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -98
app.py CHANGED
@@ -1,28 +1,22 @@
1
  # Copyright (c) Opendatalab. All rights reserved.
2
 
3
  import base64
4
- import json
5
  import os
 
 
6
  import time
7
  import zipfile
8
  from pathlib import Path
9
- import re
10
- import uuid
11
- import pymupdf
12
 
13
- # os.system('pip install -U magic-pdf==0.10.5')
14
  os.system('pip uninstall -y magic-pdf')
 
15
  os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
16
- # os.system('pip install git+https://github.com/myhloli/Magic-PDF.git@dev')
17
 
18
- os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
19
- os.system('python download_models_hf.py')
20
 
21
- with open('/home/user/magic-pdf.json', 'r') as file:
22
  config = json.load(file)
23
 
24
- config['device-mode'] = "cuda"
25
-
26
  delimiters = {
27
  'display': {'left': '\\[', 'right': '\\]'},
28
  'inline': {'left': '\\(', 'right': '\\)'}
@@ -34,49 +28,39 @@ if os.getenv('apikey'):
34
  config['llm-aided-config']['title_aided']['api_key'] = os.getenv('apikey')
35
  config['llm-aided-config']['title_aided']['enable'] = True
36
 
37
- with open('/home/user/magic-pdf.json', 'w') as file:
38
  json.dump(config, file, indent=4)
39
 
40
- # os.system('cp -r paddleocr /home/user/.paddleocr')
41
  from gradio_pdf import PDF
42
-
43
  import gradio as gr
44
  from loguru import logger
45
 
46
- from magic_pdf.data.data_reader_writer import FileBasedDataReader
47
- from magic_pdf.libs.hash_utils import compute_sha256
48
- from magic_pdf.tools.common import do_parse, prepare_env
49
 
 
50
 
51
- def read_fn(path):
52
- disk_rw = FileBasedDataReader(os.path.dirname(path))
53
- return disk_rw.read(os.path.basename(path))
54
 
55
-
56
- def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
57
  os.makedirs(output_dir, exist_ok=True)
58
 
59
  try:
60
- file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
61
  pdf_data = read_fn(doc_path)
62
  if is_ocr:
63
- parse_method = "ocr"
64
  else:
65
- parse_method = "auto"
66
  local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
67
  do_parse(
68
- output_dir,
69
- file_name,
70
- pdf_data,
71
- [],
72
- parse_method,
73
- False,
74
  end_page_id=end_page_id,
75
- layout_model=layout_mode,
76
- formula_enable=formula_enable,
77
- table_enable=table_enable,
78
- lang=language,
79
- f_dump_orig_pdf=False,
80
  )
81
  return local_md_dir, file_name
82
  except Exception as e:
@@ -128,25 +112,24 @@ def replace_image_with_base64(markdown_text, image_dir_path):
128
  return re.sub(pattern, replace, markdown_text)
129
 
130
 
131
- def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
132
  file_path = to_pdf(file_path)
133
  if end_pages > 20:
134
  end_pages = 20
135
  # 获取识别的md文件以及压缩包文件路径
136
- local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
137
- layout_mode, formula_enable, table_enable, language)
138
- archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
139
  zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
140
  if zip_archive_success == 0:
141
- logger.info("压缩成功")
142
  else:
143
- logger.error("压缩失败")
144
- md_path = os.path.join(local_md_dir, file_name + ".md")
145
  with open(md_path, 'r', encoding='utf-8') as f:
146
  txt_content = f.read()
147
  md_content = replace_image_with_base64(txt_content, local_md_dir)
148
  # 返回转换后的PDF路径
149
- new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
150
 
151
  return md_content, txt_content, archive_zip_path, new_pdf_path
152
 
@@ -155,24 +138,6 @@ latex_delimiters = [{"left": "\\[", "right": "\\]", "display": True},
155
  {"left": "\\(", "right": "\\)", "display": False}]
156
 
157
 
158
- def init_model():
159
- from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
160
- try:
161
- model_manager = ModelSingleton()
162
- txt_model = model_manager.get_model(False, False)
163
- logger.info(f"txt_model init final")
164
- ocr_model = model_manager.get_model(True, False)
165
- logger.info(f"ocr_model init final")
166
- return 0
167
- except Exception as e:
168
- logger.exception(e)
169
- return -1
170
-
171
-
172
- model_init = init_model()
173
- logger.info(f"model_init: {model_init}")
174
-
175
-
176
  with open("header.html", "r") as file:
177
  header = file.read()
178
 
@@ -201,67 +166,72 @@ all_lang = []
201
  all_lang.extend([*other_lang, *add_lang])
202
 
203
 
 
 
 
 
 
 
204
  def to_pdf(file_path):
205
- with pymupdf.open(file_path) as f:
206
- if f.is_pdf:
207
- return file_path
208
- else:
209
- pdf_bytes = f.convert_to_pdf()
210
- # 将pdfbytes 写入到uuid.pdf中
211
- # 生成唯一的文件名
212
- unique_filename = f"{uuid.uuid4()}.pdf"
213
 
214
- # 构建完整的文件路径
215
- tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
 
 
 
 
 
216
 
217
- # 将字节数据写入文件
218
- with open(tmp_file_path, 'wb') as tmp_pdf_file:
219
- tmp_pdf_file.write(pdf_bytes)
220
 
221
- return tmp_file_path
 
 
222
 
 
 
223
 
224
- if __name__ == "__main__":
225
  with gr.Blocks() as demo:
226
  gr.HTML(header)
227
  with gr.Row():
228
  with gr.Column(variant='panel', scale=5):
229
- file = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", ".jpg"])
230
- max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
231
  with gr.Row():
232
- layout_mode = gr.Dropdown([
233
- # "layoutlmv3",
234
- "doclayout_yolo"
235
- ],label="Layout model", value="doclayout_yolo")
236
- language = gr.Dropdown(all_lang, label="Language", value='ch')
 
237
  with gr.Row():
238
- formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
239
- is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
240
- table_enable = gr.Checkbox(label="Enable table recognition(test)", value=True)
241
  with gr.Row():
242
- change_bu = gr.Button("Convert")
243
- clear_bu = gr.ClearButton(value="Clear")
244
  pdf_show = PDF(label='PDF preview', interactive=False, visible=True, height=800)
245
- with gr.Accordion("Examples:"):
246
- example_root = os.path.join(os.path.dirname(__file__), "examples")
247
  gr.Examples(
248
  examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
249
- _.endswith("pdf")],
250
  inputs=file
251
  )
252
 
253
  with gr.Column(variant='panel', scale=5):
254
- output_file = gr.File(label="convert result", interactive=False)
255
  with gr.Tabs():
256
- with gr.Tab("Markdown rendering"):
257
- md = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True,
258
  latex_delimiters=latex_delimiters,
259
  line_breaks=True)
260
- with gr.Tab("Markdown text"):
261
  md_text = gr.TextArea(lines=45, show_copy_button=True)
262
  file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
263
- change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
264
- outputs=[md, md_text, output_file, pdf_show], api_name=False)
265
  clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
266
 
267
  demo.launch(ssr_mode=True)
 
1
  # Copyright (c) Opendatalab. All rights reserved.
2
 
3
  import base64
 
4
  import os
5
+ import json
6
+ import re
7
  import time
8
  import zipfile
9
  from pathlib import Path
 
 
 
10
 
 
11
  os.system('pip uninstall -y magic-pdf')
12
+ os.system('pip uninstall -y mineru')
13
  os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
 
14
 
15
+ os.system('mineru-models-download -s huggingface -m pipeline')
 
16
 
17
+ with open('/home/user/mineru.json', 'r') as file:
18
  config = json.load(file)
19
 
 
 
20
  delimiters = {
21
  'display': {'left': '\\[', 'right': '\\]'},
22
  'inline': {'left': '\\(', 'right': '\\)'}
 
28
  config['llm-aided-config']['title_aided']['api_key'] = os.getenv('apikey')
29
  config['llm-aided-config']['title_aided']['enable'] = True
30
 
31
+ with open('/home/user/mineru.json', 'w') as file:
32
  json.dump(config, file, indent=4)
33
 
 
34
  from gradio_pdf import PDF
 
35
  import gradio as gr
36
  from loguru import logger
37
 
38
+ from mineru.cli.common import prepare_env, do_parse, read_fn
39
+ from mineru.utils.hash_utils import str_sha256
 
40
 
41
+ os.environ['MINERU_MODEL_SOURCE'] = 'local'
42
 
 
 
 
43
 
44
+ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language):
 
45
  os.makedirs(output_dir, exist_ok=True)
46
 
47
  try:
48
+ file_name = f'{str(Path(doc_path).stem)}_{time.strftime("%y%m%d_%H%M%S")}'
49
  pdf_data = read_fn(doc_path)
50
  if is_ocr:
51
+ parse_method = 'ocr'
52
  else:
53
+ parse_method = 'auto'
54
  local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
55
  do_parse(
56
+ output_dir=output_dir,
57
+ pdf_file_names=[file_name],
58
+ pdf_bytes_list=[pdf_data],
59
+ p_lang_list=[language],
60
+ parse_method=parse_method,
 
61
  end_page_id=end_page_id,
62
+ p_formula_enable=formula_enable,
63
+ p_table_enable=table_enable,
 
 
 
64
  )
65
  return local_md_dir, file_name
66
  except Exception as e:
 
112
  return re.sub(pattern, replace, markdown_text)
113
 
114
 
115
+ def to_markdown(file_path, end_pages, is_ocr, formula_enable, table_enable, language):
116
  file_path = to_pdf(file_path)
117
  if end_pages > 20:
118
  end_pages = 20
119
  # 获取识别的md文件以及压缩包文件路径
120
+ local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language)
121
+ archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip')
 
122
  zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
123
  if zip_archive_success == 0:
124
+ logger.info('压缩成功')
125
  else:
126
+ logger.error('压缩失败')
127
+ md_path = os.path.join(local_md_dir, file_name + '.md')
128
  with open(md_path, 'r', encoding='utf-8') as f:
129
  txt_content = f.read()
130
  md_content = replace_image_with_base64(txt_content, local_md_dir)
131
  # 返回转换后的PDF路径
132
+ new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
133
 
134
  return md_content, txt_content, archive_zip_path, new_pdf_path
135
 
 
138
  {"left": "\\(", "right": "\\)", "display": False}]
139
 
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  with open("header.html", "r") as file:
142
  header = file.read()
143
 
 
166
  all_lang.extend([*other_lang, *add_lang])
167
 
168
 
169
+ def safe_stem(file_path):
170
+ stem = Path(file_path).stem
171
+ # 只保留字母、数字、下划线和点,其他字符替换为下划线
172
+ return re.sub(r'[^\w.]', '_', stem)
173
+
174
+
175
  def to_pdf(file_path):
 
 
 
 
 
 
 
 
176
 
177
+ if file_path is None:
178
+ return None
179
+
180
+ pdf_bytes = read_fn(file_path)
181
+
182
+ # unique_filename = f'{uuid.uuid4()}.pdf'
183
+ unique_filename = f'{safe_stem(file_path)}.pdf'
184
 
185
+ # 构建完整的文件路径
186
+ tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
 
187
 
188
+ # 将字节数据写入文件
189
+ with open(tmp_file_path, 'wb') as tmp_pdf_file:
190
+ tmp_pdf_file.write(pdf_bytes)
191
 
192
+ return tmp_file_path
193
+
194
 
195
+ if __name__ == '__main__':
196
  with gr.Blocks() as demo:
197
  gr.HTML(header)
198
  with gr.Row():
199
  with gr.Column(variant='panel', scale=5):
 
 
200
  with gr.Row():
201
+ file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
202
+ with gr.Row(equal_height=True):
203
+ with gr.Column(scale=4):
204
+ max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
205
+ with gr.Column(scale=1):
206
+ language = gr.Dropdown(all_lang, label='Language', value='ch')
207
  with gr.Row():
208
+ is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
209
+ formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
210
+ table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True)
211
  with gr.Row():
212
+ change_bu = gr.Button('Convert')
213
+ clear_bu = gr.ClearButton(value='Clear')
214
  pdf_show = PDF(label='PDF preview', interactive=False, visible=True, height=800)
215
+ with gr.Accordion('Examples:'):
216
+ example_root = os.path.join(os.path.dirname(__file__), 'examples')
217
  gr.Examples(
218
  examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
219
+ _.endswith('pdf')],
220
  inputs=file
221
  )
222
 
223
  with gr.Column(variant='panel', scale=5):
224
+ output_file = gr.File(label='convert result', interactive=False)
225
  with gr.Tabs():
226
+ with gr.Tab('Markdown rendering'):
227
+ md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
228
  latex_delimiters=latex_delimiters,
229
  line_breaks=True)
230
+ with gr.Tab('Markdown text'):
231
  md_text = gr.TextArea(lines=45, show_copy_button=True)
232
  file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
233
+ change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, formula_enable, table_enable, language],
234
+ outputs=[md, md_text, output_file, pdf_show])
235
  clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
236
 
237
  demo.launch(ssr_mode=True)