taprosoft commited on
Commit
8d8e906
·
unverified ·
1 Parent(s): 1784c14

fix: update mineru setup

Browse files
Files changed (2) hide show
  1. requirements.txt +4 -2
  2. utils.py +61 -7
requirements.txt CHANGED
@@ -5,10 +5,10 @@ rapid-table>=1.0.3,<2.0.0
5
  rapidocr-paddle
6
  rapidocr-onnxruntime
7
  gradio-pdf>=0.0.21
8
- git+https://github.com/opendatalab/MinerU.git@dev
9
  git+https://github.com/VikParuchuri/marker
10
  docling
11
- PyMuPDF>=1.24.9,<1.24.14
12
  pymupdf4llm
13
  unstructured[pdf]
14
  ultralytics>=8.3.48
@@ -22,3 +22,5 @@ unimernet==0.2.3
22
  transformers<5.0.0,>=4.45.2
23
  pypdf
24
  opencv-contrib-python
 
 
 
5
  rapidocr-paddle
6
  rapidocr-onnxruntime
7
  gradio-pdf>=0.0.21
8
+ git+https://github.com/opendatalab/MinerU.git@release-1.3.4
9
  git+https://github.com/VikParuchuri/marker
10
  docling
11
+ PyMuPDF
12
  pymupdf4llm
13
  unstructured[pdf]
14
  ultralytics>=8.3.48
 
22
  transformers<5.0.0,>=4.45.2
23
  pypdf
24
  opencv-contrib-python
25
+ numpy==1.26.4
26
+ pdfminer.six
utils.py CHANGED
@@ -62,12 +62,71 @@ def fix_problematic_imports():
62
  sys.modules[
63
  "magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
64
  ] = fake_unimernet_module
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  def prepare_env_mineru():
68
  import json
69
  import os
70
-
71
  import nltk
72
 
73
  # download nltk data
@@ -82,12 +141,7 @@ def prepare_env_mineru():
82
  return
83
 
84
  # download models
85
- os.system(
86
- "wget https://raw.githubusercontent.com/opendatalab/MinerU/"
87
- "refs/heads/release-1.3.12/scripts/download_models_hf.py"
88
- " -O download_models_hf.py"
89
- )
90
- os.system("python3 download_models_hf.py")
91
 
92
  with open(config_path, "r") as file:
93
  data = json.load(file)
 
62
  sys.modules[
63
  "magic_pdf.model.sub_modules.mfr.unimernet.Unimernet"
64
  ] = fake_unimernet_module
65
+
66
+
67
+ def setup_mineru_config():
68
+ import json
69
+ import os
70
+ import requests
71
+ from huggingface_hub import snapshot_download
72
+
73
+
74
+ def download_json(url):
75
+ response = requests.get(url)
76
+ response.raise_for_status()
77
+ return response.json()
78
+
79
+
80
+ def download_and_modify_json(url, local_filename, modifications):
81
+ if os.path.exists(local_filename):
82
+ data = json.load(open(local_filename))
83
+ config_version = data.get('config_version', '0.0.0')
84
+ if config_version < '1.2.0':
85
+ data = download_json(url)
86
+ else:
87
+ data = download_json(url)
88
+
89
+ for key, value in modifications.items():
90
+ data[key] = value
91
+
92
+ with open(local_filename, 'w', encoding='utf-8') as f:
93
+ json.dump(data, f, ensure_ascii=False, indent=4)
94
+
95
+ mineru_patterns = [
96
+ "models/Layout/YOLO/*",
97
+ "models/MFD/YOLO/*",
98
+ "models/MFR/unimernet_hf_small_2503/*",
99
+ "models/OCR/paddleocr_torch/*",
100
+ ]
101
+ model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
102
+
103
+ layoutreader_pattern = [
104
+ "*.json",
105
+ "*.safetensors",
106
+ ]
107
+ layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
108
+
109
+ model_dir = model_dir + '/models'
110
+ print(f'model_dir is: {model_dir}')
111
+ print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
112
+
113
+ json_url = 'https://raw.githubusercontent.com/opendatalab/MinerU/refs/heads/release-1.3.12/magic-pdf.template.json'
114
+ config_file_name = 'magic-pdf.json'
115
+ home_dir = os.path.expanduser('~')
116
+ config_file = os.path.join(home_dir, config_file_name)
117
+
118
+ json_mods = {
119
+ 'models-dir': model_dir,
120
+ 'layoutreader-model-dir': layoutreader_model_dir,
121
+ }
122
+
123
+ download_and_modify_json(json_url, config_file, json_mods)
124
+ print(f'The configuration file has been configured successfully, the path is: {config_file}')
125
 
126
 
127
  def prepare_env_mineru():
128
  import json
129
  import os
 
130
  import nltk
131
 
132
  # download nltk data
 
141
  return
142
 
143
  # download models
144
+ setup_mineru_config()
 
 
 
 
 
145
 
146
  with open(config_path, "r") as file:
147
  data = json.load(file)