dmitrynovikov2121 commited on
Commit
38b4e43
·
verified ·
1 Parent(s): 6331b74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -2
app.py CHANGED
@@ -47,8 +47,176 @@ def read_fn(path):
47
  disk_rw = FileBasedDataReader(os.path.dirname(path))
48
  return disk_rw.read(os.path.basename(path))
49
 
50
- # Your existing functions here (parse_pdf, compress_directory_to_zip, image_to_base64, etc.)
51
- # ... (keep all the utility functions from your original code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  @app.post("/process_document")
54
  async def process_document(
 
47
  disk_rw = FileBasedDataReader(os.path.dirname(path))
48
  return disk_rw.read(os.path.basename(path))
49
 
50
+ def read_fn(path):
51
+ disk_rw = FileBasedDataReader(os.path.dirname(path))
52
+ return disk_rw.read(os.path.basename(path))
53
+
54
+
55
+ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
56
+ os.makedirs(output_dir, exist_ok=True)
57
+
58
+ try:
59
+ file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
60
+ pdf_data = read_fn(doc_path)
61
+ if is_ocr:
62
+ parse_method = "ocr"
63
+ else:
64
+ parse_method = "auto"
65
+ local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
66
+ do_parse(
67
+ output_dir,
68
+ file_name,
69
+ pdf_data,
70
+ [],
71
+ parse_method,
72
+ False,
73
+ end_page_id=end_page_id,
74
+ layout_model=layout_mode,
75
+ formula_enable=formula_enable,
76
+ table_enable=table_enable,
77
+ lang=language,
78
+ f_dump_orig_pdf=False,
79
+ )
80
+ return local_md_dir, file_name
81
+ except Exception as e:
82
+ logger.exception(e)
83
+
84
+
85
+ def compress_directory_to_zip(directory_path, output_zip_path):
86
+ """
87
+ 压缩指定目录到一个 ZIP 文件。
88
+
89
+ :param directory_path: 要压缩的目录路径
90
+ :param output_zip_path: 输出的 ZIP 文件路径
91
+ """
92
+ try:
93
+ with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
94
+
95
+ # 遍历目录中的所有文件和子目录
96
+ for root, dirs, files in os.walk(directory_path):
97
+ for file in files:
98
+ # 构建完整的文件路径
99
+ file_path = os.path.join(root, file)
100
+ # 计算相对路径
101
+ arcname = os.path.relpath(file_path, directory_path)
102
+ # 添加文件到 ZIP 文件
103
+ zipf.write(file_path, arcname)
104
+ return 0
105
+ except Exception as e:
106
+ logger.exception(e)
107
+ return -1
108
+
109
+
110
+ def image_to_base64(image_path):
111
+ with open(image_path, "rb") as image_file:
112
+ return base64.b64encode(image_file.read()).decode('utf-8')
113
+
114
+
115
+ def replace_image_with_base64(markdown_text, image_dir_path):
116
+ # 匹配Markdown中的图片标签
117
+ pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
118
+
119
+ # 替换图片链接
120
+ def replace(match):
121
+ relative_path = match.group(1)
122
+ full_path = os.path.join(image_dir_path, relative_path)
123
+ base64_image = image_to_base64(full_path)
124
+ return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
125
+
126
+ # 应用替换
127
+ return re.sub(pattern, replace, markdown_text)
128
+
129
+
130
+ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
131
+ file_path = to_pdf(file_path)
132
+ if end_pages > 20:
133
+ end_pages = 20
134
+ # 获取识别的md文件以及压缩包文件路径
135
+ local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
136
+ layout_mode, formula_enable, table_enable, language)
137
+ archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
138
+ zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
139
+ if zip_archive_success == 0:
140
+ logger.info("压缩成功")
141
+ else:
142
+ logger.error("压缩失败")
143
+ md_path = os.path.join(local_md_dir, file_name + ".md")
144
+ with open(md_path, 'r', encoding='utf-8') as f:
145
+ txt_content = f.read()
146
+ md_content = replace_image_with_base64(txt_content, local_md_dir)
147
+ # 返回转换后的PDF路径
148
+ new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
149
+
150
+ return md_content, txt_content, archive_zip_path, new_pdf_path
151
+
152
+
153
+ latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
154
+ {"left": '$', "right": '$', "display": False}]
155
+
156
+
157
+ def init_model():
158
+ from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
159
+ try:
160
+ model_manager = ModelSingleton()
161
+ txt_model = model_manager.get_model(False, False)
162
+ logger.info(f"txt_model init final")
163
+ ocr_model = model_manager.get_model(True, False)
164
+ logger.info(f"ocr_model init final")
165
+ return 0
166
+ except Exception as e:
167
+ logger.exception(e)
168
+ return -1
169
+
170
+
171
+ model_init = init_model()
172
+ logger.info(f"model_init: {model_init}")
173
+
174
+
175
+ with open("header.html", "r") as file:
176
+ header = file.read()
177
+
178
+
179
+ latin_lang = [
180
+ 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
181
+ 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
182
+ 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
183
+ 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
184
+ ]
185
+ arabic_lang = ['ar', 'fa', 'ug', 'ur']
186
+ cyrillic_lang = [
187
+ 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
188
+ 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
189
+ ]
190
+ devanagari_lang = [
191
+ 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
192
+ 'sa', 'bgc'
193
+ ]
194
+ other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
195
+
196
+ all_lang = ['', 'auto']
197
+ all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
198
+
199
+
200
+ def to_pdf(file_path):
201
+ with pymupdf.open(file_path) as f:
202
+ if f.is_pdf:
203
+ return file_path
204
+ else:
205
+ pdf_bytes = f.convert_to_pdf()
206
+ # 将pdfbytes 写入到uuid.pdf中
207
+ # 生成唯一的文件名
208
+ unique_filename = f"{uuid.uuid4()}.pdf"
209
+
210
+ # 构建完整的文件路径
211
+ tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
212
+
213
+ # 将字节数据写入文件
214
+ with open(tmp_file_path, 'wb') as tmp_pdf_file:
215
+ tmp_pdf_file.write(pdf_bytes)
216
+
217
+ return tmp_file_path
218
+
219
+
220
 
221
  @app.post("/process_document")
222
  async def process_document(