dmitrynovikov2121 commited on
Commit
76d97a2
·
verified ·
1 Parent(s): ed98a83

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -274
app.py CHANGED
@@ -9,83 +9,21 @@ from pathlib import Path
9
  import re
10
  import uuid
11
  import pymupdf
12
- import telebot
13
- from threading import Thread
14
- import requests
15
  from io import BytesIO
 
 
 
16
 
17
- # Add your Telegram bot token as an environment variable in Hugging Face space
18
- BOT_TOKEN = os.getenv('BOT_TOKEN')
19
- bot = telebot.TeleBot(BOT_TOKEN)
20
 
21
- def download_file_from_telegram(file_id):
22
- file_info = bot.get_file(file_id)
23
- file_path = file_info.file_path
24
- file_url = f'https://api.telegram.org/file/bot{BOT_TOKEN}/{file_path}'
25
- response = requests.get(file_url)
26
- return BytesIO(response.content)
27
-
28
- @bot.message_handler(content_types=['document'])
29
- def handle_docs(message):
30
- try:
31
- # Get file from telegram
32
- file_id = message.document.file_id
33
- file_name = message.document.file_name
34
-
35
- if not file_name.lower().endswith(('.pdf', '.png', '.jpg', '.jpeg')):
36
- bot.reply_to(message, "Please send only PDF or image files.")
37
- return
38
-
39
- # Download file
40
- file_data = download_file_from_telegram(file_id)
41
-
42
- # Save temporarily
43
- temp_path = f"/tmp/{file_name}"
44
- with open(temp_path, 'wb') as f:
45
- f.write(file_data.getvalue())
46
-
47
- # Process file using your existing function
48
- md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
49
- temp_path,
50
- end_pages=10, # default value
51
- is_ocr=False, # default value
52
- layout_mode="doclayout_yolo", # default value
53
- formula_enable=True, # default value
54
- table_enable=True, # default value
55
- language='auto' # default value
56
- )
57
-
58
- # Send back results
59
- with open(archive_zip_path, 'rb') as zip_file:
60
- bot.send_document(message.chat.id, zip_file)
61
-
62
- # Send markdown content in chunks if it's too long
63
- max_length = 4096
64
- for i in range(0, len(md_content), max_length):
65
- chunk = md_content[i:i + max_length]
66
- bot.send_message(message.chat.id, chunk)
67
-
68
- # Cleanup
69
- os.remove(temp_path)
70
-
71
- except Exception as e:
72
- bot.reply_to(message, f"Error processing document: {str(e)}")
73
-
74
- @bot.message_handler(commands=['start', 'help'])
75
- def send_welcome(message):
76
- bot.reply_to(message, "Hello! Send me a PDF or image file and I'll process it for you.")
77
-
78
- def run_bot():
79
- bot.infinity_polling()
80
-
81
- # os.system('pip install -U magic-pdf==0.10.5')
82
  os.system('pip uninstall -y magic-pdf')
83
  os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
84
- # os.system('pip install git+https://github.com/myhloli/Magic-PDF.git@dev')
85
-
86
  os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
87
  os.system('python download_models_hf.py')
88
 
 
89
  with open('/home/user/magic-pdf.json', 'r') as file:
90
  data = json.load(file)
91
 
@@ -98,227 +36,70 @@ with open('/home/user/magic-pdf.json', 'w') as file:
98
  json.dump(data, file, indent=4)
99
 
100
  os.system('cp -r paddleocr /home/user/.paddleocr')
101
- from gradio_pdf import PDF
102
-
103
- import gradio as gr
104
- from loguru import logger
105
 
 
106
  from magic_pdf.data.data_reader_writer import FileBasedDataReader
107
  from magic_pdf.libs.hash_utils import compute_sha256
108
  from magic_pdf.tools.common import do_parse, prepare_env
109
-
110
 
111
  def read_fn(path):
112
  disk_rw = FileBasedDataReader(os.path.dirname(path))
113
  return disk_rw.read(os.path.basename(path))
114
 
115
-
116
- def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
117
- os.makedirs(output_dir, exist_ok=True)
118
-
 
 
 
 
 
 
 
 
 
119
  try:
120
- file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
121
- pdf_data = read_fn(doc_path)
122
- if is_ocr:
123
- parse_method = "ocr"
124
- else:
125
- parse_method = "auto"
126
- local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
127
- do_parse(
128
- output_dir,
129
- file_name,
130
- pdf_data,
131
- [],
132
- parse_method,
133
- False,
134
- end_page_id=end_page_id,
135
- layout_model=layout_mode,
136
  formula_enable=formula_enable,
137
  table_enable=table_enable,
138
- lang=language,
139
- f_dump_orig_pdf=False,
140
  )
141
- return local_md_dir, file_name
142
- except Exception as e:
143
- logger.exception(e)
144
-
145
-
146
- def compress_directory_to_zip(directory_path, output_zip_path):
147
- """
148
- 压缩指定目录到一个 ZIP 文件。
149
-
150
- :param directory_path: 要压缩的目录路径
151
- :param output_zip_path: 输出的 ZIP 文件路径
152
- """
153
- try:
154
- with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
155
-
156
- # 遍历目录中的所有文件和子目录
157
- for root, dirs, files in os.walk(directory_path):
158
- for file in files:
159
- # 构建完整的文件路径
160
- file_path = os.path.join(root, file)
161
- # 计算相对路径
162
- arcname = os.path.relpath(file_path, directory_path)
163
- # 添加文件到 ZIP 文件
164
- zipf.write(file_path, arcname)
165
- return 0
166
- except Exception as e:
167
- logger.exception(e)
168
- return -1
169
-
170
-
171
- def image_to_base64(image_path):
172
- with open(image_path, "rb") as image_file:
173
- return base64.b64encode(image_file.read()).decode('utf-8')
174
-
175
-
176
- def replace_image_with_base64(markdown_text, image_dir_path):
177
- # 匹配Markdown中的图片标签
178
- pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
179
-
180
- # 替换图片链接
181
- def replace(match):
182
- relative_path = match.group(1)
183
- full_path = os.path.join(image_dir_path, relative_path)
184
- base64_image = image_to_base64(full_path)
185
- return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
186
-
187
- # 应用替换
188
- return re.sub(pattern, replace, markdown_text)
189
-
190
-
191
- def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
192
- file_path = to_pdf(file_path)
193
- if end_pages > 20:
194
- end_pages = 20
195
- # 获取识别的md文件以及压缩包文件路径
196
- local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
197
- layout_mode, formula_enable, table_enable, language)
198
- archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
199
- zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
200
- if zip_archive_success == 0:
201
- logger.info("压缩成功")
202
- else:
203
- logger.error("压缩失败")
204
- md_path = os.path.join(local_md_dir, file_name + ".md")
205
- with open(md_path, 'r', encoding='utf-8') as f:
206
- txt_content = f.read()
207
- md_content = replace_image_with_base64(txt_content, local_md_dir)
208
- # 返回转换后的PDF路径
209
- new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
210
-
211
- return md_content, txt_content, archive_zip_path, new_pdf_path
212
-
213
-
214
- latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
215
- {"left": '$', "right": '$', "display": False}]
216
-
217
-
218
- def init_model():
219
- from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
220
- try:
221
- model_manager = ModelSingleton()
222
- txt_model = model_manager.get_model(False, False)
223
- logger.info(f"txt_model init final")
224
- ocr_model = model_manager.get_model(True, False)
225
- logger.info(f"ocr_model init final")
226
- return 0
227
  except Exception as e:
228
- logger.exception(e)
229
- return -1
230
-
 
231
 
 
232
  model_init = init_model()
233
  logger.info(f"model_init: {model_init}")
234
 
235
-
236
- with open("header.html", "r") as file:
237
- header = file.read()
238
-
239
-
240
- latin_lang = [
241
- 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
242
- 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
243
- 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
244
- 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
245
- ]
246
- arabic_lang = ['ar', 'fa', 'ug', 'ur']
247
- cyrillic_lang = [
248
- 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
249
- 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
250
- ]
251
- devanagari_lang = [
252
- 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
253
- 'sa', 'bgc'
254
- ]
255
- other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
256
-
257
- all_lang = ['', 'auto']
258
- all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
259
-
260
-
261
- def to_pdf(file_path):
262
- with pymupdf.open(file_path) as f:
263
- if f.is_pdf:
264
- return file_path
265
- else:
266
- pdf_bytes = f.convert_to_pdf()
267
- # 将pdfbytes 写入到uuid.pdf中
268
- # 生成唯一的文件名
269
- unique_filename = f"{uuid.uuid4()}.pdf"
270
-
271
- # 构建完整的文件路径
272
- tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
273
-
274
- # 将字节数据写入文件
275
- with open(tmp_file_path, 'wb') as tmp_pdf_file:
276
- tmp_pdf_file.write(pdf_bytes)
277
-
278
- return tmp_file_path
279
-
280
-
281
  if __name__ == "__main__":
282
- # Start bot in a separate thread
283
- bot_thread = Thread(target=run_bot)
284
- bot_thread.start()
285
-
286
- with gr.Blocks() as demo:
287
- gr.HTML(header)
288
- with gr.Row():
289
- with gr.Column(variant='panel', scale=5):
290
- file = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", ".jpg"])
291
- max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
292
- with gr.Row():
293
- layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="doclayout_yolo")
294
- language = gr.Dropdown(all_lang, label="Language", value='auto')
295
- with gr.Row():
296
- formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
297
- is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
298
- table_enable = gr.Checkbox(label="Enable table recognition(test)", value=True)
299
- with gr.Row():
300
- change_bu = gr.Button("Convert")
301
- clear_bu = gr.ClearButton(value="Clear")
302
- pdf_show = PDF(label='PDF preview', interactive=False, visible=True, height=800)
303
- with gr.Accordion("Examples:"):
304
- example_root = os.path.join(os.path.dirname(__file__), "examples")
305
- gr.Examples(
306
- examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
307
- _.endswith("pdf")],
308
- inputs=file
309
- )
310
-
311
- with gr.Column(variant='panel', scale=5):
312
- output_file = gr.File(label="convert result", interactive=False)
313
- with gr.Tabs():
314
- with gr.Tab("Markdown rendering"):
315
- md = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True,
316
- latex_delimiters=latex_delimiters, line_breaks=True)
317
- with gr.Tab("Markdown text"):
318
- md_text = gr.TextArea(lines=45, show_copy_button=True)
319
- file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
320
- change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
321
- outputs=[md, md_text, output_file, pdf_show], api_name=False)
322
- clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
323
-
324
- demo.launch(ssr_mode=False)
 
9
  import re
10
  import uuid
11
  import pymupdf
 
 
 
12
  from io import BytesIO
13
+ from fastapi import FastAPI, File, UploadFile
14
+ from fastapi.responses import JSONResponse
15
+ import uvicorn
16
 
17
+ # Initialize FastAPI app
18
+ app = FastAPI()
 
19
 
20
+ # Setup and installation commands
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  os.system('pip uninstall -y magic-pdf')
22
  os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
 
 
23
  os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
24
  os.system('python download_models_hf.py')
25
 
26
+ # Configure magic-pdf settings
27
  with open('/home/user/magic-pdf.json', 'r') as file:
28
  data = json.load(file)
29
 
 
36
  json.dump(data, file, indent=4)
37
 
38
  os.system('cp -r paddleocr /home/user/.paddleocr')
 
 
 
 
39
 
40
+ # Import required modules
41
  from magic_pdf.data.data_reader_writer import FileBasedDataReader
42
  from magic_pdf.libs.hash_utils import compute_sha256
43
  from magic_pdf.tools.common import do_parse, prepare_env
44
+ from loguru import logger
45
 
46
  def read_fn(path):
47
  disk_rw = FileBasedDataReader(os.path.dirname(path))
48
  return disk_rw.read(os.path.basename(path))
49
 
50
+ # Your existing functions here (parse_pdf, compress_directory_to_zip, image_to_base64, etc.)
51
+ # ... (keep all the utility functions from your original code)
52
+
53
+ @app.post("/process_document")
54
+ async def process_document(
55
+ file: UploadFile = File(...),
56
+ end_pages: int = 10,
57
+ is_ocr: bool = False,
58
+ layout_mode: str = "doclayout_yolo",
59
+ formula_enable: bool = True,
60
+ table_enable: bool = True,
61
+ language: str = "auto"
62
+ ):
63
  try:
64
+ # Save uploaded file temporarily
65
+ temp_path = f"/tmp/{file.filename}"
66
+ with open(temp_path, "wb") as buffer:
67
+ content = await file.read()
68
+ buffer.write(content)
69
+
70
+ # Process file
71
+ md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
72
+ temp_path,
73
+ end_pages=end_pages,
74
+ is_ocr=is_ocr,
75
+ layout_mode=layout_mode,
 
 
 
 
76
  formula_enable=formula_enable,
77
  table_enable=table_enable,
78
+ language=language
 
79
  )
80
+
81
+ # Read the zip file as base64
82
+ with open(archive_zip_path, "rb") as zip_file:
83
+ zip_content = base64.b64encode(zip_file.read()).decode()
84
+
85
+ # Clean up
86
+ os.remove(temp_path)
87
+
88
+ return JSONResponse({
89
+ "markdown_content": md_content,
90
+ "text_content": txt_content,
91
+ "zip_file_base64": zip_content
92
+ })
93
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  except Exception as e:
95
+ return JSONResponse(
96
+ status_code=500,
97
+ content={"error": str(e)}
98
+ )
99
 
100
+ # Initialize models
101
  model_init = init_model()
102
  logger.info(f"model_init: {model_init}")
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  if __name__ == "__main__":
105
+ uvicorn.run(app, host="0.0.0.0", port=7860)