marcosremar2 commited on
Commit
53a34c2
·
1 Parent(s): ab599b4

Fix: Use PymuDocDataset in API endpoint

Browse files
Files changed (1) hide show
  1. app.py +69 -27
app.py CHANGED
@@ -1,26 +1,30 @@
1
  from fastapi import FastAPI, UploadFile, File, HTTPException
2
  from fastapi.responses import JSONResponse
3
  from fastapi.middleware.cors import CORSMiddleware
4
- import magic_pdf
5
  import tempfile
6
  import os
7
  import json
8
  import traceback
9
- import uvicorn
10
  from datetime import datetime
11
  from typing import Dict, List, Any, Optional
12
 
 
 
 
 
 
 
13
  # Application metadata
14
  app_description = """
15
  # MinerU PDF Processor API
16
 
17
  This API provides PDF processing capabilities using MinerU's magic-pdf library.
18
- It extracts text content and tables from PDF documents.
19
 
20
  ## Features:
21
  - PDF text extraction
22
- - Table detection and extraction
23
- - JSON response for easy integration
24
  """
25
 
26
  app = FastAPI(
@@ -41,6 +45,11 @@ app.add_middleware(
41
  allow_headers=["*"], # Allow all headers
42
  )
43
 
 
 
 
 
 
44
  # Health check endpoint
45
  @app.get("/health", tags=["Health"])
46
  async def health_check() -> Dict[str, Any]:
@@ -57,13 +66,13 @@ async def health_check() -> Dict[str, Any]:
57
  @app.post("/extract", tags=["PDF Processing"])
58
  async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
59
  """
60
- Extract text and tables from a PDF file.
61
 
62
  Parameters:
63
  file: The PDF file to process
64
 
65
  Returns:
66
- A JSON object containing the extracted content with pages, text blocks, and tables
67
  """
68
  if not file.filename or not file.filename.lower().endswith('.pdf'):
69
  raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
@@ -76,35 +85,66 @@ async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
76
  with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
77
  temp_pdf.write(content)
78
  temp_pdf_path = temp_pdf.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # Process the PDF using magic_pdf.PDF class
81
- result = magic_pdf.PDF(temp_pdf_path).parse()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- # Convert result to dictionary
84
- output = {
 
 
 
 
 
 
85
  "filename": file.filename,
86
- "pages": []
 
 
 
87
  }
88
-
89
- for page in result.pages:
90
- page_data = {
91
- "page_num": page.page_num,
92
- "text": "\n".join([block.text for block in page.text_blocks]),
93
- "tables": []
94
- }
95
-
96
- for table in page.tables:
97
- page_data["tables"].append(table.to_markdown())
98
-
99
- output["pages"].append(page_data)
100
-
101
- return {"result": output}
102
 
103
  except Exception as e:
104
  error_detail = str(e)
105
  error_trace = traceback.format_exc()
106
 
107
- # Log the error (would be better with a proper logger)
108
  print(f"Error processing PDF: {error_detail}")
109
  print(error_trace)
110
 
@@ -126,4 +166,6 @@ async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
126
  pass
127
 
128
  if __name__ == "__main__":
 
 
129
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
 
1
  from fastapi import FastAPI, UploadFile, File, HTTPException
2
  from fastapi.responses import JSONResponse
3
  from fastapi.middleware.cors import CORSMiddleware
 
4
  import tempfile
5
  import os
6
  import json
7
  import traceback
 
8
  from datetime import datetime
9
  from typing import Dict, List, Any, Optional
10
 
11
+ # Import necessary components from magic_pdf based on convert_pdf.py
12
+ from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
13
+ from magic_pdf.data.dataset import PymuDocDataset
14
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
15
+ from magic_pdf.config.enums import SupportedPdfParseMethod
16
+
17
  # Application metadata
18
  app_description = """
19
  # MinerU PDF Processor API
20
 
21
  This API provides PDF processing capabilities using MinerU's magic-pdf library.
22
+ It extracts text content and generates markdown from PDF documents.
23
 
24
  ## Features:
25
  - PDF text extraction
26
+ - Markdown conversion
27
+ - Layout analysis (via output files)
28
  """
29
 
30
  app = FastAPI(
 
45
  allow_headers=["*"], # Allow all headers
46
  )
47
 
48
+ # Define output directories (relative to the app's working directory in the container)
49
+ local_image_dir, local_md_dir = "output/images", "output"
50
+ os.makedirs(local_image_dir, exist_ok=True)
51
+ os.makedirs(local_md_dir, exist_ok=True)
52
+
53
  # Health check endpoint
54
  @app.get("/health", tags=["Health"])
55
  async def health_check() -> Dict[str, Any]:
 
66
  @app.post("/extract", tags=["PDF Processing"])
67
  async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
68
  """
69
+ Process a PDF file using PymuDocDataset and return the extracted markdown content.
70
 
71
  Parameters:
72
  file: The PDF file to process
73
 
74
  Returns:
75
+ A JSON object containing the extracted markdown and status.
76
  """
77
  if not file.filename or not file.filename.lower().endswith('.pdf'):
78
  raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
 
85
  with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
86
  temp_pdf.write(content)
87
  temp_pdf_path = temp_pdf.name
88
+
89
+ # Clear previous output files (optional, depending on desired behavior)
90
+ # You might want to handle output naming differently in a multi-user API context
91
+ # For simplicity, we'll clear the output dir here like in convert_pdf.py
92
+ for item in os.listdir(local_image_dir):
93
+ os.remove(os.path.join(local_image_dir, item))
94
+ for item in os.listdir(local_md_dir):
95
+ if os.path.isfile(os.path.join(local_md_dir, item)):
96
+ os.remove(os.path.join(local_md_dir, item))
97
+
98
+ # Get filename and prepare output paths for magic-pdf
99
+ pdf_file_name = os.path.basename(temp_pdf_path)
100
+ name_without_suff = os.path.splitext(pdf_file_name)[0]
101
+ image_dir_rel_path = str(os.path.basename(local_image_dir)) # Relative path for markdown image links
102
 
103
+ # Setup writers
104
+ image_writer = FileBasedDataWriter(local_image_dir)
105
+ md_writer = FileBasedDataWriter(local_md_dir)
106
+
107
+ # Use PymuDocDataset for processing
108
+ ds = PymuDocDataset(content) # Pass pdf bytes directly
109
+
110
+ # Inference and pipeline based on PDF type
111
+ if ds.classify() == SupportedPdfParseMethod.OCR:
112
+ infer_result = ds.apply(doc_analyze, ocr=True)
113
+ pipe_result = infer_result.pipe_ocr_mode(image_writer)
114
+ else:
115
+ infer_result = ds.apply(doc_analyze, ocr=False)
116
+ pipe_result = infer_result.pipe_txt_mode(image_writer)
117
+
118
+ # Optional: Generate intermediate output files (comment out if not needed for API)
119
+ infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
120
+ pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
121
+ pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
122
+ pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir_rel_path)
123
+ pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
124
+
125
+ # Get markdown content
126
+ md_content = pipe_result.get_markdown(image_dir_rel_path)
127
 
128
+ # Dump markdown to file (optional for API, but useful for debugging/access)
129
+ md_file_path = f"{name_without_suff}.md"
130
+ pipe_result.dump_md(md_writer, md_file_path, image_dir_rel_path)
131
+ print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}")
132
+
133
+
134
+ # Return the markdown content in the response
135
+ return {
136
  "filename": file.filename,
137
+ "status": "success",
138
+ "markdown_content": md_content
139
+ # You could potentially add links to the generated files here if needed
140
+ # "output_files": { ... }
141
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  except Exception as e:
144
  error_detail = str(e)
145
  error_trace = traceback.format_exc()
146
 
147
+ # Log the error
148
  print(f"Error processing PDF: {error_detail}")
149
  print(error_trace)
150
 
 
166
  pass
167
 
168
  if __name__ == "__main__":
169
+ # Keep uvicorn import here for local running
170
+ import uvicorn
171
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)