marcosremar2 commited on
Commit
3c3eb16
·
1 Parent(s): ed4cfc9

Update PDF to Markdown converter API with NVIDIA L4 support

Browse files
Files changed (3) hide show
  1. Dockerfile +5 -2
  2. app/main.py +54 -13
  3. pdf_converter/convert_pdf_to_md.py +35 -5
Dockerfile CHANGED
@@ -71,10 +71,13 @@ ENV HF_HOME=/home/user/.cache/huggingface
71
  ENV TORCH_HOME=/home/user/.cache/torch
72
  # Add environment variable for marker font path (alternative fix)
73
  ENV MARKER_FONT_PATH=/home/user/.cache/marker_fonts
 
 
 
74
 
75
  # Expose the port
76
  EXPOSE 7860
77
 
78
  # Command to run the application with Gunicorn and Uvicorn workers
79
- # Increased workers to 16 for L40S. Adjust based on monitoring.
80
- CMD ["gunicorn", "-w", "16", "-k", "uvicorn.workers.UvicornWorker", "app.main:app", "--bind", "0.0.0.0:7860"]
 
71
  ENV TORCH_HOME=/home/user/.cache/torch
72
  # Add environment variable for marker font path (alternative fix)
73
  ENV MARKER_FONT_PATH=/home/user/.cache/marker_fonts
74
+ # Add PyTorch memory optimization environment variables
75
+ ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128
76
+ ENV CUDA_VISIBLE_DEVICES=0
77
 
78
  # Expose the port
79
  EXPOSE 7860
80
 
81
  # Command to run the application with Gunicorn and Uvicorn workers
82
+ # Reduced workers to 4 (from 16) to avoid OOM errors
83
+ CMD ["gunicorn", "-w", "4", "-k", "uvicorn.workers.UvicornWorker", "app.main:app", "--bind", "0.0.0.0:7860"]
app/main.py CHANGED
@@ -11,8 +11,16 @@ from typing import Dict, Any
11
  import shutil
12
  import torch
13
  import asyncio
 
14
  from contextlib import asynccontextmanager
15
 
 
 
 
 
 
 
 
16
  # Add the parent directory to sys.path to import convert_pdf_to_md
17
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
18
  # Import the initialization function as well
@@ -33,21 +41,41 @@ images_dir = os.path.join(output_dir, "images")
33
  # Create output directory if it doesn't exist
34
  os.makedirs(output_dir, exist_ok=True)
35
  os.makedirs(images_dir, exist_ok=True)
36
- print(f"Using output directory: {output_dir}") # Add log for debugging
37
  # --- End Configuration ---
38
 
 
 
 
39
  # --- Lifespan management for model loading ---
40
  @asynccontextmanager
41
  async def lifespan(app: FastAPI):
 
42
  # Load the ML model during startup
43
- print("Application startup: Initializing marker converter...")
44
  loop = asyncio.get_event_loop()
 
45
  # Run in executor to avoid blocking the event loop
46
- await loop.run_in_executor(None, initialize_converter)
47
- print("Marker converter initialization process finished.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  yield
 
49
  # Clean up resources if needed during shutdown
50
- print("Application shutdown.")
51
 
52
  # Application metadata
53
  app_description = """
@@ -93,14 +121,17 @@ async def health_check() -> Dict[str, Any]:
93
  "cuda_available": torch.cuda.is_available(),
94
  "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
95
  "device_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A",
96
- "current_device": torch.cuda.current_device() if torch.cuda.is_available() else -1
 
 
97
  }
98
 
99
  return {
100
- "status": "healthy",
101
  "timestamp": datetime.now().isoformat(),
102
  "service": "pdf-to-markdown-converter",
103
  "gpu": gpu_info,
 
104
  "output_directory_used": output_dir # Add info for debugging
105
  }
106
 
@@ -115,6 +146,16 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
115
  Returns:
116
  A JSON object containing the conversion result
117
  """
 
 
 
 
 
 
 
 
 
 
118
  if not file.filename or not file.filename.lower().endswith('.pdf'):
119
  raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
120
 
@@ -127,13 +168,13 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
127
  with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False, dir="/tmp") as temp_pdf:
128
  temp_pdf.write(content)
129
  temp_pdf_path = temp_pdf.name
130
- print(f"Temporary PDF saved to: {temp_pdf_path}")
131
 
132
  # Get the base name of the file for the output
133
  filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
134
  # Use the configured output_dir for saving the markdown file
135
  output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
136
- print(f"Output markdown path: {output_md_file}")
137
 
138
  # Process the PDF using the pre-loaded converter
139
  md_content = convert_pdf(temp_pdf_path, output_md_file)
@@ -153,8 +194,8 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
153
  except Exception as e:
154
  error_detail = str(e)
155
  error_trace = traceback.format_exc()
156
- print(f"Error processing PDF '{file.filename if file else 'N/A'}': {error_detail}")
157
- print(error_trace)
158
  return JSONResponse(
159
  status_code=500,
160
  content={
@@ -169,9 +210,9 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
169
  if temp_pdf_path and os.path.exists(temp_pdf_path):
170
  try:
171
  os.unlink(temp_pdf_path)
172
- print(f"Temporary file {temp_pdf_path} deleted.")
173
  except Exception as unlink_err:
174
- print(f"Error deleting temporary file {temp_pdf_path}: {unlink_err}")
175
 
176
  # Remove the old __main__ block if it exists, as CMD in Dockerfile handles startup
177
  # if __name__ == "__main__":
 
11
  import shutil
12
  import torch
13
  import asyncio
14
+ import logging
15
  from contextlib import asynccontextmanager
16
 
17
+ # Configure logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
21
+ )
22
+ logger = logging.getLogger("pdf_converter_api")
23
+
24
  # Add the parent directory to sys.path to import convert_pdf_to_md
25
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
26
  # Import the initialization function as well
 
41
  # Create output directory if it doesn't exist
42
  os.makedirs(output_dir, exist_ok=True)
43
  os.makedirs(images_dir, exist_ok=True)
44
+ logger.info(f"Using output directory: {output_dir}")
45
  # --- End Configuration ---
46
 
47
+ # Track initialization status
48
+ initialization_successful = False
49
+
50
  # --- Lifespan management for model loading ---
51
  @asynccontextmanager
52
  async def lifespan(app: FastAPI):
53
+ global initialization_successful
54
  # Load the ML model during startup
55
+ logger.info("Application startup: Initializing marker converter...")
56
  loop = asyncio.get_event_loop()
57
+
58
  # Run in executor to avoid blocking the event loop
59
+ try:
60
+ # Add timeout to prevent indefinite hanging
61
+ await asyncio.wait_for(
62
+ loop.run_in_executor(None, initialize_converter),
63
+ timeout=300 # 5 minute timeout for initialization
64
+ )
65
+ initialization_successful = True
66
+ logger.info("Marker converter initialization process finished successfully.")
67
+ except asyncio.TimeoutError:
68
+ logger.error("Marker converter initialization timed out after 5 minutes.")
69
+ initialization_successful = False
70
+ except Exception as e:
71
+ logger.error(f"Marker converter initialization failed: {e}")
72
+ logger.error(traceback.format_exc())
73
+ initialization_successful = False
74
+
75
  yield
76
+
77
  # Clean up resources if needed during shutdown
78
+ logger.info("Application shutdown.")
79
 
80
  # Application metadata
81
  app_description = """
 
121
  "cuda_available": torch.cuda.is_available(),
122
  "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
123
  "device_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A",
124
+ "current_device": torch.cuda.current_device() if torch.cuda.is_available() else -1,
125
+ "memory_allocated": f"{torch.cuda.memory_allocated()/1024**2:.2f} MB" if torch.cuda.is_available() else "N/A",
126
+ "memory_reserved": f"{torch.cuda.memory_reserved()/1024**2:.2f} MB" if torch.cuda.is_available() else "N/A",
127
  }
128
 
129
  return {
130
+ "status": "healthy" if initialization_successful else "degraded",
131
  "timestamp": datetime.now().isoformat(),
132
  "service": "pdf-to-markdown-converter",
133
  "gpu": gpu_info,
134
+ "model_initialized": initialization_successful,
135
  "output_directory_used": output_dir # Add info for debugging
136
  }
137
 
 
146
  Returns:
147
  A JSON object containing the conversion result
148
  """
149
+ # Check if models initialized successfully
150
+ if not initialization_successful:
151
+ return JSONResponse(
152
+ status_code=503, # Service Unavailable
153
+ content={
154
+ "error": "Service not ready",
155
+ "detail": "The model initialization failed during startup. The service cannot process requests at this time."
156
+ }
157
+ )
158
+
159
  if not file.filename or not file.filename.lower().endswith('.pdf'):
160
  raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
161
 
 
168
  with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False, dir="/tmp") as temp_pdf:
169
  temp_pdf.write(content)
170
  temp_pdf_path = temp_pdf.name
171
+ logger.info(f"Temporary PDF saved to: {temp_pdf_path}")
172
 
173
  # Get the base name of the file for the output
174
  filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
175
  # Use the configured output_dir for saving the markdown file
176
  output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
177
+ logger.info(f"Output markdown path: {output_md_file}")
178
 
179
  # Process the PDF using the pre-loaded converter
180
  md_content = convert_pdf(temp_pdf_path, output_md_file)
 
194
  except Exception as e:
195
  error_detail = str(e)
196
  error_trace = traceback.format_exc()
197
+ logger.error(f"Error processing PDF '{file.filename if file else 'N/A'}': {error_detail}")
198
+ logger.error(error_trace)
199
  return JSONResponse(
200
  status_code=500,
201
  content={
 
210
  if temp_pdf_path and os.path.exists(temp_pdf_path):
211
  try:
212
  os.unlink(temp_pdf_path)
213
+ logger.info(f"Temporary file {temp_pdf_path} deleted.")
214
  except Exception as unlink_err:
215
+ logger.error(f"Error deleting temporary file {temp_pdf_path}: {unlink_err}")
216
 
217
  # Remove the old __main__ block if it exists, as CMD in Dockerfile handles startup
218
  # if __name__ == "__main__":
pdf_converter/convert_pdf_to_md.py CHANGED
@@ -1,6 +1,8 @@
1
  import marker
2
  import os
3
  import sys
 
 
4
  from marker.config.parser import ConfigParser
5
  from marker.models import create_model_dict
6
 
@@ -13,6 +15,12 @@ def initialize_converter():
13
  if _converter is None:
14
  print("Initializing marker models...")
15
  try:
 
 
 
 
 
 
16
  # Set custom font path from environment variable if available
17
  font_path = os.environ.get('MARKER_FONT_PATH')
18
  if font_path:
@@ -29,15 +37,15 @@ def initialize_converter():
29
  print(f"Error setting custom font path: {e}", file=sys.stderr)
30
 
31
  # Create configuration, explicitly setting output format and batch multiplier
32
- # Increased batch_multiplier for potentially faster processing on L40S
33
  config_parser = ConfigParser({
34
  'output_format': 'markdown',
35
- 'batch_multiplier': 4 # Increased from default 2
 
 
36
  })
37
 
38
- # Load models
39
- # Potential optimization: Check if device mapping/multi-GPU is possible
40
- models = create_model_dict() # Add device mapping here if applicable
41
 
42
  # Get converter class and create converter
43
  converter_cls = config_parser.get_converter_cls()
@@ -48,10 +56,21 @@ def initialize_converter():
48
  renderer=config_parser.get_renderer(),
49
  llm_service=config_parser.get_llm_service()
50
  )
 
 
 
 
 
 
 
51
  print("Marker models initialized successfully with batch_multiplier=4.")
52
  except Exception as e:
53
  print(f"Failed to initialize marker models: {e}", file=sys.stderr)
54
  _converter = None # Ensure it's None if init fails
 
 
 
 
55
  raise
56
  else:
57
  print("Marker models already initialized.")
@@ -78,6 +97,10 @@ def convert_pdf(pdf_input_path, output_md_path=None):
78
  print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")
79
 
80
  try:
 
 
 
 
81
  # Convert the PDF to markdown using the pre-loaded converter
82
  result = _converter(pdf_input_path)
83
 
@@ -94,9 +117,16 @@ def convert_pdf(pdf_input_path, output_md_path=None):
94
  f.write(markdown_text)
95
  print(f"Successfully saved markdown to '{output_md_path}'")
96
 
 
 
 
 
97
  return markdown_text
98
 
99
  except Exception as e:
100
  print(f"An error occurred during conversion: {e}", file=sys.stderr)
101
  print(f"Error details: {str(type(e))}", file=sys.stderr)
 
 
 
102
  raise
 
1
  import marker
2
  import os
3
  import sys
4
+ import gc
5
+ import torch
6
  from marker.config.parser import ConfigParser
7
  from marker.models import create_model_dict
8
 
 
15
  if _converter is None:
16
  print("Initializing marker models...")
17
  try:
18
+ # Clear any existing CUDA cache before loading models
19
+ if torch.cuda.is_available():
20
+ torch.cuda.empty_cache()
21
+ gc.collect()
22
+ print(f"CUDA memory before initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
23
+
24
  # Set custom font path from environment variable if available
25
  font_path = os.environ.get('MARKER_FONT_PATH')
26
  if font_path:
 
37
  print(f"Error setting custom font path: {e}", file=sys.stderr)
38
 
39
  # Create configuration, explicitly setting output format and batch multiplier
 
40
  config_parser = ConfigParser({
41
  'output_format': 'markdown',
42
+ 'batch_multiplier': 4, # Increased from default 2
43
+ # Add any device-specific configuration here
44
+ 'device': 'cuda' if torch.cuda.is_available() else 'cpu'
45
  })
46
 
47
+ # Load models with explicit device mapping
48
+ models = create_model_dict()
 
49
 
50
  # Get converter class and create converter
51
  converter_cls = config_parser.get_converter_cls()
 
56
  renderer=config_parser.get_renderer(),
57
  llm_service=config_parser.get_llm_service()
58
  )
59
+
60
+ # Force another garbage collection after model load
61
+ if torch.cuda.is_available():
62
+ torch.cuda.empty_cache()
63
+ gc.collect()
64
+ print(f"CUDA memory after initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
65
+
66
  print("Marker models initialized successfully with batch_multiplier=4.")
67
  except Exception as e:
68
  print(f"Failed to initialize marker models: {e}", file=sys.stderr)
69
  _converter = None # Ensure it's None if init fails
70
+ # Attempt to clean up GPU memory in case of initialization failure
71
+ if torch.cuda.is_available():
72
+ torch.cuda.empty_cache()
73
+ gc.collect()
74
  raise
75
  else:
76
  print("Marker models already initialized.")
 
97
  print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")
98
 
99
  try:
100
+ # Free up any temporary memory before conversion
101
+ if torch.cuda.is_available():
102
+ torch.cuda.empty_cache()
103
+
104
  # Convert the PDF to markdown using the pre-loaded converter
105
  result = _converter(pdf_input_path)
106
 
 
117
  f.write(markdown_text)
118
  print(f"Successfully saved markdown to '{output_md_path}'")
119
 
120
+ # Clean up temporary GPU memory after conversion
121
+ if torch.cuda.is_available():
122
+ torch.cuda.empty_cache()
123
+
124
  return markdown_text
125
 
126
  except Exception as e:
127
  print(f"An error occurred during conversion: {e}", file=sys.stderr)
128
  print(f"Error details: {str(type(e))}", file=sys.stderr)
129
+ # Try to clean up GPU memory on error
130
+ if torch.cuda.is_available():
131
+ torch.cuda.empty_cache()
132
  raise