Spaces:
Sleeping
Sleeping
Commit
·
3c3eb16
1
Parent(s):
ed4cfc9
Update PDF to Markdown converter API with NVIDIA L4 support
Browse files- Dockerfile +5 -2
- app/main.py +54 -13
- pdf_converter/convert_pdf_to_md.py +35 -5
Dockerfile
CHANGED
@@ -71,10 +71,13 @@ ENV HF_HOME=/home/user/.cache/huggingface
|
|
71 |
ENV TORCH_HOME=/home/user/.cache/torch
|
72 |
# Add environment variable for marker font path (alternative fix)
|
73 |
ENV MARKER_FONT_PATH=/home/user/.cache/marker_fonts
|
|
|
|
|
|
|
74 |
|
75 |
# Expose the port
|
76 |
EXPOSE 7860
|
77 |
|
78 |
# Command to run the application with Gunicorn and Uvicorn workers
|
79 |
-
#
|
80 |
-
CMD ["gunicorn", "-w", "
|
|
|
71 |
ENV TORCH_HOME=/home/user/.cache/torch
|
72 |
# Add environment variable for marker font path (alternative fix)
|
73 |
ENV MARKER_FONT_PATH=/home/user/.cache/marker_fonts
|
74 |
+
# Add PyTorch memory optimization environment variables
|
75 |
+
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128
|
76 |
+
ENV CUDA_VISIBLE_DEVICES=0
|
77 |
|
78 |
# Expose the port
|
79 |
EXPOSE 7860
|
80 |
|
81 |
# Command to run the application with Gunicorn and Uvicorn workers
|
82 |
+
# Reduced workers to 4 (from 16) to avoid OOM errors
|
83 |
+
CMD ["gunicorn", "-w", "4", "-k", "uvicorn.workers.UvicornWorker", "app.main:app", "--bind", "0.0.0.0:7860"]
|
app/main.py
CHANGED
@@ -11,8 +11,16 @@ from typing import Dict, Any
|
|
11 |
import shutil
|
12 |
import torch
|
13 |
import asyncio
|
|
|
14 |
from contextlib import asynccontextmanager
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
# Add the parent directory to sys.path to import convert_pdf_to_md
|
17 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
18 |
# Import the initialization function as well
|
@@ -33,21 +41,41 @@ images_dir = os.path.join(output_dir, "images")
|
|
33 |
# Create output directory if it doesn't exist
|
34 |
os.makedirs(output_dir, exist_ok=True)
|
35 |
os.makedirs(images_dir, exist_ok=True)
|
36 |
-
|
37 |
# --- End Configuration ---
|
38 |
|
|
|
|
|
|
|
39 |
# --- Lifespan management for model loading ---
|
40 |
@asynccontextmanager
|
41 |
async def lifespan(app: FastAPI):
|
|
|
42 |
# Load the ML model during startup
|
43 |
-
|
44 |
loop = asyncio.get_event_loop()
|
|
|
45 |
# Run in executor to avoid blocking the event loop
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
yield
|
|
|
49 |
# Clean up resources if needed during shutdown
|
50 |
-
|
51 |
|
52 |
# Application metadata
|
53 |
app_description = """
|
@@ -93,14 +121,17 @@ async def health_check() -> Dict[str, Any]:
|
|
93 |
"cuda_available": torch.cuda.is_available(),
|
94 |
"device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
|
95 |
"device_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A",
|
96 |
-
"current_device": torch.cuda.current_device() if torch.cuda.is_available() else -1
|
|
|
|
|
97 |
}
|
98 |
|
99 |
return {
|
100 |
-
"status": "healthy",
|
101 |
"timestamp": datetime.now().isoformat(),
|
102 |
"service": "pdf-to-markdown-converter",
|
103 |
"gpu": gpu_info,
|
|
|
104 |
"output_directory_used": output_dir # Add info for debugging
|
105 |
}
|
106 |
|
@@ -115,6 +146,16 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
|
|
115 |
Returns:
|
116 |
A JSON object containing the conversion result
|
117 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
if not file.filename or not file.filename.lower().endswith('.pdf'):
|
119 |
raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
|
120 |
|
@@ -127,13 +168,13 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
|
|
127 |
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False, dir="/tmp") as temp_pdf:
|
128 |
temp_pdf.write(content)
|
129 |
temp_pdf_path = temp_pdf.name
|
130 |
-
|
131 |
|
132 |
# Get the base name of the file for the output
|
133 |
filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
|
134 |
# Use the configured output_dir for saving the markdown file
|
135 |
output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
|
136 |
-
|
137 |
|
138 |
# Process the PDF using the pre-loaded converter
|
139 |
md_content = convert_pdf(temp_pdf_path, output_md_file)
|
@@ -153,8 +194,8 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
|
|
153 |
except Exception as e:
|
154 |
error_detail = str(e)
|
155 |
error_trace = traceback.format_exc()
|
156 |
-
|
157 |
-
|
158 |
return JSONResponse(
|
159 |
status_code=500,
|
160 |
content={
|
@@ -169,9 +210,9 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
|
|
169 |
if temp_pdf_path and os.path.exists(temp_pdf_path):
|
170 |
try:
|
171 |
os.unlink(temp_pdf_path)
|
172 |
-
|
173 |
except Exception as unlink_err:
|
174 |
-
|
175 |
|
176 |
# Remove the old __main__ block if it exists, as CMD in Dockerfile handles startup
|
177 |
# if __name__ == "__main__":
|
|
|
11 |
import shutil
|
12 |
import torch
|
13 |
import asyncio
|
14 |
+
import logging
|
15 |
from contextlib import asynccontextmanager
|
16 |
|
17 |
+
# Configure logging
|
18 |
+
logging.basicConfig(
|
19 |
+
level=logging.INFO,
|
20 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
21 |
+
)
|
22 |
+
logger = logging.getLogger("pdf_converter_api")
|
23 |
+
|
24 |
# Add the parent directory to sys.path to import convert_pdf_to_md
|
25 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
26 |
# Import the initialization function as well
|
|
|
41 |
# Create output directory if it doesn't exist
|
42 |
os.makedirs(output_dir, exist_ok=True)
|
43 |
os.makedirs(images_dir, exist_ok=True)
|
44 |
+
logger.info(f"Using output directory: {output_dir}")
|
45 |
# --- End Configuration ---
|
46 |
|
47 |
+
# Track initialization status
|
48 |
+
initialization_successful = False
|
49 |
+
|
50 |
# --- Lifespan management for model loading ---
|
51 |
@asynccontextmanager
|
52 |
async def lifespan(app: FastAPI):
|
53 |
+
global initialization_successful
|
54 |
# Load the ML model during startup
|
55 |
+
logger.info("Application startup: Initializing marker converter...")
|
56 |
loop = asyncio.get_event_loop()
|
57 |
+
|
58 |
# Run in executor to avoid blocking the event loop
|
59 |
+
try:
|
60 |
+
# Add timeout to prevent indefinite hanging
|
61 |
+
await asyncio.wait_for(
|
62 |
+
loop.run_in_executor(None, initialize_converter),
|
63 |
+
timeout=300 # 5 minute timeout for initialization
|
64 |
+
)
|
65 |
+
initialization_successful = True
|
66 |
+
logger.info("Marker converter initialization process finished successfully.")
|
67 |
+
except asyncio.TimeoutError:
|
68 |
+
logger.error("Marker converter initialization timed out after 5 minutes.")
|
69 |
+
initialization_successful = False
|
70 |
+
except Exception as e:
|
71 |
+
logger.error(f"Marker converter initialization failed: {e}")
|
72 |
+
logger.error(traceback.format_exc())
|
73 |
+
initialization_successful = False
|
74 |
+
|
75 |
yield
|
76 |
+
|
77 |
# Clean up resources if needed during shutdown
|
78 |
+
logger.info("Application shutdown.")
|
79 |
|
80 |
# Application metadata
|
81 |
app_description = """
|
|
|
121 |
"cuda_available": torch.cuda.is_available(),
|
122 |
"device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
|
123 |
"device_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A",
|
124 |
+
"current_device": torch.cuda.current_device() if torch.cuda.is_available() else -1,
|
125 |
+
"memory_allocated": f"{torch.cuda.memory_allocated()/1024**2:.2f} MB" if torch.cuda.is_available() else "N/A",
|
126 |
+
"memory_reserved": f"{torch.cuda.memory_reserved()/1024**2:.2f} MB" if torch.cuda.is_available() else "N/A",
|
127 |
}
|
128 |
|
129 |
return {
|
130 |
+
"status": "healthy" if initialization_successful else "degraded",
|
131 |
"timestamp": datetime.now().isoformat(),
|
132 |
"service": "pdf-to-markdown-converter",
|
133 |
"gpu": gpu_info,
|
134 |
+
"model_initialized": initialization_successful,
|
135 |
"output_directory_used": output_dir # Add info for debugging
|
136 |
}
|
137 |
|
|
|
146 |
Returns:
|
147 |
A JSON object containing the conversion result
|
148 |
"""
|
149 |
+
# Check if models initialized successfully
|
150 |
+
if not initialization_successful:
|
151 |
+
return JSONResponse(
|
152 |
+
status_code=503, # Service Unavailable
|
153 |
+
content={
|
154 |
+
"error": "Service not ready",
|
155 |
+
"detail": "The model initialization failed during startup. The service cannot process requests at this time."
|
156 |
+
}
|
157 |
+
)
|
158 |
+
|
159 |
if not file.filename or not file.filename.lower().endswith('.pdf'):
|
160 |
raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
|
161 |
|
|
|
168 |
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False, dir="/tmp") as temp_pdf:
|
169 |
temp_pdf.write(content)
|
170 |
temp_pdf_path = temp_pdf.name
|
171 |
+
logger.info(f"Temporary PDF saved to: {temp_pdf_path}")
|
172 |
|
173 |
# Get the base name of the file for the output
|
174 |
filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
|
175 |
# Use the configured output_dir for saving the markdown file
|
176 |
output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
|
177 |
+
logger.info(f"Output markdown path: {output_md_file}")
|
178 |
|
179 |
# Process the PDF using the pre-loaded converter
|
180 |
md_content = convert_pdf(temp_pdf_path, output_md_file)
|
|
|
194 |
except Exception as e:
|
195 |
error_detail = str(e)
|
196 |
error_trace = traceback.format_exc()
|
197 |
+
logger.error(f"Error processing PDF '{file.filename if file else 'N/A'}': {error_detail}")
|
198 |
+
logger.error(error_trace)
|
199 |
return JSONResponse(
|
200 |
status_code=500,
|
201 |
content={
|
|
|
210 |
if temp_pdf_path and os.path.exists(temp_pdf_path):
|
211 |
try:
|
212 |
os.unlink(temp_pdf_path)
|
213 |
+
logger.info(f"Temporary file {temp_pdf_path} deleted.")
|
214 |
except Exception as unlink_err:
|
215 |
+
logger.error(f"Error deleting temporary file {temp_pdf_path}: {unlink_err}")
|
216 |
|
217 |
# Remove the old __main__ block if it exists, as CMD in Dockerfile handles startup
|
218 |
# if __name__ == "__main__":
|
pdf_converter/convert_pdf_to_md.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import marker
|
2 |
import os
|
3 |
import sys
|
|
|
|
|
4 |
from marker.config.parser import ConfigParser
|
5 |
from marker.models import create_model_dict
|
6 |
|
@@ -13,6 +15,12 @@ def initialize_converter():
|
|
13 |
if _converter is None:
|
14 |
print("Initializing marker models...")
|
15 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
# Set custom font path from environment variable if available
|
17 |
font_path = os.environ.get('MARKER_FONT_PATH')
|
18 |
if font_path:
|
@@ -29,15 +37,15 @@ def initialize_converter():
|
|
29 |
print(f"Error setting custom font path: {e}", file=sys.stderr)
|
30 |
|
31 |
# Create configuration, explicitly setting output format and batch multiplier
|
32 |
-
# Increased batch_multiplier for potentially faster processing on L40S
|
33 |
config_parser = ConfigParser({
|
34 |
'output_format': 'markdown',
|
35 |
-
'batch_multiplier': 4
|
|
|
|
|
36 |
})
|
37 |
|
38 |
-
# Load models
|
39 |
-
|
40 |
-
models = create_model_dict() # Add device mapping here if applicable
|
41 |
|
42 |
# Get converter class and create converter
|
43 |
converter_cls = config_parser.get_converter_cls()
|
@@ -48,10 +56,21 @@ def initialize_converter():
|
|
48 |
renderer=config_parser.get_renderer(),
|
49 |
llm_service=config_parser.get_llm_service()
|
50 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
print("Marker models initialized successfully with batch_multiplier=4.")
|
52 |
except Exception as e:
|
53 |
print(f"Failed to initialize marker models: {e}", file=sys.stderr)
|
54 |
_converter = None # Ensure it's None if init fails
|
|
|
|
|
|
|
|
|
55 |
raise
|
56 |
else:
|
57 |
print("Marker models already initialized.")
|
@@ -78,6 +97,10 @@ def convert_pdf(pdf_input_path, output_md_path=None):
|
|
78 |
print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")
|
79 |
|
80 |
try:
|
|
|
|
|
|
|
|
|
81 |
# Convert the PDF to markdown using the pre-loaded converter
|
82 |
result = _converter(pdf_input_path)
|
83 |
|
@@ -94,9 +117,16 @@ def convert_pdf(pdf_input_path, output_md_path=None):
|
|
94 |
f.write(markdown_text)
|
95 |
print(f"Successfully saved markdown to '{output_md_path}'")
|
96 |
|
|
|
|
|
|
|
|
|
97 |
return markdown_text
|
98 |
|
99 |
except Exception as e:
|
100 |
print(f"An error occurred during conversion: {e}", file=sys.stderr)
|
101 |
print(f"Error details: {str(type(e))}", file=sys.stderr)
|
|
|
|
|
|
|
102 |
raise
|
|
|
1 |
import marker
|
2 |
import os
|
3 |
import sys
|
4 |
+
import gc
|
5 |
+
import torch
|
6 |
from marker.config.parser import ConfigParser
|
7 |
from marker.models import create_model_dict
|
8 |
|
|
|
15 |
if _converter is None:
|
16 |
print("Initializing marker models...")
|
17 |
try:
|
18 |
+
# Clear any existing CUDA cache before loading models
|
19 |
+
if torch.cuda.is_available():
|
20 |
+
torch.cuda.empty_cache()
|
21 |
+
gc.collect()
|
22 |
+
print(f"CUDA memory before initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
|
23 |
+
|
24 |
# Set custom font path from environment variable if available
|
25 |
font_path = os.environ.get('MARKER_FONT_PATH')
|
26 |
if font_path:
|
|
|
37 |
print(f"Error setting custom font path: {e}", file=sys.stderr)
|
38 |
|
39 |
# Create configuration, explicitly setting output format and batch multiplier
|
|
|
40 |
config_parser = ConfigParser({
|
41 |
'output_format': 'markdown',
|
42 |
+
'batch_multiplier': 4, # Increased from default 2
|
43 |
+
# Add any device-specific configuration here
|
44 |
+
'device': 'cuda' if torch.cuda.is_available() else 'cpu'
|
45 |
})
|
46 |
|
47 |
+
# Load models with explicit device mapping
|
48 |
+
models = create_model_dict()
|
|
|
49 |
|
50 |
# Get converter class and create converter
|
51 |
converter_cls = config_parser.get_converter_cls()
|
|
|
56 |
renderer=config_parser.get_renderer(),
|
57 |
llm_service=config_parser.get_llm_service()
|
58 |
)
|
59 |
+
|
60 |
+
# Force another garbage collection after model load
|
61 |
+
if torch.cuda.is_available():
|
62 |
+
torch.cuda.empty_cache()
|
63 |
+
gc.collect()
|
64 |
+
print(f"CUDA memory after initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
|
65 |
+
|
66 |
print("Marker models initialized successfully with batch_multiplier=4.")
|
67 |
except Exception as e:
|
68 |
print(f"Failed to initialize marker models: {e}", file=sys.stderr)
|
69 |
_converter = None # Ensure it's None if init fails
|
70 |
+
# Attempt to clean up GPU memory in case of initialization failure
|
71 |
+
if torch.cuda.is_available():
|
72 |
+
torch.cuda.empty_cache()
|
73 |
+
gc.collect()
|
74 |
raise
|
75 |
else:
|
76 |
print("Marker models already initialized.")
|
|
|
97 |
print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")
|
98 |
|
99 |
try:
|
100 |
+
# Free up any temporary memory before conversion
|
101 |
+
if torch.cuda.is_available():
|
102 |
+
torch.cuda.empty_cache()
|
103 |
+
|
104 |
# Convert the PDF to markdown using the pre-loaded converter
|
105 |
result = _converter(pdf_input_path)
|
106 |
|
|
|
117 |
f.write(markdown_text)
|
118 |
print(f"Successfully saved markdown to '{output_md_path}'")
|
119 |
|
120 |
+
# Clean up temporary GPU memory after conversion
|
121 |
+
if torch.cuda.is_available():
|
122 |
+
torch.cuda.empty_cache()
|
123 |
+
|
124 |
return markdown_text
|
125 |
|
126 |
except Exception as e:
|
127 |
print(f"An error occurred during conversion: {e}", file=sys.stderr)
|
128 |
print(f"Error details: {str(type(e))}", file=sys.stderr)
|
129 |
+
# Try to clean up GPU memory on error
|
130 |
+
if torch.cuda.is_available():
|
131 |
+
torch.cuda.empty_cache()
|
132 |
raise
|