Spaces:

marcosremar2
/

docker_mineru

Sleeping

App Files Files Community

marcosremar2 commited on May 3

Commit

3c3eb16

1 Parent(s): ed4cfc9

Update PDF to Markdown converter API with NVIDIA L4 support

Browse files

Files changed (3) hide show

Dockerfile +5 -2
app/main.py +54 -13
pdf_converter/convert_pdf_to_md.py +35 -5

Dockerfile CHANGED Viewed

@@ -71,10 +71,13 @@ ENV HF_HOME=/home/user/.cache/huggingface
 ENV TORCH_HOME=/home/user/.cache/torch
 # Add environment variable for marker font path (alternative fix)
 ENV MARKER_FONT_PATH=/home/user/.cache/marker_fonts
 # Expose the port
 EXPOSE 7860
 # Command to run the application with Gunicorn and Uvicorn workers
-# Increased workers to 16 for L40S. Adjust based on monitoring.
-CMD ["gunicorn", "-w", "16", "-k", "uvicorn.workers.UvicornWorker", "app.main:app", "--bind", "0.0.0.0:7860"]

 ENV TORCH_HOME=/home/user/.cache/torch
 # Add environment variable for marker font path (alternative fix)
 ENV MARKER_FONT_PATH=/home/user/.cache/marker_fonts
+# Add PyTorch memory optimization environment variables
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128
+ENV CUDA_VISIBLE_DEVICES=0
 # Expose the port
 EXPOSE 7860
 # Command to run the application with Gunicorn and Uvicorn workers
+# Reduced workers to 4 (from 16) to avoid OOM errors
+CMD ["gunicorn", "-w", "4", "-k", "uvicorn.workers.UvicornWorker", "app.main:app", "--bind", "0.0.0.0:7860"]

app/main.py CHANGED Viewed

@@ -11,8 +11,16 @@ from typing import Dict, Any
 import shutil
 import torch
 import asyncio
 from contextlib import asynccontextmanager
 # Add the parent directory to sys.path to import convert_pdf_to_md
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 # Import the initialization function as well
@@ -33,21 +41,41 @@ images_dir = os.path.join(output_dir, "images")
 # Create output directory if it doesn't exist
 os.makedirs(output_dir, exist_ok=True)
 os.makedirs(images_dir, exist_ok=True)
-print(f"Using output directory: {output_dir}") # Add log for debugging
 # --- End Configuration ---
 # --- Lifespan management for model loading ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     # Load the ML model during startup
-    print("Application startup: Initializing marker converter...")
     loop = asyncio.get_event_loop()
     # Run in executor to avoid blocking the event loop
-    await loop.run_in_executor(None, initialize_converter)
-    print("Marker converter initialization process finished.")
     yield
     # Clean up resources if needed during shutdown
-    print("Application shutdown.")
 # Application metadata
 app_description = """
@@ -93,14 +121,17 @@ async def health_check() -> Dict[str, Any]:
         "cuda_available": torch.cuda.is_available(),
         "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
         "device_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A",
-        "current_device": torch.cuda.current_device() if torch.cuda.is_available() else -1
     }
     return {
-        "status": "healthy",
         "timestamp": datetime.now().isoformat(),
         "service": "pdf-to-markdown-converter",
         "gpu": gpu_info,
         "output_directory_used": output_dir # Add info for debugging
     }
@@ -115,6 +146,16 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
     Returns:
         A JSON object containing the conversion result
     """
     if not file.filename or not file.filename.lower().endswith('.pdf'):
         raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
@@ -127,13 +168,13 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
         with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False, dir="/tmp") as temp_pdf:
             temp_pdf.write(content)
             temp_pdf_path = temp_pdf.name
-        print(f"Temporary PDF saved to: {temp_pdf_path}")
         # Get the base name of the file for the output
         filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
         # Use the configured output_dir for saving the markdown file
         output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
-        print(f"Output markdown path: {output_md_file}")
         # Process the PDF using the pre-loaded converter
         md_content = convert_pdf(temp_pdf_path, output_md_file)
@@ -153,8 +194,8 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
     except Exception as e:
         error_detail = str(e)
         error_trace = traceback.format_exc()
-        print(f"Error processing PDF '{file.filename if file else 'N/A'}': {error_detail}")
-        print(error_trace)
         return JSONResponse(
             status_code=500,
             content={
@@ -169,9 +210,9 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
         if temp_pdf_path and os.path.exists(temp_pdf_path):
             try:
                 os.unlink(temp_pdf_path)
-                print(f"Temporary file {temp_pdf_path} deleted.")
             except Exception as unlink_err:
-                print(f"Error deleting temporary file {temp_pdf_path}: {unlink_err}")
 # Remove the old __main__ block if it exists, as CMD in Dockerfile handles startup
 # if __name__ == "__main__":

 import shutil
 import torch
 import asyncio
+import logging
 from contextlib import asynccontextmanager
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger("pdf_converter_api")
 # Add the parent directory to sys.path to import convert_pdf_to_md
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 # Import the initialization function as well
 # Create output directory if it doesn't exist
 os.makedirs(output_dir, exist_ok=True)
 os.makedirs(images_dir, exist_ok=True)
+logger.info(f"Using output directory: {output_dir}")
 # --- End Configuration ---
+# Track initialization status
+initialization_successful = False
 # --- Lifespan management for model loading ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    global initialization_successful
     # Load the ML model during startup
+    logger.info("Application startup: Initializing marker converter...")
     loop = asyncio.get_event_loop()
     # Run in executor to avoid blocking the event loop
+    try:
+        # Add timeout to prevent indefinite hanging
+        await asyncio.wait_for(
+            loop.run_in_executor(None, initialize_converter),
+            timeout=300  # 5 minute timeout for initialization
+        )
+        initialization_successful = True
+        logger.info("Marker converter initialization process finished successfully.")
+    except asyncio.TimeoutError:
+        logger.error("Marker converter initialization timed out after 5 minutes.")
+        initialization_successful = False
+    except Exception as e:
+        logger.error(f"Marker converter initialization failed: {e}")
+        logger.error(traceback.format_exc())
+        initialization_successful = False
     yield
     # Clean up resources if needed during shutdown
+    logger.info("Application shutdown.")
 # Application metadata
 app_description = """
         "cuda_available": torch.cuda.is_available(),
         "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
         "device_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A",
+        "current_device": torch.cuda.current_device() if torch.cuda.is_available() else -1,
+        "memory_allocated": f"{torch.cuda.memory_allocated()/1024**2:.2f} MB" if torch.cuda.is_available() else "N/A",
+        "memory_reserved": f"{torch.cuda.memory_reserved()/1024**2:.2f} MB" if torch.cuda.is_available() else "N/A",
     }
     return {
+        "status": "healthy" if initialization_successful else "degraded",
         "timestamp": datetime.now().isoformat(),
         "service": "pdf-to-markdown-converter",
         "gpu": gpu_info,
+        "model_initialized": initialization_successful,
         "output_directory_used": output_dir # Add info for debugging
     }
     Returns:
         A JSON object containing the conversion result
     """
+    # Check if models initialized successfully
+    if not initialization_successful:
+        return JSONResponse(
+            status_code=503,  # Service Unavailable
+            content={
+                "error": "Service not ready",
+                "detail": "The model initialization failed during startup. The service cannot process requests at this time."
+            }
+        )
     if not file.filename or not file.filename.lower().endswith('.pdf'):
         raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
         with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False, dir="/tmp") as temp_pdf:
             temp_pdf.write(content)
             temp_pdf_path = temp_pdf.name
+        logger.info(f"Temporary PDF saved to: {temp_pdf_path}")
         # Get the base name of the file for the output
         filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
         # Use the configured output_dir for saving the markdown file
         output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
+        logger.info(f"Output markdown path: {output_md_file}")
         # Process the PDF using the pre-loaded converter
         md_content = convert_pdf(temp_pdf_path, output_md_file)
     except Exception as e:
         error_detail = str(e)
         error_trace = traceback.format_exc()
+        logger.error(f"Error processing PDF '{file.filename if file else 'N/A'}': {error_detail}")
+        logger.error(error_trace)
         return JSONResponse(
             status_code=500,
             content={
         if temp_pdf_path and os.path.exists(temp_pdf_path):
             try:
                 os.unlink(temp_pdf_path)
+                logger.info(f"Temporary file {temp_pdf_path} deleted.")
             except Exception as unlink_err:
+                logger.error(f"Error deleting temporary file {temp_pdf_path}: {unlink_err}")
 # Remove the old __main__ block if it exists, as CMD in Dockerfile handles startup
 # if __name__ == "__main__":

pdf_converter/convert_pdf_to_md.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import marker
 import os
 import sys
 from marker.config.parser import ConfigParser
 from marker.models import create_model_dict
@@ -13,6 +15,12 @@ def initialize_converter():
     if _converter is None:
         print("Initializing marker models...")
         try:
             # Set custom font path from environment variable if available
             font_path = os.environ.get('MARKER_FONT_PATH')
             if font_path:
@@ -29,15 +37,15 @@ def initialize_converter():
                     print(f"Error setting custom font path: {e}", file=sys.stderr)
             # Create configuration, explicitly setting output format and batch multiplier
-            # Increased batch_multiplier for potentially faster processing on L40S
             config_parser = ConfigParser({
                 'output_format': 'markdown',
-                'batch_multiplier': 4 # Increased from default 2
             })
-            # Load models
-            # Potential optimization: Check if device mapping/multi-GPU is possible
-            models = create_model_dict() # Add device mapping here if applicable
             # Get converter class and create converter
             converter_cls = config_parser.get_converter_cls()
@@ -48,10 +56,21 @@ def initialize_converter():
                 renderer=config_parser.get_renderer(),
                 llm_service=config_parser.get_llm_service()
             )
             print("Marker models initialized successfully with batch_multiplier=4.")
         except Exception as e:
             print(f"Failed to initialize marker models: {e}", file=sys.stderr)
             _converter = None # Ensure it's None if init fails
             raise
     else:
         print("Marker models already initialized.")
@@ -78,6 +97,10 @@ def convert_pdf(pdf_input_path, output_md_path=None):
     print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")
     try:
         # Convert the PDF to markdown using the pre-loaded converter
         result = _converter(pdf_input_path)
@@ -94,9 +117,16 @@ def convert_pdf(pdf_input_path, output_md_path=None):
                 f.write(markdown_text)
             print(f"Successfully saved markdown to '{output_md_path}'")
         return markdown_text
     except Exception as e:
         print(f"An error occurred during conversion: {e}", file=sys.stderr)
         print(f"Error details: {str(type(e))}", file=sys.stderr)
         raise

 import marker
 import os
 import sys
+import gc
+import torch
 from marker.config.parser import ConfigParser
 from marker.models import create_model_dict
     if _converter is None:
         print("Initializing marker models...")
         try:
+            # Clear any existing CUDA cache before loading models
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                gc.collect()
+                print(f"CUDA memory before initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
             # Set custom font path from environment variable if available
             font_path = os.environ.get('MARKER_FONT_PATH')
             if font_path:
                     print(f"Error setting custom font path: {e}", file=sys.stderr)
             # Create configuration, explicitly setting output format and batch multiplier
             config_parser = ConfigParser({
                 'output_format': 'markdown',
+                'batch_multiplier': 4,  # Increased from default 2
+                # Add any device-specific configuration here
+                'device': 'cuda' if torch.cuda.is_available() else 'cpu'
             })
+            # Load models with explicit device mapping
+            models = create_model_dict()
             # Get converter class and create converter
             converter_cls = config_parser.get_converter_cls()
                 renderer=config_parser.get_renderer(),
                 llm_service=config_parser.get_llm_service()
             )
+            # Force another garbage collection after model load
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                gc.collect()
+                print(f"CUDA memory after initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
             print("Marker models initialized successfully with batch_multiplier=4.")
         except Exception as e:
             print(f"Failed to initialize marker models: {e}", file=sys.stderr)
             _converter = None # Ensure it's None if init fails
+            # Attempt to clean up GPU memory in case of initialization failure
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                gc.collect()
             raise
     else:
         print("Marker models already initialized.")
     print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")
     try:
+        # Free up any temporary memory before conversion
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         # Convert the PDF to markdown using the pre-loaded converter
         result = _converter(pdf_input_path)
                 f.write(markdown_text)
             print(f"Successfully saved markdown to '{output_md_path}'")
+        # Clean up temporary GPU memory after conversion
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         return markdown_text
     except Exception as e:
         print(f"An error occurred during conversion: {e}", file=sys.stderr)
         print(f"Error details: {str(type(e))}", file=sys.stderr)
+        # Try to clean up GPU memory on error
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         raise