marcosremar2 commited on
Commit
41ee299
·
1 Parent(s): a49c5dc

Update PDF to Markdown converter API with NVIDIA L4 support

Browse files
Files changed (2) hide show
  1. Dockerfile +6 -0
  2. pdf_converter/convert_pdf_to_md.py +15 -0
Dockerfile CHANGED
@@ -59,12 +59,18 @@ COPY --chown=user:user . .
59
  RUN mkdir -p /app/docker_mineru/output/images && \
60
  chown -R user:user /app/docker_mineru/output
61
 
 
 
 
 
62
  # Set the user
63
  USER user
64
 
65
  # Environment variables for caching (optional, might help with model downloads)
66
  ENV HF_HOME=/home/user/.cache/huggingface
67
  ENV TORCH_HOME=/home/user/.cache/torch
 
 
68
 
69
  # Expose the port
70
  EXPOSE 7860
 
59
  RUN mkdir -p /app/docker_mineru/output/images && \
60
  chown -R user:user /app/docker_mineru/output
61
 
62
+ # Create marker static directory and set proper permissions (fix for font download error)
63
+ RUN mkdir -p /usr/local/lib/python3.10/dist-packages/static && \
64
+ chmod -R 777 /usr/local/lib/python3.10/dist-packages/static
65
+
66
  # Set the user
67
  USER user
68
 
69
  # Environment variables for caching (optional, might help with model downloads)
70
  ENV HF_HOME=/home/user/.cache/huggingface
71
  ENV TORCH_HOME=/home/user/.cache/torch
72
+ # Add environment variable for marker font path (alternative fix)
73
+ ENV MARKER_FONT_PATH=/home/user/.cache/marker_fonts
74
 
75
  # Expose the port
76
  EXPOSE 7860
pdf_converter/convert_pdf_to_md.py CHANGED
@@ -13,6 +13,21 @@ def initialize_converter():
13
  if _converter is None:
14
  print("Initializing marker models...")
15
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  # Create configuration, explicitly setting output format
17
  # Potential optimization: Check if batch_multiplier or similar exists
18
  config_parser = ConfigParser({'output_format': 'markdown'}) # Add batch_multiplier here if applicable
 
13
  if _converter is None:
14
  print("Initializing marker models...")
15
  try:
16
+ # Set custom font path from environment variable if available
17
+ font_path = os.environ.get('MARKER_FONT_PATH')
18
+ if font_path:
19
+ try:
20
+ # Import marker settings and override font path
21
+ from marker import settings
22
+ os.makedirs(font_path, exist_ok=True)
23
+ custom_font_path = os.path.join(font_path, 'NotoSans-Regular.ttf')
24
+ settings.FONT_PATH = custom_font_path
25
+ print(f"Using custom font path: {custom_font_path}")
26
+ except ImportError:
27
+ print("Could not import marker settings, using default font path")
28
+ except Exception as e:
29
+ print(f"Error setting custom font path: {e}", file=sys.stderr)
30
+
31
  # Create configuration, explicitly setting output format
32
  # Potential optimization: Check if batch_multiplier or similar exists
33
  config_parser = ConfigParser({'output_format': 'markdown'}) # Add batch_multiplier here if applicable