Spaces:
Sleeping
Sleeping
File size: 5,577 Bytes
3d9ca9a 3c3eb16 a3cafa2 3d9ca9a a49c5dc 3c3eb16 41ee299 ed4cfc9 3c3eb16 ed4cfc9 a49c5dc 3c3eb16 a49c5dc 3c3eb16 ed4cfc9 a49c5dc 3c3eb16 a49c5dc 3d9ca9a a49c5dc 3d9ca9a a49c5dc 3d9ca9a a49c5dc 3d9ca9a 3c3eb16 a49c5dc 2751dee a49c5dc 3d9ca9a a49c5dc 3d9ca9a 3c3eb16 3d9ca9a a3cafa2 3c3eb16 3d9ca9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import marker
import os
import sys
import gc
import torch
from marker.config.parser import ConfigParser
from marker.models import create_model_dict
# Global variable to hold the pre-loaded converter
_converter = None
def initialize_converter():
"""Initializes the marker converter models and stores it globally."""
global _converter
if _converter is None:
print("Initializing marker models...")
try:
# Clear any existing CUDA cache before loading models
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
print(f"CUDA memory before initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
# Set custom font path from environment variable if available
font_path = os.environ.get('MARKER_FONT_PATH')
if font_path:
try:
# Import marker settings and override font path
from marker import settings
os.makedirs(font_path, exist_ok=True)
custom_font_path = os.path.join(font_path, 'NotoSans-Regular.ttf')
settings.FONT_PATH = custom_font_path
print(f"Using custom font path: {custom_font_path}")
except ImportError:
print("Could not import marker settings, using default font path")
except Exception as e:
print(f"Error setting custom font path: {e}", file=sys.stderr)
# Create configuration, explicitly setting output format and batch multiplier
config_parser = ConfigParser({
'output_format': 'markdown',
'batch_multiplier': 4, # Increased from default 2
# Add any device-specific configuration here
'device': 'cuda' if torch.cuda.is_available() else 'cpu'
})
# Load models with explicit device mapping
models = create_model_dict()
# Get converter class and create converter
converter_cls = config_parser.get_converter_cls()
_converter = converter_cls(
config=config_parser.generate_config_dict(),
artifact_dict=models,
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
llm_service=config_parser.get_llm_service()
)
# Force another garbage collection after model load
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
print(f"CUDA memory after initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
print("Marker models initialized successfully with batch_multiplier=4.")
except Exception as e:
print(f"Failed to initialize marker models: {e}", file=sys.stderr)
_converter = None # Ensure it's None if init fails
# Attempt to clean up GPU memory in case of initialization failure
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
raise
else:
print("Marker models already initialized.")
def convert_pdf(pdf_input_path, output_md_path=None):
"""
Convert PDF file to Markdown using the pre-loaded marker converter.
Args:
pdf_input_path (str): Path to the input PDF file
output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.
Returns:
str: The markdown text
"""
# Check if the input PDF exists
if not os.path.exists(pdf_input_path):
raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")
# Check if converter is initialized
if _converter is None:
raise RuntimeError("Marker converter has not been initialized. Call initialize_converter() during application startup.")
print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")
try:
# Free up any temporary memory before conversion
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Convert the PDF to markdown using the pre-loaded converter
result = _converter(pdf_input_path)
# Access the markdown content directly from the result object
markdown_text = result.markdown
# If output path is provided, save the markdown
if output_md_path:
output_dir = os.path.dirname(output_md_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
with open(output_md_path, "w", encoding="utf-8") as f:
f.write(markdown_text)
print(f"Successfully saved markdown to '{output_md_path}'")
# Clean up temporary GPU memory after conversion
if torch.cuda.is_available():
torch.cuda.empty_cache()
return markdown_text
except Exception as e:
print(f"An error occurred during conversion: {e}", file=sys.stderr)
print(f"Error details: {str(type(e))}", file=sys.stderr)
# Try to clean up GPU memory on error
if torch.cuda.is_available():
torch.cuda.empty_cache()
raise |