File size: 5,577 Bytes
3d9ca9a
 
 
3c3eb16
 
a3cafa2
 
3d9ca9a
a49c5dc
 
 
 
 
 
 
 
 
3c3eb16
 
 
 
 
 
41ee299
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed4cfc9
 
 
3c3eb16
 
 
ed4cfc9
a49c5dc
3c3eb16
 
a49c5dc
 
 
 
 
 
 
 
 
 
3c3eb16
 
 
 
 
 
 
ed4cfc9
a49c5dc
 
 
3c3eb16
 
 
 
a49c5dc
 
 
 
3d9ca9a
 
a49c5dc
 
3d9ca9a
 
 
a49c5dc
3d9ca9a
 
 
 
 
 
 
a49c5dc
 
 
 
 
3d9ca9a
 
3c3eb16
 
 
 
a49c5dc
 
 
2751dee
 
a49c5dc
3d9ca9a
 
 
 
 
a49c5dc
3d9ca9a
 
 
 
3c3eb16
 
 
 
3d9ca9a
 
 
 
a3cafa2
3c3eb16
 
 
3d9ca9a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import marker
import os
import sys
import gc
import torch
from marker.config.parser import ConfigParser
from marker.models import create_model_dict

# Global variable to hold the pre-loaded converter
_converter = None

def initialize_converter():
    """Initializes the marker converter models and stores it globally."""
    global _converter
    if _converter is None:
        print("Initializing marker models...")
        try:
            # Clear any existing CUDA cache before loading models
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gc.collect()
                print(f"CUDA memory before initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
            
            # Set custom font path from environment variable if available
            font_path = os.environ.get('MARKER_FONT_PATH')
            if font_path:
                try:
                    # Import marker settings and override font path
                    from marker import settings
                    os.makedirs(font_path, exist_ok=True)
                    custom_font_path = os.path.join(font_path, 'NotoSans-Regular.ttf')
                    settings.FONT_PATH = custom_font_path
                    print(f"Using custom font path: {custom_font_path}")
                except ImportError:
                    print("Could not import marker settings, using default font path")
                except Exception as e:
                    print(f"Error setting custom font path: {e}", file=sys.stderr)

            # Create configuration, explicitly setting output format and batch multiplier
            config_parser = ConfigParser({
                'output_format': 'markdown',
                'batch_multiplier': 4,  # Increased from default 2
                # Add any device-specific configuration here
                'device': 'cuda' if torch.cuda.is_available() else 'cpu'
            })

            # Load models with explicit device mapping
            models = create_model_dict()

            # Get converter class and create converter
            converter_cls = config_parser.get_converter_cls()
            _converter = converter_cls(
                config=config_parser.generate_config_dict(),
                artifact_dict=models,
                processor_list=config_parser.get_processors(),
                renderer=config_parser.get_renderer(),
                llm_service=config_parser.get_llm_service()
            )
            
            # Force another garbage collection after model load
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gc.collect()
                print(f"CUDA memory after initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
                
            print("Marker models initialized successfully with batch_multiplier=4.")
        except Exception as e:
            print(f"Failed to initialize marker models: {e}", file=sys.stderr)
            _converter = None # Ensure it's None if init fails
            # Attempt to clean up GPU memory in case of initialization failure
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gc.collect()
            raise
    else:
        print("Marker models already initialized.")

def convert_pdf(pdf_input_path, output_md_path=None):
    """
    Convert PDF file to Markdown using the pre-loaded marker converter.

    Args:
        pdf_input_path (str): Path to the input PDF file
        output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.

    Returns:
        str: The markdown text
    """
    # Check if the input PDF exists
    if not os.path.exists(pdf_input_path):
        raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")

    # Check if converter is initialized
    if _converter is None:
         raise RuntimeError("Marker converter has not been initialized. Call initialize_converter() during application startup.")

    print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")

    try:
        # Free up any temporary memory before conversion
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
        # Convert the PDF to markdown using the pre-loaded converter
        result = _converter(pdf_input_path)

        # Access the markdown content directly from the result object
        markdown_text = result.markdown

        # If output path is provided, save the markdown
        if output_md_path:
            output_dir = os.path.dirname(output_md_path)
            if output_dir and not os.path.exists(output_dir):
                os.makedirs(output_dir, exist_ok=True)

            with open(output_md_path, "w", encoding="utf-8") as f:
                f.write(markdown_text)
            print(f"Successfully saved markdown to '{output_md_path}'")

        # Clean up temporary GPU memory after conversion
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
        return markdown_text

    except Exception as e:
        print(f"An error occurred during conversion: {e}", file=sys.stderr)
        print(f"Error details: {str(type(e))}", file=sys.stderr)
        # Try to clean up GPU memory on error
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        raise