File size: 1,961 Bytes
e2d728a
1ba64f5
0884aec
 
 
 
 
8e024f6
1ba64f5
ba47b56
 
b562a6c
e2d728a
ba47b56
0884aec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba47b56
 
 
 
 
cfdee1a
8604d96
 
ba47b56
 
 
8604d96
266215f
 
 
e2d728a
4a9e7b8
e2d728a
a07d796
8604d96
 
e2d728a
a07d796
8604d96
a07d796
 
 
 
 
8604d96
4a9e7b8
a07d796
e2d728a
e665966
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import gradio as gr
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    PdfPipelineOptions,
    AcceleratorOptions
)
import spaces
from docling.datamodel.base_models import InputFormat
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

# Docling
accelerator_options = AcceleratorOptions(
    num_threads=8, device=AcceleratorDevice.CPU
)


pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = accelerator_options
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

docling_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)

# Marker
marker_converter = PdfConverter(
    artifact_dict=create_model_dict(),
)

def convert_document(file, method):
    if method == "Docling":
        result = docling_converter.convert(file.name)
        
        return result.document.export_to_markdown()
    elif method == "Marker":
        rendered = marker_converter(file.name)
        text, _, images = text_from_rendered(rendered)
        return text
    else:
        return 'unknown method'

with gr.Blocks() as app:
    gr.Markdown("# Document Converter")
    gr.Markdown("Upload a document, choose the backend, and get the converted text with metadata.")

    file_input = gr.File(label="Upload Document")
    method_input = gr.Radio(["Docling", "Marker"], label="Choose Conversion Backend")
    output_text = gr.Textbox(label="Converted Document")

    convert_button = gr.Button("Convert")
    convert_button.click(
        convert_document,
        inputs=[file_input, method_input],
        outputs=[output_text]
    )

app.launch(debug=True, show_error=True)