Spaces:
Sleeping
Sleeping
import gradio as gr | |
from docling.document_converter import DocumentConverter, PdfFormatOption | |
from docling.datamodel.pipeline_options import ( | |
AcceleratorDevice, | |
PdfPipelineOptions, | |
AcceleratorOptions | |
) | |
import spaces | |
from docling.datamodel.base_models import InputFormat | |
from marker.converters.pdf import PdfConverter | |
from marker.models import create_model_dict | |
from marker.output import text_from_rendered | |
# Docling | |
accelerator_options = AcceleratorOptions( | |
num_threads=8, device=AcceleratorDevice.CPU | |
) | |
pipeline_options = PdfPipelineOptions() | |
pipeline_options.accelerator_options = accelerator_options | |
pipeline_options.do_ocr = True | |
pipeline_options.do_table_structure = True | |
pipeline_options.table_structure_options.do_cell_matching = True | |
docling_converter = DocumentConverter( | |
format_options={ | |
InputFormat.PDF: PdfFormatOption( | |
pipeline_options=pipeline_options, | |
) | |
} | |
) | |
# Marker | |
marker_converter = PdfConverter( | |
artifact_dict=create_model_dict(), | |
) | |
def convert_document(file, method): | |
if method == "Docling": | |
result = docling_converter.convert(file.name) | |
return result.document.export_to_markdown() | |
elif method == "Marker": | |
rendered = marker_converter(file.name) | |
text, _, images = text_from_rendered(rendered) | |
return text | |
else: | |
return 'unknown method' | |
with gr.Blocks() as app: | |
gr.Markdown("# Document Converter") | |
gr.Markdown("Upload a document, choose the backend, and get the converted text with metadata.") | |
file_input = gr.File(label="Upload Document") | |
method_input = gr.Radio(["Docling", "Marker"], label="Choose Conversion Backend") | |
output_text = gr.Textbox(label="Converted Document") | |
convert_button = gr.Button("Convert") | |
convert_button.click( | |
convert_document, | |
inputs=[file_input, method_input], | |
outputs=[output_text] | |
) | |
app.launch(debug=True, show_error=True) |