|
import gradio as gr |
|
from docling.document_converter import DocumentConverter |
|
from llama_index.readers.docling import DoclingReader |
|
import json |
|
import tempfile |
|
import os |
|
|
|
def convert_document_docling(file, output_format): |
|
try: |
|
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON if output_format == "JSON" else DoclingReader.ExportType.MARKDOWN) |
|
docs = reader.load_data(file_path=file.name) |
|
converted_text = docs[0].text |
|
temp_dir = tempfile.gettempdir() |
|
output_filename = os.path.splitext(os.path.basename(file.name))[0] + (".json" if output_format == "JSON" else ".md") |
|
output_path = os.path.join(temp_dir, output_filename) |
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
f.write(converted_text) |
|
metadata = { |
|
"Filename": file.name, |
|
"File Size": f"{os.path.getsize(file.name) / 1024:.2f} KB", |
|
"Output Format": output_format, |
|
"Conversion Status": "Success", |
|
"Method": "llama-index-readers-docling" |
|
} |
|
return converted_text, metadata, output_path, gr.update(visible=True), "β
Document converted successfully!" |
|
except Exception as e: |
|
error_metadata = {"Error": str(e), "Status": "Failed"} |
|
return "", error_metadata, None, gr.update(visible=False), "β Error during conversion" |
|
|
|
def convert_document_original(file, output_format): |
|
try: |
|
converter = DocumentConverter() |
|
result = converter.convert(file.name) |
|
temp_dir = tempfile.gettempdir() |
|
if output_format == "Markdown": |
|
converted_text = result.document.export_to_markdown() |
|
file_extension = ".md" |
|
else: |
|
converted_text = result.document.export_to_json() |
|
file_extension = ".json" |
|
output_filename = os.path.splitext(os.path.basename(file.name))[0] + file_extension |
|
output_path = os.path.join(temp_dir, output_filename) |
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
f.write(converted_text) |
|
metadata = { |
|
"Filename": file.name, |
|
"File Size": f"{os.path.getsize(file.name) / 1024:.2f} KB", |
|
"Output Format": output_format, |
|
"Conversion Status": "Success", |
|
"Method": "docling" |
|
} |
|
return converted_text, metadata, output_path, gr.update(visible=True), "β
Document converted successfully!" |
|
except Exception as e: |
|
error_metadata = {"Error": str(e), "Status": "Failed"} |
|
return "", error_metadata, None, gr.update(visible=False), "β Error during conversion" |
|
|
|
|
|
custom_css = """ |
|
:root { |
|
--primary-color: #2563eb; |
|
--secondary-color: #1e40af; |
|
--background-color: #1e1e1e; |
|
--card-background: #262626; |
|
--text-color: #ffffff; |
|
--border-radius: 10px; |
|
} |
|
|
|
body { |
|
background-color: var(--background-color); |
|
color: var(--text-color); |
|
} |
|
|
|
.container { |
|
max-width: 1200px; |
|
margin: 0 auto; |
|
padding: 2rem; |
|
} |
|
|
|
.gr-button { |
|
background: var(--primary-color) !important; |
|
border: none !important; |
|
color: white !important; |
|
padding: 10px 20px !important; |
|
border-radius: var(--border-radius) !important; |
|
transition: all 0.3s ease !important; |
|
} |
|
|
|
.gr-button:hover { |
|
background: var(--secondary-color) !important; |
|
transform: translateY(-2px); |
|
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); |
|
} |
|
|
|
.gr-form { |
|
background-color: var(--card-background); |
|
padding: 2rem; |
|
border-radius: var(--border-radius); |
|
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); |
|
} |
|
|
|
.gr-input, .gr-textbox { |
|
background-color: #333333 !important; |
|
border: 1px solid #404040 !important; |
|
color: var(--text-color) !important; |
|
border-radius: var(--border-radius) !important; |
|
} |
|
|
|
.gr-padded { |
|
padding: 1rem; |
|
} |
|
|
|
.gr-header { |
|
margin-bottom: 2rem; |
|
text-align: center; |
|
} |
|
|
|
.gr-subtitle { |
|
color: #9ca3af; |
|
font-size: 1.1rem; |
|
margin-bottom: 1.5rem; |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=custom_css) as app: |
|
gr.HTML( |
|
""" |
|
<div class="gr-header"> |
|
<h1 style='font-size: 2.5rem; color: #2563eb; margin-bottom: 1rem;'>π Docling Document Converter</h1> |
|
<p class="gr-subtitle">Transform your documents into Markdown or JSON format with ease</p> |
|
</div> |
|
""" |
|
) |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
with gr.Group(): |
|
gr.Markdown("### Input Settings") |
|
file_input = gr.File( |
|
label="Upload Your Document", |
|
file_types=[".doc", ".docx", ".pdf", ".txt"], |
|
elem_classes="gr-input" |
|
) |
|
format_input = gr.Radio( |
|
choices=["Markdown", "JSON"], |
|
label="Output Format", |
|
value="Markdown", |
|
elem_classes="gr-input" |
|
) |
|
method_input = gr.Radio( |
|
choices=["docling", "llama-index-readers-docling"], |
|
label="Conversion Method", |
|
value="docling", |
|
elem_classes="gr-input" |
|
) |
|
convert_button = gr.Button( |
|
"π Convert Document", |
|
variant="primary", |
|
elem_classes=["gr-button"] |
|
) |
|
status_message = gr.Textbox( |
|
label="Status", |
|
interactive=False, |
|
visible=False, |
|
elem_classes="gr-padded" |
|
) |
|
with gr.Column(scale=2): |
|
with gr.Group(): |
|
gr.Markdown("### Conversion Output") |
|
output_text = gr.Textbox( |
|
label="Converted Content", |
|
placeholder="The converted text will appear here...", |
|
lines=15, |
|
elem_classes="gr-textbox" |
|
) |
|
output_metadata = gr.JSON( |
|
label="Document Metadata", |
|
elem_classes="gr-input" |
|
) |
|
download_button = gr.File( |
|
label="Download Converted File", |
|
visible=False, |
|
elem_classes="gr-padded" |
|
) |
|
|
|
with gr.Row(): |
|
progress = gr.Slider(minimum=0, maximum=100, value=0, label="Progress", interactive=False) |
|
|
|
convert_button.click( |
|
fn=lambda file, format, method: convert_document_docling(file, format, progress) if method == "llama-index-readers-docling" else convert_document_original(file, format, progress), |
|
inputs=[file_input, format_input, method_input], |
|
outputs=[output_text, output_metadata, download_button, download_button, status_message, progress] |
|
) |
|
|
|
def update_progress(current, total): |
|
return (current / total) * 100 |
|
|
|
def convert_document_docling(file, output_format, progress): |
|
try: |
|
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON if output_format == "JSON" else DoclingReader.ExportType.MARKDOWN) |
|
docs = reader.load_data(file_path=file.name) |
|
converted_text = docs[0].text |
|
temp_dir = tempfile.gettempdir() |
|
output_filename = os.path.splitext(os.path.basename(file.name))[0] + (".json" if output_format == "JSON" else ".md") |
|
output_path = os.path.join(temp_dir, output_filename) |
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
f.write(converted_text) |
|
metadata = { |
|
"Filename": file.name, |
|
"File Size": f"{os.path.getsize(file.name) / 1024:.2f} KB", |
|
"Output Format": output_format, |
|
"Conversion Status": "Success", |
|
"Method": "llama-index-readers-docling" |
|
} |
|
progress(100) |
|
return converted_text, metadata, output_path, gr.update(visible=True), "β
Document converted successfully!", 100 |
|
except Exception as e: |
|
error_metadata = {"Error": str(e), "Status": "Failed"} |
|
return "", error_metadata, None, gr.update(visible=False), "β Error during conversion", 0 |
|
|
|
def convert_document_original(file, output_format, progress): |
|
try: |
|
converter = DocumentConverter() |
|
result = converter.convert(file.name) |
|
temp_dir = tempfile.gettempdir() |
|
if output_format == "Markdown": |
|
converted_text = result.document.export_to_markdown() |
|
file_extension = ".md" |
|
else: |
|
converted_text = result.document.export_to_json() |
|
file_extension = ".json" |
|
output_filename = os.path.splitext(os.path.basename(file.name))[0] + file_extension |
|
output_path = os.path.join(temp_dir, output_filename) |
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
f.write(converted_text) |
|
metadata = { |
|
"Filename": file.name, |
|
"File Size": f"{os.path.getsize(file.name) / 1024:.2f} KB", |
|
"Output Format": output_format, |
|
"Conversion Status": "Success", |
|
"Method": "docling" |
|
} |
|
progress(100) |
|
return converted_text, metadata, output_path, gr.update(visible=True), "β
Document converted successfully!", 100 |
|
except Exception as e: |
|
error_metadata = {"Error": str(e), "Status": "Failed"} |
|
return "", error_metadata, None, gr.update(visible=False), "β Error during conversion", 0 |
|
|
|
app.launch(debug=True, share=True) |
|
|
|
|