RicardoDataScience36's picture
Update app.py
819cf7f verified
import gradio as gr
from docling.document_converter import DocumentConverter
from llama_index.readers.docling import DoclingReader
import json
import tempfile
import os
def convert_document_docling(file, output_format):
try:
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON if output_format == "JSON" else DoclingReader.ExportType.MARKDOWN)
docs = reader.load_data(file_path=file.name)
converted_text = docs[0].text
temp_dir = tempfile.gettempdir()
output_filename = os.path.splitext(os.path.basename(file.name))[0] + (".json" if output_format == "JSON" else ".md")
output_path = os.path.join(temp_dir, output_filename)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(converted_text)
metadata = {
"Filename": file.name,
"File Size": f"{os.path.getsize(file.name) / 1024:.2f} KB",
"Output Format": output_format,
"Conversion Status": "Success",
"Method": "llama-index-readers-docling"
}
return converted_text, metadata, output_path, gr.update(visible=True), "βœ… Document converted successfully!"
except Exception as e:
error_metadata = {"Error": str(e), "Status": "Failed"}
return "", error_metadata, None, gr.update(visible=False), "❌ Error during conversion"
def convert_document_original(file, output_format):
try:
converter = DocumentConverter()
result = converter.convert(file.name)
temp_dir = tempfile.gettempdir()
if output_format == "Markdown":
converted_text = result.document.export_to_markdown()
file_extension = ".md"
else:
converted_text = result.document.export_to_json()
file_extension = ".json"
output_filename = os.path.splitext(os.path.basename(file.name))[0] + file_extension
output_path = os.path.join(temp_dir, output_filename)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(converted_text)
metadata = {
"Filename": file.name,
"File Size": f"{os.path.getsize(file.name) / 1024:.2f} KB",
"Output Format": output_format,
"Conversion Status": "Success",
"Method": "docling"
}
return converted_text, metadata, output_path, gr.update(visible=True), "βœ… Document converted successfully!"
except Exception as e:
error_metadata = {"Error": str(e), "Status": "Failed"}
return "", error_metadata, None, gr.update(visible=False), "❌ Error during conversion"
custom_css = """
:root {
--primary-color: #2563eb;
--secondary-color: #1e40af;
--background-color: #1e1e1e;
--card-background: #262626;
--text-color: #ffffff;
--border-radius: 10px;
}
body {
background-color: var(--background-color);
color: var(--text-color);
}
.container {
max-width: 1200px;
margin: 0 auto;
padding: 2rem;
}
.gr-button {
background: var(--primary-color) !important;
border: none !important;
color: white !important;
padding: 10px 20px !important;
border-radius: var(--border-radius) !important;
transition: all 0.3s ease !important;
}
.gr-button:hover {
background: var(--secondary-color) !important;
transform: translateY(-2px);
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.gr-form {
background-color: var(--card-background);
padding: 2rem;
border-radius: var(--border-radius);
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.gr-input, .gr-textbox {
background-color: #333333 !important;
border: 1px solid #404040 !important;
color: var(--text-color) !important;
border-radius: var(--border-radius) !important;
}
.gr-padded {
padding: 1rem;
}
.gr-header {
margin-bottom: 2rem;
text-align: center;
}
.gr-subtitle {
color: #9ca3af;
font-size: 1.1rem;
margin-bottom: 1.5rem;
}
"""
with gr.Blocks(css=custom_css) as app:
gr.HTML(
"""
<div class="gr-header">
<h1 style='font-size: 2.5rem; color: #2563eb; margin-bottom: 1rem;'>πŸ“„ Docling Document Converter</h1>
<p class="gr-subtitle">Transform your documents into Markdown or JSON format with ease</p>
</div>
"""
)
with gr.Row():
with gr.Column(scale=1):
with gr.Group():
gr.Markdown("### Input Settings")
file_input = gr.File(
label="Upload Your Document",
file_types=[".doc", ".docx", ".pdf", ".txt"],
elem_classes="gr-input"
)
format_input = gr.Radio(
choices=["Markdown", "JSON"],
label="Output Format",
value="Markdown",
elem_classes="gr-input"
)
method_input = gr.Radio(
choices=["docling", "llama-index-readers-docling"],
label="Conversion Method",
value="docling",
elem_classes="gr-input"
)
convert_button = gr.Button(
"πŸ”„ Convert Document",
variant="primary",
elem_classes=["gr-button"]
)
status_message = gr.Textbox(
label="Status",
interactive=False,
visible=False,
elem_classes="gr-padded"
)
with gr.Column(scale=2):
with gr.Group():
gr.Markdown("### Conversion Output")
output_text = gr.Textbox(
label="Converted Content",
placeholder="The converted text will appear here...",
lines=15,
elem_classes="gr-textbox"
)
output_metadata = gr.JSON(
label="Document Metadata",
elem_classes="gr-input"
)
download_button = gr.File(
label="Download Converted File",
visible=False,
elem_classes="gr-padded"
)
with gr.Row():
progress = gr.Slider(minimum=0, maximum=100, value=0, label="Progress", interactive=False)
convert_button.click(
fn=lambda file, format, method: convert_document_docling(file, format, progress) if method == "llama-index-readers-docling" else convert_document_original(file, format, progress),
inputs=[file_input, format_input, method_input],
outputs=[output_text, output_metadata, download_button, download_button, status_message, progress]
)
def update_progress(current, total):
return (current / total) * 100
def convert_document_docling(file, output_format, progress):
try:
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON if output_format == "JSON" else DoclingReader.ExportType.MARKDOWN)
docs = reader.load_data(file_path=file.name)
converted_text = docs[0].text
temp_dir = tempfile.gettempdir()
output_filename = os.path.splitext(os.path.basename(file.name))[0] + (".json" if output_format == "JSON" else ".md")
output_path = os.path.join(temp_dir, output_filename)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(converted_text)
metadata = {
"Filename": file.name,
"File Size": f"{os.path.getsize(file.name) / 1024:.2f} KB",
"Output Format": output_format,
"Conversion Status": "Success",
"Method": "llama-index-readers-docling"
}
progress(100) # Set progress to 100% once complete
return converted_text, metadata, output_path, gr.update(visible=True), "βœ… Document converted successfully!", 100
except Exception as e:
error_metadata = {"Error": str(e), "Status": "Failed"}
return "", error_metadata, None, gr.update(visible=False), "❌ Error during conversion", 0
def convert_document_original(file, output_format, progress):
try:
converter = DocumentConverter()
result = converter.convert(file.name)
temp_dir = tempfile.gettempdir()
if output_format == "Markdown":
converted_text = result.document.export_to_markdown()
file_extension = ".md"
else:
converted_text = result.document.export_to_json()
file_extension = ".json"
output_filename = os.path.splitext(os.path.basename(file.name))[0] + file_extension
output_path = os.path.join(temp_dir, output_filename)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(converted_text)
metadata = {
"Filename": file.name,
"File Size": f"{os.path.getsize(file.name) / 1024:.2f} KB",
"Output Format": output_format,
"Conversion Status": "Success",
"Method": "docling"
}
progress(100) # Set progress to 100% once complete
return converted_text, metadata, output_path, gr.update(visible=True), "βœ… Document converted successfully!", 100
except Exception as e:
error_metadata = {"Error": str(e), "Status": "Failed"}
return "", error_metadata, None, gr.update(visible=False), "❌ Error during conversion", 0
app.launch(debug=True, share=True)