File size: 7,312 Bytes
411f800 cec331b 411f800 c84abd3 411f800 79a92e5 411f800 79a92e5 411f800 79a92e5 411f800 79a92e5 411f800 79a92e5 411f800 79a92e5 411f800 79a92e5 411f800 79a92e5 411f800 79a92e5 411f800 cec331b 411f800 79a92e5 411f800 cec331b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import logging
import json
import yaml
import gradio as gr
import gradio.themes as themes
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions
from docling.utils.export import generate_multimodal_pages
from docling.utils.utils import create_hash
import pandas as pd
import time
import datetime
# Set up logging
logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)
# OCR Configuration
ocr_options = EasyOcrOptions(force_full_page_ocr=True)
pipeline_options = PdfPipelineOptions(do_table_structure=True)
pipeline_options.do_ocr = True # Enable OCR for images and text
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # More accurate table model
pipeline_options.ocr_options = ocr_options
pipeline_options.ocr_options.lang = ["id", "en"] # OCR languages
# Function to handle document conversion and exports
def export_tables_and_figures(conv_res, output_dir):
"""Exports tables, figures, and multimodal pages from the converted document."""
start_time = time.time()
output_files = []
# Export tables
for table_ix, table in enumerate(conv_res.document.tables):
table_df = table.export_to_dataframe()
table_csv_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.csv"
table_html_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.html"
_log.info(f"Saving CSV table to {table_csv_filename}")
table_df.to_csv(table_csv_filename)
_log.info(f"Saving HTML table to {table_html_filename}")
with table_html_filename.open("w") as fp:
fp.write(table.export_to_html())
# Append to output files
output_files.append(table_csv_filename)
output_files.append(table_html_filename)
# Export pictures (e.g., images with OCR or annotations)
for picture_ix, picture in enumerate(conv_res.document.pictures): # Changed 'figures' to 'pictures'
if picture.image: # Check if picture.image is not None
picture_image_filename = output_dir / f"{conv_res.input.file.stem}-picture-{picture_ix + 1}.png"
_log.info(f"Saving Picture to {picture_image_filename}")
picture.image.save(picture_image_filename)
# Append to output files
output_files.append(picture_image_filename)
else:
_log.warning(f"Skipping picture {picture_ix + 1} due to missing image.")
# Export multimodal pages
rows = []
for content_text, content_md, content_dt, page_cells, page_segments, page in generate_multimodal_pages(conv_res):
try:
dpi = page._default_image_scale * 72
# Ensure page.image exists and handle the case where it may be None
image_width = image_height = 0
image_bytes = None
if page.image:
image_width = page.image.width
image_height = page.image.height
image_bytes = page.image.tobytes()
rows.append({
"document": conv_res.input.file.name,
"hash": conv_res.input.document_hash,
"page_hash": create_hash(conv_res.input.document_hash + ":" + str(page.page_no - 1)),
"image": {
"width": image_width,
"height": image_height,
"bytes": image_bytes,
},
"cells": page_cells,
"contents": content_text,
"contents_md": content_md,
"contents_dt": content_dt,
"segments": page_segments,
"extra": {
"page_num": page.page_no + 1,
"width_in_points": page.size.width,
"height_in_points": page.size.height,
"dpi": dpi,
},
})
except Exception as e:
_log.warning(f"Failed to process page {page.page_no + 1}: {e}")
# Generate one Parquet from all documents
df = pd.json_normalize(rows)
now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename)
# Append to output files
output_files.append(output_filename)
end_time = time.time() - start_time
_log.info(f"Tables, figures, and multimodal pages exported in {end_time:.2f} seconds.")
return [str(file.resolve()) for file in output_files]
# Main conversion function
def convert_document(input_file):
# Create a temporary output directory
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)
# Create DocumentConverter instance
doc_converter = DocumentConverter(
allowed_formats=[InputFormat.PDF, InputFormat.IMAGE, InputFormat.DOCX, InputFormat.HTML],
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend)}
)
# Convert the input file
input_path = Path(input_file.name)
conv_results = doc_converter.convert_all([input_path])
# Export to markdown, json, yaml with UTF-8 encoding
output_files = []
for res in conv_results:
out_path = output_dir / res.input.file.stem
out_path.mkdir(parents=True, exist_ok=True)
# Export Markdown and JSON with utf-8 encoding
with (out_path / f"{res.input.file.stem}.md").open("w", encoding="utf-8") as fp:
fp.write(res.document.export_to_markdown())
with (out_path / f"{res.input.file.stem}.json").open("w", encoding="utf-8") as fp:
fp.write(json.dumps(res.document.export_to_dict(), ensure_ascii=False))
with (out_path / f"{res.input.file.stem}.yaml").open("w", encoding="utf-8") as fp:
fp.write(yaml.safe_dump(res.document.export_to_dict(), allow_unicode=True))
# Append to output files
output_files.append(str((out_path / f"{res.input.file.stem}.md").resolve()))
output_files.append(str((out_path / f"{res.input.file.stem}.json").resolve()))
output_files.append(str((out_path / f"{res.input.file.stem}.yaml").resolve()))
# Export tables, figures, and multimodal content
output_files.extend(export_tables_and_figures(res, out_path))
return output_files
# Create the Gradio interface
def gradio_interface(input_file):
output_files = convert_document(input_file)
return output_files
# Create the Gradio interface with a theme
iface = gr.Interface(
fn=gradio_interface,
inputs=gr.File(file_count="single", type="filepath"),
outputs=gr.File(file_count="multiple"),
title="Document Conversion with OCR",
description="Upload your document or image, and get the converted output with OCR and other exports.",
allow_flagging="never",
theme=themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate"), # Set the theme here
)
if __name__ == "__main__":
iface.launch() |