File size: 7,312 Bytes
411f800
 
 
 
cec331b
411f800
 
 
 
 
 
 
 
 
 
c84abd3
411f800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79a92e5
411f800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79a92e5
 
411f800
 
 
 
 
 
 
 
 
79a92e5
411f800
 
 
 
 
 
79a92e5
411f800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79a92e5
 
411f800
 
 
 
79a92e5
411f800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79a92e5
411f800
 
 
 
 
 
 
79a92e5
411f800
 
 
 
 
 
 
 
 
79a92e5
411f800
 
 
 
cec331b
411f800
 
 
 
 
 
79a92e5
 
411f800
 
 
cec331b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import logging
import json
import yaml
import gradio as gr
import gradio.themes as themes
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions
from docling.utils.export import generate_multimodal_pages
from docling.utils.utils import create_hash
import pandas as pd
import time
import datetime

# Set up logging
logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)

# OCR Configuration
ocr_options = EasyOcrOptions(force_full_page_ocr=True)
pipeline_options = PdfPipelineOptions(do_table_structure=True)
pipeline_options.do_ocr = True  # Enable OCR for images and text
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE  # More accurate table model
pipeline_options.ocr_options = ocr_options
pipeline_options.ocr_options.lang = ["id", "en"]  # OCR languages

# Function to handle document conversion and exports
def export_tables_and_figures(conv_res, output_dir):
    """Exports tables, figures, and multimodal pages from the converted document."""
    start_time = time.time()

    output_files = []

    # Export tables
    for table_ix, table in enumerate(conv_res.document.tables):
        table_df = table.export_to_dataframe()
        table_csv_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.csv"
        table_html_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.html"

        _log.info(f"Saving CSV table to {table_csv_filename}")
        table_df.to_csv(table_csv_filename)

        _log.info(f"Saving HTML table to {table_html_filename}")
        with table_html_filename.open("w") as fp:
            fp.write(table.export_to_html())

        # Append to output files
        output_files.append(table_csv_filename)
        output_files.append(table_html_filename)

    # Export pictures (e.g., images with OCR or annotations)
    for picture_ix, picture in enumerate(conv_res.document.pictures):  # Changed 'figures' to 'pictures'
        if picture.image:  # Check if picture.image is not None
            picture_image_filename = output_dir / f"{conv_res.input.file.stem}-picture-{picture_ix + 1}.png"
            _log.info(f"Saving Picture to {picture_image_filename}")
            picture.image.save(picture_image_filename)

            # Append to output files
            output_files.append(picture_image_filename)
        else:
            _log.warning(f"Skipping picture {picture_ix + 1} due to missing image.")

    # Export multimodal pages
    rows = []
    for content_text, content_md, content_dt, page_cells, page_segments, page in generate_multimodal_pages(conv_res):
        try:
            dpi = page._default_image_scale * 72
            # Ensure page.image exists and handle the case where it may be None
            image_width = image_height = 0
            image_bytes = None
            if page.image:
                image_width = page.image.width
                image_height = page.image.height
                image_bytes = page.image.tobytes()

            rows.append({
                "document": conv_res.input.file.name,
                "hash": conv_res.input.document_hash,
                "page_hash": create_hash(conv_res.input.document_hash + ":" + str(page.page_no - 1)),
                "image": {
                    "width": image_width,
                    "height": image_height,
                    "bytes": image_bytes,
                },
                "cells": page_cells,
                "contents": content_text,
                "contents_md": content_md,
                "contents_dt": content_dt,
                "segments": page_segments,
                "extra": {
                    "page_num": page.page_no + 1,
                    "width_in_points": page.size.width,
                    "height_in_points": page.size.height,
                    "dpi": dpi,
                },
            })
        except Exception as e:
            _log.warning(f"Failed to process page {page.page_no + 1}: {e}")

    # Generate one Parquet from all documents
    df = pd.json_normalize(rows)
    now = datetime.datetime.now()
    output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
    df.to_parquet(output_filename)

    # Append to output files
    output_files.append(output_filename)

    end_time = time.time() - start_time
    _log.info(f"Tables, figures, and multimodal pages exported in {end_time:.2f} seconds.")

    return [str(file.resolve()) for file in output_files]

# Main conversion function
def convert_document(input_file):
    # Create a temporary output directory
    output_dir = Path("scratch")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Create DocumentConverter instance
    doc_converter = DocumentConverter(
        allowed_formats=[InputFormat.PDF, InputFormat.IMAGE, InputFormat.DOCX, InputFormat.HTML],
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend)}
    )

    # Convert the input file
    input_path = Path(input_file.name)
    conv_results = doc_converter.convert_all([input_path])

    # Export to markdown, json, yaml with UTF-8 encoding
    output_files = []
    for res in conv_results:
        out_path = output_dir / res.input.file.stem
        out_path.mkdir(parents=True, exist_ok=True)

        # Export Markdown and JSON with utf-8 encoding
        with (out_path / f"{res.input.file.stem}.md").open("w", encoding="utf-8") as fp:
            fp.write(res.document.export_to_markdown())
        with (out_path / f"{res.input.file.stem}.json").open("w", encoding="utf-8") as fp:
            fp.write(json.dumps(res.document.export_to_dict(), ensure_ascii=False))
        with (out_path / f"{res.input.file.stem}.yaml").open("w", encoding="utf-8") as fp:
            fp.write(yaml.safe_dump(res.document.export_to_dict(), allow_unicode=True))

        # Append to output files
        output_files.append(str((out_path / f"{res.input.file.stem}.md").resolve()))
        output_files.append(str((out_path / f"{res.input.file.stem}.json").resolve()))
        output_files.append(str((out_path / f"{res.input.file.stem}.yaml").resolve()))

        # Export tables, figures, and multimodal content
        output_files.extend(export_tables_and_figures(res, out_path))

    return output_files

# Create the Gradio interface
def gradio_interface(input_file):
    output_files = convert_document(input_file)
    return output_files

# Create the Gradio interface with a theme
iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.File(file_count="single", type="filepath"),
    outputs=gr.File(file_count="multiple"),
    title="Document Conversion with OCR",
    description="Upload your document or image, and get the converted output with OCR and other exports.",
    allow_flagging="never",
    theme=themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate"),  # Set the theme here
)

if __name__ == "__main__":
    iface.launch()