File size: 1,260 Bytes
c8a32e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
import base64
from marker.convert import convert_single_pdf
from marker.models import load_all_models
from marker.settings import Settings
import gradio as gr


model_list = load_all_models()

def parse_pdf_and_return_markdown(pdf_file: bytes , extract_images: bool):
    full_text, images, out_meta = convert_single_pdf(pdf_file, model_list)
    image_data = {}
    if extract_images:
        for i, (filename, image) in enumerate(images.items()):
            image_filepath = f"image_{i+1}.png"
            image.save(image_filepath, "PNG")

            with open(image_filepath, "rb") as f:
                image_bytes = f.read()

            image_base64 = base64.b64encode(image_bytes).decode('utf-8')
            image_data[f'image_{i+1}'] = image_base64

            os.remove(image_filepath)

    return full_text, out_meta, image_data
    

with gr.Blocks() as server:
    gr.Markdown("Upload a PDF file to convert to markdown.")
    gr.Interface(
        parse_pdf_and_return_markdown, 
        inputs=[gr.File(label="Upload PDF", type="filepath"), gr.Checkbox(label="Extract Images")],
        outputs=[gr.Textbox(label="Markdown"), gr.JSON(label="Metadata"), gr.JSON(label="Images")]
    )


if __name__ == "__main__":
    server.launch()