File size: 2,594 Bytes
c8a32e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from marker.images.save import get_image_filename
from marker.pdf.images import render_bbox_image
from marker.schema.bbox import rescale_bbox
from marker.schema.block import find_insert_block, Span, Line
from marker.settings import settings


def find_image_blocks(page):
    image_blocks = []
    image_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Figure", "Picture"]]
    image_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in image_regions]

    insert_points = {}
    for region_idx, region in enumerate(image_regions):
        for block_idx, block in enumerate(page.blocks):
            for line_idx, line in enumerate(block.lines):
                if line.intersection_pct(region) > settings.BBOX_INTERSECTION_THRESH:
                    line.spans = [] # We will remove this line from the block

                    if region_idx not in insert_points:
                        insert_points[region_idx] = (block_idx, line_idx)

    # Account for images with no detected lines
    for region_idx, region in enumerate(image_regions):
        if region_idx in insert_points:
            continue

        insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)

    for region_idx, image_region in enumerate(image_regions):
        image_insert = insert_points[region_idx]
        image_blocks.append([image_insert[0], image_insert[1], image_region])

    return image_blocks


def extract_page_images(page_obj, page):
    page.images = []
    image_blocks = find_image_blocks(page)

    for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks):
        block = page.blocks[block_idx]
        image = render_bbox_image(page_obj, page, bbox)
        image_filename = get_image_filename(page, image_idx)
        image_markdown = f"\n\n![{image_filename}]({image_filename})\n\n"
        image_span = Span(
            bbox=bbox,
            text=image_markdown,
            font="Image",
            rotation=0,
            font_weight=0,
            font_size=0,
            image=True,
            span_id=f"image_{image_idx}"
        )

        # Sometimes, the block has zero lines
        if len(block.lines) > line_idx:
            block.lines[line_idx].spans.append(image_span)
        else:
            line = Line(
                bbox=bbox,
                spans=[image_span]
            )
            block.lines.append(line)
        page.images.append(image)


def extract_images(doc, pages):
    for page_idx, page in enumerate(pages):
        page_obj = doc[page_idx]
        extract_page_images(page_obj, page)