|
from typing import Optional |
|
|
|
import filetype |
|
|
|
from marker.settings import settings |
|
|
|
|
|
def find_filetype(fpath): |
|
kind = filetype.guess(fpath) |
|
if kind is None: |
|
print(f"Could not determine filetype for {fpath}") |
|
return "other" |
|
|
|
mimetype = kind.mime |
|
|
|
|
|
|
|
if "pdf" in mimetype: |
|
return "pdf" |
|
elif mimetype in settings.SUPPORTED_FILETYPES: |
|
return settings.SUPPORTED_FILETYPES[mimetype] |
|
else: |
|
print(f"Found nonstandard filetype {mimetype}") |
|
return "other" |
|
|
|
|
|
def font_flags_decomposer(flags: Optional[int]) -> str: |
|
if flags is None: |
|
return "" |
|
|
|
flag_descriptions = [] |
|
if flags & (1 << 0): |
|
flag_descriptions.append("fixed_pitch") |
|
if flags & (1 << 1): |
|
flag_descriptions.append("serif") |
|
if flags & (1 << 2): |
|
flag_descriptions.append("symbolic") |
|
if flags & (1 << 3): |
|
flag_descriptions.append("script") |
|
if flags & (1 << 5): |
|
flag_descriptions.append("non_symbolic") |
|
if flags & (1 << 6): |
|
flag_descriptions.append("italic") |
|
if flags & (1 << 16): |
|
flag_descriptions.append("all_cap") |
|
if flags & (1 << 17): |
|
flag_descriptions.append("small_cap") |
|
if flags & (1 << 18): |
|
flag_descriptions.append("bold") |
|
if flags & (1 << 19): |
|
flag_descriptions.append("use_extern_attr") |
|
|
|
return "_".join(flag_descriptions) |
|
|
|
|
|
def sort_block_group(blocks, tolerance=1.25): |
|
vertical_groups = {} |
|
for block in blocks: |
|
if hasattr(block, "bbox"): |
|
bbox = block.bbox |
|
else: |
|
bbox = block["bbox"] |
|
|
|
group_key = round(bbox[1] / tolerance) * tolerance |
|
if group_key not in vertical_groups: |
|
vertical_groups[group_key] = [] |
|
vertical_groups[group_key].append(block) |
|
|
|
|
|
sorted_blocks = [] |
|
for _, group in sorted(vertical_groups.items()): |
|
sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0]) |
|
sorted_blocks.extend(sorted_group) |
|
|
|
return sorted_blocks |
|
|