Spaces:

Atticux
/

pdf2u

Sleeping

App Files Files Community

Atticux commited on Feb 22

Commit

752094d

verified ·

1 Parent(s): e0ebfbb

Upload 108 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +50 -0
pyproject.toml +262 -0
src/pdf2u/__init__.py +1 -0
src/pdf2u/__pycache__/__init__.cpython-311.pyc +0 -0
src/pdf2u/__pycache__/__init__.cpython-312.pyc +0 -0
src/pdf2u/__pycache__/const.cpython-311.pyc +0 -0
src/pdf2u/__pycache__/const.cpython-312.pyc +0 -0
src/pdf2u/__pycache__/converter.cpython-311.pyc +0 -0
src/pdf2u/__pycache__/converter.cpython-312.pyc +0 -0
src/pdf2u/__pycache__/high_level.cpython-311.pyc +0 -0
src/pdf2u/__pycache__/high_level.cpython-312.pyc +0 -0
src/pdf2u/__pycache__/io.cpython-312.pyc +0 -0
src/pdf2u/__pycache__/main.cpython-311.pyc +0 -0
src/pdf2u/__pycache__/main.cpython-312.pyc +0 -0
src/pdf2u/__pycache__/pdfinterp.cpython-311.pyc +0 -0
src/pdf2u/__pycache__/pdfinterp.cpython-312.pyc +0 -0
src/pdf2u/__pycache__/progress_monitor.cpython-311.pyc +0 -0
src/pdf2u/__pycache__/progress_monitor.cpython-312.pyc +0 -0
src/pdf2u/__pycache__/translation_config.cpython-311.pyc +0 -0
src/pdf2u/__pycache__/translation_config.cpython-312.pyc +0 -0
src/pdf2u/asynchronize/__init__.py +51 -0
src/pdf2u/asynchronize/__pycache__/__init__.cpython-311.pyc +0 -0
src/pdf2u/asynchronize/__pycache__/__init__.cpython-312.pyc +0 -0
src/pdf2u/const.py +14 -0
src/pdf2u/converter.py +493 -0
src/pdf2u/document_il/__init__.py +45 -0
src/pdf2u/document_il/__pycache__/__init__.cpython-311.pyc +0 -0
src/pdf2u/document_il/__pycache__/__init__.cpython-312.pyc +0 -0
src/pdf2u/document_il/__pycache__/il_version_1.cpython-311.pyc +0 -0
src/pdf2u/document_il/__pycache__/il_version_1.cpython-312.pyc +0 -0
src/pdf2u/document_il/__pycache__/xml_converter.cpython-311.pyc +0 -0
src/pdf2u/document_il/__pycache__/xml_converter.cpython-312.pyc +0 -0
src/pdf2u/document_il/backend/__init__.py +0 -0
src/pdf2u/document_il/backend/__pycache__/__init__.cpython-311.pyc +0 -0
src/pdf2u/document_il/backend/__pycache__/__init__.cpython-312.pyc +0 -0
src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-311.pyc +0 -0
src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-312.pyc +0 -0
src/pdf2u/document_il/backend/pdf_creater.py +405 -0
src/pdf2u/document_il/frontend/__init__.py +0 -0
src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-311.pyc +0 -0
src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-312.pyc +0 -0
src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-311.pyc +0 -0
src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-312.pyc +0 -0
src/pdf2u/document_il/frontend/il_creater.py +328 -0
src/pdf2u/document_il/il_version_1.py +396 -0
src/pdf2u/document_il/il_version_1.rnc +141 -0
src/pdf2u/document_il/il_version_1.rng +390 -0
src/pdf2u/document_il/il_version_1.xsd +235 -0
src/pdf2u/document_il/midend/__init__.py +0 -0
src/pdf2u/document_il/midend/__pycache__/__init__.cpython-311.pyc +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+# Ref: https://github.com/fastapi/full-stack-fastapi-template/blob/master/backend/Dockerfile
+FROM python:3.12-slim-bookworm
+# Print logs immediately
+# Ref: https://docs.python.org/3/using/cmdline.html#envvar-PYTHONUNBUFFERED
+ENV PYTHONUNBUFFERED=1
+# Install system dependencies including OpenGL libraries
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+# Change the working directory to the `app` directory
+WORKDIR /app
+# Install uv
+# Ref: https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
+COPY --from=ghcr.io/astral-sh/uv:0.5.18 /uv /uvx /bin/
+# Place executables in the environment at the front of the path
+# Ref: https://docs.astral.sh/uv/guides/integration/docker/#using-the-environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Compile bytecode to speed up the startup time
+# Ref: https://docs.astral.sh/uv/guides/integration/docker/#compiling-bytecode
+ENV UV_COMPILE_BYTECODE=1
+# uv Cache
+# Ref: https://docs.astral.sh/uv/guides/integration/docker/#caching
+ENV UV_LINK_MODE=copy
+# Install dependencies
+# Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    uv sync --frozen --no-install-project
+# Copy the project into the image
+COPY . .
+# Sync the project
+# Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --all-extras
+EXPOSE 8501
+# Set the default command
+CMD ["streamlit", "run", "src/pdf2u/gui.py"]

pyproject.toml ADDED Viewed

	@@ -0,0 +1,262 @@

+[build-system]
+requires      = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.version]
+path = "src/pdf2u/__init__.py"
+# FROM: https://hatch.pypa.io/latest/version/
+[tool.hatch.build.targets.wheel]
+packages = ["src/pdf2u"]
+# FROM: https://hatch.pypa.io/latest/build/
+[project]
+name = "pdf2u"
+version = "0.0.4"
+description = "Yet Another Document Translator"
+classifiers = [
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+] # FROM: https://pypi.org/classifiers/
+readme = "README.md"
+requires-python = ">=3.10,<3.13"
+license = { file = "LICENSE" }
+authors = [{ name = "A.J.Zeller", email = "[email protected]" }]
+maintainers = [{ name = "A.J.Zeller", email = "[email protected]" }]
+# dynamic = ["version"] # https://hatch.pypa.io/latest/config/metadata/#version
+dependencies = [
+    "bitstring>=4.3.0",
+    "configargparse>=1.7",
+    "httpx[socks]>=0.27.0",
+    "huggingface-hub>=0.27.0",
+    "numpy>=2.0.2",
+    "onnx>=1.17.0",
+    "onnxruntime>=1.16.1",
+    "openai>=1.59.3",
+    "opencv-python>=4.10.0.84",
+    "orjson>=3.10.14",
+    "pdfminer-six>=20240706",
+    "peewee>=3.17.8",
+    "rich>=13.9.4",
+    "toml>=0.10.2",
+    "tqdm>=4.67.1",
+    "xsdata[cli,lxml,soap]>=24.12",
+    "msgpack>=1.1.0",
+    "typer>=0.15.1",
+    "pymupdf==1.24.5",
+]
+[project.urls]
+Homepage = "https://github.com/atticuszeller/pdf2u"
+Issues   = "https://github.com/atticuszeller/pdf2u/issues"
+[project.scripts] # build-backend config needed
+pdf2u = "pdf2u.main:app"
+# FROM: https://packaging.python.org/en/latest/guides/writing-pyproject-toml/
+[project.optional-dependencies]
+gui = ["pypdf2>=3.0.1", "streamlit>=1.42.2", "streamlit-pdf-viewer>=0.0.21"]
+# optional deps for package installation
+[dependency-groups]
+dev = [
+    "ruff>=0.6.3",
+    "mypy>=1.11.2",
+    "pre-commit>=3.8.0",
+    "pytest>=8.3.2",
+    "pytest-sugar>=1.0.0",
+    "coverage>=7.6.1",
+    "git-cliff>=2.6.1",
+    "bump-my-version>=0.28.0",
+    "typos>=1.26.8",
+    "fonttools>=4.56.0",
+]
+## Test
+[tool.mypy]
+strict  = true
+exclude = ["venv", ".venv"]
+[tool.pytest.ini_options]
+# Set additional command line options for pytest
+# Ref: https://docs.pytest.org/en/stable/reference/reference.html#command-line-flags
+addopts        = "-rXs --strict-config --strict-markers --tb=long"
+xfail_strict   = true                                              # Treat tests that are marked as xfail but pass as test failures
+filterwarnings = ["error"]                                         # Treat all warnings as errors
+pythonpath     = "src/pdf2u/"
+[tool.coverage.run]
+branch = true
+[tool.coverage.report]
+skip_covered = true
+show_missing = true
+precision = 2
+exclude_lines = [
+    'def __repr__',
+    'pragma= no cover',
+    'raise NotImplementedError',
+    'if TYPE_CHECKING=',
+    'if typing.TYPE_CHECKING=',
+    '@overload',
+    '@typing.overload',
+    '\(Protocol\)=$',
+    'typing.assert_never',
+    'assert_never',
+    'if __name__ == .__main__.=',
+]
+## Linter and formatter
+[tool.ruff]
+# cover and extend the default config in https=//docs.astral.sh/ruff/configuration/
+extend-exclude = [""]
+target-version = "py310"
+[tool.ruff.lint]
+select = [
+    "E",      # pycodestyle errors
+    "W",      # pycodestyle warnings
+    "F",      # pyflakes
+    "I",      # isort
+    "B",      # flake8-bugbear
+    "C4",     # flake8-comprehensions
+    "UP",     # pyupgrade
+    "ARG001", # unused arguments in functions
+]
+isort = { combine-as-imports = true, split-on-trailing-comma = false }
+# Avoid trying to fix flake8-bugbear (`B`) violations.
+unfixable = ["B"]
+[tool.ruff.format]
+docstring-code-format     = true
+skip-magic-trailing-comma = true
+# Reference
+# 1. https=//github.com/Kludex/python-template/blob/main/template/%7B%7B%20project_slug%20%7D%7D/pyproject.toml.jinja
+# 2. https=//github.com/fastapi/full-stack-fastapi-template/blob/master/backend/pyproject.toml
+# 3. https=//github.com/pydantic/logfire
+# 4. https=//coverage.readthedocs.io/en/latest/index.html
+## VCS
+[tool.git-cliff.remote.github]
+owner = "atticuszeller"
+repo  = "python-uv-package"
+[tool.git-cliff.changelog]
+# template for the changelog header
+header = """
+# Changelog\n
+All notable changes to this project will be documented in this file.\n
+"""
+# template for the changelog body
+# https://keats.github.io/tera/docs/#introduction
+body = """
+{% if version %}\
+    ## {{ version | trim_start_matches(pat="v") }} - {{ timestamp | date(format="%Y-%m-%d") }}
+{% else %}\
+    ## unreleased
+{% endif %}\
+{% for group, commits in commits | group_by(attribute="group") %}
+    ### {{ group | striptags | trim | upper_first }}
+    {% for commit in commits| unique(attribute="message") %}
+        - {% if commit.scope %}*({{ commit.scope }})* {% endif %}\
+            {% if commit.breaking %}[**breaking**] {% endif %}\
+            {{ commit.message | upper_first }}\
+            {% if commit.remote.pr_number %} in #{{ commit.remote.pr_number }}{%- endif %}\
+    {% endfor %}
+{% endfor %}\n
+"""
+# template for the changelog footer
+footer = """
+<!-- generated by git-cliff -->
+"""
+# remove the leading and trailings
+trim = true
+# postprocessors
+# postprocessors = [
+#   { pattern = '<REPO>', replace = "https://github.com/atticuszeller/python-uv" }, # replace repository URL
+# ]
+# render body even when there are no releases to process
+render_always = true
+# output file path
+output = "CHANGELOG.md"
+[tool.git-cliff.git]
+# parse the commits based on https://www.conventionalcommits.org
+conventional_commits = true
+# filter out the commits that are not conventional
+filter_unconventional = true
+# process each line of a commit as an individual commit
+split_commits = false
+# regex for preprocessing the commit messages
+commit_preprocessors = [
+    # If the spelling is incorrect, it will be automatically fixed.
+    { pattern = '.*', replace_command = 'typos --write-changes -' },
+]
+# regex for parsing and grouping commits
+commit_parsers = [
+    { message = "^feat", group = "<!-- 0 -->🚀 Features" },
+    { message = "^fix", group = "<!-- 1 -->🐛 Bug Fixes" },
+    { message = "^doc", group = "<!-- 3 -->📚 Documentation" },
+    { message = "^perf", group = "<!-- 4 -->⚡ Performance" },
+    { message = "^refactor", group = "<!-- 2 -->🚜 Refactor" },
+    { message = "^style", group = "<!-- 5 -->🎨 Styling" },
+    { message = "^test", group = "<!-- 6 -->🧪 Testing" },
+    { message = "^chore\\(release\\)", skip = true },
+    { message = "^chore\\(deps.*\\)", skip = true },
+    { message = "^chore\\(pr\\)", skip = true },
+    { message = "^chore\\(pull\\)", skip = true },
+    { message = "^chore|^ci", group = "<!-- 7 -->⚙️ Miscellaneous Tasks" },
+    { body = ".*security", group = "<!-- 8 -->🛡️ Security" },
+    { message = "^revert", group = "<!-- 9 -->◀️ Revert" },
+]
+# filter out the commits that are not matched by commit parsers
+filter_commits = false
+# sort the tags topologically
+topo_order = false
+# sort the commits inside sections by oldest/newest order
+sort_commits = "oldest"
+[tool.bumpversion]
+current_version        = "0.0.4"
+parse                  = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
+serialize              = ["{major}.{minor}.{patch}"]
+search                 = "{current_version}"
+replace                = "{new_version}"
+regex                  = false
+ignore_missing_version = false
+ignore_missing_files   = false
+tag                    = true
+sign_tags              = false
+tag_name               = "v{new_version}"
+tag_message            = "chore(release): {current_version} → {new_version}"
+allow_dirty            = true                                                  # git-cliff first then bump patch
+commit                 = true
+message                = "chore(release): {current_version} → {new_version}"
+commit_args            = ""
+setup_hooks            = []
+pre_commit_hooks       = []
+post_commit_hooks      = []
+[[tool.bumpversion.files]]
+filename = "src/pdf2u/__init__.py"
+[[tool.bumpversion.files]]
+filename = "pyproject.toml"
+search   = "version = \"{current_version}\""
+replace  = "version = \"{new_version}\""
+[[tool.bumpversion.files]]
+filename = "CHANGELOG.md"
+search   = "unreleased"
+replace  = "{new_version} - {now:%Y-%m-%d}"
+# https://callowayproject.github.io/bump-my-version/reference/search-and-replace-config/

src/pdf2u/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.0.4"

src/pdf2u/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (181 Bytes). View file

src/pdf2u/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (175 Bytes). View file

src/pdf2u/__pycache__/const.cpython-311.pyc ADDED Viewed

Binary file (519 Bytes). View file

src/pdf2u/__pycache__/const.cpython-312.pyc ADDED Viewed

Binary file (855 Bytes). View file

src/pdf2u/__pycache__/converter.cpython-311.pyc ADDED Viewed

Binary file (13.8 kB). View file

src/pdf2u/__pycache__/converter.cpython-312.pyc ADDED Viewed

Binary file (12.7 kB). View file

src/pdf2u/__pycache__/high_level.cpython-311.pyc ADDED Viewed

Binary file (21.2 kB). View file

src/pdf2u/__pycache__/high_level.cpython-312.pyc ADDED Viewed

Binary file (18.6 kB). View file

src/pdf2u/__pycache__/io.cpython-312.pyc ADDED Viewed

Binary file (583 Bytes). View file

src/pdf2u/__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (13.3 kB). View file

src/pdf2u/__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (13.4 kB). View file

src/pdf2u/__pycache__/pdfinterp.cpython-311.pyc ADDED Viewed

Binary file (23.7 kB). View file

src/pdf2u/__pycache__/pdfinterp.cpython-312.pyc ADDED Viewed

Binary file (21.5 kB). View file

src/pdf2u/__pycache__/progress_monitor.cpython-311.pyc ADDED Viewed

Binary file (9.5 kB). View file

src/pdf2u/__pycache__/progress_monitor.cpython-312.pyc ADDED Viewed

Binary file (8.69 kB). View file

src/pdf2u/__pycache__/translation_config.cpython-311.pyc ADDED Viewed

Binary file (8.22 kB). View file

src/pdf2u/__pycache__/translation_config.cpython-312.pyc ADDED Viewed

Binary file (7.45 kB). View file

src/pdf2u/asynchronize/__init__.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import asyncio
+import time
+class Args:
+    def __init__(self, args, kwargs):
+        self.args = args
+        self.kwargs = kwargs
+class AsyncCallback:
+    def __init__(self):
+        self.queue = asyncio.Queue()
+        self.finished = False
+        self.loop = asyncio.get_event_loop()
+    def step_callback(self, *args, **kwargs):
+        # Whenever a step is called, add to the queue but don't set finished to True, so __anext__ will continue
+        args = Args(args, kwargs)
+        # We have to use the threadsafe call so that it wakes up the event loop, in case it's sleeping:
+        # https://stackoverflow.com/a/49912853/2148718
+        self.loop.call_soon_threadsafe(self.queue.put_nowait, args)
+        # Add a small delay to release the GIL, ensuring the event loop has time to process messages
+        time.sleep(0.01)
+    def finished_callback(self, *args, **kwargs):
+        # Whenever a finished is called, add to the queue as with step, but also set finished to True, so __anext__
+        # will terminate after processing the remaining items
+        if self.finished:
+            return
+        self.step_callback(*args, **kwargs)
+        self.finished = True
+    def __await__(self):
+        # Since this implements __anext__, this can return itself
+        return self.queue.get().__await__()
+    def __aiter__(self):
+        # Since this implements __anext__, this can return itself
+        return self
+    async def __anext__(self):
+        # Keep waiting for the queue if a) we haven't finished, or b) if the queue is still full. This lets us finish
+        # processing the remaining items even after we've finished
+        if self.finished and self.queue.empty():
+            raise StopAsyncIteration
+        result = await self.queue.get()
+        return result

src/pdf2u/asynchronize/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.69 kB). View file

src/pdf2u/asynchronize/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (2.5 kB). View file

src/pdf2u/const.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from enum import StrEnum
+from pathlib import Path
+CACHE_FOLDER = Path.home() / ".cache" / "pdf2u"
+def get_cache_file_path(filename: str) -> Path:
+    return CACHE_FOLDER / filename
+class TranslationService(StrEnum):
+    OPENAI: str = "openai"
+    GOOGLE: str = "google"
+    BING: str = "bing"

src/pdf2u/converter.py ADDED Viewed

	@@ -0,0 +1,493 @@

+import base64
+import logging
+import re
+import unicodedata
+import numpy as np
+from pdfminer.converter import PDFConverter
+from pdfminer.layout import LTChar, LTComponent, LTFigure, LTLine, LTPage, LTText
+from pdfminer.pdfcolor import PDFColorSpace
+from pdfminer.pdffont import PDFCIDFont, PDFFont, PDFUnicodeNotDefined
+from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
+from pdfminer.utils import Matrix, apply_matrix_pt, bbox2str, matrix2str, mult_matrix
+from pymupdf import Font
+from pdf2u.document_il.frontend.il_creater import ILCreater
+log = logging.getLogger(__name__)
+class PDFConverterEx(PDFConverter):
+    def __init__(
+        self, rsrcmgr: PDFResourceManager, il_creater: ILCreater | None = None
+    ) -> None:
+        PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
+        self.il_creater = il_creater
+    def begin_page(self, page, ctm) -> None:
+        # 重载替换 cropbox
+        (x0, y0, x1, y1) = page.cropbox
+        (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
+        (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
+        mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
+        self.il_creater.on_page_media_box(
+            mediabox[0], mediabox[1], mediabox[2], mediabox[3]
+        )
+        self.il_creater.on_page_number(page.pageno)
+        self.cur_item = LTPage(page.pageno, mediabox)
+    def end_page(self, _page) -> None:
+        # 重载返回指令流
+        return self.receive_layout(self.cur_item)
+    def begin_figure(self, name, bbox, matrix) -> None:
+        # 重载设置 pageid
+        self._stack.append(self.cur_item)
+        self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
+        self.cur_item.pageid = self._stack[-1].pageid
+    def end_figure(self, _: str) -> None:
+        # 重载返回指令流
+        fig = self.cur_item
+        if not isinstance(self.cur_item, LTFigure):
+            raise ValueError(f"Unexpected item type: {type(self.cur_item)}")
+        self.cur_item = self._stack.pop()
+        self.cur_item.add(fig)
+        return self.receive_layout(fig)
+    def render_char(
+        self,
+        matrix,
+        font,
+        fontsize: float,
+        scaling: float,
+        rise: float,
+        cid: int,
+        ncs,
+        graphicstate: PDFGraphicState,
+    ) -> float:
+        # 重载设置 cid 和 font
+        try:
+            text = font.to_unichr(cid)
+            if not isinstance(text, str):
+                raise TypeError(f"Expected string, got {type(text)}")
+        except PDFUnicodeNotDefined:
+            text = self.handle_undefined_char(font, cid)
+        textwidth = font.char_width(cid)
+        textdisp = font.char_disp(cid)
+        font_name = font.fontname
+        if isinstance(font_name, bytes):
+            try:
+                font_name = font_name.decode("utf-8")
+            except UnicodeDecodeError:
+                font_name = "BASE64:" + base64.b64encode(font_name).decode("utf-8")
+        font_id = self.il_creater.current_page_font_name_id_map[font_name]
+        item = AWLTChar(
+            matrix,
+            font,
+            fontsize,
+            scaling,
+            rise,
+            text,
+            textwidth,
+            textdisp,
+            ncs,
+            graphicstate,
+            self.il_creater.xobj_id,
+            font_id,
+        )
+        self.cur_item.add(item)
+        item.cid = cid  # hack 插入原字符编码
+        item.font = font  # hack 插入原字符字体
+        return item.adv
+class AWLTChar(LTChar):
+    """Actual letter in the text as a Unicode string."""
+    def __init__(
+        self,
+        matrix: Matrix,
+        font: PDFFont,
+        fontsize: float,
+        scaling: float,
+        rise: float,
+        text: str,
+        textwidth: float,
+        textdisp: float | tuple[float | None, float],
+        ncs: PDFColorSpace,
+        graphicstate: PDFGraphicState,
+        xobj_id: int,
+        font_id: str,
+    ) -> None:
+        LTText.__init__(self)
+        self._text = text
+        self.matrix = matrix
+        self.fontname = font.fontname
+        self.ncs = ncs
+        self.graphicstate = graphicstate
+        self.xobj_id = xobj_id
+        self.adv = textwidth * fontsize * scaling
+        self.aw_font_id = font_id
+        # compute the boundary rectangle.
+        if font.is_vertical():
+            # vertical
+            assert isinstance(textdisp, tuple)
+            (vx, vy) = textdisp
+            if vx is None:
+                vx = fontsize * 0.5
+            else:
+                vx = vx * fontsize * 0.001
+            vy = (1000 - vy) * fontsize * 0.001
+            bbox_lower_left = (-vx, vy + rise + self.adv)
+            bbox_upper_right = (-vx + fontsize, vy + rise)
+        else:
+            # horizontal
+            descent = font.get_descent() * fontsize
+            bbox_lower_left = (0, descent + rise)
+            bbox_upper_right = (self.adv, descent + rise + fontsize)
+        (a, b, c, d, e, f) = self.matrix
+        self.upright = a * d * scaling > 0 and b * c <= 0
+        (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
+        (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
+        if x1 < x0:
+            (x0, x1) = (x1, x0)
+        if y1 < y0:
+            (y0, y1) = (y1, y0)
+        LTComponent.__init__(self, (x0, y0, x1, y1))
+        if font.is_vertical() or matrix[0] == 0:
+            self.size = self.width
+        else:
+            self.size = self.height
+        return
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"
+    def get_text(self) -> str:
+        return self._text
+class Paragraph:
+    def __init__(self, y, x, x0, x1, size, brk):
+        self.y: float = y  # 初始纵坐标
+        self.x: float = x  # 初始横坐标
+        self.x0: float = x0  # 左边界
+        self.x1: float = x1  # 右边界
+        self.size: float = size  # 字体大小
+        self.brk: bool = brk  # 换行标记
+# fmt: off
+class TranslateConverter(PDFConverterEx):
+    def __init__(
+        self,
+        rsrcmgr,
+        vfont: str | None = None,
+        vchar: str | None = None,
+        thread: int = 0,
+        layout: dict | None = None,
+        lang_in: str = "",  # 保留参数但添加未使用标记
+        _lang_out: str = "",  # 改为未使用参数
+        _service: str = "",  # 改为未使用参数
+        resfont: str = "",
+        noto: Font | None = None,
+        envs: dict | None = None,
+        _prompt: list | None = None,  # 改为未使用参数
+        il_creater: ILCreater | None = None,
+    ):
+        layout = layout or {}
+        super().__init__(rsrcmgr, il_creater)
+        self.vfont = vfont
+        self.vchar = vchar
+        self.thread = thread
+        self.layout = layout
+        self.resfont = resfont
+        self.noto = noto
+    def receive_layout(self, ltpage: LTPage):
+        # 段落
+        sstk: list[str] = []            # 段落文字栈
+        pstk: list[Paragraph] = []      # 段落属性栈
+        vbkt: int = 0                   # 段落公式括号计数
+        # 公式组
+        vstk: list[LTChar] = []         # 公式符号组
+        vlstk: list[LTLine] = []        # 公式线条组
+        vfix: float = 0                 # 公式纵向偏移
+        # 公式组栈
+        var: list[list[LTChar]] = []    # 公式符号组栈
+        varl: list[list[LTLine]] = []   # 公式线条组栈
+        varf: list[float] = []          # 公式纵向偏移栈
+        vlen: list[float] = []          # 公式宽度栈
+        # 全局
+        lstk: list[LTLine] = []         # 全局线条栈
+        xt: LTChar = None               # 上一个字符
+        xt_cls: int = -1                # 上一个字符所属段落，保证无论第一个字符属于哪个类别都可以触发新段落
+        vmax: float = ltpage.width / 4  # 行内公式最大宽度
+        ops: str = ""                   # 渲染结果
+        def vflag(font: str, char: str):    # 匹配公式（和角标）字体
+            if isinstance(font, bytes):     # 不一定能 decode，直接转 str
+                font = str(font)
+            font = font.split("+")[-1]      # 字体名截断
+            if re.match(r"\(cid:", char):
+                return True
+            # 基于字体名规则的判定
+            if self.vfont:
+                if re.match(self.vfont, font):
+                    return True
+            else:
+                if re.match(                                            # latex 字体
+                    r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)",
+                    font,
+                ):
+                    return True
+            # 基于字符集规则的判定
+            if self.vchar:
+                if re.match(self.vchar, char):
+                    return True
+            else:
+                if (
+                    char
+                    and char != " "                                     # 非空格
+                    and (
+                        unicodedata.category(char[0])
+                        in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]   # 文字修饰符、数学符号、分隔符号
+                        or ord(char[0]) in range(0x370, 0x400)          # 希腊字母
+                    )
+                ):
+                    return True
+            return False
+        ############################################################
+        # A. 原文档解析
+        for child in ltpage:
+            if isinstance(child, LTChar):
+                self.il_creater.on_lt_char(child)
+                continue
+                cur_v = False
+                layout = self.layout[ltpage.pageid]
+                # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
+                h, w = layout.shape
+                # 读取当前字符在 layout 中的类别
+                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
+                cls = layout[cy, cx]
+                # 锚定文档中 bullet 的位置
+                if child.get_text() == "•":
+                    cls = 0
+                # 判定当前字符是否属于公式
+                if (                                                                                        # 判定当前字符是否属于公式
+                    cls == 0                                                                                # 1. 类别为保留区域
+                    or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79)  # 2. 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
+                    or vflag(child.fontname, child.get_text())                                              # 3. 公式字体
+                    or (child.matrix[0] == 0 and child.matrix[3] == 0)                                      # 4. 垂直字体
+                ):
+                    cur_v = True
+                # 判定括号组是否属于公式
+                if not cur_v:
+                    if vstk and child.get_text() == "(":
+                        cur_v = True
+                        vbkt += 1
+                    if vbkt and child.get_text() == ")":
+                        cur_v = True
+                        vbkt -= 1
+                if (                                                        # 判定当前公式是否结束
+                    not cur_v                                               # 1. 当前字符不属于公式
+                    or cls != xt_cls                                        # 2. 当前字符与前一个字符不属于同一段落
+                    # or (abs(child.x0 - xt.x0) > vmax and cls != 0)        # 3. 段落内换行，可能是一长串斜体的段落，也可能是段内分式换行，这里设个阈值进行区分
+                    # 禁止纯公式（代码）段落换行，直到文字开始再重开文字段落，保证只存在两种情况
+                    # A. 纯公式（代码）段落（锚定绝对位置）sstk[-1]=="" -> sstk[-1]=="{v*}"
+                    # B. 文字开头段落（排版相对位置）sstk[-1]!=""
+                    or (sstk[-1] != "" and abs(child.x0 - xt.x0) > vmax)    # 因为 cls==xt_cls==0 一定有 sstk[-1]==""，所以这里不需要再判定 cls!=0
+                ):
+                    if vstk:
+                        if (                                                # 根据公式右侧的文字修正公式的纵向偏移
+                            not cur_v                                       # 1. 当前字符不属于公式
+                            and cls == xt_cls                               # 2. 当前字符与前一个字符属于同一段落
+                            and child.x0 > max([vch.x0 for vch in vstk])    # 3. 当前字符在公式右侧
+                        ):
+                            vfix = vstk[0].y0 - child.y0
+                        if sstk[-1] == "":
+                            xt_cls = -1 # 禁止纯公式段落（sstk[-1]=="{v*}"）的后续连接，但是要考虑新字符和后续字符的连接，所以这里修改的是上个字符的类别
+                        sstk[-1] += f"{{v{len(var)}}}"
+                        var.append(vstk)
+                        varl.append(vlstk)
+                        varf.append(vfix)
+                        vstk = []
+                        vlstk = []
+                        vfix = 0
+                # 当前字符不属于公式或当前字符是公式的第一个字符
+                if not vstk:
+                    if cls == xt_cls:               # 当前字符与前一个字符属于同一段落
+                        if child.x0 > xt.x1 + 1:    # 添加行内空格
+                            sstk[-1] += " "
+                        elif child.x1 < xt.x0:      # 添加换行空格并标记原文段落存在换行
+                            sstk[-1] += " "
+                            pstk[-1].brk = True
+                    else:                           # 根据当前字符构建一个新的段落
+                        sstk.append("")
+                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
+                if not cur_v:                                               # 文字入栈
+                    if (                                                    # 根据当前字符修正段落属性
+                        child.size > pstk[-1].size / 0.79                   # 1. 当前字符显著比段落字体大
+                        or len(sstk[-1].strip()) == 1                       # 2. 当前字符为段落第二个文字（考虑首字母放大的情况）
+                    ) and child.get_text() != " ":                          # 3. 当前字符不是空格
+                        pstk[-1].y -= child.size - pstk[-1].size            # 修正段落初始纵坐标，假设两个不同大小字符的上边界对齐
+                        pstk[-1].size = child.size
+                    sstk[-1] += child.get_text()
+                else:                                                       # 公式入栈
+                    if (                                                    # 根据公式左侧的文字修正公式的纵向偏移
+                        not vstk                                            # 1. 当前字符是公式的第一个字符
+                        and cls == xt_cls                                   # 2. 当前字符与前一个字符属于同一段落
+                        and child.x0 > xt.x0                                # 3. 前一个字符在公式左侧
+                    ):
+                        vfix = child.y0 - xt.y0
+                    vstk.append(child)
+                # 更新段落边界，因为段落内换行之后可能是公式开头，所以要在外边处理
+                pstk[-1].x0 = min(pstk[-1].x0, child.x0)
+                pstk[-1].x1 = max(pstk[-1].x1, child.x1)
+                # 更新上一个字符
+                xt = child
+                xt_cls = cls
+            elif isinstance(child, LTFigure):
+                # 图表
+                self.il_creater.on_pdf_figure(child)
+                pass
+            elif isinstance(child, LTLine):     # 线条
+                continue
+                layout = self.layout[ltpage.pageid]
+                # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
+                h, w = layout.shape
+                # 读取当前线条在 layout 中的类别
+                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
+                cls = layout[cy, cx]
+                if vstk and cls == xt_cls:      # 公式线条
+                    vlstk.append(child)
+                else:                           # 全局线条
+                    lstk.append(child)
+            else:
+                pass
+        return
+        # 处理结尾
+        if vstk:    # 公式出栈
+            sstk[-1] += f"{{v{len(var)}}}"
+            var.append(vstk)
+            varl.append(vlstk)
+            varf.append(vfix)
+        log.debug("\n==========[VSTACK]==========\n")
+        for var_id, v in enumerate(var):  # 计算公式宽度
+            l = max([vch.x1 for vch in v]) - v[0].x0
+            log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[var_id])} > v{var_id} = {"".join([ch.get_text() for ch in v])}')
+            vlen.append(l)
+        ############################################################
+        # B. 段落翻译
+        log.debug("\n==========[SSTACK]==========\n")
+        news = sstk.copy()
+        ############################################################
+        # C. 新文档排版
+        def raw_string(fcur: str, cstk: str):  # 编码字符串
+            if fcur == 'noto':
+                return "".join([f"{self.noto.has_glyph(ord(c)):04x}" for c in cstk])
+            elif isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
+                return "".join([f"{ord(c):04x}" for c in cstk])
+            else:
+                return "".join([f"{ord(c):02x}" for c in cstk])
+        _x, _y = 0, 0
+        for para_id, new in enumerate(news):
+            x: float = pstk[para_id].x           # 段落初始横坐标
+            y: float = pstk[para_id].y           # 段落初始纵坐标
+            x0: float = pstk[para_id].x0         # 段落左边界
+            x1: float = pstk[para_id].x1         # 段落右边界
+            size: float = pstk[para_id].size     # 段落字体大小
+            brk: bool = pstk[para_id].brk        # 段落换行标记
+            cstk: str = ""                  # 当前文字栈
+            fcur: str = None                # 当前字体 ID
+            tx = x
+            fcur_ = fcur
+            ptr = 0
+            log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[para_id]} | {new}")
+            while ptr < len(new):
+                vy_regex = re.match(
+                    r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE,
+                )  # 匹配 {vn} 公式标记
+                mod = 0  # 文字修饰符
+                if vy_regex:  # 加载公式
+                    ptr += len(vy_regex.group(0))
+                    try:
+                        vid = int(vy_regex.group(1).replace(" ", ""))
+                        adv = vlen[vid]
+                    except Exception as e:
+                        log.debug("Skipping formula placeholder due to: %s", e)
+                        continue  # 翻译器可能会自动补个越界的公式标记
+                    if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]:  # 文字修饰符
+                        mod = var[vid][-1].width
+                else:  # 加载文字
+                    ch = new[ptr]
+                    fcur_ = None
+                    try:
+                        if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
+                            fcur_ = "tiro"  # 默认拉丁字体
+                    except Exception:
+                        pass
+                    if fcur_ is None:
+                        fcur_ = self.resfont  # 默认非拉丁字体
+                    if fcur_ == 'noto':
+                        adv = self.noto.char_lengths(ch, size)[0]
+                    else:
+                        adv = self.fontmap[fcur_].char_width(ord(ch)) * size
+                    ptr += 1
+                if (                                # 输出文字缓冲区
+                    fcur_ != fcur                   # 1. 字体更新
+                    or vy_regex                     # 2. 插入公式
+                    or x + adv > x1 + 0.1 * size    # 3. 到达右边界（可能一整行都被符号化，这里需要考虑浮点误差）
+                ):
+                    if cstk:
+                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+                        cstk = ""
+                if brk and x + adv > x1 + 0.1 * size:  # 到达右边界且原文段落存在换行
+                    x = x0
+                    lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
+                    # y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1)  # 小语种大多适配 1.1
+                    y -= size * 1.4
+                if vy_regex:  # 插入公式
+                    fix = 0
+                    if fcur is not None:  # 段落内公式修正纵向偏移
+                        fix = varf[vid]
+                    for vch in var[vid]:  # 排版公式字符
+                        vc = chr(vch.cid)
+                        ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm <{raw_string(self.fontid[vch.font], vc)}> TJ "
+                        if log.isEnabledFor(logging.DEBUG):
+                            lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
+                            _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
+                    for l in varl[vid]:  # 排版公式线条
+                        if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
+                            ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
+                else:  # 插入文字缓冲区
+                    if not cstk:  # 单行开头
+                        tx = x
+                        if x == x0 and ch == " ":  # 消除段落换行空格
+                            adv = 0
+                        else:
+                            cstk += ch
+                    else:
+                        cstk += ch
+                adv -= mod # 文字修饰符
+                fcur = fcur_
+                x += adv
+                if log.isEnabledFor(logging.DEBUG):
+                    lstk.append(LTLine(0.1, (_x, _y), (x, y)))
+                    _x, _y = x, y
+            # 处理结尾
+            if cstk:
+                ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm <{raw_string(fcur, cstk)}> TJ "
+        for l in lstk:  # 排版全局线条
+            if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
+                ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
+        ops = f"BT {ops}ET "
+        return ops

src/pdf2u/document_il/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from pdf2u.document_il.il_version_1 import (
+    BaseOperations,
+    Box,
+    Cropbox,
+    Document,
+    GraphicState,
+    Mediabox,
+    Page,
+    PageLayout,
+    PdfCharacter,
+    PdfFigure,
+    PdfFont,
+    PdfFormula,
+    PdfLine,
+    PdfParagraph,
+    PdfParagraphComposition,
+    PdfRectangle,
+    PdfSameStyleCharacters,
+    PdfSameStyleUnicodeCharacters,
+    PdfStyle,
+    PdfXobject,
+)
+__all__ = [
+    "BaseOperations",
+    "Box",
+    "Cropbox",
+    "Document",
+    "GraphicState",
+    "Mediabox",
+    "Page",
+    "PageLayout",
+    "PdfCharacter",
+    "PdfFigure",
+    "PdfFont",
+    "PdfFormula",
+    "PdfLine",
+    "PdfParagraph",
+    "PdfParagraphComposition",
+    "PdfRectangle",
+    "PdfSameStyleCharacters",
+    "PdfSameStyleUnicodeCharacters",
+    "PdfStyle",
+    "PdfXobject",
+]

src/pdf2u/document_il/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (912 Bytes). View file

src/pdf2u/document_il/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (706 Bytes). View file

src/pdf2u/document_il/__pycache__/il_version_1.cpython-311.pyc ADDED Viewed

Binary file (22 kB). View file

src/pdf2u/document_il/__pycache__/il_version_1.cpython-312.pyc ADDED Viewed

Binary file (17.1 kB). View file

src/pdf2u/document_il/__pycache__/xml_converter.cpython-311.pyc ADDED Viewed

Binary file (4.42 kB). View file

src/pdf2u/document_il/__pycache__/xml_converter.cpython-312.pyc ADDED Viewed

Binary file (3.81 kB). View file

src/pdf2u/document_il/backend/__init__.py ADDED Viewed

File without changes

src/pdf2u/document_il/backend/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (181 Bytes). View file

src/pdf2u/document_il/backend/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (169 Bytes). View file

src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-311.pyc ADDED Viewed

Binary file (19.8 kB). View file

src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-312.pyc ADDED Viewed

Binary file (18.5 kB). View file

src/pdf2u/document_il/backend/pdf_creater.py ADDED Viewed

	@@ -0,0 +1,405 @@

+import logging
+import re
+from pathlib import Path
+import pymupdf
+from bitstring import BitStream
+from pdf2u.document_il import il_version_1
+from pdf2u.document_il.utils.fontmap import FontMapper
+from pdf2u.translation_config import TranslateResult, TranslationConfig
+logger = logging.getLogger(__name__)
+SUBSET_FONT_STAGE_NAME = "Subset font"
+SAVE_PDF_STAGE_NAME = "Save PDF"
+class PDFCreater:
+    stage_name = "Generate drawing instructions"
+    def __init__(
+        self,
+        original_pdf_path: str,
+        document: il_version_1.Document,
+        translation_config: TranslationConfig,
+    ):
+        self.original_pdf_path = original_pdf_path
+        self.docs = document
+        self.font_path = translation_config.font
+        self.font_mapper = FontMapper(translation_config)
+        self.translation_config = translation_config
+    def render_graphic_state(
+        self, draw_op: BitStream, graphic_state: il_version_1.GraphicState
+    ):
+        if graphic_state is None:
+            return
+        # if graphic_state.stroking_color_space_name:
+        #     draw_op.append(
+        #         f"/{graphic_state.stroking_color_space_name} CS \n".encode()
+        #     )
+        # if graphic_state.non_stroking_color_space_name:
+        #     draw_op.append(
+        #         f"/{graphic_state.non_stroking_color_space_name}"
+        #         f" cs \n".encode()
+        #     )
+        # if graphic_state.ncolor is not None:
+        #     if len(graphic_state.ncolor) == 1:
+        #         draw_op.append(f"{graphic_state.ncolor[0]} g \n".encode())
+        #     elif len(graphic_state.ncolor) == 3:
+        #         draw_op.append(
+        #             f"{' '.join((str(x) for x in graphic_state.ncolor))} sc \n".encode()
+        #         )
+        # if graphic_state.scolor is not None:
+        #     if len(graphic_state.scolor) == 1:
+        #         draw_op.append(f"{graphic_state.scolor[0]} G \n".encode())
+        #     elif len(graphic_state.scolor) == 3:
+        #         draw_op.append(
+        #             f"{' '.join((str(x) for x in graphic_state.scolor))} SC \n".encode()
+        #         )
+        if graphic_state.passthrough_per_char_instruction:
+            draw_op.append(
+                f"{graphic_state.passthrough_per_char_instruction} \n".encode()
+            )
+    def render_paragraph_to_char(
+        self, paragraph: il_version_1.PdfParagraph
+    ) -> list[il_version_1.PdfCharacter]:
+        chars = []
+        for composition in paragraph.pdf_paragraph_composition:
+            if not isinstance(composition.pdf_character, il_version_1.PdfCharacter):
+                logger.error(
+                    f"Unknown composition type. "
+                    f"This type only appears in the IL "
+                    f"after the translation is completed."
+                    f"During pdf rendering, this type is not supported."
+                    f"Composition: {composition}. "
+                    f"Paragraph: {paragraph}. "
+                )
+                continue
+            chars.append(composition.pdf_character)
+        if not chars and paragraph.unicode:
+            logger.error(
+                f"Unable to export paragraphs that have "
+                f"not yet been formatted: {paragraph}"
+            )
+            return chars
+        return chars
+    def get_available_font_list(self, pdf, page):
+        page_xref_id = pdf[page.page_number].xref
+        return self.get_xobj_available_fonts(page_xref_id, pdf)
+    def get_xobj_available_fonts(self, page_xref_id, pdf):
+        resources_type, r_id = pdf.xref_get_key(page_xref_id, "Resources")
+        if resources_type == "xref":
+            resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
+            r_id = pdf.xref_object(int(resource_xref_id))
+            resources_type = "dict"
+        if resources_type == "dict":
+            xref_id = re.search("/Font (\\d+) 0 R", r_id)
+            if xref_id is not None:
+                xref_id = xref_id.group(1)
+                font_dict = pdf.xref_object(int(xref_id))
+            else:
+                search = re.search("/Font *<<(.+?)>>", r_id.replace("\n", " "))
+                if search is None:
+                    # Have resources but no fonts
+                    return set()
+                font_dict = search.group(1)
+        else:
+            r_id = int(r_id.split(" ")[0])
+            _, font_dict = pdf.xref_get_key(r_id, "Font")
+        fonts = re.findall("/([^ ]+?) ", font_dict)
+        return set(fonts)
+    def _debug_render_rectangle(
+        self, draw_op: BitStream, rectangle: il_version_1.PdfRectangle
+    ):
+        """Draw a debug rectangle in PDF for visualization purposes.
+        Args:
+            draw_op: BitStream to append PDF drawing operations
+            rectangle: Rectangle object containing position information
+        """
+        x1 = rectangle.box.x
+        y1 = rectangle.box.y
+        x2 = rectangle.box.x2
+        y2 = rectangle.box.y2
+        # Save graphics state
+        draw_op.append(b"q ")
+        # Set green color for debug visibility
+        draw_op.append(
+            rectangle.graphic_state.passthrough_per_char_instruction.encode()
+        )  # Green stroke
+        draw_op.append(b" 1 w ")  # Line width
+        # Draw four lines manually
+        # Bottom line
+        draw_op.append(f"{x1} {y1} m {x2} {y1} l S ".encode())
+        # Right line
+        draw_op.append(f"{x2} {y1} m {x2} {y2} l S ".encode())
+        # Top line
+        draw_op.append(f"{x2} {y2} m {x1} {y2} l S ".encode())
+        # Left line
+        draw_op.append(f"{x1} {y2} m {x1} {y1} l S ".encode())
+        # Restore graphics state
+        draw_op.append(b"Q\n")
+    def write_debug_info(
+        self, pdf: pymupdf.Document, translation_config: TranslationConfig
+    ):
+        self.font_mapper.add_font(pdf, self.docs)
+        for page in self.docs.page:
+            _, r_id = pdf.xref_get_key(pdf[page.page_number].xref, "Contents")
+            resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
+            base_op = pdf.xref_stream(int(resource_xref_id))
+            translation_config.raise_if_cancelled()
+            xobj_available_fonts = {}
+            xobj_draw_ops = {}
+            xobj_encoding_length_map = {}
+            available_font_list = self.get_available_font_list(pdf, page)
+            page_encoding_length_map = {
+                f.font_id: f.encoding_length for f in page.pdf_font
+            }
+            page_op = BitStream()
+            # q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
+            page_op.append(b"q ")
+            if base_op is not None:
+                page_op.append(base_op)
+            page_op.append(b" Q ")
+            page_op.append(
+                f"q Q 1 0 0 1 {page.cropbox.box.x} {page.cropbox.box.y} cm \n".encode()
+            )
+            # 收集所有字符
+            chars = []
+            # 首先添加页面级别的字符
+            if page.pdf_character:
+                chars.extend(page.pdf_character)
+            # 然后添加段落中的字符
+            for paragraph in page.pdf_paragraph:
+                chars.extend(self.render_paragraph_to_char(paragraph))
+            # 渲染所有字符
+            for char in chars:
+                if not getattr(char, "debug_info", False):
+                    continue
+                if char.char_unicode == "\n":
+                    continue
+                if char.pdf_character_id is None:
+                    # dummy char
+                    continue
+                char_size = char.pdf_style.font_size
+                font_id = char.pdf_style.font_id
+                if font_id not in available_font_list:
+                    continue
+                draw_op = page_op
+                encoding_length_map = page_encoding_length_map
+                draw_op.append(b"q ")
+                self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
+                if char.vertical:
+                    draw_op.append(
+                        f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode()
+                    )
+                else:
+                    draw_op.append(
+                        f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode()
+                    )
+                encoding_length = encoding_length_map[font_id]
+                # pdf32000-2008 page14:
+                # As hexadecimal data enclosed in angle brackets < >
+                # see 7.3.4.3, "Hexadecimal Strings."
+                draw_op.append(
+                    f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode()
+                )
+                draw_op.append(b" Tj ET Q \n")
+            for rect in page.pdf_rectangle:
+                if not rect.debug_info:
+                    continue
+                self._debug_render_rectangle(page_op, rect)
+            draw_op = page_op
+            # Since this is a draw instruction container,
+            # no additional information is needed
+            pdf.update_stream(int(resource_xref_id), draw_op.tobytes())
+        translation_config.raise_if_cancelled()
+        pdf.subset_fonts(fallback=False)
+    def write(self, translation_config: TranslationConfig) -> TranslateResult:
+        basename = Path(translation_config.input_file).stem
+        debug_suffix = ".debug" if translation_config.debug else ""
+        mono_out_path = translation_config.get_output_file_path(
+            f"{basename}{debug_suffix}.{translation_config.lang_out}.mono.pdf"
+        )
+        pdf = pymupdf.open(self.original_pdf_path)
+        self.font_mapper.add_font(pdf, self.docs)
+        with self.translation_config.progress_monitor.stage_start(
+            self.stage_name, len(self.docs.page)
+        ) as pbar:
+            for page in self.docs.page:
+                translation_config.raise_if_cancelled()
+                xobj_available_fonts = {}
+                xobj_draw_ops = {}
+                xobj_encoding_length_map = {}
+                available_font_list = self.get_available_font_list(pdf, page)
+                for xobj in page.pdf_xobject:
+                    xobj_available_fonts[xobj.xobj_id] = available_font_list.copy()
+                    try:
+                        xobj_available_fonts[xobj.xobj_id].update(
+                            self.get_xobj_available_fonts(xobj.xref_id, pdf)
+                        )
+                    except Exception:
+                        pass
+                    xobj_encoding_length_map[xobj.xobj_id] = {
+                        f.font_id: f.encoding_length for f in xobj.pdf_font
+                    }
+                    xobj_op = BitStream()
+                    xobj_op.append(xobj.base_operations.value.encode())
+                    xobj_draw_ops[xobj.xobj_id] = xobj_op
+                page_encoding_length_map = {
+                    f.font_id: f.encoding_length for f in page.pdf_font
+                }
+                page_op = BitStream()
+                # q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
+                page_op.append(b"q ")
+                page_op.append(page.base_operations.value.encode())
+                page_op.append(b" Q ")
+                page_op.append(
+                    f"q Q 1 0 0 1 {page.cropbox.box.x} {page.cropbox.box.y} cm \n".encode()
+                )
+                # 收集所有字符
+                chars = []
+                # 首先添加页面级别的字符
+                if page.pdf_character:
+                    chars.extend(page.pdf_character)
+                # 然后添加段落中的字符
+                for paragraph in page.pdf_paragraph:
+                    chars.extend(self.render_paragraph_to_char(paragraph))
+                # 渲染所有字符
+                for char in chars:
+                    if char.char_unicode == "\n":
+                        continue
+                    if char.pdf_character_id is None:
+                        # dummy char
+                        continue
+                    char_size = char.pdf_style.font_size
+                    font_id = char.pdf_style.font_id
+                    if char.xobj_id in xobj_available_fonts:
+                        if font_id not in xobj_available_fonts[char.xobj_id]:
+                            continue
+                        draw_op = xobj_draw_ops[char.xobj_id]
+                        encoding_length_map = xobj_encoding_length_map[char.xobj_id]
+                    else:
+                        if font_id not in available_font_list:
+                            continue
+                        draw_op = page_op
+                        encoding_length_map = page_encoding_length_map
+                    draw_op.append(b"q ")
+                    self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
+                    if char.vertical:
+                        draw_op.append(
+                            f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode()
+                        )
+                    else:
+                        draw_op.append(
+                            f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode()
+                        )
+                    encoding_length = encoding_length_map[font_id]
+                    # pdf32000-2008 page14:
+                    # As hexadecimal data enclosed in angle brackets < >
+                    # see 7.3.4.3, "Hexadecimal Strings."
+                    draw_op.append(
+                        f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode()
+                    )
+                    draw_op.append(b" Tj ET Q \n")
+                for xobj in page.pdf_xobject:
+                    draw_op = xobj_draw_ops[xobj.xobj_id]
+                    pdf.update_stream(xobj.xref_id, draw_op.tobytes())
+                    # pdf.update_stream(xobj.xref_id, b'')
+                for rect in page.pdf_rectangle:
+                    self._debug_render_rectangle(page_op, rect)
+                draw_op = page_op
+                op_container = pdf.get_new_xref()
+                # Since this is a draw instruction container,
+                # no additional information is needed
+                pdf.update_object(op_container, "<<>>")
+                pdf.update_stream(op_container, draw_op.tobytes())
+                pdf[page.page_number].set_contents(op_container)
+                pbar.advance()
+        translation_config.raise_if_cancelled()
+        with self.translation_config.progress_monitor.stage_start(
+            SUBSET_FONT_STAGE_NAME, 1
+        ) as pbar:
+            if not translation_config.skip_clean:
+                pdf.subset_fonts(fallback=False)
+            pbar.advance()
+        with self.translation_config.progress_monitor.stage_start(
+            SAVE_PDF_STAGE_NAME, 2
+        ) as pbar:
+            if not translation_config.no_mono:
+                if translation_config.debug:
+                    translation_config.raise_if_cancelled()
+                    pdf.save(
+                        f"{mono_out_path}.decompressed.pdf", expand=True, pretty=True
+                    )
+                translation_config.raise_if_cancelled()
+                pdf.save(
+                    mono_out_path,
+                    garbage=3,
+                    deflate=True,
+                    clean=not translation_config.skip_clean,
+                    deflate_fonts=True,
+                    linear=True,
+                )
+            pbar.advance()
+            dual_out_path = None
+            if not translation_config.no_dual:
+                dual_out_path = translation_config.get_output_file_path(
+                    f"{basename}{debug_suffix}.{translation_config.lang_out}.dual.pdf"
+                )
+                translation_config.raise_if_cancelled()
+                dual = pymupdf.open(self.original_pdf_path)
+                if translation_config.debug:
+                    translation_config.raise_if_cancelled()
+                    try:
+                        self.write_debug_info(dual, translation_config)
+                    except Exception:
+                        logger.warning(
+                            "Failed to write debug info to dual PDF", exc_info=True
+                        )
+                dual.insert_file(pdf)
+                page_count = pdf.page_count
+                for page_id in range(page_count):
+                    if translation_config.dual_translate_first:
+                        dual.move_page(page_count + page_id, page_id * 2)
+                    else:
+                        dual.move_page(page_count + page_id, page_id * 2 + 1)
+                dual.save(
+                    dual_out_path,
+                    garbage=3,
+                    deflate=True,
+                    clean=not translation_config.skip_clean,
+                    deflate_fonts=True,
+                    linear=True,
+                )
+                if translation_config.debug:
+                    translation_config.raise_if_cancelled()
+                    dual.save(
+                        f"{dual_out_path}.decompressed.pdf", expand=True, pretty=True
+                    )
+            pbar.advance()
+        return TranslateResult(mono_out_path, dual_out_path)

src/pdf2u/document_il/frontend/__init__.py ADDED Viewed

File without changes

src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (182 Bytes). View file

src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (170 Bytes). View file

src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-311.pyc ADDED Viewed

Binary file (19 kB). View file

src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-312.pyc ADDED Viewed

Binary file (18 kB). View file

src/pdf2u/document_il/frontend/il_creater.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import base64
+import logging
+import re
+import pdfminer.pdfinterp
+import pymupdf
+from pdfminer.layout import LTChar, LTFigure
+from pdfminer.pdffont import PDFCIDFont, PDFFont
+from pdfminer.psparser import PSLiteral
+from pdf2u.document_il import il_version_1
+from pdf2u.translation_config import TranslationConfig
+logger = logging.getLogger(__name__)
+class ILCreater:
+    stage_name = "Parse PDF and Create Intermediate Representation"
+    def __init__(self, translation_config: TranslationConfig):
+        self.progress = None
+        self.current_page: il_version_1.Page = None
+        self.mupdf: pymupdf.Document = None
+        self.model = translation_config.doc_layout_model
+        self.docs = il_version_1.Document(page=[])
+        self.stroking_color_space_name = None
+        self.non_stroking_color_space_name = None
+        self.passthrough_per_char_instruction: list[tuple[str, str]] = []
+        self.translation_config = translation_config
+        self.passthrough_per_char_instruction_stack: list[list[tuple[str, str]]] = []
+        self.xobj_id = 0
+        self.xobj_inc = 0
+        self.xobj_map: dict[int, il_version_1.PdfXobject] = {}
+        self.xobj_stack = []
+    def on_finish(self):
+        self.progress.__exit__(None, None, None)
+    def is_passthrough_per_char_operation(self, operator: str):
+        return re.match("^(sc|scn|g|rg|k|cs|gs|ri)$", operator, re.IGNORECASE)
+    def on_passthrough_per_char(self, operator: str, args: list[str]):
+        if not self.is_passthrough_per_char_operation(operator):
+            logger.error("Unknown passthrough_per_char operation: %s", operator)
+            return
+        # logger.debug("xobj_id: %d, on_passthrough_per_char: %s ( %s )", self.xobj_id, operator, args)
+        args = [self.parse_arg(arg) for arg in args]
+        for _i, value in enumerate(self.passthrough_per_char_instruction.copy()):
+            op, arg = value
+            if op == operator:
+                self.passthrough_per_char_instruction.remove(value)
+                break
+        self.passthrough_per_char_instruction.append((operator, " ".join(args)))
+        pass
+    def remove_latest_passthrough_per_char_instruction(self):
+        if self.passthrough_per_char_instruction:
+            self.passthrough_per_char_instruction.pop()
+    def parse_arg(self, arg: str):
+        if isinstance(arg, PSLiteral):
+            return f"/{arg.name}"
+        if not isinstance(arg, str):
+            return str(arg)
+        return arg
+    def pop_passthrough_per_char_instruction(self):
+        if self.passthrough_per_char_instruction_stack:
+            self.passthrough_per_char_instruction = (
+                self.passthrough_per_char_instruction_stack.pop()
+            )
+        else:
+            self.passthrough_per_char_instruction = []
+            logging.error(
+                "pop_passthrough_per_char_instruction error on page: %s",
+                self.current_page.page_number,
+            )
+    def push_passthrough_per_char_instruction(self):
+        self.passthrough_per_char_instruction_stack.append(
+            self.passthrough_per_char_instruction.copy()
+        )
+    # pdf32000 page 171
+    def on_stroking_color_space(self, color_space_name):
+        self.stroking_color_space_name = color_space_name
+    def on_non_stroking_color_space(self, color_space_name):
+        self.non_stroking_color_space_name = color_space_name
+    def on_new_stream(self):
+        self.stroking_color_space_name = None
+        self.non_stroking_color_space_name = None
+        self.passthrough_per_char_instruction = []
+    def push_xobj(self):
+        self.xobj_stack.append(
+            (self.current_page_font_name_id_map.copy(), self.xobj_id)
+        )
+        self.current_page_font_name_id_map = {}
+    def pop_xobj(self):
+        self.current_page_font_name_id_map, self.xobj_id = self.xobj_stack.pop()
+    def on_xobj_begin(self, bbox, xref_id):
+        self.push_passthrough_per_char_instruction()
+        self.push_xobj()
+        self.xobj_inc += 1
+        self.xobj_id = self.xobj_inc
+        xobject = il_version_1.PdfXobject(
+            box=il_version_1.Box(
+                x=float(bbox[0]), y=float(bbox[1]), x2=float(bbox[2]), y2=float(bbox[3])
+            ),
+            xobj_id=self.xobj_id,
+            xref_id=xref_id,
+        )
+        self.current_page.pdf_xobject.append(xobject)
+        self.xobj_map[self.xobj_id] = xobject
+        return self.xobj_id
+    def on_xobj_end(self, xobj_id, base_op):
+        self.pop_passthrough_per_char_instruction()
+        self.pop_xobj()
+        xobj = self.xobj_map[xobj_id]
+        xobj.base_operations = il_version_1.BaseOperations(value=base_op)
+        self.xobj_inc += 1
+    def on_page_start(self):
+        self.current_page = il_version_1.Page(
+            pdf_font=[],
+            pdf_character=[],
+            page_layout=[],
+            # currently don't support UserUnit page parameter
+            # pdf32000 page 79
+            unit="point",
+        )
+        self.current_page_font_name_id_map = {}
+        self.passthrough_per_char_instruction_stack = []
+        self.xobj_stack = []
+        self.non_stroking_color_space_name = None
+        self.stroking_color_space_name = None
+        self.docs.page.append(self.current_page)
+    def on_page_end(self):
+        self.progress.advance(1)
+    def on_page_crop_box(
+        self, x0: float | int, y0: float | int, x1: float | int, y1: float | int
+    ):
+        box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1))
+        self.current_page.cropbox = il_version_1.Cropbox(box=box)
+    def on_page_media_box(
+        self, x0: float | int, y0: float | int, x1: float | int, y1: float | int
+    ):
+        box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1))
+        self.current_page.mediabox = il_version_1.Mediabox(box=box)
+    def on_page_number(self, page_number: int):
+        assert isinstance(page_number, int)
+        assert page_number >= 0
+        self.current_page.page_number = page_number
+    def on_page_base_operation(self, operation: str):
+        self.current_page.base_operations = il_version_1.BaseOperations(value=operation)
+    def on_page_resource_font(self, font: PDFFont, xref_id: int, font_id: str):
+        font_name = font.fontname
+        if isinstance(font_name, bytes):
+            try:
+                font_name = font_name.decode("utf-8")
+            except UnicodeDecodeError:
+                font_name = "BASE64:" + base64.b64encode(font_name).decode("utf-8")
+        encoding_length = 1
+        if isinstance(font, PDFCIDFont):
+            try:
+                # pdf 32000:2008 page 273
+                # Table 118 - Predefined CJK CMap names
+                _, encoding = self.mupdf.xref_get_key(xref_id, "Encoding")
+                if encoding == "/Identity-H" or encoding == "/Identity-V":
+                    encoding_length = 2
+                else:
+                    _, to_unicode_id = self.mupdf.xref_get_key(xref_id, "ToUnicode")
+                    to_unicode_bytes = self.mupdf.xref_stream(
+                        int(to_unicode_id.split(" ")[0])
+                    )
+                    code_range = re.search(
+                        b"begincodespacerange\n?.*<(\\d+?)>.*", to_unicode_bytes
+                    ).group(1)
+                    encoding_length = len(code_range) // 2
+            except Exception:
+                if max(font.unicode_map.cid2unichr.keys()) > 255:
+                    encoding_length = 2
+                else:
+                    encoding_length = 1
+        try:
+            mupdf_font = pymupdf.Font(fontbuffer=self.mupdf.extract_font(xref_id)[3])
+            bold = mupdf_font.is_bold
+            italic = mupdf_font.is_italic
+            monospaced = mupdf_font.is_monospaced
+            serif = mupdf_font.is_serif
+        except Exception:
+            bold = None
+            italic = None
+            monospaced = None
+            serif = None
+        il_font_metadata = il_version_1.PdfFont(
+            name=font_name,
+            xref_id=xref_id,
+            font_id=font_id,
+            encoding_length=encoding_length,
+            bold=bold,
+            italic=italic,
+            monospace=monospaced,
+            serif=serif,
+            ascent=font.ascent,
+            descent=font.descent,
+        )
+        self.current_page_font_name_id_map[font_name] = font_id
+        if self.xobj_id in self.xobj_map:
+            self.xobj_map[self.xobj_id].pdf_font.append(il_font_metadata)
+        else:
+            self.current_page.pdf_font.append(il_font_metadata)
+    def create_graphic_state(self, gs: pdfminer.pdfinterp.PDFGraphicState):
+        graphic_state = il_version_1.GraphicState()
+        for k, v in gs.__dict__.items():
+            if v is None:
+                continue
+            if k in ["scolor", "ncolor"]:
+                if isinstance(v, tuple):
+                    v = list(v)
+                else:
+                    v = [v]
+                setattr(graphic_state, k, v)
+                continue
+            if k == "linewidth":
+                graphic_state.linewidth = float(v)
+                continue
+            continue
+            raise NotImplementedError
+        graphic_state.stroking_color_space_name = self.stroking_color_space_name
+        graphic_state.non_stroking_color_space_name = self.non_stroking_color_space_name
+        graphic_state.passthrough_per_char_instruction = " ".join(
+            f"{arg} {op}" for op, arg in gs.passthrough_instruction
+        )
+        return graphic_state
+    def on_lt_char(self, char: LTChar):
+        gs = self.create_graphic_state(char.graphicstate)
+        # Get font from current page or xobject
+        font = None
+        for pdf_font in self.xobj_map.get(self.xobj_id, self.current_page).pdf_font:
+            if pdf_font.font_id == char.aw_font_id:
+                font = pdf_font
+                break
+        # Get descent from font
+        descent = 0
+        if font and hasattr(font, "descent"):
+            descent = font.descent * char.size / 1000
+        char_id = char.cid
+        char_unicode = char.get_text()
+        if "(cid:" not in char_unicode and len(char_unicode) > 1:
+            return
+        advance = char.adv
+        if char.matrix[0] == 0 and char.matrix[3] == 0:
+            vertical = True
+            bbox = il_version_1.Box(
+                x=char.bbox[0] - descent,
+                y=char.bbox[1],
+                x2=char.bbox[2] - descent,
+                y2=char.bbox[3],
+            )
+        else:
+            vertical = False
+            # Add descent to y coordinates
+            bbox = il_version_1.Box(
+                x=char.bbox[0],
+                y=char.bbox[1] + descent,
+                x2=char.bbox[2],
+                y2=char.bbox[3] + descent,
+            )
+        pdf_style = il_version_1.PdfStyle(
+            font_id=char.aw_font_id, font_size=char.size, graphic_state=gs
+        )
+        pdf_char = il_version_1.PdfCharacter(
+            box=bbox,
+            pdf_character_id=char_id,
+            advance=advance,
+            char_unicode=char_unicode,
+            vertical=vertical,
+            pdf_style=pdf_style,
+            xobj_id=char.xobj_id,
+        )
+        self.current_page.pdf_character.append(pdf_char)
+    def create_il(self):
+        pages = [
+            page
+            for page in self.docs.page
+            if self.translation_config.should_translate_page(page.page_number + 1)
+        ]
+        self.docs.page = pages
+        return self.docs
+    def on_total_pages(self, total_pages: int):
+        assert isinstance(total_pages, int)
+        assert total_pages > 0
+        self.docs.total_pages = total_pages
+        total = 0
+        for page in range(total_pages):
+            if self.translation_config.should_translate_page(page + 1) is False:
+                continue
+            total += 1
+        self.progress = self.translation_config.progress_monitor.stage_start(
+            self.stage_name, total
+        )
+    def on_pdf_figure(self, figure: LTFigure):
+        box = il_version_1.Box(
+            figure.bbox[0], figure.bbox[1], figure.bbox[2], figure.bbox[3]
+        )
+        self.current_page.pdf_figure.append(il_version_1.PdfFigure(box=box))

src/pdf2u/document_il/il_version_1.py ADDED Viewed

	@@ -0,0 +1,396 @@

+from dataclasses import dataclass, field
+@dataclass
+class BaseOperations:
+    class Meta:
+        name = "baseOperations"
+    value: str = field(default="", metadata={"required": True})
+@dataclass
+class Box:
+    class Meta:
+        name = "box"
+    x: float | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+    y: float | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+    x2: float | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+    y2: float | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+@dataclass
+class GraphicState:
+    class Meta:
+        name = "graphicState"
+    linewidth: float | None = field(default=None, metadata={"type": "Attribute"})
+    dash: list[float] = field(
+        default_factory=list,
+        metadata={"type": "Attribute", "min_length": 1, "tokens": True},
+    )
+    flatness: float | None = field(default=None, metadata={"type": "Attribute"})
+    intent: str | None = field(default=None, metadata={"type": "Attribute"})
+    linecap: int | None = field(default=None, metadata={"type": "Attribute"})
+    linejoin: int | None = field(default=None, metadata={"type": "Attribute"})
+    miterlimit: float | None = field(default=None, metadata={"type": "Attribute"})
+    ncolor: list[float] = field(
+        default_factory=list,
+        metadata={"type": "Attribute", "min_length": 1, "tokens": True},
+    )
+    scolor: list[float] = field(
+        default_factory=list,
+        metadata={"type": "Attribute", "min_length": 1, "tokens": True},
+    )
+    stroking_color_space_name: str | None = field(
+        default=None, metadata={"name": "strokingColorSpaceName", "type": "Attribute"}
+    )
+    non_stroking_color_space_name: str | None = field(
+        default=None,
+        metadata={"name": "nonStrokingColorSpaceName", "type": "Attribute"},
+    )
+    passthrough_per_char_instruction: str | None = field(
+        default=None,
+        metadata={"name": "passthroughPerCharInstruction", "type": "Attribute"},
+    )
+@dataclass
+class PdfFont:
+    class Meta:
+        name = "pdfFont"
+    name: str | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+    font_id: str | None = field(
+        default=None, metadata={"name": "fontId", "type": "Attribute", "required": True}
+    )
+    xref_id: int | None = field(
+        default=None, metadata={"name": "xrefId", "type": "Attribute", "required": True}
+    )
+    encoding_length: int | None = field(
+        default=None,
+        metadata={"name": "encodingLength", "type": "Attribute", "required": True},
+    )
+    bold: bool | None = field(default=None, metadata={"type": "Attribute"})
+    italic: bool | None = field(default=None, metadata={"type": "Attribute"})
+    monospace: bool | None = field(default=None, metadata={"type": "Attribute"})
+    serif: bool | None = field(default=None, metadata={"type": "Attribute"})
+    ascent: float | None = field(default=None, metadata={"type": "Attribute"})
+    descent: float | None = field(default=None, metadata={"type": "Attribute"})
+@dataclass
+class Cropbox:
+    class Meta:
+        name = "cropbox"
+    box: Box | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+@dataclass
+class Mediabox:
+    class Meta:
+        name = "mediabox"
+    box: Box | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+@dataclass
+class PageLayout:
+    class Meta:
+        name = "pageLayout"
+    box: Box | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+    id: int | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+    conf: float | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+    class_name: str | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+@dataclass
+class PdfFigure:
+    class Meta:
+        name = "pdfFigure"
+    box: Box | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+@dataclass
+class PdfRectangle:
+    class Meta:
+        name = "pdfRectangle"
+    box: Box | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+    graphic_state: GraphicState | None = field(
+        default=None,
+        metadata={"name": "graphicState", "type": "Element", "required": True},
+    )
+    debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
+@dataclass
+class PdfStyle:
+    class Meta:
+        name = "pdfStyle"
+    graphic_state: GraphicState | None = field(
+        default=None,
+        metadata={"name": "graphicState", "type": "Element", "required": True},
+    )
+    font_id: str | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+    font_size: float | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+@dataclass
+class PdfXobject:
+    class Meta:
+        name = "pdfXobject"
+    box: Box | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+    pdf_font: list[PdfFont] = field(
+        default_factory=list, metadata={"name": "pdfFont", "type": "Element"}
+    )
+    base_operations: BaseOperations | None = field(
+        default=None,
+        metadata={"name": "baseOperations", "type": "Element", "required": True},
+    )
+    xobj_id: int | None = field(
+        default=None, metadata={"name": "xobjId", "type": "Attribute", "required": True}
+    )
+    xref_id: int | None = field(
+        default=None, metadata={"name": "xrefId", "type": "Attribute", "required": True}
+    )
+@dataclass
+class PdfCharacter:
+    class Meta:
+        name = "pdfCharacter"
+    pdf_style: PdfStyle | None = field(
+        default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
+    )
+    box: Box | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+    vertical: bool | None = field(default=None, metadata={"type": "Attribute"})
+    scale: float | None = field(default=None, metadata={"type": "Attribute"})
+    pdf_character_id: int | None = field(
+        default=None, metadata={"name": "pdfCharacterId", "type": "Attribute"}
+    )
+    char_unicode: str | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+    advance: float | None = field(default=None, metadata={"type": "Attribute"})
+    xobj_id: int | None = field(
+        default=None, metadata={"name": "xobjId", "type": "Attribute"}
+    )
+    debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
+@dataclass
+class PdfSameStyleUnicodeCharacters:
+    class Meta:
+        name = "pdfSameStyleUnicodeCharacters"
+    pdf_style: PdfStyle | None = field(
+        default=None, metadata={"name": "pdfStyle", "type": "Element"}
+    )
+    unicode: str | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+    debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
+@dataclass
+class PdfFormula:
+    class Meta:
+        name = "pdfFormula"
+    box: Box | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+    pdf_character: list[PdfCharacter] = field(
+        default_factory=list,
+        metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
+    )
+    x_offset: float | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+    y_offset: float | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+@dataclass
+class PdfLine:
+    class Meta:
+        name = "pdfLine"
+    box: Box | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+    pdf_character: list[PdfCharacter] = field(
+        default_factory=list,
+        metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
+    )
+@dataclass
+class PdfSameStyleCharacters:
+    class Meta:
+        name = "pdfSameStyleCharacters"
+    box: Box | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+    pdf_style: PdfStyle | None = field(
+        default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
+    )
+    pdf_character: list[PdfCharacter] = field(
+        default_factory=list,
+        metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
+    )
+@dataclass
+class PdfParagraphComposition:
+    class Meta:
+        name = "pdfParagraphComposition"
+    pdf_line: PdfLine | None = field(
+        default=None, metadata={"name": "pdfLine", "type": "Element"}
+    )
+    pdf_formula: PdfFormula | None = field(
+        default=None, metadata={"name": "pdfFormula", "type": "Element"}
+    )
+    pdf_same_style_characters: PdfSameStyleCharacters | None = field(
+        default=None, metadata={"name": "pdfSameStyleCharacters", "type": "Element"}
+    )
+    pdf_character: PdfCharacter | None = field(
+        default=None, metadata={"name": "pdfCharacter", "type": "Element"}
+    )
+    pdf_same_style_unicode_characters: PdfSameStyleUnicodeCharacters | None = field(
+        default=None,
+        metadata={"name": "pdfSameStyleUnicodeCharacters", "type": "Element"},
+    )
+@dataclass
+class PdfParagraph:
+    class Meta:
+        name = "pdfParagraph"
+    box: Box | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+    pdf_style: PdfStyle | None = field(
+        default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
+    )
+    pdf_paragraph_composition: list[PdfParagraphComposition] = field(
+        default_factory=list,
+        metadata={"name": "pdfParagraphComposition", "type": "Element"},
+    )
+    xobj_id: int | None = field(
+        default=None, metadata={"name": "xobjId", "type": "Attribute"}
+    )
+    unicode: str | None = field(
+        default=None, metadata={"type": "Attribute", "required": True}
+    )
+    scale: float | None = field(default=None, metadata={"type": "Attribute"})
+    vertical: bool | None = field(default=None, metadata={"type": "Attribute"})
+    first_line_indent: bool | None = field(
+        default=None, metadata={"name": "FirstLineIndent", "type": "Attribute"}
+    )
+    debug_id: str | None = field(default=None, metadata={"type": "Attribute"})
+@dataclass
+class Page:
+    class Meta:
+        name = "page"
+    mediabox: Mediabox | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+    cropbox: Cropbox | None = field(
+        default=None, metadata={"type": "Element", "required": True}
+    )
+    pdf_xobject: list[PdfXobject] = field(
+        default_factory=list, metadata={"name": "pdfXobject", "type": "Element"}
+    )
+    page_layout: list[PageLayout] = field(
+        default_factory=list, metadata={"name": "pageLayout", "type": "Element"}
+    )
+    pdf_rectangle: list[PdfRectangle] = field(
+        default_factory=list, metadata={"name": "pdfRectangle", "type": "Element"}
+    )
+    pdf_font: list[PdfFont] = field(
+        default_factory=list, metadata={"name": "pdfFont", "type": "Element"}
+    )
+    pdf_paragraph: list[PdfParagraph] = field(
+        default_factory=list, metadata={"name": "pdfParagraph", "type": "Element"}
+    )
+    pdf_figure: list[PdfFigure] = field(
+        default_factory=list, metadata={"name": "pdfFigure", "type": "Element"}
+    )
+    pdf_character: list[PdfCharacter] = field(
+        default_factory=list, metadata={"name": "pdfCharacter", "type": "Element"}
+    )
+    base_operations: BaseOperations | None = field(
+        default=None,
+        metadata={"name": "baseOperations", "type": "Element", "required": True},
+    )
+    page_number: int | None = field(
+        default=None,
+        metadata={"name": "pageNumber", "type": "Attribute", "required": True},
+    )
+    unit: str | None = field(
+        default=None, metadata={"name": "Unit", "type": "Attribute", "required": True}
+    )
+@dataclass
+class Document:
+    class Meta:
+        name = "document"
+    page: list[Page] = field(
+        default_factory=list, metadata={"type": "Element", "min_occurs": 1}
+    )
+    total_pages: int | None = field(
+        default=None,
+        metadata={"name": "totalPages", "type": "Attribute", "required": True},
+    )

src/pdf2u/document_il/il_version_1.rnc ADDED Viewed

	@@ -0,0 +1,141 @@

+start = Document
+Document =
+  element document {
+    Page+,
+    attribute totalPages { xsd:int }
+  }
+Page =
+  element page {
+    element mediabox { Box },
+    element cropbox { Box },
+    PDFXobject*,
+    PageLayout*,
+    PDFRectangle*,
+    PDFFont*,
+    PDFParagraph*,
+    PDFFigure*,
+    PDFCharacter*,
+    attribute pageNumber { xsd:int },
+    attribute Unit { xsd:string },
+    element baseOperations { xsd:string }
+  }
+Box =
+  element box {
+    # from (x,y) to (x2,y2)
+    attribute x { xsd:float },
+    attribute y { xsd:float },
+    attribute x2 { xsd:float },
+    attribute y2 { xsd:float }
+  }
+PDFXrefId = xsd:int
+PDFFont =
+  element pdfFont {
+    attribute name { xsd:string },
+    attribute fontId { xsd:string },
+    attribute xrefId { PDFXrefId },
+    attribute encodingLength { xsd:int },
+    attribute bold { xsd:boolean }?,
+    attribute italic { xsd:boolean }?,
+    attribute monospace { xsd:boolean }?,
+    attribute serif { xsd:boolean }?,
+    attribute ascent { xsd:float }?,
+    attribute descent { xsd:float }?
+  }
+PDFXobject =
+  element pdfXobject {
+    attribute xobjId { xsd:int },
+    attribute xrefId { PDFXrefId },
+    Box,
+    PDFFont*,
+    element baseOperations { xsd:string }
+  }
+PDFCharacter =
+  element pdfCharacter {
+    attribute vertical { xsd:boolean }?,
+    attribute scale { xsd:float }?,
+    attribute pdfCharacterId { xsd:int }?,
+    attribute char_unicode { xsd:string },
+    attribute advance { xsd:float }?,
+    # xobject nesting depth
+    attribute xobjId { xsd:int }?,
+    attribute debug_info { xsd:boolean }?,
+    PDFStyle,
+    Box
+  }
+PageLayout =
+  element pageLayout {
+    attribute id { xsd:int },
+    attribute conf { xsd:float },
+    attribute class_name { xsd:string },
+    Box
+  }
+GraphicState =
+  element graphicState {
+    attribute linewidth { xsd:float }?,
+    attribute dash {
+      list { xsd:float+ }
+    }?,
+    attribute flatness { xsd:float }?,
+    attribute intent { xsd:string }?,
+    attribute linecap { xsd:int }?,
+    attribute linejoin { xsd:int }?,
+    attribute miterlimit { xsd:float }?,
+    attribute ncolor {
+      list { xsd:float+ }
+    }?,
+    attribute scolor {
+      list { xsd:float+ }
+    }?,
+    attribute strokingColorSpaceName { xsd:string }?,
+    attribute nonStrokingColorSpaceName { xsd:string }?,
+    attribute passthroughPerCharInstruction { xsd:string }?
+  }
+PDFStyle =
+  element pdfStyle {
+    attribute font_id { xsd:string },
+    attribute font_size { xsd:float },
+    GraphicState
+  }
+PDFParagraph =
+  element pdfParagraph {
+    attribute xobjId { xsd:int }?,
+    attribute unicode { xsd:string },
+    attribute scale { xsd:float }?,
+    attribute vertical { xsd:boolean }?,
+    attribute FirstLineIndent { xsd:boolean }?,
+    attribute debug_id { xsd:string }?,
+    Box,
+    PDFStyle,
+    PDFParagraphComposition*
+  }
+PDFParagraphComposition =
+  element pdfParagraphComposition {
+    PDFLine
+    | PDFFormula
+    | PDFSameStyleCharacters
+    | PDFCharacter
+    | PDFSameStyleUnicodeCharacters
+  }
+PDFLine = element pdfLine { Box, PDFCharacter+ }
+PDFSameStyleCharacters =
+  element pdfSameStyleCharacters { Box, PDFStyle, PDFCharacter+ }
+PDFSameStyleUnicodeCharacters =
+  element pdfSameStyleUnicodeCharacters {
+    PDFStyle?,
+    attribute unicode { xsd:string },
+    attribute debug_info { xsd:boolean }?
+  }
+PDFFormula =
+  element pdfFormula {
+    Box,
+    PDFCharacter+,
+    attribute x_offset { xsd:float },
+    attribute y_offset { xsd:float }
+  }
+PDFFigure = element pdfFigure { Box }
+PDFRectangle =
+  element pdfRectangle {
+    Box,
+    GraphicState,
+    attribute debug_info { xsd:boolean }?
+  }

src/pdf2u/document_il/il_version_1.rng ADDED Viewed

	@@ -0,0 +1,390 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
+  <start>
+    <ref name="Document"/>
+  </start>
+  <define name="Document">
+    <element name="document">
+      <oneOrMore>
+        <ref name="Page"/>
+      </oneOrMore>
+      <attribute name="totalPages">
+        <data type="int"/>
+      </attribute>
+    </element>
+  </define>
+  <define name="Page">
+    <element name="page">
+      <element name="mediabox">
+        <ref name="Box"/>
+      </element>
+      <element name="cropbox">
+        <ref name="Box"/>
+      </element>
+      <zeroOrMore>
+        <ref name="PDFXobject"/>
+      </zeroOrMore>
+      <zeroOrMore>
+        <ref name="PageLayout"/>
+      </zeroOrMore>
+      <zeroOrMore>
+        <ref name="PDFRectangle"/>
+      </zeroOrMore>
+      <zeroOrMore>
+        <ref name="PDFFont"/>
+      </zeroOrMore>
+      <zeroOrMore>
+        <ref name="PDFParagraph"/>
+      </zeroOrMore>
+      <zeroOrMore>
+        <ref name="PDFFigure"/>
+      </zeroOrMore>
+      <zeroOrMore>
+        <ref name="PDFCharacter"/>
+      </zeroOrMore>
+      <attribute name="pageNumber">
+        <data type="int"/>
+      </attribute>
+      <attribute name="Unit">
+        <data type="string"/>
+      </attribute>
+      <element name="baseOperations">
+        <data type="string"/>
+      </element>
+    </element>
+  </define>
+  <define name="Box">
+    <element name="box">
+      <!-- from (x,y) to (x2,y2) -->
+      <attribute name="x">
+        <data type="float"/>
+      </attribute>
+      <attribute name="y">
+        <data type="float"/>
+      </attribute>
+      <attribute name="x2">
+        <data type="float"/>
+      </attribute>
+      <attribute name="y2">
+        <data type="float"/>
+      </attribute>
+    </element>
+  </define>
+  <define name="PDFXrefId">
+    <data type="int"/>
+  </define>
+  <define name="PDFFont">
+    <element name="pdfFont">
+      <attribute name="name">
+        <data type="string"/>
+      </attribute>
+      <attribute name="fontId">
+        <data type="string"/>
+      </attribute>
+      <attribute name="xrefId">
+        <ref name="PDFXrefId"/>
+      </attribute>
+      <attribute name="encodingLength">
+        <data type="int"/>
+      </attribute>
+      <optional>
+        <attribute name="bold">
+          <data type="boolean"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="italic">
+          <data type="boolean"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="monospace">
+          <data type="boolean"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="serif">
+          <data type="boolean"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="ascent">
+          <data type="float"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="descent">
+          <data type="float"/>
+        </attribute>
+      </optional>
+    </element>
+  </define>
+  <define name="PDFXobject">
+    <element name="pdfXobject">
+      <attribute name="xobjId">
+        <data type="int"/>
+      </attribute>
+      <attribute name="xrefId">
+        <ref name="PDFXrefId"/>
+      </attribute>
+      <ref name="Box"/>
+      <zeroOrMore>
+        <ref name="PDFFont"/>
+      </zeroOrMore>
+      <element name="baseOperations">
+        <data type="string"/>
+      </element>
+    </element>
+  </define>
+  <define name="PDFCharacter">
+    <element name="pdfCharacter">
+      <optional>
+        <attribute name="vertical">
+          <data type="boolean"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="scale">
+          <data type="float"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="pdfCharacterId">
+          <data type="int"/>
+        </attribute>
+      </optional>
+      <attribute name="char_unicode">
+        <data type="string"/>
+      </attribute>
+      <optional>
+        <attribute name="advance">
+          <data type="float"/>
+        </attribute>
+      </optional>
+      <optional>
+        <!-- xobject nesting depth -->
+        <attribute name="xobjId">
+          <data type="int"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="debug_info">
+          <data type="boolean"/>
+        </attribute>
+      </optional>
+      <ref name="PDFStyle"/>
+      <ref name="Box"/>
+    </element>
+  </define>
+  <define name="PageLayout">
+    <element name="pageLayout">
+      <attribute name="id">
+        <data type="int"/>
+      </attribute>
+      <attribute name="conf">
+        <data type="float"/>
+      </attribute>
+      <attribute name="class_name">
+        <data type="string"/>
+      </attribute>
+      <ref name="Box"/>
+    </element>
+  </define>
+  <define name="GraphicState">
+    <element name="graphicState">
+      <optional>
+        <attribute name="linewidth">
+          <data type="float"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="dash">
+          <list>
+            <oneOrMore>
+              <data type="float"/>
+            </oneOrMore>
+          </list>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="flatness">
+          <data type="float"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="intent">
+          <data type="string"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="linecap">
+          <data type="int"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="linejoin">
+          <data type="int"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="miterlimit">
+          <data type="float"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="ncolor">
+          <list>
+            <oneOrMore>
+              <data type="float"/>
+            </oneOrMore>
+          </list>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="scolor">
+          <list>
+            <oneOrMore>
+              <data type="float"/>
+            </oneOrMore>
+          </list>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="strokingColorSpaceName">
+          <data type="string"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="nonStrokingColorSpaceName">
+          <data type="string"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="passthroughPerCharInstruction">
+          <data type="string"/>
+        </attribute>
+      </optional>
+    </element>
+  </define>
+  <define name="PDFStyle">
+    <element name="pdfStyle">
+      <attribute name="font_id">
+        <data type="string"/>
+      </attribute>
+      <attribute name="font_size">
+        <data type="float"/>
+      </attribute>
+      <ref name="GraphicState"/>
+    </element>
+  </define>
+  <define name="PDFParagraph">
+    <element name="pdfParagraph">
+      <optional>
+        <attribute name="xobjId">
+          <data type="int"/>
+        </attribute>
+      </optional>
+      <attribute name="unicode">
+        <data type="string"/>
+      </attribute>
+      <optional>
+        <attribute name="scale">
+          <data type="float"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="vertical">
+          <data type="boolean"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="FirstLineIndent">
+          <data type="boolean"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="debug_id">
+          <data type="string"/>
+        </attribute>
+      </optional>
+      <ref name="Box"/>
+      <ref name="PDFStyle"/>
+      <zeroOrMore>
+        <ref name="PDFParagraphComposition"/>
+      </zeroOrMore>
+    </element>
+  </define>
+  <define name="PDFParagraphComposition">
+    <element name="pdfParagraphComposition">
+      <choice>
+        <ref name="PDFLine"/>
+        <ref name="PDFFormula"/>
+        <ref name="PDFSameStyleCharacters"/>
+        <ref name="PDFCharacter"/>
+        <ref name="PDFSameStyleUnicodeCharacters"/>
+      </choice>
+    </element>
+  </define>
+  <define name="PDFLine">
+    <element name="pdfLine">
+      <ref name="Box"/>
+      <oneOrMore>
+        <ref name="PDFCharacter"/>
+      </oneOrMore>
+    </element>
+  </define>
+  <define name="PDFSameStyleCharacters">
+    <element name="pdfSameStyleCharacters">
+      <ref name="Box"/>
+      <ref name="PDFStyle"/>
+      <oneOrMore>
+        <ref name="PDFCharacter"/>
+      </oneOrMore>
+    </element>
+  </define>
+  <define name="PDFSameStyleUnicodeCharacters">
+    <element name="pdfSameStyleUnicodeCharacters">
+      <optional>
+        <ref name="PDFStyle"/>
+      </optional>
+      <attribute name="unicode">
+        <data type="string"/>
+      </attribute>
+      <optional>
+        <attribute name="debug_info">
+          <data type="boolean"/>
+        </attribute>
+      </optional>
+    </element>
+  </define>
+  <define name="PDFFormula">
+    <element name="pdfFormula">
+      <ref name="Box"/>
+      <oneOrMore>
+        <ref name="PDFCharacter"/>
+      </oneOrMore>
+      <attribute name="x_offset">
+        <data type="float"/>
+      </attribute>
+      <attribute name="y_offset">
+        <data type="float"/>
+      </attribute>
+    </element>
+  </define>
+  <define name="PDFFigure">
+    <element name="pdfFigure">
+      <ref name="Box"/>
+    </element>
+  </define>
+  <define name="PDFRectangle">
+    <element name="pdfRectangle">
+      <ref name="Box"/>
+      <ref name="GraphicState"/>
+      <optional>
+        <attribute name="debug_info">
+          <data type="boolean"/>
+        </attribute>
+      </optional>
+    </element>
+  </define>
+</grammar>

src/pdf2u/document_il/il_version_1.xsd ADDED Viewed

	@@ -0,0 +1,235 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified">
+  <xs:element name="document">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element maxOccurs="unbounded" ref="page"/>
+      </xs:sequence>
+      <xs:attribute name="totalPages" use="required" type="xs:int"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="page">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="mediabox"/>
+        <xs:element ref="cropbox"/>
+        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfXobject"/>
+        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pageLayout"/>
+        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfRectangle"/>
+        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
+        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraph"/>
+        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFigure"/>
+        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfCharacter"/>
+        <xs:element ref="baseOperations"/>
+      </xs:sequence>
+      <xs:attribute name="pageNumber" use="required" type="xs:int"/>
+      <xs:attribute name="Unit" use="required" type="xs:string"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="mediabox">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="box"/>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="cropbox">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="box"/>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="baseOperations" type="xs:string"/>
+  <xs:element name="box">
+    <xs:complexType>
+      <xs:attribute name="x" use="required" type="xs:float"/>
+      <xs:attribute name="y" use="required" type="xs:float"/>
+      <xs:attribute name="x2" use="required" type="xs:float"/>
+      <xs:attribute name="y2" use="required" type="xs:float"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:simpleType name="PDFXrefId">
+    <xs:restriction base="xs:int"/>
+  </xs:simpleType>
+  <xs:element name="pdfFont">
+    <xs:complexType>
+      <xs:attribute name="name" use="required" type="xs:string"/>
+      <xs:attribute name="fontId" use="required" type="xs:string"/>
+      <xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
+      <xs:attribute name="encodingLength" use="required" type="xs:int"/>
+      <xs:attribute name="bold" type="xs:boolean"/>
+      <xs:attribute name="italic" type="xs:boolean"/>
+      <xs:attribute name="monospace" type="xs:boolean"/>
+      <xs:attribute name="serif" type="xs:boolean"/>
+      <xs:attribute name="ascent" type="xs:float"/>
+      <xs:attribute name="descent" type="xs:float"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pdfXobject">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="box"/>
+        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
+        <xs:element ref="baseOperations"/>
+      </xs:sequence>
+      <xs:attribute name="xobjId" use="required" type="xs:int"/>
+      <xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pdfCharacter">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="pdfStyle"/>
+        <xs:element ref="box"/>
+      </xs:sequence>
+      <xs:attribute name="vertical" type="xs:boolean"/>
+      <xs:attribute name="scale" type="xs:float"/>
+      <xs:attribute name="pdfCharacterId" type="xs:int"/>
+      <xs:attribute name="char_unicode" use="required" type="xs:string"/>
+      <xs:attribute name="advance" type="xs:float"/>
+      <xs:attribute name="xobjId" type="xs:int"/>
+      <xs:attribute name="debug_info" type="xs:boolean"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pageLayout">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="box"/>
+      </xs:sequence>
+      <xs:attribute name="id" use="required" type="xs:int"/>
+      <xs:attribute name="conf" use="required" type="xs:float"/>
+      <xs:attribute name="class_name" use="required" type="xs:string"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="graphicState">
+    <xs:complexType>
+      <xs:attribute name="linewidth" type="xs:float"/>
+      <xs:attribute name="dash">
+        <xs:simpleType>
+          <xs:restriction>
+            <xs:simpleType>
+              <xs:list itemType="xs:float"/>
+            </xs:simpleType>
+            <xs:minLength value="1"/>
+          </xs:restriction>
+        </xs:simpleType>
+      </xs:attribute>
+      <xs:attribute name="flatness" type="xs:float"/>
+      <xs:attribute name="intent" type="xs:string"/>
+      <xs:attribute name="linecap" type="xs:int"/>
+      <xs:attribute name="linejoin" type="xs:int"/>
+      <xs:attribute name="miterlimit" type="xs:float"/>
+      <xs:attribute name="ncolor">
+        <xs:simpleType>
+          <xs:restriction>
+            <xs:simpleType>
+              <xs:list itemType="xs:float"/>
+            </xs:simpleType>
+            <xs:minLength value="1"/>
+          </xs:restriction>
+        </xs:simpleType>
+      </xs:attribute>
+      <xs:attribute name="scolor">
+        <xs:simpleType>
+          <xs:restriction>
+            <xs:simpleType>
+              <xs:list itemType="xs:float"/>
+            </xs:simpleType>
+            <xs:minLength value="1"/>
+          </xs:restriction>
+        </xs:simpleType>
+      </xs:attribute>
+      <xs:attribute name="strokingColorSpaceName" type="xs:string"/>
+      <xs:attribute name="nonStrokingColorSpaceName" type="xs:string"/>
+      <xs:attribute name="passthroughPerCharInstruction" type="xs:string"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pdfStyle">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="graphicState"/>
+      </xs:sequence>
+      <xs:attribute name="font_id" use="required" type="xs:string"/>
+      <xs:attribute name="font_size" use="required" type="xs:float"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pdfParagraph">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="box"/>
+        <xs:element ref="pdfStyle"/>
+        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraphComposition"/>
+      </xs:sequence>
+      <xs:attribute name="xobjId" type="xs:int"/>
+      <xs:attribute name="unicode" use="required" type="xs:string"/>
+      <xs:attribute name="scale" type="xs:float"/>
+      <xs:attribute name="vertical" type="xs:boolean"/>
+      <xs:attribute name="FirstLineIndent" type="xs:boolean"/>
+      <xs:attribute name="debug_id" type="xs:string"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pdfParagraphComposition">
+    <xs:complexType>
+      <xs:choice>
+        <xs:element ref="pdfLine"/>
+        <xs:element ref="pdfFormula"/>
+        <xs:element ref="pdfSameStyleCharacters"/>
+        <xs:element ref="pdfCharacter"/>
+        <xs:element ref="pdfSameStyleUnicodeCharacters"/>
+      </xs:choice>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pdfLine">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="box"/>
+        <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pdfSameStyleCharacters">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="box"/>
+        <xs:element ref="pdfStyle"/>
+        <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pdfSameStyleUnicodeCharacters">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element minOccurs="0" ref="pdfStyle"/>
+      </xs:sequence>
+      <xs:attribute name="unicode" use="required" type="xs:string"/>
+      <xs:attribute name="debug_info" type="xs:boolean"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pdfFormula">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="box"/>
+        <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
+      </xs:sequence>
+      <xs:attribute name="x_offset" use="required" type="xs:float"/>
+      <xs:attribute name="y_offset" use="required" type="xs:float"/>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pdfFigure">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="box"/>
+      </xs:sequence>
+    </xs:complexType>
+  </xs:element>
+  <xs:element name="pdfRectangle">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element ref="box"/>
+        <xs:element ref="graphicState"/>
+      </xs:sequence>
+      <xs:attribute name="debug_info" type="xs:boolean"/>
+    </xs:complexType>
+  </xs:element>
+</xs:schema>

src/pdf2u/document_il/midend/__init__.py ADDED Viewed

File without changes

src/pdf2u/document_il/midend/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (180 Bytes). View file