Atticux commited on
Commit
752094d
·
verified ·
1 Parent(s): e0ebfbb

Upload 108 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +50 -0
  2. pyproject.toml +262 -0
  3. src/pdf2u/__init__.py +1 -0
  4. src/pdf2u/__pycache__/__init__.cpython-311.pyc +0 -0
  5. src/pdf2u/__pycache__/__init__.cpython-312.pyc +0 -0
  6. src/pdf2u/__pycache__/const.cpython-311.pyc +0 -0
  7. src/pdf2u/__pycache__/const.cpython-312.pyc +0 -0
  8. src/pdf2u/__pycache__/converter.cpython-311.pyc +0 -0
  9. src/pdf2u/__pycache__/converter.cpython-312.pyc +0 -0
  10. src/pdf2u/__pycache__/high_level.cpython-311.pyc +0 -0
  11. src/pdf2u/__pycache__/high_level.cpython-312.pyc +0 -0
  12. src/pdf2u/__pycache__/io.cpython-312.pyc +0 -0
  13. src/pdf2u/__pycache__/main.cpython-311.pyc +0 -0
  14. src/pdf2u/__pycache__/main.cpython-312.pyc +0 -0
  15. src/pdf2u/__pycache__/pdfinterp.cpython-311.pyc +0 -0
  16. src/pdf2u/__pycache__/pdfinterp.cpython-312.pyc +0 -0
  17. src/pdf2u/__pycache__/progress_monitor.cpython-311.pyc +0 -0
  18. src/pdf2u/__pycache__/progress_monitor.cpython-312.pyc +0 -0
  19. src/pdf2u/__pycache__/translation_config.cpython-311.pyc +0 -0
  20. src/pdf2u/__pycache__/translation_config.cpython-312.pyc +0 -0
  21. src/pdf2u/asynchronize/__init__.py +51 -0
  22. src/pdf2u/asynchronize/__pycache__/__init__.cpython-311.pyc +0 -0
  23. src/pdf2u/asynchronize/__pycache__/__init__.cpython-312.pyc +0 -0
  24. src/pdf2u/const.py +14 -0
  25. src/pdf2u/converter.py +493 -0
  26. src/pdf2u/document_il/__init__.py +45 -0
  27. src/pdf2u/document_il/__pycache__/__init__.cpython-311.pyc +0 -0
  28. src/pdf2u/document_il/__pycache__/__init__.cpython-312.pyc +0 -0
  29. src/pdf2u/document_il/__pycache__/il_version_1.cpython-311.pyc +0 -0
  30. src/pdf2u/document_il/__pycache__/il_version_1.cpython-312.pyc +0 -0
  31. src/pdf2u/document_il/__pycache__/xml_converter.cpython-311.pyc +0 -0
  32. src/pdf2u/document_il/__pycache__/xml_converter.cpython-312.pyc +0 -0
  33. src/pdf2u/document_il/backend/__init__.py +0 -0
  34. src/pdf2u/document_il/backend/__pycache__/__init__.cpython-311.pyc +0 -0
  35. src/pdf2u/document_il/backend/__pycache__/__init__.cpython-312.pyc +0 -0
  36. src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-311.pyc +0 -0
  37. src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-312.pyc +0 -0
  38. src/pdf2u/document_il/backend/pdf_creater.py +405 -0
  39. src/pdf2u/document_il/frontend/__init__.py +0 -0
  40. src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-311.pyc +0 -0
  41. src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-312.pyc +0 -0
  42. src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-311.pyc +0 -0
  43. src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-312.pyc +0 -0
  44. src/pdf2u/document_il/frontend/il_creater.py +328 -0
  45. src/pdf2u/document_il/il_version_1.py +396 -0
  46. src/pdf2u/document_il/il_version_1.rnc +141 -0
  47. src/pdf2u/document_il/il_version_1.rng +390 -0
  48. src/pdf2u/document_il/il_version_1.xsd +235 -0
  49. src/pdf2u/document_il/midend/__init__.py +0 -0
  50. src/pdf2u/document_il/midend/__pycache__/__init__.cpython-311.pyc +0 -0
Dockerfile ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ref: https://github.com/fastapi/full-stack-fastapi-template/blob/master/backend/Dockerfile
2
+ FROM python:3.12-slim-bookworm
3
+
4
+ # Print logs immediately
5
+ # Ref: https://docs.python.org/3/using/cmdline.html#envvar-PYTHONUNBUFFERED
6
+ ENV PYTHONUNBUFFERED=1
7
+
8
+ # Install system dependencies including OpenGL libraries
9
+ RUN apt-get update && apt-get install -y \
10
+ libgl1-mesa-glx \
11
+ libglib2.0-0 \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Change the working directory to the `app` directory
15
+ WORKDIR /app
16
+
17
+ # Install uv
18
+ # Ref: https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
19
+ COPY --from=ghcr.io/astral-sh/uv:0.5.18 /uv /uvx /bin/
20
+
21
+ # Place executables in the environment at the front of the path
22
+ # Ref: https://docs.astral.sh/uv/guides/integration/docker/#using-the-environment
23
+ ENV PATH="/app/.venv/bin:$PATH"
24
+
25
+ # Compile bytecode to speed up the startup time
26
+ # Ref: https://docs.astral.sh/uv/guides/integration/docker/#compiling-bytecode
27
+ ENV UV_COMPILE_BYTECODE=1
28
+
29
+ # uv Cache
30
+ # Ref: https://docs.astral.sh/uv/guides/integration/docker/#caching
31
+ ENV UV_LINK_MODE=copy
32
+
33
+ # Install dependencies
34
+ # Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
35
+ RUN --mount=type=cache,target=/root/.cache/uv \
36
+ --mount=type=bind,source=uv.lock,target=uv.lock \
37
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
38
+ uv sync --frozen --no-install-project
39
+
40
+ # Copy the project into the image
41
+ COPY . .
42
+
43
+ # Sync the project
44
+ # Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
45
+ RUN --mount=type=cache,target=/root/.cache/uv \
46
+ uv sync --all-extras
47
+
48
+ EXPOSE 8501
49
+ # Set the default command
50
+ CMD ["streamlit", "run", "src/pdf2u/gui.py"]
pyproject.toml ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [tool.hatch.version]
6
+ path = "src/pdf2u/__init__.py"
7
+ # FROM: https://hatch.pypa.io/latest/version/
8
+
9
+ [tool.hatch.build.targets.wheel]
10
+ packages = ["src/pdf2u"]
11
+ # FROM: https://hatch.pypa.io/latest/build/
12
+
13
+ [project]
14
+ name = "pdf2u"
15
+ version = "0.0.4"
16
+ description = "Yet Another Document Translator"
17
+ classifiers = [
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python",
20
+ "Programming Language :: Python :: 3 :: Only",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Python :: 3.13",
26
+ ] # FROM: https://pypi.org/classifiers/
27
+ readme = "README.md"
28
+ requires-python = ">=3.10,<3.13"
29
+ license = { file = "LICENSE" }
30
+ authors = [{ name = "A.J.Zeller", email = "[email protected]" }]
31
+ maintainers = [{ name = "A.J.Zeller", email = "[email protected]" }]
32
+ # dynamic = ["version"] # https://hatch.pypa.io/latest/config/metadata/#version
33
+ dependencies = [
34
+ "bitstring>=4.3.0",
35
+ "configargparse>=1.7",
36
+ "httpx[socks]>=0.27.0",
37
+ "huggingface-hub>=0.27.0",
38
+ "numpy>=2.0.2",
39
+ "onnx>=1.17.0",
40
+ "onnxruntime>=1.16.1",
41
+ "openai>=1.59.3",
42
+ "opencv-python>=4.10.0.84",
43
+ "orjson>=3.10.14",
44
+ "pdfminer-six>=20240706",
45
+ "peewee>=3.17.8",
46
+ "rich>=13.9.4",
47
+ "toml>=0.10.2",
48
+ "tqdm>=4.67.1",
49
+ "xsdata[cli,lxml,soap]>=24.12",
50
+ "msgpack>=1.1.0",
51
+ "typer>=0.15.1",
52
+ "pymupdf==1.24.5",
53
+ ]
54
+
55
+ [project.urls]
56
+ Homepage = "https://github.com/atticuszeller/pdf2u"
57
+ Issues = "https://github.com/atticuszeller/pdf2u/issues"
58
+
59
+ [project.scripts] # build-backend config needed
60
+ pdf2u = "pdf2u.main:app"
61
+ # FROM: https://packaging.python.org/en/latest/guides/writing-pyproject-toml/
62
+
63
+ [project.optional-dependencies]
64
+ gui = ["pypdf2>=3.0.1", "streamlit>=1.42.2", "streamlit-pdf-viewer>=0.0.21"]
65
+ # optional deps for package installation
66
+
67
+ [dependency-groups]
68
+ dev = [
69
+ "ruff>=0.6.3",
70
+ "mypy>=1.11.2",
71
+ "pre-commit>=3.8.0",
72
+ "pytest>=8.3.2",
73
+ "pytest-sugar>=1.0.0",
74
+ "coverage>=7.6.1",
75
+ "git-cliff>=2.6.1",
76
+ "bump-my-version>=0.28.0",
77
+ "typos>=1.26.8",
78
+ "fonttools>=4.56.0",
79
+ ]
80
+
81
+ ## Test
82
+ [tool.mypy]
83
+ strict = true
84
+ exclude = ["venv", ".venv"]
85
+
86
+ [tool.pytest.ini_options]
87
+ # Set additional command line options for pytest
88
+ # Ref: https://docs.pytest.org/en/stable/reference/reference.html#command-line-flags
89
+ addopts = "-rXs --strict-config --strict-markers --tb=long"
90
+ xfail_strict = true # Treat tests that are marked as xfail but pass as test failures
91
+ filterwarnings = ["error"] # Treat all warnings as errors
92
+ pythonpath = "src/pdf2u/"
93
+
94
+ [tool.coverage.run]
95
+ branch = true
96
+
97
+ [tool.coverage.report]
98
+ skip_covered = true
99
+ show_missing = true
100
+ precision = 2
101
+ exclude_lines = [
102
+ 'def __repr__',
103
+ 'pragma= no cover',
104
+ 'raise NotImplementedError',
105
+ 'if TYPE_CHECKING=',
106
+ 'if typing.TYPE_CHECKING=',
107
+ '@overload',
108
+ '@typing.overload',
109
+ '\(Protocol\)=$',
110
+ 'typing.assert_never',
111
+ 'assert_never',
112
+ 'if __name__ == .__main__.=',
113
+ ]
114
+
115
+ ## Linter and formatter
116
+ [tool.ruff]
117
+ # cover and extend the default config in https=//docs.astral.sh/ruff/configuration/
118
+ extend-exclude = [""]
119
+ target-version = "py310"
120
+
121
+ [tool.ruff.lint]
122
+ select = [
123
+ "E", # pycodestyle errors
124
+ "W", # pycodestyle warnings
125
+ "F", # pyflakes
126
+ "I", # isort
127
+ "B", # flake8-bugbear
128
+ "C4", # flake8-comprehensions
129
+ "UP", # pyupgrade
130
+ "ARG001", # unused arguments in functions
131
+ ]
132
+
133
+ isort = { combine-as-imports = true, split-on-trailing-comma = false }
134
+
135
+ # Avoid trying to fix flake8-bugbear (`B`) violations.
136
+ unfixable = ["B"]
137
+
138
+ [tool.ruff.format]
139
+ docstring-code-format = true
140
+ skip-magic-trailing-comma = true
141
+
142
+ # Reference
143
+ # 1. https=//github.com/Kludex/python-template/blob/main/template/%7B%7B%20project_slug%20%7D%7D/pyproject.toml.jinja
144
+ # 2. https=//github.com/fastapi/full-stack-fastapi-template/blob/master/backend/pyproject.toml
145
+ # 3. https=//github.com/pydantic/logfire
146
+ # 4. https=//coverage.readthedocs.io/en/latest/index.html
147
+
148
+ ## VCS
149
+ [tool.git-cliff.remote.github]
150
+ owner = "atticuszeller"
151
+ repo = "python-uv-package"
152
+
153
+ [tool.git-cliff.changelog]
154
+ # template for the changelog header
155
+ header = """
156
+ # Changelog\n
157
+ All notable changes to this project will be documented in this file.\n
158
+ """
159
+ # template for the changelog body
160
+ # https://keats.github.io/tera/docs/#introduction
161
+ body = """
162
+ {% if version %}\
163
+ ## {{ version | trim_start_matches(pat="v") }} - {{ timestamp | date(format="%Y-%m-%d") }}
164
+ {% else %}\
165
+ ## unreleased
166
+ {% endif %}\
167
+ {% for group, commits in commits | group_by(attribute="group") %}
168
+ ### {{ group | striptags | trim | upper_first }}
169
+ {% for commit in commits| unique(attribute="message") %}
170
+ - {% if commit.scope %}*({{ commit.scope }})* {% endif %}\
171
+ {% if commit.breaking %}[**breaking**] {% endif %}\
172
+ {{ commit.message | upper_first }}\
173
+ {% if commit.remote.pr_number %} in #{{ commit.remote.pr_number }}{%- endif %}\
174
+ {% endfor %}
175
+ {% endfor %}\n
176
+ """
177
+ # template for the changelog footer
178
+ footer = """
179
+ <!-- generated by git-cliff -->
180
+ """
181
+ # remove the leading and trailings
182
+ trim = true
183
+ # postprocessors
184
+ # postprocessors = [
185
+ # { pattern = '<REPO>', replace = "https://github.com/atticuszeller/python-uv" }, # replace repository URL
186
+ # ]
187
+ # render body even when there are no releases to process
188
+ render_always = true
189
+ # output file path
190
+ output = "CHANGELOG.md"
191
+
192
+ [tool.git-cliff.git]
193
+ # parse the commits based on https://www.conventionalcommits.org
194
+ conventional_commits = true
195
+ # filter out the commits that are not conventional
196
+ filter_unconventional = true
197
+ # process each line of a commit as an individual commit
198
+ split_commits = false
199
+ # regex for preprocessing the commit messages
200
+ commit_preprocessors = [
201
+ # If the spelling is incorrect, it will be automatically fixed.
202
+ { pattern = '.*', replace_command = 'typos --write-changes -' },
203
+ ]
204
+ # regex for parsing and grouping commits
205
+ commit_parsers = [
206
+ { message = "^feat", group = "<!-- 0 -->🚀 Features" },
207
+ { message = "^fix", group = "<!-- 1 -->🐛 Bug Fixes" },
208
+ { message = "^doc", group = "<!-- 3 -->📚 Documentation" },
209
+ { message = "^perf", group = "<!-- 4 -->⚡ Performance" },
210
+ { message = "^refactor", group = "<!-- 2 -->🚜 Refactor" },
211
+ { message = "^style", group = "<!-- 5 -->🎨 Styling" },
212
+ { message = "^test", group = "<!-- 6 -->🧪 Testing" },
213
+ { message = "^chore\\(release\\)", skip = true },
214
+ { message = "^chore\\(deps.*\\)", skip = true },
215
+ { message = "^chore\\(pr\\)", skip = true },
216
+ { message = "^chore\\(pull\\)", skip = true },
217
+ { message = "^chore|^ci", group = "<!-- 7 -->⚙️ Miscellaneous Tasks" },
218
+ { body = ".*security", group = "<!-- 8 -->🛡️ Security" },
219
+ { message = "^revert", group = "<!-- 9 -->◀️ Revert" },
220
+ ]
221
+ # filter out the commits that are not matched by commit parsers
222
+ filter_commits = false
223
+ # sort the tags topologically
224
+ topo_order = false
225
+ # sort the commits inside sections by oldest/newest order
226
+ sort_commits = "oldest"
227
+
228
+ [tool.bumpversion]
229
+ current_version = "0.0.4"
230
+ parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
231
+ serialize = ["{major}.{minor}.{patch}"]
232
+ search = "{current_version}"
233
+ replace = "{new_version}"
234
+ regex = false
235
+ ignore_missing_version = false
236
+ ignore_missing_files = false
237
+ tag = true
238
+ sign_tags = false
239
+ tag_name = "v{new_version}"
240
+ tag_message = "chore(release): {current_version} → {new_version}"
241
+ allow_dirty = true # git-cliff first then bump patch
242
+ commit = true
243
+ message = "chore(release): {current_version} → {new_version}"
244
+ commit_args = ""
245
+ setup_hooks = []
246
+ pre_commit_hooks = []
247
+ post_commit_hooks = []
248
+
249
+ [[tool.bumpversion.files]]
250
+ filename = "src/pdf2u/__init__.py"
251
+
252
+ [[tool.bumpversion.files]]
253
+ filename = "pyproject.toml"
254
+ search = "version = \"{current_version}\""
255
+ replace = "version = \"{new_version}\""
256
+
257
+ [[tool.bumpversion.files]]
258
+ filename = "CHANGELOG.md"
259
+ search = "unreleased"
260
+ replace = "{new_version} - {now:%Y-%m-%d}"
261
+
262
+ # https://callowayproject.github.io/bump-my-version/reference/search-and-replace-config/
src/pdf2u/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "0.0.4"
src/pdf2u/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (181 Bytes). View file
 
src/pdf2u/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (175 Bytes). View file
 
src/pdf2u/__pycache__/const.cpython-311.pyc ADDED
Binary file (519 Bytes). View file
 
src/pdf2u/__pycache__/const.cpython-312.pyc ADDED
Binary file (855 Bytes). View file
 
src/pdf2u/__pycache__/converter.cpython-311.pyc ADDED
Binary file (13.8 kB). View file
 
src/pdf2u/__pycache__/converter.cpython-312.pyc ADDED
Binary file (12.7 kB). View file
 
src/pdf2u/__pycache__/high_level.cpython-311.pyc ADDED
Binary file (21.2 kB). View file
 
src/pdf2u/__pycache__/high_level.cpython-312.pyc ADDED
Binary file (18.6 kB). View file
 
src/pdf2u/__pycache__/io.cpython-312.pyc ADDED
Binary file (583 Bytes). View file
 
src/pdf2u/__pycache__/main.cpython-311.pyc ADDED
Binary file (13.3 kB). View file
 
src/pdf2u/__pycache__/main.cpython-312.pyc ADDED
Binary file (13.4 kB). View file
 
src/pdf2u/__pycache__/pdfinterp.cpython-311.pyc ADDED
Binary file (23.7 kB). View file
 
src/pdf2u/__pycache__/pdfinterp.cpython-312.pyc ADDED
Binary file (21.5 kB). View file
 
src/pdf2u/__pycache__/progress_monitor.cpython-311.pyc ADDED
Binary file (9.5 kB). View file
 
src/pdf2u/__pycache__/progress_monitor.cpython-312.pyc ADDED
Binary file (8.69 kB). View file
 
src/pdf2u/__pycache__/translation_config.cpython-311.pyc ADDED
Binary file (8.22 kB). View file
 
src/pdf2u/__pycache__/translation_config.cpython-312.pyc ADDED
Binary file (7.45 kB). View file
 
src/pdf2u/asynchronize/__init__.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import time
3
+
4
+
5
+ class Args:
6
+ def __init__(self, args, kwargs):
7
+ self.args = args
8
+ self.kwargs = kwargs
9
+
10
+
11
+ class AsyncCallback:
12
+ def __init__(self):
13
+ self.queue = asyncio.Queue()
14
+ self.finished = False
15
+ self.loop = asyncio.get_event_loop()
16
+
17
+ def step_callback(self, *args, **kwargs):
18
+ # Whenever a step is called, add to the queue but don't set finished to True, so __anext__ will continue
19
+ args = Args(args, kwargs)
20
+
21
+ # We have to use the threadsafe call so that it wakes up the event loop, in case it's sleeping:
22
+ # https://stackoverflow.com/a/49912853/2148718
23
+ self.loop.call_soon_threadsafe(self.queue.put_nowait, args)
24
+
25
+ # Add a small delay to release the GIL, ensuring the event loop has time to process messages
26
+ time.sleep(0.01)
27
+
28
+ def finished_callback(self, *args, **kwargs):
29
+ # Whenever a finished is called, add to the queue as with step, but also set finished to True, so __anext__
30
+ # will terminate after processing the remaining items
31
+ if self.finished:
32
+ return
33
+ self.step_callback(*args, **kwargs)
34
+ self.finished = True
35
+
36
+ def __await__(self):
37
+ # Since this implements __anext__, this can return itself
38
+ return self.queue.get().__await__()
39
+
40
+ def __aiter__(self):
41
+ # Since this implements __anext__, this can return itself
42
+ return self
43
+
44
+ async def __anext__(self):
45
+ # Keep waiting for the queue if a) we haven't finished, or b) if the queue is still full. This lets us finish
46
+ # processing the remaining items even after we've finished
47
+ if self.finished and self.queue.empty():
48
+ raise StopAsyncIteration
49
+
50
+ result = await self.queue.get()
51
+ return result
src/pdf2u/asynchronize/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (2.69 kB). View file
 
src/pdf2u/asynchronize/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (2.5 kB). View file
 
src/pdf2u/const.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import StrEnum
2
+ from pathlib import Path
3
+
4
+ CACHE_FOLDER = Path.home() / ".cache" / "pdf2u"
5
+
6
+
7
+ def get_cache_file_path(filename: str) -> Path:
8
+ return CACHE_FOLDER / filename
9
+
10
+
11
+ class TranslationService(StrEnum):
12
+ OPENAI: str = "openai"
13
+ GOOGLE: str = "google"
14
+ BING: str = "bing"
src/pdf2u/converter.py ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import logging
3
+ import re
4
+ import unicodedata
5
+
6
+ import numpy as np
7
+ from pdfminer.converter import PDFConverter
8
+ from pdfminer.layout import LTChar, LTComponent, LTFigure, LTLine, LTPage, LTText
9
+ from pdfminer.pdfcolor import PDFColorSpace
10
+ from pdfminer.pdffont import PDFCIDFont, PDFFont, PDFUnicodeNotDefined
11
+ from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
12
+ from pdfminer.utils import Matrix, apply_matrix_pt, bbox2str, matrix2str, mult_matrix
13
+ from pymupdf import Font
14
+
15
+ from pdf2u.document_il.frontend.il_creater import ILCreater
16
+
17
+ log = logging.getLogger(__name__)
18
+
19
+
20
+ class PDFConverterEx(PDFConverter):
21
+ def __init__(
22
+ self, rsrcmgr: PDFResourceManager, il_creater: ILCreater | None = None
23
+ ) -> None:
24
+ PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
25
+ self.il_creater = il_creater
26
+
27
+ def begin_page(self, page, ctm) -> None:
28
+ # 重载替换 cropbox
29
+ (x0, y0, x1, y1) = page.cropbox
30
+ (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
31
+ (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
32
+ mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
33
+ self.il_creater.on_page_media_box(
34
+ mediabox[0], mediabox[1], mediabox[2], mediabox[3]
35
+ )
36
+ self.il_creater.on_page_number(page.pageno)
37
+ self.cur_item = LTPage(page.pageno, mediabox)
38
+
39
+ def end_page(self, _page) -> None:
40
+ # 重载返回指令流
41
+ return self.receive_layout(self.cur_item)
42
+
43
+ def begin_figure(self, name, bbox, matrix) -> None:
44
+ # 重载设置 pageid
45
+ self._stack.append(self.cur_item)
46
+ self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
47
+ self.cur_item.pageid = self._stack[-1].pageid
48
+
49
+ def end_figure(self, _: str) -> None:
50
+ # 重载返回指令流
51
+ fig = self.cur_item
52
+ if not isinstance(self.cur_item, LTFigure):
53
+ raise ValueError(f"Unexpected item type: {type(self.cur_item)}")
54
+ self.cur_item = self._stack.pop()
55
+ self.cur_item.add(fig)
56
+ return self.receive_layout(fig)
57
+
58
+ def render_char(
59
+ self,
60
+ matrix,
61
+ font,
62
+ fontsize: float,
63
+ scaling: float,
64
+ rise: float,
65
+ cid: int,
66
+ ncs,
67
+ graphicstate: PDFGraphicState,
68
+ ) -> float:
69
+ # 重载设置 cid 和 font
70
+ try:
71
+ text = font.to_unichr(cid)
72
+ if not isinstance(text, str):
73
+ raise TypeError(f"Expected string, got {type(text)}")
74
+ except PDFUnicodeNotDefined:
75
+ text = self.handle_undefined_char(font, cid)
76
+ textwidth = font.char_width(cid)
77
+ textdisp = font.char_disp(cid)
78
+
79
+ font_name = font.fontname
80
+ if isinstance(font_name, bytes):
81
+ try:
82
+ font_name = font_name.decode("utf-8")
83
+ except UnicodeDecodeError:
84
+ font_name = "BASE64:" + base64.b64encode(font_name).decode("utf-8")
85
+ font_id = self.il_creater.current_page_font_name_id_map[font_name]
86
+
87
+ item = AWLTChar(
88
+ matrix,
89
+ font,
90
+ fontsize,
91
+ scaling,
92
+ rise,
93
+ text,
94
+ textwidth,
95
+ textdisp,
96
+ ncs,
97
+ graphicstate,
98
+ self.il_creater.xobj_id,
99
+ font_id,
100
+ )
101
+ self.cur_item.add(item)
102
+ item.cid = cid # hack 插入原字符编码
103
+ item.font = font # hack 插入原字符字体
104
+ return item.adv
105
+
106
+
107
+ class AWLTChar(LTChar):
108
+ """Actual letter in the text as a Unicode string."""
109
+
110
+ def __init__(
111
+ self,
112
+ matrix: Matrix,
113
+ font: PDFFont,
114
+ fontsize: float,
115
+ scaling: float,
116
+ rise: float,
117
+ text: str,
118
+ textwidth: float,
119
+ textdisp: float | tuple[float | None, float],
120
+ ncs: PDFColorSpace,
121
+ graphicstate: PDFGraphicState,
122
+ xobj_id: int,
123
+ font_id: str,
124
+ ) -> None:
125
+ LTText.__init__(self)
126
+ self._text = text
127
+ self.matrix = matrix
128
+ self.fontname = font.fontname
129
+ self.ncs = ncs
130
+ self.graphicstate = graphicstate
131
+ self.xobj_id = xobj_id
132
+ self.adv = textwidth * fontsize * scaling
133
+ self.aw_font_id = font_id
134
+ # compute the boundary rectangle.
135
+ if font.is_vertical():
136
+ # vertical
137
+ assert isinstance(textdisp, tuple)
138
+ (vx, vy) = textdisp
139
+ if vx is None:
140
+ vx = fontsize * 0.5
141
+ else:
142
+ vx = vx * fontsize * 0.001
143
+ vy = (1000 - vy) * fontsize * 0.001
144
+ bbox_lower_left = (-vx, vy + rise + self.adv)
145
+ bbox_upper_right = (-vx + fontsize, vy + rise)
146
+ else:
147
+ # horizontal
148
+ descent = font.get_descent() * fontsize
149
+ bbox_lower_left = (0, descent + rise)
150
+ bbox_upper_right = (self.adv, descent + rise + fontsize)
151
+ (a, b, c, d, e, f) = self.matrix
152
+ self.upright = a * d * scaling > 0 and b * c <= 0
153
+ (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
154
+ (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
155
+ if x1 < x0:
156
+ (x0, x1) = (x1, x0)
157
+ if y1 < y0:
158
+ (y0, y1) = (y1, y0)
159
+ LTComponent.__init__(self, (x0, y0, x1, y1))
160
+ if font.is_vertical() or matrix[0] == 0:
161
+ self.size = self.width
162
+ else:
163
+ self.size = self.height
164
+ return
165
+
166
+ def __repr__(self) -> str:
167
+ return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"
168
+
169
+ def get_text(self) -> str:
170
+ return self._text
171
+
172
+
173
+ class Paragraph:
174
+ def __init__(self, y, x, x0, x1, size, brk):
175
+ self.y: float = y # 初始纵坐标
176
+ self.x: float = x # 初始横坐标
177
+ self.x0: float = x0 # 左边界
178
+ self.x1: float = x1 # 右边界
179
+ self.size: float = size # 字体大小
180
+ self.brk: bool = brk # 换行标记
181
+
182
+
183
+ # fmt: off
184
+ class TranslateConverter(PDFConverterEx):
185
+ def __init__(
186
+ self,
187
+ rsrcmgr,
188
+ vfont: str | None = None,
189
+ vchar: str | None = None,
190
+ thread: int = 0,
191
+ layout: dict | None = None,
192
+ lang_in: str = "", # 保留参数但添加未使用标记
193
+ _lang_out: str = "", # 改为未使用参数
194
+ _service: str = "", # 改为未使用参数
195
+ resfont: str = "",
196
+ noto: Font | None = None,
197
+ envs: dict | None = None,
198
+ _prompt: list | None = None, # 改为未使用参数
199
+ il_creater: ILCreater | None = None,
200
+ ):
201
+ layout = layout or {}
202
+ super().__init__(rsrcmgr, il_creater)
203
+ self.vfont = vfont
204
+ self.vchar = vchar
205
+ self.thread = thread
206
+ self.layout = layout
207
+ self.resfont = resfont
208
+ self.noto = noto
209
+
210
+ def receive_layout(self, ltpage: LTPage):
211
+ # 段落
212
+ sstk: list[str] = [] # 段落文字栈
213
+ pstk: list[Paragraph] = [] # 段落属性栈
214
+ vbkt: int = 0 # 段落公式括号计数
215
+ # 公式组
216
+ vstk: list[LTChar] = [] # 公式符号组
217
+ vlstk: list[LTLine] = [] # 公式线条组
218
+ vfix: float = 0 # 公式纵向偏移
219
+ # 公式组栈
220
+ var: list[list[LTChar]] = [] # 公式符号组栈
221
+ varl: list[list[LTLine]] = [] # 公式线条组栈
222
+ varf: list[float] = [] # 公式纵向偏移栈
223
+ vlen: list[float] = [] # 公式宽度栈
224
+ # 全局
225
+ lstk: list[LTLine] = [] # 全局线条栈
226
+ xt: LTChar = None # 上一个字符
227
+ xt_cls: int = -1 # 上一个字符所属段落,保证无论第一个字符属于哪个类别都可以触发新段落
228
+ vmax: float = ltpage.width / 4 # 行内公式最大宽度
229
+ ops: str = "" # 渲染结果
230
+
231
+ def vflag(font: str, char: str): # 匹配公式(和角标)字体
232
+ if isinstance(font, bytes): # 不一定能 decode,直接转 str
233
+ font = str(font)
234
+ font = font.split("+")[-1] # 字体名截断
235
+ if re.match(r"\(cid:", char):
236
+ return True
237
+ # 基于字体名规则的判定
238
+ if self.vfont:
239
+ if re.match(self.vfont, font):
240
+ return True
241
+ else:
242
+ if re.match( # latex 字体
243
+ r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)",
244
+ font,
245
+ ):
246
+ return True
247
+ # 基于字符集规则的判定
248
+ if self.vchar:
249
+ if re.match(self.vchar, char):
250
+ return True
251
+ else:
252
+ if (
253
+ char
254
+ and char != " " # 非空格
255
+ and (
256
+ unicodedata.category(char[0])
257
+ in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号
258
+ or ord(char[0]) in range(0x370, 0x400) # 希腊字母
259
+ )
260
+ ):
261
+ return True
262
+ return False
263
+
264
+ ############################################################
265
+ # A. 原文档解析
266
+ for child in ltpage:
267
+ if isinstance(child, LTChar):
268
+ self.il_creater.on_lt_char(child)
269
+ continue
270
+ cur_v = False
271
+ layout = self.layout[ltpage.pageid]
272
+ # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
273
+ h, w = layout.shape
274
+ # 读取当前字符在 layout 中的类别
275
+ cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
276
+ cls = layout[cy, cx]
277
+ # 锚定文档中 bullet 的位置
278
+ if child.get_text() == "•":
279
+ cls = 0
280
+ # 判定当前字符是否属于公式
281
+ if ( # 判定当前字符是否属于公式
282
+ cls == 0 # 1. 类别为保留区域
283
+ or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
284
+ or vflag(child.fontname, child.get_text()) # 3. 公式字体
285
+ or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体
286
+ ):
287
+ cur_v = True
288
+ # 判定括号组是否属于公式
289
+ if not cur_v:
290
+ if vstk and child.get_text() == "(":
291
+ cur_v = True
292
+ vbkt += 1
293
+ if vbkt and child.get_text() == ")":
294
+ cur_v = True
295
+ vbkt -= 1
296
+ if ( # 判定当前公式是否结束
297
+ not cur_v # 1. 当前字符不属于公式
298
+ or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落
299
+ # or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分
300
+ # 禁止纯公式(代码)段落换行,直到文字开始再重开文字段落,保证只存在两种情况
301
+ # A. 纯公式(代码)段落(锚定绝对位置)sstk[-1]=="" -> sstk[-1]=="{v*}"
302
+ # B. 文字开头段落(排版相对位置)sstk[-1]!=""
303
+ or (sstk[-1] != "" and abs(child.x0 - xt.x0) > vmax) # 因为 cls==xt_cls==0 一定有 sstk[-1]=="",所以这里不需要再判定 cls!=0
304
+ ):
305
+ if vstk:
306
+ if ( # 根据公式右侧的文字修正公式的纵向偏移
307
+ not cur_v # 1. 当前字符不属于公式
308
+ and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
309
+ and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧
310
+ ):
311
+ vfix = vstk[0].y0 - child.y0
312
+ if sstk[-1] == "":
313
+ xt_cls = -1 # 禁止纯公式段落(sstk[-1]=="{v*}")的后续连接,但是要考虑新字符和后续字符的连接,所以这里修改的是上个字符的类别
314
+ sstk[-1] += f"{{v{len(var)}}}"
315
+ var.append(vstk)
316
+ varl.append(vlstk)
317
+ varf.append(vfix)
318
+ vstk = []
319
+ vlstk = []
320
+ vfix = 0
321
+ # 当前字符不属于公式或当前字符是公式的第一个字符
322
+ if not vstk:
323
+ if cls == xt_cls: # 当前字符与前一个字符属于同一段落
324
+ if child.x0 > xt.x1 + 1: # 添加行内空格
325
+ sstk[-1] += " "
326
+ elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行
327
+ sstk[-1] += " "
328
+ pstk[-1].brk = True
329
+ else: # 根据当前字符构建一个新的段落
330
+ sstk.append("")
331
+ pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
332
+ if not cur_v: # 文字入栈
333
+ if ( # 根据当前字符修正段落属性
334
+ child.size > pstk[-1].size / 0.79 # 1. 当前字符显著比段落字体大
335
+ or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况)
336
+ ) and child.get_text() != " ": # 3. 当前字符不是空格
337
+ pstk[-1].y -= child.size - pstk[-1].size # 修正段落初始纵坐标,假设两个不同大小字符的上边界对齐
338
+ pstk[-1].size = child.size
339
+ sstk[-1] += child.get_text()
340
+ else: # 公式入栈
341
+ if ( # 根据公式左侧的文字修正公式的纵向偏移
342
+ not vstk # 1. 当前字符是公式的第一个字符
343
+ and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
344
+ and child.x0 > xt.x0 # 3. 前一个字符在公式左侧
345
+ ):
346
+ vfix = child.y0 - xt.y0
347
+ vstk.append(child)
348
+ # 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
349
+ pstk[-1].x0 = min(pstk[-1].x0, child.x0)
350
+ pstk[-1].x1 = max(pstk[-1].x1, child.x1)
351
+ # 更新上一个字符
352
+ xt = child
353
+ xt_cls = cls
354
+ elif isinstance(child, LTFigure):
355
+ # 图表
356
+ self.il_creater.on_pdf_figure(child)
357
+ pass
358
+ elif isinstance(child, LTLine): # 线条
359
+ continue
360
+ layout = self.layout[ltpage.pageid]
361
+ # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
362
+ h, w = layout.shape
363
+ # 读取当前线条在 layout 中的类别
364
+ cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
365
+ cls = layout[cy, cx]
366
+ if vstk and cls == xt_cls: # 公式线条
367
+ vlstk.append(child)
368
+ else: # 全局线条
369
+ lstk.append(child)
370
+ else:
371
+ pass
372
+ return
373
+ # 处理结尾
374
+ if vstk: # 公式出栈
375
+ sstk[-1] += f"{{v{len(var)}}}"
376
+ var.append(vstk)
377
+ varl.append(vlstk)
378
+ varf.append(vfix)
379
+ log.debug("\n==========[VSTACK]==========\n")
380
+ for var_id, v in enumerate(var): # 计算公式宽度
381
+ l = max([vch.x1 for vch in v]) - v[0].x0
382
+ log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[var_id])} > v{var_id} = {"".join([ch.get_text() for ch in v])}')
383
+ vlen.append(l)
384
+
385
+ ############################################################
386
+ # B. 段落翻译
387
+ log.debug("\n==========[SSTACK]==========\n")
388
+
389
+ news = sstk.copy()
390
+
391
+ ############################################################
392
+ # C. 新文档排版
393
+ def raw_string(fcur: str, cstk: str): # 编码字符串
394
+ if fcur == 'noto':
395
+ return "".join([f"{self.noto.has_glyph(ord(c)):04x}" for c in cstk])
396
+ elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
397
+ return "".join([f"{ord(c):04x}" for c in cstk])
398
+ else:
399
+ return "".join([f"{ord(c):02x}" for c in cstk])
400
+
401
+ _x, _y = 0, 0
402
+ for para_id, new in enumerate(news):
403
+ x: float = pstk[para_id].x # 段落初始横坐标
404
+ y: float = pstk[para_id].y # 段落初始纵坐标
405
+ x0: float = pstk[para_id].x0 # 段落左边界
406
+ x1: float = pstk[para_id].x1 # 段落右边界
407
+ size: float = pstk[para_id].size # 段落字体大小
408
+ brk: bool = pstk[para_id].brk # 段落换行标记
409
+ cstk: str = "" # 当前文字栈
410
+ fcur: str = None # 当前字体 ID
411
+ tx = x
412
+ fcur_ = fcur
413
+ ptr = 0
414
+ log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[para_id]} | {new}")
415
+ while ptr < len(new):
416
+ vy_regex = re.match(
417
+ r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE,
418
+ ) # 匹配 {vn} 公式标记
419
+ mod = 0 # 文字修饰符
420
+ if vy_regex: # 加载公式
421
+ ptr += len(vy_regex.group(0))
422
+ try:
423
+ vid = int(vy_regex.group(1).replace(" ", ""))
424
+ adv = vlen[vid]
425
+ except Exception as e:
426
+ log.debug("Skipping formula placeholder due to: %s", e)
427
+ continue # 翻译器可能会自动补个越界的公式标记
428
+ if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符
429
+ mod = var[vid][-1].width
430
+ else: # 加载文字
431
+ ch = new[ptr]
432
+ fcur_ = None
433
+ try:
434
+ if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
435
+ fcur_ = "tiro" # 默认拉丁字体
436
+ except Exception:
437
+ pass
438
+ if fcur_ is None:
439
+ fcur_ = self.resfont # 默认非拉丁字体
440
+ if fcur_ == 'noto':
441
+ adv = self.noto.char_lengths(ch, size)[0]
442
+ else:
443
+ adv = self.fontmap[fcur_].char_width(ord(ch)) * size
444
+ ptr += 1
445
+ if ( # 输出文字缓冲区
446
+ fcur_ != fcur # 1. 字体更新
447
+ or vy_regex # 2. 插入公式
448
+ or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
449
+ ):
450
+ if cstk:
451
+ ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
452
+ cstk = ""
453
+ if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
454
+ x = x0
455
+ lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
456
+ # y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1) # 小语种大多适配 1.1
457
+ y -= size * 1.4
458
+ if vy_regex: # 插入公式
459
+ fix = 0
460
+ if fcur is not None: # 段落内公式修正纵向偏移
461
+ fix = varf[vid]
462
+ for vch in var[vid]: # 排版公式字符
463
+ vc = chr(vch.cid)
464
+ ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm <{raw_string(self.fontid[vch.font], vc)}> TJ "
465
+ if log.isEnabledFor(logging.DEBUG):
466
+ lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
467
+ _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
468
+ for l in varl[vid]: # 排版公式线条
469
+ if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
470
+ ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
471
+ else: # 插入文字缓冲区
472
+ if not cstk: # 单行开头
473
+ tx = x
474
+ if x == x0 and ch == " ": # 消除段落换行空格
475
+ adv = 0
476
+ else:
477
+ cstk += ch
478
+ else:
479
+ cstk += ch
480
+ adv -= mod # 文字修饰符
481
+ fcur = fcur_
482
+ x += adv
483
+ if log.isEnabledFor(logging.DEBUG):
484
+ lstk.append(LTLine(0.1, (_x, _y), (x, y)))
485
+ _x, _y = x, y
486
+ # 处理结尾
487
+ if cstk:
488
+ ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm <{raw_string(fcur, cstk)}> TJ "
489
+ for l in lstk: # 排版全局线条
490
+ if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
491
+ ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
492
+ ops = f"BT {ops}ET "
493
+ return ops
src/pdf2u/document_il/__init__.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf2u.document_il.il_version_1 import (
2
+ BaseOperations,
3
+ Box,
4
+ Cropbox,
5
+ Document,
6
+ GraphicState,
7
+ Mediabox,
8
+ Page,
9
+ PageLayout,
10
+ PdfCharacter,
11
+ PdfFigure,
12
+ PdfFont,
13
+ PdfFormula,
14
+ PdfLine,
15
+ PdfParagraph,
16
+ PdfParagraphComposition,
17
+ PdfRectangle,
18
+ PdfSameStyleCharacters,
19
+ PdfSameStyleUnicodeCharacters,
20
+ PdfStyle,
21
+ PdfXobject,
22
+ )
23
+
24
+ __all__ = [
25
+ "BaseOperations",
26
+ "Box",
27
+ "Cropbox",
28
+ "Document",
29
+ "GraphicState",
30
+ "Mediabox",
31
+ "Page",
32
+ "PageLayout",
33
+ "PdfCharacter",
34
+ "PdfFigure",
35
+ "PdfFont",
36
+ "PdfFormula",
37
+ "PdfLine",
38
+ "PdfParagraph",
39
+ "PdfParagraphComposition",
40
+ "PdfRectangle",
41
+ "PdfSameStyleCharacters",
42
+ "PdfSameStyleUnicodeCharacters",
43
+ "PdfStyle",
44
+ "PdfXobject",
45
+ ]
src/pdf2u/document_il/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (912 Bytes). View file
 
src/pdf2u/document_il/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (706 Bytes). View file
 
src/pdf2u/document_il/__pycache__/il_version_1.cpython-311.pyc ADDED
Binary file (22 kB). View file
 
src/pdf2u/document_il/__pycache__/il_version_1.cpython-312.pyc ADDED
Binary file (17.1 kB). View file
 
src/pdf2u/document_il/__pycache__/xml_converter.cpython-311.pyc ADDED
Binary file (4.42 kB). View file
 
src/pdf2u/document_il/__pycache__/xml_converter.cpython-312.pyc ADDED
Binary file (3.81 kB). View file
 
src/pdf2u/document_il/backend/__init__.py ADDED
File without changes
src/pdf2u/document_il/backend/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (181 Bytes). View file
 
src/pdf2u/document_il/backend/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (169 Bytes). View file
 
src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-311.pyc ADDED
Binary file (19.8 kB). View file
 
src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-312.pyc ADDED
Binary file (18.5 kB). View file
 
src/pdf2u/document_il/backend/pdf_creater.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from pathlib import Path
4
+
5
+ import pymupdf
6
+ from bitstring import BitStream
7
+
8
+ from pdf2u.document_il import il_version_1
9
+ from pdf2u.document_il.utils.fontmap import FontMapper
10
+ from pdf2u.translation_config import TranslateResult, TranslationConfig
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ SUBSET_FONT_STAGE_NAME = "Subset font"
15
+ SAVE_PDF_STAGE_NAME = "Save PDF"
16
+
17
+
18
+ class PDFCreater:
19
+ stage_name = "Generate drawing instructions"
20
+
21
+ def __init__(
22
+ self,
23
+ original_pdf_path: str,
24
+ document: il_version_1.Document,
25
+ translation_config: TranslationConfig,
26
+ ):
27
+ self.original_pdf_path = original_pdf_path
28
+ self.docs = document
29
+ self.font_path = translation_config.font
30
+ self.font_mapper = FontMapper(translation_config)
31
+ self.translation_config = translation_config
32
+
33
+ def render_graphic_state(
34
+ self, draw_op: BitStream, graphic_state: il_version_1.GraphicState
35
+ ):
36
+ if graphic_state is None:
37
+ return
38
+ # if graphic_state.stroking_color_space_name:
39
+ # draw_op.append(
40
+ # f"/{graphic_state.stroking_color_space_name} CS \n".encode()
41
+ # )
42
+ # if graphic_state.non_stroking_color_space_name:
43
+ # draw_op.append(
44
+ # f"/{graphic_state.non_stroking_color_space_name}"
45
+ # f" cs \n".encode()
46
+ # )
47
+ # if graphic_state.ncolor is not None:
48
+ # if len(graphic_state.ncolor) == 1:
49
+ # draw_op.append(f"{graphic_state.ncolor[0]} g \n".encode())
50
+ # elif len(graphic_state.ncolor) == 3:
51
+ # draw_op.append(
52
+ # f"{' '.join((str(x) for x in graphic_state.ncolor))} sc \n".encode()
53
+ # )
54
+ # if graphic_state.scolor is not None:
55
+ # if len(graphic_state.scolor) == 1:
56
+ # draw_op.append(f"{graphic_state.scolor[0]} G \n".encode())
57
+ # elif len(graphic_state.scolor) == 3:
58
+ # draw_op.append(
59
+ # f"{' '.join((str(x) for x in graphic_state.scolor))} SC \n".encode()
60
+ # )
61
+
62
+ if graphic_state.passthrough_per_char_instruction:
63
+ draw_op.append(
64
+ f"{graphic_state.passthrough_per_char_instruction} \n".encode()
65
+ )
66
+
67
+ def render_paragraph_to_char(
68
+ self, paragraph: il_version_1.PdfParagraph
69
+ ) -> list[il_version_1.PdfCharacter]:
70
+ chars = []
71
+ for composition in paragraph.pdf_paragraph_composition:
72
+ if not isinstance(composition.pdf_character, il_version_1.PdfCharacter):
73
+ logger.error(
74
+ f"Unknown composition type. "
75
+ f"This type only appears in the IL "
76
+ f"after the translation is completed."
77
+ f"During pdf rendering, this type is not supported."
78
+ f"Composition: {composition}. "
79
+ f"Paragraph: {paragraph}. "
80
+ )
81
+ continue
82
+ chars.append(composition.pdf_character)
83
+ if not chars and paragraph.unicode:
84
+ logger.error(
85
+ f"Unable to export paragraphs that have "
86
+ f"not yet been formatted: {paragraph}"
87
+ )
88
+ return chars
89
+ return chars
90
+
91
+ def get_available_font_list(self, pdf, page):
92
+ page_xref_id = pdf[page.page_number].xref
93
+ return self.get_xobj_available_fonts(page_xref_id, pdf)
94
+
95
+ def get_xobj_available_fonts(self, page_xref_id, pdf):
96
+ resources_type, r_id = pdf.xref_get_key(page_xref_id, "Resources")
97
+ if resources_type == "xref":
98
+ resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
99
+ r_id = pdf.xref_object(int(resource_xref_id))
100
+ resources_type = "dict"
101
+ if resources_type == "dict":
102
+ xref_id = re.search("/Font (\\d+) 0 R", r_id)
103
+ if xref_id is not None:
104
+ xref_id = xref_id.group(1)
105
+ font_dict = pdf.xref_object(int(xref_id))
106
+ else:
107
+ search = re.search("/Font *<<(.+?)>>", r_id.replace("\n", " "))
108
+ if search is None:
109
+ # Have resources but no fonts
110
+ return set()
111
+ font_dict = search.group(1)
112
+ else:
113
+ r_id = int(r_id.split(" ")[0])
114
+ _, font_dict = pdf.xref_get_key(r_id, "Font")
115
+ fonts = re.findall("/([^ ]+?) ", font_dict)
116
+ return set(fonts)
117
+
118
+ def _debug_render_rectangle(
119
+ self, draw_op: BitStream, rectangle: il_version_1.PdfRectangle
120
+ ):
121
+ """Draw a debug rectangle in PDF for visualization purposes.
122
+
123
+ Args:
124
+ draw_op: BitStream to append PDF drawing operations
125
+ rectangle: Rectangle object containing position information
126
+ """
127
+ x1 = rectangle.box.x
128
+ y1 = rectangle.box.y
129
+ x2 = rectangle.box.x2
130
+ y2 = rectangle.box.y2
131
+ # Save graphics state
132
+ draw_op.append(b"q ")
133
+
134
+ # Set green color for debug visibility
135
+ draw_op.append(
136
+ rectangle.graphic_state.passthrough_per_char_instruction.encode()
137
+ ) # Green stroke
138
+ draw_op.append(b" 1 w ") # Line width
139
+
140
+ # Draw four lines manually
141
+ # Bottom line
142
+ draw_op.append(f"{x1} {y1} m {x2} {y1} l S ".encode())
143
+ # Right line
144
+ draw_op.append(f"{x2} {y1} m {x2} {y2} l S ".encode())
145
+ # Top line
146
+ draw_op.append(f"{x2} {y2} m {x1} {y2} l S ".encode())
147
+ # Left line
148
+ draw_op.append(f"{x1} {y2} m {x1} {y1} l S ".encode())
149
+
150
+ # Restore graphics state
151
+ draw_op.append(b"Q\n")
152
+
153
+ def write_debug_info(
154
+ self, pdf: pymupdf.Document, translation_config: TranslationConfig
155
+ ):
156
+ self.font_mapper.add_font(pdf, self.docs)
157
+
158
+ for page in self.docs.page:
159
+ _, r_id = pdf.xref_get_key(pdf[page.page_number].xref, "Contents")
160
+ resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
161
+ base_op = pdf.xref_stream(int(resource_xref_id))
162
+ translation_config.raise_if_cancelled()
163
+ xobj_available_fonts = {}
164
+ xobj_draw_ops = {}
165
+ xobj_encoding_length_map = {}
166
+ available_font_list = self.get_available_font_list(pdf, page)
167
+
168
+ page_encoding_length_map = {
169
+ f.font_id: f.encoding_length for f in page.pdf_font
170
+ }
171
+ page_op = BitStream()
172
+ # q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
173
+ page_op.append(b"q ")
174
+ if base_op is not None:
175
+ page_op.append(base_op)
176
+ page_op.append(b" Q ")
177
+ page_op.append(
178
+ f"q Q 1 0 0 1 {page.cropbox.box.x} {page.cropbox.box.y} cm \n".encode()
179
+ )
180
+ # 收集所有字符
181
+ chars = []
182
+ # 首先添加页面级别的字符
183
+ if page.pdf_character:
184
+ chars.extend(page.pdf_character)
185
+ # 然后添加段落中的字符
186
+ for paragraph in page.pdf_paragraph:
187
+ chars.extend(self.render_paragraph_to_char(paragraph))
188
+
189
+ # 渲染所有字符
190
+ for char in chars:
191
+ if not getattr(char, "debug_info", False):
192
+ continue
193
+ if char.char_unicode == "\n":
194
+ continue
195
+ if char.pdf_character_id is None:
196
+ # dummy char
197
+ continue
198
+ char_size = char.pdf_style.font_size
199
+ font_id = char.pdf_style.font_id
200
+
201
+ if font_id not in available_font_list:
202
+ continue
203
+ draw_op = page_op
204
+ encoding_length_map = page_encoding_length_map
205
+
206
+ draw_op.append(b"q ")
207
+ self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
208
+ if char.vertical:
209
+ draw_op.append(
210
+ f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode()
211
+ )
212
+ else:
213
+ draw_op.append(
214
+ f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode()
215
+ )
216
+
217
+ encoding_length = encoding_length_map[font_id]
218
+ # pdf32000-2008 page14:
219
+ # As hexadecimal data enclosed in angle brackets < >
220
+ # see 7.3.4.3, "Hexadecimal Strings."
221
+ draw_op.append(
222
+ f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode()
223
+ )
224
+
225
+ draw_op.append(b" Tj ET Q \n")
226
+ for rect in page.pdf_rectangle:
227
+ if not rect.debug_info:
228
+ continue
229
+ self._debug_render_rectangle(page_op, rect)
230
+ draw_op = page_op
231
+ # Since this is a draw instruction container,
232
+ # no additional information is needed
233
+ pdf.update_stream(int(resource_xref_id), draw_op.tobytes())
234
+ translation_config.raise_if_cancelled()
235
+ pdf.subset_fonts(fallback=False)
236
+
237
+ def write(self, translation_config: TranslationConfig) -> TranslateResult:
238
+ basename = Path(translation_config.input_file).stem
239
+ debug_suffix = ".debug" if translation_config.debug else ""
240
+ mono_out_path = translation_config.get_output_file_path(
241
+ f"{basename}{debug_suffix}.{translation_config.lang_out}.mono.pdf"
242
+ )
243
+ pdf = pymupdf.open(self.original_pdf_path)
244
+ self.font_mapper.add_font(pdf, self.docs)
245
+ with self.translation_config.progress_monitor.stage_start(
246
+ self.stage_name, len(self.docs.page)
247
+ ) as pbar:
248
+ for page in self.docs.page:
249
+ translation_config.raise_if_cancelled()
250
+ xobj_available_fonts = {}
251
+ xobj_draw_ops = {}
252
+ xobj_encoding_length_map = {}
253
+ available_font_list = self.get_available_font_list(pdf, page)
254
+
255
+ for xobj in page.pdf_xobject:
256
+ xobj_available_fonts[xobj.xobj_id] = available_font_list.copy()
257
+ try:
258
+ xobj_available_fonts[xobj.xobj_id].update(
259
+ self.get_xobj_available_fonts(xobj.xref_id, pdf)
260
+ )
261
+ except Exception:
262
+ pass
263
+ xobj_encoding_length_map[xobj.xobj_id] = {
264
+ f.font_id: f.encoding_length for f in xobj.pdf_font
265
+ }
266
+ xobj_op = BitStream()
267
+ xobj_op.append(xobj.base_operations.value.encode())
268
+ xobj_draw_ops[xobj.xobj_id] = xobj_op
269
+ page_encoding_length_map = {
270
+ f.font_id: f.encoding_length for f in page.pdf_font
271
+ }
272
+ page_op = BitStream()
273
+ # q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
274
+ page_op.append(b"q ")
275
+ page_op.append(page.base_operations.value.encode())
276
+ page_op.append(b" Q ")
277
+ page_op.append(
278
+ f"q Q 1 0 0 1 {page.cropbox.box.x} {page.cropbox.box.y} cm \n".encode()
279
+ )
280
+ # 收集所有字符
281
+ chars = []
282
+ # 首先添加页面级别的字符
283
+ if page.pdf_character:
284
+ chars.extend(page.pdf_character)
285
+ # 然后添加段落中的字符
286
+ for paragraph in page.pdf_paragraph:
287
+ chars.extend(self.render_paragraph_to_char(paragraph))
288
+
289
+ # 渲染所有字符
290
+ for char in chars:
291
+ if char.char_unicode == "\n":
292
+ continue
293
+ if char.pdf_character_id is None:
294
+ # dummy char
295
+ continue
296
+ char_size = char.pdf_style.font_size
297
+ font_id = char.pdf_style.font_id
298
+ if char.xobj_id in xobj_available_fonts:
299
+ if font_id not in xobj_available_fonts[char.xobj_id]:
300
+ continue
301
+ draw_op = xobj_draw_ops[char.xobj_id]
302
+ encoding_length_map = xobj_encoding_length_map[char.xobj_id]
303
+ else:
304
+ if font_id not in available_font_list:
305
+ continue
306
+ draw_op = page_op
307
+ encoding_length_map = page_encoding_length_map
308
+
309
+ draw_op.append(b"q ")
310
+ self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
311
+ if char.vertical:
312
+ draw_op.append(
313
+ f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode()
314
+ )
315
+ else:
316
+ draw_op.append(
317
+ f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode()
318
+ )
319
+
320
+ encoding_length = encoding_length_map[font_id]
321
+ # pdf32000-2008 page14:
322
+ # As hexadecimal data enclosed in angle brackets < >
323
+ # see 7.3.4.3, "Hexadecimal Strings."
324
+ draw_op.append(
325
+ f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode()
326
+ )
327
+
328
+ draw_op.append(b" Tj ET Q \n")
329
+ for xobj in page.pdf_xobject:
330
+ draw_op = xobj_draw_ops[xobj.xobj_id]
331
+ pdf.update_stream(xobj.xref_id, draw_op.tobytes())
332
+ # pdf.update_stream(xobj.xref_id, b'')
333
+ for rect in page.pdf_rectangle:
334
+ self._debug_render_rectangle(page_op, rect)
335
+ draw_op = page_op
336
+ op_container = pdf.get_new_xref()
337
+ # Since this is a draw instruction container,
338
+ # no additional information is needed
339
+ pdf.update_object(op_container, "<<>>")
340
+ pdf.update_stream(op_container, draw_op.tobytes())
341
+ pdf[page.page_number].set_contents(op_container)
342
+ pbar.advance()
343
+ translation_config.raise_if_cancelled()
344
+ with self.translation_config.progress_monitor.stage_start(
345
+ SUBSET_FONT_STAGE_NAME, 1
346
+ ) as pbar:
347
+ if not translation_config.skip_clean:
348
+ pdf.subset_fonts(fallback=False)
349
+ pbar.advance()
350
+ with self.translation_config.progress_monitor.stage_start(
351
+ SAVE_PDF_STAGE_NAME, 2
352
+ ) as pbar:
353
+ if not translation_config.no_mono:
354
+ if translation_config.debug:
355
+ translation_config.raise_if_cancelled()
356
+ pdf.save(
357
+ f"{mono_out_path}.decompressed.pdf", expand=True, pretty=True
358
+ )
359
+ translation_config.raise_if_cancelled()
360
+ pdf.save(
361
+ mono_out_path,
362
+ garbage=3,
363
+ deflate=True,
364
+ clean=not translation_config.skip_clean,
365
+ deflate_fonts=True,
366
+ linear=True,
367
+ )
368
+ pbar.advance()
369
+ dual_out_path = None
370
+ if not translation_config.no_dual:
371
+ dual_out_path = translation_config.get_output_file_path(
372
+ f"{basename}{debug_suffix}.{translation_config.lang_out}.dual.pdf"
373
+ )
374
+ translation_config.raise_if_cancelled()
375
+ dual = pymupdf.open(self.original_pdf_path)
376
+ if translation_config.debug:
377
+ translation_config.raise_if_cancelled()
378
+ try:
379
+ self.write_debug_info(dual, translation_config)
380
+ except Exception:
381
+ logger.warning(
382
+ "Failed to write debug info to dual PDF", exc_info=True
383
+ )
384
+ dual.insert_file(pdf)
385
+ page_count = pdf.page_count
386
+ for page_id in range(page_count):
387
+ if translation_config.dual_translate_first:
388
+ dual.move_page(page_count + page_id, page_id * 2)
389
+ else:
390
+ dual.move_page(page_count + page_id, page_id * 2 + 1)
391
+ dual.save(
392
+ dual_out_path,
393
+ garbage=3,
394
+ deflate=True,
395
+ clean=not translation_config.skip_clean,
396
+ deflate_fonts=True,
397
+ linear=True,
398
+ )
399
+ if translation_config.debug:
400
+ translation_config.raise_if_cancelled()
401
+ dual.save(
402
+ f"{dual_out_path}.decompressed.pdf", expand=True, pretty=True
403
+ )
404
+ pbar.advance()
405
+ return TranslateResult(mono_out_path, dual_out_path)
src/pdf2u/document_il/frontend/__init__.py ADDED
File without changes
src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (182 Bytes). View file
 
src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (170 Bytes). View file
 
src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-311.pyc ADDED
Binary file (19 kB). View file
 
src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-312.pyc ADDED
Binary file (18 kB). View file
 
src/pdf2u/document_il/frontend/il_creater.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import logging
3
+ import re
4
+
5
+ import pdfminer.pdfinterp
6
+ import pymupdf
7
+ from pdfminer.layout import LTChar, LTFigure
8
+ from pdfminer.pdffont import PDFCIDFont, PDFFont
9
+ from pdfminer.psparser import PSLiteral
10
+
11
+ from pdf2u.document_il import il_version_1
12
+ from pdf2u.translation_config import TranslationConfig
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class ILCreater:
18
+ stage_name = "Parse PDF and Create Intermediate Representation"
19
+
20
+ def __init__(self, translation_config: TranslationConfig):
21
+ self.progress = None
22
+ self.current_page: il_version_1.Page = None
23
+ self.mupdf: pymupdf.Document = None
24
+ self.model = translation_config.doc_layout_model
25
+ self.docs = il_version_1.Document(page=[])
26
+ self.stroking_color_space_name = None
27
+ self.non_stroking_color_space_name = None
28
+ self.passthrough_per_char_instruction: list[tuple[str, str]] = []
29
+ self.translation_config = translation_config
30
+ self.passthrough_per_char_instruction_stack: list[list[tuple[str, str]]] = []
31
+ self.xobj_id = 0
32
+ self.xobj_inc = 0
33
+ self.xobj_map: dict[int, il_version_1.PdfXobject] = {}
34
+ self.xobj_stack = []
35
+
36
+ def on_finish(self):
37
+ self.progress.__exit__(None, None, None)
38
+
39
+ def is_passthrough_per_char_operation(self, operator: str):
40
+ return re.match("^(sc|scn|g|rg|k|cs|gs|ri)$", operator, re.IGNORECASE)
41
+
42
+ def on_passthrough_per_char(self, operator: str, args: list[str]):
43
+ if not self.is_passthrough_per_char_operation(operator):
44
+ logger.error("Unknown passthrough_per_char operation: %s", operator)
45
+ return
46
+ # logger.debug("xobj_id: %d, on_passthrough_per_char: %s ( %s )", self.xobj_id, operator, args)
47
+ args = [self.parse_arg(arg) for arg in args]
48
+ for _i, value in enumerate(self.passthrough_per_char_instruction.copy()):
49
+ op, arg = value
50
+ if op == operator:
51
+ self.passthrough_per_char_instruction.remove(value)
52
+ break
53
+ self.passthrough_per_char_instruction.append((operator, " ".join(args)))
54
+ pass
55
+
56
+ def remove_latest_passthrough_per_char_instruction(self):
57
+ if self.passthrough_per_char_instruction:
58
+ self.passthrough_per_char_instruction.pop()
59
+
60
+ def parse_arg(self, arg: str):
61
+ if isinstance(arg, PSLiteral):
62
+ return f"/{arg.name}"
63
+ if not isinstance(arg, str):
64
+ return str(arg)
65
+ return arg
66
+
67
+ def pop_passthrough_per_char_instruction(self):
68
+ if self.passthrough_per_char_instruction_stack:
69
+ self.passthrough_per_char_instruction = (
70
+ self.passthrough_per_char_instruction_stack.pop()
71
+ )
72
+ else:
73
+ self.passthrough_per_char_instruction = []
74
+ logging.error(
75
+ "pop_passthrough_per_char_instruction error on page: %s",
76
+ self.current_page.page_number,
77
+ )
78
+
79
+ def push_passthrough_per_char_instruction(self):
80
+ self.passthrough_per_char_instruction_stack.append(
81
+ self.passthrough_per_char_instruction.copy()
82
+ )
83
+
84
+ # pdf32000 page 171
85
+ def on_stroking_color_space(self, color_space_name):
86
+ self.stroking_color_space_name = color_space_name
87
+
88
+ def on_non_stroking_color_space(self, color_space_name):
89
+ self.non_stroking_color_space_name = color_space_name
90
+
91
+ def on_new_stream(self):
92
+ self.stroking_color_space_name = None
93
+ self.non_stroking_color_space_name = None
94
+ self.passthrough_per_char_instruction = []
95
+
96
+ def push_xobj(self):
97
+ self.xobj_stack.append(
98
+ (self.current_page_font_name_id_map.copy(), self.xobj_id)
99
+ )
100
+ self.current_page_font_name_id_map = {}
101
+
102
+ def pop_xobj(self):
103
+ self.current_page_font_name_id_map, self.xobj_id = self.xobj_stack.pop()
104
+
105
+ def on_xobj_begin(self, bbox, xref_id):
106
+ self.push_passthrough_per_char_instruction()
107
+ self.push_xobj()
108
+ self.xobj_inc += 1
109
+ self.xobj_id = self.xobj_inc
110
+ xobject = il_version_1.PdfXobject(
111
+ box=il_version_1.Box(
112
+ x=float(bbox[0]), y=float(bbox[1]), x2=float(bbox[2]), y2=float(bbox[3])
113
+ ),
114
+ xobj_id=self.xobj_id,
115
+ xref_id=xref_id,
116
+ )
117
+ self.current_page.pdf_xobject.append(xobject)
118
+ self.xobj_map[self.xobj_id] = xobject
119
+ return self.xobj_id
120
+
121
+ def on_xobj_end(self, xobj_id, base_op):
122
+ self.pop_passthrough_per_char_instruction()
123
+ self.pop_xobj()
124
+ xobj = self.xobj_map[xobj_id]
125
+ xobj.base_operations = il_version_1.BaseOperations(value=base_op)
126
+ self.xobj_inc += 1
127
+
128
+ def on_page_start(self):
129
+ self.current_page = il_version_1.Page(
130
+ pdf_font=[],
131
+ pdf_character=[],
132
+ page_layout=[],
133
+ # currently don't support UserUnit page parameter
134
+ # pdf32000 page 79
135
+ unit="point",
136
+ )
137
+ self.current_page_font_name_id_map = {}
138
+ self.passthrough_per_char_instruction_stack = []
139
+ self.xobj_stack = []
140
+ self.non_stroking_color_space_name = None
141
+ self.stroking_color_space_name = None
142
+ self.docs.page.append(self.current_page)
143
+
144
+ def on_page_end(self):
145
+ self.progress.advance(1)
146
+
147
+ def on_page_crop_box(
148
+ self, x0: float | int, y0: float | int, x1: float | int, y1: float | int
149
+ ):
150
+ box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1))
151
+ self.current_page.cropbox = il_version_1.Cropbox(box=box)
152
+
153
+ def on_page_media_box(
154
+ self, x0: float | int, y0: float | int, x1: float | int, y1: float | int
155
+ ):
156
+ box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1))
157
+ self.current_page.mediabox = il_version_1.Mediabox(box=box)
158
+
159
+ def on_page_number(self, page_number: int):
160
+ assert isinstance(page_number, int)
161
+ assert page_number >= 0
162
+ self.current_page.page_number = page_number
163
+
164
+ def on_page_base_operation(self, operation: str):
165
+ self.current_page.base_operations = il_version_1.BaseOperations(value=operation)
166
+
167
+ def on_page_resource_font(self, font: PDFFont, xref_id: int, font_id: str):
168
+ font_name = font.fontname
169
+ if isinstance(font_name, bytes):
170
+ try:
171
+ font_name = font_name.decode("utf-8")
172
+ except UnicodeDecodeError:
173
+ font_name = "BASE64:" + base64.b64encode(font_name).decode("utf-8")
174
+ encoding_length = 1
175
+ if isinstance(font, PDFCIDFont):
176
+ try:
177
+ # pdf 32000:2008 page 273
178
+ # Table 118 - Predefined CJK CMap names
179
+ _, encoding = self.mupdf.xref_get_key(xref_id, "Encoding")
180
+ if encoding == "/Identity-H" or encoding == "/Identity-V":
181
+ encoding_length = 2
182
+ else:
183
+ _, to_unicode_id = self.mupdf.xref_get_key(xref_id, "ToUnicode")
184
+ to_unicode_bytes = self.mupdf.xref_stream(
185
+ int(to_unicode_id.split(" ")[0])
186
+ )
187
+ code_range = re.search(
188
+ b"begincodespacerange\n?.*<(\\d+?)>.*", to_unicode_bytes
189
+ ).group(1)
190
+ encoding_length = len(code_range) // 2
191
+ except Exception:
192
+ if max(font.unicode_map.cid2unichr.keys()) > 255:
193
+ encoding_length = 2
194
+ else:
195
+ encoding_length = 1
196
+ try:
197
+ mupdf_font = pymupdf.Font(fontbuffer=self.mupdf.extract_font(xref_id)[3])
198
+ bold = mupdf_font.is_bold
199
+ italic = mupdf_font.is_italic
200
+ monospaced = mupdf_font.is_monospaced
201
+ serif = mupdf_font.is_serif
202
+ except Exception:
203
+ bold = None
204
+ italic = None
205
+ monospaced = None
206
+ serif = None
207
+ il_font_metadata = il_version_1.PdfFont(
208
+ name=font_name,
209
+ xref_id=xref_id,
210
+ font_id=font_id,
211
+ encoding_length=encoding_length,
212
+ bold=bold,
213
+ italic=italic,
214
+ monospace=monospaced,
215
+ serif=serif,
216
+ ascent=font.ascent,
217
+ descent=font.descent,
218
+ )
219
+ self.current_page_font_name_id_map[font_name] = font_id
220
+ if self.xobj_id in self.xobj_map:
221
+ self.xobj_map[self.xobj_id].pdf_font.append(il_font_metadata)
222
+ else:
223
+ self.current_page.pdf_font.append(il_font_metadata)
224
+
225
+ def create_graphic_state(self, gs: pdfminer.pdfinterp.PDFGraphicState):
226
+ graphic_state = il_version_1.GraphicState()
227
+ for k, v in gs.__dict__.items():
228
+ if v is None:
229
+ continue
230
+ if k in ["scolor", "ncolor"]:
231
+ if isinstance(v, tuple):
232
+ v = list(v)
233
+ else:
234
+ v = [v]
235
+ setattr(graphic_state, k, v)
236
+ continue
237
+ if k == "linewidth":
238
+ graphic_state.linewidth = float(v)
239
+ continue
240
+ continue
241
+ raise NotImplementedError
242
+
243
+ graphic_state.stroking_color_space_name = self.stroking_color_space_name
244
+ graphic_state.non_stroking_color_space_name = self.non_stroking_color_space_name
245
+
246
+ graphic_state.passthrough_per_char_instruction = " ".join(
247
+ f"{arg} {op}" for op, arg in gs.passthrough_instruction
248
+ )
249
+
250
+ return graphic_state
251
+
252
+ def on_lt_char(self, char: LTChar):
253
+ gs = self.create_graphic_state(char.graphicstate)
254
+ # Get font from current page or xobject
255
+ font = None
256
+ for pdf_font in self.xobj_map.get(self.xobj_id, self.current_page).pdf_font:
257
+ if pdf_font.font_id == char.aw_font_id:
258
+ font = pdf_font
259
+ break
260
+
261
+ # Get descent from font
262
+ descent = 0
263
+ if font and hasattr(font, "descent"):
264
+ descent = font.descent * char.size / 1000
265
+
266
+ char_id = char.cid
267
+ char_unicode = char.get_text()
268
+ if "(cid:" not in char_unicode and len(char_unicode) > 1:
269
+ return
270
+ advance = char.adv
271
+ if char.matrix[0] == 0 and char.matrix[3] == 0:
272
+ vertical = True
273
+ bbox = il_version_1.Box(
274
+ x=char.bbox[0] - descent,
275
+ y=char.bbox[1],
276
+ x2=char.bbox[2] - descent,
277
+ y2=char.bbox[3],
278
+ )
279
+ else:
280
+ vertical = False
281
+ # Add descent to y coordinates
282
+ bbox = il_version_1.Box(
283
+ x=char.bbox[0],
284
+ y=char.bbox[1] + descent,
285
+ x2=char.bbox[2],
286
+ y2=char.bbox[3] + descent,
287
+ )
288
+ pdf_style = il_version_1.PdfStyle(
289
+ font_id=char.aw_font_id, font_size=char.size, graphic_state=gs
290
+ )
291
+ pdf_char = il_version_1.PdfCharacter(
292
+ box=bbox,
293
+ pdf_character_id=char_id,
294
+ advance=advance,
295
+ char_unicode=char_unicode,
296
+ vertical=vertical,
297
+ pdf_style=pdf_style,
298
+ xobj_id=char.xobj_id,
299
+ )
300
+ self.current_page.pdf_character.append(pdf_char)
301
+
302
+ def create_il(self):
303
+ pages = [
304
+ page
305
+ for page in self.docs.page
306
+ if self.translation_config.should_translate_page(page.page_number + 1)
307
+ ]
308
+ self.docs.page = pages
309
+ return self.docs
310
+
311
+ def on_total_pages(self, total_pages: int):
312
+ assert isinstance(total_pages, int)
313
+ assert total_pages > 0
314
+ self.docs.total_pages = total_pages
315
+ total = 0
316
+ for page in range(total_pages):
317
+ if self.translation_config.should_translate_page(page + 1) is False:
318
+ continue
319
+ total += 1
320
+ self.progress = self.translation_config.progress_monitor.stage_start(
321
+ self.stage_name, total
322
+ )
323
+
324
+ def on_pdf_figure(self, figure: LTFigure):
325
+ box = il_version_1.Box(
326
+ figure.bbox[0], figure.bbox[1], figure.bbox[2], figure.bbox[3]
327
+ )
328
+ self.current_page.pdf_figure.append(il_version_1.PdfFigure(box=box))
src/pdf2u/document_il/il_version_1.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+
3
+
4
+ @dataclass
5
+ class BaseOperations:
6
+ class Meta:
7
+ name = "baseOperations"
8
+
9
+ value: str = field(default="", metadata={"required": True})
10
+
11
+
12
+ @dataclass
13
+ class Box:
14
+ class Meta:
15
+ name = "box"
16
+
17
+ x: float | None = field(
18
+ default=None, metadata={"type": "Attribute", "required": True}
19
+ )
20
+ y: float | None = field(
21
+ default=None, metadata={"type": "Attribute", "required": True}
22
+ )
23
+ x2: float | None = field(
24
+ default=None, metadata={"type": "Attribute", "required": True}
25
+ )
26
+ y2: float | None = field(
27
+ default=None, metadata={"type": "Attribute", "required": True}
28
+ )
29
+
30
+
31
+ @dataclass
32
+ class GraphicState:
33
+ class Meta:
34
+ name = "graphicState"
35
+
36
+ linewidth: float | None = field(default=None, metadata={"type": "Attribute"})
37
+ dash: list[float] = field(
38
+ default_factory=list,
39
+ metadata={"type": "Attribute", "min_length": 1, "tokens": True},
40
+ )
41
+ flatness: float | None = field(default=None, metadata={"type": "Attribute"})
42
+ intent: str | None = field(default=None, metadata={"type": "Attribute"})
43
+ linecap: int | None = field(default=None, metadata={"type": "Attribute"})
44
+ linejoin: int | None = field(default=None, metadata={"type": "Attribute"})
45
+ miterlimit: float | None = field(default=None, metadata={"type": "Attribute"})
46
+ ncolor: list[float] = field(
47
+ default_factory=list,
48
+ metadata={"type": "Attribute", "min_length": 1, "tokens": True},
49
+ )
50
+ scolor: list[float] = field(
51
+ default_factory=list,
52
+ metadata={"type": "Attribute", "min_length": 1, "tokens": True},
53
+ )
54
+ stroking_color_space_name: str | None = field(
55
+ default=None, metadata={"name": "strokingColorSpaceName", "type": "Attribute"}
56
+ )
57
+ non_stroking_color_space_name: str | None = field(
58
+ default=None,
59
+ metadata={"name": "nonStrokingColorSpaceName", "type": "Attribute"},
60
+ )
61
+ passthrough_per_char_instruction: str | None = field(
62
+ default=None,
63
+ metadata={"name": "passthroughPerCharInstruction", "type": "Attribute"},
64
+ )
65
+
66
+
67
+ @dataclass
68
+ class PdfFont:
69
+ class Meta:
70
+ name = "pdfFont"
71
+
72
+ name: str | None = field(
73
+ default=None, metadata={"type": "Attribute", "required": True}
74
+ )
75
+ font_id: str | None = field(
76
+ default=None, metadata={"name": "fontId", "type": "Attribute", "required": True}
77
+ )
78
+ xref_id: int | None = field(
79
+ default=None, metadata={"name": "xrefId", "type": "Attribute", "required": True}
80
+ )
81
+ encoding_length: int | None = field(
82
+ default=None,
83
+ metadata={"name": "encodingLength", "type": "Attribute", "required": True},
84
+ )
85
+ bold: bool | None = field(default=None, metadata={"type": "Attribute"})
86
+ italic: bool | None = field(default=None, metadata={"type": "Attribute"})
87
+ monospace: bool | None = field(default=None, metadata={"type": "Attribute"})
88
+ serif: bool | None = field(default=None, metadata={"type": "Attribute"})
89
+ ascent: float | None = field(default=None, metadata={"type": "Attribute"})
90
+ descent: float | None = field(default=None, metadata={"type": "Attribute"})
91
+
92
+
93
+ @dataclass
94
+ class Cropbox:
95
+ class Meta:
96
+ name = "cropbox"
97
+
98
+ box: Box | None = field(
99
+ default=None, metadata={"type": "Element", "required": True}
100
+ )
101
+
102
+
103
+ @dataclass
104
+ class Mediabox:
105
+ class Meta:
106
+ name = "mediabox"
107
+
108
+ box: Box | None = field(
109
+ default=None, metadata={"type": "Element", "required": True}
110
+ )
111
+
112
+
113
+ @dataclass
114
+ class PageLayout:
115
+ class Meta:
116
+ name = "pageLayout"
117
+
118
+ box: Box | None = field(
119
+ default=None, metadata={"type": "Element", "required": True}
120
+ )
121
+ id: int | None = field(
122
+ default=None, metadata={"type": "Attribute", "required": True}
123
+ )
124
+ conf: float | None = field(
125
+ default=None, metadata={"type": "Attribute", "required": True}
126
+ )
127
+ class_name: str | None = field(
128
+ default=None, metadata={"type": "Attribute", "required": True}
129
+ )
130
+
131
+
132
+ @dataclass
133
+ class PdfFigure:
134
+ class Meta:
135
+ name = "pdfFigure"
136
+
137
+ box: Box | None = field(
138
+ default=None, metadata={"type": "Element", "required": True}
139
+ )
140
+
141
+
142
+ @dataclass
143
+ class PdfRectangle:
144
+ class Meta:
145
+ name = "pdfRectangle"
146
+
147
+ box: Box | None = field(
148
+ default=None, metadata={"type": "Element", "required": True}
149
+ )
150
+ graphic_state: GraphicState | None = field(
151
+ default=None,
152
+ metadata={"name": "graphicState", "type": "Element", "required": True},
153
+ )
154
+ debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
155
+
156
+
157
+ @dataclass
158
+ class PdfStyle:
159
+ class Meta:
160
+ name = "pdfStyle"
161
+
162
+ graphic_state: GraphicState | None = field(
163
+ default=None,
164
+ metadata={"name": "graphicState", "type": "Element", "required": True},
165
+ )
166
+ font_id: str | None = field(
167
+ default=None, metadata={"type": "Attribute", "required": True}
168
+ )
169
+ font_size: float | None = field(
170
+ default=None, metadata={"type": "Attribute", "required": True}
171
+ )
172
+
173
+
174
+ @dataclass
175
+ class PdfXobject:
176
+ class Meta:
177
+ name = "pdfXobject"
178
+
179
+ box: Box | None = field(
180
+ default=None, metadata={"type": "Element", "required": True}
181
+ )
182
+ pdf_font: list[PdfFont] = field(
183
+ default_factory=list, metadata={"name": "pdfFont", "type": "Element"}
184
+ )
185
+ base_operations: BaseOperations | None = field(
186
+ default=None,
187
+ metadata={"name": "baseOperations", "type": "Element", "required": True},
188
+ )
189
+ xobj_id: int | None = field(
190
+ default=None, metadata={"name": "xobjId", "type": "Attribute", "required": True}
191
+ )
192
+ xref_id: int | None = field(
193
+ default=None, metadata={"name": "xrefId", "type": "Attribute", "required": True}
194
+ )
195
+
196
+
197
+ @dataclass
198
+ class PdfCharacter:
199
+ class Meta:
200
+ name = "pdfCharacter"
201
+
202
+ pdf_style: PdfStyle | None = field(
203
+ default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
204
+ )
205
+ box: Box | None = field(
206
+ default=None, metadata={"type": "Element", "required": True}
207
+ )
208
+ vertical: bool | None = field(default=None, metadata={"type": "Attribute"})
209
+ scale: float | None = field(default=None, metadata={"type": "Attribute"})
210
+ pdf_character_id: int | None = field(
211
+ default=None, metadata={"name": "pdfCharacterId", "type": "Attribute"}
212
+ )
213
+ char_unicode: str | None = field(
214
+ default=None, metadata={"type": "Attribute", "required": True}
215
+ )
216
+ advance: float | None = field(default=None, metadata={"type": "Attribute"})
217
+ xobj_id: int | None = field(
218
+ default=None, metadata={"name": "xobjId", "type": "Attribute"}
219
+ )
220
+ debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
221
+
222
+
223
+ @dataclass
224
+ class PdfSameStyleUnicodeCharacters:
225
+ class Meta:
226
+ name = "pdfSameStyleUnicodeCharacters"
227
+
228
+ pdf_style: PdfStyle | None = field(
229
+ default=None, metadata={"name": "pdfStyle", "type": "Element"}
230
+ )
231
+ unicode: str | None = field(
232
+ default=None, metadata={"type": "Attribute", "required": True}
233
+ )
234
+ debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
235
+
236
+
237
+ @dataclass
238
+ class PdfFormula:
239
+ class Meta:
240
+ name = "pdfFormula"
241
+
242
+ box: Box | None = field(
243
+ default=None, metadata={"type": "Element", "required": True}
244
+ )
245
+ pdf_character: list[PdfCharacter] = field(
246
+ default_factory=list,
247
+ metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
248
+ )
249
+ x_offset: float | None = field(
250
+ default=None, metadata={"type": "Attribute", "required": True}
251
+ )
252
+ y_offset: float | None = field(
253
+ default=None, metadata={"type": "Attribute", "required": True}
254
+ )
255
+
256
+
257
+ @dataclass
258
+ class PdfLine:
259
+ class Meta:
260
+ name = "pdfLine"
261
+
262
+ box: Box | None = field(
263
+ default=None, metadata={"type": "Element", "required": True}
264
+ )
265
+ pdf_character: list[PdfCharacter] = field(
266
+ default_factory=list,
267
+ metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
268
+ )
269
+
270
+
271
+ @dataclass
272
+ class PdfSameStyleCharacters:
273
+ class Meta:
274
+ name = "pdfSameStyleCharacters"
275
+
276
+ box: Box | None = field(
277
+ default=None, metadata={"type": "Element", "required": True}
278
+ )
279
+ pdf_style: PdfStyle | None = field(
280
+ default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
281
+ )
282
+ pdf_character: list[PdfCharacter] = field(
283
+ default_factory=list,
284
+ metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
285
+ )
286
+
287
+
288
+ @dataclass
289
+ class PdfParagraphComposition:
290
+ class Meta:
291
+ name = "pdfParagraphComposition"
292
+
293
+ pdf_line: PdfLine | None = field(
294
+ default=None, metadata={"name": "pdfLine", "type": "Element"}
295
+ )
296
+ pdf_formula: PdfFormula | None = field(
297
+ default=None, metadata={"name": "pdfFormula", "type": "Element"}
298
+ )
299
+ pdf_same_style_characters: PdfSameStyleCharacters | None = field(
300
+ default=None, metadata={"name": "pdfSameStyleCharacters", "type": "Element"}
301
+ )
302
+ pdf_character: PdfCharacter | None = field(
303
+ default=None, metadata={"name": "pdfCharacter", "type": "Element"}
304
+ )
305
+ pdf_same_style_unicode_characters: PdfSameStyleUnicodeCharacters | None = field(
306
+ default=None,
307
+ metadata={"name": "pdfSameStyleUnicodeCharacters", "type": "Element"},
308
+ )
309
+
310
+
311
+ @dataclass
312
+ class PdfParagraph:
313
+ class Meta:
314
+ name = "pdfParagraph"
315
+
316
+ box: Box | None = field(
317
+ default=None, metadata={"type": "Element", "required": True}
318
+ )
319
+ pdf_style: PdfStyle | None = field(
320
+ default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
321
+ )
322
+ pdf_paragraph_composition: list[PdfParagraphComposition] = field(
323
+ default_factory=list,
324
+ metadata={"name": "pdfParagraphComposition", "type": "Element"},
325
+ )
326
+ xobj_id: int | None = field(
327
+ default=None, metadata={"name": "xobjId", "type": "Attribute"}
328
+ )
329
+ unicode: str | None = field(
330
+ default=None, metadata={"type": "Attribute", "required": True}
331
+ )
332
+ scale: float | None = field(default=None, metadata={"type": "Attribute"})
333
+ vertical: bool | None = field(default=None, metadata={"type": "Attribute"})
334
+ first_line_indent: bool | None = field(
335
+ default=None, metadata={"name": "FirstLineIndent", "type": "Attribute"}
336
+ )
337
+ debug_id: str | None = field(default=None, metadata={"type": "Attribute"})
338
+
339
+
340
+ @dataclass
341
+ class Page:
342
+ class Meta:
343
+ name = "page"
344
+
345
+ mediabox: Mediabox | None = field(
346
+ default=None, metadata={"type": "Element", "required": True}
347
+ )
348
+ cropbox: Cropbox | None = field(
349
+ default=None, metadata={"type": "Element", "required": True}
350
+ )
351
+ pdf_xobject: list[PdfXobject] = field(
352
+ default_factory=list, metadata={"name": "pdfXobject", "type": "Element"}
353
+ )
354
+ page_layout: list[PageLayout] = field(
355
+ default_factory=list, metadata={"name": "pageLayout", "type": "Element"}
356
+ )
357
+ pdf_rectangle: list[PdfRectangle] = field(
358
+ default_factory=list, metadata={"name": "pdfRectangle", "type": "Element"}
359
+ )
360
+ pdf_font: list[PdfFont] = field(
361
+ default_factory=list, metadata={"name": "pdfFont", "type": "Element"}
362
+ )
363
+ pdf_paragraph: list[PdfParagraph] = field(
364
+ default_factory=list, metadata={"name": "pdfParagraph", "type": "Element"}
365
+ )
366
+ pdf_figure: list[PdfFigure] = field(
367
+ default_factory=list, metadata={"name": "pdfFigure", "type": "Element"}
368
+ )
369
+ pdf_character: list[PdfCharacter] = field(
370
+ default_factory=list, metadata={"name": "pdfCharacter", "type": "Element"}
371
+ )
372
+ base_operations: BaseOperations | None = field(
373
+ default=None,
374
+ metadata={"name": "baseOperations", "type": "Element", "required": True},
375
+ )
376
+ page_number: int | None = field(
377
+ default=None,
378
+ metadata={"name": "pageNumber", "type": "Attribute", "required": True},
379
+ )
380
+ unit: str | None = field(
381
+ default=None, metadata={"name": "Unit", "type": "Attribute", "required": True}
382
+ )
383
+
384
+
385
+ @dataclass
386
+ class Document:
387
+ class Meta:
388
+ name = "document"
389
+
390
+ page: list[Page] = field(
391
+ default_factory=list, metadata={"type": "Element", "min_occurs": 1}
392
+ )
393
+ total_pages: int | None = field(
394
+ default=None,
395
+ metadata={"name": "totalPages", "type": "Attribute", "required": True},
396
+ )
src/pdf2u/document_il/il_version_1.rnc ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ start = Document
2
+ Document =
3
+ element document {
4
+ Page+,
5
+ attribute totalPages { xsd:int }
6
+ }
7
+ Page =
8
+ element page {
9
+ element mediabox { Box },
10
+ element cropbox { Box },
11
+ PDFXobject*,
12
+ PageLayout*,
13
+ PDFRectangle*,
14
+ PDFFont*,
15
+ PDFParagraph*,
16
+ PDFFigure*,
17
+ PDFCharacter*,
18
+ attribute pageNumber { xsd:int },
19
+ attribute Unit { xsd:string },
20
+ element baseOperations { xsd:string }
21
+ }
22
+ Box =
23
+ element box {
24
+ # from (x,y) to (x2,y2)
25
+ attribute x { xsd:float },
26
+ attribute y { xsd:float },
27
+ attribute x2 { xsd:float },
28
+ attribute y2 { xsd:float }
29
+ }
30
+ PDFXrefId = xsd:int
31
+ PDFFont =
32
+ element pdfFont {
33
+ attribute name { xsd:string },
34
+ attribute fontId { xsd:string },
35
+ attribute xrefId { PDFXrefId },
36
+ attribute encodingLength { xsd:int },
37
+ attribute bold { xsd:boolean }?,
38
+ attribute italic { xsd:boolean }?,
39
+ attribute monospace { xsd:boolean }?,
40
+ attribute serif { xsd:boolean }?,
41
+ attribute ascent { xsd:float }?,
42
+ attribute descent { xsd:float }?
43
+ }
44
+ PDFXobject =
45
+ element pdfXobject {
46
+ attribute xobjId { xsd:int },
47
+ attribute xrefId { PDFXrefId },
48
+ Box,
49
+ PDFFont*,
50
+ element baseOperations { xsd:string }
51
+ }
52
+ PDFCharacter =
53
+ element pdfCharacter {
54
+ attribute vertical { xsd:boolean }?,
55
+ attribute scale { xsd:float }?,
56
+ attribute pdfCharacterId { xsd:int }?,
57
+ attribute char_unicode { xsd:string },
58
+ attribute advance { xsd:float }?,
59
+ # xobject nesting depth
60
+ attribute xobjId { xsd:int }?,
61
+ attribute debug_info { xsd:boolean }?,
62
+ PDFStyle,
63
+ Box
64
+ }
65
+ PageLayout =
66
+ element pageLayout {
67
+ attribute id { xsd:int },
68
+ attribute conf { xsd:float },
69
+ attribute class_name { xsd:string },
70
+ Box
71
+ }
72
+ GraphicState =
73
+ element graphicState {
74
+ attribute linewidth { xsd:float }?,
75
+ attribute dash {
76
+ list { xsd:float+ }
77
+ }?,
78
+ attribute flatness { xsd:float }?,
79
+ attribute intent { xsd:string }?,
80
+ attribute linecap { xsd:int }?,
81
+ attribute linejoin { xsd:int }?,
82
+ attribute miterlimit { xsd:float }?,
83
+ attribute ncolor {
84
+ list { xsd:float+ }
85
+ }?,
86
+ attribute scolor {
87
+ list { xsd:float+ }
88
+ }?,
89
+ attribute strokingColorSpaceName { xsd:string }?,
90
+ attribute nonStrokingColorSpaceName { xsd:string }?,
91
+ attribute passthroughPerCharInstruction { xsd:string }?
92
+ }
93
+ PDFStyle =
94
+ element pdfStyle {
95
+ attribute font_id { xsd:string },
96
+ attribute font_size { xsd:float },
97
+ GraphicState
98
+ }
99
+ PDFParagraph =
100
+ element pdfParagraph {
101
+ attribute xobjId { xsd:int }?,
102
+ attribute unicode { xsd:string },
103
+ attribute scale { xsd:float }?,
104
+ attribute vertical { xsd:boolean }?,
105
+ attribute FirstLineIndent { xsd:boolean }?,
106
+ attribute debug_id { xsd:string }?,
107
+ Box,
108
+ PDFStyle,
109
+ PDFParagraphComposition*
110
+ }
111
+ PDFParagraphComposition =
112
+ element pdfParagraphComposition {
113
+ PDFLine
114
+ | PDFFormula
115
+ | PDFSameStyleCharacters
116
+ | PDFCharacter
117
+ | PDFSameStyleUnicodeCharacters
118
+ }
119
+ PDFLine = element pdfLine { Box, PDFCharacter+ }
120
+ PDFSameStyleCharacters =
121
+ element pdfSameStyleCharacters { Box, PDFStyle, PDFCharacter+ }
122
+ PDFSameStyleUnicodeCharacters =
123
+ element pdfSameStyleUnicodeCharacters {
124
+ PDFStyle?,
125
+ attribute unicode { xsd:string },
126
+ attribute debug_info { xsd:boolean }?
127
+ }
128
+ PDFFormula =
129
+ element pdfFormula {
130
+ Box,
131
+ PDFCharacter+,
132
+ attribute x_offset { xsd:float },
133
+ attribute y_offset { xsd:float }
134
+ }
135
+ PDFFigure = element pdfFigure { Box }
136
+ PDFRectangle =
137
+ element pdfRectangle {
138
+ Box,
139
+ GraphicState,
140
+ attribute debug_info { xsd:boolean }?
141
+ }
src/pdf2u/document_il/il_version_1.rng ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
3
+ <start>
4
+ <ref name="Document"/>
5
+ </start>
6
+ <define name="Document">
7
+ <element name="document">
8
+ <oneOrMore>
9
+ <ref name="Page"/>
10
+ </oneOrMore>
11
+ <attribute name="totalPages">
12
+ <data type="int"/>
13
+ </attribute>
14
+ </element>
15
+ </define>
16
+ <define name="Page">
17
+ <element name="page">
18
+ <element name="mediabox">
19
+ <ref name="Box"/>
20
+ </element>
21
+ <element name="cropbox">
22
+ <ref name="Box"/>
23
+ </element>
24
+ <zeroOrMore>
25
+ <ref name="PDFXobject"/>
26
+ </zeroOrMore>
27
+ <zeroOrMore>
28
+ <ref name="PageLayout"/>
29
+ </zeroOrMore>
30
+ <zeroOrMore>
31
+ <ref name="PDFRectangle"/>
32
+ </zeroOrMore>
33
+ <zeroOrMore>
34
+ <ref name="PDFFont"/>
35
+ </zeroOrMore>
36
+ <zeroOrMore>
37
+ <ref name="PDFParagraph"/>
38
+ </zeroOrMore>
39
+ <zeroOrMore>
40
+ <ref name="PDFFigure"/>
41
+ </zeroOrMore>
42
+ <zeroOrMore>
43
+ <ref name="PDFCharacter"/>
44
+ </zeroOrMore>
45
+ <attribute name="pageNumber">
46
+ <data type="int"/>
47
+ </attribute>
48
+ <attribute name="Unit">
49
+ <data type="string"/>
50
+ </attribute>
51
+ <element name="baseOperations">
52
+ <data type="string"/>
53
+ </element>
54
+ </element>
55
+ </define>
56
+ <define name="Box">
57
+ <element name="box">
58
+ <!-- from (x,y) to (x2,y2) -->
59
+ <attribute name="x">
60
+ <data type="float"/>
61
+ </attribute>
62
+ <attribute name="y">
63
+ <data type="float"/>
64
+ </attribute>
65
+ <attribute name="x2">
66
+ <data type="float"/>
67
+ </attribute>
68
+ <attribute name="y2">
69
+ <data type="float"/>
70
+ </attribute>
71
+ </element>
72
+ </define>
73
+ <define name="PDFXrefId">
74
+ <data type="int"/>
75
+ </define>
76
+ <define name="PDFFont">
77
+ <element name="pdfFont">
78
+ <attribute name="name">
79
+ <data type="string"/>
80
+ </attribute>
81
+ <attribute name="fontId">
82
+ <data type="string"/>
83
+ </attribute>
84
+ <attribute name="xrefId">
85
+ <ref name="PDFXrefId"/>
86
+ </attribute>
87
+ <attribute name="encodingLength">
88
+ <data type="int"/>
89
+ </attribute>
90
+ <optional>
91
+ <attribute name="bold">
92
+ <data type="boolean"/>
93
+ </attribute>
94
+ </optional>
95
+ <optional>
96
+ <attribute name="italic">
97
+ <data type="boolean"/>
98
+ </attribute>
99
+ </optional>
100
+ <optional>
101
+ <attribute name="monospace">
102
+ <data type="boolean"/>
103
+ </attribute>
104
+ </optional>
105
+ <optional>
106
+ <attribute name="serif">
107
+ <data type="boolean"/>
108
+ </attribute>
109
+ </optional>
110
+ <optional>
111
+ <attribute name="ascent">
112
+ <data type="float"/>
113
+ </attribute>
114
+ </optional>
115
+ <optional>
116
+ <attribute name="descent">
117
+ <data type="float"/>
118
+ </attribute>
119
+ </optional>
120
+ </element>
121
+ </define>
122
+ <define name="PDFXobject">
123
+ <element name="pdfXobject">
124
+ <attribute name="xobjId">
125
+ <data type="int"/>
126
+ </attribute>
127
+ <attribute name="xrefId">
128
+ <ref name="PDFXrefId"/>
129
+ </attribute>
130
+ <ref name="Box"/>
131
+ <zeroOrMore>
132
+ <ref name="PDFFont"/>
133
+ </zeroOrMore>
134
+ <element name="baseOperations">
135
+ <data type="string"/>
136
+ </element>
137
+ </element>
138
+ </define>
139
+ <define name="PDFCharacter">
140
+ <element name="pdfCharacter">
141
+ <optional>
142
+ <attribute name="vertical">
143
+ <data type="boolean"/>
144
+ </attribute>
145
+ </optional>
146
+ <optional>
147
+ <attribute name="scale">
148
+ <data type="float"/>
149
+ </attribute>
150
+ </optional>
151
+ <optional>
152
+ <attribute name="pdfCharacterId">
153
+ <data type="int"/>
154
+ </attribute>
155
+ </optional>
156
+ <attribute name="char_unicode">
157
+ <data type="string"/>
158
+ </attribute>
159
+ <optional>
160
+ <attribute name="advance">
161
+ <data type="float"/>
162
+ </attribute>
163
+ </optional>
164
+ <optional>
165
+ <!-- xobject nesting depth -->
166
+ <attribute name="xobjId">
167
+ <data type="int"/>
168
+ </attribute>
169
+ </optional>
170
+ <optional>
171
+ <attribute name="debug_info">
172
+ <data type="boolean"/>
173
+ </attribute>
174
+ </optional>
175
+ <ref name="PDFStyle"/>
176
+ <ref name="Box"/>
177
+ </element>
178
+ </define>
179
+ <define name="PageLayout">
180
+ <element name="pageLayout">
181
+ <attribute name="id">
182
+ <data type="int"/>
183
+ </attribute>
184
+ <attribute name="conf">
185
+ <data type="float"/>
186
+ </attribute>
187
+ <attribute name="class_name">
188
+ <data type="string"/>
189
+ </attribute>
190
+ <ref name="Box"/>
191
+ </element>
192
+ </define>
193
+ <define name="GraphicState">
194
+ <element name="graphicState">
195
+ <optional>
196
+ <attribute name="linewidth">
197
+ <data type="float"/>
198
+ </attribute>
199
+ </optional>
200
+ <optional>
201
+ <attribute name="dash">
202
+ <list>
203
+ <oneOrMore>
204
+ <data type="float"/>
205
+ </oneOrMore>
206
+ </list>
207
+ </attribute>
208
+ </optional>
209
+ <optional>
210
+ <attribute name="flatness">
211
+ <data type="float"/>
212
+ </attribute>
213
+ </optional>
214
+ <optional>
215
+ <attribute name="intent">
216
+ <data type="string"/>
217
+ </attribute>
218
+ </optional>
219
+ <optional>
220
+ <attribute name="linecap">
221
+ <data type="int"/>
222
+ </attribute>
223
+ </optional>
224
+ <optional>
225
+ <attribute name="linejoin">
226
+ <data type="int"/>
227
+ </attribute>
228
+ </optional>
229
+ <optional>
230
+ <attribute name="miterlimit">
231
+ <data type="float"/>
232
+ </attribute>
233
+ </optional>
234
+ <optional>
235
+ <attribute name="ncolor">
236
+ <list>
237
+ <oneOrMore>
238
+ <data type="float"/>
239
+ </oneOrMore>
240
+ </list>
241
+ </attribute>
242
+ </optional>
243
+ <optional>
244
+ <attribute name="scolor">
245
+ <list>
246
+ <oneOrMore>
247
+ <data type="float"/>
248
+ </oneOrMore>
249
+ </list>
250
+ </attribute>
251
+ </optional>
252
+ <optional>
253
+ <attribute name="strokingColorSpaceName">
254
+ <data type="string"/>
255
+ </attribute>
256
+ </optional>
257
+ <optional>
258
+ <attribute name="nonStrokingColorSpaceName">
259
+ <data type="string"/>
260
+ </attribute>
261
+ </optional>
262
+ <optional>
263
+ <attribute name="passthroughPerCharInstruction">
264
+ <data type="string"/>
265
+ </attribute>
266
+ </optional>
267
+ </element>
268
+ </define>
269
+ <define name="PDFStyle">
270
+ <element name="pdfStyle">
271
+ <attribute name="font_id">
272
+ <data type="string"/>
273
+ </attribute>
274
+ <attribute name="font_size">
275
+ <data type="float"/>
276
+ </attribute>
277
+ <ref name="GraphicState"/>
278
+ </element>
279
+ </define>
280
+ <define name="PDFParagraph">
281
+ <element name="pdfParagraph">
282
+ <optional>
283
+ <attribute name="xobjId">
284
+ <data type="int"/>
285
+ </attribute>
286
+ </optional>
287
+ <attribute name="unicode">
288
+ <data type="string"/>
289
+ </attribute>
290
+ <optional>
291
+ <attribute name="scale">
292
+ <data type="float"/>
293
+ </attribute>
294
+ </optional>
295
+ <optional>
296
+ <attribute name="vertical">
297
+ <data type="boolean"/>
298
+ </attribute>
299
+ </optional>
300
+ <optional>
301
+ <attribute name="FirstLineIndent">
302
+ <data type="boolean"/>
303
+ </attribute>
304
+ </optional>
305
+ <optional>
306
+ <attribute name="debug_id">
307
+ <data type="string"/>
308
+ </attribute>
309
+ </optional>
310
+ <ref name="Box"/>
311
+ <ref name="PDFStyle"/>
312
+ <zeroOrMore>
313
+ <ref name="PDFParagraphComposition"/>
314
+ </zeroOrMore>
315
+ </element>
316
+ </define>
317
+ <define name="PDFParagraphComposition">
318
+ <element name="pdfParagraphComposition">
319
+ <choice>
320
+ <ref name="PDFLine"/>
321
+ <ref name="PDFFormula"/>
322
+ <ref name="PDFSameStyleCharacters"/>
323
+ <ref name="PDFCharacter"/>
324
+ <ref name="PDFSameStyleUnicodeCharacters"/>
325
+ </choice>
326
+ </element>
327
+ </define>
328
+ <define name="PDFLine">
329
+ <element name="pdfLine">
330
+ <ref name="Box"/>
331
+ <oneOrMore>
332
+ <ref name="PDFCharacter"/>
333
+ </oneOrMore>
334
+ </element>
335
+ </define>
336
+ <define name="PDFSameStyleCharacters">
337
+ <element name="pdfSameStyleCharacters">
338
+ <ref name="Box"/>
339
+ <ref name="PDFStyle"/>
340
+ <oneOrMore>
341
+ <ref name="PDFCharacter"/>
342
+ </oneOrMore>
343
+ </element>
344
+ </define>
345
+ <define name="PDFSameStyleUnicodeCharacters">
346
+ <element name="pdfSameStyleUnicodeCharacters">
347
+ <optional>
348
+ <ref name="PDFStyle"/>
349
+ </optional>
350
+ <attribute name="unicode">
351
+ <data type="string"/>
352
+ </attribute>
353
+ <optional>
354
+ <attribute name="debug_info">
355
+ <data type="boolean"/>
356
+ </attribute>
357
+ </optional>
358
+ </element>
359
+ </define>
360
+ <define name="PDFFormula">
361
+ <element name="pdfFormula">
362
+ <ref name="Box"/>
363
+ <oneOrMore>
364
+ <ref name="PDFCharacter"/>
365
+ </oneOrMore>
366
+ <attribute name="x_offset">
367
+ <data type="float"/>
368
+ </attribute>
369
+ <attribute name="y_offset">
370
+ <data type="float"/>
371
+ </attribute>
372
+ </element>
373
+ </define>
374
+ <define name="PDFFigure">
375
+ <element name="pdfFigure">
376
+ <ref name="Box"/>
377
+ </element>
378
+ </define>
379
+ <define name="PDFRectangle">
380
+ <element name="pdfRectangle">
381
+ <ref name="Box"/>
382
+ <ref name="GraphicState"/>
383
+ <optional>
384
+ <attribute name="debug_info">
385
+ <data type="boolean"/>
386
+ </attribute>
387
+ </optional>
388
+ </element>
389
+ </define>
390
+ </grammar>
src/pdf2u/document_il/il_version_1.xsd ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified">
3
+ <xs:element name="document">
4
+ <xs:complexType>
5
+ <xs:sequence>
6
+ <xs:element maxOccurs="unbounded" ref="page"/>
7
+ </xs:sequence>
8
+ <xs:attribute name="totalPages" use="required" type="xs:int"/>
9
+ </xs:complexType>
10
+ </xs:element>
11
+ <xs:element name="page">
12
+ <xs:complexType>
13
+ <xs:sequence>
14
+ <xs:element ref="mediabox"/>
15
+ <xs:element ref="cropbox"/>
16
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfXobject"/>
17
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="pageLayout"/>
18
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfRectangle"/>
19
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
20
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraph"/>
21
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFigure"/>
22
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfCharacter"/>
23
+ <xs:element ref="baseOperations"/>
24
+ </xs:sequence>
25
+ <xs:attribute name="pageNumber" use="required" type="xs:int"/>
26
+ <xs:attribute name="Unit" use="required" type="xs:string"/>
27
+ </xs:complexType>
28
+ </xs:element>
29
+ <xs:element name="mediabox">
30
+ <xs:complexType>
31
+ <xs:sequence>
32
+ <xs:element ref="box"/>
33
+ </xs:sequence>
34
+ </xs:complexType>
35
+ </xs:element>
36
+ <xs:element name="cropbox">
37
+ <xs:complexType>
38
+ <xs:sequence>
39
+ <xs:element ref="box"/>
40
+ </xs:sequence>
41
+ </xs:complexType>
42
+ </xs:element>
43
+ <xs:element name="baseOperations" type="xs:string"/>
44
+ <xs:element name="box">
45
+ <xs:complexType>
46
+ <xs:attribute name="x" use="required" type="xs:float"/>
47
+ <xs:attribute name="y" use="required" type="xs:float"/>
48
+ <xs:attribute name="x2" use="required" type="xs:float"/>
49
+ <xs:attribute name="y2" use="required" type="xs:float"/>
50
+ </xs:complexType>
51
+ </xs:element>
52
+ <xs:simpleType name="PDFXrefId">
53
+ <xs:restriction base="xs:int"/>
54
+ </xs:simpleType>
55
+ <xs:element name="pdfFont">
56
+ <xs:complexType>
57
+ <xs:attribute name="name" use="required" type="xs:string"/>
58
+ <xs:attribute name="fontId" use="required" type="xs:string"/>
59
+ <xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
60
+ <xs:attribute name="encodingLength" use="required" type="xs:int"/>
61
+ <xs:attribute name="bold" type="xs:boolean"/>
62
+ <xs:attribute name="italic" type="xs:boolean"/>
63
+ <xs:attribute name="monospace" type="xs:boolean"/>
64
+ <xs:attribute name="serif" type="xs:boolean"/>
65
+ <xs:attribute name="ascent" type="xs:float"/>
66
+ <xs:attribute name="descent" type="xs:float"/>
67
+ </xs:complexType>
68
+ </xs:element>
69
+ <xs:element name="pdfXobject">
70
+ <xs:complexType>
71
+ <xs:sequence>
72
+ <xs:element ref="box"/>
73
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
74
+ <xs:element ref="baseOperations"/>
75
+ </xs:sequence>
76
+ <xs:attribute name="xobjId" use="required" type="xs:int"/>
77
+ <xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
78
+ </xs:complexType>
79
+ </xs:element>
80
+ <xs:element name="pdfCharacter">
81
+ <xs:complexType>
82
+ <xs:sequence>
83
+ <xs:element ref="pdfStyle"/>
84
+ <xs:element ref="box"/>
85
+ </xs:sequence>
86
+ <xs:attribute name="vertical" type="xs:boolean"/>
87
+ <xs:attribute name="scale" type="xs:float"/>
88
+ <xs:attribute name="pdfCharacterId" type="xs:int"/>
89
+ <xs:attribute name="char_unicode" use="required" type="xs:string"/>
90
+ <xs:attribute name="advance" type="xs:float"/>
91
+ <xs:attribute name="xobjId" type="xs:int"/>
92
+ <xs:attribute name="debug_info" type="xs:boolean"/>
93
+ </xs:complexType>
94
+ </xs:element>
95
+ <xs:element name="pageLayout">
96
+ <xs:complexType>
97
+ <xs:sequence>
98
+ <xs:element ref="box"/>
99
+ </xs:sequence>
100
+ <xs:attribute name="id" use="required" type="xs:int"/>
101
+ <xs:attribute name="conf" use="required" type="xs:float"/>
102
+ <xs:attribute name="class_name" use="required" type="xs:string"/>
103
+ </xs:complexType>
104
+ </xs:element>
105
+ <xs:element name="graphicState">
106
+ <xs:complexType>
107
+ <xs:attribute name="linewidth" type="xs:float"/>
108
+ <xs:attribute name="dash">
109
+ <xs:simpleType>
110
+ <xs:restriction>
111
+ <xs:simpleType>
112
+ <xs:list itemType="xs:float"/>
113
+ </xs:simpleType>
114
+ <xs:minLength value="1"/>
115
+ </xs:restriction>
116
+ </xs:simpleType>
117
+ </xs:attribute>
118
+ <xs:attribute name="flatness" type="xs:float"/>
119
+ <xs:attribute name="intent" type="xs:string"/>
120
+ <xs:attribute name="linecap" type="xs:int"/>
121
+ <xs:attribute name="linejoin" type="xs:int"/>
122
+ <xs:attribute name="miterlimit" type="xs:float"/>
123
+ <xs:attribute name="ncolor">
124
+ <xs:simpleType>
125
+ <xs:restriction>
126
+ <xs:simpleType>
127
+ <xs:list itemType="xs:float"/>
128
+ </xs:simpleType>
129
+ <xs:minLength value="1"/>
130
+ </xs:restriction>
131
+ </xs:simpleType>
132
+ </xs:attribute>
133
+ <xs:attribute name="scolor">
134
+ <xs:simpleType>
135
+ <xs:restriction>
136
+ <xs:simpleType>
137
+ <xs:list itemType="xs:float"/>
138
+ </xs:simpleType>
139
+ <xs:minLength value="1"/>
140
+ </xs:restriction>
141
+ </xs:simpleType>
142
+ </xs:attribute>
143
+ <xs:attribute name="strokingColorSpaceName" type="xs:string"/>
144
+ <xs:attribute name="nonStrokingColorSpaceName" type="xs:string"/>
145
+ <xs:attribute name="passthroughPerCharInstruction" type="xs:string"/>
146
+ </xs:complexType>
147
+ </xs:element>
148
+ <xs:element name="pdfStyle">
149
+ <xs:complexType>
150
+ <xs:sequence>
151
+ <xs:element ref="graphicState"/>
152
+ </xs:sequence>
153
+ <xs:attribute name="font_id" use="required" type="xs:string"/>
154
+ <xs:attribute name="font_size" use="required" type="xs:float"/>
155
+ </xs:complexType>
156
+ </xs:element>
157
+ <xs:element name="pdfParagraph">
158
+ <xs:complexType>
159
+ <xs:sequence>
160
+ <xs:element ref="box"/>
161
+ <xs:element ref="pdfStyle"/>
162
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraphComposition"/>
163
+ </xs:sequence>
164
+ <xs:attribute name="xobjId" type="xs:int"/>
165
+ <xs:attribute name="unicode" use="required" type="xs:string"/>
166
+ <xs:attribute name="scale" type="xs:float"/>
167
+ <xs:attribute name="vertical" type="xs:boolean"/>
168
+ <xs:attribute name="FirstLineIndent" type="xs:boolean"/>
169
+ <xs:attribute name="debug_id" type="xs:string"/>
170
+ </xs:complexType>
171
+ </xs:element>
172
+ <xs:element name="pdfParagraphComposition">
173
+ <xs:complexType>
174
+ <xs:choice>
175
+ <xs:element ref="pdfLine"/>
176
+ <xs:element ref="pdfFormula"/>
177
+ <xs:element ref="pdfSameStyleCharacters"/>
178
+ <xs:element ref="pdfCharacter"/>
179
+ <xs:element ref="pdfSameStyleUnicodeCharacters"/>
180
+ </xs:choice>
181
+ </xs:complexType>
182
+ </xs:element>
183
+ <xs:element name="pdfLine">
184
+ <xs:complexType>
185
+ <xs:sequence>
186
+ <xs:element ref="box"/>
187
+ <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
188
+ </xs:sequence>
189
+ </xs:complexType>
190
+ </xs:element>
191
+ <xs:element name="pdfSameStyleCharacters">
192
+ <xs:complexType>
193
+ <xs:sequence>
194
+ <xs:element ref="box"/>
195
+ <xs:element ref="pdfStyle"/>
196
+ <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
197
+ </xs:sequence>
198
+ </xs:complexType>
199
+ </xs:element>
200
+ <xs:element name="pdfSameStyleUnicodeCharacters">
201
+ <xs:complexType>
202
+ <xs:sequence>
203
+ <xs:element minOccurs="0" ref="pdfStyle"/>
204
+ </xs:sequence>
205
+ <xs:attribute name="unicode" use="required" type="xs:string"/>
206
+ <xs:attribute name="debug_info" type="xs:boolean"/>
207
+ </xs:complexType>
208
+ </xs:element>
209
+ <xs:element name="pdfFormula">
210
+ <xs:complexType>
211
+ <xs:sequence>
212
+ <xs:element ref="box"/>
213
+ <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
214
+ </xs:sequence>
215
+ <xs:attribute name="x_offset" use="required" type="xs:float"/>
216
+ <xs:attribute name="y_offset" use="required" type="xs:float"/>
217
+ </xs:complexType>
218
+ </xs:element>
219
+ <xs:element name="pdfFigure">
220
+ <xs:complexType>
221
+ <xs:sequence>
222
+ <xs:element ref="box"/>
223
+ </xs:sequence>
224
+ </xs:complexType>
225
+ </xs:element>
226
+ <xs:element name="pdfRectangle">
227
+ <xs:complexType>
228
+ <xs:sequence>
229
+ <xs:element ref="box"/>
230
+ <xs:element ref="graphicState"/>
231
+ </xs:sequence>
232
+ <xs:attribute name="debug_info" type="xs:boolean"/>
233
+ </xs:complexType>
234
+ </xs:element>
235
+ </xs:schema>
src/pdf2u/document_il/midend/__init__.py ADDED
File without changes
src/pdf2u/document_il/midend/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (180 Bytes). View file