Upload 108 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Dockerfile +50 -0
- pyproject.toml +262 -0
- src/pdf2u/__init__.py +1 -0
- src/pdf2u/__pycache__/__init__.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/__init__.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/const.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/const.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/converter.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/converter.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/high_level.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/high_level.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/io.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/main.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/main.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/pdfinterp.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/pdfinterp.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/progress_monitor.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/progress_monitor.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/translation_config.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/translation_config.cpython-312.pyc +0 -0
- src/pdf2u/asynchronize/__init__.py +51 -0
- src/pdf2u/asynchronize/__pycache__/__init__.cpython-311.pyc +0 -0
- src/pdf2u/asynchronize/__pycache__/__init__.cpython-312.pyc +0 -0
- src/pdf2u/const.py +14 -0
- src/pdf2u/converter.py +493 -0
- src/pdf2u/document_il/__init__.py +45 -0
- src/pdf2u/document_il/__pycache__/__init__.cpython-311.pyc +0 -0
- src/pdf2u/document_il/__pycache__/__init__.cpython-312.pyc +0 -0
- src/pdf2u/document_il/__pycache__/il_version_1.cpython-311.pyc +0 -0
- src/pdf2u/document_il/__pycache__/il_version_1.cpython-312.pyc +0 -0
- src/pdf2u/document_il/__pycache__/xml_converter.cpython-311.pyc +0 -0
- src/pdf2u/document_il/__pycache__/xml_converter.cpython-312.pyc +0 -0
- src/pdf2u/document_il/backend/__init__.py +0 -0
- src/pdf2u/document_il/backend/__pycache__/__init__.cpython-311.pyc +0 -0
- src/pdf2u/document_il/backend/__pycache__/__init__.cpython-312.pyc +0 -0
- src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-311.pyc +0 -0
- src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-312.pyc +0 -0
- src/pdf2u/document_il/backend/pdf_creater.py +405 -0
- src/pdf2u/document_il/frontend/__init__.py +0 -0
- src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-311.pyc +0 -0
- src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-312.pyc +0 -0
- src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-311.pyc +0 -0
- src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-312.pyc +0 -0
- src/pdf2u/document_il/frontend/il_creater.py +328 -0
- src/pdf2u/document_il/il_version_1.py +396 -0
- src/pdf2u/document_il/il_version_1.rnc +141 -0
- src/pdf2u/document_il/il_version_1.rng +390 -0
- src/pdf2u/document_il/il_version_1.xsd +235 -0
- src/pdf2u/document_il/midend/__init__.py +0 -0
- src/pdf2u/document_il/midend/__pycache__/__init__.cpython-311.pyc +0 -0
Dockerfile
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ref: https://github.com/fastapi/full-stack-fastapi-template/blob/master/backend/Dockerfile
|
2 |
+
FROM python:3.12-slim-bookworm
|
3 |
+
|
4 |
+
# Print logs immediately
|
5 |
+
# Ref: https://docs.python.org/3/using/cmdline.html#envvar-PYTHONUNBUFFERED
|
6 |
+
ENV PYTHONUNBUFFERED=1
|
7 |
+
|
8 |
+
# Install system dependencies including OpenGL libraries
|
9 |
+
RUN apt-get update && apt-get install -y \
|
10 |
+
libgl1-mesa-glx \
|
11 |
+
libglib2.0-0 \
|
12 |
+
&& rm -rf /var/lib/apt/lists/*
|
13 |
+
|
14 |
+
# Change the working directory to the `app` directory
|
15 |
+
WORKDIR /app
|
16 |
+
|
17 |
+
# Install uv
|
18 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
|
19 |
+
COPY --from=ghcr.io/astral-sh/uv:0.5.18 /uv /uvx /bin/
|
20 |
+
|
21 |
+
# Place executables in the environment at the front of the path
|
22 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#using-the-environment
|
23 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
24 |
+
|
25 |
+
# Compile bytecode to speed up the startup time
|
26 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#compiling-bytecode
|
27 |
+
ENV UV_COMPILE_BYTECODE=1
|
28 |
+
|
29 |
+
# uv Cache
|
30 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#caching
|
31 |
+
ENV UV_LINK_MODE=copy
|
32 |
+
|
33 |
+
# Install dependencies
|
34 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
|
35 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
36 |
+
--mount=type=bind,source=uv.lock,target=uv.lock \
|
37 |
+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
38 |
+
uv sync --frozen --no-install-project
|
39 |
+
|
40 |
+
# Copy the project into the image
|
41 |
+
COPY . .
|
42 |
+
|
43 |
+
# Sync the project
|
44 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
|
45 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
46 |
+
uv sync --all-extras
|
47 |
+
|
48 |
+
EXPOSE 8501
|
49 |
+
# Set the default command
|
50 |
+
CMD ["streamlit", "run", "src/pdf2u/gui.py"]
|
pyproject.toml
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["hatchling"]
|
3 |
+
build-backend = "hatchling.build"
|
4 |
+
|
5 |
+
[tool.hatch.version]
|
6 |
+
path = "src/pdf2u/__init__.py"
|
7 |
+
# FROM: https://hatch.pypa.io/latest/version/
|
8 |
+
|
9 |
+
[tool.hatch.build.targets.wheel]
|
10 |
+
packages = ["src/pdf2u"]
|
11 |
+
# FROM: https://hatch.pypa.io/latest/build/
|
12 |
+
|
13 |
+
[project]
|
14 |
+
name = "pdf2u"
|
15 |
+
version = "0.0.4"
|
16 |
+
description = "Yet Another Document Translator"
|
17 |
+
classifiers = [
|
18 |
+
"License :: OSI Approved :: MIT License",
|
19 |
+
"Programming Language :: Python",
|
20 |
+
"Programming Language :: Python :: 3 :: Only",
|
21 |
+
"Programming Language :: Python :: 3.10",
|
22 |
+
"Programming Language :: Python :: 3.11",
|
23 |
+
"Programming Language :: Python :: 3.11",
|
24 |
+
"Programming Language :: Python :: 3.12",
|
25 |
+
"Programming Language :: Python :: 3.13",
|
26 |
+
] # FROM: https://pypi.org/classifiers/
|
27 |
+
readme = "README.md"
|
28 |
+
requires-python = ">=3.10,<3.13"
|
29 |
+
license = { file = "LICENSE" }
|
30 |
+
authors = [{ name = "A.J.Zeller", email = "[email protected]" }]
|
31 |
+
maintainers = [{ name = "A.J.Zeller", email = "[email protected]" }]
|
32 |
+
# dynamic = ["version"] # https://hatch.pypa.io/latest/config/metadata/#version
|
33 |
+
dependencies = [
|
34 |
+
"bitstring>=4.3.0",
|
35 |
+
"configargparse>=1.7",
|
36 |
+
"httpx[socks]>=0.27.0",
|
37 |
+
"huggingface-hub>=0.27.0",
|
38 |
+
"numpy>=2.0.2",
|
39 |
+
"onnx>=1.17.0",
|
40 |
+
"onnxruntime>=1.16.1",
|
41 |
+
"openai>=1.59.3",
|
42 |
+
"opencv-python>=4.10.0.84",
|
43 |
+
"orjson>=3.10.14",
|
44 |
+
"pdfminer-six>=20240706",
|
45 |
+
"peewee>=3.17.8",
|
46 |
+
"rich>=13.9.4",
|
47 |
+
"toml>=0.10.2",
|
48 |
+
"tqdm>=4.67.1",
|
49 |
+
"xsdata[cli,lxml,soap]>=24.12",
|
50 |
+
"msgpack>=1.1.0",
|
51 |
+
"typer>=0.15.1",
|
52 |
+
"pymupdf==1.24.5",
|
53 |
+
]
|
54 |
+
|
55 |
+
[project.urls]
|
56 |
+
Homepage = "https://github.com/atticuszeller/pdf2u"
|
57 |
+
Issues = "https://github.com/atticuszeller/pdf2u/issues"
|
58 |
+
|
59 |
+
[project.scripts] # build-backend config needed
|
60 |
+
pdf2u = "pdf2u.main:app"
|
61 |
+
# FROM: https://packaging.python.org/en/latest/guides/writing-pyproject-toml/
|
62 |
+
|
63 |
+
[project.optional-dependencies]
|
64 |
+
gui = ["pypdf2>=3.0.1", "streamlit>=1.42.2", "streamlit-pdf-viewer>=0.0.21"]
|
65 |
+
# optional deps for package installation
|
66 |
+
|
67 |
+
[dependency-groups]
|
68 |
+
dev = [
|
69 |
+
"ruff>=0.6.3",
|
70 |
+
"mypy>=1.11.2",
|
71 |
+
"pre-commit>=3.8.0",
|
72 |
+
"pytest>=8.3.2",
|
73 |
+
"pytest-sugar>=1.0.0",
|
74 |
+
"coverage>=7.6.1",
|
75 |
+
"git-cliff>=2.6.1",
|
76 |
+
"bump-my-version>=0.28.0",
|
77 |
+
"typos>=1.26.8",
|
78 |
+
"fonttools>=4.56.0",
|
79 |
+
]
|
80 |
+
|
81 |
+
## Test
|
82 |
+
[tool.mypy]
|
83 |
+
strict = true
|
84 |
+
exclude = ["venv", ".venv"]
|
85 |
+
|
86 |
+
[tool.pytest.ini_options]
|
87 |
+
# Set additional command line options for pytest
|
88 |
+
# Ref: https://docs.pytest.org/en/stable/reference/reference.html#command-line-flags
|
89 |
+
addopts = "-rXs --strict-config --strict-markers --tb=long"
|
90 |
+
xfail_strict = true # Treat tests that are marked as xfail but pass as test failures
|
91 |
+
filterwarnings = ["error"] # Treat all warnings as errors
|
92 |
+
pythonpath = "src/pdf2u/"
|
93 |
+
|
94 |
+
[tool.coverage.run]
|
95 |
+
branch = true
|
96 |
+
|
97 |
+
[tool.coverage.report]
|
98 |
+
skip_covered = true
|
99 |
+
show_missing = true
|
100 |
+
precision = 2
|
101 |
+
exclude_lines = [
|
102 |
+
'def __repr__',
|
103 |
+
'pragma= no cover',
|
104 |
+
'raise NotImplementedError',
|
105 |
+
'if TYPE_CHECKING=',
|
106 |
+
'if typing.TYPE_CHECKING=',
|
107 |
+
'@overload',
|
108 |
+
'@typing.overload',
|
109 |
+
'\(Protocol\)=$',
|
110 |
+
'typing.assert_never',
|
111 |
+
'assert_never',
|
112 |
+
'if __name__ == .__main__.=',
|
113 |
+
]
|
114 |
+
|
115 |
+
## Linter and formatter
|
116 |
+
[tool.ruff]
|
117 |
+
# cover and extend the default config in https=//docs.astral.sh/ruff/configuration/
|
118 |
+
extend-exclude = [""]
|
119 |
+
target-version = "py310"
|
120 |
+
|
121 |
+
[tool.ruff.lint]
|
122 |
+
select = [
|
123 |
+
"E", # pycodestyle errors
|
124 |
+
"W", # pycodestyle warnings
|
125 |
+
"F", # pyflakes
|
126 |
+
"I", # isort
|
127 |
+
"B", # flake8-bugbear
|
128 |
+
"C4", # flake8-comprehensions
|
129 |
+
"UP", # pyupgrade
|
130 |
+
"ARG001", # unused arguments in functions
|
131 |
+
]
|
132 |
+
|
133 |
+
isort = { combine-as-imports = true, split-on-trailing-comma = false }
|
134 |
+
|
135 |
+
# Avoid trying to fix flake8-bugbear (`B`) violations.
|
136 |
+
unfixable = ["B"]
|
137 |
+
|
138 |
+
[tool.ruff.format]
|
139 |
+
docstring-code-format = true
|
140 |
+
skip-magic-trailing-comma = true
|
141 |
+
|
142 |
+
# Reference
|
143 |
+
# 1. https=//github.com/Kludex/python-template/blob/main/template/%7B%7B%20project_slug%20%7D%7D/pyproject.toml.jinja
|
144 |
+
# 2. https=//github.com/fastapi/full-stack-fastapi-template/blob/master/backend/pyproject.toml
|
145 |
+
# 3. https=//github.com/pydantic/logfire
|
146 |
+
# 4. https=//coverage.readthedocs.io/en/latest/index.html
|
147 |
+
|
148 |
+
## VCS
|
149 |
+
[tool.git-cliff.remote.github]
|
150 |
+
owner = "atticuszeller"
|
151 |
+
repo = "python-uv-package"
|
152 |
+
|
153 |
+
[tool.git-cliff.changelog]
|
154 |
+
# template for the changelog header
|
155 |
+
header = """
|
156 |
+
# Changelog\n
|
157 |
+
All notable changes to this project will be documented in this file.\n
|
158 |
+
"""
|
159 |
+
# template for the changelog body
|
160 |
+
# https://keats.github.io/tera/docs/#introduction
|
161 |
+
body = """
|
162 |
+
{% if version %}\
|
163 |
+
## {{ version | trim_start_matches(pat="v") }} - {{ timestamp | date(format="%Y-%m-%d") }}
|
164 |
+
{% else %}\
|
165 |
+
## unreleased
|
166 |
+
{% endif %}\
|
167 |
+
{% for group, commits in commits | group_by(attribute="group") %}
|
168 |
+
### {{ group | striptags | trim | upper_first }}
|
169 |
+
{% for commit in commits| unique(attribute="message") %}
|
170 |
+
- {% if commit.scope %}*({{ commit.scope }})* {% endif %}\
|
171 |
+
{% if commit.breaking %}[**breaking**] {% endif %}\
|
172 |
+
{{ commit.message | upper_first }}\
|
173 |
+
{% if commit.remote.pr_number %} in #{{ commit.remote.pr_number }}{%- endif %}\
|
174 |
+
{% endfor %}
|
175 |
+
{% endfor %}\n
|
176 |
+
"""
|
177 |
+
# template for the changelog footer
|
178 |
+
footer = """
|
179 |
+
<!-- generated by git-cliff -->
|
180 |
+
"""
|
181 |
+
# remove the leading and trailings
|
182 |
+
trim = true
|
183 |
+
# postprocessors
|
184 |
+
# postprocessors = [
|
185 |
+
# { pattern = '<REPO>', replace = "https://github.com/atticuszeller/python-uv" }, # replace repository URL
|
186 |
+
# ]
|
187 |
+
# render body even when there are no releases to process
|
188 |
+
render_always = true
|
189 |
+
# output file path
|
190 |
+
output = "CHANGELOG.md"
|
191 |
+
|
192 |
+
[tool.git-cliff.git]
|
193 |
+
# parse the commits based on https://www.conventionalcommits.org
|
194 |
+
conventional_commits = true
|
195 |
+
# filter out the commits that are not conventional
|
196 |
+
filter_unconventional = true
|
197 |
+
# process each line of a commit as an individual commit
|
198 |
+
split_commits = false
|
199 |
+
# regex for preprocessing the commit messages
|
200 |
+
commit_preprocessors = [
|
201 |
+
# If the spelling is incorrect, it will be automatically fixed.
|
202 |
+
{ pattern = '.*', replace_command = 'typos --write-changes -' },
|
203 |
+
]
|
204 |
+
# regex for parsing and grouping commits
|
205 |
+
commit_parsers = [
|
206 |
+
{ message = "^feat", group = "<!-- 0 -->🚀 Features" },
|
207 |
+
{ message = "^fix", group = "<!-- 1 -->🐛 Bug Fixes" },
|
208 |
+
{ message = "^doc", group = "<!-- 3 -->📚 Documentation" },
|
209 |
+
{ message = "^perf", group = "<!-- 4 -->⚡ Performance" },
|
210 |
+
{ message = "^refactor", group = "<!-- 2 -->🚜 Refactor" },
|
211 |
+
{ message = "^style", group = "<!-- 5 -->🎨 Styling" },
|
212 |
+
{ message = "^test", group = "<!-- 6 -->🧪 Testing" },
|
213 |
+
{ message = "^chore\\(release\\)", skip = true },
|
214 |
+
{ message = "^chore\\(deps.*\\)", skip = true },
|
215 |
+
{ message = "^chore\\(pr\\)", skip = true },
|
216 |
+
{ message = "^chore\\(pull\\)", skip = true },
|
217 |
+
{ message = "^chore|^ci", group = "<!-- 7 -->⚙️ Miscellaneous Tasks" },
|
218 |
+
{ body = ".*security", group = "<!-- 8 -->🛡️ Security" },
|
219 |
+
{ message = "^revert", group = "<!-- 9 -->◀️ Revert" },
|
220 |
+
]
|
221 |
+
# filter out the commits that are not matched by commit parsers
|
222 |
+
filter_commits = false
|
223 |
+
# sort the tags topologically
|
224 |
+
topo_order = false
|
225 |
+
# sort the commits inside sections by oldest/newest order
|
226 |
+
sort_commits = "oldest"
|
227 |
+
|
228 |
+
[tool.bumpversion]
|
229 |
+
current_version = "0.0.4"
|
230 |
+
parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
|
231 |
+
serialize = ["{major}.{minor}.{patch}"]
|
232 |
+
search = "{current_version}"
|
233 |
+
replace = "{new_version}"
|
234 |
+
regex = false
|
235 |
+
ignore_missing_version = false
|
236 |
+
ignore_missing_files = false
|
237 |
+
tag = true
|
238 |
+
sign_tags = false
|
239 |
+
tag_name = "v{new_version}"
|
240 |
+
tag_message = "chore(release): {current_version} → {new_version}"
|
241 |
+
allow_dirty = true # git-cliff first then bump patch
|
242 |
+
commit = true
|
243 |
+
message = "chore(release): {current_version} → {new_version}"
|
244 |
+
commit_args = ""
|
245 |
+
setup_hooks = []
|
246 |
+
pre_commit_hooks = []
|
247 |
+
post_commit_hooks = []
|
248 |
+
|
249 |
+
[[tool.bumpversion.files]]
|
250 |
+
filename = "src/pdf2u/__init__.py"
|
251 |
+
|
252 |
+
[[tool.bumpversion.files]]
|
253 |
+
filename = "pyproject.toml"
|
254 |
+
search = "version = \"{current_version}\""
|
255 |
+
replace = "version = \"{new_version}\""
|
256 |
+
|
257 |
+
[[tool.bumpversion.files]]
|
258 |
+
filename = "CHANGELOG.md"
|
259 |
+
search = "unreleased"
|
260 |
+
replace = "{new_version} - {now:%Y-%m-%d}"
|
261 |
+
|
262 |
+
# https://callowayproject.github.io/bump-my-version/reference/search-and-replace-config/
|
src/pdf2u/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__version__ = "0.0.4"
|
src/pdf2u/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (181 Bytes). View file
|
|
src/pdf2u/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (175 Bytes). View file
|
|
src/pdf2u/__pycache__/const.cpython-311.pyc
ADDED
Binary file (519 Bytes). View file
|
|
src/pdf2u/__pycache__/const.cpython-312.pyc
ADDED
Binary file (855 Bytes). View file
|
|
src/pdf2u/__pycache__/converter.cpython-311.pyc
ADDED
Binary file (13.8 kB). View file
|
|
src/pdf2u/__pycache__/converter.cpython-312.pyc
ADDED
Binary file (12.7 kB). View file
|
|
src/pdf2u/__pycache__/high_level.cpython-311.pyc
ADDED
Binary file (21.2 kB). View file
|
|
src/pdf2u/__pycache__/high_level.cpython-312.pyc
ADDED
Binary file (18.6 kB). View file
|
|
src/pdf2u/__pycache__/io.cpython-312.pyc
ADDED
Binary file (583 Bytes). View file
|
|
src/pdf2u/__pycache__/main.cpython-311.pyc
ADDED
Binary file (13.3 kB). View file
|
|
src/pdf2u/__pycache__/main.cpython-312.pyc
ADDED
Binary file (13.4 kB). View file
|
|
src/pdf2u/__pycache__/pdfinterp.cpython-311.pyc
ADDED
Binary file (23.7 kB). View file
|
|
src/pdf2u/__pycache__/pdfinterp.cpython-312.pyc
ADDED
Binary file (21.5 kB). View file
|
|
src/pdf2u/__pycache__/progress_monitor.cpython-311.pyc
ADDED
Binary file (9.5 kB). View file
|
|
src/pdf2u/__pycache__/progress_monitor.cpython-312.pyc
ADDED
Binary file (8.69 kB). View file
|
|
src/pdf2u/__pycache__/translation_config.cpython-311.pyc
ADDED
Binary file (8.22 kB). View file
|
|
src/pdf2u/__pycache__/translation_config.cpython-312.pyc
ADDED
Binary file (7.45 kB). View file
|
|
src/pdf2u/asynchronize/__init__.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import time
|
3 |
+
|
4 |
+
|
5 |
+
class Args:
|
6 |
+
def __init__(self, args, kwargs):
|
7 |
+
self.args = args
|
8 |
+
self.kwargs = kwargs
|
9 |
+
|
10 |
+
|
11 |
+
class AsyncCallback:
|
12 |
+
def __init__(self):
|
13 |
+
self.queue = asyncio.Queue()
|
14 |
+
self.finished = False
|
15 |
+
self.loop = asyncio.get_event_loop()
|
16 |
+
|
17 |
+
def step_callback(self, *args, **kwargs):
|
18 |
+
# Whenever a step is called, add to the queue but don't set finished to True, so __anext__ will continue
|
19 |
+
args = Args(args, kwargs)
|
20 |
+
|
21 |
+
# We have to use the threadsafe call so that it wakes up the event loop, in case it's sleeping:
|
22 |
+
# https://stackoverflow.com/a/49912853/2148718
|
23 |
+
self.loop.call_soon_threadsafe(self.queue.put_nowait, args)
|
24 |
+
|
25 |
+
# Add a small delay to release the GIL, ensuring the event loop has time to process messages
|
26 |
+
time.sleep(0.01)
|
27 |
+
|
28 |
+
def finished_callback(self, *args, **kwargs):
|
29 |
+
# Whenever a finished is called, add to the queue as with step, but also set finished to True, so __anext__
|
30 |
+
# will terminate after processing the remaining items
|
31 |
+
if self.finished:
|
32 |
+
return
|
33 |
+
self.step_callback(*args, **kwargs)
|
34 |
+
self.finished = True
|
35 |
+
|
36 |
+
def __await__(self):
|
37 |
+
# Since this implements __anext__, this can return itself
|
38 |
+
return self.queue.get().__await__()
|
39 |
+
|
40 |
+
def __aiter__(self):
|
41 |
+
# Since this implements __anext__, this can return itself
|
42 |
+
return self
|
43 |
+
|
44 |
+
async def __anext__(self):
|
45 |
+
# Keep waiting for the queue if a) we haven't finished, or b) if the queue is still full. This lets us finish
|
46 |
+
# processing the remaining items even after we've finished
|
47 |
+
if self.finished and self.queue.empty():
|
48 |
+
raise StopAsyncIteration
|
49 |
+
|
50 |
+
result = await self.queue.get()
|
51 |
+
return result
|
src/pdf2u/asynchronize/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (2.69 kB). View file
|
|
src/pdf2u/asynchronize/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (2.5 kB). View file
|
|
src/pdf2u/const.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import StrEnum
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
CACHE_FOLDER = Path.home() / ".cache" / "pdf2u"
|
5 |
+
|
6 |
+
|
7 |
+
def get_cache_file_path(filename: str) -> Path:
|
8 |
+
return CACHE_FOLDER / filename
|
9 |
+
|
10 |
+
|
11 |
+
class TranslationService(StrEnum):
|
12 |
+
OPENAI: str = "openai"
|
13 |
+
GOOGLE: str = "google"
|
14 |
+
BING: str = "bing"
|
src/pdf2u/converter.py
ADDED
@@ -0,0 +1,493 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import logging
|
3 |
+
import re
|
4 |
+
import unicodedata
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
from pdfminer.converter import PDFConverter
|
8 |
+
from pdfminer.layout import LTChar, LTComponent, LTFigure, LTLine, LTPage, LTText
|
9 |
+
from pdfminer.pdfcolor import PDFColorSpace
|
10 |
+
from pdfminer.pdffont import PDFCIDFont, PDFFont, PDFUnicodeNotDefined
|
11 |
+
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
|
12 |
+
from pdfminer.utils import Matrix, apply_matrix_pt, bbox2str, matrix2str, mult_matrix
|
13 |
+
from pymupdf import Font
|
14 |
+
|
15 |
+
from pdf2u.document_il.frontend.il_creater import ILCreater
|
16 |
+
|
17 |
+
log = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
|
20 |
+
class PDFConverterEx(PDFConverter):
|
21 |
+
def __init__(
|
22 |
+
self, rsrcmgr: PDFResourceManager, il_creater: ILCreater | None = None
|
23 |
+
) -> None:
|
24 |
+
PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
|
25 |
+
self.il_creater = il_creater
|
26 |
+
|
27 |
+
def begin_page(self, page, ctm) -> None:
|
28 |
+
# 重载替换 cropbox
|
29 |
+
(x0, y0, x1, y1) = page.cropbox
|
30 |
+
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
|
31 |
+
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
|
32 |
+
mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
|
33 |
+
self.il_creater.on_page_media_box(
|
34 |
+
mediabox[0], mediabox[1], mediabox[2], mediabox[3]
|
35 |
+
)
|
36 |
+
self.il_creater.on_page_number(page.pageno)
|
37 |
+
self.cur_item = LTPage(page.pageno, mediabox)
|
38 |
+
|
39 |
+
def end_page(self, _page) -> None:
|
40 |
+
# 重载返回指令流
|
41 |
+
return self.receive_layout(self.cur_item)
|
42 |
+
|
43 |
+
def begin_figure(self, name, bbox, matrix) -> None:
|
44 |
+
# 重载设置 pageid
|
45 |
+
self._stack.append(self.cur_item)
|
46 |
+
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
|
47 |
+
self.cur_item.pageid = self._stack[-1].pageid
|
48 |
+
|
49 |
+
def end_figure(self, _: str) -> None:
|
50 |
+
# 重载返回指令流
|
51 |
+
fig = self.cur_item
|
52 |
+
if not isinstance(self.cur_item, LTFigure):
|
53 |
+
raise ValueError(f"Unexpected item type: {type(self.cur_item)}")
|
54 |
+
self.cur_item = self._stack.pop()
|
55 |
+
self.cur_item.add(fig)
|
56 |
+
return self.receive_layout(fig)
|
57 |
+
|
58 |
+
def render_char(
|
59 |
+
self,
|
60 |
+
matrix,
|
61 |
+
font,
|
62 |
+
fontsize: float,
|
63 |
+
scaling: float,
|
64 |
+
rise: float,
|
65 |
+
cid: int,
|
66 |
+
ncs,
|
67 |
+
graphicstate: PDFGraphicState,
|
68 |
+
) -> float:
|
69 |
+
# 重载设置 cid 和 font
|
70 |
+
try:
|
71 |
+
text = font.to_unichr(cid)
|
72 |
+
if not isinstance(text, str):
|
73 |
+
raise TypeError(f"Expected string, got {type(text)}")
|
74 |
+
except PDFUnicodeNotDefined:
|
75 |
+
text = self.handle_undefined_char(font, cid)
|
76 |
+
textwidth = font.char_width(cid)
|
77 |
+
textdisp = font.char_disp(cid)
|
78 |
+
|
79 |
+
font_name = font.fontname
|
80 |
+
if isinstance(font_name, bytes):
|
81 |
+
try:
|
82 |
+
font_name = font_name.decode("utf-8")
|
83 |
+
except UnicodeDecodeError:
|
84 |
+
font_name = "BASE64:" + base64.b64encode(font_name).decode("utf-8")
|
85 |
+
font_id = self.il_creater.current_page_font_name_id_map[font_name]
|
86 |
+
|
87 |
+
item = AWLTChar(
|
88 |
+
matrix,
|
89 |
+
font,
|
90 |
+
fontsize,
|
91 |
+
scaling,
|
92 |
+
rise,
|
93 |
+
text,
|
94 |
+
textwidth,
|
95 |
+
textdisp,
|
96 |
+
ncs,
|
97 |
+
graphicstate,
|
98 |
+
self.il_creater.xobj_id,
|
99 |
+
font_id,
|
100 |
+
)
|
101 |
+
self.cur_item.add(item)
|
102 |
+
item.cid = cid # hack 插入原字符编码
|
103 |
+
item.font = font # hack 插入原字符字体
|
104 |
+
return item.adv
|
105 |
+
|
106 |
+
|
107 |
+
class AWLTChar(LTChar):
|
108 |
+
"""Actual letter in the text as a Unicode string."""
|
109 |
+
|
110 |
+
def __init__(
|
111 |
+
self,
|
112 |
+
matrix: Matrix,
|
113 |
+
font: PDFFont,
|
114 |
+
fontsize: float,
|
115 |
+
scaling: float,
|
116 |
+
rise: float,
|
117 |
+
text: str,
|
118 |
+
textwidth: float,
|
119 |
+
textdisp: float | tuple[float | None, float],
|
120 |
+
ncs: PDFColorSpace,
|
121 |
+
graphicstate: PDFGraphicState,
|
122 |
+
xobj_id: int,
|
123 |
+
font_id: str,
|
124 |
+
) -> None:
|
125 |
+
LTText.__init__(self)
|
126 |
+
self._text = text
|
127 |
+
self.matrix = matrix
|
128 |
+
self.fontname = font.fontname
|
129 |
+
self.ncs = ncs
|
130 |
+
self.graphicstate = graphicstate
|
131 |
+
self.xobj_id = xobj_id
|
132 |
+
self.adv = textwidth * fontsize * scaling
|
133 |
+
self.aw_font_id = font_id
|
134 |
+
# compute the boundary rectangle.
|
135 |
+
if font.is_vertical():
|
136 |
+
# vertical
|
137 |
+
assert isinstance(textdisp, tuple)
|
138 |
+
(vx, vy) = textdisp
|
139 |
+
if vx is None:
|
140 |
+
vx = fontsize * 0.5
|
141 |
+
else:
|
142 |
+
vx = vx * fontsize * 0.001
|
143 |
+
vy = (1000 - vy) * fontsize * 0.001
|
144 |
+
bbox_lower_left = (-vx, vy + rise + self.adv)
|
145 |
+
bbox_upper_right = (-vx + fontsize, vy + rise)
|
146 |
+
else:
|
147 |
+
# horizontal
|
148 |
+
descent = font.get_descent() * fontsize
|
149 |
+
bbox_lower_left = (0, descent + rise)
|
150 |
+
bbox_upper_right = (self.adv, descent + rise + fontsize)
|
151 |
+
(a, b, c, d, e, f) = self.matrix
|
152 |
+
self.upright = a * d * scaling > 0 and b * c <= 0
|
153 |
+
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
|
154 |
+
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
|
155 |
+
if x1 < x0:
|
156 |
+
(x0, x1) = (x1, x0)
|
157 |
+
if y1 < y0:
|
158 |
+
(y0, y1) = (y1, y0)
|
159 |
+
LTComponent.__init__(self, (x0, y0, x1, y1))
|
160 |
+
if font.is_vertical() or matrix[0] == 0:
|
161 |
+
self.size = self.width
|
162 |
+
else:
|
163 |
+
self.size = self.height
|
164 |
+
return
|
165 |
+
|
166 |
+
def __repr__(self) -> str:
|
167 |
+
return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"
|
168 |
+
|
169 |
+
def get_text(self) -> str:
|
170 |
+
return self._text
|
171 |
+
|
172 |
+
|
173 |
+
class Paragraph:
|
174 |
+
def __init__(self, y, x, x0, x1, size, brk):
|
175 |
+
self.y: float = y # 初始纵坐标
|
176 |
+
self.x: float = x # 初始横坐标
|
177 |
+
self.x0: float = x0 # 左边界
|
178 |
+
self.x1: float = x1 # 右边界
|
179 |
+
self.size: float = size # 字体大小
|
180 |
+
self.brk: bool = brk # 换行标记
|
181 |
+
|
182 |
+
|
183 |
+
# fmt: off
|
184 |
+
class TranslateConverter(PDFConverterEx):
|
185 |
+
def __init__(
|
186 |
+
self,
|
187 |
+
rsrcmgr,
|
188 |
+
vfont: str | None = None,
|
189 |
+
vchar: str | None = None,
|
190 |
+
thread: int = 0,
|
191 |
+
layout: dict | None = None,
|
192 |
+
lang_in: str = "", # 保留参数但添加未使用标记
|
193 |
+
_lang_out: str = "", # 改为未使用参数
|
194 |
+
_service: str = "", # 改为未使用参数
|
195 |
+
resfont: str = "",
|
196 |
+
noto: Font | None = None,
|
197 |
+
envs: dict | None = None,
|
198 |
+
_prompt: list | None = None, # 改为未使用参数
|
199 |
+
il_creater: ILCreater | None = None,
|
200 |
+
):
|
201 |
+
layout = layout or {}
|
202 |
+
super().__init__(rsrcmgr, il_creater)
|
203 |
+
self.vfont = vfont
|
204 |
+
self.vchar = vchar
|
205 |
+
self.thread = thread
|
206 |
+
self.layout = layout
|
207 |
+
self.resfont = resfont
|
208 |
+
self.noto = noto
|
209 |
+
|
210 |
+
def receive_layout(self, ltpage: LTPage):
|
211 |
+
# 段落
|
212 |
+
sstk: list[str] = [] # 段落文字栈
|
213 |
+
pstk: list[Paragraph] = [] # 段落属性栈
|
214 |
+
vbkt: int = 0 # 段落公式括号计数
|
215 |
+
# 公式组
|
216 |
+
vstk: list[LTChar] = [] # 公式符号组
|
217 |
+
vlstk: list[LTLine] = [] # 公式线条组
|
218 |
+
vfix: float = 0 # 公式纵向偏移
|
219 |
+
# 公式组栈
|
220 |
+
var: list[list[LTChar]] = [] # 公式符号组栈
|
221 |
+
varl: list[list[LTLine]] = [] # 公式线条组栈
|
222 |
+
varf: list[float] = [] # 公式纵向偏移栈
|
223 |
+
vlen: list[float] = [] # 公式宽度栈
|
224 |
+
# 全局
|
225 |
+
lstk: list[LTLine] = [] # 全局线条栈
|
226 |
+
xt: LTChar = None # 上一个字符
|
227 |
+
xt_cls: int = -1 # 上一个字符所属段落,保证无论第一个字符属于哪个类别都可以触发新段落
|
228 |
+
vmax: float = ltpage.width / 4 # 行内公式最大宽度
|
229 |
+
ops: str = "" # 渲染结果
|
230 |
+
|
231 |
+
def vflag(font: str, char: str): # 匹配公式(和角标)字体
|
232 |
+
if isinstance(font, bytes): # 不一定能 decode,直接转 str
|
233 |
+
font = str(font)
|
234 |
+
font = font.split("+")[-1] # 字体名截断
|
235 |
+
if re.match(r"\(cid:", char):
|
236 |
+
return True
|
237 |
+
# 基于字体名规则的判定
|
238 |
+
if self.vfont:
|
239 |
+
if re.match(self.vfont, font):
|
240 |
+
return True
|
241 |
+
else:
|
242 |
+
if re.match( # latex 字体
|
243 |
+
r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)",
|
244 |
+
font,
|
245 |
+
):
|
246 |
+
return True
|
247 |
+
# 基于字符集规则的判定
|
248 |
+
if self.vchar:
|
249 |
+
if re.match(self.vchar, char):
|
250 |
+
return True
|
251 |
+
else:
|
252 |
+
if (
|
253 |
+
char
|
254 |
+
and char != " " # 非空格
|
255 |
+
and (
|
256 |
+
unicodedata.category(char[0])
|
257 |
+
in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号
|
258 |
+
or ord(char[0]) in range(0x370, 0x400) # 希腊字母
|
259 |
+
)
|
260 |
+
):
|
261 |
+
return True
|
262 |
+
return False
|
263 |
+
|
264 |
+
############################################################
|
265 |
+
# A. 原文档解析
|
266 |
+
for child in ltpage:
|
267 |
+
if isinstance(child, LTChar):
|
268 |
+
self.il_creater.on_lt_char(child)
|
269 |
+
continue
|
270 |
+
cur_v = False
|
271 |
+
layout = self.layout[ltpage.pageid]
|
272 |
+
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
273 |
+
h, w = layout.shape
|
274 |
+
# 读取当前字符在 layout 中的类别
|
275 |
+
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
276 |
+
cls = layout[cy, cx]
|
277 |
+
# 锚定文档中 bullet 的位置
|
278 |
+
if child.get_text() == "•":
|
279 |
+
cls = 0
|
280 |
+
# 判定当前字符是否属于公式
|
281 |
+
if ( # 判定当前字符是否属于公式
|
282 |
+
cls == 0 # 1. 类别为保留区域
|
283 |
+
or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
|
284 |
+
or vflag(child.fontname, child.get_text()) # 3. 公式字体
|
285 |
+
or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体
|
286 |
+
):
|
287 |
+
cur_v = True
|
288 |
+
# 判定括号组是否属于公式
|
289 |
+
if not cur_v:
|
290 |
+
if vstk and child.get_text() == "(":
|
291 |
+
cur_v = True
|
292 |
+
vbkt += 1
|
293 |
+
if vbkt and child.get_text() == ")":
|
294 |
+
cur_v = True
|
295 |
+
vbkt -= 1
|
296 |
+
if ( # 判定当前公式是否结束
|
297 |
+
not cur_v # 1. 当前字符不属于公式
|
298 |
+
or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落
|
299 |
+
# or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分
|
300 |
+
# 禁止纯公式(代码)段落换行,直到文字开始再重开文字段落,保证只存在两种情况
|
301 |
+
# A. 纯公式(代码)段落(锚定绝对位置)sstk[-1]=="" -> sstk[-1]=="{v*}"
|
302 |
+
# B. 文字开头段落(排版相对位置)sstk[-1]!=""
|
303 |
+
or (sstk[-1] != "" and abs(child.x0 - xt.x0) > vmax) # 因为 cls==xt_cls==0 一定有 sstk[-1]=="",所以这里不需要再判定 cls!=0
|
304 |
+
):
|
305 |
+
if vstk:
|
306 |
+
if ( # 根据公式右侧的文字修正公式的纵向偏移
|
307 |
+
not cur_v # 1. 当前字符不属于公式
|
308 |
+
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
309 |
+
and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧
|
310 |
+
):
|
311 |
+
vfix = vstk[0].y0 - child.y0
|
312 |
+
if sstk[-1] == "":
|
313 |
+
xt_cls = -1 # 禁止纯公式段落(sstk[-1]=="{v*}")的后续连接,但是要考虑新字符和后续字符的连接,所以这里修改的是上个字符的类别
|
314 |
+
sstk[-1] += f"{{v{len(var)}}}"
|
315 |
+
var.append(vstk)
|
316 |
+
varl.append(vlstk)
|
317 |
+
varf.append(vfix)
|
318 |
+
vstk = []
|
319 |
+
vlstk = []
|
320 |
+
vfix = 0
|
321 |
+
# 当前字符不属于公式或当前字符是公式的第一个字符
|
322 |
+
if not vstk:
|
323 |
+
if cls == xt_cls: # 当前字符与前一个字符属于同一段落
|
324 |
+
if child.x0 > xt.x1 + 1: # 添加行内空格
|
325 |
+
sstk[-1] += " "
|
326 |
+
elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行
|
327 |
+
sstk[-1] += " "
|
328 |
+
pstk[-1].brk = True
|
329 |
+
else: # 根据当前字符构建一个新的段落
|
330 |
+
sstk.append("")
|
331 |
+
pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
|
332 |
+
if not cur_v: # 文字入栈
|
333 |
+
if ( # 根据当前字符修正段落属性
|
334 |
+
child.size > pstk[-1].size / 0.79 # 1. 当前字符显著比段落字体大
|
335 |
+
or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况)
|
336 |
+
) and child.get_text() != " ": # 3. 当前字符不是空格
|
337 |
+
pstk[-1].y -= child.size - pstk[-1].size # 修正段落初始纵坐标,假设两个不同大小字符的上边界对齐
|
338 |
+
pstk[-1].size = child.size
|
339 |
+
sstk[-1] += child.get_text()
|
340 |
+
else: # 公式入栈
|
341 |
+
if ( # 根据公式左侧的文字修正公式的纵向偏移
|
342 |
+
not vstk # 1. 当前字符是公式的第一个字符
|
343 |
+
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
344 |
+
and child.x0 > xt.x0 # 3. 前一个字符在公式左侧
|
345 |
+
):
|
346 |
+
vfix = child.y0 - xt.y0
|
347 |
+
vstk.append(child)
|
348 |
+
# 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
|
349 |
+
pstk[-1].x0 = min(pstk[-1].x0, child.x0)
|
350 |
+
pstk[-1].x1 = max(pstk[-1].x1, child.x1)
|
351 |
+
# 更新上一个字符
|
352 |
+
xt = child
|
353 |
+
xt_cls = cls
|
354 |
+
elif isinstance(child, LTFigure):
|
355 |
+
# 图表
|
356 |
+
self.il_creater.on_pdf_figure(child)
|
357 |
+
pass
|
358 |
+
elif isinstance(child, LTLine): # 线条
|
359 |
+
continue
|
360 |
+
layout = self.layout[ltpage.pageid]
|
361 |
+
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
362 |
+
h, w = layout.shape
|
363 |
+
# 读取当前线条在 layout 中的类别
|
364 |
+
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
365 |
+
cls = layout[cy, cx]
|
366 |
+
if vstk and cls == xt_cls: # 公式线条
|
367 |
+
vlstk.append(child)
|
368 |
+
else: # 全局线条
|
369 |
+
lstk.append(child)
|
370 |
+
else:
|
371 |
+
pass
|
372 |
+
return
|
373 |
+
# 处理结尾
|
374 |
+
if vstk: # 公式出栈
|
375 |
+
sstk[-1] += f"{{v{len(var)}}}"
|
376 |
+
var.append(vstk)
|
377 |
+
varl.append(vlstk)
|
378 |
+
varf.append(vfix)
|
379 |
+
log.debug("\n==========[VSTACK]==========\n")
|
380 |
+
for var_id, v in enumerate(var): # 计算公式宽度
|
381 |
+
l = max([vch.x1 for vch in v]) - v[0].x0
|
382 |
+
log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[var_id])} > v{var_id} = {"".join([ch.get_text() for ch in v])}')
|
383 |
+
vlen.append(l)
|
384 |
+
|
385 |
+
############################################################
|
386 |
+
# B. 段落翻译
|
387 |
+
log.debug("\n==========[SSTACK]==========\n")
|
388 |
+
|
389 |
+
news = sstk.copy()
|
390 |
+
|
391 |
+
############################################################
|
392 |
+
# C. 新文档排版
|
393 |
+
def raw_string(fcur: str, cstk: str): # 编码字符串
|
394 |
+
if fcur == 'noto':
|
395 |
+
return "".join([f"{self.noto.has_glyph(ord(c)):04x}" for c in cstk])
|
396 |
+
elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
|
397 |
+
return "".join([f"{ord(c):04x}" for c in cstk])
|
398 |
+
else:
|
399 |
+
return "".join([f"{ord(c):02x}" for c in cstk])
|
400 |
+
|
401 |
+
_x, _y = 0, 0
|
402 |
+
for para_id, new in enumerate(news):
|
403 |
+
x: float = pstk[para_id].x # 段落初始横坐标
|
404 |
+
y: float = pstk[para_id].y # 段落初始纵坐标
|
405 |
+
x0: float = pstk[para_id].x0 # 段落左边界
|
406 |
+
x1: float = pstk[para_id].x1 # 段落右边界
|
407 |
+
size: float = pstk[para_id].size # 段落字体大小
|
408 |
+
brk: bool = pstk[para_id].brk # 段落换行标记
|
409 |
+
cstk: str = "" # 当前文字栈
|
410 |
+
fcur: str = None # 当前字体 ID
|
411 |
+
tx = x
|
412 |
+
fcur_ = fcur
|
413 |
+
ptr = 0
|
414 |
+
log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[para_id]} | {new}")
|
415 |
+
while ptr < len(new):
|
416 |
+
vy_regex = re.match(
|
417 |
+
r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE,
|
418 |
+
) # 匹配 {vn} 公式标记
|
419 |
+
mod = 0 # 文字修饰符
|
420 |
+
if vy_regex: # 加载公式
|
421 |
+
ptr += len(vy_regex.group(0))
|
422 |
+
try:
|
423 |
+
vid = int(vy_regex.group(1).replace(" ", ""))
|
424 |
+
adv = vlen[vid]
|
425 |
+
except Exception as e:
|
426 |
+
log.debug("Skipping formula placeholder due to: %s", e)
|
427 |
+
continue # 翻译器可能会自动补个越界的公式标记
|
428 |
+
if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符
|
429 |
+
mod = var[vid][-1].width
|
430 |
+
else: # 加载文字
|
431 |
+
ch = new[ptr]
|
432 |
+
fcur_ = None
|
433 |
+
try:
|
434 |
+
if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
|
435 |
+
fcur_ = "tiro" # 默认拉丁字体
|
436 |
+
except Exception:
|
437 |
+
pass
|
438 |
+
if fcur_ is None:
|
439 |
+
fcur_ = self.resfont # 默认非拉丁字体
|
440 |
+
if fcur_ == 'noto':
|
441 |
+
adv = self.noto.char_lengths(ch, size)[0]
|
442 |
+
else:
|
443 |
+
adv = self.fontmap[fcur_].char_width(ord(ch)) * size
|
444 |
+
ptr += 1
|
445 |
+
if ( # 输出文字缓冲区
|
446 |
+
fcur_ != fcur # 1. 字体更新
|
447 |
+
or vy_regex # 2. 插入公式
|
448 |
+
or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
|
449 |
+
):
|
450 |
+
if cstk:
|
451 |
+
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
452 |
+
cstk = ""
|
453 |
+
if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
|
454 |
+
x = x0
|
455 |
+
lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
|
456 |
+
# y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1) # 小语种大多适配 1.1
|
457 |
+
y -= size * 1.4
|
458 |
+
if vy_regex: # 插入公式
|
459 |
+
fix = 0
|
460 |
+
if fcur is not None: # 段落内公式修正纵向偏移
|
461 |
+
fix = varf[vid]
|
462 |
+
for vch in var[vid]: # 排版公式字符
|
463 |
+
vc = chr(vch.cid)
|
464 |
+
ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm <{raw_string(self.fontid[vch.font], vc)}> TJ "
|
465 |
+
if log.isEnabledFor(logging.DEBUG):
|
466 |
+
lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
|
467 |
+
_x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
|
468 |
+
for l in varl[vid]: # 排版公式线条
|
469 |
+
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
470 |
+
ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
471 |
+
else: # 插入文字缓冲区
|
472 |
+
if not cstk: # 单行开头
|
473 |
+
tx = x
|
474 |
+
if x == x0 and ch == " ": # 消除段落换行空格
|
475 |
+
adv = 0
|
476 |
+
else:
|
477 |
+
cstk += ch
|
478 |
+
else:
|
479 |
+
cstk += ch
|
480 |
+
adv -= mod # 文字修饰符
|
481 |
+
fcur = fcur_
|
482 |
+
x += adv
|
483 |
+
if log.isEnabledFor(logging.DEBUG):
|
484 |
+
lstk.append(LTLine(0.1, (_x, _y), (x, y)))
|
485 |
+
_x, _y = x, y
|
486 |
+
# 处理结尾
|
487 |
+
if cstk:
|
488 |
+
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm <{raw_string(fcur, cstk)}> TJ "
|
489 |
+
for l in lstk: # 排版全局线条
|
490 |
+
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
491 |
+
ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
492 |
+
ops = f"BT {ops}ET "
|
493 |
+
return ops
|
src/pdf2u/document_il/__init__.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf2u.document_il.il_version_1 import (
|
2 |
+
BaseOperations,
|
3 |
+
Box,
|
4 |
+
Cropbox,
|
5 |
+
Document,
|
6 |
+
GraphicState,
|
7 |
+
Mediabox,
|
8 |
+
Page,
|
9 |
+
PageLayout,
|
10 |
+
PdfCharacter,
|
11 |
+
PdfFigure,
|
12 |
+
PdfFont,
|
13 |
+
PdfFormula,
|
14 |
+
PdfLine,
|
15 |
+
PdfParagraph,
|
16 |
+
PdfParagraphComposition,
|
17 |
+
PdfRectangle,
|
18 |
+
PdfSameStyleCharacters,
|
19 |
+
PdfSameStyleUnicodeCharacters,
|
20 |
+
PdfStyle,
|
21 |
+
PdfXobject,
|
22 |
+
)
|
23 |
+
|
24 |
+
__all__ = [
|
25 |
+
"BaseOperations",
|
26 |
+
"Box",
|
27 |
+
"Cropbox",
|
28 |
+
"Document",
|
29 |
+
"GraphicState",
|
30 |
+
"Mediabox",
|
31 |
+
"Page",
|
32 |
+
"PageLayout",
|
33 |
+
"PdfCharacter",
|
34 |
+
"PdfFigure",
|
35 |
+
"PdfFont",
|
36 |
+
"PdfFormula",
|
37 |
+
"PdfLine",
|
38 |
+
"PdfParagraph",
|
39 |
+
"PdfParagraphComposition",
|
40 |
+
"PdfRectangle",
|
41 |
+
"PdfSameStyleCharacters",
|
42 |
+
"PdfSameStyleUnicodeCharacters",
|
43 |
+
"PdfStyle",
|
44 |
+
"PdfXobject",
|
45 |
+
]
|
src/pdf2u/document_il/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (912 Bytes). View file
|
|
src/pdf2u/document_il/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (706 Bytes). View file
|
|
src/pdf2u/document_il/__pycache__/il_version_1.cpython-311.pyc
ADDED
Binary file (22 kB). View file
|
|
src/pdf2u/document_il/__pycache__/il_version_1.cpython-312.pyc
ADDED
Binary file (17.1 kB). View file
|
|
src/pdf2u/document_il/__pycache__/xml_converter.cpython-311.pyc
ADDED
Binary file (4.42 kB). View file
|
|
src/pdf2u/document_il/__pycache__/xml_converter.cpython-312.pyc
ADDED
Binary file (3.81 kB). View file
|
|
src/pdf2u/document_il/backend/__init__.py
ADDED
File without changes
|
src/pdf2u/document_il/backend/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (181 Bytes). View file
|
|
src/pdf2u/document_il/backend/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (169 Bytes). View file
|
|
src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-311.pyc
ADDED
Binary file (19.8 kB). View file
|
|
src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-312.pyc
ADDED
Binary file (18.5 kB). View file
|
|
src/pdf2u/document_il/backend/pdf_creater.py
ADDED
@@ -0,0 +1,405 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import re
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import pymupdf
|
6 |
+
from bitstring import BitStream
|
7 |
+
|
8 |
+
from pdf2u.document_il import il_version_1
|
9 |
+
from pdf2u.document_il.utils.fontmap import FontMapper
|
10 |
+
from pdf2u.translation_config import TranslateResult, TranslationConfig
|
11 |
+
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
SUBSET_FONT_STAGE_NAME = "Subset font"
|
15 |
+
SAVE_PDF_STAGE_NAME = "Save PDF"
|
16 |
+
|
17 |
+
|
18 |
+
class PDFCreater:
|
19 |
+
stage_name = "Generate drawing instructions"
|
20 |
+
|
21 |
+
def __init__(
|
22 |
+
self,
|
23 |
+
original_pdf_path: str,
|
24 |
+
document: il_version_1.Document,
|
25 |
+
translation_config: TranslationConfig,
|
26 |
+
):
|
27 |
+
self.original_pdf_path = original_pdf_path
|
28 |
+
self.docs = document
|
29 |
+
self.font_path = translation_config.font
|
30 |
+
self.font_mapper = FontMapper(translation_config)
|
31 |
+
self.translation_config = translation_config
|
32 |
+
|
33 |
+
def render_graphic_state(
|
34 |
+
self, draw_op: BitStream, graphic_state: il_version_1.GraphicState
|
35 |
+
):
|
36 |
+
if graphic_state is None:
|
37 |
+
return
|
38 |
+
# if graphic_state.stroking_color_space_name:
|
39 |
+
# draw_op.append(
|
40 |
+
# f"/{graphic_state.stroking_color_space_name} CS \n".encode()
|
41 |
+
# )
|
42 |
+
# if graphic_state.non_stroking_color_space_name:
|
43 |
+
# draw_op.append(
|
44 |
+
# f"/{graphic_state.non_stroking_color_space_name}"
|
45 |
+
# f" cs \n".encode()
|
46 |
+
# )
|
47 |
+
# if graphic_state.ncolor is not None:
|
48 |
+
# if len(graphic_state.ncolor) == 1:
|
49 |
+
# draw_op.append(f"{graphic_state.ncolor[0]} g \n".encode())
|
50 |
+
# elif len(graphic_state.ncolor) == 3:
|
51 |
+
# draw_op.append(
|
52 |
+
# f"{' '.join((str(x) for x in graphic_state.ncolor))} sc \n".encode()
|
53 |
+
# )
|
54 |
+
# if graphic_state.scolor is not None:
|
55 |
+
# if len(graphic_state.scolor) == 1:
|
56 |
+
# draw_op.append(f"{graphic_state.scolor[0]} G \n".encode())
|
57 |
+
# elif len(graphic_state.scolor) == 3:
|
58 |
+
# draw_op.append(
|
59 |
+
# f"{' '.join((str(x) for x in graphic_state.scolor))} SC \n".encode()
|
60 |
+
# )
|
61 |
+
|
62 |
+
if graphic_state.passthrough_per_char_instruction:
|
63 |
+
draw_op.append(
|
64 |
+
f"{graphic_state.passthrough_per_char_instruction} \n".encode()
|
65 |
+
)
|
66 |
+
|
67 |
+
def render_paragraph_to_char(
|
68 |
+
self, paragraph: il_version_1.PdfParagraph
|
69 |
+
) -> list[il_version_1.PdfCharacter]:
|
70 |
+
chars = []
|
71 |
+
for composition in paragraph.pdf_paragraph_composition:
|
72 |
+
if not isinstance(composition.pdf_character, il_version_1.PdfCharacter):
|
73 |
+
logger.error(
|
74 |
+
f"Unknown composition type. "
|
75 |
+
f"This type only appears in the IL "
|
76 |
+
f"after the translation is completed."
|
77 |
+
f"During pdf rendering, this type is not supported."
|
78 |
+
f"Composition: {composition}. "
|
79 |
+
f"Paragraph: {paragraph}. "
|
80 |
+
)
|
81 |
+
continue
|
82 |
+
chars.append(composition.pdf_character)
|
83 |
+
if not chars and paragraph.unicode:
|
84 |
+
logger.error(
|
85 |
+
f"Unable to export paragraphs that have "
|
86 |
+
f"not yet been formatted: {paragraph}"
|
87 |
+
)
|
88 |
+
return chars
|
89 |
+
return chars
|
90 |
+
|
91 |
+
def get_available_font_list(self, pdf, page):
|
92 |
+
page_xref_id = pdf[page.page_number].xref
|
93 |
+
return self.get_xobj_available_fonts(page_xref_id, pdf)
|
94 |
+
|
95 |
+
def get_xobj_available_fonts(self, page_xref_id, pdf):
|
96 |
+
resources_type, r_id = pdf.xref_get_key(page_xref_id, "Resources")
|
97 |
+
if resources_type == "xref":
|
98 |
+
resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
|
99 |
+
r_id = pdf.xref_object(int(resource_xref_id))
|
100 |
+
resources_type = "dict"
|
101 |
+
if resources_type == "dict":
|
102 |
+
xref_id = re.search("/Font (\\d+) 0 R", r_id)
|
103 |
+
if xref_id is not None:
|
104 |
+
xref_id = xref_id.group(1)
|
105 |
+
font_dict = pdf.xref_object(int(xref_id))
|
106 |
+
else:
|
107 |
+
search = re.search("/Font *<<(.+?)>>", r_id.replace("\n", " "))
|
108 |
+
if search is None:
|
109 |
+
# Have resources but no fonts
|
110 |
+
return set()
|
111 |
+
font_dict = search.group(1)
|
112 |
+
else:
|
113 |
+
r_id = int(r_id.split(" ")[0])
|
114 |
+
_, font_dict = pdf.xref_get_key(r_id, "Font")
|
115 |
+
fonts = re.findall("/([^ ]+?) ", font_dict)
|
116 |
+
return set(fonts)
|
117 |
+
|
118 |
+
def _debug_render_rectangle(
|
119 |
+
self, draw_op: BitStream, rectangle: il_version_1.PdfRectangle
|
120 |
+
):
|
121 |
+
"""Draw a debug rectangle in PDF for visualization purposes.
|
122 |
+
|
123 |
+
Args:
|
124 |
+
draw_op: BitStream to append PDF drawing operations
|
125 |
+
rectangle: Rectangle object containing position information
|
126 |
+
"""
|
127 |
+
x1 = rectangle.box.x
|
128 |
+
y1 = rectangle.box.y
|
129 |
+
x2 = rectangle.box.x2
|
130 |
+
y2 = rectangle.box.y2
|
131 |
+
# Save graphics state
|
132 |
+
draw_op.append(b"q ")
|
133 |
+
|
134 |
+
# Set green color for debug visibility
|
135 |
+
draw_op.append(
|
136 |
+
rectangle.graphic_state.passthrough_per_char_instruction.encode()
|
137 |
+
) # Green stroke
|
138 |
+
draw_op.append(b" 1 w ") # Line width
|
139 |
+
|
140 |
+
# Draw four lines manually
|
141 |
+
# Bottom line
|
142 |
+
draw_op.append(f"{x1} {y1} m {x2} {y1} l S ".encode())
|
143 |
+
# Right line
|
144 |
+
draw_op.append(f"{x2} {y1} m {x2} {y2} l S ".encode())
|
145 |
+
# Top line
|
146 |
+
draw_op.append(f"{x2} {y2} m {x1} {y2} l S ".encode())
|
147 |
+
# Left line
|
148 |
+
draw_op.append(f"{x1} {y2} m {x1} {y1} l S ".encode())
|
149 |
+
|
150 |
+
# Restore graphics state
|
151 |
+
draw_op.append(b"Q\n")
|
152 |
+
|
153 |
+
def write_debug_info(
|
154 |
+
self, pdf: pymupdf.Document, translation_config: TranslationConfig
|
155 |
+
):
|
156 |
+
self.font_mapper.add_font(pdf, self.docs)
|
157 |
+
|
158 |
+
for page in self.docs.page:
|
159 |
+
_, r_id = pdf.xref_get_key(pdf[page.page_number].xref, "Contents")
|
160 |
+
resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
|
161 |
+
base_op = pdf.xref_stream(int(resource_xref_id))
|
162 |
+
translation_config.raise_if_cancelled()
|
163 |
+
xobj_available_fonts = {}
|
164 |
+
xobj_draw_ops = {}
|
165 |
+
xobj_encoding_length_map = {}
|
166 |
+
available_font_list = self.get_available_font_list(pdf, page)
|
167 |
+
|
168 |
+
page_encoding_length_map = {
|
169 |
+
f.font_id: f.encoding_length for f in page.pdf_font
|
170 |
+
}
|
171 |
+
page_op = BitStream()
|
172 |
+
# q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
|
173 |
+
page_op.append(b"q ")
|
174 |
+
if base_op is not None:
|
175 |
+
page_op.append(base_op)
|
176 |
+
page_op.append(b" Q ")
|
177 |
+
page_op.append(
|
178 |
+
f"q Q 1 0 0 1 {page.cropbox.box.x} {page.cropbox.box.y} cm \n".encode()
|
179 |
+
)
|
180 |
+
# 收集所有字符
|
181 |
+
chars = []
|
182 |
+
# 首先添加页面级别的字符
|
183 |
+
if page.pdf_character:
|
184 |
+
chars.extend(page.pdf_character)
|
185 |
+
# 然后添加段落中的字符
|
186 |
+
for paragraph in page.pdf_paragraph:
|
187 |
+
chars.extend(self.render_paragraph_to_char(paragraph))
|
188 |
+
|
189 |
+
# 渲染所有字符
|
190 |
+
for char in chars:
|
191 |
+
if not getattr(char, "debug_info", False):
|
192 |
+
continue
|
193 |
+
if char.char_unicode == "\n":
|
194 |
+
continue
|
195 |
+
if char.pdf_character_id is None:
|
196 |
+
# dummy char
|
197 |
+
continue
|
198 |
+
char_size = char.pdf_style.font_size
|
199 |
+
font_id = char.pdf_style.font_id
|
200 |
+
|
201 |
+
if font_id not in available_font_list:
|
202 |
+
continue
|
203 |
+
draw_op = page_op
|
204 |
+
encoding_length_map = page_encoding_length_map
|
205 |
+
|
206 |
+
draw_op.append(b"q ")
|
207 |
+
self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
|
208 |
+
if char.vertical:
|
209 |
+
draw_op.append(
|
210 |
+
f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode()
|
211 |
+
)
|
212 |
+
else:
|
213 |
+
draw_op.append(
|
214 |
+
f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode()
|
215 |
+
)
|
216 |
+
|
217 |
+
encoding_length = encoding_length_map[font_id]
|
218 |
+
# pdf32000-2008 page14:
|
219 |
+
# As hexadecimal data enclosed in angle brackets < >
|
220 |
+
# see 7.3.4.3, "Hexadecimal Strings."
|
221 |
+
draw_op.append(
|
222 |
+
f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode()
|
223 |
+
)
|
224 |
+
|
225 |
+
draw_op.append(b" Tj ET Q \n")
|
226 |
+
for rect in page.pdf_rectangle:
|
227 |
+
if not rect.debug_info:
|
228 |
+
continue
|
229 |
+
self._debug_render_rectangle(page_op, rect)
|
230 |
+
draw_op = page_op
|
231 |
+
# Since this is a draw instruction container,
|
232 |
+
# no additional information is needed
|
233 |
+
pdf.update_stream(int(resource_xref_id), draw_op.tobytes())
|
234 |
+
translation_config.raise_if_cancelled()
|
235 |
+
pdf.subset_fonts(fallback=False)
|
236 |
+
|
237 |
+
def write(self, translation_config: TranslationConfig) -> TranslateResult:
|
238 |
+
basename = Path(translation_config.input_file).stem
|
239 |
+
debug_suffix = ".debug" if translation_config.debug else ""
|
240 |
+
mono_out_path = translation_config.get_output_file_path(
|
241 |
+
f"{basename}{debug_suffix}.{translation_config.lang_out}.mono.pdf"
|
242 |
+
)
|
243 |
+
pdf = pymupdf.open(self.original_pdf_path)
|
244 |
+
self.font_mapper.add_font(pdf, self.docs)
|
245 |
+
with self.translation_config.progress_monitor.stage_start(
|
246 |
+
self.stage_name, len(self.docs.page)
|
247 |
+
) as pbar:
|
248 |
+
for page in self.docs.page:
|
249 |
+
translation_config.raise_if_cancelled()
|
250 |
+
xobj_available_fonts = {}
|
251 |
+
xobj_draw_ops = {}
|
252 |
+
xobj_encoding_length_map = {}
|
253 |
+
available_font_list = self.get_available_font_list(pdf, page)
|
254 |
+
|
255 |
+
for xobj in page.pdf_xobject:
|
256 |
+
xobj_available_fonts[xobj.xobj_id] = available_font_list.copy()
|
257 |
+
try:
|
258 |
+
xobj_available_fonts[xobj.xobj_id].update(
|
259 |
+
self.get_xobj_available_fonts(xobj.xref_id, pdf)
|
260 |
+
)
|
261 |
+
except Exception:
|
262 |
+
pass
|
263 |
+
xobj_encoding_length_map[xobj.xobj_id] = {
|
264 |
+
f.font_id: f.encoding_length for f in xobj.pdf_font
|
265 |
+
}
|
266 |
+
xobj_op = BitStream()
|
267 |
+
xobj_op.append(xobj.base_operations.value.encode())
|
268 |
+
xobj_draw_ops[xobj.xobj_id] = xobj_op
|
269 |
+
page_encoding_length_map = {
|
270 |
+
f.font_id: f.encoding_length for f in page.pdf_font
|
271 |
+
}
|
272 |
+
page_op = BitStream()
|
273 |
+
# q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
|
274 |
+
page_op.append(b"q ")
|
275 |
+
page_op.append(page.base_operations.value.encode())
|
276 |
+
page_op.append(b" Q ")
|
277 |
+
page_op.append(
|
278 |
+
f"q Q 1 0 0 1 {page.cropbox.box.x} {page.cropbox.box.y} cm \n".encode()
|
279 |
+
)
|
280 |
+
# 收集所有字符
|
281 |
+
chars = []
|
282 |
+
# 首先添加页面级别的字符
|
283 |
+
if page.pdf_character:
|
284 |
+
chars.extend(page.pdf_character)
|
285 |
+
# 然后添加段落中的字符
|
286 |
+
for paragraph in page.pdf_paragraph:
|
287 |
+
chars.extend(self.render_paragraph_to_char(paragraph))
|
288 |
+
|
289 |
+
# 渲染所有字符
|
290 |
+
for char in chars:
|
291 |
+
if char.char_unicode == "\n":
|
292 |
+
continue
|
293 |
+
if char.pdf_character_id is None:
|
294 |
+
# dummy char
|
295 |
+
continue
|
296 |
+
char_size = char.pdf_style.font_size
|
297 |
+
font_id = char.pdf_style.font_id
|
298 |
+
if char.xobj_id in xobj_available_fonts:
|
299 |
+
if font_id not in xobj_available_fonts[char.xobj_id]:
|
300 |
+
continue
|
301 |
+
draw_op = xobj_draw_ops[char.xobj_id]
|
302 |
+
encoding_length_map = xobj_encoding_length_map[char.xobj_id]
|
303 |
+
else:
|
304 |
+
if font_id not in available_font_list:
|
305 |
+
continue
|
306 |
+
draw_op = page_op
|
307 |
+
encoding_length_map = page_encoding_length_map
|
308 |
+
|
309 |
+
draw_op.append(b"q ")
|
310 |
+
self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
|
311 |
+
if char.vertical:
|
312 |
+
draw_op.append(
|
313 |
+
f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode()
|
314 |
+
)
|
315 |
+
else:
|
316 |
+
draw_op.append(
|
317 |
+
f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode()
|
318 |
+
)
|
319 |
+
|
320 |
+
encoding_length = encoding_length_map[font_id]
|
321 |
+
# pdf32000-2008 page14:
|
322 |
+
# As hexadecimal data enclosed in angle brackets < >
|
323 |
+
# see 7.3.4.3, "Hexadecimal Strings."
|
324 |
+
draw_op.append(
|
325 |
+
f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode()
|
326 |
+
)
|
327 |
+
|
328 |
+
draw_op.append(b" Tj ET Q \n")
|
329 |
+
for xobj in page.pdf_xobject:
|
330 |
+
draw_op = xobj_draw_ops[xobj.xobj_id]
|
331 |
+
pdf.update_stream(xobj.xref_id, draw_op.tobytes())
|
332 |
+
# pdf.update_stream(xobj.xref_id, b'')
|
333 |
+
for rect in page.pdf_rectangle:
|
334 |
+
self._debug_render_rectangle(page_op, rect)
|
335 |
+
draw_op = page_op
|
336 |
+
op_container = pdf.get_new_xref()
|
337 |
+
# Since this is a draw instruction container,
|
338 |
+
# no additional information is needed
|
339 |
+
pdf.update_object(op_container, "<<>>")
|
340 |
+
pdf.update_stream(op_container, draw_op.tobytes())
|
341 |
+
pdf[page.page_number].set_contents(op_container)
|
342 |
+
pbar.advance()
|
343 |
+
translation_config.raise_if_cancelled()
|
344 |
+
with self.translation_config.progress_monitor.stage_start(
|
345 |
+
SUBSET_FONT_STAGE_NAME, 1
|
346 |
+
) as pbar:
|
347 |
+
if not translation_config.skip_clean:
|
348 |
+
pdf.subset_fonts(fallback=False)
|
349 |
+
pbar.advance()
|
350 |
+
with self.translation_config.progress_monitor.stage_start(
|
351 |
+
SAVE_PDF_STAGE_NAME, 2
|
352 |
+
) as pbar:
|
353 |
+
if not translation_config.no_mono:
|
354 |
+
if translation_config.debug:
|
355 |
+
translation_config.raise_if_cancelled()
|
356 |
+
pdf.save(
|
357 |
+
f"{mono_out_path}.decompressed.pdf", expand=True, pretty=True
|
358 |
+
)
|
359 |
+
translation_config.raise_if_cancelled()
|
360 |
+
pdf.save(
|
361 |
+
mono_out_path,
|
362 |
+
garbage=3,
|
363 |
+
deflate=True,
|
364 |
+
clean=not translation_config.skip_clean,
|
365 |
+
deflate_fonts=True,
|
366 |
+
linear=True,
|
367 |
+
)
|
368 |
+
pbar.advance()
|
369 |
+
dual_out_path = None
|
370 |
+
if not translation_config.no_dual:
|
371 |
+
dual_out_path = translation_config.get_output_file_path(
|
372 |
+
f"{basename}{debug_suffix}.{translation_config.lang_out}.dual.pdf"
|
373 |
+
)
|
374 |
+
translation_config.raise_if_cancelled()
|
375 |
+
dual = pymupdf.open(self.original_pdf_path)
|
376 |
+
if translation_config.debug:
|
377 |
+
translation_config.raise_if_cancelled()
|
378 |
+
try:
|
379 |
+
self.write_debug_info(dual, translation_config)
|
380 |
+
except Exception:
|
381 |
+
logger.warning(
|
382 |
+
"Failed to write debug info to dual PDF", exc_info=True
|
383 |
+
)
|
384 |
+
dual.insert_file(pdf)
|
385 |
+
page_count = pdf.page_count
|
386 |
+
for page_id in range(page_count):
|
387 |
+
if translation_config.dual_translate_first:
|
388 |
+
dual.move_page(page_count + page_id, page_id * 2)
|
389 |
+
else:
|
390 |
+
dual.move_page(page_count + page_id, page_id * 2 + 1)
|
391 |
+
dual.save(
|
392 |
+
dual_out_path,
|
393 |
+
garbage=3,
|
394 |
+
deflate=True,
|
395 |
+
clean=not translation_config.skip_clean,
|
396 |
+
deflate_fonts=True,
|
397 |
+
linear=True,
|
398 |
+
)
|
399 |
+
if translation_config.debug:
|
400 |
+
translation_config.raise_if_cancelled()
|
401 |
+
dual.save(
|
402 |
+
f"{dual_out_path}.decompressed.pdf", expand=True, pretty=True
|
403 |
+
)
|
404 |
+
pbar.advance()
|
405 |
+
return TranslateResult(mono_out_path, dual_out_path)
|
src/pdf2u/document_il/frontend/__init__.py
ADDED
File without changes
|
src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (182 Bytes). View file
|
|
src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (170 Bytes). View file
|
|
src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-311.pyc
ADDED
Binary file (19 kB). View file
|
|
src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-312.pyc
ADDED
Binary file (18 kB). View file
|
|
src/pdf2u/document_il/frontend/il_creater.py
ADDED
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import logging
|
3 |
+
import re
|
4 |
+
|
5 |
+
import pdfminer.pdfinterp
|
6 |
+
import pymupdf
|
7 |
+
from pdfminer.layout import LTChar, LTFigure
|
8 |
+
from pdfminer.pdffont import PDFCIDFont, PDFFont
|
9 |
+
from pdfminer.psparser import PSLiteral
|
10 |
+
|
11 |
+
from pdf2u.document_il import il_version_1
|
12 |
+
from pdf2u.translation_config import TranslationConfig
|
13 |
+
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
|
17 |
+
class ILCreater:
|
18 |
+
stage_name = "Parse PDF and Create Intermediate Representation"
|
19 |
+
|
20 |
+
def __init__(self, translation_config: TranslationConfig):
|
21 |
+
self.progress = None
|
22 |
+
self.current_page: il_version_1.Page = None
|
23 |
+
self.mupdf: pymupdf.Document = None
|
24 |
+
self.model = translation_config.doc_layout_model
|
25 |
+
self.docs = il_version_1.Document(page=[])
|
26 |
+
self.stroking_color_space_name = None
|
27 |
+
self.non_stroking_color_space_name = None
|
28 |
+
self.passthrough_per_char_instruction: list[tuple[str, str]] = []
|
29 |
+
self.translation_config = translation_config
|
30 |
+
self.passthrough_per_char_instruction_stack: list[list[tuple[str, str]]] = []
|
31 |
+
self.xobj_id = 0
|
32 |
+
self.xobj_inc = 0
|
33 |
+
self.xobj_map: dict[int, il_version_1.PdfXobject] = {}
|
34 |
+
self.xobj_stack = []
|
35 |
+
|
36 |
+
def on_finish(self):
|
37 |
+
self.progress.__exit__(None, None, None)
|
38 |
+
|
39 |
+
def is_passthrough_per_char_operation(self, operator: str):
|
40 |
+
return re.match("^(sc|scn|g|rg|k|cs|gs|ri)$", operator, re.IGNORECASE)
|
41 |
+
|
42 |
+
def on_passthrough_per_char(self, operator: str, args: list[str]):
|
43 |
+
if not self.is_passthrough_per_char_operation(operator):
|
44 |
+
logger.error("Unknown passthrough_per_char operation: %s", operator)
|
45 |
+
return
|
46 |
+
# logger.debug("xobj_id: %d, on_passthrough_per_char: %s ( %s )", self.xobj_id, operator, args)
|
47 |
+
args = [self.parse_arg(arg) for arg in args]
|
48 |
+
for _i, value in enumerate(self.passthrough_per_char_instruction.copy()):
|
49 |
+
op, arg = value
|
50 |
+
if op == operator:
|
51 |
+
self.passthrough_per_char_instruction.remove(value)
|
52 |
+
break
|
53 |
+
self.passthrough_per_char_instruction.append((operator, " ".join(args)))
|
54 |
+
pass
|
55 |
+
|
56 |
+
def remove_latest_passthrough_per_char_instruction(self):
|
57 |
+
if self.passthrough_per_char_instruction:
|
58 |
+
self.passthrough_per_char_instruction.pop()
|
59 |
+
|
60 |
+
def parse_arg(self, arg: str):
|
61 |
+
if isinstance(arg, PSLiteral):
|
62 |
+
return f"/{arg.name}"
|
63 |
+
if not isinstance(arg, str):
|
64 |
+
return str(arg)
|
65 |
+
return arg
|
66 |
+
|
67 |
+
def pop_passthrough_per_char_instruction(self):
|
68 |
+
if self.passthrough_per_char_instruction_stack:
|
69 |
+
self.passthrough_per_char_instruction = (
|
70 |
+
self.passthrough_per_char_instruction_stack.pop()
|
71 |
+
)
|
72 |
+
else:
|
73 |
+
self.passthrough_per_char_instruction = []
|
74 |
+
logging.error(
|
75 |
+
"pop_passthrough_per_char_instruction error on page: %s",
|
76 |
+
self.current_page.page_number,
|
77 |
+
)
|
78 |
+
|
79 |
+
def push_passthrough_per_char_instruction(self):
|
80 |
+
self.passthrough_per_char_instruction_stack.append(
|
81 |
+
self.passthrough_per_char_instruction.copy()
|
82 |
+
)
|
83 |
+
|
84 |
+
# pdf32000 page 171
|
85 |
+
def on_stroking_color_space(self, color_space_name):
|
86 |
+
self.stroking_color_space_name = color_space_name
|
87 |
+
|
88 |
+
def on_non_stroking_color_space(self, color_space_name):
|
89 |
+
self.non_stroking_color_space_name = color_space_name
|
90 |
+
|
91 |
+
def on_new_stream(self):
|
92 |
+
self.stroking_color_space_name = None
|
93 |
+
self.non_stroking_color_space_name = None
|
94 |
+
self.passthrough_per_char_instruction = []
|
95 |
+
|
96 |
+
def push_xobj(self):
|
97 |
+
self.xobj_stack.append(
|
98 |
+
(self.current_page_font_name_id_map.copy(), self.xobj_id)
|
99 |
+
)
|
100 |
+
self.current_page_font_name_id_map = {}
|
101 |
+
|
102 |
+
def pop_xobj(self):
|
103 |
+
self.current_page_font_name_id_map, self.xobj_id = self.xobj_stack.pop()
|
104 |
+
|
105 |
+
def on_xobj_begin(self, bbox, xref_id):
|
106 |
+
self.push_passthrough_per_char_instruction()
|
107 |
+
self.push_xobj()
|
108 |
+
self.xobj_inc += 1
|
109 |
+
self.xobj_id = self.xobj_inc
|
110 |
+
xobject = il_version_1.PdfXobject(
|
111 |
+
box=il_version_1.Box(
|
112 |
+
x=float(bbox[0]), y=float(bbox[1]), x2=float(bbox[2]), y2=float(bbox[3])
|
113 |
+
),
|
114 |
+
xobj_id=self.xobj_id,
|
115 |
+
xref_id=xref_id,
|
116 |
+
)
|
117 |
+
self.current_page.pdf_xobject.append(xobject)
|
118 |
+
self.xobj_map[self.xobj_id] = xobject
|
119 |
+
return self.xobj_id
|
120 |
+
|
121 |
+
def on_xobj_end(self, xobj_id, base_op):
|
122 |
+
self.pop_passthrough_per_char_instruction()
|
123 |
+
self.pop_xobj()
|
124 |
+
xobj = self.xobj_map[xobj_id]
|
125 |
+
xobj.base_operations = il_version_1.BaseOperations(value=base_op)
|
126 |
+
self.xobj_inc += 1
|
127 |
+
|
128 |
+
def on_page_start(self):
|
129 |
+
self.current_page = il_version_1.Page(
|
130 |
+
pdf_font=[],
|
131 |
+
pdf_character=[],
|
132 |
+
page_layout=[],
|
133 |
+
# currently don't support UserUnit page parameter
|
134 |
+
# pdf32000 page 79
|
135 |
+
unit="point",
|
136 |
+
)
|
137 |
+
self.current_page_font_name_id_map = {}
|
138 |
+
self.passthrough_per_char_instruction_stack = []
|
139 |
+
self.xobj_stack = []
|
140 |
+
self.non_stroking_color_space_name = None
|
141 |
+
self.stroking_color_space_name = None
|
142 |
+
self.docs.page.append(self.current_page)
|
143 |
+
|
144 |
+
def on_page_end(self):
|
145 |
+
self.progress.advance(1)
|
146 |
+
|
147 |
+
def on_page_crop_box(
|
148 |
+
self, x0: float | int, y0: float | int, x1: float | int, y1: float | int
|
149 |
+
):
|
150 |
+
box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1))
|
151 |
+
self.current_page.cropbox = il_version_1.Cropbox(box=box)
|
152 |
+
|
153 |
+
def on_page_media_box(
|
154 |
+
self, x0: float | int, y0: float | int, x1: float | int, y1: float | int
|
155 |
+
):
|
156 |
+
box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1))
|
157 |
+
self.current_page.mediabox = il_version_1.Mediabox(box=box)
|
158 |
+
|
159 |
+
def on_page_number(self, page_number: int):
|
160 |
+
assert isinstance(page_number, int)
|
161 |
+
assert page_number >= 0
|
162 |
+
self.current_page.page_number = page_number
|
163 |
+
|
164 |
+
def on_page_base_operation(self, operation: str):
|
165 |
+
self.current_page.base_operations = il_version_1.BaseOperations(value=operation)
|
166 |
+
|
167 |
+
def on_page_resource_font(self, font: PDFFont, xref_id: int, font_id: str):
|
168 |
+
font_name = font.fontname
|
169 |
+
if isinstance(font_name, bytes):
|
170 |
+
try:
|
171 |
+
font_name = font_name.decode("utf-8")
|
172 |
+
except UnicodeDecodeError:
|
173 |
+
font_name = "BASE64:" + base64.b64encode(font_name).decode("utf-8")
|
174 |
+
encoding_length = 1
|
175 |
+
if isinstance(font, PDFCIDFont):
|
176 |
+
try:
|
177 |
+
# pdf 32000:2008 page 273
|
178 |
+
# Table 118 - Predefined CJK CMap names
|
179 |
+
_, encoding = self.mupdf.xref_get_key(xref_id, "Encoding")
|
180 |
+
if encoding == "/Identity-H" or encoding == "/Identity-V":
|
181 |
+
encoding_length = 2
|
182 |
+
else:
|
183 |
+
_, to_unicode_id = self.mupdf.xref_get_key(xref_id, "ToUnicode")
|
184 |
+
to_unicode_bytes = self.mupdf.xref_stream(
|
185 |
+
int(to_unicode_id.split(" ")[0])
|
186 |
+
)
|
187 |
+
code_range = re.search(
|
188 |
+
b"begincodespacerange\n?.*<(\\d+?)>.*", to_unicode_bytes
|
189 |
+
).group(1)
|
190 |
+
encoding_length = len(code_range) // 2
|
191 |
+
except Exception:
|
192 |
+
if max(font.unicode_map.cid2unichr.keys()) > 255:
|
193 |
+
encoding_length = 2
|
194 |
+
else:
|
195 |
+
encoding_length = 1
|
196 |
+
try:
|
197 |
+
mupdf_font = pymupdf.Font(fontbuffer=self.mupdf.extract_font(xref_id)[3])
|
198 |
+
bold = mupdf_font.is_bold
|
199 |
+
italic = mupdf_font.is_italic
|
200 |
+
monospaced = mupdf_font.is_monospaced
|
201 |
+
serif = mupdf_font.is_serif
|
202 |
+
except Exception:
|
203 |
+
bold = None
|
204 |
+
italic = None
|
205 |
+
monospaced = None
|
206 |
+
serif = None
|
207 |
+
il_font_metadata = il_version_1.PdfFont(
|
208 |
+
name=font_name,
|
209 |
+
xref_id=xref_id,
|
210 |
+
font_id=font_id,
|
211 |
+
encoding_length=encoding_length,
|
212 |
+
bold=bold,
|
213 |
+
italic=italic,
|
214 |
+
monospace=monospaced,
|
215 |
+
serif=serif,
|
216 |
+
ascent=font.ascent,
|
217 |
+
descent=font.descent,
|
218 |
+
)
|
219 |
+
self.current_page_font_name_id_map[font_name] = font_id
|
220 |
+
if self.xobj_id in self.xobj_map:
|
221 |
+
self.xobj_map[self.xobj_id].pdf_font.append(il_font_metadata)
|
222 |
+
else:
|
223 |
+
self.current_page.pdf_font.append(il_font_metadata)
|
224 |
+
|
225 |
+
def create_graphic_state(self, gs: pdfminer.pdfinterp.PDFGraphicState):
|
226 |
+
graphic_state = il_version_1.GraphicState()
|
227 |
+
for k, v in gs.__dict__.items():
|
228 |
+
if v is None:
|
229 |
+
continue
|
230 |
+
if k in ["scolor", "ncolor"]:
|
231 |
+
if isinstance(v, tuple):
|
232 |
+
v = list(v)
|
233 |
+
else:
|
234 |
+
v = [v]
|
235 |
+
setattr(graphic_state, k, v)
|
236 |
+
continue
|
237 |
+
if k == "linewidth":
|
238 |
+
graphic_state.linewidth = float(v)
|
239 |
+
continue
|
240 |
+
continue
|
241 |
+
raise NotImplementedError
|
242 |
+
|
243 |
+
graphic_state.stroking_color_space_name = self.stroking_color_space_name
|
244 |
+
graphic_state.non_stroking_color_space_name = self.non_stroking_color_space_name
|
245 |
+
|
246 |
+
graphic_state.passthrough_per_char_instruction = " ".join(
|
247 |
+
f"{arg} {op}" for op, arg in gs.passthrough_instruction
|
248 |
+
)
|
249 |
+
|
250 |
+
return graphic_state
|
251 |
+
|
252 |
+
def on_lt_char(self, char: LTChar):
|
253 |
+
gs = self.create_graphic_state(char.graphicstate)
|
254 |
+
# Get font from current page or xobject
|
255 |
+
font = None
|
256 |
+
for pdf_font in self.xobj_map.get(self.xobj_id, self.current_page).pdf_font:
|
257 |
+
if pdf_font.font_id == char.aw_font_id:
|
258 |
+
font = pdf_font
|
259 |
+
break
|
260 |
+
|
261 |
+
# Get descent from font
|
262 |
+
descent = 0
|
263 |
+
if font and hasattr(font, "descent"):
|
264 |
+
descent = font.descent * char.size / 1000
|
265 |
+
|
266 |
+
char_id = char.cid
|
267 |
+
char_unicode = char.get_text()
|
268 |
+
if "(cid:" not in char_unicode and len(char_unicode) > 1:
|
269 |
+
return
|
270 |
+
advance = char.adv
|
271 |
+
if char.matrix[0] == 0 and char.matrix[3] == 0:
|
272 |
+
vertical = True
|
273 |
+
bbox = il_version_1.Box(
|
274 |
+
x=char.bbox[0] - descent,
|
275 |
+
y=char.bbox[1],
|
276 |
+
x2=char.bbox[2] - descent,
|
277 |
+
y2=char.bbox[3],
|
278 |
+
)
|
279 |
+
else:
|
280 |
+
vertical = False
|
281 |
+
# Add descent to y coordinates
|
282 |
+
bbox = il_version_1.Box(
|
283 |
+
x=char.bbox[0],
|
284 |
+
y=char.bbox[1] + descent,
|
285 |
+
x2=char.bbox[2],
|
286 |
+
y2=char.bbox[3] + descent,
|
287 |
+
)
|
288 |
+
pdf_style = il_version_1.PdfStyle(
|
289 |
+
font_id=char.aw_font_id, font_size=char.size, graphic_state=gs
|
290 |
+
)
|
291 |
+
pdf_char = il_version_1.PdfCharacter(
|
292 |
+
box=bbox,
|
293 |
+
pdf_character_id=char_id,
|
294 |
+
advance=advance,
|
295 |
+
char_unicode=char_unicode,
|
296 |
+
vertical=vertical,
|
297 |
+
pdf_style=pdf_style,
|
298 |
+
xobj_id=char.xobj_id,
|
299 |
+
)
|
300 |
+
self.current_page.pdf_character.append(pdf_char)
|
301 |
+
|
302 |
+
def create_il(self):
|
303 |
+
pages = [
|
304 |
+
page
|
305 |
+
for page in self.docs.page
|
306 |
+
if self.translation_config.should_translate_page(page.page_number + 1)
|
307 |
+
]
|
308 |
+
self.docs.page = pages
|
309 |
+
return self.docs
|
310 |
+
|
311 |
+
def on_total_pages(self, total_pages: int):
|
312 |
+
assert isinstance(total_pages, int)
|
313 |
+
assert total_pages > 0
|
314 |
+
self.docs.total_pages = total_pages
|
315 |
+
total = 0
|
316 |
+
for page in range(total_pages):
|
317 |
+
if self.translation_config.should_translate_page(page + 1) is False:
|
318 |
+
continue
|
319 |
+
total += 1
|
320 |
+
self.progress = self.translation_config.progress_monitor.stage_start(
|
321 |
+
self.stage_name, total
|
322 |
+
)
|
323 |
+
|
324 |
+
def on_pdf_figure(self, figure: LTFigure):
|
325 |
+
box = il_version_1.Box(
|
326 |
+
figure.bbox[0], figure.bbox[1], figure.bbox[2], figure.bbox[3]
|
327 |
+
)
|
328 |
+
self.current_page.pdf_figure.append(il_version_1.PdfFigure(box=box))
|
src/pdf2u/document_il/il_version_1.py
ADDED
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, field
|
2 |
+
|
3 |
+
|
4 |
+
@dataclass
|
5 |
+
class BaseOperations:
|
6 |
+
class Meta:
|
7 |
+
name = "baseOperations"
|
8 |
+
|
9 |
+
value: str = field(default="", metadata={"required": True})
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass
|
13 |
+
class Box:
|
14 |
+
class Meta:
|
15 |
+
name = "box"
|
16 |
+
|
17 |
+
x: float | None = field(
|
18 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
19 |
+
)
|
20 |
+
y: float | None = field(
|
21 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
22 |
+
)
|
23 |
+
x2: float | None = field(
|
24 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
25 |
+
)
|
26 |
+
y2: float | None = field(
|
27 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
@dataclass
|
32 |
+
class GraphicState:
|
33 |
+
class Meta:
|
34 |
+
name = "graphicState"
|
35 |
+
|
36 |
+
linewidth: float | None = field(default=None, metadata={"type": "Attribute"})
|
37 |
+
dash: list[float] = field(
|
38 |
+
default_factory=list,
|
39 |
+
metadata={"type": "Attribute", "min_length": 1, "tokens": True},
|
40 |
+
)
|
41 |
+
flatness: float | None = field(default=None, metadata={"type": "Attribute"})
|
42 |
+
intent: str | None = field(default=None, metadata={"type": "Attribute"})
|
43 |
+
linecap: int | None = field(default=None, metadata={"type": "Attribute"})
|
44 |
+
linejoin: int | None = field(default=None, metadata={"type": "Attribute"})
|
45 |
+
miterlimit: float | None = field(default=None, metadata={"type": "Attribute"})
|
46 |
+
ncolor: list[float] = field(
|
47 |
+
default_factory=list,
|
48 |
+
metadata={"type": "Attribute", "min_length": 1, "tokens": True},
|
49 |
+
)
|
50 |
+
scolor: list[float] = field(
|
51 |
+
default_factory=list,
|
52 |
+
metadata={"type": "Attribute", "min_length": 1, "tokens": True},
|
53 |
+
)
|
54 |
+
stroking_color_space_name: str | None = field(
|
55 |
+
default=None, metadata={"name": "strokingColorSpaceName", "type": "Attribute"}
|
56 |
+
)
|
57 |
+
non_stroking_color_space_name: str | None = field(
|
58 |
+
default=None,
|
59 |
+
metadata={"name": "nonStrokingColorSpaceName", "type": "Attribute"},
|
60 |
+
)
|
61 |
+
passthrough_per_char_instruction: str | None = field(
|
62 |
+
default=None,
|
63 |
+
metadata={"name": "passthroughPerCharInstruction", "type": "Attribute"},
|
64 |
+
)
|
65 |
+
|
66 |
+
|
67 |
+
@dataclass
|
68 |
+
class PdfFont:
|
69 |
+
class Meta:
|
70 |
+
name = "pdfFont"
|
71 |
+
|
72 |
+
name: str | None = field(
|
73 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
74 |
+
)
|
75 |
+
font_id: str | None = field(
|
76 |
+
default=None, metadata={"name": "fontId", "type": "Attribute", "required": True}
|
77 |
+
)
|
78 |
+
xref_id: int | None = field(
|
79 |
+
default=None, metadata={"name": "xrefId", "type": "Attribute", "required": True}
|
80 |
+
)
|
81 |
+
encoding_length: int | None = field(
|
82 |
+
default=None,
|
83 |
+
metadata={"name": "encodingLength", "type": "Attribute", "required": True},
|
84 |
+
)
|
85 |
+
bold: bool | None = field(default=None, metadata={"type": "Attribute"})
|
86 |
+
italic: bool | None = field(default=None, metadata={"type": "Attribute"})
|
87 |
+
monospace: bool | None = field(default=None, metadata={"type": "Attribute"})
|
88 |
+
serif: bool | None = field(default=None, metadata={"type": "Attribute"})
|
89 |
+
ascent: float | None = field(default=None, metadata={"type": "Attribute"})
|
90 |
+
descent: float | None = field(default=None, metadata={"type": "Attribute"})
|
91 |
+
|
92 |
+
|
93 |
+
@dataclass
|
94 |
+
class Cropbox:
|
95 |
+
class Meta:
|
96 |
+
name = "cropbox"
|
97 |
+
|
98 |
+
box: Box | None = field(
|
99 |
+
default=None, metadata={"type": "Element", "required": True}
|
100 |
+
)
|
101 |
+
|
102 |
+
|
103 |
+
@dataclass
|
104 |
+
class Mediabox:
|
105 |
+
class Meta:
|
106 |
+
name = "mediabox"
|
107 |
+
|
108 |
+
box: Box | None = field(
|
109 |
+
default=None, metadata={"type": "Element", "required": True}
|
110 |
+
)
|
111 |
+
|
112 |
+
|
113 |
+
@dataclass
|
114 |
+
class PageLayout:
|
115 |
+
class Meta:
|
116 |
+
name = "pageLayout"
|
117 |
+
|
118 |
+
box: Box | None = field(
|
119 |
+
default=None, metadata={"type": "Element", "required": True}
|
120 |
+
)
|
121 |
+
id: int | None = field(
|
122 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
123 |
+
)
|
124 |
+
conf: float | None = field(
|
125 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
126 |
+
)
|
127 |
+
class_name: str | None = field(
|
128 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
129 |
+
)
|
130 |
+
|
131 |
+
|
132 |
+
@dataclass
|
133 |
+
class PdfFigure:
|
134 |
+
class Meta:
|
135 |
+
name = "pdfFigure"
|
136 |
+
|
137 |
+
box: Box | None = field(
|
138 |
+
default=None, metadata={"type": "Element", "required": True}
|
139 |
+
)
|
140 |
+
|
141 |
+
|
142 |
+
@dataclass
|
143 |
+
class PdfRectangle:
|
144 |
+
class Meta:
|
145 |
+
name = "pdfRectangle"
|
146 |
+
|
147 |
+
box: Box | None = field(
|
148 |
+
default=None, metadata={"type": "Element", "required": True}
|
149 |
+
)
|
150 |
+
graphic_state: GraphicState | None = field(
|
151 |
+
default=None,
|
152 |
+
metadata={"name": "graphicState", "type": "Element", "required": True},
|
153 |
+
)
|
154 |
+
debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
|
155 |
+
|
156 |
+
|
157 |
+
@dataclass
|
158 |
+
class PdfStyle:
|
159 |
+
class Meta:
|
160 |
+
name = "pdfStyle"
|
161 |
+
|
162 |
+
graphic_state: GraphicState | None = field(
|
163 |
+
default=None,
|
164 |
+
metadata={"name": "graphicState", "type": "Element", "required": True},
|
165 |
+
)
|
166 |
+
font_id: str | None = field(
|
167 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
168 |
+
)
|
169 |
+
font_size: float | None = field(
|
170 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
171 |
+
)
|
172 |
+
|
173 |
+
|
174 |
+
@dataclass
|
175 |
+
class PdfXobject:
|
176 |
+
class Meta:
|
177 |
+
name = "pdfXobject"
|
178 |
+
|
179 |
+
box: Box | None = field(
|
180 |
+
default=None, metadata={"type": "Element", "required": True}
|
181 |
+
)
|
182 |
+
pdf_font: list[PdfFont] = field(
|
183 |
+
default_factory=list, metadata={"name": "pdfFont", "type": "Element"}
|
184 |
+
)
|
185 |
+
base_operations: BaseOperations | None = field(
|
186 |
+
default=None,
|
187 |
+
metadata={"name": "baseOperations", "type": "Element", "required": True},
|
188 |
+
)
|
189 |
+
xobj_id: int | None = field(
|
190 |
+
default=None, metadata={"name": "xobjId", "type": "Attribute", "required": True}
|
191 |
+
)
|
192 |
+
xref_id: int | None = field(
|
193 |
+
default=None, metadata={"name": "xrefId", "type": "Attribute", "required": True}
|
194 |
+
)
|
195 |
+
|
196 |
+
|
197 |
+
@dataclass
|
198 |
+
class PdfCharacter:
|
199 |
+
class Meta:
|
200 |
+
name = "pdfCharacter"
|
201 |
+
|
202 |
+
pdf_style: PdfStyle | None = field(
|
203 |
+
default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
|
204 |
+
)
|
205 |
+
box: Box | None = field(
|
206 |
+
default=None, metadata={"type": "Element", "required": True}
|
207 |
+
)
|
208 |
+
vertical: bool | None = field(default=None, metadata={"type": "Attribute"})
|
209 |
+
scale: float | None = field(default=None, metadata={"type": "Attribute"})
|
210 |
+
pdf_character_id: int | None = field(
|
211 |
+
default=None, metadata={"name": "pdfCharacterId", "type": "Attribute"}
|
212 |
+
)
|
213 |
+
char_unicode: str | None = field(
|
214 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
215 |
+
)
|
216 |
+
advance: float | None = field(default=None, metadata={"type": "Attribute"})
|
217 |
+
xobj_id: int | None = field(
|
218 |
+
default=None, metadata={"name": "xobjId", "type": "Attribute"}
|
219 |
+
)
|
220 |
+
debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
|
221 |
+
|
222 |
+
|
223 |
+
@dataclass
|
224 |
+
class PdfSameStyleUnicodeCharacters:
|
225 |
+
class Meta:
|
226 |
+
name = "pdfSameStyleUnicodeCharacters"
|
227 |
+
|
228 |
+
pdf_style: PdfStyle | None = field(
|
229 |
+
default=None, metadata={"name": "pdfStyle", "type": "Element"}
|
230 |
+
)
|
231 |
+
unicode: str | None = field(
|
232 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
233 |
+
)
|
234 |
+
debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
|
235 |
+
|
236 |
+
|
237 |
+
@dataclass
|
238 |
+
class PdfFormula:
|
239 |
+
class Meta:
|
240 |
+
name = "pdfFormula"
|
241 |
+
|
242 |
+
box: Box | None = field(
|
243 |
+
default=None, metadata={"type": "Element", "required": True}
|
244 |
+
)
|
245 |
+
pdf_character: list[PdfCharacter] = field(
|
246 |
+
default_factory=list,
|
247 |
+
metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
|
248 |
+
)
|
249 |
+
x_offset: float | None = field(
|
250 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
251 |
+
)
|
252 |
+
y_offset: float | None = field(
|
253 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
254 |
+
)
|
255 |
+
|
256 |
+
|
257 |
+
@dataclass
|
258 |
+
class PdfLine:
|
259 |
+
class Meta:
|
260 |
+
name = "pdfLine"
|
261 |
+
|
262 |
+
box: Box | None = field(
|
263 |
+
default=None, metadata={"type": "Element", "required": True}
|
264 |
+
)
|
265 |
+
pdf_character: list[PdfCharacter] = field(
|
266 |
+
default_factory=list,
|
267 |
+
metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
|
268 |
+
)
|
269 |
+
|
270 |
+
|
271 |
+
@dataclass
|
272 |
+
class PdfSameStyleCharacters:
|
273 |
+
class Meta:
|
274 |
+
name = "pdfSameStyleCharacters"
|
275 |
+
|
276 |
+
box: Box | None = field(
|
277 |
+
default=None, metadata={"type": "Element", "required": True}
|
278 |
+
)
|
279 |
+
pdf_style: PdfStyle | None = field(
|
280 |
+
default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
|
281 |
+
)
|
282 |
+
pdf_character: list[PdfCharacter] = field(
|
283 |
+
default_factory=list,
|
284 |
+
metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
|
285 |
+
)
|
286 |
+
|
287 |
+
|
288 |
+
@dataclass
|
289 |
+
class PdfParagraphComposition:
|
290 |
+
class Meta:
|
291 |
+
name = "pdfParagraphComposition"
|
292 |
+
|
293 |
+
pdf_line: PdfLine | None = field(
|
294 |
+
default=None, metadata={"name": "pdfLine", "type": "Element"}
|
295 |
+
)
|
296 |
+
pdf_formula: PdfFormula | None = field(
|
297 |
+
default=None, metadata={"name": "pdfFormula", "type": "Element"}
|
298 |
+
)
|
299 |
+
pdf_same_style_characters: PdfSameStyleCharacters | None = field(
|
300 |
+
default=None, metadata={"name": "pdfSameStyleCharacters", "type": "Element"}
|
301 |
+
)
|
302 |
+
pdf_character: PdfCharacter | None = field(
|
303 |
+
default=None, metadata={"name": "pdfCharacter", "type": "Element"}
|
304 |
+
)
|
305 |
+
pdf_same_style_unicode_characters: PdfSameStyleUnicodeCharacters | None = field(
|
306 |
+
default=None,
|
307 |
+
metadata={"name": "pdfSameStyleUnicodeCharacters", "type": "Element"},
|
308 |
+
)
|
309 |
+
|
310 |
+
|
311 |
+
@dataclass
|
312 |
+
class PdfParagraph:
|
313 |
+
class Meta:
|
314 |
+
name = "pdfParagraph"
|
315 |
+
|
316 |
+
box: Box | None = field(
|
317 |
+
default=None, metadata={"type": "Element", "required": True}
|
318 |
+
)
|
319 |
+
pdf_style: PdfStyle | None = field(
|
320 |
+
default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
|
321 |
+
)
|
322 |
+
pdf_paragraph_composition: list[PdfParagraphComposition] = field(
|
323 |
+
default_factory=list,
|
324 |
+
metadata={"name": "pdfParagraphComposition", "type": "Element"},
|
325 |
+
)
|
326 |
+
xobj_id: int | None = field(
|
327 |
+
default=None, metadata={"name": "xobjId", "type": "Attribute"}
|
328 |
+
)
|
329 |
+
unicode: str | None = field(
|
330 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
331 |
+
)
|
332 |
+
scale: float | None = field(default=None, metadata={"type": "Attribute"})
|
333 |
+
vertical: bool | None = field(default=None, metadata={"type": "Attribute"})
|
334 |
+
first_line_indent: bool | None = field(
|
335 |
+
default=None, metadata={"name": "FirstLineIndent", "type": "Attribute"}
|
336 |
+
)
|
337 |
+
debug_id: str | None = field(default=None, metadata={"type": "Attribute"})
|
338 |
+
|
339 |
+
|
340 |
+
@dataclass
|
341 |
+
class Page:
|
342 |
+
class Meta:
|
343 |
+
name = "page"
|
344 |
+
|
345 |
+
mediabox: Mediabox | None = field(
|
346 |
+
default=None, metadata={"type": "Element", "required": True}
|
347 |
+
)
|
348 |
+
cropbox: Cropbox | None = field(
|
349 |
+
default=None, metadata={"type": "Element", "required": True}
|
350 |
+
)
|
351 |
+
pdf_xobject: list[PdfXobject] = field(
|
352 |
+
default_factory=list, metadata={"name": "pdfXobject", "type": "Element"}
|
353 |
+
)
|
354 |
+
page_layout: list[PageLayout] = field(
|
355 |
+
default_factory=list, metadata={"name": "pageLayout", "type": "Element"}
|
356 |
+
)
|
357 |
+
pdf_rectangle: list[PdfRectangle] = field(
|
358 |
+
default_factory=list, metadata={"name": "pdfRectangle", "type": "Element"}
|
359 |
+
)
|
360 |
+
pdf_font: list[PdfFont] = field(
|
361 |
+
default_factory=list, metadata={"name": "pdfFont", "type": "Element"}
|
362 |
+
)
|
363 |
+
pdf_paragraph: list[PdfParagraph] = field(
|
364 |
+
default_factory=list, metadata={"name": "pdfParagraph", "type": "Element"}
|
365 |
+
)
|
366 |
+
pdf_figure: list[PdfFigure] = field(
|
367 |
+
default_factory=list, metadata={"name": "pdfFigure", "type": "Element"}
|
368 |
+
)
|
369 |
+
pdf_character: list[PdfCharacter] = field(
|
370 |
+
default_factory=list, metadata={"name": "pdfCharacter", "type": "Element"}
|
371 |
+
)
|
372 |
+
base_operations: BaseOperations | None = field(
|
373 |
+
default=None,
|
374 |
+
metadata={"name": "baseOperations", "type": "Element", "required": True},
|
375 |
+
)
|
376 |
+
page_number: int | None = field(
|
377 |
+
default=None,
|
378 |
+
metadata={"name": "pageNumber", "type": "Attribute", "required": True},
|
379 |
+
)
|
380 |
+
unit: str | None = field(
|
381 |
+
default=None, metadata={"name": "Unit", "type": "Attribute", "required": True}
|
382 |
+
)
|
383 |
+
|
384 |
+
|
385 |
+
@dataclass
|
386 |
+
class Document:
|
387 |
+
class Meta:
|
388 |
+
name = "document"
|
389 |
+
|
390 |
+
page: list[Page] = field(
|
391 |
+
default_factory=list, metadata={"type": "Element", "min_occurs": 1}
|
392 |
+
)
|
393 |
+
total_pages: int | None = field(
|
394 |
+
default=None,
|
395 |
+
metadata={"name": "totalPages", "type": "Attribute", "required": True},
|
396 |
+
)
|
src/pdf2u/document_il/il_version_1.rnc
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
start = Document
|
2 |
+
Document =
|
3 |
+
element document {
|
4 |
+
Page+,
|
5 |
+
attribute totalPages { xsd:int }
|
6 |
+
}
|
7 |
+
Page =
|
8 |
+
element page {
|
9 |
+
element mediabox { Box },
|
10 |
+
element cropbox { Box },
|
11 |
+
PDFXobject*,
|
12 |
+
PageLayout*,
|
13 |
+
PDFRectangle*,
|
14 |
+
PDFFont*,
|
15 |
+
PDFParagraph*,
|
16 |
+
PDFFigure*,
|
17 |
+
PDFCharacter*,
|
18 |
+
attribute pageNumber { xsd:int },
|
19 |
+
attribute Unit { xsd:string },
|
20 |
+
element baseOperations { xsd:string }
|
21 |
+
}
|
22 |
+
Box =
|
23 |
+
element box {
|
24 |
+
# from (x,y) to (x2,y2)
|
25 |
+
attribute x { xsd:float },
|
26 |
+
attribute y { xsd:float },
|
27 |
+
attribute x2 { xsd:float },
|
28 |
+
attribute y2 { xsd:float }
|
29 |
+
}
|
30 |
+
PDFXrefId = xsd:int
|
31 |
+
PDFFont =
|
32 |
+
element pdfFont {
|
33 |
+
attribute name { xsd:string },
|
34 |
+
attribute fontId { xsd:string },
|
35 |
+
attribute xrefId { PDFXrefId },
|
36 |
+
attribute encodingLength { xsd:int },
|
37 |
+
attribute bold { xsd:boolean }?,
|
38 |
+
attribute italic { xsd:boolean }?,
|
39 |
+
attribute monospace { xsd:boolean }?,
|
40 |
+
attribute serif { xsd:boolean }?,
|
41 |
+
attribute ascent { xsd:float }?,
|
42 |
+
attribute descent { xsd:float }?
|
43 |
+
}
|
44 |
+
PDFXobject =
|
45 |
+
element pdfXobject {
|
46 |
+
attribute xobjId { xsd:int },
|
47 |
+
attribute xrefId { PDFXrefId },
|
48 |
+
Box,
|
49 |
+
PDFFont*,
|
50 |
+
element baseOperations { xsd:string }
|
51 |
+
}
|
52 |
+
PDFCharacter =
|
53 |
+
element pdfCharacter {
|
54 |
+
attribute vertical { xsd:boolean }?,
|
55 |
+
attribute scale { xsd:float }?,
|
56 |
+
attribute pdfCharacterId { xsd:int }?,
|
57 |
+
attribute char_unicode { xsd:string },
|
58 |
+
attribute advance { xsd:float }?,
|
59 |
+
# xobject nesting depth
|
60 |
+
attribute xobjId { xsd:int }?,
|
61 |
+
attribute debug_info { xsd:boolean }?,
|
62 |
+
PDFStyle,
|
63 |
+
Box
|
64 |
+
}
|
65 |
+
PageLayout =
|
66 |
+
element pageLayout {
|
67 |
+
attribute id { xsd:int },
|
68 |
+
attribute conf { xsd:float },
|
69 |
+
attribute class_name { xsd:string },
|
70 |
+
Box
|
71 |
+
}
|
72 |
+
GraphicState =
|
73 |
+
element graphicState {
|
74 |
+
attribute linewidth { xsd:float }?,
|
75 |
+
attribute dash {
|
76 |
+
list { xsd:float+ }
|
77 |
+
}?,
|
78 |
+
attribute flatness { xsd:float }?,
|
79 |
+
attribute intent { xsd:string }?,
|
80 |
+
attribute linecap { xsd:int }?,
|
81 |
+
attribute linejoin { xsd:int }?,
|
82 |
+
attribute miterlimit { xsd:float }?,
|
83 |
+
attribute ncolor {
|
84 |
+
list { xsd:float+ }
|
85 |
+
}?,
|
86 |
+
attribute scolor {
|
87 |
+
list { xsd:float+ }
|
88 |
+
}?,
|
89 |
+
attribute strokingColorSpaceName { xsd:string }?,
|
90 |
+
attribute nonStrokingColorSpaceName { xsd:string }?,
|
91 |
+
attribute passthroughPerCharInstruction { xsd:string }?
|
92 |
+
}
|
93 |
+
PDFStyle =
|
94 |
+
element pdfStyle {
|
95 |
+
attribute font_id { xsd:string },
|
96 |
+
attribute font_size { xsd:float },
|
97 |
+
GraphicState
|
98 |
+
}
|
99 |
+
PDFParagraph =
|
100 |
+
element pdfParagraph {
|
101 |
+
attribute xobjId { xsd:int }?,
|
102 |
+
attribute unicode { xsd:string },
|
103 |
+
attribute scale { xsd:float }?,
|
104 |
+
attribute vertical { xsd:boolean }?,
|
105 |
+
attribute FirstLineIndent { xsd:boolean }?,
|
106 |
+
attribute debug_id { xsd:string }?,
|
107 |
+
Box,
|
108 |
+
PDFStyle,
|
109 |
+
PDFParagraphComposition*
|
110 |
+
}
|
111 |
+
PDFParagraphComposition =
|
112 |
+
element pdfParagraphComposition {
|
113 |
+
PDFLine
|
114 |
+
| PDFFormula
|
115 |
+
| PDFSameStyleCharacters
|
116 |
+
| PDFCharacter
|
117 |
+
| PDFSameStyleUnicodeCharacters
|
118 |
+
}
|
119 |
+
PDFLine = element pdfLine { Box, PDFCharacter+ }
|
120 |
+
PDFSameStyleCharacters =
|
121 |
+
element pdfSameStyleCharacters { Box, PDFStyle, PDFCharacter+ }
|
122 |
+
PDFSameStyleUnicodeCharacters =
|
123 |
+
element pdfSameStyleUnicodeCharacters {
|
124 |
+
PDFStyle?,
|
125 |
+
attribute unicode { xsd:string },
|
126 |
+
attribute debug_info { xsd:boolean }?
|
127 |
+
}
|
128 |
+
PDFFormula =
|
129 |
+
element pdfFormula {
|
130 |
+
Box,
|
131 |
+
PDFCharacter+,
|
132 |
+
attribute x_offset { xsd:float },
|
133 |
+
attribute y_offset { xsd:float }
|
134 |
+
}
|
135 |
+
PDFFigure = element pdfFigure { Box }
|
136 |
+
PDFRectangle =
|
137 |
+
element pdfRectangle {
|
138 |
+
Box,
|
139 |
+
GraphicState,
|
140 |
+
attribute debug_info { xsd:boolean }?
|
141 |
+
}
|
src/pdf2u/document_il/il_version_1.rng
ADDED
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
|
3 |
+
<start>
|
4 |
+
<ref name="Document"/>
|
5 |
+
</start>
|
6 |
+
<define name="Document">
|
7 |
+
<element name="document">
|
8 |
+
<oneOrMore>
|
9 |
+
<ref name="Page"/>
|
10 |
+
</oneOrMore>
|
11 |
+
<attribute name="totalPages">
|
12 |
+
<data type="int"/>
|
13 |
+
</attribute>
|
14 |
+
</element>
|
15 |
+
</define>
|
16 |
+
<define name="Page">
|
17 |
+
<element name="page">
|
18 |
+
<element name="mediabox">
|
19 |
+
<ref name="Box"/>
|
20 |
+
</element>
|
21 |
+
<element name="cropbox">
|
22 |
+
<ref name="Box"/>
|
23 |
+
</element>
|
24 |
+
<zeroOrMore>
|
25 |
+
<ref name="PDFXobject"/>
|
26 |
+
</zeroOrMore>
|
27 |
+
<zeroOrMore>
|
28 |
+
<ref name="PageLayout"/>
|
29 |
+
</zeroOrMore>
|
30 |
+
<zeroOrMore>
|
31 |
+
<ref name="PDFRectangle"/>
|
32 |
+
</zeroOrMore>
|
33 |
+
<zeroOrMore>
|
34 |
+
<ref name="PDFFont"/>
|
35 |
+
</zeroOrMore>
|
36 |
+
<zeroOrMore>
|
37 |
+
<ref name="PDFParagraph"/>
|
38 |
+
</zeroOrMore>
|
39 |
+
<zeroOrMore>
|
40 |
+
<ref name="PDFFigure"/>
|
41 |
+
</zeroOrMore>
|
42 |
+
<zeroOrMore>
|
43 |
+
<ref name="PDFCharacter"/>
|
44 |
+
</zeroOrMore>
|
45 |
+
<attribute name="pageNumber">
|
46 |
+
<data type="int"/>
|
47 |
+
</attribute>
|
48 |
+
<attribute name="Unit">
|
49 |
+
<data type="string"/>
|
50 |
+
</attribute>
|
51 |
+
<element name="baseOperations">
|
52 |
+
<data type="string"/>
|
53 |
+
</element>
|
54 |
+
</element>
|
55 |
+
</define>
|
56 |
+
<define name="Box">
|
57 |
+
<element name="box">
|
58 |
+
<!-- from (x,y) to (x2,y2) -->
|
59 |
+
<attribute name="x">
|
60 |
+
<data type="float"/>
|
61 |
+
</attribute>
|
62 |
+
<attribute name="y">
|
63 |
+
<data type="float"/>
|
64 |
+
</attribute>
|
65 |
+
<attribute name="x2">
|
66 |
+
<data type="float"/>
|
67 |
+
</attribute>
|
68 |
+
<attribute name="y2">
|
69 |
+
<data type="float"/>
|
70 |
+
</attribute>
|
71 |
+
</element>
|
72 |
+
</define>
|
73 |
+
<define name="PDFXrefId">
|
74 |
+
<data type="int"/>
|
75 |
+
</define>
|
76 |
+
<define name="PDFFont">
|
77 |
+
<element name="pdfFont">
|
78 |
+
<attribute name="name">
|
79 |
+
<data type="string"/>
|
80 |
+
</attribute>
|
81 |
+
<attribute name="fontId">
|
82 |
+
<data type="string"/>
|
83 |
+
</attribute>
|
84 |
+
<attribute name="xrefId">
|
85 |
+
<ref name="PDFXrefId"/>
|
86 |
+
</attribute>
|
87 |
+
<attribute name="encodingLength">
|
88 |
+
<data type="int"/>
|
89 |
+
</attribute>
|
90 |
+
<optional>
|
91 |
+
<attribute name="bold">
|
92 |
+
<data type="boolean"/>
|
93 |
+
</attribute>
|
94 |
+
</optional>
|
95 |
+
<optional>
|
96 |
+
<attribute name="italic">
|
97 |
+
<data type="boolean"/>
|
98 |
+
</attribute>
|
99 |
+
</optional>
|
100 |
+
<optional>
|
101 |
+
<attribute name="monospace">
|
102 |
+
<data type="boolean"/>
|
103 |
+
</attribute>
|
104 |
+
</optional>
|
105 |
+
<optional>
|
106 |
+
<attribute name="serif">
|
107 |
+
<data type="boolean"/>
|
108 |
+
</attribute>
|
109 |
+
</optional>
|
110 |
+
<optional>
|
111 |
+
<attribute name="ascent">
|
112 |
+
<data type="float"/>
|
113 |
+
</attribute>
|
114 |
+
</optional>
|
115 |
+
<optional>
|
116 |
+
<attribute name="descent">
|
117 |
+
<data type="float"/>
|
118 |
+
</attribute>
|
119 |
+
</optional>
|
120 |
+
</element>
|
121 |
+
</define>
|
122 |
+
<define name="PDFXobject">
|
123 |
+
<element name="pdfXobject">
|
124 |
+
<attribute name="xobjId">
|
125 |
+
<data type="int"/>
|
126 |
+
</attribute>
|
127 |
+
<attribute name="xrefId">
|
128 |
+
<ref name="PDFXrefId"/>
|
129 |
+
</attribute>
|
130 |
+
<ref name="Box"/>
|
131 |
+
<zeroOrMore>
|
132 |
+
<ref name="PDFFont"/>
|
133 |
+
</zeroOrMore>
|
134 |
+
<element name="baseOperations">
|
135 |
+
<data type="string"/>
|
136 |
+
</element>
|
137 |
+
</element>
|
138 |
+
</define>
|
139 |
+
<define name="PDFCharacter">
|
140 |
+
<element name="pdfCharacter">
|
141 |
+
<optional>
|
142 |
+
<attribute name="vertical">
|
143 |
+
<data type="boolean"/>
|
144 |
+
</attribute>
|
145 |
+
</optional>
|
146 |
+
<optional>
|
147 |
+
<attribute name="scale">
|
148 |
+
<data type="float"/>
|
149 |
+
</attribute>
|
150 |
+
</optional>
|
151 |
+
<optional>
|
152 |
+
<attribute name="pdfCharacterId">
|
153 |
+
<data type="int"/>
|
154 |
+
</attribute>
|
155 |
+
</optional>
|
156 |
+
<attribute name="char_unicode">
|
157 |
+
<data type="string"/>
|
158 |
+
</attribute>
|
159 |
+
<optional>
|
160 |
+
<attribute name="advance">
|
161 |
+
<data type="float"/>
|
162 |
+
</attribute>
|
163 |
+
</optional>
|
164 |
+
<optional>
|
165 |
+
<!-- xobject nesting depth -->
|
166 |
+
<attribute name="xobjId">
|
167 |
+
<data type="int"/>
|
168 |
+
</attribute>
|
169 |
+
</optional>
|
170 |
+
<optional>
|
171 |
+
<attribute name="debug_info">
|
172 |
+
<data type="boolean"/>
|
173 |
+
</attribute>
|
174 |
+
</optional>
|
175 |
+
<ref name="PDFStyle"/>
|
176 |
+
<ref name="Box"/>
|
177 |
+
</element>
|
178 |
+
</define>
|
179 |
+
<define name="PageLayout">
|
180 |
+
<element name="pageLayout">
|
181 |
+
<attribute name="id">
|
182 |
+
<data type="int"/>
|
183 |
+
</attribute>
|
184 |
+
<attribute name="conf">
|
185 |
+
<data type="float"/>
|
186 |
+
</attribute>
|
187 |
+
<attribute name="class_name">
|
188 |
+
<data type="string"/>
|
189 |
+
</attribute>
|
190 |
+
<ref name="Box"/>
|
191 |
+
</element>
|
192 |
+
</define>
|
193 |
+
<define name="GraphicState">
|
194 |
+
<element name="graphicState">
|
195 |
+
<optional>
|
196 |
+
<attribute name="linewidth">
|
197 |
+
<data type="float"/>
|
198 |
+
</attribute>
|
199 |
+
</optional>
|
200 |
+
<optional>
|
201 |
+
<attribute name="dash">
|
202 |
+
<list>
|
203 |
+
<oneOrMore>
|
204 |
+
<data type="float"/>
|
205 |
+
</oneOrMore>
|
206 |
+
</list>
|
207 |
+
</attribute>
|
208 |
+
</optional>
|
209 |
+
<optional>
|
210 |
+
<attribute name="flatness">
|
211 |
+
<data type="float"/>
|
212 |
+
</attribute>
|
213 |
+
</optional>
|
214 |
+
<optional>
|
215 |
+
<attribute name="intent">
|
216 |
+
<data type="string"/>
|
217 |
+
</attribute>
|
218 |
+
</optional>
|
219 |
+
<optional>
|
220 |
+
<attribute name="linecap">
|
221 |
+
<data type="int"/>
|
222 |
+
</attribute>
|
223 |
+
</optional>
|
224 |
+
<optional>
|
225 |
+
<attribute name="linejoin">
|
226 |
+
<data type="int"/>
|
227 |
+
</attribute>
|
228 |
+
</optional>
|
229 |
+
<optional>
|
230 |
+
<attribute name="miterlimit">
|
231 |
+
<data type="float"/>
|
232 |
+
</attribute>
|
233 |
+
</optional>
|
234 |
+
<optional>
|
235 |
+
<attribute name="ncolor">
|
236 |
+
<list>
|
237 |
+
<oneOrMore>
|
238 |
+
<data type="float"/>
|
239 |
+
</oneOrMore>
|
240 |
+
</list>
|
241 |
+
</attribute>
|
242 |
+
</optional>
|
243 |
+
<optional>
|
244 |
+
<attribute name="scolor">
|
245 |
+
<list>
|
246 |
+
<oneOrMore>
|
247 |
+
<data type="float"/>
|
248 |
+
</oneOrMore>
|
249 |
+
</list>
|
250 |
+
</attribute>
|
251 |
+
</optional>
|
252 |
+
<optional>
|
253 |
+
<attribute name="strokingColorSpaceName">
|
254 |
+
<data type="string"/>
|
255 |
+
</attribute>
|
256 |
+
</optional>
|
257 |
+
<optional>
|
258 |
+
<attribute name="nonStrokingColorSpaceName">
|
259 |
+
<data type="string"/>
|
260 |
+
</attribute>
|
261 |
+
</optional>
|
262 |
+
<optional>
|
263 |
+
<attribute name="passthroughPerCharInstruction">
|
264 |
+
<data type="string"/>
|
265 |
+
</attribute>
|
266 |
+
</optional>
|
267 |
+
</element>
|
268 |
+
</define>
|
269 |
+
<define name="PDFStyle">
|
270 |
+
<element name="pdfStyle">
|
271 |
+
<attribute name="font_id">
|
272 |
+
<data type="string"/>
|
273 |
+
</attribute>
|
274 |
+
<attribute name="font_size">
|
275 |
+
<data type="float"/>
|
276 |
+
</attribute>
|
277 |
+
<ref name="GraphicState"/>
|
278 |
+
</element>
|
279 |
+
</define>
|
280 |
+
<define name="PDFParagraph">
|
281 |
+
<element name="pdfParagraph">
|
282 |
+
<optional>
|
283 |
+
<attribute name="xobjId">
|
284 |
+
<data type="int"/>
|
285 |
+
</attribute>
|
286 |
+
</optional>
|
287 |
+
<attribute name="unicode">
|
288 |
+
<data type="string"/>
|
289 |
+
</attribute>
|
290 |
+
<optional>
|
291 |
+
<attribute name="scale">
|
292 |
+
<data type="float"/>
|
293 |
+
</attribute>
|
294 |
+
</optional>
|
295 |
+
<optional>
|
296 |
+
<attribute name="vertical">
|
297 |
+
<data type="boolean"/>
|
298 |
+
</attribute>
|
299 |
+
</optional>
|
300 |
+
<optional>
|
301 |
+
<attribute name="FirstLineIndent">
|
302 |
+
<data type="boolean"/>
|
303 |
+
</attribute>
|
304 |
+
</optional>
|
305 |
+
<optional>
|
306 |
+
<attribute name="debug_id">
|
307 |
+
<data type="string"/>
|
308 |
+
</attribute>
|
309 |
+
</optional>
|
310 |
+
<ref name="Box"/>
|
311 |
+
<ref name="PDFStyle"/>
|
312 |
+
<zeroOrMore>
|
313 |
+
<ref name="PDFParagraphComposition"/>
|
314 |
+
</zeroOrMore>
|
315 |
+
</element>
|
316 |
+
</define>
|
317 |
+
<define name="PDFParagraphComposition">
|
318 |
+
<element name="pdfParagraphComposition">
|
319 |
+
<choice>
|
320 |
+
<ref name="PDFLine"/>
|
321 |
+
<ref name="PDFFormula"/>
|
322 |
+
<ref name="PDFSameStyleCharacters"/>
|
323 |
+
<ref name="PDFCharacter"/>
|
324 |
+
<ref name="PDFSameStyleUnicodeCharacters"/>
|
325 |
+
</choice>
|
326 |
+
</element>
|
327 |
+
</define>
|
328 |
+
<define name="PDFLine">
|
329 |
+
<element name="pdfLine">
|
330 |
+
<ref name="Box"/>
|
331 |
+
<oneOrMore>
|
332 |
+
<ref name="PDFCharacter"/>
|
333 |
+
</oneOrMore>
|
334 |
+
</element>
|
335 |
+
</define>
|
336 |
+
<define name="PDFSameStyleCharacters">
|
337 |
+
<element name="pdfSameStyleCharacters">
|
338 |
+
<ref name="Box"/>
|
339 |
+
<ref name="PDFStyle"/>
|
340 |
+
<oneOrMore>
|
341 |
+
<ref name="PDFCharacter"/>
|
342 |
+
</oneOrMore>
|
343 |
+
</element>
|
344 |
+
</define>
|
345 |
+
<define name="PDFSameStyleUnicodeCharacters">
|
346 |
+
<element name="pdfSameStyleUnicodeCharacters">
|
347 |
+
<optional>
|
348 |
+
<ref name="PDFStyle"/>
|
349 |
+
</optional>
|
350 |
+
<attribute name="unicode">
|
351 |
+
<data type="string"/>
|
352 |
+
</attribute>
|
353 |
+
<optional>
|
354 |
+
<attribute name="debug_info">
|
355 |
+
<data type="boolean"/>
|
356 |
+
</attribute>
|
357 |
+
</optional>
|
358 |
+
</element>
|
359 |
+
</define>
|
360 |
+
<define name="PDFFormula">
|
361 |
+
<element name="pdfFormula">
|
362 |
+
<ref name="Box"/>
|
363 |
+
<oneOrMore>
|
364 |
+
<ref name="PDFCharacter"/>
|
365 |
+
</oneOrMore>
|
366 |
+
<attribute name="x_offset">
|
367 |
+
<data type="float"/>
|
368 |
+
</attribute>
|
369 |
+
<attribute name="y_offset">
|
370 |
+
<data type="float"/>
|
371 |
+
</attribute>
|
372 |
+
</element>
|
373 |
+
</define>
|
374 |
+
<define name="PDFFigure">
|
375 |
+
<element name="pdfFigure">
|
376 |
+
<ref name="Box"/>
|
377 |
+
</element>
|
378 |
+
</define>
|
379 |
+
<define name="PDFRectangle">
|
380 |
+
<element name="pdfRectangle">
|
381 |
+
<ref name="Box"/>
|
382 |
+
<ref name="GraphicState"/>
|
383 |
+
<optional>
|
384 |
+
<attribute name="debug_info">
|
385 |
+
<data type="boolean"/>
|
386 |
+
</attribute>
|
387 |
+
</optional>
|
388 |
+
</element>
|
389 |
+
</define>
|
390 |
+
</grammar>
|
src/pdf2u/document_il/il_version_1.xsd
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified">
|
3 |
+
<xs:element name="document">
|
4 |
+
<xs:complexType>
|
5 |
+
<xs:sequence>
|
6 |
+
<xs:element maxOccurs="unbounded" ref="page"/>
|
7 |
+
</xs:sequence>
|
8 |
+
<xs:attribute name="totalPages" use="required" type="xs:int"/>
|
9 |
+
</xs:complexType>
|
10 |
+
</xs:element>
|
11 |
+
<xs:element name="page">
|
12 |
+
<xs:complexType>
|
13 |
+
<xs:sequence>
|
14 |
+
<xs:element ref="mediabox"/>
|
15 |
+
<xs:element ref="cropbox"/>
|
16 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfXobject"/>
|
17 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pageLayout"/>
|
18 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfRectangle"/>
|
19 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
|
20 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraph"/>
|
21 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFigure"/>
|
22 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfCharacter"/>
|
23 |
+
<xs:element ref="baseOperations"/>
|
24 |
+
</xs:sequence>
|
25 |
+
<xs:attribute name="pageNumber" use="required" type="xs:int"/>
|
26 |
+
<xs:attribute name="Unit" use="required" type="xs:string"/>
|
27 |
+
</xs:complexType>
|
28 |
+
</xs:element>
|
29 |
+
<xs:element name="mediabox">
|
30 |
+
<xs:complexType>
|
31 |
+
<xs:sequence>
|
32 |
+
<xs:element ref="box"/>
|
33 |
+
</xs:sequence>
|
34 |
+
</xs:complexType>
|
35 |
+
</xs:element>
|
36 |
+
<xs:element name="cropbox">
|
37 |
+
<xs:complexType>
|
38 |
+
<xs:sequence>
|
39 |
+
<xs:element ref="box"/>
|
40 |
+
</xs:sequence>
|
41 |
+
</xs:complexType>
|
42 |
+
</xs:element>
|
43 |
+
<xs:element name="baseOperations" type="xs:string"/>
|
44 |
+
<xs:element name="box">
|
45 |
+
<xs:complexType>
|
46 |
+
<xs:attribute name="x" use="required" type="xs:float"/>
|
47 |
+
<xs:attribute name="y" use="required" type="xs:float"/>
|
48 |
+
<xs:attribute name="x2" use="required" type="xs:float"/>
|
49 |
+
<xs:attribute name="y2" use="required" type="xs:float"/>
|
50 |
+
</xs:complexType>
|
51 |
+
</xs:element>
|
52 |
+
<xs:simpleType name="PDFXrefId">
|
53 |
+
<xs:restriction base="xs:int"/>
|
54 |
+
</xs:simpleType>
|
55 |
+
<xs:element name="pdfFont">
|
56 |
+
<xs:complexType>
|
57 |
+
<xs:attribute name="name" use="required" type="xs:string"/>
|
58 |
+
<xs:attribute name="fontId" use="required" type="xs:string"/>
|
59 |
+
<xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
|
60 |
+
<xs:attribute name="encodingLength" use="required" type="xs:int"/>
|
61 |
+
<xs:attribute name="bold" type="xs:boolean"/>
|
62 |
+
<xs:attribute name="italic" type="xs:boolean"/>
|
63 |
+
<xs:attribute name="monospace" type="xs:boolean"/>
|
64 |
+
<xs:attribute name="serif" type="xs:boolean"/>
|
65 |
+
<xs:attribute name="ascent" type="xs:float"/>
|
66 |
+
<xs:attribute name="descent" type="xs:float"/>
|
67 |
+
</xs:complexType>
|
68 |
+
</xs:element>
|
69 |
+
<xs:element name="pdfXobject">
|
70 |
+
<xs:complexType>
|
71 |
+
<xs:sequence>
|
72 |
+
<xs:element ref="box"/>
|
73 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
|
74 |
+
<xs:element ref="baseOperations"/>
|
75 |
+
</xs:sequence>
|
76 |
+
<xs:attribute name="xobjId" use="required" type="xs:int"/>
|
77 |
+
<xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
|
78 |
+
</xs:complexType>
|
79 |
+
</xs:element>
|
80 |
+
<xs:element name="pdfCharacter">
|
81 |
+
<xs:complexType>
|
82 |
+
<xs:sequence>
|
83 |
+
<xs:element ref="pdfStyle"/>
|
84 |
+
<xs:element ref="box"/>
|
85 |
+
</xs:sequence>
|
86 |
+
<xs:attribute name="vertical" type="xs:boolean"/>
|
87 |
+
<xs:attribute name="scale" type="xs:float"/>
|
88 |
+
<xs:attribute name="pdfCharacterId" type="xs:int"/>
|
89 |
+
<xs:attribute name="char_unicode" use="required" type="xs:string"/>
|
90 |
+
<xs:attribute name="advance" type="xs:float"/>
|
91 |
+
<xs:attribute name="xobjId" type="xs:int"/>
|
92 |
+
<xs:attribute name="debug_info" type="xs:boolean"/>
|
93 |
+
</xs:complexType>
|
94 |
+
</xs:element>
|
95 |
+
<xs:element name="pageLayout">
|
96 |
+
<xs:complexType>
|
97 |
+
<xs:sequence>
|
98 |
+
<xs:element ref="box"/>
|
99 |
+
</xs:sequence>
|
100 |
+
<xs:attribute name="id" use="required" type="xs:int"/>
|
101 |
+
<xs:attribute name="conf" use="required" type="xs:float"/>
|
102 |
+
<xs:attribute name="class_name" use="required" type="xs:string"/>
|
103 |
+
</xs:complexType>
|
104 |
+
</xs:element>
|
105 |
+
<xs:element name="graphicState">
|
106 |
+
<xs:complexType>
|
107 |
+
<xs:attribute name="linewidth" type="xs:float"/>
|
108 |
+
<xs:attribute name="dash">
|
109 |
+
<xs:simpleType>
|
110 |
+
<xs:restriction>
|
111 |
+
<xs:simpleType>
|
112 |
+
<xs:list itemType="xs:float"/>
|
113 |
+
</xs:simpleType>
|
114 |
+
<xs:minLength value="1"/>
|
115 |
+
</xs:restriction>
|
116 |
+
</xs:simpleType>
|
117 |
+
</xs:attribute>
|
118 |
+
<xs:attribute name="flatness" type="xs:float"/>
|
119 |
+
<xs:attribute name="intent" type="xs:string"/>
|
120 |
+
<xs:attribute name="linecap" type="xs:int"/>
|
121 |
+
<xs:attribute name="linejoin" type="xs:int"/>
|
122 |
+
<xs:attribute name="miterlimit" type="xs:float"/>
|
123 |
+
<xs:attribute name="ncolor">
|
124 |
+
<xs:simpleType>
|
125 |
+
<xs:restriction>
|
126 |
+
<xs:simpleType>
|
127 |
+
<xs:list itemType="xs:float"/>
|
128 |
+
</xs:simpleType>
|
129 |
+
<xs:minLength value="1"/>
|
130 |
+
</xs:restriction>
|
131 |
+
</xs:simpleType>
|
132 |
+
</xs:attribute>
|
133 |
+
<xs:attribute name="scolor">
|
134 |
+
<xs:simpleType>
|
135 |
+
<xs:restriction>
|
136 |
+
<xs:simpleType>
|
137 |
+
<xs:list itemType="xs:float"/>
|
138 |
+
</xs:simpleType>
|
139 |
+
<xs:minLength value="1"/>
|
140 |
+
</xs:restriction>
|
141 |
+
</xs:simpleType>
|
142 |
+
</xs:attribute>
|
143 |
+
<xs:attribute name="strokingColorSpaceName" type="xs:string"/>
|
144 |
+
<xs:attribute name="nonStrokingColorSpaceName" type="xs:string"/>
|
145 |
+
<xs:attribute name="passthroughPerCharInstruction" type="xs:string"/>
|
146 |
+
</xs:complexType>
|
147 |
+
</xs:element>
|
148 |
+
<xs:element name="pdfStyle">
|
149 |
+
<xs:complexType>
|
150 |
+
<xs:sequence>
|
151 |
+
<xs:element ref="graphicState"/>
|
152 |
+
</xs:sequence>
|
153 |
+
<xs:attribute name="font_id" use="required" type="xs:string"/>
|
154 |
+
<xs:attribute name="font_size" use="required" type="xs:float"/>
|
155 |
+
</xs:complexType>
|
156 |
+
</xs:element>
|
157 |
+
<xs:element name="pdfParagraph">
|
158 |
+
<xs:complexType>
|
159 |
+
<xs:sequence>
|
160 |
+
<xs:element ref="box"/>
|
161 |
+
<xs:element ref="pdfStyle"/>
|
162 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraphComposition"/>
|
163 |
+
</xs:sequence>
|
164 |
+
<xs:attribute name="xobjId" type="xs:int"/>
|
165 |
+
<xs:attribute name="unicode" use="required" type="xs:string"/>
|
166 |
+
<xs:attribute name="scale" type="xs:float"/>
|
167 |
+
<xs:attribute name="vertical" type="xs:boolean"/>
|
168 |
+
<xs:attribute name="FirstLineIndent" type="xs:boolean"/>
|
169 |
+
<xs:attribute name="debug_id" type="xs:string"/>
|
170 |
+
</xs:complexType>
|
171 |
+
</xs:element>
|
172 |
+
<xs:element name="pdfParagraphComposition">
|
173 |
+
<xs:complexType>
|
174 |
+
<xs:choice>
|
175 |
+
<xs:element ref="pdfLine"/>
|
176 |
+
<xs:element ref="pdfFormula"/>
|
177 |
+
<xs:element ref="pdfSameStyleCharacters"/>
|
178 |
+
<xs:element ref="pdfCharacter"/>
|
179 |
+
<xs:element ref="pdfSameStyleUnicodeCharacters"/>
|
180 |
+
</xs:choice>
|
181 |
+
</xs:complexType>
|
182 |
+
</xs:element>
|
183 |
+
<xs:element name="pdfLine">
|
184 |
+
<xs:complexType>
|
185 |
+
<xs:sequence>
|
186 |
+
<xs:element ref="box"/>
|
187 |
+
<xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
|
188 |
+
</xs:sequence>
|
189 |
+
</xs:complexType>
|
190 |
+
</xs:element>
|
191 |
+
<xs:element name="pdfSameStyleCharacters">
|
192 |
+
<xs:complexType>
|
193 |
+
<xs:sequence>
|
194 |
+
<xs:element ref="box"/>
|
195 |
+
<xs:element ref="pdfStyle"/>
|
196 |
+
<xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
|
197 |
+
</xs:sequence>
|
198 |
+
</xs:complexType>
|
199 |
+
</xs:element>
|
200 |
+
<xs:element name="pdfSameStyleUnicodeCharacters">
|
201 |
+
<xs:complexType>
|
202 |
+
<xs:sequence>
|
203 |
+
<xs:element minOccurs="0" ref="pdfStyle"/>
|
204 |
+
</xs:sequence>
|
205 |
+
<xs:attribute name="unicode" use="required" type="xs:string"/>
|
206 |
+
<xs:attribute name="debug_info" type="xs:boolean"/>
|
207 |
+
</xs:complexType>
|
208 |
+
</xs:element>
|
209 |
+
<xs:element name="pdfFormula">
|
210 |
+
<xs:complexType>
|
211 |
+
<xs:sequence>
|
212 |
+
<xs:element ref="box"/>
|
213 |
+
<xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
|
214 |
+
</xs:sequence>
|
215 |
+
<xs:attribute name="x_offset" use="required" type="xs:float"/>
|
216 |
+
<xs:attribute name="y_offset" use="required" type="xs:float"/>
|
217 |
+
</xs:complexType>
|
218 |
+
</xs:element>
|
219 |
+
<xs:element name="pdfFigure">
|
220 |
+
<xs:complexType>
|
221 |
+
<xs:sequence>
|
222 |
+
<xs:element ref="box"/>
|
223 |
+
</xs:sequence>
|
224 |
+
</xs:complexType>
|
225 |
+
</xs:element>
|
226 |
+
<xs:element name="pdfRectangle">
|
227 |
+
<xs:complexType>
|
228 |
+
<xs:sequence>
|
229 |
+
<xs:element ref="box"/>
|
230 |
+
<xs:element ref="graphicState"/>
|
231 |
+
</xs:sequence>
|
232 |
+
<xs:attribute name="debug_info" type="xs:boolean"/>
|
233 |
+
</xs:complexType>
|
234 |
+
</xs:element>
|
235 |
+
</xs:schema>
|
src/pdf2u/document_il/midend/__init__.py
ADDED
File without changes
|
src/pdf2u/document_il/midend/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (180 Bytes). View file
|
|