Spaces:
Running
Running
sanbo
commited on
Commit
•
9b0f4a0
1
Parent(s):
a4272c9
update sth. at 2024-11-26 16:15:47
Browse files- Dockerfile +13 -0
- Dockerfile.Demo +22 -0
- LICENSE +661 -0
- README.md +268 -0
- app.json +5 -0
- docs/README_GUI.md +18 -0
- docs/licenses/LICENSE.pdfminer.six +22 -0
- docs/licenses/LICENSE.pyHanko +23 -0
- pdf2zh/__init__.py +2 -0
- pdf2zh/_saslprep.py +101 -0
- pdf2zh/arcfour.py +35 -0
- pdf2zh/ascii85.py +70 -0
- pdf2zh/cache.py +91 -0
- pdf2zh/casting.py +15 -0
- pdf2zh/ccitt.py +614 -0
- pdf2zh/cmapdb.py +471 -0
- pdf2zh/converter.py +1384 -0
- pdf2zh/data_structures.py +52 -0
- pdf2zh/doclayout.py +213 -0
- pdf2zh/encodingdb.py +127 -0
- pdf2zh/fontmetrics.py +0 -0
- pdf2zh/glyphlist.py +0 -0
- pdf2zh/gui.py +425 -0
- pdf2zh/high_level.py +298 -0
- pdf2zh/image.py +297 -0
- pdf2zh/jbig2.py +373 -0
- pdf2zh/latin_enc.py +246 -0
- pdf2zh/layout.py +993 -0
- pdf2zh/lzw.py +105 -0
- pdf2zh/pdf2zh.py +310 -0
- pdf2zh/pdfcolor.py +37 -0
- pdf2zh/pdfdevice.py +316 -0
- pdf2zh/pdfdocument.py +1069 -0
- pdf2zh/pdfexceptions.py +33 -0
- pdf2zh/pdffont.py +1190 -0
- pdf2zh/pdfinterp.py +1113 -0
- pdf2zh/pdfpage.py +196 -0
- pdf2zh/pdfparser.py +166 -0
- pdf2zh/pdftypes.py +397 -0
- pdf2zh/psexceptions.py +18 -0
- pdf2zh/psparser.py +656 -0
- pdf2zh/py.typed +0 -0
- pdf2zh/runlength.py +39 -0
- pdf2zh/settings.py +1 -0
- pdf2zh/translator.py +315 -0
- pdf2zh/utils.py +834 -0
- pyproject.toml +52 -0
- setup.cfg +4 -0
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.12
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY . .
|
6 |
+
|
7 |
+
ENV PYTHONUNBUFFERED=1
|
8 |
+
|
9 |
+
RUN apt-get update && apt-get install -y libgl1
|
10 |
+
|
11 |
+
RUN pip install .
|
12 |
+
|
13 |
+
CMD ["pdf2zh", "-i"]
|
Dockerfile.Demo
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.12
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY . .
|
6 |
+
|
7 |
+
ENV PYTHONUNBUFFERED=1
|
8 |
+
|
9 |
+
RUN apt-get update && apt-get install -y libgl1
|
10 |
+
|
11 |
+
RUN pip install .
|
12 |
+
|
13 |
+
RUN mkdir -p /data
|
14 |
+
RUN chmod 777 /data
|
15 |
+
RUN mkdir -p /app
|
16 |
+
RUN chmod 777 /app
|
17 |
+
RUN mkdir -p /.cache
|
18 |
+
RUN chmod 777 /.cache
|
19 |
+
RUN mkdir -p ./gradio_files
|
20 |
+
RUN chmod 777 ./gradio_files
|
21 |
+
|
22 |
+
CMD ["pdf2zh", "-i"]
|
LICENSE
ADDED
@@ -0,0 +1,661 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GNU AFFERO GENERAL PUBLIC LICENSE
|
2 |
+
Version 3, 19 November 2007
|
3 |
+
|
4 |
+
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
5 |
+
Everyone is permitted to copy and distribute verbatim copies
|
6 |
+
of this license document, but changing it is not allowed.
|
7 |
+
|
8 |
+
Preamble
|
9 |
+
|
10 |
+
The GNU Affero General Public License is a free, copyleft license for
|
11 |
+
software and other kinds of works, specifically designed to ensure
|
12 |
+
cooperation with the community in the case of network server software.
|
13 |
+
|
14 |
+
The licenses for most software and other practical works are designed
|
15 |
+
to take away your freedom to share and change the works. By contrast,
|
16 |
+
our General Public Licenses are intended to guarantee your freedom to
|
17 |
+
share and change all versions of a program--to make sure it remains free
|
18 |
+
software for all its users.
|
19 |
+
|
20 |
+
When we speak of free software, we are referring to freedom, not
|
21 |
+
price. Our General Public Licenses are designed to make sure that you
|
22 |
+
have the freedom to distribute copies of free software (and charge for
|
23 |
+
them if you wish), that you receive source code or can get it if you
|
24 |
+
want it, that you can change the software or use pieces of it in new
|
25 |
+
free programs, and that you know you can do these things.
|
26 |
+
|
27 |
+
Developers that use our General Public Licenses protect your rights
|
28 |
+
with two steps: (1) assert copyright on the software, and (2) offer
|
29 |
+
you this License which gives you legal permission to copy, distribute
|
30 |
+
and/or modify the software.
|
31 |
+
|
32 |
+
A secondary benefit of defending all users' freedom is that
|
33 |
+
improvements made in alternate versions of the program, if they
|
34 |
+
receive widespread use, become available for other developers to
|
35 |
+
incorporate. Many developers of free software are heartened and
|
36 |
+
encouraged by the resulting cooperation. However, in the case of
|
37 |
+
software used on network servers, this result may fail to come about.
|
38 |
+
The GNU General Public License permits making a modified version and
|
39 |
+
letting the public access it on a server without ever releasing its
|
40 |
+
source code to the public.
|
41 |
+
|
42 |
+
The GNU Affero General Public License is designed specifically to
|
43 |
+
ensure that, in such cases, the modified source code becomes available
|
44 |
+
to the community. It requires the operator of a network server to
|
45 |
+
provide the source code of the modified version running there to the
|
46 |
+
users of that server. Therefore, public use of a modified version, on
|
47 |
+
a publicly accessible server, gives the public access to the source
|
48 |
+
code of the modified version.
|
49 |
+
|
50 |
+
An older license, called the Affero General Public License and
|
51 |
+
published by Affero, was designed to accomplish similar goals. This is
|
52 |
+
a different license, not a version of the Affero GPL, but Affero has
|
53 |
+
released a new version of the Affero GPL which permits relicensing under
|
54 |
+
this license.
|
55 |
+
|
56 |
+
The precise terms and conditions for copying, distribution and
|
57 |
+
modification follow.
|
58 |
+
|
59 |
+
TERMS AND CONDITIONS
|
60 |
+
|
61 |
+
0. Definitions.
|
62 |
+
|
63 |
+
"This License" refers to version 3 of the GNU Affero General Public License.
|
64 |
+
|
65 |
+
"Copyright" also means copyright-like laws that apply to other kinds of
|
66 |
+
works, such as semiconductor masks.
|
67 |
+
|
68 |
+
"The Program" refers to any copyrightable work licensed under this
|
69 |
+
License. Each licensee is addressed as "you". "Licensees" and
|
70 |
+
"recipients" may be individuals or organizations.
|
71 |
+
|
72 |
+
To "modify" a work means to copy from or adapt all or part of the work
|
73 |
+
in a fashion requiring copyright permission, other than the making of an
|
74 |
+
exact copy. The resulting work is called a "modified version" of the
|
75 |
+
earlier work or a work "based on" the earlier work.
|
76 |
+
|
77 |
+
A "covered work" means either the unmodified Program or a work based
|
78 |
+
on the Program.
|
79 |
+
|
80 |
+
To "propagate" a work means to do anything with it that, without
|
81 |
+
permission, would make you directly or secondarily liable for
|
82 |
+
infringement under applicable copyright law, except executing it on a
|
83 |
+
computer or modifying a private copy. Propagation includes copying,
|
84 |
+
distribution (with or without modification), making available to the
|
85 |
+
public, and in some countries other activities as well.
|
86 |
+
|
87 |
+
To "convey" a work means any kind of propagation that enables other
|
88 |
+
parties to make or receive copies. Mere interaction with a user through
|
89 |
+
a computer network, with no transfer of a copy, is not conveying.
|
90 |
+
|
91 |
+
An interactive user interface displays "Appropriate Legal Notices"
|
92 |
+
to the extent that it includes a convenient and prominently visible
|
93 |
+
feature that (1) displays an appropriate copyright notice, and (2)
|
94 |
+
tells the user that there is no warranty for the work (except to the
|
95 |
+
extent that warranties are provided), that licensees may convey the
|
96 |
+
work under this License, and how to view a copy of this License. If
|
97 |
+
the interface presents a list of user commands or options, such as a
|
98 |
+
menu, a prominent item in the list meets this criterion.
|
99 |
+
|
100 |
+
1. Source Code.
|
101 |
+
|
102 |
+
The "source code" for a work means the preferred form of the work
|
103 |
+
for making modifications to it. "Object code" means any non-source
|
104 |
+
form of a work.
|
105 |
+
|
106 |
+
A "Standard Interface" means an interface that either is an official
|
107 |
+
standard defined by a recognized standards body, or, in the case of
|
108 |
+
interfaces specified for a particular programming language, one that
|
109 |
+
is widely used among developers working in that language.
|
110 |
+
|
111 |
+
The "System Libraries" of an executable work include anything, other
|
112 |
+
than the work as a whole, that (a) is included in the normal form of
|
113 |
+
packaging a Major Component, but which is not part of that Major
|
114 |
+
Component, and (b) serves only to enable use of the work with that
|
115 |
+
Major Component, or to implement a Standard Interface for which an
|
116 |
+
implementation is available to the public in source code form. A
|
117 |
+
"Major Component", in this context, means a major essential component
|
118 |
+
(kernel, window system, and so on) of the specific operating system
|
119 |
+
(if any) on which the executable work runs, or a compiler used to
|
120 |
+
produce the work, or an object code interpreter used to run it.
|
121 |
+
|
122 |
+
The "Corresponding Source" for a work in object code form means all
|
123 |
+
the source code needed to generate, install, and (for an executable
|
124 |
+
work) run the object code and to modify the work, including scripts to
|
125 |
+
control those activities. However, it does not include the work's
|
126 |
+
System Libraries, or general-purpose tools or generally available free
|
127 |
+
programs which are used unmodified in performing those activities but
|
128 |
+
which are not part of the work. For example, Corresponding Source
|
129 |
+
includes interface definition files associated with source files for
|
130 |
+
the work, and the source code for shared libraries and dynamically
|
131 |
+
linked subprograms that the work is specifically designed to require,
|
132 |
+
such as by intimate data communication or control flow between those
|
133 |
+
subprograms and other parts of the work.
|
134 |
+
|
135 |
+
The Corresponding Source need not include anything that users
|
136 |
+
can regenerate automatically from other parts of the Corresponding
|
137 |
+
Source.
|
138 |
+
|
139 |
+
The Corresponding Source for a work in source code form is that
|
140 |
+
same work.
|
141 |
+
|
142 |
+
2. Basic Permissions.
|
143 |
+
|
144 |
+
All rights granted under this License are granted for the term of
|
145 |
+
copyright on the Program, and are irrevocable provided the stated
|
146 |
+
conditions are met. This License explicitly affirms your unlimited
|
147 |
+
permission to run the unmodified Program. The output from running a
|
148 |
+
covered work is covered by this License only if the output, given its
|
149 |
+
content, constitutes a covered work. This License acknowledges your
|
150 |
+
rights of fair use or other equivalent, as provided by copyright law.
|
151 |
+
|
152 |
+
You may make, run and propagate covered works that you do not
|
153 |
+
convey, without conditions so long as your license otherwise remains
|
154 |
+
in force. You may convey covered works to others for the sole purpose
|
155 |
+
of having them make modifications exclusively for you, or provide you
|
156 |
+
with facilities for running those works, provided that you comply with
|
157 |
+
the terms of this License in conveying all material for which you do
|
158 |
+
not control copyright. Those thus making or running the covered works
|
159 |
+
for you must do so exclusively on your behalf, under your direction
|
160 |
+
and control, on terms that prohibit them from making any copies of
|
161 |
+
your copyrighted material outside their relationship with you.
|
162 |
+
|
163 |
+
Conveying under any other circumstances is permitted solely under
|
164 |
+
the conditions stated below. Sublicensing is not allowed; section 10
|
165 |
+
makes it unnecessary.
|
166 |
+
|
167 |
+
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
168 |
+
|
169 |
+
No covered work shall be deemed part of an effective technological
|
170 |
+
measure under any applicable law fulfilling obligations under article
|
171 |
+
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
172 |
+
similar laws prohibiting or restricting circumvention of such
|
173 |
+
measures.
|
174 |
+
|
175 |
+
When you convey a covered work, you waive any legal power to forbid
|
176 |
+
circumvention of technological measures to the extent such circumvention
|
177 |
+
is effected by exercising rights under this License with respect to
|
178 |
+
the covered work, and you disclaim any intention to limit operation or
|
179 |
+
modification of the work as a means of enforcing, against the work's
|
180 |
+
users, your or third parties' legal rights to forbid circumvention of
|
181 |
+
technological measures.
|
182 |
+
|
183 |
+
4. Conveying Verbatim Copies.
|
184 |
+
|
185 |
+
You may convey verbatim copies of the Program's source code as you
|
186 |
+
receive it, in any medium, provided that you conspicuously and
|
187 |
+
appropriately publish on each copy an appropriate copyright notice;
|
188 |
+
keep intact all notices stating that this License and any
|
189 |
+
non-permissive terms added in accord with section 7 apply to the code;
|
190 |
+
keep intact all notices of the absence of any warranty; and give all
|
191 |
+
recipients a copy of this License along with the Program.
|
192 |
+
|
193 |
+
You may charge any price or no price for each copy that you convey,
|
194 |
+
and you may offer support or warranty protection for a fee.
|
195 |
+
|
196 |
+
5. Conveying Modified Source Versions.
|
197 |
+
|
198 |
+
You may convey a work based on the Program, or the modifications to
|
199 |
+
produce it from the Program, in the form of source code under the
|
200 |
+
terms of section 4, provided that you also meet all of these conditions:
|
201 |
+
|
202 |
+
a) The work must carry prominent notices stating that you modified
|
203 |
+
it, and giving a relevant date.
|
204 |
+
|
205 |
+
b) The work must carry prominent notices stating that it is
|
206 |
+
released under this License and any conditions added under section
|
207 |
+
7. This requirement modifies the requirement in section 4 to
|
208 |
+
"keep intact all notices".
|
209 |
+
|
210 |
+
c) You must license the entire work, as a whole, under this
|
211 |
+
License to anyone who comes into possession of a copy. This
|
212 |
+
License will therefore apply, along with any applicable section 7
|
213 |
+
additional terms, to the whole of the work, and all its parts,
|
214 |
+
regardless of how they are packaged. This License gives no
|
215 |
+
permission to license the work in any other way, but it does not
|
216 |
+
invalidate such permission if you have separately received it.
|
217 |
+
|
218 |
+
d) If the work has interactive user interfaces, each must display
|
219 |
+
Appropriate Legal Notices; however, if the Program has interactive
|
220 |
+
interfaces that do not display Appropriate Legal Notices, your
|
221 |
+
work need not make them do so.
|
222 |
+
|
223 |
+
A compilation of a covered work with other separate and independent
|
224 |
+
works, which are not by their nature extensions of the covered work,
|
225 |
+
and which are not combined with it such as to form a larger program,
|
226 |
+
in or on a volume of a storage or distribution medium, is called an
|
227 |
+
"aggregate" if the compilation and its resulting copyright are not
|
228 |
+
used to limit the access or legal rights of the compilation's users
|
229 |
+
beyond what the individual works permit. Inclusion of a covered work
|
230 |
+
in an aggregate does not cause this License to apply to the other
|
231 |
+
parts of the aggregate.
|
232 |
+
|
233 |
+
6. Conveying Non-Source Forms.
|
234 |
+
|
235 |
+
You may convey a covered work in object code form under the terms
|
236 |
+
of sections 4 and 5, provided that you also convey the
|
237 |
+
machine-readable Corresponding Source under the terms of this License,
|
238 |
+
in one of these ways:
|
239 |
+
|
240 |
+
a) Convey the object code in, or embodied in, a physical product
|
241 |
+
(including a physical distribution medium), accompanied by the
|
242 |
+
Corresponding Source fixed on a durable physical medium
|
243 |
+
customarily used for software interchange.
|
244 |
+
|
245 |
+
b) Convey the object code in, or embodied in, a physical product
|
246 |
+
(including a physical distribution medium), accompanied by a
|
247 |
+
written offer, valid for at least three years and valid for as
|
248 |
+
long as you offer spare parts or customer support for that product
|
249 |
+
model, to give anyone who possesses the object code either (1) a
|
250 |
+
copy of the Corresponding Source for all the software in the
|
251 |
+
product that is covered by this License, on a durable physical
|
252 |
+
medium customarily used for software interchange, for a price no
|
253 |
+
more than your reasonable cost of physically performing this
|
254 |
+
conveying of source, or (2) access to copy the
|
255 |
+
Corresponding Source from a network server at no charge.
|
256 |
+
|
257 |
+
c) Convey individual copies of the object code with a copy of the
|
258 |
+
written offer to provide the Corresponding Source. This
|
259 |
+
alternative is allowed only occasionally and noncommercially, and
|
260 |
+
only if you received the object code with such an offer, in accord
|
261 |
+
with subsection 6b.
|
262 |
+
|
263 |
+
d) Convey the object code by offering access from a designated
|
264 |
+
place (gratis or for a charge), and offer equivalent access to the
|
265 |
+
Corresponding Source in the same way through the same place at no
|
266 |
+
further charge. You need not require recipients to copy the
|
267 |
+
Corresponding Source along with the object code. If the place to
|
268 |
+
copy the object code is a network server, the Corresponding Source
|
269 |
+
may be on a different server (operated by you or a third party)
|
270 |
+
that supports equivalent copying facilities, provided you maintain
|
271 |
+
clear directions next to the object code saying where to find the
|
272 |
+
Corresponding Source. Regardless of what server hosts the
|
273 |
+
Corresponding Source, you remain obligated to ensure that it is
|
274 |
+
available for as long as needed to satisfy these requirements.
|
275 |
+
|
276 |
+
e) Convey the object code using peer-to-peer transmission, provided
|
277 |
+
you inform other peers where the object code and Corresponding
|
278 |
+
Source of the work are being offered to the general public at no
|
279 |
+
charge under subsection 6d.
|
280 |
+
|
281 |
+
A separable portion of the object code, whose source code is excluded
|
282 |
+
from the Corresponding Source as a System Library, need not be
|
283 |
+
included in conveying the object code work.
|
284 |
+
|
285 |
+
A "User Product" is either (1) a "consumer product", which means any
|
286 |
+
tangible personal property which is normally used for personal, family,
|
287 |
+
or household purposes, or (2) anything designed or sold for incorporation
|
288 |
+
into a dwelling. In determining whether a product is a consumer product,
|
289 |
+
doubtful cases shall be resolved in favor of coverage. For a particular
|
290 |
+
product received by a particular user, "normally used" refers to a
|
291 |
+
typical or common use of that class of product, regardless of the status
|
292 |
+
of the particular user or of the way in which the particular user
|
293 |
+
actually uses, or expects or is expected to use, the product. A product
|
294 |
+
is a consumer product regardless of whether the product has substantial
|
295 |
+
commercial, industrial or non-consumer uses, unless such uses represent
|
296 |
+
the only significant mode of use of the product.
|
297 |
+
|
298 |
+
"Installation Information" for a User Product means any methods,
|
299 |
+
procedures, authorization keys, or other information required to install
|
300 |
+
and execute modified versions of a covered work in that User Product from
|
301 |
+
a modified version of its Corresponding Source. The information must
|
302 |
+
suffice to ensure that the continued functioning of the modified object
|
303 |
+
code is in no case prevented or interfered with solely because
|
304 |
+
modification has been made.
|
305 |
+
|
306 |
+
If you convey an object code work under this section in, or with, or
|
307 |
+
specifically for use in, a User Product, and the conveying occurs as
|
308 |
+
part of a transaction in which the right of possession and use of the
|
309 |
+
User Product is transferred to the recipient in perpetuity or for a
|
310 |
+
fixed term (regardless of how the transaction is characterized), the
|
311 |
+
Corresponding Source conveyed under this section must be accompanied
|
312 |
+
by the Installation Information. But this requirement does not apply
|
313 |
+
if neither you nor any third party retains the ability to install
|
314 |
+
modified object code on the User Product (for example, the work has
|
315 |
+
been installed in ROM).
|
316 |
+
|
317 |
+
The requirement to provide Installation Information does not include a
|
318 |
+
requirement to continue to provide support service, warranty, or updates
|
319 |
+
for a work that has been modified or installed by the recipient, or for
|
320 |
+
the User Product in which it has been modified or installed. Access to a
|
321 |
+
network may be denied when the modification itself materially and
|
322 |
+
adversely affects the operation of the network or violates the rules and
|
323 |
+
protocols for communication across the network.
|
324 |
+
|
325 |
+
Corresponding Source conveyed, and Installation Information provided,
|
326 |
+
in accord with this section must be in a format that is publicly
|
327 |
+
documented (and with an implementation available to the public in
|
328 |
+
source code form), and must require no special password or key for
|
329 |
+
unpacking, reading or copying.
|
330 |
+
|
331 |
+
7. Additional Terms.
|
332 |
+
|
333 |
+
"Additional permissions" are terms that supplement the terms of this
|
334 |
+
License by making exceptions from one or more of its conditions.
|
335 |
+
Additional permissions that are applicable to the entire Program shall
|
336 |
+
be treated as though they were included in this License, to the extent
|
337 |
+
that they are valid under applicable law. If additional permissions
|
338 |
+
apply only to part of the Program, that part may be used separately
|
339 |
+
under those permissions, but the entire Program remains governed by
|
340 |
+
this License without regard to the additional permissions.
|
341 |
+
|
342 |
+
When you convey a copy of a covered work, you may at your option
|
343 |
+
remove any additional permissions from that copy, or from any part of
|
344 |
+
it. (Additional permissions may be written to require their own
|
345 |
+
removal in certain cases when you modify the work.) You may place
|
346 |
+
additional permissions on material, added by you to a covered work,
|
347 |
+
for which you have or can give appropriate copyright permission.
|
348 |
+
|
349 |
+
Notwithstanding any other provision of this License, for material you
|
350 |
+
add to a covered work, you may (if authorized by the copyright holders of
|
351 |
+
that material) supplement the terms of this License with terms:
|
352 |
+
|
353 |
+
a) Disclaiming warranty or limiting liability differently from the
|
354 |
+
terms of sections 15 and 16 of this License; or
|
355 |
+
|
356 |
+
b) Requiring preservation of specified reasonable legal notices or
|
357 |
+
author attributions in that material or in the Appropriate Legal
|
358 |
+
Notices displayed by works containing it; or
|
359 |
+
|
360 |
+
c) Prohibiting misrepresentation of the origin of that material, or
|
361 |
+
requiring that modified versions of such material be marked in
|
362 |
+
reasonable ways as different from the original version; or
|
363 |
+
|
364 |
+
d) Limiting the use for publicity purposes of names of licensors or
|
365 |
+
authors of the material; or
|
366 |
+
|
367 |
+
e) Declining to grant rights under trademark law for use of some
|
368 |
+
trade names, trademarks, or service marks; or
|
369 |
+
|
370 |
+
f) Requiring indemnification of licensors and authors of that
|
371 |
+
material by anyone who conveys the material (or modified versions of
|
372 |
+
it) with contractual assumptions of liability to the recipient, for
|
373 |
+
any liability that these contractual assumptions directly impose on
|
374 |
+
those licensors and authors.
|
375 |
+
|
376 |
+
All other non-permissive additional terms are considered "further
|
377 |
+
restrictions" within the meaning of section 10. If the Program as you
|
378 |
+
received it, or any part of it, contains a notice stating that it is
|
379 |
+
governed by this License along with a term that is a further
|
380 |
+
restriction, you may remove that term. If a license document contains
|
381 |
+
a further restriction but permits relicensing or conveying under this
|
382 |
+
License, you may add to a covered work material governed by the terms
|
383 |
+
of that license document, provided that the further restriction does
|
384 |
+
not survive such relicensing or conveying.
|
385 |
+
|
386 |
+
If you add terms to a covered work in accord with this section, you
|
387 |
+
must place, in the relevant source files, a statement of the
|
388 |
+
additional terms that apply to those files, or a notice indicating
|
389 |
+
where to find the applicable terms.
|
390 |
+
|
391 |
+
Additional terms, permissive or non-permissive, may be stated in the
|
392 |
+
form of a separately written license, or stated as exceptions;
|
393 |
+
the above requirements apply either way.
|
394 |
+
|
395 |
+
8. Termination.
|
396 |
+
|
397 |
+
You may not propagate or modify a covered work except as expressly
|
398 |
+
provided under this License. Any attempt otherwise to propagate or
|
399 |
+
modify it is void, and will automatically terminate your rights under
|
400 |
+
this License (including any patent licenses granted under the third
|
401 |
+
paragraph of section 11).
|
402 |
+
|
403 |
+
However, if you cease all violation of this License, then your
|
404 |
+
license from a particular copyright holder is reinstated (a)
|
405 |
+
provisionally, unless and until the copyright holder explicitly and
|
406 |
+
finally terminates your license, and (b) permanently, if the copyright
|
407 |
+
holder fails to notify you of the violation by some reasonable means
|
408 |
+
prior to 60 days after the cessation.
|
409 |
+
|
410 |
+
Moreover, your license from a particular copyright holder is
|
411 |
+
reinstated permanently if the copyright holder notifies you of the
|
412 |
+
violation by some reasonable means, this is the first time you have
|
413 |
+
received notice of violation of this License (for any work) from that
|
414 |
+
copyright holder, and you cure the violation prior to 30 days after
|
415 |
+
your receipt of the notice.
|
416 |
+
|
417 |
+
Termination of your rights under this section does not terminate the
|
418 |
+
licenses of parties who have received copies or rights from you under
|
419 |
+
this License. If your rights have been terminated and not permanently
|
420 |
+
reinstated, you do not qualify to receive new licenses for the same
|
421 |
+
material under section 10.
|
422 |
+
|
423 |
+
9. Acceptance Not Required for Having Copies.
|
424 |
+
|
425 |
+
You are not required to accept this License in order to receive or
|
426 |
+
run a copy of the Program. Ancillary propagation of a covered work
|
427 |
+
occurring solely as a consequence of using peer-to-peer transmission
|
428 |
+
to receive a copy likewise does not require acceptance. However,
|
429 |
+
nothing other than this License grants you permission to propagate or
|
430 |
+
modify any covered work. These actions infringe copyright if you do
|
431 |
+
not accept this License. Therefore, by modifying or propagating a
|
432 |
+
covered work, you indicate your acceptance of this License to do so.
|
433 |
+
|
434 |
+
10. Automatic Licensing of Downstream Recipients.
|
435 |
+
|
436 |
+
Each time you convey a covered work, the recipient automatically
|
437 |
+
receives a license from the original licensors, to run, modify and
|
438 |
+
propagate that work, subject to this License. You are not responsible
|
439 |
+
for enforcing compliance by third parties with this License.
|
440 |
+
|
441 |
+
An "entity transaction" is a transaction transferring control of an
|
442 |
+
organization, or substantially all assets of one, or subdividing an
|
443 |
+
organization, or merging organizations. If propagation of a covered
|
444 |
+
work results from an entity transaction, each party to that
|
445 |
+
transaction who receives a copy of the work also receives whatever
|
446 |
+
licenses to the work the party's predecessor in interest had or could
|
447 |
+
give under the previous paragraph, plus a right to possession of the
|
448 |
+
Corresponding Source of the work from the predecessor in interest, if
|
449 |
+
the predecessor has it or can get it with reasonable efforts.
|
450 |
+
|
451 |
+
You may not impose any further restrictions on the exercise of the
|
452 |
+
rights granted or affirmed under this License. For example, you may
|
453 |
+
not impose a license fee, royalty, or other charge for exercise of
|
454 |
+
rights granted under this License, and you may not initiate litigation
|
455 |
+
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
456 |
+
any patent claim is infringed by making, using, selling, offering for
|
457 |
+
sale, or importing the Program or any portion of it.
|
458 |
+
|
459 |
+
11. Patents.
|
460 |
+
|
461 |
+
A "contributor" is a copyright holder who authorizes use under this
|
462 |
+
License of the Program or a work on which the Program is based. The
|
463 |
+
work thus licensed is called the contributor's "contributor version".
|
464 |
+
|
465 |
+
A contributor's "essential patent claims" are all patent claims
|
466 |
+
owned or controlled by the contributor, whether already acquired or
|
467 |
+
hereafter acquired, that would be infringed by some manner, permitted
|
468 |
+
by this License, of making, using, or selling its contributor version,
|
469 |
+
but do not include claims that would be infringed only as a
|
470 |
+
consequence of further modification of the contributor version. For
|
471 |
+
purposes of this definition, "control" includes the right to grant
|
472 |
+
patent sublicenses in a manner consistent with the requirements of
|
473 |
+
this License.
|
474 |
+
|
475 |
+
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
476 |
+
patent license under the contributor's essential patent claims, to
|
477 |
+
make, use, sell, offer for sale, import and otherwise run, modify and
|
478 |
+
propagate the contents of its contributor version.
|
479 |
+
|
480 |
+
In the following three paragraphs, a "patent license" is any express
|
481 |
+
agreement or commitment, however denominated, not to enforce a patent
|
482 |
+
(such as an express permission to practice a patent or covenant not to
|
483 |
+
sue for patent infringement). To "grant" such a patent license to a
|
484 |
+
party means to make such an agreement or commitment not to enforce a
|
485 |
+
patent against the party.
|
486 |
+
|
487 |
+
If you convey a covered work, knowingly relying on a patent license,
|
488 |
+
and the Corresponding Source of the work is not available for anyone
|
489 |
+
to copy, free of charge and under the terms of this License, through a
|
490 |
+
publicly available network server or other readily accessible means,
|
491 |
+
then you must either (1) cause the Corresponding Source to be so
|
492 |
+
available, or (2) arrange to deprive yourself of the benefit of the
|
493 |
+
patent license for this particular work, or (3) arrange, in a manner
|
494 |
+
consistent with the requirements of this License, to extend the patent
|
495 |
+
license to downstream recipients. "Knowingly relying" means you have
|
496 |
+
actual knowledge that, but for the patent license, your conveying the
|
497 |
+
covered work in a country, or your recipient's use of the covered work
|
498 |
+
in a country, would infringe one or more identifiable patents in that
|
499 |
+
country that you have reason to believe are valid.
|
500 |
+
|
501 |
+
If, pursuant to or in connection with a single transaction or
|
502 |
+
arrangement, you convey, or propagate by procuring conveyance of, a
|
503 |
+
covered work, and grant a patent license to some of the parties
|
504 |
+
receiving the covered work authorizing them to use, propagate, modify
|
505 |
+
or convey a specific copy of the covered work, then the patent license
|
506 |
+
you grant is automatically extended to all recipients of the covered
|
507 |
+
work and works based on it.
|
508 |
+
|
509 |
+
A patent license is "discriminatory" if it does not include within
|
510 |
+
the scope of its coverage, prohibits the exercise of, or is
|
511 |
+
conditioned on the non-exercise of one or more of the rights that are
|
512 |
+
specifically granted under this License. You may not convey a covered
|
513 |
+
work if you are a party to an arrangement with a third party that is
|
514 |
+
in the business of distributing software, under which you make payment
|
515 |
+
to the third party based on the extent of your activity of conveying
|
516 |
+
the work, and under which the third party grants, to any of the
|
517 |
+
parties who would receive the covered work from you, a discriminatory
|
518 |
+
patent license (a) in connection with copies of the covered work
|
519 |
+
conveyed by you (or copies made from those copies), or (b) primarily
|
520 |
+
for and in connection with specific products or compilations that
|
521 |
+
contain the covered work, unless you entered into that arrangement,
|
522 |
+
or that patent license was granted, prior to 28 March 2007.
|
523 |
+
|
524 |
+
Nothing in this License shall be construed as excluding or limiting
|
525 |
+
any implied license or other defenses to infringement that may
|
526 |
+
otherwise be available to you under applicable patent law.
|
527 |
+
|
528 |
+
12. No Surrender of Others' Freedom.
|
529 |
+
|
530 |
+
If conditions are imposed on you (whether by court order, agreement or
|
531 |
+
otherwise) that contradict the conditions of this License, they do not
|
532 |
+
excuse you from the conditions of this License. If you cannot convey a
|
533 |
+
covered work so as to satisfy simultaneously your obligations under this
|
534 |
+
License and any other pertinent obligations, then as a consequence you may
|
535 |
+
not convey it at all. For example, if you agree to terms that obligate you
|
536 |
+
to collect a royalty for further conveying from those to whom you convey
|
537 |
+
the Program, the only way you could satisfy both those terms and this
|
538 |
+
License would be to refrain entirely from conveying the Program.
|
539 |
+
|
540 |
+
13. Remote Network Interaction; Use with the GNU General Public License.
|
541 |
+
|
542 |
+
Notwithstanding any other provision of this License, if you modify the
|
543 |
+
Program, your modified version must prominently offer all users
|
544 |
+
interacting with it remotely through a computer network (if your version
|
545 |
+
supports such interaction) an opportunity to receive the Corresponding
|
546 |
+
Source of your version by providing access to the Corresponding Source
|
547 |
+
from a network server at no charge, through some standard or customary
|
548 |
+
means of facilitating copying of software. This Corresponding Source
|
549 |
+
shall include the Corresponding Source for any work covered by version 3
|
550 |
+
of the GNU General Public License that is incorporated pursuant to the
|
551 |
+
following paragraph.
|
552 |
+
|
553 |
+
Notwithstanding any other provision of this License, you have
|
554 |
+
permission to link or combine any covered work with a work licensed
|
555 |
+
under version 3 of the GNU General Public License into a single
|
556 |
+
combined work, and to convey the resulting work. The terms of this
|
557 |
+
License will continue to apply to the part which is the covered work,
|
558 |
+
but the work with which it is combined will remain governed by version
|
559 |
+
3 of the GNU General Public License.
|
560 |
+
|
561 |
+
14. Revised Versions of this License.
|
562 |
+
|
563 |
+
The Free Software Foundation may publish revised and/or new versions of
|
564 |
+
the GNU Affero General Public License from time to time. Such new versions
|
565 |
+
will be similar in spirit to the present version, but may differ in detail to
|
566 |
+
address new problems or concerns.
|
567 |
+
|
568 |
+
Each version is given a distinguishing version number. If the
|
569 |
+
Program specifies that a certain numbered version of the GNU Affero General
|
570 |
+
Public License "or any later version" applies to it, you have the
|
571 |
+
option of following the terms and conditions either of that numbered
|
572 |
+
version or of any later version published by the Free Software
|
573 |
+
Foundation. If the Program does not specify a version number of the
|
574 |
+
GNU Affero General Public License, you may choose any version ever published
|
575 |
+
by the Free Software Foundation.
|
576 |
+
|
577 |
+
If the Program specifies that a proxy can decide which future
|
578 |
+
versions of the GNU Affero General Public License can be used, that proxy's
|
579 |
+
public statement of acceptance of a version permanently authorizes you
|
580 |
+
to choose that version for the Program.
|
581 |
+
|
582 |
+
Later license versions may give you additional or different
|
583 |
+
permissions. However, no additional obligations are imposed on any
|
584 |
+
author or copyright holder as a result of your choosing to follow a
|
585 |
+
later version.
|
586 |
+
|
587 |
+
15. Disclaimer of Warranty.
|
588 |
+
|
589 |
+
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
590 |
+
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
591 |
+
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
592 |
+
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
593 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
594 |
+
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
595 |
+
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
596 |
+
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
597 |
+
|
598 |
+
16. Limitation of Liability.
|
599 |
+
|
600 |
+
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
601 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
602 |
+
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
603 |
+
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
604 |
+
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
605 |
+
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
606 |
+
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
607 |
+
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
608 |
+
SUCH DAMAGES.
|
609 |
+
|
610 |
+
17. Interpretation of Sections 15 and 16.
|
611 |
+
|
612 |
+
If the disclaimer of warranty and limitation of liability provided
|
613 |
+
above cannot be given local legal effect according to their terms,
|
614 |
+
reviewing courts shall apply local law that most closely approximates
|
615 |
+
an absolute waiver of all civil liability in connection with the
|
616 |
+
Program, unless a warranty or assumption of liability accompanies a
|
617 |
+
copy of the Program in return for a fee.
|
618 |
+
|
619 |
+
END OF TERMS AND CONDITIONS
|
620 |
+
|
621 |
+
How to Apply These Terms to Your New Programs
|
622 |
+
|
623 |
+
If you develop a new program, and you want it to be of the greatest
|
624 |
+
possible use to the public, the best way to achieve this is to make it
|
625 |
+
free software which everyone can redistribute and change under these terms.
|
626 |
+
|
627 |
+
To do so, attach the following notices to the program. It is safest
|
628 |
+
to attach them to the start of each source file to most effectively
|
629 |
+
state the exclusion of warranty; and each file should have at least
|
630 |
+
the "copyright" line and a pointer to where the full notice is found.
|
631 |
+
|
632 |
+
<one line to give the program's name and a brief idea of what it does.>
|
633 |
+
Copyright (C) <year> <name of author>
|
634 |
+
|
635 |
+
This program is free software: you can redistribute it and/or modify
|
636 |
+
it under the terms of the GNU Affero General Public License as published
|
637 |
+
by the Free Software Foundation, either version 3 of the License, or
|
638 |
+
(at your option) any later version.
|
639 |
+
|
640 |
+
This program is distributed in the hope that it will be useful,
|
641 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
642 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
643 |
+
GNU Affero General Public License for more details.
|
644 |
+
|
645 |
+
You should have received a copy of the GNU Affero General Public License
|
646 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
647 |
+
|
648 |
+
Also add information on how to contact you by electronic and paper mail.
|
649 |
+
|
650 |
+
If your software can interact with users remotely through a computer
|
651 |
+
network, you should also make sure that it provides a way for users to
|
652 |
+
get its source. For example, if your program is a web application, its
|
653 |
+
interface could display a "Source" link that leads users to an archive
|
654 |
+
of the code. There are many ways you could offer source, and different
|
655 |
+
solutions will be better for different programs; see section 13 for the
|
656 |
+
specific requirements.
|
657 |
+
|
658 |
+
You should also get your employer (if you work as a programmer) or school,
|
659 |
+
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
660 |
+
For more information on this, and how to apply and follow the GNU AGPL, see
|
661 |
+
<https://www.gnu.org/licenses/>.
|
README.md
CHANGED
@@ -9,3 +9,271 @@ short_description: pdf翻译
|
|
9 |
---
|
10 |
|
11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
---
|
10 |
|
11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
12 |
+
|
13 |
+
# PDFTranslate
|
14 |
+
|
15 |
+
科学 PDF 文档翻译及双语对照工具
|
16 |
+
|
17 |
+
|
18 |
+
- 📊 保留公式、图表、目录和注释 *([预览效果](#preview))*
|
19 |
+
- 🌐 支持 [多种语言](#language) 和 [诸多翻译服务](#services)
|
20 |
+
- 🤖 提供 [命令行工具](#usage),[图形交互界面](#gui),以及 [容器化部署](#docker)
|
21 |
+
|
22 |
+
欢迎在 [GitHub Issues](https://github.com/Byaidu/PDFMathTranslate/issues) 或 [Telegram 用户群](https://t.me/+Z9_SgnxmsmA5NzBl) 中提供反馈。
|
23 |
+
|
24 |
+
<h2 id="updates">近期更新</h2>
|
25 |
+
|
26 |
+
- [Nov. 26 2024] CLI 现在已支持(多个)在线 PDF 文件 *(by [@reycn](https://github.com/reycn))*
|
27 |
+
- [Nov. 24 2024] 为降低依赖大小,提供 [ONNX](https://github.com/onnx/onnx) 支持 *(by [@Wybxc](https://github.com/Wybxc))*
|
28 |
+
- [Nov. 23 2024] 🌟 [免费公共服务](#demo) 上线! *(by [@Byaidu](https://github.com/Byaidu))*
|
29 |
+
- [Nov. 23 2024] 防止网页爬虫的防火墙 *(by [@Byaidu](https://github.com/Byaidu))*
|
30 |
+
- [Nov. 22 2024] 图形用户界面现已支持意大利语,并获得了一些更新 *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))*
|
31 |
+
- [Nov. 22 2024] 现在你可以将自己部署的服务分享给朋友了 *(by [@Zxis233](https://github.com/Zxis233))*
|
32 |
+
- [Nov. 22 2024] 支持腾讯翻译 *(by [@hellofinch](https://github.com/hellofinch))*
|
33 |
+
- [Nov. 21 2024] 图形用户界面现在支持下载双语文档 *(by [@reycn](https://github.com/reycn))*
|
34 |
+
- [Nov. 20 2024] 🌟 提供了 [在线演示](#demo)! *(by [@reycn](https://github.com/reycn))*
|
35 |
+
|
36 |
+
<h2 id="preview">效果预览</h2>
|
37 |
+
|
38 |
+
|
39 |
+
![](https://raw.githubusercontent.com/hhhaiai/Picture/main/img/202411261612975.gif)
|
40 |
+
|
41 |
+
<h2 id="demo">在线演示 🌟</h2>
|
42 |
+
|
43 |
+
### 免费服务 (<https://pdf2zh.com/>)
|
44 |
+
|
45 |
+
你可以立即尝试 [免费公共服务](https://pdf2zh.com/) 而无需安装。
|
46 |
+
|
47 |
+
### Hugging Face 在线演示
|
48 |
+
|
49 |
+
你可以立即尝试 [在 HuggingFace 上的在线演示](https://huggingface.co/spaces/reycn/PDFMathTranslate-Docker) 而无需安装。
|
50 |
+
请注意,演示的计算资源有限,因此请避免滥用。
|
51 |
+
|
52 |
+
<h2 id="install">安装和使用</h2>
|
53 |
+
|
54 |
+
我们提供了三种使用该项目的方法:[命令行工具](#cmd)、[图形交互界面](#gui) 和 [容器化部署](#docker).
|
55 |
+
|
56 |
+
<h3 id="cmd">方法一、命令行工具</h3>
|
57 |
+
|
58 |
+
1. 确保安装了版本大于 3.8 且小于 3.12 的 Python
|
59 |
+
2. 安装此程序:
|
60 |
+
|
61 |
+
```bash
|
62 |
+
pip install pdf2zh
|
63 |
+
```
|
64 |
+
|
65 |
+
3. 开始使用:
|
66 |
+
|
67 |
+
```bash
|
68 |
+
pdf2zh document.pdf
|
69 |
+
```
|
70 |
+
|
71 |
+
<h3 id="gui">方法二、图形交互界面</h3>
|
72 |
+
|
73 |
+
1. 确保安装了版本大于 3.8 且小于 3.12 的 Python
|
74 |
+
2. 安装此程序:
|
75 |
+
|
76 |
+
```bash
|
77 |
+
pip install pdf2zh
|
78 |
+
```
|
79 |
+
|
80 |
+
3. 开始在浏览器中使用:
|
81 |
+
|
82 |
+
```bash
|
83 |
+
pdf2zh -i
|
84 |
+
```
|
85 |
+
|
86 |
+
4. 如果您的浏览器没有自动启动并跳转,请用浏览器打开:
|
87 |
+
|
88 |
+
```bash
|
89 |
+
http://localhost:7860/
|
90 |
+
```
|
91 |
+
|
92 |
+
![](https://raw.githubusercontent.com/hhhaiai/Picture/main/img/202411261614075.gif)
|
93 |
+
|
94 |
+
查看 [documentation for GUI](./docs/README_GUI.md) 获取细节说明.
|
95 |
+
|
96 |
+
<h3 id="docker">方法三、容器化部署</h3>
|
97 |
+
|
98 |
+
1. 拉取 Docker 镜像并运行:
|
99 |
+
|
100 |
+
```bash
|
101 |
+
docker pull byaidu/pdf2zh
|
102 |
+
docker run -d -p 7860:7860 byaidu/pdf2zh
|
103 |
+
```
|
104 |
+
|
105 |
+
2. 通过浏览器打开:
|
106 |
+
|
107 |
+
```
|
108 |
+
http://localhost:7860/
|
109 |
+
```
|
110 |
+
|
111 |
+
用于在云服务上部署容器镜像:
|
112 |
+
|
113 |
+
<a href="https://www.heroku.com/deploy?template=https://github.com/Byaidu/PDFMathTranslate">
|
114 |
+
<img src="https://www.herokucdn.com/deploy/button.svg" alt="Deploy" height="26"></a>
|
115 |
+
|
116 |
+
|
117 |
+
<a href="https://render.com/deploy">
|
118 |
+
<img src="https://render.com/images/deploy-to-render-button.svg" alt="Deploy to Koyeb" height="26"></a>
|
119 |
+
|
120 |
+
<a href="https://zeabur.com/templates/5FQIGX?referralCode=reycn">
|
121 |
+
<img src="https://zeabur.com/button.svg" alt="Deploy on Zeabur" height="26"></a>
|
122 |
+
|
123 |
+
<a href="https://app.koyeb.com/deploy?type=git&builder=buildpack&repository=github.com/Byaidu/PDFMathTranslate&branch=main&name=pdf-math-translate">
|
124 |
+
<img src="https://www.koyeb.com/static/images/deploy/button.svg" alt="Deploy to Koyeb" height="26"></a>
|
125 |
+
|
126 |
+
<h2 id="usage">高级选项</h2>
|
127 |
+
|
128 |
+
在命令行中执行翻译命令,生成译文文档 `example-zh.pdf` 和双语对照文档 `example-dual.pdf`,默认使用 Google 翻译服务
|
129 |
+
|
130 |
+
![](https://raw.githubusercontent.com/hhhaiai/Picture/main/img/202411261614851.png)
|
131 |
+
|
132 |
+
我们在下表中列出了所有高级选项,以供参考:
|
133 |
+
|
134 |
+
| Option | Function | Example |
|
135 |
+
| -------- | ------- |------- |
|
136 |
+
| (文档) | 本地(多个)文件 | `pdf2zh ~/local.pdf` |
|
137 |
+
| | 在线(多个)文件| `pdf2zh http://web.com/online.pdf` |
|
138 |
+
| `-i` | [进入图形界面](#gui) | `pdf2zh -i` |
|
139 |
+
| `-p` | [仅翻译部分文档](#partial) | `pdf2zh example.pdf -p 1` |
|
140 |
+
| `-li` | [源语言](#languages) | `pdf2zh example.pdf -li en` |
|
141 |
+
| `-lo` | [目标语言](#languages) | `pdf2zh example.pdf -lo zh` |
|
142 |
+
| `-s` | [指定翻译服务](#services) | `pdf2zh example.pdf -s deepl` |
|
143 |
+
| `-t` | [多线程](#threads) | `pdf2zh example.pdf -t 1` |
|
144 |
+
| `-f`, `-c` | [例外规则](#exceptions) | `pdf2zh example.pdf -f "(MS.*)"` |
|
145 |
+
|
146 |
+
某些服务需要设置环境变量。关于设置环境变量的详细说明,请参考 [ChatGPT](https://chatgpt.com/share/6734a83d-9d48-800e-8a46-f57ca6e8bcb4)
|
147 |
+
|
148 |
+
<h3 id="partial">全文或部分文档翻译</h3>
|
149 |
+
|
150 |
+
- **全文翻译**
|
151 |
+
|
152 |
+
```bash
|
153 |
+
pdf2zh example.pdf
|
154 |
+
```
|
155 |
+
|
156 |
+
- **部分翻译**
|
157 |
+
|
158 |
+
```bash
|
159 |
+
pdf2zh example.pdf -p 1-3,5
|
160 |
+
```
|
161 |
+
|
162 |
+
<h3 id="language">指定源语言和目标语言</h3>
|
163 |
+
|
164 |
+
参考 [Google Languages Codes](https://developers.google.com/admin-sdk/directory/v1/languages), [DeepL Languages Codes](https://developers.deepl.com/docs/resources/supported-languages)
|
165 |
+
|
166 |
+
```bash
|
167 |
+
pdf2zh example.pdf -li en -lo ja
|
168 |
+
```
|
169 |
+
|
170 |
+
<h3 id="services">使用不同的翻译服务</h3>
|
171 |
+
|
172 |
+
- **DeepL**
|
173 |
+
|
174 |
+
参考 [DeepL](https://support.deepl.com/hc/en-us/articles/360020695820-API-Key-for-DeepL-s-API)
|
175 |
+
|
176 |
+
设置环境变量构建接入点:`{DEEPL_SERVER_URL}/translate`
|
177 |
+
|
178 |
+
- `DEEPL_SERVER_URL`(可选), e.g., `export DEEPL_SERVER_URL=https://api.deepl.com`
|
179 |
+
- `DEEPL_AUTH_KEY`, e.g., `export DEEPL_AUTH_KEY=xxx`
|
180 |
+
|
181 |
+
```bash
|
182 |
+
pdf2zh example.pdf -s deepl
|
183 |
+
```
|
184 |
+
|
185 |
+
- **DeepLX**
|
186 |
+
|
187 |
+
参考 [DeepLX](https://github.com/OwO-Network/DeepLX)
|
188 |
+
|
189 |
+
设置环境变量构建接入点:`{DEEPLX_SERVER_URL}/translate`
|
190 |
+
|
191 |
+
- `DEEPLX_SERVER_URL`(可选), e.g., `export DEEPLX_SERVER_URL=https://api.deepl.com`
|
192 |
+
- `DEEPLX_AUTH_KEY`, e.g., `export DEEPLX_AUTH_KEY=xxx`
|
193 |
+
|
194 |
+
```bash
|
195 |
+
pdf2zh example.pdf -s deepl
|
196 |
+
```
|
197 |
+
|
198 |
+
- **Ollama**
|
199 |
+
|
200 |
+
参考 [Ollama](https://github.com/ollama/ollama)
|
201 |
+
|
202 |
+
设置环境变量构建接入点:`{OLLAMA_HOST}/api/chat`
|
203 |
+
|
204 |
+
- `OLLAMA_HOST`(可选), e.g., `export OLLAMA_HOST=https://localhost:11434`
|
205 |
+
|
206 |
+
```bash
|
207 |
+
pdf2zh example.pdf -s ollama:gemma2
|
208 |
+
```
|
209 |
+
|
210 |
+
- **支持 OpenAI 协议的 LLM(如 OpenAI、SiliconCloud、Zhipu)**
|
211 |
+
|
212 |
+
参考 [SiliconCloud](https://docs.siliconflow.cn/quickstart), [Zhipu](https://open.bigmodel.cn/dev/api/thirdparty-frame/openai-sdk)
|
213 |
+
|
214 |
+
设置环境变量构建接入点:`{OPENAI_BASE_URL}/chat/completions`
|
215 |
+
|
216 |
+
- `OPENAI_BASE_URL`(可选), e.g., `export OPENAI_BASE_URL=https://api.openai.com/v1`
|
217 |
+
- `OPENAI_API_KEY`, e.g., `export OPENAI_API_KEY=xxx`
|
218 |
+
|
219 |
+
```bash
|
220 |
+
pdf2zh example.pdf -s openai:gpt-4o
|
221 |
+
```
|
222 |
+
|
223 |
+
- **Azure**
|
224 |
+
|
225 |
+
参考 [Azure Text Translation](https://docs.azure.cn/en-us/ai-services/translator/text-translation-overview)
|
226 |
+
|
227 |
+
需设置以下环境变量:
|
228 |
+
|
229 |
+
- `AZURE_APIKEY`, e.g., `export AZURE_APIKEY=xxx`
|
230 |
+
- `AZURE_ENDPOINT`, e.g., `export AZURE_ENDPOINT=https://api.translator.azure.cn/`
|
231 |
+
- `AZURE_REGION`, e.g., `export AZURE_REGION=chinaeast2`
|
232 |
+
|
233 |
+
```bash
|
234 |
+
pdf2zh example.pdf -s azure
|
235 |
+
```
|
236 |
+
|
237 |
+
- **腾讯机器翻译**
|
238 |
+
|
239 |
+
参考 [腾讯机器翻译](https://cloud.tencent.com/product/tmt)
|
240 |
+
|
241 |
+
需设置以下环境变量:
|
242 |
+
|
243 |
+
- `TENCENT_SECRET_ID`, e.g., `export TENCENT_SECRET_ID=AKIDxxx`
|
244 |
+
- `TENCENT_SECRET_KEY`, e.g., `export TENCENT_SECRET_KEY=xxx`
|
245 |
+
|
246 |
+
```bash
|
247 |
+
pdf2zh example.pdf -s tmt
|
248 |
+
```
|
249 |
+
|
250 |
+
<h3 id="exceptions">指定例外规则</h3>
|
251 |
+
|
252 |
+
使用正则表达式指定需保留的公式字体与字符
|
253 |
+
|
254 |
+
```bash
|
255 |
+
pdf2zh example.pdf -f "(CM[^RT].*|MS.*|.*Ital)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])"
|
256 |
+
```
|
257 |
+
|
258 |
+
<h3 id="threads">指定线程数量</h3>
|
259 |
+
|
260 |
+
使用 `-t` 指定翻译时使用的线程数量:
|
261 |
+
|
262 |
+
```bash
|
263 |
+
pdf2zh example.pdf -t 1
|
264 |
+
```
|
265 |
+
|
266 |
+
<h2 id="acknowledgement">致谢</h2>
|
267 |
+
|
268 |
+
- 文档合并:[PyMuPDF](https://github.com/pymupdf/PyMuPDF)
|
269 |
+
|
270 |
+
- 文档解析:[Pdfminer.six](https://github.com/pdfminer/pdfminer.six)
|
271 |
+
|
272 |
+
- 文档提取:[MinerU](https://github.com/opendatalab/MinerU)
|
273 |
+
|
274 |
+
- 多线程翻译:[MathTranslate](https://github.com/SUSYUSTC/MathTranslate)
|
275 |
+
|
276 |
+
- 布局解析:[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
|
277 |
+
|
278 |
+
- 文档标准:[PDF Explained](https://zxyle.github.io/PDF-Explained/), [PDF Cheat Sheets](https://pdfa.org/resource/pdf-cheat-sheets/)
|
279 |
+
|
app.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "PDFMathTranslate",
|
3 |
+
"description": "PDF scientific paper translation and bilingual comparison.",
|
4 |
+
"repository": "https://github.com/Byaidu/PDFMathTranslate"
|
5 |
+
}
|
docs/README_GUI.md
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Interact with GUI
|
2 |
+
|
3 |
+
This subfolder provides the GUI mode of `pdf2zh`.
|
4 |
+
|
5 |
+
## Usage
|
6 |
+
|
7 |
+
1. Run `pdf2zh -i`
|
8 |
+
|
9 |
+
2. Drop the PDF file into the window and click `Translate`.
|
10 |
+
|
11 |
+
## Preview
|
12 |
+
|
13 |
+
<img src="./images/before.png" width="500"/>
|
14 |
+
<img src="./images/after.png" width="500"/>
|
15 |
+
|
16 |
+
## Maintainance
|
17 |
+
|
18 |
+
GUI maintained by [Rongxin](https://github.com/reycn)
|
docs/licenses/LICENSE.pdfminer.six
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2004-2016 Yusuke Shinyama <yusuke at shinyama dot jp>
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person
|
4 |
+
obtaining a copy of this software and associated documentation
|
5 |
+
files (the "Software"), to deal in the Software without
|
6 |
+
restriction, including without limitation the rights to use,
|
7 |
+
copy, modify, merge, publish, distribute, sublicense, and/or
|
8 |
+
sell copies of the Software, and to permit persons to whom the
|
9 |
+
Software is furnished to do so, subject to the following
|
10 |
+
conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be
|
13 |
+
included in all copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
16 |
+
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
17 |
+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
|
18 |
+
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
19 |
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
21 |
+
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
22 |
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
docs/licenses/LICENSE.pyHanko
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This package contains various elements based on code from the pyHanko project, of which we reproduce the license below.
|
2 |
+
|
3 |
+
MIT License
|
4 |
+
|
5 |
+
Copyright (c) 2020 Matthias Valvekens
|
6 |
+
|
7 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
8 |
+
of this software and associated documentation files (the "Software"), to deal
|
9 |
+
in the Software without restriction, including without limitation the rights
|
10 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11 |
+
copies of the Software, and to permit persons to whom the Software is
|
12 |
+
furnished to do so, subject to the following conditions:
|
13 |
+
|
14 |
+
The above copyright notice and this permission notice shall be included in all
|
15 |
+
copies or substantial portions of the Software.
|
16 |
+
|
17 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
20 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
22 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23 |
+
SOFTWARE.
|
pdf2zh/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__version__ = "1.8.0"
|
2 |
+
__author__ = "Byaidu"
|
pdf2zh/_saslprep.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2016-present MongoDB, Inc.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
#
|
15 |
+
# Some changes copyright 2021-present Matthias Valvekens,
|
16 |
+
# licensed under the license of the pyHanko project (see LICENSE file).
|
17 |
+
|
18 |
+
|
19 |
+
"""An implementation of RFC4013 SASLprep."""
|
20 |
+
|
21 |
+
__all__ = ["saslprep"]
|
22 |
+
|
23 |
+
import stringprep
|
24 |
+
import unicodedata
|
25 |
+
from typing import Callable, Tuple
|
26 |
+
|
27 |
+
from pdf2zh.pdfexceptions import PDFValueError
|
28 |
+
|
29 |
+
# RFC4013 section 2.3 prohibited output.
|
30 |
+
_PROHIBITED: Tuple[Callable[[str], bool], ...] = (
|
31 |
+
# A strict reading of RFC 4013 requires table c12 here, but
|
32 |
+
# characters from it are mapped to SPACE in the Map step. Can
|
33 |
+
# normalization reintroduce them somehow?
|
34 |
+
stringprep.in_table_c12,
|
35 |
+
stringprep.in_table_c21_c22,
|
36 |
+
stringprep.in_table_c3,
|
37 |
+
stringprep.in_table_c4,
|
38 |
+
stringprep.in_table_c5,
|
39 |
+
stringprep.in_table_c6,
|
40 |
+
stringprep.in_table_c7,
|
41 |
+
stringprep.in_table_c8,
|
42 |
+
stringprep.in_table_c9,
|
43 |
+
)
|
44 |
+
|
45 |
+
|
46 |
+
def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
|
47 |
+
"""An implementation of RFC4013 SASLprep.
|
48 |
+
:param data:
|
49 |
+
The string to SASLprep.
|
50 |
+
:param prohibit_unassigned_code_points:
|
51 |
+
RFC 3454 and RFCs for various SASL mechanisms distinguish between
|
52 |
+
`queries` (unassigned code points allowed) and
|
53 |
+
`stored strings` (unassigned code points prohibited). Defaults
|
54 |
+
to ``True`` (unassigned code points are prohibited).
|
55 |
+
:return: The SASLprep'ed version of `data`.
|
56 |
+
"""
|
57 |
+
if prohibit_unassigned_code_points:
|
58 |
+
prohibited = _PROHIBITED + (stringprep.in_table_a1,)
|
59 |
+
else:
|
60 |
+
prohibited = _PROHIBITED
|
61 |
+
|
62 |
+
# RFC3454 section 2, step 1 - Map
|
63 |
+
# RFC4013 section 2.1 mappings
|
64 |
+
# Map Non-ASCII space characters to SPACE (U+0020). Map
|
65 |
+
# commonly mapped to nothing characters to, well, nothing.
|
66 |
+
in_table_c12 = stringprep.in_table_c12
|
67 |
+
in_table_b1 = stringprep.in_table_b1
|
68 |
+
data = "".join(
|
69 |
+
[
|
70 |
+
"\u0020" if in_table_c12(elt) else elt
|
71 |
+
for elt in data
|
72 |
+
if not in_table_b1(elt)
|
73 |
+
],
|
74 |
+
)
|
75 |
+
|
76 |
+
# RFC3454 section 2, step 2 - Normalize
|
77 |
+
# RFC4013 section 2.2 normalization
|
78 |
+
data = unicodedata.ucd_3_2_0.normalize("NFKC", data)
|
79 |
+
|
80 |
+
in_table_d1 = stringprep.in_table_d1
|
81 |
+
if in_table_d1(data[0]):
|
82 |
+
if not in_table_d1(data[-1]):
|
83 |
+
# RFC3454, Section 6, #3. If a string contains any
|
84 |
+
# RandALCat character, the first and last characters
|
85 |
+
# MUST be RandALCat characters.
|
86 |
+
raise PDFValueError("SASLprep: failed bidirectional check")
|
87 |
+
# RFC3454, Section 6, #2. If a string contains any RandALCat
|
88 |
+
# character, it MUST NOT contain any LCat character.
|
89 |
+
prohibited = prohibited + (stringprep.in_table_d2,)
|
90 |
+
else:
|
91 |
+
# RFC3454, Section 6, #3. Following the logic of #3, if
|
92 |
+
# the first character is not a RandALCat, no other character
|
93 |
+
# can be either.
|
94 |
+
prohibited = prohibited + (in_table_d1,)
|
95 |
+
|
96 |
+
# RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
|
97 |
+
for char in data:
|
98 |
+
if any(in_table(char) for in_table in prohibited):
|
99 |
+
raise PDFValueError("SASLprep: failed prohibited character check")
|
100 |
+
|
101 |
+
return data
|
pdf2zh/arcfour.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Python implementation of Arcfour encryption algorithm.
|
2 |
+
See https://en.wikipedia.org/wiki/RC4
|
3 |
+
This code is in the public domain.
|
4 |
+
|
5 |
+
"""
|
6 |
+
|
7 |
+
from typing import Sequence
|
8 |
+
|
9 |
+
|
10 |
+
class Arcfour:
|
11 |
+
def __init__(self, key: Sequence[int]) -> None:
|
12 |
+
# because Py3 range is not indexable
|
13 |
+
s = [i for i in range(256)]
|
14 |
+
j = 0
|
15 |
+
klen = len(key)
|
16 |
+
for i in range(256):
|
17 |
+
j = (j + s[i] + key[i % klen]) % 256
|
18 |
+
(s[i], s[j]) = (s[j], s[i])
|
19 |
+
self.s = s
|
20 |
+
(self.i, self.j) = (0, 0)
|
21 |
+
|
22 |
+
def process(self, data: bytes) -> bytes:
|
23 |
+
(i, j) = (self.i, self.j)
|
24 |
+
s = self.s
|
25 |
+
r = b""
|
26 |
+
for c in iter(data):
|
27 |
+
i = (i + 1) % 256
|
28 |
+
j = (j + s[i]) % 256
|
29 |
+
(s[i], s[j]) = (s[j], s[i])
|
30 |
+
k = s[(s[i] + s[j]) % 256]
|
31 |
+
r += bytes((c ^ k,))
|
32 |
+
(self.i, self.j) = (i, j)
|
33 |
+
return r
|
34 |
+
|
35 |
+
encrypt = decrypt = process
|
pdf2zh/ascii85.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
|
2 |
+
|
3 |
+
This code is in the public domain.
|
4 |
+
|
5 |
+
"""
|
6 |
+
|
7 |
+
import re
|
8 |
+
import struct
|
9 |
+
|
10 |
+
|
11 |
+
# ascii85decode(data)
|
12 |
+
def ascii85decode(data: bytes) -> bytes:
|
13 |
+
"""In ASCII85 encoding, every four bytes are encoded with five ASCII
|
14 |
+
letters, using 85 different types of characters (as 256**4 < 85**5).
|
15 |
+
When the length of the original bytes is not a multiple of 4, a special
|
16 |
+
rule is used for round up.
|
17 |
+
|
18 |
+
The Adobe's ASCII85 implementation is slightly different from
|
19 |
+
its original in handling the last characters.
|
20 |
+
|
21 |
+
"""
|
22 |
+
n = b = 0
|
23 |
+
out = b""
|
24 |
+
for i in iter(data):
|
25 |
+
c = bytes((i,))
|
26 |
+
if c >= b"!" and c <= b"u":
|
27 |
+
n += 1
|
28 |
+
b = b * 85 + (ord(c) - 33)
|
29 |
+
if n == 5:
|
30 |
+
out += struct.pack(">L", b)
|
31 |
+
n = b = 0
|
32 |
+
elif c == b"z":
|
33 |
+
assert n == 0, str(n)
|
34 |
+
out += b"\0\0\0\0"
|
35 |
+
elif c == b"~":
|
36 |
+
if n:
|
37 |
+
for _ in range(5 - n):
|
38 |
+
b = b * 85 + 84
|
39 |
+
out += struct.pack(">L", b)[: n - 1]
|
40 |
+
break
|
41 |
+
return out
|
42 |
+
|
43 |
+
|
44 |
+
# asciihexdecode(data)
|
45 |
+
hex_re = re.compile(rb"([a-f\d]{2})", re.IGNORECASE)
|
46 |
+
trail_re = re.compile(rb"^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$", re.IGNORECASE)
|
47 |
+
|
48 |
+
|
49 |
+
def asciihexdecode(data: bytes) -> bytes:
|
50 |
+
"""ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
|
51 |
+
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
|
52 |
+
ASCIIHexDecode filter produces one byte of binary data. All white-space
|
53 |
+
characters are ignored. A right angle bracket character (>) indicates
|
54 |
+
EOD. Any other characters will cause an error. If the filter encounters
|
55 |
+
the EOD marker after reading an odd number of hexadecimal digits, it
|
56 |
+
will behave as if a 0 followed the last digit.
|
57 |
+
"""
|
58 |
+
|
59 |
+
def decode(x: bytes) -> bytes:
|
60 |
+
i = int(x, 16)
|
61 |
+
return bytes((i,))
|
62 |
+
|
63 |
+
out = b""
|
64 |
+
for x in hex_re.findall(data):
|
65 |
+
out += decode(x)
|
66 |
+
|
67 |
+
m = trail_re.search(data)
|
68 |
+
if m:
|
69 |
+
out += decode(m.group(1) + b"0")
|
70 |
+
return out
|
pdf2zh/cache.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tempfile
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
import hashlib
|
5 |
+
import shutil
|
6 |
+
|
7 |
+
cache_dir = os.path.join(tempfile.gettempdir(), "cache")
|
8 |
+
os.makedirs(cache_dir, exist_ok=True)
|
9 |
+
time_filename = "update_time"
|
10 |
+
max_cache = 5
|
11 |
+
|
12 |
+
|
13 |
+
def deterministic_hash(obj):
|
14 |
+
hash_object = hashlib.sha256()
|
15 |
+
hash_object.update(str(obj).encode())
|
16 |
+
return hash_object.hexdigest()[0:20]
|
17 |
+
|
18 |
+
|
19 |
+
def get_dirs():
|
20 |
+
dirs = [
|
21 |
+
os.path.join(cache_dir, dir)
|
22 |
+
for dir in os.listdir(cache_dir)
|
23 |
+
if os.path.isdir(os.path.join(cache_dir, dir))
|
24 |
+
]
|
25 |
+
return dirs
|
26 |
+
|
27 |
+
|
28 |
+
def get_time(dir):
|
29 |
+
try:
|
30 |
+
timefile = os.path.join(dir, time_filename)
|
31 |
+
t = float(open(timefile, encoding="utf-8").read())
|
32 |
+
return t
|
33 |
+
except FileNotFoundError:
|
34 |
+
# handle the error as needed, for now we'll just return a default value
|
35 |
+
return float(
|
36 |
+
"inf"
|
37 |
+
) # This ensures that this directory will be the first to be removed if required
|
38 |
+
|
39 |
+
|
40 |
+
def write_time(dir):
|
41 |
+
timefile = os.path.join(dir, time_filename)
|
42 |
+
t = time.time()
|
43 |
+
print(t, file=open(timefile, "w", encoding="utf-8"), end="")
|
44 |
+
|
45 |
+
|
46 |
+
def argmin(iterable):
|
47 |
+
return min(enumerate(iterable), key=lambda x: x[1])[0]
|
48 |
+
|
49 |
+
|
50 |
+
def remove_extra():
|
51 |
+
dirs = get_dirs()
|
52 |
+
for dir in dirs:
|
53 |
+
if not os.path.isdir(
|
54 |
+
dir
|
55 |
+
): # This line might be redundant now, as get_dirs() ensures only directories are returned
|
56 |
+
os.remove(dir)
|
57 |
+
try:
|
58 |
+
get_time(dir)
|
59 |
+
except BaseException:
|
60 |
+
shutil.rmtree(dir)
|
61 |
+
while True:
|
62 |
+
dirs = get_dirs()
|
63 |
+
if len(dirs) <= max_cache:
|
64 |
+
break
|
65 |
+
times = [get_time(dir) for dir in dirs]
|
66 |
+
arg = argmin(times)
|
67 |
+
shutil.rmtree(dirs[arg])
|
68 |
+
|
69 |
+
|
70 |
+
def is_cached(hash_key):
|
71 |
+
dir = os.path.join(cache_dir, hash_key)
|
72 |
+
return os.path.exists(dir)
|
73 |
+
|
74 |
+
|
75 |
+
def create_cache(hash_key):
|
76 |
+
dir = os.path.join(cache_dir, hash_key)
|
77 |
+
os.makedirs(dir, exist_ok=True)
|
78 |
+
write_time(dir)
|
79 |
+
|
80 |
+
|
81 |
+
def load_paragraph(hash_key, hash_key_paragraph):
|
82 |
+
filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
|
83 |
+
if os.path.exists(filename):
|
84 |
+
return open(filename, encoding="utf-8").read()
|
85 |
+
else:
|
86 |
+
return None
|
87 |
+
|
88 |
+
|
89 |
+
def write_paragraph(hash_key, hash_key_paragraph, paragraph):
|
90 |
+
filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
|
91 |
+
print(paragraph, file=open(filename, "w", encoding="utf-8"), end="")
|
pdf2zh/casting.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Optional
|
2 |
+
|
3 |
+
|
4 |
+
def safe_int(o: Any) -> Optional[int]:
|
5 |
+
try:
|
6 |
+
return int(o)
|
7 |
+
except (TypeError, ValueError):
|
8 |
+
return None
|
9 |
+
|
10 |
+
|
11 |
+
def safe_float(o: Any) -> Optional[float]:
|
12 |
+
try:
|
13 |
+
return float(o)
|
14 |
+
except (TypeError, ValueError):
|
15 |
+
return None
|
pdf2zh/ccitt.py
ADDED
@@ -0,0 +1,614 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# CCITT Fax decoder
|
2 |
+
#
|
3 |
+
# Bugs: uncompressed mode untested.
|
4 |
+
#
|
5 |
+
# cf.
|
6 |
+
# ITU-T Recommendation T.4
|
7 |
+
# "Standardization of Group 3 facsimile terminals
|
8 |
+
# for document transmission"
|
9 |
+
# ITU-T Recommendation T.6
|
10 |
+
# "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS
|
11 |
+
# FOR GROUP 4 FACSIMILE APPARATUS"
|
12 |
+
|
13 |
+
|
14 |
+
import array
|
15 |
+
from typing import (
|
16 |
+
Any,
|
17 |
+
Callable,
|
18 |
+
Dict,
|
19 |
+
Iterator,
|
20 |
+
List,
|
21 |
+
MutableSequence,
|
22 |
+
Optional,
|
23 |
+
Sequence,
|
24 |
+
Union,
|
25 |
+
cast,
|
26 |
+
)
|
27 |
+
|
28 |
+
from pdf2zh.pdfexceptions import PDFException, PDFValueError
|
29 |
+
|
30 |
+
|
31 |
+
def get_bytes(data: bytes) -> Iterator[int]:
|
32 |
+
yield from data
|
33 |
+
|
34 |
+
|
35 |
+
# Workaround https://github.com/python/mypy/issues/731
|
36 |
+
BitParserState = MutableSequence[Any]
|
37 |
+
# A better definition (not supported by mypy) would be:
|
38 |
+
# BitParserState = MutableSequence[Union["BitParserState", int, str, None]]
|
39 |
+
|
40 |
+
|
41 |
+
class BitParser:
|
42 |
+
_state: BitParserState
|
43 |
+
|
44 |
+
# _accept is declared Optional solely as a workaround for
|
45 |
+
# https://github.com/python/mypy/issues/708
|
46 |
+
_accept: Optional[Callable[[Any], BitParserState]]
|
47 |
+
|
48 |
+
def __init__(self) -> None:
|
49 |
+
self._pos = 0
|
50 |
+
|
51 |
+
@classmethod
|
52 |
+
def add(cls, root: BitParserState, v: Union[int, str], bits: str) -> None:
|
53 |
+
p: BitParserState = root
|
54 |
+
b = None
|
55 |
+
for i in range(len(bits)):
|
56 |
+
if i > 0:
|
57 |
+
assert b is not None
|
58 |
+
if p[b] is None:
|
59 |
+
p[b] = [None, None]
|
60 |
+
p = p[b]
|
61 |
+
if bits[i] == "1":
|
62 |
+
b = 1
|
63 |
+
else:
|
64 |
+
b = 0
|
65 |
+
assert b is not None
|
66 |
+
p[b] = v
|
67 |
+
|
68 |
+
def feedbytes(self, data: bytes) -> None:
|
69 |
+
for byte in get_bytes(data):
|
70 |
+
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
71 |
+
self._parse_bit(byte & m)
|
72 |
+
|
73 |
+
def _parse_bit(self, x: object) -> None:
|
74 |
+
if x:
|
75 |
+
v = self._state[1]
|
76 |
+
else:
|
77 |
+
v = self._state[0]
|
78 |
+
self._pos += 1
|
79 |
+
if isinstance(v, list):
|
80 |
+
self._state = v
|
81 |
+
else:
|
82 |
+
assert self._accept is not None
|
83 |
+
self._state = self._accept(v)
|
84 |
+
|
85 |
+
|
86 |
+
class CCITTG4Parser(BitParser):
|
87 |
+
MODE = [None, None]
|
88 |
+
BitParser.add(MODE, 0, "1")
|
89 |
+
BitParser.add(MODE, +1, "011")
|
90 |
+
BitParser.add(MODE, -1, "010")
|
91 |
+
BitParser.add(MODE, "h", "001")
|
92 |
+
BitParser.add(MODE, "p", "0001")
|
93 |
+
BitParser.add(MODE, +2, "000011")
|
94 |
+
BitParser.add(MODE, -2, "000010")
|
95 |
+
BitParser.add(MODE, +3, "0000011")
|
96 |
+
BitParser.add(MODE, -3, "0000010")
|
97 |
+
BitParser.add(MODE, "u", "0000001111")
|
98 |
+
BitParser.add(MODE, "x1", "0000001000")
|
99 |
+
BitParser.add(MODE, "x2", "0000001001")
|
100 |
+
BitParser.add(MODE, "x3", "0000001010")
|
101 |
+
BitParser.add(MODE, "x4", "0000001011")
|
102 |
+
BitParser.add(MODE, "x5", "0000001100")
|
103 |
+
BitParser.add(MODE, "x6", "0000001101")
|
104 |
+
BitParser.add(MODE, "x7", "0000001110")
|
105 |
+
BitParser.add(MODE, "e", "000000000001000000000001")
|
106 |
+
|
107 |
+
WHITE = [None, None]
|
108 |
+
BitParser.add(WHITE, 0, "00110101")
|
109 |
+
BitParser.add(WHITE, 1, "000111")
|
110 |
+
BitParser.add(WHITE, 2, "0111")
|
111 |
+
BitParser.add(WHITE, 3, "1000")
|
112 |
+
BitParser.add(WHITE, 4, "1011")
|
113 |
+
BitParser.add(WHITE, 5, "1100")
|
114 |
+
BitParser.add(WHITE, 6, "1110")
|
115 |
+
BitParser.add(WHITE, 7, "1111")
|
116 |
+
BitParser.add(WHITE, 8, "10011")
|
117 |
+
BitParser.add(WHITE, 9, "10100")
|
118 |
+
BitParser.add(WHITE, 10, "00111")
|
119 |
+
BitParser.add(WHITE, 11, "01000")
|
120 |
+
BitParser.add(WHITE, 12, "001000")
|
121 |
+
BitParser.add(WHITE, 13, "000011")
|
122 |
+
BitParser.add(WHITE, 14, "110100")
|
123 |
+
BitParser.add(WHITE, 15, "110101")
|
124 |
+
BitParser.add(WHITE, 16, "101010")
|
125 |
+
BitParser.add(WHITE, 17, "101011")
|
126 |
+
BitParser.add(WHITE, 18, "0100111")
|
127 |
+
BitParser.add(WHITE, 19, "0001100")
|
128 |
+
BitParser.add(WHITE, 20, "0001000")
|
129 |
+
BitParser.add(WHITE, 21, "0010111")
|
130 |
+
BitParser.add(WHITE, 22, "0000011")
|
131 |
+
BitParser.add(WHITE, 23, "0000100")
|
132 |
+
BitParser.add(WHITE, 24, "0101000")
|
133 |
+
BitParser.add(WHITE, 25, "0101011")
|
134 |
+
BitParser.add(WHITE, 26, "0010011")
|
135 |
+
BitParser.add(WHITE, 27, "0100100")
|
136 |
+
BitParser.add(WHITE, 28, "0011000")
|
137 |
+
BitParser.add(WHITE, 29, "00000010")
|
138 |
+
BitParser.add(WHITE, 30, "00000011")
|
139 |
+
BitParser.add(WHITE, 31, "00011010")
|
140 |
+
BitParser.add(WHITE, 32, "00011011")
|
141 |
+
BitParser.add(WHITE, 33, "00010010")
|
142 |
+
BitParser.add(WHITE, 34, "00010011")
|
143 |
+
BitParser.add(WHITE, 35, "00010100")
|
144 |
+
BitParser.add(WHITE, 36, "00010101")
|
145 |
+
BitParser.add(WHITE, 37, "00010110")
|
146 |
+
BitParser.add(WHITE, 38, "00010111")
|
147 |
+
BitParser.add(WHITE, 39, "00101000")
|
148 |
+
BitParser.add(WHITE, 40, "00101001")
|
149 |
+
BitParser.add(WHITE, 41, "00101010")
|
150 |
+
BitParser.add(WHITE, 42, "00101011")
|
151 |
+
BitParser.add(WHITE, 43, "00101100")
|
152 |
+
BitParser.add(WHITE, 44, "00101101")
|
153 |
+
BitParser.add(WHITE, 45, "00000100")
|
154 |
+
BitParser.add(WHITE, 46, "00000101")
|
155 |
+
BitParser.add(WHITE, 47, "00001010")
|
156 |
+
BitParser.add(WHITE, 48, "00001011")
|
157 |
+
BitParser.add(WHITE, 49, "01010010")
|
158 |
+
BitParser.add(WHITE, 50, "01010011")
|
159 |
+
BitParser.add(WHITE, 51, "01010100")
|
160 |
+
BitParser.add(WHITE, 52, "01010101")
|
161 |
+
BitParser.add(WHITE, 53, "00100100")
|
162 |
+
BitParser.add(WHITE, 54, "00100101")
|
163 |
+
BitParser.add(WHITE, 55, "01011000")
|
164 |
+
BitParser.add(WHITE, 56, "01011001")
|
165 |
+
BitParser.add(WHITE, 57, "01011010")
|
166 |
+
BitParser.add(WHITE, 58, "01011011")
|
167 |
+
BitParser.add(WHITE, 59, "01001010")
|
168 |
+
BitParser.add(WHITE, 60, "01001011")
|
169 |
+
BitParser.add(WHITE, 61, "00110010")
|
170 |
+
BitParser.add(WHITE, 62, "00110011")
|
171 |
+
BitParser.add(WHITE, 63, "00110100")
|
172 |
+
BitParser.add(WHITE, 64, "11011")
|
173 |
+
BitParser.add(WHITE, 128, "10010")
|
174 |
+
BitParser.add(WHITE, 192, "010111")
|
175 |
+
BitParser.add(WHITE, 256, "0110111")
|
176 |
+
BitParser.add(WHITE, 320, "00110110")
|
177 |
+
BitParser.add(WHITE, 384, "00110111")
|
178 |
+
BitParser.add(WHITE, 448, "01100100")
|
179 |
+
BitParser.add(WHITE, 512, "01100101")
|
180 |
+
BitParser.add(WHITE, 576, "01101000")
|
181 |
+
BitParser.add(WHITE, 640, "01100111")
|
182 |
+
BitParser.add(WHITE, 704, "011001100")
|
183 |
+
BitParser.add(WHITE, 768, "011001101")
|
184 |
+
BitParser.add(WHITE, 832, "011010010")
|
185 |
+
BitParser.add(WHITE, 896, "011010011")
|
186 |
+
BitParser.add(WHITE, 960, "011010100")
|
187 |
+
BitParser.add(WHITE, 1024, "011010101")
|
188 |
+
BitParser.add(WHITE, 1088, "011010110")
|
189 |
+
BitParser.add(WHITE, 1152, "011010111")
|
190 |
+
BitParser.add(WHITE, 1216, "011011000")
|
191 |
+
BitParser.add(WHITE, 1280, "011011001")
|
192 |
+
BitParser.add(WHITE, 1344, "011011010")
|
193 |
+
BitParser.add(WHITE, 1408, "011011011")
|
194 |
+
BitParser.add(WHITE, 1472, "010011000")
|
195 |
+
BitParser.add(WHITE, 1536, "010011001")
|
196 |
+
BitParser.add(WHITE, 1600, "010011010")
|
197 |
+
BitParser.add(WHITE, 1664, "011000")
|
198 |
+
BitParser.add(WHITE, 1728, "010011011")
|
199 |
+
BitParser.add(WHITE, 1792, "00000001000")
|
200 |
+
BitParser.add(WHITE, 1856, "00000001100")
|
201 |
+
BitParser.add(WHITE, 1920, "00000001101")
|
202 |
+
BitParser.add(WHITE, 1984, "000000010010")
|
203 |
+
BitParser.add(WHITE, 2048, "000000010011")
|
204 |
+
BitParser.add(WHITE, 2112, "000000010100")
|
205 |
+
BitParser.add(WHITE, 2176, "000000010101")
|
206 |
+
BitParser.add(WHITE, 2240, "000000010110")
|
207 |
+
BitParser.add(WHITE, 2304, "000000010111")
|
208 |
+
BitParser.add(WHITE, 2368, "000000011100")
|
209 |
+
BitParser.add(WHITE, 2432, "000000011101")
|
210 |
+
BitParser.add(WHITE, 2496, "000000011110")
|
211 |
+
BitParser.add(WHITE, 2560, "000000011111")
|
212 |
+
|
213 |
+
BLACK = [None, None]
|
214 |
+
BitParser.add(BLACK, 0, "0000110111")
|
215 |
+
BitParser.add(BLACK, 1, "010")
|
216 |
+
BitParser.add(BLACK, 2, "11")
|
217 |
+
BitParser.add(BLACK, 3, "10")
|
218 |
+
BitParser.add(BLACK, 4, "011")
|
219 |
+
BitParser.add(BLACK, 5, "0011")
|
220 |
+
BitParser.add(BLACK, 6, "0010")
|
221 |
+
BitParser.add(BLACK, 7, "00011")
|
222 |
+
BitParser.add(BLACK, 8, "000101")
|
223 |
+
BitParser.add(BLACK, 9, "000100")
|
224 |
+
BitParser.add(BLACK, 10, "0000100")
|
225 |
+
BitParser.add(BLACK, 11, "0000101")
|
226 |
+
BitParser.add(BLACK, 12, "0000111")
|
227 |
+
BitParser.add(BLACK, 13, "00000100")
|
228 |
+
BitParser.add(BLACK, 14, "00000111")
|
229 |
+
BitParser.add(BLACK, 15, "000011000")
|
230 |
+
BitParser.add(BLACK, 16, "0000010111")
|
231 |
+
BitParser.add(BLACK, 17, "0000011000")
|
232 |
+
BitParser.add(BLACK, 18, "0000001000")
|
233 |
+
BitParser.add(BLACK, 19, "00001100111")
|
234 |
+
BitParser.add(BLACK, 20, "00001101000")
|
235 |
+
BitParser.add(BLACK, 21, "00001101100")
|
236 |
+
BitParser.add(BLACK, 22, "00000110111")
|
237 |
+
BitParser.add(BLACK, 23, "00000101000")
|
238 |
+
BitParser.add(BLACK, 24, "00000010111")
|
239 |
+
BitParser.add(BLACK, 25, "00000011000")
|
240 |
+
BitParser.add(BLACK, 26, "000011001010")
|
241 |
+
BitParser.add(BLACK, 27, "000011001011")
|
242 |
+
BitParser.add(BLACK, 28, "000011001100")
|
243 |
+
BitParser.add(BLACK, 29, "000011001101")
|
244 |
+
BitParser.add(BLACK, 30, "000001101000")
|
245 |
+
BitParser.add(BLACK, 31, "000001101001")
|
246 |
+
BitParser.add(BLACK, 32, "000001101010")
|
247 |
+
BitParser.add(BLACK, 33, "000001101011")
|
248 |
+
BitParser.add(BLACK, 34, "000011010010")
|
249 |
+
BitParser.add(BLACK, 35, "000011010011")
|
250 |
+
BitParser.add(BLACK, 36, "000011010100")
|
251 |
+
BitParser.add(BLACK, 37, "000011010101")
|
252 |
+
BitParser.add(BLACK, 38, "000011010110")
|
253 |
+
BitParser.add(BLACK, 39, "000011010111")
|
254 |
+
BitParser.add(BLACK, 40, "000001101100")
|
255 |
+
BitParser.add(BLACK, 41, "000001101101")
|
256 |
+
BitParser.add(BLACK, 42, "000011011010")
|
257 |
+
BitParser.add(BLACK, 43, "000011011011")
|
258 |
+
BitParser.add(BLACK, 44, "000001010100")
|
259 |
+
BitParser.add(BLACK, 45, "000001010101")
|
260 |
+
BitParser.add(BLACK, 46, "000001010110")
|
261 |
+
BitParser.add(BLACK, 47, "000001010111")
|
262 |
+
BitParser.add(BLACK, 48, "000001100100")
|
263 |
+
BitParser.add(BLACK, 49, "000001100101")
|
264 |
+
BitParser.add(BLACK, 50, "000001010010")
|
265 |
+
BitParser.add(BLACK, 51, "000001010011")
|
266 |
+
BitParser.add(BLACK, 52, "000000100100")
|
267 |
+
BitParser.add(BLACK, 53, "000000110111")
|
268 |
+
BitParser.add(BLACK, 54, "000000111000")
|
269 |
+
BitParser.add(BLACK, 55, "000000100111")
|
270 |
+
BitParser.add(BLACK, 56, "000000101000")
|
271 |
+
BitParser.add(BLACK, 57, "000001011000")
|
272 |
+
BitParser.add(BLACK, 58, "000001011001")
|
273 |
+
BitParser.add(BLACK, 59, "000000101011")
|
274 |
+
BitParser.add(BLACK, 60, "000000101100")
|
275 |
+
BitParser.add(BLACK, 61, "000001011010")
|
276 |
+
BitParser.add(BLACK, 62, "000001100110")
|
277 |
+
BitParser.add(BLACK, 63, "000001100111")
|
278 |
+
BitParser.add(BLACK, 64, "0000001111")
|
279 |
+
BitParser.add(BLACK, 128, "000011001000")
|
280 |
+
BitParser.add(BLACK, 192, "000011001001")
|
281 |
+
BitParser.add(BLACK, 256, "000001011011")
|
282 |
+
BitParser.add(BLACK, 320, "000000110011")
|
283 |
+
BitParser.add(BLACK, 384, "000000110100")
|
284 |
+
BitParser.add(BLACK, 448, "000000110101")
|
285 |
+
BitParser.add(BLACK, 512, "0000001101100")
|
286 |
+
BitParser.add(BLACK, 576, "0000001101101")
|
287 |
+
BitParser.add(BLACK, 640, "0000001001010")
|
288 |
+
BitParser.add(BLACK, 704, "0000001001011")
|
289 |
+
BitParser.add(BLACK, 768, "0000001001100")
|
290 |
+
BitParser.add(BLACK, 832, "0000001001101")
|
291 |
+
BitParser.add(BLACK, 896, "0000001110010")
|
292 |
+
BitParser.add(BLACK, 960, "0000001110011")
|
293 |
+
BitParser.add(BLACK, 1024, "0000001110100")
|
294 |
+
BitParser.add(BLACK, 1088, "0000001110101")
|
295 |
+
BitParser.add(BLACK, 1152, "0000001110110")
|
296 |
+
BitParser.add(BLACK, 1216, "0000001110111")
|
297 |
+
BitParser.add(BLACK, 1280, "0000001010010")
|
298 |
+
BitParser.add(BLACK, 1344, "0000001010011")
|
299 |
+
BitParser.add(BLACK, 1408, "0000001010100")
|
300 |
+
BitParser.add(BLACK, 1472, "0000001010101")
|
301 |
+
BitParser.add(BLACK, 1536, "0000001011010")
|
302 |
+
BitParser.add(BLACK, 1600, "0000001011011")
|
303 |
+
BitParser.add(BLACK, 1664, "0000001100100")
|
304 |
+
BitParser.add(BLACK, 1728, "0000001100101")
|
305 |
+
BitParser.add(BLACK, 1792, "00000001000")
|
306 |
+
BitParser.add(BLACK, 1856, "00000001100")
|
307 |
+
BitParser.add(BLACK, 1920, "00000001101")
|
308 |
+
BitParser.add(BLACK, 1984, "000000010010")
|
309 |
+
BitParser.add(BLACK, 2048, "000000010011")
|
310 |
+
BitParser.add(BLACK, 2112, "000000010100")
|
311 |
+
BitParser.add(BLACK, 2176, "000000010101")
|
312 |
+
BitParser.add(BLACK, 2240, "000000010110")
|
313 |
+
BitParser.add(BLACK, 2304, "000000010111")
|
314 |
+
BitParser.add(BLACK, 2368, "000000011100")
|
315 |
+
BitParser.add(BLACK, 2432, "000000011101")
|
316 |
+
BitParser.add(BLACK, 2496, "000000011110")
|
317 |
+
BitParser.add(BLACK, 2560, "000000011111")
|
318 |
+
|
319 |
+
UNCOMPRESSED = [None, None]
|
320 |
+
BitParser.add(UNCOMPRESSED, "1", "1")
|
321 |
+
BitParser.add(UNCOMPRESSED, "01", "01")
|
322 |
+
BitParser.add(UNCOMPRESSED, "001", "001")
|
323 |
+
BitParser.add(UNCOMPRESSED, "0001", "0001")
|
324 |
+
BitParser.add(UNCOMPRESSED, "00001", "00001")
|
325 |
+
BitParser.add(UNCOMPRESSED, "00000", "000001")
|
326 |
+
BitParser.add(UNCOMPRESSED, "T00", "00000011")
|
327 |
+
BitParser.add(UNCOMPRESSED, "T10", "00000010")
|
328 |
+
BitParser.add(UNCOMPRESSED, "T000", "000000011")
|
329 |
+
BitParser.add(UNCOMPRESSED, "T100", "000000010")
|
330 |
+
BitParser.add(UNCOMPRESSED, "T0000", "0000000011")
|
331 |
+
BitParser.add(UNCOMPRESSED, "T1000", "0000000010")
|
332 |
+
BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
|
333 |
+
BitParser.add(UNCOMPRESSED, "T10000", "00000000010")
|
334 |
+
|
335 |
+
class CCITTException(PDFException):
|
336 |
+
pass
|
337 |
+
|
338 |
+
class EOFB(CCITTException):
|
339 |
+
pass
|
340 |
+
|
341 |
+
class InvalidData(CCITTException):
|
342 |
+
pass
|
343 |
+
|
344 |
+
class ByteSkip(CCITTException):
|
345 |
+
pass
|
346 |
+
|
347 |
+
_color: int
|
348 |
+
|
349 |
+
def __init__(self, width: int, bytealign: bool = False) -> None:
|
350 |
+
BitParser.__init__(self)
|
351 |
+
self.width = width
|
352 |
+
self.bytealign = bytealign
|
353 |
+
self.reset()
|
354 |
+
|
355 |
+
def feedbytes(self, data: bytes) -> None:
|
356 |
+
for byte in get_bytes(data):
|
357 |
+
try:
|
358 |
+
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
359 |
+
self._parse_bit(byte & m)
|
360 |
+
except self.ByteSkip:
|
361 |
+
self._accept = self._parse_mode
|
362 |
+
self._state = self.MODE
|
363 |
+
except self.EOFB:
|
364 |
+
break
|
365 |
+
|
366 |
+
def _parse_mode(self, mode: object) -> BitParserState:
|
367 |
+
if mode == "p":
|
368 |
+
self._do_pass()
|
369 |
+
self._flush_line()
|
370 |
+
return self.MODE
|
371 |
+
elif mode == "h":
|
372 |
+
self._n1 = 0
|
373 |
+
self._accept = self._parse_horiz1
|
374 |
+
if self._color:
|
375 |
+
return self.WHITE
|
376 |
+
else:
|
377 |
+
return self.BLACK
|
378 |
+
elif mode == "u":
|
379 |
+
self._accept = self._parse_uncompressed
|
380 |
+
return self.UNCOMPRESSED
|
381 |
+
elif mode == "e":
|
382 |
+
raise self.EOFB
|
383 |
+
elif isinstance(mode, int):
|
384 |
+
self._do_vertical(mode)
|
385 |
+
self._flush_line()
|
386 |
+
return self.MODE
|
387 |
+
else:
|
388 |
+
raise self.InvalidData(mode)
|
389 |
+
|
390 |
+
def _parse_horiz1(self, n: Any) -> BitParserState:
|
391 |
+
if n is None:
|
392 |
+
raise self.InvalidData
|
393 |
+
self._n1 += n
|
394 |
+
if n < 64:
|
395 |
+
self._n2 = 0
|
396 |
+
self._color = 1 - self._color
|
397 |
+
self._accept = self._parse_horiz2
|
398 |
+
if self._color:
|
399 |
+
return self.WHITE
|
400 |
+
else:
|
401 |
+
return self.BLACK
|
402 |
+
|
403 |
+
def _parse_horiz2(self, n: Any) -> BitParserState:
|
404 |
+
if n is None:
|
405 |
+
raise self.InvalidData
|
406 |
+
self._n2 += n
|
407 |
+
if n < 64:
|
408 |
+
self._color = 1 - self._color
|
409 |
+
self._accept = self._parse_mode
|
410 |
+
self._do_horizontal(self._n1, self._n2)
|
411 |
+
self._flush_line()
|
412 |
+
return self.MODE
|
413 |
+
elif self._color:
|
414 |
+
return self.WHITE
|
415 |
+
else:
|
416 |
+
return self.BLACK
|
417 |
+
|
418 |
+
def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState:
|
419 |
+
if not bits:
|
420 |
+
raise self.InvalidData
|
421 |
+
if bits.startswith("T"):
|
422 |
+
self._accept = self._parse_mode
|
423 |
+
self._color = int(bits[1])
|
424 |
+
self._do_uncompressed(bits[2:])
|
425 |
+
return self.MODE
|
426 |
+
else:
|
427 |
+
self._do_uncompressed(bits)
|
428 |
+
return self.UNCOMPRESSED
|
429 |
+
|
430 |
+
def _get_bits(self) -> str:
|
431 |
+
return "".join(str(b) for b in self._curline[: self._curpos])
|
432 |
+
|
433 |
+
def _get_refline(self, i: int) -> str:
|
434 |
+
if i < 0:
|
435 |
+
return "[]" + "".join(str(b) for b in self._refline)
|
436 |
+
elif len(self._refline) <= i:
|
437 |
+
return "".join(str(b) for b in self._refline) + "[]"
|
438 |
+
else:
|
439 |
+
return (
|
440 |
+
"".join(str(b) for b in self._refline[:i])
|
441 |
+
+ "["
|
442 |
+
+ str(self._refline[i])
|
443 |
+
+ "]"
|
444 |
+
+ "".join(str(b) for b in self._refline[i + 1 :])
|
445 |
+
)
|
446 |
+
|
447 |
+
def reset(self) -> None:
|
448 |
+
self._y = 0
|
449 |
+
self._curline = array.array("b", [1] * self.width)
|
450 |
+
self._reset_line()
|
451 |
+
self._accept = self._parse_mode
|
452 |
+
self._state = self.MODE
|
453 |
+
|
454 |
+
def output_line(self, y: int, bits: Sequence[int]) -> None:
|
455 |
+
print(y, "".join(str(b) for b in bits))
|
456 |
+
|
457 |
+
def _reset_line(self) -> None:
|
458 |
+
self._refline = self._curline
|
459 |
+
self._curline = array.array("b", [1] * self.width)
|
460 |
+
self._curpos = -1
|
461 |
+
self._color = 1
|
462 |
+
|
463 |
+
def _flush_line(self) -> None:
|
464 |
+
if self.width <= self._curpos:
|
465 |
+
self.output_line(self._y, self._curline)
|
466 |
+
self._y += 1
|
467 |
+
self._reset_line()
|
468 |
+
if self.bytealign:
|
469 |
+
raise self.ByteSkip
|
470 |
+
|
471 |
+
def _do_vertical(self, dx: int) -> None:
|
472 |
+
x1 = self._curpos + 1
|
473 |
+
while 1:
|
474 |
+
if x1 == 0:
|
475 |
+
if self._color == 1 and self._refline[x1] != self._color:
|
476 |
+
break
|
477 |
+
elif x1 == len(self._refline) or (
|
478 |
+
self._refline[x1 - 1] == self._color
|
479 |
+
and self._refline[x1] != self._color
|
480 |
+
):
|
481 |
+
break
|
482 |
+
x1 += 1
|
483 |
+
x1 += dx
|
484 |
+
x0 = max(0, self._curpos)
|
485 |
+
x1 = max(0, min(self.width, x1))
|
486 |
+
if x1 < x0:
|
487 |
+
for x in range(x1, x0):
|
488 |
+
self._curline[x] = self._color
|
489 |
+
elif x0 < x1:
|
490 |
+
for x in range(x0, x1):
|
491 |
+
self._curline[x] = self._color
|
492 |
+
self._curpos = x1
|
493 |
+
self._color = 1 - self._color
|
494 |
+
|
495 |
+
def _do_pass(self) -> None:
|
496 |
+
x1 = self._curpos + 1
|
497 |
+
while 1:
|
498 |
+
if x1 == 0:
|
499 |
+
if self._color == 1 and self._refline[x1] != self._color:
|
500 |
+
break
|
501 |
+
elif x1 == len(self._refline) or (
|
502 |
+
self._refline[x1 - 1] == self._color
|
503 |
+
and self._refline[x1] != self._color
|
504 |
+
):
|
505 |
+
break
|
506 |
+
x1 += 1
|
507 |
+
while 1:
|
508 |
+
if x1 == 0:
|
509 |
+
if self._color == 0 and self._refline[x1] == self._color:
|
510 |
+
break
|
511 |
+
elif x1 == len(self._refline) or (
|
512 |
+
self._refline[x1 - 1] != self._color
|
513 |
+
and self._refline[x1] == self._color
|
514 |
+
):
|
515 |
+
break
|
516 |
+
x1 += 1
|
517 |
+
for x in range(self._curpos, x1):
|
518 |
+
self._curline[x] = self._color
|
519 |
+
self._curpos = x1
|
520 |
+
|
521 |
+
def _do_horizontal(self, n1: int, n2: int) -> None:
|
522 |
+
if self._curpos < 0:
|
523 |
+
self._curpos = 0
|
524 |
+
x = self._curpos
|
525 |
+
for _ in range(n1):
|
526 |
+
if len(self._curline) <= x:
|
527 |
+
break
|
528 |
+
self._curline[x] = self._color
|
529 |
+
x += 1
|
530 |
+
for _ in range(n2):
|
531 |
+
if len(self._curline) <= x:
|
532 |
+
break
|
533 |
+
self._curline[x] = 1 - self._color
|
534 |
+
x += 1
|
535 |
+
self._curpos = x
|
536 |
+
|
537 |
+
def _do_uncompressed(self, bits: str) -> None:
|
538 |
+
for c in bits:
|
539 |
+
self._curline[self._curpos] = int(c)
|
540 |
+
self._curpos += 1
|
541 |
+
self._flush_line()
|
542 |
+
|
543 |
+
|
544 |
+
class CCITTFaxDecoder(CCITTG4Parser):
|
545 |
+
def __init__(
|
546 |
+
self,
|
547 |
+
width: int,
|
548 |
+
bytealign: bool = False,
|
549 |
+
reversed: bool = False,
|
550 |
+
) -> None:
|
551 |
+
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
552 |
+
self.reversed = reversed
|
553 |
+
self._buf = b""
|
554 |
+
|
555 |
+
def close(self) -> bytes:
|
556 |
+
return self._buf
|
557 |
+
|
558 |
+
def output_line(self, y: int, bits: Sequence[int]) -> None:
|
559 |
+
arr = array.array("B", [0] * ((len(bits) + 7) // 8))
|
560 |
+
if self.reversed:
|
561 |
+
bits = [1 - b for b in bits]
|
562 |
+
for i, b in enumerate(bits):
|
563 |
+
if b:
|
564 |
+
arr[i // 8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
|
565 |
+
self._buf += arr.tobytes()
|
566 |
+
|
567 |
+
|
568 |
+
def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
|
569 |
+
K = params.get("K")
|
570 |
+
if K == -1:
|
571 |
+
cols = cast(int, params.get("Columns"))
|
572 |
+
bytealign = cast(bool, params.get("EncodedByteAlign"))
|
573 |
+
reversed = cast(bool, params.get("BlackIs1"))
|
574 |
+
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
|
575 |
+
else:
|
576 |
+
raise PDFValueError(K)
|
577 |
+
parser.feedbytes(data)
|
578 |
+
return parser.close()
|
579 |
+
|
580 |
+
|
581 |
+
# test
|
582 |
+
def main(argv: List[str]) -> None:
|
583 |
+
if not argv[1:]:
|
584 |
+
import unittest
|
585 |
+
|
586 |
+
unittest.main()
|
587 |
+
return
|
588 |
+
|
589 |
+
class Parser(CCITTG4Parser):
|
590 |
+
def __init__(self, width: int, bytealign: bool = False) -> None:
|
591 |
+
import pygame # type: ignore[import]
|
592 |
+
|
593 |
+
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
594 |
+
self.img = pygame.Surface((self.width, 1000))
|
595 |
+
|
596 |
+
def output_line(self, y: int, bits: Sequence[int]) -> None:
|
597 |
+
for x, b in enumerate(bits):
|
598 |
+
if b:
|
599 |
+
self.img.set_at((x, y), (255, 255, 255))
|
600 |
+
else:
|
601 |
+
self.img.set_at((x, y), (0, 0, 0))
|
602 |
+
|
603 |
+
def close(self) -> None:
|
604 |
+
import pygame
|
605 |
+
|
606 |
+
pygame.image.save(self.img, "out.bmp")
|
607 |
+
|
608 |
+
for path in argv[1:]:
|
609 |
+
fp = open(path, "rb")
|
610 |
+
(_, _, k, w, h, _) = path.split(".")
|
611 |
+
parser = Parser(int(w))
|
612 |
+
parser.feedbytes(fp.read())
|
613 |
+
parser.close()
|
614 |
+
fp.close()
|
pdf2zh/cmapdb.py
ADDED
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Adobe character mapping (CMap) support.
|
2 |
+
|
3 |
+
CMaps provide the mapping between character codes and Unicode
|
4 |
+
code-points to character ids (CIDs).
|
5 |
+
|
6 |
+
More information is available on:
|
7 |
+
|
8 |
+
https://github.com/adobe-type-tools/cmap-resources
|
9 |
+
|
10 |
+
"""
|
11 |
+
|
12 |
+
import gzip
|
13 |
+
import logging
|
14 |
+
import os
|
15 |
+
import os.path
|
16 |
+
import pickle as pickle
|
17 |
+
import struct
|
18 |
+
import sys
|
19 |
+
from typing import (
|
20 |
+
Any,
|
21 |
+
BinaryIO,
|
22 |
+
Dict,
|
23 |
+
Iterable,
|
24 |
+
Iterator,
|
25 |
+
List,
|
26 |
+
MutableMapping,
|
27 |
+
Optional,
|
28 |
+
Set,
|
29 |
+
TextIO,
|
30 |
+
Tuple,
|
31 |
+
Union,
|
32 |
+
cast,
|
33 |
+
)
|
34 |
+
|
35 |
+
from pdf2zh.encodingdb import name2unicode
|
36 |
+
from pdf2zh.pdfexceptions import PDFException, PDFTypeError
|
37 |
+
from pdf2zh.psexceptions import PSEOF, PSSyntaxError
|
38 |
+
from pdf2zh.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name
|
39 |
+
from pdf2zh.utils import choplist, nunpack
|
40 |
+
|
41 |
+
log = logging.getLogger(__name__)
|
42 |
+
|
43 |
+
|
44 |
+
class CMapError(PDFException):
|
45 |
+
pass
|
46 |
+
|
47 |
+
|
48 |
+
class CMapBase:
|
49 |
+
debug = 0
|
50 |
+
|
51 |
+
def __init__(self, **kwargs: object) -> None:
|
52 |
+
self.attrs: MutableMapping[str, object] = kwargs.copy()
|
53 |
+
|
54 |
+
def is_vertical(self) -> bool:
|
55 |
+
return self.attrs.get("WMode", 0) != 0
|
56 |
+
|
57 |
+
def set_attr(self, k: str, v: object) -> None:
|
58 |
+
self.attrs[k] = v
|
59 |
+
|
60 |
+
def add_code2cid(self, code: str, cid: int) -> None:
|
61 |
+
pass
|
62 |
+
|
63 |
+
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
|
64 |
+
pass
|
65 |
+
|
66 |
+
def use_cmap(self, cmap: "CMapBase") -> None:
|
67 |
+
pass
|
68 |
+
|
69 |
+
def decode(self, code: bytes) -> Iterable[int]:
|
70 |
+
raise NotImplementedError
|
71 |
+
|
72 |
+
|
73 |
+
class CMap(CMapBase):
|
74 |
+
def __init__(self, **kwargs: Union[str, int]) -> None:
|
75 |
+
CMapBase.__init__(self, **kwargs)
|
76 |
+
self.code2cid: Dict[int, object] = {}
|
77 |
+
|
78 |
+
def __repr__(self) -> str:
|
79 |
+
return "<CMap: %s>" % self.attrs.get("CMapName")
|
80 |
+
|
81 |
+
def use_cmap(self, cmap: CMapBase) -> None:
|
82 |
+
assert isinstance(cmap, CMap), str(type(cmap))
|
83 |
+
|
84 |
+
def copy(dst: Dict[int, object], src: Dict[int, object]) -> None:
|
85 |
+
for k, v in src.items():
|
86 |
+
if isinstance(v, dict):
|
87 |
+
d: Dict[int, object] = {}
|
88 |
+
dst[k] = d
|
89 |
+
copy(d, v)
|
90 |
+
else:
|
91 |
+
dst[k] = v
|
92 |
+
|
93 |
+
copy(self.code2cid, cmap.code2cid)
|
94 |
+
|
95 |
+
def decode(self, code: bytes) -> Iterator[int]:
|
96 |
+
# log.debug("decode: %r, %r", self, code)
|
97 |
+
d = self.code2cid
|
98 |
+
for i in iter(code):
|
99 |
+
if i in d:
|
100 |
+
x = d[i]
|
101 |
+
if isinstance(x, int):
|
102 |
+
yield x
|
103 |
+
d = self.code2cid
|
104 |
+
else:
|
105 |
+
d = cast(Dict[int, object], x)
|
106 |
+
else:
|
107 |
+
d = self.code2cid
|
108 |
+
|
109 |
+
def dump(
|
110 |
+
self,
|
111 |
+
out: TextIO = sys.stdout,
|
112 |
+
code2cid: Optional[Dict[int, object]] = None,
|
113 |
+
code: Tuple[int, ...] = (),
|
114 |
+
) -> None:
|
115 |
+
if code2cid is None:
|
116 |
+
code2cid = self.code2cid
|
117 |
+
code = ()
|
118 |
+
for k, v in sorted(code2cid.items()):
|
119 |
+
c = code + (k,)
|
120 |
+
if isinstance(v, int):
|
121 |
+
out.write("code %r = cid %d\n" % (c, v))
|
122 |
+
else:
|
123 |
+
self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)
|
124 |
+
|
125 |
+
|
126 |
+
class IdentityCMap(CMapBase):
|
127 |
+
def decode(self, code: bytes) -> Tuple[int, ...]:
|
128 |
+
n = len(code) // 2
|
129 |
+
if n:
|
130 |
+
return struct.unpack(">%dH" % n, code)
|
131 |
+
else:
|
132 |
+
return ()
|
133 |
+
|
134 |
+
|
135 |
+
class IdentityCMapByte(IdentityCMap):
|
136 |
+
def decode(self, code: bytes) -> Tuple[int, ...]:
|
137 |
+
n = len(code)
|
138 |
+
if n:
|
139 |
+
return struct.unpack(">%dB" % n, code)
|
140 |
+
else:
|
141 |
+
return ()
|
142 |
+
|
143 |
+
|
144 |
+
class UnicodeMap(CMapBase):
|
145 |
+
def __init__(self, **kwargs: Union[str, int]) -> None:
|
146 |
+
CMapBase.__init__(self, **kwargs)
|
147 |
+
self.cid2unichr: Dict[int, str] = {}
|
148 |
+
|
149 |
+
def __repr__(self) -> str:
|
150 |
+
return "<UnicodeMap: %s>" % self.attrs.get("CMapName")
|
151 |
+
|
152 |
+
def get_unichr(self, cid: int) -> str:
|
153 |
+
# log.debug("get_unichr: %r, %r", self, cid)
|
154 |
+
return self.cid2unichr[cid]
|
155 |
+
|
156 |
+
def dump(self, out: TextIO = sys.stdout) -> None:
|
157 |
+
for k, v in sorted(self.cid2unichr.items()):
|
158 |
+
out.write("cid %d = unicode %r\n" % (k, v))
|
159 |
+
|
160 |
+
|
161 |
+
class IdentityUnicodeMap(UnicodeMap):
|
162 |
+
def get_unichr(self, cid: int) -> str:
|
163 |
+
"""Interpret character id as unicode codepoint"""
|
164 |
+
# log.debug("get_unichr: %r, %r", self, cid)
|
165 |
+
return chr(cid)
|
166 |
+
|
167 |
+
|
168 |
+
class FileCMap(CMap):
|
169 |
+
def add_code2cid(self, code: str, cid: int) -> None:
|
170 |
+
assert isinstance(code, str) and isinstance(cid, int), str(
|
171 |
+
(type(code), type(cid)),
|
172 |
+
)
|
173 |
+
d = self.code2cid
|
174 |
+
for c in code[:-1]:
|
175 |
+
ci = ord(c)
|
176 |
+
if ci in d:
|
177 |
+
d = cast(Dict[int, object], d[ci])
|
178 |
+
else:
|
179 |
+
t: Dict[int, object] = {}
|
180 |
+
d[ci] = t
|
181 |
+
d = t
|
182 |
+
ci = ord(code[-1])
|
183 |
+
d[ci] = cid
|
184 |
+
|
185 |
+
|
186 |
+
class FileUnicodeMap(UnicodeMap):
|
187 |
+
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
|
188 |
+
assert isinstance(cid, int), str(type(cid))
|
189 |
+
if isinstance(code, PSLiteral):
|
190 |
+
# Interpret as an Adobe glyph name.
|
191 |
+
assert isinstance(code.name, str)
|
192 |
+
unichr = name2unicode(code.name)
|
193 |
+
elif isinstance(code, bytes):
|
194 |
+
# Interpret as UTF-16BE.
|
195 |
+
unichr = code.decode("UTF-16BE", "ignore")
|
196 |
+
elif isinstance(code, int):
|
197 |
+
unichr = chr(code)
|
198 |
+
else:
|
199 |
+
raise PDFTypeError(code)
|
200 |
+
|
201 |
+
# A0 = non-breaking space, some weird fonts can have a collision on a cid here.
|
202 |
+
if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
|
203 |
+
return
|
204 |
+
self.cid2unichr[cid] = unichr
|
205 |
+
|
206 |
+
|
207 |
+
class PyCMap(CMap):
|
208 |
+
def __init__(self, name: str, module: Any) -> None:
|
209 |
+
super().__init__(CMapName=name)
|
210 |
+
self.code2cid = module.CODE2CID
|
211 |
+
if module.IS_VERTICAL:
|
212 |
+
self.attrs["WMode"] = 1
|
213 |
+
|
214 |
+
|
215 |
+
class PyUnicodeMap(UnicodeMap):
|
216 |
+
def __init__(self, name: str, module: Any, vertical: bool) -> None:
|
217 |
+
super().__init__(CMapName=name)
|
218 |
+
if vertical:
|
219 |
+
self.cid2unichr = module.CID2UNICHR_V
|
220 |
+
self.attrs["WMode"] = 1
|
221 |
+
else:
|
222 |
+
self.cid2unichr = module.CID2UNICHR_H
|
223 |
+
|
224 |
+
|
225 |
+
class CMapDB:
|
226 |
+
_cmap_cache: Dict[str, PyCMap] = {}
|
227 |
+
_umap_cache: Dict[str, List[PyUnicodeMap]] = {}
|
228 |
+
|
229 |
+
class CMapNotFound(CMapError):
|
230 |
+
pass
|
231 |
+
|
232 |
+
@classmethod
|
233 |
+
def _load_data(cls, name: str) -> Any:
|
234 |
+
name = name.replace("\0", "")
|
235 |
+
filename = "%s.pickle.gz" % name
|
236 |
+
# log.debug("loading: %r", name)
|
237 |
+
cmap_paths = (
|
238 |
+
os.environ.get("CMAP_PATH", "/usr/share/pdf2zh/"),
|
239 |
+
os.path.join(os.path.dirname(__file__), "cmap"),
|
240 |
+
)
|
241 |
+
for directory in cmap_paths:
|
242 |
+
path = os.path.join(directory, filename)
|
243 |
+
if os.path.exists(path):
|
244 |
+
gzfile = gzip.open(path)
|
245 |
+
try:
|
246 |
+
return type(str(name), (), pickle.loads(gzfile.read()))
|
247 |
+
finally:
|
248 |
+
gzfile.close()
|
249 |
+
raise CMapDB.CMapNotFound(name)
|
250 |
+
|
251 |
+
@classmethod
|
252 |
+
def get_cmap(cls, name: str) -> CMapBase:
|
253 |
+
if name == "Identity-H":
|
254 |
+
return IdentityCMap(WMode=0)
|
255 |
+
elif name == "Identity-V":
|
256 |
+
return IdentityCMap(WMode=1)
|
257 |
+
elif name == "OneByteIdentityH":
|
258 |
+
return IdentityCMapByte(WMode=0)
|
259 |
+
elif name == "OneByteIdentityV":
|
260 |
+
return IdentityCMapByte(WMode=1)
|
261 |
+
try:
|
262 |
+
return cls._cmap_cache[name]
|
263 |
+
except KeyError:
|
264 |
+
pass
|
265 |
+
data = cls._load_data(name)
|
266 |
+
cls._cmap_cache[name] = cmap = PyCMap(name, data)
|
267 |
+
return cmap
|
268 |
+
|
269 |
+
@classmethod
|
270 |
+
def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
|
271 |
+
try:
|
272 |
+
return cls._umap_cache[name][vertical]
|
273 |
+
except KeyError:
|
274 |
+
pass
|
275 |
+
data = cls._load_data("to-unicode-%s" % name)
|
276 |
+
cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
|
277 |
+
return cls._umap_cache[name][vertical]
|
278 |
+
|
279 |
+
|
280 |
+
class CMapParser(PSStackParser[PSKeyword]):
|
281 |
+
def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
|
282 |
+
PSStackParser.__init__(self, fp)
|
283 |
+
self.cmap = cmap
|
284 |
+
# some ToUnicode maps don't have "begincmap" keyword.
|
285 |
+
self._in_cmap = True
|
286 |
+
self._warnings: Set[str] = set()
|
287 |
+
|
288 |
+
def run(self) -> None:
|
289 |
+
try:
|
290 |
+
self.nextobject()
|
291 |
+
except PSEOF:
|
292 |
+
pass
|
293 |
+
|
294 |
+
KEYWORD_BEGINCMAP = KWD(b"begincmap")
|
295 |
+
KEYWORD_ENDCMAP = KWD(b"endcmap")
|
296 |
+
KEYWORD_USECMAP = KWD(b"usecmap")
|
297 |
+
KEYWORD_DEF = KWD(b"def")
|
298 |
+
KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
|
299 |
+
KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
|
300 |
+
KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
|
301 |
+
KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
|
302 |
+
KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
|
303 |
+
KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
|
304 |
+
KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
|
305 |
+
KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
|
306 |
+
KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
|
307 |
+
KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
|
308 |
+
KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
|
309 |
+
KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
|
310 |
+
|
311 |
+
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
312 |
+
"""ToUnicode CMaps
|
313 |
+
|
314 |
+
See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
|
315 |
+
"""
|
316 |
+
if token is self.KEYWORD_BEGINCMAP:
|
317 |
+
self._in_cmap = True
|
318 |
+
self.popall()
|
319 |
+
return
|
320 |
+
|
321 |
+
elif token is self.KEYWORD_ENDCMAP:
|
322 |
+
self._in_cmap = False
|
323 |
+
return
|
324 |
+
|
325 |
+
if not self._in_cmap:
|
326 |
+
return
|
327 |
+
|
328 |
+
if token is self.KEYWORD_DEF:
|
329 |
+
try:
|
330 |
+
((_, k), (_, v)) = self.pop(2)
|
331 |
+
self.cmap.set_attr(literal_name(k), v)
|
332 |
+
except PSSyntaxError:
|
333 |
+
pass
|
334 |
+
return
|
335 |
+
|
336 |
+
if token is self.KEYWORD_USECMAP:
|
337 |
+
try:
|
338 |
+
((_, cmapname),) = self.pop(1)
|
339 |
+
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
|
340 |
+
except PSSyntaxError:
|
341 |
+
pass
|
342 |
+
except CMapDB.CMapNotFound:
|
343 |
+
pass
|
344 |
+
return
|
345 |
+
|
346 |
+
if token is self.KEYWORD_BEGINCODESPACERANGE:
|
347 |
+
self.popall()
|
348 |
+
return
|
349 |
+
if token is self.KEYWORD_ENDCODESPACERANGE:
|
350 |
+
self.popall()
|
351 |
+
return
|
352 |
+
|
353 |
+
if token is self.KEYWORD_BEGINCIDRANGE:
|
354 |
+
self.popall()
|
355 |
+
return
|
356 |
+
|
357 |
+
if token is self.KEYWORD_ENDCIDRANGE:
|
358 |
+
objs = [obj for (__, obj) in self.popall()]
|
359 |
+
for start_byte, end_byte, cid in choplist(3, objs):
|
360 |
+
if not isinstance(start_byte, bytes):
|
361 |
+
self._warn_once("The start object of begincidrange is not a byte.")
|
362 |
+
continue
|
363 |
+
if not isinstance(end_byte, bytes):
|
364 |
+
self._warn_once("The end object of begincidrange is not a byte.")
|
365 |
+
continue
|
366 |
+
if not isinstance(cid, int):
|
367 |
+
self._warn_once("The cid object of begincidrange is not a byte.")
|
368 |
+
continue
|
369 |
+
if len(start_byte) != len(end_byte):
|
370 |
+
self._warn_once(
|
371 |
+
"The start and end byte of begincidrange have "
|
372 |
+
"different lengths.",
|
373 |
+
)
|
374 |
+
continue
|
375 |
+
start_prefix = start_byte[:-4]
|
376 |
+
end_prefix = end_byte[:-4]
|
377 |
+
if start_prefix != end_prefix:
|
378 |
+
self._warn_once(
|
379 |
+
"The prefix of the start and end byte of "
|
380 |
+
"begincidrange are not the same.",
|
381 |
+
)
|
382 |
+
continue
|
383 |
+
svar = start_byte[-4:]
|
384 |
+
evar = end_byte[-4:]
|
385 |
+
start = nunpack(svar)
|
386 |
+
end = nunpack(evar)
|
387 |
+
vlen = len(svar)
|
388 |
+
for i in range(end - start + 1):
|
389 |
+
x = start_prefix + struct.pack(">L", start + i)[-vlen:]
|
390 |
+
self.cmap.add_cid2unichr(cid + i, x)
|
391 |
+
return
|
392 |
+
|
393 |
+
if token is self.KEYWORD_BEGINCIDCHAR:
|
394 |
+
self.popall()
|
395 |
+
return
|
396 |
+
|
397 |
+
if token is self.KEYWORD_ENDCIDCHAR:
|
398 |
+
objs = [obj for (__, obj) in self.popall()]
|
399 |
+
for cid, code in choplist(2, objs):
|
400 |
+
if isinstance(code, bytes) and isinstance(cid, int):
|
401 |
+
self.cmap.add_cid2unichr(cid, code)
|
402 |
+
return
|
403 |
+
|
404 |
+
if token is self.KEYWORD_BEGINBFRANGE:
|
405 |
+
self.popall()
|
406 |
+
return
|
407 |
+
|
408 |
+
if token is self.KEYWORD_ENDBFRANGE:
|
409 |
+
objs = [obj for (__, obj) in self.popall()]
|
410 |
+
for start_byte, end_byte, code in choplist(3, objs):
|
411 |
+
if not isinstance(start_byte, bytes):
|
412 |
+
self._warn_once("The start object is not a byte.")
|
413 |
+
continue
|
414 |
+
if not isinstance(end_byte, bytes):
|
415 |
+
self._warn_once("The end object is not a byte.")
|
416 |
+
continue
|
417 |
+
if len(start_byte) != len(end_byte):
|
418 |
+
self._warn_once("The start and end byte have different lengths.")
|
419 |
+
continue
|
420 |
+
start = nunpack(start_byte)
|
421 |
+
end = nunpack(end_byte)
|
422 |
+
if isinstance(code, list):
|
423 |
+
if len(code) != end - start + 1:
|
424 |
+
self._warn_once(
|
425 |
+
"The difference between the start and end "
|
426 |
+
"offsets does not match the code length.",
|
427 |
+
)
|
428 |
+
for cid, unicode_value in zip(range(start, end + 1), code):
|
429 |
+
self.cmap.add_cid2unichr(cid, unicode_value)
|
430 |
+
else:
|
431 |
+
assert isinstance(code, bytes)
|
432 |
+
var = code[-4:]
|
433 |
+
base = nunpack(var)
|
434 |
+
prefix = code[:-4]
|
435 |
+
vlen = len(var)
|
436 |
+
for i in range(end - start + 1):
|
437 |
+
x = prefix + struct.pack(">L", base + i)[-vlen:]
|
438 |
+
self.cmap.add_cid2unichr(start + i, x)
|
439 |
+
return
|
440 |
+
|
441 |
+
if token is self.KEYWORD_BEGINBFCHAR:
|
442 |
+
self.popall()
|
443 |
+
return
|
444 |
+
|
445 |
+
if token is self.KEYWORD_ENDBFCHAR:
|
446 |
+
objs = [obj for (__, obj) in self.popall()]
|
447 |
+
for cid, code in choplist(2, objs):
|
448 |
+
if isinstance(cid, bytes) and isinstance(code, bytes):
|
449 |
+
self.cmap.add_cid2unichr(nunpack(cid), code)
|
450 |
+
return
|
451 |
+
|
452 |
+
if token is self.KEYWORD_BEGINNOTDEFRANGE:
|
453 |
+
self.popall()
|
454 |
+
return
|
455 |
+
|
456 |
+
if token is self.KEYWORD_ENDNOTDEFRANGE:
|
457 |
+
self.popall()
|
458 |
+
return
|
459 |
+
|
460 |
+
self.push((pos, token))
|
461 |
+
|
462 |
+
def _warn_once(self, msg: str) -> None:
|
463 |
+
"""Warn once for each unique message"""
|
464 |
+
if msg not in self._warnings:
|
465 |
+
self._warnings.add(msg)
|
466 |
+
base_msg = (
|
467 |
+
"Ignoring (part of) ToUnicode map because the PDF data "
|
468 |
+
"does not conform to the format. This could result in "
|
469 |
+
"(cid) values in the output. "
|
470 |
+
)
|
471 |
+
log.warning(base_msg + msg)
|
pdf2zh/converter.py
ADDED
@@ -0,0 +1,1384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf2zh.utils import (
|
2 |
+
AnyIO,
|
3 |
+
Matrix,
|
4 |
+
PathSegment,
|
5 |
+
Point,
|
6 |
+
Rect,
|
7 |
+
apply_matrix_pt,
|
8 |
+
bbox2str,
|
9 |
+
enc,
|
10 |
+
make_compat_str,
|
11 |
+
mult_matrix,
|
12 |
+
matrix_scale,
|
13 |
+
)
|
14 |
+
from pdf2zh.pdftypes import PDFStream
|
15 |
+
from pdf2zh.pdfpage import PDFPage
|
16 |
+
from pdf2zh.pdfinterp import PDFGraphicState, PDFResourceManager
|
17 |
+
from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined, PDFCIDFont
|
18 |
+
from pdf2zh.pdfexceptions import PDFValueError
|
19 |
+
from pdf2zh.pdfdevice import PDFTextDevice
|
20 |
+
from pdf2zh.pdfcolor import PDFColorSpace
|
21 |
+
from pdf2zh.layout import (
|
22 |
+
LAParams,
|
23 |
+
LTAnno,
|
24 |
+
LTChar,
|
25 |
+
LTComponent,
|
26 |
+
LTCurve,
|
27 |
+
LTFigure,
|
28 |
+
LTImage,
|
29 |
+
LTItem,
|
30 |
+
LTLayoutContainer,
|
31 |
+
LTLine,
|
32 |
+
LTPage,
|
33 |
+
LTRect,
|
34 |
+
LTText,
|
35 |
+
LTTextBox,
|
36 |
+
LTTextBoxVertical,
|
37 |
+
LTTextGroup,
|
38 |
+
LTTextLine,
|
39 |
+
TextGroupElement,
|
40 |
+
)
|
41 |
+
from pdf2zh.image import ImageWriter
|
42 |
+
from pdf2zh import utils
|
43 |
+
import io
|
44 |
+
import logging
|
45 |
+
import re
|
46 |
+
from typing import (
|
47 |
+
BinaryIO,
|
48 |
+
Dict,
|
49 |
+
Generic,
|
50 |
+
List,
|
51 |
+
Optional,
|
52 |
+
Sequence,
|
53 |
+
TextIO,
|
54 |
+
Tuple,
|
55 |
+
TypeVar,
|
56 |
+
Union,
|
57 |
+
cast,
|
58 |
+
)
|
59 |
+
import concurrent.futures
|
60 |
+
import numpy as np
|
61 |
+
import unicodedata
|
62 |
+
from tenacity import retry, wait_fixed
|
63 |
+
from pdf2zh import cache
|
64 |
+
from pdf2zh.translator import (
|
65 |
+
BaseTranslator,
|
66 |
+
GoogleTranslator,
|
67 |
+
DeepLTranslator,
|
68 |
+
DeepLXTranslator,
|
69 |
+
OllamaTranslator,
|
70 |
+
OpenAITranslator,
|
71 |
+
AzureTranslator,
|
72 |
+
TencentTranslator,
|
73 |
+
)
|
74 |
+
|
75 |
+
|
76 |
+
def remove_control_characters(s):
|
77 |
+
return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
|
78 |
+
|
79 |
+
|
80 |
+
log = logging.getLogger(__name__)
|
81 |
+
|
82 |
+
|
83 |
+
class PDFLayoutAnalyzer(PDFTextDevice):
|
84 |
+
cur_item: LTLayoutContainer
|
85 |
+
ctm: Matrix
|
86 |
+
|
87 |
+
def __init__(
|
88 |
+
self,
|
89 |
+
rsrcmgr: PDFResourceManager,
|
90 |
+
pageno: int = 1,
|
91 |
+
laparams: Optional[LAParams] = None,
|
92 |
+
) -> None:
|
93 |
+
PDFTextDevice.__init__(self, rsrcmgr)
|
94 |
+
self.pageno = pageno
|
95 |
+
self.laparams = laparams
|
96 |
+
self._stack: List[LTLayoutContainer] = []
|
97 |
+
|
98 |
+
def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
|
99 |
+
# (x0, y0, x1, y1) = page.mediabox
|
100 |
+
(x0, y0, x1, y1) = page.cropbox
|
101 |
+
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
|
102 |
+
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
|
103 |
+
mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
|
104 |
+
self.cur_item = LTPage(page.pageno, mediabox)
|
105 |
+
|
106 |
+
def end_page(self, page: PDFPage):
|
107 |
+
assert not self._stack, str(len(self._stack))
|
108 |
+
assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
|
109 |
+
# 取消默认排版分析
|
110 |
+
# if self.laparams is not None:
|
111 |
+
# self.cur_item.analyze(self.laparams)
|
112 |
+
self.pageno += 1
|
113 |
+
return self.receive_layout(self.cur_item)
|
114 |
+
|
115 |
+
def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
|
116 |
+
self._stack.append(self.cur_item)
|
117 |
+
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
|
118 |
+
self.cur_item.pageid = self._stack[-1].pageid
|
119 |
+
|
120 |
+
def end_figure(self, _: str) -> None:
|
121 |
+
fig = self.cur_item
|
122 |
+
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
|
123 |
+
self.cur_item = self._stack.pop()
|
124 |
+
self.cur_item.add(fig)
|
125 |
+
return self.receive_layout(fig)
|
126 |
+
|
127 |
+
def render_image(self, name: str, stream: PDFStream) -> None:
|
128 |
+
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
|
129 |
+
item = LTImage(
|
130 |
+
name,
|
131 |
+
stream,
|
132 |
+
(self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
|
133 |
+
)
|
134 |
+
self.cur_item.add(item)
|
135 |
+
|
136 |
+
def paint_path(
|
137 |
+
self,
|
138 |
+
gstate: PDFGraphicState,
|
139 |
+
stroke: bool,
|
140 |
+
fill: bool,
|
141 |
+
evenodd: bool,
|
142 |
+
path: Sequence[PathSegment],
|
143 |
+
) -> None:
|
144 |
+
"""Paint paths described in section 4.4 of the PDF reference manual"""
|
145 |
+
shape = "".join(x[0] for x in path)
|
146 |
+
|
147 |
+
if shape[:1] != "m":
|
148 |
+
# Per PDF Reference Section 4.4.1, "path construction operators may
|
149 |
+
# be invoked in any sequence, but the first one invoked must be m
|
150 |
+
# or re to begin a new subpath." Since pdf2zh.six already
|
151 |
+
# converts all `re` (rectangle) operators to their equivelent
|
152 |
+
# `mlllh` representation, paths ingested by `.paint_path(...)` that
|
153 |
+
# do not begin with the `m` operator are invalid.
|
154 |
+
pass
|
155 |
+
|
156 |
+
elif shape.count("m") > 1:
|
157 |
+
# recurse if there are multiple m's in this shape
|
158 |
+
for m in re.finditer(r"m[^m]+", shape):
|
159 |
+
subpath = path[m.start(0) : m.end(0)]
|
160 |
+
self.paint_path(gstate, stroke, fill, evenodd, subpath)
|
161 |
+
|
162 |
+
else:
|
163 |
+
# Although the 'h' command does not not literally provide a
|
164 |
+
# point-position, its position is (by definition) equal to the
|
165 |
+
# subpath's starting point.
|
166 |
+
#
|
167 |
+
# And, per Section 4.4's Table 4.9, all other path commands place
|
168 |
+
# their point-position in their final two arguments. (Any preceding
|
169 |
+
# arguments represent control points on Bézier curves.)
|
170 |
+
raw_pts = [
|
171 |
+
cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
|
172 |
+
]
|
173 |
+
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
|
174 |
+
|
175 |
+
operators = [str(operation[0]) for operation in path]
|
176 |
+
transformed_points = [
|
177 |
+
[
|
178 |
+
apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
|
179 |
+
for operand1, operand2 in zip(operation[1::2], operation[2::2])
|
180 |
+
]
|
181 |
+
for operation in path
|
182 |
+
]
|
183 |
+
transformed_path = [
|
184 |
+
cast(PathSegment, (o, *p))
|
185 |
+
for o, p in zip(operators, transformed_points)
|
186 |
+
]
|
187 |
+
|
188 |
+
if shape in {"mlh", "ml"}:
|
189 |
+
# single line segment
|
190 |
+
#
|
191 |
+
# Note: 'ml', in conditional above, is a frequent anomaly
|
192 |
+
# that we want to support.
|
193 |
+
line = LTLine(
|
194 |
+
gstate.linewidth * matrix_scale(self.ctm),
|
195 |
+
pts[0],
|
196 |
+
pts[1],
|
197 |
+
stroke,
|
198 |
+
fill,
|
199 |
+
evenodd,
|
200 |
+
gstate.scolor,
|
201 |
+
gstate.ncolor,
|
202 |
+
original_path=transformed_path,
|
203 |
+
dashing_style=gstate.dash,
|
204 |
+
)
|
205 |
+
self.cur_item.add(line)
|
206 |
+
|
207 |
+
elif shape in {"mlllh", "mllll"}:
|
208 |
+
(x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
|
209 |
+
|
210 |
+
is_closed_loop = pts[0] == pts[4]
|
211 |
+
has_square_coordinates = (
|
212 |
+
x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
|
213 |
+
) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
|
214 |
+
if is_closed_loop and has_square_coordinates:
|
215 |
+
rect = LTRect(
|
216 |
+
gstate.linewidth * matrix_scale(self.ctm),
|
217 |
+
(*pts[0], *pts[2]),
|
218 |
+
stroke,
|
219 |
+
fill,
|
220 |
+
evenodd,
|
221 |
+
gstate.scolor,
|
222 |
+
gstate.ncolor,
|
223 |
+
transformed_path,
|
224 |
+
gstate.dash,
|
225 |
+
)
|
226 |
+
self.cur_item.add(rect)
|
227 |
+
else:
|
228 |
+
curve = LTCurve(
|
229 |
+
gstate.linewidth * matrix_scale(self.ctm),
|
230 |
+
pts,
|
231 |
+
stroke,
|
232 |
+
fill,
|
233 |
+
evenodd,
|
234 |
+
gstate.scolor,
|
235 |
+
gstate.ncolor,
|
236 |
+
transformed_path,
|
237 |
+
gstate.dash,
|
238 |
+
)
|
239 |
+
self.cur_item.add(curve)
|
240 |
+
else:
|
241 |
+
curve = LTCurve(
|
242 |
+
gstate.linewidth * matrix_scale(self.ctm),
|
243 |
+
pts,
|
244 |
+
stroke,
|
245 |
+
fill,
|
246 |
+
evenodd,
|
247 |
+
gstate.scolor,
|
248 |
+
gstate.ncolor,
|
249 |
+
transformed_path,
|
250 |
+
gstate.dash,
|
251 |
+
)
|
252 |
+
self.cur_item.add(curve)
|
253 |
+
|
254 |
+
def render_char(
|
255 |
+
self,
|
256 |
+
matrix: Matrix,
|
257 |
+
font: PDFFont,
|
258 |
+
fontsize: float,
|
259 |
+
scaling: float,
|
260 |
+
rise: float,
|
261 |
+
cid: int,
|
262 |
+
ncs: PDFColorSpace,
|
263 |
+
graphicstate: PDFGraphicState,
|
264 |
+
) -> float:
|
265 |
+
try:
|
266 |
+
text = font.to_unichr(cid)
|
267 |
+
assert isinstance(text, str), str(type(text))
|
268 |
+
except PDFUnicodeNotDefined:
|
269 |
+
text = self.handle_undefined_char(font, cid)
|
270 |
+
textwidth = font.char_width(cid)
|
271 |
+
textdisp = font.char_disp(cid)
|
272 |
+
item = LTChar(
|
273 |
+
matrix,
|
274 |
+
font,
|
275 |
+
fontsize,
|
276 |
+
scaling,
|
277 |
+
rise,
|
278 |
+
text,
|
279 |
+
textwidth,
|
280 |
+
textdisp,
|
281 |
+
ncs,
|
282 |
+
graphicstate,
|
283 |
+
)
|
284 |
+
self.cur_item.add(item)
|
285 |
+
item.cid = cid # hack 插入原字符编码
|
286 |
+
return item.adv
|
287 |
+
|
288 |
+
def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
|
289 |
+
# log.debug("undefined: %r, %r", font, cid)
|
290 |
+
return "(cid:%d)" % cid
|
291 |
+
|
292 |
+
def receive_layout(self, ltpage: LTPage) -> None:
|
293 |
+
pass
|
294 |
+
|
295 |
+
|
296 |
+
class PDFPageAggregator(PDFLayoutAnalyzer):
|
297 |
+
def __init__(
|
298 |
+
self,
|
299 |
+
rsrcmgr: PDFResourceManager,
|
300 |
+
pageno: int = 1,
|
301 |
+
laparams: Optional[LAParams] = None,
|
302 |
+
) -> None:
|
303 |
+
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
|
304 |
+
self.result: Optional[LTPage] = None
|
305 |
+
|
306 |
+
def receive_layout(self, ltpage: LTPage) -> None:
|
307 |
+
self.result = ltpage
|
308 |
+
|
309 |
+
def get_result(self) -> LTPage:
|
310 |
+
assert self.result is not None
|
311 |
+
return self.result
|
312 |
+
|
313 |
+
|
314 |
+
# Some PDFConverter children support only binary I/O
|
315 |
+
IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
|
316 |
+
|
317 |
+
|
318 |
+
class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
|
319 |
+
def __init__(
|
320 |
+
self,
|
321 |
+
rsrcmgr: PDFResourceManager,
|
322 |
+
outfp: IOType,
|
323 |
+
codec: str = "utf-8",
|
324 |
+
pageno: int = 1,
|
325 |
+
laparams: Optional[LAParams] = None,
|
326 |
+
) -> None:
|
327 |
+
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
|
328 |
+
self.outfp: IOType = outfp
|
329 |
+
self.codec = codec
|
330 |
+
self.outfp_binary = self._is_binary_stream(self.outfp)
|
331 |
+
|
332 |
+
@staticmethod
|
333 |
+
def _is_binary_stream(outfp: AnyIO) -> bool:
|
334 |
+
"""Test if an stream is binary or not"""
|
335 |
+
if "b" in getattr(outfp, "mode", ""):
|
336 |
+
return True
|
337 |
+
elif hasattr(outfp, "mode"):
|
338 |
+
# output stream has a mode, but it does not contain 'b'
|
339 |
+
return False
|
340 |
+
elif isinstance(outfp, io.BytesIO):
|
341 |
+
return True
|
342 |
+
elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase):
|
343 |
+
return False
|
344 |
+
|
345 |
+
return True
|
346 |
+
|
347 |
+
|
348 |
+
class TextConverter(PDFConverter[AnyIO]):
|
349 |
+
def __init__(
|
350 |
+
self,
|
351 |
+
rsrcmgr: PDFResourceManager,
|
352 |
+
outfp: AnyIO,
|
353 |
+
codec: str = "utf-8",
|
354 |
+
pageno: int = 1,
|
355 |
+
laparams: Optional[LAParams] = None,
|
356 |
+
showpageno: bool = False,
|
357 |
+
imagewriter: Optional[ImageWriter] = None,
|
358 |
+
vfont: str = None,
|
359 |
+
vchar: str = None,
|
360 |
+
thread: int = 0,
|
361 |
+
layout={},
|
362 |
+
lang_in: str = "",
|
363 |
+
lang_out: str = "",
|
364 |
+
service: str = "",
|
365 |
+
) -> None:
|
366 |
+
super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
367 |
+
self.showpageno = showpageno
|
368 |
+
self.imagewriter = imagewriter
|
369 |
+
self.vfont = vfont
|
370 |
+
self.vchar = vchar
|
371 |
+
self.thread = thread
|
372 |
+
self.layout = layout
|
373 |
+
param = service.split(":", 1)
|
374 |
+
if param[0] == "google":
|
375 |
+
self.translator: BaseTranslator = GoogleTranslator(
|
376 |
+
service, lang_out, lang_in, None
|
377 |
+
)
|
378 |
+
elif param[0] == "deepl":
|
379 |
+
self.translator: BaseTranslator = DeepLTranslator(
|
380 |
+
service, lang_out, lang_in, None
|
381 |
+
)
|
382 |
+
elif param[0] == "deeplx":
|
383 |
+
self.translator: BaseTranslator = DeepLXTranslator(
|
384 |
+
service, lang_out, lang_in, None
|
385 |
+
)
|
386 |
+
elif param[0] == "ollama":
|
387 |
+
self.translator: BaseTranslator = OllamaTranslator(
|
388 |
+
service, lang_out, lang_in, param[1]
|
389 |
+
)
|
390 |
+
elif param[0] == "openai":
|
391 |
+
self.translator: BaseTranslator = OpenAITranslator(
|
392 |
+
service, lang_out, lang_in, param[1]
|
393 |
+
)
|
394 |
+
elif param[0] == "azure":
|
395 |
+
self.translator: BaseTranslator = AzureTranslator(
|
396 |
+
service, lang_out, lang_in, None
|
397 |
+
)
|
398 |
+
elif param[0] == "tencent":
|
399 |
+
self.translator: BaseTranslator = TencentTranslator(
|
400 |
+
service, lang_out, lang_in, None
|
401 |
+
)
|
402 |
+
else:
|
403 |
+
raise ValueError("Unsupported translation service")
|
404 |
+
|
405 |
+
def write_text(self, text: str) -> None:
|
406 |
+
text = utils.compatible_encode_method(text, self.codec, "ignore")
|
407 |
+
if self.outfp_binary:
|
408 |
+
cast(BinaryIO, self.outfp).write(text.encode())
|
409 |
+
else:
|
410 |
+
cast(TextIO, self.outfp).write(text)
|
411 |
+
|
412 |
+
# fmt: off
|
413 |
+
def receive_layout(self, ltpage: LTPage):
|
414 |
+
xt = None # 上一个字符
|
415 |
+
sstk = [] # 段落文字栈
|
416 |
+
vstk = [] # 公式符号组
|
417 |
+
vlstk = [] # 公式线条组
|
418 |
+
vfix = 0 # 公式纵向偏移
|
419 |
+
vbkt = 0 # 段落公式括号计数
|
420 |
+
pstk = [] # 段落属性栈
|
421 |
+
lstk = [] # 全局线条栈
|
422 |
+
var = [] # 公式符号组栈
|
423 |
+
varl = [] # 公式线条组栈
|
424 |
+
varf = [] # 公式纵向偏移栈
|
425 |
+
vlen = [] # 公式宽度栈
|
426 |
+
xt_cls = -1 # 上一个字符所属段落
|
427 |
+
vmax = ltpage.width / 4 # 行内公式最大宽度
|
428 |
+
ops = "" # 渲染结果
|
429 |
+
|
430 |
+
def vflag(font, char): # 匹配公式(和角标)字体
|
431 |
+
if re.match(r"\(cid:", char):
|
432 |
+
return True
|
433 |
+
# 基于字体名规则的判定
|
434 |
+
if self.vfont:
|
435 |
+
if re.match(self.vfont, font):
|
436 |
+
return True
|
437 |
+
else:
|
438 |
+
if re.match( # latex 字体
|
439 |
+
r"(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)",
|
440 |
+
font,
|
441 |
+
):
|
442 |
+
return True
|
443 |
+
# 基于字符集规则的判定
|
444 |
+
if self.vchar:
|
445 |
+
if re.match(self.vchar, char):
|
446 |
+
return True
|
447 |
+
else:
|
448 |
+
if (
|
449 |
+
char
|
450 |
+
and char != " " # 非空格
|
451 |
+
and (
|
452 |
+
unicodedata.category(char[0])
|
453 |
+
in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号
|
454 |
+
or ord(char[0]) in range(0x370, 0x400) # 希腊字母
|
455 |
+
)
|
456 |
+
):
|
457 |
+
return True
|
458 |
+
return False
|
459 |
+
|
460 |
+
############################################################
|
461 |
+
# A. 原文档解析
|
462 |
+
ptr = 0
|
463 |
+
item = list(ltpage)
|
464 |
+
while ptr < len(item):
|
465 |
+
child = item[ptr]
|
466 |
+
if isinstance(child, LTChar):
|
467 |
+
cur_v = False
|
468 |
+
fontname = child.fontname.split("+")[-1]
|
469 |
+
layout = self.layout[ltpage.pageid]
|
470 |
+
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
471 |
+
h, w = layout.shape
|
472 |
+
# 读取当前字符在 layout 中的类别
|
473 |
+
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
474 |
+
cls = layout[cy, cx]
|
475 |
+
if ( # 判定当前字符是否属于公式
|
476 |
+
cls == 0 # 1. 类别为保留区域
|
477 |
+
or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1][4] * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
|
478 |
+
or vflag(fontname, child.get_text()) # 3. 公式字体
|
479 |
+
or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体
|
480 |
+
):
|
481 |
+
cur_v = True
|
482 |
+
# 判定括号组是否属于公式
|
483 |
+
if not cur_v:
|
484 |
+
if vstk and child.get_text() == "(":
|
485 |
+
cur_v = True
|
486 |
+
vbkt += 1
|
487 |
+
if vbkt and child.get_text() == ")":
|
488 |
+
cur_v = True
|
489 |
+
vbkt -= 1
|
490 |
+
if ( # 判定当前公式是否结束
|
491 |
+
not cur_v # 1. 当前字符不属于公式
|
492 |
+
or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落
|
493 |
+
or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分
|
494 |
+
):
|
495 |
+
if vstk:
|
496 |
+
if ( # 根据公式右侧的文字修正公式的纵向偏移
|
497 |
+
not cur_v # 1. 当前字符不属于公式
|
498 |
+
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
499 |
+
and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧
|
500 |
+
):
|
501 |
+
vfix = vstk[0].y0 - child.y0
|
502 |
+
sstk[-1] += f"$v{len(var)}$"
|
503 |
+
var.append(vstk)
|
504 |
+
varl.append(vlstk)
|
505 |
+
varf.append(vfix)
|
506 |
+
vstk = []
|
507 |
+
vlstk = []
|
508 |
+
vfix = 0
|
509 |
+
# 当前字符不属于公式或当前字符是公式的第一个字符
|
510 |
+
if not vstk:
|
511 |
+
if cls == xt_cls: # 当前字符与前一个字符属于同一段落
|
512 |
+
if child.x0 > xt.x1 + 1: # 添加行内空格
|
513 |
+
sstk[-1] += " "
|
514 |
+
elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行
|
515 |
+
sstk[-1] += " "
|
516 |
+
pstk[-1][6] = True
|
517 |
+
else: # 根据当前字符构建一个新的段落
|
518 |
+
sstk.append("")
|
519 |
+
pstk.append([child.y0, child.x0, child.x0, child.x0, child.size, child.font, False])
|
520 |
+
if not cur_v: # 文字入栈
|
521 |
+
if ( # 根据当前字符修正段落属性
|
522 |
+
child.size > pstk[-1][4] / 0.79 # 1. 当前字符显著比段落字体大
|
523 |
+
or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况)
|
524 |
+
or vflag(pstk[-1][5].fontname.split("+")[-1], "") # 3. 段落字体为公式字体
|
525 |
+
or re.match( # 4. 段落字体为���体
|
526 |
+
r"(.*Medi|.*Bold)",
|
527 |
+
pstk[-1][5].fontname.split("+")[-1],
|
528 |
+
re.IGNORECASE,
|
529 |
+
)
|
530 |
+
):
|
531 |
+
pstk[-1][0] -= child.size - pstk[-1][4] # hack 这个段落纵向位置的修正有问题,不过先凑合用吧
|
532 |
+
pstk[-1][4] = child.size
|
533 |
+
pstk[-1][5] = child.font
|
534 |
+
sstk[-1] += child.get_text()
|
535 |
+
else: # 公式入栈
|
536 |
+
if ( # 根据公式左侧的文字修正公式的纵向偏移
|
537 |
+
not vstk # 1. 当前字符是公式的第一个字符
|
538 |
+
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
539 |
+
and child.x0 > xt.x0 # 3. 前一个字符在公式左侧
|
540 |
+
):
|
541 |
+
vfix = child.y0 - xt.y0
|
542 |
+
vstk.append(child)
|
543 |
+
# 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
|
544 |
+
pstk[-1][2] = min(pstk[-1][2], child.x0)
|
545 |
+
pstk[-1][3] = max(pstk[-1][3], child.x1)
|
546 |
+
# 更新上一个字符
|
547 |
+
xt = child
|
548 |
+
xt_cls = cls
|
549 |
+
elif isinstance(child, LTFigure): # 图表
|
550 |
+
pass
|
551 |
+
elif isinstance(child, LTLine): # 线条
|
552 |
+
layout = self.layout[ltpage.pageid]
|
553 |
+
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
554 |
+
h, w = layout.shape
|
555 |
+
# 读取当前线条在 layout 中的类别
|
556 |
+
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
557 |
+
cls = layout[cy, cx]
|
558 |
+
if vstk and cls == xt_cls: # 公式线条
|
559 |
+
vlstk.append(child)
|
560 |
+
else: # 全局线条
|
561 |
+
lstk.append(child)
|
562 |
+
else:
|
563 |
+
pass
|
564 |
+
ptr += 1
|
565 |
+
# 处理结尾
|
566 |
+
if vstk: # 公式出栈
|
567 |
+
sstk[-1] += f"$v{len(var)}$"
|
568 |
+
var.append(vstk)
|
569 |
+
varl.append(vlstk)
|
570 |
+
varf.append(vfix)
|
571 |
+
log.debug("\n==========[VSTACK]==========\n")
|
572 |
+
for id, v in enumerate(var): # 计算公式宽度
|
573 |
+
l = max([vch.x1 for vch in v]) - v[0].x0
|
574 |
+
log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
|
575 |
+
vlen.append(l)
|
576 |
+
|
577 |
+
############################################################
|
578 |
+
# B. 段落翻译
|
579 |
+
log.debug("\n==========[SSTACK]==========\n")
|
580 |
+
hash_key = cache.deterministic_hash("PDFMathTranslate")
|
581 |
+
cache.create_cache(hash_key)
|
582 |
+
|
583 |
+
@retry(wait=wait_fixed(1))
|
584 |
+
def worker(s): # 多线程翻译
|
585 |
+
try:
|
586 |
+
hash_key_paragraph = cache.deterministic_hash(
|
587 |
+
(s, str(self.translator))
|
588 |
+
)
|
589 |
+
new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
|
590 |
+
if new is None:
|
591 |
+
new = self.translator.translate(s)
|
592 |
+
new = remove_control_characters(new)
|
593 |
+
cache.write_paragraph(hash_key, hash_key_paragraph, new)
|
594 |
+
return new
|
595 |
+
except BaseException as e:
|
596 |
+
if log.isEnabledFor(logging.DEBUG):
|
597 |
+
log.exception(e)
|
598 |
+
else:
|
599 |
+
log.exception(e, exc_info=False)
|
600 |
+
raise e
|
601 |
+
with concurrent.futures.ThreadPoolExecutor(
|
602 |
+
max_workers=self.thread
|
603 |
+
) as executor:
|
604 |
+
news = list(executor.map(worker, sstk))
|
605 |
+
|
606 |
+
############################################################
|
607 |
+
# C. 新文档排版
|
608 |
+
def raw_string(fcur, cstk): # 编码字符串
|
609 |
+
if isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
|
610 |
+
return "".join(["%04x" % ord(c) for c in cstk])
|
611 |
+
else:
|
612 |
+
return "".join(["%02x" % ord(c) for c in cstk])
|
613 |
+
|
614 |
+
_x, _y = 0, 0
|
615 |
+
for id, new in enumerate(news):
|
616 |
+
tx = x = pstk[id][1] # 段落初始横坐标
|
617 |
+
y = pstk[id][0] # 段落上边界
|
618 |
+
lt = pstk[id][2] # 段落左边界
|
619 |
+
rt = pstk[id][3] # 段落右边界
|
620 |
+
size = pstk[id][4] # 段落字体大小
|
621 |
+
font = pstk[id][5] # 段落字体
|
622 |
+
lb = pstk[id][6] # 段落属性
|
623 |
+
cstk = "" # 当前文字栈
|
624 |
+
fcur = fcur_ = None # 当前字体
|
625 |
+
ptr = 0
|
626 |
+
log.debug(f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}")
|
627 |
+
while True:
|
628 |
+
if ptr == len(new): # 到达段落结尾
|
629 |
+
if cstk:
|
630 |
+
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
631 |
+
break
|
632 |
+
vy_regex = re.match(
|
633 |
+
r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
|
634 |
+
) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
|
635 |
+
mod = False # 当前公式是否为文字修饰符
|
636 |
+
if vy_regex: # 加载公式
|
637 |
+
ptr += len(vy_regex.group(0))
|
638 |
+
try:
|
639 |
+
vid = int(vy_regex.group(1).replace(" ", ""))
|
640 |
+
adv = vlen[vid]
|
641 |
+
except Exception:
|
642 |
+
continue # 翻译器可能会自动补个越界的公式标记
|
643 |
+
if len(var[vid]) == 1 and unicodedata.category(var[vid][0].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符
|
644 |
+
mod = True
|
645 |
+
else: # 加载文字
|
646 |
+
ch = new[ptr]
|
647 |
+
# if font.char_width(ord(ch)):
|
648 |
+
fcur_ = None
|
649 |
+
# 原字体编码容易出问题,这里直接放弃掉
|
650 |
+
# try:
|
651 |
+
# if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
|
652 |
+
# fcur_=self.fontid[font] # 原字体
|
653 |
+
# except:
|
654 |
+
# pass
|
655 |
+
try:
|
656 |
+
if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
|
657 |
+
fcur_ = "tiro" # 默认英文字体
|
658 |
+
except Exception:
|
659 |
+
pass
|
660 |
+
if fcur_ is None:
|
661 |
+
fcur_ = "china-ss" # 默认中文字体
|
662 |
+
# print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
|
663 |
+
adv = self.fontmap[fcur_].char_width(ord(ch)) * size
|
664 |
+
ptr += 1
|
665 |
+
if ( # 输出文字缓冲区
|
666 |
+
fcur_ != fcur # 1. 字体更新
|
667 |
+
or vy_regex # 2. 插入公式
|
668 |
+
or x + adv > rt + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
|
669 |
+
):
|
670 |
+
if cstk:
|
671 |
+
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
672 |
+
cstk = ""
|
673 |
+
if lb and x + adv > rt + 0.1 * size: # 到达右边界且原文段落存在换行
|
674 |
+
x = lt
|
675 |
+
lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2} # CJK
|
676 |
+
y -= size * lang_space.get(self.translator.lang_out, 1.1) # 小语种大多适配 1.1
|
677 |
+
if vy_regex: # 插入公式
|
678 |
+
fix = 0
|
679 |
+
if fcur is not None: # 段落内公式修正纵向偏移
|
680 |
+
fix = varf[vid]
|
681 |
+
for vch in var[vid]: # 排版公式字符
|
682 |
+
vc = chr(vch.cid)
|
683 |
+
ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
|
684 |
+
if log.isEnabledFor(logging.DEBUG):
|
685 |
+
lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
|
686 |
+
_x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
|
687 |
+
for l in varl[vid]: # 排版公式线条
|
688 |
+
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
689 |
+
ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
690 |
+
else: # 插入文字缓冲区
|
691 |
+
if not cstk: # 单行开头
|
692 |
+
tx = x
|
693 |
+
if x == lt and ch == " ": # 消除段落换行空格
|
694 |
+
adv = 0
|
695 |
+
else:
|
696 |
+
cstk += ch
|
697 |
+
else:
|
698 |
+
cstk += ch
|
699 |
+
if mod: # 文字修饰符
|
700 |
+
adv = 0
|
701 |
+
fcur = fcur_
|
702 |
+
x += adv
|
703 |
+
if log.isEnabledFor(logging.DEBUG):
|
704 |
+
lstk.append(LTLine(0.1, (_x, _y), (x, y)))
|
705 |
+
_x, _y = x, y
|
706 |
+
for l in lstk: # 排版全局线条
|
707 |
+
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
708 |
+
ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
709 |
+
ops = f"BT {ops}ET "
|
710 |
+
return ops
|
711 |
+
|
712 |
+
# Some dummy functions to save memory/CPU when all that is wanted
|
713 |
+
# is text. This stops all the image and drawing output from being
|
714 |
+
# recorded and taking up RAM.
|
715 |
+
def render_image(self, name: str, stream: PDFStream) -> None:
|
716 |
+
if self.imagewriter is not None:
|
717 |
+
PDFConverter.render_image(self, name, stream)
|
718 |
+
|
719 |
+
# def paint_path(
|
720 |
+
# self,
|
721 |
+
# gstate: PDFGraphicState,
|
722 |
+
# stroke: bool,
|
723 |
+
# fill: bool,
|
724 |
+
# evenodd: bool,
|
725 |
+
# path: Sequence[PathSegment],
|
726 |
+
# ) -> None:
|
727 |
+
# pass
|
728 |
+
|
729 |
+
|
730 |
+
class HTMLConverter(PDFConverter[AnyIO]):
|
731 |
+
RECT_COLORS = {
|
732 |
+
"figure": "yellow",
|
733 |
+
"textline": "magenta",
|
734 |
+
"textbox": "cyan",
|
735 |
+
"textgroup": "red",
|
736 |
+
"curve": "black",
|
737 |
+
"page": "gray",
|
738 |
+
}
|
739 |
+
|
740 |
+
TEXT_COLORS = {
|
741 |
+
"textbox": "blue",
|
742 |
+
"char": "black",
|
743 |
+
}
|
744 |
+
|
745 |
+
def __init__(
|
746 |
+
self,
|
747 |
+
rsrcmgr: PDFResourceManager,
|
748 |
+
outfp: AnyIO,
|
749 |
+
codec: str = "utf-8",
|
750 |
+
pageno: int = 1,
|
751 |
+
laparams: Optional[LAParams] = None,
|
752 |
+
scale: float = 1,
|
753 |
+
fontscale: float = 1.0,
|
754 |
+
layoutmode: str = "normal",
|
755 |
+
showpageno: bool = True,
|
756 |
+
pagemargin: int = 50,
|
757 |
+
imagewriter: Optional[ImageWriter] = None,
|
758 |
+
debug: int = 0,
|
759 |
+
rect_colors: Optional[Dict[str, str]] = None,
|
760 |
+
text_colors: Optional[Dict[str, str]] = None,
|
761 |
+
) -> None:
|
762 |
+
PDFConverter.__init__(
|
763 |
+
self,
|
764 |
+
rsrcmgr,
|
765 |
+
outfp,
|
766 |
+
codec=codec,
|
767 |
+
pageno=pageno,
|
768 |
+
laparams=laparams,
|
769 |
+
)
|
770 |
+
|
771 |
+
# write() assumes a codec for binary I/O, or no codec for text I/O.
|
772 |
+
if self.outfp_binary and not self.codec:
|
773 |
+
raise PDFValueError("Codec is required for a binary I/O output")
|
774 |
+
if not self.outfp_binary and self.codec:
|
775 |
+
raise PDFValueError("Codec must not be specified for a text I/O output")
|
776 |
+
|
777 |
+
if text_colors is None:
|
778 |
+
text_colors = {"char": "black"}
|
779 |
+
if rect_colors is None:
|
780 |
+
rect_colors = {"curve": "black", "page": "gray"}
|
781 |
+
|
782 |
+
self.scale = scale
|
783 |
+
self.fontscale = fontscale
|
784 |
+
self.layoutmode = layoutmode
|
785 |
+
self.showpageno = showpageno
|
786 |
+
self.pagemargin = pagemargin
|
787 |
+
self.imagewriter = imagewriter
|
788 |
+
self.rect_colors = rect_colors
|
789 |
+
self.text_colors = text_colors
|
790 |
+
if debug:
|
791 |
+
self.rect_colors.update(self.RECT_COLORS)
|
792 |
+
self.text_colors.update(self.TEXT_COLORS)
|
793 |
+
self._yoffset: float = self.pagemargin
|
794 |
+
self._font: Optional[Tuple[str, float]] = None
|
795 |
+
self._fontstack: List[Optional[Tuple[str, float]]] = []
|
796 |
+
self.write_header()
|
797 |
+
|
798 |
+
def write(self, text: str) -> None:
|
799 |
+
if self.codec:
|
800 |
+
cast(BinaryIO, self.outfp).write(text.encode(self.codec))
|
801 |
+
else:
|
802 |
+
cast(TextIO, self.outfp).write(text)
|
803 |
+
|
804 |
+
def write_header(self) -> None:
|
805 |
+
self.write("<html><head>\n")
|
806 |
+
if self.codec:
|
807 |
+
s = (
|
808 |
+
'<meta http-equiv="Content-Type" content="text/html; '
|
809 |
+
'charset=%s">\n' % self.codec
|
810 |
+
)
|
811 |
+
else:
|
812 |
+
s = '<meta http-equiv="Content-Type" content="text/html">\n'
|
813 |
+
self.write(s)
|
814 |
+
self.write("</head><body>\n")
|
815 |
+
|
816 |
+
def write_footer(self) -> None:
|
817 |
+
page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]
|
818 |
+
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(
|
819 |
+
page_links,
|
820 |
+
)
|
821 |
+
self.write(s)
|
822 |
+
self.write("</body></html>\n")
|
823 |
+
|
824 |
+
def write_text(self, text: str) -> None:
|
825 |
+
self.write(enc(text))
|
826 |
+
|
827 |
+
def place_rect(
|
828 |
+
self,
|
829 |
+
color: str,
|
830 |
+
borderwidth: int,
|
831 |
+
x: float,
|
832 |
+
y: float,
|
833 |
+
w: float,
|
834 |
+
h: float,
|
835 |
+
) -> None:
|
836 |
+
color2 = self.rect_colors.get(color)
|
837 |
+
if color2 is not None:
|
838 |
+
s = (
|
839 |
+
'<span style="position:absolute; border: %s %dpx solid; '
|
840 |
+
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'
|
841 |
+
% (
|
842 |
+
color2,
|
843 |
+
borderwidth,
|
844 |
+
x * self.scale,
|
845 |
+
(self._yoffset - y) * self.scale,
|
846 |
+
w * self.scale,
|
847 |
+
h * self.scale,
|
848 |
+
)
|
849 |
+
)
|
850 |
+
self.write(s)
|
851 |
+
|
852 |
+
def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
|
853 |
+
self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
|
854 |
+
|
855 |
+
def place_image(
|
856 |
+
self,
|
857 |
+
item: LTImage,
|
858 |
+
borderwidth: int,
|
859 |
+
x: float,
|
860 |
+
y: float,
|
861 |
+
w: float,
|
862 |
+
h: float,
|
863 |
+
) -> None:
|
864 |
+
if self.imagewriter is not None:
|
865 |
+
name = self.imagewriter.export_image(item)
|
866 |
+
s = (
|
867 |
+
'<img src="%s" border="%d" style="position:absolute; '
|
868 |
+
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'
|
869 |
+
% (
|
870 |
+
enc(name),
|
871 |
+
borderwidth,
|
872 |
+
x * self.scale,
|
873 |
+
(self._yoffset - y) * self.scale,
|
874 |
+
w * self.scale,
|
875 |
+
h * self.scale,
|
876 |
+
)
|
877 |
+
)
|
878 |
+
self.write(s)
|
879 |
+
|
880 |
+
def place_text(
|
881 |
+
self,
|
882 |
+
color: str,
|
883 |
+
text: str,
|
884 |
+
x: float,
|
885 |
+
y: float,
|
886 |
+
size: float,
|
887 |
+
) -> None:
|
888 |
+
color2 = self.text_colors.get(color)
|
889 |
+
if color2 is not None:
|
890 |
+
s = (
|
891 |
+
'<span style="position:absolute; color:%s; left:%dpx; '
|
892 |
+
'top:%dpx; font-size:%dpx;">'
|
893 |
+
% (
|
894 |
+
color2,
|
895 |
+
x * self.scale,
|
896 |
+
(self._yoffset - y) * self.scale,
|
897 |
+
size * self.scale * self.fontscale,
|
898 |
+
)
|
899 |
+
)
|
900 |
+
self.write(s)
|
901 |
+
self.write_text(text)
|
902 |
+
self.write("</span>\n")
|
903 |
+
|
904 |
+
def begin_div(
|
905 |
+
self,
|
906 |
+
color: str,
|
907 |
+
borderwidth: int,
|
908 |
+
x: float,
|
909 |
+
y: float,
|
910 |
+
w: float,
|
911 |
+
h: float,
|
912 |
+
writing_mode: str = "False",
|
913 |
+
) -> None:
|
914 |
+
self._fontstack.append(self._font)
|
915 |
+
self._font = None
|
916 |
+
s = (
|
917 |
+
'<div style="position:absolute; border: %s %dpx solid; '
|
918 |
+
"writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "
|
919 |
+
'height:%dpx;">'
|
920 |
+
% (
|
921 |
+
color,
|
922 |
+
borderwidth,
|
923 |
+
writing_mode,
|
924 |
+
x * self.scale,
|
925 |
+
(self._yoffset - y) * self.scale,
|
926 |
+
w * self.scale,
|
927 |
+
h * self.scale,
|
928 |
+
)
|
929 |
+
)
|
930 |
+
self.write(s)
|
931 |
+
|
932 |
+
def end_div(self, color: str) -> None:
|
933 |
+
if self._font is not None:
|
934 |
+
self.write("</span>")
|
935 |
+
self._font = self._fontstack.pop()
|
936 |
+
self.write("</div>")
|
937 |
+
|
938 |
+
def put_text(self, text: str, fontname: str, fontsize: float) -> None:
|
939 |
+
font = (fontname, fontsize)
|
940 |
+
if font != self._font:
|
941 |
+
if self._font is not None:
|
942 |
+
self.write("</span>")
|
943 |
+
# Remove subset tag from fontname, see PDF Reference 5.5.3
|
944 |
+
fontname_without_subset_tag = fontname.split("+")[-1]
|
945 |
+
self.write(
|
946 |
+
'<span style="font-family: %s; font-size:%dpx">'
|
947 |
+
% (fontname_without_subset_tag, fontsize * self.scale * self.fontscale),
|
948 |
+
)
|
949 |
+
self._font = font
|
950 |
+
self.write_text(text)
|
951 |
+
|
952 |
+
def put_newline(self) -> None:
|
953 |
+
self.write("<br>")
|
954 |
+
|
955 |
+
def receive_layout(self, ltpage: LTPage) -> None:
|
956 |
+
def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
|
957 |
+
if isinstance(item, LTTextGroup):
|
958 |
+
self.place_border("textgroup", 1, item)
|
959 |
+
for child in item:
|
960 |
+
show_group(child)
|
961 |
+
|
962 |
+
def render(item: LTItem) -> None:
|
963 |
+
child: LTItem
|
964 |
+
if isinstance(item, LTPage):
|
965 |
+
self._yoffset += item.y1
|
966 |
+
self.place_border("page", 1, item)
|
967 |
+
if self.showpageno:
|
968 |
+
self.write(
|
969 |
+
'<div style="position:absolute; top:%dpx;">'
|
970 |
+
% ((self._yoffset - item.y1) * self.scale),
|
971 |
+
)
|
972 |
+
self.write(
|
973 |
+
f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',
|
974 |
+
)
|
975 |
+
for child in item:
|
976 |
+
render(child)
|
977 |
+
if item.groups is not None:
|
978 |
+
for group in item.groups:
|
979 |
+
show_group(group)
|
980 |
+
elif isinstance(item, LTCurve):
|
981 |
+
self.place_border("curve", 1, item)
|
982 |
+
elif isinstance(item, LTFigure):
|
983 |
+
self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
|
984 |
+
for child in item:
|
985 |
+
render(child)
|
986 |
+
self.end_div("figure")
|
987 |
+
elif isinstance(item, LTImage):
|
988 |
+
self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
|
989 |
+
elif self.layoutmode == "exact":
|
990 |
+
if isinstance(item, LTTextLine):
|
991 |
+
self.place_border("textline", 1, item)
|
992 |
+
for child in item:
|
993 |
+
render(child)
|
994 |
+
elif isinstance(item, LTTextBox):
|
995 |
+
self.place_border("textbox", 1, item)
|
996 |
+
self.place_text(
|
997 |
+
"textbox",
|
998 |
+
str(item.index + 1),
|
999 |
+
item.x0,
|
1000 |
+
item.y1,
|
1001 |
+
20,
|
1002 |
+
)
|
1003 |
+
for child in item:
|
1004 |
+
render(child)
|
1005 |
+
elif isinstance(item, LTChar):
|
1006 |
+
self.place_border("char", 1, item)
|
1007 |
+
self.place_text(
|
1008 |
+
"char",
|
1009 |
+
item.get_text(),
|
1010 |
+
item.x0,
|
1011 |
+
item.y1,
|
1012 |
+
item.size,
|
1013 |
+
)
|
1014 |
+
elif isinstance(item, LTTextLine):
|
1015 |
+
for child in item:
|
1016 |
+
render(child)
|
1017 |
+
if self.layoutmode != "loose":
|
1018 |
+
self.put_newline()
|
1019 |
+
elif isinstance(item, LTTextBox):
|
1020 |
+
self.begin_div(
|
1021 |
+
"textbox",
|
1022 |
+
1,
|
1023 |
+
item.x0,
|
1024 |
+
item.y1,
|
1025 |
+
item.width,
|
1026 |
+
item.height,
|
1027 |
+
item.get_writing_mode(),
|
1028 |
+
)
|
1029 |
+
for child in item:
|
1030 |
+
render(child)
|
1031 |
+
self.end_div("textbox")
|
1032 |
+
elif isinstance(item, LTChar):
|
1033 |
+
fontname = make_compat_str(item.fontname)
|
1034 |
+
self.put_text(item.get_text(), fontname, item.size)
|
1035 |
+
elif isinstance(item, LTText):
|
1036 |
+
self.write_text(item.get_text())
|
1037 |
+
|
1038 |
+
render(ltpage)
|
1039 |
+
self._yoffset += self.pagemargin
|
1040 |
+
|
1041 |
+
def close(self) -> None:
|
1042 |
+
self.write_footer()
|
1043 |
+
|
1044 |
+
|
1045 |
+
class XMLConverter(PDFConverter[AnyIO]):
|
1046 |
+
CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
|
1047 |
+
|
1048 |
+
def __init__(
|
1049 |
+
self,
|
1050 |
+
rsrcmgr: PDFResourceManager,
|
1051 |
+
outfp: AnyIO,
|
1052 |
+
codec: str = "utf-8",
|
1053 |
+
pageno: int = 1,
|
1054 |
+
laparams: Optional[LAParams] = None,
|
1055 |
+
imagewriter: Optional[ImageWriter] = None,
|
1056 |
+
stripcontrol: bool = False,
|
1057 |
+
) -> None:
|
1058 |
+
PDFConverter.__init__(
|
1059 |
+
self,
|
1060 |
+
rsrcmgr,
|
1061 |
+
outfp,
|
1062 |
+
codec=codec,
|
1063 |
+
pageno=pageno,
|
1064 |
+
laparams=laparams,
|
1065 |
+
)
|
1066 |
+
|
1067 |
+
# write() assumes a codec for binary I/O, or no codec for text I/O.
|
1068 |
+
if self.outfp_binary == (not self.codec):
|
1069 |
+
raise PDFValueError("Codec is required for a binary I/O output")
|
1070 |
+
|
1071 |
+
self.imagewriter = imagewriter
|
1072 |
+
self.stripcontrol = stripcontrol
|
1073 |
+
self.write_header()
|
1074 |
+
|
1075 |
+
def write(self, text: str) -> None:
|
1076 |
+
if self.codec:
|
1077 |
+
cast(BinaryIO, self.outfp).write(text.encode(self.codec))
|
1078 |
+
else:
|
1079 |
+
cast(TextIO, self.outfp).write(text)
|
1080 |
+
|
1081 |
+
def write_header(self) -> None:
|
1082 |
+
if self.codec:
|
1083 |
+
self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
|
1084 |
+
else:
|
1085 |
+
self.write('<?xml version="1.0" ?>\n')
|
1086 |
+
self.write("<pages>\n")
|
1087 |
+
|
1088 |
+
def write_footer(self) -> None:
|
1089 |
+
self.write("</pages>\n")
|
1090 |
+
|
1091 |
+
def write_text(self, text: str) -> None:
|
1092 |
+
if self.stripcontrol:
|
1093 |
+
text = self.CONTROL.sub("", text)
|
1094 |
+
self.write(enc(text))
|
1095 |
+
|
1096 |
+
def receive_layout(self, ltpage: LTPage) -> None:
|
1097 |
+
def show_group(item: LTItem) -> None:
|
1098 |
+
if isinstance(item, LTTextBox):
|
1099 |
+
self.write(
|
1100 |
+
'<textbox id="%d" bbox="%s" />\n'
|
1101 |
+
% (item.index, bbox2str(item.bbox)),
|
1102 |
+
)
|
1103 |
+
elif isinstance(item, LTTextGroup):
|
1104 |
+
self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
|
1105 |
+
for child in item:
|
1106 |
+
show_group(child)
|
1107 |
+
self.write("</textgroup>\n")
|
1108 |
+
|
1109 |
+
def render(item: LTItem) -> None:
|
1110 |
+
child: LTItem
|
1111 |
+
if isinstance(item, LTPage):
|
1112 |
+
s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
|
1113 |
+
item.pageid,
|
1114 |
+
bbox2str(item.bbox),
|
1115 |
+
item.rotate,
|
1116 |
+
)
|
1117 |
+
self.write(s)
|
1118 |
+
for child in item:
|
1119 |
+
render(child)
|
1120 |
+
if item.groups is not None:
|
1121 |
+
self.write("<layout>\n")
|
1122 |
+
for group in item.groups:
|
1123 |
+
show_group(group)
|
1124 |
+
self.write("</layout>\n")
|
1125 |
+
self.write("</page>\n")
|
1126 |
+
elif isinstance(item, LTLine):
|
1127 |
+
s = '<line linewidth="%d" bbox="%s" />\n' % (
|
1128 |
+
item.linewidth,
|
1129 |
+
bbox2str(item.bbox),
|
1130 |
+
)
|
1131 |
+
self.write(s)
|
1132 |
+
elif isinstance(item, LTRect):
|
1133 |
+
s = '<rect linewidth="%d" bbox="%s" />\n' % (
|
1134 |
+
item.linewidth,
|
1135 |
+
bbox2str(item.bbox),
|
1136 |
+
)
|
1137 |
+
self.write(s)
|
1138 |
+
elif isinstance(item, LTCurve):
|
1139 |
+
s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
|
1140 |
+
item.linewidth,
|
1141 |
+
bbox2str(item.bbox),
|
1142 |
+
item.get_pts(),
|
1143 |
+
)
|
1144 |
+
self.write(s)
|
1145 |
+
elif isinstance(item, LTFigure):
|
1146 |
+
s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'
|
1147 |
+
self.write(s)
|
1148 |
+
for child in item:
|
1149 |
+
render(child)
|
1150 |
+
self.write("</figure>\n")
|
1151 |
+
elif isinstance(item, LTTextLine):
|
1152 |
+
self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
|
1153 |
+
for child in item:
|
1154 |
+
render(child)
|
1155 |
+
self.write("</textline>\n")
|
1156 |
+
elif isinstance(item, LTTextBox):
|
1157 |
+
wmode = ""
|
1158 |
+
if isinstance(item, LTTextBoxVertical):
|
1159 |
+
wmode = ' wmode="vertical"'
|
1160 |
+
s = '<textbox id="%d" bbox="%s"%s>\n' % (
|
1161 |
+
item.index,
|
1162 |
+
bbox2str(item.bbox),
|
1163 |
+
wmode,
|
1164 |
+
)
|
1165 |
+
self.write(s)
|
1166 |
+
for child in item:
|
1167 |
+
render(child)
|
1168 |
+
self.write("</textbox>\n")
|
1169 |
+
elif isinstance(item, LTChar):
|
1170 |
+
s = (
|
1171 |
+
'<text font="%s" bbox="%s" colourspace="%s" '
|
1172 |
+
'ncolour="%s" size="%.3f">'
|
1173 |
+
% (
|
1174 |
+
enc(item.fontname),
|
1175 |
+
bbox2str(item.bbox),
|
1176 |
+
item.ncs.name,
|
1177 |
+
item.graphicstate.ncolor,
|
1178 |
+
item.size,
|
1179 |
+
)
|
1180 |
+
)
|
1181 |
+
self.write(s)
|
1182 |
+
self.write_text(item.get_text())
|
1183 |
+
self.write("</text>\n")
|
1184 |
+
elif isinstance(item, LTText):
|
1185 |
+
self.write("<text>%s</text>\n" % item.get_text())
|
1186 |
+
elif isinstance(item, LTImage):
|
1187 |
+
if self.imagewriter is not None:
|
1188 |
+
name = self.imagewriter.export_image(item)
|
1189 |
+
self.write(
|
1190 |
+
'<image src="%s" width="%d" height="%d" />\n'
|
1191 |
+
% (enc(name), item.width, item.height),
|
1192 |
+
)
|
1193 |
+
else:
|
1194 |
+
self.write(
|
1195 |
+
'<image width="%d" height="%d" />\n'
|
1196 |
+
% (item.width, item.height),
|
1197 |
+
)
|
1198 |
+
else:
|
1199 |
+
assert False, str(("Unhandled", item))
|
1200 |
+
|
1201 |
+
render(ltpage)
|
1202 |
+
|
1203 |
+
def close(self) -> None:
|
1204 |
+
self.write_footer()
|
1205 |
+
|
1206 |
+
|
1207 |
+
class HOCRConverter(PDFConverter[AnyIO]):
|
1208 |
+
"""Extract an hOCR representation from explicit text information within a PDF."""
|
1209 |
+
|
1210 |
+
# Where text is being extracted from a variety of types of PDF within a
|
1211 |
+
# business process, those PDFs where the text is only present in image
|
1212 |
+
# form will need to be analysed using an OCR tool which will typically
|
1213 |
+
# output hOCR. This converter extracts the explicit text information from
|
1214 |
+
# those PDFs that do have it and uses it to genxerate a basic hOCR
|
1215 |
+
# representation that is designed to be used in conjunction with the image
|
1216 |
+
# of the PDF in the same way as genuine OCR output would be, but without the
|
1217 |
+
# inevitable OCR errors.
|
1218 |
+
|
1219 |
+
# The converter does not handle images, diagrams or text colors.
|
1220 |
+
|
1221 |
+
# In the examples processed by the contributor it was necessary to set
|
1222 |
+
# LAParams.all_texts to True.
|
1223 |
+
|
1224 |
+
CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
|
1225 |
+
|
1226 |
+
def __init__(
|
1227 |
+
self,
|
1228 |
+
rsrcmgr: PDFResourceManager,
|
1229 |
+
outfp: AnyIO,
|
1230 |
+
codec: str = "utf8",
|
1231 |
+
pageno: int = 1,
|
1232 |
+
laparams: Optional[LAParams] = None,
|
1233 |
+
stripcontrol: bool = False,
|
1234 |
+
):
|
1235 |
+
PDFConverter.__init__(
|
1236 |
+
self,
|
1237 |
+
rsrcmgr,
|
1238 |
+
outfp,
|
1239 |
+
codec=codec,
|
1240 |
+
pageno=pageno,
|
1241 |
+
laparams=laparams,
|
1242 |
+
)
|
1243 |
+
self.stripcontrol = stripcontrol
|
1244 |
+
self.within_chars = False
|
1245 |
+
self.write_header()
|
1246 |
+
|
1247 |
+
def bbox_repr(self, bbox: Rect) -> str:
|
1248 |
+
(in_x0, in_y0, in_x1, in_y1) = bbox
|
1249 |
+
# PDF y-coordinates are the other way round from hOCR coordinates
|
1250 |
+
out_x0 = int(in_x0)
|
1251 |
+
out_y0 = int(self.page_bbox[3] - in_y1)
|
1252 |
+
out_x1 = int(in_x1)
|
1253 |
+
out_y1 = int(self.page_bbox[3] - in_y0)
|
1254 |
+
return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
|
1255 |
+
|
1256 |
+
def write(self, text: str) -> None:
|
1257 |
+
if self.codec:
|
1258 |
+
encoded_text = text.encode(self.codec)
|
1259 |
+
cast(BinaryIO, self.outfp).write(encoded_text)
|
1260 |
+
else:
|
1261 |
+
cast(TextIO, self.outfp).write(text)
|
1262 |
+
|
1263 |
+
def write_header(self) -> None:
|
1264 |
+
if self.codec:
|
1265 |
+
self.write(
|
1266 |
+
"<html xmlns='http://www.w3.org/1999/xhtml' "
|
1267 |
+
"xml:lang='en' lang='en' charset='%s'>\n" % self.codec,
|
1268 |
+
)
|
1269 |
+
else:
|
1270 |
+
self.write(
|
1271 |
+
"<html xmlns='http://www.w3.org/1999/xhtml' "
|
1272 |
+
"xml:lang='en' lang='en'>\n",
|
1273 |
+
)
|
1274 |
+
self.write("<head>\n")
|
1275 |
+
self.write("<title></title>\n")
|
1276 |
+
self.write(
|
1277 |
+
"<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",
|
1278 |
+
)
|
1279 |
+
self.write(
|
1280 |
+
"<meta name='ocr-system' content='pdf2zh.six HOCR Converter' />\n",
|
1281 |
+
)
|
1282 |
+
self.write(
|
1283 |
+
" <meta name='ocr-capabilities'"
|
1284 |
+
" content='ocr_page ocr_block ocr_line ocrx_word'/>\n",
|
1285 |
+
)
|
1286 |
+
self.write("</head>\n")
|
1287 |
+
self.write("<body>\n")
|
1288 |
+
|
1289 |
+
def write_footer(self) -> None:
|
1290 |
+
self.write("<!-- comment in the following line to debug -->\n")
|
1291 |
+
self.write(
|
1292 |
+
"<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n",
|
1293 |
+
)
|
1294 |
+
|
1295 |
+
def write_text(self, text: str) -> None:
|
1296 |
+
if self.stripcontrol:
|
1297 |
+
text = self.CONTROL.sub("", text)
|
1298 |
+
self.write(text)
|
1299 |
+
|
1300 |
+
def write_word(self) -> None:
|
1301 |
+
if len(self.working_text) > 0:
|
1302 |
+
bold_and_italic_styles = ""
|
1303 |
+
if "Italic" in self.working_font:
|
1304 |
+
bold_and_italic_styles = "font-style: italic; "
|
1305 |
+
if "Bold" in self.working_font:
|
1306 |
+
bold_and_italic_styles += "font-weight: bold; "
|
1307 |
+
self.write(
|
1308 |
+
"<span style='font:\"%s\"; font-size:%d; %s' "
|
1309 |
+
"class='ocrx_word' title='%s; x_font %s; "
|
1310 |
+
"x_fsize %d'>%s</span>"
|
1311 |
+
% (
|
1312 |
+
(
|
1313 |
+
self.working_font,
|
1314 |
+
self.working_size,
|
1315 |
+
bold_and_italic_styles,
|
1316 |
+
self.bbox_repr(self.working_bbox),
|
1317 |
+
self.working_font,
|
1318 |
+
self.working_size,
|
1319 |
+
self.working_text.strip(),
|
1320 |
+
)
|
1321 |
+
),
|
1322 |
+
)
|
1323 |
+
self.within_chars = False
|
1324 |
+
|
1325 |
+
def receive_layout(self, ltpage: LTPage) -> None:
|
1326 |
+
def render(item: LTItem) -> None:
|
1327 |
+
if self.within_chars and isinstance(item, LTAnno):
|
1328 |
+
self.write_word()
|
1329 |
+
if isinstance(item, LTPage):
|
1330 |
+
self.page_bbox = item.bbox
|
1331 |
+
self.write(
|
1332 |
+
"<div class='ocr_page' id='%s' title='%s'>\n"
|
1333 |
+
% (item.pageid, self.bbox_repr(item.bbox)),
|
1334 |
+
)
|
1335 |
+
for child in item:
|
1336 |
+
render(child)
|
1337 |
+
self.write("</div>\n")
|
1338 |
+
elif isinstance(item, LTTextLine):
|
1339 |
+
self.write(
|
1340 |
+
"<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)),
|
1341 |
+
)
|
1342 |
+
for child_line in item:
|
1343 |
+
render(child_line)
|
1344 |
+
self.write("</span>\n")
|
1345 |
+
elif isinstance(item, LTTextBox):
|
1346 |
+
self.write(
|
1347 |
+
"<div class='ocr_block' id='%d' title='%s'>\n"
|
1348 |
+
% (item.index, self.bbox_repr(item.bbox)),
|
1349 |
+
)
|
1350 |
+
for child in item:
|
1351 |
+
render(child)
|
1352 |
+
self.write("</div>\n")
|
1353 |
+
elif isinstance(item, LTChar):
|
1354 |
+
if not self.within_chars:
|
1355 |
+
self.within_chars = True
|
1356 |
+
self.working_text = item.get_text()
|
1357 |
+
self.working_bbox = item.bbox
|
1358 |
+
self.working_font = item.fontname
|
1359 |
+
self.working_size = item.size
|
1360 |
+
elif len(item.get_text().strip()) == 0:
|
1361 |
+
self.write_word()
|
1362 |
+
self.write(item.get_text())
|
1363 |
+
else:
|
1364 |
+
if (
|
1365 |
+
self.working_bbox[1] != item.bbox[1]
|
1366 |
+
or self.working_font != item.fontname
|
1367 |
+
or self.working_size != item.size
|
1368 |
+
):
|
1369 |
+
self.write_word()
|
1370 |
+
self.working_bbox = item.bbox
|
1371 |
+
self.working_font = item.fontname
|
1372 |
+
self.working_size = item.size
|
1373 |
+
self.working_text += item.get_text()
|
1374 |
+
self.working_bbox = (
|
1375 |
+
self.working_bbox[0],
|
1376 |
+
self.working_bbox[1],
|
1377 |
+
item.bbox[2],
|
1378 |
+
self.working_bbox[3],
|
1379 |
+
)
|
1380 |
+
|
1381 |
+
render(ltpage)
|
1382 |
+
|
1383 |
+
def close(self) -> None:
|
1384 |
+
self.write_footer()
|
pdf2zh/data_structures.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Iterable, List, Optional, Tuple
|
2 |
+
|
3 |
+
from pdf2zh import settings
|
4 |
+
from pdf2zh.pdfparser import PDFSyntaxError
|
5 |
+
from pdf2zh.pdftypes import dict_value, int_value, list_value
|
6 |
+
from pdf2zh.utils import choplist
|
7 |
+
|
8 |
+
|
9 |
+
class NumberTree:
|
10 |
+
"""A PDF number tree.
|
11 |
+
|
12 |
+
See Section 3.8.6 of the PDF Reference.
|
13 |
+
"""
|
14 |
+
|
15 |
+
def __init__(self, obj: Any):
|
16 |
+
self._obj = dict_value(obj)
|
17 |
+
self.nums: Optional[Iterable[Any]] = None
|
18 |
+
self.kids: Optional[Iterable[Any]] = None
|
19 |
+
self.limits: Optional[Iterable[Any]] = None
|
20 |
+
|
21 |
+
if "Nums" in self._obj:
|
22 |
+
self.nums = list_value(self._obj["Nums"])
|
23 |
+
if "Kids" in self._obj:
|
24 |
+
self.kids = list_value(self._obj["Kids"])
|
25 |
+
if "Limits" in self._obj:
|
26 |
+
self.limits = list_value(self._obj["Limits"])
|
27 |
+
|
28 |
+
def _parse(self) -> List[Tuple[int, Any]]:
|
29 |
+
items = []
|
30 |
+
if self.nums: # Leaf node
|
31 |
+
for k, v in choplist(2, self.nums):
|
32 |
+
items.append((int_value(k), v))
|
33 |
+
|
34 |
+
if self.kids: # Root or intermediate node
|
35 |
+
for child_ref in self.kids:
|
36 |
+
items += NumberTree(child_ref)._parse()
|
37 |
+
|
38 |
+
return items
|
39 |
+
|
40 |
+
values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy
|
41 |
+
|
42 |
+
@property # type: ignore[no-redef,misc]
|
43 |
+
def values(self) -> List[Tuple[int, Any]]:
|
44 |
+
values = self._parse()
|
45 |
+
|
46 |
+
if settings.STRICT:
|
47 |
+
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
|
48 |
+
raise PDFSyntaxError("Number tree elements are out of order")
|
49 |
+
else:
|
50 |
+
values.sort(key=lambda t: t[0])
|
51 |
+
|
52 |
+
return values
|
pdf2zh/doclayout.py
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import abc
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
import contextlib
|
5 |
+
from huggingface_hub import hf_hub_download
|
6 |
+
|
7 |
+
|
8 |
+
class DocLayoutModel(abc.ABC):
|
9 |
+
@staticmethod
|
10 |
+
def load_torch():
|
11 |
+
model = TorchModel.from_pretrained(
|
12 |
+
repo_id="juliozhao/DocLayout-YOLO-DocStructBench",
|
13 |
+
filename="doclayout_yolo_docstructbench_imgsz1024.pt",
|
14 |
+
)
|
15 |
+
return model
|
16 |
+
|
17 |
+
@staticmethod
|
18 |
+
def load_onnx():
|
19 |
+
model = OnnxModel.from_pretrained(
|
20 |
+
repo_id="wybxc/DocLayout-YOLO-DocStructBench-onnx",
|
21 |
+
filename="doclayout_yolo_docstructbench_imgsz1024.onnx",
|
22 |
+
)
|
23 |
+
return model
|
24 |
+
|
25 |
+
@staticmethod
|
26 |
+
def load_available():
|
27 |
+
with contextlib.suppress(ImportError):
|
28 |
+
return DocLayoutModel.load_torch()
|
29 |
+
|
30 |
+
with contextlib.suppress(ImportError):
|
31 |
+
return DocLayoutModel.load_onnx()
|
32 |
+
|
33 |
+
raise ImportError(
|
34 |
+
"Please install the `torch` or `onnx` feature to use the DocLayout model."
|
35 |
+
)
|
36 |
+
|
37 |
+
@property
|
38 |
+
@abc.abstractmethod
|
39 |
+
def stride(self) -> int:
|
40 |
+
"""Stride of the model input."""
|
41 |
+
pass
|
42 |
+
|
43 |
+
@abc.abstractmethod
|
44 |
+
def predict(self, image, imgsz=1024, **kwargs) -> list:
|
45 |
+
"""
|
46 |
+
Predict the layout of a document page.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
image: The image of the document page.
|
50 |
+
imgsz: Resize the image to this size. Must be a multiple of the stride.
|
51 |
+
**kwargs: Additional arguments.
|
52 |
+
"""
|
53 |
+
pass
|
54 |
+
|
55 |
+
|
56 |
+
class TorchModel(DocLayoutModel):
|
57 |
+
def __init__(self, model_path: str):
|
58 |
+
try:
|
59 |
+
import doclayout_yolo
|
60 |
+
except ImportError:
|
61 |
+
raise ImportError(
|
62 |
+
"Please install the `torch` feature to use the Torch model."
|
63 |
+
)
|
64 |
+
|
65 |
+
self.model_path = model_path
|
66 |
+
self.model = doclayout_yolo.YOLOv10(model_path)
|
67 |
+
|
68 |
+
@staticmethod
|
69 |
+
def from_pretrained(repo_id: str, filename: str):
|
70 |
+
pth = hf_hub_download(repo_id=repo_id, filename=filename)
|
71 |
+
return TorchModel(pth)
|
72 |
+
|
73 |
+
@property
|
74 |
+
def stride(self):
|
75 |
+
return 32
|
76 |
+
|
77 |
+
def predict(self, *args, **kwargs):
|
78 |
+
return self.model.predict(*args, **kwargs)
|
79 |
+
|
80 |
+
|
81 |
+
class YoloResult:
|
82 |
+
"""Helper class to store detection results from ONNX model."""
|
83 |
+
|
84 |
+
def __init__(self, boxes, names):
|
85 |
+
self.boxes = [YoloBox(data=d) for d in boxes]
|
86 |
+
self.boxes.sort(key=lambda x: x.conf, reverse=True)
|
87 |
+
self.names = names
|
88 |
+
|
89 |
+
|
90 |
+
class YoloBox:
|
91 |
+
"""Helper class to store detection results from ONNX model."""
|
92 |
+
|
93 |
+
def __init__(self, data):
|
94 |
+
self.xyxy = data[:4]
|
95 |
+
self.conf = data[-2]
|
96 |
+
self.cls = data[-1]
|
97 |
+
|
98 |
+
|
99 |
+
class OnnxModel(DocLayoutModel):
|
100 |
+
def __init__(self, model_path: str):
|
101 |
+
import ast
|
102 |
+
|
103 |
+
try:
|
104 |
+
|
105 |
+
import onnx
|
106 |
+
import onnxruntime
|
107 |
+
except ImportError:
|
108 |
+
raise ImportError(
|
109 |
+
"Please install the `onnx` feature to use the ONNX model."
|
110 |
+
)
|
111 |
+
|
112 |
+
self.model_path = model_path
|
113 |
+
|
114 |
+
model = onnx.load(model_path)
|
115 |
+
metadata = {d.key: d.value for d in model.metadata_props}
|
116 |
+
self._stride = ast.literal_eval(metadata["stride"])
|
117 |
+
self._names = ast.literal_eval(metadata["names"])
|
118 |
+
|
119 |
+
self.model = onnxruntime.InferenceSession(model.SerializeToString())
|
120 |
+
|
121 |
+
@staticmethod
|
122 |
+
def from_pretrained(repo_id: str, filename: str):
|
123 |
+
pth = hf_hub_download(repo_id=repo_id, filename=filename)
|
124 |
+
return OnnxModel(pth)
|
125 |
+
|
126 |
+
@property
|
127 |
+
def stride(self):
|
128 |
+
return self._stride
|
129 |
+
|
130 |
+
def resize_and_pad_image(self, image, new_shape):
|
131 |
+
"""
|
132 |
+
Resize and pad the image to the specified size, ensuring dimensions are multiples of stride.
|
133 |
+
|
134 |
+
Parameters:
|
135 |
+
- image: Input image
|
136 |
+
- new_shape: Target size (integer or (height, width) tuple)
|
137 |
+
- stride: Padding alignment stride, default 32
|
138 |
+
|
139 |
+
Returns:
|
140 |
+
- Processed image
|
141 |
+
"""
|
142 |
+
if isinstance(new_shape, int):
|
143 |
+
new_shape = (new_shape, new_shape)
|
144 |
+
|
145 |
+
h, w = image.shape[:2]
|
146 |
+
new_h, new_w = new_shape
|
147 |
+
|
148 |
+
# Calculate scaling ratio
|
149 |
+
r = min(new_h / h, new_w / w)
|
150 |
+
resized_h, resized_w = int(round(h * r)), int(round(w * r))
|
151 |
+
|
152 |
+
# Resize image
|
153 |
+
image = cv2.resize(
|
154 |
+
image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
|
155 |
+
)
|
156 |
+
|
157 |
+
# Calculate padding size and align to stride multiple
|
158 |
+
pad_w = (new_w - resized_w) % self.stride
|
159 |
+
pad_h = (new_h - resized_h) % self.stride
|
160 |
+
top, bottom = pad_h // 2, pad_h - pad_h // 2
|
161 |
+
left, right = pad_w // 2, pad_w - pad_w // 2
|
162 |
+
|
163 |
+
# Add padding
|
164 |
+
image = cv2.copyMakeBorder(
|
165 |
+
image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
|
166 |
+
)
|
167 |
+
|
168 |
+
return image
|
169 |
+
|
170 |
+
def scale_boxes(self, img1_shape, boxes, img0_shape):
|
171 |
+
"""
|
172 |
+
Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
|
173 |
+
specified in (img1_shape) to the shape of a different image (img0_shape).
|
174 |
+
|
175 |
+
Args:
|
176 |
+
img1_shape (tuple): The shape of the image that the bounding boxes are for,
|
177 |
+
in the format of (height, width).
|
178 |
+
boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
|
179 |
+
img0_shape (tuple): the shape of the target image, in the format of (height, width).
|
180 |
+
|
181 |
+
Returns:
|
182 |
+
boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
|
183 |
+
"""
|
184 |
+
|
185 |
+
# Calculate scaling ratio
|
186 |
+
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])
|
187 |
+
|
188 |
+
# Calculate padding size
|
189 |
+
pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
|
190 |
+
pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)
|
191 |
+
|
192 |
+
# Remove padding and scale boxes
|
193 |
+
boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain
|
194 |
+
return boxes
|
195 |
+
|
196 |
+
def predict(self, image, imgsz=1024, **kwargs):
|
197 |
+
# Preprocess input image
|
198 |
+
orig_h, orig_w = image.shape[:2]
|
199 |
+
pix = self.resize_and_pad_image(image, new_shape=imgsz)
|
200 |
+
pix = np.transpose(pix, (2, 0, 1)) # CHW
|
201 |
+
pix = np.expand_dims(pix, axis=0) # BCHW
|
202 |
+
pix = pix.astype(np.float32) / 255.0 # Normalize to [0, 1]
|
203 |
+
new_h, new_w = pix.shape[2:]
|
204 |
+
|
205 |
+
# Run inference
|
206 |
+
preds = self.model.run(None, {"images": pix})[0]
|
207 |
+
|
208 |
+
# Postprocess predictions
|
209 |
+
preds = preds[preds[..., 4] > 0.25]
|
210 |
+
preds[..., :4] = self.scale_boxes(
|
211 |
+
(new_h, new_w), preds[..., :4], (orig_h, orig_w)
|
212 |
+
)
|
213 |
+
return [YoloResult(boxes=preds, names=self._names)]
|
pdf2zh/encodingdb.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import re
|
3 |
+
from typing import Dict, Iterable, Optional, cast
|
4 |
+
|
5 |
+
from pdf2zh.glyphlist import glyphname2unicode
|
6 |
+
from pdf2zh.latin_enc import ENCODING
|
7 |
+
from pdf2zh.pdfexceptions import PDFKeyError
|
8 |
+
from pdf2zh.psparser import PSLiteral
|
9 |
+
|
10 |
+
HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
|
11 |
+
|
12 |
+
log = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
|
15 |
+
def name2unicode(name: str) -> str:
|
16 |
+
"""Converts Adobe glyph names to Unicode numbers.
|
17 |
+
|
18 |
+
In contrast to the specification, this raises a KeyError instead of return
|
19 |
+
an empty string when the key is unknown.
|
20 |
+
This way the caller must explicitly define what to do
|
21 |
+
when there is not a match.
|
22 |
+
|
23 |
+
Reference:
|
24 |
+
https://github.com/adobe-type-tools/agl-specification#2-the-mapping
|
25 |
+
|
26 |
+
:returns unicode character if name resembles something,
|
27 |
+
otherwise a KeyError
|
28 |
+
"""
|
29 |
+
if not isinstance(name, str):
|
30 |
+
raise PDFKeyError(
|
31 |
+
'Could not convert unicode name "%s" to character because '
|
32 |
+
"it should be of type str but is of type %s" % (name, type(name)),
|
33 |
+
)
|
34 |
+
|
35 |
+
name = name.split(".")[0]
|
36 |
+
components = name.split("_")
|
37 |
+
|
38 |
+
if len(components) > 1:
|
39 |
+
return "".join(map(name2unicode, components))
|
40 |
+
|
41 |
+
elif name in glyphname2unicode:
|
42 |
+
return glyphname2unicode[name]
|
43 |
+
|
44 |
+
elif name.startswith("uni"):
|
45 |
+
name_without_uni = name.strip("uni")
|
46 |
+
|
47 |
+
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
|
48 |
+
unicode_digits = [
|
49 |
+
int(name_without_uni[i : i + 4], base=16)
|
50 |
+
for i in range(0, len(name_without_uni), 4)
|
51 |
+
]
|
52 |
+
for digit in unicode_digits:
|
53 |
+
raise_key_error_for_invalid_unicode(digit)
|
54 |
+
characters = map(chr, unicode_digits)
|
55 |
+
return "".join(characters)
|
56 |
+
|
57 |
+
elif name.startswith("u"):
|
58 |
+
name_without_u = name.strip("u")
|
59 |
+
|
60 |
+
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
|
61 |
+
unicode_digit = int(name_without_u, base=16)
|
62 |
+
raise_key_error_for_invalid_unicode(unicode_digit)
|
63 |
+
return chr(unicode_digit)
|
64 |
+
|
65 |
+
raise PDFKeyError(
|
66 |
+
'Could not convert unicode name "%s" to character because '
|
67 |
+
"it does not match specification" % name,
|
68 |
+
)
|
69 |
+
|
70 |
+
|
71 |
+
def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
|
72 |
+
"""Unicode values should not be in the range D800 through DFFF because
|
73 |
+
that is used for surrogate pairs in UTF-16
|
74 |
+
|
75 |
+
:raises KeyError if unicode digit is invalid
|
76 |
+
"""
|
77 |
+
if 55295 < unicode_digit < 57344:
|
78 |
+
raise PDFKeyError(
|
79 |
+
"Unicode digit %d is invalid because "
|
80 |
+
"it is in the range D800 through DFFF" % unicode_digit,
|
81 |
+
)
|
82 |
+
|
83 |
+
|
84 |
+
class EncodingDB:
|
85 |
+
std2unicode: Dict[int, str] = {}
|
86 |
+
mac2unicode: Dict[int, str] = {}
|
87 |
+
win2unicode: Dict[int, str] = {}
|
88 |
+
pdf2unicode: Dict[int, str] = {}
|
89 |
+
for name, std, mac, win, pdf in ENCODING:
|
90 |
+
c = name2unicode(name)
|
91 |
+
if std:
|
92 |
+
std2unicode[std] = c
|
93 |
+
if mac:
|
94 |
+
mac2unicode[mac] = c
|
95 |
+
if win:
|
96 |
+
win2unicode[win] = c
|
97 |
+
if pdf:
|
98 |
+
pdf2unicode[pdf] = c
|
99 |
+
|
100 |
+
encodings = {
|
101 |
+
"StandardEncoding": std2unicode,
|
102 |
+
"MacRomanEncoding": mac2unicode,
|
103 |
+
"WinAnsiEncoding": win2unicode,
|
104 |
+
"PDFDocEncoding": pdf2unicode,
|
105 |
+
}
|
106 |
+
|
107 |
+
@classmethod
|
108 |
+
def get_encoding(
|
109 |
+
cls,
|
110 |
+
name: str,
|
111 |
+
diff: Optional[Iterable[object]] = None,
|
112 |
+
) -> Dict[int, str]:
|
113 |
+
cid2unicode = cls.encodings.get(name, cls.std2unicode)
|
114 |
+
if diff:
|
115 |
+
cid2unicode = cid2unicode.copy()
|
116 |
+
cid = 0
|
117 |
+
for x in diff:
|
118 |
+
if isinstance(x, int):
|
119 |
+
cid = x
|
120 |
+
elif isinstance(x, PSLiteral):
|
121 |
+
try:
|
122 |
+
cid2unicode[cid] = name2unicode(cast(str, x.name))
|
123 |
+
except (KeyError, ValueError):
|
124 |
+
# log.debug(str(e))
|
125 |
+
pass
|
126 |
+
cid += 1
|
127 |
+
return cid2unicode
|
pdf2zh/fontmetrics.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pdf2zh/glyphlist.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pdf2zh/gui.py
ADDED
@@ -0,0 +1,425 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
from pathlib import Path
|
4 |
+
from pdf2zh import __version__
|
5 |
+
from pdf2zh.pdf2zh import extract_text
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
import numpy as np
|
9 |
+
import pymupdf
|
10 |
+
import tqdm
|
11 |
+
import requests
|
12 |
+
|
13 |
+
# Map service names to pdf2zh service options
|
14 |
+
service_map = {
|
15 |
+
"Google": "google",
|
16 |
+
"DeepL": "deepl",
|
17 |
+
"DeepLX": "deeplx",
|
18 |
+
"Ollama": "ollama",
|
19 |
+
"OpenAI": "openai",
|
20 |
+
"Azure": "azure",
|
21 |
+
}
|
22 |
+
lang_map = {
|
23 |
+
"Chinese": "zh",
|
24 |
+
"English": "en",
|
25 |
+
"French": "fr",
|
26 |
+
"German": "de",
|
27 |
+
"Japanese": "ja",
|
28 |
+
"Korean": "ko",
|
29 |
+
"Russian": "ru",
|
30 |
+
"Spanish": "es",
|
31 |
+
"Italian": "it",
|
32 |
+
}
|
33 |
+
page_map = {
|
34 |
+
"All": None,
|
35 |
+
"First": [0],
|
36 |
+
"First 5 pages": list(range(0, 5)),
|
37 |
+
}
|
38 |
+
|
39 |
+
flag_demo = False
|
40 |
+
if os.environ.get("PDF2ZH_DEMO"):
|
41 |
+
flag_demo = True
|
42 |
+
service_map = {
|
43 |
+
"Google": "google",
|
44 |
+
}
|
45 |
+
page_map = {
|
46 |
+
"First": [0],
|
47 |
+
"First 20 pages": list(range(0, 20)),
|
48 |
+
}
|
49 |
+
client_key = os.environ.get("PDF2ZH_CLIENT_KEY")
|
50 |
+
server_key = os.environ.get("PDF2ZH_SERVER_KEY")
|
51 |
+
|
52 |
+
|
53 |
+
def verify_recaptcha(response):
|
54 |
+
recaptcha_url = "https://www.google.com/recaptcha/api/siteverify"
|
55 |
+
|
56 |
+
print("reCAPTCHA", server_key, response)
|
57 |
+
|
58 |
+
data = {"secret": server_key, "response": response}
|
59 |
+
result = requests.post(recaptcha_url, data=data).json()
|
60 |
+
|
61 |
+
print("reCAPTCHA", result.get("success"))
|
62 |
+
|
63 |
+
return result.get("success")
|
64 |
+
|
65 |
+
|
66 |
+
def pdf_preview(file):
|
67 |
+
doc = pymupdf.open(file)
|
68 |
+
page = doc[0]
|
69 |
+
pix = page.get_pixmap()
|
70 |
+
image = np.frombuffer(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)
|
71 |
+
return image
|
72 |
+
|
73 |
+
|
74 |
+
def upload_file(file, service, progress=gr.Progress()):
|
75 |
+
"""Handle file upload, validation, and initial preview."""
|
76 |
+
if not file or not os.path.exists(file):
|
77 |
+
return None, None
|
78 |
+
|
79 |
+
try:
|
80 |
+
# Convert first page for preview
|
81 |
+
preview_image = pdf_preview(file)
|
82 |
+
|
83 |
+
return file, preview_image
|
84 |
+
except Exception as e:
|
85 |
+
print(f"Error converting PDF: {e}")
|
86 |
+
return None, None
|
87 |
+
|
88 |
+
|
89 |
+
def translate(
|
90 |
+
file_path,
|
91 |
+
service,
|
92 |
+
model_id,
|
93 |
+
lang,
|
94 |
+
page_range,
|
95 |
+
recaptcha_response,
|
96 |
+
progress=gr.Progress(),
|
97 |
+
):
|
98 |
+
"""Translate PDF content using selected service."""
|
99 |
+
if not file_path:
|
100 |
+
raise gr.Error("No input")
|
101 |
+
|
102 |
+
if flag_demo and not verify_recaptcha(recaptcha_response):
|
103 |
+
raise gr.Error("reCAPTCHA fail")
|
104 |
+
|
105 |
+
progress(0, desc="Starting translation...")
|
106 |
+
|
107 |
+
output = Path("pdf2zh_files")
|
108 |
+
output.mkdir(parents=True, exist_ok=True)
|
109 |
+
filename = os.path.splitext(os.path.basename(file_path))[0]
|
110 |
+
file_en = output / f"{filename}.pdf"
|
111 |
+
file_zh = output / f"{filename}-zh.pdf"
|
112 |
+
file_dual = output / f"{filename}-dual.pdf"
|
113 |
+
shutil.copyfile(file_path, file_en)
|
114 |
+
|
115 |
+
selected_service = service_map.get(service, "google")
|
116 |
+
selected_page = page_map.get(page_range, [0])
|
117 |
+
lang_to = lang_map.get(lang, "zh")
|
118 |
+
if selected_service == "google":
|
119 |
+
lang_to = "zh-CN" if lang_to == "zh" else lang_to
|
120 |
+
|
121 |
+
print(f"Files before translation: {os.listdir(output)}")
|
122 |
+
|
123 |
+
def progress_bar(t: tqdm.tqdm):
|
124 |
+
progress(t.n / t.total, desc="Translating...")
|
125 |
+
|
126 |
+
param = {
|
127 |
+
"files": [file_en],
|
128 |
+
"pages": selected_page,
|
129 |
+
"lang_in": "auto",
|
130 |
+
"lang_out": lang_to,
|
131 |
+
"service": f"{selected_service}:{model_id}",
|
132 |
+
"output": output,
|
133 |
+
"thread": 4,
|
134 |
+
"callback": progress_bar,
|
135 |
+
}
|
136 |
+
print(param)
|
137 |
+
extract_text(**param)
|
138 |
+
print(f"Files after translation: {os.listdir(output)}")
|
139 |
+
|
140 |
+
if not file_zh.exists() or not file_dual.exists():
|
141 |
+
raise gr.Error("No output")
|
142 |
+
|
143 |
+
try:
|
144 |
+
translated_preview = pdf_preview(str(file_zh))
|
145 |
+
except Exception:
|
146 |
+
raise gr.Error("No preview")
|
147 |
+
|
148 |
+
progress(1.0, desc="Translation complete!")
|
149 |
+
|
150 |
+
return (
|
151 |
+
str(file_zh),
|
152 |
+
translated_preview,
|
153 |
+
str(file_dual),
|
154 |
+
gr.update(visible=True),
|
155 |
+
gr.update(visible=True),
|
156 |
+
gr.update(visible=True),
|
157 |
+
)
|
158 |
+
|
159 |
+
|
160 |
+
# Global setup
|
161 |
+
custom_blue = gr.themes.Color(
|
162 |
+
c50="#E8F3FF",
|
163 |
+
c100="#BEDAFF",
|
164 |
+
c200="#94BFFF",
|
165 |
+
c300="#6AA1FF",
|
166 |
+
c400="#4080FF",
|
167 |
+
c500="#165DFF", # Primary color
|
168 |
+
c600="#0E42D2",
|
169 |
+
c700="#0A2BA6",
|
170 |
+
c800="#061D79",
|
171 |
+
c900="#03114D",
|
172 |
+
c950="#020B33",
|
173 |
+
)
|
174 |
+
|
175 |
+
with gr.Blocks(
|
176 |
+
title="PDFMathTranslate - PDF Translation with preserved formats",
|
177 |
+
theme=gr.themes.Default(
|
178 |
+
primary_hue=custom_blue, spacing_size="md", radius_size="lg"
|
179 |
+
),
|
180 |
+
css="""
|
181 |
+
.secondary-text {color: #999 !important;}
|
182 |
+
footer {visibility: hidden}
|
183 |
+
.env-warning {color: #dd5500 !important;}
|
184 |
+
.env-success {color: #559900 !important;}
|
185 |
+
|
186 |
+
/* Add dashed border to input-file class */
|
187 |
+
.input-file {
|
188 |
+
border: 1.2px dashed #165DFF !important;
|
189 |
+
border-radius: 6px !important;
|
190 |
+
# background-color: #ffffff !important;
|
191 |
+
transition: background-color 0.4s ease-out;
|
192 |
+
}
|
193 |
+
|
194 |
+
.input-file:hover {
|
195 |
+
border: 1.2px dashed #165DFF !important;
|
196 |
+
border-radius: 6px !important;
|
197 |
+
color: #165DFF !important;
|
198 |
+
background-color: #E8F3FF !important;
|
199 |
+
transition: background-color 0.2s ease-in;
|
200 |
+
}
|
201 |
+
|
202 |
+
.progress-bar-wrap {
|
203 |
+
border-radius: 8px !important;
|
204 |
+
}
|
205 |
+
.progress-bar {
|
206 |
+
border-radius: 8px !important;
|
207 |
+
}
|
208 |
+
|
209 |
+
# .input-file label {
|
210 |
+
# color: #165DFF !important;
|
211 |
+
# border: 1.2px dashed #165DFF !important;
|
212 |
+
# border-left: none !important;
|
213 |
+
# border-top: none !important;
|
214 |
+
# }
|
215 |
+
# .input-file .wrap {
|
216 |
+
# color: #165DFF !important;
|
217 |
+
# }
|
218 |
+
# .input-file .or {
|
219 |
+
# color: #165DFF !important;
|
220 |
+
# }
|
221 |
+
""",
|
222 |
+
head=(
|
223 |
+
"""
|
224 |
+
<script src="https://www.google.com/recaptcha/api.js?render=explicit" async defer></script>
|
225 |
+
<script type="text/javascript">
|
226 |
+
var onVerify = function(token) {
|
227 |
+
el=document.getElementById('verify').getElementsByTagName('textarea')[0];
|
228 |
+
el.value=token;
|
229 |
+
el.dispatchEvent(new Event('input'));
|
230 |
+
};
|
231 |
+
</script>
|
232 |
+
"""
|
233 |
+
if flag_demo
|
234 |
+
else ""
|
235 |
+
),
|
236 |
+
) as demo:
|
237 |
+
gr.Markdown(
|
238 |
+
"# [PDFMathTranslate @ GitHub](https://github.com/Byaidu/PDFMathTranslate)"
|
239 |
+
)
|
240 |
+
|
241 |
+
with gr.Row():
|
242 |
+
with gr.Column(scale=1):
|
243 |
+
gr.Markdown("## File | < 5 MB" if flag_demo else "## File")
|
244 |
+
file_input = gr.File(
|
245 |
+
label="Document",
|
246 |
+
file_count="single",
|
247 |
+
file_types=[".pdf"],
|
248 |
+
type="filepath",
|
249 |
+
elem_classes=["input-file"],
|
250 |
+
)
|
251 |
+
gr.Markdown("## Option")
|
252 |
+
service = gr.Dropdown(
|
253 |
+
label="Service",
|
254 |
+
info="Which translation service to use. Some require keys",
|
255 |
+
choices=service_map.keys(),
|
256 |
+
value="Google",
|
257 |
+
)
|
258 |
+
lang_to = gr.Dropdown(
|
259 |
+
label="Translate to",
|
260 |
+
info="Which language to translate to (optional)",
|
261 |
+
choices=lang_map.keys(),
|
262 |
+
value="Chinese",
|
263 |
+
)
|
264 |
+
page_range = gr.Radio(
|
265 |
+
choices=page_map.keys(),
|
266 |
+
label="Pages",
|
267 |
+
info="Translate the full document or just few pages (optional)",
|
268 |
+
value=list(page_map.keys())[0],
|
269 |
+
)
|
270 |
+
model_id = gr.Textbox(
|
271 |
+
label="Model ID",
|
272 |
+
info="Please enter the identifier of the model you wish to use (e.g., gemma2). "
|
273 |
+
"This identifier will be used to specify the particular model for translation.",
|
274 |
+
# value="gemma2",
|
275 |
+
visible=False, # hide by default
|
276 |
+
)
|
277 |
+
envs_status = "<span class='env-success'>- Properly configured.</span><br>"
|
278 |
+
|
279 |
+
def details_wrapper(text_markdown):
|
280 |
+
text = f"""
|
281 |
+
<details>
|
282 |
+
<summary>Technical details</summary>
|
283 |
+
{text_markdown}
|
284 |
+
- GitHub: <a href="https://github.com/Byaidu/PDFMathTranslate">Byaidu/PDFMathTranslate</a><br>
|
285 |
+
- GUI by: <a href="https://github.com/reycn">Rongxin</a><br>
|
286 |
+
- Version: {__version__}
|
287 |
+
</details>"""
|
288 |
+
return text
|
289 |
+
|
290 |
+
def env_var_checker(env_var_name: str) -> str:
|
291 |
+
if (
|
292 |
+
not os.environ.get(env_var_name)
|
293 |
+
or os.environ.get(env_var_name) == ""
|
294 |
+
):
|
295 |
+
envs_status = (
|
296 |
+
f"<span class='env-warning'>- Warning: environmental not found or error ({env_var_name})."
|
297 |
+
+ "</span><br>- Please make sure that the environment variables are properly configured "
|
298 |
+
+ "(<a href='https://github.com/Byaidu/PDFMathTranslate'>guide</a>).<br>"
|
299 |
+
)
|
300 |
+
else:
|
301 |
+
value = str(os.environ.get(env_var_name))
|
302 |
+
envs_status = (
|
303 |
+
"<span class='env-success'>- Properly configured.</span><br>"
|
304 |
+
)
|
305 |
+
if len(value) < 13:
|
306 |
+
envs_status += (
|
307 |
+
f"- Env: <code>{os.environ.get(env_var_name)}</code><br>"
|
308 |
+
)
|
309 |
+
else:
|
310 |
+
envs_status += f"- Env: <code>{value[:13]}***</code><br>"
|
311 |
+
return details_wrapper(envs_status)
|
312 |
+
|
313 |
+
def on_select_service(value, evt: gr.EventData):
|
314 |
+
# hide model id by default
|
315 |
+
model_visibility = gr.update(visible=False)
|
316 |
+
# add a text description
|
317 |
+
if value == "Google":
|
318 |
+
envs_status = details_wrapper(
|
319 |
+
"<span class='env-success'>- Properly configured.</span><br>"
|
320 |
+
)
|
321 |
+
|
322 |
+
elif value == "DeepL":
|
323 |
+
envs_status = env_var_checker("DEEPL_AUTH_KEY")
|
324 |
+
elif value == "DeepLX":
|
325 |
+
envs_status = env_var_checker("DEEPLX_AUTH_KEY")
|
326 |
+
elif value == "Azure":
|
327 |
+
envs_status = env_var_checker("AZURE_APIKEY")
|
328 |
+
elif value == "OpenAI":
|
329 |
+
model_visibility = gr.update(
|
330 |
+
visible=True, value="gpt-4o"
|
331 |
+
) # show model id when service is selected
|
332 |
+
envs_status = env_var_checker("OPENAI_API_KEY")
|
333 |
+
elif value == "Ollama":
|
334 |
+
model_visibility = gr.update(
|
335 |
+
visible=True, value="gemma2"
|
336 |
+
) # show model id when service is selected
|
337 |
+
envs_status = env_var_checker("OLLAMA_HOST")
|
338 |
+
else:
|
339 |
+
envs_status = (
|
340 |
+
"<span class='env-warning'>- Warning: model not in the list."
|
341 |
+
"</span><br>- Please report via "
|
342 |
+
"(<a href='https://github.com/Byaidu/PDFMathTranslate'>guide</a>).<br>"
|
343 |
+
)
|
344 |
+
return envs_status, model_visibility
|
345 |
+
|
346 |
+
output_title = gr.Markdown("## Translated", visible=False)
|
347 |
+
output_file = gr.File(label="Download Translation", visible=False)
|
348 |
+
output_file_dual = gr.File(
|
349 |
+
label="Download Translation (Dual)", visible=False
|
350 |
+
)
|
351 |
+
recaptcha_response = gr.Textbox(
|
352 |
+
label="reCAPTCHA Response", elem_id="verify", visible=False
|
353 |
+
)
|
354 |
+
recaptcha_box = gr.HTML('<div id="recaptcha-box"></div>')
|
355 |
+
translate_btn = gr.Button("Translate", variant="primary")
|
356 |
+
tech_details_tog = gr.Markdown(
|
357 |
+
details_wrapper(envs_status),
|
358 |
+
elem_classes=["secondary-text"],
|
359 |
+
)
|
360 |
+
service.select(on_select_service, service, [tech_details_tog, model_id])
|
361 |
+
|
362 |
+
with gr.Column(scale=2):
|
363 |
+
gr.Markdown("## Preview")
|
364 |
+
preview = gr.Image(label="Document Preview", visible=True)
|
365 |
+
|
366 |
+
# Event handlers
|
367 |
+
file_input.upload(
|
368 |
+
upload_file,
|
369 |
+
inputs=[file_input, service],
|
370 |
+
outputs=[file_input, preview],
|
371 |
+
js=(
|
372 |
+
f"""
|
373 |
+
(a,b)=>{{
|
374 |
+
try{{
|
375 |
+
grecaptcha.render('recaptcha-box',{{
|
376 |
+
'sitekey':'{client_key}',
|
377 |
+
'callback':'onVerify'
|
378 |
+
}});
|
379 |
+
}}catch(error){{}}
|
380 |
+
return [a];
|
381 |
+
}}
|
382 |
+
"""
|
383 |
+
if flag_demo
|
384 |
+
else ""
|
385 |
+
),
|
386 |
+
)
|
387 |
+
|
388 |
+
translate_btn.click(
|
389 |
+
translate,
|
390 |
+
inputs=[file_input, service, model_id, lang_to, page_range, recaptcha_response],
|
391 |
+
outputs=[
|
392 |
+
output_file,
|
393 |
+
preview,
|
394 |
+
output_file_dual,
|
395 |
+
output_file,
|
396 |
+
output_file_dual,
|
397 |
+
output_title,
|
398 |
+
],
|
399 |
+
).then(lambda: None, js="()=>{grecaptcha.reset()}" if flag_demo else "")
|
400 |
+
|
401 |
+
|
402 |
+
def setup_gui(share=False):
|
403 |
+
if flag_demo:
|
404 |
+
demo.launch(server_name="0.0.0.0", max_file_size="5mb", inbrowser=True)
|
405 |
+
else:
|
406 |
+
try:
|
407 |
+
demo.launch(server_name="0.0.0.0", debug=True, inbrowser=True, share=share)
|
408 |
+
except Exception:
|
409 |
+
print(
|
410 |
+
"Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software."
|
411 |
+
)
|
412 |
+
try:
|
413 |
+
demo.launch(
|
414 |
+
server_name="127.0.0.1", debug=True, inbrowser=True, share=share
|
415 |
+
)
|
416 |
+
except Exception:
|
417 |
+
print(
|
418 |
+
"Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software."
|
419 |
+
)
|
420 |
+
demo.launch(debug=True, inbrowser=True, share=True)
|
421 |
+
|
422 |
+
|
423 |
+
# For auto-reloading while developing
|
424 |
+
if __name__ == "__main__":
|
425 |
+
setup_gui()
|
pdf2zh/high_level.py
ADDED
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Functions that can be used for the most common use-cases for pdf2zh.six"""
|
2 |
+
|
3 |
+
import logging
|
4 |
+
import sys
|
5 |
+
from io import StringIO
|
6 |
+
from typing import Any, BinaryIO, Container, Iterator, Optional, cast
|
7 |
+
import numpy as np
|
8 |
+
import tqdm
|
9 |
+
from pymupdf import Document
|
10 |
+
|
11 |
+
from pdf2zh.converter import (
|
12 |
+
HOCRConverter,
|
13 |
+
HTMLConverter,
|
14 |
+
PDFPageAggregator,
|
15 |
+
TextConverter,
|
16 |
+
XMLConverter,
|
17 |
+
)
|
18 |
+
from pdf2zh.image import ImageWriter
|
19 |
+
from pdf2zh.layout import LAParams, LTPage
|
20 |
+
from pdf2zh.pdfdevice import PDFDevice, TagExtractor
|
21 |
+
from pdf2zh.pdfexceptions import PDFValueError
|
22 |
+
from pdf2zh.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
23 |
+
from pdf2zh.pdfpage import PDFPage
|
24 |
+
from pdf2zh.utils import AnyIO, FileOrName, open_filename, get_device
|
25 |
+
|
26 |
+
|
27 |
+
def extract_text_to_fp(
|
28 |
+
inf: BinaryIO,
|
29 |
+
outfp: AnyIO,
|
30 |
+
output_type: str = "text",
|
31 |
+
codec: str = "utf-8",
|
32 |
+
laparams: Optional[LAParams] = None,
|
33 |
+
maxpages: int = 0,
|
34 |
+
pages: Optional[Container[int]] = None,
|
35 |
+
password: str = "",
|
36 |
+
scale: float = 1.0,
|
37 |
+
rotation: int = 0,
|
38 |
+
layoutmode: str = "normal",
|
39 |
+
output_dir: Optional[str] = None,
|
40 |
+
strip_control: bool = False,
|
41 |
+
debug: bool = False,
|
42 |
+
disable_caching: bool = False,
|
43 |
+
page_count: int = 0,
|
44 |
+
vfont: str = "",
|
45 |
+
vchar: str = "",
|
46 |
+
thread: int = 0,
|
47 |
+
doc_en: Document = None,
|
48 |
+
model=None,
|
49 |
+
lang_in: str = "",
|
50 |
+
lang_out: str = "",
|
51 |
+
service: str = "",
|
52 |
+
callback: object = None,
|
53 |
+
**kwargs: Any,
|
54 |
+
) -> None:
|
55 |
+
"""Parses text from inf-file and writes to outfp file-like object.
|
56 |
+
|
57 |
+
Takes loads of optional arguments but the defaults are somewhat sane.
|
58 |
+
Beware laparams: Including an empty LAParams is not the same as passing
|
59 |
+
None!
|
60 |
+
|
61 |
+
:param inf: a file-like object to read PDF structure from, such as a
|
62 |
+
file handler (using the builtin `open()` function) or a `BytesIO`.
|
63 |
+
:param outfp: a file-like object to write the text to.
|
64 |
+
:param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
|
65 |
+
Only 'text' works properly.
|
66 |
+
:param codec: Text decoding codec
|
67 |
+
:param laparams: An LAParams object from pdf2zh.layout. Default is None
|
68 |
+
but may not layout correctly.
|
69 |
+
:param maxpages: How many pages to stop parsing after
|
70 |
+
:param page_numbers: zero-indexed page numbers to operate on.
|
71 |
+
:param password: For encrypted PDFs, the password to decrypt.
|
72 |
+
:param scale: Scale factor
|
73 |
+
:param rotation: Rotation factor
|
74 |
+
:param layoutmode: Default is 'normal', see
|
75 |
+
pdf2zh.converter.HTMLConverter
|
76 |
+
:param output_dir: If given, creates an ImageWriter for extracted images.
|
77 |
+
:param strip_control: Does what it says on the tin
|
78 |
+
:param debug: Output more logging data
|
79 |
+
:param disable_caching: Does what it says on the tin
|
80 |
+
:param other:
|
81 |
+
:return: nothing, acting as it does on two streams. Use StringIO to get
|
82 |
+
strings.
|
83 |
+
"""
|
84 |
+
if debug:
|
85 |
+
logging.getLogger().setLevel(logging.DEBUG)
|
86 |
+
|
87 |
+
imagewriter = None
|
88 |
+
if output_dir:
|
89 |
+
imagewriter = ImageWriter(output_dir)
|
90 |
+
|
91 |
+
rsrcmgr = PDFResourceManager(caching=not disable_caching)
|
92 |
+
device: Optional[PDFDevice] = None
|
93 |
+
layout = {}
|
94 |
+
|
95 |
+
if output_type != "text" and outfp == sys.stdout:
|
96 |
+
outfp = sys.stdout.buffer
|
97 |
+
|
98 |
+
if output_type == "text":
|
99 |
+
device = TextConverter(
|
100 |
+
rsrcmgr,
|
101 |
+
outfp,
|
102 |
+
codec=codec,
|
103 |
+
laparams=laparams,
|
104 |
+
imagewriter=imagewriter,
|
105 |
+
vfont=vfont,
|
106 |
+
vchar=vchar,
|
107 |
+
thread=thread,
|
108 |
+
layout=layout,
|
109 |
+
lang_in=lang_in,
|
110 |
+
lang_out=lang_out,
|
111 |
+
service=service,
|
112 |
+
)
|
113 |
+
|
114 |
+
elif output_type == "xml":
|
115 |
+
device = XMLConverter(
|
116 |
+
rsrcmgr,
|
117 |
+
outfp,
|
118 |
+
codec=codec,
|
119 |
+
laparams=laparams,
|
120 |
+
imagewriter=imagewriter,
|
121 |
+
stripcontrol=strip_control,
|
122 |
+
)
|
123 |
+
|
124 |
+
elif output_type == "html":
|
125 |
+
device = HTMLConverter(
|
126 |
+
rsrcmgr,
|
127 |
+
outfp,
|
128 |
+
codec=codec,
|
129 |
+
scale=scale,
|
130 |
+
layoutmode=layoutmode,
|
131 |
+
laparams=laparams,
|
132 |
+
imagewriter=imagewriter,
|
133 |
+
)
|
134 |
+
|
135 |
+
elif output_type == "hocr":
|
136 |
+
device = HOCRConverter(
|
137 |
+
rsrcmgr,
|
138 |
+
outfp,
|
139 |
+
codec=codec,
|
140 |
+
laparams=laparams,
|
141 |
+
stripcontrol=strip_control,
|
142 |
+
)
|
143 |
+
|
144 |
+
elif output_type == "tag":
|
145 |
+
# Binary I/O is required, but we have no good way to test it here.
|
146 |
+
device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
|
147 |
+
|
148 |
+
else:
|
149 |
+
msg = f"Output type can be text, html, xml or tag but is {output_type}"
|
150 |
+
raise PDFValueError(msg)
|
151 |
+
|
152 |
+
assert device is not None
|
153 |
+
obj_patch = {}
|
154 |
+
interpreter = PDFPageInterpreter(rsrcmgr, device, obj_patch)
|
155 |
+
if pages:
|
156 |
+
total_pages = len(pages)
|
157 |
+
else:
|
158 |
+
total_pages = page_count
|
159 |
+
with tqdm.tqdm(
|
160 |
+
PDFPage.get_pages(
|
161 |
+
inf,
|
162 |
+
pages,
|
163 |
+
maxpages=maxpages,
|
164 |
+
password=password,
|
165 |
+
caching=not disable_caching,
|
166 |
+
),
|
167 |
+
total=total_pages,
|
168 |
+
position=0,
|
169 |
+
) as progress:
|
170 |
+
for page in progress:
|
171 |
+
if callback:
|
172 |
+
callback(progress)
|
173 |
+
pix = doc_en[page.pageno].get_pixmap()
|
174 |
+
image = np.fromstring(pix.samples, np.uint8).reshape(
|
175 |
+
pix.height, pix.width, 3
|
176 |
+
)[:, :, ::-1]
|
177 |
+
page_layout = model.predict(
|
178 |
+
image, imgsz=int(pix.height / 32) * 32, device=get_device()
|
179 |
+
)[0]
|
180 |
+
# kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间
|
181 |
+
box = np.ones((pix.height, pix.width))
|
182 |
+
h, w = box.shape
|
183 |
+
vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
|
184 |
+
for i, d in enumerate(page_layout.boxes):
|
185 |
+
if not page_layout.names[int(d.cls)] in vcls:
|
186 |
+
x0, y0, x1, y1 = d.xyxy.squeeze()
|
187 |
+
x0, y0, x1, y1 = (
|
188 |
+
np.clip(int(x0 - 1), 0, w - 1),
|
189 |
+
np.clip(int(h - y1 - 1), 0, h - 1),
|
190 |
+
np.clip(int(x1 + 1), 0, w - 1),
|
191 |
+
np.clip(int(h - y0 + 1), 0, h - 1),
|
192 |
+
)
|
193 |
+
box[y0:y1, x0:x1] = i + 2
|
194 |
+
for i, d in enumerate(page_layout.boxes):
|
195 |
+
if page_layout.names[int(d.cls)] in vcls:
|
196 |
+
x0, y0, x1, y1 = d.xyxy.squeeze()
|
197 |
+
x0, y0, x1, y1 = (
|
198 |
+
np.clip(int(x0 - 1), 0, w - 1),
|
199 |
+
np.clip(int(h - y1 - 1), 0, h - 1),
|
200 |
+
np.clip(int(x1 + 1), 0, w - 1),
|
201 |
+
np.clip(int(h - y0 + 1), 0, h - 1),
|
202 |
+
)
|
203 |
+
box[y0:y1, x0:x1] = 0
|
204 |
+
layout[page.pageno] = box
|
205 |
+
# print(page.number,page_layout)
|
206 |
+
page.rotate = (page.rotate + rotation) % 360
|
207 |
+
# 新建一个 xref 存放新指令流
|
208 |
+
page.page_xref = doc_en.get_new_xref() # hack 插入页面的新 xref
|
209 |
+
doc_en.update_object(page.page_xref, "<<>>")
|
210 |
+
doc_en.update_stream(page.page_xref, b"")
|
211 |
+
doc_en[page.pageno].set_contents(page.page_xref)
|
212 |
+
interpreter.process_page(page)
|
213 |
+
|
214 |
+
device.close()
|
215 |
+
return obj_patch
|
216 |
+
|
217 |
+
|
218 |
+
def extract_text(
|
219 |
+
pdf_file: FileOrName,
|
220 |
+
password: str = "",
|
221 |
+
page_numbers: Optional[Container[int]] = None,
|
222 |
+
maxpages: int = 0,
|
223 |
+
caching: bool = True,
|
224 |
+
codec: str = "utf-8",
|
225 |
+
laparams: Optional[LAParams] = None,
|
226 |
+
) -> str:
|
227 |
+
"""Parse and return the text contained in a PDF file.
|
228 |
+
|
229 |
+
:param pdf_file: Either a file path or a file-like object for the PDF file
|
230 |
+
to be worked on.
|
231 |
+
:param password: For encrypted PDFs, the password to decrypt.
|
232 |
+
:param page_numbers: List of zero-indexed page numbers to extract.
|
233 |
+
:param maxpages: The maximum number of pages to parse
|
234 |
+
:param caching: If resources should be cached
|
235 |
+
:param codec: Text decoding codec
|
236 |
+
:param laparams: An LAParams object from pdf2zh.layout. If None, uses
|
237 |
+
some default settings that often work well.
|
238 |
+
:return: a string containing all of the text extracted.
|
239 |
+
"""
|
240 |
+
if laparams is None:
|
241 |
+
laparams = LAParams()
|
242 |
+
|
243 |
+
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
|
244 |
+
fp = cast(BinaryIO, fp) # we opened in binary mode
|
245 |
+
rsrcmgr = PDFResourceManager(caching=caching)
|
246 |
+
device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
|
247 |
+
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
248 |
+
|
249 |
+
for page in PDFPage.get_pages(
|
250 |
+
fp,
|
251 |
+
page_numbers,
|
252 |
+
maxpages=maxpages,
|
253 |
+
password=password,
|
254 |
+
caching=caching,
|
255 |
+
):
|
256 |
+
interpreter.process_page(page)
|
257 |
+
|
258 |
+
return output_string.getvalue()
|
259 |
+
|
260 |
+
|
261 |
+
def extract_pages(
|
262 |
+
pdf_file: FileOrName,
|
263 |
+
password: str = "",
|
264 |
+
page_numbers: Optional[Container[int]] = None,
|
265 |
+
maxpages: int = 0,
|
266 |
+
caching: bool = True,
|
267 |
+
laparams: Optional[LAParams] = None,
|
268 |
+
) -> Iterator[LTPage]:
|
269 |
+
"""Extract and yield LTPage objects
|
270 |
+
|
271 |
+
:param pdf_file: Either a file path or a file-like object for the PDF file
|
272 |
+
to be worked on.
|
273 |
+
:param password: For encrypted PDFs, the password to decrypt.
|
274 |
+
:param page_numbers: List of zero-indexed page numbers to extract.
|
275 |
+
:param maxpages: The maximum number of pages to parse
|
276 |
+
:param caching: If resources should be cached
|
277 |
+
:param laparams: An LAParams object from pdf2zh.layout. If None, uses
|
278 |
+
some default settings that often work well.
|
279 |
+
:return: LTPage objects
|
280 |
+
"""
|
281 |
+
if laparams is None:
|
282 |
+
laparams = LAParams()
|
283 |
+
|
284 |
+
with open_filename(pdf_file, "rb") as fp:
|
285 |
+
fp = cast(BinaryIO, fp) # we opened in binary mode
|
286 |
+
resource_manager = PDFResourceManager(caching=caching)
|
287 |
+
device = PDFPageAggregator(resource_manager, laparams=laparams)
|
288 |
+
interpreter = PDFPageInterpreter(resource_manager, device)
|
289 |
+
for page in PDFPage.get_pages(
|
290 |
+
fp,
|
291 |
+
page_numbers,
|
292 |
+
maxpages=maxpages,
|
293 |
+
password=password,
|
294 |
+
caching=caching,
|
295 |
+
):
|
296 |
+
interpreter.process_page(page)
|
297 |
+
layout = device.get_result()
|
298 |
+
yield layout
|
pdf2zh/image.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import os.path
|
3 |
+
import struct
|
4 |
+
from io import BytesIO
|
5 |
+
from typing import BinaryIO, Tuple
|
6 |
+
|
7 |
+
try:
|
8 |
+
from typing import Literal
|
9 |
+
except ImportError:
|
10 |
+
# Literal was introduced in Python 3.8
|
11 |
+
from typing_extensions import Literal # type: ignore[assignment]
|
12 |
+
|
13 |
+
from pdf2zh.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
|
14 |
+
from pdf2zh.layout import LTImage
|
15 |
+
from pdf2zh.pdfcolor import (
|
16 |
+
LITERAL_DEVICE_CMYK,
|
17 |
+
LITERAL_DEVICE_GRAY,
|
18 |
+
LITERAL_DEVICE_RGB,
|
19 |
+
LITERAL_INLINE_DEVICE_GRAY,
|
20 |
+
LITERAL_INLINE_DEVICE_RGB,
|
21 |
+
)
|
22 |
+
from pdf2zh.pdfexceptions import PDFValueError
|
23 |
+
from pdf2zh.pdftypes import (
|
24 |
+
LITERALS_DCT_DECODE,
|
25 |
+
LITERALS_FLATE_DECODE,
|
26 |
+
LITERALS_JBIG2_DECODE,
|
27 |
+
LITERALS_JPX_DECODE,
|
28 |
+
)
|
29 |
+
|
30 |
+
PIL_ERROR_MESSAGE = (
|
31 |
+
"Could not import Pillow. This dependency of pdf2zh.six is not "
|
32 |
+
"installed by default. You need it to to save jpg images to a file. Install it "
|
33 |
+
"with `pip install 'pdf2zh.six[image]'`"
|
34 |
+
)
|
35 |
+
|
36 |
+
|
37 |
+
def align32(x: int) -> int:
|
38 |
+
return ((x + 3) // 4) * 4
|
39 |
+
|
40 |
+
|
41 |
+
class BMPWriter:
|
42 |
+
def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
|
43 |
+
self.fp = fp
|
44 |
+
self.bits = bits
|
45 |
+
self.width = width
|
46 |
+
self.height = height
|
47 |
+
if bits == 1:
|
48 |
+
ncols = 2
|
49 |
+
elif bits == 8:
|
50 |
+
ncols = 256
|
51 |
+
elif bits == 24:
|
52 |
+
ncols = 0
|
53 |
+
else:
|
54 |
+
raise PDFValueError(bits)
|
55 |
+
self.linesize = align32((self.width * self.bits + 7) // 8)
|
56 |
+
self.datasize = self.linesize * self.height
|
57 |
+
headersize = 14 + 40 + ncols * 4
|
58 |
+
info = struct.pack(
|
59 |
+
"<IiiHHIIIIII",
|
60 |
+
40,
|
61 |
+
self.width,
|
62 |
+
self.height,
|
63 |
+
1,
|
64 |
+
self.bits,
|
65 |
+
0,
|
66 |
+
self.datasize,
|
67 |
+
0,
|
68 |
+
0,
|
69 |
+
ncols,
|
70 |
+
0,
|
71 |
+
)
|
72 |
+
assert len(info) == 40, str(len(info))
|
73 |
+
header = struct.pack(
|
74 |
+
"<ccIHHI",
|
75 |
+
b"B",
|
76 |
+
b"M",
|
77 |
+
headersize + self.datasize,
|
78 |
+
0,
|
79 |
+
0,
|
80 |
+
headersize,
|
81 |
+
)
|
82 |
+
assert len(header) == 14, str(len(header))
|
83 |
+
self.fp.write(header)
|
84 |
+
self.fp.write(info)
|
85 |
+
if ncols == 2:
|
86 |
+
# B&W color table
|
87 |
+
for i in (0, 255):
|
88 |
+
self.fp.write(struct.pack("BBBx", i, i, i))
|
89 |
+
elif ncols == 256:
|
90 |
+
# grayscale color table
|
91 |
+
for i in range(256):
|
92 |
+
self.fp.write(struct.pack("BBBx", i, i, i))
|
93 |
+
self.pos0 = self.fp.tell()
|
94 |
+
self.pos1 = self.pos0 + self.datasize
|
95 |
+
|
96 |
+
def write_line(self, y: int, data: bytes) -> None:
|
97 |
+
self.fp.seek(self.pos1 - (y + 1) * self.linesize)
|
98 |
+
self.fp.write(data)
|
99 |
+
|
100 |
+
|
101 |
+
class ImageWriter:
|
102 |
+
"""Write image to a file
|
103 |
+
|
104 |
+
Supports various image types: JPEG, JBIG2 and bitmaps
|
105 |
+
"""
|
106 |
+
|
107 |
+
def __init__(self, outdir: str) -> None:
|
108 |
+
self.outdir = outdir
|
109 |
+
if not os.path.exists(self.outdir):
|
110 |
+
os.makedirs(self.outdir)
|
111 |
+
|
112 |
+
def export_image(self, image: LTImage) -> str:
|
113 |
+
"""Save an LTImage to disk"""
|
114 |
+
(width, height) = image.srcsize
|
115 |
+
|
116 |
+
filters = image.stream.get_filters()
|
117 |
+
|
118 |
+
if filters[-1][0] in LITERALS_DCT_DECODE:
|
119 |
+
name = self._save_jpeg(image)
|
120 |
+
|
121 |
+
elif filters[-1][0] in LITERALS_JPX_DECODE:
|
122 |
+
name = self._save_jpeg2000(image)
|
123 |
+
|
124 |
+
elif self._is_jbig2_iamge(image):
|
125 |
+
name = self._save_jbig2(image)
|
126 |
+
|
127 |
+
elif image.bits == 1:
|
128 |
+
name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)
|
129 |
+
|
130 |
+
elif image.bits == 8 and (
|
131 |
+
LITERAL_DEVICE_RGB in image.colorspace
|
132 |
+
or LITERAL_INLINE_DEVICE_RGB in image.colorspace
|
133 |
+
):
|
134 |
+
name = self._save_bmp(image, width, height, width * 3, image.bits * 3)
|
135 |
+
|
136 |
+
elif image.bits == 8 and (
|
137 |
+
LITERAL_DEVICE_GRAY in image.colorspace
|
138 |
+
or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
|
139 |
+
):
|
140 |
+
name = self._save_bmp(image, width, height, width, image.bits)
|
141 |
+
|
142 |
+
elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
|
143 |
+
name = self._save_bytes(image)
|
144 |
+
|
145 |
+
else:
|
146 |
+
name = self._save_raw(image)
|
147 |
+
|
148 |
+
return name
|
149 |
+
|
150 |
+
def _save_jpeg(self, image: LTImage) -> str:
|
151 |
+
"""Save a JPEG encoded image"""
|
152 |
+
data = image.stream.get_data()
|
153 |
+
|
154 |
+
name, path = self._create_unique_image_name(image, ".jpg")
|
155 |
+
with open(path, "wb") as fp:
|
156 |
+
if LITERAL_DEVICE_CMYK in image.colorspace:
|
157 |
+
try:
|
158 |
+
from PIL import Image, ImageChops # type: ignore[import]
|
159 |
+
except ImportError:
|
160 |
+
raise ImportError(PIL_ERROR_MESSAGE)
|
161 |
+
|
162 |
+
ifp = BytesIO(data)
|
163 |
+
i = Image.open(ifp)
|
164 |
+
i = ImageChops.invert(i)
|
165 |
+
i = i.convert("RGB")
|
166 |
+
i.save(fp, "JPEG")
|
167 |
+
else:
|
168 |
+
fp.write(data)
|
169 |
+
|
170 |
+
return name
|
171 |
+
|
172 |
+
def _save_jpeg2000(self, image: LTImage) -> str:
|
173 |
+
"""Save a JPEG 2000 encoded image"""
|
174 |
+
data = image.stream.get_data()
|
175 |
+
|
176 |
+
name, path = self._create_unique_image_name(image, ".jp2")
|
177 |
+
with open(path, "wb") as fp:
|
178 |
+
try:
|
179 |
+
from PIL import Image # type: ignore[import]
|
180 |
+
except ImportError:
|
181 |
+
raise ImportError(PIL_ERROR_MESSAGE)
|
182 |
+
|
183 |
+
# if we just write the raw data, most image programs
|
184 |
+
# that I have tried cannot open the file. However,
|
185 |
+
# open and saving with PIL produces a file that
|
186 |
+
# seems to be easily opened by other programs
|
187 |
+
ifp = BytesIO(data)
|
188 |
+
i = Image.open(ifp)
|
189 |
+
i.save(fp, "JPEG2000")
|
190 |
+
return name
|
191 |
+
|
192 |
+
def _save_jbig2(self, image: LTImage) -> str:
|
193 |
+
"""Save a JBIG2 encoded image"""
|
194 |
+
name, path = self._create_unique_image_name(image, ".jb2")
|
195 |
+
with open(path, "wb") as fp:
|
196 |
+
input_stream = BytesIO()
|
197 |
+
|
198 |
+
global_streams = []
|
199 |
+
filters = image.stream.get_filters()
|
200 |
+
for filter_name, params in filters:
|
201 |
+
if filter_name in LITERALS_JBIG2_DECODE:
|
202 |
+
global_streams.append(params["JBIG2Globals"].resolve())
|
203 |
+
|
204 |
+
if len(global_streams) > 1:
|
205 |
+
msg = (
|
206 |
+
"There should never be more than one JBIG2Globals "
|
207 |
+
"associated with a JBIG2 embedded image"
|
208 |
+
)
|
209 |
+
raise PDFValueError(msg)
|
210 |
+
if len(global_streams) == 1:
|
211 |
+
input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
|
212 |
+
input_stream.write(image.stream.get_data())
|
213 |
+
input_stream.seek(0)
|
214 |
+
reader = JBIG2StreamReader(input_stream)
|
215 |
+
segments = reader.get_segments()
|
216 |
+
|
217 |
+
writer = JBIG2StreamWriter(fp)
|
218 |
+
writer.write_file(segments)
|
219 |
+
return name
|
220 |
+
|
221 |
+
def _save_bmp(
|
222 |
+
self,
|
223 |
+
image: LTImage,
|
224 |
+
width: int,
|
225 |
+
height: int,
|
226 |
+
bytes_per_line: int,
|
227 |
+
bits: int,
|
228 |
+
) -> str:
|
229 |
+
"""Save a BMP encoded image"""
|
230 |
+
name, path = self._create_unique_image_name(image, ".bmp")
|
231 |
+
with open(path, "wb") as fp:
|
232 |
+
bmp = BMPWriter(fp, bits, width, height)
|
233 |
+
data = image.stream.get_data()
|
234 |
+
i = 0
|
235 |
+
for y in range(height):
|
236 |
+
bmp.write_line(y, data[i : i + bytes_per_line])
|
237 |
+
i += bytes_per_line
|
238 |
+
return name
|
239 |
+
|
240 |
+
def _save_bytes(self, image: LTImage) -> str:
|
241 |
+
"""Save an image without encoding, just bytes"""
|
242 |
+
name, path = self._create_unique_image_name(image, ".jpg")
|
243 |
+
width, height = image.srcsize
|
244 |
+
channels = len(image.stream.get_data()) / width / height / (image.bits / 8)
|
245 |
+
with open(path, "wb") as fp:
|
246 |
+
try:
|
247 |
+
from PIL import (
|
248 |
+
Image, # type: ignore[import]
|
249 |
+
ImageOps,
|
250 |
+
)
|
251 |
+
except ImportError:
|
252 |
+
raise ImportError(PIL_ERROR_MESSAGE)
|
253 |
+
|
254 |
+
mode: Literal["1", "L", "RGB", "CMYK"]
|
255 |
+
if image.bits == 1:
|
256 |
+
mode = "1"
|
257 |
+
elif image.bits == 8 and channels == 1:
|
258 |
+
mode = "L"
|
259 |
+
elif image.bits == 8 and channels == 3:
|
260 |
+
mode = "RGB"
|
261 |
+
elif image.bits == 8 and channels == 4:
|
262 |
+
mode = "CMYK"
|
263 |
+
|
264 |
+
img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw")
|
265 |
+
if mode == "L":
|
266 |
+
img = ImageOps.invert(img)
|
267 |
+
|
268 |
+
img.save(fp)
|
269 |
+
|
270 |
+
return name
|
271 |
+
|
272 |
+
def _save_raw(self, image: LTImage) -> str:
|
273 |
+
"""Save an image with unknown encoding"""
|
274 |
+
ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1])
|
275 |
+
name, path = self._create_unique_image_name(image, ext)
|
276 |
+
|
277 |
+
with open(path, "wb") as fp:
|
278 |
+
fp.write(image.stream.get_data())
|
279 |
+
return name
|
280 |
+
|
281 |
+
@staticmethod
|
282 |
+
def _is_jbig2_iamge(image: LTImage) -> bool:
|
283 |
+
filters = image.stream.get_filters()
|
284 |
+
for filter_name, params in filters:
|
285 |
+
if filter_name in LITERALS_JBIG2_DECODE:
|
286 |
+
return True
|
287 |
+
return False
|
288 |
+
|
289 |
+
def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]:
|
290 |
+
name = image.name + ext
|
291 |
+
path = os.path.join(self.outdir, name)
|
292 |
+
img_index = 0
|
293 |
+
while os.path.exists(path):
|
294 |
+
name = "%s.%d%s" % (image.name, img_index, ext)
|
295 |
+
path = os.path.join(self.outdir, name)
|
296 |
+
img_index += 1
|
297 |
+
return name, path
|
pdf2zh/jbig2.py
ADDED
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import os
|
3 |
+
from struct import calcsize, pack, unpack
|
4 |
+
from typing import BinaryIO, Dict, Iterable, List, Optional, Tuple, Union, cast
|
5 |
+
|
6 |
+
from pdf2zh.pdfexceptions import PDFValueError
|
7 |
+
|
8 |
+
# segment structure base
|
9 |
+
SEG_STRUCT = [
|
10 |
+
(">L", "number"),
|
11 |
+
(">B", "flags"),
|
12 |
+
(">B", "retention_flags"),
|
13 |
+
(">B", "page_assoc"),
|
14 |
+
(">L", "data_length"),
|
15 |
+
]
|
16 |
+
|
17 |
+
# segment header literals
|
18 |
+
HEADER_FLAG_DEFERRED = 0b10000000
|
19 |
+
HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000
|
20 |
+
|
21 |
+
SEG_TYPE_MASK = 0b00111111
|
22 |
+
|
23 |
+
REF_COUNT_SHORT_MASK = 0b11100000
|
24 |
+
REF_COUNT_LONG_MASK = 0x1FFFFFFF
|
25 |
+
REF_COUNT_LONG = 7
|
26 |
+
|
27 |
+
DATA_LEN_UNKNOWN = 0xFFFFFFFF
|
28 |
+
|
29 |
+
# segment types
|
30 |
+
SEG_TYPE_IMMEDIATE_GEN_REGION = 38
|
31 |
+
SEG_TYPE_END_OF_PAGE = 49
|
32 |
+
SEG_TYPE_END_OF_FILE = 51
|
33 |
+
|
34 |
+
# file literals
|
35 |
+
FILE_HEADER_ID = b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a"
|
36 |
+
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
|
37 |
+
|
38 |
+
|
39 |
+
def bit_set(bit_pos: int, value: int) -> bool:
|
40 |
+
return bool((value >> bit_pos) & 1)
|
41 |
+
|
42 |
+
|
43 |
+
def check_flag(flag: int, value: int) -> bool:
|
44 |
+
return bool(flag & value)
|
45 |
+
|
46 |
+
|
47 |
+
def masked_value(mask: int, value: int) -> int:
|
48 |
+
for bit_pos in range(31):
|
49 |
+
if bit_set(bit_pos, mask):
|
50 |
+
return (value & mask) >> bit_pos
|
51 |
+
|
52 |
+
raise PDFValueError("Invalid mask or value")
|
53 |
+
|
54 |
+
|
55 |
+
def mask_value(mask: int, value: int) -> int:
|
56 |
+
for bit_pos in range(31):
|
57 |
+
if bit_set(bit_pos, mask):
|
58 |
+
return (value & (mask >> bit_pos)) << bit_pos
|
59 |
+
|
60 |
+
raise PDFValueError("Invalid mask or value")
|
61 |
+
|
62 |
+
|
63 |
+
def unpack_int(format: str, buffer: bytes) -> int:
|
64 |
+
assert format in {">B", ">I", ">L"}
|
65 |
+
[result] = cast(Tuple[int], unpack(format, buffer))
|
66 |
+
return result
|
67 |
+
|
68 |
+
|
69 |
+
JBIG2SegmentFlags = Dict[str, Union[int, bool]]
|
70 |
+
JBIG2RetentionFlags = Dict[str, Union[int, List[int], List[bool]]]
|
71 |
+
JBIG2Segment = Dict[
|
72 |
+
str,
|
73 |
+
Union[bool, int, bytes, JBIG2SegmentFlags, JBIG2RetentionFlags],
|
74 |
+
]
|
75 |
+
|
76 |
+
|
77 |
+
class JBIG2StreamReader:
|
78 |
+
"""Read segments from a JBIG2 byte stream"""
|
79 |
+
|
80 |
+
def __init__(self, stream: BinaryIO) -> None:
|
81 |
+
self.stream = stream
|
82 |
+
|
83 |
+
def get_segments(self) -> List[JBIG2Segment]:
|
84 |
+
segments: List[JBIG2Segment] = []
|
85 |
+
while not self.is_eof():
|
86 |
+
segment: JBIG2Segment = {}
|
87 |
+
for field_format, name in SEG_STRUCT:
|
88 |
+
field_len = calcsize(field_format)
|
89 |
+
field = self.stream.read(field_len)
|
90 |
+
if len(field) < field_len:
|
91 |
+
segment["_error"] = True
|
92 |
+
break
|
93 |
+
value = unpack_int(field_format, field)
|
94 |
+
parser = getattr(self, "parse_%s" % name, None)
|
95 |
+
if callable(parser):
|
96 |
+
value = parser(segment, value, field)
|
97 |
+
segment[name] = value
|
98 |
+
|
99 |
+
if not segment.get("_error"):
|
100 |
+
segments.append(segment)
|
101 |
+
return segments
|
102 |
+
|
103 |
+
def is_eof(self) -> bool:
|
104 |
+
if self.stream.read(1) == b"":
|
105 |
+
return True
|
106 |
+
else:
|
107 |
+
self.stream.seek(-1, os.SEEK_CUR)
|
108 |
+
return False
|
109 |
+
|
110 |
+
def parse_flags(
|
111 |
+
self,
|
112 |
+
segment: JBIG2Segment,
|
113 |
+
flags: int,
|
114 |
+
field: bytes,
|
115 |
+
) -> JBIG2SegmentFlags:
|
116 |
+
return {
|
117 |
+
"deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
|
118 |
+
"page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
|
119 |
+
"type": masked_value(SEG_TYPE_MASK, flags),
|
120 |
+
}
|
121 |
+
|
122 |
+
def parse_retention_flags(
|
123 |
+
self,
|
124 |
+
segment: JBIG2Segment,
|
125 |
+
flags: int,
|
126 |
+
field: bytes,
|
127 |
+
) -> JBIG2RetentionFlags:
|
128 |
+
ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
|
129 |
+
retain_segments = []
|
130 |
+
ref_segments = []
|
131 |
+
|
132 |
+
if ref_count < REF_COUNT_LONG:
|
133 |
+
for bit_pos in range(5):
|
134 |
+
retain_segments.append(bit_set(bit_pos, flags))
|
135 |
+
else:
|
136 |
+
field += self.stream.read(3)
|
137 |
+
ref_count = unpack_int(">L", field)
|
138 |
+
ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count)
|
139 |
+
ret_bytes_count = int(math.ceil((ref_count + 1) / 8))
|
140 |
+
for ret_byte_index in range(ret_bytes_count):
|
141 |
+
ret_byte = unpack_int(">B", self.stream.read(1))
|
142 |
+
for bit_pos in range(7):
|
143 |
+
retain_segments.append(bit_set(bit_pos, ret_byte))
|
144 |
+
|
145 |
+
seg_num = segment["number"]
|
146 |
+
assert isinstance(seg_num, int)
|
147 |
+
if seg_num <= 256:
|
148 |
+
ref_format = ">B"
|
149 |
+
elif seg_num <= 65536:
|
150 |
+
ref_format = ">I"
|
151 |
+
else:
|
152 |
+
ref_format = ">L"
|
153 |
+
|
154 |
+
ref_size = calcsize(ref_format)
|
155 |
+
|
156 |
+
for ref_index in range(ref_count):
|
157 |
+
ref_data = self.stream.read(ref_size)
|
158 |
+
ref = unpack_int(ref_format, ref_data)
|
159 |
+
ref_segments.append(ref)
|
160 |
+
|
161 |
+
return {
|
162 |
+
"ref_count": ref_count,
|
163 |
+
"retain_segments": retain_segments,
|
164 |
+
"ref_segments": ref_segments,
|
165 |
+
}
|
166 |
+
|
167 |
+
def parse_page_assoc(self, segment: JBIG2Segment, page: int, field: bytes) -> int:
|
168 |
+
if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]:
|
169 |
+
field += self.stream.read(3)
|
170 |
+
page = unpack_int(">L", field)
|
171 |
+
return page
|
172 |
+
|
173 |
+
def parse_data_length(
|
174 |
+
self,
|
175 |
+
segment: JBIG2Segment,
|
176 |
+
length: int,
|
177 |
+
field: bytes,
|
178 |
+
) -> int:
|
179 |
+
if length:
|
180 |
+
if (
|
181 |
+
cast(JBIG2SegmentFlags, segment["flags"])["type"]
|
182 |
+
== SEG_TYPE_IMMEDIATE_GEN_REGION
|
183 |
+
) and (length == DATA_LEN_UNKNOWN):
|
184 |
+
raise NotImplementedError(
|
185 |
+
"Working with unknown segment length is not implemented yet",
|
186 |
+
)
|
187 |
+
else:
|
188 |
+
segment["raw_data"] = self.stream.read(length)
|
189 |
+
|
190 |
+
return length
|
191 |
+
|
192 |
+
|
193 |
+
class JBIG2StreamWriter:
|
194 |
+
"""Write JBIG2 segments to a file in JBIG2 format"""
|
195 |
+
|
196 |
+
EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = {
|
197 |
+
"ref_count": 0,
|
198 |
+
"ref_segments": cast(List[int], []),
|
199 |
+
"retain_segments": cast(List[bool], []),
|
200 |
+
}
|
201 |
+
|
202 |
+
def __init__(self, stream: BinaryIO) -> None:
|
203 |
+
self.stream = stream
|
204 |
+
|
205 |
+
def write_segments(
|
206 |
+
self,
|
207 |
+
segments: Iterable[JBIG2Segment],
|
208 |
+
fix_last_page: bool = True,
|
209 |
+
) -> int:
|
210 |
+
data_len = 0
|
211 |
+
current_page: Optional[int] = None
|
212 |
+
seg_num: Optional[int] = None
|
213 |
+
|
214 |
+
for segment in segments:
|
215 |
+
data = self.encode_segment(segment)
|
216 |
+
self.stream.write(data)
|
217 |
+
data_len += len(data)
|
218 |
+
|
219 |
+
seg_num = cast(Optional[int], segment["number"])
|
220 |
+
|
221 |
+
if fix_last_page:
|
222 |
+
seg_page = cast(int, segment.get("page_assoc"))
|
223 |
+
|
224 |
+
if (
|
225 |
+
cast(JBIG2SegmentFlags, segment["flags"])["type"]
|
226 |
+
== SEG_TYPE_END_OF_PAGE
|
227 |
+
):
|
228 |
+
current_page = None
|
229 |
+
elif seg_page:
|
230 |
+
current_page = seg_page
|
231 |
+
|
232 |
+
if fix_last_page and current_page and (seg_num is not None):
|
233 |
+
segment = self.get_eop_segment(seg_num + 1, current_page)
|
234 |
+
data = self.encode_segment(segment)
|
235 |
+
self.stream.write(data)
|
236 |
+
data_len += len(data)
|
237 |
+
|
238 |
+
return data_len
|
239 |
+
|
240 |
+
def write_file(
|
241 |
+
self,
|
242 |
+
segments: Iterable[JBIG2Segment],
|
243 |
+
fix_last_page: bool = True,
|
244 |
+
) -> int:
|
245 |
+
header = FILE_HEADER_ID
|
246 |
+
header_flags = FILE_HEAD_FLAG_SEQUENTIAL
|
247 |
+
header += pack(">B", header_flags)
|
248 |
+
# The embedded JBIG2 files in a PDF always
|
249 |
+
# only have one page
|
250 |
+
number_of_pages = pack(">L", 1)
|
251 |
+
header += number_of_pages
|
252 |
+
self.stream.write(header)
|
253 |
+
data_len = len(header)
|
254 |
+
|
255 |
+
data_len += self.write_segments(segments, fix_last_page)
|
256 |
+
|
257 |
+
seg_num = 0
|
258 |
+
for segment in segments:
|
259 |
+
seg_num = cast(int, segment["number"])
|
260 |
+
|
261 |
+
if fix_last_page:
|
262 |
+
seg_num_offset = 2
|
263 |
+
else:
|
264 |
+
seg_num_offset = 1
|
265 |
+
eof_segment = self.get_eof_segment(seg_num + seg_num_offset)
|
266 |
+
data = self.encode_segment(eof_segment)
|
267 |
+
|
268 |
+
self.stream.write(data)
|
269 |
+
data_len += len(data)
|
270 |
+
|
271 |
+
return data_len
|
272 |
+
|
273 |
+
def encode_segment(self, segment: JBIG2Segment) -> bytes:
|
274 |
+
data = b""
|
275 |
+
for field_format, name in SEG_STRUCT:
|
276 |
+
value = segment.get(name)
|
277 |
+
encoder = getattr(self, "encode_%s" % name, None)
|
278 |
+
if callable(encoder):
|
279 |
+
field = encoder(value, segment)
|
280 |
+
else:
|
281 |
+
field = pack(field_format, value)
|
282 |
+
data += field
|
283 |
+
return data
|
284 |
+
|
285 |
+
def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment) -> bytes:
|
286 |
+
flags = 0
|
287 |
+
if value.get("deferred"):
|
288 |
+
flags |= HEADER_FLAG_DEFERRED
|
289 |
+
|
290 |
+
if "page_assoc_long" in value:
|
291 |
+
flags |= HEADER_FLAG_PAGE_ASSOC_LONG if value["page_assoc_long"] else flags
|
292 |
+
else:
|
293 |
+
flags |= (
|
294 |
+
HEADER_FLAG_PAGE_ASSOC_LONG
|
295 |
+
if cast(int, segment.get("page", 0)) > 255
|
296 |
+
else flags
|
297 |
+
)
|
298 |
+
|
299 |
+
flags |= mask_value(SEG_TYPE_MASK, value["type"])
|
300 |
+
|
301 |
+
return pack(">B", flags)
|
302 |
+
|
303 |
+
def encode_retention_flags(
|
304 |
+
self,
|
305 |
+
value: JBIG2RetentionFlags,
|
306 |
+
segment: JBIG2Segment,
|
307 |
+
) -> bytes:
|
308 |
+
flags = []
|
309 |
+
flags_format = ">B"
|
310 |
+
ref_count = value["ref_count"]
|
311 |
+
assert isinstance(ref_count, int)
|
312 |
+
retain_segments = cast(List[bool], value.get("retain_segments", []))
|
313 |
+
|
314 |
+
if ref_count <= 4:
|
315 |
+
flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
|
316 |
+
for ref_index, ref_retain in enumerate(retain_segments):
|
317 |
+
if ref_retain:
|
318 |
+
flags_byte |= 1 << ref_index
|
319 |
+
flags.append(flags_byte)
|
320 |
+
else:
|
321 |
+
bytes_count = math.ceil((ref_count + 1) / 8)
|
322 |
+
flags_format = ">L" + ("B" * bytes_count)
|
323 |
+
flags_dword = mask_value(REF_COUNT_SHORT_MASK, REF_COUNT_LONG) << 24
|
324 |
+
flags.append(flags_dword)
|
325 |
+
|
326 |
+
for byte_index in range(bytes_count):
|
327 |
+
ret_byte = 0
|
328 |
+
ret_part = retain_segments[byte_index * 8 : byte_index * 8 + 8]
|
329 |
+
for bit_pos, ret_seg in enumerate(ret_part):
|
330 |
+
ret_byte |= 1 << bit_pos if ret_seg else ret_byte
|
331 |
+
|
332 |
+
flags.append(ret_byte)
|
333 |
+
|
334 |
+
ref_segments = cast(List[int], value.get("ref_segments", []))
|
335 |
+
|
336 |
+
seg_num = cast(int, segment["number"])
|
337 |
+
if seg_num <= 256:
|
338 |
+
ref_format = "B"
|
339 |
+
elif seg_num <= 65536:
|
340 |
+
ref_format = "I"
|
341 |
+
else:
|
342 |
+
ref_format = "L"
|
343 |
+
|
344 |
+
for ref in ref_segments:
|
345 |
+
flags_format += ref_format
|
346 |
+
flags.append(ref)
|
347 |
+
|
348 |
+
return pack(flags_format, *flags)
|
349 |
+
|
350 |
+
def encode_data_length(self, value: int, segment: JBIG2Segment) -> bytes:
|
351 |
+
data = pack(">L", value)
|
352 |
+
data += cast(bytes, segment["raw_data"])
|
353 |
+
return data
|
354 |
+
|
355 |
+
def get_eop_segment(self, seg_number: int, page_number: int) -> JBIG2Segment:
|
356 |
+
return {
|
357 |
+
"data_length": 0,
|
358 |
+
"flags": {"deferred": False, "type": SEG_TYPE_END_OF_PAGE},
|
359 |
+
"number": seg_number,
|
360 |
+
"page_assoc": page_number,
|
361 |
+
"raw_data": b"",
|
362 |
+
"retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
|
363 |
+
}
|
364 |
+
|
365 |
+
def get_eof_segment(self, seg_number: int) -> JBIG2Segment:
|
366 |
+
return {
|
367 |
+
"data_length": 0,
|
368 |
+
"flags": {"deferred": False, "type": SEG_TYPE_END_OF_FILE},
|
369 |
+
"number": seg_number,
|
370 |
+
"page_assoc": 0,
|
371 |
+
"raw_data": b"",
|
372 |
+
"retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
|
373 |
+
}
|
pdf2zh/latin_enc.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Standard encoding tables used in PDF.
|
2 |
+
|
3 |
+
This table is extracted from PDF Reference Manual 1.6, pp.925
|
4 |
+
"D.1 Latin Character Set and Encodings"
|
5 |
+
|
6 |
+
"""
|
7 |
+
|
8 |
+
from typing import List, Optional, Tuple
|
9 |
+
|
10 |
+
EncodingRow = Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]]
|
11 |
+
|
12 |
+
ENCODING: List[EncodingRow] = [
|
13 |
+
# (name, std, mac, win, pdf)
|
14 |
+
("A", 65, 65, 65, 65),
|
15 |
+
("AE", 225, 174, 198, 198),
|
16 |
+
("Aacute", None, 231, 193, 193),
|
17 |
+
("Acircumflex", None, 229, 194, 194),
|
18 |
+
("Adieresis", None, 128, 196, 196),
|
19 |
+
("Agrave", None, 203, 192, 192),
|
20 |
+
("Aring", None, 129, 197, 197),
|
21 |
+
("Atilde", None, 204, 195, 195),
|
22 |
+
("B", 66, 66, 66, 66),
|
23 |
+
("C", 67, 67, 67, 67),
|
24 |
+
("Ccedilla", None, 130, 199, 199),
|
25 |
+
("D", 68, 68, 68, 68),
|
26 |
+
("E", 69, 69, 69, 69),
|
27 |
+
("Eacute", None, 131, 201, 201),
|
28 |
+
("Ecircumflex", None, 230, 202, 202),
|
29 |
+
("Edieresis", None, 232, 203, 203),
|
30 |
+
("Egrave", None, 233, 200, 200),
|
31 |
+
("Eth", None, None, 208, 208),
|
32 |
+
("Euro", None, None, 128, 160),
|
33 |
+
("F", 70, 70, 70, 70),
|
34 |
+
("G", 71, 71, 71, 71),
|
35 |
+
("H", 72, 72, 72, 72),
|
36 |
+
("I", 73, 73, 73, 73),
|
37 |
+
("Iacute", None, 234, 205, 205),
|
38 |
+
("Icircumflex", None, 235, 206, 206),
|
39 |
+
("Idieresis", None, 236, 207, 207),
|
40 |
+
("Igrave", None, 237, 204, 204),
|
41 |
+
("J", 74, 74, 74, 74),
|
42 |
+
("K", 75, 75, 75, 75),
|
43 |
+
("L", 76, 76, 76, 76),
|
44 |
+
("Lslash", 232, None, None, 149),
|
45 |
+
("M", 77, 77, 77, 77),
|
46 |
+
("N", 78, 78, 78, 78),
|
47 |
+
("Ntilde", None, 132, 209, 209),
|
48 |
+
("O", 79, 79, 79, 79),
|
49 |
+
("OE", 234, 206, 140, 150),
|
50 |
+
("Oacute", None, 238, 211, 211),
|
51 |
+
("Ocircumflex", None, 239, 212, 212),
|
52 |
+
("Odieresis", None, 133, 214, 214),
|
53 |
+
("Ograve", None, 241, 210, 210),
|
54 |
+
("Oslash", 233, 175, 216, 216),
|
55 |
+
("Otilde", None, 205, 213, 213),
|
56 |
+
("P", 80, 80, 80, 80),
|
57 |
+
("Q", 81, 81, 81, 81),
|
58 |
+
("R", 82, 82, 82, 82),
|
59 |
+
("S", 83, 83, 83, 83),
|
60 |
+
("Scaron", None, None, 138, 151),
|
61 |
+
("T", 84, 84, 84, 84),
|
62 |
+
("Thorn", None, None, 222, 222),
|
63 |
+
("U", 85, 85, 85, 85),
|
64 |
+
("Uacute", None, 242, 218, 218),
|
65 |
+
("Ucircumflex", None, 243, 219, 219),
|
66 |
+
("Udieresis", None, 134, 220, 220),
|
67 |
+
("Ugrave", None, 244, 217, 217),
|
68 |
+
("V", 86, 86, 86, 86),
|
69 |
+
("W", 87, 87, 87, 87),
|
70 |
+
("X", 88, 88, 88, 88),
|
71 |
+
("Y", 89, 89, 89, 89),
|
72 |
+
("Yacute", None, None, 221, 221),
|
73 |
+
("Ydieresis", None, 217, 159, 152),
|
74 |
+
("Z", 90, 90, 90, 90),
|
75 |
+
("Zcaron", None, None, 142, 153),
|
76 |
+
("a", 97, 97, 97, 97),
|
77 |
+
("aacute", None, 135, 225, 225),
|
78 |
+
("acircumflex", None, 137, 226, 226),
|
79 |
+
("acute", 194, 171, 180, 180),
|
80 |
+
("adieresis", None, 138, 228, 228),
|
81 |
+
("ae", 241, 190, 230, 230),
|
82 |
+
("agrave", None, 136, 224, 224),
|
83 |
+
("ampersand", 38, 38, 38, 38),
|
84 |
+
("aring", None, 140, 229, 229),
|
85 |
+
("asciicircum", 94, 94, 94, 94),
|
86 |
+
("asciitilde", 126, 126, 126, 126),
|
87 |
+
("asterisk", 42, 42, 42, 42),
|
88 |
+
("at", 64, 64, 64, 64),
|
89 |
+
("atilde", None, 139, 227, 227),
|
90 |
+
("b", 98, 98, 98, 98),
|
91 |
+
("backslash", 92, 92, 92, 92),
|
92 |
+
("bar", 124, 124, 124, 124),
|
93 |
+
("braceleft", 123, 123, 123, 123),
|
94 |
+
("braceright", 125, 125, 125, 125),
|
95 |
+
("bracketleft", 91, 91, 91, 91),
|
96 |
+
("bracketright", 93, 93, 93, 93),
|
97 |
+
("breve", 198, 249, None, 24),
|
98 |
+
("brokenbar", None, None, 166, 166),
|
99 |
+
("bullet", 183, 165, 149, 128),
|
100 |
+
("c", 99, 99, 99, 99),
|
101 |
+
("caron", 207, 255, None, 25),
|
102 |
+
("ccedilla", None, 141, 231, 231),
|
103 |
+
("cedilla", 203, 252, 184, 184),
|
104 |
+
("cent", 162, 162, 162, 162),
|
105 |
+
("circumflex", 195, 246, 136, 26),
|
106 |
+
("colon", 58, 58, 58, 58),
|
107 |
+
("comma", 44, 44, 44, 44),
|
108 |
+
("copyright", None, 169, 169, 169),
|
109 |
+
("currency", 168, 219, 164, 164),
|
110 |
+
("d", 100, 100, 100, 100),
|
111 |
+
("dagger", 178, 160, 134, 129),
|
112 |
+
("daggerdbl", 179, 224, 135, 130),
|
113 |
+
("degree", None, 161, 176, 176),
|
114 |
+
("dieresis", 200, 172, 168, 168),
|
115 |
+
("divide", None, 214, 247, 247),
|
116 |
+
("dollar", 36, 36, 36, 36),
|
117 |
+
("dotaccent", 199, 250, None, 27),
|
118 |
+
("dotlessi", 245, 245, None, 154),
|
119 |
+
("e", 101, 101, 101, 101),
|
120 |
+
("eacute", None, 142, 233, 233),
|
121 |
+
("ecircumflex", None, 144, 234, 234),
|
122 |
+
("edieresis", None, 145, 235, 235),
|
123 |
+
("egrave", None, 143, 232, 232),
|
124 |
+
("eight", 56, 56, 56, 56),
|
125 |
+
("ellipsis", 188, 201, 133, 131),
|
126 |
+
("emdash", 208, 209, 151, 132),
|
127 |
+
("endash", 177, 208, 150, 133),
|
128 |
+
("equal", 61, 61, 61, 61),
|
129 |
+
("eth", None, None, 240, 240),
|
130 |
+
("exclam", 33, 33, 33, 33),
|
131 |
+
("exclamdown", 161, 193, 161, 161),
|
132 |
+
("f", 102, 102, 102, 102),
|
133 |
+
("fi", 174, 222, None, 147),
|
134 |
+
("five", 53, 53, 53, 53),
|
135 |
+
("fl", 175, 223, None, 148),
|
136 |
+
("florin", 166, 196, 131, 134),
|
137 |
+
("four", 52, 52, 52, 52),
|
138 |
+
("fraction", 164, 218, None, 135),
|
139 |
+
("g", 103, 103, 103, 103),
|
140 |
+
("germandbls", 251, 167, 223, 223),
|
141 |
+
("grave", 193, 96, 96, 96),
|
142 |
+
("greater", 62, 62, 62, 62),
|
143 |
+
("guillemotleft", 171, 199, 171, 171),
|
144 |
+
("guillemotright", 187, 200, 187, 187),
|
145 |
+
("guilsinglleft", 172, 220, 139, 136),
|
146 |
+
("guilsinglright", 173, 221, 155, 137),
|
147 |
+
("h", 104, 104, 104, 104),
|
148 |
+
("hungarumlaut", 205, 253, None, 28),
|
149 |
+
("hyphen", 45, 45, 45, 45),
|
150 |
+
("i", 105, 105, 105, 105),
|
151 |
+
("iacute", None, 146, 237, 237),
|
152 |
+
("icircumflex", None, 148, 238, 238),
|
153 |
+
("idieresis", None, 149, 239, 239),
|
154 |
+
("igrave", None, 147, 236, 236),
|
155 |
+
("j", 106, 106, 106, 106),
|
156 |
+
("k", 107, 107, 107, 107),
|
157 |
+
("l", 108, 108, 108, 108),
|
158 |
+
("less", 60, 60, 60, 60),
|
159 |
+
("logicalnot", None, 194, 172, 172),
|
160 |
+
("lslash", 248, None, None, 155),
|
161 |
+
("m", 109, 109, 109, 109),
|
162 |
+
("macron", 197, 248, 175, 175),
|
163 |
+
("minus", None, None, None, 138),
|
164 |
+
("mu", None, 181, 181, 181),
|
165 |
+
("multiply", None, None, 215, 215),
|
166 |
+
("n", 110, 110, 110, 110),
|
167 |
+
("nbspace", None, 202, 160, None),
|
168 |
+
("nine", 57, 57, 57, 57),
|
169 |
+
("ntilde", None, 150, 241, 241),
|
170 |
+
("numbersign", 35, 35, 35, 35),
|
171 |
+
("o", 111, 111, 111, 111),
|
172 |
+
("oacute", None, 151, 243, 243),
|
173 |
+
("ocircumflex", None, 153, 244, 244),
|
174 |
+
("odieresis", None, 154, 246, 246),
|
175 |
+
("oe", 250, 207, 156, 156),
|
176 |
+
("ogonek", 206, 254, None, 29),
|
177 |
+
("ograve", None, 152, 242, 242),
|
178 |
+
("one", 49, 49, 49, 49),
|
179 |
+
("onehalf", None, None, 189, 189),
|
180 |
+
("onequarter", None, None, 188, 188),
|
181 |
+
("onesuperior", None, None, 185, 185),
|
182 |
+
("ordfeminine", 227, 187, 170, 170),
|
183 |
+
("ordmasculine", 235, 188, 186, 186),
|
184 |
+
("oslash", 249, 191, 248, 248),
|
185 |
+
("otilde", None, 155, 245, 245),
|
186 |
+
("p", 112, 112, 112, 112),
|
187 |
+
("paragraph", 182, 166, 182, 182),
|
188 |
+
("parenleft", 40, 40, 40, 40),
|
189 |
+
("parenright", 41, 41, 41, 41),
|
190 |
+
("percent", 37, 37, 37, 37),
|
191 |
+
("period", 46, 46, 46, 46),
|
192 |
+
("periodcentered", 180, 225, 183, 183),
|
193 |
+
("perthousand", 189, 228, 137, 139),
|
194 |
+
("plus", 43, 43, 43, 43),
|
195 |
+
("plusminus", None, 177, 177, 177),
|
196 |
+
("q", 113, 113, 113, 113),
|
197 |
+
("question", 63, 63, 63, 63),
|
198 |
+
("questiondown", 191, 192, 191, 191),
|
199 |
+
("quotedbl", 34, 34, 34, 34),
|
200 |
+
("quotedblbase", 185, 227, 132, 140),
|
201 |
+
("quotedblleft", 170, 210, 147, 141),
|
202 |
+
("quotedblright", 186, 211, 148, 142),
|
203 |
+
("quoteleft", 96, 212, 145, 143),
|
204 |
+
("quoteright", 39, 213, 146, 144),
|
205 |
+
("quotesinglbase", 184, 226, 130, 145),
|
206 |
+
("quotesingle", 169, 39, 39, 39),
|
207 |
+
("r", 114, 114, 114, 114),
|
208 |
+
("registered", None, 168, 174, 174),
|
209 |
+
("ring", 202, 251, None, 30),
|
210 |
+
("s", 115, 115, 115, 115),
|
211 |
+
("scaron", None, None, 154, 157),
|
212 |
+
("section", 167, 164, 167, 167),
|
213 |
+
("semicolon", 59, 59, 59, 59),
|
214 |
+
("seven", 55, 55, 55, 55),
|
215 |
+
("six", 54, 54, 54, 54),
|
216 |
+
("slash", 47, 47, 47, 47),
|
217 |
+
("space", 32, 32, 32, 32),
|
218 |
+
("space", None, 202, 160, None),
|
219 |
+
("space", None, 202, 173, None),
|
220 |
+
("sterling", 163, 163, 163, 163),
|
221 |
+
("t", 116, 116, 116, 116),
|
222 |
+
("thorn", None, None, 254, 254),
|
223 |
+
("three", 51, 51, 51, 51),
|
224 |
+
("threequarters", None, None, 190, 190),
|
225 |
+
("threesuperior", None, None, 179, 179),
|
226 |
+
("tilde", 196, 247, 152, 31),
|
227 |
+
("trademark", None, 170, 153, 146),
|
228 |
+
("two", 50, 50, 50, 50),
|
229 |
+
("twosuperior", None, None, 178, 178),
|
230 |
+
("u", 117, 117, 117, 117),
|
231 |
+
("uacute", None, 156, 250, 250),
|
232 |
+
("ucircumflex", None, 158, 251, 251),
|
233 |
+
("udieresis", None, 159, 252, 252),
|
234 |
+
("ugrave", None, 157, 249, 249),
|
235 |
+
("underscore", 95, 95, 95, 95),
|
236 |
+
("v", 118, 118, 118, 118),
|
237 |
+
("w", 119, 119, 119, 119),
|
238 |
+
("x", 120, 120, 120, 120),
|
239 |
+
("y", 121, 121, 121, 121),
|
240 |
+
("yacute", None, None, 253, 253),
|
241 |
+
("ydieresis", None, 216, 255, 255),
|
242 |
+
("yen", 165, 180, 165, 165),
|
243 |
+
("z", 122, 122, 122, 122),
|
244 |
+
("zcaron", None, None, 158, 158),
|
245 |
+
("zero", 48, 48, 48, 48),
|
246 |
+
]
|
pdf2zh/layout.py
ADDED
@@ -0,0 +1,993 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import heapq
|
2 |
+
import logging
|
3 |
+
from typing import (
|
4 |
+
Dict,
|
5 |
+
Generic,
|
6 |
+
Iterable,
|
7 |
+
Iterator,
|
8 |
+
List,
|
9 |
+
Optional,
|
10 |
+
Sequence,
|
11 |
+
Set,
|
12 |
+
Tuple,
|
13 |
+
TypeVar,
|
14 |
+
Union,
|
15 |
+
cast,
|
16 |
+
)
|
17 |
+
|
18 |
+
from pdf2zh.pdfcolor import PDFColorSpace
|
19 |
+
from pdf2zh.pdfexceptions import PDFTypeError, PDFValueError
|
20 |
+
from pdf2zh.pdffont import PDFFont
|
21 |
+
from pdf2zh.pdfinterp import Color, PDFGraphicState
|
22 |
+
from pdf2zh.pdftypes import PDFStream
|
23 |
+
from pdf2zh.utils import (
|
24 |
+
INF,
|
25 |
+
LTComponentT,
|
26 |
+
Matrix,
|
27 |
+
PathSegment,
|
28 |
+
Plane,
|
29 |
+
Point,
|
30 |
+
Rect,
|
31 |
+
apply_matrix_pt,
|
32 |
+
bbox2str,
|
33 |
+
fsplit,
|
34 |
+
get_bound,
|
35 |
+
matrix2str,
|
36 |
+
uniq,
|
37 |
+
)
|
38 |
+
|
39 |
+
logger = logging.getLogger(__name__)
|
40 |
+
|
41 |
+
|
42 |
+
class IndexAssigner:
|
43 |
+
def __init__(self, index: int = 0) -> None:
|
44 |
+
self.index = index
|
45 |
+
|
46 |
+
def run(self, obj: "LTItem") -> None:
|
47 |
+
if isinstance(obj, LTTextBox):
|
48 |
+
obj.index = self.index
|
49 |
+
self.index += 1
|
50 |
+
elif isinstance(obj, LTTextGroup):
|
51 |
+
for x in obj:
|
52 |
+
self.run(x)
|
53 |
+
|
54 |
+
|
55 |
+
class LAParams:
|
56 |
+
"""Parameters for layout analysis
|
57 |
+
|
58 |
+
:param line_overlap: If two characters have more overlap than this they
|
59 |
+
are considered to be on the same line. The overlap is specified
|
60 |
+
relative to the minimum height of both characters.
|
61 |
+
:param char_margin: If two characters are closer together than this
|
62 |
+
margin they are considered part of the same line. The margin is
|
63 |
+
specified relative to the width of the character.
|
64 |
+
:param word_margin: If two characters on the same line are further apart
|
65 |
+
than this margin then they are considered to be two separate words, and
|
66 |
+
an intermediate space will be added for readability. The margin is
|
67 |
+
specified relative to the width of the character.
|
68 |
+
:param line_margin: If two lines are are close together they are
|
69 |
+
considered to be part of the same paragraph. The margin is
|
70 |
+
specified relative to the height of a line.
|
71 |
+
:param boxes_flow: Specifies how much a horizontal and vertical position
|
72 |
+
of a text matters when determining the order of text boxes. The value
|
73 |
+
should be within the range of -1.0 (only horizontal position
|
74 |
+
matters) to +1.0 (only vertical position matters). You can also pass
|
75 |
+
`None` to disable advanced layout analysis, and instead return text
|
76 |
+
based on the position of the bottom left corner of the text box.
|
77 |
+
:param detect_vertical: If vertical text should be considered during
|
78 |
+
layout analysis
|
79 |
+
:param all_texts: If layout analysis should be performed on text in
|
80 |
+
figures.
|
81 |
+
"""
|
82 |
+
|
83 |
+
def __init__(
|
84 |
+
self,
|
85 |
+
line_overlap: float = 0.5,
|
86 |
+
char_margin: float = 2.0,
|
87 |
+
line_margin: float = 0.5,
|
88 |
+
word_margin: float = 0.1,
|
89 |
+
boxes_flow: Optional[float] = 0.5,
|
90 |
+
detect_vertical: bool = False,
|
91 |
+
all_texts: bool = False,
|
92 |
+
) -> None:
|
93 |
+
self.line_overlap = line_overlap
|
94 |
+
self.char_margin = char_margin
|
95 |
+
self.line_margin = line_margin
|
96 |
+
self.word_margin = word_margin
|
97 |
+
self.boxes_flow = boxes_flow
|
98 |
+
self.detect_vertical = detect_vertical
|
99 |
+
self.all_texts = all_texts
|
100 |
+
|
101 |
+
self._validate()
|
102 |
+
|
103 |
+
def _validate(self) -> None:
|
104 |
+
if self.boxes_flow is not None:
|
105 |
+
boxes_flow_err_msg = (
|
106 |
+
"LAParam boxes_flow should be None, or a number between -1 and +1"
|
107 |
+
)
|
108 |
+
if not (
|
109 |
+
isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float)
|
110 |
+
):
|
111 |
+
raise PDFTypeError(boxes_flow_err_msg)
|
112 |
+
if not -1 <= self.boxes_flow <= 1:
|
113 |
+
raise PDFValueError(boxes_flow_err_msg)
|
114 |
+
|
115 |
+
def __repr__(self) -> str:
|
116 |
+
return (
|
117 |
+
"<LAParams: char_margin=%.1f, line_margin=%.1f, "
|
118 |
+
"word_margin=%.1f all_texts=%r>"
|
119 |
+
% (self.char_margin, self.line_margin, self.word_margin, self.all_texts)
|
120 |
+
)
|
121 |
+
|
122 |
+
|
123 |
+
class LTItem:
|
124 |
+
"""Interface for things that can be analyzed"""
|
125 |
+
|
126 |
+
def analyze(self, laparams: LAParams) -> None:
|
127 |
+
"""Perform the layout analysis."""
|
128 |
+
|
129 |
+
|
130 |
+
class LTText:
|
131 |
+
"""Interface for things that have text"""
|
132 |
+
|
133 |
+
def __repr__(self) -> str:
|
134 |
+
return f"<{self.__class__.__name__} {self.get_text()!r}>"
|
135 |
+
|
136 |
+
def get_text(self) -> str:
|
137 |
+
"""Text contained in this object"""
|
138 |
+
raise NotImplementedError
|
139 |
+
|
140 |
+
|
141 |
+
class LTComponent(LTItem):
|
142 |
+
"""Object with a bounding box"""
|
143 |
+
|
144 |
+
def __init__(self, bbox: Rect) -> None:
|
145 |
+
LTItem.__init__(self)
|
146 |
+
self.set_bbox(bbox)
|
147 |
+
|
148 |
+
def __repr__(self) -> str:
|
149 |
+
return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>"
|
150 |
+
|
151 |
+
# Disable comparison.
|
152 |
+
def __lt__(self, _: object) -> bool:
|
153 |
+
raise PDFValueError
|
154 |
+
|
155 |
+
def __le__(self, _: object) -> bool:
|
156 |
+
raise PDFValueError
|
157 |
+
|
158 |
+
def __gt__(self, _: object) -> bool:
|
159 |
+
raise PDFValueError
|
160 |
+
|
161 |
+
def __ge__(self, _: object) -> bool:
|
162 |
+
raise PDFValueError
|
163 |
+
|
164 |
+
def set_bbox(self, bbox: Rect) -> None:
|
165 |
+
(x0, y0, x1, y1) = bbox
|
166 |
+
self.x0 = x0
|
167 |
+
self.y0 = y0
|
168 |
+
self.x1 = x1
|
169 |
+
self.y1 = y1
|
170 |
+
self.width = x1 - x0
|
171 |
+
self.height = y1 - y0
|
172 |
+
self.bbox = bbox
|
173 |
+
|
174 |
+
def is_empty(self) -> bool:
|
175 |
+
return self.width <= 0 or self.height <= 0
|
176 |
+
|
177 |
+
def is_hoverlap(self, obj: "LTComponent") -> bool:
|
178 |
+
assert isinstance(obj, LTComponent), str(type(obj))
|
179 |
+
return obj.x0 <= self.x1 and self.x0 <= obj.x1
|
180 |
+
|
181 |
+
def hdistance(self, obj: "LTComponent") -> float:
|
182 |
+
assert isinstance(obj, LTComponent), str(type(obj))
|
183 |
+
if self.is_hoverlap(obj):
|
184 |
+
return 0
|
185 |
+
else:
|
186 |
+
return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
|
187 |
+
|
188 |
+
def hoverlap(self, obj: "LTComponent") -> float:
|
189 |
+
assert isinstance(obj, LTComponent), str(type(obj))
|
190 |
+
if self.is_hoverlap(obj):
|
191 |
+
return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
|
192 |
+
else:
|
193 |
+
return 0
|
194 |
+
|
195 |
+
def is_voverlap(self, obj: "LTComponent") -> bool:
|
196 |
+
assert isinstance(obj, LTComponent), str(type(obj))
|
197 |
+
return obj.y0 <= self.y1 and self.y0 <= obj.y1
|
198 |
+
|
199 |
+
def vdistance(self, obj: "LTComponent") -> float:
|
200 |
+
assert isinstance(obj, LTComponent), str(type(obj))
|
201 |
+
if self.is_voverlap(obj):
|
202 |
+
return 0
|
203 |
+
else:
|
204 |
+
return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
|
205 |
+
|
206 |
+
def voverlap(self, obj: "LTComponent") -> float:
|
207 |
+
assert isinstance(obj, LTComponent), str(type(obj))
|
208 |
+
if self.is_voverlap(obj):
|
209 |
+
return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
|
210 |
+
else:
|
211 |
+
return 0
|
212 |
+
|
213 |
+
|
214 |
+
class LTCurve(LTComponent):
|
215 |
+
"""A generic Bezier curve
|
216 |
+
|
217 |
+
The parameter `original_path` contains the original
|
218 |
+
pathing information from the pdf (e.g. for reconstructing Bezier Curves).
|
219 |
+
|
220 |
+
`dashing_style` contains the Dashing information if any.
|
221 |
+
"""
|
222 |
+
|
223 |
+
def __init__(
|
224 |
+
self,
|
225 |
+
linewidth: float,
|
226 |
+
pts: List[Point],
|
227 |
+
stroke: bool = False,
|
228 |
+
fill: bool = False,
|
229 |
+
evenodd: bool = False,
|
230 |
+
stroking_color: Optional[Color] = None,
|
231 |
+
non_stroking_color: Optional[Color] = None,
|
232 |
+
original_path: Optional[List[PathSegment]] = None,
|
233 |
+
dashing_style: Optional[Tuple[object, object]] = None,
|
234 |
+
) -> None:
|
235 |
+
LTComponent.__init__(self, get_bound(pts))
|
236 |
+
self.pts = pts
|
237 |
+
self.linewidth = linewidth
|
238 |
+
self.stroke = stroke
|
239 |
+
self.fill = fill
|
240 |
+
self.evenodd = evenodd
|
241 |
+
self.stroking_color = stroking_color
|
242 |
+
self.non_stroking_color = non_stroking_color
|
243 |
+
self.original_path = original_path
|
244 |
+
self.dashing_style = dashing_style
|
245 |
+
|
246 |
+
def get_pts(self) -> str:
|
247 |
+
return ",".join("%.3f,%.3f" % p for p in self.pts)
|
248 |
+
|
249 |
+
|
250 |
+
class LTLine(LTCurve):
|
251 |
+
"""A single straight line.
|
252 |
+
|
253 |
+
Could be used for separating text or figures.
|
254 |
+
"""
|
255 |
+
|
256 |
+
def __init__(
|
257 |
+
self,
|
258 |
+
linewidth: float,
|
259 |
+
p0: Point,
|
260 |
+
p1: Point,
|
261 |
+
stroke: bool = False,
|
262 |
+
fill: bool = False,
|
263 |
+
evenodd: bool = False,
|
264 |
+
stroking_color: Optional[Color] = None,
|
265 |
+
non_stroking_color: Optional[Color] = None,
|
266 |
+
original_path: Optional[List[PathSegment]] = None,
|
267 |
+
dashing_style: Optional[Tuple[object, object]] = None,
|
268 |
+
) -> None:
|
269 |
+
LTCurve.__init__(
|
270 |
+
self,
|
271 |
+
linewidth,
|
272 |
+
[p0, p1],
|
273 |
+
stroke,
|
274 |
+
fill,
|
275 |
+
evenodd,
|
276 |
+
stroking_color,
|
277 |
+
non_stroking_color,
|
278 |
+
original_path,
|
279 |
+
dashing_style,
|
280 |
+
)
|
281 |
+
|
282 |
+
|
283 |
+
class LTRect(LTCurve):
|
284 |
+
"""A rectangle.
|
285 |
+
|
286 |
+
Could be used for framing another pictures or figures.
|
287 |
+
"""
|
288 |
+
|
289 |
+
def __init__(
|
290 |
+
self,
|
291 |
+
linewidth: float,
|
292 |
+
bbox: Rect,
|
293 |
+
stroke: bool = False,
|
294 |
+
fill: bool = False,
|
295 |
+
evenodd: bool = False,
|
296 |
+
stroking_color: Optional[Color] = None,
|
297 |
+
non_stroking_color: Optional[Color] = None,
|
298 |
+
original_path: Optional[List[PathSegment]] = None,
|
299 |
+
dashing_style: Optional[Tuple[object, object]] = None,
|
300 |
+
) -> None:
|
301 |
+
(x0, y0, x1, y1) = bbox
|
302 |
+
LTCurve.__init__(
|
303 |
+
self,
|
304 |
+
linewidth,
|
305 |
+
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)],
|
306 |
+
stroke,
|
307 |
+
fill,
|
308 |
+
evenodd,
|
309 |
+
stroking_color,
|
310 |
+
non_stroking_color,
|
311 |
+
original_path,
|
312 |
+
dashing_style,
|
313 |
+
)
|
314 |
+
|
315 |
+
|
316 |
+
class LTImage(LTComponent):
|
317 |
+
"""An image object.
|
318 |
+
|
319 |
+
Embedded images can be in JPEG, Bitmap or JBIG2.
|
320 |
+
"""
|
321 |
+
|
322 |
+
def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None:
|
323 |
+
LTComponent.__init__(self, bbox)
|
324 |
+
self.name = name
|
325 |
+
self.stream = stream
|
326 |
+
self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height")))
|
327 |
+
self.imagemask = stream.get_any(("IM", "ImageMask"))
|
328 |
+
self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1)
|
329 |
+
self.colorspace = stream.get_any(("CS", "ColorSpace"))
|
330 |
+
if not isinstance(self.colorspace, list):
|
331 |
+
self.colorspace = [self.colorspace]
|
332 |
+
|
333 |
+
def __repr__(self) -> str:
|
334 |
+
return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} {self.srcsize!r}>"
|
335 |
+
|
336 |
+
|
337 |
+
class LTAnno(LTItem, LTText):
|
338 |
+
"""Actual letter in the text as a Unicode string.
|
339 |
+
|
340 |
+
Note that, while a LTChar object has actual boundaries, LTAnno objects does
|
341 |
+
not, as these are "virtual" characters, inserted by a layout analyzer
|
342 |
+
according to the relationship between two characters (e.g. a space).
|
343 |
+
"""
|
344 |
+
|
345 |
+
def __init__(self, text: str) -> None:
|
346 |
+
self._text = text
|
347 |
+
|
348 |
+
def get_text(self) -> str:
|
349 |
+
return self._text
|
350 |
+
|
351 |
+
|
352 |
+
class LTChar(LTComponent, LTText):
|
353 |
+
"""Actual letter in the text as a Unicode string."""
|
354 |
+
|
355 |
+
def __init__(
|
356 |
+
self,
|
357 |
+
matrix: Matrix,
|
358 |
+
font: PDFFont,
|
359 |
+
fontsize: float,
|
360 |
+
scaling: float,
|
361 |
+
rise: float,
|
362 |
+
text: str,
|
363 |
+
textwidth: float,
|
364 |
+
textdisp: Union[float, Tuple[Optional[float], float]],
|
365 |
+
ncs: PDFColorSpace,
|
366 |
+
graphicstate: PDFGraphicState,
|
367 |
+
) -> None:
|
368 |
+
LTText.__init__(self)
|
369 |
+
self._text = text
|
370 |
+
self.matrix = matrix
|
371 |
+
self.font = font
|
372 |
+
self.fontname = font.fontname
|
373 |
+
self.ncs = ncs
|
374 |
+
self.graphicstate = graphicstate
|
375 |
+
self.adv = textwidth * fontsize * scaling
|
376 |
+
# compute the boundary rectangle.
|
377 |
+
if font.is_vertical():
|
378 |
+
# vertical
|
379 |
+
assert isinstance(textdisp, tuple)
|
380 |
+
(vx, vy) = textdisp
|
381 |
+
if vx is None:
|
382 |
+
vx = fontsize * 0.5
|
383 |
+
else:
|
384 |
+
vx = vx * fontsize * 0.001
|
385 |
+
vy = (1000 - vy) * fontsize * 0.001
|
386 |
+
bbox_lower_left = (-vx, vy + rise + self.adv)
|
387 |
+
bbox_upper_right = (-vx + fontsize, vy + rise)
|
388 |
+
else:
|
389 |
+
# horizontal
|
390 |
+
descent = 0 # descent = font.get_descent() * fontsize
|
391 |
+
bbox_lower_left = (0, descent + rise)
|
392 |
+
bbox_upper_right = (self.adv, descent + rise + fontsize)
|
393 |
+
(a, b, c, d, e, f) = self.matrix
|
394 |
+
self.upright = a * d * scaling > 0 and b * c <= 0
|
395 |
+
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
|
396 |
+
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
|
397 |
+
if x1 < x0:
|
398 |
+
(x0, x1) = (x1, x0)
|
399 |
+
if y1 < y0:
|
400 |
+
(y0, y1) = (y1, y0)
|
401 |
+
LTComponent.__init__(self, (x0, y0, x1, y1))
|
402 |
+
if font.is_vertical():
|
403 |
+
self.size = self.width
|
404 |
+
else:
|
405 |
+
self.size = self.height
|
406 |
+
|
407 |
+
def __repr__(self) -> str:
|
408 |
+
return "<{} {} matrix={} font={} adv={} text={}>".format(
|
409 |
+
self.__class__.__name__,
|
410 |
+
bbox2str(self.bbox),
|
411 |
+
matrix2str(self.matrix),
|
412 |
+
repr(self.fontname),
|
413 |
+
self.adv,
|
414 |
+
repr(self.get_text()),
|
415 |
+
)
|
416 |
+
|
417 |
+
def get_text(self) -> str:
|
418 |
+
return self._text
|
419 |
+
|
420 |
+
|
421 |
+
LTItemT = TypeVar("LTItemT", bound=LTItem)
|
422 |
+
|
423 |
+
|
424 |
+
class LTContainer(LTComponent, Generic[LTItemT]):
|
425 |
+
"""Object that can be extended and analyzed"""
|
426 |
+
|
427 |
+
def __init__(self, bbox: Rect) -> None:
|
428 |
+
LTComponent.__init__(self, bbox)
|
429 |
+
self._objs: List[LTItemT] = []
|
430 |
+
|
431 |
+
def __iter__(self) -> Iterator[LTItemT]:
|
432 |
+
return iter(self._objs)
|
433 |
+
|
434 |
+
def __len__(self) -> int:
|
435 |
+
return len(self._objs)
|
436 |
+
|
437 |
+
def add(self, obj: LTItemT) -> None:
|
438 |
+
self._objs.append(obj)
|
439 |
+
|
440 |
+
def extend(self, objs: Iterable[LTItemT]) -> None:
|
441 |
+
for obj in objs:
|
442 |
+
self.add(obj)
|
443 |
+
|
444 |
+
def analyze(self, laparams: LAParams) -> None:
|
445 |
+
for obj in self._objs:
|
446 |
+
obj.analyze(laparams)
|
447 |
+
|
448 |
+
|
449 |
+
class LTExpandableContainer(LTContainer[LTItemT]):
|
450 |
+
def __init__(self) -> None:
|
451 |
+
LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
|
452 |
+
|
453 |
+
# Incompatible override: we take an LTComponent (with bounding box), but
|
454 |
+
# super() LTContainer only considers LTItem (no bounding box).
|
455 |
+
def add(self, obj: LTComponent) -> None: # type: ignore[override]
|
456 |
+
LTContainer.add(self, cast(LTItemT, obj))
|
457 |
+
self.set_bbox(
|
458 |
+
(
|
459 |
+
min(self.x0, obj.x0),
|
460 |
+
min(self.y0, obj.y0),
|
461 |
+
max(self.x1, obj.x1),
|
462 |
+
max(self.y1, obj.y1),
|
463 |
+
),
|
464 |
+
)
|
465 |
+
|
466 |
+
|
467 |
+
class LTTextContainer(LTExpandableContainer[LTItemT], LTText):
|
468 |
+
def __init__(self) -> None:
|
469 |
+
LTText.__init__(self)
|
470 |
+
LTExpandableContainer.__init__(self)
|
471 |
+
|
472 |
+
def get_text(self) -> str:
|
473 |
+
return "".join(
|
474 |
+
cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)
|
475 |
+
)
|
476 |
+
|
477 |
+
|
478 |
+
TextLineElement = Union[LTChar, LTAnno]
|
479 |
+
|
480 |
+
|
481 |
+
class LTTextLine(LTTextContainer[TextLineElement]):
|
482 |
+
"""Contains a list of LTChar objects that represent a single text line.
|
483 |
+
|
484 |
+
The characters are aligned either horizontally or vertically, depending on
|
485 |
+
the text's writing mode.
|
486 |
+
"""
|
487 |
+
|
488 |
+
def __init__(self, word_margin: float) -> None:
|
489 |
+
super().__init__()
|
490 |
+
self.word_margin = word_margin
|
491 |
+
|
492 |
+
def __repr__(self) -> str:
|
493 |
+
return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>"
|
494 |
+
|
495 |
+
def analyze(self, laparams: LAParams) -> None:
|
496 |
+
for obj in self._objs:
|
497 |
+
obj.analyze(laparams)
|
498 |
+
LTContainer.add(self, LTAnno("\n"))
|
499 |
+
|
500 |
+
def find_neighbors(
|
501 |
+
self,
|
502 |
+
plane: Plane[LTComponentT],
|
503 |
+
ratio: float,
|
504 |
+
) -> List["LTTextLine"]:
|
505 |
+
raise NotImplementedError
|
506 |
+
|
507 |
+
def is_empty(self) -> bool:
|
508 |
+
return super().is_empty() or self.get_text().isspace()
|
509 |
+
|
510 |
+
|
511 |
+
class LTTextLineHorizontal(LTTextLine):
|
512 |
+
def __init__(self, word_margin: float) -> None:
|
513 |
+
LTTextLine.__init__(self, word_margin)
|
514 |
+
self._x1: float = +INF
|
515 |
+
|
516 |
+
# Incompatible override: we take an LTComponent (with bounding box), but
|
517 |
+
# LTContainer only considers LTItem (no bounding box).
|
518 |
+
def add(self, obj: LTComponent) -> None: # type: ignore[override]
|
519 |
+
if isinstance(obj, LTChar) and self.word_margin:
|
520 |
+
margin = self.word_margin * max(obj.width, obj.height)
|
521 |
+
if self._x1 < obj.x0 - margin:
|
522 |
+
LTContainer.add(self, LTAnno(" "))
|
523 |
+
self._x1 = obj.x1
|
524 |
+
super().add(obj)
|
525 |
+
|
526 |
+
def find_neighbors(
|
527 |
+
self,
|
528 |
+
plane: Plane[LTComponentT],
|
529 |
+
ratio: float,
|
530 |
+
) -> List[LTTextLine]:
|
531 |
+
"""Finds neighboring LTTextLineHorizontals in the plane.
|
532 |
+
|
533 |
+
Returns a list of other LTTestLineHorizontals in the plane which are
|
534 |
+
close to self. "Close" can be controlled by ratio. The returned objects
|
535 |
+
will be the same height as self, and also either left-, right-, or
|
536 |
+
centrally-aligned.
|
537 |
+
"""
|
538 |
+
d = ratio * self.height
|
539 |
+
objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
|
540 |
+
return [
|
541 |
+
obj
|
542 |
+
for obj in objs
|
543 |
+
if (
|
544 |
+
isinstance(obj, LTTextLineHorizontal)
|
545 |
+
and self._is_same_height_as(obj, tolerance=d)
|
546 |
+
and (
|
547 |
+
self._is_left_aligned_with(obj, tolerance=d)
|
548 |
+
or self._is_right_aligned_with(obj, tolerance=d)
|
549 |
+
or self._is_centrally_aligned_with(obj, tolerance=d)
|
550 |
+
)
|
551 |
+
)
|
552 |
+
]
|
553 |
+
|
554 |
+
def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
|
555 |
+
"""Whether the left-hand edge of `other` is within `tolerance`."""
|
556 |
+
return abs(other.x0 - self.x0) <= tolerance
|
557 |
+
|
558 |
+
def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
|
559 |
+
"""Whether the right-hand edge of `other` is within `tolerance`."""
|
560 |
+
return abs(other.x1 - self.x1) <= tolerance
|
561 |
+
|
562 |
+
def _is_centrally_aligned_with(
|
563 |
+
self,
|
564 |
+
other: LTComponent,
|
565 |
+
tolerance: float = 0,
|
566 |
+
) -> bool:
|
567 |
+
"""Whether the horizontal center of `other` is within `tolerance`."""
|
568 |
+
return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
|
569 |
+
|
570 |
+
def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:
|
571 |
+
return abs(other.height - self.height) <= tolerance
|
572 |
+
|
573 |
+
|
574 |
+
class LTTextLineVertical(LTTextLine):
|
575 |
+
def __init__(self, word_margin: float) -> None:
|
576 |
+
LTTextLine.__init__(self, word_margin)
|
577 |
+
self._y0: float = -INF
|
578 |
+
|
579 |
+
# Incompatible override: we take an LTComponent (with bounding box), but
|
580 |
+
# LTContainer only considers LTItem (no bounding box).
|
581 |
+
def add(self, obj: LTComponent) -> None: # type: ignore[override]
|
582 |
+
if isinstance(obj, LTChar) and self.word_margin:
|
583 |
+
margin = self.word_margin * max(obj.width, obj.height)
|
584 |
+
if obj.y1 + margin < self._y0:
|
585 |
+
LTContainer.add(self, LTAnno(" "))
|
586 |
+
self._y0 = obj.y0
|
587 |
+
super().add(obj)
|
588 |
+
|
589 |
+
def find_neighbors(
|
590 |
+
self,
|
591 |
+
plane: Plane[LTComponentT],
|
592 |
+
ratio: float,
|
593 |
+
) -> List[LTTextLine]:
|
594 |
+
"""Finds neighboring LTTextLineVerticals in the plane.
|
595 |
+
|
596 |
+
Returns a list of other LTTextLineVerticals in the plane which are
|
597 |
+
close to self. "Close" can be controlled by ratio. The returned objects
|
598 |
+
will be the same width as self, and also either upper-, lower-, or
|
599 |
+
centrally-aligned.
|
600 |
+
"""
|
601 |
+
d = ratio * self.width
|
602 |
+
objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
|
603 |
+
return [
|
604 |
+
obj
|
605 |
+
for obj in objs
|
606 |
+
if (
|
607 |
+
isinstance(obj, LTTextLineVertical)
|
608 |
+
and self._is_same_width_as(obj, tolerance=d)
|
609 |
+
and (
|
610 |
+
self._is_lower_aligned_with(obj, tolerance=d)
|
611 |
+
or self._is_upper_aligned_with(obj, tolerance=d)
|
612 |
+
or self._is_centrally_aligned_with(obj, tolerance=d)
|
613 |
+
)
|
614 |
+
)
|
615 |
+
]
|
616 |
+
|
617 |
+
def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
|
618 |
+
"""Whether the lower edge of `other` is within `tolerance`."""
|
619 |
+
return abs(other.y0 - self.y0) <= tolerance
|
620 |
+
|
621 |
+
def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
|
622 |
+
"""Whether the upper edge of `other` is within `tolerance`."""
|
623 |
+
return abs(other.y1 - self.y1) <= tolerance
|
624 |
+
|
625 |
+
def _is_centrally_aligned_with(
|
626 |
+
self,
|
627 |
+
other: LTComponent,
|
628 |
+
tolerance: float = 0,
|
629 |
+
) -> bool:
|
630 |
+
"""Whether the vertical center of `other` is within `tolerance`."""
|
631 |
+
return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
|
632 |
+
|
633 |
+
def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:
|
634 |
+
return abs(other.width - self.width) <= tolerance
|
635 |
+
|
636 |
+
|
637 |
+
class LTTextBox(LTTextContainer[LTTextLine]):
|
638 |
+
"""Represents a group of text chunks in a rectangular area.
|
639 |
+
|
640 |
+
Note that this box is created by geometric analysis and does not
|
641 |
+
necessarily represents a logical boundary of the text. It contains a list
|
642 |
+
of LTTextLine objects.
|
643 |
+
"""
|
644 |
+
|
645 |
+
def __init__(self) -> None:
|
646 |
+
LTTextContainer.__init__(self)
|
647 |
+
self.index: int = -1
|
648 |
+
|
649 |
+
def __repr__(self) -> str:
|
650 |
+
return f"<{self.__class__.__name__}({self.index}) {bbox2str(self.bbox)} {self.get_text()!r}>"
|
651 |
+
|
652 |
+
def get_writing_mode(self) -> str:
|
653 |
+
raise NotImplementedError
|
654 |
+
|
655 |
+
|
656 |
+
class LTTextBoxHorizontal(LTTextBox):
|
657 |
+
def analyze(self, laparams: LAParams) -> None:
|
658 |
+
super().analyze(laparams)
|
659 |
+
self._objs.sort(key=lambda obj: -obj.y1)
|
660 |
+
|
661 |
+
def get_writing_mode(self) -> str:
|
662 |
+
return "lr-tb"
|
663 |
+
|
664 |
+
|
665 |
+
class LTTextBoxVertical(LTTextBox):
|
666 |
+
def analyze(self, laparams: LAParams) -> None:
|
667 |
+
super().analyze(laparams)
|
668 |
+
self._objs.sort(key=lambda obj: -obj.x1)
|
669 |
+
|
670 |
+
def get_writing_mode(self) -> str:
|
671 |
+
return "tb-rl"
|
672 |
+
|
673 |
+
|
674 |
+
TextGroupElement = Union[LTTextBox, "LTTextGroup"]
|
675 |
+
|
676 |
+
|
677 |
+
class LTTextGroup(LTTextContainer[TextGroupElement]):
|
678 |
+
def __init__(self, objs: Iterable[TextGroupElement]) -> None:
|
679 |
+
super().__init__()
|
680 |
+
self.extend(objs)
|
681 |
+
|
682 |
+
|
683 |
+
class LTTextGroupLRTB(LTTextGroup):
|
684 |
+
def analyze(self, laparams: LAParams) -> None:
|
685 |
+
super().analyze(laparams)
|
686 |
+
assert laparams.boxes_flow is not None
|
687 |
+
boxes_flow = laparams.boxes_flow
|
688 |
+
# reorder the objects from top-left to bottom-right.
|
689 |
+
self._objs.sort(
|
690 |
+
key=lambda obj: (1 - boxes_flow) * obj.x0
|
691 |
+
- (1 + boxes_flow) * (obj.y0 + obj.y1),
|
692 |
+
)
|
693 |
+
|
694 |
+
|
695 |
+
class LTTextGroupTBRL(LTTextGroup):
|
696 |
+
def analyze(self, laparams: LAParams) -> None:
|
697 |
+
super().analyze(laparams)
|
698 |
+
assert laparams.boxes_flow is not None
|
699 |
+
boxes_flow = laparams.boxes_flow
|
700 |
+
# reorder the objects from top-right to bottom-left.
|
701 |
+
self._objs.sort(
|
702 |
+
key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)
|
703 |
+
- (1 - boxes_flow) * obj.y1,
|
704 |
+
)
|
705 |
+
|
706 |
+
|
707 |
+
class LTLayoutContainer(LTContainer[LTComponent]):
|
708 |
+
def __init__(self, bbox: Rect) -> None:
|
709 |
+
LTContainer.__init__(self, bbox)
|
710 |
+
self.groups: Optional[List[LTTextGroup]] = None
|
711 |
+
|
712 |
+
# group_objects: group text object to textlines.
|
713 |
+
def group_objects(
|
714 |
+
self,
|
715 |
+
laparams: LAParams,
|
716 |
+
objs: Iterable[LTComponent],
|
717 |
+
) -> Iterator[LTTextLine]:
|
718 |
+
obj0 = None
|
719 |
+
line = None
|
720 |
+
for obj1 in objs:
|
721 |
+
if obj0 is not None:
|
722 |
+
# halign: obj0 and obj1 is horizontally aligned.
|
723 |
+
#
|
724 |
+
# +------+ - - -
|
725 |
+
# | obj0 | - - +------+ -
|
726 |
+
# | | | obj1 | | (line_overlap)
|
727 |
+
# +------+ - - | | -
|
728 |
+
# - - - +------+
|
729 |
+
#
|
730 |
+
# |<--->|
|
731 |
+
# (char_margin)
|
732 |
+
halign = (
|
733 |
+
obj0.is_voverlap(obj1)
|
734 |
+
and min(obj0.height, obj1.height) * laparams.line_overlap
|
735 |
+
< obj0.voverlap(obj1)
|
736 |
+
and obj0.hdistance(obj1)
|
737 |
+
< max(obj0.width, obj1.width) * laparams.char_margin
|
738 |
+
)
|
739 |
+
|
740 |
+
# valign: obj0 and obj1 is vertically aligned.
|
741 |
+
#
|
742 |
+
# +------+
|
743 |
+
# | obj0 |
|
744 |
+
# | |
|
745 |
+
# +------+ - - -
|
746 |
+
# | | | (char_margin)
|
747 |
+
# +------+ - -
|
748 |
+
# | obj1 |
|
749 |
+
# | |
|
750 |
+
# +------+
|
751 |
+
#
|
752 |
+
# |<-->|
|
753 |
+
# (line_overlap)
|
754 |
+
valign = (
|
755 |
+
laparams.detect_vertical
|
756 |
+
and obj0.is_hoverlap(obj1)
|
757 |
+
and min(obj0.width, obj1.width) * laparams.line_overlap
|
758 |
+
< obj0.hoverlap(obj1)
|
759 |
+
and obj0.vdistance(obj1)
|
760 |
+
< max(obj0.height, obj1.height) * laparams.char_margin
|
761 |
+
)
|
762 |
+
|
763 |
+
if (halign and isinstance(line, LTTextLineHorizontal)) or (
|
764 |
+
valign and isinstance(line, LTTextLineVertical)
|
765 |
+
):
|
766 |
+
line.add(obj1)
|
767 |
+
elif line is not None:
|
768 |
+
yield line
|
769 |
+
line = None
|
770 |
+
elif valign and not halign:
|
771 |
+
line = LTTextLineVertical(laparams.word_margin)
|
772 |
+
line.add(obj0)
|
773 |
+
line.add(obj1)
|
774 |
+
elif halign and not valign:
|
775 |
+
line = LTTextLineHorizontal(laparams.word_margin)
|
776 |
+
line.add(obj0)
|
777 |
+
line.add(obj1)
|
778 |
+
else:
|
779 |
+
line = LTTextLineHorizontal(laparams.word_margin)
|
780 |
+
line.add(obj0)
|
781 |
+
yield line
|
782 |
+
line = None
|
783 |
+
obj0 = obj1
|
784 |
+
if line is None:
|
785 |
+
line = LTTextLineHorizontal(laparams.word_margin)
|
786 |
+
assert obj0 is not None
|
787 |
+
line.add(obj0)
|
788 |
+
yield line
|
789 |
+
|
790 |
+
def group_textlines(
|
791 |
+
self,
|
792 |
+
laparams: LAParams,
|
793 |
+
lines: Iterable[LTTextLine],
|
794 |
+
) -> Iterator[LTTextBox]:
|
795 |
+
"""Group neighboring lines to textboxes"""
|
796 |
+
plane: Plane[LTTextLine] = Plane(self.bbox)
|
797 |
+
plane.extend(lines)
|
798 |
+
boxes: Dict[LTTextLine, LTTextBox] = {}
|
799 |
+
for line in lines:
|
800 |
+
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
801 |
+
members = [line]
|
802 |
+
for obj1 in neighbors:
|
803 |
+
members.append(obj1)
|
804 |
+
if obj1 in boxes:
|
805 |
+
members.extend(boxes.pop(obj1))
|
806 |
+
if isinstance(line, LTTextLineHorizontal):
|
807 |
+
box: LTTextBox = LTTextBoxHorizontal()
|
808 |
+
else:
|
809 |
+
box = LTTextBoxVertical()
|
810 |
+
for obj in uniq(members):
|
811 |
+
box.add(obj)
|
812 |
+
boxes[obj] = box
|
813 |
+
done = set()
|
814 |
+
for line in lines:
|
815 |
+
if line not in boxes:
|
816 |
+
continue
|
817 |
+
box = boxes[line]
|
818 |
+
if box in done:
|
819 |
+
continue
|
820 |
+
done.add(box)
|
821 |
+
if not box.is_empty():
|
822 |
+
yield box
|
823 |
+
|
824 |
+
def group_textboxes(
|
825 |
+
self,
|
826 |
+
laparams: LAParams,
|
827 |
+
boxes: Sequence[LTTextBox],
|
828 |
+
) -> List[LTTextGroup]:
|
829 |
+
"""Group textboxes hierarchically.
|
830 |
+
|
831 |
+
Get pair-wise distances, via dist func defined below, and then merge
|
832 |
+
from the closest textbox pair. Once obj1 and obj2 are merged /
|
833 |
+
grouped, the resulting group is considered as a new object, and its
|
834 |
+
distances to other objects & groups are added to the process queue.
|
835 |
+
|
836 |
+
For performance reason, pair-wise distances and object pair info are
|
837 |
+
maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)
|
838 |
+
tuples. It ensures quick access to the smallest element. Note that
|
839 |
+
since comparison operators, e.g., __lt__, are disabled for
|
840 |
+
LTComponent, id(obj) has to appear before obj in element tuples.
|
841 |
+
|
842 |
+
:param laparams: LAParams object.
|
843 |
+
:param boxes: All textbox objects to be grouped.
|
844 |
+
:return: a list that has only one element, the final top level group.
|
845 |
+
"""
|
846 |
+
ElementT = Union[LTTextBox, LTTextGroup]
|
847 |
+
plane: Plane[ElementT] = Plane(self.bbox)
|
848 |
+
|
849 |
+
def dist(obj1: LTComponent, obj2: LTComponent) -> float:
|
850 |
+
"""A distance function between two TextBoxes.
|
851 |
+
|
852 |
+
Consider the bounding rectangle for obj1 and obj2.
|
853 |
+
Return its area less the areas of obj1 and obj2,
|
854 |
+
shown as 'www' below. This value may be negative.
|
855 |
+
+------+..........+ (x1, y1)
|
856 |
+
| obj1 |wwwwwwwwww:
|
857 |
+
+------+www+------+
|
858 |
+
:wwwwwwwwww| obj2 |
|
859 |
+
(x0, y0) +..........+------+
|
860 |
+
"""
|
861 |
+
x0 = min(obj1.x0, obj2.x0)
|
862 |
+
y0 = min(obj1.y0, obj2.y0)
|
863 |
+
x1 = max(obj1.x1, obj2.x1)
|
864 |
+
y1 = max(obj1.y1, obj2.y1)
|
865 |
+
return (
|
866 |
+
(x1 - x0) * (y1 - y0)
|
867 |
+
- obj1.width * obj1.height
|
868 |
+
- obj2.width * obj2.height
|
869 |
+
)
|
870 |
+
|
871 |
+
def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]:
|
872 |
+
"""Check if there's any other object between obj1 and obj2."""
|
873 |
+
x0 = min(obj1.x0, obj2.x0)
|
874 |
+
y0 = min(obj1.y0, obj2.y0)
|
875 |
+
x1 = max(obj1.x1, obj2.x1)
|
876 |
+
y1 = max(obj1.y1, obj2.y1)
|
877 |
+
objs = set(plane.find((x0, y0, x1, y1)))
|
878 |
+
return objs.difference((obj1, obj2))
|
879 |
+
|
880 |
+
dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = []
|
881 |
+
for i in range(len(boxes)):
|
882 |
+
box1 = boxes[i]
|
883 |
+
for j in range(i + 1, len(boxes)):
|
884 |
+
box2 = boxes[j]
|
885 |
+
dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))
|
886 |
+
heapq.heapify(dists)
|
887 |
+
|
888 |
+
plane.extend(boxes)
|
889 |
+
done = set()
|
890 |
+
while len(dists) > 0:
|
891 |
+
(skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
|
892 |
+
# Skip objects that are already merged
|
893 |
+
if (id1 not in done) and (id2 not in done):
|
894 |
+
if not skip_isany and isany(obj1, obj2):
|
895 |
+
heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
|
896 |
+
continue
|
897 |
+
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(
|
898 |
+
obj2,
|
899 |
+
(LTTextBoxVertical, LTTextGroupTBRL),
|
900 |
+
):
|
901 |
+
group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])
|
902 |
+
else:
|
903 |
+
group = LTTextGroupLRTB([obj1, obj2])
|
904 |
+
plane.remove(obj1)
|
905 |
+
plane.remove(obj2)
|
906 |
+
done.update([id1, id2])
|
907 |
+
|
908 |
+
for other in plane:
|
909 |
+
heapq.heappush(
|
910 |
+
dists,
|
911 |
+
(False, dist(group, other), id(group), id(other), group, other),
|
912 |
+
)
|
913 |
+
plane.add(group)
|
914 |
+
# By now only groups are in the plane
|
915 |
+
return list(cast(LTTextGroup, g) for g in plane)
|
916 |
+
|
917 |
+
def analyze(self, laparams: LAParams) -> None:
|
918 |
+
# textobjs is a list of LTChar objects, i.e.
|
919 |
+
# it has all the individual characters in the page.
|
920 |
+
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
|
921 |
+
for obj in otherobjs:
|
922 |
+
obj.analyze(laparams)
|
923 |
+
if not textobjs:
|
924 |
+
return
|
925 |
+
textlines = list(self.group_objects(laparams, textobjs))
|
926 |
+
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
|
927 |
+
for obj in empties:
|
928 |
+
obj.analyze(laparams)
|
929 |
+
textboxes = list(self.group_textlines(laparams, textlines))
|
930 |
+
if laparams.boxes_flow is None:
|
931 |
+
for textbox in textboxes:
|
932 |
+
textbox.analyze(laparams)
|
933 |
+
|
934 |
+
def getkey(box: LTTextBox) -> Tuple[int, float, float]:
|
935 |
+
if isinstance(box, LTTextBoxVertical):
|
936 |
+
return (0, -box.x1, -box.y0)
|
937 |
+
else:
|
938 |
+
return (1, -box.y0, box.x0)
|
939 |
+
|
940 |
+
textboxes.sort(key=getkey)
|
941 |
+
else:
|
942 |
+
self.groups = self.group_textboxes(laparams, textboxes)
|
943 |
+
assigner = IndexAssigner()
|
944 |
+
for group in self.groups:
|
945 |
+
group.analyze(laparams)
|
946 |
+
assigner.run(group)
|
947 |
+
textboxes.sort(key=lambda box: box.index)
|
948 |
+
self._objs = (
|
949 |
+
cast(List[LTComponent], textboxes)
|
950 |
+
+ otherobjs
|
951 |
+
+ cast(List[LTComponent], empties)
|
952 |
+
)
|
953 |
+
|
954 |
+
|
955 |
+
class LTFigure(LTLayoutContainer):
|
956 |
+
"""Represents an area used by PDF Form objects.
|
957 |
+
|
958 |
+
PDF Forms can be used to present figures or pictures by embedding yet
|
959 |
+
another PDF document within a page. Note that LTFigure objects can appear
|
960 |
+
recursively.
|
961 |
+
"""
|
962 |
+
|
963 |
+
def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None:
|
964 |
+
self.name = name
|
965 |
+
self.matrix = matrix
|
966 |
+
(x, y, w, h) = bbox
|
967 |
+
bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h))
|
968 |
+
bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds)
|
969 |
+
LTLayoutContainer.__init__(self, bbox)
|
970 |
+
|
971 |
+
def __repr__(self) -> str:
|
972 |
+
return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>"
|
973 |
+
|
974 |
+
def analyze(self, laparams: LAParams) -> None:
|
975 |
+
if not laparams.all_texts:
|
976 |
+
return
|
977 |
+
LTLayoutContainer.analyze(self, laparams)
|
978 |
+
|
979 |
+
|
980 |
+
class LTPage(LTLayoutContainer):
|
981 |
+
"""Represents an entire page.
|
982 |
+
|
983 |
+
Like any other LTLayoutContainer, an LTPage can be iterated to obtain child
|
984 |
+
objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine.
|
985 |
+
"""
|
986 |
+
|
987 |
+
def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None:
|
988 |
+
LTLayoutContainer.__init__(self, bbox)
|
989 |
+
self.pageid = pageid
|
990 |
+
self.rotate = rotate
|
991 |
+
|
992 |
+
def __repr__(self) -> str:
|
993 |
+
return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>"
|
pdf2zh/lzw.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from io import BytesIO
|
3 |
+
from typing import BinaryIO, Iterator, List, Optional, cast
|
4 |
+
|
5 |
+
from pdf2zh.pdfexceptions import PDFEOFError, PDFException
|
6 |
+
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
|
9 |
+
|
10 |
+
class CorruptDataError(PDFException):
|
11 |
+
pass
|
12 |
+
|
13 |
+
|
14 |
+
class LZWDecoder:
|
15 |
+
def __init__(self, fp: BinaryIO) -> None:
|
16 |
+
self.fp = fp
|
17 |
+
self.buff = 0
|
18 |
+
self.bpos = 8
|
19 |
+
self.nbits = 9
|
20 |
+
# NB: self.table stores None only in indices 256 and 257
|
21 |
+
self.table: List[Optional[bytes]] = []
|
22 |
+
self.prevbuf: Optional[bytes] = None
|
23 |
+
|
24 |
+
def readbits(self, bits: int) -> int:
|
25 |
+
v = 0
|
26 |
+
while 1:
|
27 |
+
# the number of remaining bits we can get from the current buffer.
|
28 |
+
r = 8 - self.bpos
|
29 |
+
if bits <= r:
|
30 |
+
# |-----8-bits-----|
|
31 |
+
# |-bpos-|-bits-| |
|
32 |
+
# | |----r----|
|
33 |
+
v = (v << bits) | ((self.buff >> (r - bits)) & ((1 << bits) - 1))
|
34 |
+
self.bpos += bits
|
35 |
+
break
|
36 |
+
else:
|
37 |
+
# |-----8-bits-----|
|
38 |
+
# |-bpos-|---bits----...
|
39 |
+
# | |----r----|
|
40 |
+
v = (v << r) | (self.buff & ((1 << r) - 1))
|
41 |
+
bits -= r
|
42 |
+
x = self.fp.read(1)
|
43 |
+
if not x:
|
44 |
+
raise PDFEOFError
|
45 |
+
self.buff = ord(x)
|
46 |
+
self.bpos = 0
|
47 |
+
return v
|
48 |
+
|
49 |
+
def feed(self, code: int) -> bytes:
|
50 |
+
x = b""
|
51 |
+
if code == 256:
|
52 |
+
self.table = [bytes((c,)) for c in range(256)] # 0-255
|
53 |
+
self.table.append(None) # 256
|
54 |
+
self.table.append(None) # 257
|
55 |
+
self.prevbuf = b""
|
56 |
+
self.nbits = 9
|
57 |
+
elif code == 257:
|
58 |
+
pass
|
59 |
+
elif not self.prevbuf:
|
60 |
+
x = self.prevbuf = cast(bytes, self.table[code]) # assume not None
|
61 |
+
else:
|
62 |
+
if code < len(self.table):
|
63 |
+
x = cast(bytes, self.table[code]) # assume not None
|
64 |
+
self.table.append(self.prevbuf + x[:1])
|
65 |
+
elif code == len(self.table):
|
66 |
+
self.table.append(self.prevbuf + self.prevbuf[:1])
|
67 |
+
x = cast(bytes, self.table[code])
|
68 |
+
else:
|
69 |
+
raise CorruptDataError
|
70 |
+
table_length = len(self.table)
|
71 |
+
if table_length == 511:
|
72 |
+
self.nbits = 10
|
73 |
+
elif table_length == 1023:
|
74 |
+
self.nbits = 11
|
75 |
+
elif table_length == 2047:
|
76 |
+
self.nbits = 12
|
77 |
+
self.prevbuf = x
|
78 |
+
return x
|
79 |
+
|
80 |
+
def run(self) -> Iterator[bytes]:
|
81 |
+
while 1:
|
82 |
+
try:
|
83 |
+
code = self.readbits(self.nbits)
|
84 |
+
except EOFError:
|
85 |
+
break
|
86 |
+
try:
|
87 |
+
x = self.feed(code)
|
88 |
+
except CorruptDataError:
|
89 |
+
# just ignore corrupt data and stop yielding there
|
90 |
+
break
|
91 |
+
yield x
|
92 |
+
|
93 |
+
# logger.debug(
|
94 |
+
# "nbits=%d, code=%d, output=%r, table=%r",
|
95 |
+
# self.nbits,
|
96 |
+
# code,
|
97 |
+
# x,
|
98 |
+
# self.table[258:],
|
99 |
+
# )
|
100 |
+
|
101 |
+
|
102 |
+
def lzwdecode(data: bytes) -> bytes:
|
103 |
+
fp = BytesIO(data)
|
104 |
+
s = LZWDecoder(fp).run()
|
105 |
+
return b"".join(s)
|
pdf2zh/pdf2zh.py
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""A command line tool for extracting text and images from PDF and
|
3 |
+
output it to plain text, html, xml or tags.
|
4 |
+
"""
|
5 |
+
|
6 |
+
from __future__ import annotations
|
7 |
+
|
8 |
+
import argparse
|
9 |
+
import logging
|
10 |
+
import os
|
11 |
+
import sys
|
12 |
+
from pathlib import Path
|
13 |
+
from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional
|
14 |
+
|
15 |
+
import pymupdf
|
16 |
+
import requests
|
17 |
+
|
18 |
+
from pdf2zh import __version__
|
19 |
+
from pdf2zh.pdfexceptions import PDFValueError
|
20 |
+
|
21 |
+
if TYPE_CHECKING:
|
22 |
+
from pdf2zh.layout import LAParams
|
23 |
+
from pdf2zh.utils import AnyIO
|
24 |
+
|
25 |
+
OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
|
26 |
+
|
27 |
+
|
28 |
+
def setup_log() -> None:
|
29 |
+
logging.basicConfig()
|
30 |
+
|
31 |
+
try:
|
32 |
+
import doclayout_yolo
|
33 |
+
|
34 |
+
doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING)
|
35 |
+
except ImportError:
|
36 |
+
pass
|
37 |
+
|
38 |
+
|
39 |
+
def check_files(files: List[str]) -> List[str]:
|
40 |
+
files = [
|
41 |
+
f for f in files if not f.startswith("http://")
|
42 |
+
] # exclude online files, http
|
43 |
+
files = [
|
44 |
+
f for f in files if not f.startswith("https://")
|
45 |
+
] # exclude online files, https
|
46 |
+
missing_files = [file for file in files if not os.path.exists(file)]
|
47 |
+
return missing_files
|
48 |
+
|
49 |
+
|
50 |
+
def float_or_disabled(x: str) -> Optional[float]:
|
51 |
+
if x.lower().strip() == "disabled":
|
52 |
+
return None
|
53 |
+
try:
|
54 |
+
return float(x)
|
55 |
+
except ValueError:
|
56 |
+
raise argparse.ArgumentTypeError(f"invalid float value: {x}")
|
57 |
+
|
58 |
+
|
59 |
+
def extract_text(
|
60 |
+
files: Iterable[str] = [],
|
61 |
+
outfile: str = "-",
|
62 |
+
laparams: Optional[LAParams] = None,
|
63 |
+
output_type: str = "text",
|
64 |
+
codec: str = "utf-8",
|
65 |
+
strip_control: bool = False,
|
66 |
+
maxpages: int = 0,
|
67 |
+
pages: Optional[Container[int]] = None,
|
68 |
+
password: str = "",
|
69 |
+
scale: float = 1.0,
|
70 |
+
rotation: int = 0,
|
71 |
+
layoutmode: str = "normal",
|
72 |
+
output_dir: Optional[str] = None,
|
73 |
+
debug: bool = False,
|
74 |
+
disable_caching: bool = False,
|
75 |
+
vfont: str = "",
|
76 |
+
vchar: str = "",
|
77 |
+
thread: int = 0,
|
78 |
+
lang_in: str = "",
|
79 |
+
lang_out: str = "",
|
80 |
+
service: str = "",
|
81 |
+
callback: object = None,
|
82 |
+
output: str = "",
|
83 |
+
**kwargs: Any,
|
84 |
+
) -> AnyIO:
|
85 |
+
import pdf2zh.high_level
|
86 |
+
from pdf2zh.doclayout import DocLayoutModel
|
87 |
+
|
88 |
+
if not files:
|
89 |
+
raise PDFValueError("Must provide files to work upon!")
|
90 |
+
|
91 |
+
if output_type == "text" and outfile != "-":
|
92 |
+
for override, alttype in OUTPUT_TYPES:
|
93 |
+
if outfile.endswith(override):
|
94 |
+
output_type = alttype
|
95 |
+
|
96 |
+
outfp: AnyIO = sys.stdout
|
97 |
+
model = DocLayoutModel.load_available()
|
98 |
+
|
99 |
+
for file in files:
|
100 |
+
if file.startswith("http://") or file.startswith("https://"):
|
101 |
+
print("Online files detected, downloading...")
|
102 |
+
try:
|
103 |
+
r = requests.get(file, allow_redirects=True)
|
104 |
+
if r.status_code == 200:
|
105 |
+
if not os.path.exists("./pdf2zh_files"):
|
106 |
+
print("Making a temporary dir for downloading PDF files...")
|
107 |
+
os.mkdir(os.path.dirname("./pdf2zh_files"))
|
108 |
+
with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
|
109 |
+
print(f"Writing the file: {file}...")
|
110 |
+
f.write(r.content)
|
111 |
+
file = "./pdf2zh_files/tmp_download.pdf"
|
112 |
+
else:
|
113 |
+
r.raise_for_status()
|
114 |
+
except Exception as e:
|
115 |
+
raise PDFValueError(
|
116 |
+
f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
|
117 |
+
)
|
118 |
+
filename = os.path.splitext(os.path.basename(file))[0]
|
119 |
+
|
120 |
+
doc_en = pymupdf.open(file)
|
121 |
+
page_count = doc_en.page_count
|
122 |
+
font_list = ["china-ss", "tiro"]
|
123 |
+
font_id = {}
|
124 |
+
for page in doc_en:
|
125 |
+
for font in font_list:
|
126 |
+
font_id[font] = page.insert_font(font)
|
127 |
+
xreflen = doc_en.xref_length()
|
128 |
+
for xref in range(1, xreflen):
|
129 |
+
for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
|
130 |
+
try: # xref 读写可能出错
|
131 |
+
font_res = doc_en.xref_get_key(xref, f"{label}Font")
|
132 |
+
if font_res[0] == "dict":
|
133 |
+
for font in font_list:
|
134 |
+
font_exist = doc_en.xref_get_key(
|
135 |
+
xref, f"{label}Font/{font}"
|
136 |
+
)
|
137 |
+
if font_exist[0] == "null":
|
138 |
+
doc_en.xref_set_key(
|
139 |
+
xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
|
140 |
+
)
|
141 |
+
except Exception:
|
142 |
+
pass
|
143 |
+
doc_en.save(Path(output) / f"{filename}-en.pdf")
|
144 |
+
|
145 |
+
with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
|
146 |
+
obj_patch: dict = pdf2zh.high_level.extract_text_to_fp(fp, **locals())
|
147 |
+
|
148 |
+
for obj_id, ops_new in obj_patch.items():
|
149 |
+
# ops_old=doc_en.xref_stream(obj_id)
|
150 |
+
# print(obj_id)
|
151 |
+
# print(ops_old)
|
152 |
+
# print(ops_new.encode())
|
153 |
+
doc_en.update_stream(obj_id, ops_new.encode())
|
154 |
+
|
155 |
+
doc_zh = doc_en
|
156 |
+
doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
|
157 |
+
doc_dual.insert_file(doc_zh)
|
158 |
+
for id in range(page_count):
|
159 |
+
doc_dual.move_page(page_count + id, id * 2 + 1)
|
160 |
+
doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
|
161 |
+
doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
|
162 |
+
doc_zh.close()
|
163 |
+
doc_dual.close()
|
164 |
+
os.remove(Path(output) / f"{filename}-en.pdf")
|
165 |
+
|
166 |
+
return
|
167 |
+
|
168 |
+
|
169 |
+
def create_parser() -> argparse.ArgumentParser:
|
170 |
+
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
171 |
+
parser.add_argument(
|
172 |
+
"files",
|
173 |
+
type=str,
|
174 |
+
default=None,
|
175 |
+
nargs="*",
|
176 |
+
help="One or more paths to PDF files.",
|
177 |
+
)
|
178 |
+
parser.add_argument(
|
179 |
+
"--version",
|
180 |
+
"-v",
|
181 |
+
action="version",
|
182 |
+
version=f"pdf2zh v{__version__}",
|
183 |
+
)
|
184 |
+
parser.add_argument(
|
185 |
+
"--debug",
|
186 |
+
"-d",
|
187 |
+
default=False,
|
188 |
+
action="store_true",
|
189 |
+
help="Use debug logging level.",
|
190 |
+
)
|
191 |
+
parse_params = parser.add_argument_group(
|
192 |
+
"Parser",
|
193 |
+
description="Used during PDF parsing",
|
194 |
+
)
|
195 |
+
parse_params.add_argument(
|
196 |
+
"--pages",
|
197 |
+
"-p",
|
198 |
+
type=str,
|
199 |
+
help="The list of page numbers to parse.",
|
200 |
+
)
|
201 |
+
parse_params.add_argument(
|
202 |
+
"--password",
|
203 |
+
"-P",
|
204 |
+
type=str,
|
205 |
+
default="",
|
206 |
+
help="The password to use for decrypting PDF file.",
|
207 |
+
)
|
208 |
+
parse_params.add_argument(
|
209 |
+
"--vfont",
|
210 |
+
"-f",
|
211 |
+
type=str,
|
212 |
+
default="",
|
213 |
+
help="The regex to math font name of formula.",
|
214 |
+
)
|
215 |
+
parse_params.add_argument(
|
216 |
+
"--vchar",
|
217 |
+
"-c",
|
218 |
+
type=str,
|
219 |
+
default="",
|
220 |
+
help="The regex to math character of formula.",
|
221 |
+
)
|
222 |
+
parse_params.add_argument(
|
223 |
+
"--lang-in",
|
224 |
+
"-li",
|
225 |
+
type=str,
|
226 |
+
default="auto",
|
227 |
+
help="The code of source language.",
|
228 |
+
)
|
229 |
+
parse_params.add_argument(
|
230 |
+
"--lang-out",
|
231 |
+
"-lo",
|
232 |
+
type=str,
|
233 |
+
default="auto",
|
234 |
+
help="The code of target language.",
|
235 |
+
)
|
236 |
+
parse_params.add_argument(
|
237 |
+
"--service",
|
238 |
+
"-s",
|
239 |
+
type=str,
|
240 |
+
default="google",
|
241 |
+
help="The service to use for translation.",
|
242 |
+
)
|
243 |
+
parse_params.add_argument(
|
244 |
+
"--output",
|
245 |
+
"-o",
|
246 |
+
type=str,
|
247 |
+
default="",
|
248 |
+
help="Output directory for files.",
|
249 |
+
)
|
250 |
+
parse_params.add_argument(
|
251 |
+
"--thread",
|
252 |
+
"-t",
|
253 |
+
type=int,
|
254 |
+
default=4,
|
255 |
+
help="The number of threads to execute translation.",
|
256 |
+
)
|
257 |
+
parse_params.add_argument(
|
258 |
+
"--interactive",
|
259 |
+
"-i",
|
260 |
+
action="store_true",
|
261 |
+
help="Interact with GUI.",
|
262 |
+
)
|
263 |
+
parse_params.add_argument(
|
264 |
+
"--share",
|
265 |
+
action="store_true",
|
266 |
+
help="Enable Gradio Share",
|
267 |
+
)
|
268 |
+
|
269 |
+
return parser
|
270 |
+
|
271 |
+
|
272 |
+
def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
|
273 |
+
parsed_args = create_parser().parse_args(args=args)
|
274 |
+
|
275 |
+
if parsed_args.pages:
|
276 |
+
pages = []
|
277 |
+
for p in parsed_args.pages.split(","):
|
278 |
+
if "-" in p:
|
279 |
+
start, end = p.split("-")
|
280 |
+
pages.extend(range(int(start) - 1, int(end)))
|
281 |
+
else:
|
282 |
+
pages.append(int(p) - 1)
|
283 |
+
parsed_args.pages = pages
|
284 |
+
|
285 |
+
return parsed_args
|
286 |
+
|
287 |
+
|
288 |
+
def main(args: Optional[List[str]] = None) -> int:
|
289 |
+
parsed_args = parse_args(args)
|
290 |
+
|
291 |
+
missing_files = check_files(parsed_args.files)
|
292 |
+
if missing_files:
|
293 |
+
print("The following files do not exist:", file=sys.stderr)
|
294 |
+
for file in missing_files:
|
295 |
+
print(f" {file}", file=sys.stderr)
|
296 |
+
return -1
|
297 |
+
if parsed_args.interactive:
|
298 |
+
from pdf2zh.gui import setup_gui
|
299 |
+
|
300 |
+
setup_gui(parsed_args.share)
|
301 |
+
return 0
|
302 |
+
|
303 |
+
setup_log()
|
304 |
+
extract_text(**vars(parsed_args))
|
305 |
+
return 0
|
306 |
+
|
307 |
+
|
308 |
+
if __name__ == "__main__":
|
309 |
+
sys.exit(main())
|
310 |
+
sys.exit(main())
|
pdf2zh/pdfcolor.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import collections
|
2 |
+
from typing import Dict
|
3 |
+
|
4 |
+
from pdf2zh.psparser import LIT
|
5 |
+
|
6 |
+
LITERAL_DEVICE_GRAY = LIT("DeviceGray")
|
7 |
+
LITERAL_DEVICE_RGB = LIT("DeviceRGB")
|
8 |
+
LITERAL_DEVICE_CMYK = LIT("DeviceCMYK")
|
9 |
+
# Abbreviations for inline images
|
10 |
+
LITERAL_INLINE_DEVICE_GRAY = LIT("G")
|
11 |
+
LITERAL_INLINE_DEVICE_RGB = LIT("RGB")
|
12 |
+
LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK")
|
13 |
+
|
14 |
+
|
15 |
+
class PDFColorSpace:
|
16 |
+
def __init__(self, name: str, ncomponents: int) -> None:
|
17 |
+
self.name = name
|
18 |
+
self.ncomponents = ncomponents
|
19 |
+
|
20 |
+
def __repr__(self) -> str:
|
21 |
+
return "<PDFColorSpace: %s, ncomponents=%d>" % (self.name, self.ncomponents)
|
22 |
+
|
23 |
+
|
24 |
+
PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict()
|
25 |
+
|
26 |
+
for name, n in [
|
27 |
+
("DeviceGray", 1), # default value first
|
28 |
+
("CalRGB", 3),
|
29 |
+
("CalGray", 1),
|
30 |
+
("Lab", 3),
|
31 |
+
("DeviceRGB", 3),
|
32 |
+
("DeviceCMYK", 4),
|
33 |
+
("Separation", 1),
|
34 |
+
("Indexed", 1),
|
35 |
+
("Pattern", 1),
|
36 |
+
]:
|
37 |
+
PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)
|
pdf2zh/pdfdevice.py
ADDED
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import (
|
2 |
+
TYPE_CHECKING,
|
3 |
+
BinaryIO,
|
4 |
+
Iterable,
|
5 |
+
List,
|
6 |
+
Optional,
|
7 |
+
Sequence,
|
8 |
+
Union,
|
9 |
+
cast,
|
10 |
+
)
|
11 |
+
|
12 |
+
from pdf2zh import utils
|
13 |
+
from pdf2zh.pdfcolor import PDFColorSpace
|
14 |
+
from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined
|
15 |
+
from pdf2zh.pdfpage import PDFPage
|
16 |
+
from pdf2zh.pdftypes import PDFStream
|
17 |
+
from pdf2zh.psparser import PSLiteral
|
18 |
+
from pdf2zh.utils import Matrix, PathSegment, Point, Rect
|
19 |
+
|
20 |
+
if TYPE_CHECKING:
|
21 |
+
from pdf2zh.pdfinterp import (
|
22 |
+
PDFGraphicState,
|
23 |
+
PDFResourceManager,
|
24 |
+
PDFStackT,
|
25 |
+
PDFTextState,
|
26 |
+
)
|
27 |
+
|
28 |
+
|
29 |
+
PDFTextSeq = Iterable[Union[int, float, bytes]]
|
30 |
+
|
31 |
+
|
32 |
+
class PDFDevice:
|
33 |
+
"""Translate the output of PDFPageInterpreter to the output that is needed"""
|
34 |
+
|
35 |
+
def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
|
36 |
+
self.rsrcmgr = rsrcmgr
|
37 |
+
self.ctm: Optional[Matrix] = None
|
38 |
+
|
39 |
+
def __repr__(self) -> str:
|
40 |
+
return "<PDFDevice>"
|
41 |
+
|
42 |
+
def __enter__(self) -> "PDFDevice":
|
43 |
+
return self
|
44 |
+
|
45 |
+
def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
|
46 |
+
self.close()
|
47 |
+
|
48 |
+
def close(self) -> None:
|
49 |
+
pass
|
50 |
+
|
51 |
+
def set_ctm(self, ctm: Matrix) -> None:
|
52 |
+
self.ctm = ctm
|
53 |
+
|
54 |
+
def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
|
55 |
+
pass
|
56 |
+
|
57 |
+
def end_tag(self) -> None:
|
58 |
+
pass
|
59 |
+
|
60 |
+
def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
|
61 |
+
pass
|
62 |
+
|
63 |
+
def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
|
64 |
+
pass
|
65 |
+
|
66 |
+
def end_page(self, page: PDFPage) -> None:
|
67 |
+
pass
|
68 |
+
|
69 |
+
def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
|
70 |
+
pass
|
71 |
+
|
72 |
+
def end_figure(self, name: str) -> None:
|
73 |
+
pass
|
74 |
+
|
75 |
+
def paint_path(
|
76 |
+
self,
|
77 |
+
graphicstate: "PDFGraphicState",
|
78 |
+
stroke: bool,
|
79 |
+
fill: bool,
|
80 |
+
evenodd: bool,
|
81 |
+
path: Sequence[PathSegment],
|
82 |
+
) -> None:
|
83 |
+
pass
|
84 |
+
|
85 |
+
def render_image(self, name: str, stream: PDFStream) -> None:
|
86 |
+
pass
|
87 |
+
|
88 |
+
def render_string(
|
89 |
+
self,
|
90 |
+
textstate: "PDFTextState",
|
91 |
+
seq: PDFTextSeq,
|
92 |
+
ncs: PDFColorSpace,
|
93 |
+
graphicstate: "PDFGraphicState",
|
94 |
+
) -> None:
|
95 |
+
pass
|
96 |
+
|
97 |
+
|
98 |
+
class PDFTextDevice(PDFDevice):
|
99 |
+
def render_string(
|
100 |
+
self,
|
101 |
+
textstate: "PDFTextState",
|
102 |
+
seq: PDFTextSeq,
|
103 |
+
ncs: PDFColorSpace,
|
104 |
+
graphicstate: "PDFGraphicState",
|
105 |
+
) -> None:
|
106 |
+
assert self.ctm is not None
|
107 |
+
matrix = utils.mult_matrix(textstate.matrix, self.ctm)
|
108 |
+
font = textstate.font
|
109 |
+
fontsize = textstate.fontsize
|
110 |
+
scaling = textstate.scaling * 0.01
|
111 |
+
charspace = textstate.charspace * scaling
|
112 |
+
wordspace = textstate.wordspace * scaling
|
113 |
+
rise = textstate.rise
|
114 |
+
assert font is not None
|
115 |
+
if font.is_multibyte():
|
116 |
+
wordspace = 0
|
117 |
+
dxscale = 0.001 * fontsize * scaling
|
118 |
+
if font.is_vertical():
|
119 |
+
textstate.linematrix = self.render_string_vertical(
|
120 |
+
seq,
|
121 |
+
matrix,
|
122 |
+
textstate.linematrix,
|
123 |
+
font,
|
124 |
+
fontsize,
|
125 |
+
scaling,
|
126 |
+
charspace,
|
127 |
+
wordspace,
|
128 |
+
rise,
|
129 |
+
dxscale,
|
130 |
+
ncs,
|
131 |
+
graphicstate,
|
132 |
+
)
|
133 |
+
else:
|
134 |
+
textstate.linematrix = self.render_string_horizontal(
|
135 |
+
seq,
|
136 |
+
matrix,
|
137 |
+
textstate.linematrix,
|
138 |
+
font,
|
139 |
+
fontsize,
|
140 |
+
scaling,
|
141 |
+
charspace,
|
142 |
+
wordspace,
|
143 |
+
rise,
|
144 |
+
dxscale,
|
145 |
+
ncs,
|
146 |
+
graphicstate,
|
147 |
+
)
|
148 |
+
|
149 |
+
def render_string_horizontal(
|
150 |
+
self,
|
151 |
+
seq: PDFTextSeq,
|
152 |
+
matrix: Matrix,
|
153 |
+
pos: Point,
|
154 |
+
font: PDFFont,
|
155 |
+
fontsize: float,
|
156 |
+
scaling: float,
|
157 |
+
charspace: float,
|
158 |
+
wordspace: float,
|
159 |
+
rise: float,
|
160 |
+
dxscale: float,
|
161 |
+
ncs: PDFColorSpace,
|
162 |
+
graphicstate: "PDFGraphicState",
|
163 |
+
) -> Point:
|
164 |
+
(x, y) = pos
|
165 |
+
needcharspace = False
|
166 |
+
for obj in seq:
|
167 |
+
if isinstance(obj, (int, float)):
|
168 |
+
x -= obj * dxscale
|
169 |
+
needcharspace = True
|
170 |
+
else:
|
171 |
+
for cid in font.decode(obj):
|
172 |
+
if needcharspace:
|
173 |
+
x += charspace
|
174 |
+
x += self.render_char(
|
175 |
+
utils.translate_matrix(matrix, (x, y)),
|
176 |
+
font,
|
177 |
+
fontsize,
|
178 |
+
scaling,
|
179 |
+
rise,
|
180 |
+
cid,
|
181 |
+
ncs,
|
182 |
+
graphicstate,
|
183 |
+
)
|
184 |
+
if cid == 32 and wordspace:
|
185 |
+
x += wordspace
|
186 |
+
needcharspace = True
|
187 |
+
return (x, y)
|
188 |
+
|
189 |
+
def render_string_vertical(
|
190 |
+
self,
|
191 |
+
seq: PDFTextSeq,
|
192 |
+
matrix: Matrix,
|
193 |
+
pos: Point,
|
194 |
+
font: PDFFont,
|
195 |
+
fontsize: float,
|
196 |
+
scaling: float,
|
197 |
+
charspace: float,
|
198 |
+
wordspace: float,
|
199 |
+
rise: float,
|
200 |
+
dxscale: float,
|
201 |
+
ncs: PDFColorSpace,
|
202 |
+
graphicstate: "PDFGraphicState",
|
203 |
+
) -> Point:
|
204 |
+
(x, y) = pos
|
205 |
+
needcharspace = False
|
206 |
+
for obj in seq:
|
207 |
+
if isinstance(obj, (int, float)):
|
208 |
+
y -= obj * dxscale
|
209 |
+
needcharspace = True
|
210 |
+
else:
|
211 |
+
for cid in font.decode(obj):
|
212 |
+
if needcharspace:
|
213 |
+
y += charspace
|
214 |
+
y += self.render_char(
|
215 |
+
utils.translate_matrix(matrix, (x, y)),
|
216 |
+
font,
|
217 |
+
fontsize,
|
218 |
+
scaling,
|
219 |
+
rise,
|
220 |
+
cid,
|
221 |
+
ncs,
|
222 |
+
graphicstate,
|
223 |
+
)
|
224 |
+
if cid == 32 and wordspace:
|
225 |
+
y += wordspace
|
226 |
+
needcharspace = True
|
227 |
+
return (x, y)
|
228 |
+
|
229 |
+
def render_char(
|
230 |
+
self,
|
231 |
+
matrix: Matrix,
|
232 |
+
font: PDFFont,
|
233 |
+
fontsize: float,
|
234 |
+
scaling: float,
|
235 |
+
rise: float,
|
236 |
+
cid: int,
|
237 |
+
ncs: PDFColorSpace,
|
238 |
+
graphicstate: "PDFGraphicState",
|
239 |
+
) -> float:
|
240 |
+
return 0
|
241 |
+
|
242 |
+
|
243 |
+
class TagExtractor(PDFDevice):
|
244 |
+
def __init__(
|
245 |
+
self,
|
246 |
+
rsrcmgr: "PDFResourceManager",
|
247 |
+
outfp: BinaryIO,
|
248 |
+
codec: str = "utf-8",
|
249 |
+
) -> None:
|
250 |
+
PDFDevice.__init__(self, rsrcmgr)
|
251 |
+
self.outfp = outfp
|
252 |
+
self.codec = codec
|
253 |
+
self.pageno = 0
|
254 |
+
self._stack: List[PSLiteral] = []
|
255 |
+
|
256 |
+
def render_string(
|
257 |
+
self,
|
258 |
+
textstate: "PDFTextState",
|
259 |
+
seq: PDFTextSeq,
|
260 |
+
ncs: PDFColorSpace,
|
261 |
+
graphicstate: "PDFGraphicState",
|
262 |
+
) -> None:
|
263 |
+
font = textstate.font
|
264 |
+
assert font is not None
|
265 |
+
text = ""
|
266 |
+
for obj in seq:
|
267 |
+
if isinstance(obj, str):
|
268 |
+
obj = utils.make_compat_bytes(obj)
|
269 |
+
if not isinstance(obj, bytes):
|
270 |
+
continue
|
271 |
+
chars = font.decode(obj)
|
272 |
+
for cid in chars:
|
273 |
+
try:
|
274 |
+
char = font.to_unichr(cid)
|
275 |
+
text += char
|
276 |
+
except PDFUnicodeNotDefined:
|
277 |
+
pass
|
278 |
+
self._write(utils.enc(text))
|
279 |
+
|
280 |
+
def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
|
281 |
+
output = '<page id="%s" bbox="%s" rotate="%d">' % (
|
282 |
+
self.pageno,
|
283 |
+
utils.bbox2str(page.mediabox),
|
284 |
+
page.rotate,
|
285 |
+
)
|
286 |
+
self._write(output)
|
287 |
+
|
288 |
+
def end_page(self, page: PDFPage) -> None:
|
289 |
+
self._write("</page>\n")
|
290 |
+
self.pageno += 1
|
291 |
+
|
292 |
+
def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
|
293 |
+
s = ""
|
294 |
+
if isinstance(props, dict):
|
295 |
+
s = "".join(
|
296 |
+
[
|
297 |
+
f' {utils.enc(k)}="{utils.make_compat_str(v)}"'
|
298 |
+
for (k, v) in sorted(props.items())
|
299 |
+
],
|
300 |
+
)
|
301 |
+
out_s = f"<{utils.enc(cast(str, tag.name))}{s}>"
|
302 |
+
self._write(out_s)
|
303 |
+
self._stack.append(tag)
|
304 |
+
|
305 |
+
def end_tag(self) -> None:
|
306 |
+
assert self._stack, str(self.pageno)
|
307 |
+
tag = self._stack.pop(-1)
|
308 |
+
out_s = "</%s>" % utils.enc(cast(str, tag.name))
|
309 |
+
self._write(out_s)
|
310 |
+
|
311 |
+
def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
|
312 |
+
self.begin_tag(tag, props)
|
313 |
+
self._stack.pop(-1)
|
314 |
+
|
315 |
+
def _write(self, s: str) -> None:
|
316 |
+
self.outfp.write(s.encode(self.codec))
|
pdf2zh/pdfdocument.py
ADDED
@@ -0,0 +1,1069 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import itertools
|
2 |
+
import logging
|
3 |
+
import re
|
4 |
+
import struct
|
5 |
+
from hashlib import md5, sha256, sha384, sha512
|
6 |
+
from typing import (
|
7 |
+
Any,
|
8 |
+
Callable,
|
9 |
+
Dict,
|
10 |
+
Iterable,
|
11 |
+
Iterator,
|
12 |
+
KeysView,
|
13 |
+
List,
|
14 |
+
Optional,
|
15 |
+
Sequence,
|
16 |
+
Tuple,
|
17 |
+
Type,
|
18 |
+
Union,
|
19 |
+
cast,
|
20 |
+
)
|
21 |
+
|
22 |
+
from cryptography.hazmat.backends import default_backend
|
23 |
+
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
24 |
+
|
25 |
+
from pdf2zh import settings
|
26 |
+
from pdf2zh.arcfour import Arcfour
|
27 |
+
from pdf2zh.data_structures import NumberTree
|
28 |
+
from pdf2zh.pdfexceptions import (
|
29 |
+
PDFException,
|
30 |
+
PDFKeyError,
|
31 |
+
PDFObjectNotFound,
|
32 |
+
PDFTypeError,
|
33 |
+
)
|
34 |
+
from pdf2zh.pdfparser import PDFParser, PDFStreamParser, PDFSyntaxError
|
35 |
+
from pdf2zh.pdftypes import (
|
36 |
+
DecipherCallable,
|
37 |
+
PDFStream,
|
38 |
+
decipher_all,
|
39 |
+
dict_value,
|
40 |
+
int_value,
|
41 |
+
list_value,
|
42 |
+
str_value,
|
43 |
+
stream_value,
|
44 |
+
uint_value,
|
45 |
+
)
|
46 |
+
from pdf2zh.psexceptions import PSEOF
|
47 |
+
from pdf2zh.psparser import KWD, LIT, literal_name
|
48 |
+
from pdf2zh.utils import (
|
49 |
+
choplist,
|
50 |
+
decode_text,
|
51 |
+
format_int_alpha,
|
52 |
+
format_int_roman,
|
53 |
+
nunpack,
|
54 |
+
)
|
55 |
+
|
56 |
+
log = logging.getLogger(__name__)
|
57 |
+
|
58 |
+
|
59 |
+
class PDFNoValidXRef(PDFSyntaxError):
|
60 |
+
pass
|
61 |
+
|
62 |
+
|
63 |
+
class PDFNoValidXRefWarning(SyntaxWarning):
|
64 |
+
"""Legacy warning for missing xref.
|
65 |
+
|
66 |
+
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
|
67 |
+
"""
|
68 |
+
|
69 |
+
|
70 |
+
class PDFNoOutlines(PDFException):
|
71 |
+
pass
|
72 |
+
|
73 |
+
|
74 |
+
class PDFNoPageLabels(PDFException):
|
75 |
+
pass
|
76 |
+
|
77 |
+
|
78 |
+
class PDFDestinationNotFound(PDFException):
|
79 |
+
pass
|
80 |
+
|
81 |
+
|
82 |
+
class PDFEncryptionError(PDFException):
|
83 |
+
pass
|
84 |
+
|
85 |
+
|
86 |
+
class PDFPasswordIncorrect(PDFEncryptionError):
|
87 |
+
pass
|
88 |
+
|
89 |
+
|
90 |
+
class PDFEncryptionWarning(UserWarning):
|
91 |
+
"""Legacy warning for failed decryption.
|
92 |
+
|
93 |
+
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
|
94 |
+
"""
|
95 |
+
|
96 |
+
|
97 |
+
class PDFTextExtractionNotAllowedWarning(UserWarning):
|
98 |
+
"""Legacy warning for PDF that does not allow extraction.
|
99 |
+
|
100 |
+
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
|
101 |
+
"""
|
102 |
+
|
103 |
+
|
104 |
+
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
105 |
+
pass
|
106 |
+
|
107 |
+
|
108 |
+
# some predefined literals and keywords.
|
109 |
+
LITERAL_OBJSTM = LIT("ObjStm")
|
110 |
+
LITERAL_XREF = LIT("XRef")
|
111 |
+
LITERAL_CATALOG = LIT("Catalog")
|
112 |
+
|
113 |
+
|
114 |
+
class PDFBaseXRef:
|
115 |
+
def get_trailer(self) -> Dict[str, Any]:
|
116 |
+
raise NotImplementedError
|
117 |
+
|
118 |
+
def get_objids(self) -> Iterable[int]:
|
119 |
+
return []
|
120 |
+
|
121 |
+
# Must return
|
122 |
+
# (strmid, index, genno)
|
123 |
+
# or (None, pos, genno)
|
124 |
+
def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
|
125 |
+
raise PDFKeyError(objid)
|
126 |
+
|
127 |
+
def load(self, parser: PDFParser) -> None:
|
128 |
+
raise NotImplementedError
|
129 |
+
|
130 |
+
|
131 |
+
class PDFXRef(PDFBaseXRef):
|
132 |
+
def __init__(self) -> None:
|
133 |
+
self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {}
|
134 |
+
self.trailer: Dict[str, Any] = {}
|
135 |
+
|
136 |
+
def __repr__(self) -> str:
|
137 |
+
return "<PDFXRef: offsets=%r>" % (self.offsets.keys())
|
138 |
+
|
139 |
+
def load(self, parser: PDFParser) -> None:
|
140 |
+
while True:
|
141 |
+
try:
|
142 |
+
(pos, line) = parser.nextline()
|
143 |
+
line = line.strip()
|
144 |
+
if not line:
|
145 |
+
continue
|
146 |
+
except PSEOF:
|
147 |
+
raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
|
148 |
+
if line.startswith(b"trailer"):
|
149 |
+
parser.seek(pos)
|
150 |
+
break
|
151 |
+
f = line.split(b" ")
|
152 |
+
if len(f) != 2:
|
153 |
+
error_msg = f"Trailer not found: {parser!r}: line={line!r}"
|
154 |
+
raise PDFNoValidXRef(error_msg)
|
155 |
+
try:
|
156 |
+
(start, nobjs) = map(int, f)
|
157 |
+
except ValueError:
|
158 |
+
error_msg = f"Invalid line: {parser!r}: line={line!r}"
|
159 |
+
raise PDFNoValidXRef(error_msg)
|
160 |
+
for objid in range(start, start + nobjs):
|
161 |
+
try:
|
162 |
+
(_, line) = parser.nextline()
|
163 |
+
line = line.strip()
|
164 |
+
except PSEOF:
|
165 |
+
raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
|
166 |
+
f = line.split(b" ")
|
167 |
+
if len(f) != 3:
|
168 |
+
error_msg = f"Invalid XRef format: {parser!r}, line={line!r}"
|
169 |
+
raise PDFNoValidXRef(error_msg)
|
170 |
+
(pos_b, genno_b, use_b) = f
|
171 |
+
if use_b != b"n":
|
172 |
+
continue
|
173 |
+
self.offsets[objid] = (None, int(pos_b), int(genno_b))
|
174 |
+
# log.debug("xref objects: %r", self.offsets)
|
175 |
+
self.load_trailer(parser)
|
176 |
+
|
177 |
+
def load_trailer(self, parser: PDFParser) -> None:
|
178 |
+
try:
|
179 |
+
(_, kwd) = parser.nexttoken()
|
180 |
+
assert kwd is KWD(b"trailer"), str(kwd)
|
181 |
+
_, (_, dic) = parser.nextobject()
|
182 |
+
except PSEOF:
|
183 |
+
x = parser.pop(1)
|
184 |
+
if not x:
|
185 |
+
raise PDFNoValidXRef("Unexpected EOF - file corrupted")
|
186 |
+
(_, dic) = x[0]
|
187 |
+
self.trailer.update(dict_value(dic))
|
188 |
+
# log.debug("trailer=%r", self.trailer)
|
189 |
+
|
190 |
+
def get_trailer(self) -> Dict[str, Any]:
|
191 |
+
return self.trailer
|
192 |
+
|
193 |
+
def get_objids(self) -> KeysView[int]:
|
194 |
+
return self.offsets.keys()
|
195 |
+
|
196 |
+
def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
|
197 |
+
return self.offsets[objid]
|
198 |
+
|
199 |
+
|
200 |
+
class PDFXRefFallback(PDFXRef):
|
201 |
+
def __repr__(self) -> str:
|
202 |
+
return "<PDFXRefFallback: offsets=%r>" % (self.offsets.keys())
|
203 |
+
|
204 |
+
PDFOBJ_CUE = re.compile(r"^(\d+)\s+(\d+)\s+obj\b")
|
205 |
+
|
206 |
+
def load(self, parser: PDFParser) -> None:
|
207 |
+
parser.seek(0)
|
208 |
+
while 1:
|
209 |
+
try:
|
210 |
+
(pos, line_bytes) = parser.nextline()
|
211 |
+
except PSEOF:
|
212 |
+
break
|
213 |
+
if line_bytes.startswith(b"trailer"):
|
214 |
+
parser.seek(pos)
|
215 |
+
self.load_trailer(parser)
|
216 |
+
# log.debug("trailer: %r", self.trailer)
|
217 |
+
break
|
218 |
+
line = line_bytes.decode("latin-1") # default pdf encoding
|
219 |
+
m = self.PDFOBJ_CUE.match(line)
|
220 |
+
if not m:
|
221 |
+
continue
|
222 |
+
(objid_s, genno_s) = m.groups()
|
223 |
+
objid = int(objid_s)
|
224 |
+
genno = int(genno_s)
|
225 |
+
self.offsets[objid] = (None, pos, genno)
|
226 |
+
# expand ObjStm.
|
227 |
+
parser.seek(pos)
|
228 |
+
_, (_, obj) = parser.nextobject()
|
229 |
+
if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM:
|
230 |
+
stream = stream_value(obj)
|
231 |
+
try:
|
232 |
+
n = stream["N"]
|
233 |
+
except KeyError:
|
234 |
+
if settings.STRICT:
|
235 |
+
raise PDFSyntaxError("N is not defined: %r" % stream)
|
236 |
+
n = 0
|
237 |
+
parser1 = PDFStreamParser(stream.get_data())
|
238 |
+
objs: List[int] = []
|
239 |
+
try:
|
240 |
+
while 1:
|
241 |
+
_, (_, obj) = parser1.nextobject()
|
242 |
+
objs.append(cast(int, obj))
|
243 |
+
except PSEOF:
|
244 |
+
pass
|
245 |
+
n = min(n, len(objs) // 2)
|
246 |
+
for index in range(n):
|
247 |
+
objid1 = objs[index * 2]
|
248 |
+
self.offsets[objid1] = (objid, index, 0)
|
249 |
+
|
250 |
+
|
251 |
+
class PDFXRefStream(PDFBaseXRef):
|
252 |
+
def __init__(self) -> None:
|
253 |
+
self.data: Optional[bytes] = None
|
254 |
+
self.entlen: Optional[int] = None
|
255 |
+
self.fl1: Optional[int] = None
|
256 |
+
self.fl2: Optional[int] = None
|
257 |
+
self.fl3: Optional[int] = None
|
258 |
+
self.ranges: List[Tuple[int, int]] = []
|
259 |
+
|
260 |
+
def __repr__(self) -> str:
|
261 |
+
return "<PDFXRefStream: ranges=%r>" % (self.ranges)
|
262 |
+
|
263 |
+
def load(self, parser: PDFParser) -> None:
|
264 |
+
(_, objid) = parser.nexttoken() # ignored
|
265 |
+
(_, genno) = parser.nexttoken() # ignored
|
266 |
+
(_, kwd) = parser.nexttoken()
|
267 |
+
_, (_, stream) = parser.nextobject()
|
268 |
+
if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF:
|
269 |
+
raise PDFNoValidXRef("Invalid PDF stream spec.")
|
270 |
+
size = stream["Size"]
|
271 |
+
index_array = stream.get("Index", (0, size))
|
272 |
+
if len(index_array) % 2 != 0:
|
273 |
+
raise PDFSyntaxError("Invalid index number")
|
274 |
+
self.ranges.extend(cast(Iterator[Tuple[int, int]], choplist(2, index_array)))
|
275 |
+
(self.fl1, self.fl2, self.fl3) = stream["W"]
|
276 |
+
assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
|
277 |
+
self.data = stream.get_data()
|
278 |
+
self.entlen = self.fl1 + self.fl2 + self.fl3
|
279 |
+
self.trailer = stream.attrs
|
280 |
+
# log.debug(
|
281 |
+
# "xref stream: objid=%s, fields=%d,%d,%d",
|
282 |
+
# ", ".join(map(repr, self.ranges)),
|
283 |
+
# self.fl1,
|
284 |
+
# self.fl2,
|
285 |
+
# self.fl3,
|
286 |
+
# )
|
287 |
+
|
288 |
+
def get_trailer(self) -> Dict[str, Any]:
|
289 |
+
return self.trailer
|
290 |
+
|
291 |
+
def get_objids(self) -> Iterator[int]:
|
292 |
+
for start, nobjs in self.ranges:
|
293 |
+
for i in range(nobjs):
|
294 |
+
assert self.entlen is not None
|
295 |
+
assert self.data is not None
|
296 |
+
offset = self.entlen * i
|
297 |
+
ent = self.data[offset : offset + self.entlen]
|
298 |
+
f1 = nunpack(ent[: self.fl1], 1)
|
299 |
+
if f1 == 1 or f1 == 2:
|
300 |
+
yield start + i
|
301 |
+
|
302 |
+
def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
|
303 |
+
index = 0
|
304 |
+
for start, nobjs in self.ranges:
|
305 |
+
if start <= objid and objid < start + nobjs:
|
306 |
+
index += objid - start
|
307 |
+
break
|
308 |
+
else:
|
309 |
+
index += nobjs
|
310 |
+
else:
|
311 |
+
raise PDFKeyError(objid)
|
312 |
+
assert self.entlen is not None
|
313 |
+
assert self.data is not None
|
314 |
+
assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
|
315 |
+
offset = self.entlen * index
|
316 |
+
ent = self.data[offset : offset + self.entlen]
|
317 |
+
f1 = nunpack(ent[: self.fl1], 1)
|
318 |
+
f2 = nunpack(ent[self.fl1 : self.fl1 + self.fl2])
|
319 |
+
f3 = nunpack(ent[self.fl1 + self.fl2 :])
|
320 |
+
if f1 == 1:
|
321 |
+
return (None, f2, f3)
|
322 |
+
elif f1 == 2:
|
323 |
+
return (f2, f3, 0)
|
324 |
+
else:
|
325 |
+
# this is a free object
|
326 |
+
raise PDFKeyError(objid)
|
327 |
+
|
328 |
+
|
329 |
+
class PDFStandardSecurityHandler:
|
330 |
+
PASSWORD_PADDING = (
|
331 |
+
b"(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08"
|
332 |
+
b"..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz"
|
333 |
+
)
|
334 |
+
supported_revisions: Tuple[int, ...] = (2, 3)
|
335 |
+
|
336 |
+
def __init__(
|
337 |
+
self,
|
338 |
+
docid: Sequence[bytes],
|
339 |
+
param: Dict[str, Any],
|
340 |
+
password: str = "",
|
341 |
+
) -> None:
|
342 |
+
self.docid = docid
|
343 |
+
self.param = param
|
344 |
+
self.password = password
|
345 |
+
self.init()
|
346 |
+
|
347 |
+
def init(self) -> None:
|
348 |
+
self.init_params()
|
349 |
+
if self.r not in self.supported_revisions:
|
350 |
+
error_msg = "Unsupported revision: param=%r" % self.param
|
351 |
+
raise PDFEncryptionError(error_msg)
|
352 |
+
self.init_key()
|
353 |
+
|
354 |
+
def init_params(self) -> None:
|
355 |
+
self.v = int_value(self.param.get("V", 0))
|
356 |
+
self.r = int_value(self.param["R"])
|
357 |
+
self.p = uint_value(self.param["P"], 32)
|
358 |
+
self.o = str_value(self.param["O"])
|
359 |
+
self.u = str_value(self.param["U"])
|
360 |
+
self.length = int_value(self.param.get("Length", 40))
|
361 |
+
|
362 |
+
def init_key(self) -> None:
|
363 |
+
self.key = self.authenticate(self.password)
|
364 |
+
if self.key is None:
|
365 |
+
raise PDFPasswordIncorrect
|
366 |
+
|
367 |
+
def is_printable(self) -> bool:
|
368 |
+
return bool(self.p & 4)
|
369 |
+
|
370 |
+
def is_modifiable(self) -> bool:
|
371 |
+
return bool(self.p & 8)
|
372 |
+
|
373 |
+
def is_extractable(self) -> bool:
|
374 |
+
return bool(self.p & 16)
|
375 |
+
|
376 |
+
def compute_u(self, key: bytes) -> bytes:
|
377 |
+
if self.r == 2:
|
378 |
+
# Algorithm 3.4
|
379 |
+
return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
|
380 |
+
else:
|
381 |
+
# Algorithm 3.5
|
382 |
+
hash = md5(self.PASSWORD_PADDING) # 2
|
383 |
+
hash.update(self.docid[0]) # 3
|
384 |
+
result = Arcfour(key).encrypt(hash.digest()) # 4
|
385 |
+
for i in range(1, 20): # 5
|
386 |
+
k = b"".join(bytes((c ^ i,)) for c in iter(key))
|
387 |
+
result = Arcfour(k).encrypt(result)
|
388 |
+
result += result # 6
|
389 |
+
return result
|
390 |
+
|
391 |
+
def compute_encryption_key(self, password: bytes) -> bytes:
|
392 |
+
# Algorithm 3.2
|
393 |
+
password = (password + self.PASSWORD_PADDING)[:32] # 1
|
394 |
+
hash = md5(password) # 2
|
395 |
+
hash.update(self.o) # 3
|
396 |
+
# See https://github.com/pdf2zh/pdf2zh.six/issues/186
|
397 |
+
hash.update(struct.pack("<L", self.p)) # 4
|
398 |
+
hash.update(self.docid[0]) # 5
|
399 |
+
if self.r >= 4:
|
400 |
+
if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata:
|
401 |
+
hash.update(b"\xff\xff\xff\xff")
|
402 |
+
result = hash.digest()
|
403 |
+
n = 5
|
404 |
+
if self.r >= 3:
|
405 |
+
n = self.length // 8
|
406 |
+
for _ in range(50):
|
407 |
+
result = md5(result[:n]).digest()
|
408 |
+
return result[:n]
|
409 |
+
|
410 |
+
def authenticate(self, password: str) -> Optional[bytes]:
|
411 |
+
password_bytes = password.encode("latin1")
|
412 |
+
key = self.authenticate_user_password(password_bytes)
|
413 |
+
if key is None:
|
414 |
+
key = self.authenticate_owner_password(password_bytes)
|
415 |
+
return key
|
416 |
+
|
417 |
+
def authenticate_user_password(self, password: bytes) -> Optional[bytes]:
|
418 |
+
key = self.compute_encryption_key(password)
|
419 |
+
if self.verify_encryption_key(key):
|
420 |
+
return key
|
421 |
+
else:
|
422 |
+
return None
|
423 |
+
|
424 |
+
def verify_encryption_key(self, key: bytes) -> bool:
|
425 |
+
# Algorithm 3.6
|
426 |
+
u = self.compute_u(key)
|
427 |
+
if self.r == 2:
|
428 |
+
return u == self.u
|
429 |
+
return u[:16] == self.u[:16]
|
430 |
+
|
431 |
+
def authenticate_owner_password(self, password: bytes) -> Optional[bytes]:
|
432 |
+
# Algorithm 3.7
|
433 |
+
password = (password + self.PASSWORD_PADDING)[:32]
|
434 |
+
hash = md5(password)
|
435 |
+
if self.r >= 3:
|
436 |
+
for _ in range(50):
|
437 |
+
hash = md5(hash.digest())
|
438 |
+
n = 5
|
439 |
+
if self.r >= 3:
|
440 |
+
n = self.length // 8
|
441 |
+
key = hash.digest()[:n]
|
442 |
+
if self.r == 2:
|
443 |
+
user_password = Arcfour(key).decrypt(self.o)
|
444 |
+
else:
|
445 |
+
user_password = self.o
|
446 |
+
for i in range(19, -1, -1):
|
447 |
+
k = b"".join(bytes((c ^ i,)) for c in iter(key))
|
448 |
+
user_password = Arcfour(k).decrypt(user_password)
|
449 |
+
return self.authenticate_user_password(user_password)
|
450 |
+
|
451 |
+
def decrypt(
|
452 |
+
self,
|
453 |
+
objid: int,
|
454 |
+
genno: int,
|
455 |
+
data: bytes,
|
456 |
+
attrs: Optional[Dict[str, Any]] = None,
|
457 |
+
) -> bytes:
|
458 |
+
return self.decrypt_rc4(objid, genno, data)
|
459 |
+
|
460 |
+
def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes:
|
461 |
+
assert self.key is not None
|
462 |
+
key = self.key + struct.pack("<L", objid)[:3] + struct.pack("<L", genno)[:2]
|
463 |
+
hash = md5(key)
|
464 |
+
key = hash.digest()[: min(len(key), 16)]
|
465 |
+
return Arcfour(key).decrypt(data)
|
466 |
+
|
467 |
+
|
468 |
+
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
469 |
+
supported_revisions: Tuple[int, ...] = (4,)
|
470 |
+
|
471 |
+
def init_params(self) -> None:
|
472 |
+
super().init_params()
|
473 |
+
self.length = 128
|
474 |
+
self.cf = dict_value(self.param.get("CF"))
|
475 |
+
self.stmf = literal_name(self.param["StmF"])
|
476 |
+
self.strf = literal_name(self.param["StrF"])
|
477 |
+
self.encrypt_metadata = bool(self.param.get("EncryptMetadata", True))
|
478 |
+
if self.stmf != self.strf:
|
479 |
+
error_msg = "Unsupported crypt filter: param=%r" % self.param
|
480 |
+
raise PDFEncryptionError(error_msg)
|
481 |
+
self.cfm = {}
|
482 |
+
for k, v in self.cf.items():
|
483 |
+
f = self.get_cfm(literal_name(v["CFM"]))
|
484 |
+
if f is None:
|
485 |
+
error_msg = "Unknown crypt filter method: param=%r" % self.param
|
486 |
+
raise PDFEncryptionError(error_msg)
|
487 |
+
self.cfm[k] = f
|
488 |
+
self.cfm["Identity"] = self.decrypt_identity
|
489 |
+
if self.strf not in self.cfm:
|
490 |
+
error_msg = "Undefined crypt filter: param=%r" % self.param
|
491 |
+
raise PDFEncryptionError(error_msg)
|
492 |
+
|
493 |
+
def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
|
494 |
+
if name == "V2":
|
495 |
+
return self.decrypt_rc4
|
496 |
+
elif name == "AESV2":
|
497 |
+
return self.decrypt_aes128
|
498 |
+
else:
|
499 |
+
return None
|
500 |
+
|
501 |
+
def decrypt(
|
502 |
+
self,
|
503 |
+
objid: int,
|
504 |
+
genno: int,
|
505 |
+
data: bytes,
|
506 |
+
attrs: Optional[Dict[str, Any]] = None,
|
507 |
+
name: Optional[str] = None,
|
508 |
+
) -> bytes:
|
509 |
+
if not self.encrypt_metadata and attrs is not None:
|
510 |
+
t = attrs.get("Type")
|
511 |
+
if t is not None and literal_name(t) == "Metadata":
|
512 |
+
return data
|
513 |
+
if name is None:
|
514 |
+
name = self.strf
|
515 |
+
return self.cfm[name](objid, genno, data)
|
516 |
+
|
517 |
+
def decrypt_identity(self, objid: int, genno: int, data: bytes) -> bytes:
|
518 |
+
return data
|
519 |
+
|
520 |
+
def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes:
|
521 |
+
assert self.key is not None
|
522 |
+
key = (
|
523 |
+
self.key
|
524 |
+
+ struct.pack("<L", objid)[:3]
|
525 |
+
+ struct.pack("<L", genno)[:2]
|
526 |
+
+ b"sAlT"
|
527 |
+
)
|
528 |
+
hash = md5(key)
|
529 |
+
key = hash.digest()[: min(len(key), 16)]
|
530 |
+
initialization_vector = data[:16]
|
531 |
+
ciphertext = data[16:]
|
532 |
+
cipher = Cipher(
|
533 |
+
algorithms.AES(key),
|
534 |
+
modes.CBC(initialization_vector),
|
535 |
+
backend=default_backend(),
|
536 |
+
) # type: ignore
|
537 |
+
return cipher.decryptor().update(ciphertext) # type: ignore
|
538 |
+
|
539 |
+
|
540 |
+
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
541 |
+
supported_revisions = (5, 6)
|
542 |
+
|
543 |
+
def init_params(self) -> None:
|
544 |
+
super().init_params()
|
545 |
+
self.length = 256
|
546 |
+
self.oe = str_value(self.param["OE"])
|
547 |
+
self.ue = str_value(self.param["UE"])
|
548 |
+
self.o_hash = self.o[:32]
|
549 |
+
self.o_validation_salt = self.o[32:40]
|
550 |
+
self.o_key_salt = self.o[40:]
|
551 |
+
self.u_hash = self.u[:32]
|
552 |
+
self.u_validation_salt = self.u[32:40]
|
553 |
+
self.u_key_salt = self.u[40:]
|
554 |
+
|
555 |
+
def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
|
556 |
+
if name == "AESV3":
|
557 |
+
return self.decrypt_aes256
|
558 |
+
else:
|
559 |
+
return None
|
560 |
+
|
561 |
+
def authenticate(self, password: str) -> Optional[bytes]:
|
562 |
+
password_b = self._normalize_password(password)
|
563 |
+
hash = self._password_hash(password_b, self.o_validation_salt, self.u)
|
564 |
+
if hash == self.o_hash:
|
565 |
+
hash = self._password_hash(password_b, self.o_key_salt, self.u)
|
566 |
+
cipher = Cipher(
|
567 |
+
algorithms.AES(hash),
|
568 |
+
modes.CBC(b"\0" * 16),
|
569 |
+
backend=default_backend(),
|
570 |
+
) # type: ignore
|
571 |
+
return cipher.decryptor().update(self.oe) # type: ignore
|
572 |
+
hash = self._password_hash(password_b, self.u_validation_salt)
|
573 |
+
if hash == self.u_hash:
|
574 |
+
hash = self._password_hash(password_b, self.u_key_salt)
|
575 |
+
cipher = Cipher(
|
576 |
+
algorithms.AES(hash),
|
577 |
+
modes.CBC(b"\0" * 16),
|
578 |
+
backend=default_backend(),
|
579 |
+
) # type: ignore
|
580 |
+
return cipher.decryptor().update(self.ue) # type: ignore
|
581 |
+
return None
|
582 |
+
|
583 |
+
def _normalize_password(self, password: str) -> bytes:
|
584 |
+
if self.r == 6:
|
585 |
+
# saslprep expects non-empty strings, apparently
|
586 |
+
if not password:
|
587 |
+
return b""
|
588 |
+
from pdf2zh._saslprep import saslprep
|
589 |
+
|
590 |
+
password = saslprep(password)
|
591 |
+
return password.encode("utf-8")[:127]
|
592 |
+
|
593 |
+
def _password_hash(
|
594 |
+
self,
|
595 |
+
password: bytes,
|
596 |
+
salt: bytes,
|
597 |
+
vector: Optional[bytes] = None,
|
598 |
+
) -> bytes:
|
599 |
+
"""Compute password hash depending on revision number"""
|
600 |
+
if self.r == 5:
|
601 |
+
return self._r5_password(password, salt, vector)
|
602 |
+
return self._r6_password(password, salt[0:8], vector)
|
603 |
+
|
604 |
+
def _r5_password(
|
605 |
+
self,
|
606 |
+
password: bytes,
|
607 |
+
salt: bytes,
|
608 |
+
vector: Optional[bytes] = None,
|
609 |
+
) -> bytes:
|
610 |
+
"""Compute the password for revision 5"""
|
611 |
+
hash = sha256(password)
|
612 |
+
hash.update(salt)
|
613 |
+
if vector is not None:
|
614 |
+
hash.update(vector)
|
615 |
+
return hash.digest()
|
616 |
+
|
617 |
+
def _r6_password(
|
618 |
+
self,
|
619 |
+
password: bytes,
|
620 |
+
salt: bytes,
|
621 |
+
vector: Optional[bytes] = None,
|
622 |
+
) -> bytes:
|
623 |
+
"""Compute the password for revision 6"""
|
624 |
+
initial_hash = sha256(password)
|
625 |
+
initial_hash.update(salt)
|
626 |
+
if vector is not None:
|
627 |
+
initial_hash.update(vector)
|
628 |
+
k = initial_hash.digest()
|
629 |
+
hashes = (sha256, sha384, sha512)
|
630 |
+
round_no = last_byte_val = 0
|
631 |
+
while round_no < 64 or last_byte_val > round_no - 32:
|
632 |
+
k1 = (password + k + (vector or b"")) * 64
|
633 |
+
e = self._aes_cbc_encrypt(key=k[:16], iv=k[16:32], data=k1)
|
634 |
+
# compute the first 16 bytes of e,
|
635 |
+
# interpreted as an unsigned integer mod 3
|
636 |
+
next_hash = hashes[self._bytes_mod_3(e[:16])]
|
637 |
+
k = next_hash(e).digest()
|
638 |
+
last_byte_val = e[len(e) - 1]
|
639 |
+
round_no += 1
|
640 |
+
return k[:32]
|
641 |
+
|
642 |
+
@staticmethod
|
643 |
+
def _bytes_mod_3(input_bytes: bytes) -> int:
|
644 |
+
# 256 is 1 mod 3, so we can just sum 'em
|
645 |
+
return sum(b % 3 for b in input_bytes) % 3
|
646 |
+
|
647 |
+
def _aes_cbc_encrypt(self, key: bytes, iv: bytes, data: bytes) -> bytes:
|
648 |
+
cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
|
649 |
+
encryptor = cipher.encryptor() # type: ignore
|
650 |
+
return encryptor.update(data) + encryptor.finalize() # type: ignore
|
651 |
+
|
652 |
+
def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes:
|
653 |
+
initialization_vector = data[:16]
|
654 |
+
ciphertext = data[16:]
|
655 |
+
assert self.key is not None
|
656 |
+
cipher = Cipher(
|
657 |
+
algorithms.AES(self.key),
|
658 |
+
modes.CBC(initialization_vector),
|
659 |
+
backend=default_backend(),
|
660 |
+
) # type: ignore
|
661 |
+
return cipher.decryptor().update(ciphertext) # type: ignore
|
662 |
+
|
663 |
+
|
664 |
+
class PDFDocument:
|
665 |
+
"""PDFDocument object represents a PDF document.
|
666 |
+
|
667 |
+
Since a PDF file can be very big, normally it is not loaded at
|
668 |
+
once. So PDF document has to cooperate with a PDF parser in order to
|
669 |
+
dynamically import the data as processing goes.
|
670 |
+
|
671 |
+
Typical usage:
|
672 |
+
doc = PDFDocument(parser, password)
|
673 |
+
obj = doc.getobj(objid)
|
674 |
+
|
675 |
+
"""
|
676 |
+
|
677 |
+
security_handler_registry: Dict[int, Type[PDFStandardSecurityHandler]] = {
|
678 |
+
1: PDFStandardSecurityHandler,
|
679 |
+
2: PDFStandardSecurityHandler,
|
680 |
+
4: PDFStandardSecurityHandlerV4,
|
681 |
+
5: PDFStandardSecurityHandlerV5,
|
682 |
+
}
|
683 |
+
|
684 |
+
def __init__(
|
685 |
+
self,
|
686 |
+
parser: PDFParser,
|
687 |
+
password: str = "",
|
688 |
+
caching: bool = True,
|
689 |
+
fallback: bool = True,
|
690 |
+
) -> None:
|
691 |
+
"""Set the document to use a given PDFParser object."""
|
692 |
+
self.caching = caching
|
693 |
+
self.xrefs: List[PDFBaseXRef] = []
|
694 |
+
self.info = []
|
695 |
+
self.catalog: Dict[str, Any] = {}
|
696 |
+
self.encryption: Optional[Tuple[Any, Any]] = None
|
697 |
+
self.decipher: Optional[DecipherCallable] = None
|
698 |
+
self._parser = None
|
699 |
+
self._cached_objs: Dict[int, Tuple[object, int]] = {}
|
700 |
+
self._parsed_objs: Dict[int, Tuple[List[object], int]] = {}
|
701 |
+
self._parser = parser
|
702 |
+
self._parser.set_document(self)
|
703 |
+
self.is_printable = self.is_modifiable = self.is_extractable = True
|
704 |
+
# Retrieve the information of each header that was appended
|
705 |
+
# (maybe multiple times) at the end of the document.
|
706 |
+
try:
|
707 |
+
# print('FIND XREF')
|
708 |
+
pos = self.find_xref(parser)
|
709 |
+
self.pos = pos
|
710 |
+
self.read_xref_from(parser, pos, self.xrefs)
|
711 |
+
except PDFNoValidXRef:
|
712 |
+
if fallback:
|
713 |
+
parser.fallback = True
|
714 |
+
newxref = PDFXRefFallback()
|
715 |
+
newxref.load(parser)
|
716 |
+
self.xrefs.append(newxref)
|
717 |
+
# print(f'XREF {self.xrefs}')
|
718 |
+
for xref in self.xrefs:
|
719 |
+
trailer = xref.get_trailer()
|
720 |
+
if not trailer:
|
721 |
+
continue
|
722 |
+
# If there's an encryption info, remember it.
|
723 |
+
if "Encrypt" in trailer:
|
724 |
+
if "ID" in trailer:
|
725 |
+
id_value = list_value(trailer["ID"])
|
726 |
+
else:
|
727 |
+
# Some documents may not have a /ID, use two empty
|
728 |
+
# byte strings instead. Solves
|
729 |
+
# https://github.com/pdf2zh/pdf2zh.six/issues/594
|
730 |
+
id_value = (b"", b"")
|
731 |
+
self.encryption = (id_value, dict_value(trailer["Encrypt"]))
|
732 |
+
self._initialize_password(password)
|
733 |
+
if "Info" in trailer:
|
734 |
+
self.info.append(dict_value(trailer["Info"]))
|
735 |
+
if "Root" in trailer:
|
736 |
+
# Every PDF file must have exactly one /Root dictionary.
|
737 |
+
self.catalog = dict_value(trailer["Root"])
|
738 |
+
break
|
739 |
+
else:
|
740 |
+
raise PDFSyntaxError("No /Root object! - Is this really a PDF?")
|
741 |
+
if self.catalog.get("Type") is not LITERAL_CATALOG:
|
742 |
+
if settings.STRICT:
|
743 |
+
raise PDFSyntaxError("Catalog not found!")
|
744 |
+
|
745 |
+
KEYWORD_OBJ = KWD(b"obj")
|
746 |
+
|
747 |
+
# _initialize_password(password=b'')
|
748 |
+
# Perform the initialization with a given password.
|
749 |
+
def _initialize_password(self, password: str = "") -> None:
|
750 |
+
assert self.encryption is not None
|
751 |
+
(docid, param) = self.encryption
|
752 |
+
if literal_name(param.get("Filter")) != "Standard":
|
753 |
+
raise PDFEncryptionError("Unknown filter: param=%r" % param)
|
754 |
+
v = int_value(param.get("V", 0))
|
755 |
+
factory = self.security_handler_registry.get(v)
|
756 |
+
if factory is None:
|
757 |
+
raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
|
758 |
+
handler = factory(docid, param, password)
|
759 |
+
self.decipher = handler.decrypt
|
760 |
+
self.is_printable = handler.is_printable()
|
761 |
+
self.is_modifiable = handler.is_modifiable()
|
762 |
+
self.is_extractable = handler.is_extractable()
|
763 |
+
assert self._parser is not None
|
764 |
+
self._parser.fallback = False # need to read streams with exact length
|
765 |
+
|
766 |
+
def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
|
767 |
+
if stream.objid in self._parsed_objs:
|
768 |
+
(objs, n) = self._parsed_objs[stream.objid]
|
769 |
+
else:
|
770 |
+
(objs, n) = self._get_objects(stream)
|
771 |
+
if self.caching:
|
772 |
+
assert stream.objid is not None
|
773 |
+
self._parsed_objs[stream.objid] = (objs, n)
|
774 |
+
i = n * 2 + index
|
775 |
+
try:
|
776 |
+
obj = objs[i]
|
777 |
+
except IndexError:
|
778 |
+
raise PDFSyntaxError("index too big: %r" % index)
|
779 |
+
return obj
|
780 |
+
|
781 |
+
def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
|
782 |
+
if stream.get("Type") is not LITERAL_OBJSTM:
|
783 |
+
if settings.STRICT:
|
784 |
+
raise PDFSyntaxError("Not a stream object: %r" % stream)
|
785 |
+
try:
|
786 |
+
n = cast(int, stream["N"])
|
787 |
+
except KeyError:
|
788 |
+
if settings.STRICT:
|
789 |
+
raise PDFSyntaxError("N is not defined: %r" % stream)
|
790 |
+
n = 0
|
791 |
+
parser = PDFStreamParser(stream.get_data())
|
792 |
+
parser.set_document(self)
|
793 |
+
objs: List[object] = []
|
794 |
+
try:
|
795 |
+
while 1:
|
796 |
+
_, (_, obj) = parser.nextobject()
|
797 |
+
objs.append(obj)
|
798 |
+
except PSEOF:
|
799 |
+
pass
|
800 |
+
return (objs, n)
|
801 |
+
|
802 |
+
def _getobj_parse(self, pos: int, objid: int) -> object:
|
803 |
+
assert self._parser is not None
|
804 |
+
self._parser.seek(pos)
|
805 |
+
(_, objid1) = self._parser.nexttoken() # objid
|
806 |
+
(_, genno) = self._parser.nexttoken() # genno
|
807 |
+
(_, kwd) = self._parser.nexttoken()
|
808 |
+
# hack around malformed pdf files
|
809 |
+
# copied from https://github.com/jaepil/pdf2zh3k/blob/master/
|
810 |
+
# pdf2zh/pdfparser.py#L399
|
811 |
+
# to solve https://github.com/pdf2zh/pdf2zh.six/issues/56
|
812 |
+
# assert objid1 == objid, str((objid1, objid))
|
813 |
+
if objid1 != objid:
|
814 |
+
x = []
|
815 |
+
while kwd is not self.KEYWORD_OBJ:
|
816 |
+
(_, kwd) = self._parser.nexttoken()
|
817 |
+
x.append(kwd)
|
818 |
+
if len(x) >= 2:
|
819 |
+
objid1 = x[-2]
|
820 |
+
# #### end hack around malformed pdf files
|
821 |
+
if objid1 != objid:
|
822 |
+
raise PDFSyntaxError(f"objid mismatch: {objid1!r}={objid!r}")
|
823 |
+
|
824 |
+
if kwd != KWD(b"obj"):
|
825 |
+
raise PDFSyntaxError("Invalid object spec: offset=%r" % pos)
|
826 |
+
end, (_, obj) = self._parser.nextobject()
|
827 |
+
return end, obj
|
828 |
+
|
829 |
+
# can raise PDFObjectNotFound
|
830 |
+
def getobj(self, objid: int) -> object:
|
831 |
+
"""Get object from PDF
|
832 |
+
|
833 |
+
:raises PDFException if PDFDocument is not initialized
|
834 |
+
:raises PDFObjectNotFound if objid does not exist in PDF
|
835 |
+
"""
|
836 |
+
if not self.xrefs:
|
837 |
+
raise PDFException("PDFDocument is not initialized")
|
838 |
+
# log.debug("getobj: objid=%r", objid)
|
839 |
+
if objid in self._cached_objs:
|
840 |
+
(obj, genno) = self._cached_objs[objid]
|
841 |
+
else:
|
842 |
+
for xref in self.xrefs:
|
843 |
+
try:
|
844 |
+
(strmid, index, genno) = xref.get_pos(objid)
|
845 |
+
except KeyError:
|
846 |
+
continue
|
847 |
+
try:
|
848 |
+
if strmid is not None:
|
849 |
+
stream = stream_value(self.getobj(strmid))
|
850 |
+
obj = self._getobj_objstm(stream, index, objid)
|
851 |
+
else:
|
852 |
+
end, obj = self._getobj_parse(index, objid)
|
853 |
+
if self.decipher:
|
854 |
+
obj = decipher_all(self.decipher, objid, genno, obj)
|
855 |
+
|
856 |
+
if isinstance(obj, PDFStream):
|
857 |
+
obj.set_objid(objid, genno)
|
858 |
+
break
|
859 |
+
except (PSEOF, PDFSyntaxError):
|
860 |
+
continue
|
861 |
+
else:
|
862 |
+
raise PDFObjectNotFound(objid)
|
863 |
+
# log.debug("register: objid=%r: %r", objid, obj)
|
864 |
+
if self.caching:
|
865 |
+
self._cached_objs[objid] = (obj, genno)
|
866 |
+
return obj
|
867 |
+
|
868 |
+
OutlineType = Tuple[Any, Any, Any, Any, Any]
|
869 |
+
|
870 |
+
def get_outlines(self) -> Iterator[OutlineType]:
|
871 |
+
if "Outlines" not in self.catalog:
|
872 |
+
raise PDFNoOutlines
|
873 |
+
|
874 |
+
def search(entry: object, level: int) -> Iterator[PDFDocument.OutlineType]:
|
875 |
+
entry = dict_value(entry)
|
876 |
+
if "Title" in entry:
|
877 |
+
if "A" in entry or "Dest" in entry:
|
878 |
+
title = decode_text(str_value(entry["Title"]))
|
879 |
+
dest = entry.get("Dest")
|
880 |
+
action = entry.get("A")
|
881 |
+
se = entry.get("SE")
|
882 |
+
yield (level, title, dest, action, se)
|
883 |
+
if "First" in entry and "Last" in entry:
|
884 |
+
yield from search(entry["First"], level + 1)
|
885 |
+
if "Next" in entry:
|
886 |
+
yield from search(entry["Next"], level)
|
887 |
+
|
888 |
+
return search(self.catalog["Outlines"], 0)
|
889 |
+
|
890 |
+
def get_page_labels(self) -> Iterator[str]:
|
891 |
+
"""Generate page label strings for the PDF document.
|
892 |
+
|
893 |
+
If the document includes page labels, generates strings, one per page.
|
894 |
+
If not, raises PDFNoPageLabels.
|
895 |
+
|
896 |
+
The resulting iteration is unbounded.
|
897 |
+
"""
|
898 |
+
assert self.catalog is not None
|
899 |
+
|
900 |
+
try:
|
901 |
+
page_labels = PageLabels(self.catalog["PageLabels"])
|
902 |
+
except (PDFTypeError, KeyError):
|
903 |
+
raise PDFNoPageLabels
|
904 |
+
|
905 |
+
return page_labels.labels
|
906 |
+
|
907 |
+
def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
|
908 |
+
try:
|
909 |
+
names = dict_value(self.catalog["Names"])
|
910 |
+
except (PDFTypeError, KeyError):
|
911 |
+
raise PDFKeyError((cat, key))
|
912 |
+
# may raise KeyError
|
913 |
+
d0 = dict_value(names[cat])
|
914 |
+
|
915 |
+
def lookup(d: Dict[str, Any]) -> Any:
|
916 |
+
if "Limits" in d:
|
917 |
+
(k1, k2) = list_value(d["Limits"])
|
918 |
+
if key < k1 or k2 < key:
|
919 |
+
return None
|
920 |
+
if "Names" in d:
|
921 |
+
objs = list_value(d["Names"])
|
922 |
+
names = dict(
|
923 |
+
cast(Iterator[Tuple[Union[str, bytes], Any]], choplist(2, objs)),
|
924 |
+
)
|
925 |
+
return names[key]
|
926 |
+
if "Kids" in d:
|
927 |
+
for c in list_value(d["Kids"]):
|
928 |
+
v = lookup(dict_value(c))
|
929 |
+
if v:
|
930 |
+
return v
|
931 |
+
raise PDFKeyError((cat, key))
|
932 |
+
|
933 |
+
return lookup(d0)
|
934 |
+
|
935 |
+
def get_dest(self, name: Union[str, bytes]) -> Any:
|
936 |
+
try:
|
937 |
+
# PDF-1.2 or later
|
938 |
+
obj = self.lookup_name("Dests", name)
|
939 |
+
except KeyError:
|
940 |
+
# PDF-1.1 or prior
|
941 |
+
if "Dests" not in self.catalog:
|
942 |
+
raise PDFDestinationNotFound(name)
|
943 |
+
d0 = dict_value(self.catalog["Dests"])
|
944 |
+
if name not in d0:
|
945 |
+
raise PDFDestinationNotFound(name)
|
946 |
+
obj = d0[name]
|
947 |
+
return obj
|
948 |
+
|
949 |
+
# find_xref
|
950 |
+
def find_xref(self, parser: PDFParser) -> int:
|
951 |
+
"""Internal function used to locate the first XRef."""
|
952 |
+
# search the last xref table by scanning the file backwards.
|
953 |
+
prev = b""
|
954 |
+
for line in parser.revreadlines():
|
955 |
+
line = line.strip()
|
956 |
+
# log.debug("find_xref: %r", line)
|
957 |
+
|
958 |
+
if line == b"startxref":
|
959 |
+
# log.debug("xref found: pos=%r", prev)
|
960 |
+
|
961 |
+
if not prev.isdigit():
|
962 |
+
raise PDFNoValidXRef(f"Invalid xref position: {prev!r}")
|
963 |
+
|
964 |
+
start = int(prev)
|
965 |
+
|
966 |
+
if not start >= 0:
|
967 |
+
raise PDFNoValidXRef(f"Invalid negative xref position: {start}")
|
968 |
+
|
969 |
+
return start
|
970 |
+
|
971 |
+
if line:
|
972 |
+
prev = line
|
973 |
+
|
974 |
+
raise PDFNoValidXRef("Unexpected EOF")
|
975 |
+
|
976 |
+
# read xref table
|
977 |
+
def read_xref_from(
|
978 |
+
self,
|
979 |
+
parser: PDFParser,
|
980 |
+
start: int,
|
981 |
+
xrefs: List[PDFBaseXRef],
|
982 |
+
) -> None:
|
983 |
+
"""Reads XRefs from the given location."""
|
984 |
+
parser.seek(start)
|
985 |
+
parser.reset()
|
986 |
+
try:
|
987 |
+
(pos, token) = parser.nexttoken()
|
988 |
+
except PSEOF:
|
989 |
+
raise PDFNoValidXRef("Unexpected EOF")
|
990 |
+
# log.debug("read_xref_from: start=%d, token=%r", start, token)
|
991 |
+
if isinstance(token, int):
|
992 |
+
# XRefStream: PDF-1.5
|
993 |
+
parser.seek(pos)
|
994 |
+
parser.reset()
|
995 |
+
xref: PDFBaseXRef = PDFXRefStream()
|
996 |
+
xref.load(parser)
|
997 |
+
else:
|
998 |
+
if token is parser.KEYWORD_XREF:
|
999 |
+
parser.nextline()
|
1000 |
+
xref = PDFXRef()
|
1001 |
+
xref.load(parser)
|
1002 |
+
xrefs.append(xref)
|
1003 |
+
trailer = xref.get_trailer()
|
1004 |
+
# log.debug("trailer: %r", trailer)
|
1005 |
+
if "XRefStm" in trailer:
|
1006 |
+
pos = int_value(trailer["XRefStm"])
|
1007 |
+
self.read_xref_from(parser, pos, xrefs)
|
1008 |
+
if "Prev" in trailer:
|
1009 |
+
# find previous xref
|
1010 |
+
pos = int_value(trailer["Prev"])
|
1011 |
+
self.read_xref_from(parser, pos, xrefs)
|
1012 |
+
|
1013 |
+
|
1014 |
+
class PageLabels(NumberTree):
|
1015 |
+
"""PageLabels from the document catalog.
|
1016 |
+
|
1017 |
+
See Section 8.3.1 in the PDF Reference.
|
1018 |
+
"""
|
1019 |
+
|
1020 |
+
@property
|
1021 |
+
def labels(self) -> Iterator[str]:
|
1022 |
+
ranges = self.values
|
1023 |
+
|
1024 |
+
# The tree must begin with page index 0
|
1025 |
+
if len(ranges) == 0 or ranges[0][0] != 0:
|
1026 |
+
if settings.STRICT:
|
1027 |
+
raise PDFSyntaxError("PageLabels is missing page index 0")
|
1028 |
+
else:
|
1029 |
+
# Try to cope, by assuming empty labels for the initial pages
|
1030 |
+
ranges.insert(0, (0, {}))
|
1031 |
+
|
1032 |
+
for next, (start, label_dict_unchecked) in enumerate(ranges, 1):
|
1033 |
+
label_dict = dict_value(label_dict_unchecked)
|
1034 |
+
style = label_dict.get("S")
|
1035 |
+
prefix = decode_text(str_value(label_dict.get("P", b"")))
|
1036 |
+
first_value = int_value(label_dict.get("St", 1))
|
1037 |
+
|
1038 |
+
if next == len(ranges):
|
1039 |
+
# This is the last specified range. It continues until the end
|
1040 |
+
# of the document.
|
1041 |
+
values: Iterable[int] = itertools.count(first_value)
|
1042 |
+
else:
|
1043 |
+
end, _ = ranges[next]
|
1044 |
+
range_length = end - start
|
1045 |
+
values = range(first_value, first_value + range_length)
|
1046 |
+
|
1047 |
+
for value in values:
|
1048 |
+
label = self._format_page_label(value, style)
|
1049 |
+
yield prefix + label
|
1050 |
+
|
1051 |
+
@staticmethod
|
1052 |
+
def _format_page_label(value: int, style: Any) -> str:
|
1053 |
+
"""Format page label value in a specific style"""
|
1054 |
+
if style is None:
|
1055 |
+
label = ""
|
1056 |
+
elif style is LIT("D"): # Decimal arabic numerals
|
1057 |
+
label = str(value)
|
1058 |
+
elif style is LIT("R"): # Uppercase roman numerals
|
1059 |
+
label = format_int_roman(value).upper()
|
1060 |
+
elif style is LIT("r"): # Lowercase roman numerals
|
1061 |
+
label = format_int_roman(value)
|
1062 |
+
elif style is LIT("A"): # Uppercase letters A-Z, AA-ZZ...
|
1063 |
+
label = format_int_alpha(value).upper()
|
1064 |
+
elif style is LIT("a"): # Lowercase letters a-z, aa-zz...
|
1065 |
+
label = format_int_alpha(value)
|
1066 |
+
else:
|
1067 |
+
log.warning("Unknown page label style: %r", style)
|
1068 |
+
label = ""
|
1069 |
+
return label
|
pdf2zh/pdfexceptions.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf2zh.psexceptions import PSException
|
2 |
+
|
3 |
+
|
4 |
+
class PDFException(PSException):
|
5 |
+
pass
|
6 |
+
|
7 |
+
|
8 |
+
class PDFTypeError(PDFException, TypeError):
|
9 |
+
pass
|
10 |
+
|
11 |
+
|
12 |
+
class PDFValueError(PDFException, ValueError):
|
13 |
+
pass
|
14 |
+
|
15 |
+
|
16 |
+
class PDFObjectNotFound(PDFException):
|
17 |
+
pass
|
18 |
+
|
19 |
+
|
20 |
+
class PDFNotImplementedError(PDFException, NotImplementedError):
|
21 |
+
pass
|
22 |
+
|
23 |
+
|
24 |
+
class PDFKeyError(PDFException, KeyError):
|
25 |
+
pass
|
26 |
+
|
27 |
+
|
28 |
+
class PDFEOFError(PDFException, EOFError):
|
29 |
+
pass
|
30 |
+
|
31 |
+
|
32 |
+
class PDFIOError(PDFException, IOError):
|
33 |
+
pass
|
pdf2zh/pdffont.py
ADDED
@@ -0,0 +1,1190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import struct
|
3 |
+
from io import BytesIO
|
4 |
+
from typing import (
|
5 |
+
TYPE_CHECKING,
|
6 |
+
Any,
|
7 |
+
BinaryIO,
|
8 |
+
Dict,
|
9 |
+
Iterable,
|
10 |
+
Iterator,
|
11 |
+
List,
|
12 |
+
Mapping,
|
13 |
+
Optional,
|
14 |
+
Tuple,
|
15 |
+
Union,
|
16 |
+
cast,
|
17 |
+
)
|
18 |
+
|
19 |
+
from pdf2zh import settings
|
20 |
+
from pdf2zh.cmapdb import (
|
21 |
+
CMap,
|
22 |
+
CMapBase,
|
23 |
+
CMapDB,
|
24 |
+
CMapParser,
|
25 |
+
FileUnicodeMap,
|
26 |
+
IdentityUnicodeMap,
|
27 |
+
UnicodeMap,
|
28 |
+
)
|
29 |
+
from pdf2zh.encodingdb import EncodingDB, name2unicode
|
30 |
+
from pdf2zh.fontmetrics import FONT_METRICS
|
31 |
+
from pdf2zh.pdfexceptions import PDFException, PDFKeyError, PDFValueError
|
32 |
+
from pdf2zh.pdftypes import (
|
33 |
+
PDFStream,
|
34 |
+
dict_value,
|
35 |
+
int_value,
|
36 |
+
list_value,
|
37 |
+
num_value,
|
38 |
+
resolve1,
|
39 |
+
resolve_all,
|
40 |
+
stream_value,
|
41 |
+
)
|
42 |
+
from pdf2zh.psexceptions import PSEOF
|
43 |
+
from pdf2zh.psparser import (
|
44 |
+
KWD,
|
45 |
+
LIT,
|
46 |
+
PSKeyword,
|
47 |
+
PSLiteral,
|
48 |
+
PSStackParser,
|
49 |
+
literal_name,
|
50 |
+
)
|
51 |
+
from pdf2zh.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack
|
52 |
+
|
53 |
+
if TYPE_CHECKING:
|
54 |
+
from pdf2zh.pdfinterp import PDFResourceManager
|
55 |
+
|
56 |
+
log = logging.getLogger(__name__)
|
57 |
+
|
58 |
+
|
59 |
+
def get_widths(seq: Iterable[object]) -> Dict[int, float]:
|
60 |
+
"""Build a mapping of character widths for horizontal writing."""
|
61 |
+
widths: Dict[int, float] = {}
|
62 |
+
r: List[float] = []
|
63 |
+
for v in seq:
|
64 |
+
if isinstance(v, list):
|
65 |
+
if r:
|
66 |
+
char1 = r[-1]
|
67 |
+
for i, w in enumerate(v):
|
68 |
+
widths[cast(int, char1) + i] = w
|
69 |
+
r = []
|
70 |
+
elif isinstance(v, (int, float)): # == utils.isnumber(v)
|
71 |
+
r.append(v)
|
72 |
+
if len(r) == 3:
|
73 |
+
(char1, char2, w) = r
|
74 |
+
for i in range(cast(int, char1), cast(int, char2) + 1):
|
75 |
+
widths[i] = w
|
76 |
+
r = []
|
77 |
+
return widths
|
78 |
+
|
79 |
+
|
80 |
+
def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]:
|
81 |
+
"""Build a mapping of character widths for vertical writing."""
|
82 |
+
widths: Dict[int, Tuple[float, Point]] = {}
|
83 |
+
r: List[float] = []
|
84 |
+
for v in seq:
|
85 |
+
if isinstance(v, list):
|
86 |
+
if r:
|
87 |
+
char1 = r[-1]
|
88 |
+
for i, (w, vx, vy) in enumerate(choplist(3, v)):
|
89 |
+
widths[cast(int, char1) + i] = (w, (vx, vy))
|
90 |
+
r = []
|
91 |
+
elif isinstance(v, (int, float)): # == utils.isnumber(v)
|
92 |
+
r.append(v)
|
93 |
+
if len(r) == 5:
|
94 |
+
(char1, char2, w, vx, vy) = r
|
95 |
+
for i in range(cast(int, char1), cast(int, char2) + 1):
|
96 |
+
widths[i] = (w, (vx, vy))
|
97 |
+
r = []
|
98 |
+
return widths
|
99 |
+
|
100 |
+
|
101 |
+
class FontMetricsDB:
|
102 |
+
@classmethod
|
103 |
+
def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]:
|
104 |
+
return FONT_METRICS[fontname]
|
105 |
+
|
106 |
+
|
107 |
+
# int here means that we're not extending PSStackParser with additional types.
|
108 |
+
class Type1FontHeaderParser(PSStackParser[int]):
|
109 |
+
KEYWORD_BEGIN = KWD(b"begin")
|
110 |
+
KEYWORD_END = KWD(b"end")
|
111 |
+
KEYWORD_DEF = KWD(b"def")
|
112 |
+
KEYWORD_PUT = KWD(b"put")
|
113 |
+
KEYWORD_DICT = KWD(b"dict")
|
114 |
+
KEYWORD_ARRAY = KWD(b"array")
|
115 |
+
KEYWORD_READONLY = KWD(b"readonly")
|
116 |
+
KEYWORD_FOR = KWD(b"for")
|
117 |
+
|
118 |
+
def __init__(self, data: BinaryIO) -> None:
|
119 |
+
PSStackParser.__init__(self, data)
|
120 |
+
self._cid2unicode: Dict[int, str] = {}
|
121 |
+
|
122 |
+
def get_encoding(self) -> Dict[int, str]:
|
123 |
+
"""Parse the font encoding.
|
124 |
+
|
125 |
+
The Type1 font encoding maps character codes to character names. These
|
126 |
+
character names could either be standard Adobe glyph names, or
|
127 |
+
character names associated with custom CharStrings for this font. A
|
128 |
+
CharString is a sequence of operations that describe how the character
|
129 |
+
should be drawn. Currently, this function returns '' (empty string)
|
130 |
+
for character names that are associated with a CharStrings.
|
131 |
+
|
132 |
+
Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
|
133 |
+
|
134 |
+
:returns mapping of character identifiers (cid's) to unicode characters
|
135 |
+
"""
|
136 |
+
while 1:
|
137 |
+
try:
|
138 |
+
_, (cid, name) = self.nextobject()
|
139 |
+
except PSEOF:
|
140 |
+
break
|
141 |
+
try:
|
142 |
+
self._cid2unicode[cid] = name2unicode(cast(str, name))
|
143 |
+
except KeyError:
|
144 |
+
# log.debug(str(e))
|
145 |
+
pass
|
146 |
+
return self._cid2unicode
|
147 |
+
|
148 |
+
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
149 |
+
if token is self.KEYWORD_PUT:
|
150 |
+
((_, key), (_, value)) = self.pop(2)
|
151 |
+
if isinstance(key, int) and isinstance(value, PSLiteral):
|
152 |
+
self.add_results((key, literal_name(value)))
|
153 |
+
|
154 |
+
|
155 |
+
NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")
|
156 |
+
|
157 |
+
# Mapping of cmap names. Original cmap name is kept if not in the mapping.
|
158 |
+
# (missing reference for why DLIdent is mapped to Identity)
|
159 |
+
IDENTITY_ENCODER = {
|
160 |
+
"DLIdent-H": "Identity-H",
|
161 |
+
"DLIdent-V": "Identity-V",
|
162 |
+
}
|
163 |
+
|
164 |
+
|
165 |
+
def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]:
|
166 |
+
d: Dict[int, List[Union[float, int]]] = {}
|
167 |
+
fp = BytesIO(data)
|
168 |
+
stack: List[Union[float, int]] = []
|
169 |
+
while 1:
|
170 |
+
c = fp.read(1)
|
171 |
+
if not c:
|
172 |
+
break
|
173 |
+
b0 = ord(c)
|
174 |
+
if b0 <= 21:
|
175 |
+
d[b0] = stack
|
176 |
+
stack = []
|
177 |
+
continue
|
178 |
+
if b0 == 30:
|
179 |
+
s = ""
|
180 |
+
loop = True
|
181 |
+
while loop:
|
182 |
+
b = ord(fp.read(1))
|
183 |
+
for n in (b >> 4, b & 15):
|
184 |
+
if n == 15:
|
185 |
+
loop = False
|
186 |
+
else:
|
187 |
+
nibble = NIBBLES[n]
|
188 |
+
assert nibble is not None
|
189 |
+
s += nibble
|
190 |
+
value = float(s)
|
191 |
+
elif b0 >= 32 and b0 <= 246:
|
192 |
+
value = b0 - 139
|
193 |
+
else:
|
194 |
+
b1 = ord(fp.read(1))
|
195 |
+
if b0 >= 247 and b0 <= 250:
|
196 |
+
value = ((b0 - 247) << 8) + b1 + 108
|
197 |
+
elif b0 >= 251 and b0 <= 254:
|
198 |
+
value = -((b0 - 251) << 8) - b1 - 108
|
199 |
+
else:
|
200 |
+
b2 = ord(fp.read(1))
|
201 |
+
if b1 >= 128:
|
202 |
+
b1 -= 256
|
203 |
+
if b0 == 28:
|
204 |
+
value = b1 << 8 | b2
|
205 |
+
else:
|
206 |
+
value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]
|
207 |
+
stack.append(value)
|
208 |
+
return d
|
209 |
+
|
210 |
+
|
211 |
+
class CFFFont:
|
212 |
+
STANDARD_STRINGS = (
|
213 |
+
".notdef",
|
214 |
+
"space",
|
215 |
+
"exclam",
|
216 |
+
"quotedbl",
|
217 |
+
"numbersign",
|
218 |
+
"dollar",
|
219 |
+
"percent",
|
220 |
+
"ampersand",
|
221 |
+
"quoteright",
|
222 |
+
"parenleft",
|
223 |
+
"parenright",
|
224 |
+
"asterisk",
|
225 |
+
"plus",
|
226 |
+
"comma",
|
227 |
+
"hyphen",
|
228 |
+
"period",
|
229 |
+
"slash",
|
230 |
+
"zero",
|
231 |
+
"one",
|
232 |
+
"two",
|
233 |
+
"three",
|
234 |
+
"four",
|
235 |
+
"five",
|
236 |
+
"six",
|
237 |
+
"seven",
|
238 |
+
"eight",
|
239 |
+
"nine",
|
240 |
+
"colon",
|
241 |
+
"semicolon",
|
242 |
+
"less",
|
243 |
+
"equal",
|
244 |
+
"greater",
|
245 |
+
"question",
|
246 |
+
"at",
|
247 |
+
"A",
|
248 |
+
"B",
|
249 |
+
"C",
|
250 |
+
"D",
|
251 |
+
"E",
|
252 |
+
"F",
|
253 |
+
"G",
|
254 |
+
"H",
|
255 |
+
"I",
|
256 |
+
"J",
|
257 |
+
"K",
|
258 |
+
"L",
|
259 |
+
"M",
|
260 |
+
"N",
|
261 |
+
"O",
|
262 |
+
"P",
|
263 |
+
"Q",
|
264 |
+
"R",
|
265 |
+
"S",
|
266 |
+
"T",
|
267 |
+
"U",
|
268 |
+
"V",
|
269 |
+
"W",
|
270 |
+
"X",
|
271 |
+
"Y",
|
272 |
+
"Z",
|
273 |
+
"bracketleft",
|
274 |
+
"backslash",
|
275 |
+
"bracketright",
|
276 |
+
"asciicircum",
|
277 |
+
"underscore",
|
278 |
+
"quoteleft",
|
279 |
+
"a",
|
280 |
+
"b",
|
281 |
+
"c",
|
282 |
+
"d",
|
283 |
+
"e",
|
284 |
+
"f",
|
285 |
+
"g",
|
286 |
+
"h",
|
287 |
+
"i",
|
288 |
+
"j",
|
289 |
+
"k",
|
290 |
+
"l",
|
291 |
+
"m",
|
292 |
+
"n",
|
293 |
+
"o",
|
294 |
+
"p",
|
295 |
+
"q",
|
296 |
+
"r",
|
297 |
+
"s",
|
298 |
+
"t",
|
299 |
+
"u",
|
300 |
+
"v",
|
301 |
+
"w",
|
302 |
+
"x",
|
303 |
+
"y",
|
304 |
+
"z",
|
305 |
+
"braceleft",
|
306 |
+
"bar",
|
307 |
+
"braceright",
|
308 |
+
"asciitilde",
|
309 |
+
"exclamdown",
|
310 |
+
"cent",
|
311 |
+
"sterling",
|
312 |
+
"fraction",
|
313 |
+
"yen",
|
314 |
+
"florin",
|
315 |
+
"section",
|
316 |
+
"currency",
|
317 |
+
"quotesingle",
|
318 |
+
"quotedblleft",
|
319 |
+
"guillemotleft",
|
320 |
+
"guilsinglleft",
|
321 |
+
"guilsinglright",
|
322 |
+
"fi",
|
323 |
+
"fl",
|
324 |
+
"endash",
|
325 |
+
"dagger",
|
326 |
+
"daggerdbl",
|
327 |
+
"periodcentered",
|
328 |
+
"paragraph",
|
329 |
+
"bullet",
|
330 |
+
"quotesinglbase",
|
331 |
+
"quotedblbase",
|
332 |
+
"quotedblright",
|
333 |
+
"guillemotright",
|
334 |
+
"ellipsis",
|
335 |
+
"perthousand",
|
336 |
+
"questiondown",
|
337 |
+
"grave",
|
338 |
+
"acute",
|
339 |
+
"circumflex",
|
340 |
+
"tilde",
|
341 |
+
"macron",
|
342 |
+
"breve",
|
343 |
+
"dotaccent",
|
344 |
+
"dieresis",
|
345 |
+
"ring",
|
346 |
+
"cedilla",
|
347 |
+
"hungarumlaut",
|
348 |
+
"ogonek",
|
349 |
+
"caron",
|
350 |
+
"emdash",
|
351 |
+
"AE",
|
352 |
+
"ordfeminine",
|
353 |
+
"Lslash",
|
354 |
+
"Oslash",
|
355 |
+
"OE",
|
356 |
+
"ordmasculine",
|
357 |
+
"ae",
|
358 |
+
"dotlessi",
|
359 |
+
"lslash",
|
360 |
+
"oslash",
|
361 |
+
"oe",
|
362 |
+
"germandbls",
|
363 |
+
"onesuperior",
|
364 |
+
"logicalnot",
|
365 |
+
"mu",
|
366 |
+
"trademark",
|
367 |
+
"Eth",
|
368 |
+
"onehalf",
|
369 |
+
"plusminus",
|
370 |
+
"Thorn",
|
371 |
+
"onequarter",
|
372 |
+
"divide",
|
373 |
+
"brokenbar",
|
374 |
+
"degree",
|
375 |
+
"thorn",
|
376 |
+
"threequarters",
|
377 |
+
"twosuperior",
|
378 |
+
"registered",
|
379 |
+
"minus",
|
380 |
+
"eth",
|
381 |
+
"multiply",
|
382 |
+
"threesuperior",
|
383 |
+
"copyright",
|
384 |
+
"Aacute",
|
385 |
+
"Acircumflex",
|
386 |
+
"Adieresis",
|
387 |
+
"Agrave",
|
388 |
+
"Aring",
|
389 |
+
"Atilde",
|
390 |
+
"Ccedilla",
|
391 |
+
"Eacute",
|
392 |
+
"Ecircumflex",
|
393 |
+
"Edieresis",
|
394 |
+
"Egrave",
|
395 |
+
"Iacute",
|
396 |
+
"Icircumflex",
|
397 |
+
"Idieresis",
|
398 |
+
"Igrave",
|
399 |
+
"Ntilde",
|
400 |
+
"Oacute",
|
401 |
+
"Ocircumflex",
|
402 |
+
"Odieresis",
|
403 |
+
"Ograve",
|
404 |
+
"Otilde",
|
405 |
+
"Scaron",
|
406 |
+
"Uacute",
|
407 |
+
"Ucircumflex",
|
408 |
+
"Udieresis",
|
409 |
+
"Ugrave",
|
410 |
+
"Yacute",
|
411 |
+
"Ydieresis",
|
412 |
+
"Zcaron",
|
413 |
+
"aacute",
|
414 |
+
"acircumflex",
|
415 |
+
"adieresis",
|
416 |
+
"agrave",
|
417 |
+
"aring",
|
418 |
+
"atilde",
|
419 |
+
"ccedilla",
|
420 |
+
"eacute",
|
421 |
+
"ecircumflex",
|
422 |
+
"edieresis",
|
423 |
+
"egrave",
|
424 |
+
"iacute",
|
425 |
+
"icircumflex",
|
426 |
+
"idieresis",
|
427 |
+
"igrave",
|
428 |
+
"ntilde",
|
429 |
+
"oacute",
|
430 |
+
"ocircumflex",
|
431 |
+
"odieresis",
|
432 |
+
"ograve",
|
433 |
+
"otilde",
|
434 |
+
"scaron",
|
435 |
+
"uacute",
|
436 |
+
"ucircumflex",
|
437 |
+
"udieresis",
|
438 |
+
"ugrave",
|
439 |
+
"yacute",
|
440 |
+
"ydieresis",
|
441 |
+
"zcaron",
|
442 |
+
"exclamsmall",
|
443 |
+
"Hungarumlautsmall",
|
444 |
+
"dollaroldstyle",
|
445 |
+
"dollarsuperior",
|
446 |
+
"ampersandsmall",
|
447 |
+
"Acutesmall",
|
448 |
+
"parenleftsuperior",
|
449 |
+
"parenrightsuperior",
|
450 |
+
"twodotenleader",
|
451 |
+
"onedotenleader",
|
452 |
+
"zerooldstyle",
|
453 |
+
"oneoldstyle",
|
454 |
+
"twooldstyle",
|
455 |
+
"threeoldstyle",
|
456 |
+
"fouroldstyle",
|
457 |
+
"fiveoldstyle",
|
458 |
+
"sixoldstyle",
|
459 |
+
"sevenoldstyle",
|
460 |
+
"eightoldstyle",
|
461 |
+
"nineoldstyle",
|
462 |
+
"commasuperior",
|
463 |
+
"threequartersemdash",
|
464 |
+
"periodsuperior",
|
465 |
+
"questionsmall",
|
466 |
+
"asuperior",
|
467 |
+
"bsuperior",
|
468 |
+
"centsuperior",
|
469 |
+
"dsuperior",
|
470 |
+
"esuperior",
|
471 |
+
"isuperior",
|
472 |
+
"lsuperior",
|
473 |
+
"msuperior",
|
474 |
+
"nsuperior",
|
475 |
+
"osuperior",
|
476 |
+
"rsuperior",
|
477 |
+
"ssuperior",
|
478 |
+
"tsuperior",
|
479 |
+
"ff",
|
480 |
+
"ffi",
|
481 |
+
"ffl",
|
482 |
+
"parenleftinferior",
|
483 |
+
"parenrightinferior",
|
484 |
+
"Circumflexsmall",
|
485 |
+
"hyphensuperior",
|
486 |
+
"Gravesmall",
|
487 |
+
"Asmall",
|
488 |
+
"Bsmall",
|
489 |
+
"Csmall",
|
490 |
+
"Dsmall",
|
491 |
+
"Esmall",
|
492 |
+
"Fsmall",
|
493 |
+
"Gsmall",
|
494 |
+
"Hsmall",
|
495 |
+
"Ismall",
|
496 |
+
"Jsmall",
|
497 |
+
"Ksmall",
|
498 |
+
"Lsmall",
|
499 |
+
"Msmall",
|
500 |
+
"Nsmall",
|
501 |
+
"Osmall",
|
502 |
+
"Psmall",
|
503 |
+
"Qsmall",
|
504 |
+
"Rsmall",
|
505 |
+
"Ssmall",
|
506 |
+
"Tsmall",
|
507 |
+
"Usmall",
|
508 |
+
"Vsmall",
|
509 |
+
"Wsmall",
|
510 |
+
"Xsmall",
|
511 |
+
"Ysmall",
|
512 |
+
"Zsmall",
|
513 |
+
"colonmonetary",
|
514 |
+
"onefitted",
|
515 |
+
"rupiah",
|
516 |
+
"Tildesmall",
|
517 |
+
"exclamdownsmall",
|
518 |
+
"centoldstyle",
|
519 |
+
"Lslashsmall",
|
520 |
+
"Scaronsmall",
|
521 |
+
"Zcaronsmall",
|
522 |
+
"Dieresissmall",
|
523 |
+
"Brevesmall",
|
524 |
+
"Caronsmall",
|
525 |
+
"Dotaccentsmall",
|
526 |
+
"Macronsmall",
|
527 |
+
"figuredash",
|
528 |
+
"hypheninferior",
|
529 |
+
"Ogoneksmall",
|
530 |
+
"Ringsmall",
|
531 |
+
"Cedillasmall",
|
532 |
+
"questiondownsmall",
|
533 |
+
"oneeighth",
|
534 |
+
"threeeighths",
|
535 |
+
"fiveeighths",
|
536 |
+
"seveneighths",
|
537 |
+
"onethird",
|
538 |
+
"twothirds",
|
539 |
+
"zerosuperior",
|
540 |
+
"foursuperior",
|
541 |
+
"fivesuperior",
|
542 |
+
"sixsuperior",
|
543 |
+
"sevensuperior",
|
544 |
+
"eightsuperior",
|
545 |
+
"ninesuperior",
|
546 |
+
"zeroinferior",
|
547 |
+
"oneinferior",
|
548 |
+
"twoinferior",
|
549 |
+
"threeinferior",
|
550 |
+
"fourinferior",
|
551 |
+
"fiveinferior",
|
552 |
+
"sixinferior",
|
553 |
+
"seveninferior",
|
554 |
+
"eightinferior",
|
555 |
+
"nineinferior",
|
556 |
+
"centinferior",
|
557 |
+
"dollarinferior",
|
558 |
+
"periodinferior",
|
559 |
+
"commainferior",
|
560 |
+
"Agravesmall",
|
561 |
+
"Aacutesmall",
|
562 |
+
"Acircumflexsmall",
|
563 |
+
"Atildesmall",
|
564 |
+
"Adieresissmall",
|
565 |
+
"Aringsmall",
|
566 |
+
"AEsmall",
|
567 |
+
"Ccedillasmall",
|
568 |
+
"Egravesmall",
|
569 |
+
"Eacutesmall",
|
570 |
+
"Ecircumflexsmall",
|
571 |
+
"Edieresissmall",
|
572 |
+
"Igravesmall",
|
573 |
+
"Iacutesmall",
|
574 |
+
"Icircumflexsmall",
|
575 |
+
"Idieresissmall",
|
576 |
+
"Ethsmall",
|
577 |
+
"Ntildesmall",
|
578 |
+
"Ogravesmall",
|
579 |
+
"Oacutesmall",
|
580 |
+
"Ocircumflexsmall",
|
581 |
+
"Otildesmall",
|
582 |
+
"Odieresissmall",
|
583 |
+
"OEsmall",
|
584 |
+
"Oslashsmall",
|
585 |
+
"Ugravesmall",
|
586 |
+
"Uacutesmall",
|
587 |
+
"Ucircumflexsmall",
|
588 |
+
"Udieresissmall",
|
589 |
+
"Yacutesmall",
|
590 |
+
"Thornsmall",
|
591 |
+
"Ydieresissmall",
|
592 |
+
"001.000",
|
593 |
+
"001.001",
|
594 |
+
"001.002",
|
595 |
+
"001.003",
|
596 |
+
"Black",
|
597 |
+
"Bold",
|
598 |
+
"Book",
|
599 |
+
"Light",
|
600 |
+
"Medium",
|
601 |
+
"Regular",
|
602 |
+
"Roman",
|
603 |
+
"Semibold",
|
604 |
+
)
|
605 |
+
|
606 |
+
class INDEX:
|
607 |
+
def __init__(self, fp: BinaryIO) -> None:
|
608 |
+
self.fp = fp
|
609 |
+
self.offsets: List[int] = []
|
610 |
+
(count, offsize) = struct.unpack(">HB", self.fp.read(3))
|
611 |
+
for i in range(count + 1):
|
612 |
+
self.offsets.append(nunpack(self.fp.read(offsize)))
|
613 |
+
self.base = self.fp.tell() - 1
|
614 |
+
self.fp.seek(self.base + self.offsets[-1])
|
615 |
+
|
616 |
+
def __repr__(self) -> str:
|
617 |
+
return "<INDEX: size=%d>" % len(self)
|
618 |
+
|
619 |
+
def __len__(self) -> int:
|
620 |
+
return len(self.offsets) - 1
|
621 |
+
|
622 |
+
def __getitem__(self, i: int) -> bytes:
|
623 |
+
self.fp.seek(self.base + self.offsets[i])
|
624 |
+
return self.fp.read(self.offsets[i + 1] - self.offsets[i])
|
625 |
+
|
626 |
+
def __iter__(self) -> Iterator[bytes]:
|
627 |
+
return iter(self[i] for i in range(len(self)))
|
628 |
+
|
629 |
+
def __init__(self, name: str, fp: BinaryIO) -> None:
|
630 |
+
self.name = name
|
631 |
+
self.fp = fp
|
632 |
+
# Header
|
633 |
+
(_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4))
|
634 |
+
self.fp.read(hdrsize - 4)
|
635 |
+
# Name INDEX
|
636 |
+
self.name_index = self.INDEX(self.fp)
|
637 |
+
# Top DICT INDEX
|
638 |
+
self.dict_index = self.INDEX(self.fp)
|
639 |
+
# String INDEX
|
640 |
+
self.string_index = self.INDEX(self.fp)
|
641 |
+
# Global Subr INDEX
|
642 |
+
self.subr_index = self.INDEX(self.fp)
|
643 |
+
# Top DICT DATA
|
644 |
+
self.top_dict = getdict(self.dict_index[0])
|
645 |
+
(charset_pos,) = self.top_dict.get(15, [0])
|
646 |
+
(encoding_pos,) = self.top_dict.get(16, [0])
|
647 |
+
(charstring_pos,) = self.top_dict.get(17, [0])
|
648 |
+
# CharStrings
|
649 |
+
self.fp.seek(cast(int, charstring_pos))
|
650 |
+
self.charstring = self.INDEX(self.fp)
|
651 |
+
self.nglyphs = len(self.charstring)
|
652 |
+
# Encodings
|
653 |
+
self.code2gid = {}
|
654 |
+
self.gid2code = {}
|
655 |
+
self.fp.seek(cast(int, encoding_pos))
|
656 |
+
format = self.fp.read(1)
|
657 |
+
if format == b"\x00":
|
658 |
+
# Format 0
|
659 |
+
(n,) = struct.unpack("B", self.fp.read(1))
|
660 |
+
for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):
|
661 |
+
self.code2gid[code] = gid
|
662 |
+
self.gid2code[gid] = code
|
663 |
+
elif format == b"\x01":
|
664 |
+
# Format 1
|
665 |
+
(n,) = struct.unpack("B", self.fp.read(1))
|
666 |
+
code = 0
|
667 |
+
for i in range(n):
|
668 |
+
(first, nleft) = struct.unpack("BB", self.fp.read(2))
|
669 |
+
for gid in range(first, first + nleft + 1):
|
670 |
+
self.code2gid[code] = gid
|
671 |
+
self.gid2code[gid] = code
|
672 |
+
code += 1
|
673 |
+
else:
|
674 |
+
raise PDFValueError("unsupported encoding format: %r" % format)
|
675 |
+
# Charsets
|
676 |
+
self.name2gid = {}
|
677 |
+
self.gid2name = {}
|
678 |
+
self.fp.seek(cast(int, charset_pos))
|
679 |
+
format = self.fp.read(1)
|
680 |
+
if format == b"\x00":
|
681 |
+
# Format 0
|
682 |
+
n = self.nglyphs - 1
|
683 |
+
for gid, sid in enumerate(
|
684 |
+
cast(
|
685 |
+
Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))
|
686 |
+
),
|
687 |
+
):
|
688 |
+
gid += 1
|
689 |
+
sidname = self.getstr(sid)
|
690 |
+
self.name2gid[sidname] = gid
|
691 |
+
self.gid2name[gid] = sidname
|
692 |
+
elif format == b"\x01":
|
693 |
+
# Format 1
|
694 |
+
(n,) = struct.unpack("B", self.fp.read(1))
|
695 |
+
sid = 0
|
696 |
+
for i in range(n):
|
697 |
+
(first, nleft) = struct.unpack("BB", self.fp.read(2))
|
698 |
+
for gid in range(first, first + nleft + 1):
|
699 |
+
sidname = self.getstr(sid)
|
700 |
+
self.name2gid[sidname] = gid
|
701 |
+
self.gid2name[gid] = sidname
|
702 |
+
sid += 1
|
703 |
+
elif format == b"\x02":
|
704 |
+
# Format 2
|
705 |
+
assert False, str(("Unhandled", format))
|
706 |
+
else:
|
707 |
+
raise PDFValueError("unsupported charset format: %r" % format)
|
708 |
+
|
709 |
+
def getstr(self, sid: int) -> Union[str, bytes]:
|
710 |
+
# This returns str for one of the STANDARD_STRINGS but bytes otherwise,
|
711 |
+
# and appears to be a needless source of type complexity.
|
712 |
+
if sid < len(self.STANDARD_STRINGS):
|
713 |
+
return self.STANDARD_STRINGS[sid]
|
714 |
+
return self.string_index[sid - len(self.STANDARD_STRINGS)]
|
715 |
+
|
716 |
+
|
717 |
+
class TrueTypeFont:
|
718 |
+
class CMapNotFound(PDFException):
|
719 |
+
pass
|
720 |
+
|
721 |
+
def __init__(self, name: str, fp: BinaryIO) -> None:
|
722 |
+
self.name = name
|
723 |
+
self.fp = fp
|
724 |
+
self.tables: Dict[bytes, Tuple[int, int]] = {}
|
725 |
+
self.fonttype = fp.read(4)
|
726 |
+
try:
|
727 |
+
(ntables, _1, _2, _3) = cast(
|
728 |
+
Tuple[int, int, int, int],
|
729 |
+
struct.unpack(">HHHH", fp.read(8)),
|
730 |
+
)
|
731 |
+
for _ in range(ntables):
|
732 |
+
(name_bytes, tsum, offset, length) = cast(
|
733 |
+
Tuple[bytes, int, int, int],
|
734 |
+
struct.unpack(">4sLLL", fp.read(16)),
|
735 |
+
)
|
736 |
+
self.tables[name_bytes] = (offset, length)
|
737 |
+
except struct.error:
|
738 |
+
# Do not fail if there are not enough bytes to read. Even for
|
739 |
+
# corrupted PDFs we would like to get as much information as
|
740 |
+
# possible, so continue.
|
741 |
+
pass
|
742 |
+
|
743 |
+
def create_unicode_map(self) -> FileUnicodeMap:
|
744 |
+
if b"cmap" not in self.tables:
|
745 |
+
raise TrueTypeFont.CMapNotFound
|
746 |
+
(base_offset, length) = self.tables[b"cmap"]
|
747 |
+
fp = self.fp
|
748 |
+
fp.seek(base_offset)
|
749 |
+
(version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4)))
|
750 |
+
subtables: List[Tuple[int, int, int]] = []
|
751 |
+
for i in range(nsubtables):
|
752 |
+
subtables.append(
|
753 |
+
cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),
|
754 |
+
)
|
755 |
+
char2gid: Dict[int, int] = {}
|
756 |
+
# Only supports subtable type 0, 2 and 4.
|
757 |
+
for platform_id, encoding_id, st_offset in subtables:
|
758 |
+
# Skip non-Unicode cmaps.
|
759 |
+
# https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
|
760 |
+
if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
|
761 |
+
continue
|
762 |
+
fp.seek(base_offset + st_offset)
|
763 |
+
(fmttype, fmtlen, fmtlang) = cast(
|
764 |
+
Tuple[int, int, int],
|
765 |
+
struct.unpack(">HHH", fp.read(6)),
|
766 |
+
)
|
767 |
+
if fmttype == 0:
|
768 |
+
char2gid.update(
|
769 |
+
enumerate(
|
770 |
+
cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))),
|
771 |
+
),
|
772 |
+
)
|
773 |
+
elif fmttype == 2:
|
774 |
+
subheaderkeys = cast(
|
775 |
+
Tuple[int, ...],
|
776 |
+
struct.unpack(">256H", fp.read(512)),
|
777 |
+
)
|
778 |
+
firstbytes = [0] * 8192
|
779 |
+
for i, k in enumerate(subheaderkeys):
|
780 |
+
firstbytes[k // 8] = i
|
781 |
+
nhdrs = max(subheaderkeys) // 8 + 1
|
782 |
+
hdrs: List[Tuple[int, int, int, int, int]] = []
|
783 |
+
for i in range(nhdrs):
|
784 |
+
(firstcode, entcount, delta, offset) = cast(
|
785 |
+
Tuple[int, int, int, int],
|
786 |
+
struct.unpack(">HHhH", fp.read(8)),
|
787 |
+
)
|
788 |
+
hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))
|
789 |
+
for i, firstcode, entcount, delta, pos in hdrs:
|
790 |
+
if not entcount:
|
791 |
+
continue
|
792 |
+
first = firstcode + (firstbytes[i] << 8)
|
793 |
+
fp.seek(pos)
|
794 |
+
for c in range(entcount):
|
795 |
+
gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
|
796 |
+
if gid:
|
797 |
+
gid += delta
|
798 |
+
char2gid[first + c] = gid
|
799 |
+
elif fmttype == 4:
|
800 |
+
(segcount, _1, _2, _3) = cast(
|
801 |
+
Tuple[int, int, int, int],
|
802 |
+
struct.unpack(">HHHH", fp.read(8)),
|
803 |
+
)
|
804 |
+
segcount //= 2
|
805 |
+
ecs = cast(
|
806 |
+
Tuple[int, ...],
|
807 |
+
struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
|
808 |
+
)
|
809 |
+
fp.read(2)
|
810 |
+
scs = cast(
|
811 |
+
Tuple[int, ...],
|
812 |
+
struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
|
813 |
+
)
|
814 |
+
idds = cast(
|
815 |
+
Tuple[int, ...],
|
816 |
+
struct.unpack(">%dh" % segcount, fp.read(2 * segcount)),
|
817 |
+
)
|
818 |
+
pos = fp.tell()
|
819 |
+
idrs = cast(
|
820 |
+
Tuple[int, ...],
|
821 |
+
struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
|
822 |
+
)
|
823 |
+
for ec, sc, idd, idr in zip(ecs, scs, idds, idrs):
|
824 |
+
if idr:
|
825 |
+
fp.seek(pos + idr)
|
826 |
+
for c in range(sc, ec + 1):
|
827 |
+
b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
|
828 |
+
char2gid[c] = (b + idd) & 0xFFFF
|
829 |
+
else:
|
830 |
+
for c in range(sc, ec + 1):
|
831 |
+
char2gid[c] = (c + idd) & 0xFFFF
|
832 |
+
else:
|
833 |
+
assert False, str(("Unhandled", fmttype))
|
834 |
+
if not char2gid:
|
835 |
+
raise TrueTypeFont.CMapNotFound
|
836 |
+
# create unicode map
|
837 |
+
unicode_map = FileUnicodeMap()
|
838 |
+
for char, gid in char2gid.items():
|
839 |
+
unicode_map.add_cid2unichr(gid, char)
|
840 |
+
return unicode_map
|
841 |
+
|
842 |
+
|
843 |
+
class PDFFontError(PDFException):
|
844 |
+
pass
|
845 |
+
|
846 |
+
|
847 |
+
class PDFUnicodeNotDefined(PDFFontError):
|
848 |
+
pass
|
849 |
+
|
850 |
+
|
851 |
+
LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")
|
852 |
+
LITERAL_TYPE1C = LIT("Type1C")
|
853 |
+
|
854 |
+
# Font widths are maintained in a dict type that maps from *either* unicode
|
855 |
+
# chars or integer character IDs.
|
856 |
+
FontWidthDict = Union[Dict[int, float], Dict[str, float]]
|
857 |
+
|
858 |
+
|
859 |
+
class PDFFont:
|
860 |
+
def __init__(
|
861 |
+
self,
|
862 |
+
descriptor: Mapping[str, Any],
|
863 |
+
widths: FontWidthDict,
|
864 |
+
default_width: Optional[float] = None,
|
865 |
+
) -> None:
|
866 |
+
self.descriptor = descriptor
|
867 |
+
self.widths: FontWidthDict = resolve_all(widths)
|
868 |
+
self.fontname = resolve1(descriptor.get("FontName", "unknown"))
|
869 |
+
if isinstance(self.fontname, PSLiteral):
|
870 |
+
self.fontname = literal_name(self.fontname)
|
871 |
+
self.flags = int_value(descriptor.get("Flags", 0))
|
872 |
+
self.ascent = num_value(descriptor.get("Ascent", 0))
|
873 |
+
self.descent = num_value(descriptor.get("Descent", 0))
|
874 |
+
self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
|
875 |
+
if default_width is None:
|
876 |
+
self.default_width = num_value(descriptor.get("MissingWidth", 0))
|
877 |
+
else:
|
878 |
+
self.default_width = default_width
|
879 |
+
self.default_width = resolve1(self.default_width)
|
880 |
+
self.leading = num_value(descriptor.get("Leading", 0))
|
881 |
+
self.bbox = cast(
|
882 |
+
Rect,
|
883 |
+
list_value(resolve_all(descriptor.get("FontBBox", (0, 0, 0, 0)))),
|
884 |
+
)
|
885 |
+
self.hscale = self.vscale = 0.001
|
886 |
+
|
887 |
+
# PDF RM 9.8.1 specifies /Descent should always be a negative number.
|
888 |
+
# PScript5.dll seems to produce Descent with a positive number, but
|
889 |
+
# text analysis will be wrong if this is taken as correct. So force
|
890 |
+
# descent to negative.
|
891 |
+
if self.descent > 0:
|
892 |
+
self.descent = -self.descent
|
893 |
+
|
894 |
+
def __repr__(self) -> str:
|
895 |
+
return "<PDFFont>"
|
896 |
+
|
897 |
+
def is_vertical(self) -> bool:
|
898 |
+
return False
|
899 |
+
|
900 |
+
def is_multibyte(self) -> bool:
|
901 |
+
return False
|
902 |
+
|
903 |
+
def decode(self, bytes: bytes) -> Iterable[int]:
|
904 |
+
return bytearray(bytes) # map(ord, bytes)
|
905 |
+
|
906 |
+
def get_ascent(self) -> float:
|
907 |
+
"""Ascent above the baseline, in text space units"""
|
908 |
+
return self.ascent * self.vscale
|
909 |
+
|
910 |
+
def get_descent(self) -> float:
|
911 |
+
"""Descent below the baseline, in text space units; always negative"""
|
912 |
+
return self.descent * self.vscale
|
913 |
+
|
914 |
+
def get_width(self) -> float:
|
915 |
+
w = self.bbox[2] - self.bbox[0]
|
916 |
+
if w == 0:
|
917 |
+
w = -self.default_width
|
918 |
+
return w * self.hscale
|
919 |
+
|
920 |
+
def get_height(self) -> float:
|
921 |
+
h = self.bbox[3] - self.bbox[1]
|
922 |
+
if h == 0:
|
923 |
+
h = self.ascent - self.descent
|
924 |
+
return h * self.vscale
|
925 |
+
|
926 |
+
def char_width(self, cid: int) -> float:
|
927 |
+
# Because character widths may be mapping either IDs or strings,
|
928 |
+
# we try to lookup the character ID first, then its str equivalent.
|
929 |
+
try:
|
930 |
+
return cast(Dict[int, float], self.widths)[cid] * self.hscale
|
931 |
+
except KeyError:
|
932 |
+
str_widths = cast(Dict[str, float], self.widths)
|
933 |
+
try:
|
934 |
+
return str_widths[self.to_unichr(cid)] * self.hscale
|
935 |
+
except (KeyError, PDFUnicodeNotDefined):
|
936 |
+
return self.default_width * self.hscale
|
937 |
+
|
938 |
+
def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
|
939 |
+
"""Returns an integer for horizontal fonts, a tuple for vertical fonts."""
|
940 |
+
return 0
|
941 |
+
|
942 |
+
def string_width(self, s: bytes) -> float:
|
943 |
+
return sum(self.char_width(cid) for cid in self.decode(s))
|
944 |
+
|
945 |
+
def to_unichr(self, cid: int) -> str:
|
946 |
+
raise NotImplementedError
|
947 |
+
|
948 |
+
|
949 |
+
class PDFSimpleFont(PDFFont):
|
950 |
+
def __init__(
|
951 |
+
self,
|
952 |
+
descriptor: Mapping[str, Any],
|
953 |
+
widths: FontWidthDict,
|
954 |
+
spec: Mapping[str, Any],
|
955 |
+
) -> None:
|
956 |
+
# Font encoding is specified either by a name of
|
957 |
+
# built-in encoding or a dictionary that describes
|
958 |
+
# the differences.
|
959 |
+
if "Encoding" in spec:
|
960 |
+
encoding = resolve1(spec["Encoding"])
|
961 |
+
else:
|
962 |
+
encoding = LITERAL_STANDARD_ENCODING
|
963 |
+
if isinstance(encoding, dict):
|
964 |
+
name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING))
|
965 |
+
diff = list_value(encoding.get("Differences", []))
|
966 |
+
self.cid2unicode = EncodingDB.get_encoding(name, diff)
|
967 |
+
else:
|
968 |
+
self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
|
969 |
+
self.unicode_map: Optional[UnicodeMap] = None
|
970 |
+
if "ToUnicode" in spec:
|
971 |
+
strm = stream_value(spec["ToUnicode"])
|
972 |
+
self.unicode_map = FileUnicodeMap()
|
973 |
+
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
|
974 |
+
PDFFont.__init__(self, descriptor, widths)
|
975 |
+
|
976 |
+
def to_unichr(self, cid: int) -> str:
|
977 |
+
if self.unicode_map:
|
978 |
+
try:
|
979 |
+
return self.unicode_map.get_unichr(cid)
|
980 |
+
except KeyError:
|
981 |
+
pass
|
982 |
+
try:
|
983 |
+
return self.cid2unicode[cid]
|
984 |
+
except KeyError:
|
985 |
+
raise PDFUnicodeNotDefined(None, cid)
|
986 |
+
|
987 |
+
|
988 |
+
class PDFType1Font(PDFSimpleFont):
|
989 |
+
def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
|
990 |
+
try:
|
991 |
+
self.basefont = literal_name(spec["BaseFont"])
|
992 |
+
except KeyError:
|
993 |
+
if settings.STRICT:
|
994 |
+
raise PDFFontError("BaseFont is missing")
|
995 |
+
self.basefont = "unknown"
|
996 |
+
|
997 |
+
widths: FontWidthDict
|
998 |
+
try:
|
999 |
+
(descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)
|
1000 |
+
widths = cast(Dict[str, float], int_widths) # implicit int->float
|
1001 |
+
except KeyError:
|
1002 |
+
descriptor = dict_value(spec.get("FontDescriptor", {}))
|
1003 |
+
firstchar = int_value(spec.get("FirstChar", 0))
|
1004 |
+
# lastchar = int_value(spec.get('LastChar', 255))
|
1005 |
+
width_list = list_value(spec.get("Widths", [0] * 256))
|
1006 |
+
widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}
|
1007 |
+
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
1008 |
+
if "Encoding" not in spec and "FontFile" in descriptor:
|
1009 |
+
# try to recover the missing encoding info from the font file.
|
1010 |
+
self.fontfile = stream_value(descriptor.get("FontFile"))
|
1011 |
+
length1 = int_value(self.fontfile["Length1"])
|
1012 |
+
data = self.fontfile.get_data()[:length1]
|
1013 |
+
parser = Type1FontHeaderParser(BytesIO(data))
|
1014 |
+
self.cid2unicode = parser.get_encoding()
|
1015 |
+
|
1016 |
+
def __repr__(self) -> str:
|
1017 |
+
return "<PDFType1Font: basefont=%r>" % self.basefont
|
1018 |
+
|
1019 |
+
|
1020 |
+
class PDFTrueTypeFont(PDFType1Font):
|
1021 |
+
def __repr__(self) -> str:
|
1022 |
+
return "<PDFTrueTypeFont: basefont=%r>" % self.basefont
|
1023 |
+
|
1024 |
+
|
1025 |
+
class PDFType3Font(PDFSimpleFont):
|
1026 |
+
def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
|
1027 |
+
firstchar = int_value(spec.get("FirstChar", 0))
|
1028 |
+
# lastchar = int_value(spec.get('LastChar', 0))
|
1029 |
+
width_list = list_value(spec.get("Widths", [0] * 256))
|
1030 |
+
widths = {i + firstchar: w for (i, w) in enumerate(width_list)}
|
1031 |
+
if "FontDescriptor" in spec:
|
1032 |
+
descriptor = dict_value(spec["FontDescriptor"])
|
1033 |
+
else:
|
1034 |
+
descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
|
1035 |
+
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
1036 |
+
self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))
|
1037 |
+
(_, self.descent, _, self.ascent) = self.bbox
|
1038 |
+
(self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
|
1039 |
+
|
1040 |
+
def __repr__(self) -> str:
|
1041 |
+
return "<PDFType3Font>"
|
1042 |
+
|
1043 |
+
|
1044 |
+
class PDFCIDFont(PDFFont):
|
1045 |
+
default_disp: Union[float, Tuple[Optional[float], float]]
|
1046 |
+
|
1047 |
+
def __init__(
|
1048 |
+
self,
|
1049 |
+
rsrcmgr: "PDFResourceManager",
|
1050 |
+
spec: Mapping[str, Any],
|
1051 |
+
strict: bool = settings.STRICT,
|
1052 |
+
) -> None:
|
1053 |
+
try:
|
1054 |
+
self.basefont = literal_name(spec["BaseFont"])
|
1055 |
+
except KeyError:
|
1056 |
+
if strict:
|
1057 |
+
raise PDFFontError("BaseFont is missing")
|
1058 |
+
self.basefont = "unknown"
|
1059 |
+
self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
|
1060 |
+
cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(
|
1061 |
+
"latin1",
|
1062 |
+
)
|
1063 |
+
cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(
|
1064 |
+
"latin1",
|
1065 |
+
)
|
1066 |
+
self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
|
1067 |
+
self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)
|
1068 |
+
|
1069 |
+
try:
|
1070 |
+
descriptor = dict_value(spec["FontDescriptor"])
|
1071 |
+
except KeyError:
|
1072 |
+
if strict:
|
1073 |
+
raise PDFFontError("FontDescriptor is missing")
|
1074 |
+
descriptor = {}
|
1075 |
+
ttf = None
|
1076 |
+
if "FontFile2" in descriptor:
|
1077 |
+
self.fontfile = stream_value(descriptor.get("FontFile2"))
|
1078 |
+
ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
|
1079 |
+
self.unicode_map: Optional[UnicodeMap] = None
|
1080 |
+
if "ToUnicode" in spec:
|
1081 |
+
if isinstance(spec["ToUnicode"], PDFStream):
|
1082 |
+
strm = stream_value(spec["ToUnicode"])
|
1083 |
+
self.unicode_map = FileUnicodeMap()
|
1084 |
+
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
|
1085 |
+
else:
|
1086 |
+
cmap_name = literal_name(spec["ToUnicode"])
|
1087 |
+
encoding = literal_name(spec["Encoding"])
|
1088 |
+
if (
|
1089 |
+
"Identity" in cid_ordering
|
1090 |
+
or "Identity" in cmap_name
|
1091 |
+
or "Identity" in encoding
|
1092 |
+
):
|
1093 |
+
self.unicode_map = IdentityUnicodeMap()
|
1094 |
+
elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):
|
1095 |
+
if ttf:
|
1096 |
+
try:
|
1097 |
+
self.unicode_map = ttf.create_unicode_map()
|
1098 |
+
except TrueTypeFont.CMapNotFound:
|
1099 |
+
pass
|
1100 |
+
else:
|
1101 |
+
try:
|
1102 |
+
self.unicode_map = CMapDB.get_unicode_map(
|
1103 |
+
self.cidcoding,
|
1104 |
+
self.cmap.is_vertical(),
|
1105 |
+
)
|
1106 |
+
except CMapDB.CMapNotFound:
|
1107 |
+
pass
|
1108 |
+
|
1109 |
+
self.vertical = self.cmap.is_vertical()
|
1110 |
+
if self.vertical:
|
1111 |
+
# writing mode: vertical
|
1112 |
+
widths2 = get_widths2(list_value(spec.get("W2", [])))
|
1113 |
+
self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}
|
1114 |
+
(vy, w) = resolve1(spec.get("DW2", [880, -1000]))
|
1115 |
+
self.default_disp = (None, vy)
|
1116 |
+
widths = {cid: w for (cid, (w, _)) in widths2.items()}
|
1117 |
+
default_width = w
|
1118 |
+
else:
|
1119 |
+
# writing mode: horizontal
|
1120 |
+
self.disps = {}
|
1121 |
+
self.default_disp = 0
|
1122 |
+
widths = get_widths(list_value(spec.get("W", [])))
|
1123 |
+
default_width = spec.get("DW", 1000)
|
1124 |
+
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
|
1125 |
+
|
1126 |
+
def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:
|
1127 |
+
"""Get cmap from font specification
|
1128 |
+
|
1129 |
+
For certain PDFs, Encoding Type isn't mentioned as an attribute of
|
1130 |
+
Encoding but as an attribute of CMapName, where CMapName is an
|
1131 |
+
attribute of spec['Encoding'].
|
1132 |
+
The horizontal/vertical modes are mentioned with different name
|
1133 |
+
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
|
1134 |
+
"""
|
1135 |
+
cmap_name = self._get_cmap_name(spec, strict)
|
1136 |
+
|
1137 |
+
try:
|
1138 |
+
return CMapDB.get_cmap(cmap_name)
|
1139 |
+
except CMapDB.CMapNotFound as e:
|
1140 |
+
if strict:
|
1141 |
+
raise PDFFontError(e)
|
1142 |
+
return CMap()
|
1143 |
+
|
1144 |
+
@staticmethod
|
1145 |
+
def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
|
1146 |
+
"""Get cmap name from font specification"""
|
1147 |
+
cmap_name = "unknown" # default value
|
1148 |
+
|
1149 |
+
try:
|
1150 |
+
spec_encoding = spec["Encoding"]
|
1151 |
+
if hasattr(spec_encoding, "name"):
|
1152 |
+
cmap_name = literal_name(spec["Encoding"])
|
1153 |
+
else:
|
1154 |
+
cmap_name = literal_name(spec_encoding["CMapName"])
|
1155 |
+
except KeyError:
|
1156 |
+
if strict:
|
1157 |
+
raise PDFFontError("Encoding is unspecified")
|
1158 |
+
|
1159 |
+
if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]
|
1160 |
+
cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
|
1161 |
+
if "CMapName" in cmap_name_stream:
|
1162 |
+
cmap_name = cmap_name_stream.get("CMapName").name
|
1163 |
+
elif strict:
|
1164 |
+
raise PDFFontError("CMapName unspecified for encoding")
|
1165 |
+
|
1166 |
+
return IDENTITY_ENCODER.get(cmap_name, cmap_name)
|
1167 |
+
|
1168 |
+
def __repr__(self) -> str:
|
1169 |
+
return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"
|
1170 |
+
|
1171 |
+
def is_vertical(self) -> bool:
|
1172 |
+
return self.vertical
|
1173 |
+
|
1174 |
+
def is_multibyte(self) -> bool:
|
1175 |
+
return True
|
1176 |
+
|
1177 |
+
def decode(self, bytes: bytes) -> Iterable[int]:
|
1178 |
+
return self.cmap.decode(bytes)
|
1179 |
+
|
1180 |
+
def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
|
1181 |
+
"""Returns an integer for horizontal fonts, a tuple for vertical fonts."""
|
1182 |
+
return self.disps.get(cid, self.default_disp)
|
1183 |
+
|
1184 |
+
def to_unichr(self, cid: int) -> str:
|
1185 |
+
try:
|
1186 |
+
if not self.unicode_map:
|
1187 |
+
raise PDFKeyError(cid)
|
1188 |
+
return self.unicode_map.get_unichr(cid)
|
1189 |
+
except KeyError:
|
1190 |
+
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
pdf2zh/pdfinterp.py
ADDED
@@ -0,0 +1,1113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import re
|
3 |
+
from io import BytesIO
|
4 |
+
from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from pdf2zh import settings
|
8 |
+
from pdf2zh.casting import safe_float
|
9 |
+
from pdf2zh.cmapdb import CMap, CMapBase, CMapDB
|
10 |
+
from pdf2zh.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
|
11 |
+
from pdf2zh.pdfdevice import PDFDevice, PDFTextSeq
|
12 |
+
from pdf2zh.pdfexceptions import PDFException, PDFValueError
|
13 |
+
from pdf2zh.pdffont import (
|
14 |
+
PDFCIDFont,
|
15 |
+
PDFFont,
|
16 |
+
PDFFontError,
|
17 |
+
PDFTrueTypeFont,
|
18 |
+
PDFType1Font,
|
19 |
+
PDFType3Font,
|
20 |
+
)
|
21 |
+
from pdf2zh.pdfpage import PDFPage
|
22 |
+
from pdf2zh.pdftypes import (
|
23 |
+
LITERALS_ASCII85_DECODE,
|
24 |
+
PDFObjRef,
|
25 |
+
PDFStream,
|
26 |
+
dict_value,
|
27 |
+
list_value,
|
28 |
+
resolve1,
|
29 |
+
stream_value,
|
30 |
+
)
|
31 |
+
from pdf2zh.psexceptions import PSEOF, PSTypeError
|
32 |
+
from pdf2zh.psparser import (
|
33 |
+
KWD,
|
34 |
+
LIT,
|
35 |
+
PSKeyword,
|
36 |
+
PSLiteral,
|
37 |
+
PSStackParser,
|
38 |
+
PSStackType,
|
39 |
+
keyword_name,
|
40 |
+
literal_name,
|
41 |
+
)
|
42 |
+
from pdf2zh.utils import (
|
43 |
+
MATRIX_IDENTITY,
|
44 |
+
Matrix,
|
45 |
+
PathSegment,
|
46 |
+
Point,
|
47 |
+
Rect,
|
48 |
+
choplist,
|
49 |
+
mult_matrix,
|
50 |
+
apply_matrix_pt,
|
51 |
+
)
|
52 |
+
|
53 |
+
log = logging.getLogger(__name__)
|
54 |
+
|
55 |
+
|
56 |
+
class PDFResourceError(PDFException):
|
57 |
+
pass
|
58 |
+
|
59 |
+
|
60 |
+
class PDFInterpreterError(PDFException):
|
61 |
+
pass
|
62 |
+
|
63 |
+
|
64 |
+
LITERAL_PDF = LIT("PDF")
|
65 |
+
LITERAL_TEXT = LIT("Text")
|
66 |
+
LITERAL_FONT = LIT("Font")
|
67 |
+
LITERAL_FORM = LIT("Form")
|
68 |
+
LITERAL_IMAGE = LIT("Image")
|
69 |
+
|
70 |
+
|
71 |
+
class PDFTextState:
|
72 |
+
matrix: Matrix
|
73 |
+
linematrix: Point
|
74 |
+
|
75 |
+
def __init__(self) -> None:
|
76 |
+
self.font: Optional[PDFFont] = None
|
77 |
+
self.fontsize: float = 0
|
78 |
+
self.charspace: float = 0
|
79 |
+
self.wordspace: float = 0
|
80 |
+
self.scaling: float = 100
|
81 |
+
self.leading: float = 0
|
82 |
+
self.render: int = 0
|
83 |
+
self.rise: float = 0
|
84 |
+
self.reset()
|
85 |
+
# self.matrix is set
|
86 |
+
# self.linematrix is set
|
87 |
+
|
88 |
+
def __repr__(self) -> str:
|
89 |
+
return (
|
90 |
+
"<PDFTextState: font=%r, fontsize=%r, charspace=%r, "
|
91 |
+
"wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, "
|
92 |
+
"matrix=%r, linematrix=%r>"
|
93 |
+
% (
|
94 |
+
self.font,
|
95 |
+
self.fontsize,
|
96 |
+
self.charspace,
|
97 |
+
self.wordspace,
|
98 |
+
self.scaling,
|
99 |
+
self.leading,
|
100 |
+
self.render,
|
101 |
+
self.rise,
|
102 |
+
self.matrix,
|
103 |
+
self.linematrix,
|
104 |
+
)
|
105 |
+
)
|
106 |
+
|
107 |
+
def copy(self) -> "PDFTextState":
|
108 |
+
obj = PDFTextState()
|
109 |
+
obj.font = self.font
|
110 |
+
obj.fontsize = self.fontsize
|
111 |
+
obj.charspace = self.charspace
|
112 |
+
obj.wordspace = self.wordspace
|
113 |
+
obj.scaling = self.scaling
|
114 |
+
obj.leading = self.leading
|
115 |
+
obj.render = self.render
|
116 |
+
obj.rise = self.rise
|
117 |
+
obj.matrix = self.matrix
|
118 |
+
obj.linematrix = self.linematrix
|
119 |
+
return obj
|
120 |
+
|
121 |
+
def reset(self) -> None:
|
122 |
+
self.matrix = MATRIX_IDENTITY
|
123 |
+
self.linematrix = (0, 0)
|
124 |
+
|
125 |
+
|
126 |
+
Color = Union[
|
127 |
+
float, # Greyscale
|
128 |
+
Tuple[float, float, float], # R, G, B
|
129 |
+
Tuple[float, float, float, float], # C, M, Y, K
|
130 |
+
]
|
131 |
+
|
132 |
+
|
133 |
+
class PDFGraphicState:
|
134 |
+
def __init__(self) -> None:
|
135 |
+
self.linewidth: float = 0
|
136 |
+
self.linecap: Optional[object] = None
|
137 |
+
self.linejoin: Optional[object] = None
|
138 |
+
self.miterlimit: Optional[object] = None
|
139 |
+
self.dash: Optional[Tuple[object, object]] = None
|
140 |
+
self.intent: Optional[object] = None
|
141 |
+
self.flatness: Optional[object] = None
|
142 |
+
|
143 |
+
# stroking color
|
144 |
+
self.scolor: Optional[Color] = None
|
145 |
+
|
146 |
+
# non stroking color
|
147 |
+
self.ncolor: Optional[Color] = None
|
148 |
+
|
149 |
+
def copy(self) -> "PDFGraphicState":
|
150 |
+
obj = PDFGraphicState()
|
151 |
+
obj.linewidth = self.linewidth
|
152 |
+
obj.linecap = self.linecap
|
153 |
+
obj.linejoin = self.linejoin
|
154 |
+
obj.miterlimit = self.miterlimit
|
155 |
+
obj.dash = self.dash
|
156 |
+
obj.intent = self.intent
|
157 |
+
obj.flatness = self.flatness
|
158 |
+
obj.scolor = self.scolor
|
159 |
+
obj.ncolor = self.ncolor
|
160 |
+
return obj
|
161 |
+
|
162 |
+
def __repr__(self) -> str:
|
163 |
+
return (
|
164 |
+
"<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, "
|
165 |
+
" miterlimit=%r, dash=%r, intent=%r, flatness=%r, "
|
166 |
+
" stroking color=%r, non stroking color=%r>"
|
167 |
+
% (
|
168 |
+
self.linewidth,
|
169 |
+
self.linecap,
|
170 |
+
self.linejoin,
|
171 |
+
self.miterlimit,
|
172 |
+
self.dash,
|
173 |
+
self.intent,
|
174 |
+
self.flatness,
|
175 |
+
self.scolor,
|
176 |
+
self.ncolor,
|
177 |
+
)
|
178 |
+
)
|
179 |
+
|
180 |
+
|
181 |
+
class PDFResourceManager:
|
182 |
+
"""Repository of shared resources.
|
183 |
+
|
184 |
+
ResourceManager facilitates reuse of shared resources
|
185 |
+
such as fonts and images so that large objects are not
|
186 |
+
allocated multiple times.
|
187 |
+
"""
|
188 |
+
|
189 |
+
def __init__(self, caching: bool = True) -> None:
|
190 |
+
self.caching = caching
|
191 |
+
self._cached_fonts: Dict[object, PDFFont] = {}
|
192 |
+
|
193 |
+
def get_procset(self, procs: Sequence[object]) -> None:
|
194 |
+
for proc in procs:
|
195 |
+
if proc is LITERAL_PDF or proc is LITERAL_TEXT:
|
196 |
+
pass
|
197 |
+
else:
|
198 |
+
pass
|
199 |
+
|
200 |
+
def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
|
201 |
+
try:
|
202 |
+
return CMapDB.get_cmap(cmapname)
|
203 |
+
except CMapDB.CMapNotFound:
|
204 |
+
if strict:
|
205 |
+
raise
|
206 |
+
return CMap()
|
207 |
+
|
208 |
+
def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
|
209 |
+
if objid and objid in self._cached_fonts:
|
210 |
+
font = self._cached_fonts[objid]
|
211 |
+
else:
|
212 |
+
# log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
|
213 |
+
if settings.STRICT:
|
214 |
+
if spec["Type"] is not LITERAL_FONT:
|
215 |
+
raise PDFFontError("Type is not /Font")
|
216 |
+
# Create a Font object.
|
217 |
+
if "Subtype" in spec:
|
218 |
+
subtype = literal_name(spec["Subtype"])
|
219 |
+
else:
|
220 |
+
if settings.STRICT:
|
221 |
+
raise PDFFontError("Font Subtype is not specified.")
|
222 |
+
subtype = "Type1"
|
223 |
+
if subtype in ("Type1", "MMType1"):
|
224 |
+
# Type1 Font
|
225 |
+
font = PDFType1Font(self, spec)
|
226 |
+
elif subtype == "TrueType":
|
227 |
+
# TrueType Font
|
228 |
+
font = PDFTrueTypeFont(self, spec)
|
229 |
+
elif subtype == "Type3":
|
230 |
+
# Type3 Font
|
231 |
+
font = PDFType3Font(self, spec)
|
232 |
+
elif subtype in ("CIDFontType0", "CIDFontType2"):
|
233 |
+
# CID Font
|
234 |
+
font = PDFCIDFont(self, spec)
|
235 |
+
elif subtype == "Type0":
|
236 |
+
# Type0 Font
|
237 |
+
dfonts = list_value(spec["DescendantFonts"])
|
238 |
+
assert dfonts
|
239 |
+
subspec = dict_value(dfonts[0]).copy()
|
240 |
+
for k in ("Encoding", "ToUnicode"):
|
241 |
+
if k in spec:
|
242 |
+
subspec[k] = resolve1(spec[k])
|
243 |
+
font = self.get_font(None, subspec)
|
244 |
+
else:
|
245 |
+
if settings.STRICT:
|
246 |
+
raise PDFFontError("Invalid Font spec: %r" % spec)
|
247 |
+
font = PDFType1Font(self, spec) # this is so wrong!
|
248 |
+
if objid and self.caching:
|
249 |
+
self._cached_fonts[objid] = font
|
250 |
+
return font
|
251 |
+
|
252 |
+
|
253 |
+
class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
|
254 |
+
def __init__(self, streams: Sequence[object]) -> None:
|
255 |
+
self.streams = streams
|
256 |
+
self.istream = 0
|
257 |
+
# PSStackParser.__init__(fp=None) is safe only because we've overloaded
|
258 |
+
# all the methods that would attempt to access self.fp without first
|
259 |
+
# calling self.fillfp().
|
260 |
+
PSStackParser.__init__(self, None) # type: ignore[arg-type]
|
261 |
+
|
262 |
+
def fillfp(self) -> None:
|
263 |
+
if not self.fp:
|
264 |
+
if self.istream < len(self.streams):
|
265 |
+
strm = stream_value(self.streams[self.istream])
|
266 |
+
self.istream += 1
|
267 |
+
else:
|
268 |
+
raise PSEOF("Unexpected EOF, file truncated?")
|
269 |
+
self.fp = BytesIO(strm.get_data())
|
270 |
+
# if log.isEnabledFor(logging.DEBUG):
|
271 |
+
# log.debug(f'STREAM DATA {strm.get_data()}')
|
272 |
+
|
273 |
+
def seek(self, pos: int) -> None:
|
274 |
+
self.fillfp()
|
275 |
+
PSStackParser.seek(self, pos)
|
276 |
+
|
277 |
+
def fillbuf(self) -> None:
|
278 |
+
if self.charpos < len(self.buf):
|
279 |
+
return
|
280 |
+
while 1:
|
281 |
+
self.fillfp()
|
282 |
+
self.bufpos = self.fp.tell()
|
283 |
+
self.buf = self.fp.read(self.BUFSIZ)
|
284 |
+
if self.buf:
|
285 |
+
break
|
286 |
+
self.fp = None # type: ignore[assignment]
|
287 |
+
self.charpos = 0
|
288 |
+
|
289 |
+
def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
|
290 |
+
self.seek(pos)
|
291 |
+
i = 0
|
292 |
+
data = b""
|
293 |
+
while i <= len(target):
|
294 |
+
self.fillbuf()
|
295 |
+
if i:
|
296 |
+
ci = self.buf[self.charpos]
|
297 |
+
c = bytes((ci,))
|
298 |
+
data += c
|
299 |
+
self.charpos += 1
|
300 |
+
if (
|
301 |
+
len(target) <= i
|
302 |
+
and c.isspace()
|
303 |
+
or i < len(target)
|
304 |
+
and c == (bytes((target[i],)))
|
305 |
+
):
|
306 |
+
i += 1
|
307 |
+
else:
|
308 |
+
i = 0
|
309 |
+
else:
|
310 |
+
try:
|
311 |
+
j = self.buf.index(target[0], self.charpos)
|
312 |
+
data += self.buf[self.charpos : j + 1]
|
313 |
+
self.charpos = j + 1
|
314 |
+
i = 1
|
315 |
+
except ValueError:
|
316 |
+
data += self.buf[self.charpos :]
|
317 |
+
self.charpos = len(self.buf)
|
318 |
+
data = data[: -(len(target) + 1)] # strip the last part
|
319 |
+
data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
|
320 |
+
return (pos, data)
|
321 |
+
|
322 |
+
def flush(self) -> None:
|
323 |
+
self.add_results(*self.popall())
|
324 |
+
|
325 |
+
KEYWORD_BI = KWD(b"BI")
|
326 |
+
KEYWORD_ID = KWD(b"ID")
|
327 |
+
KEYWORD_EI = KWD(b"EI")
|
328 |
+
|
329 |
+
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
330 |
+
if token is self.KEYWORD_BI:
|
331 |
+
# inline image within a content stream
|
332 |
+
self.start_type(pos, "inline")
|
333 |
+
elif token is self.KEYWORD_ID:
|
334 |
+
try:
|
335 |
+
(_, objs) = self.end_type("inline")
|
336 |
+
if len(objs) % 2 != 0:
|
337 |
+
error_msg = f"Invalid dictionary construct: {objs!r}"
|
338 |
+
raise PSTypeError(error_msg)
|
339 |
+
d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
|
340 |
+
eos = b"EI"
|
341 |
+
filter = d.get("F", None)
|
342 |
+
if filter is not None:
|
343 |
+
if isinstance(filter, PSLiteral):
|
344 |
+
filter = [filter]
|
345 |
+
if filter[0] in LITERALS_ASCII85_DECODE:
|
346 |
+
eos = b"~>"
|
347 |
+
(pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
|
348 |
+
if eos != b"EI": # it may be necessary for decoding
|
349 |
+
data += eos
|
350 |
+
obj = PDFStream(d, data)
|
351 |
+
self.push((pos, obj))
|
352 |
+
if eos == b"EI": # otherwise it is still in the stream
|
353 |
+
self.push((pos, self.KEYWORD_EI))
|
354 |
+
except PSTypeError:
|
355 |
+
if settings.STRICT:
|
356 |
+
raise
|
357 |
+
else:
|
358 |
+
self.push((pos, token))
|
359 |
+
|
360 |
+
|
361 |
+
PDFStackT = PSStackType[PDFStream]
|
362 |
+
"""Types that may appear on the PDF argument stack."""
|
363 |
+
|
364 |
+
|
365 |
+
class PDFPageInterpreter:
|
366 |
+
"""Processor for the content of a PDF page
|
367 |
+
|
368 |
+
Reference: PDF Reference, Appendix A, Operator Summary
|
369 |
+
"""
|
370 |
+
|
371 |
+
def __init__(
|
372 |
+
self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch
|
373 |
+
) -> None:
|
374 |
+
self.rsrcmgr = rsrcmgr
|
375 |
+
self.device = device
|
376 |
+
self.obj_patch = obj_patch
|
377 |
+
|
378 |
+
def dup(self) -> "PDFPageInterpreter":
|
379 |
+
return self.__class__(self.rsrcmgr, self.device, self.obj_patch)
|
380 |
+
|
381 |
+
def init_resources(self, resources: Dict[object, object]) -> None:
|
382 |
+
"""Prepare the fonts and XObjects listed in the Resource attribute."""
|
383 |
+
self.resources = resources
|
384 |
+
self.fontmap: Dict[object, PDFFont] = {}
|
385 |
+
self.fontid: Dict[PDFFont, object] = {}
|
386 |
+
self.xobjmap = {}
|
387 |
+
self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
|
388 |
+
if not resources:
|
389 |
+
return
|
390 |
+
|
391 |
+
def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
|
392 |
+
if isinstance(spec, list):
|
393 |
+
name = literal_name(spec[0])
|
394 |
+
else:
|
395 |
+
name = literal_name(spec)
|
396 |
+
if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
|
397 |
+
return PDFColorSpace(name, stream_value(spec[1])["N"])
|
398 |
+
elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
|
399 |
+
return PDFColorSpace(name, len(list_value(spec[1])))
|
400 |
+
else:
|
401 |
+
return PREDEFINED_COLORSPACE.get(name)
|
402 |
+
|
403 |
+
for k, v in dict_value(resources).items():
|
404 |
+
# log.debug("Resource: %r: %r", k, v)
|
405 |
+
if k == "Font":
|
406 |
+
for fontid, spec in dict_value(v).items():
|
407 |
+
objid = None
|
408 |
+
if isinstance(spec, PDFObjRef):
|
409 |
+
objid = spec.objid
|
410 |
+
spec = dict_value(spec)
|
411 |
+
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
412 |
+
self.fontid[self.fontmap[fontid]] = fontid
|
413 |
+
elif k == "ColorSpace":
|
414 |
+
for csid, spec in dict_value(v).items():
|
415 |
+
colorspace = get_colorspace(resolve1(spec))
|
416 |
+
if colorspace is not None:
|
417 |
+
self.csmap[csid] = colorspace
|
418 |
+
elif k == "ProcSet":
|
419 |
+
self.rsrcmgr.get_procset(list_value(v))
|
420 |
+
elif k == "XObject":
|
421 |
+
for xobjid, xobjstrm in dict_value(v).items():
|
422 |
+
self.xobjmap[xobjid] = xobjstrm
|
423 |
+
|
424 |
+
def init_state(self, ctm: Matrix) -> None:
|
425 |
+
"""Initialize the text and graphic states for rendering a page."""
|
426 |
+
# gstack: stack for graphical states.
|
427 |
+
self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = []
|
428 |
+
self.ctm = ctm
|
429 |
+
self.device.set_ctm(self.ctm)
|
430 |
+
self.textstate = PDFTextState()
|
431 |
+
self.graphicstate = PDFGraphicState()
|
432 |
+
self.curpath: List[PathSegment] = []
|
433 |
+
# argstack: stack for command arguments.
|
434 |
+
self.argstack: List[PDFStackT] = []
|
435 |
+
# set some global states.
|
436 |
+
self.scs: Optional[PDFColorSpace] = None
|
437 |
+
self.ncs: Optional[PDFColorSpace] = None
|
438 |
+
if self.csmap:
|
439 |
+
self.scs = self.ncs = next(iter(self.csmap.values()))
|
440 |
+
|
441 |
+
def push(self, obj: PDFStackT) -> None:
|
442 |
+
self.argstack.append(obj)
|
443 |
+
|
444 |
+
def pop(self, n: int) -> List[PDFStackT]:
|
445 |
+
if n == 0:
|
446 |
+
return []
|
447 |
+
x = self.argstack[-n:]
|
448 |
+
self.argstack = self.argstack[:-n]
|
449 |
+
return x
|
450 |
+
|
451 |
+
def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
|
452 |
+
return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
|
453 |
+
|
454 |
+
def set_current_state(
|
455 |
+
self,
|
456 |
+
state: Tuple[Matrix, PDFTextState, PDFGraphicState],
|
457 |
+
) -> None:
|
458 |
+
(self.ctm, self.textstate, self.graphicstate) = state
|
459 |
+
self.device.set_ctm(self.ctm)
|
460 |
+
|
461 |
+
def do_q(self) -> None:
|
462 |
+
"""Save graphics state"""
|
463 |
+
self.gstack.append(self.get_current_state())
|
464 |
+
|
465 |
+
def do_Q(self) -> None:
|
466 |
+
"""Restore graphics state"""
|
467 |
+
if self.gstack:
|
468 |
+
self.set_current_state(self.gstack.pop())
|
469 |
+
|
470 |
+
def do_cm(
|
471 |
+
self,
|
472 |
+
a1: PDFStackT,
|
473 |
+
b1: PDFStackT,
|
474 |
+
c1: PDFStackT,
|
475 |
+
d1: PDFStackT,
|
476 |
+
e1: PDFStackT,
|
477 |
+
f1: PDFStackT,
|
478 |
+
) -> None:
|
479 |
+
"""Concatenate matrix to current transformation matrix"""
|
480 |
+
self.ctm = mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
|
481 |
+
self.device.set_ctm(self.ctm)
|
482 |
+
|
483 |
+
def do_w(self, linewidth: PDFStackT) -> None:
|
484 |
+
"""Set line width"""
|
485 |
+
self.graphicstate.linewidth = cast(float, linewidth)
|
486 |
+
|
487 |
+
def do_J(self, linecap: PDFStackT) -> None:
|
488 |
+
"""Set line cap style"""
|
489 |
+
self.graphicstate.linecap = linecap
|
490 |
+
|
491 |
+
def do_j(self, linejoin: PDFStackT) -> None:
|
492 |
+
"""Set line join style"""
|
493 |
+
self.graphicstate.linejoin = linejoin
|
494 |
+
|
495 |
+
def do_M(self, miterlimit: PDFStackT) -> None:
|
496 |
+
"""Set miter limit"""
|
497 |
+
self.graphicstate.miterlimit = miterlimit
|
498 |
+
|
499 |
+
def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
|
500 |
+
"""Set line dash pattern"""
|
501 |
+
self.graphicstate.dash = (dash, phase)
|
502 |
+
|
503 |
+
def do_ri(self, intent: PDFStackT) -> None:
|
504 |
+
"""Set color rendering intent"""
|
505 |
+
self.graphicstate.intent = intent
|
506 |
+
|
507 |
+
def do_i(self, flatness: PDFStackT) -> None:
|
508 |
+
"""Set flatness tolerance"""
|
509 |
+
self.graphicstate.flatness = flatness
|
510 |
+
|
511 |
+
def do_gs(self, name: PDFStackT) -> None:
|
512 |
+
"""Set parameters from graphics state parameter dictionary"""
|
513 |
+
# TODO
|
514 |
+
|
515 |
+
def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
|
516 |
+
"""Begin new subpath"""
|
517 |
+
self.curpath.append(("m", cast(float, x), cast(float, y)))
|
518 |
+
|
519 |
+
def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
|
520 |
+
"""Append straight line segment to path"""
|
521 |
+
self.curpath.append(("l", cast(float, x), cast(float, y)))
|
522 |
+
|
523 |
+
def do_c(
|
524 |
+
self,
|
525 |
+
x1: PDFStackT,
|
526 |
+
y1: PDFStackT,
|
527 |
+
x2: PDFStackT,
|
528 |
+
y2: PDFStackT,
|
529 |
+
x3: PDFStackT,
|
530 |
+
y3: PDFStackT,
|
531 |
+
) -> None:
|
532 |
+
"""Append curved segment to path (three control points)"""
|
533 |
+
self.curpath.append(
|
534 |
+
(
|
535 |
+
"c",
|
536 |
+
cast(float, x1),
|
537 |
+
cast(float, y1),
|
538 |
+
cast(float, x2),
|
539 |
+
cast(float, y2),
|
540 |
+
cast(float, x3),
|
541 |
+
cast(float, y3),
|
542 |
+
),
|
543 |
+
)
|
544 |
+
|
545 |
+
def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
|
546 |
+
"""Append curved segment to path (initial point replicated)"""
|
547 |
+
self.curpath.append(
|
548 |
+
("v", cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3)),
|
549 |
+
)
|
550 |
+
|
551 |
+
def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
|
552 |
+
"""Append curved segment to path (final point replicated)"""
|
553 |
+
self.curpath.append(
|
554 |
+
("y", cast(float, x1), cast(float, y1), cast(float, x3), cast(float, y3)),
|
555 |
+
)
|
556 |
+
|
557 |
+
def do_h(self) -> None:
|
558 |
+
"""Close subpath"""
|
559 |
+
self.curpath.append(("h",))
|
560 |
+
|
561 |
+
def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
|
562 |
+
"""Append rectangle to path"""
|
563 |
+
x = cast(float, x)
|
564 |
+
y = cast(float, y)
|
565 |
+
w = cast(float, w)
|
566 |
+
h = cast(float, h)
|
567 |
+
self.curpath.append(("m", x, y))
|
568 |
+
self.curpath.append(("l", x + w, y))
|
569 |
+
self.curpath.append(("l", x + w, y + h))
|
570 |
+
self.curpath.append(("l", x, y + h))
|
571 |
+
self.curpath.append(("h",))
|
572 |
+
|
573 |
+
def do_S(self) -> None:
|
574 |
+
"""Stroke path"""
|
575 |
+
|
576 |
+
def is_black(color: Color) -> bool:
|
577 |
+
if isinstance(color, Tuple):
|
578 |
+
return sum(color) == 0
|
579 |
+
else:
|
580 |
+
return color == 0
|
581 |
+
|
582 |
+
if (
|
583 |
+
len(self.curpath) == 2
|
584 |
+
and self.curpath[0][0] == "m"
|
585 |
+
and self.curpath[1][0] == "l"
|
586 |
+
and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1]
|
587 |
+
== apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1]
|
588 |
+
and is_black(self.graphicstate.scolor)
|
589 |
+
): # 独立直线,水平,黑色
|
590 |
+
# print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor)
|
591 |
+
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
|
592 |
+
self.curpath = []
|
593 |
+
return "n"
|
594 |
+
else:
|
595 |
+
self.curpath = []
|
596 |
+
|
597 |
+
def do_s(self) -> None:
|
598 |
+
"""Close and stroke path"""
|
599 |
+
self.do_h()
|
600 |
+
self.do_S()
|
601 |
+
|
602 |
+
def do_f(self) -> None:
|
603 |
+
"""Fill path using nonzero winding number rule"""
|
604 |
+
# self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
|
605 |
+
self.curpath = []
|
606 |
+
|
607 |
+
def do_F(self) -> None:
|
608 |
+
"""Fill path using nonzero winding number rule (obsolete)"""
|
609 |
+
|
610 |
+
def do_f_a(self) -> None:
|
611 |
+
"""Fill path using even-odd rule"""
|
612 |
+
# self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
|
613 |
+
self.curpath = []
|
614 |
+
|
615 |
+
def do_B(self) -> None:
|
616 |
+
"""Fill and stroke path using nonzero winding number rule"""
|
617 |
+
# self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
|
618 |
+
self.curpath = []
|
619 |
+
|
620 |
+
def do_B_a(self) -> None:
|
621 |
+
"""Fill and stroke path using even-odd rule"""
|
622 |
+
# self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
|
623 |
+
self.curpath = []
|
624 |
+
|
625 |
+
def do_b(self) -> None:
|
626 |
+
"""Close, fill, and stroke path using nonzero winding number rule"""
|
627 |
+
self.do_h()
|
628 |
+
self.do_B()
|
629 |
+
|
630 |
+
def do_b_a(self) -> None:
|
631 |
+
"""Close, fill, and stroke path using even-odd rule"""
|
632 |
+
self.do_h()
|
633 |
+
self.do_B_a()
|
634 |
+
|
635 |
+
def do_n(self) -> None:
|
636 |
+
"""End path without filling or stroking"""
|
637 |
+
self.curpath = []
|
638 |
+
|
639 |
+
def do_W(self) -> None:
|
640 |
+
"""Set clipping path using nonzero winding number rule"""
|
641 |
+
|
642 |
+
def do_W_a(self) -> None:
|
643 |
+
"""Set clipping path using even-odd rule"""
|
644 |
+
|
645 |
+
def do_CS(self, name: PDFStackT) -> None:
|
646 |
+
"""Set color space for stroking operations
|
647 |
+
|
648 |
+
Introduced in PDF 1.1
|
649 |
+
"""
|
650 |
+
try:
|
651 |
+
self.scs = self.csmap[literal_name(name)]
|
652 |
+
except KeyError:
|
653 |
+
if settings.STRICT:
|
654 |
+
raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
|
655 |
+
|
656 |
+
def do_cs(self, name: PDFStackT) -> None:
|
657 |
+
"""Set color space for nonstroking operations"""
|
658 |
+
try:
|
659 |
+
self.ncs = self.csmap[literal_name(name)]
|
660 |
+
except KeyError:
|
661 |
+
if settings.STRICT:
|
662 |
+
raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
|
663 |
+
|
664 |
+
def do_G(self, gray: PDFStackT) -> None:
|
665 |
+
"""Set gray level for stroking operations"""
|
666 |
+
self.graphicstate.scolor = cast(float, gray)
|
667 |
+
self.scs = self.csmap["DeviceGray"]
|
668 |
+
|
669 |
+
def do_g(self, gray: PDFStackT) -> None:
|
670 |
+
"""Set gray level for nonstroking operations"""
|
671 |
+
self.graphicstate.ncolor = cast(float, gray)
|
672 |
+
self.ncs = self.csmap["DeviceGray"]
|
673 |
+
|
674 |
+
def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
|
675 |
+
"""Set RGB color for stroking operations"""
|
676 |
+
self.graphicstate.scolor = (cast(float, r), cast(float, g), cast(float, b))
|
677 |
+
self.scs = self.csmap["DeviceRGB"]
|
678 |
+
|
679 |
+
def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
|
680 |
+
"""Set RGB color for nonstroking operations"""
|
681 |
+
self.graphicstate.ncolor = (cast(float, r), cast(float, g), cast(float, b))
|
682 |
+
self.ncs = self.csmap["DeviceRGB"]
|
683 |
+
|
684 |
+
def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
|
685 |
+
"""Set CMYK color for stroking operations"""
|
686 |
+
self.graphicstate.scolor = (
|
687 |
+
cast(float, c),
|
688 |
+
cast(float, m),
|
689 |
+
cast(float, y),
|
690 |
+
cast(float, k),
|
691 |
+
)
|
692 |
+
self.scs = self.csmap["DeviceCMYK"]
|
693 |
+
|
694 |
+
def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
|
695 |
+
"""Set CMYK color for nonstroking operations"""
|
696 |
+
self.graphicstate.ncolor = (
|
697 |
+
cast(float, c),
|
698 |
+
cast(float, m),
|
699 |
+
cast(float, y),
|
700 |
+
cast(float, k),
|
701 |
+
)
|
702 |
+
self.ncs = self.csmap["DeviceCMYK"]
|
703 |
+
|
704 |
+
def do_SCN(self) -> None:
|
705 |
+
"""Set color for stroking operations."""
|
706 |
+
if self.scs:
|
707 |
+
n = self.scs.ncomponents
|
708 |
+
else:
|
709 |
+
if settings.STRICT:
|
710 |
+
raise PDFInterpreterError("No colorspace specified!")
|
711 |
+
n = 1
|
712 |
+
args = self.pop(n)
|
713 |
+
self.graphicstate.scolor = cast(Color, args)
|
714 |
+
return args
|
715 |
+
|
716 |
+
def do_scn(self) -> None:
|
717 |
+
"""Set color for nonstroking operations"""
|
718 |
+
if self.ncs:
|
719 |
+
n = self.ncs.ncomponents
|
720 |
+
else:
|
721 |
+
if settings.STRICT:
|
722 |
+
raise PDFInterpreterError("No colorspace specified!")
|
723 |
+
n = 1
|
724 |
+
args = self.pop(n)
|
725 |
+
self.graphicstate.ncolor = cast(Color, args)
|
726 |
+
return args
|
727 |
+
|
728 |
+
def do_SC(self) -> None:
|
729 |
+
"""Set color for stroking operations"""
|
730 |
+
return self.do_SCN()
|
731 |
+
|
732 |
+
def do_sc(self) -> None:
|
733 |
+
"""Set color for nonstroking operations"""
|
734 |
+
return self.do_scn()
|
735 |
+
|
736 |
+
def do_sh(self, name: object) -> None:
|
737 |
+
"""Paint area defined by shading pattern"""
|
738 |
+
|
739 |
+
def do_BT(self) -> None:
|
740 |
+
"""Begin text object
|
741 |
+
|
742 |
+
Initializing the text matrix, Tm, and the text line matrix, Tlm, to
|
743 |
+
the identity matrix. Text objects cannot be nested; a second BT cannot
|
744 |
+
appear before an ET.
|
745 |
+
"""
|
746 |
+
self.textstate.reset()
|
747 |
+
|
748 |
+
def do_ET(self) -> None:
|
749 |
+
"""End a text object"""
|
750 |
+
|
751 |
+
def do_BX(self) -> None:
|
752 |
+
"""Begin compatibility section"""
|
753 |
+
|
754 |
+
def do_EX(self) -> None:
|
755 |
+
"""End compatibility section"""
|
756 |
+
|
757 |
+
def do_MP(self, tag: PDFStackT) -> None:
|
758 |
+
"""Define marked-content point"""
|
759 |
+
self.device.do_tag(cast(PSLiteral, tag))
|
760 |
+
|
761 |
+
def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None:
|
762 |
+
"""Define marked-content point with property list"""
|
763 |
+
self.device.do_tag(cast(PSLiteral, tag), props)
|
764 |
+
|
765 |
+
def do_BMC(self, tag: PDFStackT) -> None:
|
766 |
+
"""Begin marked-content sequence"""
|
767 |
+
self.device.begin_tag(cast(PSLiteral, tag))
|
768 |
+
|
769 |
+
def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None:
|
770 |
+
"""Begin marked-content sequence with property list"""
|
771 |
+
self.device.begin_tag(cast(PSLiteral, tag), props)
|
772 |
+
|
773 |
+
def do_EMC(self) -> None:
|
774 |
+
"""End marked-content sequence"""
|
775 |
+
self.device.end_tag()
|
776 |
+
|
777 |
+
def do_Tc(self, space: PDFStackT) -> None:
|
778 |
+
"""Set character spacing.
|
779 |
+
|
780 |
+
Character spacing is used by the Tj, TJ, and ' operators.
|
781 |
+
|
782 |
+
:param space: a number expressed in unscaled text space units.
|
783 |
+
"""
|
784 |
+
self.textstate.charspace = cast(float, space)
|
785 |
+
|
786 |
+
def do_Tw(self, space: PDFStackT) -> None:
|
787 |
+
"""Set the word spacing.
|
788 |
+
|
789 |
+
Word spacing is used by the Tj, TJ, and ' operators.
|
790 |
+
|
791 |
+
:param space: a number expressed in unscaled text space units
|
792 |
+
"""
|
793 |
+
self.textstate.wordspace = cast(float, space)
|
794 |
+
|
795 |
+
def do_Tz(self, scale: PDFStackT) -> None:
|
796 |
+
"""Set the horizontal scaling.
|
797 |
+
|
798 |
+
:param scale: is a number specifying the percentage of the normal width
|
799 |
+
"""
|
800 |
+
self.textstate.scaling = cast(float, scale)
|
801 |
+
|
802 |
+
def do_TL(self, leading: PDFStackT) -> None:
|
803 |
+
"""Set the text leading.
|
804 |
+
|
805 |
+
Text leading is used only by the T*, ', and " operators.
|
806 |
+
|
807 |
+
:param leading: a number expressed in unscaled text space units
|
808 |
+
"""
|
809 |
+
self.textstate.leading = -cast(float, leading)
|
810 |
+
|
811 |
+
def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
|
812 |
+
"""Set the text font
|
813 |
+
|
814 |
+
:param fontid: the name of a font resource in the Font subdictionary
|
815 |
+
of the current resource dictionary
|
816 |
+
:param fontsize: size is a number representing a scale factor.
|
817 |
+
"""
|
818 |
+
try:
|
819 |
+
self.textstate.font = self.fontmap[literal_name(fontid)]
|
820 |
+
except KeyError:
|
821 |
+
if settings.STRICT:
|
822 |
+
raise PDFInterpreterError("Undefined Font id: %r" % fontid)
|
823 |
+
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
824 |
+
self.textstate.fontsize = cast(float, fontsize)
|
825 |
+
|
826 |
+
def do_Tr(self, render: PDFStackT) -> None:
|
827 |
+
"""Set the text rendering mode"""
|
828 |
+
self.textstate.render = cast(int, render)
|
829 |
+
|
830 |
+
def do_Ts(self, rise: PDFStackT) -> None:
|
831 |
+
"""Set the text rise
|
832 |
+
|
833 |
+
:param rise: a number expressed in unscaled text space units
|
834 |
+
"""
|
835 |
+
self.textstate.rise = cast(float, rise)
|
836 |
+
|
837 |
+
def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
|
838 |
+
"""Move to the start of the next line
|
839 |
+
|
840 |
+
Offset from the start of the current line by (tx , ty).
|
841 |
+
"""
|
842 |
+
tx_ = safe_float(tx)
|
843 |
+
ty_ = safe_float(ty)
|
844 |
+
if tx_ is not None and ty_ is not None:
|
845 |
+
(a, b, c, d, e, f) = self.textstate.matrix
|
846 |
+
e_new = tx_ * a + ty_ * c + e
|
847 |
+
f_new = tx_ * b + ty_ * d + f
|
848 |
+
self.textstate.matrix = (a, b, c, d, e_new, f_new)
|
849 |
+
|
850 |
+
elif settings.STRICT:
|
851 |
+
raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td")
|
852 |
+
|
853 |
+
self.textstate.linematrix = (0, 0)
|
854 |
+
|
855 |
+
def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
|
856 |
+
"""Move to the start of the next line.
|
857 |
+
|
858 |
+
offset from the start of the current line by (tx , ty). As a side effect, this
|
859 |
+
operator sets the leading parameter in the text state.
|
860 |
+
"""
|
861 |
+
tx_ = safe_float(tx)
|
862 |
+
ty_ = safe_float(ty)
|
863 |
+
|
864 |
+
if tx_ is not None and ty_ is not None:
|
865 |
+
(a, b, c, d, e, f) = self.textstate.matrix
|
866 |
+
e_new = tx_ * a + ty_ * c + e
|
867 |
+
f_new = tx_ * b + ty_ * d + f
|
868 |
+
self.textstate.matrix = (a, b, c, d, e_new, f_new)
|
869 |
+
|
870 |
+
elif settings.STRICT:
|
871 |
+
raise PDFValueError("Invalid offset ({tx}, {ty}) for TD")
|
872 |
+
|
873 |
+
if ty_ is not None:
|
874 |
+
self.textstate.leading = ty_
|
875 |
+
|
876 |
+
self.textstate.linematrix = (0, 0)
|
877 |
+
|
878 |
+
def do_Tm(
|
879 |
+
self,
|
880 |
+
a: PDFStackT,
|
881 |
+
b: PDFStackT,
|
882 |
+
c: PDFStackT,
|
883 |
+
d: PDFStackT,
|
884 |
+
e: PDFStackT,
|
885 |
+
f: PDFStackT,
|
886 |
+
) -> None:
|
887 |
+
"""Set text matrix and text line matrix"""
|
888 |
+
self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f))
|
889 |
+
self.textstate.linematrix = (0, 0)
|
890 |
+
|
891 |
+
def do_T_a(self) -> None:
|
892 |
+
"""Move to start of next text line"""
|
893 |
+
(a, b, c, d, e, f) = self.textstate.matrix
|
894 |
+
self.textstate.matrix = (
|
895 |
+
a,
|
896 |
+
b,
|
897 |
+
c,
|
898 |
+
d,
|
899 |
+
self.textstate.leading * c + e,
|
900 |
+
self.textstate.leading * d + f,
|
901 |
+
)
|
902 |
+
self.textstate.linematrix = (0, 0)
|
903 |
+
|
904 |
+
def do_TJ(self, seq: PDFStackT) -> None:
|
905 |
+
"""Show text, allowing individual glyph positioning"""
|
906 |
+
if self.textstate.font is None:
|
907 |
+
if settings.STRICT:
|
908 |
+
raise PDFInterpreterError("No font specified!")
|
909 |
+
return
|
910 |
+
assert self.ncs is not None
|
911 |
+
self.device.render_string(
|
912 |
+
self.textstate,
|
913 |
+
cast(PDFTextSeq, seq),
|
914 |
+
self.ncs,
|
915 |
+
self.graphicstate.copy(),
|
916 |
+
)
|
917 |
+
|
918 |
+
def do_Tj(self, s: PDFStackT) -> None:
|
919 |
+
"""Show text"""
|
920 |
+
self.do_TJ([s])
|
921 |
+
|
922 |
+
def do__q(self, s: PDFStackT) -> None:
|
923 |
+
"""Move to next line and show text
|
924 |
+
|
925 |
+
The ' (single quote) operator.
|
926 |
+
"""
|
927 |
+
self.do_T_a()
|
928 |
+
self.do_TJ([s])
|
929 |
+
|
930 |
+
def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None:
|
931 |
+
"""Set word and character spacing, move to next line, and show text
|
932 |
+
|
933 |
+
The " (double quote) operator.
|
934 |
+
"""
|
935 |
+
self.do_Tw(aw)
|
936 |
+
self.do_Tc(ac)
|
937 |
+
self.do_TJ([s])
|
938 |
+
|
939 |
+
def do_BI(self) -> None:
|
940 |
+
"""Begin inline image object"""
|
941 |
+
|
942 |
+
def do_ID(self) -> None:
|
943 |
+
"""Begin inline image data"""
|
944 |
+
|
945 |
+
def do_EI(self, obj: PDFStackT) -> None:
|
946 |
+
"""End inline image object"""
|
947 |
+
if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
|
948 |
+
iobjid = str(id(obj))
|
949 |
+
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
950 |
+
self.device.render_image(iobjid, obj)
|
951 |
+
self.device.end_figure(iobjid)
|
952 |
+
|
953 |
+
def do_Do(self, xobjid_arg: PDFStackT) -> None:
|
954 |
+
"""Invoke named XObject"""
|
955 |
+
xobjid = literal_name(xobjid_arg)
|
956 |
+
try:
|
957 |
+
xobj = stream_value(self.xobjmap[xobjid])
|
958 |
+
except KeyError:
|
959 |
+
if settings.STRICT:
|
960 |
+
raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
|
961 |
+
return
|
962 |
+
# log.debug("Processing xobj: %r", xobj)
|
963 |
+
subtype = xobj.get("Subtype")
|
964 |
+
if subtype is LITERAL_FORM and "BBox" in xobj:
|
965 |
+
interpreter = self.dup()
|
966 |
+
bbox = cast(Rect, list_value(xobj["BBox"]))
|
967 |
+
matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
|
968 |
+
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
969 |
+
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
970 |
+
# instead of having their own Resources entry.
|
971 |
+
xobjres = xobj.get("Resources")
|
972 |
+
if xobjres:
|
973 |
+
resources = dict_value(xobjres)
|
974 |
+
else:
|
975 |
+
resources = self.resources.copy()
|
976 |
+
self.device.begin_figure(xobjid, bbox, matrix)
|
977 |
+
ctm = mult_matrix(matrix, self.ctm)
|
978 |
+
ops_base = interpreter.render_contents(
|
979 |
+
resources,
|
980 |
+
[xobj],
|
981 |
+
ctm=ctm,
|
982 |
+
)
|
983 |
+
try: # 有的时候 form 字体加不上这里会烂掉
|
984 |
+
self.device.fontid = interpreter.fontid
|
985 |
+
self.device.fontmap = interpreter.fontmap
|
986 |
+
ops_new = self.device.end_figure(xobjid)
|
987 |
+
ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2))
|
988 |
+
pos_inv = -np.mat(ctm[4:]) * ctm_inv
|
989 |
+
a, b, c, d = ctm_inv.reshape(4).tolist()
|
990 |
+
e, f = pos_inv.tolist()[0]
|
991 |
+
self.obj_patch[self.xobjmap[xobjid].objid] = (
|
992 |
+
f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}"
|
993 |
+
)
|
994 |
+
except Exception:
|
995 |
+
pass
|
996 |
+
elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
|
997 |
+
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
998 |
+
self.device.render_image(xobjid, xobj)
|
999 |
+
self.device.end_figure(xobjid)
|
1000 |
+
else:
|
1001 |
+
# unsupported xobject type.
|
1002 |
+
pass
|
1003 |
+
|
1004 |
+
def process_page(self, page: PDFPage) -> None:
|
1005 |
+
# log.debug("Processing page: %r", page)
|
1006 |
+
# print(page.mediabox,page.cropbox)
|
1007 |
+
# (x0, y0, x1, y1) = page.mediabox
|
1008 |
+
(x0, y0, x1, y1) = page.cropbox
|
1009 |
+
if page.rotate == 90:
|
1010 |
+
ctm = (0, -1, 1, 0, -y0, x1)
|
1011 |
+
elif page.rotate == 180:
|
1012 |
+
ctm = (-1, 0, 0, -1, x1, y1)
|
1013 |
+
elif page.rotate == 270:
|
1014 |
+
ctm = (0, 1, -1, 0, y1, -x0)
|
1015 |
+
else:
|
1016 |
+
ctm = (1, 0, 0, 1, -x0, -y0)
|
1017 |
+
self.device.begin_page(page, ctm)
|
1018 |
+
ops_base = self.render_contents(page.resources, page.contents, ctm=ctm)
|
1019 |
+
self.device.fontid = self.fontid
|
1020 |
+
self.device.fontmap = self.fontmap
|
1021 |
+
ops_new = self.device.end_page(page)
|
1022 |
+
# 上面渲染的时候会根据 cropbox 减掉页面���移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来
|
1023 |
+
self.obj_patch[page.page_xref] = (
|
1024 |
+
f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵
|
1025 |
+
)
|
1026 |
+
for obj in page.contents:
|
1027 |
+
self.obj_patch[obj.objid] = ""
|
1028 |
+
|
1029 |
+
def render_contents(
|
1030 |
+
self,
|
1031 |
+
resources: Dict[object, object],
|
1032 |
+
streams: Sequence[object],
|
1033 |
+
ctm: Matrix = MATRIX_IDENTITY,
|
1034 |
+
) -> None:
|
1035 |
+
"""Render the content streams.
|
1036 |
+
|
1037 |
+
This method may be called recursively.
|
1038 |
+
"""
|
1039 |
+
# log.debug(
|
1040 |
+
# "render_contents: resources=%r, streams=%r, ctm=%r",
|
1041 |
+
# resources,
|
1042 |
+
# streams,
|
1043 |
+
# ctm,
|
1044 |
+
# )
|
1045 |
+
self.init_resources(resources)
|
1046 |
+
self.init_state(ctm)
|
1047 |
+
return self.execute(list_value(streams))
|
1048 |
+
|
1049 |
+
def execute(self, streams: Sequence[object]) -> None:
|
1050 |
+
ops = ""
|
1051 |
+
try:
|
1052 |
+
parser = PDFContentParser(streams)
|
1053 |
+
except PSEOF:
|
1054 |
+
# empty page
|
1055 |
+
return
|
1056 |
+
while True:
|
1057 |
+
try:
|
1058 |
+
_, (_, obj) = parser.nextobject()
|
1059 |
+
except PSEOF:
|
1060 |
+
break
|
1061 |
+
if isinstance(obj, PSKeyword):
|
1062 |
+
name = keyword_name(obj)
|
1063 |
+
method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
|
1064 |
+
"'",
|
1065 |
+
"_q",
|
1066 |
+
)
|
1067 |
+
if hasattr(self, method):
|
1068 |
+
func = getattr(self, method)
|
1069 |
+
nargs = func.__code__.co_argcount - 1
|
1070 |
+
if nargs:
|
1071 |
+
args = self.pop(nargs)
|
1072 |
+
# log.debug("exec: %s %r", name, args)
|
1073 |
+
if len(args) == nargs:
|
1074 |
+
func(*args)
|
1075 |
+
if not (
|
1076 |
+
name[0] == "T"
|
1077 |
+
or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"]
|
1078 |
+
): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令
|
1079 |
+
p = " ".join(
|
1080 |
+
[
|
1081 |
+
(
|
1082 |
+
f"{x:f}"
|
1083 |
+
if isinstance(x, float)
|
1084 |
+
else str(x).replace("'", "")
|
1085 |
+
)
|
1086 |
+
for x in args
|
1087 |
+
]
|
1088 |
+
)
|
1089 |
+
ops += f"{p} {name} "
|
1090 |
+
else:
|
1091 |
+
# log.debug("exec: %s", name)
|
1092 |
+
targs = func()
|
1093 |
+
if targs is None:
|
1094 |
+
targs = []
|
1095 |
+
if not (name[0] == "T" or name in ["BI", "ID", "EMC"]):
|
1096 |
+
p = " ".join(
|
1097 |
+
[
|
1098 |
+
(
|
1099 |
+
f"{x:f}"
|
1100 |
+
if isinstance(x, float)
|
1101 |
+
else str(x).replace("'", "")
|
1102 |
+
)
|
1103 |
+
for x in targs
|
1104 |
+
]
|
1105 |
+
)
|
1106 |
+
ops += f"{p} {name} "
|
1107 |
+
elif settings.STRICT:
|
1108 |
+
error_msg = "Unknown operator: %r" % name
|
1109 |
+
raise PDFInterpreterError(error_msg)
|
1110 |
+
else:
|
1111 |
+
self.push(obj)
|
1112 |
+
# print('REV DATA',ops)
|
1113 |
+
return ops
|
pdf2zh/pdfpage.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import itertools
|
2 |
+
import logging
|
3 |
+
from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple
|
4 |
+
|
5 |
+
from pdf2zh import settings
|
6 |
+
from pdf2zh.pdfdocument import (
|
7 |
+
PDFDocument,
|
8 |
+
PDFNoPageLabels,
|
9 |
+
PDFTextExtractionNotAllowed,
|
10 |
+
)
|
11 |
+
from pdf2zh.pdfexceptions import PDFObjectNotFound, PDFValueError
|
12 |
+
from pdf2zh.pdfparser import PDFParser
|
13 |
+
from pdf2zh.pdftypes import dict_value, int_value, list_value, resolve1
|
14 |
+
from pdf2zh.psparser import LIT
|
15 |
+
from pdf2zh.utils import parse_rect
|
16 |
+
|
17 |
+
log = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
# some predefined literals and keywords.
|
20 |
+
LITERAL_PAGE = LIT("Page")
|
21 |
+
LITERAL_PAGES = LIT("Pages")
|
22 |
+
|
23 |
+
|
24 |
+
class PDFPage:
|
25 |
+
"""An object that holds the information about a page.
|
26 |
+
|
27 |
+
A PDFPage object is merely a convenience class that has a set
|
28 |
+
of keys and values, which describe the properties of a page
|
29 |
+
and point to its contents.
|
30 |
+
|
31 |
+
Attributes
|
32 |
+
----------
|
33 |
+
doc: a PDFDocument object.
|
34 |
+
pageid: any Python object that can uniquely identify the page.
|
35 |
+
attrs: a dictionary of page attributes.
|
36 |
+
contents: a list of PDFStream objects that represents the page content.
|
37 |
+
lastmod: the last modified time of the page.
|
38 |
+
resources: a dictionary of resources used by the page.
|
39 |
+
mediabox: the physical size of the page.
|
40 |
+
cropbox: the crop rectangle of the page.
|
41 |
+
rotate: the page rotation (in degree).
|
42 |
+
annots: the page annotations.
|
43 |
+
beads: a chain that represents natural reading order.
|
44 |
+
label: the page's label (typically, the logical page number).
|
45 |
+
|
46 |
+
"""
|
47 |
+
|
48 |
+
def __init__(
|
49 |
+
self,
|
50 |
+
doc: PDFDocument,
|
51 |
+
pageid: object,
|
52 |
+
attrs: object,
|
53 |
+
label: Optional[str],
|
54 |
+
) -> None:
|
55 |
+
"""Initialize a page object.
|
56 |
+
|
57 |
+
doc: a PDFDocument object.
|
58 |
+
pageid: any Python object that can uniquely identify the page.
|
59 |
+
attrs: a dictionary of page attributes.
|
60 |
+
label: page label string.
|
61 |
+
"""
|
62 |
+
self.doc = doc
|
63 |
+
self.pageid = pageid
|
64 |
+
self.pageno = 0
|
65 |
+
self.attrs = dict_value(attrs)
|
66 |
+
self.label = label
|
67 |
+
self.lastmod = resolve1(self.attrs.get("LastModified"))
|
68 |
+
self.resources: Dict[object, object] = resolve1(
|
69 |
+
self.attrs.get("Resources", dict()),
|
70 |
+
)
|
71 |
+
mediabox_params: List[Any] = [
|
72 |
+
resolve1(mediabox_param) for mediabox_param in self.attrs["MediaBox"]
|
73 |
+
]
|
74 |
+
self.mediabox = parse_rect(resolve1(mediabox_params))
|
75 |
+
self.cropbox = self.mediabox
|
76 |
+
if "CropBox" in self.attrs:
|
77 |
+
try:
|
78 |
+
self.cropbox = parse_rect(resolve1(self.attrs["CropBox"]))
|
79 |
+
except PDFValueError:
|
80 |
+
pass
|
81 |
+
|
82 |
+
self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
|
83 |
+
self.annots = self.attrs.get("Annots")
|
84 |
+
self.beads = self.attrs.get("B")
|
85 |
+
if "Contents" in self.attrs:
|
86 |
+
contents = resolve1(self.attrs["Contents"])
|
87 |
+
else:
|
88 |
+
contents = []
|
89 |
+
if not isinstance(contents, list):
|
90 |
+
contents = [contents]
|
91 |
+
self.contents: List[object] = contents
|
92 |
+
|
93 |
+
def __repr__(self) -> str:
|
94 |
+
return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"
|
95 |
+
|
96 |
+
INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
|
97 |
+
|
98 |
+
@classmethod
|
99 |
+
def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
|
100 |
+
def depth_first_search(
|
101 |
+
obj: Any,
|
102 |
+
parent: Dict[str, Any],
|
103 |
+
visited: Optional[Set[Any]] = None,
|
104 |
+
) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]:
|
105 |
+
if isinstance(obj, int):
|
106 |
+
object_id = obj
|
107 |
+
object_properties = dict_value(document.getobj(object_id)).copy()
|
108 |
+
else:
|
109 |
+
# This looks broken. obj.objid means obj could be either
|
110 |
+
# PDFObjRef or PDFStream, but neither is valid for dict_value.
|
111 |
+
object_id = obj.objid # type: ignore[attr-defined]
|
112 |
+
object_properties = dict_value(obj).copy()
|
113 |
+
|
114 |
+
# Avoid recursion errors by keeping track of visited nodes
|
115 |
+
if visited is None:
|
116 |
+
visited = set()
|
117 |
+
if object_id in visited:
|
118 |
+
return
|
119 |
+
visited.add(object_id)
|
120 |
+
|
121 |
+
for k, v in parent.items():
|
122 |
+
if k in cls.INHERITABLE_ATTRS and k not in object_properties:
|
123 |
+
object_properties[k] = v
|
124 |
+
|
125 |
+
object_type = object_properties.get("Type")
|
126 |
+
if object_type is None and not settings.STRICT: # See #64
|
127 |
+
object_type = object_properties.get("type")
|
128 |
+
|
129 |
+
if object_type is LITERAL_PAGES and "Kids" in object_properties:
|
130 |
+
# log.debug("Pages: Kids=%r", object_properties["Kids"])
|
131 |
+
for child in list_value(object_properties["Kids"]):
|
132 |
+
yield from depth_first_search(child, object_properties, visited)
|
133 |
+
|
134 |
+
elif object_type is LITERAL_PAGE:
|
135 |
+
# log.debug("Page: %r", object_properties)
|
136 |
+
yield (object_id, object_properties)
|
137 |
+
|
138 |
+
try:
|
139 |
+
page_labels: Iterator[Optional[str]] = document.get_page_labels()
|
140 |
+
except PDFNoPageLabels:
|
141 |
+
page_labels = itertools.repeat(None)
|
142 |
+
|
143 |
+
pages = False
|
144 |
+
if "Pages" in document.catalog:
|
145 |
+
objects = depth_first_search(document.catalog["Pages"], document.catalog)
|
146 |
+
for objid, tree in objects:
|
147 |
+
yield cls(document, objid, tree, next(page_labels))
|
148 |
+
pages = True
|
149 |
+
if not pages:
|
150 |
+
# fallback when /Pages is missing.
|
151 |
+
for xref in document.xrefs:
|
152 |
+
for objid in xref.get_objids():
|
153 |
+
try:
|
154 |
+
obj = document.getobj(objid)
|
155 |
+
if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
|
156 |
+
yield cls(document, objid, obj, next(page_labels))
|
157 |
+
except PDFObjectNotFound:
|
158 |
+
pass
|
159 |
+
|
160 |
+
@classmethod
|
161 |
+
def get_pages(
|
162 |
+
cls,
|
163 |
+
fp: BinaryIO,
|
164 |
+
pagenos: Optional[Container[int]] = None,
|
165 |
+
maxpages: int = 0,
|
166 |
+
password: str = "",
|
167 |
+
caching: bool = True,
|
168 |
+
check_extractable: bool = False,
|
169 |
+
) -> Iterator["PDFPage"]:
|
170 |
+
# Create a PDF parser object associated with the file object.
|
171 |
+
parser = PDFParser(fp)
|
172 |
+
# Create a PDF document object that stores the document structure.
|
173 |
+
doc = PDFDocument(parser, password=password, caching=caching)
|
174 |
+
# Check if the document allows text extraction.
|
175 |
+
# If not, warn the user and proceed.
|
176 |
+
if not doc.is_extractable:
|
177 |
+
if check_extractable:
|
178 |
+
error_msg = "Text extraction is not allowed: %r" % fp
|
179 |
+
raise PDFTextExtractionNotAllowed(error_msg)
|
180 |
+
else:
|
181 |
+
warning_msg = (
|
182 |
+
"The PDF %r contains a metadata field "
|
183 |
+
"indicating that it should not allow "
|
184 |
+
"text extraction. Ignoring this field "
|
185 |
+
"and proceeding. Use the check_extractable "
|
186 |
+
"if you want to raise an error in this case" % fp
|
187 |
+
)
|
188 |
+
log.warning(warning_msg)
|
189 |
+
# Process each page contained in the document.
|
190 |
+
for pageno, page in enumerate(cls.create_pages(doc)):
|
191 |
+
page.pageno = pageno
|
192 |
+
if pagenos and (pageno not in pagenos):
|
193 |
+
continue
|
194 |
+
yield page
|
195 |
+
if maxpages and maxpages <= pageno + 1:
|
196 |
+
break
|
pdf2zh/pdfparser.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from io import BytesIO
|
3 |
+
from typing import TYPE_CHECKING, BinaryIO, Optional, Union
|
4 |
+
|
5 |
+
from pdf2zh import settings
|
6 |
+
from pdf2zh.casting import safe_int
|
7 |
+
from pdf2zh.pdfexceptions import PDFException
|
8 |
+
from pdf2zh.pdftypes import PDFObjRef, PDFStream, dict_value, int_value
|
9 |
+
from pdf2zh.psexceptions import PSEOF
|
10 |
+
from pdf2zh.psparser import KWD, PSKeyword, PSStackParser
|
11 |
+
|
12 |
+
if TYPE_CHECKING:
|
13 |
+
from pdf2zh.pdfdocument import PDFDocument
|
14 |
+
|
15 |
+
log = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
|
18 |
+
class PDFSyntaxError(PDFException):
|
19 |
+
pass
|
20 |
+
|
21 |
+
|
22 |
+
# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
|
23 |
+
class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
|
24 |
+
"""PDFParser fetch PDF objects from a file stream.
|
25 |
+
It can handle indirect references by referring to
|
26 |
+
a PDF document set by set_document method.
|
27 |
+
It also reads XRefs at the end of every PDF file.
|
28 |
+
|
29 |
+
Typical usage:
|
30 |
+
parser = PDFParser(fp)
|
31 |
+
parser.read_xref()
|
32 |
+
parser.read_xref(fallback=True) # optional
|
33 |
+
parser.set_document(doc)
|
34 |
+
parser.seek(offset)
|
35 |
+
parser.nextobject()
|
36 |
+
|
37 |
+
"""
|
38 |
+
|
39 |
+
def __init__(self, fp: BinaryIO) -> None:
|
40 |
+
PSStackParser.__init__(self, fp)
|
41 |
+
self.doc: Optional[PDFDocument] = None
|
42 |
+
self.fallback = False
|
43 |
+
|
44 |
+
def set_document(self, doc: "PDFDocument") -> None:
|
45 |
+
"""Associates the parser with a PDFDocument object."""
|
46 |
+
self.doc = doc
|
47 |
+
|
48 |
+
KEYWORD_R = KWD(b"R")
|
49 |
+
KEYWORD_NULL = KWD(b"null")
|
50 |
+
KEYWORD_ENDOBJ = KWD(b"endobj")
|
51 |
+
KEYWORD_STREAM = KWD(b"stream")
|
52 |
+
KEYWORD_XREF = KWD(b"xref")
|
53 |
+
KEYWORD_STARTXREF = KWD(b"startxref")
|
54 |
+
|
55 |
+
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
56 |
+
"""Handles PDF-related keywords."""
|
57 |
+
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
|
58 |
+
self.add_results(*self.pop(1))
|
59 |
+
|
60 |
+
elif token is self.KEYWORD_ENDOBJ:
|
61 |
+
self.add_results(*self.pop(4))
|
62 |
+
|
63 |
+
elif token is self.KEYWORD_NULL:
|
64 |
+
# null object
|
65 |
+
self.push((pos, None))
|
66 |
+
|
67 |
+
elif token is self.KEYWORD_R:
|
68 |
+
# reference to indirect object
|
69 |
+
if len(self.curstack) >= 2:
|
70 |
+
(_, _object_id), _ = self.pop(2)
|
71 |
+
object_id = safe_int(_object_id)
|
72 |
+
if object_id is not None:
|
73 |
+
obj = PDFObjRef(self.doc, object_id)
|
74 |
+
self.push((pos, obj))
|
75 |
+
|
76 |
+
elif token is self.KEYWORD_STREAM:
|
77 |
+
# stream object
|
78 |
+
((_, dic),) = self.pop(1)
|
79 |
+
dic = dict_value(dic)
|
80 |
+
objlen = 0
|
81 |
+
if not self.fallback:
|
82 |
+
try:
|
83 |
+
objlen = int_value(dic["Length"])
|
84 |
+
except KeyError:
|
85 |
+
if settings.STRICT:
|
86 |
+
raise PDFSyntaxError("/Length is undefined: %r" % dic)
|
87 |
+
self.seek(pos)
|
88 |
+
try:
|
89 |
+
(_, line) = self.nextline() # 'stream'
|
90 |
+
except PSEOF:
|
91 |
+
if settings.STRICT:
|
92 |
+
raise PDFSyntaxError("Unexpected EOF")
|
93 |
+
return
|
94 |
+
pos += len(line)
|
95 |
+
self.fp.seek(pos)
|
96 |
+
data = bytearray(self.fp.read(objlen))
|
97 |
+
self.seek(pos + objlen)
|
98 |
+
while 1:
|
99 |
+
try:
|
100 |
+
(linepos, line) = self.nextline()
|
101 |
+
except PSEOF:
|
102 |
+
if settings.STRICT:
|
103 |
+
raise PDFSyntaxError("Unexpected EOF")
|
104 |
+
break
|
105 |
+
if b"endstream" in line:
|
106 |
+
i = line.index(b"endstream")
|
107 |
+
objlen += i
|
108 |
+
if self.fallback:
|
109 |
+
data += line[:i]
|
110 |
+
break
|
111 |
+
objlen += len(line)
|
112 |
+
if self.fallback:
|
113 |
+
data += line
|
114 |
+
self.seek(pos + objlen)
|
115 |
+
# XXX limit objlen not to exceed object boundary
|
116 |
+
# log.debug(
|
117 |
+
# "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
|
118 |
+
# pos,
|
119 |
+
# objlen,
|
120 |
+
# dic,
|
121 |
+
# data[:10],
|
122 |
+
# )
|
123 |
+
assert self.doc is not None
|
124 |
+
stream = PDFStream(dic, bytes(data), self.doc.decipher)
|
125 |
+
self.push((pos, stream))
|
126 |
+
|
127 |
+
else:
|
128 |
+
# others
|
129 |
+
self.push((pos, token))
|
130 |
+
|
131 |
+
|
132 |
+
class PDFStreamParser(PDFParser):
|
133 |
+
"""PDFStreamParser is used to parse PDF content streams
|
134 |
+
that is contained in each page and has instructions
|
135 |
+
for rendering the page. A reference to a PDF document is
|
136 |
+
needed because a PDF content stream can also have
|
137 |
+
indirect references to other objects in the same document.
|
138 |
+
"""
|
139 |
+
|
140 |
+
def __init__(self, data: bytes) -> None:
|
141 |
+
PDFParser.__init__(self, BytesIO(data))
|
142 |
+
|
143 |
+
def flush(self) -> None:
|
144 |
+
self.add_results(*self.popall())
|
145 |
+
|
146 |
+
KEYWORD_OBJ = KWD(b"obj")
|
147 |
+
|
148 |
+
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
149 |
+
if token is self.KEYWORD_R:
|
150 |
+
# reference to indirect object
|
151 |
+
(_, _object_id), _ = self.pop(2)
|
152 |
+
object_id = safe_int(_object_id)
|
153 |
+
if object_id is not None:
|
154 |
+
obj = PDFObjRef(self.doc, object_id)
|
155 |
+
self.push((pos, obj))
|
156 |
+
return
|
157 |
+
|
158 |
+
elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
|
159 |
+
if settings.STRICT:
|
160 |
+
# See PDF Spec 3.4.6: Only the object values are stored in the
|
161 |
+
# stream; the obj and endobj keywords are not used.
|
162 |
+
raise PDFSyntaxError("Keyword endobj found in stream")
|
163 |
+
return
|
164 |
+
|
165 |
+
# others
|
166 |
+
self.push((pos, token))
|
pdf2zh/pdftypes.py
ADDED
@@ -0,0 +1,397 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import logging
|
3 |
+
import zlib
|
4 |
+
from typing import (
|
5 |
+
TYPE_CHECKING,
|
6 |
+
Any,
|
7 |
+
Dict,
|
8 |
+
Iterable,
|
9 |
+
List,
|
10 |
+
Optional,
|
11 |
+
Protocol,
|
12 |
+
Tuple,
|
13 |
+
Union,
|
14 |
+
cast,
|
15 |
+
)
|
16 |
+
from warnings import warn
|
17 |
+
|
18 |
+
from pdf2zh import pdfexceptions, settings
|
19 |
+
from pdf2zh.ascii85 import ascii85decode, asciihexdecode
|
20 |
+
from pdf2zh.ccitt import ccittfaxdecode
|
21 |
+
from pdf2zh.lzw import lzwdecode
|
22 |
+
from pdf2zh.psparser import LIT, PSObject
|
23 |
+
from pdf2zh.runlength import rldecode
|
24 |
+
from pdf2zh.utils import apply_png_predictor
|
25 |
+
|
26 |
+
if TYPE_CHECKING:
|
27 |
+
from pdf2zh.pdfdocument import PDFDocument
|
28 |
+
|
29 |
+
logger = logging.getLogger(__name__)
|
30 |
+
|
31 |
+
LITERAL_CRYPT = LIT("Crypt")
|
32 |
+
|
33 |
+
# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
|
34 |
+
LITERALS_FLATE_DECODE = (LIT("FlateDecode"), LIT("Fl"))
|
35 |
+
LITERALS_LZW_DECODE = (LIT("LZWDecode"), LIT("LZW"))
|
36 |
+
LITERALS_ASCII85_DECODE = (LIT("ASCII85Decode"), LIT("A85"))
|
37 |
+
LITERALS_ASCIIHEX_DECODE = (LIT("ASCIIHexDecode"), LIT("AHx"))
|
38 |
+
LITERALS_RUNLENGTH_DECODE = (LIT("RunLengthDecode"), LIT("RL"))
|
39 |
+
LITERALS_CCITTFAX_DECODE = (LIT("CCITTFaxDecode"), LIT("CCF"))
|
40 |
+
LITERALS_DCT_DECODE = (LIT("DCTDecode"), LIT("DCT"))
|
41 |
+
LITERALS_JBIG2_DECODE = (LIT("JBIG2Decode"),)
|
42 |
+
LITERALS_JPX_DECODE = (LIT("JPXDecode"),)
|
43 |
+
|
44 |
+
|
45 |
+
class DecipherCallable(Protocol):
|
46 |
+
"""Fully typed a decipher callback, with optional parameter."""
|
47 |
+
|
48 |
+
def __call__(
|
49 |
+
self,
|
50 |
+
objid: int,
|
51 |
+
genno: int,
|
52 |
+
data: bytes,
|
53 |
+
attrs: Optional[Dict[str, Any]] = None,
|
54 |
+
) -> bytes:
|
55 |
+
raise NotImplementedError
|
56 |
+
|
57 |
+
|
58 |
+
class PDFObject(PSObject):
|
59 |
+
pass
|
60 |
+
|
61 |
+
|
62 |
+
# Adding aliases for these exceptions for backwards compatibility
|
63 |
+
PDFException = pdfexceptions.PDFException
|
64 |
+
PDFTypeError = pdfexceptions.PDFTypeError
|
65 |
+
PDFValueError = pdfexceptions.PDFValueError
|
66 |
+
PDFObjectNotFound = pdfexceptions.PDFObjectNotFound
|
67 |
+
PDFNotImplementedError = pdfexceptions.PDFNotImplementedError
|
68 |
+
|
69 |
+
_DEFAULT = object()
|
70 |
+
|
71 |
+
|
72 |
+
class PDFObjRef(PDFObject):
|
73 |
+
def __init__(
|
74 |
+
self,
|
75 |
+
doc: Optional["PDFDocument"],
|
76 |
+
objid: int,
|
77 |
+
_: Any = _DEFAULT,
|
78 |
+
) -> None:
|
79 |
+
"""Reference to a PDF object.
|
80 |
+
|
81 |
+
:param doc: The PDF document.
|
82 |
+
:param objid: The object number.
|
83 |
+
:param _: Unused argument for backwards compatibility.
|
84 |
+
"""
|
85 |
+
if _ is not _DEFAULT:
|
86 |
+
warn(
|
87 |
+
"The third argument of PDFObjRef is unused and will be removed after "
|
88 |
+
"2024",
|
89 |
+
DeprecationWarning,
|
90 |
+
)
|
91 |
+
|
92 |
+
if objid == 0:
|
93 |
+
if settings.STRICT:
|
94 |
+
raise PDFValueError("PDF object id cannot be 0.")
|
95 |
+
|
96 |
+
self.doc = doc
|
97 |
+
self.objid = objid
|
98 |
+
|
99 |
+
def __repr__(self) -> str:
|
100 |
+
return "<PDFObjRef:%d>" % (self.objid)
|
101 |
+
|
102 |
+
def resolve(self, default: object = None) -> Any:
|
103 |
+
assert self.doc is not None
|
104 |
+
try:
|
105 |
+
return self.doc.getobj(self.objid)
|
106 |
+
except PDFObjectNotFound:
|
107 |
+
return default
|
108 |
+
|
109 |
+
|
110 |
+
def resolve1(x: object, default: object = None) -> Any:
|
111 |
+
"""Resolves an object.
|
112 |
+
|
113 |
+
If this is an array or dictionary, it may still contains
|
114 |
+
some indirect objects inside.
|
115 |
+
"""
|
116 |
+
while isinstance(x, PDFObjRef):
|
117 |
+
x = x.resolve(default=default)
|
118 |
+
return x
|
119 |
+
|
120 |
+
|
121 |
+
def resolve_all(x: object, default: object = None) -> Any:
|
122 |
+
"""Recursively resolves the given object and all the internals.
|
123 |
+
|
124 |
+
Make sure there is no indirect reference within the nested object.
|
125 |
+
This procedure might be slow.
|
126 |
+
"""
|
127 |
+
while isinstance(x, PDFObjRef):
|
128 |
+
x = x.resolve(default=default)
|
129 |
+
if isinstance(x, list):
|
130 |
+
x = [resolve_all(v, default=default) for v in x]
|
131 |
+
elif isinstance(x, dict):
|
132 |
+
for k, v in x.items():
|
133 |
+
x[k] = resolve_all(v, default=default)
|
134 |
+
return x
|
135 |
+
|
136 |
+
|
137 |
+
def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: object) -> Any:
|
138 |
+
"""Recursively deciphers the given object."""
|
139 |
+
if isinstance(x, bytes):
|
140 |
+
if len(x) == 0:
|
141 |
+
return x
|
142 |
+
return decipher(objid, genno, x)
|
143 |
+
if isinstance(x, list):
|
144 |
+
x = [decipher_all(decipher, objid, genno, v) for v in x]
|
145 |
+
elif isinstance(x, dict):
|
146 |
+
for k, v in x.items():
|
147 |
+
x[k] = decipher_all(decipher, objid, genno, v)
|
148 |
+
return x
|
149 |
+
|
150 |
+
|
151 |
+
def int_value(x: object) -> int:
|
152 |
+
x = resolve1(x)
|
153 |
+
if not isinstance(x, int):
|
154 |
+
if settings.STRICT:
|
155 |
+
raise PDFTypeError("Integer required: %r" % x)
|
156 |
+
return 0
|
157 |
+
return x
|
158 |
+
|
159 |
+
|
160 |
+
def float_value(x: object) -> float:
|
161 |
+
x = resolve1(x)
|
162 |
+
if not isinstance(x, float):
|
163 |
+
if settings.STRICT:
|
164 |
+
raise PDFTypeError("Float required: %r" % x)
|
165 |
+
return 0.0
|
166 |
+
return x
|
167 |
+
|
168 |
+
|
169 |
+
def num_value(x: object) -> float:
|
170 |
+
x = resolve1(x)
|
171 |
+
if not isinstance(x, (int, float)): # == utils.isnumber(x)
|
172 |
+
if settings.STRICT:
|
173 |
+
raise PDFTypeError("Int or Float required: %r" % x)
|
174 |
+
return 0
|
175 |
+
return x
|
176 |
+
|
177 |
+
|
178 |
+
def uint_value(x: object, n_bits: int) -> int:
|
179 |
+
"""Resolve number and interpret it as a two's-complement unsigned number"""
|
180 |
+
xi = int_value(x)
|
181 |
+
if xi > 0:
|
182 |
+
return xi
|
183 |
+
else:
|
184 |
+
return xi + cast(int, 2**n_bits)
|
185 |
+
|
186 |
+
|
187 |
+
def str_value(x: object) -> bytes:
|
188 |
+
x = resolve1(x)
|
189 |
+
if not isinstance(x, bytes):
|
190 |
+
if settings.STRICT:
|
191 |
+
raise PDFTypeError("String required: %r" % x)
|
192 |
+
return b""
|
193 |
+
return x
|
194 |
+
|
195 |
+
|
196 |
+
def list_value(x: object) -> Union[List[Any], Tuple[Any, ...]]:
|
197 |
+
x = resolve1(x)
|
198 |
+
if not isinstance(x, (list, tuple)):
|
199 |
+
if settings.STRICT:
|
200 |
+
raise PDFTypeError("List required: %r" % x)
|
201 |
+
return []
|
202 |
+
return x
|
203 |
+
|
204 |
+
|
205 |
+
def dict_value(x: object) -> Dict[Any, Any]:
|
206 |
+
x = resolve1(x)
|
207 |
+
if not isinstance(x, dict):
|
208 |
+
if settings.STRICT:
|
209 |
+
logger.error("PDFTypeError : Dict required: %r", x)
|
210 |
+
raise PDFTypeError("Dict required: %r" % x)
|
211 |
+
return {}
|
212 |
+
return x
|
213 |
+
|
214 |
+
|
215 |
+
def stream_value(x: object) -> "PDFStream":
|
216 |
+
x = resolve1(x)
|
217 |
+
if not isinstance(x, PDFStream):
|
218 |
+
if settings.STRICT:
|
219 |
+
raise PDFTypeError("PDFStream required: %r" % x)
|
220 |
+
return PDFStream({}, b"")
|
221 |
+
return x
|
222 |
+
|
223 |
+
|
224 |
+
def decompress_corrupted(data: bytes) -> bytes:
|
225 |
+
"""Called on some data that can't be properly decoded because of CRC checksum
|
226 |
+
error. Attempt to decode it skipping the CRC.
|
227 |
+
"""
|
228 |
+
d = zlib.decompressobj()
|
229 |
+
f = io.BytesIO(data)
|
230 |
+
result_str = b""
|
231 |
+
buffer = f.read(1)
|
232 |
+
i = 0
|
233 |
+
try:
|
234 |
+
while buffer:
|
235 |
+
result_str += d.decompress(buffer)
|
236 |
+
buffer = f.read(1)
|
237 |
+
i += 1
|
238 |
+
except zlib.error:
|
239 |
+
# Let the error propagates if we're not yet in the CRC checksum
|
240 |
+
if i < len(data) - 3:
|
241 |
+
logger.warning("Data-loss while decompressing corrupted data")
|
242 |
+
return result_str
|
243 |
+
|
244 |
+
|
245 |
+
class PDFStream(PDFObject):
|
246 |
+
def __init__(
|
247 |
+
self,
|
248 |
+
attrs: Dict[str, Any],
|
249 |
+
rawdata: bytes,
|
250 |
+
decipher: Optional[DecipherCallable] = None,
|
251 |
+
) -> None:
|
252 |
+
assert isinstance(attrs, dict), str(type(attrs))
|
253 |
+
self.attrs = attrs
|
254 |
+
self.rawdata: Optional[bytes] = rawdata
|
255 |
+
self.decipher = decipher
|
256 |
+
self.data: Optional[bytes] = None
|
257 |
+
self.objid: Optional[int] = None
|
258 |
+
self.genno: Optional[int] = None
|
259 |
+
|
260 |
+
def set_objid(self, objid: int, genno: int) -> None:
|
261 |
+
self.objid = objid
|
262 |
+
self.genno = genno
|
263 |
+
|
264 |
+
def __repr__(self) -> str:
|
265 |
+
if self.data is None:
|
266 |
+
assert self.rawdata is not None
|
267 |
+
return "<PDFStream(%r): raw=%d, %r>" % (
|
268 |
+
self.objid,
|
269 |
+
len(self.rawdata),
|
270 |
+
self.attrs,
|
271 |
+
)
|
272 |
+
else:
|
273 |
+
assert self.data is not None
|
274 |
+
return "<PDFStream(%r): len=%d, %r>" % (
|
275 |
+
self.objid,
|
276 |
+
len(self.data),
|
277 |
+
self.attrs,
|
278 |
+
)
|
279 |
+
|
280 |
+
def __contains__(self, name: object) -> bool:
|
281 |
+
return name in self.attrs
|
282 |
+
|
283 |
+
def __getitem__(self, name: str) -> Any:
|
284 |
+
return self.attrs[name]
|
285 |
+
|
286 |
+
def get(self, name: str, default: object = None) -> Any:
|
287 |
+
return self.attrs.get(name, default)
|
288 |
+
|
289 |
+
def get_any(self, names: Iterable[str], default: object = None) -> Any:
|
290 |
+
for name in names:
|
291 |
+
if name in self.attrs:
|
292 |
+
return self.attrs[name]
|
293 |
+
return default
|
294 |
+
|
295 |
+
def get_filters(self) -> List[Tuple[Any, Any]]:
|
296 |
+
filters = self.get_any(("F", "Filter"))
|
297 |
+
params = self.get_any(("DP", "DecodeParms", "FDecodeParms"), {})
|
298 |
+
if not filters:
|
299 |
+
return []
|
300 |
+
if not isinstance(filters, list):
|
301 |
+
filters = [filters]
|
302 |
+
if not isinstance(params, list):
|
303 |
+
# Make sure the parameters list is the same as filters.
|
304 |
+
params = [params] * len(filters)
|
305 |
+
if settings.STRICT and len(params) != len(filters):
|
306 |
+
raise PDFException("Parameters len filter mismatch")
|
307 |
+
|
308 |
+
resolved_filters = [resolve1(f) for f in filters]
|
309 |
+
resolved_params = [resolve1(param) for param in params]
|
310 |
+
return list(zip(resolved_filters, resolved_params))
|
311 |
+
|
312 |
+
def decode(self) -> None:
|
313 |
+
assert self.data is None and self.rawdata is not None, str(
|
314 |
+
(self.data, self.rawdata),
|
315 |
+
)
|
316 |
+
data = self.rawdata
|
317 |
+
if self.decipher:
|
318 |
+
# Handle encryption
|
319 |
+
assert self.objid is not None
|
320 |
+
assert self.genno is not None
|
321 |
+
data = self.decipher(self.objid, self.genno, data, self.attrs)
|
322 |
+
filters = self.get_filters()
|
323 |
+
if not filters:
|
324 |
+
self.data = data
|
325 |
+
self.rawdata = None
|
326 |
+
return
|
327 |
+
for f, params in filters:
|
328 |
+
if f in LITERALS_FLATE_DECODE:
|
329 |
+
# will get errors if the document is encrypted.
|
330 |
+
try:
|
331 |
+
data = zlib.decompress(data)
|
332 |
+
|
333 |
+
except zlib.error as e:
|
334 |
+
if settings.STRICT:
|
335 |
+
error_msg = f"Invalid zlib bytes: {e!r}, {data!r}"
|
336 |
+
raise PDFException(error_msg)
|
337 |
+
|
338 |
+
try:
|
339 |
+
data = decompress_corrupted(data)
|
340 |
+
except zlib.error:
|
341 |
+
data = b""
|
342 |
+
|
343 |
+
elif f in LITERALS_LZW_DECODE:
|
344 |
+
data = lzwdecode(data)
|
345 |
+
elif f in LITERALS_ASCII85_DECODE:
|
346 |
+
data = ascii85decode(data)
|
347 |
+
elif f in LITERALS_ASCIIHEX_DECODE:
|
348 |
+
data = asciihexdecode(data)
|
349 |
+
elif f in LITERALS_RUNLENGTH_DECODE:
|
350 |
+
data = rldecode(data)
|
351 |
+
elif f in LITERALS_CCITTFAX_DECODE:
|
352 |
+
data = ccittfaxdecode(data, params)
|
353 |
+
elif f in LITERALS_DCT_DECODE:
|
354 |
+
# This is probably a JPG stream
|
355 |
+
# it does not need to be decoded twice.
|
356 |
+
# Just return the stream to the user.
|
357 |
+
pass
|
358 |
+
elif f in LITERALS_JBIG2_DECODE or f in LITERALS_JPX_DECODE:
|
359 |
+
pass
|
360 |
+
elif f == LITERAL_CRYPT:
|
361 |
+
# not yet..
|
362 |
+
raise PDFNotImplementedError("/Crypt filter is unsupported")
|
363 |
+
else:
|
364 |
+
raise PDFNotImplementedError("Unsupported filter: %r" % f)
|
365 |
+
# apply predictors
|
366 |
+
if params and "Predictor" in params:
|
367 |
+
pred = int_value(params["Predictor"])
|
368 |
+
if pred == 1:
|
369 |
+
# no predictor
|
370 |
+
pass
|
371 |
+
elif pred >= 10:
|
372 |
+
# PNG predictor
|
373 |
+
colors = int_value(params.get("Colors", 1))
|
374 |
+
columns = int_value(params.get("Columns", 1))
|
375 |
+
raw_bits_per_component = params.get("BitsPerComponent", 8)
|
376 |
+
bitspercomponent = int_value(raw_bits_per_component)
|
377 |
+
data = apply_png_predictor(
|
378 |
+
pred,
|
379 |
+
colors,
|
380 |
+
columns,
|
381 |
+
bitspercomponent,
|
382 |
+
data,
|
383 |
+
)
|
384 |
+
else:
|
385 |
+
error_msg = "Unsupported predictor: %r" % pred
|
386 |
+
raise PDFNotImplementedError(error_msg)
|
387 |
+
self.data = data
|
388 |
+
self.rawdata = None
|
389 |
+
|
390 |
+
def get_data(self) -> bytes:
|
391 |
+
if self.data is None:
|
392 |
+
self.decode()
|
393 |
+
assert self.data is not None
|
394 |
+
return self.data
|
395 |
+
|
396 |
+
def get_rawdata(self) -> Optional[bytes]:
|
397 |
+
return self.rawdata
|
pdf2zh/psexceptions.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class PSException(Exception):
|
2 |
+
pass
|
3 |
+
|
4 |
+
|
5 |
+
class PSEOF(PSException):
|
6 |
+
pass
|
7 |
+
|
8 |
+
|
9 |
+
class PSSyntaxError(PSException):
|
10 |
+
pass
|
11 |
+
|
12 |
+
|
13 |
+
class PSTypeError(PSException):
|
14 |
+
pass
|
15 |
+
|
16 |
+
|
17 |
+
class PSValueError(PSException):
|
18 |
+
pass
|
pdf2zh/psparser.py
ADDED
@@ -0,0 +1,656 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import io
|
3 |
+
import logging
|
4 |
+
import re
|
5 |
+
from typing import (
|
6 |
+
Any,
|
7 |
+
BinaryIO,
|
8 |
+
Dict,
|
9 |
+
Generic,
|
10 |
+
Iterator,
|
11 |
+
List,
|
12 |
+
Optional,
|
13 |
+
Tuple,
|
14 |
+
Type,
|
15 |
+
TypeVar,
|
16 |
+
Union,
|
17 |
+
)
|
18 |
+
|
19 |
+
from pdf2zh import psexceptions, settings
|
20 |
+
from pdf2zh.utils import choplist
|
21 |
+
|
22 |
+
log = logging.getLogger(__name__)
|
23 |
+
|
24 |
+
|
25 |
+
# Adding aliases for these exceptions for backwards compatibility
|
26 |
+
PSException = psexceptions.PSException
|
27 |
+
PSEOF = psexceptions.PSEOF
|
28 |
+
PSSyntaxError = psexceptions.PSSyntaxError
|
29 |
+
PSTypeError = psexceptions.PSTypeError
|
30 |
+
PSValueError = psexceptions.PSValueError
|
31 |
+
|
32 |
+
|
33 |
+
class PSObject:
|
34 |
+
"""Base class for all PS or PDF-related data types."""
|
35 |
+
|
36 |
+
|
37 |
+
class PSLiteral(PSObject):
|
38 |
+
"""A class that represents a PostScript literal.
|
39 |
+
|
40 |
+
Postscript literals are used as identifiers, such as
|
41 |
+
variable names, property names and dictionary keys.
|
42 |
+
Literals are case sensitive and denoted by a preceding
|
43 |
+
slash sign (e.g. "/Name")
|
44 |
+
|
45 |
+
Note: Do not create an instance of PSLiteral directly.
|
46 |
+
Always use PSLiteralTable.intern().
|
47 |
+
"""
|
48 |
+
|
49 |
+
NameType = Union[str, bytes]
|
50 |
+
|
51 |
+
def __init__(self, name: NameType) -> None:
|
52 |
+
self.name = name
|
53 |
+
|
54 |
+
def __repr__(self) -> str:
|
55 |
+
name = self.name
|
56 |
+
return "/%r" % name
|
57 |
+
|
58 |
+
|
59 |
+
class PSKeyword(PSObject):
|
60 |
+
"""A class that represents a PostScript keyword.
|
61 |
+
|
62 |
+
PostScript keywords are a dozen of predefined words.
|
63 |
+
Commands and directives in PostScript are expressed by keywords.
|
64 |
+
They are also used to denote the content boundaries.
|
65 |
+
|
66 |
+
Note: Do not create an instance of PSKeyword directly.
|
67 |
+
Always use PSKeywordTable.intern().
|
68 |
+
"""
|
69 |
+
|
70 |
+
def __init__(self, name: bytes) -> None:
|
71 |
+
self.name = name
|
72 |
+
|
73 |
+
def __repr__(self) -> str:
|
74 |
+
name = self.name
|
75 |
+
return "/%r" % name
|
76 |
+
|
77 |
+
|
78 |
+
_SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword)
|
79 |
+
|
80 |
+
|
81 |
+
class PSSymbolTable(Generic[_SymbolT]):
|
82 |
+
"""A utility class for storing PSLiteral/PSKeyword objects.
|
83 |
+
|
84 |
+
Interned objects can be checked its identity with "is" operator.
|
85 |
+
"""
|
86 |
+
|
87 |
+
def __init__(self, klass: Type[_SymbolT]) -> None:
|
88 |
+
self.dict: Dict[PSLiteral.NameType, _SymbolT] = {}
|
89 |
+
self.klass: Type[_SymbolT] = klass
|
90 |
+
|
91 |
+
def intern(self, name: PSLiteral.NameType) -> _SymbolT:
|
92 |
+
if name in self.dict:
|
93 |
+
lit = self.dict[name]
|
94 |
+
else:
|
95 |
+
# Type confusion issue: PSKeyword always takes bytes as name
|
96 |
+
# PSLiteral uses either str or bytes
|
97 |
+
lit = self.klass(name) # type: ignore[arg-type]
|
98 |
+
self.dict[name] = lit
|
99 |
+
return lit
|
100 |
+
|
101 |
+
|
102 |
+
PSLiteralTable = PSSymbolTable(PSLiteral)
|
103 |
+
PSKeywordTable = PSSymbolTable(PSKeyword)
|
104 |
+
LIT = PSLiteralTable.intern
|
105 |
+
KWD = PSKeywordTable.intern
|
106 |
+
KEYWORD_PROC_BEGIN = KWD(b"{")
|
107 |
+
KEYWORD_PROC_END = KWD(b"}")
|
108 |
+
KEYWORD_ARRAY_BEGIN = KWD(b"[")
|
109 |
+
KEYWORD_ARRAY_END = KWD(b"]")
|
110 |
+
KEYWORD_DICT_BEGIN = KWD(b"<<")
|
111 |
+
KEYWORD_DICT_END = KWD(b">>")
|
112 |
+
|
113 |
+
|
114 |
+
def literal_name(x: Any) -> str:
|
115 |
+
if isinstance(x, PSLiteral):
|
116 |
+
if isinstance(x.name, str):
|
117 |
+
return x.name
|
118 |
+
try:
|
119 |
+
return str(x.name, "utf-8")
|
120 |
+
except UnicodeDecodeError:
|
121 |
+
return str(x.name)
|
122 |
+
else:
|
123 |
+
if settings.STRICT:
|
124 |
+
raise PSTypeError(f"Literal required: {x!r}")
|
125 |
+
return str(x)
|
126 |
+
|
127 |
+
|
128 |
+
def keyword_name(x: Any) -> Any:
|
129 |
+
if not isinstance(x, PSKeyword):
|
130 |
+
if settings.STRICT:
|
131 |
+
raise PSTypeError("Keyword required: %r" % x)
|
132 |
+
else:
|
133 |
+
name = x
|
134 |
+
else:
|
135 |
+
name = str(x.name, "utf-8", "ignore")
|
136 |
+
return name
|
137 |
+
|
138 |
+
|
139 |
+
EOL = re.compile(rb"[\r\n]")
|
140 |
+
SPC = re.compile(rb"\s")
|
141 |
+
NONSPC = re.compile(rb"\S")
|
142 |
+
HEX = re.compile(rb"[0-9a-fA-F]")
|
143 |
+
END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]")
|
144 |
+
END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]")
|
145 |
+
HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.")
|
146 |
+
END_NUMBER = re.compile(rb"[^0-9]")
|
147 |
+
END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]")
|
148 |
+
END_STRING = re.compile(rb"[()\134]")
|
149 |
+
OCT_STRING = re.compile(rb"[0-7]")
|
150 |
+
ESC_STRING = {
|
151 |
+
b"b": 8,
|
152 |
+
b"t": 9,
|
153 |
+
b"n": 10,
|
154 |
+
b"f": 12,
|
155 |
+
b"r": 13,
|
156 |
+
b"(": 40,
|
157 |
+
b")": 41,
|
158 |
+
b"\\": 92,
|
159 |
+
}
|
160 |
+
|
161 |
+
|
162 |
+
PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes]
|
163 |
+
|
164 |
+
|
165 |
+
class PSBaseParser:
|
166 |
+
"""Most basic PostScript parser that performs only tokenization."""
|
167 |
+
|
168 |
+
BUFSIZ = 4096
|
169 |
+
|
170 |
+
def __init__(self, fp: BinaryIO) -> None:
|
171 |
+
self.fp = fp
|
172 |
+
self.seek(0)
|
173 |
+
|
174 |
+
def __repr__(self) -> str:
|
175 |
+
return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos)
|
176 |
+
|
177 |
+
def flush(self) -> None:
|
178 |
+
pass
|
179 |
+
|
180 |
+
def close(self) -> None:
|
181 |
+
self.flush()
|
182 |
+
|
183 |
+
def tell(self) -> int:
|
184 |
+
return self.bufpos + self.charpos
|
185 |
+
|
186 |
+
def poll(self, pos: Optional[int] = None, n: int = 80) -> None:
|
187 |
+
pos0 = self.fp.tell()
|
188 |
+
if not pos:
|
189 |
+
pos = self.bufpos + self.charpos
|
190 |
+
self.fp.seek(pos)
|
191 |
+
# log.debug("poll(%d): %r", pos, self.fp.read(n))
|
192 |
+
self.fp.seek(pos0)
|
193 |
+
|
194 |
+
def seek(self, pos: int) -> None:
|
195 |
+
"""Seeks the parser to the given position."""
|
196 |
+
# log.debug("seek: %r", pos)
|
197 |
+
self.fp.seek(pos)
|
198 |
+
# reset the status for nextline()
|
199 |
+
self.bufpos = pos
|
200 |
+
self.buf = b""
|
201 |
+
self.charpos = 0
|
202 |
+
# reset the status for nexttoken()
|
203 |
+
self._parse1 = self._parse_main
|
204 |
+
self._curtoken = b""
|
205 |
+
self._curtokenpos = 0
|
206 |
+
self._tokens: List[Tuple[int, PSBaseParserToken]] = []
|
207 |
+
|
208 |
+
def fillbuf(self) -> None:
|
209 |
+
if self.charpos < len(self.buf):
|
210 |
+
return
|
211 |
+
# fetch next chunk.
|
212 |
+
self.bufpos = self.fp.tell()
|
213 |
+
self.buf = self.fp.read(self.BUFSIZ)
|
214 |
+
if not self.buf:
|
215 |
+
raise PSEOF("Unexpected EOF")
|
216 |
+
self.charpos = 0
|
217 |
+
|
218 |
+
def nextline(self) -> Tuple[int, bytes]:
|
219 |
+
"""Fetches a next line that ends either with \\r or \\n."""
|
220 |
+
linebuf = b""
|
221 |
+
linepos = self.bufpos + self.charpos
|
222 |
+
eol = False
|
223 |
+
while 1:
|
224 |
+
self.fillbuf()
|
225 |
+
if eol:
|
226 |
+
c = self.buf[self.charpos : self.charpos + 1]
|
227 |
+
# handle b'\r\n'
|
228 |
+
if c == b"\n":
|
229 |
+
linebuf += c
|
230 |
+
self.charpos += 1
|
231 |
+
break
|
232 |
+
m = EOL.search(self.buf, self.charpos)
|
233 |
+
if m:
|
234 |
+
linebuf += self.buf[self.charpos : m.end(0)]
|
235 |
+
self.charpos = m.end(0)
|
236 |
+
if linebuf[-1:] == b"\r":
|
237 |
+
eol = True
|
238 |
+
else:
|
239 |
+
break
|
240 |
+
else:
|
241 |
+
linebuf += self.buf[self.charpos :]
|
242 |
+
self.charpos = len(self.buf)
|
243 |
+
# log.debug("nextline: %r, %r", linepos, linebuf)
|
244 |
+
|
245 |
+
return (linepos, linebuf)
|
246 |
+
|
247 |
+
def revreadlines(self) -> Iterator[bytes]:
|
248 |
+
"""Fetches a next line backword.
|
249 |
+
|
250 |
+
This is used to locate the trailers at the end of a file.
|
251 |
+
"""
|
252 |
+
self.fp.seek(0, io.SEEK_END)
|
253 |
+
pos = self.fp.tell()
|
254 |
+
buf = b""
|
255 |
+
while pos > 0:
|
256 |
+
prevpos = pos
|
257 |
+
pos = max(0, pos - self.BUFSIZ)
|
258 |
+
self.fp.seek(pos)
|
259 |
+
s = self.fp.read(prevpos - pos)
|
260 |
+
if not s:
|
261 |
+
break
|
262 |
+
while 1:
|
263 |
+
n = max(s.rfind(b"\r"), s.rfind(b"\n"))
|
264 |
+
if n == -1:
|
265 |
+
buf = s + buf
|
266 |
+
break
|
267 |
+
yield s[n:] + buf
|
268 |
+
s = s[:n]
|
269 |
+
buf = b""
|
270 |
+
|
271 |
+
def _parse_main(self, s: bytes, i: int) -> int:
|
272 |
+
m = NONSPC.search(s, i)
|
273 |
+
if not m:
|
274 |
+
return len(s)
|
275 |
+
j = m.start(0)
|
276 |
+
c = s[j : j + 1]
|
277 |
+
self._curtokenpos = self.bufpos + j
|
278 |
+
if c == b"%":
|
279 |
+
self._curtoken = b"%"
|
280 |
+
self._parse1 = self._parse_comment
|
281 |
+
return j + 1
|
282 |
+
elif c == b"/":
|
283 |
+
self._curtoken = b""
|
284 |
+
self._parse1 = self._parse_literal
|
285 |
+
return j + 1
|
286 |
+
elif c in b"-+" or c.isdigit():
|
287 |
+
self._curtoken = c
|
288 |
+
self._parse1 = self._parse_number
|
289 |
+
return j + 1
|
290 |
+
elif c == b".":
|
291 |
+
self._curtoken = c
|
292 |
+
self._parse1 = self._parse_float
|
293 |
+
return j + 1
|
294 |
+
elif c.isalpha():
|
295 |
+
self._curtoken = c
|
296 |
+
self._parse1 = self._parse_keyword
|
297 |
+
return j + 1
|
298 |
+
elif c == b"(":
|
299 |
+
self._curtoken = b""
|
300 |
+
self.paren = 1
|
301 |
+
self._parse1 = self._parse_string
|
302 |
+
return j + 1
|
303 |
+
elif c == b"<":
|
304 |
+
self._curtoken = b""
|
305 |
+
self._parse1 = self._parse_wopen
|
306 |
+
return j + 1
|
307 |
+
elif c == b">":
|
308 |
+
self._curtoken = b""
|
309 |
+
self._parse1 = self._parse_wclose
|
310 |
+
return j + 1
|
311 |
+
elif c == b"\x00":
|
312 |
+
return j + 1
|
313 |
+
else:
|
314 |
+
self._add_token(KWD(c))
|
315 |
+
return j + 1
|
316 |
+
|
317 |
+
def _add_token(self, obj: PSBaseParserToken) -> None:
|
318 |
+
self._tokens.append((self._curtokenpos, obj))
|
319 |
+
|
320 |
+
def _parse_comment(self, s: bytes, i: int) -> int:
|
321 |
+
m = EOL.search(s, i)
|
322 |
+
if not m:
|
323 |
+
self._curtoken += s[i:]
|
324 |
+
return len(s)
|
325 |
+
j = m.start(0)
|
326 |
+
self._curtoken += s[i:j]
|
327 |
+
self._parse1 = self._parse_main
|
328 |
+
# We ignore comments.
|
329 |
+
# self._tokens.append(self._curtoken)
|
330 |
+
return j
|
331 |
+
|
332 |
+
def _parse_literal(self, s: bytes, i: int) -> int:
|
333 |
+
m = END_LITERAL.search(s, i)
|
334 |
+
if not m:
|
335 |
+
self._curtoken += s[i:]
|
336 |
+
return len(s)
|
337 |
+
j = m.start(0)
|
338 |
+
self._curtoken += s[i:j]
|
339 |
+
c = s[j : j + 1]
|
340 |
+
if c == b"#":
|
341 |
+
self.hex = b""
|
342 |
+
self._parse1 = self._parse_literal_hex
|
343 |
+
return j + 1
|
344 |
+
try:
|
345 |
+
name: Union[str, bytes] = str(self._curtoken, "utf-8")
|
346 |
+
except Exception:
|
347 |
+
name = self._curtoken
|
348 |
+
self._add_token(LIT(name))
|
349 |
+
self._parse1 = self._parse_main
|
350 |
+
return j
|
351 |
+
|
352 |
+
def _parse_literal_hex(self, s: bytes, i: int) -> int:
|
353 |
+
c = s[i : i + 1]
|
354 |
+
if HEX.match(c) and len(self.hex) < 2:
|
355 |
+
self.hex += c
|
356 |
+
return i + 1
|
357 |
+
if self.hex:
|
358 |
+
self._curtoken += bytes((int(self.hex, 16),))
|
359 |
+
self._parse1 = self._parse_literal
|
360 |
+
return i
|
361 |
+
|
362 |
+
def _parse_number(self, s: bytes, i: int) -> int:
|
363 |
+
m = END_NUMBER.search(s, i)
|
364 |
+
if not m:
|
365 |
+
self._curtoken += s[i:]
|
366 |
+
return len(s)
|
367 |
+
j = m.start(0)
|
368 |
+
self._curtoken += s[i:j]
|
369 |
+
c = s[j : j + 1]
|
370 |
+
if c == b".":
|
371 |
+
self._curtoken += c
|
372 |
+
self._parse1 = self._parse_float
|
373 |
+
return j + 1
|
374 |
+
try:
|
375 |
+
self._add_token(int(self._curtoken))
|
376 |
+
except ValueError:
|
377 |
+
pass
|
378 |
+
self._parse1 = self._parse_main
|
379 |
+
return j
|
380 |
+
|
381 |
+
def _parse_float(self, s: bytes, i: int) -> int:
|
382 |
+
m = END_NUMBER.search(s, i)
|
383 |
+
if not m:
|
384 |
+
self._curtoken += s[i:]
|
385 |
+
return len(s)
|
386 |
+
j = m.start(0)
|
387 |
+
self._curtoken += s[i:j]
|
388 |
+
try:
|
389 |
+
self._add_token(float(self._curtoken))
|
390 |
+
except ValueError:
|
391 |
+
pass
|
392 |
+
self._parse1 = self._parse_main
|
393 |
+
return j
|
394 |
+
|
395 |
+
def _parse_keyword(self, s: bytes, i: int) -> int:
|
396 |
+
m = END_KEYWORD.search(s, i)
|
397 |
+
if m:
|
398 |
+
j = m.start(0)
|
399 |
+
self._curtoken += s[i:j]
|
400 |
+
else:
|
401 |
+
# Use the rest of the stream if no non-keyword character is found. This
|
402 |
+
# can happen if the keyword is the final bytes of the stream
|
403 |
+
# (https://github.com/pdf2zh/pdf2zh.six/issues/884).
|
404 |
+
j = len(s)
|
405 |
+
self._curtoken += s[i:]
|
406 |
+
if self._curtoken == b"true":
|
407 |
+
token: Union[bool, PSKeyword] = True
|
408 |
+
elif self._curtoken == b"false":
|
409 |
+
token = False
|
410 |
+
else:
|
411 |
+
token = KWD(self._curtoken)
|
412 |
+
self._add_token(token)
|
413 |
+
self._parse1 = self._parse_main
|
414 |
+
return j
|
415 |
+
|
416 |
+
def _parse_string(self, s: bytes, i: int) -> int:
|
417 |
+
m = END_STRING.search(s, i)
|
418 |
+
if not m:
|
419 |
+
self._curtoken += s[i:]
|
420 |
+
return len(s)
|
421 |
+
j = m.start(0)
|
422 |
+
self._curtoken += s[i:j]
|
423 |
+
c = s[j : j + 1]
|
424 |
+
if c == b"\\":
|
425 |
+
self.oct = b""
|
426 |
+
self._parse1 = self._parse_string_1
|
427 |
+
return j + 1
|
428 |
+
if c == b"(":
|
429 |
+
self.paren += 1
|
430 |
+
self._curtoken += c
|
431 |
+
return j + 1
|
432 |
+
if c == b")":
|
433 |
+
self.paren -= 1
|
434 |
+
if self.paren:
|
435 |
+
# WTF, they said balanced parens need no special treatment.
|
436 |
+
self._curtoken += c
|
437 |
+
return j + 1
|
438 |
+
self._add_token(self._curtoken)
|
439 |
+
self._parse1 = self._parse_main
|
440 |
+
return j + 1
|
441 |
+
|
442 |
+
def _parse_string_1(self, s: bytes, i: int) -> int:
|
443 |
+
"""Parse literal strings
|
444 |
+
|
445 |
+
PDF Reference 3.2.3
|
446 |
+
"""
|
447 |
+
c = s[i : i + 1]
|
448 |
+
if OCT_STRING.match(c) and len(self.oct) < 3:
|
449 |
+
self.oct += c
|
450 |
+
return i + 1
|
451 |
+
|
452 |
+
elif self.oct:
|
453 |
+
chrcode = int(self.oct, 8)
|
454 |
+
assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode)
|
455 |
+
self._curtoken += bytes((chrcode,))
|
456 |
+
self._parse1 = self._parse_string
|
457 |
+
return i
|
458 |
+
|
459 |
+
elif c in ESC_STRING:
|
460 |
+
self._curtoken += bytes((ESC_STRING[c],))
|
461 |
+
|
462 |
+
elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n":
|
463 |
+
# If current and next character is \r\n skip both because enters
|
464 |
+
# after a \ are ignored
|
465 |
+
i += 1
|
466 |
+
|
467 |
+
# default action
|
468 |
+
self._parse1 = self._parse_string
|
469 |
+
return i + 1
|
470 |
+
|
471 |
+
def _parse_wopen(self, s: bytes, i: int) -> int:
|
472 |
+
c = s[i : i + 1]
|
473 |
+
if c == b"<":
|
474 |
+
self._add_token(KEYWORD_DICT_BEGIN)
|
475 |
+
self._parse1 = self._parse_main
|
476 |
+
i += 1
|
477 |
+
else:
|
478 |
+
self._parse1 = self._parse_hexstring
|
479 |
+
return i
|
480 |
+
|
481 |
+
def _parse_wclose(self, s: bytes, i: int) -> int:
|
482 |
+
c = s[i : i + 1]
|
483 |
+
if c == b">":
|
484 |
+
self._add_token(KEYWORD_DICT_END)
|
485 |
+
i += 1
|
486 |
+
self._parse1 = self._parse_main
|
487 |
+
return i
|
488 |
+
|
489 |
+
def _parse_hexstring(self, s: bytes, i: int) -> int:
|
490 |
+
m = END_HEX_STRING.search(s, i)
|
491 |
+
if not m:
|
492 |
+
self._curtoken += s[i:]
|
493 |
+
return len(s)
|
494 |
+
j = m.start(0)
|
495 |
+
self._curtoken += s[i:j]
|
496 |
+
token = HEX_PAIR.sub(
|
497 |
+
lambda m: bytes((int(m.group(0), 16),)),
|
498 |
+
SPC.sub(b"", self._curtoken),
|
499 |
+
)
|
500 |
+
self._add_token(token)
|
501 |
+
self._parse1 = self._parse_main
|
502 |
+
return j
|
503 |
+
|
504 |
+
def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
|
505 |
+
while not self._tokens:
|
506 |
+
self.fillbuf()
|
507 |
+
self.charpos = self._parse1(self.buf, self.charpos)
|
508 |
+
token = self._tokens.pop(0)
|
509 |
+
# log.debug("nexttoken: %r", token)
|
510 |
+
return token
|
511 |
+
|
512 |
+
|
513 |
+
# Stack slots may by occupied by any of:
|
514 |
+
# * the name of a literal
|
515 |
+
# * the PSBaseParserToken types
|
516 |
+
# * list (via KEYWORD_ARRAY)
|
517 |
+
# * dict (via KEYWORD_DICT)
|
518 |
+
# * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT
|
519 |
+
ExtraT = TypeVar("ExtraT")
|
520 |
+
PSStackType = Union[str, float, bool, PSLiteral, bytes, List, Dict, ExtraT]
|
521 |
+
PSStackEntry = Tuple[int, PSStackType[ExtraT]]
|
522 |
+
|
523 |
+
|
524 |
+
class PSStackParser(PSBaseParser, Generic[ExtraT]):
|
525 |
+
def __init__(self, fp: BinaryIO) -> None:
|
526 |
+
PSBaseParser.__init__(self, fp)
|
527 |
+
self.reset()
|
528 |
+
|
529 |
+
def reset(self) -> None:
|
530 |
+
self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = []
|
531 |
+
self.curtype: Optional[str] = None
|
532 |
+
self.curstack: List[PSStackEntry[ExtraT]] = []
|
533 |
+
self.results: List[PSStackEntry[ExtraT]] = []
|
534 |
+
|
535 |
+
def seek(self, pos: int) -> None:
|
536 |
+
PSBaseParser.seek(self, pos)
|
537 |
+
self.reset()
|
538 |
+
|
539 |
+
def push(self, *objs: PSStackEntry[ExtraT]) -> None:
|
540 |
+
self.curstack.extend(objs)
|
541 |
+
|
542 |
+
def pop(self, n: int) -> List[PSStackEntry[ExtraT]]:
|
543 |
+
objs = self.curstack[-n:]
|
544 |
+
self.curstack[-n:] = []
|
545 |
+
return objs
|
546 |
+
|
547 |
+
def popall(self) -> List[PSStackEntry[ExtraT]]:
|
548 |
+
objs = self.curstack
|
549 |
+
self.curstack = []
|
550 |
+
return objs
|
551 |
+
|
552 |
+
def add_results(self, *objs: PSStackEntry[ExtraT]) -> None:
|
553 |
+
# try:
|
554 |
+
# log.debug("add_results: %r", objs)
|
555 |
+
# except Exception:
|
556 |
+
# log.debug("add_results: (unprintable object)")
|
557 |
+
self.results.extend(objs)
|
558 |
+
|
559 |
+
def start_type(self, pos: int, type: str) -> None:
|
560 |
+
self.context.append((pos, self.curtype, self.curstack))
|
561 |
+
(self.curtype, self.curstack) = (type, [])
|
562 |
+
# log.debug("start_type: pos=%r, type=%r", pos, type)
|
563 |
+
|
564 |
+
def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]:
|
565 |
+
if self.curtype != type:
|
566 |
+
raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}")
|
567 |
+
objs = [obj for (_, obj) in self.curstack]
|
568 |
+
(pos, self.curtype, self.curstack) = self.context.pop()
|
569 |
+
# log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs)
|
570 |
+
return (pos, objs)
|
571 |
+
|
572 |
+
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
573 |
+
pass
|
574 |
+
|
575 |
+
def nextobject(self) -> PSStackEntry[ExtraT]:
|
576 |
+
"""Yields a list of objects.
|
577 |
+
|
578 |
+
Arrays and dictionaries are represented as Python lists and
|
579 |
+
dictionaries.
|
580 |
+
|
581 |
+
:return: keywords, literals, strings, numbers, arrays and dictionaries.
|
582 |
+
"""
|
583 |
+
end = None
|
584 |
+
while not self.results:
|
585 |
+
(pos, token) = self.nexttoken()
|
586 |
+
if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
|
587 |
+
# normal token
|
588 |
+
self.push((pos, token))
|
589 |
+
elif token == KEYWORD_ARRAY_BEGIN:
|
590 |
+
# begin array
|
591 |
+
self.start_type(pos, "a")
|
592 |
+
elif token == KEYWORD_ARRAY_END:
|
593 |
+
# end array
|
594 |
+
try:
|
595 |
+
self.push(self.end_type("a"))
|
596 |
+
except PSTypeError:
|
597 |
+
if settings.STRICT:
|
598 |
+
raise
|
599 |
+
elif token == KEYWORD_DICT_BEGIN:
|
600 |
+
# begin dictionary
|
601 |
+
self.start_type(pos, "d")
|
602 |
+
elif token == KEYWORD_DICT_END:
|
603 |
+
# end dictionary
|
604 |
+
try:
|
605 |
+
(pos, objs) = self.end_type("d")
|
606 |
+
if len(objs) % 2 != 0:
|
607 |
+
error_msg = "Invalid dictionary construct: %r" % objs
|
608 |
+
raise PSSyntaxError(error_msg)
|
609 |
+
d = {
|
610 |
+
literal_name(k): v
|
611 |
+
for (k, v) in choplist(2, objs)
|
612 |
+
if v is not None
|
613 |
+
}
|
614 |
+
self.push((pos, d))
|
615 |
+
except PSTypeError:
|
616 |
+
if settings.STRICT:
|
617 |
+
raise
|
618 |
+
elif token == KEYWORD_PROC_BEGIN:
|
619 |
+
# begin proc
|
620 |
+
self.start_type(pos, "p")
|
621 |
+
elif token == KEYWORD_PROC_END:
|
622 |
+
# end proc
|
623 |
+
try:
|
624 |
+
self.push(self.end_type("p"))
|
625 |
+
except PSTypeError:
|
626 |
+
if settings.STRICT:
|
627 |
+
raise
|
628 |
+
elif isinstance(token, PSKeyword):
|
629 |
+
# log.debug(
|
630 |
+
# "do_keyword: pos=%r, token=%r, stack=%r",
|
631 |
+
# pos,
|
632 |
+
# token,
|
633 |
+
# self.curstack,
|
634 |
+
# )
|
635 |
+
if token.name == b"endobj":
|
636 |
+
end = pos + 7
|
637 |
+
self.do_keyword(pos, token)
|
638 |
+
else:
|
639 |
+
log.error(
|
640 |
+
"unknown token: pos=%r, token=%r, stack=%r",
|
641 |
+
pos,
|
642 |
+
token,
|
643 |
+
self.curstack,
|
644 |
+
)
|
645 |
+
self.do_keyword(pos, token)
|
646 |
+
raise PSException
|
647 |
+
if self.context:
|
648 |
+
continue
|
649 |
+
else:
|
650 |
+
self.flush()
|
651 |
+
obj = self.results.pop(0)
|
652 |
+
# try:
|
653 |
+
# log.debug("nextobject: %r", obj)
|
654 |
+
# except Exception:
|
655 |
+
# log.debug("nextobject: (unprintable object)")
|
656 |
+
return end, obj
|
pdf2zh/py.typed
ADDED
File without changes
|
pdf2zh/runlength.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# RunLength decoder (Adobe version) implementation based on PDF Reference
|
3 |
+
# version 1.4 section 3.3.4.
|
4 |
+
#
|
5 |
+
# * public domain *
|
6 |
+
#
|
7 |
+
|
8 |
+
|
9 |
+
def rldecode(data: bytes) -> bytes:
|
10 |
+
"""RunLength decoder (Adobe version) implementation based on PDF Reference
|
11 |
+
version 1.4 section 3.3.4:
|
12 |
+
The RunLengthDecode filter decodes data that has been encoded in a
|
13 |
+
simple byte-oriented format based on run length. The encoded data
|
14 |
+
is a sequence of runs, where each run consists of a length byte
|
15 |
+
followed by 1 to 128 bytes of data. If the length byte is in the
|
16 |
+
range 0 to 127, the following length + 1 (1 to 128) bytes are
|
17 |
+
copied literally during decompression. If length is in the range
|
18 |
+
129 to 255, the following single byte is to be copied 257 - length
|
19 |
+
(2 to 128) times during decompression. A length value of 128
|
20 |
+
denotes EOD.
|
21 |
+
"""
|
22 |
+
decoded = b""
|
23 |
+
i = 0
|
24 |
+
while i < len(data):
|
25 |
+
length = data[i]
|
26 |
+
if length == 128:
|
27 |
+
break
|
28 |
+
|
29 |
+
if length >= 0 and length < 128:
|
30 |
+
for j in range(i + 1, (i + 1) + (length + 1)):
|
31 |
+
decoded += bytes((data[j],))
|
32 |
+
i = (i + 1) + (length + 1)
|
33 |
+
|
34 |
+
if length > 128:
|
35 |
+
run = bytes((data[i + 1],)) * (257 - length)
|
36 |
+
decoded += run
|
37 |
+
i = (i + 1) + 1
|
38 |
+
|
39 |
+
return decoded
|
pdf2zh/settings.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
STRICT = False
|
pdf2zh/translator.py
ADDED
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import html
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
from json import dumps, loads
|
6 |
+
|
7 |
+
import deepl
|
8 |
+
import ollama
|
9 |
+
import openai
|
10 |
+
import requests
|
11 |
+
from azure.ai.translation.text import TextTranslationClient
|
12 |
+
from azure.core.credentials import AzureKeyCredential
|
13 |
+
|
14 |
+
import hmac
|
15 |
+
import hashlib
|
16 |
+
import time
|
17 |
+
from datetime import datetime,UTC
|
18 |
+
|
19 |
+
class BaseTranslator:
|
20 |
+
def __init__(self, service, lang_out, lang_in, model):
|
21 |
+
self.service = service
|
22 |
+
self.lang_out = lang_out
|
23 |
+
self.lang_in = lang_in
|
24 |
+
self.model = model
|
25 |
+
|
26 |
+
def translate(self, text) -> str: ... # noqa: E704
|
27 |
+
|
28 |
+
def __str__(self):
|
29 |
+
return f"{self.service} {self.lang_out} {self.lang_in}"
|
30 |
+
|
31 |
+
|
32 |
+
class GoogleTranslator(BaseTranslator):
|
33 |
+
def __init__(self, service, lang_out, lang_in, model):
|
34 |
+
lang_out = "zh-CN" if lang_out == "auto" else lang_out
|
35 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
36 |
+
super().__init__(service, lang_out, lang_in, model)
|
37 |
+
self.session = requests.Session()
|
38 |
+
self.base_link = "http://translate.google.com/m"
|
39 |
+
self.headers = {
|
40 |
+
"User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
|
41 |
+
}
|
42 |
+
|
43 |
+
def translate(self, text):
|
44 |
+
text = text[:5000] # google translate max length
|
45 |
+
response = self.session.get(
|
46 |
+
self.base_link,
|
47 |
+
params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
|
48 |
+
headers=self.headers,
|
49 |
+
)
|
50 |
+
re_result = re.findall(
|
51 |
+
r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
|
52 |
+
)
|
53 |
+
if response.status_code == 400:
|
54 |
+
result = "IRREPARABLE TRANSLATION ERROR"
|
55 |
+
elif len(re_result) == 0:
|
56 |
+
raise ValueError("Empty translation result")
|
57 |
+
else:
|
58 |
+
result = html.unescape(re_result[0])
|
59 |
+
return result
|
60 |
+
|
61 |
+
class TencentTranslator(BaseTranslator):
|
62 |
+
def sign(self,key, msg):
|
63 |
+
return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
|
64 |
+
|
65 |
+
def __init__(self, service, lang_out, lang_in, model):
|
66 |
+
lang_out = "zh" if lang_out == "auto" else lang_out
|
67 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
68 |
+
super().__init__(service, lang_out, lang_in, model)
|
69 |
+
try:
|
70 |
+
server_url = (
|
71 |
+
"tmt.tencentcloudapi.com"
|
72 |
+
)
|
73 |
+
self.secret_id = os.getenv("TENCENT_SECRET_ID")
|
74 |
+
self.secret_key = os.getenv("TENCENT_SECRET_KEY")
|
75 |
+
|
76 |
+
except KeyError as e:
|
77 |
+
missing_var = e.args[0]
|
78 |
+
raise ValueError(
|
79 |
+
f"The environment variable '{missing_var}' is required but not set."
|
80 |
+
) from e
|
81 |
+
|
82 |
+
self.session = requests.Session()
|
83 |
+
self.base_link = f"{server_url}"
|
84 |
+
|
85 |
+
def translate(self, text):
|
86 |
+
text = text[:5000]
|
87 |
+
data={
|
88 |
+
"SourceText":text,
|
89 |
+
"Source":self.lang_in,
|
90 |
+
"Target":self.lang_out,
|
91 |
+
"ProjectId":0
|
92 |
+
}
|
93 |
+
payloadx = dumps(data)
|
94 |
+
hashed_request_payload = hashlib.sha256(payloadx.encode("utf-8")).hexdigest()
|
95 |
+
canonical_request = ("POST" + "\n" +
|
96 |
+
"/" + "\n" +
|
97 |
+
"" + "\n" +
|
98 |
+
"content-type:application/json; charset=utf-8\nhost:tmt.tencentcloudapi.com\nx-tc-action:texttranslate\n" + "\n" +
|
99 |
+
"content-type;host;x-tc-action" + "\n" +
|
100 |
+
hashed_request_payload)
|
101 |
+
|
102 |
+
timestamp = int(time.time())
|
103 |
+
date = datetime.fromtimestamp(timestamp, UTC).strftime("%Y-%m-%d")
|
104 |
+
credential_scope = date + "/tmt/tc3_request"
|
105 |
+
hashed_canonical_request = hashlib.sha256(canonical_request.encode("utf-8")).hexdigest()
|
106 |
+
algorithm = "TC3-HMAC-SHA256"
|
107 |
+
string_to_sign = (algorithm + "\n" +
|
108 |
+
str(timestamp) + "\n" +
|
109 |
+
credential_scope + "\n" +
|
110 |
+
hashed_canonical_request)
|
111 |
+
secret_date = self.sign(("TC3" + self.secret_key).encode("utf-8"), date)
|
112 |
+
secret_service = self.sign(secret_date, "tmt")
|
113 |
+
secret_signing = self.sign(secret_service, "tc3_request")
|
114 |
+
signed_headers = "content-type;host;x-tc-action"
|
115 |
+
signature = hmac.new(secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256).hexdigest()
|
116 |
+
authorization = (algorithm + " " +
|
117 |
+
"Credential=" + self.secret_id + "/" + credential_scope + ", " +
|
118 |
+
"SignedHeaders=" + signed_headers + ", " +
|
119 |
+
"Signature=" + signature)
|
120 |
+
self.headers = {
|
121 |
+
"Authorization": authorization,
|
122 |
+
"Content-Type": "application/json; charset=utf-8",
|
123 |
+
"Host": "tmt.tencentcloudapi.com",
|
124 |
+
"X-TC-Action": "TextTranslate",
|
125 |
+
"X-TC-Region":"ap-beijing",
|
126 |
+
"X-TC-Timestamp": str(timestamp),
|
127 |
+
"X-TC-Version": "2018-03-21"
|
128 |
+
}
|
129 |
+
|
130 |
+
response = self.session.post(
|
131 |
+
"https://"+self.base_link,
|
132 |
+
json=data,
|
133 |
+
headers=self.headers,
|
134 |
+
)
|
135 |
+
# 1. Status code test
|
136 |
+
if response.status_code == 200:
|
137 |
+
result = loads(response.text)
|
138 |
+
else:
|
139 |
+
raise ValueError("HTTP error: " + str(response.status_code))
|
140 |
+
# 2. Result test
|
141 |
+
try:
|
142 |
+
result = result['Response']['TargetText']
|
143 |
+
return result
|
144 |
+
except KeyError:
|
145 |
+
result = ""
|
146 |
+
raise ValueError("No valid key in Tencent's response")
|
147 |
+
# 3. Result length check
|
148 |
+
if len(result) == 0:
|
149 |
+
raise ValueError("Empty translation result")
|
150 |
+
return result
|
151 |
+
|
152 |
+
class DeepLXTranslator(BaseTranslator):
|
153 |
+
def __init__(self, service, lang_out, lang_in, model):
|
154 |
+
lang_out = "zh" if lang_out == "auto" else lang_out
|
155 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
156 |
+
super().__init__(service, lang_out, lang_in, model)
|
157 |
+
try:
|
158 |
+
auth_key = os.getenv("DEEPLX_AUTH_KEY")
|
159 |
+
server_url = (
|
160 |
+
"https://api.deeplx.org"
|
161 |
+
if not os.getenv("DEEPLX_SERVER_URL")
|
162 |
+
else os.getenv("DEEPLX_SERVER_URL")
|
163 |
+
)
|
164 |
+
except KeyError as e:
|
165 |
+
missing_var = e.args[0]
|
166 |
+
raise ValueError(
|
167 |
+
f"The environment variable '{missing_var}' is required but not set."
|
168 |
+
) from e
|
169 |
+
|
170 |
+
self.session = requests.Session()
|
171 |
+
server_url=server_url.rstrip('/')
|
172 |
+
if auth_key:
|
173 |
+
self.base_link = f"{server_url}/{auth_key}/translate"
|
174 |
+
else:
|
175 |
+
self.base_link = f"{server_url}/translate"
|
176 |
+
self.headers = {
|
177 |
+
"User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
|
178 |
+
}
|
179 |
+
|
180 |
+
def translate(self, text):
|
181 |
+
text = text[:5000] # google translate max length
|
182 |
+
response = self.session.post(
|
183 |
+
self.base_link,
|
184 |
+
dumps(
|
185 |
+
{
|
186 |
+
"target_lang": self.lang_out,
|
187 |
+
"text": text,
|
188 |
+
}
|
189 |
+
),
|
190 |
+
headers=self.headers,
|
191 |
+
)
|
192 |
+
# 1. Status code test
|
193 |
+
if response.status_code == 200:
|
194 |
+
result = loads(response.text)
|
195 |
+
else:
|
196 |
+
raise ValueError("HTTP error: " + str(response.status_code))
|
197 |
+
# 2. Result test
|
198 |
+
try:
|
199 |
+
result = result["data"]
|
200 |
+
return result
|
201 |
+
except KeyError:
|
202 |
+
result = ""
|
203 |
+
raise ValueError("No valid key in DeepLX's response")
|
204 |
+
# 3. Result length check
|
205 |
+
if len(result) == 0:
|
206 |
+
raise ValueError("Empty translation result")
|
207 |
+
return result
|
208 |
+
|
209 |
+
|
210 |
+
class DeepLTranslator(BaseTranslator):
|
211 |
+
def __init__(self, service, lang_out, lang_in, model):
|
212 |
+
lang_out = "ZH" if lang_out == "auto" else lang_out
|
213 |
+
lang_in = "EN" if lang_in == "auto" else lang_in
|
214 |
+
super().__init__(service, lang_out, lang_in, model)
|
215 |
+
self.session = requests.Session()
|
216 |
+
auth_key = os.getenv("DEEPL_AUTH_KEY")
|
217 |
+
server_url = os.getenv("DEEPL_SERVER_URL")
|
218 |
+
self.client = deepl.Translator(auth_key, server_url=server_url)
|
219 |
+
|
220 |
+
def translate(self, text):
|
221 |
+
response = self.client.translate_text(
|
222 |
+
text, target_lang=self.lang_out, source_lang=self.lang_in
|
223 |
+
)
|
224 |
+
return response.text
|
225 |
+
|
226 |
+
|
227 |
+
class OllamaTranslator(BaseTranslator):
|
228 |
+
def __init__(self, service, lang_out, lang_in, model):
|
229 |
+
lang_out = "zh-CN" if lang_out == "auto" else lang_out
|
230 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
231 |
+
super().__init__(service, lang_out, lang_in, model)
|
232 |
+
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
|
233 |
+
# OLLAMA_HOST
|
234 |
+
self.client = ollama.Client()
|
235 |
+
|
236 |
+
def translate(self, text):
|
237 |
+
response = self.client.chat(
|
238 |
+
model=self.model,
|
239 |
+
options=self.options,
|
240 |
+
messages=[
|
241 |
+
{
|
242 |
+
"role": "system",
|
243 |
+
"content": "You are a professional,authentic machine translation engine.",
|
244 |
+
},
|
245 |
+
{
|
246 |
+
"role": "user",
|
247 |
+
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
|
248 |
+
},
|
249 |
+
],
|
250 |
+
)
|
251 |
+
return response["message"]["content"].strip()
|
252 |
+
|
253 |
+
|
254 |
+
class OpenAITranslator(BaseTranslator):
|
255 |
+
def __init__(self, service, lang_out, lang_in, model):
|
256 |
+
lang_out = "zh-CN" if lang_out == "auto" else lang_out
|
257 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
258 |
+
super().__init__(service, lang_out, lang_in, model)
|
259 |
+
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
|
260 |
+
# OPENAI_BASE_URL
|
261 |
+
# OPENAI_API_KEY
|
262 |
+
self.client = openai.OpenAI()
|
263 |
+
|
264 |
+
def translate(self, text) -> str:
|
265 |
+
response = self.client.chat.completions.create(
|
266 |
+
model=self.model,
|
267 |
+
**self.options,
|
268 |
+
messages=[
|
269 |
+
{
|
270 |
+
"role": "system",
|
271 |
+
"content": "You are a professional,authentic machine translation engine.",
|
272 |
+
},
|
273 |
+
{
|
274 |
+
"role": "user",
|
275 |
+
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
|
276 |
+
},
|
277 |
+
],
|
278 |
+
)
|
279 |
+
return response.choices[0].message.content.strip()
|
280 |
+
|
281 |
+
|
282 |
+
class AzureTranslator(BaseTranslator):
|
283 |
+
def __init__(self, service, lang_out, lang_in, model):
|
284 |
+
lang_out = "zh-Hans" if lang_out == "auto" else lang_out
|
285 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
286 |
+
super().__init__(service, lang_out, lang_in, model)
|
287 |
+
|
288 |
+
try:
|
289 |
+
api_key = os.environ["AZURE_APIKEY"]
|
290 |
+
endpoint = os.environ["AZURE_ENDPOINT"]
|
291 |
+
region = os.environ["AZURE_REGION"]
|
292 |
+
except KeyError as e:
|
293 |
+
missing_var = e.args[0]
|
294 |
+
raise ValueError(
|
295 |
+
f"The environment variable '{missing_var}' is required but not set."
|
296 |
+
) from e
|
297 |
+
|
298 |
+
credential = AzureKeyCredential(api_key)
|
299 |
+
self.client = TextTranslationClient(
|
300 |
+
endpoint=endpoint, credential=credential, region=region
|
301 |
+
)
|
302 |
+
|
303 |
+
# https://github.com/Azure/azure-sdk-for-python/issues/9422
|
304 |
+
logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
|
305 |
+
logger.setLevel(logging.WARNING)
|
306 |
+
|
307 |
+
def translate(self, text) -> str:
|
308 |
+
response = self.client.translate(
|
309 |
+
body=[text],
|
310 |
+
from_language=self.lang_in,
|
311 |
+
to_language=[self.lang_out],
|
312 |
+
)
|
313 |
+
|
314 |
+
translated_text = response[0].translations[0].text
|
315 |
+
return translated_text
|
pdf2zh/utils.py
ADDED
@@ -0,0 +1,834 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Miscellaneous Routines."""
|
2 |
+
|
3 |
+
import io
|
4 |
+
import pathlib
|
5 |
+
import string
|
6 |
+
import struct
|
7 |
+
from html import escape
|
8 |
+
from typing import (
|
9 |
+
TYPE_CHECKING,
|
10 |
+
Any,
|
11 |
+
BinaryIO,
|
12 |
+
Callable,
|
13 |
+
Dict,
|
14 |
+
Generic,
|
15 |
+
Iterable,
|
16 |
+
Iterator,
|
17 |
+
List,
|
18 |
+
Optional,
|
19 |
+
Set,
|
20 |
+
TextIO,
|
21 |
+
Tuple,
|
22 |
+
TypeVar,
|
23 |
+
Union,
|
24 |
+
cast,
|
25 |
+
)
|
26 |
+
|
27 |
+
from pdf2zh.pdfexceptions import PDFTypeError, PDFValueError
|
28 |
+
|
29 |
+
if TYPE_CHECKING:
|
30 |
+
from pdf2zh.layout import LTComponent
|
31 |
+
|
32 |
+
import charset_normalizer # For str encoding detection
|
33 |
+
|
34 |
+
# from sys import maxint as INF doesn't work anymore under Python3, but PDF
|
35 |
+
# still uses 32 bits ints
|
36 |
+
INF = (1 << 31) - 1
|
37 |
+
|
38 |
+
|
39 |
+
FileOrName = Union[pathlib.PurePath, str, io.IOBase]
|
40 |
+
AnyIO = Union[TextIO, BinaryIO]
|
41 |
+
|
42 |
+
|
43 |
+
class open_filename:
|
44 |
+
"""Context manager that allows opening a filename
|
45 |
+
(str or pathlib.PurePath type is supported) and closes it on exit,
|
46 |
+
(just like `open`), but does nothing for file-like objects.
|
47 |
+
"""
|
48 |
+
|
49 |
+
def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any) -> None:
|
50 |
+
if isinstance(filename, pathlib.PurePath):
|
51 |
+
filename = str(filename)
|
52 |
+
if isinstance(filename, str):
|
53 |
+
self.file_handler: AnyIO = open(filename, *args, **kwargs)
|
54 |
+
self.closing = True
|
55 |
+
elif isinstance(filename, io.IOBase):
|
56 |
+
self.file_handler = cast(AnyIO, filename)
|
57 |
+
self.closing = False
|
58 |
+
else:
|
59 |
+
raise PDFTypeError("Unsupported input type: %s" % type(filename))
|
60 |
+
|
61 |
+
def __enter__(self) -> AnyIO:
|
62 |
+
return self.file_handler
|
63 |
+
|
64 |
+
def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
|
65 |
+
if self.closing:
|
66 |
+
self.file_handler.close()
|
67 |
+
|
68 |
+
|
69 |
+
def make_compat_bytes(in_str: str) -> bytes:
|
70 |
+
"""Converts to bytes, encoding to unicode."""
|
71 |
+
assert isinstance(in_str, str), str(type(in_str))
|
72 |
+
return in_str.encode()
|
73 |
+
|
74 |
+
|
75 |
+
def make_compat_str(o: object) -> str:
|
76 |
+
"""Converts everything to string, if bytes guessing the encoding."""
|
77 |
+
if isinstance(o, bytes):
|
78 |
+
enc = charset_normalizer.detect(o)
|
79 |
+
try:
|
80 |
+
return o.decode(enc["encoding"])
|
81 |
+
except UnicodeDecodeError:
|
82 |
+
return str(o)
|
83 |
+
else:
|
84 |
+
return str(o)
|
85 |
+
|
86 |
+
|
87 |
+
def shorten_str(s: str, size: int) -> str:
|
88 |
+
if size < 7:
|
89 |
+
return s[:size]
|
90 |
+
if len(s) > size:
|
91 |
+
length = (size - 5) // 2
|
92 |
+
return f"{s[:length]} ... {s[-length:]}"
|
93 |
+
else:
|
94 |
+
return s
|
95 |
+
|
96 |
+
|
97 |
+
def compatible_encode_method(
|
98 |
+
bytesorstring: Union[bytes, str],
|
99 |
+
encoding: str = "utf-8",
|
100 |
+
erraction: str = "ignore",
|
101 |
+
) -> str:
|
102 |
+
"""When Py2 str.encode is called, it often means bytes.encode in Py3.
|
103 |
+
|
104 |
+
This does either.
|
105 |
+
"""
|
106 |
+
if isinstance(bytesorstring, str):
|
107 |
+
return bytesorstring
|
108 |
+
assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
|
109 |
+
return bytesorstring.decode(encoding, erraction)
|
110 |
+
|
111 |
+
|
112 |
+
def paeth_predictor(left: int, above: int, upper_left: int) -> int:
|
113 |
+
# From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
|
114 |
+
# Initial estimate
|
115 |
+
p = left + above - upper_left
|
116 |
+
# Distances to a,b,c
|
117 |
+
pa = abs(p - left)
|
118 |
+
pb = abs(p - above)
|
119 |
+
pc = abs(p - upper_left)
|
120 |
+
|
121 |
+
# Return nearest of a,b,c breaking ties in order a,b,c
|
122 |
+
if pa <= pb and pa <= pc:
|
123 |
+
return left
|
124 |
+
elif pb <= pc:
|
125 |
+
return above
|
126 |
+
else:
|
127 |
+
return upper_left
|
128 |
+
|
129 |
+
|
130 |
+
def apply_png_predictor(
|
131 |
+
pred: int,
|
132 |
+
colors: int,
|
133 |
+
columns: int,
|
134 |
+
bitspercomponent: int,
|
135 |
+
data: bytes,
|
136 |
+
) -> bytes:
|
137 |
+
"""Reverse the effect of the PNG predictor
|
138 |
+
|
139 |
+
Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
|
140 |
+
"""
|
141 |
+
if bitspercomponent not in [8, 1]:
|
142 |
+
msg = "Unsupported `bitspercomponent': %d" % bitspercomponent
|
143 |
+
raise PDFValueError(msg)
|
144 |
+
|
145 |
+
nbytes = colors * columns * bitspercomponent // 8
|
146 |
+
bpp = colors * bitspercomponent // 8 # number of bytes per complete pixel
|
147 |
+
buf = []
|
148 |
+
line_above = list(b"\x00" * columns)
|
149 |
+
for scanline_i in range(0, len(data), nbytes + 1):
|
150 |
+
filter_type = data[scanline_i]
|
151 |
+
line_encoded = data[scanline_i + 1 : scanline_i + 1 + nbytes]
|
152 |
+
raw = []
|
153 |
+
|
154 |
+
if filter_type == 0:
|
155 |
+
# Filter type 0: None
|
156 |
+
raw = list(line_encoded)
|
157 |
+
|
158 |
+
elif filter_type == 1:
|
159 |
+
# Filter type 1: Sub
|
160 |
+
# To reverse the effect of the Sub() filter after decompression,
|
161 |
+
# output the following value:
|
162 |
+
# Raw(x) = Sub(x) + Raw(x - bpp)
|
163 |
+
# (computed mod 256), where Raw() refers to the bytes already
|
164 |
+
# decoded.
|
165 |
+
for j, sub_x in enumerate(line_encoded):
|
166 |
+
if j - bpp < 0:
|
167 |
+
raw_x_bpp = 0
|
168 |
+
else:
|
169 |
+
raw_x_bpp = int(raw[j - bpp])
|
170 |
+
raw_x = (sub_x + raw_x_bpp) & 255
|
171 |
+
raw.append(raw_x)
|
172 |
+
|
173 |
+
elif filter_type == 2:
|
174 |
+
# Filter type 2: Up
|
175 |
+
# To reverse the effect of the Up() filter after decompression,
|
176 |
+
# output the following value:
|
177 |
+
# Raw(x) = Up(x) + Prior(x)
|
178 |
+
# (computed mod 256), where Prior() refers to the decoded bytes of
|
179 |
+
# the prior scanline.
|
180 |
+
for up_x, prior_x in zip(line_encoded, line_above):
|
181 |
+
raw_x = (up_x + prior_x) & 255
|
182 |
+
raw.append(raw_x)
|
183 |
+
|
184 |
+
elif filter_type == 3:
|
185 |
+
# Filter type 3: Average
|
186 |
+
# To reverse the effect of the Average() filter after
|
187 |
+
# decompression, output the following value:
|
188 |
+
# Raw(x) = Average(x) + floor((Raw(x-bpp)+Prior(x))/2)
|
189 |
+
# where the result is computed mod 256, but the prediction is
|
190 |
+
# calculated in the same way as for encoding. Raw() refers to the
|
191 |
+
# bytes already decoded, and Prior() refers to the decoded bytes of
|
192 |
+
# the prior scanline.
|
193 |
+
for j, average_x in enumerate(line_encoded):
|
194 |
+
if j - bpp < 0:
|
195 |
+
raw_x_bpp = 0
|
196 |
+
else:
|
197 |
+
raw_x_bpp = int(raw[j - bpp])
|
198 |
+
prior_x = int(line_above[j])
|
199 |
+
raw_x = (average_x + (raw_x_bpp + prior_x) // 2) & 255
|
200 |
+
raw.append(raw_x)
|
201 |
+
|
202 |
+
elif filter_type == 4:
|
203 |
+
# Filter type 4: Paeth
|
204 |
+
# To reverse the effect of the Paeth() filter after decompression,
|
205 |
+
# output the following value:
|
206 |
+
# Raw(x) = Paeth(x)
|
207 |
+
# + PaethPredictor(Raw(x-bpp), Prior(x), Prior(x-bpp))
|
208 |
+
# (computed mod 256), where Raw() and Prior() refer to bytes
|
209 |
+
# already decoded. Exactly the same PaethPredictor() function is
|
210 |
+
# used by both encoder and decoder.
|
211 |
+
for j, paeth_x in enumerate(line_encoded):
|
212 |
+
if j - bpp < 0:
|
213 |
+
raw_x_bpp = 0
|
214 |
+
prior_x_bpp = 0
|
215 |
+
else:
|
216 |
+
raw_x_bpp = int(raw[j - bpp])
|
217 |
+
prior_x_bpp = int(line_above[j - bpp])
|
218 |
+
prior_x = int(line_above[j])
|
219 |
+
paeth = paeth_predictor(raw_x_bpp, prior_x, prior_x_bpp)
|
220 |
+
raw_x = (paeth_x + paeth) & 255
|
221 |
+
raw.append(raw_x)
|
222 |
+
|
223 |
+
else:
|
224 |
+
raise PDFValueError("Unsupported predictor value: %d" % filter_type)
|
225 |
+
|
226 |
+
buf.extend(raw)
|
227 |
+
line_above = raw
|
228 |
+
return bytes(buf)
|
229 |
+
|
230 |
+
|
231 |
+
Point = Tuple[float, float]
|
232 |
+
Rect = Tuple[float, float, float, float]
|
233 |
+
Matrix = Tuple[float, float, float, float, float, float]
|
234 |
+
PathSegment = Union[
|
235 |
+
Tuple[str], # Literal['h']
|
236 |
+
Tuple[str, float, float], # Literal['m', 'l']
|
237 |
+
Tuple[str, float, float, float, float], # Literal['v', 'y']
|
238 |
+
Tuple[str, float, float, float, float, float, float],
|
239 |
+
] # Literal['c']
|
240 |
+
|
241 |
+
# Matrix operations
|
242 |
+
MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)
|
243 |
+
|
244 |
+
|
245 |
+
def parse_rect(o: Any) -> Rect:
|
246 |
+
try:
|
247 |
+
(x0, y0, x1, y1) = o
|
248 |
+
return float(x0), float(y0), float(x1), float(y1)
|
249 |
+
except ValueError:
|
250 |
+
raise PDFValueError("Could not parse rectangle")
|
251 |
+
|
252 |
+
|
253 |
+
def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix:
|
254 |
+
(a1, b1, c1, d1, e1, f1) = m1
|
255 |
+
(a0, b0, c0, d0, e0, f0) = m0
|
256 |
+
"""Returns the multiplication of two matrices."""
|
257 |
+
return (
|
258 |
+
a0 * a1 + c0 * b1,
|
259 |
+
b0 * a1 + d0 * b1,
|
260 |
+
a0 * c1 + c0 * d1,
|
261 |
+
b0 * c1 + d0 * d1,
|
262 |
+
a0 * e1 + c0 * f1 + e0,
|
263 |
+
b0 * e1 + d0 * f1 + f0,
|
264 |
+
)
|
265 |
+
|
266 |
+
|
267 |
+
def translate_matrix(m: Matrix, v: Point) -> Matrix:
|
268 |
+
"""Translates a matrix by (x, y)."""
|
269 |
+
(a, b, c, d, e, f) = m
|
270 |
+
(x, y) = v
|
271 |
+
return a, b, c, d, x * a + y * c + e, x * b + y * d + f
|
272 |
+
|
273 |
+
|
274 |
+
def apply_matrix_pt(m: Matrix, v: Point) -> Point:
|
275 |
+
(a, b, c, d, e, f) = m
|
276 |
+
(x, y) = v
|
277 |
+
"""Applies a matrix to a point."""
|
278 |
+
return a * x + c * y + e, b * x + d * y + f
|
279 |
+
|
280 |
+
|
281 |
+
def apply_matrix_norm(m: Matrix, v: Point) -> Point:
|
282 |
+
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
|
283 |
+
(a, b, c, d, e, f) = m
|
284 |
+
(p, q) = v
|
285 |
+
return a * p + c * q, b * p + d * q
|
286 |
+
|
287 |
+
|
288 |
+
def matrix_scale(m: Matrix) -> float:
|
289 |
+
(a, b, c, d, e, f) = m
|
290 |
+
return (a**2 + c**2) ** 0.5
|
291 |
+
|
292 |
+
|
293 |
+
# Utility functions
|
294 |
+
|
295 |
+
|
296 |
+
def isnumber(x: object) -> bool:
|
297 |
+
return isinstance(x, (int, float))
|
298 |
+
|
299 |
+
|
300 |
+
_T = TypeVar("_T")
|
301 |
+
|
302 |
+
|
303 |
+
def uniq(objs: Iterable[_T]) -> Iterator[_T]:
|
304 |
+
"""Eliminates duplicated elements."""
|
305 |
+
done = set()
|
306 |
+
for obj in objs:
|
307 |
+
if obj in done:
|
308 |
+
continue
|
309 |
+
done.add(obj)
|
310 |
+
yield obj
|
311 |
+
|
312 |
+
|
313 |
+
def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T]) -> Tuple[List[_T], List[_T]]:
|
314 |
+
"""Split a list into two classes according to the predicate."""
|
315 |
+
t = []
|
316 |
+
f = []
|
317 |
+
for obj in objs:
|
318 |
+
if pred(obj):
|
319 |
+
t.append(obj)
|
320 |
+
else:
|
321 |
+
f.append(obj)
|
322 |
+
return t, f
|
323 |
+
|
324 |
+
|
325 |
+
def drange(v0: float, v1: float, d: int) -> range:
|
326 |
+
"""Returns a discrete range."""
|
327 |
+
return range(int(v0) // d, int(v1 + d) // d)
|
328 |
+
|
329 |
+
|
330 |
+
def get_bound(pts: Iterable[Point]) -> Rect:
|
331 |
+
"""Compute a minimal rectangle that covers all the points."""
|
332 |
+
limit: Rect = (INF, INF, -INF, -INF)
|
333 |
+
(x0, y0, x1, y1) = limit
|
334 |
+
for x, y in pts:
|
335 |
+
x0 = min(x0, x)
|
336 |
+
y0 = min(y0, y)
|
337 |
+
x1 = max(x1, x)
|
338 |
+
y1 = max(y1, y)
|
339 |
+
return x0, y0, x1, y1
|
340 |
+
|
341 |
+
|
342 |
+
def pick(
|
343 |
+
seq: Iterable[_T],
|
344 |
+
func: Callable[[_T], float],
|
345 |
+
maxobj: Optional[_T] = None,
|
346 |
+
) -> Optional[_T]:
|
347 |
+
"""Picks the object obj where func(obj) has the highest value."""
|
348 |
+
maxscore = None
|
349 |
+
for obj in seq:
|
350 |
+
score = func(obj)
|
351 |
+
if maxscore is None or maxscore < score:
|
352 |
+
(maxscore, maxobj) = (score, obj)
|
353 |
+
return maxobj
|
354 |
+
|
355 |
+
|
356 |
+
def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]:
|
357 |
+
"""Groups every n elements of the list."""
|
358 |
+
r = []
|
359 |
+
for x in seq:
|
360 |
+
r.append(x)
|
361 |
+
if len(r) == n:
|
362 |
+
yield tuple(r)
|
363 |
+
r = []
|
364 |
+
|
365 |
+
|
366 |
+
def nunpack(s: bytes, default: int = 0) -> int:
|
367 |
+
"""Unpacks 1 to 4 or 8 byte integers (big endian)."""
|
368 |
+
length = len(s)
|
369 |
+
if not length:
|
370 |
+
return default
|
371 |
+
elif length == 1:
|
372 |
+
return ord(s)
|
373 |
+
elif length == 2:
|
374 |
+
return cast(int, struct.unpack(">H", s)[0])
|
375 |
+
elif length == 3:
|
376 |
+
return cast(int, struct.unpack(">L", b"\x00" + s)[0])
|
377 |
+
elif length == 4:
|
378 |
+
return cast(int, struct.unpack(">L", s)[0])
|
379 |
+
elif length == 8:
|
380 |
+
return cast(int, struct.unpack(">Q", s)[0])
|
381 |
+
else:
|
382 |
+
raise PDFTypeError("invalid length: %d" % length)
|
383 |
+
|
384 |
+
|
385 |
+
PDFDocEncoding = "".join(
|
386 |
+
chr(x)
|
387 |
+
for x in (
|
388 |
+
0x0000,
|
389 |
+
0x0001,
|
390 |
+
0x0002,
|
391 |
+
0x0003,
|
392 |
+
0x0004,
|
393 |
+
0x0005,
|
394 |
+
0x0006,
|
395 |
+
0x0007,
|
396 |
+
0x0008,
|
397 |
+
0x0009,
|
398 |
+
0x000A,
|
399 |
+
0x000B,
|
400 |
+
0x000C,
|
401 |
+
0x000D,
|
402 |
+
0x000E,
|
403 |
+
0x000F,
|
404 |
+
0x0010,
|
405 |
+
0x0011,
|
406 |
+
0x0012,
|
407 |
+
0x0013,
|
408 |
+
0x0014,
|
409 |
+
0x0015,
|
410 |
+
0x0017,
|
411 |
+
0x0017,
|
412 |
+
0x02D8,
|
413 |
+
0x02C7,
|
414 |
+
0x02C6,
|
415 |
+
0x02D9,
|
416 |
+
0x02DD,
|
417 |
+
0x02DB,
|
418 |
+
0x02DA,
|
419 |
+
0x02DC,
|
420 |
+
0x0020,
|
421 |
+
0x0021,
|
422 |
+
0x0022,
|
423 |
+
0x0023,
|
424 |
+
0x0024,
|
425 |
+
0x0025,
|
426 |
+
0x0026,
|
427 |
+
0x0027,
|
428 |
+
0x0028,
|
429 |
+
0x0029,
|
430 |
+
0x002A,
|
431 |
+
0x002B,
|
432 |
+
0x002C,
|
433 |
+
0x002D,
|
434 |
+
0x002E,
|
435 |
+
0x002F,
|
436 |
+
0x0030,
|
437 |
+
0x0031,
|
438 |
+
0x0032,
|
439 |
+
0x0033,
|
440 |
+
0x0034,
|
441 |
+
0x0035,
|
442 |
+
0x0036,
|
443 |
+
0x0037,
|
444 |
+
0x0038,
|
445 |
+
0x0039,
|
446 |
+
0x003A,
|
447 |
+
0x003B,
|
448 |
+
0x003C,
|
449 |
+
0x003D,
|
450 |
+
0x003E,
|
451 |
+
0x003F,
|
452 |
+
0x0040,
|
453 |
+
0x0041,
|
454 |
+
0x0042,
|
455 |
+
0x0043,
|
456 |
+
0x0044,
|
457 |
+
0x0045,
|
458 |
+
0x0046,
|
459 |
+
0x0047,
|
460 |
+
0x0048,
|
461 |
+
0x0049,
|
462 |
+
0x004A,
|
463 |
+
0x004B,
|
464 |
+
0x004C,
|
465 |
+
0x004D,
|
466 |
+
0x004E,
|
467 |
+
0x004F,
|
468 |
+
0x0050,
|
469 |
+
0x0051,
|
470 |
+
0x0052,
|
471 |
+
0x0053,
|
472 |
+
0x0054,
|
473 |
+
0x0055,
|
474 |
+
0x0056,
|
475 |
+
0x0057,
|
476 |
+
0x0058,
|
477 |
+
0x0059,
|
478 |
+
0x005A,
|
479 |
+
0x005B,
|
480 |
+
0x005C,
|
481 |
+
0x005D,
|
482 |
+
0x005E,
|
483 |
+
0x005F,
|
484 |
+
0x0060,
|
485 |
+
0x0061,
|
486 |
+
0x0062,
|
487 |
+
0x0063,
|
488 |
+
0x0064,
|
489 |
+
0x0065,
|
490 |
+
0x0066,
|
491 |
+
0x0067,
|
492 |
+
0x0068,
|
493 |
+
0x0069,
|
494 |
+
0x006A,
|
495 |
+
0x006B,
|
496 |
+
0x006C,
|
497 |
+
0x006D,
|
498 |
+
0x006E,
|
499 |
+
0x006F,
|
500 |
+
0x0070,
|
501 |
+
0x0071,
|
502 |
+
0x0072,
|
503 |
+
0x0073,
|
504 |
+
0x0074,
|
505 |
+
0x0075,
|
506 |
+
0x0076,
|
507 |
+
0x0077,
|
508 |
+
0x0078,
|
509 |
+
0x0079,
|
510 |
+
0x007A,
|
511 |
+
0x007B,
|
512 |
+
0x007C,
|
513 |
+
0x007D,
|
514 |
+
0x007E,
|
515 |
+
0x0000,
|
516 |
+
0x2022,
|
517 |
+
0x2020,
|
518 |
+
0x2021,
|
519 |
+
0x2026,
|
520 |
+
0x2014,
|
521 |
+
0x2013,
|
522 |
+
0x0192,
|
523 |
+
0x2044,
|
524 |
+
0x2039,
|
525 |
+
0x203A,
|
526 |
+
0x2212,
|
527 |
+
0x2030,
|
528 |
+
0x201E,
|
529 |
+
0x201C,
|
530 |
+
0x201D,
|
531 |
+
0x2018,
|
532 |
+
0x2019,
|
533 |
+
0x201A,
|
534 |
+
0x2122,
|
535 |
+
0xFB01,
|
536 |
+
0xFB02,
|
537 |
+
0x0141,
|
538 |
+
0x0152,
|
539 |
+
0x0160,
|
540 |
+
0x0178,
|
541 |
+
0x017D,
|
542 |
+
0x0131,
|
543 |
+
0x0142,
|
544 |
+
0x0153,
|
545 |
+
0x0161,
|
546 |
+
0x017E,
|
547 |
+
0x0000,
|
548 |
+
0x20AC,
|
549 |
+
0x00A1,
|
550 |
+
0x00A2,
|
551 |
+
0x00A3,
|
552 |
+
0x00A4,
|
553 |
+
0x00A5,
|
554 |
+
0x00A6,
|
555 |
+
0x00A7,
|
556 |
+
0x00A8,
|
557 |
+
0x00A9,
|
558 |
+
0x00AA,
|
559 |
+
0x00AB,
|
560 |
+
0x00AC,
|
561 |
+
0x0000,
|
562 |
+
0x00AE,
|
563 |
+
0x00AF,
|
564 |
+
0x00B0,
|
565 |
+
0x00B1,
|
566 |
+
0x00B2,
|
567 |
+
0x00B3,
|
568 |
+
0x00B4,
|
569 |
+
0x00B5,
|
570 |
+
0x00B6,
|
571 |
+
0x00B7,
|
572 |
+
0x00B8,
|
573 |
+
0x00B9,
|
574 |
+
0x00BA,
|
575 |
+
0x00BB,
|
576 |
+
0x00BC,
|
577 |
+
0x00BD,
|
578 |
+
0x00BE,
|
579 |
+
0x00BF,
|
580 |
+
0x00C0,
|
581 |
+
0x00C1,
|
582 |
+
0x00C2,
|
583 |
+
0x00C3,
|
584 |
+
0x00C4,
|
585 |
+
0x00C5,
|
586 |
+
0x00C6,
|
587 |
+
0x00C7,
|
588 |
+
0x00C8,
|
589 |
+
0x00C9,
|
590 |
+
0x00CA,
|
591 |
+
0x00CB,
|
592 |
+
0x00CC,
|
593 |
+
0x00CD,
|
594 |
+
0x00CE,
|
595 |
+
0x00CF,
|
596 |
+
0x00D0,
|
597 |
+
0x00D1,
|
598 |
+
0x00D2,
|
599 |
+
0x00D3,
|
600 |
+
0x00D4,
|
601 |
+
0x00D5,
|
602 |
+
0x00D6,
|
603 |
+
0x00D7,
|
604 |
+
0x00D8,
|
605 |
+
0x00D9,
|
606 |
+
0x00DA,
|
607 |
+
0x00DB,
|
608 |
+
0x00DC,
|
609 |
+
0x00DD,
|
610 |
+
0x00DE,
|
611 |
+
0x00DF,
|
612 |
+
0x00E0,
|
613 |
+
0x00E1,
|
614 |
+
0x00E2,
|
615 |
+
0x00E3,
|
616 |
+
0x00E4,
|
617 |
+
0x00E5,
|
618 |
+
0x00E6,
|
619 |
+
0x00E7,
|
620 |
+
0x00E8,
|
621 |
+
0x00E9,
|
622 |
+
0x00EA,
|
623 |
+
0x00EB,
|
624 |
+
0x00EC,
|
625 |
+
0x00ED,
|
626 |
+
0x00EE,
|
627 |
+
0x00EF,
|
628 |
+
0x00F0,
|
629 |
+
0x00F1,
|
630 |
+
0x00F2,
|
631 |
+
0x00F3,
|
632 |
+
0x00F4,
|
633 |
+
0x00F5,
|
634 |
+
0x00F6,
|
635 |
+
0x00F7,
|
636 |
+
0x00F8,
|
637 |
+
0x00F9,
|
638 |
+
0x00FA,
|
639 |
+
0x00FB,
|
640 |
+
0x00FC,
|
641 |
+
0x00FD,
|
642 |
+
0x00FE,
|
643 |
+
0x00FF,
|
644 |
+
)
|
645 |
+
)
|
646 |
+
|
647 |
+
|
648 |
+
def decode_text(s: bytes) -> str:
|
649 |
+
"""Decodes a PDFDocEncoding string to Unicode."""
|
650 |
+
if s.startswith(b"\xfe\xff"):
|
651 |
+
return str(s[2:], "utf-16be", "ignore")
|
652 |
+
else:
|
653 |
+
return "".join(PDFDocEncoding[c] for c in s)
|
654 |
+
|
655 |
+
|
656 |
+
def enc(x: str) -> str:
|
657 |
+
"""Encodes a string for SGML/XML/HTML"""
|
658 |
+
if isinstance(x, bytes):
|
659 |
+
return ""
|
660 |
+
return escape(x)
|
661 |
+
|
662 |
+
|
663 |
+
def bbox2str(bbox: Rect) -> str:
|
664 |
+
(x0, y0, x1, y1) = bbox
|
665 |
+
return f"{x0:.3f},{y0:.3f},{x1:.3f},{y1:.3f}"
|
666 |
+
|
667 |
+
|
668 |
+
def matrix2str(m: Matrix) -> str:
|
669 |
+
(a, b, c, d, e, f) = m
|
670 |
+
return f"[{a:.2f},{b:.2f},{c:.2f},{d:.2f}, ({e:.2f},{f:.2f})]"
|
671 |
+
|
672 |
+
|
673 |
+
def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
|
674 |
+
"""A distance function between two TextBoxes.
|
675 |
+
|
676 |
+
Consider the bounding rectangle for obj1 and obj2.
|
677 |
+
Return vector between 2 boxes boundaries if they don't overlap, otherwise
|
678 |
+
returns vector betweeen boxes centers
|
679 |
+
|
680 |
+
+------+..........+ (x1, y1)
|
681 |
+
| obj1 | :
|
682 |
+
+------+www+------+
|
683 |
+
: | obj2 |
|
684 |
+
(x0, y0) +..........+------+
|
685 |
+
"""
|
686 |
+
(x0, y0) = (min(obj1.x0, obj2.x0), min(obj1.y0, obj2.y0))
|
687 |
+
(x1, y1) = (max(obj1.x1, obj2.x1), max(obj1.y1, obj2.y1))
|
688 |
+
(ow, oh) = (x1 - x0, y1 - y0)
|
689 |
+
(iw, ih) = (ow - obj1.width - obj2.width, oh - obj1.height - obj2.height)
|
690 |
+
if iw < 0 and ih < 0:
|
691 |
+
# if one is inside another we compute euclidean distance
|
692 |
+
(xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2)
|
693 |
+
(xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2)
|
694 |
+
return xc1 - xc2, yc1 - yc2
|
695 |
+
else:
|
696 |
+
return max(0, iw), max(0, ih)
|
697 |
+
|
698 |
+
|
699 |
+
LTComponentT = TypeVar("LTComponentT", bound="LTComponent")
|
700 |
+
|
701 |
+
|
702 |
+
class Plane(Generic[LTComponentT]):
|
703 |
+
"""A set-like data structure for objects placed on a plane.
|
704 |
+
|
705 |
+
Can efficiently find objects in a certain rectangular area.
|
706 |
+
It maintains two parallel lists of objects, each of
|
707 |
+
which is sorted by its x or y coordinate.
|
708 |
+
"""
|
709 |
+
|
710 |
+
def __init__(self, bbox: Rect, gridsize: int = 50) -> None:
|
711 |
+
self._seq: List[LTComponentT] = [] # preserve the object order.
|
712 |
+
self._objs: Set[LTComponentT] = set()
|
713 |
+
self._grid: Dict[Point, List[LTComponentT]] = {}
|
714 |
+
self.gridsize = gridsize
|
715 |
+
(self.x0, self.y0, self.x1, self.y1) = bbox
|
716 |
+
|
717 |
+
def __repr__(self) -> str:
|
718 |
+
return "<Plane objs=%r>" % list(self)
|
719 |
+
|
720 |
+
def __iter__(self) -> Iterator[LTComponentT]:
|
721 |
+
return (obj for obj in self._seq if obj in self._objs)
|
722 |
+
|
723 |
+
def __len__(self) -> int:
|
724 |
+
return len(self._objs)
|
725 |
+
|
726 |
+
def __contains__(self, obj: object) -> bool:
|
727 |
+
return obj in self._objs
|
728 |
+
|
729 |
+
def _getrange(self, bbox: Rect) -> Iterator[Point]:
|
730 |
+
(x0, y0, x1, y1) = bbox
|
731 |
+
if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
|
732 |
+
return
|
733 |
+
x0 = max(self.x0, x0)
|
734 |
+
y0 = max(self.y0, y0)
|
735 |
+
x1 = min(self.x1, x1)
|
736 |
+
y1 = min(self.y1, y1)
|
737 |
+
for grid_y in drange(y0, y1, self.gridsize):
|
738 |
+
for grid_x in drange(x0, x1, self.gridsize):
|
739 |
+
yield (grid_x, grid_y)
|
740 |
+
|
741 |
+
def extend(self, objs: Iterable[LTComponentT]) -> None:
|
742 |
+
for obj in objs:
|
743 |
+
self.add(obj)
|
744 |
+
|
745 |
+
def add(self, obj: LTComponentT) -> None:
|
746 |
+
"""Place an object."""
|
747 |
+
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
748 |
+
if k not in self._grid:
|
749 |
+
r: List[LTComponentT] = []
|
750 |
+
self._grid[k] = r
|
751 |
+
else:
|
752 |
+
r = self._grid[k]
|
753 |
+
r.append(obj)
|
754 |
+
self._seq.append(obj)
|
755 |
+
self._objs.add(obj)
|
756 |
+
|
757 |
+
def remove(self, obj: LTComponentT) -> None:
|
758 |
+
"""Displace an object."""
|
759 |
+
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
760 |
+
try:
|
761 |
+
self._grid[k].remove(obj)
|
762 |
+
except (KeyError, ValueError):
|
763 |
+
pass
|
764 |
+
self._objs.remove(obj)
|
765 |
+
|
766 |
+
def find(self, bbox: Rect) -> Iterator[LTComponentT]:
|
767 |
+
"""Finds objects that are in a certain area."""
|
768 |
+
(x0, y0, x1, y1) = bbox
|
769 |
+
done = set()
|
770 |
+
for k in self._getrange(bbox):
|
771 |
+
if k not in self._grid:
|
772 |
+
continue
|
773 |
+
for obj in self._grid[k]:
|
774 |
+
if obj in done:
|
775 |
+
continue
|
776 |
+
done.add(obj)
|
777 |
+
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0:
|
778 |
+
continue
|
779 |
+
yield obj
|
780 |
+
|
781 |
+
|
782 |
+
ROMAN_ONES = ["i", "x", "c", "m"]
|
783 |
+
ROMAN_FIVES = ["v", "l", "d"]
|
784 |
+
|
785 |
+
|
786 |
+
def format_int_roman(value: int) -> str:
|
787 |
+
"""Format a number as lowercase Roman numerals."""
|
788 |
+
assert 0 < value < 4000
|
789 |
+
result: List[str] = []
|
790 |
+
index = 0
|
791 |
+
|
792 |
+
while value != 0:
|
793 |
+
value, remainder = divmod(value, 10)
|
794 |
+
if remainder == 9:
|
795 |
+
result.insert(0, ROMAN_ONES[index])
|
796 |
+
result.insert(1, ROMAN_ONES[index + 1])
|
797 |
+
elif remainder == 4:
|
798 |
+
result.insert(0, ROMAN_ONES[index])
|
799 |
+
result.insert(1, ROMAN_FIVES[index])
|
800 |
+
else:
|
801 |
+
over_five = remainder >= 5
|
802 |
+
if over_five:
|
803 |
+
result.insert(0, ROMAN_FIVES[index])
|
804 |
+
remainder -= 5
|
805 |
+
result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
|
806 |
+
index += 1
|
807 |
+
|
808 |
+
return "".join(result)
|
809 |
+
|
810 |
+
|
811 |
+
def format_int_alpha(value: int) -> str:
|
812 |
+
"""Format a number as lowercase letters a-z, aa-zz, etc."""
|
813 |
+
assert value > 0
|
814 |
+
result: List[str] = []
|
815 |
+
|
816 |
+
while value != 0:
|
817 |
+
value, remainder = divmod(value - 1, len(string.ascii_lowercase))
|
818 |
+
result.append(string.ascii_lowercase[remainder])
|
819 |
+
|
820 |
+
result.reverse()
|
821 |
+
return "".join(result)
|
822 |
+
|
823 |
+
|
824 |
+
def get_device():
|
825 |
+
"""Get the device to use for computation."""
|
826 |
+
try:
|
827 |
+
import torch
|
828 |
+
|
829 |
+
if torch.cuda.is_available():
|
830 |
+
return "cuda:0"
|
831 |
+
except ImportError:
|
832 |
+
pass
|
833 |
+
|
834 |
+
return "cpu"
|
pyproject.toml
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "pdf2zh"
|
3 |
+
version = "1.8.0"
|
4 |
+
description = "Latex PDF Translator"
|
5 |
+
authors = [{ name = "Byaidu", email = "[email protected]" }]
|
6 |
+
license = "AGPL-3.0"
|
7 |
+
readme = "README.md"
|
8 |
+
requires-python = ">=3.9,<3.13"
|
9 |
+
classifiers = [
|
10 |
+
"Programming Language :: Python :: 3",
|
11 |
+
"Operating System :: OS Independent",
|
12 |
+
]
|
13 |
+
dependencies = [
|
14 |
+
"charset-normalizer",
|
15 |
+
"cryptography",
|
16 |
+
"requests",
|
17 |
+
"pymupdf",
|
18 |
+
"tqdm",
|
19 |
+
"tenacity",
|
20 |
+
"numpy",
|
21 |
+
"ollama",
|
22 |
+
"deepl<1.19.1",
|
23 |
+
"openai",
|
24 |
+
"requests",
|
25 |
+
"azure-ai-translation-text<=1.0.1",
|
26 |
+
"gradio",
|
27 |
+
"huggingface_hub",
|
28 |
+
"onnx",
|
29 |
+
"onnxruntime",
|
30 |
+
"opencv-python-headless",
|
31 |
+
]
|
32 |
+
|
33 |
+
[project.optional-dependencies]
|
34 |
+
torch = [
|
35 |
+
"doclayout-yolo",
|
36 |
+
"torch",
|
37 |
+
]
|
38 |
+
dev = [
|
39 |
+
"black",
|
40 |
+
"flake8",
|
41 |
+
"pre-commit"
|
42 |
+
]
|
43 |
+
|
44 |
+
[project.urls]
|
45 |
+
Homepage = "https://github.com/Byaidu/PDFMathTranslate"
|
46 |
+
|
47 |
+
[build-system]
|
48 |
+
requires = ["hatchling"]
|
49 |
+
build-backend = "hatchling.build"
|
50 |
+
|
51 |
+
[project.scripts]
|
52 |
+
pdf2zh = "pdf2zh.pdf2zh:main"
|
setup.cfg
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[flake8]
|
2 |
+
max-line-length = 120
|
3 |
+
ignore = E203,W503,E261
|
4 |
+
exclude = .git,build,dist,docs
|