Spaces:

sanbo1200
/

PDFTranslate

Running

App Files Files Community

sanbo commited on Nov 26, 2024

Commit

9b0f4a0

1 Parent(s): a4272c9

update sth. at 2024-11-26 16:15:47

Browse files

Files changed (48) hide show

Dockerfile +13 -0
Dockerfile.Demo +22 -0
LICENSE +661 -0
README.md +268 -0
app.json +5 -0
docs/README_GUI.md +18 -0
docs/licenses/LICENSE.pdfminer.six +22 -0
docs/licenses/LICENSE.pyHanko +23 -0
pdf2zh/__init__.py +2 -0
pdf2zh/_saslprep.py +101 -0
pdf2zh/arcfour.py +35 -0
pdf2zh/ascii85.py +70 -0
pdf2zh/cache.py +91 -0
pdf2zh/casting.py +15 -0
pdf2zh/ccitt.py +614 -0
pdf2zh/cmapdb.py +471 -0
pdf2zh/converter.py +1384 -0
pdf2zh/data_structures.py +52 -0
pdf2zh/doclayout.py +213 -0
pdf2zh/encodingdb.py +127 -0
pdf2zh/fontmetrics.py +0 -0
pdf2zh/glyphlist.py +0 -0
pdf2zh/gui.py +425 -0
pdf2zh/high_level.py +298 -0
pdf2zh/image.py +297 -0
pdf2zh/jbig2.py +373 -0
pdf2zh/latin_enc.py +246 -0
pdf2zh/layout.py +993 -0
pdf2zh/lzw.py +105 -0
pdf2zh/pdf2zh.py +310 -0
pdf2zh/pdfcolor.py +37 -0
pdf2zh/pdfdevice.py +316 -0
pdf2zh/pdfdocument.py +1069 -0
pdf2zh/pdfexceptions.py +33 -0
pdf2zh/pdffont.py +1190 -0
pdf2zh/pdfinterp.py +1113 -0
pdf2zh/pdfpage.py +196 -0
pdf2zh/pdfparser.py +166 -0
pdf2zh/pdftypes.py +397 -0
pdf2zh/psexceptions.py +18 -0
pdf2zh/psparser.py +656 -0
pdf2zh/py.typed +0 -0
pdf2zh/runlength.py +39 -0
pdf2zh/settings.py +1 -0
pdf2zh/translator.py +315 -0
pdf2zh/utils.py +834 -0
pyproject.toml +52 -0
setup.cfg +4 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.12
+WORKDIR /app
+COPY . .
+ENV PYTHONUNBUFFERED=1
+RUN apt-get update && apt-get install -y libgl1
+RUN pip install .
+CMD ["pdf2zh", "-i"]

Dockerfile.Demo ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.12
+WORKDIR /app
+COPY . .
+ENV PYTHONUNBUFFERED=1
+RUN apt-get update && apt-get install -y libgl1
+RUN pip install .
+RUN mkdir -p /data
+RUN chmod 777 /data
+RUN mkdir -p /app
+RUN chmod 777 /app
+RUN mkdir -p /.cache
+RUN chmod 777 /.cache
+RUN mkdir -p ./gradio_files
+RUN chmod 777 ./gradio_files
+CMD ["pdf2zh", "-i"]

LICENSE ADDED Viewed

	@@ -0,0 +1,661 @@

+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU Affero General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Remote Network Interaction; Use with the GNU General Public License.
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.

README.md CHANGED Viewed

@@ -9,3 +9,271 @@ short_description: pdf翻译
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# PDFTranslate
+科学 PDF 文档翻译及双语对照工具
+- 📊 保留公式、图表、目录和注释 *([预览效果](#preview))*
+- 🌐 支持 [多种语言](#language) 和 [诸多翻译服务](#services)
+- 🤖 提供 [命令行工具](#usage)，[图形交互界面](#gui)，以及 [容器化部署](#docker)
+欢迎在 [GitHub Issues](https://github.com/Byaidu/PDFMathTranslate/issues) 或 [Telegram 用户群](https://t.me/+Z9_SgnxmsmA5NzBl) 中提供反馈。
+<h2 id="updates">近期更新</h2>
+- [Nov. 26 2024] CLI 现在已支持（多个）在线 PDF 文件 *(by [@reycn](https://github.com/reycn))*
+- [Nov. 24 2024] 为降低依赖大小，提供 [ONNX](https://github.com/onnx/onnx) 支持 *(by [@Wybxc](https://github.com/Wybxc))*
+- [Nov. 23 2024] 🌟 [免费公共服务](#demo) 上线! *(by [@Byaidu](https://github.com/Byaidu))*
+- [Nov. 23 2024] 防止网页爬虫的防火墙 *(by [@Byaidu](https://github.com/Byaidu))*
+- [Nov. 22 2024] 图形用户界面现已支持意大利语，并获得了一些更新 *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))*
+- [Nov. 22 2024] 现在你可以将自己部署的服务分享给朋友了 *(by [@Zxis233](https://github.com/Zxis233))*
+- [Nov. 22 2024] 支持腾讯翻译 *(by [@hellofinch](https://github.com/hellofinch))*
+- [Nov. 21 2024] 图形用户界面现在支持下载双语文档 *(by [@reycn](https://github.com/reycn))*
+- [Nov. 20 2024] 🌟 提供了 [在线演示](#demo)！ *(by [@reycn](https://github.com/reycn))*
+<h2 id="preview">效果预览</h2>
+![](https://raw.githubusercontent.com/hhhaiai/Picture/main/img/202411261612975.gif)
+<h2 id="demo">在线演示 🌟</h2>
+### 免费服务 (<https://pdf2zh.com/>)
+你可以立即尝试 [免费公共服务](https://pdf2zh.com/) 而无需安装。
+### Hugging Face 在线演示
+你可以立即尝试 [在 HuggingFace 上的在线演示](https://huggingface.co/spaces/reycn/PDFMathTranslate-Docker) 而无需安装。
+请注意，演示的计算资源有限，因此请避免滥用。
+<h2 id="install">安装和使用</h2>
+我们提供了三种使用该项目的方法：[命令行工具](#cmd)、[图形交互界面](#gui) 和 [容器化部署](#docker).
+<h3 id="cmd">方法一、命令行工具</h3>
+  1. 确保安装了版本大于 3.8 且小于 3.12 的 Python
+  2. 安装此程序：
+      ```bash
+      pip install pdf2zh
+      ```
+  3. 开始使用：
+      ```bash
+      pdf2zh document.pdf
+      ```
+<h3 id="gui">方法二、图形交互界面</h3>
+1. 确保安装了版本大于 3.8 且小于 3.12 的 Python
+2. 安装此程序：
+      ```bash
+      pip install pdf2zh
+      ```
+3. 开始在浏览器中使用：
+      ```bash
+      pdf2zh -i
+      ```
+4. 如果您的浏览器没有自动启动并跳转，请用浏览器打开：
+    ```bash
+    http://localhost:7860/
+    ```
+    ![](https://raw.githubusercontent.com/hhhaiai/Picture/main/img/202411261614075.gif)
+查看 [documentation for GUI](./docs/README_GUI.md) 获取细节说明.
+<h3 id="docker">方法三、容器化部署</h3>
+1. 拉取 Docker 镜像并运行：
+    ```bash
+    docker pull byaidu/pdf2zh
+    docker run -d -p 7860:7860 byaidu/pdf2zh
+    ```
+2. 通过浏览器打开：
+    ```
+    http://localhost:7860/
+    ```
+用于在云服务上部署容器镜像：
+<a href="https://www.heroku.com/deploy?template=https://github.com/Byaidu/PDFMathTranslate">
+  <img src="https://www.herokucdn.com/deploy/button.svg" alt="Deploy" height="26"></a>
+<a href="https://render.com/deploy">
+  <img src="https://render.com/images/deploy-to-render-button.svg" alt="Deploy to Koyeb" height="26"></a>
+<a href="https://zeabur.com/templates/5FQIGX?referralCode=reycn">
+  <img src="https://zeabur.com/button.svg" alt="Deploy on Zeabur" height="26"></a>
+<a href="https://app.koyeb.com/deploy?type=git&builder=buildpack&repository=github.com/Byaidu/PDFMathTranslate&branch=main&name=pdf-math-translate">
+  <img src="https://www.koyeb.com/static/images/deploy/button.svg" alt="Deploy to Koyeb" height="26"></a>
+<h2 id="usage">高级选项</h2>
+在命令行中执行翻译命令，生成译文文档 `example-zh.pdf` 和双语对照文档 `example-dual.pdf`，默认使用 Google 翻译服务
+![](https://raw.githubusercontent.com/hhhaiai/Picture/main/img/202411261614851.png)
+我们在下表中列出了所有高级选项，以供参考：
+| Option    | Function | Example |
+| -------- | ------- |------- |
+| (文档)  | 本地（多个）文件 |  `pdf2zh ~/local.pdf` |
+|  | 在线（多个）文件|  `pdf2zh http://web.com/online.pdf` |
+| `-i`  | [进入图形界面](#gui) |  `pdf2zh -i` |
+| `-p`  | [仅翻译部分文档](#partial) |  `pdf2zh example.pdf -p 1` |
+| `-li` | [源语言](#languages) |  `pdf2zh example.pdf -li en` |
+| `-lo` | [目标语言](#languages) |  `pdf2zh example.pdf -lo zh` |
+| `-s`  | [指定翻译服务](#services) |  `pdf2zh example.pdf -s deepl` |
+| `-t`  | [多线程](#threads) | `pdf2zh example.pdf -t 1` |
+| `-f`, `-c` | [例外规则](#exceptions) | `pdf2zh example.pdf -f "(MS.*)"` |
+某些服务需要设置环境变量。关于设置环境变量的详细说明，请参考 [ChatGPT](https://chatgpt.com/share/6734a83d-9d48-800e-8a46-f57ca6e8bcb4)
+<h3 id="partial">全文或部分文档翻译</h3>
+- **全文翻译**
+```bash
+pdf2zh example.pdf
+```
+- **部分翻译**
+```bash
+pdf2zh example.pdf -p 1-3,5
+```
+<h3 id="language">指定源语言和目标语言</h3>
+参考 [Google Languages Codes](https://developers.google.com/admin-sdk/directory/v1/languages), [DeepL Languages Codes](https://developers.deepl.com/docs/resources/supported-languages)
+```bash
+pdf2zh example.pdf -li en -lo ja
+```
+<h3 id="services">使用不同的翻译服务</h3>
+- **DeepL**
+参考 [DeepL](https://support.deepl.com/hc/en-us/articles/360020695820-API-Key-for-DeepL-s-API)
+设置环境变量构建接入点：`{DEEPL_SERVER_URL}/translate`
+- `DEEPL_SERVER_URL`（可选）, e.g., `export DEEPL_SERVER_URL=https://api.deepl.com`
+- `DEEPL_AUTH_KEY`, e.g., `export DEEPL_AUTH_KEY=xxx`
+```bash
+pdf2zh example.pdf -s deepl
+```
+- **DeepLX**
+参考 [DeepLX](https://github.com/OwO-Network/DeepLX)
+设置环境变量构建接入点：`{DEEPLX_SERVER_URL}/translate`
+- `DEEPLX_SERVER_URL`（可选）, e.g., `export DEEPLX_SERVER_URL=https://api.deepl.com`
+- `DEEPLX_AUTH_KEY`, e.g., `export DEEPLX_AUTH_KEY=xxx`
+```bash
+pdf2zh example.pdf -s deepl
+```
+- **Ollama**
+参考 [Ollama](https://github.com/ollama/ollama)
+设置环境变量构建接入点：`{OLLAMA_HOST}/api/chat`
+- `OLLAMA_HOST`（可选）, e.g., `export OLLAMA_HOST=https://localhost:11434`
+```bash
+pdf2zh example.pdf -s ollama:gemma2
+```
+- **支持 OpenAI 协议的 LLM（如 OpenAI、SiliconCloud、Zhipu）**
+参考 [SiliconCloud](https://docs.siliconflow.cn/quickstart), [Zhipu](https://open.bigmodel.cn/dev/api/thirdparty-frame/openai-sdk)
+设置环境变量构建接入点：`{OPENAI_BASE_URL}/chat/completions`
+- `OPENAI_BASE_URL`（可选）, e.g., `export OPENAI_BASE_URL=https://api.openai.com/v1`
+- `OPENAI_API_KEY`, e.g., `export OPENAI_API_KEY=xxx`
+```bash
+pdf2zh example.pdf -s openai:gpt-4o
+```
+- **Azure**
+参考 [Azure Text Translation](https://docs.azure.cn/en-us/ai-services/translator/text-translation-overview)
+需设置以下环境变量：
+- `AZURE_APIKEY`, e.g., `export AZURE_APIKEY=xxx`
+- `AZURE_ENDPOINT`, e.g., `export AZURE_ENDPOINT=https://api.translator.azure.cn/`
+- `AZURE_REGION`, e.g., `export AZURE_REGION=chinaeast2`
+```bash
+pdf2zh example.pdf -s azure
+```
+- **腾讯机器翻译**
+参考 [腾讯机器翻译](https://cloud.tencent.com/product/tmt)
+需设置以下环境变量：
+- `TENCENT_SECRET_ID`, e.g., `export TENCENT_SECRET_ID=AKIDxxx`
+- `TENCENT_SECRET_KEY`, e.g., `export TENCENT_SECRET_KEY=xxx`
+```bash
+pdf2zh example.pdf -s tmt
+```
+<h3 id="exceptions">指定例外规则</h3>
+使用正则表达式指定需保留的公式字体与字符
+```bash
+pdf2zh example.pdf -f "(CM[^RT].*|MS.*|.*Ital)" -c "(\(|\||\)|\+|=|\d|[\u0080-\ufaff])"
+```
+<h3 id="threads">指定线程数量</h3>
+使用 `-t` 指定翻译时使用的线程数量：
+```bash
+pdf2zh example.pdf -t 1
+```
+<h2 id="acknowledgement">致谢</h2>
+- 文档合并：[PyMuPDF](https://github.com/pymupdf/PyMuPDF)
+- 文档解析：[Pdfminer.six](https://github.com/pdfminer/pdfminer.six)
+- 文档提取：[MinerU](https://github.com/opendatalab/MinerU)
+- 多线程翻译：[MathTranslate](https://github.com/SUSYUSTC/MathTranslate)
+- 布局解析：[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
+- 文档标准：[PDF Explained](https://zxyle.github.io/PDF-Explained/), [PDF Cheat Sheets](https://pdfa.org/resource/pdf-cheat-sheets/)

app.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "name": "PDFMathTranslate",
+    "description": "PDF scientific paper translation and bilingual comparison.",
+    "repository": "https://github.com/Byaidu/PDFMathTranslate"
+}

docs/README_GUI.md ADDED Viewed

	@@ -0,0 +1,18 @@

+# Interact with GUI
+This subfolder provides the GUI mode of `pdf2zh`.
+## Usage
+1. Run `pdf2zh -i`
+2. Drop the PDF file into the window and click `Translate`.
+## Preview
+<img src="./images/before.png" width="500"/>
+<img src="./images/after.png" width="500"/>
+## Maintainance
+GUI maintained by [Rongxin](https://github.com/reycn)

docs/licenses/LICENSE.pdfminer.six ADDED Viewed

	@@ -0,0 +1,22 @@

+Copyright (c) 2004-2016  Yusuke Shinyama <yusuke at shinyama dot jp>
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

docs/licenses/LICENSE.pyHanko ADDED Viewed

	@@ -0,0 +1,23 @@

+This package contains various elements based on code from the pyHanko project, of which we reproduce the license below.
+MIT License
+Copyright (c) 2020 Matthias Valvekens
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

pdf2zh/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __version__ = "1.8.0"
2	+ __author__ = "Byaidu"

pdf2zh/_saslprep.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright 2016-present MongoDB, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some changes copyright 2021-present Matthias Valvekens,
+# licensed under the license of the pyHanko project (see LICENSE file).
+"""An implementation of RFC4013 SASLprep."""
+__all__ = ["saslprep"]
+import stringprep
+import unicodedata
+from typing import Callable, Tuple
+from pdf2zh.pdfexceptions import PDFValueError
+# RFC4013 section 2.3 prohibited output.
+_PROHIBITED: Tuple[Callable[[str], bool], ...] = (
+    # A strict reading of RFC 4013 requires table c12 here, but
+    # characters from it are mapped to SPACE in the Map step. Can
+    # normalization reintroduce them somehow?
+    stringprep.in_table_c12,
+    stringprep.in_table_c21_c22,
+    stringprep.in_table_c3,
+    stringprep.in_table_c4,
+    stringprep.in_table_c5,
+    stringprep.in_table_c6,
+    stringprep.in_table_c7,
+    stringprep.in_table_c8,
+    stringprep.in_table_c9,
+)
+def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
+    """An implementation of RFC4013 SASLprep.
+    :param data:
+        The string to SASLprep.
+    :param prohibit_unassigned_code_points:
+        RFC 3454 and RFCs for various SASL mechanisms distinguish between
+        `queries` (unassigned code points allowed) and
+        `stored strings` (unassigned code points prohibited). Defaults
+        to ``True`` (unassigned code points are prohibited).
+    :return: The SASLprep'ed version of `data`.
+    """
+    if prohibit_unassigned_code_points:
+        prohibited = _PROHIBITED + (stringprep.in_table_a1,)
+    else:
+        prohibited = _PROHIBITED
+    # RFC3454 section 2, step 1 - Map
+    # RFC4013 section 2.1 mappings
+    # Map Non-ASCII space characters to SPACE (U+0020). Map
+    # commonly mapped to nothing characters to, well, nothing.
+    in_table_c12 = stringprep.in_table_c12
+    in_table_b1 = stringprep.in_table_b1
+    data = "".join(
+        [
+            "\u0020" if in_table_c12(elt) else elt
+            for elt in data
+            if not in_table_b1(elt)
+        ],
+    )
+    # RFC3454 section 2, step 2 - Normalize
+    # RFC4013 section 2.2 normalization
+    data = unicodedata.ucd_3_2_0.normalize("NFKC", data)
+    in_table_d1 = stringprep.in_table_d1
+    if in_table_d1(data[0]):
+        if not in_table_d1(data[-1]):
+            # RFC3454, Section 6, #3. If a string contains any
+            # RandALCat character, the first and last characters
+            # MUST be RandALCat characters.
+            raise PDFValueError("SASLprep: failed bidirectional check")
+        # RFC3454, Section 6, #2. If a string contains any RandALCat
+        # character, it MUST NOT contain any LCat character.
+        prohibited = prohibited + (stringprep.in_table_d2,)
+    else:
+        # RFC3454, Section 6, #3. Following the logic of #3, if
+        # the first character is not a RandALCat, no other character
+        # can be either.
+        prohibited = prohibited + (in_table_d1,)
+    # RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
+    for char in data:
+        if any(in_table(char) for in_table in prohibited):
+            raise PDFValueError("SASLprep: failed prohibited character check")
+    return data

pdf2zh/arcfour.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""Python implementation of Arcfour encryption algorithm.
+See https://en.wikipedia.org/wiki/RC4
+This code is in the public domain.
+"""
+from typing import Sequence
+class Arcfour:
+    def __init__(self, key: Sequence[int]) -> None:
+        # because Py3 range is not indexable
+        s = [i for i in range(256)]
+        j = 0
+        klen = len(key)
+        for i in range(256):
+            j = (j + s[i] + key[i % klen]) % 256
+            (s[i], s[j]) = (s[j], s[i])
+        self.s = s
+        (self.i, self.j) = (0, 0)
+    def process(self, data: bytes) -> bytes:
+        (i, j) = (self.i, self.j)
+        s = self.s
+        r = b""
+        for c in iter(data):
+            i = (i + 1) % 256
+            j = (j + s[i]) % 256
+            (s[i], s[j]) = (s[j], s[i])
+            k = s[(s[i] + s[j]) % 256]
+            r += bytes((c ^ k,))
+        (self.i, self.j) = (i, j)
+        return r
+    encrypt = decrypt = process

pdf2zh/ascii85.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
+This code is in the public domain.
+"""
+import re
+import struct
+# ascii85decode(data)
+def ascii85decode(data: bytes) -> bytes:
+    """In ASCII85 encoding, every four bytes are encoded with five ASCII
+    letters, using 85 different types of characters (as 256**4 < 85**5).
+    When the length of the original bytes is not a multiple of 4, a special
+    rule is used for round up.
+    The Adobe's ASCII85 implementation is slightly different from
+    its original in handling the last characters.
+    """
+    n = b = 0
+    out = b""
+    for i in iter(data):
+        c = bytes((i,))
+        if c >= b"!" and c <= b"u":
+            n += 1
+            b = b * 85 + (ord(c) - 33)
+            if n == 5:
+                out += struct.pack(">L", b)
+                n = b = 0
+        elif c == b"z":
+            assert n == 0, str(n)
+            out += b"\0\0\0\0"
+        elif c == b"~":
+            if n:
+                for _ in range(5 - n):
+                    b = b * 85 + 84
+                out += struct.pack(">L", b)[: n - 1]
+            break
+    return out
+# asciihexdecode(data)
+hex_re = re.compile(rb"([a-f\d]{2})", re.IGNORECASE)
+trail_re = re.compile(rb"^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$", re.IGNORECASE)
+def asciihexdecode(data: bytes) -> bytes:
+    """ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
+    For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
+    ASCIIHexDecode filter produces one byte of binary data. All white-space
+    characters are ignored. A right angle bracket character (>) indicates
+    EOD. Any other characters will cause an error. If the filter encounters
+    the EOD marker after reading an odd number of hexadecimal digits, it
+    will behave as if a 0 followed the last digit.
+    """
+    def decode(x: bytes) -> bytes:
+        i = int(x, 16)
+        return bytes((i,))
+    out = b""
+    for x in hex_re.findall(data):
+        out += decode(x)
+    m = trail_re.search(data)
+    if m:
+        out += decode(m.group(1) + b"0")
+    return out

pdf2zh/cache.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import tempfile
+import os
+import time
+import hashlib
+import shutil
+cache_dir = os.path.join(tempfile.gettempdir(), "cache")
+os.makedirs(cache_dir, exist_ok=True)
+time_filename = "update_time"
+max_cache = 5
+def deterministic_hash(obj):
+    hash_object = hashlib.sha256()
+    hash_object.update(str(obj).encode())
+    return hash_object.hexdigest()[0:20]
+def get_dirs():
+    dirs = [
+        os.path.join(cache_dir, dir)
+        for dir in os.listdir(cache_dir)
+        if os.path.isdir(os.path.join(cache_dir, dir))
+    ]
+    return dirs
+def get_time(dir):
+    try:
+        timefile = os.path.join(dir, time_filename)
+        t = float(open(timefile, encoding="utf-8").read())
+        return t
+    except FileNotFoundError:
+        # handle the error as needed, for now we'll just return a default value
+        return float(
+            "inf"
+        )  # This ensures that this directory will be the first to be removed if required
+def write_time(dir):
+    timefile = os.path.join(dir, time_filename)
+    t = time.time()
+    print(t, file=open(timefile, "w", encoding="utf-8"), end="")
+def argmin(iterable):
+    return min(enumerate(iterable), key=lambda x: x[1])[0]
+def remove_extra():
+    dirs = get_dirs()
+    for dir in dirs:
+        if not os.path.isdir(
+            dir
+        ):  # This line might be redundant now, as get_dirs() ensures only directories are returned
+            os.remove(dir)
+        try:
+            get_time(dir)
+        except BaseException:
+            shutil.rmtree(dir)
+    while True:
+        dirs = get_dirs()
+        if len(dirs) <= max_cache:
+            break
+        times = [get_time(dir) for dir in dirs]
+        arg = argmin(times)
+        shutil.rmtree(dirs[arg])
+def is_cached(hash_key):
+    dir = os.path.join(cache_dir, hash_key)
+    return os.path.exists(dir)
+def create_cache(hash_key):
+    dir = os.path.join(cache_dir, hash_key)
+    os.makedirs(dir, exist_ok=True)
+    write_time(dir)
+def load_paragraph(hash_key, hash_key_paragraph):
+    filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
+    if os.path.exists(filename):
+        return open(filename, encoding="utf-8").read()
+    else:
+        return None
+def write_paragraph(hash_key, hash_key_paragraph, paragraph):
+    filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
+    print(paragraph, file=open(filename, "w", encoding="utf-8"), end="")

pdf2zh/casting.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from typing import Any, Optional
+def safe_int(o: Any) -> Optional[int]:
+    try:
+        return int(o)
+    except (TypeError, ValueError):
+        return None
+def safe_float(o: Any) -> Optional[float]:
+    try:
+        return float(o)
+    except (TypeError, ValueError):
+        return None

pdf2zh/ccitt.py ADDED Viewed

	@@ -0,0 +1,614 @@

+# CCITT Fax decoder
+#
+# Bugs: uncompressed mode untested.
+#
+# cf.
+#  ITU-T Recommendation T.4
+#    "Standardization of Group 3 facsimile terminals
+#    for document transmission"
+#  ITU-T Recommendation T.6
+#    "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS
+#    FOR GROUP 4 FACSIMILE APPARATUS"
+import array
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    MutableSequence,
+    Optional,
+    Sequence,
+    Union,
+    cast,
+)
+from pdf2zh.pdfexceptions import PDFException, PDFValueError
+def get_bytes(data: bytes) -> Iterator[int]:
+    yield from data
+# Workaround https://github.com/python/mypy/issues/731
+BitParserState = MutableSequence[Any]
+# A better definition (not supported by mypy) would be:
+# BitParserState = MutableSequence[Union["BitParserState", int, str, None]]
+class BitParser:
+    _state: BitParserState
+    # _accept is declared Optional solely as a workaround for
+    # https://github.com/python/mypy/issues/708
+    _accept: Optional[Callable[[Any], BitParserState]]
+    def __init__(self) -> None:
+        self._pos = 0
+    @classmethod
+    def add(cls, root: BitParserState, v: Union[int, str], bits: str) -> None:
+        p: BitParserState = root
+        b = None
+        for i in range(len(bits)):
+            if i > 0:
+                assert b is not None
+                if p[b] is None:
+                    p[b] = [None, None]
+                p = p[b]
+            if bits[i] == "1":
+                b = 1
+            else:
+                b = 0
+        assert b is not None
+        p[b] = v
+    def feedbytes(self, data: bytes) -> None:
+        for byte in get_bytes(data):
+            for m in (128, 64, 32, 16, 8, 4, 2, 1):
+                self._parse_bit(byte & m)
+    def _parse_bit(self, x: object) -> None:
+        if x:
+            v = self._state[1]
+        else:
+            v = self._state[0]
+        self._pos += 1
+        if isinstance(v, list):
+            self._state = v
+        else:
+            assert self._accept is not None
+            self._state = self._accept(v)
+class CCITTG4Parser(BitParser):
+    MODE = [None, None]
+    BitParser.add(MODE, 0, "1")
+    BitParser.add(MODE, +1, "011")
+    BitParser.add(MODE, -1, "010")
+    BitParser.add(MODE, "h", "001")
+    BitParser.add(MODE, "p", "0001")
+    BitParser.add(MODE, +2, "000011")
+    BitParser.add(MODE, -2, "000010")
+    BitParser.add(MODE, +3, "0000011")
+    BitParser.add(MODE, -3, "0000010")
+    BitParser.add(MODE, "u", "0000001111")
+    BitParser.add(MODE, "x1", "0000001000")
+    BitParser.add(MODE, "x2", "0000001001")
+    BitParser.add(MODE, "x3", "0000001010")
+    BitParser.add(MODE, "x4", "0000001011")
+    BitParser.add(MODE, "x5", "0000001100")
+    BitParser.add(MODE, "x6", "0000001101")
+    BitParser.add(MODE, "x7", "0000001110")
+    BitParser.add(MODE, "e", "000000000001000000000001")
+    WHITE = [None, None]
+    BitParser.add(WHITE, 0, "00110101")
+    BitParser.add(WHITE, 1, "000111")
+    BitParser.add(WHITE, 2, "0111")
+    BitParser.add(WHITE, 3, "1000")
+    BitParser.add(WHITE, 4, "1011")
+    BitParser.add(WHITE, 5, "1100")
+    BitParser.add(WHITE, 6, "1110")
+    BitParser.add(WHITE, 7, "1111")
+    BitParser.add(WHITE, 8, "10011")
+    BitParser.add(WHITE, 9, "10100")
+    BitParser.add(WHITE, 10, "00111")
+    BitParser.add(WHITE, 11, "01000")
+    BitParser.add(WHITE, 12, "001000")
+    BitParser.add(WHITE, 13, "000011")
+    BitParser.add(WHITE, 14, "110100")
+    BitParser.add(WHITE, 15, "110101")
+    BitParser.add(WHITE, 16, "101010")
+    BitParser.add(WHITE, 17, "101011")
+    BitParser.add(WHITE, 18, "0100111")
+    BitParser.add(WHITE, 19, "0001100")
+    BitParser.add(WHITE, 20, "0001000")
+    BitParser.add(WHITE, 21, "0010111")
+    BitParser.add(WHITE, 22, "0000011")
+    BitParser.add(WHITE, 23, "0000100")
+    BitParser.add(WHITE, 24, "0101000")
+    BitParser.add(WHITE, 25, "0101011")
+    BitParser.add(WHITE, 26, "0010011")
+    BitParser.add(WHITE, 27, "0100100")
+    BitParser.add(WHITE, 28, "0011000")
+    BitParser.add(WHITE, 29, "00000010")
+    BitParser.add(WHITE, 30, "00000011")
+    BitParser.add(WHITE, 31, "00011010")
+    BitParser.add(WHITE, 32, "00011011")
+    BitParser.add(WHITE, 33, "00010010")
+    BitParser.add(WHITE, 34, "00010011")
+    BitParser.add(WHITE, 35, "00010100")
+    BitParser.add(WHITE, 36, "00010101")
+    BitParser.add(WHITE, 37, "00010110")
+    BitParser.add(WHITE, 38, "00010111")
+    BitParser.add(WHITE, 39, "00101000")
+    BitParser.add(WHITE, 40, "00101001")
+    BitParser.add(WHITE, 41, "00101010")
+    BitParser.add(WHITE, 42, "00101011")
+    BitParser.add(WHITE, 43, "00101100")
+    BitParser.add(WHITE, 44, "00101101")
+    BitParser.add(WHITE, 45, "00000100")
+    BitParser.add(WHITE, 46, "00000101")
+    BitParser.add(WHITE, 47, "00001010")
+    BitParser.add(WHITE, 48, "00001011")
+    BitParser.add(WHITE, 49, "01010010")
+    BitParser.add(WHITE, 50, "01010011")
+    BitParser.add(WHITE, 51, "01010100")
+    BitParser.add(WHITE, 52, "01010101")
+    BitParser.add(WHITE, 53, "00100100")
+    BitParser.add(WHITE, 54, "00100101")
+    BitParser.add(WHITE, 55, "01011000")
+    BitParser.add(WHITE, 56, "01011001")
+    BitParser.add(WHITE, 57, "01011010")
+    BitParser.add(WHITE, 58, "01011011")
+    BitParser.add(WHITE, 59, "01001010")
+    BitParser.add(WHITE, 60, "01001011")
+    BitParser.add(WHITE, 61, "00110010")
+    BitParser.add(WHITE, 62, "00110011")
+    BitParser.add(WHITE, 63, "00110100")
+    BitParser.add(WHITE, 64, "11011")
+    BitParser.add(WHITE, 128, "10010")
+    BitParser.add(WHITE, 192, "010111")
+    BitParser.add(WHITE, 256, "0110111")
+    BitParser.add(WHITE, 320, "00110110")
+    BitParser.add(WHITE, 384, "00110111")
+    BitParser.add(WHITE, 448, "01100100")
+    BitParser.add(WHITE, 512, "01100101")
+    BitParser.add(WHITE, 576, "01101000")
+    BitParser.add(WHITE, 640, "01100111")
+    BitParser.add(WHITE, 704, "011001100")
+    BitParser.add(WHITE, 768, "011001101")
+    BitParser.add(WHITE, 832, "011010010")
+    BitParser.add(WHITE, 896, "011010011")
+    BitParser.add(WHITE, 960, "011010100")
+    BitParser.add(WHITE, 1024, "011010101")
+    BitParser.add(WHITE, 1088, "011010110")
+    BitParser.add(WHITE, 1152, "011010111")
+    BitParser.add(WHITE, 1216, "011011000")
+    BitParser.add(WHITE, 1280, "011011001")
+    BitParser.add(WHITE, 1344, "011011010")
+    BitParser.add(WHITE, 1408, "011011011")
+    BitParser.add(WHITE, 1472, "010011000")
+    BitParser.add(WHITE, 1536, "010011001")
+    BitParser.add(WHITE, 1600, "010011010")
+    BitParser.add(WHITE, 1664, "011000")
+    BitParser.add(WHITE, 1728, "010011011")
+    BitParser.add(WHITE, 1792, "00000001000")
+    BitParser.add(WHITE, 1856, "00000001100")
+    BitParser.add(WHITE, 1920, "00000001101")
+    BitParser.add(WHITE, 1984, "000000010010")
+    BitParser.add(WHITE, 2048, "000000010011")
+    BitParser.add(WHITE, 2112, "000000010100")
+    BitParser.add(WHITE, 2176, "000000010101")
+    BitParser.add(WHITE, 2240, "000000010110")
+    BitParser.add(WHITE, 2304, "000000010111")
+    BitParser.add(WHITE, 2368, "000000011100")
+    BitParser.add(WHITE, 2432, "000000011101")
+    BitParser.add(WHITE, 2496, "000000011110")
+    BitParser.add(WHITE, 2560, "000000011111")
+    BLACK = [None, None]
+    BitParser.add(BLACK, 0, "0000110111")
+    BitParser.add(BLACK, 1, "010")
+    BitParser.add(BLACK, 2, "11")
+    BitParser.add(BLACK, 3, "10")
+    BitParser.add(BLACK, 4, "011")
+    BitParser.add(BLACK, 5, "0011")
+    BitParser.add(BLACK, 6, "0010")
+    BitParser.add(BLACK, 7, "00011")
+    BitParser.add(BLACK, 8, "000101")
+    BitParser.add(BLACK, 9, "000100")
+    BitParser.add(BLACK, 10, "0000100")
+    BitParser.add(BLACK, 11, "0000101")
+    BitParser.add(BLACK, 12, "0000111")
+    BitParser.add(BLACK, 13, "00000100")
+    BitParser.add(BLACK, 14, "00000111")
+    BitParser.add(BLACK, 15, "000011000")
+    BitParser.add(BLACK, 16, "0000010111")
+    BitParser.add(BLACK, 17, "0000011000")
+    BitParser.add(BLACK, 18, "0000001000")
+    BitParser.add(BLACK, 19, "00001100111")
+    BitParser.add(BLACK, 20, "00001101000")
+    BitParser.add(BLACK, 21, "00001101100")
+    BitParser.add(BLACK, 22, "00000110111")
+    BitParser.add(BLACK, 23, "00000101000")
+    BitParser.add(BLACK, 24, "00000010111")
+    BitParser.add(BLACK, 25, "00000011000")
+    BitParser.add(BLACK, 26, "000011001010")
+    BitParser.add(BLACK, 27, "000011001011")
+    BitParser.add(BLACK, 28, "000011001100")
+    BitParser.add(BLACK, 29, "000011001101")
+    BitParser.add(BLACK, 30, "000001101000")
+    BitParser.add(BLACK, 31, "000001101001")
+    BitParser.add(BLACK, 32, "000001101010")
+    BitParser.add(BLACK, 33, "000001101011")
+    BitParser.add(BLACK, 34, "000011010010")
+    BitParser.add(BLACK, 35, "000011010011")
+    BitParser.add(BLACK, 36, "000011010100")
+    BitParser.add(BLACK, 37, "000011010101")
+    BitParser.add(BLACK, 38, "000011010110")
+    BitParser.add(BLACK, 39, "000011010111")
+    BitParser.add(BLACK, 40, "000001101100")
+    BitParser.add(BLACK, 41, "000001101101")
+    BitParser.add(BLACK, 42, "000011011010")
+    BitParser.add(BLACK, 43, "000011011011")
+    BitParser.add(BLACK, 44, "000001010100")
+    BitParser.add(BLACK, 45, "000001010101")
+    BitParser.add(BLACK, 46, "000001010110")
+    BitParser.add(BLACK, 47, "000001010111")
+    BitParser.add(BLACK, 48, "000001100100")
+    BitParser.add(BLACK, 49, "000001100101")
+    BitParser.add(BLACK, 50, "000001010010")
+    BitParser.add(BLACK, 51, "000001010011")
+    BitParser.add(BLACK, 52, "000000100100")
+    BitParser.add(BLACK, 53, "000000110111")
+    BitParser.add(BLACK, 54, "000000111000")
+    BitParser.add(BLACK, 55, "000000100111")
+    BitParser.add(BLACK, 56, "000000101000")
+    BitParser.add(BLACK, 57, "000001011000")
+    BitParser.add(BLACK, 58, "000001011001")
+    BitParser.add(BLACK, 59, "000000101011")
+    BitParser.add(BLACK, 60, "000000101100")
+    BitParser.add(BLACK, 61, "000001011010")
+    BitParser.add(BLACK, 62, "000001100110")
+    BitParser.add(BLACK, 63, "000001100111")
+    BitParser.add(BLACK, 64, "0000001111")
+    BitParser.add(BLACK, 128, "000011001000")
+    BitParser.add(BLACK, 192, "000011001001")
+    BitParser.add(BLACK, 256, "000001011011")
+    BitParser.add(BLACK, 320, "000000110011")
+    BitParser.add(BLACK, 384, "000000110100")
+    BitParser.add(BLACK, 448, "000000110101")
+    BitParser.add(BLACK, 512, "0000001101100")
+    BitParser.add(BLACK, 576, "0000001101101")
+    BitParser.add(BLACK, 640, "0000001001010")
+    BitParser.add(BLACK, 704, "0000001001011")
+    BitParser.add(BLACK, 768, "0000001001100")
+    BitParser.add(BLACK, 832, "0000001001101")
+    BitParser.add(BLACK, 896, "0000001110010")
+    BitParser.add(BLACK, 960, "0000001110011")
+    BitParser.add(BLACK, 1024, "0000001110100")
+    BitParser.add(BLACK, 1088, "0000001110101")
+    BitParser.add(BLACK, 1152, "0000001110110")
+    BitParser.add(BLACK, 1216, "0000001110111")
+    BitParser.add(BLACK, 1280, "0000001010010")
+    BitParser.add(BLACK, 1344, "0000001010011")
+    BitParser.add(BLACK, 1408, "0000001010100")
+    BitParser.add(BLACK, 1472, "0000001010101")
+    BitParser.add(BLACK, 1536, "0000001011010")
+    BitParser.add(BLACK, 1600, "0000001011011")
+    BitParser.add(BLACK, 1664, "0000001100100")
+    BitParser.add(BLACK, 1728, "0000001100101")
+    BitParser.add(BLACK, 1792, "00000001000")
+    BitParser.add(BLACK, 1856, "00000001100")
+    BitParser.add(BLACK, 1920, "00000001101")
+    BitParser.add(BLACK, 1984, "000000010010")
+    BitParser.add(BLACK, 2048, "000000010011")
+    BitParser.add(BLACK, 2112, "000000010100")
+    BitParser.add(BLACK, 2176, "000000010101")
+    BitParser.add(BLACK, 2240, "000000010110")
+    BitParser.add(BLACK, 2304, "000000010111")
+    BitParser.add(BLACK, 2368, "000000011100")
+    BitParser.add(BLACK, 2432, "000000011101")
+    BitParser.add(BLACK, 2496, "000000011110")
+    BitParser.add(BLACK, 2560, "000000011111")
+    UNCOMPRESSED = [None, None]
+    BitParser.add(UNCOMPRESSED, "1", "1")
+    BitParser.add(UNCOMPRESSED, "01", "01")
+    BitParser.add(UNCOMPRESSED, "001", "001")
+    BitParser.add(UNCOMPRESSED, "0001", "0001")
+    BitParser.add(UNCOMPRESSED, "00001", "00001")
+    BitParser.add(UNCOMPRESSED, "00000", "000001")
+    BitParser.add(UNCOMPRESSED, "T00", "00000011")
+    BitParser.add(UNCOMPRESSED, "T10", "00000010")
+    BitParser.add(UNCOMPRESSED, "T000", "000000011")
+    BitParser.add(UNCOMPRESSED, "T100", "000000010")
+    BitParser.add(UNCOMPRESSED, "T0000", "0000000011")
+    BitParser.add(UNCOMPRESSED, "T1000", "0000000010")
+    BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
+    BitParser.add(UNCOMPRESSED, "T10000", "00000000010")
+    class CCITTException(PDFException):
+        pass
+    class EOFB(CCITTException):
+        pass
+    class InvalidData(CCITTException):
+        pass
+    class ByteSkip(CCITTException):
+        pass
+    _color: int
+    def __init__(self, width: int, bytealign: bool = False) -> None:
+        BitParser.__init__(self)
+        self.width = width
+        self.bytealign = bytealign
+        self.reset()
+    def feedbytes(self, data: bytes) -> None:
+        for byte in get_bytes(data):
+            try:
+                for m in (128, 64, 32, 16, 8, 4, 2, 1):
+                    self._parse_bit(byte & m)
+            except self.ByteSkip:
+                self._accept = self._parse_mode
+                self._state = self.MODE
+            except self.EOFB:
+                break
+    def _parse_mode(self, mode: object) -> BitParserState:
+        if mode == "p":
+            self._do_pass()
+            self._flush_line()
+            return self.MODE
+        elif mode == "h":
+            self._n1 = 0
+            self._accept = self._parse_horiz1
+            if self._color:
+                return self.WHITE
+            else:
+                return self.BLACK
+        elif mode == "u":
+            self._accept = self._parse_uncompressed
+            return self.UNCOMPRESSED
+        elif mode == "e":
+            raise self.EOFB
+        elif isinstance(mode, int):
+            self._do_vertical(mode)
+            self._flush_line()
+            return self.MODE
+        else:
+            raise self.InvalidData(mode)
+    def _parse_horiz1(self, n: Any) -> BitParserState:
+        if n is None:
+            raise self.InvalidData
+        self._n1 += n
+        if n < 64:
+            self._n2 = 0
+            self._color = 1 - self._color
+            self._accept = self._parse_horiz2
+        if self._color:
+            return self.WHITE
+        else:
+            return self.BLACK
+    def _parse_horiz2(self, n: Any) -> BitParserState:
+        if n is None:
+            raise self.InvalidData
+        self._n2 += n
+        if n < 64:
+            self._color = 1 - self._color
+            self._accept = self._parse_mode
+            self._do_horizontal(self._n1, self._n2)
+            self._flush_line()
+            return self.MODE
+        elif self._color:
+            return self.WHITE
+        else:
+            return self.BLACK
+    def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState:
+        if not bits:
+            raise self.InvalidData
+        if bits.startswith("T"):
+            self._accept = self._parse_mode
+            self._color = int(bits[1])
+            self._do_uncompressed(bits[2:])
+            return self.MODE
+        else:
+            self._do_uncompressed(bits)
+            return self.UNCOMPRESSED
+    def _get_bits(self) -> str:
+        return "".join(str(b) for b in self._curline[: self._curpos])
+    def _get_refline(self, i: int) -> str:
+        if i < 0:
+            return "[]" + "".join(str(b) for b in self._refline)
+        elif len(self._refline) <= i:
+            return "".join(str(b) for b in self._refline) + "[]"
+        else:
+            return (
+                "".join(str(b) for b in self._refline[:i])
+                + "["
+                + str(self._refline[i])
+                + "]"
+                + "".join(str(b) for b in self._refline[i + 1 :])
+            )
+    def reset(self) -> None:
+        self._y = 0
+        self._curline = array.array("b", [1] * self.width)
+        self._reset_line()
+        self._accept = self._parse_mode
+        self._state = self.MODE
+    def output_line(self, y: int, bits: Sequence[int]) -> None:
+        print(y, "".join(str(b) for b in bits))
+    def _reset_line(self) -> None:
+        self._refline = self._curline
+        self._curline = array.array("b", [1] * self.width)
+        self._curpos = -1
+        self._color = 1
+    def _flush_line(self) -> None:
+        if self.width <= self._curpos:
+            self.output_line(self._y, self._curline)
+            self._y += 1
+            self._reset_line()
+            if self.bytealign:
+                raise self.ByteSkip
+    def _do_vertical(self, dx: int) -> None:
+        x1 = self._curpos + 1
+        while 1:
+            if x1 == 0:
+                if self._color == 1 and self._refline[x1] != self._color:
+                    break
+            elif x1 == len(self._refline) or (
+                self._refline[x1 - 1] == self._color
+                and self._refline[x1] != self._color
+            ):
+                break
+            x1 += 1
+        x1 += dx
+        x0 = max(0, self._curpos)
+        x1 = max(0, min(self.width, x1))
+        if x1 < x0:
+            for x in range(x1, x0):
+                self._curline[x] = self._color
+        elif x0 < x1:
+            for x in range(x0, x1):
+                self._curline[x] = self._color
+        self._curpos = x1
+        self._color = 1 - self._color
+    def _do_pass(self) -> None:
+        x1 = self._curpos + 1
+        while 1:
+            if x1 == 0:
+                if self._color == 1 and self._refline[x1] != self._color:
+                    break
+            elif x1 == len(self._refline) or (
+                self._refline[x1 - 1] == self._color
+                and self._refline[x1] != self._color
+            ):
+                break
+            x1 += 1
+        while 1:
+            if x1 == 0:
+                if self._color == 0 and self._refline[x1] == self._color:
+                    break
+            elif x1 == len(self._refline) or (
+                self._refline[x1 - 1] != self._color
+                and self._refline[x1] == self._color
+            ):
+                break
+            x1 += 1
+        for x in range(self._curpos, x1):
+            self._curline[x] = self._color
+        self._curpos = x1
+    def _do_horizontal(self, n1: int, n2: int) -> None:
+        if self._curpos < 0:
+            self._curpos = 0
+        x = self._curpos
+        for _ in range(n1):
+            if len(self._curline) <= x:
+                break
+            self._curline[x] = self._color
+            x += 1
+        for _ in range(n2):
+            if len(self._curline) <= x:
+                break
+            self._curline[x] = 1 - self._color
+            x += 1
+        self._curpos = x
+    def _do_uncompressed(self, bits: str) -> None:
+        for c in bits:
+            self._curline[self._curpos] = int(c)
+            self._curpos += 1
+            self._flush_line()
+class CCITTFaxDecoder(CCITTG4Parser):
+    def __init__(
+        self,
+        width: int,
+        bytealign: bool = False,
+        reversed: bool = False,
+    ) -> None:
+        CCITTG4Parser.__init__(self, width, bytealign=bytealign)
+        self.reversed = reversed
+        self._buf = b""
+    def close(self) -> bytes:
+        return self._buf
+    def output_line(self, y: int, bits: Sequence[int]) -> None:
+        arr = array.array("B", [0] * ((len(bits) + 7) // 8))
+        if self.reversed:
+            bits = [1 - b for b in bits]
+        for i, b in enumerate(bits):
+            if b:
+                arr[i // 8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
+        self._buf += arr.tobytes()
+def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
+    K = params.get("K")
+    if K == -1:
+        cols = cast(int, params.get("Columns"))
+        bytealign = cast(bool, params.get("EncodedByteAlign"))
+        reversed = cast(bool, params.get("BlackIs1"))
+        parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
+    else:
+        raise PDFValueError(K)
+    parser.feedbytes(data)
+    return parser.close()
+# test
+def main(argv: List[str]) -> None:
+    if not argv[1:]:
+        import unittest
+        unittest.main()
+        return
+    class Parser(CCITTG4Parser):
+        def __init__(self, width: int, bytealign: bool = False) -> None:
+            import pygame  # type: ignore[import]
+            CCITTG4Parser.__init__(self, width, bytealign=bytealign)
+            self.img = pygame.Surface((self.width, 1000))
+        def output_line(self, y: int, bits: Sequence[int]) -> None:
+            for x, b in enumerate(bits):
+                if b:
+                    self.img.set_at((x, y), (255, 255, 255))
+                else:
+                    self.img.set_at((x, y), (0, 0, 0))
+        def close(self) -> None:
+            import pygame
+            pygame.image.save(self.img, "out.bmp")
+    for path in argv[1:]:
+        fp = open(path, "rb")
+        (_, _, k, w, h, _) = path.split(".")
+        parser = Parser(int(w))
+        parser.feedbytes(fp.read())
+        parser.close()
+        fp.close()

pdf2zh/cmapdb.py ADDED Viewed

	@@ -0,0 +1,471 @@

+"""Adobe character mapping (CMap) support.
+CMaps provide the mapping between character codes and Unicode
+code-points to character ids (CIDs).
+More information is available on:
+  https://github.com/adobe-type-tools/cmap-resources
+"""
+import gzip
+import logging
+import os
+import os.path
+import pickle as pickle
+import struct
+import sys
+from typing import (
+    Any,
+    BinaryIO,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    MutableMapping,
+    Optional,
+    Set,
+    TextIO,
+    Tuple,
+    Union,
+    cast,
+)
+from pdf2zh.encodingdb import name2unicode
+from pdf2zh.pdfexceptions import PDFException, PDFTypeError
+from pdf2zh.psexceptions import PSEOF, PSSyntaxError
+from pdf2zh.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name
+from pdf2zh.utils import choplist, nunpack
+log = logging.getLogger(__name__)
+class CMapError(PDFException):
+    pass
+class CMapBase:
+    debug = 0
+    def __init__(self, **kwargs: object) -> None:
+        self.attrs: MutableMapping[str, object] = kwargs.copy()
+    def is_vertical(self) -> bool:
+        return self.attrs.get("WMode", 0) != 0
+    def set_attr(self, k: str, v: object) -> None:
+        self.attrs[k] = v
+    def add_code2cid(self, code: str, cid: int) -> None:
+        pass
+    def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
+        pass
+    def use_cmap(self, cmap: "CMapBase") -> None:
+        pass
+    def decode(self, code: bytes) -> Iterable[int]:
+        raise NotImplementedError
+class CMap(CMapBase):
+    def __init__(self, **kwargs: Union[str, int]) -> None:
+        CMapBase.__init__(self, **kwargs)
+        self.code2cid: Dict[int, object] = {}
+    def __repr__(self) -> str:
+        return "<CMap: %s>" % self.attrs.get("CMapName")
+    def use_cmap(self, cmap: CMapBase) -> None:
+        assert isinstance(cmap, CMap), str(type(cmap))
+        def copy(dst: Dict[int, object], src: Dict[int, object]) -> None:
+            for k, v in src.items():
+                if isinstance(v, dict):
+                    d: Dict[int, object] = {}
+                    dst[k] = d
+                    copy(d, v)
+                else:
+                    dst[k] = v
+        copy(self.code2cid, cmap.code2cid)
+    def decode(self, code: bytes) -> Iterator[int]:
+        # log.debug("decode: %r, %r", self, code)
+        d = self.code2cid
+        for i in iter(code):
+            if i in d:
+                x = d[i]
+                if isinstance(x, int):
+                    yield x
+                    d = self.code2cid
+                else:
+                    d = cast(Dict[int, object], x)
+            else:
+                d = self.code2cid
+    def dump(
+        self,
+        out: TextIO = sys.stdout,
+        code2cid: Optional[Dict[int, object]] = None,
+        code: Tuple[int, ...] = (),
+    ) -> None:
+        if code2cid is None:
+            code2cid = self.code2cid
+            code = ()
+        for k, v in sorted(code2cid.items()):
+            c = code + (k,)
+            if isinstance(v, int):
+                out.write("code %r = cid %d\n" % (c, v))
+            else:
+                self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)
+class IdentityCMap(CMapBase):
+    def decode(self, code: bytes) -> Tuple[int, ...]:
+        n = len(code) // 2
+        if n:
+            return struct.unpack(">%dH" % n, code)
+        else:
+            return ()
+class IdentityCMapByte(IdentityCMap):
+    def decode(self, code: bytes) -> Tuple[int, ...]:
+        n = len(code)
+        if n:
+            return struct.unpack(">%dB" % n, code)
+        else:
+            return ()
+class UnicodeMap(CMapBase):
+    def __init__(self, **kwargs: Union[str, int]) -> None:
+        CMapBase.__init__(self, **kwargs)
+        self.cid2unichr: Dict[int, str] = {}
+    def __repr__(self) -> str:
+        return "<UnicodeMap: %s>" % self.attrs.get("CMapName")
+    def get_unichr(self, cid: int) -> str:
+        # log.debug("get_unichr: %r, %r", self, cid)
+        return self.cid2unichr[cid]
+    def dump(self, out: TextIO = sys.stdout) -> None:
+        for k, v in sorted(self.cid2unichr.items()):
+            out.write("cid %d = unicode %r\n" % (k, v))
+class IdentityUnicodeMap(UnicodeMap):
+    def get_unichr(self, cid: int) -> str:
+        """Interpret character id as unicode codepoint"""
+        # log.debug("get_unichr: %r, %r", self, cid)
+        return chr(cid)
+class FileCMap(CMap):
+    def add_code2cid(self, code: str, cid: int) -> None:
+        assert isinstance(code, str) and isinstance(cid, int), str(
+            (type(code), type(cid)),
+        )
+        d = self.code2cid
+        for c in code[:-1]:
+            ci = ord(c)
+            if ci in d:
+                d = cast(Dict[int, object], d[ci])
+            else:
+                t: Dict[int, object] = {}
+                d[ci] = t
+                d = t
+        ci = ord(code[-1])
+        d[ci] = cid
+class FileUnicodeMap(UnicodeMap):
+    def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
+        assert isinstance(cid, int), str(type(cid))
+        if isinstance(code, PSLiteral):
+            # Interpret as an Adobe glyph name.
+            assert isinstance(code.name, str)
+            unichr = name2unicode(code.name)
+        elif isinstance(code, bytes):
+            # Interpret as UTF-16BE.
+            unichr = code.decode("UTF-16BE", "ignore")
+        elif isinstance(code, int):
+            unichr = chr(code)
+        else:
+            raise PDFTypeError(code)
+        # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
+        if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
+            return
+        self.cid2unichr[cid] = unichr
+class PyCMap(CMap):
+    def __init__(self, name: str, module: Any) -> None:
+        super().__init__(CMapName=name)
+        self.code2cid = module.CODE2CID
+        if module.IS_VERTICAL:
+            self.attrs["WMode"] = 1
+class PyUnicodeMap(UnicodeMap):
+    def __init__(self, name: str, module: Any, vertical: bool) -> None:
+        super().__init__(CMapName=name)
+        if vertical:
+            self.cid2unichr = module.CID2UNICHR_V
+            self.attrs["WMode"] = 1
+        else:
+            self.cid2unichr = module.CID2UNICHR_H
+class CMapDB:
+    _cmap_cache: Dict[str, PyCMap] = {}
+    _umap_cache: Dict[str, List[PyUnicodeMap]] = {}
+    class CMapNotFound(CMapError):
+        pass
+    @classmethod
+    def _load_data(cls, name: str) -> Any:
+        name = name.replace("\0", "")
+        filename = "%s.pickle.gz" % name
+        # log.debug("loading: %r", name)
+        cmap_paths = (
+            os.environ.get("CMAP_PATH", "/usr/share/pdf2zh/"),
+            os.path.join(os.path.dirname(__file__), "cmap"),
+        )
+        for directory in cmap_paths:
+            path = os.path.join(directory, filename)
+            if os.path.exists(path):
+                gzfile = gzip.open(path)
+                try:
+                    return type(str(name), (), pickle.loads(gzfile.read()))
+                finally:
+                    gzfile.close()
+        raise CMapDB.CMapNotFound(name)
+    @classmethod
+    def get_cmap(cls, name: str) -> CMapBase:
+        if name == "Identity-H":
+            return IdentityCMap(WMode=0)
+        elif name == "Identity-V":
+            return IdentityCMap(WMode=1)
+        elif name == "OneByteIdentityH":
+            return IdentityCMapByte(WMode=0)
+        elif name == "OneByteIdentityV":
+            return IdentityCMapByte(WMode=1)
+        try:
+            return cls._cmap_cache[name]
+        except KeyError:
+            pass
+        data = cls._load_data(name)
+        cls._cmap_cache[name] = cmap = PyCMap(name, data)
+        return cmap
+    @classmethod
+    def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
+        try:
+            return cls._umap_cache[name][vertical]
+        except KeyError:
+            pass
+        data = cls._load_data("to-unicode-%s" % name)
+        cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
+        return cls._umap_cache[name][vertical]
+class CMapParser(PSStackParser[PSKeyword]):
+    def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
+        PSStackParser.__init__(self, fp)
+        self.cmap = cmap
+        # some ToUnicode maps don't have "begincmap" keyword.
+        self._in_cmap = True
+        self._warnings: Set[str] = set()
+    def run(self) -> None:
+        try:
+            self.nextobject()
+        except PSEOF:
+            pass
+    KEYWORD_BEGINCMAP = KWD(b"begincmap")
+    KEYWORD_ENDCMAP = KWD(b"endcmap")
+    KEYWORD_USECMAP = KWD(b"usecmap")
+    KEYWORD_DEF = KWD(b"def")
+    KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
+    KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
+    KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
+    KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
+    KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
+    KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
+    KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
+    KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
+    KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
+    KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
+    KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
+    KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
+    def do_keyword(self, pos: int, token: PSKeyword) -> None:
+        """ToUnicode CMaps
+        See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
+        """
+        if token is self.KEYWORD_BEGINCMAP:
+            self._in_cmap = True
+            self.popall()
+            return
+        elif token is self.KEYWORD_ENDCMAP:
+            self._in_cmap = False
+            return
+        if not self._in_cmap:
+            return
+        if token is self.KEYWORD_DEF:
+            try:
+                ((_, k), (_, v)) = self.pop(2)
+                self.cmap.set_attr(literal_name(k), v)
+            except PSSyntaxError:
+                pass
+            return
+        if token is self.KEYWORD_USECMAP:
+            try:
+                ((_, cmapname),) = self.pop(1)
+                self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
+            except PSSyntaxError:
+                pass
+            except CMapDB.CMapNotFound:
+                pass
+            return
+        if token is self.KEYWORD_BEGINCODESPACERANGE:
+            self.popall()
+            return
+        if token is self.KEYWORD_ENDCODESPACERANGE:
+            self.popall()
+            return
+        if token is self.KEYWORD_BEGINCIDRANGE:
+            self.popall()
+            return
+        if token is self.KEYWORD_ENDCIDRANGE:
+            objs = [obj for (__, obj) in self.popall()]
+            for start_byte, end_byte, cid in choplist(3, objs):
+                if not isinstance(start_byte, bytes):
+                    self._warn_once("The start object of begincidrange is not a byte.")
+                    continue
+                if not isinstance(end_byte, bytes):
+                    self._warn_once("The end object of begincidrange is not a byte.")
+                    continue
+                if not isinstance(cid, int):
+                    self._warn_once("The cid object of begincidrange is not a byte.")
+                    continue
+                if len(start_byte) != len(end_byte):
+                    self._warn_once(
+                        "The start and end byte of begincidrange have "
+                        "different lengths.",
+                    )
+                    continue
+                start_prefix = start_byte[:-4]
+                end_prefix = end_byte[:-4]
+                if start_prefix != end_prefix:
+                    self._warn_once(
+                        "The prefix of the start and end byte of "
+                        "begincidrange are not the same.",
+                    )
+                    continue
+                svar = start_byte[-4:]
+                evar = end_byte[-4:]
+                start = nunpack(svar)
+                end = nunpack(evar)
+                vlen = len(svar)
+                for i in range(end - start + 1):
+                    x = start_prefix + struct.pack(">L", start + i)[-vlen:]
+                    self.cmap.add_cid2unichr(cid + i, x)
+            return
+        if token is self.KEYWORD_BEGINCIDCHAR:
+            self.popall()
+            return
+        if token is self.KEYWORD_ENDCIDCHAR:
+            objs = [obj for (__, obj) in self.popall()]
+            for cid, code in choplist(2, objs):
+                if isinstance(code, bytes) and isinstance(cid, int):
+                    self.cmap.add_cid2unichr(cid, code)
+            return
+        if token is self.KEYWORD_BEGINBFRANGE:
+            self.popall()
+            return
+        if token is self.KEYWORD_ENDBFRANGE:
+            objs = [obj for (__, obj) in self.popall()]
+            for start_byte, end_byte, code in choplist(3, objs):
+                if not isinstance(start_byte, bytes):
+                    self._warn_once("The start object is not a byte.")
+                    continue
+                if not isinstance(end_byte, bytes):
+                    self._warn_once("The end object is not a byte.")
+                    continue
+                if len(start_byte) != len(end_byte):
+                    self._warn_once("The start and end byte have different lengths.")
+                    continue
+                start = nunpack(start_byte)
+                end = nunpack(end_byte)
+                if isinstance(code, list):
+                    if len(code) != end - start + 1:
+                        self._warn_once(
+                            "The difference between the start and end "
+                            "offsets does not match the code length.",
+                        )
+                    for cid, unicode_value in zip(range(start, end + 1), code):
+                        self.cmap.add_cid2unichr(cid, unicode_value)
+                else:
+                    assert isinstance(code, bytes)
+                    var = code[-4:]
+                    base = nunpack(var)
+                    prefix = code[:-4]
+                    vlen = len(var)
+                    for i in range(end - start + 1):
+                        x = prefix + struct.pack(">L", base + i)[-vlen:]
+                        self.cmap.add_cid2unichr(start + i, x)
+            return
+        if token is self.KEYWORD_BEGINBFCHAR:
+            self.popall()
+            return
+        if token is self.KEYWORD_ENDBFCHAR:
+            objs = [obj for (__, obj) in self.popall()]
+            for cid, code in choplist(2, objs):
+                if isinstance(cid, bytes) and isinstance(code, bytes):
+                    self.cmap.add_cid2unichr(nunpack(cid), code)
+            return
+        if token is self.KEYWORD_BEGINNOTDEFRANGE:
+            self.popall()
+            return
+        if token is self.KEYWORD_ENDNOTDEFRANGE:
+            self.popall()
+            return
+        self.push((pos, token))
+    def _warn_once(self, msg: str) -> None:
+        """Warn once for each unique message"""
+        if msg not in self._warnings:
+            self._warnings.add(msg)
+            base_msg = (
+                "Ignoring (part of) ToUnicode map because the PDF data "
+                "does not conform to the format. This could result in "
+                "(cid) values in the output. "
+            )
+            log.warning(base_msg + msg)

pdf2zh/converter.py ADDED Viewed

	@@ -0,0 +1,1384 @@

+from pdf2zh.utils import (
+    AnyIO,
+    Matrix,
+    PathSegment,
+    Point,
+    Rect,
+    apply_matrix_pt,
+    bbox2str,
+    enc,
+    make_compat_str,
+    mult_matrix,
+    matrix_scale,
+)
+from pdf2zh.pdftypes import PDFStream
+from pdf2zh.pdfpage import PDFPage
+from pdf2zh.pdfinterp import PDFGraphicState, PDFResourceManager
+from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined, PDFCIDFont
+from pdf2zh.pdfexceptions import PDFValueError
+from pdf2zh.pdfdevice import PDFTextDevice
+from pdf2zh.pdfcolor import PDFColorSpace
+from pdf2zh.layout import (
+    LAParams,
+    LTAnno,
+    LTChar,
+    LTComponent,
+    LTCurve,
+    LTFigure,
+    LTImage,
+    LTItem,
+    LTLayoutContainer,
+    LTLine,
+    LTPage,
+    LTRect,
+    LTText,
+    LTTextBox,
+    LTTextBoxVertical,
+    LTTextGroup,
+    LTTextLine,
+    TextGroupElement,
+)
+from pdf2zh.image import ImageWriter
+from pdf2zh import utils
+import io
+import logging
+import re
+from typing import (
+    BinaryIO,
+    Dict,
+    Generic,
+    List,
+    Optional,
+    Sequence,
+    TextIO,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
+import concurrent.futures
+import numpy as np
+import unicodedata
+from tenacity import retry, wait_fixed
+from pdf2zh import cache
+from pdf2zh.translator import (
+    BaseTranslator,
+    GoogleTranslator,
+    DeepLTranslator,
+    DeepLXTranslator,
+    OllamaTranslator,
+    OpenAITranslator,
+    AzureTranslator,
+    TencentTranslator,
+)
+def remove_control_characters(s):
+    return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
+log = logging.getLogger(__name__)
+class PDFLayoutAnalyzer(PDFTextDevice):
+    cur_item: LTLayoutContainer
+    ctm: Matrix
+    def __init__(
+        self,
+        rsrcmgr: PDFResourceManager,
+        pageno: int = 1,
+        laparams: Optional[LAParams] = None,
+    ) -> None:
+        PDFTextDevice.__init__(self, rsrcmgr)
+        self.pageno = pageno
+        self.laparams = laparams
+        self._stack: List[LTLayoutContainer] = []
+    def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
+        # (x0, y0, x1, y1) = page.mediabox
+        (x0, y0, x1, y1) = page.cropbox
+        (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
+        (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
+        mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
+        self.cur_item = LTPage(page.pageno, mediabox)
+    def end_page(self, page: PDFPage):
+        assert not self._stack, str(len(self._stack))
+        assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
+        # 取消默认排版分析
+        # if self.laparams is not None:
+        #     self.cur_item.analyze(self.laparams)
+        self.pageno += 1
+        return self.receive_layout(self.cur_item)
+    def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
+        self._stack.append(self.cur_item)
+        self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
+        self.cur_item.pageid = self._stack[-1].pageid
+    def end_figure(self, _: str) -> None:
+        fig = self.cur_item
+        assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
+        self.cur_item = self._stack.pop()
+        self.cur_item.add(fig)
+        return self.receive_layout(fig)
+    def render_image(self, name: str, stream: PDFStream) -> None:
+        assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
+        item = LTImage(
+            name,
+            stream,
+            (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
+        )
+        self.cur_item.add(item)
+    def paint_path(
+        self,
+        gstate: PDFGraphicState,
+        stroke: bool,
+        fill: bool,
+        evenodd: bool,
+        path: Sequence[PathSegment],
+    ) -> None:
+        """Paint paths described in section 4.4 of the PDF reference manual"""
+        shape = "".join(x[0] for x in path)
+        if shape[:1] != "m":
+            # Per PDF Reference Section 4.4.1, "path construction operators may
+            # be invoked in any sequence, but the first one invoked must be m
+            # or re to begin a new subpath." Since pdf2zh.six already
+            # converts all `re` (rectangle) operators to their equivelent
+            # `mlllh` representation, paths ingested by `.paint_path(...)` that
+            # do not begin with the `m` operator are invalid.
+            pass
+        elif shape.count("m") > 1:
+            # recurse if there are multiple m's in this shape
+            for m in re.finditer(r"m[^m]+", shape):
+                subpath = path[m.start(0) : m.end(0)]
+                self.paint_path(gstate, stroke, fill, evenodd, subpath)
+        else:
+            # Although the 'h' command does not not literally provide a
+            # point-position, its position is (by definition) equal to the
+            # subpath's starting point.
+            #
+            # And, per Section 4.4's Table 4.9, all other path commands place
+            # their point-position in their final two arguments. (Any preceding
+            # arguments represent control points on Bézier curves.)
+            raw_pts = [
+                cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
+            ]
+            pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
+            operators = [str(operation[0]) for operation in path]
+            transformed_points = [
+                [
+                    apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
+                    for operand1, operand2 in zip(operation[1::2], operation[2::2])
+                ]
+                for operation in path
+            ]
+            transformed_path = [
+                cast(PathSegment, (o, *p))
+                for o, p in zip(operators, transformed_points)
+            ]
+            if shape in {"mlh", "ml"}:
+                # single line segment
+                #
+                # Note: 'ml', in conditional above, is a frequent anomaly
+                # that we want to support.
+                line = LTLine(
+                    gstate.linewidth * matrix_scale(self.ctm),
+                    pts[0],
+                    pts[1],
+                    stroke,
+                    fill,
+                    evenodd,
+                    gstate.scolor,
+                    gstate.ncolor,
+                    original_path=transformed_path,
+                    dashing_style=gstate.dash,
+                )
+                self.cur_item.add(line)
+            elif shape in {"mlllh", "mllll"}:
+                (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
+                is_closed_loop = pts[0] == pts[4]
+                has_square_coordinates = (
+                    x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
+                ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
+                if is_closed_loop and has_square_coordinates:
+                    rect = LTRect(
+                        gstate.linewidth * matrix_scale(self.ctm),
+                        (*pts[0], *pts[2]),
+                        stroke,
+                        fill,
+                        evenodd,
+                        gstate.scolor,
+                        gstate.ncolor,
+                        transformed_path,
+                        gstate.dash,
+                    )
+                    self.cur_item.add(rect)
+                else:
+                    curve = LTCurve(
+                        gstate.linewidth * matrix_scale(self.ctm),
+                        pts,
+                        stroke,
+                        fill,
+                        evenodd,
+                        gstate.scolor,
+                        gstate.ncolor,
+                        transformed_path,
+                        gstate.dash,
+                    )
+                    self.cur_item.add(curve)
+            else:
+                curve = LTCurve(
+                    gstate.linewidth * matrix_scale(self.ctm),
+                    pts,
+                    stroke,
+                    fill,
+                    evenodd,
+                    gstate.scolor,
+                    gstate.ncolor,
+                    transformed_path,
+                    gstate.dash,
+                )
+                self.cur_item.add(curve)
+    def render_char(
+        self,
+        matrix: Matrix,
+        font: PDFFont,
+        fontsize: float,
+        scaling: float,
+        rise: float,
+        cid: int,
+        ncs: PDFColorSpace,
+        graphicstate: PDFGraphicState,
+    ) -> float:
+        try:
+            text = font.to_unichr(cid)
+            assert isinstance(text, str), str(type(text))
+        except PDFUnicodeNotDefined:
+            text = self.handle_undefined_char(font, cid)
+        textwidth = font.char_width(cid)
+        textdisp = font.char_disp(cid)
+        item = LTChar(
+            matrix,
+            font,
+            fontsize,
+            scaling,
+            rise,
+            text,
+            textwidth,
+            textdisp,
+            ncs,
+            graphicstate,
+        )
+        self.cur_item.add(item)
+        item.cid = cid  # hack 插入原字符编码
+        return item.adv
+    def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
+        # log.debug("undefined: %r, %r", font, cid)
+        return "(cid:%d)" % cid
+    def receive_layout(self, ltpage: LTPage) -> None:
+        pass
+class PDFPageAggregator(PDFLayoutAnalyzer):
+    def __init__(
+        self,
+        rsrcmgr: PDFResourceManager,
+        pageno: int = 1,
+        laparams: Optional[LAParams] = None,
+    ) -> None:
+        PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
+        self.result: Optional[LTPage] = None
+    def receive_layout(self, ltpage: LTPage) -> None:
+        self.result = ltpage
+    def get_result(self) -> LTPage:
+        assert self.result is not None
+        return self.result
+# Some PDFConverter children support only binary I/O
+IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
+class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
+    def __init__(
+        self,
+        rsrcmgr: PDFResourceManager,
+        outfp: IOType,
+        codec: str = "utf-8",
+        pageno: int = 1,
+        laparams: Optional[LAParams] = None,
+    ) -> None:
+        PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
+        self.outfp: IOType = outfp
+        self.codec = codec
+        self.outfp_binary = self._is_binary_stream(self.outfp)
+    @staticmethod
+    def _is_binary_stream(outfp: AnyIO) -> bool:
+        """Test if an stream is binary or not"""
+        if "b" in getattr(outfp, "mode", ""):
+            return True
+        elif hasattr(outfp, "mode"):
+            # output stream has a mode, but it does not contain 'b'
+            return False
+        elif isinstance(outfp, io.BytesIO):
+            return True
+        elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase):
+            return False
+        return True
+class TextConverter(PDFConverter[AnyIO]):
+    def __init__(
+        self,
+        rsrcmgr: PDFResourceManager,
+        outfp: AnyIO,
+        codec: str = "utf-8",
+        pageno: int = 1,
+        laparams: Optional[LAParams] = None,
+        showpageno: bool = False,
+        imagewriter: Optional[ImageWriter] = None,
+        vfont: str = None,
+        vchar: str = None,
+        thread: int = 0,
+        layout={},
+        lang_in: str = "",
+        lang_out: str = "",
+        service: str = "",
+    ) -> None:
+        super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
+        self.showpageno = showpageno
+        self.imagewriter = imagewriter
+        self.vfont = vfont
+        self.vchar = vchar
+        self.thread = thread
+        self.layout = layout
+        param = service.split(":", 1)
+        if param[0] == "google":
+            self.translator: BaseTranslator = GoogleTranslator(
+                service, lang_out, lang_in, None
+            )
+        elif param[0] == "deepl":
+            self.translator: BaseTranslator = DeepLTranslator(
+                service, lang_out, lang_in, None
+            )
+        elif param[0] == "deeplx":
+            self.translator: BaseTranslator = DeepLXTranslator(
+                service, lang_out, lang_in, None
+            )
+        elif param[0] == "ollama":
+            self.translator: BaseTranslator = OllamaTranslator(
+                service, lang_out, lang_in, param[1]
+            )
+        elif param[0] == "openai":
+            self.translator: BaseTranslator = OpenAITranslator(
+                service, lang_out, lang_in, param[1]
+            )
+        elif param[0] == "azure":
+            self.translator: BaseTranslator = AzureTranslator(
+                service, lang_out, lang_in, None
+            )
+        elif param[0] == "tencent":
+            self.translator: BaseTranslator = TencentTranslator(
+                service, lang_out, lang_in, None
+            )
+        else:
+            raise ValueError("Unsupported translation service")
+    def write_text(self, text: str) -> None:
+        text = utils.compatible_encode_method(text, self.codec, "ignore")
+        if self.outfp_binary:
+            cast(BinaryIO, self.outfp).write(text.encode())
+        else:
+            cast(TextIO, self.outfp).write(text)
+    # fmt: off
+    def receive_layout(self, ltpage: LTPage):
+        xt = None   # 上一个字符
+        sstk = []   # 段落文字栈
+        vstk = []   # 公式符号组
+        vlstk = []  # 公式线条组
+        vfix = 0    # 公式纵向偏移
+        vbkt = 0    # 段落公式括号计数
+        pstk = []   # 段落属性栈
+        lstk = []   # 全局线条栈
+        var = []    # 公式符号组栈
+        varl = []   # 公式线条组栈
+        varf = []   # 公式纵向偏移栈
+        vlen = []   # 公式宽度栈
+        xt_cls = -1 # 上一个字符所属段落
+        vmax = ltpage.width / 4 # 行内公式最大宽度
+        ops = ""    # 渲染结果
+        def vflag(font, char):  # 匹配公式（和角标）字体
+            if re.match(r"\(cid:", char):
+                return True
+            # 基于字体名规则的判定
+            if self.vfont:
+                if re.match(self.vfont, font):
+                    return True
+            else:
+                if re.match(                                            # latex 字体
+                    r"(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)",
+                    font,
+                ):
+                    return True
+            # 基于字符集规则的判定
+            if self.vchar:
+                if re.match(self.vchar, char):
+                    return True
+            else:
+                if (
+                    char
+                    and char != " "                                     # 非空格
+                    and (
+                        unicodedata.category(char[0])
+                        in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]   # 文字修饰符、数学符号、分隔符号
+                        or ord(char[0]) in range(0x370, 0x400)          # 希腊字母
+                    )
+                ):
+                    return True
+            return False
+        ############################################################
+        # A. 原文档解析
+        ptr = 0
+        item = list(ltpage)
+        while ptr < len(item):
+            child = item[ptr]
+            if isinstance(child, LTChar):
+                cur_v = False
+                fontname = child.fontname.split("+")[-1]
+                layout = self.layout[ltpage.pageid]
+                # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
+                h, w = layout.shape
+                # 读取当前字符在 layout 中的类别
+                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
+                cls = layout[cy, cx]
+                if (                                                                                        # 判定当前字符是否属于公式
+                    cls == 0                                                                                # 1. 类别为保留区域
+                    or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1][4] * 0.79)    # 2. 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
+                    or vflag(fontname, child.get_text())                                                    # 3. 公式字体
+                    or (child.matrix[0] == 0 and child.matrix[3] == 0)                                      # 4. 垂直字体
+                ):
+                    cur_v = True
+                # 判定括号组是否属于公式
+                if not cur_v:
+                    if vstk and child.get_text() == "(":
+                        cur_v = True
+                        vbkt += 1
+                    if vbkt and child.get_text() == ")":
+                        cur_v = True
+                        vbkt -= 1
+                if (                                                        # 判定当前公式是否结束
+                    not cur_v                                               # 1. 当前字符不属于公式
+                    or cls != xt_cls                                        # 2. 当前字符与前一个字符不属于同一段落
+                    or (abs(child.x0 - xt.x0) > vmax and cls != 0)          # 3. 段落内换行，可能是一长串斜体的段落，也可能是段内分式换行，这里设个阈值进行区分
+                ):
+                    if vstk:
+                        if (                                                # 根据公式右侧的文字修正公式的纵向偏移
+                            not cur_v                                       # 1. 当前字符不属于公式
+                            and cls == xt_cls                               # 2. 当前字符与前一个字符属于同一段落
+                            and child.x0 > max([vch.x0 for vch in vstk])    # 3. 当前字符在公式右侧
+                        ):
+                            vfix = vstk[0].y0 - child.y0
+                        sstk[-1] += f"$v{len(var)}$"
+                        var.append(vstk)
+                        varl.append(vlstk)
+                        varf.append(vfix)
+                        vstk = []
+                        vlstk = []
+                        vfix = 0
+                # 当前字符不属于公式或当前字符是公式的第一个字符
+                if not vstk:
+                    if cls == xt_cls:               # 当前字符与前一个字符属于同一段落
+                        if child.x0 > xt.x1 + 1:    # 添加行内空格
+                            sstk[-1] += " "
+                        elif child.x1 < xt.x0:      # 添加换行空格并标记原文段落存在换行
+                            sstk[-1] += " "
+                            pstk[-1][6] = True
+                    else:                           # 根据当前字符构建一个新的段落
+                        sstk.append("")
+                        pstk.append([child.y0, child.x0, child.x0, child.x0, child.size, child.font, False])
+                if not cur_v:                                               # 文字入栈
+                    if (                                                    # 根据当前字符修正段落属性
+                        child.size > pstk[-1][4] / 0.79                     # 1. 当前字符显著比段落字体大
+                        or len(sstk[-1].strip()) == 1                       # 2. 当前字符为段落第二个文字（考虑首字母放大的情况）
+                        or vflag(pstk[-1][5].fontname.split("+")[-1], "")   # 3. 段落字体为公式字体
+                        or re.match(                                        # 4. 段落字体为���体
+                            r"(.*Medi|.*Bold)",
+                            pstk[-1][5].fontname.split("+")[-1],
+                            re.IGNORECASE,
+                        )
+                    ):
+                        pstk[-1][0] -= child.size - pstk[-1][4]             # hack 这个段落纵向位置的修正有问题，不过先凑合用吧
+                        pstk[-1][4] = child.size
+                        pstk[-1][5] = child.font
+                    sstk[-1] += child.get_text()
+                else:                                                       # 公式入栈
+                    if (                                                    # 根据公式左侧的文字修正公式的纵向偏移
+                        not vstk                                            # 1. 当前字符是公式的第一个字符
+                        and cls == xt_cls                                   # 2. 当前字符与前一个字符属于同一段落
+                        and child.x0 > xt.x0                                # 3. 前一个字符在公式左侧
+                    ):
+                        vfix = child.y0 - xt.y0
+                    vstk.append(child)
+                # 更新段落边界，因为段落内换行之后可能是公式开头，所以要在外边处理
+                pstk[-1][2] = min(pstk[-1][2], child.x0)
+                pstk[-1][3] = max(pstk[-1][3], child.x1)
+                # 更新上一个字符
+                xt = child
+                xt_cls = cls
+            elif isinstance(child, LTFigure):   # 图表
+                pass
+            elif isinstance(child, LTLine):     # 线条
+                layout = self.layout[ltpage.pageid]
+                # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
+                h, w = layout.shape
+                # 读取当前线条在 layout 中的类别
+                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
+                cls = layout[cy, cx]
+                if vstk and cls == xt_cls:      # 公式线条
+                    vlstk.append(child)
+                else:                           # 全局线条
+                    lstk.append(child)
+            else:
+                pass
+            ptr += 1
+        # 处理结尾
+        if vstk:    # 公式出栈
+            sstk[-1] += f"$v{len(var)}$"
+            var.append(vstk)
+            varl.append(vlstk)
+            varf.append(vfix)
+        log.debug("\n==========[VSTACK]==========\n")
+        for id, v in enumerate(var):  # 计算公式宽度
+            l = max([vch.x1 for vch in v]) - v[0].x0
+            log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
+            vlen.append(l)
+        ############################################################
+        # B. 段落翻译
+        log.debug("\n==========[SSTACK]==========\n")
+        hash_key = cache.deterministic_hash("PDFMathTranslate")
+        cache.create_cache(hash_key)
+        @retry(wait=wait_fixed(1))
+        def worker(s):  # 多线程翻译
+            try:
+                hash_key_paragraph = cache.deterministic_hash(
+                    (s, str(self.translator))
+                )
+                new = cache.load_paragraph(hash_key, hash_key_paragraph)  # 查询缓存
+                if new is None:
+                    new = self.translator.translate(s)
+                    new = remove_control_characters(new)
+                    cache.write_paragraph(hash_key, hash_key_paragraph, new)
+                return new
+            except BaseException as e:
+                if log.isEnabledFor(logging.DEBUG):
+                    log.exception(e)
+                else:
+                    log.exception(e, exc_info=False)
+                raise e
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=self.thread
+        ) as executor:
+            news = list(executor.map(worker, sstk))
+        ############################################################
+        # C. 新文档排版
+        def raw_string(fcur, cstk):  # 编码字符串
+            if isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
+                return "".join(["%04x" % ord(c) for c in cstk])
+            else:
+                return "".join(["%02x" % ord(c) for c in cstk])
+        _x, _y = 0, 0
+        for id, new in enumerate(news):
+            tx = x = pstk[id][1]    # 段落初始横坐标
+            y = pstk[id][0]         # 段落上边界
+            lt = pstk[id][2]        # 段落左边界
+            rt = pstk[id][3]        # 段落右边界
+            size = pstk[id][4]      # 段落字体大小
+            font = pstk[id][5]      # 段落字体
+            lb = pstk[id][6]        # 段落属性
+            cstk = ""               # 当前文字栈
+            fcur = fcur_ = None     # 当前字体
+            ptr = 0
+            log.debug(f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}")
+            while True:
+                if ptr == len(new):  # 到达段落结尾
+                    if cstk:
+                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+                    break
+                vy_regex = re.match(
+                    r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
+                )  # 匹配 $vn$ 公式标记，前面的 $ 有的时候会被丢掉
+                mod = False  # 当前公式是否为文字修饰符
+                if vy_regex:  # 加载公式
+                    ptr += len(vy_regex.group(0))
+                    try:
+                        vid = int(vy_regex.group(1).replace(" ", ""))
+                        adv = vlen[vid]
+                    except Exception:
+                        continue  # 翻译器可能会自动补个越界的公式标记
+                    if len(var[vid]) == 1 and unicodedata.category(var[vid][0].get_text()[0]) in ["Lm", "Mn", "Sk"]:  # 文字修饰符
+                        mod = True
+                else:  # 加载文字
+                    ch = new[ptr]
+                    # if font.char_width(ord(ch)):
+                    fcur_ = None
+                    # 原字体编码容易出问题，这里直接放弃掉
+                    # try:
+                    #     if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
+                    #         fcur_=self.fontid[font] # 原字体
+                    # except:
+                    #     pass
+                    try:
+                        if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
+                            fcur_ = "tiro"  # 默认英文字体
+                    except Exception:
+                        pass
+                    if fcur_ is None:
+                        fcur_ = "china-ss"  # 默认中文字体
+                    # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
+                    adv = self.fontmap[fcur_].char_width(ord(ch)) * size
+                    ptr += 1
+                if (                                # 输出文字缓冲区
+                    fcur_ != fcur                   # 1. 字体更新
+                    or vy_regex                     # 2. 插入公式
+                    or x + adv > rt + 0.1 * size    # 3. 到达右边界（可能一整行都被符号化，这里需要考虑浮点误差）
+                ):
+                    if cstk:
+                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+                        cstk = ""
+                if lb and x + adv > rt + 0.1 * size:  # 到达右边界且原文段落存在换行
+                    x = lt
+                    lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2}  # CJK
+                    y -= size * lang_space.get(self.translator.lang_out, 1.1)  # 小语种大多适配 1.1
+                if vy_regex:  # 插入公式
+                    fix = 0
+                    if fcur is not None:  # 段落内公式修正纵向偏移
+                        fix = varf[vid]
+                    for vch in var[vid]:  # 排版公式字符
+                        vc = chr(vch.cid)
+                        ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
+                        if log.isEnabledFor(logging.DEBUG):
+                            lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
+                            _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
+                    for l in varl[vid]:  # 排版公式线条
+                        if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
+                            ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
+                else:  # 插入文字缓冲区
+                    if not cstk:  # 单行开头
+                        tx = x
+                        if x == lt and ch == " ":  # 消除段落换行空格
+                            adv = 0
+                        else:
+                            cstk += ch
+                    else:
+                        cstk += ch
+                if mod:  # 文字修饰符
+                    adv = 0
+                fcur = fcur_
+                x += adv
+                if log.isEnabledFor(logging.DEBUG):
+                    lstk.append(LTLine(0.1, (_x, _y), (x, y)))
+                    _x, _y = x, y
+        for l in lstk:  # 排版全局线条
+            if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
+                ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
+        ops = f"BT {ops}ET "
+        return ops
+    # Some dummy functions to save memory/CPU when all that is wanted
+    # is text.  This stops all the image and drawing output from being
+    # recorded and taking up RAM.
+    def render_image(self, name: str, stream: PDFStream) -> None:
+        if self.imagewriter is not None:
+            PDFConverter.render_image(self, name, stream)
+    # def paint_path(
+    #     self,
+    #     gstate: PDFGraphicState,
+    #     stroke: bool,
+    #     fill: bool,
+    #     evenodd: bool,
+    #     path: Sequence[PathSegment],
+    # ) -> None:
+    #     pass
+class HTMLConverter(PDFConverter[AnyIO]):
+    RECT_COLORS = {
+        "figure": "yellow",
+        "textline": "magenta",
+        "textbox": "cyan",
+        "textgroup": "red",
+        "curve": "black",
+        "page": "gray",
+    }
+    TEXT_COLORS = {
+        "textbox": "blue",
+        "char": "black",
+    }
+    def __init__(
+        self,
+        rsrcmgr: PDFResourceManager,
+        outfp: AnyIO,
+        codec: str = "utf-8",
+        pageno: int = 1,
+        laparams: Optional[LAParams] = None,
+        scale: float = 1,
+        fontscale: float = 1.0,
+        layoutmode: str = "normal",
+        showpageno: bool = True,
+        pagemargin: int = 50,
+        imagewriter: Optional[ImageWriter] = None,
+        debug: int = 0,
+        rect_colors: Optional[Dict[str, str]] = None,
+        text_colors: Optional[Dict[str, str]] = None,
+    ) -> None:
+        PDFConverter.__init__(
+            self,
+            rsrcmgr,
+            outfp,
+            codec=codec,
+            pageno=pageno,
+            laparams=laparams,
+        )
+        # write() assumes a codec for binary I/O, or no codec for text I/O.
+        if self.outfp_binary and not self.codec:
+            raise PDFValueError("Codec is required for a binary I/O output")
+        if not self.outfp_binary and self.codec:
+            raise PDFValueError("Codec must not be specified for a text I/O output")
+        if text_colors is None:
+            text_colors = {"char": "black"}
+        if rect_colors is None:
+            rect_colors = {"curve": "black", "page": "gray"}
+        self.scale = scale
+        self.fontscale = fontscale
+        self.layoutmode = layoutmode
+        self.showpageno = showpageno
+        self.pagemargin = pagemargin
+        self.imagewriter = imagewriter
+        self.rect_colors = rect_colors
+        self.text_colors = text_colors
+        if debug:
+            self.rect_colors.update(self.RECT_COLORS)
+            self.text_colors.update(self.TEXT_COLORS)
+        self._yoffset: float = self.pagemargin
+        self._font: Optional[Tuple[str, float]] = None
+        self._fontstack: List[Optional[Tuple[str, float]]] = []
+        self.write_header()
+    def write(self, text: str) -> None:
+        if self.codec:
+            cast(BinaryIO, self.outfp).write(text.encode(self.codec))
+        else:
+            cast(TextIO, self.outfp).write(text)
+    def write_header(self) -> None:
+        self.write("<html><head>\n")
+        if self.codec:
+            s = (
+                '<meta http-equiv="Content-Type" content="text/html; '
+                'charset=%s">\n' % self.codec
+            )
+        else:
+            s = '<meta http-equiv="Content-Type" content="text/html">\n'
+        self.write(s)
+        self.write("</head><body>\n")
+    def write_footer(self) -> None:
+        page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]
+        s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(
+            page_links,
+        )
+        self.write(s)
+        self.write("</body></html>\n")
+    def write_text(self, text: str) -> None:
+        self.write(enc(text))
+    def place_rect(
+        self,
+        color: str,
+        borderwidth: int,
+        x: float,
+        y: float,
+        w: float,
+        h: float,
+    ) -> None:
+        color2 = self.rect_colors.get(color)
+        if color2 is not None:
+            s = (
+                '<span style="position:absolute; border: %s %dpx solid; '
+                'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'
+                % (
+                    color2,
+                    borderwidth,
+                    x * self.scale,
+                    (self._yoffset - y) * self.scale,
+                    w * self.scale,
+                    h * self.scale,
+                )
+            )
+            self.write(s)
+    def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
+        self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
+    def place_image(
+        self,
+        item: LTImage,
+        borderwidth: int,
+        x: float,
+        y: float,
+        w: float,
+        h: float,
+    ) -> None:
+        if self.imagewriter is not None:
+            name = self.imagewriter.export_image(item)
+            s = (
+                '<img src="%s" border="%d" style="position:absolute; '
+                'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'
+                % (
+                    enc(name),
+                    borderwidth,
+                    x * self.scale,
+                    (self._yoffset - y) * self.scale,
+                    w * self.scale,
+                    h * self.scale,
+                )
+            )
+            self.write(s)
+    def place_text(
+        self,
+        color: str,
+        text: str,
+        x: float,
+        y: float,
+        size: float,
+    ) -> None:
+        color2 = self.text_colors.get(color)
+        if color2 is not None:
+            s = (
+                '<span style="position:absolute; color:%s; left:%dpx; '
+                'top:%dpx; font-size:%dpx;">'
+                % (
+                    color2,
+                    x * self.scale,
+                    (self._yoffset - y) * self.scale,
+                    size * self.scale * self.fontscale,
+                )
+            )
+            self.write(s)
+            self.write_text(text)
+            self.write("</span>\n")
+    def begin_div(
+        self,
+        color: str,
+        borderwidth: int,
+        x: float,
+        y: float,
+        w: float,
+        h: float,
+        writing_mode: str = "False",
+    ) -> None:
+        self._fontstack.append(self._font)
+        self._font = None
+        s = (
+            '<div style="position:absolute; border: %s %dpx solid; '
+            "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "
+            'height:%dpx;">'
+            % (
+                color,
+                borderwidth,
+                writing_mode,
+                x * self.scale,
+                (self._yoffset - y) * self.scale,
+                w * self.scale,
+                h * self.scale,
+            )
+        )
+        self.write(s)
+    def end_div(self, color: str) -> None:
+        if self._font is not None:
+            self.write("</span>")
+        self._font = self._fontstack.pop()
+        self.write("</div>")
+    def put_text(self, text: str, fontname: str, fontsize: float) -> None:
+        font = (fontname, fontsize)
+        if font != self._font:
+            if self._font is not None:
+                self.write("</span>")
+            # Remove subset tag from fontname, see PDF Reference 5.5.3
+            fontname_without_subset_tag = fontname.split("+")[-1]
+            self.write(
+                '<span style="font-family: %s; font-size:%dpx">'
+                % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale),
+            )
+            self._font = font
+        self.write_text(text)
+    def put_newline(self) -> None:
+        self.write("<br>")
+    def receive_layout(self, ltpage: LTPage) -> None:
+        def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
+            if isinstance(item, LTTextGroup):
+                self.place_border("textgroup", 1, item)
+                for child in item:
+                    show_group(child)
+        def render(item: LTItem) -> None:
+            child: LTItem
+            if isinstance(item, LTPage):
+                self._yoffset += item.y1
+                self.place_border("page", 1, item)
+                if self.showpageno:
+                    self.write(
+                        '<div style="position:absolute; top:%dpx;">'
+                        % ((self._yoffset - item.y1) * self.scale),
+                    )
+                    self.write(
+                        f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',
+                    )
+                for child in item:
+                    render(child)
+                if item.groups is not None:
+                    for group in item.groups:
+                        show_group(group)
+            elif isinstance(item, LTCurve):
+                self.place_border("curve", 1, item)
+            elif isinstance(item, LTFigure):
+                self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
+                for child in item:
+                    render(child)
+                self.end_div("figure")
+            elif isinstance(item, LTImage):
+                self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
+            elif self.layoutmode == "exact":
+                if isinstance(item, LTTextLine):
+                    self.place_border("textline", 1, item)
+                    for child in item:
+                        render(child)
+                elif isinstance(item, LTTextBox):
+                    self.place_border("textbox", 1, item)
+                    self.place_text(
+                        "textbox",
+                        str(item.index + 1),
+                        item.x0,
+                        item.y1,
+                        20,
+                    )
+                    for child in item:
+                        render(child)
+                elif isinstance(item, LTChar):
+                    self.place_border("char", 1, item)
+                    self.place_text(
+                        "char",
+                        item.get_text(),
+                        item.x0,
+                        item.y1,
+                        item.size,
+                    )
+            elif isinstance(item, LTTextLine):
+                for child in item:
+                    render(child)
+                if self.layoutmode != "loose":
+                    self.put_newline()
+            elif isinstance(item, LTTextBox):
+                self.begin_div(
+                    "textbox",
+                    1,
+                    item.x0,
+                    item.y1,
+                    item.width,
+                    item.height,
+                    item.get_writing_mode(),
+                )
+                for child in item:
+                    render(child)
+                self.end_div("textbox")
+            elif isinstance(item, LTChar):
+                fontname = make_compat_str(item.fontname)
+                self.put_text(item.get_text(), fontname, item.size)
+            elif isinstance(item, LTText):
+                self.write_text(item.get_text())
+        render(ltpage)
+        self._yoffset += self.pagemargin
+    def close(self) -> None:
+        self.write_footer()
+class XMLConverter(PDFConverter[AnyIO]):
+    CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
+    def __init__(
+        self,
+        rsrcmgr: PDFResourceManager,
+        outfp: AnyIO,
+        codec: str = "utf-8",
+        pageno: int = 1,
+        laparams: Optional[LAParams] = None,
+        imagewriter: Optional[ImageWriter] = None,
+        stripcontrol: bool = False,
+    ) -> None:
+        PDFConverter.__init__(
+            self,
+            rsrcmgr,
+            outfp,
+            codec=codec,
+            pageno=pageno,
+            laparams=laparams,
+        )
+        # write() assumes a codec for binary I/O, or no codec for text I/O.
+        if self.outfp_binary == (not self.codec):
+            raise PDFValueError("Codec is required for a binary I/O output")
+        self.imagewriter = imagewriter
+        self.stripcontrol = stripcontrol
+        self.write_header()
+    def write(self, text: str) -> None:
+        if self.codec:
+            cast(BinaryIO, self.outfp).write(text.encode(self.codec))
+        else:
+            cast(TextIO, self.outfp).write(text)
+    def write_header(self) -> None:
+        if self.codec:
+            self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
+        else:
+            self.write('<?xml version="1.0" ?>\n')
+        self.write("<pages>\n")
+    def write_footer(self) -> None:
+        self.write("</pages>\n")
+    def write_text(self, text: str) -> None:
+        if self.stripcontrol:
+            text = self.CONTROL.sub("", text)
+        self.write(enc(text))
+    def receive_layout(self, ltpage: LTPage) -> None:
+        def show_group(item: LTItem) -> None:
+            if isinstance(item, LTTextBox):
+                self.write(
+                    '<textbox id="%d" bbox="%s" />\n'
+                    % (item.index, bbox2str(item.bbox)),
+                )
+            elif isinstance(item, LTTextGroup):
+                self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
+                for child in item:
+                    show_group(child)
+                self.write("</textgroup>\n")
+        def render(item: LTItem) -> None:
+            child: LTItem
+            if isinstance(item, LTPage):
+                s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
+                    item.pageid,
+                    bbox2str(item.bbox),
+                    item.rotate,
+                )
+                self.write(s)
+                for child in item:
+                    render(child)
+                if item.groups is not None:
+                    self.write("<layout>\n")
+                    for group in item.groups:
+                        show_group(group)
+                    self.write("</layout>\n")
+                self.write("</page>\n")
+            elif isinstance(item, LTLine):
+                s = '<line linewidth="%d" bbox="%s" />\n' % (
+                    item.linewidth,
+                    bbox2str(item.bbox),
+                )
+                self.write(s)
+            elif isinstance(item, LTRect):
+                s = '<rect linewidth="%d" bbox="%s" />\n' % (
+                    item.linewidth,
+                    bbox2str(item.bbox),
+                )
+                self.write(s)
+            elif isinstance(item, LTCurve):
+                s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
+                    item.linewidth,
+                    bbox2str(item.bbox),
+                    item.get_pts(),
+                )
+                self.write(s)
+            elif isinstance(item, LTFigure):
+                s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'
+                self.write(s)
+                for child in item:
+                    render(child)
+                self.write("</figure>\n")
+            elif isinstance(item, LTTextLine):
+                self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
+                for child in item:
+                    render(child)
+                self.write("</textline>\n")
+            elif isinstance(item, LTTextBox):
+                wmode = ""
+                if isinstance(item, LTTextBoxVertical):
+                    wmode = ' wmode="vertical"'
+                s = '<textbox id="%d" bbox="%s"%s>\n' % (
+                    item.index,
+                    bbox2str(item.bbox),
+                    wmode,
+                )
+                self.write(s)
+                for child in item:
+                    render(child)
+                self.write("</textbox>\n")
+            elif isinstance(item, LTChar):
+                s = (
+                    '<text font="%s" bbox="%s" colourspace="%s" '
+                    'ncolour="%s" size="%.3f">'
+                    % (
+                        enc(item.fontname),
+                        bbox2str(item.bbox),
+                        item.ncs.name,
+                        item.graphicstate.ncolor,
+                        item.size,
+                    )
+                )
+                self.write(s)
+                self.write_text(item.get_text())
+                self.write("</text>\n")
+            elif isinstance(item, LTText):
+                self.write("<text>%s</text>\n" % item.get_text())
+            elif isinstance(item, LTImage):
+                if self.imagewriter is not None:
+                    name = self.imagewriter.export_image(item)
+                    self.write(
+                        '<image src="%s" width="%d" height="%d" />\n'
+                        % (enc(name), item.width, item.height),
+                    )
+                else:
+                    self.write(
+                        '<image width="%d" height="%d" />\n'
+                        % (item.width, item.height),
+                    )
+            else:
+                assert False, str(("Unhandled", item))
+        render(ltpage)
+    def close(self) -> None:
+        self.write_footer()
+class HOCRConverter(PDFConverter[AnyIO]):
+    """Extract an hOCR representation from explicit text information within a PDF."""
+    #   Where text is being extracted from a variety of types of PDF within a
+    #   business process, those PDFs where the text is only present in image
+    #   form will need to be analysed using an OCR tool which will typically
+    #   output hOCR. This converter extracts the explicit text information from
+    #   those PDFs that do have it and uses it to genxerate a basic hOCR
+    #   representation that is designed to be used in conjunction with the image
+    #   of the PDF in the same way as genuine OCR output would be, but without the
+    #   inevitable OCR errors.
+    #   The converter does not handle images, diagrams or text colors.
+    #   In the examples processed by the contributor it was necessary to set
+    #   LAParams.all_texts to True.
+    CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
+    def __init__(
+        self,
+        rsrcmgr: PDFResourceManager,
+        outfp: AnyIO,
+        codec: str = "utf8",
+        pageno: int = 1,
+        laparams: Optional[LAParams] = None,
+        stripcontrol: bool = False,
+    ):
+        PDFConverter.__init__(
+            self,
+            rsrcmgr,
+            outfp,
+            codec=codec,
+            pageno=pageno,
+            laparams=laparams,
+        )
+        self.stripcontrol = stripcontrol
+        self.within_chars = False
+        self.write_header()
+    def bbox_repr(self, bbox: Rect) -> str:
+        (in_x0, in_y0, in_x1, in_y1) = bbox
+        # PDF y-coordinates are the other way round from hOCR coordinates
+        out_x0 = int(in_x0)
+        out_y0 = int(self.page_bbox[3] - in_y1)
+        out_x1 = int(in_x1)
+        out_y1 = int(self.page_bbox[3] - in_y0)
+        return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
+    def write(self, text: str) -> None:
+        if self.codec:
+            encoded_text = text.encode(self.codec)
+            cast(BinaryIO, self.outfp).write(encoded_text)
+        else:
+            cast(TextIO, self.outfp).write(text)
+    def write_header(self) -> None:
+        if self.codec:
+            self.write(
+                "<html xmlns='http://www.w3.org/1999/xhtml' "
+                "xml:lang='en' lang='en' charset='%s'>\n" % self.codec,
+            )
+        else:
+            self.write(
+                "<html xmlns='http://www.w3.org/1999/xhtml' "
+                "xml:lang='en' lang='en'>\n",
+            )
+        self.write("<head>\n")
+        self.write("<title></title>\n")
+        self.write(
+            "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",
+        )
+        self.write(
+            "<meta name='ocr-system' content='pdf2zh.six HOCR Converter' />\n",
+        )
+        self.write(
+            "  <meta name='ocr-capabilities'"
+            " content='ocr_page ocr_block ocr_line ocrx_word'/>\n",
+        )
+        self.write("</head>\n")
+        self.write("<body>\n")
+    def write_footer(self) -> None:
+        self.write("<!-- comment in the following line to debug -->\n")
+        self.write(
+            "<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n",
+        )
+    def write_text(self, text: str) -> None:
+        if self.stripcontrol:
+            text = self.CONTROL.sub("", text)
+        self.write(text)
+    def write_word(self) -> None:
+        if len(self.working_text) > 0:
+            bold_and_italic_styles = ""
+            if "Italic" in self.working_font:
+                bold_and_italic_styles = "font-style: italic; "
+            if "Bold" in self.working_font:
+                bold_and_italic_styles += "font-weight: bold; "
+            self.write(
+                "<span style='font:\"%s\"; font-size:%d; %s' "
+                "class='ocrx_word' title='%s; x_font %s; "
+                "x_fsize %d'>%s</span>"
+                % (
+                    (
+                        self.working_font,
+                        self.working_size,
+                        bold_and_italic_styles,
+                        self.bbox_repr(self.working_bbox),
+                        self.working_font,
+                        self.working_size,
+                        self.working_text.strip(),
+                    )
+                ),
+            )
+        self.within_chars = False
+    def receive_layout(self, ltpage: LTPage) -> None:
+        def render(item: LTItem) -> None:
+            if self.within_chars and isinstance(item, LTAnno):
+                self.write_word()
+            if isinstance(item, LTPage):
+                self.page_bbox = item.bbox
+                self.write(
+                    "<div class='ocr_page' id='%s' title='%s'>\n"
+                    % (item.pageid, self.bbox_repr(item.bbox)),
+                )
+                for child in item:
+                    render(child)
+                self.write("</div>\n")
+            elif isinstance(item, LTTextLine):
+                self.write(
+                    "<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)),
+                )
+                for child_line in item:
+                    render(child_line)
+                self.write("</span>\n")
+            elif isinstance(item, LTTextBox):
+                self.write(
+                    "<div class='ocr_block' id='%d' title='%s'>\n"
+                    % (item.index, self.bbox_repr(item.bbox)),
+                )
+                for child in item:
+                    render(child)
+                self.write("</div>\n")
+            elif isinstance(item, LTChar):
+                if not self.within_chars:
+                    self.within_chars = True
+                    self.working_text = item.get_text()
+                    self.working_bbox = item.bbox
+                    self.working_font = item.fontname
+                    self.working_size = item.size
+                elif len(item.get_text().strip()) == 0:
+                    self.write_word()
+                    self.write(item.get_text())
+                else:
+                    if (
+                        self.working_bbox[1] != item.bbox[1]
+                        or self.working_font != item.fontname
+                        or self.working_size != item.size
+                    ):
+                        self.write_word()
+                        self.working_bbox = item.bbox
+                        self.working_font = item.fontname
+                        self.working_size = item.size
+                    self.working_text += item.get_text()
+                    self.working_bbox = (
+                        self.working_bbox[0],
+                        self.working_bbox[1],
+                        item.bbox[2],
+                        self.working_bbox[3],
+                    )
+        render(ltpage)
+    def close(self) -> None:
+        self.write_footer()

pdf2zh/data_structures.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import Any, Iterable, List, Optional, Tuple
+from pdf2zh import settings
+from pdf2zh.pdfparser import PDFSyntaxError
+from pdf2zh.pdftypes import dict_value, int_value, list_value
+from pdf2zh.utils import choplist
+class NumberTree:
+    """A PDF number tree.
+    See Section 3.8.6 of the PDF Reference.
+    """
+    def __init__(self, obj: Any):
+        self._obj = dict_value(obj)
+        self.nums: Optional[Iterable[Any]] = None
+        self.kids: Optional[Iterable[Any]] = None
+        self.limits: Optional[Iterable[Any]] = None
+        if "Nums" in self._obj:
+            self.nums = list_value(self._obj["Nums"])
+        if "Kids" in self._obj:
+            self.kids = list_value(self._obj["Kids"])
+        if "Limits" in self._obj:
+            self.limits = list_value(self._obj["Limits"])
+    def _parse(self) -> List[Tuple[int, Any]]:
+        items = []
+        if self.nums:  # Leaf node
+            for k, v in choplist(2, self.nums):
+                items.append((int_value(k), v))
+        if self.kids:  # Root or intermediate node
+            for child_ref in self.kids:
+                items += NumberTree(child_ref)._parse()
+        return items
+    values: List[Tuple[int, Any]]  # workaround decorators unsupported by mypy
+    @property  # type: ignore[no-redef,misc]
+    def values(self) -> List[Tuple[int, Any]]:
+        values = self._parse()
+        if settings.STRICT:
+            if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
+                raise PDFSyntaxError("Number tree elements are out of order")
+        else:
+            values.sort(key=lambda t: t[0])
+        return values

pdf2zh/doclayout.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import abc
+import cv2
+import numpy as np
+import contextlib
+from huggingface_hub import hf_hub_download
+class DocLayoutModel(abc.ABC):
+    @staticmethod
+    def load_torch():
+        model = TorchModel.from_pretrained(
+            repo_id="juliozhao/DocLayout-YOLO-DocStructBench",
+            filename="doclayout_yolo_docstructbench_imgsz1024.pt",
+        )
+        return model
+    @staticmethod
+    def load_onnx():
+        model = OnnxModel.from_pretrained(
+            repo_id="wybxc/DocLayout-YOLO-DocStructBench-onnx",
+            filename="doclayout_yolo_docstructbench_imgsz1024.onnx",
+        )
+        return model
+    @staticmethod
+    def load_available():
+        with contextlib.suppress(ImportError):
+            return DocLayoutModel.load_torch()
+        with contextlib.suppress(ImportError):
+            return DocLayoutModel.load_onnx()
+        raise ImportError(
+            "Please install the `torch` or `onnx` feature to use the DocLayout model."
+        )
+    @property
+    @abc.abstractmethod
+    def stride(self) -> int:
+        """Stride of the model input."""
+        pass
+    @abc.abstractmethod
+    def predict(self, image, imgsz=1024, **kwargs) -> list:
+        """
+        Predict the layout of a document page.
+        Args:
+            image: The image of the document page.
+            imgsz: Resize the image to this size. Must be a multiple of the stride.
+            **kwargs: Additional arguments.
+        """
+        pass
+class TorchModel(DocLayoutModel):
+    def __init__(self, model_path: str):
+        try:
+            import doclayout_yolo
+        except ImportError:
+            raise ImportError(
+                "Please install the `torch` feature to use the Torch model."
+            )
+        self.model_path = model_path
+        self.model = doclayout_yolo.YOLOv10(model_path)
+    @staticmethod
+    def from_pretrained(repo_id: str, filename: str):
+        pth = hf_hub_download(repo_id=repo_id, filename=filename)
+        return TorchModel(pth)
+    @property
+    def stride(self):
+        return 32
+    def predict(self, *args, **kwargs):
+        return self.model.predict(*args, **kwargs)
+class YoloResult:
+    """Helper class to store detection results from ONNX model."""
+    def __init__(self, boxes, names):
+        self.boxes = [YoloBox(data=d) for d in boxes]
+        self.boxes.sort(key=lambda x: x.conf, reverse=True)
+        self.names = names
+class YoloBox:
+    """Helper class to store detection results from ONNX model."""
+    def __init__(self, data):
+        self.xyxy = data[:4]
+        self.conf = data[-2]
+        self.cls = data[-1]
+class OnnxModel(DocLayoutModel):
+    def __init__(self, model_path: str):
+        import ast
+        try:
+            import onnx
+            import onnxruntime
+        except ImportError:
+            raise ImportError(
+                "Please install the `onnx` feature to use the ONNX model."
+            )
+        self.model_path = model_path
+        model = onnx.load(model_path)
+        metadata = {d.key: d.value for d in model.metadata_props}
+        self._stride = ast.literal_eval(metadata["stride"])
+        self._names = ast.literal_eval(metadata["names"])
+        self.model = onnxruntime.InferenceSession(model.SerializeToString())
+    @staticmethod
+    def from_pretrained(repo_id: str, filename: str):
+        pth = hf_hub_download(repo_id=repo_id, filename=filename)
+        return OnnxModel(pth)
+    @property
+    def stride(self):
+        return self._stride
+    def resize_and_pad_image(self, image, new_shape):
+        """
+        Resize and pad the image to the specified size, ensuring dimensions are multiples of stride.
+        Parameters:
+        - image: Input image
+        - new_shape: Target size (integer or (height, width) tuple)
+        - stride: Padding alignment stride, default 32
+        Returns:
+        - Processed image
+        """
+        if isinstance(new_shape, int):
+            new_shape = (new_shape, new_shape)
+        h, w = image.shape[:2]
+        new_h, new_w = new_shape
+        # Calculate scaling ratio
+        r = min(new_h / h, new_w / w)
+        resized_h, resized_w = int(round(h * r)), int(round(w * r))
+        # Resize image
+        image = cv2.resize(
+            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
+        )
+        # Calculate padding size and align to stride multiple
+        pad_w = (new_w - resized_w) % self.stride
+        pad_h = (new_h - resized_h) % self.stride
+        top, bottom = pad_h // 2, pad_h - pad_h // 2
+        left, right = pad_w // 2, pad_w - pad_w // 2
+        # Add padding
+        image = cv2.copyMakeBorder(
+            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
+        )
+        return image
+    def scale_boxes(self, img1_shape, boxes, img0_shape):
+        """
+        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
+        specified in (img1_shape) to the shape of a different image (img0_shape).
+        Args:
+            img1_shape (tuple): The shape of the image that the bounding boxes are for,
+                in the format of (height, width).
+            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
+            img0_shape (tuple): the shape of the target image, in the format of (height, width).
+        Returns:
+            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
+        """
+        # Calculate scaling ratio
+        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])
+        # Calculate padding size
+        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
+        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)
+        # Remove padding and scale boxes
+        boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain
+        return boxes
+    def predict(self, image, imgsz=1024, **kwargs):
+        # Preprocess input image
+        orig_h, orig_w = image.shape[:2]
+        pix = self.resize_and_pad_image(image, new_shape=imgsz)
+        pix = np.transpose(pix, (2, 0, 1))  # CHW
+        pix = np.expand_dims(pix, axis=0)  # BCHW
+        pix = pix.astype(np.float32) / 255.0  # Normalize to [0, 1]
+        new_h, new_w = pix.shape[2:]
+        # Run inference
+        preds = self.model.run(None, {"images": pix})[0]
+        # Postprocess predictions
+        preds = preds[preds[..., 4] > 0.25]
+        preds[..., :4] = self.scale_boxes(
+            (new_h, new_w), preds[..., :4], (orig_h, orig_w)
+        )
+        return [YoloResult(boxes=preds, names=self._names)]

pdf2zh/encodingdb.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import logging
+import re
+from typing import Dict, Iterable, Optional, cast
+from pdf2zh.glyphlist import glyphname2unicode
+from pdf2zh.latin_enc import ENCODING
+from pdf2zh.pdfexceptions import PDFKeyError
+from pdf2zh.psparser import PSLiteral
+HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
+log = logging.getLogger(__name__)
+def name2unicode(name: str) -> str:
+    """Converts Adobe glyph names to Unicode numbers.
+    In contrast to the specification, this raises a KeyError instead of return
+    an empty string when the key is unknown.
+    This way the caller must explicitly define what to do
+    when there is not a match.
+    Reference:
+    https://github.com/adobe-type-tools/agl-specification#2-the-mapping
+    :returns unicode character if name resembles something,
+    otherwise a KeyError
+    """
+    if not isinstance(name, str):
+        raise PDFKeyError(
+            'Could not convert unicode name "%s" to character because '
+            "it should be of type str but is of type %s" % (name, type(name)),
+        )
+    name = name.split(".")[0]
+    components = name.split("_")
+    if len(components) > 1:
+        return "".join(map(name2unicode, components))
+    elif name in glyphname2unicode:
+        return glyphname2unicode[name]
+    elif name.startswith("uni"):
+        name_without_uni = name.strip("uni")
+        if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
+            unicode_digits = [
+                int(name_without_uni[i : i + 4], base=16)
+                for i in range(0, len(name_without_uni), 4)
+            ]
+            for digit in unicode_digits:
+                raise_key_error_for_invalid_unicode(digit)
+            characters = map(chr, unicode_digits)
+            return "".join(characters)
+    elif name.startswith("u"):
+        name_without_u = name.strip("u")
+        if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
+            unicode_digit = int(name_without_u, base=16)
+            raise_key_error_for_invalid_unicode(unicode_digit)
+            return chr(unicode_digit)
+    raise PDFKeyError(
+        'Could not convert unicode name "%s" to character because '
+        "it does not match specification" % name,
+    )
+def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
+    """Unicode values should not be in the range D800 through DFFF because
+    that is used for surrogate pairs in UTF-16
+    :raises KeyError if unicode digit is invalid
+    """
+    if 55295 < unicode_digit < 57344:
+        raise PDFKeyError(
+            "Unicode digit %d is invalid because "
+            "it is in the range D800 through DFFF" % unicode_digit,
+        )
+class EncodingDB:
+    std2unicode: Dict[int, str] = {}
+    mac2unicode: Dict[int, str] = {}
+    win2unicode: Dict[int, str] = {}
+    pdf2unicode: Dict[int, str] = {}
+    for name, std, mac, win, pdf in ENCODING:
+        c = name2unicode(name)
+        if std:
+            std2unicode[std] = c
+        if mac:
+            mac2unicode[mac] = c
+        if win:
+            win2unicode[win] = c
+        if pdf:
+            pdf2unicode[pdf] = c
+    encodings = {
+        "StandardEncoding": std2unicode,
+        "MacRomanEncoding": mac2unicode,
+        "WinAnsiEncoding": win2unicode,
+        "PDFDocEncoding": pdf2unicode,
+    }
+    @classmethod
+    def get_encoding(
+        cls,
+        name: str,
+        diff: Optional[Iterable[object]] = None,
+    ) -> Dict[int, str]:
+        cid2unicode = cls.encodings.get(name, cls.std2unicode)
+        if diff:
+            cid2unicode = cid2unicode.copy()
+            cid = 0
+            for x in diff:
+                if isinstance(x, int):
+                    cid = x
+                elif isinstance(x, PSLiteral):
+                    try:
+                        cid2unicode[cid] = name2unicode(cast(str, x.name))
+                    except (KeyError, ValueError):
+                        # log.debug(str(e))
+                        pass
+                    cid += 1
+        return cid2unicode

pdf2zh/fontmetrics.py ADDED Viewed

The diff for this file is too large to render. See raw diff

pdf2zh/glyphlist.py ADDED Viewed

The diff for this file is too large to render. See raw diff

pdf2zh/gui.py ADDED Viewed

	@@ -0,0 +1,425 @@

+import os
+import shutil
+from pathlib import Path
+from pdf2zh import __version__
+from pdf2zh.pdf2zh import extract_text
+import gradio as gr
+import numpy as np
+import pymupdf
+import tqdm
+import requests
+# Map service names to pdf2zh service options
+service_map = {
+    "Google": "google",
+    "DeepL": "deepl",
+    "DeepLX": "deeplx",
+    "Ollama": "ollama",
+    "OpenAI": "openai",
+    "Azure": "azure",
+}
+lang_map = {
+    "Chinese": "zh",
+    "English": "en",
+    "French": "fr",
+    "German": "de",
+    "Japanese": "ja",
+    "Korean": "ko",
+    "Russian": "ru",
+    "Spanish": "es",
+    "Italian": "it",
+}
+page_map = {
+    "All": None,
+    "First": [0],
+    "First 5 pages": list(range(0, 5)),
+}
+flag_demo = False
+if os.environ.get("PDF2ZH_DEMO"):
+    flag_demo = True
+    service_map = {
+        "Google": "google",
+    }
+    page_map = {
+        "First": [0],
+        "First 20 pages": list(range(0, 20)),
+    }
+    client_key = os.environ.get("PDF2ZH_CLIENT_KEY")
+    server_key = os.environ.get("PDF2ZH_SERVER_KEY")
+def verify_recaptcha(response):
+    recaptcha_url = "https://www.google.com/recaptcha/api/siteverify"
+    print("reCAPTCHA", server_key, response)
+    data = {"secret": server_key, "response": response}
+    result = requests.post(recaptcha_url, data=data).json()
+    print("reCAPTCHA", result.get("success"))
+    return result.get("success")
+def pdf_preview(file):
+    doc = pymupdf.open(file)
+    page = doc[0]
+    pix = page.get_pixmap()
+    image = np.frombuffer(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)
+    return image
+def upload_file(file, service, progress=gr.Progress()):
+    """Handle file upload, validation, and initial preview."""
+    if not file or not os.path.exists(file):
+        return None, None
+    try:
+        # Convert first page for preview
+        preview_image = pdf_preview(file)
+        return file, preview_image
+    except Exception as e:
+        print(f"Error converting PDF: {e}")
+        return None, None
+def translate(
+    file_path,
+    service,
+    model_id,
+    lang,
+    page_range,
+    recaptcha_response,
+    progress=gr.Progress(),
+):
+    """Translate PDF content using selected service."""
+    if not file_path:
+        raise gr.Error("No input")
+    if flag_demo and not verify_recaptcha(recaptcha_response):
+        raise gr.Error("reCAPTCHA fail")
+    progress(0, desc="Starting translation...")
+    output = Path("pdf2zh_files")
+    output.mkdir(parents=True, exist_ok=True)
+    filename = os.path.splitext(os.path.basename(file_path))[0]
+    file_en = output / f"{filename}.pdf"
+    file_zh = output / f"{filename}-zh.pdf"
+    file_dual = output / f"{filename}-dual.pdf"
+    shutil.copyfile(file_path, file_en)
+    selected_service = service_map.get(service, "google")
+    selected_page = page_map.get(page_range, [0])
+    lang_to = lang_map.get(lang, "zh")
+    if selected_service == "google":
+        lang_to = "zh-CN" if lang_to == "zh" else lang_to
+    print(f"Files before translation: {os.listdir(output)}")
+    def progress_bar(t: tqdm.tqdm):
+        progress(t.n / t.total, desc="Translating...")
+    param = {
+        "files": [file_en],
+        "pages": selected_page,
+        "lang_in": "auto",
+        "lang_out": lang_to,
+        "service": f"{selected_service}:{model_id}",
+        "output": output,
+        "thread": 4,
+        "callback": progress_bar,
+    }
+    print(param)
+    extract_text(**param)
+    print(f"Files after translation: {os.listdir(output)}")
+    if not file_zh.exists() or not file_dual.exists():
+        raise gr.Error("No output")
+    try:
+        translated_preview = pdf_preview(str(file_zh))
+    except Exception:
+        raise gr.Error("No preview")
+    progress(1.0, desc="Translation complete!")
+    return (
+        str(file_zh),
+        translated_preview,
+        str(file_dual),
+        gr.update(visible=True),
+        gr.update(visible=True),
+        gr.update(visible=True),
+    )
+# Global setup
+custom_blue = gr.themes.Color(
+    c50="#E8F3FF",
+    c100="#BEDAFF",
+    c200="#94BFFF",
+    c300="#6AA1FF",
+    c400="#4080FF",
+    c500="#165DFF",  # Primary color
+    c600="#0E42D2",
+    c700="#0A2BA6",
+    c800="#061D79",
+    c900="#03114D",
+    c950="#020B33",
+)
+with gr.Blocks(
+    title="PDFMathTranslate - PDF Translation with preserved formats",
+    theme=gr.themes.Default(
+        primary_hue=custom_blue, spacing_size="md", radius_size="lg"
+    ),
+    css="""
+    .secondary-text {color: #999 !important;}
+    footer {visibility: hidden}
+    .env-warning {color: #dd5500 !important;}
+    .env-success {color: #559900 !important;}
+    /* Add dashed border to input-file class */
+    .input-file {
+        border: 1.2px dashed #165DFF !important;
+        border-radius: 6px !important;
+        # background-color: #ffffff !important;
+        transition: background-color 0.4s ease-out;
+    }
+    .input-file:hover {
+        border: 1.2px dashed #165DFF !important;
+        border-radius: 6px !important;
+        color: #165DFF !important;
+        background-color: #E8F3FF !important;
+        transition: background-color 0.2s ease-in;
+    }
+    .progress-bar-wrap {
+    border-radius: 8px !important;
+    }
+    .progress-bar {
+    border-radius: 8px !important;
+    }
+    # .input-file label {
+    #     color: #165DFF !important;
+    #     border: 1.2px dashed #165DFF !important;
+    #     border-left: none !important;
+    #     border-top: none !important;
+    # }
+    # .input-file .wrap {
+    #     color: #165DFF !important;
+    # }
+    # .input-file .or {
+    #     color: #165DFF !important;
+    # }
+    """,
+    head=(
+        """
+    <script src="https://www.google.com/recaptcha/api.js?render=explicit" async defer></script>
+    <script type="text/javascript">
+        var onVerify = function(token) {
+            el=document.getElementById('verify').getElementsByTagName('textarea')[0];
+            el.value=token;
+            el.dispatchEvent(new Event('input'));
+        };
+    </script>
+    """
+        if flag_demo
+        else ""
+    ),
+) as demo:
+    gr.Markdown(
+        "# [PDFMathTranslate @ GitHub](https://github.com/Byaidu/PDFMathTranslate)"
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("## File | < 5 MB" if flag_demo else "## File")
+            file_input = gr.File(
+                label="Document",
+                file_count="single",
+                file_types=[".pdf"],
+                type="filepath",
+                elem_classes=["input-file"],
+            )
+            gr.Markdown("## Option")
+            service = gr.Dropdown(
+                label="Service",
+                info="Which translation service to use. Some require keys",
+                choices=service_map.keys(),
+                value="Google",
+            )
+            lang_to = gr.Dropdown(
+                label="Translate to",
+                info="Which language to translate to (optional)",
+                choices=lang_map.keys(),
+                value="Chinese",
+            )
+            page_range = gr.Radio(
+                choices=page_map.keys(),
+                label="Pages",
+                info="Translate the full document or just few pages (optional)",
+                value=list(page_map.keys())[0],
+            )
+            model_id = gr.Textbox(
+                label="Model ID",
+                info="Please enter the identifier of the model you wish to use (e.g., gemma2). "
+                "This identifier will be used to specify the particular model for translation.",
+                # value="gemma2",
+                visible=False,  # hide by default
+            )
+            envs_status = "<span class='env-success'>- Properly configured.</span><br>"
+            def details_wrapper(text_markdown):
+                text = f"""
+                <details>
+                    <summary>Technical details</summary>
+                    {text_markdown}
+                    - GitHub: <a href="https://github.com/Byaidu/PDFMathTranslate">Byaidu/PDFMathTranslate</a><br>
+                    - GUI by: <a href="https://github.com/reycn">Rongxin</a><br>
+                    - Version: {__version__}
+                </details>"""
+                return text
+            def env_var_checker(env_var_name: str) -> str:
+                if (
+                    not os.environ.get(env_var_name)
+                    or os.environ.get(env_var_name) == ""
+                ):
+                    envs_status = (
+                        f"<span class='env-warning'>- Warning: environmental not found or error ({env_var_name})."
+                        + "</span><br>- Please make sure that the environment variables are properly configured "
+                        + "(<a href='https://github.com/Byaidu/PDFMathTranslate'>guide</a>).<br>"
+                    )
+                else:
+                    value = str(os.environ.get(env_var_name))
+                    envs_status = (
+                        "<span class='env-success'>- Properly configured.</span><br>"
+                    )
+                    if len(value) < 13:
+                        envs_status += (
+                            f"- Env: <code>{os.environ.get(env_var_name)}</code><br>"
+                        )
+                    else:
+                        envs_status += f"- Env: <code>{value[:13]}***</code><br>"
+                return details_wrapper(envs_status)
+            def on_select_service(value, evt: gr.EventData):
+                # hide model id by default
+                model_visibility = gr.update(visible=False)
+                # add a text description
+                if value == "Google":
+                    envs_status = details_wrapper(
+                        "<span class='env-success'>- Properly configured.</span><br>"
+                    )
+                elif value == "DeepL":
+                    envs_status = env_var_checker("DEEPL_AUTH_KEY")
+                elif value == "DeepLX":
+                    envs_status = env_var_checker("DEEPLX_AUTH_KEY")
+                elif value == "Azure":
+                    envs_status = env_var_checker("AZURE_APIKEY")
+                elif value == "OpenAI":
+                    model_visibility = gr.update(
+                        visible=True, value="gpt-4o"
+                    )  # show model id when service is selected
+                    envs_status = env_var_checker("OPENAI_API_KEY")
+                elif value == "Ollama":
+                    model_visibility = gr.update(
+                        visible=True, value="gemma2"
+                    )  # show model id when service is selected
+                    envs_status = env_var_checker("OLLAMA_HOST")
+                else:
+                    envs_status = (
+                        "<span class='env-warning'>- Warning: model not in the list."
+                        "</span><br>- Please report via "
+                        "(<a href='https://github.com/Byaidu/PDFMathTranslate'>guide</a>).<br>"
+                    )
+                return envs_status, model_visibility
+            output_title = gr.Markdown("## Translated", visible=False)
+            output_file = gr.File(label="Download Translation", visible=False)
+            output_file_dual = gr.File(
+                label="Download Translation (Dual)", visible=False
+            )
+            recaptcha_response = gr.Textbox(
+                label="reCAPTCHA Response", elem_id="verify", visible=False
+            )
+            recaptcha_box = gr.HTML('<div id="recaptcha-box"></div>')
+            translate_btn = gr.Button("Translate", variant="primary")
+            tech_details_tog = gr.Markdown(
+                details_wrapper(envs_status),
+                elem_classes=["secondary-text"],
+            )
+            service.select(on_select_service, service, [tech_details_tog, model_id])
+        with gr.Column(scale=2):
+            gr.Markdown("## Preview")
+            preview = gr.Image(label="Document Preview", visible=True)
+    # Event handlers
+    file_input.upload(
+        upload_file,
+        inputs=[file_input, service],
+        outputs=[file_input, preview],
+        js=(
+            f"""
+            (a,b)=>{{
+                try{{
+                    grecaptcha.render('recaptcha-box',{{
+                        'sitekey':'{client_key}',
+                        'callback':'onVerify'
+                    }});
+                }}catch(error){{}}
+                return [a];
+            }}
+            """
+            if flag_demo
+            else ""
+        ),
+    )
+    translate_btn.click(
+        translate,
+        inputs=[file_input, service, model_id, lang_to, page_range, recaptcha_response],
+        outputs=[
+            output_file,
+            preview,
+            output_file_dual,
+            output_file,
+            output_file_dual,
+            output_title,
+        ],
+    ).then(lambda: None, js="()=>{grecaptcha.reset()}" if flag_demo else "")
+def setup_gui(share=False):
+    if flag_demo:
+        demo.launch(server_name="0.0.0.0", max_file_size="5mb", inbrowser=True)
+    else:
+        try:
+            demo.launch(server_name="0.0.0.0", debug=True, inbrowser=True, share=share)
+        except Exception:
+            print(
+                "Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software."
+            )
+            try:
+                demo.launch(
+                    server_name="127.0.0.1", debug=True, inbrowser=True, share=share
+                )
+            except Exception:
+                print(
+                    "Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software."
+                )
+                demo.launch(debug=True, inbrowser=True, share=True)
+# For auto-reloading while developing
+if __name__ == "__main__":
+    setup_gui()

pdf2zh/high_level.py ADDED Viewed

	@@ -0,0 +1,298 @@

+"""Functions that can be used for the most common use-cases for pdf2zh.six"""
+import logging
+import sys
+from io import StringIO
+from typing import Any, BinaryIO, Container, Iterator, Optional, cast
+import numpy as np
+import tqdm
+from pymupdf import Document
+from pdf2zh.converter import (
+    HOCRConverter,
+    HTMLConverter,
+    PDFPageAggregator,
+    TextConverter,
+    XMLConverter,
+)
+from pdf2zh.image import ImageWriter
+from pdf2zh.layout import LAParams, LTPage
+from pdf2zh.pdfdevice import PDFDevice, TagExtractor
+from pdf2zh.pdfexceptions import PDFValueError
+from pdf2zh.pdfinterp import PDFPageInterpreter, PDFResourceManager
+from pdf2zh.pdfpage import PDFPage
+from pdf2zh.utils import AnyIO, FileOrName, open_filename, get_device
+def extract_text_to_fp(
+    inf: BinaryIO,
+    outfp: AnyIO,
+    output_type: str = "text",
+    codec: str = "utf-8",
+    laparams: Optional[LAParams] = None,
+    maxpages: int = 0,
+    pages: Optional[Container[int]] = None,
+    password: str = "",
+    scale: float = 1.0,
+    rotation: int = 0,
+    layoutmode: str = "normal",
+    output_dir: Optional[str] = None,
+    strip_control: bool = False,
+    debug: bool = False,
+    disable_caching: bool = False,
+    page_count: int = 0,
+    vfont: str = "",
+    vchar: str = "",
+    thread: int = 0,
+    doc_en: Document = None,
+    model=None,
+    lang_in: str = "",
+    lang_out: str = "",
+    service: str = "",
+    callback: object = None,
+    **kwargs: Any,
+) -> None:
+    """Parses text from inf-file and writes to outfp file-like object.
+    Takes loads of optional arguments but the defaults are somewhat sane.
+    Beware laparams: Including an empty LAParams is not the same as passing
+    None!
+    :param inf: a file-like object to read PDF structure from, such as a
+        file handler (using the builtin `open()` function) or a `BytesIO`.
+    :param outfp: a file-like object to write the text to.
+    :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
+        Only 'text' works properly.
+    :param codec: Text decoding codec
+    :param laparams: An LAParams object from pdf2zh.layout. Default is None
+        but may not layout correctly.
+    :param maxpages: How many pages to stop parsing after
+    :param page_numbers: zero-indexed page numbers to operate on.
+    :param password: For encrypted PDFs, the password to decrypt.
+    :param scale: Scale factor
+    :param rotation: Rotation factor
+    :param layoutmode: Default is 'normal', see
+        pdf2zh.converter.HTMLConverter
+    :param output_dir: If given, creates an ImageWriter for extracted images.
+    :param strip_control: Does what it says on the tin
+    :param debug: Output more logging data
+    :param disable_caching: Does what it says on the tin
+    :param other:
+    :return: nothing, acting as it does on two streams. Use StringIO to get
+        strings.
+    """
+    if debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+    imagewriter = None
+    if output_dir:
+        imagewriter = ImageWriter(output_dir)
+    rsrcmgr = PDFResourceManager(caching=not disable_caching)
+    device: Optional[PDFDevice] = None
+    layout = {}
+    if output_type != "text" and outfp == sys.stdout:
+        outfp = sys.stdout.buffer
+    if output_type == "text":
+        device = TextConverter(
+            rsrcmgr,
+            outfp,
+            codec=codec,
+            laparams=laparams,
+            imagewriter=imagewriter,
+            vfont=vfont,
+            vchar=vchar,
+            thread=thread,
+            layout=layout,
+            lang_in=lang_in,
+            lang_out=lang_out,
+            service=service,
+        )
+    elif output_type == "xml":
+        device = XMLConverter(
+            rsrcmgr,
+            outfp,
+            codec=codec,
+            laparams=laparams,
+            imagewriter=imagewriter,
+            stripcontrol=strip_control,
+        )
+    elif output_type == "html":
+        device = HTMLConverter(
+            rsrcmgr,
+            outfp,
+            codec=codec,
+            scale=scale,
+            layoutmode=layoutmode,
+            laparams=laparams,
+            imagewriter=imagewriter,
+        )
+    elif output_type == "hocr":
+        device = HOCRConverter(
+            rsrcmgr,
+            outfp,
+            codec=codec,
+            laparams=laparams,
+            stripcontrol=strip_control,
+        )
+    elif output_type == "tag":
+        # Binary I/O is required, but we have no good way to test it here.
+        device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
+    else:
+        msg = f"Output type can be text, html, xml or tag but is {output_type}"
+        raise PDFValueError(msg)
+    assert device is not None
+    obj_patch = {}
+    interpreter = PDFPageInterpreter(rsrcmgr, device, obj_patch)
+    if pages:
+        total_pages = len(pages)
+    else:
+        total_pages = page_count
+    with tqdm.tqdm(
+        PDFPage.get_pages(
+            inf,
+            pages,
+            maxpages=maxpages,
+            password=password,
+            caching=not disable_caching,
+        ),
+        total=total_pages,
+        position=0,
+    ) as progress:
+        for page in progress:
+            if callback:
+                callback(progress)
+            pix = doc_en[page.pageno].get_pixmap()
+            image = np.fromstring(pix.samples, np.uint8).reshape(
+                pix.height, pix.width, 3
+            )[:, :, ::-1]
+            page_layout = model.predict(
+                image, imgsz=int(pix.height / 32) * 32, device=get_device()
+            )[0]
+            # kdtree 是不可能 kdtree 的，不如直接渲染成图片，用空间换时间
+            box = np.ones((pix.height, pix.width))
+            h, w = box.shape
+            vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
+            for i, d in enumerate(page_layout.boxes):
+                if not page_layout.names[int(d.cls)] in vcls:
+                    x0, y0, x1, y1 = d.xyxy.squeeze()
+                    x0, y0, x1, y1 = (
+                        np.clip(int(x0 - 1), 0, w - 1),
+                        np.clip(int(h - y1 - 1), 0, h - 1),
+                        np.clip(int(x1 + 1), 0, w - 1),
+                        np.clip(int(h - y0 + 1), 0, h - 1),
+                    )
+                    box[y0:y1, x0:x1] = i + 2
+            for i, d in enumerate(page_layout.boxes):
+                if page_layout.names[int(d.cls)] in vcls:
+                    x0, y0, x1, y1 = d.xyxy.squeeze()
+                    x0, y0, x1, y1 = (
+                        np.clip(int(x0 - 1), 0, w - 1),
+                        np.clip(int(h - y1 - 1), 0, h - 1),
+                        np.clip(int(x1 + 1), 0, w - 1),
+                        np.clip(int(h - y0 + 1), 0, h - 1),
+                    )
+                    box[y0:y1, x0:x1] = 0
+            layout[page.pageno] = box
+            # print(page.number,page_layout)
+            page.rotate = (page.rotate + rotation) % 360
+            # 新建一个 xref 存放新指令流
+            page.page_xref = doc_en.get_new_xref()  # hack 插入页面的新 xref
+            doc_en.update_object(page.page_xref, "<<>>")
+            doc_en.update_stream(page.page_xref, b"")
+            doc_en[page.pageno].set_contents(page.page_xref)
+            interpreter.process_page(page)
+    device.close()
+    return obj_patch
+def extract_text(
+    pdf_file: FileOrName,
+    password: str = "",
+    page_numbers: Optional[Container[int]] = None,
+    maxpages: int = 0,
+    caching: bool = True,
+    codec: str = "utf-8",
+    laparams: Optional[LAParams] = None,
+) -> str:
+    """Parse and return the text contained in a PDF file.
+    :param pdf_file: Either a file path or a file-like object for the PDF file
+        to be worked on.
+    :param password: For encrypted PDFs, the password to decrypt.
+    :param page_numbers: List of zero-indexed page numbers to extract.
+    :param maxpages: The maximum number of pages to parse
+    :param caching: If resources should be cached
+    :param codec: Text decoding codec
+    :param laparams: An LAParams object from pdf2zh.layout. If None, uses
+        some default settings that often work well.
+    :return: a string containing all of the text extracted.
+    """
+    if laparams is None:
+        laparams = LAParams()
+    with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
+        fp = cast(BinaryIO, fp)  # we opened in binary mode
+        rsrcmgr = PDFResourceManager(caching=caching)
+        device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+        for page in PDFPage.get_pages(
+            fp,
+            page_numbers,
+            maxpages=maxpages,
+            password=password,
+            caching=caching,
+        ):
+            interpreter.process_page(page)
+        return output_string.getvalue()
+def extract_pages(
+    pdf_file: FileOrName,
+    password: str = "",
+    page_numbers: Optional[Container[int]] = None,
+    maxpages: int = 0,
+    caching: bool = True,
+    laparams: Optional[LAParams] = None,
+) -> Iterator[LTPage]:
+    """Extract and yield LTPage objects
+    :param pdf_file: Either a file path or a file-like object for the PDF file
+        to be worked on.
+    :param password: For encrypted PDFs, the password to decrypt.
+    :param page_numbers: List of zero-indexed page numbers to extract.
+    :param maxpages: The maximum number of pages to parse
+    :param caching: If resources should be cached
+    :param laparams: An LAParams object from pdf2zh.layout. If None, uses
+        some default settings that often work well.
+    :return: LTPage objects
+    """
+    if laparams is None:
+        laparams = LAParams()
+    with open_filename(pdf_file, "rb") as fp:
+        fp = cast(BinaryIO, fp)  # we opened in binary mode
+        resource_manager = PDFResourceManager(caching=caching)
+        device = PDFPageAggregator(resource_manager, laparams=laparams)
+        interpreter = PDFPageInterpreter(resource_manager, device)
+        for page in PDFPage.get_pages(
+            fp,
+            page_numbers,
+            maxpages=maxpages,
+            password=password,
+            caching=caching,
+        ):
+            interpreter.process_page(page)
+            layout = device.get_result()
+            yield layout

pdf2zh/image.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import os
+import os.path
+import struct
+from io import BytesIO
+from typing import BinaryIO, Tuple
+try:
+    from typing import Literal
+except ImportError:
+    # Literal was introduced in Python 3.8
+    from typing_extensions import Literal  # type: ignore[assignment]
+from pdf2zh.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
+from pdf2zh.layout import LTImage
+from pdf2zh.pdfcolor import (
+    LITERAL_DEVICE_CMYK,
+    LITERAL_DEVICE_GRAY,
+    LITERAL_DEVICE_RGB,
+    LITERAL_INLINE_DEVICE_GRAY,
+    LITERAL_INLINE_DEVICE_RGB,
+)
+from pdf2zh.pdfexceptions import PDFValueError
+from pdf2zh.pdftypes import (
+    LITERALS_DCT_DECODE,
+    LITERALS_FLATE_DECODE,
+    LITERALS_JBIG2_DECODE,
+    LITERALS_JPX_DECODE,
+)
+PIL_ERROR_MESSAGE = (
+    "Could not import Pillow. This dependency of pdf2zh.six is not "
+    "installed by default. You need it to to save jpg images to a file. Install it "
+    "with `pip install 'pdf2zh.six[image]'`"
+)
+def align32(x: int) -> int:
+    return ((x + 3) // 4) * 4
+class BMPWriter:
+    def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
+        self.fp = fp
+        self.bits = bits
+        self.width = width
+        self.height = height
+        if bits == 1:
+            ncols = 2
+        elif bits == 8:
+            ncols = 256
+        elif bits == 24:
+            ncols = 0
+        else:
+            raise PDFValueError(bits)
+        self.linesize = align32((self.width * self.bits + 7) // 8)
+        self.datasize = self.linesize * self.height
+        headersize = 14 + 40 + ncols * 4
+        info = struct.pack(
+            "<IiiHHIIIIII",
+            40,
+            self.width,
+            self.height,
+            1,
+            self.bits,
+            0,
+            self.datasize,
+            0,
+            0,
+            ncols,
+            0,
+        )
+        assert len(info) == 40, str(len(info))
+        header = struct.pack(
+            "<ccIHHI",
+            b"B",
+            b"M",
+            headersize + self.datasize,
+            0,
+            0,
+            headersize,
+        )
+        assert len(header) == 14, str(len(header))
+        self.fp.write(header)
+        self.fp.write(info)
+        if ncols == 2:
+            # B&W color table
+            for i in (0, 255):
+                self.fp.write(struct.pack("BBBx", i, i, i))
+        elif ncols == 256:
+            # grayscale color table
+            for i in range(256):
+                self.fp.write(struct.pack("BBBx", i, i, i))
+        self.pos0 = self.fp.tell()
+        self.pos1 = self.pos0 + self.datasize
+    def write_line(self, y: int, data: bytes) -> None:
+        self.fp.seek(self.pos1 - (y + 1) * self.linesize)
+        self.fp.write(data)
+class ImageWriter:
+    """Write image to a file
+    Supports various image types: JPEG, JBIG2 and bitmaps
+    """
+    def __init__(self, outdir: str) -> None:
+        self.outdir = outdir
+        if not os.path.exists(self.outdir):
+            os.makedirs(self.outdir)
+    def export_image(self, image: LTImage) -> str:
+        """Save an LTImage to disk"""
+        (width, height) = image.srcsize
+        filters = image.stream.get_filters()
+        if filters[-1][0] in LITERALS_DCT_DECODE:
+            name = self._save_jpeg(image)
+        elif filters[-1][0] in LITERALS_JPX_DECODE:
+            name = self._save_jpeg2000(image)
+        elif self._is_jbig2_iamge(image):
+            name = self._save_jbig2(image)
+        elif image.bits == 1:
+            name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)
+        elif image.bits == 8 and (
+            LITERAL_DEVICE_RGB in image.colorspace
+            or LITERAL_INLINE_DEVICE_RGB in image.colorspace
+        ):
+            name = self._save_bmp(image, width, height, width * 3, image.bits * 3)
+        elif image.bits == 8 and (
+            LITERAL_DEVICE_GRAY in image.colorspace
+            or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
+        ):
+            name = self._save_bmp(image, width, height, width, image.bits)
+        elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
+            name = self._save_bytes(image)
+        else:
+            name = self._save_raw(image)
+        return name
+    def _save_jpeg(self, image: LTImage) -> str:
+        """Save a JPEG encoded image"""
+        data = image.stream.get_data()
+        name, path = self._create_unique_image_name(image, ".jpg")
+        with open(path, "wb") as fp:
+            if LITERAL_DEVICE_CMYK in image.colorspace:
+                try:
+                    from PIL import Image, ImageChops  # type: ignore[import]
+                except ImportError:
+                    raise ImportError(PIL_ERROR_MESSAGE)
+                ifp = BytesIO(data)
+                i = Image.open(ifp)
+                i = ImageChops.invert(i)
+                i = i.convert("RGB")
+                i.save(fp, "JPEG")
+            else:
+                fp.write(data)
+        return name
+    def _save_jpeg2000(self, image: LTImage) -> str:
+        """Save a JPEG 2000 encoded image"""
+        data = image.stream.get_data()
+        name, path = self._create_unique_image_name(image, ".jp2")
+        with open(path, "wb") as fp:
+            try:
+                from PIL import Image  # type: ignore[import]
+            except ImportError:
+                raise ImportError(PIL_ERROR_MESSAGE)
+            # if we just write the raw data, most image programs
+            # that I have tried cannot open the file. However,
+            # open and saving with PIL produces a file that
+            # seems to be easily opened by other programs
+            ifp = BytesIO(data)
+            i = Image.open(ifp)
+            i.save(fp, "JPEG2000")
+        return name
+    def _save_jbig2(self, image: LTImage) -> str:
+        """Save a JBIG2 encoded image"""
+        name, path = self._create_unique_image_name(image, ".jb2")
+        with open(path, "wb") as fp:
+            input_stream = BytesIO()
+            global_streams = []
+            filters = image.stream.get_filters()
+            for filter_name, params in filters:
+                if filter_name in LITERALS_JBIG2_DECODE:
+                    global_streams.append(params["JBIG2Globals"].resolve())
+            if len(global_streams) > 1:
+                msg = (
+                    "There should never be more than one JBIG2Globals "
+                    "associated with a JBIG2 embedded image"
+                )
+                raise PDFValueError(msg)
+            if len(global_streams) == 1:
+                input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
+            input_stream.write(image.stream.get_data())
+            input_stream.seek(0)
+            reader = JBIG2StreamReader(input_stream)
+            segments = reader.get_segments()
+            writer = JBIG2StreamWriter(fp)
+            writer.write_file(segments)
+        return name
+    def _save_bmp(
+        self,
+        image: LTImage,
+        width: int,
+        height: int,
+        bytes_per_line: int,
+        bits: int,
+    ) -> str:
+        """Save a BMP encoded image"""
+        name, path = self._create_unique_image_name(image, ".bmp")
+        with open(path, "wb") as fp:
+            bmp = BMPWriter(fp, bits, width, height)
+            data = image.stream.get_data()
+            i = 0
+            for y in range(height):
+                bmp.write_line(y, data[i : i + bytes_per_line])
+                i += bytes_per_line
+        return name
+    def _save_bytes(self, image: LTImage) -> str:
+        """Save an image without encoding, just bytes"""
+        name, path = self._create_unique_image_name(image, ".jpg")
+        width, height = image.srcsize
+        channels = len(image.stream.get_data()) / width / height / (image.bits / 8)
+        with open(path, "wb") as fp:
+            try:
+                from PIL import (
+                    Image,  # type: ignore[import]
+                    ImageOps,
+                )
+            except ImportError:
+                raise ImportError(PIL_ERROR_MESSAGE)
+            mode: Literal["1", "L", "RGB", "CMYK"]
+            if image.bits == 1:
+                mode = "1"
+            elif image.bits == 8 and channels == 1:
+                mode = "L"
+            elif image.bits == 8 and channels == 3:
+                mode = "RGB"
+            elif image.bits == 8 and channels == 4:
+                mode = "CMYK"
+            img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw")
+            if mode == "L":
+                img = ImageOps.invert(img)
+            img.save(fp)
+        return name
+    def _save_raw(self, image: LTImage) -> str:
+        """Save an image with unknown encoding"""
+        ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1])
+        name, path = self._create_unique_image_name(image, ext)
+        with open(path, "wb") as fp:
+            fp.write(image.stream.get_data())
+        return name
+    @staticmethod
+    def _is_jbig2_iamge(image: LTImage) -> bool:
+        filters = image.stream.get_filters()
+        for filter_name, params in filters:
+            if filter_name in LITERALS_JBIG2_DECODE:
+                return True
+        return False
+    def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]:
+        name = image.name + ext
+        path = os.path.join(self.outdir, name)
+        img_index = 0
+        while os.path.exists(path):
+            name = "%s.%d%s" % (image.name, img_index, ext)
+            path = os.path.join(self.outdir, name)
+            img_index += 1
+        return name, path

pdf2zh/jbig2.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import math
+import os
+from struct import calcsize, pack, unpack
+from typing import BinaryIO, Dict, Iterable, List, Optional, Tuple, Union, cast
+from pdf2zh.pdfexceptions import PDFValueError
+# segment structure base
+SEG_STRUCT = [
+    (">L", "number"),
+    (">B", "flags"),
+    (">B", "retention_flags"),
+    (">B", "page_assoc"),
+    (">L", "data_length"),
+]
+# segment header literals
+HEADER_FLAG_DEFERRED = 0b10000000
+HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000
+SEG_TYPE_MASK = 0b00111111
+REF_COUNT_SHORT_MASK = 0b11100000
+REF_COUNT_LONG_MASK = 0x1FFFFFFF
+REF_COUNT_LONG = 7
+DATA_LEN_UNKNOWN = 0xFFFFFFFF
+# segment types
+SEG_TYPE_IMMEDIATE_GEN_REGION = 38
+SEG_TYPE_END_OF_PAGE = 49
+SEG_TYPE_END_OF_FILE = 51
+# file literals
+FILE_HEADER_ID = b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a"
+FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
+def bit_set(bit_pos: int, value: int) -> bool:
+    return bool((value >> bit_pos) & 1)
+def check_flag(flag: int, value: int) -> bool:
+    return bool(flag & value)
+def masked_value(mask: int, value: int) -> int:
+    for bit_pos in range(31):
+        if bit_set(bit_pos, mask):
+            return (value & mask) >> bit_pos
+    raise PDFValueError("Invalid mask or value")
+def mask_value(mask: int, value: int) -> int:
+    for bit_pos in range(31):
+        if bit_set(bit_pos, mask):
+            return (value & (mask >> bit_pos)) << bit_pos
+    raise PDFValueError("Invalid mask or value")
+def unpack_int(format: str, buffer: bytes) -> int:
+    assert format in {">B", ">I", ">L"}
+    [result] = cast(Tuple[int], unpack(format, buffer))
+    return result
+JBIG2SegmentFlags = Dict[str, Union[int, bool]]
+JBIG2RetentionFlags = Dict[str, Union[int, List[int], List[bool]]]
+JBIG2Segment = Dict[
+    str,
+    Union[bool, int, bytes, JBIG2SegmentFlags, JBIG2RetentionFlags],
+]
+class JBIG2StreamReader:
+    """Read segments from a JBIG2 byte stream"""
+    def __init__(self, stream: BinaryIO) -> None:
+        self.stream = stream
+    def get_segments(self) -> List[JBIG2Segment]:
+        segments: List[JBIG2Segment] = []
+        while not self.is_eof():
+            segment: JBIG2Segment = {}
+            for field_format, name in SEG_STRUCT:
+                field_len = calcsize(field_format)
+                field = self.stream.read(field_len)
+                if len(field) < field_len:
+                    segment["_error"] = True
+                    break
+                value = unpack_int(field_format, field)
+                parser = getattr(self, "parse_%s" % name, None)
+                if callable(parser):
+                    value = parser(segment, value, field)
+                segment[name] = value
+            if not segment.get("_error"):
+                segments.append(segment)
+        return segments
+    def is_eof(self) -> bool:
+        if self.stream.read(1) == b"":
+            return True
+        else:
+            self.stream.seek(-1, os.SEEK_CUR)
+            return False
+    def parse_flags(
+        self,
+        segment: JBIG2Segment,
+        flags: int,
+        field: bytes,
+    ) -> JBIG2SegmentFlags:
+        return {
+            "deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
+            "page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
+            "type": masked_value(SEG_TYPE_MASK, flags),
+        }
+    def parse_retention_flags(
+        self,
+        segment: JBIG2Segment,
+        flags: int,
+        field: bytes,
+    ) -> JBIG2RetentionFlags:
+        ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
+        retain_segments = []
+        ref_segments = []
+        if ref_count < REF_COUNT_LONG:
+            for bit_pos in range(5):
+                retain_segments.append(bit_set(bit_pos, flags))
+        else:
+            field += self.stream.read(3)
+            ref_count = unpack_int(">L", field)
+            ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count)
+            ret_bytes_count = int(math.ceil((ref_count + 1) / 8))
+            for ret_byte_index in range(ret_bytes_count):
+                ret_byte = unpack_int(">B", self.stream.read(1))
+                for bit_pos in range(7):
+                    retain_segments.append(bit_set(bit_pos, ret_byte))
+        seg_num = segment["number"]
+        assert isinstance(seg_num, int)
+        if seg_num <= 256:
+            ref_format = ">B"
+        elif seg_num <= 65536:
+            ref_format = ">I"
+        else:
+            ref_format = ">L"
+        ref_size = calcsize(ref_format)
+        for ref_index in range(ref_count):
+            ref_data = self.stream.read(ref_size)
+            ref = unpack_int(ref_format, ref_data)
+            ref_segments.append(ref)
+        return {
+            "ref_count": ref_count,
+            "retain_segments": retain_segments,
+            "ref_segments": ref_segments,
+        }
+    def parse_page_assoc(self, segment: JBIG2Segment, page: int, field: bytes) -> int:
+        if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]:
+            field += self.stream.read(3)
+            page = unpack_int(">L", field)
+        return page
+    def parse_data_length(
+        self,
+        segment: JBIG2Segment,
+        length: int,
+        field: bytes,
+    ) -> int:
+        if length:
+            if (
+                cast(JBIG2SegmentFlags, segment["flags"])["type"]
+                == SEG_TYPE_IMMEDIATE_GEN_REGION
+            ) and (length == DATA_LEN_UNKNOWN):
+                raise NotImplementedError(
+                    "Working with unknown segment length is not implemented yet",
+                )
+            else:
+                segment["raw_data"] = self.stream.read(length)
+        return length
+class JBIG2StreamWriter:
+    """Write JBIG2 segments to a file in JBIG2 format"""
+    EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = {
+        "ref_count": 0,
+        "ref_segments": cast(List[int], []),
+        "retain_segments": cast(List[bool], []),
+    }
+    def __init__(self, stream: BinaryIO) -> None:
+        self.stream = stream
+    def write_segments(
+        self,
+        segments: Iterable[JBIG2Segment],
+        fix_last_page: bool = True,
+    ) -> int:
+        data_len = 0
+        current_page: Optional[int] = None
+        seg_num: Optional[int] = None
+        for segment in segments:
+            data = self.encode_segment(segment)
+            self.stream.write(data)
+            data_len += len(data)
+            seg_num = cast(Optional[int], segment["number"])
+            if fix_last_page:
+                seg_page = cast(int, segment.get("page_assoc"))
+                if (
+                    cast(JBIG2SegmentFlags, segment["flags"])["type"]
+                    == SEG_TYPE_END_OF_PAGE
+                ):
+                    current_page = None
+                elif seg_page:
+                    current_page = seg_page
+        if fix_last_page and current_page and (seg_num is not None):
+            segment = self.get_eop_segment(seg_num + 1, current_page)
+            data = self.encode_segment(segment)
+            self.stream.write(data)
+            data_len += len(data)
+        return data_len
+    def write_file(
+        self,
+        segments: Iterable[JBIG2Segment],
+        fix_last_page: bool = True,
+    ) -> int:
+        header = FILE_HEADER_ID
+        header_flags = FILE_HEAD_FLAG_SEQUENTIAL
+        header += pack(">B", header_flags)
+        # The embedded JBIG2 files in a PDF always
+        # only have one page
+        number_of_pages = pack(">L", 1)
+        header += number_of_pages
+        self.stream.write(header)
+        data_len = len(header)
+        data_len += self.write_segments(segments, fix_last_page)
+        seg_num = 0
+        for segment in segments:
+            seg_num = cast(int, segment["number"])
+        if fix_last_page:
+            seg_num_offset = 2
+        else:
+            seg_num_offset = 1
+        eof_segment = self.get_eof_segment(seg_num + seg_num_offset)
+        data = self.encode_segment(eof_segment)
+        self.stream.write(data)
+        data_len += len(data)
+        return data_len
+    def encode_segment(self, segment: JBIG2Segment) -> bytes:
+        data = b""
+        for field_format, name in SEG_STRUCT:
+            value = segment.get(name)
+            encoder = getattr(self, "encode_%s" % name, None)
+            if callable(encoder):
+                field = encoder(value, segment)
+            else:
+                field = pack(field_format, value)
+            data += field
+        return data
+    def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment) -> bytes:
+        flags = 0
+        if value.get("deferred"):
+            flags |= HEADER_FLAG_DEFERRED
+        if "page_assoc_long" in value:
+            flags |= HEADER_FLAG_PAGE_ASSOC_LONG if value["page_assoc_long"] else flags
+        else:
+            flags |= (
+                HEADER_FLAG_PAGE_ASSOC_LONG
+                if cast(int, segment.get("page", 0)) > 255
+                else flags
+            )
+        flags |= mask_value(SEG_TYPE_MASK, value["type"])
+        return pack(">B", flags)
+    def encode_retention_flags(
+        self,
+        value: JBIG2RetentionFlags,
+        segment: JBIG2Segment,
+    ) -> bytes:
+        flags = []
+        flags_format = ">B"
+        ref_count = value["ref_count"]
+        assert isinstance(ref_count, int)
+        retain_segments = cast(List[bool], value.get("retain_segments", []))
+        if ref_count <= 4:
+            flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
+            for ref_index, ref_retain in enumerate(retain_segments):
+                if ref_retain:
+                    flags_byte |= 1 << ref_index
+            flags.append(flags_byte)
+        else:
+            bytes_count = math.ceil((ref_count + 1) / 8)
+            flags_format = ">L" + ("B" * bytes_count)
+            flags_dword = mask_value(REF_COUNT_SHORT_MASK, REF_COUNT_LONG) << 24
+            flags.append(flags_dword)
+            for byte_index in range(bytes_count):
+                ret_byte = 0
+                ret_part = retain_segments[byte_index * 8 : byte_index * 8 + 8]
+                for bit_pos, ret_seg in enumerate(ret_part):
+                    ret_byte |= 1 << bit_pos if ret_seg else ret_byte
+                flags.append(ret_byte)
+        ref_segments = cast(List[int], value.get("ref_segments", []))
+        seg_num = cast(int, segment["number"])
+        if seg_num <= 256:
+            ref_format = "B"
+        elif seg_num <= 65536:
+            ref_format = "I"
+        else:
+            ref_format = "L"
+        for ref in ref_segments:
+            flags_format += ref_format
+            flags.append(ref)
+        return pack(flags_format, *flags)
+    def encode_data_length(self, value: int, segment: JBIG2Segment) -> bytes:
+        data = pack(">L", value)
+        data += cast(bytes, segment["raw_data"])
+        return data
+    def get_eop_segment(self, seg_number: int, page_number: int) -> JBIG2Segment:
+        return {
+            "data_length": 0,
+            "flags": {"deferred": False, "type": SEG_TYPE_END_OF_PAGE},
+            "number": seg_number,
+            "page_assoc": page_number,
+            "raw_data": b"",
+            "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
+        }
+    def get_eof_segment(self, seg_number: int) -> JBIG2Segment:
+        return {
+            "data_length": 0,
+            "flags": {"deferred": False, "type": SEG_TYPE_END_OF_FILE},
+            "number": seg_number,
+            "page_assoc": 0,
+            "raw_data": b"",
+            "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
+        }

pdf2zh/latin_enc.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""Standard encoding tables used in PDF.
+This table is extracted from PDF Reference Manual 1.6, pp.925
+  "D.1 Latin Character Set and Encodings"
+"""
+from typing import List, Optional, Tuple
+EncodingRow = Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]]
+ENCODING: List[EncodingRow] = [
+    # (name, std, mac, win, pdf)
+    ("A", 65, 65, 65, 65),
+    ("AE", 225, 174, 198, 198),
+    ("Aacute", None, 231, 193, 193),
+    ("Acircumflex", None, 229, 194, 194),
+    ("Adieresis", None, 128, 196, 196),
+    ("Agrave", None, 203, 192, 192),
+    ("Aring", None, 129, 197, 197),
+    ("Atilde", None, 204, 195, 195),
+    ("B", 66, 66, 66, 66),
+    ("C", 67, 67, 67, 67),
+    ("Ccedilla", None, 130, 199, 199),
+    ("D", 68, 68, 68, 68),
+    ("E", 69, 69, 69, 69),
+    ("Eacute", None, 131, 201, 201),
+    ("Ecircumflex", None, 230, 202, 202),
+    ("Edieresis", None, 232, 203, 203),
+    ("Egrave", None, 233, 200, 200),
+    ("Eth", None, None, 208, 208),
+    ("Euro", None, None, 128, 160),
+    ("F", 70, 70, 70, 70),
+    ("G", 71, 71, 71, 71),
+    ("H", 72, 72, 72, 72),
+    ("I", 73, 73, 73, 73),
+    ("Iacute", None, 234, 205, 205),
+    ("Icircumflex", None, 235, 206, 206),
+    ("Idieresis", None, 236, 207, 207),
+    ("Igrave", None, 237, 204, 204),
+    ("J", 74, 74, 74, 74),
+    ("K", 75, 75, 75, 75),
+    ("L", 76, 76, 76, 76),
+    ("Lslash", 232, None, None, 149),
+    ("M", 77, 77, 77, 77),
+    ("N", 78, 78, 78, 78),
+    ("Ntilde", None, 132, 209, 209),
+    ("O", 79, 79, 79, 79),
+    ("OE", 234, 206, 140, 150),
+    ("Oacute", None, 238, 211, 211),
+    ("Ocircumflex", None, 239, 212, 212),
+    ("Odieresis", None, 133, 214, 214),
+    ("Ograve", None, 241, 210, 210),
+    ("Oslash", 233, 175, 216, 216),
+    ("Otilde", None, 205, 213, 213),
+    ("P", 80, 80, 80, 80),
+    ("Q", 81, 81, 81, 81),
+    ("R", 82, 82, 82, 82),
+    ("S", 83, 83, 83, 83),
+    ("Scaron", None, None, 138, 151),
+    ("T", 84, 84, 84, 84),
+    ("Thorn", None, None, 222, 222),
+    ("U", 85, 85, 85, 85),
+    ("Uacute", None, 242, 218, 218),
+    ("Ucircumflex", None, 243, 219, 219),
+    ("Udieresis", None, 134, 220, 220),
+    ("Ugrave", None, 244, 217, 217),
+    ("V", 86, 86, 86, 86),
+    ("W", 87, 87, 87, 87),
+    ("X", 88, 88, 88, 88),
+    ("Y", 89, 89, 89, 89),
+    ("Yacute", None, None, 221, 221),
+    ("Ydieresis", None, 217, 159, 152),
+    ("Z", 90, 90, 90, 90),
+    ("Zcaron", None, None, 142, 153),
+    ("a", 97, 97, 97, 97),
+    ("aacute", None, 135, 225, 225),
+    ("acircumflex", None, 137, 226, 226),
+    ("acute", 194, 171, 180, 180),
+    ("adieresis", None, 138, 228, 228),
+    ("ae", 241, 190, 230, 230),
+    ("agrave", None, 136, 224, 224),
+    ("ampersand", 38, 38, 38, 38),
+    ("aring", None, 140, 229, 229),
+    ("asciicircum", 94, 94, 94, 94),
+    ("asciitilde", 126, 126, 126, 126),
+    ("asterisk", 42, 42, 42, 42),
+    ("at", 64, 64, 64, 64),
+    ("atilde", None, 139, 227, 227),
+    ("b", 98, 98, 98, 98),
+    ("backslash", 92, 92, 92, 92),
+    ("bar", 124, 124, 124, 124),
+    ("braceleft", 123, 123, 123, 123),
+    ("braceright", 125, 125, 125, 125),
+    ("bracketleft", 91, 91, 91, 91),
+    ("bracketright", 93, 93, 93, 93),
+    ("breve", 198, 249, None, 24),
+    ("brokenbar", None, None, 166, 166),
+    ("bullet", 183, 165, 149, 128),
+    ("c", 99, 99, 99, 99),
+    ("caron", 207, 255, None, 25),
+    ("ccedilla", None, 141, 231, 231),
+    ("cedilla", 203, 252, 184, 184),
+    ("cent", 162, 162, 162, 162),
+    ("circumflex", 195, 246, 136, 26),
+    ("colon", 58, 58, 58, 58),
+    ("comma", 44, 44, 44, 44),
+    ("copyright", None, 169, 169, 169),
+    ("currency", 168, 219, 164, 164),
+    ("d", 100, 100, 100, 100),
+    ("dagger", 178, 160, 134, 129),
+    ("daggerdbl", 179, 224, 135, 130),
+    ("degree", None, 161, 176, 176),
+    ("dieresis", 200, 172, 168, 168),
+    ("divide", None, 214, 247, 247),
+    ("dollar", 36, 36, 36, 36),
+    ("dotaccent", 199, 250, None, 27),
+    ("dotlessi", 245, 245, None, 154),
+    ("e", 101, 101, 101, 101),
+    ("eacute", None, 142, 233, 233),
+    ("ecircumflex", None, 144, 234, 234),
+    ("edieresis", None, 145, 235, 235),
+    ("egrave", None, 143, 232, 232),
+    ("eight", 56, 56, 56, 56),
+    ("ellipsis", 188, 201, 133, 131),
+    ("emdash", 208, 209, 151, 132),
+    ("endash", 177, 208, 150, 133),
+    ("equal", 61, 61, 61, 61),
+    ("eth", None, None, 240, 240),
+    ("exclam", 33, 33, 33, 33),
+    ("exclamdown", 161, 193, 161, 161),
+    ("f", 102, 102, 102, 102),
+    ("fi", 174, 222, None, 147),
+    ("five", 53, 53, 53, 53),
+    ("fl", 175, 223, None, 148),
+    ("florin", 166, 196, 131, 134),
+    ("four", 52, 52, 52, 52),
+    ("fraction", 164, 218, None, 135),
+    ("g", 103, 103, 103, 103),
+    ("germandbls", 251, 167, 223, 223),
+    ("grave", 193, 96, 96, 96),
+    ("greater", 62, 62, 62, 62),
+    ("guillemotleft", 171, 199, 171, 171),
+    ("guillemotright", 187, 200, 187, 187),
+    ("guilsinglleft", 172, 220, 139, 136),
+    ("guilsinglright", 173, 221, 155, 137),
+    ("h", 104, 104, 104, 104),
+    ("hungarumlaut", 205, 253, None, 28),
+    ("hyphen", 45, 45, 45, 45),
+    ("i", 105, 105, 105, 105),
+    ("iacute", None, 146, 237, 237),
+    ("icircumflex", None, 148, 238, 238),
+    ("idieresis", None, 149, 239, 239),
+    ("igrave", None, 147, 236, 236),
+    ("j", 106, 106, 106, 106),
+    ("k", 107, 107, 107, 107),
+    ("l", 108, 108, 108, 108),
+    ("less", 60, 60, 60, 60),
+    ("logicalnot", None, 194, 172, 172),
+    ("lslash", 248, None, None, 155),
+    ("m", 109, 109, 109, 109),
+    ("macron", 197, 248, 175, 175),
+    ("minus", None, None, None, 138),
+    ("mu", None, 181, 181, 181),
+    ("multiply", None, None, 215, 215),
+    ("n", 110, 110, 110, 110),
+    ("nbspace", None, 202, 160, None),
+    ("nine", 57, 57, 57, 57),
+    ("ntilde", None, 150, 241, 241),
+    ("numbersign", 35, 35, 35, 35),
+    ("o", 111, 111, 111, 111),
+    ("oacute", None, 151, 243, 243),
+    ("ocircumflex", None, 153, 244, 244),
+    ("odieresis", None, 154, 246, 246),
+    ("oe", 250, 207, 156, 156),
+    ("ogonek", 206, 254, None, 29),
+    ("ograve", None, 152, 242, 242),
+    ("one", 49, 49, 49, 49),
+    ("onehalf", None, None, 189, 189),
+    ("onequarter", None, None, 188, 188),
+    ("onesuperior", None, None, 185, 185),
+    ("ordfeminine", 227, 187, 170, 170),
+    ("ordmasculine", 235, 188, 186, 186),
+    ("oslash", 249, 191, 248, 248),
+    ("otilde", None, 155, 245, 245),
+    ("p", 112, 112, 112, 112),
+    ("paragraph", 182, 166, 182, 182),
+    ("parenleft", 40, 40, 40, 40),
+    ("parenright", 41, 41, 41, 41),
+    ("percent", 37, 37, 37, 37),
+    ("period", 46, 46, 46, 46),
+    ("periodcentered", 180, 225, 183, 183),
+    ("perthousand", 189, 228, 137, 139),
+    ("plus", 43, 43, 43, 43),
+    ("plusminus", None, 177, 177, 177),
+    ("q", 113, 113, 113, 113),
+    ("question", 63, 63, 63, 63),
+    ("questiondown", 191, 192, 191, 191),
+    ("quotedbl", 34, 34, 34, 34),
+    ("quotedblbase", 185, 227, 132, 140),
+    ("quotedblleft", 170, 210, 147, 141),
+    ("quotedblright", 186, 211, 148, 142),
+    ("quoteleft", 96, 212, 145, 143),
+    ("quoteright", 39, 213, 146, 144),
+    ("quotesinglbase", 184, 226, 130, 145),
+    ("quotesingle", 169, 39, 39, 39),
+    ("r", 114, 114, 114, 114),
+    ("registered", None, 168, 174, 174),
+    ("ring", 202, 251, None, 30),
+    ("s", 115, 115, 115, 115),
+    ("scaron", None, None, 154, 157),
+    ("section", 167, 164, 167, 167),
+    ("semicolon", 59, 59, 59, 59),
+    ("seven", 55, 55, 55, 55),
+    ("six", 54, 54, 54, 54),
+    ("slash", 47, 47, 47, 47),
+    ("space", 32, 32, 32, 32),
+    ("space", None, 202, 160, None),
+    ("space", None, 202, 173, None),
+    ("sterling", 163, 163, 163, 163),
+    ("t", 116, 116, 116, 116),
+    ("thorn", None, None, 254, 254),
+    ("three", 51, 51, 51, 51),
+    ("threequarters", None, None, 190, 190),
+    ("threesuperior", None, None, 179, 179),
+    ("tilde", 196, 247, 152, 31),
+    ("trademark", None, 170, 153, 146),
+    ("two", 50, 50, 50, 50),
+    ("twosuperior", None, None, 178, 178),
+    ("u", 117, 117, 117, 117),
+    ("uacute", None, 156, 250, 250),
+    ("ucircumflex", None, 158, 251, 251),
+    ("udieresis", None, 159, 252, 252),
+    ("ugrave", None, 157, 249, 249),
+    ("underscore", 95, 95, 95, 95),
+    ("v", 118, 118, 118, 118),
+    ("w", 119, 119, 119, 119),
+    ("x", 120, 120, 120, 120),
+    ("y", 121, 121, 121, 121),
+    ("yacute", None, None, 253, 253),
+    ("ydieresis", None, 216, 255, 255),
+    ("yen", 165, 180, 165, 165),
+    ("z", 122, 122, 122, 122),
+    ("zcaron", None, None, 158, 158),
+    ("zero", 48, 48, 48, 48),
+]

pdf2zh/layout.py ADDED Viewed

	@@ -0,0 +1,993 @@

+import heapq
+import logging
+from typing import (
+    Dict,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
+from pdf2zh.pdfcolor import PDFColorSpace
+from pdf2zh.pdfexceptions import PDFTypeError, PDFValueError
+from pdf2zh.pdffont import PDFFont
+from pdf2zh.pdfinterp import Color, PDFGraphicState
+from pdf2zh.pdftypes import PDFStream
+from pdf2zh.utils import (
+    INF,
+    LTComponentT,
+    Matrix,
+    PathSegment,
+    Plane,
+    Point,
+    Rect,
+    apply_matrix_pt,
+    bbox2str,
+    fsplit,
+    get_bound,
+    matrix2str,
+    uniq,
+)
+logger = logging.getLogger(__name__)
+class IndexAssigner:
+    def __init__(self, index: int = 0) -> None:
+        self.index = index
+    def run(self, obj: "LTItem") -> None:
+        if isinstance(obj, LTTextBox):
+            obj.index = self.index
+            self.index += 1
+        elif isinstance(obj, LTTextGroup):
+            for x in obj:
+                self.run(x)
+class LAParams:
+    """Parameters for layout analysis
+    :param line_overlap: If two characters have more overlap than this they
+        are considered to be on the same line. The overlap is specified
+        relative to the minimum height of both characters.
+    :param char_margin: If two characters are closer together than this
+        margin they are considered part of the same line. The margin is
+        specified relative to the width of the character.
+    :param word_margin: If two characters on the same line are further apart
+        than this margin then they are considered to be two separate words, and
+        an intermediate space will be added for readability. The margin is
+        specified relative to the width of the character.
+    :param line_margin: If two lines are are close together they are
+        considered to be part of the same paragraph. The margin is
+        specified relative to the height of a line.
+    :param boxes_flow: Specifies how much a horizontal and vertical position
+        of a text matters when determining the order of text boxes. The value
+        should be within the range of -1.0 (only horizontal position
+        matters) to +1.0 (only vertical position matters). You can also pass
+        `None` to disable advanced layout analysis, and instead return text
+        based on the position of the bottom left corner of the text box.
+    :param detect_vertical: If vertical text should be considered during
+        layout analysis
+    :param all_texts: If layout analysis should be performed on text in
+        figures.
+    """
+    def __init__(
+        self,
+        line_overlap: float = 0.5,
+        char_margin: float = 2.0,
+        line_margin: float = 0.5,
+        word_margin: float = 0.1,
+        boxes_flow: Optional[float] = 0.5,
+        detect_vertical: bool = False,
+        all_texts: bool = False,
+    ) -> None:
+        self.line_overlap = line_overlap
+        self.char_margin = char_margin
+        self.line_margin = line_margin
+        self.word_margin = word_margin
+        self.boxes_flow = boxes_flow
+        self.detect_vertical = detect_vertical
+        self.all_texts = all_texts
+        self._validate()
+    def _validate(self) -> None:
+        if self.boxes_flow is not None:
+            boxes_flow_err_msg = (
+                "LAParam boxes_flow should be None, or a number between -1 and +1"
+            )
+            if not (
+                isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float)
+            ):
+                raise PDFTypeError(boxes_flow_err_msg)
+            if not -1 <= self.boxes_flow <= 1:
+                raise PDFValueError(boxes_flow_err_msg)
+    def __repr__(self) -> str:
+        return (
+            "<LAParams: char_margin=%.1f, line_margin=%.1f, "
+            "word_margin=%.1f all_texts=%r>"
+            % (self.char_margin, self.line_margin, self.word_margin, self.all_texts)
+        )
+class LTItem:
+    """Interface for things that can be analyzed"""
+    def analyze(self, laparams: LAParams) -> None:
+        """Perform the layout analysis."""
+class LTText:
+    """Interface for things that have text"""
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {self.get_text()!r}>"
+    def get_text(self) -> str:
+        """Text contained in this object"""
+        raise NotImplementedError
+class LTComponent(LTItem):
+    """Object with a bounding box"""
+    def __init__(self, bbox: Rect) -> None:
+        LTItem.__init__(self)
+        self.set_bbox(bbox)
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>"
+    # Disable comparison.
+    def __lt__(self, _: object) -> bool:
+        raise PDFValueError
+    def __le__(self, _: object) -> bool:
+        raise PDFValueError
+    def __gt__(self, _: object) -> bool:
+        raise PDFValueError
+    def __ge__(self, _: object) -> bool:
+        raise PDFValueError
+    def set_bbox(self, bbox: Rect) -> None:
+        (x0, y0, x1, y1) = bbox
+        self.x0 = x0
+        self.y0 = y0
+        self.x1 = x1
+        self.y1 = y1
+        self.width = x1 - x0
+        self.height = y1 - y0
+        self.bbox = bbox
+    def is_empty(self) -> bool:
+        return self.width <= 0 or self.height <= 0
+    def is_hoverlap(self, obj: "LTComponent") -> bool:
+        assert isinstance(obj, LTComponent), str(type(obj))
+        return obj.x0 <= self.x1 and self.x0 <= obj.x1
+    def hdistance(self, obj: "LTComponent") -> float:
+        assert isinstance(obj, LTComponent), str(type(obj))
+        if self.is_hoverlap(obj):
+            return 0
+        else:
+            return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
+    def hoverlap(self, obj: "LTComponent") -> float:
+        assert isinstance(obj, LTComponent), str(type(obj))
+        if self.is_hoverlap(obj):
+            return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
+        else:
+            return 0
+    def is_voverlap(self, obj: "LTComponent") -> bool:
+        assert isinstance(obj, LTComponent), str(type(obj))
+        return obj.y0 <= self.y1 and self.y0 <= obj.y1
+    def vdistance(self, obj: "LTComponent") -> float:
+        assert isinstance(obj, LTComponent), str(type(obj))
+        if self.is_voverlap(obj):
+            return 0
+        else:
+            return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
+    def voverlap(self, obj: "LTComponent") -> float:
+        assert isinstance(obj, LTComponent), str(type(obj))
+        if self.is_voverlap(obj):
+            return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
+        else:
+            return 0
+class LTCurve(LTComponent):
+    """A generic Bezier curve
+    The parameter `original_path` contains the original
+    pathing information from the pdf (e.g. for reconstructing Bezier Curves).
+    `dashing_style` contains the Dashing information if any.
+    """
+    def __init__(
+        self,
+        linewidth: float,
+        pts: List[Point],
+        stroke: bool = False,
+        fill: bool = False,
+        evenodd: bool = False,
+        stroking_color: Optional[Color] = None,
+        non_stroking_color: Optional[Color] = None,
+        original_path: Optional[List[PathSegment]] = None,
+        dashing_style: Optional[Tuple[object, object]] = None,
+    ) -> None:
+        LTComponent.__init__(self, get_bound(pts))
+        self.pts = pts
+        self.linewidth = linewidth
+        self.stroke = stroke
+        self.fill = fill
+        self.evenodd = evenodd
+        self.stroking_color = stroking_color
+        self.non_stroking_color = non_stroking_color
+        self.original_path = original_path
+        self.dashing_style = dashing_style
+    def get_pts(self) -> str:
+        return ",".join("%.3f,%.3f" % p for p in self.pts)
+class LTLine(LTCurve):
+    """A single straight line.
+    Could be used for separating text or figures.
+    """
+    def __init__(
+        self,
+        linewidth: float,
+        p0: Point,
+        p1: Point,
+        stroke: bool = False,
+        fill: bool = False,
+        evenodd: bool = False,
+        stroking_color: Optional[Color] = None,
+        non_stroking_color: Optional[Color] = None,
+        original_path: Optional[List[PathSegment]] = None,
+        dashing_style: Optional[Tuple[object, object]] = None,
+    ) -> None:
+        LTCurve.__init__(
+            self,
+            linewidth,
+            [p0, p1],
+            stroke,
+            fill,
+            evenodd,
+            stroking_color,
+            non_stroking_color,
+            original_path,
+            dashing_style,
+        )
+class LTRect(LTCurve):
+    """A rectangle.
+    Could be used for framing another pictures or figures.
+    """
+    def __init__(
+        self,
+        linewidth: float,
+        bbox: Rect,
+        stroke: bool = False,
+        fill: bool = False,
+        evenodd: bool = False,
+        stroking_color: Optional[Color] = None,
+        non_stroking_color: Optional[Color] = None,
+        original_path: Optional[List[PathSegment]] = None,
+        dashing_style: Optional[Tuple[object, object]] = None,
+    ) -> None:
+        (x0, y0, x1, y1) = bbox
+        LTCurve.__init__(
+            self,
+            linewidth,
+            [(x0, y0), (x1, y0), (x1, y1), (x0, y1)],
+            stroke,
+            fill,
+            evenodd,
+            stroking_color,
+            non_stroking_color,
+            original_path,
+            dashing_style,
+        )
+class LTImage(LTComponent):
+    """An image object.
+    Embedded images can be in JPEG, Bitmap or JBIG2.
+    """
+    def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None:
+        LTComponent.__init__(self, bbox)
+        self.name = name
+        self.stream = stream
+        self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height")))
+        self.imagemask = stream.get_any(("IM", "ImageMask"))
+        self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1)
+        self.colorspace = stream.get_any(("CS", "ColorSpace"))
+        if not isinstance(self.colorspace, list):
+            self.colorspace = [self.colorspace]
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} {self.srcsize!r}>"
+class LTAnno(LTItem, LTText):
+    """Actual letter in the text as a Unicode string.
+    Note that, while a LTChar object has actual boundaries, LTAnno objects does
+    not, as these are "virtual" characters, inserted by a layout analyzer
+    according to the relationship between two characters (e.g. a space).
+    """
+    def __init__(self, text: str) -> None:
+        self._text = text
+    def get_text(self) -> str:
+        return self._text
+class LTChar(LTComponent, LTText):
+    """Actual letter in the text as a Unicode string."""
+    def __init__(
+        self,
+        matrix: Matrix,
+        font: PDFFont,
+        fontsize: float,
+        scaling: float,
+        rise: float,
+        text: str,
+        textwidth: float,
+        textdisp: Union[float, Tuple[Optional[float], float]],
+        ncs: PDFColorSpace,
+        graphicstate: PDFGraphicState,
+    ) -> None:
+        LTText.__init__(self)
+        self._text = text
+        self.matrix = matrix
+        self.font = font
+        self.fontname = font.fontname
+        self.ncs = ncs
+        self.graphicstate = graphicstate
+        self.adv = textwidth * fontsize * scaling
+        # compute the boundary rectangle.
+        if font.is_vertical():
+            # vertical
+            assert isinstance(textdisp, tuple)
+            (vx, vy) = textdisp
+            if vx is None:
+                vx = fontsize * 0.5
+            else:
+                vx = vx * fontsize * 0.001
+            vy = (1000 - vy) * fontsize * 0.001
+            bbox_lower_left = (-vx, vy + rise + self.adv)
+            bbox_upper_right = (-vx + fontsize, vy + rise)
+        else:
+            # horizontal
+            descent = 0  # descent = font.get_descent() * fontsize
+            bbox_lower_left = (0, descent + rise)
+            bbox_upper_right = (self.adv, descent + rise + fontsize)
+        (a, b, c, d, e, f) = self.matrix
+        self.upright = a * d * scaling > 0 and b * c <= 0
+        (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
+        (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
+        if x1 < x0:
+            (x0, x1) = (x1, x0)
+        if y1 < y0:
+            (y0, y1) = (y1, y0)
+        LTComponent.__init__(self, (x0, y0, x1, y1))
+        if font.is_vertical():
+            self.size = self.width
+        else:
+            self.size = self.height
+    def __repr__(self) -> str:
+        return "<{} {} matrix={} font={} adv={} text={}>".format(
+            self.__class__.__name__,
+            bbox2str(self.bbox),
+            matrix2str(self.matrix),
+            repr(self.fontname),
+            self.adv,
+            repr(self.get_text()),
+        )
+    def get_text(self) -> str:
+        return self._text
+LTItemT = TypeVar("LTItemT", bound=LTItem)
+class LTContainer(LTComponent, Generic[LTItemT]):
+    """Object that can be extended and analyzed"""
+    def __init__(self, bbox: Rect) -> None:
+        LTComponent.__init__(self, bbox)
+        self._objs: List[LTItemT] = []
+    def __iter__(self) -> Iterator[LTItemT]:
+        return iter(self._objs)
+    def __len__(self) -> int:
+        return len(self._objs)
+    def add(self, obj: LTItemT) -> None:
+        self._objs.append(obj)
+    def extend(self, objs: Iterable[LTItemT]) -> None:
+        for obj in objs:
+            self.add(obj)
+    def analyze(self, laparams: LAParams) -> None:
+        for obj in self._objs:
+            obj.analyze(laparams)
+class LTExpandableContainer(LTContainer[LTItemT]):
+    def __init__(self) -> None:
+        LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
+    # Incompatible override: we take an LTComponent (with bounding box), but
+    # super() LTContainer only considers LTItem (no bounding box).
+    def add(self, obj: LTComponent) -> None:  # type: ignore[override]
+        LTContainer.add(self, cast(LTItemT, obj))
+        self.set_bbox(
+            (
+                min(self.x0, obj.x0),
+                min(self.y0, obj.y0),
+                max(self.x1, obj.x1),
+                max(self.y1, obj.y1),
+            ),
+        )
+class LTTextContainer(LTExpandableContainer[LTItemT], LTText):
+    def __init__(self) -> None:
+        LTText.__init__(self)
+        LTExpandableContainer.__init__(self)
+    def get_text(self) -> str:
+        return "".join(
+            cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)
+        )
+TextLineElement = Union[LTChar, LTAnno]
+class LTTextLine(LTTextContainer[TextLineElement]):
+    """Contains a list of LTChar objects that represent a single text line.
+    The characters are aligned either horizontally or vertically, depending on
+    the text's writing mode.
+    """
+    def __init__(self, word_margin: float) -> None:
+        super().__init__()
+        self.word_margin = word_margin
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>"
+    def analyze(self, laparams: LAParams) -> None:
+        for obj in self._objs:
+            obj.analyze(laparams)
+        LTContainer.add(self, LTAnno("\n"))
+    def find_neighbors(
+        self,
+        plane: Plane[LTComponentT],
+        ratio: float,
+    ) -> List["LTTextLine"]:
+        raise NotImplementedError
+    def is_empty(self) -> bool:
+        return super().is_empty() or self.get_text().isspace()
+class LTTextLineHorizontal(LTTextLine):
+    def __init__(self, word_margin: float) -> None:
+        LTTextLine.__init__(self, word_margin)
+        self._x1: float = +INF
+    # Incompatible override: we take an LTComponent (with bounding box), but
+    # LTContainer only considers LTItem (no bounding box).
+    def add(self, obj: LTComponent) -> None:  # type: ignore[override]
+        if isinstance(obj, LTChar) and self.word_margin:
+            margin = self.word_margin * max(obj.width, obj.height)
+            if self._x1 < obj.x0 - margin:
+                LTContainer.add(self, LTAnno(" "))
+        self._x1 = obj.x1
+        super().add(obj)
+    def find_neighbors(
+        self,
+        plane: Plane[LTComponentT],
+        ratio: float,
+    ) -> List[LTTextLine]:
+        """Finds neighboring LTTextLineHorizontals in the plane.
+        Returns a list of other LTTestLineHorizontals in the plane which are
+        close to self. "Close" can be controlled by ratio. The returned objects
+        will be the same height as self, and also either left-, right-, or
+        centrally-aligned.
+        """
+        d = ratio * self.height
+        objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
+        return [
+            obj
+            for obj in objs
+            if (
+                isinstance(obj, LTTextLineHorizontal)
+                and self._is_same_height_as(obj, tolerance=d)
+                and (
+                    self._is_left_aligned_with(obj, tolerance=d)
+                    or self._is_right_aligned_with(obj, tolerance=d)
+                    or self._is_centrally_aligned_with(obj, tolerance=d)
+                )
+            )
+        ]
+    def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
+        """Whether the left-hand edge of `other` is within `tolerance`."""
+        return abs(other.x0 - self.x0) <= tolerance
+    def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
+        """Whether the right-hand edge of `other` is within `tolerance`."""
+        return abs(other.x1 - self.x1) <= tolerance
+    def _is_centrally_aligned_with(
+        self,
+        other: LTComponent,
+        tolerance: float = 0,
+    ) -> bool:
+        """Whether the horizontal center of `other` is within `tolerance`."""
+        return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
+    def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:
+        return abs(other.height - self.height) <= tolerance
+class LTTextLineVertical(LTTextLine):
+    def __init__(self, word_margin: float) -> None:
+        LTTextLine.__init__(self, word_margin)
+        self._y0: float = -INF
+    # Incompatible override: we take an LTComponent (with bounding box), but
+    # LTContainer only considers LTItem (no bounding box).
+    def add(self, obj: LTComponent) -> None:  # type: ignore[override]
+        if isinstance(obj, LTChar) and self.word_margin:
+            margin = self.word_margin * max(obj.width, obj.height)
+            if obj.y1 + margin < self._y0:
+                LTContainer.add(self, LTAnno(" "))
+        self._y0 = obj.y0
+        super().add(obj)
+    def find_neighbors(
+        self,
+        plane: Plane[LTComponentT],
+        ratio: float,
+    ) -> List[LTTextLine]:
+        """Finds neighboring LTTextLineVerticals in the plane.
+        Returns a list of other LTTextLineVerticals in the plane which are
+        close to self. "Close" can be controlled by ratio. The returned objects
+        will be the same width as self, and also either upper-, lower-, or
+        centrally-aligned.
+        """
+        d = ratio * self.width
+        objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
+        return [
+            obj
+            for obj in objs
+            if (
+                isinstance(obj, LTTextLineVertical)
+                and self._is_same_width_as(obj, tolerance=d)
+                and (
+                    self._is_lower_aligned_with(obj, tolerance=d)
+                    or self._is_upper_aligned_with(obj, tolerance=d)
+                    or self._is_centrally_aligned_with(obj, tolerance=d)
+                )
+            )
+        ]
+    def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
+        """Whether the lower edge of `other` is within `tolerance`."""
+        return abs(other.y0 - self.y0) <= tolerance
+    def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
+        """Whether the upper edge of `other` is within `tolerance`."""
+        return abs(other.y1 - self.y1) <= tolerance
+    def _is_centrally_aligned_with(
+        self,
+        other: LTComponent,
+        tolerance: float = 0,
+    ) -> bool:
+        """Whether the vertical center of `other` is within `tolerance`."""
+        return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
+    def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:
+        return abs(other.width - self.width) <= tolerance
+class LTTextBox(LTTextContainer[LTTextLine]):
+    """Represents a group of text chunks in a rectangular area.
+    Note that this box is created by geometric analysis and does not
+    necessarily represents a logical boundary of the text. It contains a list
+    of LTTextLine objects.
+    """
+    def __init__(self) -> None:
+        LTTextContainer.__init__(self)
+        self.index: int = -1
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}({self.index}) {bbox2str(self.bbox)} {self.get_text()!r}>"
+    def get_writing_mode(self) -> str:
+        raise NotImplementedError
+class LTTextBoxHorizontal(LTTextBox):
+    def analyze(self, laparams: LAParams) -> None:
+        super().analyze(laparams)
+        self._objs.sort(key=lambda obj: -obj.y1)
+    def get_writing_mode(self) -> str:
+        return "lr-tb"
+class LTTextBoxVertical(LTTextBox):
+    def analyze(self, laparams: LAParams) -> None:
+        super().analyze(laparams)
+        self._objs.sort(key=lambda obj: -obj.x1)
+    def get_writing_mode(self) -> str:
+        return "tb-rl"
+TextGroupElement = Union[LTTextBox, "LTTextGroup"]
+class LTTextGroup(LTTextContainer[TextGroupElement]):
+    def __init__(self, objs: Iterable[TextGroupElement]) -> None:
+        super().__init__()
+        self.extend(objs)
+class LTTextGroupLRTB(LTTextGroup):
+    def analyze(self, laparams: LAParams) -> None:
+        super().analyze(laparams)
+        assert laparams.boxes_flow is not None
+        boxes_flow = laparams.boxes_flow
+        # reorder the objects from top-left to bottom-right.
+        self._objs.sort(
+            key=lambda obj: (1 - boxes_flow) * obj.x0
+            - (1 + boxes_flow) * (obj.y0 + obj.y1),
+        )
+class LTTextGroupTBRL(LTTextGroup):
+    def analyze(self, laparams: LAParams) -> None:
+        super().analyze(laparams)
+        assert laparams.boxes_flow is not None
+        boxes_flow = laparams.boxes_flow
+        # reorder the objects from top-right to bottom-left.
+        self._objs.sort(
+            key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)
+            - (1 - boxes_flow) * obj.y1,
+        )
+class LTLayoutContainer(LTContainer[LTComponent]):
+    def __init__(self, bbox: Rect) -> None:
+        LTContainer.__init__(self, bbox)
+        self.groups: Optional[List[LTTextGroup]] = None
+    # group_objects: group text object to textlines.
+    def group_objects(
+        self,
+        laparams: LAParams,
+        objs: Iterable[LTComponent],
+    ) -> Iterator[LTTextLine]:
+        obj0 = None
+        line = None
+        for obj1 in objs:
+            if obj0 is not None:
+                # halign: obj0 and obj1 is horizontally aligned.
+                #
+                #   +------+ - - -
+                #   | obj0 | - - +------+   -
+                #   |      |     | obj1 |   | (line_overlap)
+                #   +------+ - - |      |   -
+                #          - - - +------+
+                #
+                #          |<--->|
+                #        (char_margin)
+                halign = (
+                    obj0.is_voverlap(obj1)
+                    and min(obj0.height, obj1.height) * laparams.line_overlap
+                    < obj0.voverlap(obj1)
+                    and obj0.hdistance(obj1)
+                    < max(obj0.width, obj1.width) * laparams.char_margin
+                )
+                # valign: obj0 and obj1 is vertically aligned.
+                #
+                #   +------+
+                #   | obj0 |
+                #   |      |
+                #   +------+ - - -
+                #     |    |     | (char_margin)
+                #     +------+ - -
+                #     | obj1 |
+                #     |      |
+                #     +------+
+                #
+                #     |<-->|
+                #   (line_overlap)
+                valign = (
+                    laparams.detect_vertical
+                    and obj0.is_hoverlap(obj1)
+                    and min(obj0.width, obj1.width) * laparams.line_overlap
+                    < obj0.hoverlap(obj1)
+                    and obj0.vdistance(obj1)
+                    < max(obj0.height, obj1.height) * laparams.char_margin
+                )
+                if (halign and isinstance(line, LTTextLineHorizontal)) or (
+                    valign and isinstance(line, LTTextLineVertical)
+                ):
+                    line.add(obj1)
+                elif line is not None:
+                    yield line
+                    line = None
+                elif valign and not halign:
+                    line = LTTextLineVertical(laparams.word_margin)
+                    line.add(obj0)
+                    line.add(obj1)
+                elif halign and not valign:
+                    line = LTTextLineHorizontal(laparams.word_margin)
+                    line.add(obj0)
+                    line.add(obj1)
+                else:
+                    line = LTTextLineHorizontal(laparams.word_margin)
+                    line.add(obj0)
+                    yield line
+                    line = None
+            obj0 = obj1
+        if line is None:
+            line = LTTextLineHorizontal(laparams.word_margin)
+            assert obj0 is not None
+            line.add(obj0)
+        yield line
+    def group_textlines(
+        self,
+        laparams: LAParams,
+        lines: Iterable[LTTextLine],
+    ) -> Iterator[LTTextBox]:
+        """Group neighboring lines to textboxes"""
+        plane: Plane[LTTextLine] = Plane(self.bbox)
+        plane.extend(lines)
+        boxes: Dict[LTTextLine, LTTextBox] = {}
+        for line in lines:
+            neighbors = line.find_neighbors(plane, laparams.line_margin)
+            members = [line]
+            for obj1 in neighbors:
+                members.append(obj1)
+                if obj1 in boxes:
+                    members.extend(boxes.pop(obj1))
+            if isinstance(line, LTTextLineHorizontal):
+                box: LTTextBox = LTTextBoxHorizontal()
+            else:
+                box = LTTextBoxVertical()
+            for obj in uniq(members):
+                box.add(obj)
+                boxes[obj] = box
+        done = set()
+        for line in lines:
+            if line not in boxes:
+                continue
+            box = boxes[line]
+            if box in done:
+                continue
+            done.add(box)
+            if not box.is_empty():
+                yield box
+    def group_textboxes(
+        self,
+        laparams: LAParams,
+        boxes: Sequence[LTTextBox],
+    ) -> List[LTTextGroup]:
+        """Group textboxes hierarchically.
+        Get pair-wise distances, via dist func defined below, and then merge
+        from the closest textbox pair. Once obj1 and obj2 are merged /
+        grouped, the resulting group is considered as a new object, and its
+        distances to other objects & groups are added to the process queue.
+        For performance reason, pair-wise distances and object pair info are
+        maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)
+        tuples. It ensures quick access to the smallest element. Note that
+        since comparison operators, e.g., __lt__, are disabled for
+        LTComponent, id(obj) has to appear before obj in element tuples.
+        :param laparams: LAParams object.
+        :param boxes: All textbox objects to be grouped.
+        :return: a list that has only one element, the final top level group.
+        """
+        ElementT = Union[LTTextBox, LTTextGroup]
+        plane: Plane[ElementT] = Plane(self.bbox)
+        def dist(obj1: LTComponent, obj2: LTComponent) -> float:
+            """A distance function between two TextBoxes.
+            Consider the bounding rectangle for obj1 and obj2.
+            Return its area less the areas of obj1 and obj2,
+            shown as 'www' below. This value may be negative.
+                    +------+..........+ (x1, y1)
+                    | obj1 |wwwwwwwwww:
+                    +------+www+------+
+                    :wwwwwwwwww| obj2 |
+            (x0, y0) +..........+------+
+            """
+            x0 = min(obj1.x0, obj2.x0)
+            y0 = min(obj1.y0, obj2.y0)
+            x1 = max(obj1.x1, obj2.x1)
+            y1 = max(obj1.y1, obj2.y1)
+            return (
+                (x1 - x0) * (y1 - y0)
+                - obj1.width * obj1.height
+                - obj2.width * obj2.height
+            )
+        def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]:
+            """Check if there's any other object between obj1 and obj2."""
+            x0 = min(obj1.x0, obj2.x0)
+            y0 = min(obj1.y0, obj2.y0)
+            x1 = max(obj1.x1, obj2.x1)
+            y1 = max(obj1.y1, obj2.y1)
+            objs = set(plane.find((x0, y0, x1, y1)))
+            return objs.difference((obj1, obj2))
+        dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = []
+        for i in range(len(boxes)):
+            box1 = boxes[i]
+            for j in range(i + 1, len(boxes)):
+                box2 = boxes[j]
+                dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))
+        heapq.heapify(dists)
+        plane.extend(boxes)
+        done = set()
+        while len(dists) > 0:
+            (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
+            # Skip objects that are already merged
+            if (id1 not in done) and (id2 not in done):
+                if not skip_isany and isany(obj1, obj2):
+                    heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
+                    continue
+                if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(
+                    obj2,
+                    (LTTextBoxVertical, LTTextGroupTBRL),
+                ):
+                    group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])
+                else:
+                    group = LTTextGroupLRTB([obj1, obj2])
+                plane.remove(obj1)
+                plane.remove(obj2)
+                done.update([id1, id2])
+                for other in plane:
+                    heapq.heappush(
+                        dists,
+                        (False, dist(group, other), id(group), id(other), group, other),
+                    )
+                plane.add(group)
+        # By now only groups are in the plane
+        return list(cast(LTTextGroup, g) for g in plane)
+    def analyze(self, laparams: LAParams) -> None:
+        # textobjs is a list of LTChar objects, i.e.
+        # it has all the individual characters in the page.
+        (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
+        for obj in otherobjs:
+            obj.analyze(laparams)
+        if not textobjs:
+            return
+        textlines = list(self.group_objects(laparams, textobjs))
+        (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
+        for obj in empties:
+            obj.analyze(laparams)
+        textboxes = list(self.group_textlines(laparams, textlines))
+        if laparams.boxes_flow is None:
+            for textbox in textboxes:
+                textbox.analyze(laparams)
+            def getkey(box: LTTextBox) -> Tuple[int, float, float]:
+                if isinstance(box, LTTextBoxVertical):
+                    return (0, -box.x1, -box.y0)
+                else:
+                    return (1, -box.y0, box.x0)
+            textboxes.sort(key=getkey)
+        else:
+            self.groups = self.group_textboxes(laparams, textboxes)
+            assigner = IndexAssigner()
+            for group in self.groups:
+                group.analyze(laparams)
+                assigner.run(group)
+            textboxes.sort(key=lambda box: box.index)
+        self._objs = (
+            cast(List[LTComponent], textboxes)
+            + otherobjs
+            + cast(List[LTComponent], empties)
+        )
+class LTFigure(LTLayoutContainer):
+    """Represents an area used by PDF Form objects.
+    PDF Forms can be used to present figures or pictures by embedding yet
+    another PDF document within a page. Note that LTFigure objects can appear
+    recursively.
+    """
+    def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None:
+        self.name = name
+        self.matrix = matrix
+        (x, y, w, h) = bbox
+        bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h))
+        bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds)
+        LTLayoutContainer.__init__(self, bbox)
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>"
+    def analyze(self, laparams: LAParams) -> None:
+        if not laparams.all_texts:
+            return
+        LTLayoutContainer.analyze(self, laparams)
+class LTPage(LTLayoutContainer):
+    """Represents an entire page.
+    Like any other LTLayoutContainer, an LTPage can be iterated to obtain child
+    objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine.
+    """
+    def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None:
+        LTLayoutContainer.__init__(self, bbox)
+        self.pageid = pageid
+        self.rotate = rotate
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>"

pdf2zh/lzw.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import logging
+from io import BytesIO
+from typing import BinaryIO, Iterator, List, Optional, cast
+from pdf2zh.pdfexceptions import PDFEOFError, PDFException
+logger = logging.getLogger(__name__)
+class CorruptDataError(PDFException):
+    pass
+class LZWDecoder:
+    def __init__(self, fp: BinaryIO) -> None:
+        self.fp = fp
+        self.buff = 0
+        self.bpos = 8
+        self.nbits = 9
+        # NB: self.table stores None only in indices 256 and 257
+        self.table: List[Optional[bytes]] = []
+        self.prevbuf: Optional[bytes] = None
+    def readbits(self, bits: int) -> int:
+        v = 0
+        while 1:
+            # the number of remaining bits we can get from the current buffer.
+            r = 8 - self.bpos
+            if bits <= r:
+                # |-----8-bits-----|
+                # |-bpos-|-bits-|  |
+                # |      |----r----|
+                v = (v << bits) | ((self.buff >> (r - bits)) & ((1 << bits) - 1))
+                self.bpos += bits
+                break
+            else:
+                # |-----8-bits-----|
+                # |-bpos-|---bits----...
+                # |      |----r----|
+                v = (v << r) | (self.buff & ((1 << r) - 1))
+                bits -= r
+                x = self.fp.read(1)
+                if not x:
+                    raise PDFEOFError
+                self.buff = ord(x)
+                self.bpos = 0
+        return v
+    def feed(self, code: int) -> bytes:
+        x = b""
+        if code == 256:
+            self.table = [bytes((c,)) for c in range(256)]  # 0-255
+            self.table.append(None)  # 256
+            self.table.append(None)  # 257
+            self.prevbuf = b""
+            self.nbits = 9
+        elif code == 257:
+            pass
+        elif not self.prevbuf:
+            x = self.prevbuf = cast(bytes, self.table[code])  # assume not None
+        else:
+            if code < len(self.table):
+                x = cast(bytes, self.table[code])  # assume not None
+                self.table.append(self.prevbuf + x[:1])
+            elif code == len(self.table):
+                self.table.append(self.prevbuf + self.prevbuf[:1])
+                x = cast(bytes, self.table[code])
+            else:
+                raise CorruptDataError
+            table_length = len(self.table)
+            if table_length == 511:
+                self.nbits = 10
+            elif table_length == 1023:
+                self.nbits = 11
+            elif table_length == 2047:
+                self.nbits = 12
+            self.prevbuf = x
+        return x
+    def run(self) -> Iterator[bytes]:
+        while 1:
+            try:
+                code = self.readbits(self.nbits)
+            except EOFError:
+                break
+            try:
+                x = self.feed(code)
+            except CorruptDataError:
+                # just ignore corrupt data and stop yielding there
+                break
+            yield x
+            # logger.debug(
+            #     "nbits=%d, code=%d, output=%r, table=%r",
+            #     self.nbits,
+            #     code,
+            #     x,
+            #     self.table[258:],
+            # )
+def lzwdecode(data: bytes) -> bytes:
+    fp = BytesIO(data)
+    s = LZWDecoder(fp).run()
+    return b"".join(s)

pdf2zh/pdf2zh.py ADDED Viewed

	@@ -0,0 +1,310 @@

+#!/usr/bin/env python3
+"""A command line tool for extracting text and images from PDF and
+output it to plain text, html, xml or tags.
+"""
+from __future__ import annotations
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional
+import pymupdf
+import requests
+from pdf2zh import __version__
+from pdf2zh.pdfexceptions import PDFValueError
+if TYPE_CHECKING:
+    from pdf2zh.layout import LAParams
+    from pdf2zh.utils import AnyIO
+OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
+def setup_log() -> None:
+    logging.basicConfig()
+    try:
+        import doclayout_yolo
+        doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING)
+    except ImportError:
+        pass
+def check_files(files: List[str]) -> List[str]:
+    files = [
+        f for f in files if not f.startswith("http://")
+    ]  # exclude online files, http
+    files = [
+        f for f in files if not f.startswith("https://")
+    ]  # exclude online files, https
+    missing_files = [file for file in files if not os.path.exists(file)]
+    return missing_files
+def float_or_disabled(x: str) -> Optional[float]:
+    if x.lower().strip() == "disabled":
+        return None
+    try:
+        return float(x)
+    except ValueError:
+        raise argparse.ArgumentTypeError(f"invalid float value: {x}")
+def extract_text(
+    files: Iterable[str] = [],
+    outfile: str = "-",
+    laparams: Optional[LAParams] = None,
+    output_type: str = "text",
+    codec: str = "utf-8",
+    strip_control: bool = False,
+    maxpages: int = 0,
+    pages: Optional[Container[int]] = None,
+    password: str = "",
+    scale: float = 1.0,
+    rotation: int = 0,
+    layoutmode: str = "normal",
+    output_dir: Optional[str] = None,
+    debug: bool = False,
+    disable_caching: bool = False,
+    vfont: str = "",
+    vchar: str = "",
+    thread: int = 0,
+    lang_in: str = "",
+    lang_out: str = "",
+    service: str = "",
+    callback: object = None,
+    output: str = "",
+    **kwargs: Any,
+) -> AnyIO:
+    import pdf2zh.high_level
+    from pdf2zh.doclayout import DocLayoutModel
+    if not files:
+        raise PDFValueError("Must provide files to work upon!")
+    if output_type == "text" and outfile != "-":
+        for override, alttype in OUTPUT_TYPES:
+            if outfile.endswith(override):
+                output_type = alttype
+    outfp: AnyIO = sys.stdout
+    model = DocLayoutModel.load_available()
+    for file in files:
+        if file.startswith("http://") or file.startswith("https://"):
+            print("Online files detected, downloading...")
+            try:
+                r = requests.get(file, allow_redirects=True)
+                if r.status_code == 200:
+                    if not os.path.exists("./pdf2zh_files"):
+                        print("Making a temporary dir for downloading PDF files...")
+                        os.mkdir(os.path.dirname("./pdf2zh_files"))
+                    with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
+                        print(f"Writing the file: {file}...")
+                        f.write(r.content)
+                    file = "./pdf2zh_files/tmp_download.pdf"
+                else:
+                    r.raise_for_status()
+            except Exception as e:
+                raise PDFValueError(
+                    f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
+                )
+        filename = os.path.splitext(os.path.basename(file))[0]
+        doc_en = pymupdf.open(file)
+        page_count = doc_en.page_count
+        font_list = ["china-ss", "tiro"]
+        font_id = {}
+        for page in doc_en:
+            for font in font_list:
+                font_id[font] = page.insert_font(font)
+        xreflen = doc_en.xref_length()
+        for xref in range(1, xreflen):
+            for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
+                try:  # xref 读写可能出错
+                    font_res = doc_en.xref_get_key(xref, f"{label}Font")
+                    if font_res[0] == "dict":
+                        for font in font_list:
+                            font_exist = doc_en.xref_get_key(
+                                xref, f"{label}Font/{font}"
+                            )
+                            if font_exist[0] == "null":
+                                doc_en.xref_set_key(
+                                    xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
+                                )
+                except Exception:
+                    pass
+        doc_en.save(Path(output) / f"{filename}-en.pdf")
+        with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
+            obj_patch: dict = pdf2zh.high_level.extract_text_to_fp(fp, **locals())
+        for obj_id, ops_new in obj_patch.items():
+            # ops_old=doc_en.xref_stream(obj_id)
+            # print(obj_id)
+            # print(ops_old)
+            # print(ops_new.encode())
+            doc_en.update_stream(obj_id, ops_new.encode())
+        doc_zh = doc_en
+        doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
+        doc_dual.insert_file(doc_zh)
+        for id in range(page_count):
+            doc_dual.move_page(page_count + id, id * 2 + 1)
+        doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
+        doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
+        doc_zh.close()
+        doc_dual.close()
+        os.remove(Path(output) / f"{filename}-en.pdf")
+    return
+def create_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__, add_help=True)
+    parser.add_argument(
+        "files",
+        type=str,
+        default=None,
+        nargs="*",
+        help="One or more paths to PDF files.",
+    )
+    parser.add_argument(
+        "--version",
+        "-v",
+        action="version",
+        version=f"pdf2zh v{__version__}",
+    )
+    parser.add_argument(
+        "--debug",
+        "-d",
+        default=False,
+        action="store_true",
+        help="Use debug logging level.",
+    )
+    parse_params = parser.add_argument_group(
+        "Parser",
+        description="Used during PDF parsing",
+    )
+    parse_params.add_argument(
+        "--pages",
+        "-p",
+        type=str,
+        help="The list of page numbers to parse.",
+    )
+    parse_params.add_argument(
+        "--password",
+        "-P",
+        type=str,
+        default="",
+        help="The password to use for decrypting PDF file.",
+    )
+    parse_params.add_argument(
+        "--vfont",
+        "-f",
+        type=str,
+        default="",
+        help="The regex to math font name of formula.",
+    )
+    parse_params.add_argument(
+        "--vchar",
+        "-c",
+        type=str,
+        default="",
+        help="The regex to math character of formula.",
+    )
+    parse_params.add_argument(
+        "--lang-in",
+        "-li",
+        type=str,
+        default="auto",
+        help="The code of source language.",
+    )
+    parse_params.add_argument(
+        "--lang-out",
+        "-lo",
+        type=str,
+        default="auto",
+        help="The code of target language.",
+    )
+    parse_params.add_argument(
+        "--service",
+        "-s",
+        type=str,
+        default="google",
+        help="The service to use for translation.",
+    )
+    parse_params.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        default="",
+        help="Output directory for files.",
+    )
+    parse_params.add_argument(
+        "--thread",
+        "-t",
+        type=int,
+        default=4,
+        help="The number of threads to execute translation.",
+    )
+    parse_params.add_argument(
+        "--interactive",
+        "-i",
+        action="store_true",
+        help="Interact with GUI.",
+    )
+    parse_params.add_argument(
+        "--share",
+        action="store_true",
+        help="Enable Gradio Share",
+    )
+    return parser
+def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
+    parsed_args = create_parser().parse_args(args=args)
+    if parsed_args.pages:
+        pages = []
+        for p in parsed_args.pages.split(","):
+            if "-" in p:
+                start, end = p.split("-")
+                pages.extend(range(int(start) - 1, int(end)))
+            else:
+                pages.append(int(p) - 1)
+        parsed_args.pages = pages
+    return parsed_args
+def main(args: Optional[List[str]] = None) -> int:
+    parsed_args = parse_args(args)
+    missing_files = check_files(parsed_args.files)
+    if missing_files:
+        print("The following files do not exist:", file=sys.stderr)
+        for file in missing_files:
+            print(f"  {file}", file=sys.stderr)
+        return -1
+    if parsed_args.interactive:
+        from pdf2zh.gui import setup_gui
+        setup_gui(parsed_args.share)
+        return 0
+    setup_log()
+    extract_text(**vars(parsed_args))
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())
+    sys.exit(main())

pdf2zh/pdfcolor.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import collections
+from typing import Dict
+from pdf2zh.psparser import LIT
+LITERAL_DEVICE_GRAY = LIT("DeviceGray")
+LITERAL_DEVICE_RGB = LIT("DeviceRGB")
+LITERAL_DEVICE_CMYK = LIT("DeviceCMYK")
+# Abbreviations for inline images
+LITERAL_INLINE_DEVICE_GRAY = LIT("G")
+LITERAL_INLINE_DEVICE_RGB = LIT("RGB")
+LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK")
+class PDFColorSpace:
+    def __init__(self, name: str, ncomponents: int) -> None:
+        self.name = name
+        self.ncomponents = ncomponents
+    def __repr__(self) -> str:
+        return "<PDFColorSpace: %s, ncomponents=%d>" % (self.name, self.ncomponents)
+PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict()
+for name, n in [
+    ("DeviceGray", 1),  # default value first
+    ("CalRGB", 3),
+    ("CalGray", 1),
+    ("Lab", 3),
+    ("DeviceRGB", 3),
+    ("DeviceCMYK", 4),
+    ("Separation", 1),
+    ("Indexed", 1),
+    ("Pattern", 1),
+]:
+    PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)

pdf2zh/pdfdevice.py ADDED Viewed

	@@ -0,0 +1,316 @@

+from typing import (
+    TYPE_CHECKING,
+    BinaryIO,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Union,
+    cast,
+)
+from pdf2zh import utils
+from pdf2zh.pdfcolor import PDFColorSpace
+from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined
+from pdf2zh.pdfpage import PDFPage
+from pdf2zh.pdftypes import PDFStream
+from pdf2zh.psparser import PSLiteral
+from pdf2zh.utils import Matrix, PathSegment, Point, Rect
+if TYPE_CHECKING:
+    from pdf2zh.pdfinterp import (
+        PDFGraphicState,
+        PDFResourceManager,
+        PDFStackT,
+        PDFTextState,
+    )
+PDFTextSeq = Iterable[Union[int, float, bytes]]
+class PDFDevice:
+    """Translate the output of PDFPageInterpreter to the output that is needed"""
+    def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
+        self.rsrcmgr = rsrcmgr
+        self.ctm: Optional[Matrix] = None
+    def __repr__(self) -> str:
+        return "<PDFDevice>"
+    def __enter__(self) -> "PDFDevice":
+        return self
+    def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
+        self.close()
+    def close(self) -> None:
+        pass
+    def set_ctm(self, ctm: Matrix) -> None:
+        self.ctm = ctm
+    def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
+        pass
+    def end_tag(self) -> None:
+        pass
+    def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
+        pass
+    def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
+        pass
+    def end_page(self, page: PDFPage) -> None:
+        pass
+    def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
+        pass
+    def end_figure(self, name: str) -> None:
+        pass
+    def paint_path(
+        self,
+        graphicstate: "PDFGraphicState",
+        stroke: bool,
+        fill: bool,
+        evenodd: bool,
+        path: Sequence[PathSegment],
+    ) -> None:
+        pass
+    def render_image(self, name: str, stream: PDFStream) -> None:
+        pass
+    def render_string(
+        self,
+        textstate: "PDFTextState",
+        seq: PDFTextSeq,
+        ncs: PDFColorSpace,
+        graphicstate: "PDFGraphicState",
+    ) -> None:
+        pass
+class PDFTextDevice(PDFDevice):
+    def render_string(
+        self,
+        textstate: "PDFTextState",
+        seq: PDFTextSeq,
+        ncs: PDFColorSpace,
+        graphicstate: "PDFGraphicState",
+    ) -> None:
+        assert self.ctm is not None
+        matrix = utils.mult_matrix(textstate.matrix, self.ctm)
+        font = textstate.font
+        fontsize = textstate.fontsize
+        scaling = textstate.scaling * 0.01
+        charspace = textstate.charspace * scaling
+        wordspace = textstate.wordspace * scaling
+        rise = textstate.rise
+        assert font is not None
+        if font.is_multibyte():
+            wordspace = 0
+        dxscale = 0.001 * fontsize * scaling
+        if font.is_vertical():
+            textstate.linematrix = self.render_string_vertical(
+                seq,
+                matrix,
+                textstate.linematrix,
+                font,
+                fontsize,
+                scaling,
+                charspace,
+                wordspace,
+                rise,
+                dxscale,
+                ncs,
+                graphicstate,
+            )
+        else:
+            textstate.linematrix = self.render_string_horizontal(
+                seq,
+                matrix,
+                textstate.linematrix,
+                font,
+                fontsize,
+                scaling,
+                charspace,
+                wordspace,
+                rise,
+                dxscale,
+                ncs,
+                graphicstate,
+            )
+    def render_string_horizontal(
+        self,
+        seq: PDFTextSeq,
+        matrix: Matrix,
+        pos: Point,
+        font: PDFFont,
+        fontsize: float,
+        scaling: float,
+        charspace: float,
+        wordspace: float,
+        rise: float,
+        dxscale: float,
+        ncs: PDFColorSpace,
+        graphicstate: "PDFGraphicState",
+    ) -> Point:
+        (x, y) = pos
+        needcharspace = False
+        for obj in seq:
+            if isinstance(obj, (int, float)):
+                x -= obj * dxscale
+                needcharspace = True
+            else:
+                for cid in font.decode(obj):
+                    if needcharspace:
+                        x += charspace
+                    x += self.render_char(
+                        utils.translate_matrix(matrix, (x, y)),
+                        font,
+                        fontsize,
+                        scaling,
+                        rise,
+                        cid,
+                        ncs,
+                        graphicstate,
+                    )
+                    if cid == 32 and wordspace:
+                        x += wordspace
+                    needcharspace = True
+        return (x, y)
+    def render_string_vertical(
+        self,
+        seq: PDFTextSeq,
+        matrix: Matrix,
+        pos: Point,
+        font: PDFFont,
+        fontsize: float,
+        scaling: float,
+        charspace: float,
+        wordspace: float,
+        rise: float,
+        dxscale: float,
+        ncs: PDFColorSpace,
+        graphicstate: "PDFGraphicState",
+    ) -> Point:
+        (x, y) = pos
+        needcharspace = False
+        for obj in seq:
+            if isinstance(obj, (int, float)):
+                y -= obj * dxscale
+                needcharspace = True
+            else:
+                for cid in font.decode(obj):
+                    if needcharspace:
+                        y += charspace
+                    y += self.render_char(
+                        utils.translate_matrix(matrix, (x, y)),
+                        font,
+                        fontsize,
+                        scaling,
+                        rise,
+                        cid,
+                        ncs,
+                        graphicstate,
+                    )
+                    if cid == 32 and wordspace:
+                        y += wordspace
+                    needcharspace = True
+        return (x, y)
+    def render_char(
+        self,
+        matrix: Matrix,
+        font: PDFFont,
+        fontsize: float,
+        scaling: float,
+        rise: float,
+        cid: int,
+        ncs: PDFColorSpace,
+        graphicstate: "PDFGraphicState",
+    ) -> float:
+        return 0
+class TagExtractor(PDFDevice):
+    def __init__(
+        self,
+        rsrcmgr: "PDFResourceManager",
+        outfp: BinaryIO,
+        codec: str = "utf-8",
+    ) -> None:
+        PDFDevice.__init__(self, rsrcmgr)
+        self.outfp = outfp
+        self.codec = codec
+        self.pageno = 0
+        self._stack: List[PSLiteral] = []
+    def render_string(
+        self,
+        textstate: "PDFTextState",
+        seq: PDFTextSeq,
+        ncs: PDFColorSpace,
+        graphicstate: "PDFGraphicState",
+    ) -> None:
+        font = textstate.font
+        assert font is not None
+        text = ""
+        for obj in seq:
+            if isinstance(obj, str):
+                obj = utils.make_compat_bytes(obj)
+            if not isinstance(obj, bytes):
+                continue
+            chars = font.decode(obj)
+            for cid in chars:
+                try:
+                    char = font.to_unichr(cid)
+                    text += char
+                except PDFUnicodeNotDefined:
+                    pass
+        self._write(utils.enc(text))
+    def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
+        output = '<page id="%s" bbox="%s" rotate="%d">' % (
+            self.pageno,
+            utils.bbox2str(page.mediabox),
+            page.rotate,
+        )
+        self._write(output)
+    def end_page(self, page: PDFPage) -> None:
+        self._write("</page>\n")
+        self.pageno += 1
+    def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
+        s = ""
+        if isinstance(props, dict):
+            s = "".join(
+                [
+                    f' {utils.enc(k)}="{utils.make_compat_str(v)}"'
+                    for (k, v) in sorted(props.items())
+                ],
+            )
+        out_s = f"<{utils.enc(cast(str, tag.name))}{s}>"
+        self._write(out_s)
+        self._stack.append(tag)
+    def end_tag(self) -> None:
+        assert self._stack, str(self.pageno)
+        tag = self._stack.pop(-1)
+        out_s = "</%s>" % utils.enc(cast(str, tag.name))
+        self._write(out_s)
+    def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
+        self.begin_tag(tag, props)
+        self._stack.pop(-1)
+    def _write(self, s: str) -> None:
+        self.outfp.write(s.encode(self.codec))

pdf2zh/pdfdocument.py ADDED Viewed

	@@ -0,0 +1,1069 @@

+import itertools
+import logging
+import re
+import struct
+from hashlib import md5, sha256, sha384, sha512
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    KeysView,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    Union,
+    cast,
+)
+from cryptography.hazmat.backends import default_backend
+from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
+from pdf2zh import settings
+from pdf2zh.arcfour import Arcfour
+from pdf2zh.data_structures import NumberTree
+from pdf2zh.pdfexceptions import (
+    PDFException,
+    PDFKeyError,
+    PDFObjectNotFound,
+    PDFTypeError,
+)
+from pdf2zh.pdfparser import PDFParser, PDFStreamParser, PDFSyntaxError
+from pdf2zh.pdftypes import (
+    DecipherCallable,
+    PDFStream,
+    decipher_all,
+    dict_value,
+    int_value,
+    list_value,
+    str_value,
+    stream_value,
+    uint_value,
+)
+from pdf2zh.psexceptions import PSEOF
+from pdf2zh.psparser import KWD, LIT, literal_name
+from pdf2zh.utils import (
+    choplist,
+    decode_text,
+    format_int_alpha,
+    format_int_roman,
+    nunpack,
+)
+log = logging.getLogger(__name__)
+class PDFNoValidXRef(PDFSyntaxError):
+    pass
+class PDFNoValidXRefWarning(SyntaxWarning):
+    """Legacy warning for missing xref.
+    Not used anymore because warnings.warn is replaced by logger.Logger.warn.
+    """
+class PDFNoOutlines(PDFException):
+    pass
+class PDFNoPageLabels(PDFException):
+    pass
+class PDFDestinationNotFound(PDFException):
+    pass
+class PDFEncryptionError(PDFException):
+    pass
+class PDFPasswordIncorrect(PDFEncryptionError):
+    pass
+class PDFEncryptionWarning(UserWarning):
+    """Legacy warning for failed decryption.
+    Not used anymore because warnings.warn is replaced by logger.Logger.warn.
+    """
+class PDFTextExtractionNotAllowedWarning(UserWarning):
+    """Legacy warning for PDF that does not allow extraction.
+    Not used anymore because warnings.warn is replaced by logger.Logger.warn.
+    """
+class PDFTextExtractionNotAllowed(PDFEncryptionError):
+    pass
+# some predefined literals and keywords.
+LITERAL_OBJSTM = LIT("ObjStm")
+LITERAL_XREF = LIT("XRef")
+LITERAL_CATALOG = LIT("Catalog")
+class PDFBaseXRef:
+    def get_trailer(self) -> Dict[str, Any]:
+        raise NotImplementedError
+    def get_objids(self) -> Iterable[int]:
+        return []
+    # Must return
+    #     (strmid, index, genno)
+    #  or (None, pos, genno)
+    def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
+        raise PDFKeyError(objid)
+    def load(self, parser: PDFParser) -> None:
+        raise NotImplementedError
+class PDFXRef(PDFBaseXRef):
+    def __init__(self) -> None:
+        self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {}
+        self.trailer: Dict[str, Any] = {}
+    def __repr__(self) -> str:
+        return "<PDFXRef: offsets=%r>" % (self.offsets.keys())
+    def load(self, parser: PDFParser) -> None:
+        while True:
+            try:
+                (pos, line) = parser.nextline()
+                line = line.strip()
+                if not line:
+                    continue
+            except PSEOF:
+                raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
+            if line.startswith(b"trailer"):
+                parser.seek(pos)
+                break
+            f = line.split(b" ")
+            if len(f) != 2:
+                error_msg = f"Trailer not found: {parser!r}: line={line!r}"
+                raise PDFNoValidXRef(error_msg)
+            try:
+                (start, nobjs) = map(int, f)
+            except ValueError:
+                error_msg = f"Invalid line: {parser!r}: line={line!r}"
+                raise PDFNoValidXRef(error_msg)
+            for objid in range(start, start + nobjs):
+                try:
+                    (_, line) = parser.nextline()
+                    line = line.strip()
+                except PSEOF:
+                    raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
+                f = line.split(b" ")
+                if len(f) != 3:
+                    error_msg = f"Invalid XRef format: {parser!r}, line={line!r}"
+                    raise PDFNoValidXRef(error_msg)
+                (pos_b, genno_b, use_b) = f
+                if use_b != b"n":
+                    continue
+                self.offsets[objid] = (None, int(pos_b), int(genno_b))
+        # log.debug("xref objects: %r", self.offsets)
+        self.load_trailer(parser)
+    def load_trailer(self, parser: PDFParser) -> None:
+        try:
+            (_, kwd) = parser.nexttoken()
+            assert kwd is KWD(b"trailer"), str(kwd)
+            _, (_, dic) = parser.nextobject()
+        except PSEOF:
+            x = parser.pop(1)
+            if not x:
+                raise PDFNoValidXRef("Unexpected EOF - file corrupted")
+            (_, dic) = x[0]
+        self.trailer.update(dict_value(dic))
+        # log.debug("trailer=%r", self.trailer)
+    def get_trailer(self) -> Dict[str, Any]:
+        return self.trailer
+    def get_objids(self) -> KeysView[int]:
+        return self.offsets.keys()
+    def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
+        return self.offsets[objid]
+class PDFXRefFallback(PDFXRef):
+    def __repr__(self) -> str:
+        return "<PDFXRefFallback: offsets=%r>" % (self.offsets.keys())
+    PDFOBJ_CUE = re.compile(r"^(\d+)\s+(\d+)\s+obj\b")
+    def load(self, parser: PDFParser) -> None:
+        parser.seek(0)
+        while 1:
+            try:
+                (pos, line_bytes) = parser.nextline()
+            except PSEOF:
+                break
+            if line_bytes.startswith(b"trailer"):
+                parser.seek(pos)
+                self.load_trailer(parser)
+                # log.debug("trailer: %r", self.trailer)
+                break
+            line = line_bytes.decode("latin-1")  # default pdf encoding
+            m = self.PDFOBJ_CUE.match(line)
+            if not m:
+                continue
+            (objid_s, genno_s) = m.groups()
+            objid = int(objid_s)
+            genno = int(genno_s)
+            self.offsets[objid] = (None, pos, genno)
+            # expand ObjStm.
+            parser.seek(pos)
+            _, (_, obj) = parser.nextobject()
+            if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM:
+                stream = stream_value(obj)
+                try:
+                    n = stream["N"]
+                except KeyError:
+                    if settings.STRICT:
+                        raise PDFSyntaxError("N is not defined: %r" % stream)
+                    n = 0
+                parser1 = PDFStreamParser(stream.get_data())
+                objs: List[int] = []
+                try:
+                    while 1:
+                        _, (_, obj) = parser1.nextobject()
+                        objs.append(cast(int, obj))
+                except PSEOF:
+                    pass
+                n = min(n, len(objs) // 2)
+                for index in range(n):
+                    objid1 = objs[index * 2]
+                    self.offsets[objid1] = (objid, index, 0)
+class PDFXRefStream(PDFBaseXRef):
+    def __init__(self) -> None:
+        self.data: Optional[bytes] = None
+        self.entlen: Optional[int] = None
+        self.fl1: Optional[int] = None
+        self.fl2: Optional[int] = None
+        self.fl3: Optional[int] = None
+        self.ranges: List[Tuple[int, int]] = []
+    def __repr__(self) -> str:
+        return "<PDFXRefStream: ranges=%r>" % (self.ranges)
+    def load(self, parser: PDFParser) -> None:
+        (_, objid) = parser.nexttoken()  # ignored
+        (_, genno) = parser.nexttoken()  # ignored
+        (_, kwd) = parser.nexttoken()
+        _, (_, stream) = parser.nextobject()
+        if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF:
+            raise PDFNoValidXRef("Invalid PDF stream spec.")
+        size = stream["Size"]
+        index_array = stream.get("Index", (0, size))
+        if len(index_array) % 2 != 0:
+            raise PDFSyntaxError("Invalid index number")
+        self.ranges.extend(cast(Iterator[Tuple[int, int]], choplist(2, index_array)))
+        (self.fl1, self.fl2, self.fl3) = stream["W"]
+        assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
+        self.data = stream.get_data()
+        self.entlen = self.fl1 + self.fl2 + self.fl3
+        self.trailer = stream.attrs
+        # log.debug(
+        #     "xref stream: objid=%s, fields=%d,%d,%d",
+        #     ", ".join(map(repr, self.ranges)),
+        #     self.fl1,
+        #     self.fl2,
+        #     self.fl3,
+        # )
+    def get_trailer(self) -> Dict[str, Any]:
+        return self.trailer
+    def get_objids(self) -> Iterator[int]:
+        for start, nobjs in self.ranges:
+            for i in range(nobjs):
+                assert self.entlen is not None
+                assert self.data is not None
+                offset = self.entlen * i
+                ent = self.data[offset : offset + self.entlen]
+                f1 = nunpack(ent[: self.fl1], 1)
+                if f1 == 1 or f1 == 2:
+                    yield start + i
+    def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
+        index = 0
+        for start, nobjs in self.ranges:
+            if start <= objid and objid < start + nobjs:
+                index += objid - start
+                break
+            else:
+                index += nobjs
+        else:
+            raise PDFKeyError(objid)
+        assert self.entlen is not None
+        assert self.data is not None
+        assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
+        offset = self.entlen * index
+        ent = self.data[offset : offset + self.entlen]
+        f1 = nunpack(ent[: self.fl1], 1)
+        f2 = nunpack(ent[self.fl1 : self.fl1 + self.fl2])
+        f3 = nunpack(ent[self.fl1 + self.fl2 :])
+        if f1 == 1:
+            return (None, f2, f3)
+        elif f1 == 2:
+            return (f2, f3, 0)
+        else:
+            # this is a free object
+            raise PDFKeyError(objid)
+class PDFStandardSecurityHandler:
+    PASSWORD_PADDING = (
+        b"(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08"
+        b"..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz"
+    )
+    supported_revisions: Tuple[int, ...] = (2, 3)
+    def __init__(
+        self,
+        docid: Sequence[bytes],
+        param: Dict[str, Any],
+        password: str = "",
+    ) -> None:
+        self.docid = docid
+        self.param = param
+        self.password = password
+        self.init()
+    def init(self) -> None:
+        self.init_params()
+        if self.r not in self.supported_revisions:
+            error_msg = "Unsupported revision: param=%r" % self.param
+            raise PDFEncryptionError(error_msg)
+        self.init_key()
+    def init_params(self) -> None:
+        self.v = int_value(self.param.get("V", 0))
+        self.r = int_value(self.param["R"])
+        self.p = uint_value(self.param["P"], 32)
+        self.o = str_value(self.param["O"])
+        self.u = str_value(self.param["U"])
+        self.length = int_value(self.param.get("Length", 40))
+    def init_key(self) -> None:
+        self.key = self.authenticate(self.password)
+        if self.key is None:
+            raise PDFPasswordIncorrect
+    def is_printable(self) -> bool:
+        return bool(self.p & 4)
+    def is_modifiable(self) -> bool:
+        return bool(self.p & 8)
+    def is_extractable(self) -> bool:
+        return bool(self.p & 16)
+    def compute_u(self, key: bytes) -> bytes:
+        if self.r == 2:
+            # Algorithm 3.4
+            return Arcfour(key).encrypt(self.PASSWORD_PADDING)  # 2
+        else:
+            # Algorithm 3.5
+            hash = md5(self.PASSWORD_PADDING)  # 2
+            hash.update(self.docid[0])  # 3
+            result = Arcfour(key).encrypt(hash.digest())  # 4
+            for i in range(1, 20):  # 5
+                k = b"".join(bytes((c ^ i,)) for c in iter(key))
+                result = Arcfour(k).encrypt(result)
+            result += result  # 6
+            return result
+    def compute_encryption_key(self, password: bytes) -> bytes:
+        # Algorithm 3.2
+        password = (password + self.PASSWORD_PADDING)[:32]  # 1
+        hash = md5(password)  # 2
+        hash.update(self.o)  # 3
+        # See https://github.com/pdf2zh/pdf2zh.six/issues/186
+        hash.update(struct.pack("<L", self.p))  # 4
+        hash.update(self.docid[0])  # 5
+        if self.r >= 4:
+            if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata:
+                hash.update(b"\xff\xff\xff\xff")
+        result = hash.digest()
+        n = 5
+        if self.r >= 3:
+            n = self.length // 8
+            for _ in range(50):
+                result = md5(result[:n]).digest()
+        return result[:n]
+    def authenticate(self, password: str) -> Optional[bytes]:
+        password_bytes = password.encode("latin1")
+        key = self.authenticate_user_password(password_bytes)
+        if key is None:
+            key = self.authenticate_owner_password(password_bytes)
+        return key
+    def authenticate_user_password(self, password: bytes) -> Optional[bytes]:
+        key = self.compute_encryption_key(password)
+        if self.verify_encryption_key(key):
+            return key
+        else:
+            return None
+    def verify_encryption_key(self, key: bytes) -> bool:
+        # Algorithm 3.6
+        u = self.compute_u(key)
+        if self.r == 2:
+            return u == self.u
+        return u[:16] == self.u[:16]
+    def authenticate_owner_password(self, password: bytes) -> Optional[bytes]:
+        # Algorithm 3.7
+        password = (password + self.PASSWORD_PADDING)[:32]
+        hash = md5(password)
+        if self.r >= 3:
+            for _ in range(50):
+                hash = md5(hash.digest())
+        n = 5
+        if self.r >= 3:
+            n = self.length // 8
+        key = hash.digest()[:n]
+        if self.r == 2:
+            user_password = Arcfour(key).decrypt(self.o)
+        else:
+            user_password = self.o
+            for i in range(19, -1, -1):
+                k = b"".join(bytes((c ^ i,)) for c in iter(key))
+                user_password = Arcfour(k).decrypt(user_password)
+        return self.authenticate_user_password(user_password)
+    def decrypt(
+        self,
+        objid: int,
+        genno: int,
+        data: bytes,
+        attrs: Optional[Dict[str, Any]] = None,
+    ) -> bytes:
+        return self.decrypt_rc4(objid, genno, data)
+    def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes:
+        assert self.key is not None
+        key = self.key + struct.pack("<L", objid)[:3] + struct.pack("<L", genno)[:2]
+        hash = md5(key)
+        key = hash.digest()[: min(len(key), 16)]
+        return Arcfour(key).decrypt(data)
+class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
+    supported_revisions: Tuple[int, ...] = (4,)
+    def init_params(self) -> None:
+        super().init_params()
+        self.length = 128
+        self.cf = dict_value(self.param.get("CF"))
+        self.stmf = literal_name(self.param["StmF"])
+        self.strf = literal_name(self.param["StrF"])
+        self.encrypt_metadata = bool(self.param.get("EncryptMetadata", True))
+        if self.stmf != self.strf:
+            error_msg = "Unsupported crypt filter: param=%r" % self.param
+            raise PDFEncryptionError(error_msg)
+        self.cfm = {}
+        for k, v in self.cf.items():
+            f = self.get_cfm(literal_name(v["CFM"]))
+            if f is None:
+                error_msg = "Unknown crypt filter method: param=%r" % self.param
+                raise PDFEncryptionError(error_msg)
+            self.cfm[k] = f
+        self.cfm["Identity"] = self.decrypt_identity
+        if self.strf not in self.cfm:
+            error_msg = "Undefined crypt filter: param=%r" % self.param
+            raise PDFEncryptionError(error_msg)
+    def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
+        if name == "V2":
+            return self.decrypt_rc4
+        elif name == "AESV2":
+            return self.decrypt_aes128
+        else:
+            return None
+    def decrypt(
+        self,
+        objid: int,
+        genno: int,
+        data: bytes,
+        attrs: Optional[Dict[str, Any]] = None,
+        name: Optional[str] = None,
+    ) -> bytes:
+        if not self.encrypt_metadata and attrs is not None:
+            t = attrs.get("Type")
+            if t is not None and literal_name(t) == "Metadata":
+                return data
+        if name is None:
+            name = self.strf
+        return self.cfm[name](objid, genno, data)
+    def decrypt_identity(self, objid: int, genno: int, data: bytes) -> bytes:
+        return data
+    def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes:
+        assert self.key is not None
+        key = (
+            self.key
+            + struct.pack("<L", objid)[:3]
+            + struct.pack("<L", genno)[:2]
+            + b"sAlT"
+        )
+        hash = md5(key)
+        key = hash.digest()[: min(len(key), 16)]
+        initialization_vector = data[:16]
+        ciphertext = data[16:]
+        cipher = Cipher(
+            algorithms.AES(key),
+            modes.CBC(initialization_vector),
+            backend=default_backend(),
+        )  # type: ignore
+        return cipher.decryptor().update(ciphertext)  # type: ignore
+class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
+    supported_revisions = (5, 6)
+    def init_params(self) -> None:
+        super().init_params()
+        self.length = 256
+        self.oe = str_value(self.param["OE"])
+        self.ue = str_value(self.param["UE"])
+        self.o_hash = self.o[:32]
+        self.o_validation_salt = self.o[32:40]
+        self.o_key_salt = self.o[40:]
+        self.u_hash = self.u[:32]
+        self.u_validation_salt = self.u[32:40]
+        self.u_key_salt = self.u[40:]
+    def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
+        if name == "AESV3":
+            return self.decrypt_aes256
+        else:
+            return None
+    def authenticate(self, password: str) -> Optional[bytes]:
+        password_b = self._normalize_password(password)
+        hash = self._password_hash(password_b, self.o_validation_salt, self.u)
+        if hash == self.o_hash:
+            hash = self._password_hash(password_b, self.o_key_salt, self.u)
+            cipher = Cipher(
+                algorithms.AES(hash),
+                modes.CBC(b"\0" * 16),
+                backend=default_backend(),
+            )  # type: ignore
+            return cipher.decryptor().update(self.oe)  # type: ignore
+        hash = self._password_hash(password_b, self.u_validation_salt)
+        if hash == self.u_hash:
+            hash = self._password_hash(password_b, self.u_key_salt)
+            cipher = Cipher(
+                algorithms.AES(hash),
+                modes.CBC(b"\0" * 16),
+                backend=default_backend(),
+            )  # type: ignore
+            return cipher.decryptor().update(self.ue)  # type: ignore
+        return None
+    def _normalize_password(self, password: str) -> bytes:
+        if self.r == 6:
+            # saslprep expects non-empty strings, apparently
+            if not password:
+                return b""
+            from pdf2zh._saslprep import saslprep
+            password = saslprep(password)
+        return password.encode("utf-8")[:127]
+    def _password_hash(
+        self,
+        password: bytes,
+        salt: bytes,
+        vector: Optional[bytes] = None,
+    ) -> bytes:
+        """Compute password hash depending on revision number"""
+        if self.r == 5:
+            return self._r5_password(password, salt, vector)
+        return self._r6_password(password, salt[0:8], vector)
+    def _r5_password(
+        self,
+        password: bytes,
+        salt: bytes,
+        vector: Optional[bytes] = None,
+    ) -> bytes:
+        """Compute the password for revision 5"""
+        hash = sha256(password)
+        hash.update(salt)
+        if vector is not None:
+            hash.update(vector)
+        return hash.digest()
+    def _r6_password(
+        self,
+        password: bytes,
+        salt: bytes,
+        vector: Optional[bytes] = None,
+    ) -> bytes:
+        """Compute the password for revision 6"""
+        initial_hash = sha256(password)
+        initial_hash.update(salt)
+        if vector is not None:
+            initial_hash.update(vector)
+        k = initial_hash.digest()
+        hashes = (sha256, sha384, sha512)
+        round_no = last_byte_val = 0
+        while round_no < 64 or last_byte_val > round_no - 32:
+            k1 = (password + k + (vector or b"")) * 64
+            e = self._aes_cbc_encrypt(key=k[:16], iv=k[16:32], data=k1)
+            # compute the first 16 bytes of e,
+            # interpreted as an unsigned integer mod 3
+            next_hash = hashes[self._bytes_mod_3(e[:16])]
+            k = next_hash(e).digest()
+            last_byte_val = e[len(e) - 1]
+            round_no += 1
+        return k[:32]
+    @staticmethod
+    def _bytes_mod_3(input_bytes: bytes) -> int:
+        # 256 is 1 mod 3, so we can just sum 'em
+        return sum(b % 3 for b in input_bytes) % 3
+    def _aes_cbc_encrypt(self, key: bytes, iv: bytes, data: bytes) -> bytes:
+        cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
+        encryptor = cipher.encryptor()  # type: ignore
+        return encryptor.update(data) + encryptor.finalize()  # type: ignore
+    def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes:
+        initialization_vector = data[:16]
+        ciphertext = data[16:]
+        assert self.key is not None
+        cipher = Cipher(
+            algorithms.AES(self.key),
+            modes.CBC(initialization_vector),
+            backend=default_backend(),
+        )  # type: ignore
+        return cipher.decryptor().update(ciphertext)  # type: ignore
+class PDFDocument:
+    """PDFDocument object represents a PDF document.
+    Since a PDF file can be very big, normally it is not loaded at
+    once. So PDF document has to cooperate with a PDF parser in order to
+    dynamically import the data as processing goes.
+    Typical usage:
+      doc = PDFDocument(parser, password)
+      obj = doc.getobj(objid)
+    """
+    security_handler_registry: Dict[int, Type[PDFStandardSecurityHandler]] = {
+        1: PDFStandardSecurityHandler,
+        2: PDFStandardSecurityHandler,
+        4: PDFStandardSecurityHandlerV4,
+        5: PDFStandardSecurityHandlerV5,
+    }
+    def __init__(
+        self,
+        parser: PDFParser,
+        password: str = "",
+        caching: bool = True,
+        fallback: bool = True,
+    ) -> None:
+        """Set the document to use a given PDFParser object."""
+        self.caching = caching
+        self.xrefs: List[PDFBaseXRef] = []
+        self.info = []
+        self.catalog: Dict[str, Any] = {}
+        self.encryption: Optional[Tuple[Any, Any]] = None
+        self.decipher: Optional[DecipherCallable] = None
+        self._parser = None
+        self._cached_objs: Dict[int, Tuple[object, int]] = {}
+        self._parsed_objs: Dict[int, Tuple[List[object], int]] = {}
+        self._parser = parser
+        self._parser.set_document(self)
+        self.is_printable = self.is_modifiable = self.is_extractable = True
+        # Retrieve the information of each header that was appended
+        # (maybe multiple times) at the end of the document.
+        try:
+            # print('FIND XREF')
+            pos = self.find_xref(parser)
+            self.pos = pos
+            self.read_xref_from(parser, pos, self.xrefs)
+        except PDFNoValidXRef:
+            if fallback:
+                parser.fallback = True
+                newxref = PDFXRefFallback()
+                newxref.load(parser)
+                self.xrefs.append(newxref)
+        # print(f'XREF {self.xrefs}')
+        for xref in self.xrefs:
+            trailer = xref.get_trailer()
+            if not trailer:
+                continue
+            # If there's an encryption info, remember it.
+            if "Encrypt" in trailer:
+                if "ID" in trailer:
+                    id_value = list_value(trailer["ID"])
+                else:
+                    # Some documents may not have a /ID, use two empty
+                    # byte strings instead. Solves
+                    # https://github.com/pdf2zh/pdf2zh.six/issues/594
+                    id_value = (b"", b"")
+                self.encryption = (id_value, dict_value(trailer["Encrypt"]))
+                self._initialize_password(password)
+            if "Info" in trailer:
+                self.info.append(dict_value(trailer["Info"]))
+            if "Root" in trailer:
+                # Every PDF file must have exactly one /Root dictionary.
+                self.catalog = dict_value(trailer["Root"])
+                break
+        else:
+            raise PDFSyntaxError("No /Root object! - Is this really a PDF?")
+        if self.catalog.get("Type") is not LITERAL_CATALOG:
+            if settings.STRICT:
+                raise PDFSyntaxError("Catalog not found!")
+    KEYWORD_OBJ = KWD(b"obj")
+    # _initialize_password(password=b'')
+    #   Perform the initialization with a given password.
+    def _initialize_password(self, password: str = "") -> None:
+        assert self.encryption is not None
+        (docid, param) = self.encryption
+        if literal_name(param.get("Filter")) != "Standard":
+            raise PDFEncryptionError("Unknown filter: param=%r" % param)
+        v = int_value(param.get("V", 0))
+        factory = self.security_handler_registry.get(v)
+        if factory is None:
+            raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
+        handler = factory(docid, param, password)
+        self.decipher = handler.decrypt
+        self.is_printable = handler.is_printable()
+        self.is_modifiable = handler.is_modifiable()
+        self.is_extractable = handler.is_extractable()
+        assert self._parser is not None
+        self._parser.fallback = False  # need to read streams with exact length
+    def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
+        if stream.objid in self._parsed_objs:
+            (objs, n) = self._parsed_objs[stream.objid]
+        else:
+            (objs, n) = self._get_objects(stream)
+            if self.caching:
+                assert stream.objid is not None
+                self._parsed_objs[stream.objid] = (objs, n)
+        i = n * 2 + index
+        try:
+            obj = objs[i]
+        except IndexError:
+            raise PDFSyntaxError("index too big: %r" % index)
+        return obj
+    def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
+        if stream.get("Type") is not LITERAL_OBJSTM:
+            if settings.STRICT:
+                raise PDFSyntaxError("Not a stream object: %r" % stream)
+        try:
+            n = cast(int, stream["N"])
+        except KeyError:
+            if settings.STRICT:
+                raise PDFSyntaxError("N is not defined: %r" % stream)
+            n = 0
+        parser = PDFStreamParser(stream.get_data())
+        parser.set_document(self)
+        objs: List[object] = []
+        try:
+            while 1:
+                _, (_, obj) = parser.nextobject()
+                objs.append(obj)
+        except PSEOF:
+            pass
+        return (objs, n)
+    def _getobj_parse(self, pos: int, objid: int) -> object:
+        assert self._parser is not None
+        self._parser.seek(pos)
+        (_, objid1) = self._parser.nexttoken()  # objid
+        (_, genno) = self._parser.nexttoken()  # genno
+        (_, kwd) = self._parser.nexttoken()
+        # hack around malformed pdf files
+        # copied from https://github.com/jaepil/pdf2zh3k/blob/master/
+        # pdf2zh/pdfparser.py#L399
+        # to solve https://github.com/pdf2zh/pdf2zh.six/issues/56
+        # assert objid1 == objid, str((objid1, objid))
+        if objid1 != objid:
+            x = []
+            while kwd is not self.KEYWORD_OBJ:
+                (_, kwd) = self._parser.nexttoken()
+                x.append(kwd)
+            if len(x) >= 2:
+                objid1 = x[-2]
+        # #### end hack around malformed pdf files
+        if objid1 != objid:
+            raise PDFSyntaxError(f"objid mismatch: {objid1!r}={objid!r}")
+        if kwd != KWD(b"obj"):
+            raise PDFSyntaxError("Invalid object spec: offset=%r" % pos)
+        end, (_, obj) = self._parser.nextobject()
+        return end, obj
+    # can raise PDFObjectNotFound
+    def getobj(self, objid: int) -> object:
+        """Get object from PDF
+        :raises PDFException if PDFDocument is not initialized
+        :raises PDFObjectNotFound if objid does not exist in PDF
+        """
+        if not self.xrefs:
+            raise PDFException("PDFDocument is not initialized")
+        # log.debug("getobj: objid=%r", objid)
+        if objid in self._cached_objs:
+            (obj, genno) = self._cached_objs[objid]
+        else:
+            for xref in self.xrefs:
+                try:
+                    (strmid, index, genno) = xref.get_pos(objid)
+                except KeyError:
+                    continue
+                try:
+                    if strmid is not None:
+                        stream = stream_value(self.getobj(strmid))
+                        obj = self._getobj_objstm(stream, index, objid)
+                    else:
+                        end, obj = self._getobj_parse(index, objid)
+                        if self.decipher:
+                            obj = decipher_all(self.decipher, objid, genno, obj)
+                    if isinstance(obj, PDFStream):
+                        obj.set_objid(objid, genno)
+                    break
+                except (PSEOF, PDFSyntaxError):
+                    continue
+            else:
+                raise PDFObjectNotFound(objid)
+            # log.debug("register: objid=%r: %r", objid, obj)
+            if self.caching:
+                self._cached_objs[objid] = (obj, genno)
+        return obj
+    OutlineType = Tuple[Any, Any, Any, Any, Any]
+    def get_outlines(self) -> Iterator[OutlineType]:
+        if "Outlines" not in self.catalog:
+            raise PDFNoOutlines
+        def search(entry: object, level: int) -> Iterator[PDFDocument.OutlineType]:
+            entry = dict_value(entry)
+            if "Title" in entry:
+                if "A" in entry or "Dest" in entry:
+                    title = decode_text(str_value(entry["Title"]))
+                    dest = entry.get("Dest")
+                    action = entry.get("A")
+                    se = entry.get("SE")
+                    yield (level, title, dest, action, se)
+            if "First" in entry and "Last" in entry:
+                yield from search(entry["First"], level + 1)
+            if "Next" in entry:
+                yield from search(entry["Next"], level)
+        return search(self.catalog["Outlines"], 0)
+    def get_page_labels(self) -> Iterator[str]:
+        """Generate page label strings for the PDF document.
+        If the document includes page labels, generates strings, one per page.
+        If not, raises PDFNoPageLabels.
+        The resulting iteration is unbounded.
+        """
+        assert self.catalog is not None
+        try:
+            page_labels = PageLabels(self.catalog["PageLabels"])
+        except (PDFTypeError, KeyError):
+            raise PDFNoPageLabels
+        return page_labels.labels
+    def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
+        try:
+            names = dict_value(self.catalog["Names"])
+        except (PDFTypeError, KeyError):
+            raise PDFKeyError((cat, key))
+        # may raise KeyError
+        d0 = dict_value(names[cat])
+        def lookup(d: Dict[str, Any]) -> Any:
+            if "Limits" in d:
+                (k1, k2) = list_value(d["Limits"])
+                if key < k1 or k2 < key:
+                    return None
+            if "Names" in d:
+                objs = list_value(d["Names"])
+                names = dict(
+                    cast(Iterator[Tuple[Union[str, bytes], Any]], choplist(2, objs)),
+                )
+                return names[key]
+            if "Kids" in d:
+                for c in list_value(d["Kids"]):
+                    v = lookup(dict_value(c))
+                    if v:
+                        return v
+            raise PDFKeyError((cat, key))
+        return lookup(d0)
+    def get_dest(self, name: Union[str, bytes]) -> Any:
+        try:
+            # PDF-1.2 or later
+            obj = self.lookup_name("Dests", name)
+        except KeyError:
+            # PDF-1.1 or prior
+            if "Dests" not in self.catalog:
+                raise PDFDestinationNotFound(name)
+            d0 = dict_value(self.catalog["Dests"])
+            if name not in d0:
+                raise PDFDestinationNotFound(name)
+            obj = d0[name]
+        return obj
+    # find_xref
+    def find_xref(self, parser: PDFParser) -> int:
+        """Internal function used to locate the first XRef."""
+        # search the last xref table by scanning the file backwards.
+        prev = b""
+        for line in parser.revreadlines():
+            line = line.strip()
+            # log.debug("find_xref: %r", line)
+            if line == b"startxref":
+                # log.debug("xref found: pos=%r", prev)
+                if not prev.isdigit():
+                    raise PDFNoValidXRef(f"Invalid xref position: {prev!r}")
+                start = int(prev)
+                if not start >= 0:
+                    raise PDFNoValidXRef(f"Invalid negative xref position: {start}")
+                return start
+            if line:
+                prev = line
+        raise PDFNoValidXRef("Unexpected EOF")
+    # read xref table
+    def read_xref_from(
+        self,
+        parser: PDFParser,
+        start: int,
+        xrefs: List[PDFBaseXRef],
+    ) -> None:
+        """Reads XRefs from the given location."""
+        parser.seek(start)
+        parser.reset()
+        try:
+            (pos, token) = parser.nexttoken()
+        except PSEOF:
+            raise PDFNoValidXRef("Unexpected EOF")
+        # log.debug("read_xref_from: start=%d, token=%r", start, token)
+        if isinstance(token, int):
+            # XRefStream: PDF-1.5
+            parser.seek(pos)
+            parser.reset()
+            xref: PDFBaseXRef = PDFXRefStream()
+            xref.load(parser)
+        else:
+            if token is parser.KEYWORD_XREF:
+                parser.nextline()
+            xref = PDFXRef()
+            xref.load(parser)
+        xrefs.append(xref)
+        trailer = xref.get_trailer()
+        # log.debug("trailer: %r", trailer)
+        if "XRefStm" in trailer:
+            pos = int_value(trailer["XRefStm"])
+            self.read_xref_from(parser, pos, xrefs)
+        if "Prev" in trailer:
+            # find previous xref
+            pos = int_value(trailer["Prev"])
+            self.read_xref_from(parser, pos, xrefs)
+class PageLabels(NumberTree):
+    """PageLabels from the document catalog.
+    See Section 8.3.1 in the PDF Reference.
+    """
+    @property
+    def labels(self) -> Iterator[str]:
+        ranges = self.values
+        # The tree must begin with page index 0
+        if len(ranges) == 0 or ranges[0][0] != 0:
+            if settings.STRICT:
+                raise PDFSyntaxError("PageLabels is missing page index 0")
+            else:
+                # Try to cope, by assuming empty labels for the initial pages
+                ranges.insert(0, (0, {}))
+        for next, (start, label_dict_unchecked) in enumerate(ranges, 1):
+            label_dict = dict_value(label_dict_unchecked)
+            style = label_dict.get("S")
+            prefix = decode_text(str_value(label_dict.get("P", b"")))
+            first_value = int_value(label_dict.get("St", 1))
+            if next == len(ranges):
+                # This is the last specified range. It continues until the end
+                # of the document.
+                values: Iterable[int] = itertools.count(first_value)
+            else:
+                end, _ = ranges[next]
+                range_length = end - start
+                values = range(first_value, first_value + range_length)
+            for value in values:
+                label = self._format_page_label(value, style)
+                yield prefix + label
+    @staticmethod
+    def _format_page_label(value: int, style: Any) -> str:
+        """Format page label value in a specific style"""
+        if style is None:
+            label = ""
+        elif style is LIT("D"):  # Decimal arabic numerals
+            label = str(value)
+        elif style is LIT("R"):  # Uppercase roman numerals
+            label = format_int_roman(value).upper()
+        elif style is LIT("r"):  # Lowercase roman numerals
+            label = format_int_roman(value)
+        elif style is LIT("A"):  # Uppercase letters A-Z, AA-ZZ...
+            label = format_int_alpha(value).upper()
+        elif style is LIT("a"):  # Lowercase letters a-z, aa-zz...
+            label = format_int_alpha(value)
+        else:
+            log.warning("Unknown page label style: %r", style)
+            label = ""
+        return label

pdf2zh/pdfexceptions.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from pdf2zh.psexceptions import PSException
+class PDFException(PSException):
+    pass
+class PDFTypeError(PDFException, TypeError):
+    pass
+class PDFValueError(PDFException, ValueError):
+    pass
+class PDFObjectNotFound(PDFException):
+    pass
+class PDFNotImplementedError(PDFException, NotImplementedError):
+    pass
+class PDFKeyError(PDFException, KeyError):
+    pass
+class PDFEOFError(PDFException, EOFError):
+    pass
+class PDFIOError(PDFException, IOError):
+    pass

pdf2zh/pdffont.py ADDED Viewed

	@@ -0,0 +1,1190 @@

+import logging
+import struct
+from io import BytesIO
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    BinaryIO,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+    cast,
+)
+from pdf2zh import settings
+from pdf2zh.cmapdb import (
+    CMap,
+    CMapBase,
+    CMapDB,
+    CMapParser,
+    FileUnicodeMap,
+    IdentityUnicodeMap,
+    UnicodeMap,
+)
+from pdf2zh.encodingdb import EncodingDB, name2unicode
+from pdf2zh.fontmetrics import FONT_METRICS
+from pdf2zh.pdfexceptions import PDFException, PDFKeyError, PDFValueError
+from pdf2zh.pdftypes import (
+    PDFStream,
+    dict_value,
+    int_value,
+    list_value,
+    num_value,
+    resolve1,
+    resolve_all,
+    stream_value,
+)
+from pdf2zh.psexceptions import PSEOF
+from pdf2zh.psparser import (
+    KWD,
+    LIT,
+    PSKeyword,
+    PSLiteral,
+    PSStackParser,
+    literal_name,
+)
+from pdf2zh.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack
+if TYPE_CHECKING:
+    from pdf2zh.pdfinterp import PDFResourceManager
+log = logging.getLogger(__name__)
+def get_widths(seq: Iterable[object]) -> Dict[int, float]:
+    """Build a mapping of character widths for horizontal writing."""
+    widths: Dict[int, float] = {}
+    r: List[float] = []
+    for v in seq:
+        if isinstance(v, list):
+            if r:
+                char1 = r[-1]
+                for i, w in enumerate(v):
+                    widths[cast(int, char1) + i] = w
+                r = []
+        elif isinstance(v, (int, float)):  # == utils.isnumber(v)
+            r.append(v)
+            if len(r) == 3:
+                (char1, char2, w) = r
+                for i in range(cast(int, char1), cast(int, char2) + 1):
+                    widths[i] = w
+                r = []
+    return widths
+def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]:
+    """Build a mapping of character widths for vertical writing."""
+    widths: Dict[int, Tuple[float, Point]] = {}
+    r: List[float] = []
+    for v in seq:
+        if isinstance(v, list):
+            if r:
+                char1 = r[-1]
+                for i, (w, vx, vy) in enumerate(choplist(3, v)):
+                    widths[cast(int, char1) + i] = (w, (vx, vy))
+                r = []
+        elif isinstance(v, (int, float)):  # == utils.isnumber(v)
+            r.append(v)
+            if len(r) == 5:
+                (char1, char2, w, vx, vy) = r
+                for i in range(cast(int, char1), cast(int, char2) + 1):
+                    widths[i] = (w, (vx, vy))
+                r = []
+    return widths
+class FontMetricsDB:
+    @classmethod
+    def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]:
+        return FONT_METRICS[fontname]
+# int here means that we're not extending PSStackParser with additional types.
+class Type1FontHeaderParser(PSStackParser[int]):
+    KEYWORD_BEGIN = KWD(b"begin")
+    KEYWORD_END = KWD(b"end")
+    KEYWORD_DEF = KWD(b"def")
+    KEYWORD_PUT = KWD(b"put")
+    KEYWORD_DICT = KWD(b"dict")
+    KEYWORD_ARRAY = KWD(b"array")
+    KEYWORD_READONLY = KWD(b"readonly")
+    KEYWORD_FOR = KWD(b"for")
+    def __init__(self, data: BinaryIO) -> None:
+        PSStackParser.__init__(self, data)
+        self._cid2unicode: Dict[int, str] = {}
+    def get_encoding(self) -> Dict[int, str]:
+        """Parse the font encoding.
+        The Type1 font encoding maps character codes to character names. These
+        character names could either be standard Adobe glyph names, or
+        character names associated with custom CharStrings for this font. A
+        CharString is a sequence of operations that describe how the character
+        should be drawn. Currently, this function returns '' (empty string)
+        for character names that are associated with a CharStrings.
+        Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
+        :returns mapping of character identifiers (cid's) to unicode characters
+        """
+        while 1:
+            try:
+                _, (cid, name) = self.nextobject()
+            except PSEOF:
+                break
+            try:
+                self._cid2unicode[cid] = name2unicode(cast(str, name))
+            except KeyError:
+                # log.debug(str(e))
+                pass
+        return self._cid2unicode
+    def do_keyword(self, pos: int, token: PSKeyword) -> None:
+        if token is self.KEYWORD_PUT:
+            ((_, key), (_, value)) = self.pop(2)
+            if isinstance(key, int) and isinstance(value, PSLiteral):
+                self.add_results((key, literal_name(value)))
+NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")
+# Mapping of cmap names. Original cmap name is kept if not in the mapping.
+# (missing reference for why DLIdent is mapped to Identity)
+IDENTITY_ENCODER = {
+    "DLIdent-H": "Identity-H",
+    "DLIdent-V": "Identity-V",
+}
+def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]:
+    d: Dict[int, List[Union[float, int]]] = {}
+    fp = BytesIO(data)
+    stack: List[Union[float, int]] = []
+    while 1:
+        c = fp.read(1)
+        if not c:
+            break
+        b0 = ord(c)
+        if b0 <= 21:
+            d[b0] = stack
+            stack = []
+            continue
+        if b0 == 30:
+            s = ""
+            loop = True
+            while loop:
+                b = ord(fp.read(1))
+                for n in (b >> 4, b & 15):
+                    if n == 15:
+                        loop = False
+                    else:
+                        nibble = NIBBLES[n]
+                        assert nibble is not None
+                        s += nibble
+            value = float(s)
+        elif b0 >= 32 and b0 <= 246:
+            value = b0 - 139
+        else:
+            b1 = ord(fp.read(1))
+            if b0 >= 247 and b0 <= 250:
+                value = ((b0 - 247) << 8) + b1 + 108
+            elif b0 >= 251 and b0 <= 254:
+                value = -((b0 - 251) << 8) - b1 - 108
+            else:
+                b2 = ord(fp.read(1))
+                if b1 >= 128:
+                    b1 -= 256
+                if b0 == 28:
+                    value = b1 << 8 | b2
+                else:
+                    value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]
+        stack.append(value)
+    return d
+class CFFFont:
+    STANDARD_STRINGS = (
+        ".notdef",
+        "space",
+        "exclam",
+        "quotedbl",
+        "numbersign",
+        "dollar",
+        "percent",
+        "ampersand",
+        "quoteright",
+        "parenleft",
+        "parenright",
+        "asterisk",
+        "plus",
+        "comma",
+        "hyphen",
+        "period",
+        "slash",
+        "zero",
+        "one",
+        "two",
+        "three",
+        "four",
+        "five",
+        "six",
+        "seven",
+        "eight",
+        "nine",
+        "colon",
+        "semicolon",
+        "less",
+        "equal",
+        "greater",
+        "question",
+        "at",
+        "A",
+        "B",
+        "C",
+        "D",
+        "E",
+        "F",
+        "G",
+        "H",
+        "I",
+        "J",
+        "K",
+        "L",
+        "M",
+        "N",
+        "O",
+        "P",
+        "Q",
+        "R",
+        "S",
+        "T",
+        "U",
+        "V",
+        "W",
+        "X",
+        "Y",
+        "Z",
+        "bracketleft",
+        "backslash",
+        "bracketright",
+        "asciicircum",
+        "underscore",
+        "quoteleft",
+        "a",
+        "b",
+        "c",
+        "d",
+        "e",
+        "f",
+        "g",
+        "h",
+        "i",
+        "j",
+        "k",
+        "l",
+        "m",
+        "n",
+        "o",
+        "p",
+        "q",
+        "r",
+        "s",
+        "t",
+        "u",
+        "v",
+        "w",
+        "x",
+        "y",
+        "z",
+        "braceleft",
+        "bar",
+        "braceright",
+        "asciitilde",
+        "exclamdown",
+        "cent",
+        "sterling",
+        "fraction",
+        "yen",
+        "florin",
+        "section",
+        "currency",
+        "quotesingle",
+        "quotedblleft",
+        "guillemotleft",
+        "guilsinglleft",
+        "guilsinglright",
+        "fi",
+        "fl",
+        "endash",
+        "dagger",
+        "daggerdbl",
+        "periodcentered",
+        "paragraph",
+        "bullet",
+        "quotesinglbase",
+        "quotedblbase",
+        "quotedblright",
+        "guillemotright",
+        "ellipsis",
+        "perthousand",
+        "questiondown",
+        "grave",
+        "acute",
+        "circumflex",
+        "tilde",
+        "macron",
+        "breve",
+        "dotaccent",
+        "dieresis",
+        "ring",
+        "cedilla",
+        "hungarumlaut",
+        "ogonek",
+        "caron",
+        "emdash",
+        "AE",
+        "ordfeminine",
+        "Lslash",
+        "Oslash",
+        "OE",
+        "ordmasculine",
+        "ae",
+        "dotlessi",
+        "lslash",
+        "oslash",
+        "oe",
+        "germandbls",
+        "onesuperior",
+        "logicalnot",
+        "mu",
+        "trademark",
+        "Eth",
+        "onehalf",
+        "plusminus",
+        "Thorn",
+        "onequarter",
+        "divide",
+        "brokenbar",
+        "degree",
+        "thorn",
+        "threequarters",
+        "twosuperior",
+        "registered",
+        "minus",
+        "eth",
+        "multiply",
+        "threesuperior",
+        "copyright",
+        "Aacute",
+        "Acircumflex",
+        "Adieresis",
+        "Agrave",
+        "Aring",
+        "Atilde",
+        "Ccedilla",
+        "Eacute",
+        "Ecircumflex",
+        "Edieresis",
+        "Egrave",
+        "Iacute",
+        "Icircumflex",
+        "Idieresis",
+        "Igrave",
+        "Ntilde",
+        "Oacute",
+        "Ocircumflex",
+        "Odieresis",
+        "Ograve",
+        "Otilde",
+        "Scaron",
+        "Uacute",
+        "Ucircumflex",
+        "Udieresis",
+        "Ugrave",
+        "Yacute",
+        "Ydieresis",
+        "Zcaron",
+        "aacute",
+        "acircumflex",
+        "adieresis",
+        "agrave",
+        "aring",
+        "atilde",
+        "ccedilla",
+        "eacute",
+        "ecircumflex",
+        "edieresis",
+        "egrave",
+        "iacute",
+        "icircumflex",
+        "idieresis",
+        "igrave",
+        "ntilde",
+        "oacute",
+        "ocircumflex",
+        "odieresis",
+        "ograve",
+        "otilde",
+        "scaron",
+        "uacute",
+        "ucircumflex",
+        "udieresis",
+        "ugrave",
+        "yacute",
+        "ydieresis",
+        "zcaron",
+        "exclamsmall",
+        "Hungarumlautsmall",
+        "dollaroldstyle",
+        "dollarsuperior",
+        "ampersandsmall",
+        "Acutesmall",
+        "parenleftsuperior",
+        "parenrightsuperior",
+        "twodotenleader",
+        "onedotenleader",
+        "zerooldstyle",
+        "oneoldstyle",
+        "twooldstyle",
+        "threeoldstyle",
+        "fouroldstyle",
+        "fiveoldstyle",
+        "sixoldstyle",
+        "sevenoldstyle",
+        "eightoldstyle",
+        "nineoldstyle",
+        "commasuperior",
+        "threequartersemdash",
+        "periodsuperior",
+        "questionsmall",
+        "asuperior",
+        "bsuperior",
+        "centsuperior",
+        "dsuperior",
+        "esuperior",
+        "isuperior",
+        "lsuperior",
+        "msuperior",
+        "nsuperior",
+        "osuperior",
+        "rsuperior",
+        "ssuperior",
+        "tsuperior",
+        "ff",
+        "ffi",
+        "ffl",
+        "parenleftinferior",
+        "parenrightinferior",
+        "Circumflexsmall",
+        "hyphensuperior",
+        "Gravesmall",
+        "Asmall",
+        "Bsmall",
+        "Csmall",
+        "Dsmall",
+        "Esmall",
+        "Fsmall",
+        "Gsmall",
+        "Hsmall",
+        "Ismall",
+        "Jsmall",
+        "Ksmall",
+        "Lsmall",
+        "Msmall",
+        "Nsmall",
+        "Osmall",
+        "Psmall",
+        "Qsmall",
+        "Rsmall",
+        "Ssmall",
+        "Tsmall",
+        "Usmall",
+        "Vsmall",
+        "Wsmall",
+        "Xsmall",
+        "Ysmall",
+        "Zsmall",
+        "colonmonetary",
+        "onefitted",
+        "rupiah",
+        "Tildesmall",
+        "exclamdownsmall",
+        "centoldstyle",
+        "Lslashsmall",
+        "Scaronsmall",
+        "Zcaronsmall",
+        "Dieresissmall",
+        "Brevesmall",
+        "Caronsmall",
+        "Dotaccentsmall",
+        "Macronsmall",
+        "figuredash",
+        "hypheninferior",
+        "Ogoneksmall",
+        "Ringsmall",
+        "Cedillasmall",
+        "questiondownsmall",
+        "oneeighth",
+        "threeeighths",
+        "fiveeighths",
+        "seveneighths",
+        "onethird",
+        "twothirds",
+        "zerosuperior",
+        "foursuperior",
+        "fivesuperior",
+        "sixsuperior",
+        "sevensuperior",
+        "eightsuperior",
+        "ninesuperior",
+        "zeroinferior",
+        "oneinferior",
+        "twoinferior",
+        "threeinferior",
+        "fourinferior",
+        "fiveinferior",
+        "sixinferior",
+        "seveninferior",
+        "eightinferior",
+        "nineinferior",
+        "centinferior",
+        "dollarinferior",
+        "periodinferior",
+        "commainferior",
+        "Agravesmall",
+        "Aacutesmall",
+        "Acircumflexsmall",
+        "Atildesmall",
+        "Adieresissmall",
+        "Aringsmall",
+        "AEsmall",
+        "Ccedillasmall",
+        "Egravesmall",
+        "Eacutesmall",
+        "Ecircumflexsmall",
+        "Edieresissmall",
+        "Igravesmall",
+        "Iacutesmall",
+        "Icircumflexsmall",
+        "Idieresissmall",
+        "Ethsmall",
+        "Ntildesmall",
+        "Ogravesmall",
+        "Oacutesmall",
+        "Ocircumflexsmall",
+        "Otildesmall",
+        "Odieresissmall",
+        "OEsmall",
+        "Oslashsmall",
+        "Ugravesmall",
+        "Uacutesmall",
+        "Ucircumflexsmall",
+        "Udieresissmall",
+        "Yacutesmall",
+        "Thornsmall",
+        "Ydieresissmall",
+        "001.000",
+        "001.001",
+        "001.002",
+        "001.003",
+        "Black",
+        "Bold",
+        "Book",
+        "Light",
+        "Medium",
+        "Regular",
+        "Roman",
+        "Semibold",
+    )
+    class INDEX:
+        def __init__(self, fp: BinaryIO) -> None:
+            self.fp = fp
+            self.offsets: List[int] = []
+            (count, offsize) = struct.unpack(">HB", self.fp.read(3))
+            for i in range(count + 1):
+                self.offsets.append(nunpack(self.fp.read(offsize)))
+            self.base = self.fp.tell() - 1
+            self.fp.seek(self.base + self.offsets[-1])
+        def __repr__(self) -> str:
+            return "<INDEX: size=%d>" % len(self)
+        def __len__(self) -> int:
+            return len(self.offsets) - 1
+        def __getitem__(self, i: int) -> bytes:
+            self.fp.seek(self.base + self.offsets[i])
+            return self.fp.read(self.offsets[i + 1] - self.offsets[i])
+        def __iter__(self) -> Iterator[bytes]:
+            return iter(self[i] for i in range(len(self)))
+    def __init__(self, name: str, fp: BinaryIO) -> None:
+        self.name = name
+        self.fp = fp
+        # Header
+        (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4))
+        self.fp.read(hdrsize - 4)
+        # Name INDEX
+        self.name_index = self.INDEX(self.fp)
+        # Top DICT INDEX
+        self.dict_index = self.INDEX(self.fp)
+        # String INDEX
+        self.string_index = self.INDEX(self.fp)
+        # Global Subr INDEX
+        self.subr_index = self.INDEX(self.fp)
+        # Top DICT DATA
+        self.top_dict = getdict(self.dict_index[0])
+        (charset_pos,) = self.top_dict.get(15, [0])
+        (encoding_pos,) = self.top_dict.get(16, [0])
+        (charstring_pos,) = self.top_dict.get(17, [0])
+        # CharStrings
+        self.fp.seek(cast(int, charstring_pos))
+        self.charstring = self.INDEX(self.fp)
+        self.nglyphs = len(self.charstring)
+        # Encodings
+        self.code2gid = {}
+        self.gid2code = {}
+        self.fp.seek(cast(int, encoding_pos))
+        format = self.fp.read(1)
+        if format == b"\x00":
+            # Format 0
+            (n,) = struct.unpack("B", self.fp.read(1))
+            for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):
+                self.code2gid[code] = gid
+                self.gid2code[gid] = code
+        elif format == b"\x01":
+            # Format 1
+            (n,) = struct.unpack("B", self.fp.read(1))
+            code = 0
+            for i in range(n):
+                (first, nleft) = struct.unpack("BB", self.fp.read(2))
+                for gid in range(first, first + nleft + 1):
+                    self.code2gid[code] = gid
+                    self.gid2code[gid] = code
+                    code += 1
+        else:
+            raise PDFValueError("unsupported encoding format: %r" % format)
+        # Charsets
+        self.name2gid = {}
+        self.gid2name = {}
+        self.fp.seek(cast(int, charset_pos))
+        format = self.fp.read(1)
+        if format == b"\x00":
+            # Format 0
+            n = self.nglyphs - 1
+            for gid, sid in enumerate(
+                cast(
+                    Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))
+                ),
+            ):
+                gid += 1
+                sidname = self.getstr(sid)
+                self.name2gid[sidname] = gid
+                self.gid2name[gid] = sidname
+        elif format == b"\x01":
+            # Format 1
+            (n,) = struct.unpack("B", self.fp.read(1))
+            sid = 0
+            for i in range(n):
+                (first, nleft) = struct.unpack("BB", self.fp.read(2))
+                for gid in range(first, first + nleft + 1):
+                    sidname = self.getstr(sid)
+                    self.name2gid[sidname] = gid
+                    self.gid2name[gid] = sidname
+                    sid += 1
+        elif format == b"\x02":
+            # Format 2
+            assert False, str(("Unhandled", format))
+        else:
+            raise PDFValueError("unsupported charset format: %r" % format)
+    def getstr(self, sid: int) -> Union[str, bytes]:
+        # This returns str for one of the STANDARD_STRINGS but bytes otherwise,
+        # and appears to be a needless source of type complexity.
+        if sid < len(self.STANDARD_STRINGS):
+            return self.STANDARD_STRINGS[sid]
+        return self.string_index[sid - len(self.STANDARD_STRINGS)]
+class TrueTypeFont:
+    class CMapNotFound(PDFException):
+        pass
+    def __init__(self, name: str, fp: BinaryIO) -> None:
+        self.name = name
+        self.fp = fp
+        self.tables: Dict[bytes, Tuple[int, int]] = {}
+        self.fonttype = fp.read(4)
+        try:
+            (ntables, _1, _2, _3) = cast(
+                Tuple[int, int, int, int],
+                struct.unpack(">HHHH", fp.read(8)),
+            )
+            for _ in range(ntables):
+                (name_bytes, tsum, offset, length) = cast(
+                    Tuple[bytes, int, int, int],
+                    struct.unpack(">4sLLL", fp.read(16)),
+                )
+                self.tables[name_bytes] = (offset, length)
+        except struct.error:
+            # Do not fail if there are not enough bytes to read. Even for
+            # corrupted PDFs we would like to get as much information as
+            # possible, so continue.
+            pass
+    def create_unicode_map(self) -> FileUnicodeMap:
+        if b"cmap" not in self.tables:
+            raise TrueTypeFont.CMapNotFound
+        (base_offset, length) = self.tables[b"cmap"]
+        fp = self.fp
+        fp.seek(base_offset)
+        (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4)))
+        subtables: List[Tuple[int, int, int]] = []
+        for i in range(nsubtables):
+            subtables.append(
+                cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),
+            )
+        char2gid: Dict[int, int] = {}
+        # Only supports subtable type 0, 2 and 4.
+        for platform_id, encoding_id, st_offset in subtables:
+            # Skip non-Unicode cmaps.
+            # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
+            if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
+                continue
+            fp.seek(base_offset + st_offset)
+            (fmttype, fmtlen, fmtlang) = cast(
+                Tuple[int, int, int],
+                struct.unpack(">HHH", fp.read(6)),
+            )
+            if fmttype == 0:
+                char2gid.update(
+                    enumerate(
+                        cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))),
+                    ),
+                )
+            elif fmttype == 2:
+                subheaderkeys = cast(
+                    Tuple[int, ...],
+                    struct.unpack(">256H", fp.read(512)),
+                )
+                firstbytes = [0] * 8192
+                for i, k in enumerate(subheaderkeys):
+                    firstbytes[k // 8] = i
+                nhdrs = max(subheaderkeys) // 8 + 1
+                hdrs: List[Tuple[int, int, int, int, int]] = []
+                for i in range(nhdrs):
+                    (firstcode, entcount, delta, offset) = cast(
+                        Tuple[int, int, int, int],
+                        struct.unpack(">HHhH", fp.read(8)),
+                    )
+                    hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))
+                for i, firstcode, entcount, delta, pos in hdrs:
+                    if not entcount:
+                        continue
+                    first = firstcode + (firstbytes[i] << 8)
+                    fp.seek(pos)
+                    for c in range(entcount):
+                        gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
+                        if gid:
+                            gid += delta
+                        char2gid[first + c] = gid
+            elif fmttype == 4:
+                (segcount, _1, _2, _3) = cast(
+                    Tuple[int, int, int, int],
+                    struct.unpack(">HHHH", fp.read(8)),
+                )
+                segcount //= 2
+                ecs = cast(
+                    Tuple[int, ...],
+                    struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
+                )
+                fp.read(2)
+                scs = cast(
+                    Tuple[int, ...],
+                    struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
+                )
+                idds = cast(
+                    Tuple[int, ...],
+                    struct.unpack(">%dh" % segcount, fp.read(2 * segcount)),
+                )
+                pos = fp.tell()
+                idrs = cast(
+                    Tuple[int, ...],
+                    struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
+                )
+                for ec, sc, idd, idr in zip(ecs, scs, idds, idrs):
+                    if idr:
+                        fp.seek(pos + idr)
+                        for c in range(sc, ec + 1):
+                            b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
+                            char2gid[c] = (b + idd) & 0xFFFF
+                    else:
+                        for c in range(sc, ec + 1):
+                            char2gid[c] = (c + idd) & 0xFFFF
+            else:
+                assert False, str(("Unhandled", fmttype))
+        if not char2gid:
+            raise TrueTypeFont.CMapNotFound
+        # create unicode map
+        unicode_map = FileUnicodeMap()
+        for char, gid in char2gid.items():
+            unicode_map.add_cid2unichr(gid, char)
+        return unicode_map
+class PDFFontError(PDFException):
+    pass
+class PDFUnicodeNotDefined(PDFFontError):
+    pass
+LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")
+LITERAL_TYPE1C = LIT("Type1C")
+# Font widths are maintained in a dict type that maps from *either* unicode
+# chars or integer character IDs.
+FontWidthDict = Union[Dict[int, float], Dict[str, float]]
+class PDFFont:
+    def __init__(
+        self,
+        descriptor: Mapping[str, Any],
+        widths: FontWidthDict,
+        default_width: Optional[float] = None,
+    ) -> None:
+        self.descriptor = descriptor
+        self.widths: FontWidthDict = resolve_all(widths)
+        self.fontname = resolve1(descriptor.get("FontName", "unknown"))
+        if isinstance(self.fontname, PSLiteral):
+            self.fontname = literal_name(self.fontname)
+        self.flags = int_value(descriptor.get("Flags", 0))
+        self.ascent = num_value(descriptor.get("Ascent", 0))
+        self.descent = num_value(descriptor.get("Descent", 0))
+        self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
+        if default_width is None:
+            self.default_width = num_value(descriptor.get("MissingWidth", 0))
+        else:
+            self.default_width = default_width
+        self.default_width = resolve1(self.default_width)
+        self.leading = num_value(descriptor.get("Leading", 0))
+        self.bbox = cast(
+            Rect,
+            list_value(resolve_all(descriptor.get("FontBBox", (0, 0, 0, 0)))),
+        )
+        self.hscale = self.vscale = 0.001
+        # PDF RM 9.8.1 specifies /Descent should always be a negative number.
+        # PScript5.dll seems to produce Descent with a positive number, but
+        # text analysis will be wrong if this is taken as correct. So force
+        # descent to negative.
+        if self.descent > 0:
+            self.descent = -self.descent
+    def __repr__(self) -> str:
+        return "<PDFFont>"
+    def is_vertical(self) -> bool:
+        return False
+    def is_multibyte(self) -> bool:
+        return False
+    def decode(self, bytes: bytes) -> Iterable[int]:
+        return bytearray(bytes)  # map(ord, bytes)
+    def get_ascent(self) -> float:
+        """Ascent above the baseline, in text space units"""
+        return self.ascent * self.vscale
+    def get_descent(self) -> float:
+        """Descent below the baseline, in text space units; always negative"""
+        return self.descent * self.vscale
+    def get_width(self) -> float:
+        w = self.bbox[2] - self.bbox[0]
+        if w == 0:
+            w = -self.default_width
+        return w * self.hscale
+    def get_height(self) -> float:
+        h = self.bbox[3] - self.bbox[1]
+        if h == 0:
+            h = self.ascent - self.descent
+        return h * self.vscale
+    def char_width(self, cid: int) -> float:
+        # Because character widths may be mapping either IDs or strings,
+        # we try to lookup the character ID first, then its str equivalent.
+        try:
+            return cast(Dict[int, float], self.widths)[cid] * self.hscale
+        except KeyError:
+            str_widths = cast(Dict[str, float], self.widths)
+            try:
+                return str_widths[self.to_unichr(cid)] * self.hscale
+            except (KeyError, PDFUnicodeNotDefined):
+                return self.default_width * self.hscale
+    def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
+        """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
+        return 0
+    def string_width(self, s: bytes) -> float:
+        return sum(self.char_width(cid) for cid in self.decode(s))
+    def to_unichr(self, cid: int) -> str:
+        raise NotImplementedError
+class PDFSimpleFont(PDFFont):
+    def __init__(
+        self,
+        descriptor: Mapping[str, Any],
+        widths: FontWidthDict,
+        spec: Mapping[str, Any],
+    ) -> None:
+        # Font encoding is specified either by a name of
+        # built-in encoding or a dictionary that describes
+        # the differences.
+        if "Encoding" in spec:
+            encoding = resolve1(spec["Encoding"])
+        else:
+            encoding = LITERAL_STANDARD_ENCODING
+        if isinstance(encoding, dict):
+            name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING))
+            diff = list_value(encoding.get("Differences", []))
+            self.cid2unicode = EncodingDB.get_encoding(name, diff)
+        else:
+            self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
+        self.unicode_map: Optional[UnicodeMap] = None
+        if "ToUnicode" in spec:
+            strm = stream_value(spec["ToUnicode"])
+            self.unicode_map = FileUnicodeMap()
+            CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
+        PDFFont.__init__(self, descriptor, widths)
+    def to_unichr(self, cid: int) -> str:
+        if self.unicode_map:
+            try:
+                return self.unicode_map.get_unichr(cid)
+            except KeyError:
+                pass
+        try:
+            return self.cid2unicode[cid]
+        except KeyError:
+            raise PDFUnicodeNotDefined(None, cid)
+class PDFType1Font(PDFSimpleFont):
+    def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
+        try:
+            self.basefont = literal_name(spec["BaseFont"])
+        except KeyError:
+            if settings.STRICT:
+                raise PDFFontError("BaseFont is missing")
+            self.basefont = "unknown"
+        widths: FontWidthDict
+        try:
+            (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)
+            widths = cast(Dict[str, float], int_widths)  # implicit int->float
+        except KeyError:
+            descriptor = dict_value(spec.get("FontDescriptor", {}))
+            firstchar = int_value(spec.get("FirstChar", 0))
+            # lastchar = int_value(spec.get('LastChar', 255))
+            width_list = list_value(spec.get("Widths", [0] * 256))
+            widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}
+        PDFSimpleFont.__init__(self, descriptor, widths, spec)
+        if "Encoding" not in spec and "FontFile" in descriptor:
+            # try to recover the missing encoding info from the font file.
+            self.fontfile = stream_value(descriptor.get("FontFile"))
+            length1 = int_value(self.fontfile["Length1"])
+            data = self.fontfile.get_data()[:length1]
+            parser = Type1FontHeaderParser(BytesIO(data))
+            self.cid2unicode = parser.get_encoding()
+    def __repr__(self) -> str:
+        return "<PDFType1Font: basefont=%r>" % self.basefont
+class PDFTrueTypeFont(PDFType1Font):
+    def __repr__(self) -> str:
+        return "<PDFTrueTypeFont: basefont=%r>" % self.basefont
+class PDFType3Font(PDFSimpleFont):
+    def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
+        firstchar = int_value(spec.get("FirstChar", 0))
+        # lastchar = int_value(spec.get('LastChar', 0))
+        width_list = list_value(spec.get("Widths", [0] * 256))
+        widths = {i + firstchar: w for (i, w) in enumerate(width_list)}
+        if "FontDescriptor" in spec:
+            descriptor = dict_value(spec["FontDescriptor"])
+        else:
+            descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
+        PDFSimpleFont.__init__(self, descriptor, widths, spec)
+        self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))
+        (_, self.descent, _, self.ascent) = self.bbox
+        (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
+    def __repr__(self) -> str:
+        return "<PDFType3Font>"
+class PDFCIDFont(PDFFont):
+    default_disp: Union[float, Tuple[Optional[float], float]]
+    def __init__(
+        self,
+        rsrcmgr: "PDFResourceManager",
+        spec: Mapping[str, Any],
+        strict: bool = settings.STRICT,
+    ) -> None:
+        try:
+            self.basefont = literal_name(spec["BaseFont"])
+        except KeyError:
+            if strict:
+                raise PDFFontError("BaseFont is missing")
+            self.basefont = "unknown"
+        self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
+        cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(
+            "latin1",
+        )
+        cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(
+            "latin1",
+        )
+        self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
+        self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)
+        try:
+            descriptor = dict_value(spec["FontDescriptor"])
+        except KeyError:
+            if strict:
+                raise PDFFontError("FontDescriptor is missing")
+            descriptor = {}
+        ttf = None
+        if "FontFile2" in descriptor:
+            self.fontfile = stream_value(descriptor.get("FontFile2"))
+            ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
+        self.unicode_map: Optional[UnicodeMap] = None
+        if "ToUnicode" in spec:
+            if isinstance(spec["ToUnicode"], PDFStream):
+                strm = stream_value(spec["ToUnicode"])
+                self.unicode_map = FileUnicodeMap()
+                CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
+            else:
+                cmap_name = literal_name(spec["ToUnicode"])
+                encoding = literal_name(spec["Encoding"])
+                if (
+                    "Identity" in cid_ordering
+                    or "Identity" in cmap_name
+                    or "Identity" in encoding
+                ):
+                    self.unicode_map = IdentityUnicodeMap()
+        elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):
+            if ttf:
+                try:
+                    self.unicode_map = ttf.create_unicode_map()
+                except TrueTypeFont.CMapNotFound:
+                    pass
+        else:
+            try:
+                self.unicode_map = CMapDB.get_unicode_map(
+                    self.cidcoding,
+                    self.cmap.is_vertical(),
+                )
+            except CMapDB.CMapNotFound:
+                pass
+        self.vertical = self.cmap.is_vertical()
+        if self.vertical:
+            # writing mode: vertical
+            widths2 = get_widths2(list_value(spec.get("W2", [])))
+            self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}
+            (vy, w) = resolve1(spec.get("DW2", [880, -1000]))
+            self.default_disp = (None, vy)
+            widths = {cid: w for (cid, (w, _)) in widths2.items()}
+            default_width = w
+        else:
+            # writing mode: horizontal
+            self.disps = {}
+            self.default_disp = 0
+            widths = get_widths(list_value(spec.get("W", [])))
+            default_width = spec.get("DW", 1000)
+        PDFFont.__init__(self, descriptor, widths, default_width=default_width)
+    def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:
+        """Get cmap from font specification
+        For certain PDFs, Encoding Type isn't mentioned as an attribute of
+        Encoding but as an attribute of CMapName, where CMapName is an
+        attribute of spec['Encoding'].
+        The horizontal/vertical modes are mentioned with different name
+        such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
+        """
+        cmap_name = self._get_cmap_name(spec, strict)
+        try:
+            return CMapDB.get_cmap(cmap_name)
+        except CMapDB.CMapNotFound as e:
+            if strict:
+                raise PDFFontError(e)
+            return CMap()
+    @staticmethod
+    def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
+        """Get cmap name from font specification"""
+        cmap_name = "unknown"  # default value
+        try:
+            spec_encoding = spec["Encoding"]
+            if hasattr(spec_encoding, "name"):
+                cmap_name = literal_name(spec["Encoding"])
+            else:
+                cmap_name = literal_name(spec_encoding["CMapName"])
+        except KeyError:
+            if strict:
+                raise PDFFontError("Encoding is unspecified")
+        if type(cmap_name) is PDFStream:  # type: ignore[comparison-overlap]
+            cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
+            if "CMapName" in cmap_name_stream:
+                cmap_name = cmap_name_stream.get("CMapName").name
+            elif strict:
+                raise PDFFontError("CMapName unspecified for encoding")
+        return IDENTITY_ENCODER.get(cmap_name, cmap_name)
+    def __repr__(self) -> str:
+        return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"
+    def is_vertical(self) -> bool:
+        return self.vertical
+    def is_multibyte(self) -> bool:
+        return True
+    def decode(self, bytes: bytes) -> Iterable[int]:
+        return self.cmap.decode(bytes)
+    def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
+        """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
+        return self.disps.get(cid, self.default_disp)
+    def to_unichr(self, cid: int) -> str:
+        try:
+            if not self.unicode_map:
+                raise PDFKeyError(cid)
+            return self.unicode_map.get_unichr(cid)
+        except KeyError:
+            raise PDFUnicodeNotDefined(self.cidcoding, cid)

pdf2zh/pdfinterp.py ADDED Viewed

	@@ -0,0 +1,1113 @@

+import logging
+import re
+from io import BytesIO
+from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
+import numpy as np
+from pdf2zh import settings
+from pdf2zh.casting import safe_float
+from pdf2zh.cmapdb import CMap, CMapBase, CMapDB
+from pdf2zh.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
+from pdf2zh.pdfdevice import PDFDevice, PDFTextSeq
+from pdf2zh.pdfexceptions import PDFException, PDFValueError
+from pdf2zh.pdffont import (
+    PDFCIDFont,
+    PDFFont,
+    PDFFontError,
+    PDFTrueTypeFont,
+    PDFType1Font,
+    PDFType3Font,
+)
+from pdf2zh.pdfpage import PDFPage
+from pdf2zh.pdftypes import (
+    LITERALS_ASCII85_DECODE,
+    PDFObjRef,
+    PDFStream,
+    dict_value,
+    list_value,
+    resolve1,
+    stream_value,
+)
+from pdf2zh.psexceptions import PSEOF, PSTypeError
+from pdf2zh.psparser import (
+    KWD,
+    LIT,
+    PSKeyword,
+    PSLiteral,
+    PSStackParser,
+    PSStackType,
+    keyword_name,
+    literal_name,
+)
+from pdf2zh.utils import (
+    MATRIX_IDENTITY,
+    Matrix,
+    PathSegment,
+    Point,
+    Rect,
+    choplist,
+    mult_matrix,
+    apply_matrix_pt,
+)
+log = logging.getLogger(__name__)
+class PDFResourceError(PDFException):
+    pass
+class PDFInterpreterError(PDFException):
+    pass
+LITERAL_PDF = LIT("PDF")
+LITERAL_TEXT = LIT("Text")
+LITERAL_FONT = LIT("Font")
+LITERAL_FORM = LIT("Form")
+LITERAL_IMAGE = LIT("Image")
+class PDFTextState:
+    matrix: Matrix
+    linematrix: Point
+    def __init__(self) -> None:
+        self.font: Optional[PDFFont] = None
+        self.fontsize: float = 0
+        self.charspace: float = 0
+        self.wordspace: float = 0
+        self.scaling: float = 100
+        self.leading: float = 0
+        self.render: int = 0
+        self.rise: float = 0
+        self.reset()
+        # self.matrix is set
+        # self.linematrix is set
+    def __repr__(self) -> str:
+        return (
+            "<PDFTextState: font=%r, fontsize=%r, charspace=%r, "
+            "wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, "
+            "matrix=%r, linematrix=%r>"
+            % (
+                self.font,
+                self.fontsize,
+                self.charspace,
+                self.wordspace,
+                self.scaling,
+                self.leading,
+                self.render,
+                self.rise,
+                self.matrix,
+                self.linematrix,
+            )
+        )
+    def copy(self) -> "PDFTextState":
+        obj = PDFTextState()
+        obj.font = self.font
+        obj.fontsize = self.fontsize
+        obj.charspace = self.charspace
+        obj.wordspace = self.wordspace
+        obj.scaling = self.scaling
+        obj.leading = self.leading
+        obj.render = self.render
+        obj.rise = self.rise
+        obj.matrix = self.matrix
+        obj.linematrix = self.linematrix
+        return obj
+    def reset(self) -> None:
+        self.matrix = MATRIX_IDENTITY
+        self.linematrix = (0, 0)
+Color = Union[
+    float,  # Greyscale
+    Tuple[float, float, float],  # R, G, B
+    Tuple[float, float, float, float],  # C, M, Y, K
+]
+class PDFGraphicState:
+    def __init__(self) -> None:
+        self.linewidth: float = 0
+        self.linecap: Optional[object] = None
+        self.linejoin: Optional[object] = None
+        self.miterlimit: Optional[object] = None
+        self.dash: Optional[Tuple[object, object]] = None
+        self.intent: Optional[object] = None
+        self.flatness: Optional[object] = None
+        # stroking color
+        self.scolor: Optional[Color] = None
+        # non stroking color
+        self.ncolor: Optional[Color] = None
+    def copy(self) -> "PDFGraphicState":
+        obj = PDFGraphicState()
+        obj.linewidth = self.linewidth
+        obj.linecap = self.linecap
+        obj.linejoin = self.linejoin
+        obj.miterlimit = self.miterlimit
+        obj.dash = self.dash
+        obj.intent = self.intent
+        obj.flatness = self.flatness
+        obj.scolor = self.scolor
+        obj.ncolor = self.ncolor
+        return obj
+    def __repr__(self) -> str:
+        return (
+            "<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, "
+            " miterlimit=%r, dash=%r, intent=%r, flatness=%r, "
+            " stroking color=%r, non stroking color=%r>"
+            % (
+                self.linewidth,
+                self.linecap,
+                self.linejoin,
+                self.miterlimit,
+                self.dash,
+                self.intent,
+                self.flatness,
+                self.scolor,
+                self.ncolor,
+            )
+        )
+class PDFResourceManager:
+    """Repository of shared resources.
+    ResourceManager facilitates reuse of shared resources
+    such as fonts and images so that large objects are not
+    allocated multiple times.
+    """
+    def __init__(self, caching: bool = True) -> None:
+        self.caching = caching
+        self._cached_fonts: Dict[object, PDFFont] = {}
+    def get_procset(self, procs: Sequence[object]) -> None:
+        for proc in procs:
+            if proc is LITERAL_PDF or proc is LITERAL_TEXT:
+                pass
+            else:
+                pass
+    def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
+        try:
+            return CMapDB.get_cmap(cmapname)
+        except CMapDB.CMapNotFound:
+            if strict:
+                raise
+            return CMap()
+    def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
+        if objid and objid in self._cached_fonts:
+            font = self._cached_fonts[objid]
+        else:
+            # log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
+            if settings.STRICT:
+                if spec["Type"] is not LITERAL_FONT:
+                    raise PDFFontError("Type is not /Font")
+            # Create a Font object.
+            if "Subtype" in spec:
+                subtype = literal_name(spec["Subtype"])
+            else:
+                if settings.STRICT:
+                    raise PDFFontError("Font Subtype is not specified.")
+                subtype = "Type1"
+            if subtype in ("Type1", "MMType1"):
+                # Type1 Font
+                font = PDFType1Font(self, spec)
+            elif subtype == "TrueType":
+                # TrueType Font
+                font = PDFTrueTypeFont(self, spec)
+            elif subtype == "Type3":
+                # Type3 Font
+                font = PDFType3Font(self, spec)
+            elif subtype in ("CIDFontType0", "CIDFontType2"):
+                # CID Font
+                font = PDFCIDFont(self, spec)
+            elif subtype == "Type0":
+                # Type0 Font
+                dfonts = list_value(spec["DescendantFonts"])
+                assert dfonts
+                subspec = dict_value(dfonts[0]).copy()
+                for k in ("Encoding", "ToUnicode"):
+                    if k in spec:
+                        subspec[k] = resolve1(spec[k])
+                font = self.get_font(None, subspec)
+            else:
+                if settings.STRICT:
+                    raise PDFFontError("Invalid Font spec: %r" % spec)
+                font = PDFType1Font(self, spec)  # this is so wrong!
+            if objid and self.caching:
+                self._cached_fonts[objid] = font
+        return font
+class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
+    def __init__(self, streams: Sequence[object]) -> None:
+        self.streams = streams
+        self.istream = 0
+        # PSStackParser.__init__(fp=None) is safe only because we've overloaded
+        # all the methods that would attempt to access self.fp without first
+        # calling self.fillfp().
+        PSStackParser.__init__(self, None)  # type: ignore[arg-type]
+    def fillfp(self) -> None:
+        if not self.fp:
+            if self.istream < len(self.streams):
+                strm = stream_value(self.streams[self.istream])
+                self.istream += 1
+            else:
+                raise PSEOF("Unexpected EOF, file truncated?")
+            self.fp = BytesIO(strm.get_data())
+            # if log.isEnabledFor(logging.DEBUG):
+            #     log.debug(f'STREAM DATA {strm.get_data()}')
+    def seek(self, pos: int) -> None:
+        self.fillfp()
+        PSStackParser.seek(self, pos)
+    def fillbuf(self) -> None:
+        if self.charpos < len(self.buf):
+            return
+        while 1:
+            self.fillfp()
+            self.bufpos = self.fp.tell()
+            self.buf = self.fp.read(self.BUFSIZ)
+            if self.buf:
+                break
+            self.fp = None  # type: ignore[assignment]
+        self.charpos = 0
+    def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
+        self.seek(pos)
+        i = 0
+        data = b""
+        while i <= len(target):
+            self.fillbuf()
+            if i:
+                ci = self.buf[self.charpos]
+                c = bytes((ci,))
+                data += c
+                self.charpos += 1
+                if (
+                    len(target) <= i
+                    and c.isspace()
+                    or i < len(target)
+                    and c == (bytes((target[i],)))
+                ):
+                    i += 1
+                else:
+                    i = 0
+            else:
+                try:
+                    j = self.buf.index(target[0], self.charpos)
+                    data += self.buf[self.charpos : j + 1]
+                    self.charpos = j + 1
+                    i = 1
+                except ValueError:
+                    data += self.buf[self.charpos :]
+                    self.charpos = len(self.buf)
+        data = data[: -(len(target) + 1)]  # strip the last part
+        data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
+        return (pos, data)
+    def flush(self) -> None:
+        self.add_results(*self.popall())
+    KEYWORD_BI = KWD(b"BI")
+    KEYWORD_ID = KWD(b"ID")
+    KEYWORD_EI = KWD(b"EI")
+    def do_keyword(self, pos: int, token: PSKeyword) -> None:
+        if token is self.KEYWORD_BI:
+            # inline image within a content stream
+            self.start_type(pos, "inline")
+        elif token is self.KEYWORD_ID:
+            try:
+                (_, objs) = self.end_type("inline")
+                if len(objs) % 2 != 0:
+                    error_msg = f"Invalid dictionary construct: {objs!r}"
+                    raise PSTypeError(error_msg)
+                d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
+                eos = b"EI"
+                filter = d.get("F", None)
+                if filter is not None:
+                    if isinstance(filter, PSLiteral):
+                        filter = [filter]
+                    if filter[0] in LITERALS_ASCII85_DECODE:
+                        eos = b"~>"
+                (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
+                if eos != b"EI":  # it may be necessary for decoding
+                    data += eos
+                obj = PDFStream(d, data)
+                self.push((pos, obj))
+                if eos == b"EI":  # otherwise it is still in the stream
+                    self.push((pos, self.KEYWORD_EI))
+            except PSTypeError:
+                if settings.STRICT:
+                    raise
+        else:
+            self.push((pos, token))
+PDFStackT = PSStackType[PDFStream]
+"""Types that may appear on the PDF argument stack."""
+class PDFPageInterpreter:
+    """Processor for the content of a PDF page
+    Reference: PDF Reference, Appendix A, Operator Summary
+    """
+    def __init__(
+        self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch
+    ) -> None:
+        self.rsrcmgr = rsrcmgr
+        self.device = device
+        self.obj_patch = obj_patch
+    def dup(self) -> "PDFPageInterpreter":
+        return self.__class__(self.rsrcmgr, self.device, self.obj_patch)
+    def init_resources(self, resources: Dict[object, object]) -> None:
+        """Prepare the fonts and XObjects listed in the Resource attribute."""
+        self.resources = resources
+        self.fontmap: Dict[object, PDFFont] = {}
+        self.fontid: Dict[PDFFont, object] = {}
+        self.xobjmap = {}
+        self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
+        if not resources:
+            return
+        def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
+            if isinstance(spec, list):
+                name = literal_name(spec[0])
+            else:
+                name = literal_name(spec)
+            if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
+                return PDFColorSpace(name, stream_value(spec[1])["N"])
+            elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
+                return PDFColorSpace(name, len(list_value(spec[1])))
+            else:
+                return PREDEFINED_COLORSPACE.get(name)
+        for k, v in dict_value(resources).items():
+            # log.debug("Resource: %r: %r", k, v)
+            if k == "Font":
+                for fontid, spec in dict_value(v).items():
+                    objid = None
+                    if isinstance(spec, PDFObjRef):
+                        objid = spec.objid
+                    spec = dict_value(spec)
+                    self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
+                    self.fontid[self.fontmap[fontid]] = fontid
+            elif k == "ColorSpace":
+                for csid, spec in dict_value(v).items():
+                    colorspace = get_colorspace(resolve1(spec))
+                    if colorspace is not None:
+                        self.csmap[csid] = colorspace
+            elif k == "ProcSet":
+                self.rsrcmgr.get_procset(list_value(v))
+            elif k == "XObject":
+                for xobjid, xobjstrm in dict_value(v).items():
+                    self.xobjmap[xobjid] = xobjstrm
+    def init_state(self, ctm: Matrix) -> None:
+        """Initialize the text and graphic states for rendering a page."""
+        # gstack: stack for graphical states.
+        self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = []
+        self.ctm = ctm
+        self.device.set_ctm(self.ctm)
+        self.textstate = PDFTextState()
+        self.graphicstate = PDFGraphicState()
+        self.curpath: List[PathSegment] = []
+        # argstack: stack for command arguments.
+        self.argstack: List[PDFStackT] = []
+        # set some global states.
+        self.scs: Optional[PDFColorSpace] = None
+        self.ncs: Optional[PDFColorSpace] = None
+        if self.csmap:
+            self.scs = self.ncs = next(iter(self.csmap.values()))
+    def push(self, obj: PDFStackT) -> None:
+        self.argstack.append(obj)
+    def pop(self, n: int) -> List[PDFStackT]:
+        if n == 0:
+            return []
+        x = self.argstack[-n:]
+        self.argstack = self.argstack[:-n]
+        return x
+    def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
+        return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
+    def set_current_state(
+        self,
+        state: Tuple[Matrix, PDFTextState, PDFGraphicState],
+    ) -> None:
+        (self.ctm, self.textstate, self.graphicstate) = state
+        self.device.set_ctm(self.ctm)
+    def do_q(self) -> None:
+        """Save graphics state"""
+        self.gstack.append(self.get_current_state())
+    def do_Q(self) -> None:
+        """Restore graphics state"""
+        if self.gstack:
+            self.set_current_state(self.gstack.pop())
+    def do_cm(
+        self,
+        a1: PDFStackT,
+        b1: PDFStackT,
+        c1: PDFStackT,
+        d1: PDFStackT,
+        e1: PDFStackT,
+        f1: PDFStackT,
+    ) -> None:
+        """Concatenate matrix to current transformation matrix"""
+        self.ctm = mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
+        self.device.set_ctm(self.ctm)
+    def do_w(self, linewidth: PDFStackT) -> None:
+        """Set line width"""
+        self.graphicstate.linewidth = cast(float, linewidth)
+    def do_J(self, linecap: PDFStackT) -> None:
+        """Set line cap style"""
+        self.graphicstate.linecap = linecap
+    def do_j(self, linejoin: PDFStackT) -> None:
+        """Set line join style"""
+        self.graphicstate.linejoin = linejoin
+    def do_M(self, miterlimit: PDFStackT) -> None:
+        """Set miter limit"""
+        self.graphicstate.miterlimit = miterlimit
+    def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
+        """Set line dash pattern"""
+        self.graphicstate.dash = (dash, phase)
+    def do_ri(self, intent: PDFStackT) -> None:
+        """Set color rendering intent"""
+        self.graphicstate.intent = intent
+    def do_i(self, flatness: PDFStackT) -> None:
+        """Set flatness tolerance"""
+        self.graphicstate.flatness = flatness
+    def do_gs(self, name: PDFStackT) -> None:
+        """Set parameters from graphics state parameter dictionary"""
+        # TODO
+    def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
+        """Begin new subpath"""
+        self.curpath.append(("m", cast(float, x), cast(float, y)))
+    def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
+        """Append straight line segment to path"""
+        self.curpath.append(("l", cast(float, x), cast(float, y)))
+    def do_c(
+        self,
+        x1: PDFStackT,
+        y1: PDFStackT,
+        x2: PDFStackT,
+        y2: PDFStackT,
+        x3: PDFStackT,
+        y3: PDFStackT,
+    ) -> None:
+        """Append curved segment to path (three control points)"""
+        self.curpath.append(
+            (
+                "c",
+                cast(float, x1),
+                cast(float, y1),
+                cast(float, x2),
+                cast(float, y2),
+                cast(float, x3),
+                cast(float, y3),
+            ),
+        )
+    def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
+        """Append curved segment to path (initial point replicated)"""
+        self.curpath.append(
+            ("v", cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3)),
+        )
+    def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
+        """Append curved segment to path (final point replicated)"""
+        self.curpath.append(
+            ("y", cast(float, x1), cast(float, y1), cast(float, x3), cast(float, y3)),
+        )
+    def do_h(self) -> None:
+        """Close subpath"""
+        self.curpath.append(("h",))
+    def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
+        """Append rectangle to path"""
+        x = cast(float, x)
+        y = cast(float, y)
+        w = cast(float, w)
+        h = cast(float, h)
+        self.curpath.append(("m", x, y))
+        self.curpath.append(("l", x + w, y))
+        self.curpath.append(("l", x + w, y + h))
+        self.curpath.append(("l", x, y + h))
+        self.curpath.append(("h",))
+    def do_S(self) -> None:
+        """Stroke path"""
+        def is_black(color: Color) -> bool:
+            if isinstance(color, Tuple):
+                return sum(color) == 0
+            else:
+                return color == 0
+        if (
+            len(self.curpath) == 2
+            and self.curpath[0][0] == "m"
+            and self.curpath[1][0] == "l"
+            and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1]
+            == apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1]
+            and is_black(self.graphicstate.scolor)
+        ):  # 独立直线，水平，黑色
+            # print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor)
+            self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
+            self.curpath = []
+            return "n"
+        else:
+            self.curpath = []
+    def do_s(self) -> None:
+        """Close and stroke path"""
+        self.do_h()
+        self.do_S()
+    def do_f(self) -> None:
+        """Fill path using nonzero winding number rule"""
+        # self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
+        self.curpath = []
+    def do_F(self) -> None:
+        """Fill path using nonzero winding number rule (obsolete)"""
+    def do_f_a(self) -> None:
+        """Fill path using even-odd rule"""
+        # self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
+        self.curpath = []
+    def do_B(self) -> None:
+        """Fill and stroke path using nonzero winding number rule"""
+        # self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
+        self.curpath = []
+    def do_B_a(self) -> None:
+        """Fill and stroke path using even-odd rule"""
+        # self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
+        self.curpath = []
+    def do_b(self) -> None:
+        """Close, fill, and stroke path using nonzero winding number rule"""
+        self.do_h()
+        self.do_B()
+    def do_b_a(self) -> None:
+        """Close, fill, and stroke path using even-odd rule"""
+        self.do_h()
+        self.do_B_a()
+    def do_n(self) -> None:
+        """End path without filling or stroking"""
+        self.curpath = []
+    def do_W(self) -> None:
+        """Set clipping path using nonzero winding number rule"""
+    def do_W_a(self) -> None:
+        """Set clipping path using even-odd rule"""
+    def do_CS(self, name: PDFStackT) -> None:
+        """Set color space for stroking operations
+        Introduced in PDF 1.1
+        """
+        try:
+            self.scs = self.csmap[literal_name(name)]
+        except KeyError:
+            if settings.STRICT:
+                raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
+    def do_cs(self, name: PDFStackT) -> None:
+        """Set color space for nonstroking operations"""
+        try:
+            self.ncs = self.csmap[literal_name(name)]
+        except KeyError:
+            if settings.STRICT:
+                raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
+    def do_G(self, gray: PDFStackT) -> None:
+        """Set gray level for stroking operations"""
+        self.graphicstate.scolor = cast(float, gray)
+        self.scs = self.csmap["DeviceGray"]
+    def do_g(self, gray: PDFStackT) -> None:
+        """Set gray level for nonstroking operations"""
+        self.graphicstate.ncolor = cast(float, gray)
+        self.ncs = self.csmap["DeviceGray"]
+    def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
+        """Set RGB color for stroking operations"""
+        self.graphicstate.scolor = (cast(float, r), cast(float, g), cast(float, b))
+        self.scs = self.csmap["DeviceRGB"]
+    def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
+        """Set RGB color for nonstroking operations"""
+        self.graphicstate.ncolor = (cast(float, r), cast(float, g), cast(float, b))
+        self.ncs = self.csmap["DeviceRGB"]
+    def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
+        """Set CMYK color for stroking operations"""
+        self.graphicstate.scolor = (
+            cast(float, c),
+            cast(float, m),
+            cast(float, y),
+            cast(float, k),
+        )
+        self.scs = self.csmap["DeviceCMYK"]
+    def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
+        """Set CMYK color for nonstroking operations"""
+        self.graphicstate.ncolor = (
+            cast(float, c),
+            cast(float, m),
+            cast(float, y),
+            cast(float, k),
+        )
+        self.ncs = self.csmap["DeviceCMYK"]
+    def do_SCN(self) -> None:
+        """Set color for stroking operations."""
+        if self.scs:
+            n = self.scs.ncomponents
+        else:
+            if settings.STRICT:
+                raise PDFInterpreterError("No colorspace specified!")
+            n = 1
+        args = self.pop(n)
+        self.graphicstate.scolor = cast(Color, args)
+        return args
+    def do_scn(self) -> None:
+        """Set color for nonstroking operations"""
+        if self.ncs:
+            n = self.ncs.ncomponents
+        else:
+            if settings.STRICT:
+                raise PDFInterpreterError("No colorspace specified!")
+            n = 1
+        args = self.pop(n)
+        self.graphicstate.ncolor = cast(Color, args)
+        return args
+    def do_SC(self) -> None:
+        """Set color for stroking operations"""
+        return self.do_SCN()
+    def do_sc(self) -> None:
+        """Set color for nonstroking operations"""
+        return self.do_scn()
+    def do_sh(self, name: object) -> None:
+        """Paint area defined by shading pattern"""
+    def do_BT(self) -> None:
+        """Begin text object
+        Initializing the text matrix, Tm, and the text line matrix, Tlm, to
+        the identity matrix. Text objects cannot be nested; a second BT cannot
+        appear before an ET.
+        """
+        self.textstate.reset()
+    def do_ET(self) -> None:
+        """End a text object"""
+    def do_BX(self) -> None:
+        """Begin compatibility section"""
+    def do_EX(self) -> None:
+        """End compatibility section"""
+    def do_MP(self, tag: PDFStackT) -> None:
+        """Define marked-content point"""
+        self.device.do_tag(cast(PSLiteral, tag))
+    def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None:
+        """Define marked-content point with property list"""
+        self.device.do_tag(cast(PSLiteral, tag), props)
+    def do_BMC(self, tag: PDFStackT) -> None:
+        """Begin marked-content sequence"""
+        self.device.begin_tag(cast(PSLiteral, tag))
+    def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None:
+        """Begin marked-content sequence with property list"""
+        self.device.begin_tag(cast(PSLiteral, tag), props)
+    def do_EMC(self) -> None:
+        """End marked-content sequence"""
+        self.device.end_tag()
+    def do_Tc(self, space: PDFStackT) -> None:
+        """Set character spacing.
+        Character spacing is used by the Tj, TJ, and ' operators.
+        :param space: a number expressed in unscaled text space units.
+        """
+        self.textstate.charspace = cast(float, space)
+    def do_Tw(self, space: PDFStackT) -> None:
+        """Set the word spacing.
+        Word spacing is used by the Tj, TJ, and ' operators.
+        :param space: a number expressed in unscaled text space units
+        """
+        self.textstate.wordspace = cast(float, space)
+    def do_Tz(self, scale: PDFStackT) -> None:
+        """Set the horizontal scaling.
+        :param scale: is a number specifying the percentage of the normal width
+        """
+        self.textstate.scaling = cast(float, scale)
+    def do_TL(self, leading: PDFStackT) -> None:
+        """Set the text leading.
+        Text leading is used only by the T*, ', and " operators.
+        :param leading: a number expressed in unscaled text space units
+        """
+        self.textstate.leading = -cast(float, leading)
+    def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
+        """Set the text font
+        :param fontid: the name of a font resource in the Font subdictionary
+            of the current resource dictionary
+        :param fontsize: size is a number representing a scale factor.
+        """
+        try:
+            self.textstate.font = self.fontmap[literal_name(fontid)]
+        except KeyError:
+            if settings.STRICT:
+                raise PDFInterpreterError("Undefined Font id: %r" % fontid)
+            self.textstate.font = self.rsrcmgr.get_font(None, {})
+        self.textstate.fontsize = cast(float, fontsize)
+    def do_Tr(self, render: PDFStackT) -> None:
+        """Set the text rendering mode"""
+        self.textstate.render = cast(int, render)
+    def do_Ts(self, rise: PDFStackT) -> None:
+        """Set the text rise
+        :param rise: a number expressed in unscaled text space units
+        """
+        self.textstate.rise = cast(float, rise)
+    def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
+        """Move to the start of the next line
+        Offset from the start of the current line by (tx , ty).
+        """
+        tx_ = safe_float(tx)
+        ty_ = safe_float(ty)
+        if tx_ is not None and ty_ is not None:
+            (a, b, c, d, e, f) = self.textstate.matrix
+            e_new = tx_ * a + ty_ * c + e
+            f_new = tx_ * b + ty_ * d + f
+            self.textstate.matrix = (a, b, c, d, e_new, f_new)
+        elif settings.STRICT:
+            raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td")
+        self.textstate.linematrix = (0, 0)
+    def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
+        """Move to the start of the next line.
+        offset from the start of the current line by (tx , ty). As a side effect, this
+        operator sets the leading parameter in the text state.
+        """
+        tx_ = safe_float(tx)
+        ty_ = safe_float(ty)
+        if tx_ is not None and ty_ is not None:
+            (a, b, c, d, e, f) = self.textstate.matrix
+            e_new = tx_ * a + ty_ * c + e
+            f_new = tx_ * b + ty_ * d + f
+            self.textstate.matrix = (a, b, c, d, e_new, f_new)
+        elif settings.STRICT:
+            raise PDFValueError("Invalid offset ({tx}, {ty}) for TD")
+        if ty_ is not None:
+            self.textstate.leading = ty_
+        self.textstate.linematrix = (0, 0)
+    def do_Tm(
+        self,
+        a: PDFStackT,
+        b: PDFStackT,
+        c: PDFStackT,
+        d: PDFStackT,
+        e: PDFStackT,
+        f: PDFStackT,
+    ) -> None:
+        """Set text matrix and text line matrix"""
+        self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f))
+        self.textstate.linematrix = (0, 0)
+    def do_T_a(self) -> None:
+        """Move to start of next text line"""
+        (a, b, c, d, e, f) = self.textstate.matrix
+        self.textstate.matrix = (
+            a,
+            b,
+            c,
+            d,
+            self.textstate.leading * c + e,
+            self.textstate.leading * d + f,
+        )
+        self.textstate.linematrix = (0, 0)
+    def do_TJ(self, seq: PDFStackT) -> None:
+        """Show text, allowing individual glyph positioning"""
+        if self.textstate.font is None:
+            if settings.STRICT:
+                raise PDFInterpreterError("No font specified!")
+            return
+        assert self.ncs is not None
+        self.device.render_string(
+            self.textstate,
+            cast(PDFTextSeq, seq),
+            self.ncs,
+            self.graphicstate.copy(),
+        )
+    def do_Tj(self, s: PDFStackT) -> None:
+        """Show text"""
+        self.do_TJ([s])
+    def do__q(self, s: PDFStackT) -> None:
+        """Move to next line and show text
+        The ' (single quote) operator.
+        """
+        self.do_T_a()
+        self.do_TJ([s])
+    def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None:
+        """Set word and character spacing, move to next line, and show text
+        The " (double quote) operator.
+        """
+        self.do_Tw(aw)
+        self.do_Tc(ac)
+        self.do_TJ([s])
+    def do_BI(self) -> None:
+        """Begin inline image object"""
+    def do_ID(self) -> None:
+        """Begin inline image data"""
+    def do_EI(self, obj: PDFStackT) -> None:
+        """End inline image object"""
+        if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
+            iobjid = str(id(obj))
+            self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
+            self.device.render_image(iobjid, obj)
+            self.device.end_figure(iobjid)
+    def do_Do(self, xobjid_arg: PDFStackT) -> None:
+        """Invoke named XObject"""
+        xobjid = literal_name(xobjid_arg)
+        try:
+            xobj = stream_value(self.xobjmap[xobjid])
+        except KeyError:
+            if settings.STRICT:
+                raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
+            return
+        # log.debug("Processing xobj: %r", xobj)
+        subtype = xobj.get("Subtype")
+        if subtype is LITERAL_FORM and "BBox" in xobj:
+            interpreter = self.dup()
+            bbox = cast(Rect, list_value(xobj["BBox"]))
+            matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
+            # According to PDF reference 1.7 section 4.9.1, XObjects in
+            # earlier PDFs (prior to v1.2) use the page's Resources entry
+            # instead of having their own Resources entry.
+            xobjres = xobj.get("Resources")
+            if xobjres:
+                resources = dict_value(xobjres)
+            else:
+                resources = self.resources.copy()
+            self.device.begin_figure(xobjid, bbox, matrix)
+            ctm = mult_matrix(matrix, self.ctm)
+            ops_base = interpreter.render_contents(
+                resources,
+                [xobj],
+                ctm=ctm,
+            )
+            try:  # 有的时候 form 字体加不上这里会烂掉
+                self.device.fontid = interpreter.fontid
+                self.device.fontmap = interpreter.fontmap
+                ops_new = self.device.end_figure(xobjid)
+                ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2))
+                pos_inv = -np.mat(ctm[4:]) * ctm_inv
+                a, b, c, d = ctm_inv.reshape(4).tolist()
+                e, f = pos_inv.tolist()[0]
+                self.obj_patch[self.xobjmap[xobjid].objid] = (
+                    f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}"
+                )
+            except Exception:
+                pass
+        elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
+            self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
+            self.device.render_image(xobjid, xobj)
+            self.device.end_figure(xobjid)
+        else:
+            # unsupported xobject type.
+            pass
+    def process_page(self, page: PDFPage) -> None:
+        # log.debug("Processing page: %r", page)
+        # print(page.mediabox,page.cropbox)
+        # (x0, y0, x1, y1) = page.mediabox
+        (x0, y0, x1, y1) = page.cropbox
+        if page.rotate == 90:
+            ctm = (0, -1, 1, 0, -y0, x1)
+        elif page.rotate == 180:
+            ctm = (-1, 0, 0, -1, x1, y1)
+        elif page.rotate == 270:
+            ctm = (0, 1, -1, 0, y1, -x0)
+        else:
+            ctm = (1, 0, 0, 1, -x0, -y0)
+        self.device.begin_page(page, ctm)
+        ops_base = self.render_contents(page.resources, page.contents, ctm=ctm)
+        self.device.fontid = self.fontid
+        self.device.fontmap = self.fontmap
+        ops_new = self.device.end_page(page)
+        # 上面渲染的时候会根据 cropbox 减掉页面���移得到真实坐标，这里输出的时候需要用 cm 把页面偏移加回来
+        self.obj_patch[page.page_xref] = (
+            f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}"  # ops_base 里可能有图，需要让 ops_new 里的文字覆盖在上面，使用 q/Q 重置位置矩阵
+        )
+        for obj in page.contents:
+            self.obj_patch[obj.objid] = ""
+    def render_contents(
+        self,
+        resources: Dict[object, object],
+        streams: Sequence[object],
+        ctm: Matrix = MATRIX_IDENTITY,
+    ) -> None:
+        """Render the content streams.
+        This method may be called recursively.
+        """
+        # log.debug(
+        #     "render_contents: resources=%r, streams=%r, ctm=%r",
+        #     resources,
+        #     streams,
+        #     ctm,
+        # )
+        self.init_resources(resources)
+        self.init_state(ctm)
+        return self.execute(list_value(streams))
+    def execute(self, streams: Sequence[object]) -> None:
+        ops = ""
+        try:
+            parser = PDFContentParser(streams)
+        except PSEOF:
+            # empty page
+            return
+        while True:
+            try:
+                _, (_, obj) = parser.nextobject()
+            except PSEOF:
+                break
+            if isinstance(obj, PSKeyword):
+                name = keyword_name(obj)
+                method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
+                    "'",
+                    "_q",
+                )
+                if hasattr(self, method):
+                    func = getattr(self, method)
+                    nargs = func.__code__.co_argcount - 1
+                    if nargs:
+                        args = self.pop(nargs)
+                        # log.debug("exec: %s %r", name, args)
+                        if len(args) == nargs:
+                            func(*args)
+                            if not (
+                                name[0] == "T"
+                                or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"]
+                            ):  # 过滤 T 系列文字指令，因为 EI 的参数是 obj 所以也需要过滤（只在少数文档中画横线时使用），过滤 marked 系列指令
+                                p = " ".join(
+                                    [
+                                        (
+                                            f"{x:f}"
+                                            if isinstance(x, float)
+                                            else str(x).replace("'", "")
+                                        )
+                                        for x in args
+                                    ]
+                                )
+                                ops += f"{p} {name} "
+                    else:
+                        # log.debug("exec: %s", name)
+                        targs = func()
+                        if targs is None:
+                            targs = []
+                        if not (name[0] == "T" or name in ["BI", "ID", "EMC"]):
+                            p = " ".join(
+                                [
+                                    (
+                                        f"{x:f}"
+                                        if isinstance(x, float)
+                                        else str(x).replace("'", "")
+                                    )
+                                    for x in targs
+                                ]
+                            )
+                            ops += f"{p} {name} "
+                elif settings.STRICT:
+                    error_msg = "Unknown operator: %r" % name
+                    raise PDFInterpreterError(error_msg)
+            else:
+                self.push(obj)
+        # print('REV DATA',ops)
+        return ops

pdf2zh/pdfpage.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import itertools
+import logging
+from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple
+from pdf2zh import settings
+from pdf2zh.pdfdocument import (
+    PDFDocument,
+    PDFNoPageLabels,
+    PDFTextExtractionNotAllowed,
+)
+from pdf2zh.pdfexceptions import PDFObjectNotFound, PDFValueError
+from pdf2zh.pdfparser import PDFParser
+from pdf2zh.pdftypes import dict_value, int_value, list_value, resolve1
+from pdf2zh.psparser import LIT
+from pdf2zh.utils import parse_rect
+log = logging.getLogger(__name__)
+# some predefined literals and keywords.
+LITERAL_PAGE = LIT("Page")
+LITERAL_PAGES = LIT("Pages")
+class PDFPage:
+    """An object that holds the information about a page.
+    A PDFPage object is merely a convenience class that has a set
+    of keys and values, which describe the properties of a page
+    and point to its contents.
+    Attributes
+    ----------
+      doc: a PDFDocument object.
+      pageid: any Python object that can uniquely identify the page.
+      attrs: a dictionary of page attributes.
+      contents: a list of PDFStream objects that represents the page content.
+      lastmod: the last modified time of the page.
+      resources: a dictionary of resources used by the page.
+      mediabox: the physical size of the page.
+      cropbox: the crop rectangle of the page.
+      rotate: the page rotation (in degree).
+      annots: the page annotations.
+      beads: a chain that represents natural reading order.
+      label: the page's label (typically, the logical page number).
+    """
+    def __init__(
+        self,
+        doc: PDFDocument,
+        pageid: object,
+        attrs: object,
+        label: Optional[str],
+    ) -> None:
+        """Initialize a page object.
+        doc: a PDFDocument object.
+        pageid: any Python object that can uniquely identify the page.
+        attrs: a dictionary of page attributes.
+        label: page label string.
+        """
+        self.doc = doc
+        self.pageid = pageid
+        self.pageno = 0
+        self.attrs = dict_value(attrs)
+        self.label = label
+        self.lastmod = resolve1(self.attrs.get("LastModified"))
+        self.resources: Dict[object, object] = resolve1(
+            self.attrs.get("Resources", dict()),
+        )
+        mediabox_params: List[Any] = [
+            resolve1(mediabox_param) for mediabox_param in self.attrs["MediaBox"]
+        ]
+        self.mediabox = parse_rect(resolve1(mediabox_params))
+        self.cropbox = self.mediabox
+        if "CropBox" in self.attrs:
+            try:
+                self.cropbox = parse_rect(resolve1(self.attrs["CropBox"]))
+            except PDFValueError:
+                pass
+        self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
+        self.annots = self.attrs.get("Annots")
+        self.beads = self.attrs.get("B")
+        if "Contents" in self.attrs:
+            contents = resolve1(self.attrs["Contents"])
+        else:
+            contents = []
+        if not isinstance(contents, list):
+            contents = [contents]
+        self.contents: List[object] = contents
+    def __repr__(self) -> str:
+        return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"
+    INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
+    @classmethod
+    def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
+        def depth_first_search(
+            obj: Any,
+            parent: Dict[str, Any],
+            visited: Optional[Set[Any]] = None,
+        ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]:
+            if isinstance(obj, int):
+                object_id = obj
+                object_properties = dict_value(document.getobj(object_id)).copy()
+            else:
+                # This looks broken. obj.objid means obj could be either
+                # PDFObjRef or PDFStream, but neither is valid for dict_value.
+                object_id = obj.objid  # type: ignore[attr-defined]
+                object_properties = dict_value(obj).copy()
+            # Avoid recursion errors by keeping track of visited nodes
+            if visited is None:
+                visited = set()
+            if object_id in visited:
+                return
+            visited.add(object_id)
+            for k, v in parent.items():
+                if k in cls.INHERITABLE_ATTRS and k not in object_properties:
+                    object_properties[k] = v
+            object_type = object_properties.get("Type")
+            if object_type is None and not settings.STRICT:  # See #64
+                object_type = object_properties.get("type")
+            if object_type is LITERAL_PAGES and "Kids" in object_properties:
+                # log.debug("Pages: Kids=%r", object_properties["Kids"])
+                for child in list_value(object_properties["Kids"]):
+                    yield from depth_first_search(child, object_properties, visited)
+            elif object_type is LITERAL_PAGE:
+                # log.debug("Page: %r", object_properties)
+                yield (object_id, object_properties)
+        try:
+            page_labels: Iterator[Optional[str]] = document.get_page_labels()
+        except PDFNoPageLabels:
+            page_labels = itertools.repeat(None)
+        pages = False
+        if "Pages" in document.catalog:
+            objects = depth_first_search(document.catalog["Pages"], document.catalog)
+            for objid, tree in objects:
+                yield cls(document, objid, tree, next(page_labels))
+                pages = True
+        if not pages:
+            # fallback when /Pages is missing.
+            for xref in document.xrefs:
+                for objid in xref.get_objids():
+                    try:
+                        obj = document.getobj(objid)
+                        if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
+                            yield cls(document, objid, obj, next(page_labels))
+                    except PDFObjectNotFound:
+                        pass
+    @classmethod
+    def get_pages(
+        cls,
+        fp: BinaryIO,
+        pagenos: Optional[Container[int]] = None,
+        maxpages: int = 0,
+        password: str = "",
+        caching: bool = True,
+        check_extractable: bool = False,
+    ) -> Iterator["PDFPage"]:
+        # Create a PDF parser object associated with the file object.
+        parser = PDFParser(fp)
+        # Create a PDF document object that stores the document structure.
+        doc = PDFDocument(parser, password=password, caching=caching)
+        # Check if the document allows text extraction.
+        # If not, warn the user and proceed.
+        if not doc.is_extractable:
+            if check_extractable:
+                error_msg = "Text extraction is not allowed: %r" % fp
+                raise PDFTextExtractionNotAllowed(error_msg)
+            else:
+                warning_msg = (
+                    "The PDF %r contains a metadata field "
+                    "indicating that it should not allow "
+                    "text extraction. Ignoring this field "
+                    "and proceeding. Use the check_extractable "
+                    "if you want to raise an error in this case" % fp
+                )
+                log.warning(warning_msg)
+        # Process each page contained in the document.
+        for pageno, page in enumerate(cls.create_pages(doc)):
+            page.pageno = pageno
+            if pagenos and (pageno not in pagenos):
+                continue
+            yield page
+            if maxpages and maxpages <= pageno + 1:
+                break

pdf2zh/pdfparser.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import logging
+from io import BytesIO
+from typing import TYPE_CHECKING, BinaryIO, Optional, Union
+from pdf2zh import settings
+from pdf2zh.casting import safe_int
+from pdf2zh.pdfexceptions import PDFException
+from pdf2zh.pdftypes import PDFObjRef, PDFStream, dict_value, int_value
+from pdf2zh.psexceptions import PSEOF
+from pdf2zh.psparser import KWD, PSKeyword, PSStackParser
+if TYPE_CHECKING:
+    from pdf2zh.pdfdocument import PDFDocument
+log = logging.getLogger(__name__)
+class PDFSyntaxError(PDFException):
+    pass
+# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
+class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
+    """PDFParser fetch PDF objects from a file stream.
+    It can handle indirect references by referring to
+    a PDF document set by set_document method.
+    It also reads XRefs at the end of every PDF file.
+    Typical usage:
+      parser = PDFParser(fp)
+      parser.read_xref()
+      parser.read_xref(fallback=True) # optional
+      parser.set_document(doc)
+      parser.seek(offset)
+      parser.nextobject()
+    """
+    def __init__(self, fp: BinaryIO) -> None:
+        PSStackParser.__init__(self, fp)
+        self.doc: Optional[PDFDocument] = None
+        self.fallback = False
+    def set_document(self, doc: "PDFDocument") -> None:
+        """Associates the parser with a PDFDocument object."""
+        self.doc = doc
+    KEYWORD_R = KWD(b"R")
+    KEYWORD_NULL = KWD(b"null")
+    KEYWORD_ENDOBJ = KWD(b"endobj")
+    KEYWORD_STREAM = KWD(b"stream")
+    KEYWORD_XREF = KWD(b"xref")
+    KEYWORD_STARTXREF = KWD(b"startxref")
+    def do_keyword(self, pos: int, token: PSKeyword) -> None:
+        """Handles PDF-related keywords."""
+        if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
+            self.add_results(*self.pop(1))
+        elif token is self.KEYWORD_ENDOBJ:
+            self.add_results(*self.pop(4))
+        elif token is self.KEYWORD_NULL:
+            # null object
+            self.push((pos, None))
+        elif token is self.KEYWORD_R:
+            # reference to indirect object
+            if len(self.curstack) >= 2:
+                (_, _object_id), _ = self.pop(2)
+                object_id = safe_int(_object_id)
+                if object_id is not None:
+                    obj = PDFObjRef(self.doc, object_id)
+                    self.push((pos, obj))
+        elif token is self.KEYWORD_STREAM:
+            # stream object
+            ((_, dic),) = self.pop(1)
+            dic = dict_value(dic)
+            objlen = 0
+            if not self.fallback:
+                try:
+                    objlen = int_value(dic["Length"])
+                except KeyError:
+                    if settings.STRICT:
+                        raise PDFSyntaxError("/Length is undefined: %r" % dic)
+            self.seek(pos)
+            try:
+                (_, line) = self.nextline()  # 'stream'
+            except PSEOF:
+                if settings.STRICT:
+                    raise PDFSyntaxError("Unexpected EOF")
+                return
+            pos += len(line)
+            self.fp.seek(pos)
+            data = bytearray(self.fp.read(objlen))
+            self.seek(pos + objlen)
+            while 1:
+                try:
+                    (linepos, line) = self.nextline()
+                except PSEOF:
+                    if settings.STRICT:
+                        raise PDFSyntaxError("Unexpected EOF")
+                    break
+                if b"endstream" in line:
+                    i = line.index(b"endstream")
+                    objlen += i
+                    if self.fallback:
+                        data += line[:i]
+                    break
+                objlen += len(line)
+                if self.fallback:
+                    data += line
+            self.seek(pos + objlen)
+            # XXX limit objlen not to exceed object boundary
+            # log.debug(
+            #     "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
+            #     pos,
+            #     objlen,
+            #     dic,
+            #     data[:10],
+            # )
+            assert self.doc is not None
+            stream = PDFStream(dic, bytes(data), self.doc.decipher)
+            self.push((pos, stream))
+        else:
+            # others
+            self.push((pos, token))
+class PDFStreamParser(PDFParser):
+    """PDFStreamParser is used to parse PDF content streams
+    that is contained in each page and has instructions
+    for rendering the page. A reference to a PDF document is
+    needed because a PDF content stream can also have
+    indirect references to other objects in the same document.
+    """
+    def __init__(self, data: bytes) -> None:
+        PDFParser.__init__(self, BytesIO(data))
+    def flush(self) -> None:
+        self.add_results(*self.popall())
+    KEYWORD_OBJ = KWD(b"obj")
+    def do_keyword(self, pos: int, token: PSKeyword) -> None:
+        if token is self.KEYWORD_R:
+            # reference to indirect object
+            (_, _object_id), _ = self.pop(2)
+            object_id = safe_int(_object_id)
+            if object_id is not None:
+                obj = PDFObjRef(self.doc, object_id)
+                self.push((pos, obj))
+            return
+        elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
+            if settings.STRICT:
+                # See PDF Spec 3.4.6: Only the object values are stored in the
+                # stream; the obj and endobj keywords are not used.
+                raise PDFSyntaxError("Keyword endobj found in stream")
+            return
+        # others
+        self.push((pos, token))

pdf2zh/pdftypes.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import io
+import logging
+import zlib
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Protocol,
+    Tuple,
+    Union,
+    cast,
+)
+from warnings import warn
+from pdf2zh import pdfexceptions, settings
+from pdf2zh.ascii85 import ascii85decode, asciihexdecode
+from pdf2zh.ccitt import ccittfaxdecode
+from pdf2zh.lzw import lzwdecode
+from pdf2zh.psparser import LIT, PSObject
+from pdf2zh.runlength import rldecode
+from pdf2zh.utils import apply_png_predictor
+if TYPE_CHECKING:
+    from pdf2zh.pdfdocument import PDFDocument
+logger = logging.getLogger(__name__)
+LITERAL_CRYPT = LIT("Crypt")
+# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
+LITERALS_FLATE_DECODE = (LIT("FlateDecode"), LIT("Fl"))
+LITERALS_LZW_DECODE = (LIT("LZWDecode"), LIT("LZW"))
+LITERALS_ASCII85_DECODE = (LIT("ASCII85Decode"), LIT("A85"))
+LITERALS_ASCIIHEX_DECODE = (LIT("ASCIIHexDecode"), LIT("AHx"))
+LITERALS_RUNLENGTH_DECODE = (LIT("RunLengthDecode"), LIT("RL"))
+LITERALS_CCITTFAX_DECODE = (LIT("CCITTFaxDecode"), LIT("CCF"))
+LITERALS_DCT_DECODE = (LIT("DCTDecode"), LIT("DCT"))
+LITERALS_JBIG2_DECODE = (LIT("JBIG2Decode"),)
+LITERALS_JPX_DECODE = (LIT("JPXDecode"),)
+class DecipherCallable(Protocol):
+    """Fully typed a decipher callback, with optional parameter."""
+    def __call__(
+        self,
+        objid: int,
+        genno: int,
+        data: bytes,
+        attrs: Optional[Dict[str, Any]] = None,
+    ) -> bytes:
+        raise NotImplementedError
+class PDFObject(PSObject):
+    pass
+# Adding aliases for these exceptions for backwards compatibility
+PDFException = pdfexceptions.PDFException
+PDFTypeError = pdfexceptions.PDFTypeError
+PDFValueError = pdfexceptions.PDFValueError
+PDFObjectNotFound = pdfexceptions.PDFObjectNotFound
+PDFNotImplementedError = pdfexceptions.PDFNotImplementedError
+_DEFAULT = object()
+class PDFObjRef(PDFObject):
+    def __init__(
+        self,
+        doc: Optional["PDFDocument"],
+        objid: int,
+        _: Any = _DEFAULT,
+    ) -> None:
+        """Reference to a PDF object.
+        :param doc: The PDF document.
+        :param objid: The object number.
+        :param _: Unused argument for backwards compatibility.
+        """
+        if _ is not _DEFAULT:
+            warn(
+                "The third argument of PDFObjRef is unused and will be removed after "
+                "2024",
+                DeprecationWarning,
+            )
+        if objid == 0:
+            if settings.STRICT:
+                raise PDFValueError("PDF object id cannot be 0.")
+        self.doc = doc
+        self.objid = objid
+    def __repr__(self) -> str:
+        return "<PDFObjRef:%d>" % (self.objid)
+    def resolve(self, default: object = None) -> Any:
+        assert self.doc is not None
+        try:
+            return self.doc.getobj(self.objid)
+        except PDFObjectNotFound:
+            return default
+def resolve1(x: object, default: object = None) -> Any:
+    """Resolves an object.
+    If this is an array or dictionary, it may still contains
+    some indirect objects inside.
+    """
+    while isinstance(x, PDFObjRef):
+        x = x.resolve(default=default)
+    return x
+def resolve_all(x: object, default: object = None) -> Any:
+    """Recursively resolves the given object and all the internals.
+    Make sure there is no indirect reference within the nested object.
+    This procedure might be slow.
+    """
+    while isinstance(x, PDFObjRef):
+        x = x.resolve(default=default)
+    if isinstance(x, list):
+        x = [resolve_all(v, default=default) for v in x]
+    elif isinstance(x, dict):
+        for k, v in x.items():
+            x[k] = resolve_all(v, default=default)
+    return x
+def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: object) -> Any:
+    """Recursively deciphers the given object."""
+    if isinstance(x, bytes):
+        if len(x) == 0:
+            return x
+        return decipher(objid, genno, x)
+    if isinstance(x, list):
+        x = [decipher_all(decipher, objid, genno, v) for v in x]
+    elif isinstance(x, dict):
+        for k, v in x.items():
+            x[k] = decipher_all(decipher, objid, genno, v)
+    return x
+def int_value(x: object) -> int:
+    x = resolve1(x)
+    if not isinstance(x, int):
+        if settings.STRICT:
+            raise PDFTypeError("Integer required: %r" % x)
+        return 0
+    return x
+def float_value(x: object) -> float:
+    x = resolve1(x)
+    if not isinstance(x, float):
+        if settings.STRICT:
+            raise PDFTypeError("Float required: %r" % x)
+        return 0.0
+    return x
+def num_value(x: object) -> float:
+    x = resolve1(x)
+    if not isinstance(x, (int, float)):  # == utils.isnumber(x)
+        if settings.STRICT:
+            raise PDFTypeError("Int or Float required: %r" % x)
+        return 0
+    return x
+def uint_value(x: object, n_bits: int) -> int:
+    """Resolve number and interpret it as a two's-complement unsigned number"""
+    xi = int_value(x)
+    if xi > 0:
+        return xi
+    else:
+        return xi + cast(int, 2**n_bits)
+def str_value(x: object) -> bytes:
+    x = resolve1(x)
+    if not isinstance(x, bytes):
+        if settings.STRICT:
+            raise PDFTypeError("String required: %r" % x)
+        return b""
+    return x
+def list_value(x: object) -> Union[List[Any], Tuple[Any, ...]]:
+    x = resolve1(x)
+    if not isinstance(x, (list, tuple)):
+        if settings.STRICT:
+            raise PDFTypeError("List required: %r" % x)
+        return []
+    return x
+def dict_value(x: object) -> Dict[Any, Any]:
+    x = resolve1(x)
+    if not isinstance(x, dict):
+        if settings.STRICT:
+            logger.error("PDFTypeError : Dict required: %r", x)
+            raise PDFTypeError("Dict required: %r" % x)
+        return {}
+    return x
+def stream_value(x: object) -> "PDFStream":
+    x = resolve1(x)
+    if not isinstance(x, PDFStream):
+        if settings.STRICT:
+            raise PDFTypeError("PDFStream required: %r" % x)
+        return PDFStream({}, b"")
+    return x
+def decompress_corrupted(data: bytes) -> bytes:
+    """Called on some data that can't be properly decoded because of CRC checksum
+    error. Attempt to decode it skipping the CRC.
+    """
+    d = zlib.decompressobj()
+    f = io.BytesIO(data)
+    result_str = b""
+    buffer = f.read(1)
+    i = 0
+    try:
+        while buffer:
+            result_str += d.decompress(buffer)
+            buffer = f.read(1)
+            i += 1
+    except zlib.error:
+        # Let the error propagates if we're not yet in the CRC checksum
+        if i < len(data) - 3:
+            logger.warning("Data-loss while decompressing corrupted data")
+    return result_str
+class PDFStream(PDFObject):
+    def __init__(
+        self,
+        attrs: Dict[str, Any],
+        rawdata: bytes,
+        decipher: Optional[DecipherCallable] = None,
+    ) -> None:
+        assert isinstance(attrs, dict), str(type(attrs))
+        self.attrs = attrs
+        self.rawdata: Optional[bytes] = rawdata
+        self.decipher = decipher
+        self.data: Optional[bytes] = None
+        self.objid: Optional[int] = None
+        self.genno: Optional[int] = None
+    def set_objid(self, objid: int, genno: int) -> None:
+        self.objid = objid
+        self.genno = genno
+    def __repr__(self) -> str:
+        if self.data is None:
+            assert self.rawdata is not None
+            return "<PDFStream(%r): raw=%d, %r>" % (
+                self.objid,
+                len(self.rawdata),
+                self.attrs,
+            )
+        else:
+            assert self.data is not None
+            return "<PDFStream(%r): len=%d, %r>" % (
+                self.objid,
+                len(self.data),
+                self.attrs,
+            )
+    def __contains__(self, name: object) -> bool:
+        return name in self.attrs
+    def __getitem__(self, name: str) -> Any:
+        return self.attrs[name]
+    def get(self, name: str, default: object = None) -> Any:
+        return self.attrs.get(name, default)
+    def get_any(self, names: Iterable[str], default: object = None) -> Any:
+        for name in names:
+            if name in self.attrs:
+                return self.attrs[name]
+        return default
+    def get_filters(self) -> List[Tuple[Any, Any]]:
+        filters = self.get_any(("F", "Filter"))
+        params = self.get_any(("DP", "DecodeParms", "FDecodeParms"), {})
+        if not filters:
+            return []
+        if not isinstance(filters, list):
+            filters = [filters]
+        if not isinstance(params, list):
+            # Make sure the parameters list is the same as filters.
+            params = [params] * len(filters)
+        if settings.STRICT and len(params) != len(filters):
+            raise PDFException("Parameters len filter mismatch")
+        resolved_filters = [resolve1(f) for f in filters]
+        resolved_params = [resolve1(param) for param in params]
+        return list(zip(resolved_filters, resolved_params))
+    def decode(self) -> None:
+        assert self.data is None and self.rawdata is not None, str(
+            (self.data, self.rawdata),
+        )
+        data = self.rawdata
+        if self.decipher:
+            # Handle encryption
+            assert self.objid is not None
+            assert self.genno is not None
+            data = self.decipher(self.objid, self.genno, data, self.attrs)
+        filters = self.get_filters()
+        if not filters:
+            self.data = data
+            self.rawdata = None
+            return
+        for f, params in filters:
+            if f in LITERALS_FLATE_DECODE:
+                # will get errors if the document is encrypted.
+                try:
+                    data = zlib.decompress(data)
+                except zlib.error as e:
+                    if settings.STRICT:
+                        error_msg = f"Invalid zlib bytes: {e!r}, {data!r}"
+                        raise PDFException(error_msg)
+                    try:
+                        data = decompress_corrupted(data)
+                    except zlib.error:
+                        data = b""
+            elif f in LITERALS_LZW_DECODE:
+                data = lzwdecode(data)
+            elif f in LITERALS_ASCII85_DECODE:
+                data = ascii85decode(data)
+            elif f in LITERALS_ASCIIHEX_DECODE:
+                data = asciihexdecode(data)
+            elif f in LITERALS_RUNLENGTH_DECODE:
+                data = rldecode(data)
+            elif f in LITERALS_CCITTFAX_DECODE:
+                data = ccittfaxdecode(data, params)
+            elif f in LITERALS_DCT_DECODE:
+                # This is probably a JPG stream
+                # it does not need to be decoded twice.
+                # Just return the stream to the user.
+                pass
+            elif f in LITERALS_JBIG2_DECODE or f in LITERALS_JPX_DECODE:
+                pass
+            elif f == LITERAL_CRYPT:
+                # not yet..
+                raise PDFNotImplementedError("/Crypt filter is unsupported")
+            else:
+                raise PDFNotImplementedError("Unsupported filter: %r" % f)
+            # apply predictors
+            if params and "Predictor" in params:
+                pred = int_value(params["Predictor"])
+                if pred == 1:
+                    # no predictor
+                    pass
+                elif pred >= 10:
+                    # PNG predictor
+                    colors = int_value(params.get("Colors", 1))
+                    columns = int_value(params.get("Columns", 1))
+                    raw_bits_per_component = params.get("BitsPerComponent", 8)
+                    bitspercomponent = int_value(raw_bits_per_component)
+                    data = apply_png_predictor(
+                        pred,
+                        colors,
+                        columns,
+                        bitspercomponent,
+                        data,
+                    )
+                else:
+                    error_msg = "Unsupported predictor: %r" % pred
+                    raise PDFNotImplementedError(error_msg)
+        self.data = data
+        self.rawdata = None
+    def get_data(self) -> bytes:
+        if self.data is None:
+            self.decode()
+            assert self.data is not None
+        return self.data
+    def get_rawdata(self) -> Optional[bytes]:
+        return self.rawdata

pdf2zh/psexceptions.py ADDED Viewed

	@@ -0,0 +1,18 @@

+class PSException(Exception):
+    pass
+class PSEOF(PSException):
+    pass
+class PSSyntaxError(PSException):
+    pass
+class PSTypeError(PSException):
+    pass
+class PSValueError(PSException):
+    pass

pdf2zh/psparser.py ADDED Viewed

	@@ -0,0 +1,656 @@

+#!/usr/bin/env python3
+import io
+import logging
+import re
+from typing import (
+    Any,
+    BinaryIO,
+    Dict,
+    Generic,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+from pdf2zh import psexceptions, settings
+from pdf2zh.utils import choplist
+log = logging.getLogger(__name__)
+# Adding aliases for these exceptions for backwards compatibility
+PSException = psexceptions.PSException
+PSEOF = psexceptions.PSEOF
+PSSyntaxError = psexceptions.PSSyntaxError
+PSTypeError = psexceptions.PSTypeError
+PSValueError = psexceptions.PSValueError
+class PSObject:
+    """Base class for all PS or PDF-related data types."""
+class PSLiteral(PSObject):
+    """A class that represents a PostScript literal.
+    Postscript literals are used as identifiers, such as
+    variable names, property names and dictionary keys.
+    Literals are case sensitive and denoted by a preceding
+    slash sign (e.g. "/Name")
+    Note: Do not create an instance of PSLiteral directly.
+    Always use PSLiteralTable.intern().
+    """
+    NameType = Union[str, bytes]
+    def __init__(self, name: NameType) -> None:
+        self.name = name
+    def __repr__(self) -> str:
+        name = self.name
+        return "/%r" % name
+class PSKeyword(PSObject):
+    """A class that represents a PostScript keyword.
+    PostScript keywords are a dozen of predefined words.
+    Commands and directives in PostScript are expressed by keywords.
+    They are also used to denote the content boundaries.
+    Note: Do not create an instance of PSKeyword directly.
+    Always use PSKeywordTable.intern().
+    """
+    def __init__(self, name: bytes) -> None:
+        self.name = name
+    def __repr__(self) -> str:
+        name = self.name
+        return "/%r" % name
+_SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword)
+class PSSymbolTable(Generic[_SymbolT]):
+    """A utility class for storing PSLiteral/PSKeyword objects.
+    Interned objects can be checked its identity with "is" operator.
+    """
+    def __init__(self, klass: Type[_SymbolT]) -> None:
+        self.dict: Dict[PSLiteral.NameType, _SymbolT] = {}
+        self.klass: Type[_SymbolT] = klass
+    def intern(self, name: PSLiteral.NameType) -> _SymbolT:
+        if name in self.dict:
+            lit = self.dict[name]
+        else:
+            # Type confusion issue: PSKeyword always takes bytes as name
+            #                       PSLiteral uses either str or bytes
+            lit = self.klass(name)  # type: ignore[arg-type]
+            self.dict[name] = lit
+        return lit
+PSLiteralTable = PSSymbolTable(PSLiteral)
+PSKeywordTable = PSSymbolTable(PSKeyword)
+LIT = PSLiteralTable.intern
+KWD = PSKeywordTable.intern
+KEYWORD_PROC_BEGIN = KWD(b"{")
+KEYWORD_PROC_END = KWD(b"}")
+KEYWORD_ARRAY_BEGIN = KWD(b"[")
+KEYWORD_ARRAY_END = KWD(b"]")
+KEYWORD_DICT_BEGIN = KWD(b"<<")
+KEYWORD_DICT_END = KWD(b">>")
+def literal_name(x: Any) -> str:
+    if isinstance(x, PSLiteral):
+        if isinstance(x.name, str):
+            return x.name
+        try:
+            return str(x.name, "utf-8")
+        except UnicodeDecodeError:
+            return str(x.name)
+    else:
+        if settings.STRICT:
+            raise PSTypeError(f"Literal required: {x!r}")
+        return str(x)
+def keyword_name(x: Any) -> Any:
+    if not isinstance(x, PSKeyword):
+        if settings.STRICT:
+            raise PSTypeError("Keyword required: %r" % x)
+        else:
+            name = x
+    else:
+        name = str(x.name, "utf-8", "ignore")
+    return name
+EOL = re.compile(rb"[\r\n]")
+SPC = re.compile(rb"\s")
+NONSPC = re.compile(rb"\S")
+HEX = re.compile(rb"[0-9a-fA-F]")
+END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]")
+END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]")
+HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.")
+END_NUMBER = re.compile(rb"[^0-9]")
+END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]")
+END_STRING = re.compile(rb"[()\134]")
+OCT_STRING = re.compile(rb"[0-7]")
+ESC_STRING = {
+    b"b": 8,
+    b"t": 9,
+    b"n": 10,
+    b"f": 12,
+    b"r": 13,
+    b"(": 40,
+    b")": 41,
+    b"\\": 92,
+}
+PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes]
+class PSBaseParser:
+    """Most basic PostScript parser that performs only tokenization."""
+    BUFSIZ = 4096
+    def __init__(self, fp: BinaryIO) -> None:
+        self.fp = fp
+        self.seek(0)
+    def __repr__(self) -> str:
+        return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos)
+    def flush(self) -> None:
+        pass
+    def close(self) -> None:
+        self.flush()
+    def tell(self) -> int:
+        return self.bufpos + self.charpos
+    def poll(self, pos: Optional[int] = None, n: int = 80) -> None:
+        pos0 = self.fp.tell()
+        if not pos:
+            pos = self.bufpos + self.charpos
+        self.fp.seek(pos)
+        # log.debug("poll(%d): %r", pos, self.fp.read(n))
+        self.fp.seek(pos0)
+    def seek(self, pos: int) -> None:
+        """Seeks the parser to the given position."""
+        # log.debug("seek: %r", pos)
+        self.fp.seek(pos)
+        # reset the status for nextline()
+        self.bufpos = pos
+        self.buf = b""
+        self.charpos = 0
+        # reset the status for nexttoken()
+        self._parse1 = self._parse_main
+        self._curtoken = b""
+        self._curtokenpos = 0
+        self._tokens: List[Tuple[int, PSBaseParserToken]] = []
+    def fillbuf(self) -> None:
+        if self.charpos < len(self.buf):
+            return
+        # fetch next chunk.
+        self.bufpos = self.fp.tell()
+        self.buf = self.fp.read(self.BUFSIZ)
+        if not self.buf:
+            raise PSEOF("Unexpected EOF")
+        self.charpos = 0
+    def nextline(self) -> Tuple[int, bytes]:
+        """Fetches a next line that ends either with \\r or \\n."""
+        linebuf = b""
+        linepos = self.bufpos + self.charpos
+        eol = False
+        while 1:
+            self.fillbuf()
+            if eol:
+                c = self.buf[self.charpos : self.charpos + 1]
+                # handle b'\r\n'
+                if c == b"\n":
+                    linebuf += c
+                    self.charpos += 1
+                break
+            m = EOL.search(self.buf, self.charpos)
+            if m:
+                linebuf += self.buf[self.charpos : m.end(0)]
+                self.charpos = m.end(0)
+                if linebuf[-1:] == b"\r":
+                    eol = True
+                else:
+                    break
+            else:
+                linebuf += self.buf[self.charpos :]
+                self.charpos = len(self.buf)
+        # log.debug("nextline: %r, %r", linepos, linebuf)
+        return (linepos, linebuf)
+    def revreadlines(self) -> Iterator[bytes]:
+        """Fetches a next line backword.
+        This is used to locate the trailers at the end of a file.
+        """
+        self.fp.seek(0, io.SEEK_END)
+        pos = self.fp.tell()
+        buf = b""
+        while pos > 0:
+            prevpos = pos
+            pos = max(0, pos - self.BUFSIZ)
+            self.fp.seek(pos)
+            s = self.fp.read(prevpos - pos)
+            if not s:
+                break
+            while 1:
+                n = max(s.rfind(b"\r"), s.rfind(b"\n"))
+                if n == -1:
+                    buf = s + buf
+                    break
+                yield s[n:] + buf
+                s = s[:n]
+                buf = b""
+    def _parse_main(self, s: bytes, i: int) -> int:
+        m = NONSPC.search(s, i)
+        if not m:
+            return len(s)
+        j = m.start(0)
+        c = s[j : j + 1]
+        self._curtokenpos = self.bufpos + j
+        if c == b"%":
+            self._curtoken = b"%"
+            self._parse1 = self._parse_comment
+            return j + 1
+        elif c == b"/":
+            self._curtoken = b""
+            self._parse1 = self._parse_literal
+            return j + 1
+        elif c in b"-+" or c.isdigit():
+            self._curtoken = c
+            self._parse1 = self._parse_number
+            return j + 1
+        elif c == b".":
+            self._curtoken = c
+            self._parse1 = self._parse_float
+            return j + 1
+        elif c.isalpha():
+            self._curtoken = c
+            self._parse1 = self._parse_keyword
+            return j + 1
+        elif c == b"(":
+            self._curtoken = b""
+            self.paren = 1
+            self._parse1 = self._parse_string
+            return j + 1
+        elif c == b"<":
+            self._curtoken = b""
+            self._parse1 = self._parse_wopen
+            return j + 1
+        elif c == b">":
+            self._curtoken = b""
+            self._parse1 = self._parse_wclose
+            return j + 1
+        elif c == b"\x00":
+            return j + 1
+        else:
+            self._add_token(KWD(c))
+            return j + 1
+    def _add_token(self, obj: PSBaseParserToken) -> None:
+        self._tokens.append((self._curtokenpos, obj))
+    def _parse_comment(self, s: bytes, i: int) -> int:
+        m = EOL.search(s, i)
+        if not m:
+            self._curtoken += s[i:]
+            return len(s)
+        j = m.start(0)
+        self._curtoken += s[i:j]
+        self._parse1 = self._parse_main
+        # We ignore comments.
+        # self._tokens.append(self._curtoken)
+        return j
+    def _parse_literal(self, s: bytes, i: int) -> int:
+        m = END_LITERAL.search(s, i)
+        if not m:
+            self._curtoken += s[i:]
+            return len(s)
+        j = m.start(0)
+        self._curtoken += s[i:j]
+        c = s[j : j + 1]
+        if c == b"#":
+            self.hex = b""
+            self._parse1 = self._parse_literal_hex
+            return j + 1
+        try:
+            name: Union[str, bytes] = str(self._curtoken, "utf-8")
+        except Exception:
+            name = self._curtoken
+        self._add_token(LIT(name))
+        self._parse1 = self._parse_main
+        return j
+    def _parse_literal_hex(self, s: bytes, i: int) -> int:
+        c = s[i : i + 1]
+        if HEX.match(c) and len(self.hex) < 2:
+            self.hex += c
+            return i + 1
+        if self.hex:
+            self._curtoken += bytes((int(self.hex, 16),))
+        self._parse1 = self._parse_literal
+        return i
+    def _parse_number(self, s: bytes, i: int) -> int:
+        m = END_NUMBER.search(s, i)
+        if not m:
+            self._curtoken += s[i:]
+            return len(s)
+        j = m.start(0)
+        self._curtoken += s[i:j]
+        c = s[j : j + 1]
+        if c == b".":
+            self._curtoken += c
+            self._parse1 = self._parse_float
+            return j + 1
+        try:
+            self._add_token(int(self._curtoken))
+        except ValueError:
+            pass
+        self._parse1 = self._parse_main
+        return j
+    def _parse_float(self, s: bytes, i: int) -> int:
+        m = END_NUMBER.search(s, i)
+        if not m:
+            self._curtoken += s[i:]
+            return len(s)
+        j = m.start(0)
+        self._curtoken += s[i:j]
+        try:
+            self._add_token(float(self._curtoken))
+        except ValueError:
+            pass
+        self._parse1 = self._parse_main
+        return j
+    def _parse_keyword(self, s: bytes, i: int) -> int:
+        m = END_KEYWORD.search(s, i)
+        if m:
+            j = m.start(0)
+            self._curtoken += s[i:j]
+        else:
+            # Use the rest of the stream if no non-keyword character is found. This
+            # can happen if the keyword is the final bytes of the stream
+            # (https://github.com/pdf2zh/pdf2zh.six/issues/884).
+            j = len(s)
+            self._curtoken += s[i:]
+        if self._curtoken == b"true":
+            token: Union[bool, PSKeyword] = True
+        elif self._curtoken == b"false":
+            token = False
+        else:
+            token = KWD(self._curtoken)
+        self._add_token(token)
+        self._parse1 = self._parse_main
+        return j
+    def _parse_string(self, s: bytes, i: int) -> int:
+        m = END_STRING.search(s, i)
+        if not m:
+            self._curtoken += s[i:]
+            return len(s)
+        j = m.start(0)
+        self._curtoken += s[i:j]
+        c = s[j : j + 1]
+        if c == b"\\":
+            self.oct = b""
+            self._parse1 = self._parse_string_1
+            return j + 1
+        if c == b"(":
+            self.paren += 1
+            self._curtoken += c
+            return j + 1
+        if c == b")":
+            self.paren -= 1
+            if self.paren:
+                # WTF, they said balanced parens need no special treatment.
+                self._curtoken += c
+                return j + 1
+        self._add_token(self._curtoken)
+        self._parse1 = self._parse_main
+        return j + 1
+    def _parse_string_1(self, s: bytes, i: int) -> int:
+        """Parse literal strings
+        PDF Reference 3.2.3
+        """
+        c = s[i : i + 1]
+        if OCT_STRING.match(c) and len(self.oct) < 3:
+            self.oct += c
+            return i + 1
+        elif self.oct:
+            chrcode = int(self.oct, 8)
+            assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode)
+            self._curtoken += bytes((chrcode,))
+            self._parse1 = self._parse_string
+            return i
+        elif c in ESC_STRING:
+            self._curtoken += bytes((ESC_STRING[c],))
+        elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n":
+            # If current and next character is \r\n skip both because enters
+            # after a \ are ignored
+            i += 1
+        # default action
+        self._parse1 = self._parse_string
+        return i + 1
+    def _parse_wopen(self, s: bytes, i: int) -> int:
+        c = s[i : i + 1]
+        if c == b"<":
+            self._add_token(KEYWORD_DICT_BEGIN)
+            self._parse1 = self._parse_main
+            i += 1
+        else:
+            self._parse1 = self._parse_hexstring
+        return i
+    def _parse_wclose(self, s: bytes, i: int) -> int:
+        c = s[i : i + 1]
+        if c == b">":
+            self._add_token(KEYWORD_DICT_END)
+            i += 1
+        self._parse1 = self._parse_main
+        return i
+    def _parse_hexstring(self, s: bytes, i: int) -> int:
+        m = END_HEX_STRING.search(s, i)
+        if not m:
+            self._curtoken += s[i:]
+            return len(s)
+        j = m.start(0)
+        self._curtoken += s[i:j]
+        token = HEX_PAIR.sub(
+            lambda m: bytes((int(m.group(0), 16),)),
+            SPC.sub(b"", self._curtoken),
+        )
+        self._add_token(token)
+        self._parse1 = self._parse_main
+        return j
+    def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
+        while not self._tokens:
+            self.fillbuf()
+            self.charpos = self._parse1(self.buf, self.charpos)
+        token = self._tokens.pop(0)
+        # log.debug("nexttoken: %r", token)
+        return token
+# Stack slots may by occupied by any of:
+#  * the name of a literal
+#  * the PSBaseParserToken types
+#  * list (via KEYWORD_ARRAY)
+#  * dict (via KEYWORD_DICT)
+#  * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT
+ExtraT = TypeVar("ExtraT")
+PSStackType = Union[str, float, bool, PSLiteral, bytes, List, Dict, ExtraT]
+PSStackEntry = Tuple[int, PSStackType[ExtraT]]
+class PSStackParser(PSBaseParser, Generic[ExtraT]):
+    def __init__(self, fp: BinaryIO) -> None:
+        PSBaseParser.__init__(self, fp)
+        self.reset()
+    def reset(self) -> None:
+        self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = []
+        self.curtype: Optional[str] = None
+        self.curstack: List[PSStackEntry[ExtraT]] = []
+        self.results: List[PSStackEntry[ExtraT]] = []
+    def seek(self, pos: int) -> None:
+        PSBaseParser.seek(self, pos)
+        self.reset()
+    def push(self, *objs: PSStackEntry[ExtraT]) -> None:
+        self.curstack.extend(objs)
+    def pop(self, n: int) -> List[PSStackEntry[ExtraT]]:
+        objs = self.curstack[-n:]
+        self.curstack[-n:] = []
+        return objs
+    def popall(self) -> List[PSStackEntry[ExtraT]]:
+        objs = self.curstack
+        self.curstack = []
+        return objs
+    def add_results(self, *objs: PSStackEntry[ExtraT]) -> None:
+        # try:
+        #     log.debug("add_results: %r", objs)
+        # except Exception:
+        #     log.debug("add_results: (unprintable object)")
+        self.results.extend(objs)
+    def start_type(self, pos: int, type: str) -> None:
+        self.context.append((pos, self.curtype, self.curstack))
+        (self.curtype, self.curstack) = (type, [])
+        # log.debug("start_type: pos=%r, type=%r", pos, type)
+    def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]:
+        if self.curtype != type:
+            raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}")
+        objs = [obj for (_, obj) in self.curstack]
+        (pos, self.curtype, self.curstack) = self.context.pop()
+        # log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs)
+        return (pos, objs)
+    def do_keyword(self, pos: int, token: PSKeyword) -> None:
+        pass
+    def nextobject(self) -> PSStackEntry[ExtraT]:
+        """Yields a list of objects.
+        Arrays and dictionaries are represented as Python lists and
+        dictionaries.
+        :return: keywords, literals, strings, numbers, arrays and dictionaries.
+        """
+        end = None
+        while not self.results:
+            (pos, token) = self.nexttoken()
+            if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
+                # normal token
+                self.push((pos, token))
+            elif token == KEYWORD_ARRAY_BEGIN:
+                # begin array
+                self.start_type(pos, "a")
+            elif token == KEYWORD_ARRAY_END:
+                # end array
+                try:
+                    self.push(self.end_type("a"))
+                except PSTypeError:
+                    if settings.STRICT:
+                        raise
+            elif token == KEYWORD_DICT_BEGIN:
+                # begin dictionary
+                self.start_type(pos, "d")
+            elif token == KEYWORD_DICT_END:
+                # end dictionary
+                try:
+                    (pos, objs) = self.end_type("d")
+                    if len(objs) % 2 != 0:
+                        error_msg = "Invalid dictionary construct: %r" % objs
+                        raise PSSyntaxError(error_msg)
+                    d = {
+                        literal_name(k): v
+                        for (k, v) in choplist(2, objs)
+                        if v is not None
+                    }
+                    self.push((pos, d))
+                except PSTypeError:
+                    if settings.STRICT:
+                        raise
+            elif token == KEYWORD_PROC_BEGIN:
+                # begin proc
+                self.start_type(pos, "p")
+            elif token == KEYWORD_PROC_END:
+                # end proc
+                try:
+                    self.push(self.end_type("p"))
+                except PSTypeError:
+                    if settings.STRICT:
+                        raise
+            elif isinstance(token, PSKeyword):
+                # log.debug(
+                #     "do_keyword: pos=%r, token=%r, stack=%r",
+                #     pos,
+                #     token,
+                #     self.curstack,
+                # )
+                if token.name == b"endobj":
+                    end = pos + 7
+                self.do_keyword(pos, token)
+            else:
+                log.error(
+                    "unknown token: pos=%r, token=%r, stack=%r",
+                    pos,
+                    token,
+                    self.curstack,
+                )
+                self.do_keyword(pos, token)
+                raise PSException
+            if self.context:
+                continue
+            else:
+                self.flush()
+        obj = self.results.pop(0)
+        # try:
+        #     log.debug("nextobject: %r", obj)
+        # except Exception:
+        #     log.debug("nextobject: (unprintable object)")
+        return end, obj

pdf2zh/py.typed ADDED Viewed

File without changes

pdf2zh/runlength.py ADDED Viewed

	@@ -0,0 +1,39 @@

+#
+# RunLength decoder (Adobe version) implementation based on PDF Reference
+# version 1.4 section 3.3.4.
+#
+#  * public domain *
+#
+def rldecode(data: bytes) -> bytes:
+    """RunLength decoder (Adobe version) implementation based on PDF Reference
+    version 1.4 section 3.3.4:
+        The RunLengthDecode filter decodes data that has been encoded in a
+        simple byte-oriented format based on run length. The encoded data
+        is a sequence of runs, where each run consists of a length byte
+        followed by 1 to 128 bytes of data. If the length byte is in the
+        range 0 to 127, the following length + 1 (1 to 128) bytes are
+        copied literally during decompression. If length is in the range
+        129 to 255, the following single byte is to be copied 257 - length
+        (2 to 128) times during decompression. A length value of 128
+        denotes EOD.
+    """
+    decoded = b""
+    i = 0
+    while i < len(data):
+        length = data[i]
+        if length == 128:
+            break
+        if length >= 0 and length < 128:
+            for j in range(i + 1, (i + 1) + (length + 1)):
+                decoded += bytes((data[j],))
+            i = (i + 1) + (length + 1)
+        if length > 128:
+            run = bytes((data[i + 1],)) * (257 - length)
+            decoded += run
+            i = (i + 1) + 1
+    return decoded

pdf2zh/settings.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ STRICT = False

pdf2zh/translator.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import html
+import logging
+import os
+import re
+from json import dumps, loads
+import deepl
+import ollama
+import openai
+import requests
+from azure.ai.translation.text import TextTranslationClient
+from azure.core.credentials import AzureKeyCredential
+import hmac
+import hashlib
+import time
+from datetime import datetime,UTC
+class BaseTranslator:
+    def __init__(self, service, lang_out, lang_in, model):
+        self.service = service
+        self.lang_out = lang_out
+        self.lang_in = lang_in
+        self.model = model
+    def translate(self, text) -> str: ...  # noqa: E704
+    def __str__(self):
+        return f"{self.service} {self.lang_out} {self.lang_in}"
+class GoogleTranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh-CN" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        self.session = requests.Session()
+        self.base_link = "http://translate.google.com/m"
+        self.headers = {
+            "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)"  # noqa: E501
+        }
+    def translate(self, text):
+        text = text[:5000]  # google translate max length
+        response = self.session.get(
+            self.base_link,
+            params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
+            headers=self.headers,
+        )
+        re_result = re.findall(
+            r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
+        )
+        if response.status_code == 400:
+            result = "IRREPARABLE TRANSLATION ERROR"
+        elif len(re_result) == 0:
+            raise ValueError("Empty translation result")
+        else:
+            result = html.unescape(re_result[0])
+        return result
+class TencentTranslator(BaseTranslator):
+    def sign(self,key, msg):
+        return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        try:
+            server_url = (
+                "tmt.tencentcloudapi.com"
+            )
+            self.secret_id = os.getenv("TENCENT_SECRET_ID")
+            self.secret_key = os.getenv("TENCENT_SECRET_KEY")
+        except KeyError as e:
+            missing_var = e.args[0]
+            raise ValueError(
+                f"The environment variable '{missing_var}' is required but not set."
+            ) from e
+        self.session = requests.Session()
+        self.base_link = f"{server_url}"
+    def translate(self, text):
+        text = text[:5000]
+        data={
+            "SourceText":text,
+            "Source":self.lang_in,
+            "Target":self.lang_out,
+            "ProjectId":0
+        }
+        payloadx = dumps(data)
+        hashed_request_payload = hashlib.sha256(payloadx.encode("utf-8")).hexdigest()
+        canonical_request = ("POST" + "\n" +
+                            "/" + "\n" +
+                            "" + "\n" +
+                            "content-type:application/json; charset=utf-8\nhost:tmt.tencentcloudapi.com\nx-tc-action:texttranslate\n" + "\n" +
+                            "content-type;host;x-tc-action" + "\n" +
+                            hashed_request_payload)
+        timestamp = int(time.time())
+        date = datetime.fromtimestamp(timestamp, UTC).strftime("%Y-%m-%d")
+        credential_scope = date + "/tmt/tc3_request"
+        hashed_canonical_request = hashlib.sha256(canonical_request.encode("utf-8")).hexdigest()
+        algorithm = "TC3-HMAC-SHA256"
+        string_to_sign = (algorithm + "\n" +
+                        str(timestamp) + "\n" +
+                        credential_scope + "\n" +
+                        hashed_canonical_request)
+        secret_date = self.sign(("TC3" + self.secret_key).encode("utf-8"), date)
+        secret_service = self.sign(secret_date, "tmt")
+        secret_signing = self.sign(secret_service, "tc3_request")
+        signed_headers = "content-type;host;x-tc-action"
+        signature = hmac.new(secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256).hexdigest()
+        authorization = (algorithm + " " +
+                 "Credential=" + self.secret_id + "/" + credential_scope + ", " +
+                 "SignedHeaders=" + signed_headers + ", " +
+                 "Signature=" + signature)
+        self.headers = {
+            "Authorization": authorization,
+            "Content-Type": "application/json; charset=utf-8",
+            "Host": "tmt.tencentcloudapi.com",
+            "X-TC-Action": "TextTranslate",
+            "X-TC-Region":"ap-beijing",
+            "X-TC-Timestamp": str(timestamp),
+            "X-TC-Version": "2018-03-21"
+        }
+        response = self.session.post(
+            "https://"+self.base_link,
+            json=data,
+            headers=self.headers,
+        )
+        # 1. Status code test
+        if response.status_code == 200:
+            result = loads(response.text)
+        else:
+            raise ValueError("HTTP error: " + str(response.status_code))
+        # 2. Result test
+        try:
+            result = result['Response']['TargetText']
+            return result
+        except KeyError:
+            result = ""
+            raise ValueError("No valid key in Tencent's response")
+        # 3. Result length check
+        if len(result) == 0:
+            raise ValueError("Empty translation result")
+        return result
+class DeepLXTranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        try:
+            auth_key = os.getenv("DEEPLX_AUTH_KEY")
+            server_url = (
+                "https://api.deeplx.org"
+                if not os.getenv("DEEPLX_SERVER_URL")
+                else os.getenv("DEEPLX_SERVER_URL")
+            )
+        except KeyError as e:
+            missing_var = e.args[0]
+            raise ValueError(
+                f"The environment variable '{missing_var}' is required but not set."
+            ) from e
+        self.session = requests.Session()
+        server_url=server_url.rstrip('/')
+        if auth_key:
+            self.base_link = f"{server_url}/{auth_key}/translate"
+        else:
+            self.base_link = f"{server_url}/translate"
+        self.headers = {
+            "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)"  # noqa: E501
+        }
+    def translate(self, text):
+        text = text[:5000]  # google translate max length
+        response = self.session.post(
+            self.base_link,
+            dumps(
+                {
+                    "target_lang": self.lang_out,
+                    "text": text,
+                }
+            ),
+            headers=self.headers,
+        )
+        # 1. Status code test
+        if response.status_code == 200:
+            result = loads(response.text)
+        else:
+            raise ValueError("HTTP error: " + str(response.status_code))
+        # 2. Result test
+        try:
+            result = result["data"]
+            return result
+        except KeyError:
+            result = ""
+            raise ValueError("No valid key in DeepLX's response")
+        # 3. Result length check
+        if len(result) == 0:
+            raise ValueError("Empty translation result")
+        return result
+class DeepLTranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "ZH" if lang_out == "auto" else lang_out
+        lang_in = "EN" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        self.session = requests.Session()
+        auth_key = os.getenv("DEEPL_AUTH_KEY")
+        server_url = os.getenv("DEEPL_SERVER_URL")
+        self.client = deepl.Translator(auth_key, server_url=server_url)
+    def translate(self, text):
+        response = self.client.translate_text(
+            text, target_lang=self.lang_out, source_lang=self.lang_in
+        )
+        return response.text
+class OllamaTranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh-CN" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        self.options = {"temperature": 0}  # 随机采样可能会打断公式标记
+        # OLLAMA_HOST
+        self.client = ollama.Client()
+    def translate(self, text):
+        response = self.client.chat(
+            model=self.model,
+            options=self.options,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a professional,authentic machine translation engine.",
+                },
+                {
+                    "role": "user",
+                    "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:",  # noqa: E501
+                },
+            ],
+        )
+        return response["message"]["content"].strip()
+class OpenAITranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh-CN" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        self.options = {"temperature": 0}  # 随机采样可能会打断公式标记
+        # OPENAI_BASE_URL
+        # OPENAI_API_KEY
+        self.client = openai.OpenAI()
+    def translate(self, text) -> str:
+        response = self.client.chat.completions.create(
+            model=self.model,
+            **self.options,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a professional,authentic machine translation engine.",
+                },
+                {
+                    "role": "user",
+                    "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:",  # noqa: E501
+                },
+            ],
+        )
+        return response.choices[0].message.content.strip()
+class AzureTranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh-Hans" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        try:
+            api_key = os.environ["AZURE_APIKEY"]
+            endpoint = os.environ["AZURE_ENDPOINT"]
+            region = os.environ["AZURE_REGION"]
+        except KeyError as e:
+            missing_var = e.args[0]
+            raise ValueError(
+                f"The environment variable '{missing_var}' is required but not set."
+            ) from e
+        credential = AzureKeyCredential(api_key)
+        self.client = TextTranslationClient(
+            endpoint=endpoint, credential=credential, region=region
+        )
+        # https://github.com/Azure/azure-sdk-for-python/issues/9422
+        logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
+        logger.setLevel(logging.WARNING)
+    def translate(self, text) -> str:
+        response = self.client.translate(
+            body=[text],
+            from_language=self.lang_in,
+            to_language=[self.lang_out],
+        )
+        translated_text = response[0].translations[0].text
+        return translated_text

pdf2zh/utils.py ADDED Viewed

	@@ -0,0 +1,834 @@

+"""Miscellaneous Routines."""
+import io
+import pathlib
+import string
+import struct
+from html import escape
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    BinaryIO,
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    TextIO,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
+from pdf2zh.pdfexceptions import PDFTypeError, PDFValueError
+if TYPE_CHECKING:
+    from pdf2zh.layout import LTComponent
+import charset_normalizer  # For str encoding detection
+# from sys import maxint as INF doesn't work anymore under Python3, but PDF
+# still uses 32 bits ints
+INF = (1 << 31) - 1
+FileOrName = Union[pathlib.PurePath, str, io.IOBase]
+AnyIO = Union[TextIO, BinaryIO]
+class open_filename:
+    """Context manager that allows opening a filename
+    (str or pathlib.PurePath type is supported) and closes it on exit,
+    (just like `open`), but does nothing for file-like objects.
+    """
+    def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any) -> None:
+        if isinstance(filename, pathlib.PurePath):
+            filename = str(filename)
+        if isinstance(filename, str):
+            self.file_handler: AnyIO = open(filename, *args, **kwargs)
+            self.closing = True
+        elif isinstance(filename, io.IOBase):
+            self.file_handler = cast(AnyIO, filename)
+            self.closing = False
+        else:
+            raise PDFTypeError("Unsupported input type: %s" % type(filename))
+    def __enter__(self) -> AnyIO:
+        return self.file_handler
+    def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
+        if self.closing:
+            self.file_handler.close()
+def make_compat_bytes(in_str: str) -> bytes:
+    """Converts to bytes, encoding to unicode."""
+    assert isinstance(in_str, str), str(type(in_str))
+    return in_str.encode()
+def make_compat_str(o: object) -> str:
+    """Converts everything to string, if bytes guessing the encoding."""
+    if isinstance(o, bytes):
+        enc = charset_normalizer.detect(o)
+        try:
+            return o.decode(enc["encoding"])
+        except UnicodeDecodeError:
+            return str(o)
+    else:
+        return str(o)
+def shorten_str(s: str, size: int) -> str:
+    if size < 7:
+        return s[:size]
+    if len(s) > size:
+        length = (size - 5) // 2
+        return f"{s[:length]} ... {s[-length:]}"
+    else:
+        return s
+def compatible_encode_method(
+    bytesorstring: Union[bytes, str],
+    encoding: str = "utf-8",
+    erraction: str = "ignore",
+) -> str:
+    """When Py2 str.encode is called, it often means bytes.encode in Py3.
+    This does either.
+    """
+    if isinstance(bytesorstring, str):
+        return bytesorstring
+    assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
+    return bytesorstring.decode(encoding, erraction)
+def paeth_predictor(left: int, above: int, upper_left: int) -> int:
+    # From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
+    # Initial estimate
+    p = left + above - upper_left
+    # Distances to a,b,c
+    pa = abs(p - left)
+    pb = abs(p - above)
+    pc = abs(p - upper_left)
+    # Return nearest of a,b,c breaking ties in order a,b,c
+    if pa <= pb and pa <= pc:
+        return left
+    elif pb <= pc:
+        return above
+    else:
+        return upper_left
+def apply_png_predictor(
+    pred: int,
+    colors: int,
+    columns: int,
+    bitspercomponent: int,
+    data: bytes,
+) -> bytes:
+    """Reverse the effect of the PNG predictor
+    Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
+    """
+    if bitspercomponent not in [8, 1]:
+        msg = "Unsupported `bitspercomponent': %d" % bitspercomponent
+        raise PDFValueError(msg)
+    nbytes = colors * columns * bitspercomponent // 8
+    bpp = colors * bitspercomponent // 8  # number of bytes per complete pixel
+    buf = []
+    line_above = list(b"\x00" * columns)
+    for scanline_i in range(0, len(data), nbytes + 1):
+        filter_type = data[scanline_i]
+        line_encoded = data[scanline_i + 1 : scanline_i + 1 + nbytes]
+        raw = []
+        if filter_type == 0:
+            # Filter type 0: None
+            raw = list(line_encoded)
+        elif filter_type == 1:
+            # Filter type 1: Sub
+            # To reverse the effect of the Sub() filter after decompression,
+            # output the following value:
+            #   Raw(x) = Sub(x) + Raw(x - bpp)
+            # (computed mod 256), where Raw() refers to the bytes already
+            #  decoded.
+            for j, sub_x in enumerate(line_encoded):
+                if j - bpp < 0:
+                    raw_x_bpp = 0
+                else:
+                    raw_x_bpp = int(raw[j - bpp])
+                raw_x = (sub_x + raw_x_bpp) & 255
+                raw.append(raw_x)
+        elif filter_type == 2:
+            # Filter type 2: Up
+            # To reverse the effect of the Up() filter after decompression,
+            # output the following value:
+            #   Raw(x) = Up(x) + Prior(x)
+            # (computed mod 256), where Prior() refers to the decoded bytes of
+            # the prior scanline.
+            for up_x, prior_x in zip(line_encoded, line_above):
+                raw_x = (up_x + prior_x) & 255
+                raw.append(raw_x)
+        elif filter_type == 3:
+            # Filter type 3: Average
+            # To reverse the effect of the Average() filter after
+            # decompression, output the following value:
+            #    Raw(x) = Average(x) + floor((Raw(x-bpp)+Prior(x))/2)
+            # where the result is computed mod 256, but the prediction is
+            # calculated in the same way as for encoding. Raw() refers to the
+            # bytes already decoded, and Prior() refers to the decoded bytes of
+            # the prior scanline.
+            for j, average_x in enumerate(line_encoded):
+                if j - bpp < 0:
+                    raw_x_bpp = 0
+                else:
+                    raw_x_bpp = int(raw[j - bpp])
+                prior_x = int(line_above[j])
+                raw_x = (average_x + (raw_x_bpp + prior_x) // 2) & 255
+                raw.append(raw_x)
+        elif filter_type == 4:
+            # Filter type 4: Paeth
+            # To reverse the effect of the Paeth() filter after decompression,
+            # output the following value:
+            #    Raw(x) = Paeth(x)
+            #             + PaethPredictor(Raw(x-bpp), Prior(x), Prior(x-bpp))
+            # (computed mod 256), where Raw() and Prior() refer to bytes
+            # already decoded. Exactly the same PaethPredictor() function is
+            # used by both encoder and decoder.
+            for j, paeth_x in enumerate(line_encoded):
+                if j - bpp < 0:
+                    raw_x_bpp = 0
+                    prior_x_bpp = 0
+                else:
+                    raw_x_bpp = int(raw[j - bpp])
+                    prior_x_bpp = int(line_above[j - bpp])
+                prior_x = int(line_above[j])
+                paeth = paeth_predictor(raw_x_bpp, prior_x, prior_x_bpp)
+                raw_x = (paeth_x + paeth) & 255
+                raw.append(raw_x)
+        else:
+            raise PDFValueError("Unsupported predictor value: %d" % filter_type)
+        buf.extend(raw)
+        line_above = raw
+    return bytes(buf)
+Point = Tuple[float, float]
+Rect = Tuple[float, float, float, float]
+Matrix = Tuple[float, float, float, float, float, float]
+PathSegment = Union[
+    Tuple[str],  # Literal['h']
+    Tuple[str, float, float],  # Literal['m', 'l']
+    Tuple[str, float, float, float, float],  # Literal['v', 'y']
+    Tuple[str, float, float, float, float, float, float],
+]  # Literal['c']
+#  Matrix operations
+MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)
+def parse_rect(o: Any) -> Rect:
+    try:
+        (x0, y0, x1, y1) = o
+        return float(x0), float(y0), float(x1), float(y1)
+    except ValueError:
+        raise PDFValueError("Could not parse rectangle")
+def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix:
+    (a1, b1, c1, d1, e1, f1) = m1
+    (a0, b0, c0, d0, e0, f0) = m0
+    """Returns the multiplication of two matrices."""
+    return (
+        a0 * a1 + c0 * b1,
+        b0 * a1 + d0 * b1,
+        a0 * c1 + c0 * d1,
+        b0 * c1 + d0 * d1,
+        a0 * e1 + c0 * f1 + e0,
+        b0 * e1 + d0 * f1 + f0,
+    )
+def translate_matrix(m: Matrix, v: Point) -> Matrix:
+    """Translates a matrix by (x, y)."""
+    (a, b, c, d, e, f) = m
+    (x, y) = v
+    return a, b, c, d, x * a + y * c + e, x * b + y * d + f
+def apply_matrix_pt(m: Matrix, v: Point) -> Point:
+    (a, b, c, d, e, f) = m
+    (x, y) = v
+    """Applies a matrix to a point."""
+    return a * x + c * y + e, b * x + d * y + f
+def apply_matrix_norm(m: Matrix, v: Point) -> Point:
+    """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
+    (a, b, c, d, e, f) = m
+    (p, q) = v
+    return a * p + c * q, b * p + d * q
+def matrix_scale(m: Matrix) -> float:
+    (a, b, c, d, e, f) = m
+    return (a**2 + c**2) ** 0.5
+#  Utility functions
+def isnumber(x: object) -> bool:
+    return isinstance(x, (int, float))
+_T = TypeVar("_T")
+def uniq(objs: Iterable[_T]) -> Iterator[_T]:
+    """Eliminates duplicated elements."""
+    done = set()
+    for obj in objs:
+        if obj in done:
+            continue
+        done.add(obj)
+        yield obj
+def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T]) -> Tuple[List[_T], List[_T]]:
+    """Split a list into two classes according to the predicate."""
+    t = []
+    f = []
+    for obj in objs:
+        if pred(obj):
+            t.append(obj)
+        else:
+            f.append(obj)
+    return t, f
+def drange(v0: float, v1: float, d: int) -> range:
+    """Returns a discrete range."""
+    return range(int(v0) // d, int(v1 + d) // d)
+def get_bound(pts: Iterable[Point]) -> Rect:
+    """Compute a minimal rectangle that covers all the points."""
+    limit: Rect = (INF, INF, -INF, -INF)
+    (x0, y0, x1, y1) = limit
+    for x, y in pts:
+        x0 = min(x0, x)
+        y0 = min(y0, y)
+        x1 = max(x1, x)
+        y1 = max(y1, y)
+    return x0, y0, x1, y1
+def pick(
+    seq: Iterable[_T],
+    func: Callable[[_T], float],
+    maxobj: Optional[_T] = None,
+) -> Optional[_T]:
+    """Picks the object obj where func(obj) has the highest value."""
+    maxscore = None
+    for obj in seq:
+        score = func(obj)
+        if maxscore is None or maxscore < score:
+            (maxscore, maxobj) = (score, obj)
+    return maxobj
+def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]:
+    """Groups every n elements of the list."""
+    r = []
+    for x in seq:
+        r.append(x)
+        if len(r) == n:
+            yield tuple(r)
+            r = []
+def nunpack(s: bytes, default: int = 0) -> int:
+    """Unpacks 1 to 4 or 8 byte integers (big endian)."""
+    length = len(s)
+    if not length:
+        return default
+    elif length == 1:
+        return ord(s)
+    elif length == 2:
+        return cast(int, struct.unpack(">H", s)[0])
+    elif length == 3:
+        return cast(int, struct.unpack(">L", b"\x00" + s)[0])
+    elif length == 4:
+        return cast(int, struct.unpack(">L", s)[0])
+    elif length == 8:
+        return cast(int, struct.unpack(">Q", s)[0])
+    else:
+        raise PDFTypeError("invalid length: %d" % length)
+PDFDocEncoding = "".join(
+    chr(x)
+    for x in (
+        0x0000,
+        0x0001,
+        0x0002,
+        0x0003,
+        0x0004,
+        0x0005,
+        0x0006,
+        0x0007,
+        0x0008,
+        0x0009,
+        0x000A,
+        0x000B,
+        0x000C,
+        0x000D,
+        0x000E,
+        0x000F,
+        0x0010,
+        0x0011,
+        0x0012,
+        0x0013,
+        0x0014,
+        0x0015,
+        0x0017,
+        0x0017,
+        0x02D8,
+        0x02C7,
+        0x02C6,
+        0x02D9,
+        0x02DD,
+        0x02DB,
+        0x02DA,
+        0x02DC,
+        0x0020,
+        0x0021,
+        0x0022,
+        0x0023,
+        0x0024,
+        0x0025,
+        0x0026,
+        0x0027,
+        0x0028,
+        0x0029,
+        0x002A,
+        0x002B,
+        0x002C,
+        0x002D,
+        0x002E,
+        0x002F,
+        0x0030,
+        0x0031,
+        0x0032,
+        0x0033,
+        0x0034,
+        0x0035,
+        0x0036,
+        0x0037,
+        0x0038,
+        0x0039,
+        0x003A,
+        0x003B,
+        0x003C,
+        0x003D,
+        0x003E,
+        0x003F,
+        0x0040,
+        0x0041,
+        0x0042,
+        0x0043,
+        0x0044,
+        0x0045,
+        0x0046,
+        0x0047,
+        0x0048,
+        0x0049,
+        0x004A,
+        0x004B,
+        0x004C,
+        0x004D,
+        0x004E,
+        0x004F,
+        0x0050,
+        0x0051,
+        0x0052,
+        0x0053,
+        0x0054,
+        0x0055,
+        0x0056,
+        0x0057,
+        0x0058,
+        0x0059,
+        0x005A,
+        0x005B,
+        0x005C,
+        0x005D,
+        0x005E,
+        0x005F,
+        0x0060,
+        0x0061,
+        0x0062,
+        0x0063,
+        0x0064,
+        0x0065,
+        0x0066,
+        0x0067,
+        0x0068,
+        0x0069,
+        0x006A,
+        0x006B,
+        0x006C,
+        0x006D,
+        0x006E,
+        0x006F,
+        0x0070,
+        0x0071,
+        0x0072,
+        0x0073,
+        0x0074,
+        0x0075,
+        0x0076,
+        0x0077,
+        0x0078,
+        0x0079,
+        0x007A,
+        0x007B,
+        0x007C,
+        0x007D,
+        0x007E,
+        0x0000,
+        0x2022,
+        0x2020,
+        0x2021,
+        0x2026,
+        0x2014,
+        0x2013,
+        0x0192,
+        0x2044,
+        0x2039,
+        0x203A,
+        0x2212,
+        0x2030,
+        0x201E,
+        0x201C,
+        0x201D,
+        0x2018,
+        0x2019,
+        0x201A,
+        0x2122,
+        0xFB01,
+        0xFB02,
+        0x0141,
+        0x0152,
+        0x0160,
+        0x0178,
+        0x017D,
+        0x0131,
+        0x0142,
+        0x0153,
+        0x0161,
+        0x017E,
+        0x0000,
+        0x20AC,
+        0x00A1,
+        0x00A2,
+        0x00A3,
+        0x00A4,
+        0x00A5,
+        0x00A6,
+        0x00A7,
+        0x00A8,
+        0x00A9,
+        0x00AA,
+        0x00AB,
+        0x00AC,
+        0x0000,
+        0x00AE,
+        0x00AF,
+        0x00B0,
+        0x00B1,
+        0x00B2,
+        0x00B3,
+        0x00B4,
+        0x00B5,
+        0x00B6,
+        0x00B7,
+        0x00B8,
+        0x00B9,
+        0x00BA,
+        0x00BB,
+        0x00BC,
+        0x00BD,
+        0x00BE,
+        0x00BF,
+        0x00C0,
+        0x00C1,
+        0x00C2,
+        0x00C3,
+        0x00C4,
+        0x00C5,
+        0x00C6,
+        0x00C7,
+        0x00C8,
+        0x00C9,
+        0x00CA,
+        0x00CB,
+        0x00CC,
+        0x00CD,
+        0x00CE,
+        0x00CF,
+        0x00D0,
+        0x00D1,
+        0x00D2,
+        0x00D3,
+        0x00D4,
+        0x00D5,
+        0x00D6,
+        0x00D7,
+        0x00D8,
+        0x00D9,
+        0x00DA,
+        0x00DB,
+        0x00DC,
+        0x00DD,
+        0x00DE,
+        0x00DF,
+        0x00E0,
+        0x00E1,
+        0x00E2,
+        0x00E3,
+        0x00E4,
+        0x00E5,
+        0x00E6,
+        0x00E7,
+        0x00E8,
+        0x00E9,
+        0x00EA,
+        0x00EB,
+        0x00EC,
+        0x00ED,
+        0x00EE,
+        0x00EF,
+        0x00F0,
+        0x00F1,
+        0x00F2,
+        0x00F3,
+        0x00F4,
+        0x00F5,
+        0x00F6,
+        0x00F7,
+        0x00F8,
+        0x00F9,
+        0x00FA,
+        0x00FB,
+        0x00FC,
+        0x00FD,
+        0x00FE,
+        0x00FF,
+    )
+)
+def decode_text(s: bytes) -> str:
+    """Decodes a PDFDocEncoding string to Unicode."""
+    if s.startswith(b"\xfe\xff"):
+        return str(s[2:], "utf-16be", "ignore")
+    else:
+        return "".join(PDFDocEncoding[c] for c in s)
+def enc(x: str) -> str:
+    """Encodes a string for SGML/XML/HTML"""
+    if isinstance(x, bytes):
+        return ""
+    return escape(x)
+def bbox2str(bbox: Rect) -> str:
+    (x0, y0, x1, y1) = bbox
+    return f"{x0:.3f},{y0:.3f},{x1:.3f},{y1:.3f}"
+def matrix2str(m: Matrix) -> str:
+    (a, b, c, d, e, f) = m
+    return f"[{a:.2f},{b:.2f},{c:.2f},{d:.2f}, ({e:.2f},{f:.2f})]"
+def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
+    """A distance function between two TextBoxes.
+    Consider the bounding rectangle for obj1 and obj2.
+    Return vector between 2 boxes boundaries if they don't overlap, otherwise
+    returns vector betweeen boxes centers
+             +------+..........+ (x1, y1)
+             | obj1 |          :
+             +------+www+------+
+             :          | obj2 |
+    (x0, y0) +..........+------+
+    """
+    (x0, y0) = (min(obj1.x0, obj2.x0), min(obj1.y0, obj2.y0))
+    (x1, y1) = (max(obj1.x1, obj2.x1), max(obj1.y1, obj2.y1))
+    (ow, oh) = (x1 - x0, y1 - y0)
+    (iw, ih) = (ow - obj1.width - obj2.width, oh - obj1.height - obj2.height)
+    if iw < 0 and ih < 0:
+        # if one is inside another we compute euclidean distance
+        (xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2)
+        (xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2)
+        return xc1 - xc2, yc1 - yc2
+    else:
+        return max(0, iw), max(0, ih)
+LTComponentT = TypeVar("LTComponentT", bound="LTComponent")
+class Plane(Generic[LTComponentT]):
+    """A set-like data structure for objects placed on a plane.
+    Can efficiently find objects in a certain rectangular area.
+    It maintains two parallel lists of objects, each of
+    which is sorted by its x or y coordinate.
+    """
+    def __init__(self, bbox: Rect, gridsize: int = 50) -> None:
+        self._seq: List[LTComponentT] = []  # preserve the object order.
+        self._objs: Set[LTComponentT] = set()
+        self._grid: Dict[Point, List[LTComponentT]] = {}
+        self.gridsize = gridsize
+        (self.x0, self.y0, self.x1, self.y1) = bbox
+    def __repr__(self) -> str:
+        return "<Plane objs=%r>" % list(self)
+    def __iter__(self) -> Iterator[LTComponentT]:
+        return (obj for obj in self._seq if obj in self._objs)
+    def __len__(self) -> int:
+        return len(self._objs)
+    def __contains__(self, obj: object) -> bool:
+        return obj in self._objs
+    def _getrange(self, bbox: Rect) -> Iterator[Point]:
+        (x0, y0, x1, y1) = bbox
+        if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
+            return
+        x0 = max(self.x0, x0)
+        y0 = max(self.y0, y0)
+        x1 = min(self.x1, x1)
+        y1 = min(self.y1, y1)
+        for grid_y in drange(y0, y1, self.gridsize):
+            for grid_x in drange(x0, x1, self.gridsize):
+                yield (grid_x, grid_y)
+    def extend(self, objs: Iterable[LTComponentT]) -> None:
+        for obj in objs:
+            self.add(obj)
+    def add(self, obj: LTComponentT) -> None:
+        """Place an object."""
+        for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
+            if k not in self._grid:
+                r: List[LTComponentT] = []
+                self._grid[k] = r
+            else:
+                r = self._grid[k]
+            r.append(obj)
+        self._seq.append(obj)
+        self._objs.add(obj)
+    def remove(self, obj: LTComponentT) -> None:
+        """Displace an object."""
+        for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
+            try:
+                self._grid[k].remove(obj)
+            except (KeyError, ValueError):
+                pass
+        self._objs.remove(obj)
+    def find(self, bbox: Rect) -> Iterator[LTComponentT]:
+        """Finds objects that are in a certain area."""
+        (x0, y0, x1, y1) = bbox
+        done = set()
+        for k in self._getrange(bbox):
+            if k not in self._grid:
+                continue
+            for obj in self._grid[k]:
+                if obj in done:
+                    continue
+                done.add(obj)
+                if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0:
+                    continue
+                yield obj
+ROMAN_ONES = ["i", "x", "c", "m"]
+ROMAN_FIVES = ["v", "l", "d"]
+def format_int_roman(value: int) -> str:
+    """Format a number as lowercase Roman numerals."""
+    assert 0 < value < 4000
+    result: List[str] = []
+    index = 0
+    while value != 0:
+        value, remainder = divmod(value, 10)
+        if remainder == 9:
+            result.insert(0, ROMAN_ONES[index])
+            result.insert(1, ROMAN_ONES[index + 1])
+        elif remainder == 4:
+            result.insert(0, ROMAN_ONES[index])
+            result.insert(1, ROMAN_FIVES[index])
+        else:
+            over_five = remainder >= 5
+            if over_five:
+                result.insert(0, ROMAN_FIVES[index])
+                remainder -= 5
+            result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
+        index += 1
+    return "".join(result)
+def format_int_alpha(value: int) -> str:
+    """Format a number as lowercase letters a-z, aa-zz, etc."""
+    assert value > 0
+    result: List[str] = []
+    while value != 0:
+        value, remainder = divmod(value - 1, len(string.ascii_lowercase))
+        result.append(string.ascii_lowercase[remainder])
+    result.reverse()
+    return "".join(result)
+def get_device():
+    """Get the device to use for computation."""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            return "cuda:0"
+    except ImportError:
+        pass
+    return "cpu"

pyproject.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+[project]
+name = "pdf2zh"
+version = "1.8.0"
+description = "Latex PDF Translator"
+authors = [{ name = "Byaidu", email = "[email protected]" }]
+license = "AGPL-3.0"
+readme = "README.md"
+requires-python = ">=3.9,<3.13"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+    "charset-normalizer",
+    "cryptography",
+    "requests",
+    "pymupdf",
+    "tqdm",
+    "tenacity",
+    "numpy",
+    "ollama",
+    "deepl<1.19.1",
+    "openai",
+    "requests",
+    "azure-ai-translation-text<=1.0.1",
+    "gradio",
+    "huggingface_hub",
+    "onnx",
+    "onnxruntime",
+    "opencv-python-headless",
+]
+[project.optional-dependencies]
+torch = [
+    "doclayout-yolo",
+    "torch",
+]
+dev = [
+    "black",
+    "flake8",
+    "pre-commit"
+]
+[project.urls]
+Homepage = "https://github.com/Byaidu/PDFMathTranslate"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project.scripts]
+pdf2zh = "pdf2zh.pdf2zh:main"

setup.cfg ADDED Viewed

	@@ -0,0 +1,4 @@

+[flake8]
+max-line-length = 120
+ignore = E203,W503,E261
+exclude = .git,build,dist,docs