File size: 2,829 Bytes
153628e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Copyright (C) 2021-2024, Mindee.

# This program is licensed under the Apache License 2.0.
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.

from pathlib import Path
from typing import List, Sequence, Union

import numpy as np

from doctr.file_utils import requires_package
from doctr.utils.common_types import AbstractFile

from .html import read_html
from .image import read_img_as_numpy
from .pdf import read_pdf

__all__ = ["DocumentFile"]


class DocumentFile:
    """Read a document from multiple extensions"""

    @classmethod
    def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]:
        """Read a PDF file

        >>> from doctr.io import DocumentFile
        >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")

        Args:
        ----
            file: the path to the PDF file or a binary stream
            **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`

        Returns:
        -------
            the list of pages decoded as numpy ndarray of shape H x W x 3
        """
        return read_pdf(file, **kwargs)

    @classmethod
    def from_url(cls, url: str, **kwargs) -> List[np.ndarray]:
        """Interpret a web page as a PDF document

        >>> from doctr.io import DocumentFile
        >>> doc = DocumentFile.from_url("https://www.yoursite.com")

        Args:
        ----
            url: the URL of the target web page
            **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`

        Returns:
        -------
            the list of pages decoded as numpy ndarray of shape H x W x 3
        """
        requires_package(
            "weasyprint",
            "`.from_url` requires weasyprint installed.\n"
            + "Installation instructions: https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#installation",
        )
        pdf_stream = read_html(url)
        return cls.from_pdf(pdf_stream, **kwargs)

    @classmethod
    def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]:
        """Read an image file (or a collection of image files) and convert it into an image in numpy format

        >>> from doctr.io import DocumentFile
        >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])

        Args:
        ----
            files: the path to the image file or a binary stream, or a collection of those
            **kwargs: additional parameters to :meth:`doctr.io.image.read_img_as_numpy`

        Returns:
        -------
            the list of pages decoded as numpy ndarray of shape H x W x 3
        """
        if isinstance(files, (str, Path, bytes)):
            files = [files]

        return [read_img_as_numpy(file, **kwargs) for file in files]