File size: 6,445 Bytes
d1ceb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""A preprocessor that extracts all of the outputs from the
notebook file.  The extracted outputs are returned in the 'resources' dictionary.
"""

# Copyright (c) IPython Development Team.
# Distributed under the terms of the Modified BSD License.

import json
import os
import sys
from binascii import a2b_base64
from mimetypes import guess_extension
from textwrap import dedent

from traitlets import Set, Unicode

from .base import Preprocessor


def guess_extension_without_jpe(mimetype):
    """
    This function fixes a problem with '.jpe' extensions
    of jpeg images which are then not recognised by latex.
    For any other case, the function works in the same way
    as mimetypes.guess_extension
    """
    ext = guess_extension(mimetype)
    if ext == ".jpe":
        ext = ".jpeg"
    return ext


def platform_utf_8_encode(data):
    """Encode data based on platform."""
    if isinstance(data, str):
        if sys.platform == "win32":
            data = data.replace("\n", "\r\n")
        data = data.encode("utf-8")
    return data


class ExtractOutputPreprocessor(Preprocessor):
    """
    Extracts all of the outputs from the notebook file.  The extracted
    outputs are returned in the 'resources' dictionary.
    """

    output_filename_template = Unicode("{unique_key}_{cell_index}_{index}{extension}").tag(
        config=True
    )

    extract_output_types = Set({"image/png", "image/jpeg", "image/svg+xml", "application/pdf"}).tag(
        config=True
    )

    def preprocess_cell(self, cell, resources, cell_index):
        """
        Apply a transformation on each cell,

        Parameters
        ----------
        cell : NotebookNode cell
            Notebook cell being processed
        resources : dictionary
            Additional resources used in the conversion process.  Allows
            preprocessors to pass variables into the Jinja engine.
        cell_index : int
            Index of the cell being processed (see base.py)
        """

        # Get the unique key from the resource dict if it exists.  If it does not
        # exist, use 'output' as the default.  Also, get files directory if it
        # has been specified
        unique_key = resources.get("unique_key", "output")
        output_files_dir = resources.get("output_files_dir", None)

        # Make sure outputs key exists
        if not isinstance(resources["outputs"], dict):
            resources["outputs"] = {}

        # Loop through all of the outputs in the cell
        for index, out in enumerate(cell.get("outputs", [])):
            if out.output_type not in {"display_data", "execute_result"}:
                continue
            if "text/html" in out.data:
                out["data"]["text/html"] = dedent(out["data"]["text/html"])
            # Get the output in data formats that the template needs extracted
            for mime_type in self.extract_output_types:
                if mime_type in out.data:
                    data = out.data[mime_type]

                    # Binary files are base64-encoded, SVG is already XML
                    if mime_type in {"image/png", "image/jpeg", "application/pdf"}:
                        # data is b64-encoded as text (str, unicode),
                        # we want the original bytes
                        data = a2b_base64(data)
                    elif mime_type == "application/json" or not isinstance(data, str):
                        # Data is either JSON-like and was parsed into a Python
                        # object according to the spec, or data is for sure
                        # JSON. In the latter case we want to go extra sure that
                        # we enclose a scalar string value into extra quotes by
                        # serializing it properly.
                        if isinstance(data, bytes):
                            # We need to guess the encoding in this
                            # instance. Some modules that return raw data like
                            # svg can leave the data in byte form instead of str
                            data = data.decode("utf-8")
                        data = platform_utf_8_encode(json.dumps(data))
                    else:
                        # All other text_type data will fall into this path
                        data = platform_utf_8_encode(data)

                    ext = guess_extension_without_jpe(mime_type)
                    if ext is None:
                        ext = "." + mime_type.rsplit("/")[-1]
                    if out.metadata.get("filename", ""):
                        filename = out.metadata["filename"]
                        if not filename.endswith(ext):
                            filename += ext
                    else:
                        filename = self.output_filename_template.format(
                            unique_key=unique_key, cell_index=cell_index, index=index, extension=ext
                        )

                    # On the cell, make the figure available via
                    #   cell.outputs[i].metadata.filenames['mime/type']
                    # where
                    #   cell.outputs[i].data['mime/type'] contains the data
                    if output_files_dir is not None:
                        filename = os.path.join(output_files_dir, filename)
                    out.metadata.setdefault("filenames", {})
                    out.metadata["filenames"][mime_type] = filename

                    if filename in resources["outputs"]:
                        msg = (
                            "Your outputs have filename metadata associated "
                            "with them. Nbconvert saves these outputs to "
                            "external files using this filename metadata. "
                            "Filenames need to be unique across the notebook, "
                            f"or images will be overwritten. The filename {filename} is "
                            "associated with more than one output. The second "
                            "output associated with this filename is in cell "
                            f"{cell_index}."
                        )
                        raise ValueError(msg)
                    # In the resources, make the figure available via
                    #   resources['outputs']['filename'] = data
                    resources["outputs"][filename] = data

        return cell, resources