File size: 5,463 Bytes
d1ceb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
NBConvert Preprocessor for sanitizing HTML rendering of notebooks.
"""

import warnings

from bleach import ALLOWED_ATTRIBUTES, ALLOWED_TAGS, clean
from traitlets import Any, Bool, List, Set, Unicode

from .base import Preprocessor

_USE_BLEACH_CSS_SANITIZER = False
_USE_BLEACH_STYLES = False


try:
    # bleach[css] >=5.0
    from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES as ALLOWED_STYLES
    from bleach.css_sanitizer import CSSSanitizer

    _USE_BLEACH_CSS_SANITIZER = True
    _USE_BLEACH_STYLES = False
except ImportError:
    try:
        # bleach <5
        from bleach import ALLOWED_STYLES  # type:ignore[attr-defined, no-redef]

        _USE_BLEACH_CSS_SANITIZER = False
        _USE_BLEACH_STYLES = True
        warnings.warn(
            "Support for bleach <5 will be removed in a future version of nbconvert",
            DeprecationWarning,
            stacklevel=2,
        )

    except ImportError:
        warnings.warn(
            "The installed bleach/tinycss2 do not provide CSS sanitization, "
            "please upgrade to bleach >=5",
            UserWarning,
            stacklevel=2,
        )


__all__ = ["SanitizeHTML"]


class SanitizeHTML(Preprocessor):
    """A preprocessor to sanitize html."""

    # Bleach config.
    attributes = Any(
        config=True,
        default_value=ALLOWED_ATTRIBUTES,
        help="Allowed HTML tag attributes",
    )
    tags = List(
        Unicode(),
        config=True,
        default_value=ALLOWED_TAGS,  # type:ignore[arg-type]
        help="List of HTML tags to allow",
    )
    styles = List(
        Unicode(),
        config=True,
        default_value=ALLOWED_STYLES,  # type:ignore[arg-type]
        help="Allowed CSS styles if <style> tag is allowed",
    )
    strip = Bool(
        config=True,
        default_value=False,
        help="If True, remove unsafe markup entirely instead of escaping",
    )
    strip_comments = Bool(
        config=True,
        default_value=True,
        help="If True, strip comments from escaped HTML",
    )

    # Display data config.
    safe_output_keys = Set(
        config=True,
        default_value={
            "metadata",  # Not a mimetype per-se, but expected and safe.
            "text/plain",
            "text/latex",
            "application/json",
            "image/png",
            "image/jpeg",
        },
        help="Cell output mimetypes to render without modification",
    )
    sanitized_output_types = Set(
        config=True,
        default_value={
            "text/html",
            "text/markdown",
        },
        help="Cell output types to display after escaping with Bleach.",
    )

    def preprocess_cell(self, cell, resources, cell_index):
        """
        Sanitize potentially-dangerous contents of the cell.

        Cell Types:
          raw:
            Sanitize literal HTML
          markdown:
            Sanitize literal HTML
          code:
            Sanitize outputs that could result in code execution
        """
        if cell.cell_type == "raw":
            # Sanitize all raw cells anyway.
            # Only ones with the text/html mimetype should be emitted
            # but erring on the side of safety maybe.
            cell.source = self.sanitize_html_tags(cell.source)
            return cell, resources
        if cell.cell_type == "markdown":
            cell.source = self.sanitize_html_tags(cell.source)
            return cell, resources
        if cell.cell_type == "code":
            cell.outputs = self.sanitize_code_outputs(cell.outputs)
            return cell, resources
        return None

    def sanitize_code_outputs(self, outputs):
        """
        Sanitize code cell outputs.

        Removes 'text/javascript' fields from display_data outputs, and
        runs `sanitize_html_tags` over 'text/html'.
        """
        for output in outputs:
            # These are always ascii, so nothing to escape.
            if output["output_type"] in ("stream", "error"):
                continue
            data = output.data
            to_remove = []
            for key in data:
                if key in self.safe_output_keys:
                    continue
                if key in self.sanitized_output_types:
                    self.log.info("Sanitizing %s", key)
                    data[key] = self.sanitize_html_tags(data[key])
                else:
                    # Mark key for removal. (Python doesn't allow deletion of
                    # keys from a dict during iteration)
                    to_remove.append(key)
            for key in to_remove:
                self.log.info("Removing %s", key)
                del data[key]
        return outputs

    def sanitize_html_tags(self, html_str):
        """
        Sanitize a string containing raw HTML tags.
        """
        kwargs = {
            "tags": self.tags,
            "attributes": self.attributes,
            "strip": self.strip,
            "strip_comments": self.strip_comments,
        }

        if _USE_BLEACH_CSS_SANITIZER:
            css_sanitizer = CSSSanitizer(allowed_css_properties=self.styles)
            kwargs.update(css_sanitizer=css_sanitizer)
        elif _USE_BLEACH_STYLES:
            kwargs.update(styles=self.styles)

        return clean(html_str, **kwargs)


def _get_default_css_sanitizer():
    if _USE_BLEACH_CSS_SANITIZER:
        return CSSSanitizer(allowed_css_properties=ALLOWED_STYLES)
    return None