Spaces:
Running
Running
Upload 527 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- MLPY/Lib/site-packages/tensorboard/__init__.py +113 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/__init__.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/assets.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/auth.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/context.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/data_compat.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/dataclass_compat.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/default.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/errors.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/lazy.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/main.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/main_lib.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/manager.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/notebook.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/plugin_util.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/program.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/__pycache__/version.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/__init__.py +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/__pycache__/__init__.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__init__.py +124 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/__init__.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/callbacks.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/encoding.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/linkifier.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/sanitizer.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/utils.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/version.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/callbacks.py +25 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/encoding.py +62 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/linkifier.py +526 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/sanitizer.py +368 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/utils.py +23 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/bleach/version.py +6 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__init__.py +35 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/__init__.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_ihatexml.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_inputstream.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_tokenizer.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_utils.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/constants.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/html5parser.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/serializer.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_ihatexml.py +289 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_inputstream.py +918 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_tokenizer.py +1735 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__init__.py +5 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/__init__.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/_base.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/py.cpython-39.pyc +0 -0
- MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/_base.py +40 -0
MLPY/Lib/site-packages/tensorboard/__init__.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
# ==============================================================================
|
15 |
+
"""TensorBoard is a webapp for understanding TensorFlow runs and graphs."""
|
16 |
+
|
17 |
+
|
18 |
+
from tensorboard import lazy as _lazy
|
19 |
+
from tensorboard import version as _version
|
20 |
+
|
21 |
+
# TensorBoard public API.
|
22 |
+
__all__ = [
|
23 |
+
"__version__",
|
24 |
+
"errors",
|
25 |
+
"notebook",
|
26 |
+
"program",
|
27 |
+
"summary",
|
28 |
+
]
|
29 |
+
|
30 |
+
|
31 |
+
# Please be careful when changing the structure of this file.
|
32 |
+
#
|
33 |
+
# The lazy imports in this file must use `importlib.import_module`, not
|
34 |
+
# `import tensorboard.foo` or `from tensorboard import foo`, or it will
|
35 |
+
# be impossible to reload the TensorBoard module without breaking these
|
36 |
+
# top-level public APIs. This has to do with the gory details of
|
37 |
+
# Python's module system. Take `tensorboard.notebook` as an example:
|
38 |
+
#
|
39 |
+
# - When the `tensorboard` module (that's us!) is initialized, its
|
40 |
+
# `notebook` attribute is initialized to a new LazyModule. The
|
41 |
+
# actual `tensorboard.notebook` submodule is not loaded.
|
42 |
+
#
|
43 |
+
# - When the `tensorboard.notebook` submodule is first loaded, Python
|
44 |
+
# _reassigns_ the `notebook` attribute on the `tensorboard` module
|
45 |
+
# object to point to the underlying `tensorboard.notebook` module
|
46 |
+
# object, rather than its former LazyModule value. This occurs
|
47 |
+
# whether the module is loaded via the lazy module or directly as an
|
48 |
+
# import:
|
49 |
+
#
|
50 |
+
# - import tensorboard; tensorboard.notebook.start(...) # one way
|
51 |
+
# - from tensorboard import notebook # other way; same effect
|
52 |
+
#
|
53 |
+
# - When the `tensorboard` module is reloaded, its `notebook`
|
54 |
+
# attribute is once again bound to a (new) LazyModule, while the
|
55 |
+
# `tensorboard.notebook` module object is unaffected and still
|
56 |
+
# exists in `sys.modules`. But then...
|
57 |
+
#
|
58 |
+
# - When the new LazyModule is forced, it must resolve to the existing
|
59 |
+
# `tensorboard.notebook` module object rather than itself (which
|
60 |
+
# just creates a stack overflow). If the LazyModule load function
|
61 |
+
# uses `import tensorboard.notebook; return tensorboard.notebook`,
|
62 |
+
# then the first statement will do _nothing_ because the
|
63 |
+
# `tensorboard.notebook` module is already loaded, and the second
|
64 |
+
# statement will return the LazyModule itself. The same goes for the
|
65 |
+
# `from tensorboard import notebook` form. We need to ensure that
|
66 |
+
# the submodule is loaded and then pull the actual module object out
|
67 |
+
# of `sys.modules`... which is exactly what `importlib` handles for
|
68 |
+
# us.
|
69 |
+
#
|
70 |
+
# See <https://github.com/tensorflow/tensorboard/issues/1989> for
|
71 |
+
# additional discussion.
|
72 |
+
|
73 |
+
|
74 |
+
@_lazy.lazy_load("tensorboard.errors")
|
75 |
+
def errors():
|
76 |
+
import importlib
|
77 |
+
|
78 |
+
return importlib.import_module("tensorboard.errors")
|
79 |
+
|
80 |
+
|
81 |
+
@_lazy.lazy_load("tensorboard.notebook")
|
82 |
+
def notebook():
|
83 |
+
import importlib
|
84 |
+
|
85 |
+
return importlib.import_module("tensorboard.notebook")
|
86 |
+
|
87 |
+
|
88 |
+
@_lazy.lazy_load("tensorboard.program")
|
89 |
+
def program():
|
90 |
+
import importlib
|
91 |
+
|
92 |
+
return importlib.import_module("tensorboard.program")
|
93 |
+
|
94 |
+
|
95 |
+
@_lazy.lazy_load("tensorboard.summary")
|
96 |
+
def summary():
|
97 |
+
import importlib
|
98 |
+
|
99 |
+
return importlib.import_module("tensorboard.summary")
|
100 |
+
|
101 |
+
|
102 |
+
def load_ipython_extension(ipython):
|
103 |
+
"""IPython API entry point.
|
104 |
+
|
105 |
+
Only intended to be called by the IPython runtime.
|
106 |
+
|
107 |
+
See:
|
108 |
+
https://ipython.readthedocs.io/en/stable/config/extensions/index.html
|
109 |
+
"""
|
110 |
+
notebook._load_ipython_extension(ipython)
|
111 |
+
|
112 |
+
|
113 |
+
__version__ = _version.VERSION
|
MLPY/Lib/site-packages/tensorboard/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (1.41 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/assets.cpython-39.pyc
ADDED
Binary file (1.01 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/auth.cpython-39.pyc
ADDED
Binary file (3.45 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/context.cpython-39.pyc
ADDED
Binary file (4.19 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/data_compat.cpython-39.pyc
ADDED
Binary file (4.99 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/dataclass_compat.cpython-39.pyc
ADDED
Binary file (6.47 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/default.cpython-39.pyc
ADDED
Binary file (4.02 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/errors.cpython-39.pyc
ADDED
Binary file (4.57 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/lazy.cpython-39.pyc
ADDED
Binary file (2.84 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/main.cpython-39.pyc
ADDED
Binary file (1.26 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/main_lib.cpython-39.pyc
ADDED
Binary file (1.33 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/manager.cpython-39.pyc
ADDED
Binary file (15 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/notebook.cpython-39.pyc
ADDED
Binary file (11.7 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/plugin_util.cpython-39.pyc
ADDED
Binary file (6.67 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/program.cpython-39.pyc
ADDED
Binary file (26.7 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/__pycache__/version.cpython-39.pyc
ADDED
Binary file (257 Bytes). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/__init__.py
ADDED
File without changes
|
MLPY/Lib/site-packages/tensorboard/_vendor/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (156 Bytes). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__init__.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
from __future__ import unicode_literals
|
4 |
+
|
5 |
+
from tensorboard._vendor.bleach.linkifier import (
|
6 |
+
DEFAULT_CALLBACKS,
|
7 |
+
Linker,
|
8 |
+
LinkifyFilter,
|
9 |
+
)
|
10 |
+
from tensorboard._vendor.bleach.sanitizer import (
|
11 |
+
ALLOWED_ATTRIBUTES,
|
12 |
+
ALLOWED_PROTOCOLS,
|
13 |
+
ALLOWED_STYLES,
|
14 |
+
ALLOWED_TAGS,
|
15 |
+
BleachSanitizerFilter,
|
16 |
+
Cleaner,
|
17 |
+
)
|
18 |
+
from tensorboard._vendor.bleach.version import __version__, VERSION # flake8: noqa
|
19 |
+
|
20 |
+
__all__ = ['clean', 'linkify']
|
21 |
+
|
22 |
+
|
23 |
+
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
|
24 |
+
styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
|
25 |
+
strip_comments=True):
|
26 |
+
"""Clean an HTML fragment of malicious content and return it
|
27 |
+
|
28 |
+
This function is a security-focused function whose sole purpose is to
|
29 |
+
remove malicious content from a string such that it can be displayed as
|
30 |
+
content in a web page.
|
31 |
+
|
32 |
+
This function is not designed to use to transform content to be used in
|
33 |
+
non-web-page contexts.
|
34 |
+
|
35 |
+
Example::
|
36 |
+
|
37 |
+
import bleach
|
38 |
+
|
39 |
+
better_text = bleach.clean(yucky_text)
|
40 |
+
|
41 |
+
|
42 |
+
.. Note::
|
43 |
+
|
44 |
+
If you're cleaning a lot of text and passing the same argument values or
|
45 |
+
you want more configurability, consider using a
|
46 |
+
:py:class:`bleach.sanitizer.Cleaner` instance.
|
47 |
+
|
48 |
+
:arg str text: the text to clean
|
49 |
+
|
50 |
+
:arg list tags: allowed list of tags; defaults to
|
51 |
+
``bleach.sanitizer.ALLOWED_TAGS``
|
52 |
+
|
53 |
+
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
54 |
+
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
55 |
+
|
56 |
+
:arg list styles: allowed list of css styles; defaults to
|
57 |
+
``bleach.sanitizer.ALLOWED_STYLES``
|
58 |
+
|
59 |
+
:arg list protocols: allowed list of protocols for links; defaults
|
60 |
+
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
61 |
+
|
62 |
+
:arg bool strip: whether or not to strip disallowed elements
|
63 |
+
|
64 |
+
:arg bool strip_comments: whether or not to strip HTML comments
|
65 |
+
|
66 |
+
:returns: cleaned text as unicode
|
67 |
+
|
68 |
+
"""
|
69 |
+
cleaner = Cleaner(
|
70 |
+
tags=tags,
|
71 |
+
attributes=attributes,
|
72 |
+
styles=styles,
|
73 |
+
protocols=protocols,
|
74 |
+
strip=strip,
|
75 |
+
strip_comments=strip_comments,
|
76 |
+
)
|
77 |
+
return cleaner.clean(text)
|
78 |
+
|
79 |
+
|
80 |
+
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
|
81 |
+
"""Convert URL-like strings in an HTML fragment to links
|
82 |
+
|
83 |
+
This function converts strings that look like URLs, domain names and email
|
84 |
+
addresses in text that may be an HTML fragment to links, while preserving:
|
85 |
+
|
86 |
+
1. links already in the string
|
87 |
+
2. urls found in attributes
|
88 |
+
3. email addresses
|
89 |
+
|
90 |
+
linkify does a best-effort approach and tries to recover from bad
|
91 |
+
situations due to crazy text.
|
92 |
+
|
93 |
+
.. Note::
|
94 |
+
|
95 |
+
If you're linking a lot of text and passing the same argument values or
|
96 |
+
you want more configurability, consider using a
|
97 |
+
:py:class:`bleach.linkifier.Linker` instance.
|
98 |
+
|
99 |
+
.. Note::
|
100 |
+
|
101 |
+
If you have text that you want to clean and then linkify, consider using
|
102 |
+
the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
|
103 |
+
pass. That way you're not parsing the HTML twice.
|
104 |
+
|
105 |
+
:arg str text: the text to linkify
|
106 |
+
|
107 |
+
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
108 |
+
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
109 |
+
|
110 |
+
:arg list skip_tags: list of tags that you don't want to linkify the
|
111 |
+
contents of; for example, you could set this to ``['pre']`` to skip
|
112 |
+
linkifying contents of ``pre`` tags
|
113 |
+
|
114 |
+
:arg bool parse_email: whether or not to linkify email addresses
|
115 |
+
|
116 |
+
:returns: linkified text as unicode
|
117 |
+
|
118 |
+
"""
|
119 |
+
linker = Linker(
|
120 |
+
callbacks=callbacks,
|
121 |
+
skip_tags=skip_tags,
|
122 |
+
parse_email=parse_email
|
123 |
+
)
|
124 |
+
return linker.linkify(text)
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (3.79 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/callbacks.cpython-39.pyc
ADDED
Binary file (1.06 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/encoding.cpython-39.pyc
ADDED
Binary file (1.6 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/linkifier.cpython-39.pyc
ADDED
Binary file (11.2 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/sanitizer.cpython-39.pyc
ADDED
Binary file (8.63 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/utils.cpython-39.pyc
ADDED
Binary file (1.03 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/version.cpython-39.pyc
ADDED
Binary file (401 Bytes). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/callbacks.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""A set of basic callbacks for bleach.linkify."""
|
2 |
+
from __future__ import unicode_literals
|
3 |
+
|
4 |
+
|
5 |
+
def nofollow(attrs, new=False):
|
6 |
+
href_key = (None, u'href')
|
7 |
+
if href_key not in attrs or attrs[href_key].startswith(u'mailto:'):
|
8 |
+
return attrs
|
9 |
+
|
10 |
+
rel_key = (None, u'rel')
|
11 |
+
rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val]
|
12 |
+
if u'nofollow' not in [rel_val.lower() for rel_val in rel_values]:
|
13 |
+
rel_values.append(u'nofollow')
|
14 |
+
attrs[rel_key] = u' '.join(rel_values)
|
15 |
+
|
16 |
+
return attrs
|
17 |
+
|
18 |
+
|
19 |
+
def target_blank(attrs, new=False):
|
20 |
+
href_key = (None, u'href')
|
21 |
+
if attrs[href_key].startswith(u'mailto:'):
|
22 |
+
return attrs
|
23 |
+
|
24 |
+
attrs[(None, u'target')] = u'_blank'
|
25 |
+
return attrs
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/encoding.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
from decimal import Decimal
|
3 |
+
import types
|
4 |
+
import six
|
5 |
+
|
6 |
+
|
7 |
+
def is_protected_type(obj):
|
8 |
+
"""Determine if the object instance is of a protected type.
|
9 |
+
|
10 |
+
Objects of protected types are preserved as-is when passed to
|
11 |
+
force_unicode(strings_only=True).
|
12 |
+
"""
|
13 |
+
return isinstance(obj, (
|
14 |
+
six.integer_types +
|
15 |
+
(types.NoneType,
|
16 |
+
datetime.datetime, datetime.date, datetime.time,
|
17 |
+
float, Decimal))
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
|
22 |
+
"""
|
23 |
+
Similar to smart_text, except that lazy instances are resolved to
|
24 |
+
strings, rather than kept as lazy objects.
|
25 |
+
|
26 |
+
If strings_only is True, don't convert (some) non-string-like objects.
|
27 |
+
"""
|
28 |
+
# Handle the common case first, saves 30-40% when s is an instance of
|
29 |
+
# six.text_type. This function gets called often in that setting.
|
30 |
+
if isinstance(s, six.text_type):
|
31 |
+
return s
|
32 |
+
if strings_only and is_protected_type(s):
|
33 |
+
return s
|
34 |
+
try:
|
35 |
+
if not isinstance(s, six.string_types):
|
36 |
+
if hasattr(s, '__unicode__'):
|
37 |
+
s = s.__unicode__()
|
38 |
+
else:
|
39 |
+
if six.PY3:
|
40 |
+
if isinstance(s, bytes):
|
41 |
+
s = six.text_type(s, encoding, errors)
|
42 |
+
else:
|
43 |
+
s = six.text_type(s)
|
44 |
+
else:
|
45 |
+
s = six.text_type(bytes(s), encoding, errors)
|
46 |
+
else:
|
47 |
+
# Note: We use .decode() here, instead of six.text_type(s,
|
48 |
+
# encoding, errors), so that if s is a SafeBytes, it ends up being
|
49 |
+
# a SafeText at the end.
|
50 |
+
s = s.decode(encoding, errors)
|
51 |
+
except UnicodeDecodeError as e:
|
52 |
+
if not isinstance(s, Exception):
|
53 |
+
raise UnicodeDecodeError(*e.args)
|
54 |
+
else:
|
55 |
+
# If we get to here, the caller has passed in an Exception
|
56 |
+
# subclass populated with non-ASCII bytestring data without a
|
57 |
+
# working unicode method. Try to handle this without raising a
|
58 |
+
# further exception by individually forcing the exception args
|
59 |
+
# to unicode.
|
60 |
+
s = ' '.join([force_unicode(arg, encoding, strings_only,
|
61 |
+
errors) for arg in s])
|
62 |
+
return s
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/linkifier.py
ADDED
@@ -0,0 +1,526 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import unicode_literals
|
2 |
+
import re
|
3 |
+
|
4 |
+
from tensorboard._vendor import html5lib
|
5 |
+
from tensorboard._vendor.html5lib.filters.base import Filter
|
6 |
+
from tensorboard._vendor.html5lib.filters.sanitizer import allowed_protocols
|
7 |
+
from tensorboard._vendor.html5lib.serializer import HTMLSerializer
|
8 |
+
|
9 |
+
from tensorboard._vendor.bleach import callbacks as linkify_callbacks
|
10 |
+
from tensorboard._vendor.bleach.encoding import force_unicode
|
11 |
+
from tensorboard._vendor.bleach.utils import alphabetize_attributes
|
12 |
+
|
13 |
+
|
14 |
+
#: List of default callbacks
|
15 |
+
DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
|
16 |
+
|
17 |
+
|
18 |
+
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
|
19 |
+
ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
|
20 |
+
cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
|
21 |
+
dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
|
22 |
+
gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
|
23 |
+
im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
|
24 |
+
kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
|
25 |
+
ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
|
26 |
+
net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
|
27 |
+
pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
|
28 |
+
sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
|
29 |
+
tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
|
30 |
+
xn xxx ye yt yu za zm zw""".split()
|
31 |
+
|
32 |
+
# Make sure that .com doesn't get matched by .co first
|
33 |
+
TLDS.reverse()
|
34 |
+
|
35 |
+
|
36 |
+
def build_url_re(tlds=TLDS, protocols=allowed_protocols):
|
37 |
+
"""Builds the url regex used by linkifier
|
38 |
+
|
39 |
+
If you want a different set of tlds or allowed protocols, pass those in
|
40 |
+
and stomp on the existing ``url_re``::
|
41 |
+
|
42 |
+
from bleach import linkifier
|
43 |
+
|
44 |
+
my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
|
45 |
+
|
46 |
+
linker = LinkifyFilter(url_re=my_url_re)
|
47 |
+
|
48 |
+
"""
|
49 |
+
return re.compile(
|
50 |
+
r"""\(* # Match any opening parentheses.
|
51 |
+
\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
|
52 |
+
([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
53 |
+
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
54 |
+
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
55 |
+
# except for # and ~, which happen in practice)
|
56 |
+
""".format('|'.join(protocols), '|'.join(tlds)),
|
57 |
+
re.IGNORECASE | re.VERBOSE | re.UNICODE)
|
58 |
+
|
59 |
+
|
60 |
+
URL_RE = build_url_re()
|
61 |
+
|
62 |
+
|
63 |
+
PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
|
64 |
+
|
65 |
+
|
66 |
+
EMAIL_RE = re.compile(
|
67 |
+
r"""(?<!//)
|
68 |
+
(([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
|
69 |
+
(\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)* # dot-atom
|
70 |
+
|^"([\001-\010\013\014\016-\037!#-\[\]-\177]
|
71 |
+
|\\[\001-\011\013\014\016-\177])*" # quoted-string
|
72 |
+
)@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}) # domain
|
73 |
+
""",
|
74 |
+
re.IGNORECASE | re.MULTILINE | re.VERBOSE)
|
75 |
+
|
76 |
+
|
77 |
+
class Linker(object):
|
78 |
+
"""Convert URL-like strings in an HTML fragment to links
|
79 |
+
|
80 |
+
This function converts strings that look like URLs, domain names and email
|
81 |
+
addresses in text that may be an HTML fragment to links, while preserving:
|
82 |
+
|
83 |
+
1. links already in the string
|
84 |
+
2. urls found in attributes
|
85 |
+
3. email addresses
|
86 |
+
|
87 |
+
linkify does a best-effort approach and tries to recover from bad
|
88 |
+
situations due to crazy text.
|
89 |
+
|
90 |
+
"""
|
91 |
+
def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
|
92 |
+
url_re=URL_RE, email_re=EMAIL_RE):
|
93 |
+
"""Creates a Linker instance
|
94 |
+
|
95 |
+
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
96 |
+
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
97 |
+
|
98 |
+
:arg list skip_tags: list of tags that you don't want to linkify the
|
99 |
+
contents of; for example, you could set this to ``['pre']`` to skip
|
100 |
+
linkifying contents of ``pre`` tags
|
101 |
+
|
102 |
+
:arg bool parse_email: whether or not to linkify email addresses
|
103 |
+
|
104 |
+
:arg re url_re: url matching regex
|
105 |
+
|
106 |
+
:arg re email_re: email matching regex
|
107 |
+
|
108 |
+
:returns: linkified text as unicode
|
109 |
+
|
110 |
+
"""
|
111 |
+
self.callbacks = callbacks
|
112 |
+
self.skip_tags = skip_tags
|
113 |
+
self.parse_email = parse_email
|
114 |
+
self.url_re = url_re
|
115 |
+
self.email_re = email_re
|
116 |
+
|
117 |
+
self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
|
118 |
+
self.walker = html5lib.getTreeWalker('etree')
|
119 |
+
self.serializer = HTMLSerializer(
|
120 |
+
quote_attr_values='always',
|
121 |
+
omit_optional_tags=False,
|
122 |
+
|
123 |
+
# linkify does not sanitize
|
124 |
+
sanitize=False,
|
125 |
+
|
126 |
+
# linkify alphabetizes
|
127 |
+
alphabetical_attributes=False,
|
128 |
+
)
|
129 |
+
|
130 |
+
def linkify(self, text):
|
131 |
+
"""Linkify specified text
|
132 |
+
|
133 |
+
:arg str text: the text to add links to
|
134 |
+
|
135 |
+
:returns: linkified text as unicode
|
136 |
+
|
137 |
+
"""
|
138 |
+
text = force_unicode(text)
|
139 |
+
|
140 |
+
if not text:
|
141 |
+
return u''
|
142 |
+
|
143 |
+
dom = self.parser.parseFragment(text)
|
144 |
+
filtered = LinkifyFilter(
|
145 |
+
source=self.walker(dom),
|
146 |
+
callbacks=self.callbacks,
|
147 |
+
skip_tags=self.skip_tags,
|
148 |
+
parse_email=self.parse_email,
|
149 |
+
url_re=self.url_re,
|
150 |
+
email_re=self.email_re,
|
151 |
+
)
|
152 |
+
return self.serializer.render(filtered)
|
153 |
+
|
154 |
+
|
155 |
+
class LinkifyFilter(Filter):
|
156 |
+
"""html5lib filter that linkifies text
|
157 |
+
|
158 |
+
This will do the following:
|
159 |
+
|
160 |
+
* convert email addresses into links
|
161 |
+
* convert urls into links
|
162 |
+
* edit existing links by running them through callbacks--the default is to
|
163 |
+
add a ``rel="nofollow"``
|
164 |
+
|
165 |
+
This filter can be used anywhere html5lib filters can be used.
|
166 |
+
|
167 |
+
"""
|
168 |
+
def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
|
169 |
+
url_re=URL_RE, email_re=EMAIL_RE):
|
170 |
+
"""Creates a LinkifyFilter instance
|
171 |
+
|
172 |
+
:arg TreeWalker source: stream
|
173 |
+
|
174 |
+
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
175 |
+
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
176 |
+
|
177 |
+
:arg list skip_tags: list of tags that you don't want to linkify the
|
178 |
+
contents of; for example, you could set this to ``['pre']`` to skip
|
179 |
+
linkifying contents of ``pre`` tags
|
180 |
+
|
181 |
+
:arg bool parse_email: whether or not to linkify email addresses
|
182 |
+
|
183 |
+
:arg re url_re: url matching regex
|
184 |
+
|
185 |
+
:arg re email_re: email matching regex
|
186 |
+
|
187 |
+
"""
|
188 |
+
super(LinkifyFilter, self).__init__(source)
|
189 |
+
|
190 |
+
self.callbacks = callbacks or []
|
191 |
+
self.skip_tags = skip_tags or []
|
192 |
+
self.parse_email = parse_email
|
193 |
+
|
194 |
+
self.url_re = url_re
|
195 |
+
self.email_re = email_re
|
196 |
+
|
197 |
+
def apply_callbacks(self, attrs, is_new):
|
198 |
+
"""Given an attrs dict and an is_new bool, runs through callbacks
|
199 |
+
|
200 |
+
Callbacks can return an adjusted attrs dict or ``None``. In the case of
|
201 |
+
``None``, we stop going through callbacks and return that and the link
|
202 |
+
gets dropped.
|
203 |
+
|
204 |
+
:arg dict attrs: map of ``(namespace, name)`` -> ``value``
|
205 |
+
|
206 |
+
:arg bool is_new: whether or not this link was added by linkify
|
207 |
+
|
208 |
+
:returns: adjusted attrs dict or ``None``
|
209 |
+
|
210 |
+
"""
|
211 |
+
for cb in self.callbacks:
|
212 |
+
attrs = cb(attrs, is_new)
|
213 |
+
if attrs is None:
|
214 |
+
return None
|
215 |
+
return attrs
|
216 |
+
|
217 |
+
def extract_character_data(self, token_list):
|
218 |
+
"""Extracts and squashes character sequences in a token stream"""
|
219 |
+
# FIXME(willkg): This is a terrible idea. What it does is drop all the
|
220 |
+
# tags from the token list and merge the Characters and SpaceCharacters
|
221 |
+
# tokens into a single text.
|
222 |
+
#
|
223 |
+
# So something like this::
|
224 |
+
#
|
225 |
+
# "<span>" "<b>" "some text" "</b>" "</span>"
|
226 |
+
#
|
227 |
+
# gets converted to "some text".
|
228 |
+
#
|
229 |
+
# This gets used to figure out the ``_text`` fauxttribute value for
|
230 |
+
# linkify callables.
|
231 |
+
#
|
232 |
+
# I'm not really sure how else to support that ``_text`` fauxttribute and
|
233 |
+
# maintain some modicum of backwards compatability with previous versions
|
234 |
+
# of Bleach.
|
235 |
+
|
236 |
+
out = []
|
237 |
+
for token in token_list:
|
238 |
+
token_type = token['type']
|
239 |
+
if token_type in ['Characters', 'SpaceCharacters']:
|
240 |
+
out.append(token['data'])
|
241 |
+
|
242 |
+
return u''.join(out)
|
243 |
+
|
244 |
+
def handle_email_addresses(self, src_iter):
|
245 |
+
"""Handle email addresses in character tokens"""
|
246 |
+
for token in src_iter:
|
247 |
+
if token['type'] == 'Characters':
|
248 |
+
text = token['data']
|
249 |
+
new_tokens = []
|
250 |
+
end = 0
|
251 |
+
|
252 |
+
# For each email address we find in the text
|
253 |
+
for match in self.email_re.finditer(text):
|
254 |
+
if match.start() > end:
|
255 |
+
new_tokens.append(
|
256 |
+
{u'type': u'Characters', u'data': text[end:match.start()]}
|
257 |
+
)
|
258 |
+
|
259 |
+
# Run attributes through the callbacks to see what we
|
260 |
+
# should do with this match
|
261 |
+
attrs = {
|
262 |
+
(None, u'href'): u'mailto:%s' % match.group(0),
|
263 |
+
u'_text': match.group(0)
|
264 |
+
}
|
265 |
+
attrs = self.apply_callbacks(attrs, True)
|
266 |
+
|
267 |
+
if attrs is None:
|
268 |
+
# Just add the text--but not as a link
|
269 |
+
new_tokens.append(
|
270 |
+
{u'type': u'Characters', u'data': match.group(0)}
|
271 |
+
)
|
272 |
+
|
273 |
+
else:
|
274 |
+
# Add an "a" tag for the new link
|
275 |
+
_text = attrs.pop(u'_text', '')
|
276 |
+
attrs = alphabetize_attributes(attrs)
|
277 |
+
new_tokens.extend([
|
278 |
+
{u'type': u'StartTag', u'name': u'a', u'data': attrs},
|
279 |
+
{u'type': u'Characters', u'data': force_unicode(_text)},
|
280 |
+
{u'type': u'EndTag', u'name': 'a'}
|
281 |
+
])
|
282 |
+
end = match.end()
|
283 |
+
|
284 |
+
if new_tokens:
|
285 |
+
# Yield the adjusted set of tokens and then continue
|
286 |
+
# through the loop
|
287 |
+
if end < len(text):
|
288 |
+
new_tokens.append({u'type': u'Characters', u'data': text[end:]})
|
289 |
+
|
290 |
+
for new_token in new_tokens:
|
291 |
+
yield new_token
|
292 |
+
|
293 |
+
continue
|
294 |
+
|
295 |
+
yield token
|
296 |
+
|
297 |
+
def strip_non_url_bits(self, fragment):
|
298 |
+
"""Strips non-url bits from the url
|
299 |
+
|
300 |
+
This accounts for over-eager matching by the regex.
|
301 |
+
|
302 |
+
"""
|
303 |
+
prefix = suffix = ''
|
304 |
+
|
305 |
+
while fragment:
|
306 |
+
# Try removing ( from the beginning and, if it's balanced, from the
|
307 |
+
# end, too
|
308 |
+
if fragment.startswith(u'('):
|
309 |
+
prefix = prefix + u'('
|
310 |
+
fragment = fragment[1:]
|
311 |
+
|
312 |
+
if fragment.endswith(u')'):
|
313 |
+
suffix = u')' + suffix
|
314 |
+
fragment = fragment[:-1]
|
315 |
+
continue
|
316 |
+
|
317 |
+
# Now try extraneous things from the end. For example, sometimes we
|
318 |
+
# pick up ) at the end of a url, but the url is in a parenthesized
|
319 |
+
# phrase like:
|
320 |
+
#
|
321 |
+
# "i looked at the site (at http://example.com)"
|
322 |
+
|
323 |
+
if fragment.endswith(u')') and u'(' not in fragment:
|
324 |
+
fragment = fragment[:-1]
|
325 |
+
suffix = u')' + suffix
|
326 |
+
continue
|
327 |
+
|
328 |
+
# Handle commas
|
329 |
+
if fragment.endswith(u','):
|
330 |
+
fragment = fragment[:-1]
|
331 |
+
suffix = u',' + suffix
|
332 |
+
continue
|
333 |
+
|
334 |
+
# Handle periods
|
335 |
+
if fragment.endswith(u'.'):
|
336 |
+
fragment = fragment[:-1]
|
337 |
+
suffix = u'.' + suffix
|
338 |
+
continue
|
339 |
+
|
340 |
+
# Nothing matched, so we're done
|
341 |
+
break
|
342 |
+
|
343 |
+
return fragment, prefix, suffix
|
344 |
+
|
345 |
+
def handle_links(self, src_iter):
|
346 |
+
"""Handle links in character tokens"""
|
347 |
+
for token in src_iter:
|
348 |
+
if token['type'] == 'Characters':
|
349 |
+
text = token['data']
|
350 |
+
new_tokens = []
|
351 |
+
end = 0
|
352 |
+
|
353 |
+
for match in self.url_re.finditer(text):
|
354 |
+
if match.start() > end:
|
355 |
+
new_tokens.append(
|
356 |
+
{u'type': u'Characters', u'data': text[end:match.start()]}
|
357 |
+
)
|
358 |
+
|
359 |
+
url = match.group(0)
|
360 |
+
prefix = suffix = ''
|
361 |
+
|
362 |
+
# Sometimes we pick up too much in the url match, so look for
|
363 |
+
# bits we should drop and remove them from the match
|
364 |
+
url, prefix, suffix = self.strip_non_url_bits(url)
|
365 |
+
|
366 |
+
# If there's no protocol, add one
|
367 |
+
if PROTO_RE.search(url):
|
368 |
+
href = url
|
369 |
+
else:
|
370 |
+
href = u'http://%s' % url
|
371 |
+
|
372 |
+
attrs = {
|
373 |
+
(None, u'href'): href,
|
374 |
+
u'_text': url
|
375 |
+
}
|
376 |
+
attrs = self.apply_callbacks(attrs, True)
|
377 |
+
|
378 |
+
if attrs is None:
|
379 |
+
# Just add the text
|
380 |
+
new_tokens.append(
|
381 |
+
{u'type': u'Characters', u'data': prefix + url + suffix}
|
382 |
+
)
|
383 |
+
|
384 |
+
else:
|
385 |
+
# Add the "a" tag!
|
386 |
+
if prefix:
|
387 |
+
new_tokens.append(
|
388 |
+
{u'type': u'Characters', u'data': prefix}
|
389 |
+
)
|
390 |
+
|
391 |
+
_text = attrs.pop(u'_text', '')
|
392 |
+
attrs = alphabetize_attributes(attrs)
|
393 |
+
|
394 |
+
new_tokens.extend([
|
395 |
+
{u'type': u'StartTag', u'name': u'a', u'data': attrs},
|
396 |
+
{u'type': u'Characters', u'data': force_unicode(_text)},
|
397 |
+
{u'type': u'EndTag', u'name': 'a'},
|
398 |
+
])
|
399 |
+
|
400 |
+
if suffix:
|
401 |
+
new_tokens.append(
|
402 |
+
{u'type': u'Characters', u'data': suffix}
|
403 |
+
)
|
404 |
+
|
405 |
+
end = match.end()
|
406 |
+
|
407 |
+
if new_tokens:
|
408 |
+
# Yield the adjusted set of tokens and then continue
|
409 |
+
# through the loop
|
410 |
+
if end < len(text):
|
411 |
+
new_tokens.append({u'type': u'Characters', u'data': text[end:]})
|
412 |
+
|
413 |
+
for new_token in new_tokens:
|
414 |
+
yield new_token
|
415 |
+
|
416 |
+
continue
|
417 |
+
|
418 |
+
yield token
|
419 |
+
|
420 |
+
def handle_a_tag(self, token_buffer):
|
421 |
+
"""Handle the "a" tag
|
422 |
+
|
423 |
+
This could adjust the link or drop it altogether depending on what the
|
424 |
+
callbacks return.
|
425 |
+
|
426 |
+
This yields the new set of tokens.
|
427 |
+
|
428 |
+
"""
|
429 |
+
a_token = token_buffer[0]
|
430 |
+
if a_token['data']:
|
431 |
+
attrs = a_token['data']
|
432 |
+
else:
|
433 |
+
attrs = {}
|
434 |
+
text = self.extract_character_data(token_buffer)
|
435 |
+
attrs['_text'] = text
|
436 |
+
|
437 |
+
attrs = self.apply_callbacks(attrs, False)
|
438 |
+
|
439 |
+
if attrs is None:
|
440 |
+
# We're dropping the "a" tag and everything else and replacing
|
441 |
+
# it with character data. So emit that token.
|
442 |
+
yield {'type': 'Characters', 'data': text}
|
443 |
+
|
444 |
+
else:
|
445 |
+
new_text = attrs.pop('_text', '')
|
446 |
+
a_token['data'] = alphabetize_attributes(attrs)
|
447 |
+
|
448 |
+
if text == new_text:
|
449 |
+
# The callbacks didn't change the text, so we yield the new "a"
|
450 |
+
# token, then whatever else was there, then the end "a" token
|
451 |
+
yield a_token
|
452 |
+
for mem in token_buffer[1:]:
|
453 |
+
yield mem
|
454 |
+
|
455 |
+
else:
|
456 |
+
# If the callbacks changed the text, then we're going to drop
|
457 |
+
# all the tokens between the start and end "a" tags and replace
|
458 |
+
# it with the new text
|
459 |
+
yield a_token
|
460 |
+
yield {'type': 'Characters', 'data': force_unicode(new_text)}
|
461 |
+
yield token_buffer[-1]
|
462 |
+
|
463 |
+
def __iter__(self):
|
464 |
+
in_a = False
|
465 |
+
in_skip_tag = None
|
466 |
+
|
467 |
+
token_buffer = []
|
468 |
+
|
469 |
+
for token in super(LinkifyFilter, self).__iter__():
|
470 |
+
if in_a:
|
471 |
+
# Handle the case where we're in an "a" tag--we want to buffer tokens
|
472 |
+
# until we hit an end "a" tag.
|
473 |
+
if token['type'] == 'EndTag' and token['name'] == 'a':
|
474 |
+
# Add the end tag to the token buffer and then handle them
|
475 |
+
# and yield anything returned
|
476 |
+
token_buffer.append(token)
|
477 |
+
for new_token in self.handle_a_tag(token_buffer):
|
478 |
+
yield new_token
|
479 |
+
|
480 |
+
# Clear "a" related state and continue since we've yielded all
|
481 |
+
# the tokens we're going to yield
|
482 |
+
in_a = False
|
483 |
+
token_buffer = []
|
484 |
+
continue
|
485 |
+
|
486 |
+
else:
|
487 |
+
token_buffer.append(token)
|
488 |
+
continue
|
489 |
+
|
490 |
+
elif token['type'] in ['StartTag', 'EmptyTag']:
|
491 |
+
if token['name'] in self.skip_tags:
|
492 |
+
# Skip tags start a "special mode" where we don't linkify
|
493 |
+
# anything until the end tag.
|
494 |
+
in_skip_tag = token['name']
|
495 |
+
|
496 |
+
elif token['name'] == 'a':
|
497 |
+
# The "a" tag is special--we switch to a slurp mode and
|
498 |
+
# slurp all the tokens until the end "a" tag and then
|
499 |
+
# figure out what to do with them there.
|
500 |
+
in_a = True
|
501 |
+
token_buffer.append(token)
|
502 |
+
|
503 |
+
# We buffer the start tag, so we don't want to yield it,
|
504 |
+
# yet
|
505 |
+
continue
|
506 |
+
|
507 |
+
elif in_skip_tag and self.skip_tags:
|
508 |
+
# NOTE(willkg): We put this clause here since in_a and
|
509 |
+
# switching in and out of in_a takes precedence.
|
510 |
+
if token['type'] == 'EndTag' and token['name'] == in_skip_tag:
|
511 |
+
in_skip_tag = None
|
512 |
+
|
513 |
+
elif not in_a and not in_skip_tag and token['type'] == 'Characters':
|
514 |
+
new_stream = iter([token])
|
515 |
+
if self.parse_email:
|
516 |
+
new_stream = self.handle_email_addresses(new_stream)
|
517 |
+
|
518 |
+
new_stream = self.handle_links(new_stream)
|
519 |
+
|
520 |
+
for token in new_stream:
|
521 |
+
yield token
|
522 |
+
|
523 |
+
# We've already yielded this token, so continue
|
524 |
+
continue
|
525 |
+
|
526 |
+
yield token
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/sanitizer.py
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import unicode_literals
|
2 |
+
import re
|
3 |
+
from xml.sax.saxutils import unescape
|
4 |
+
|
5 |
+
from tensorboard._vendor import html5lib
|
6 |
+
from tensorboard._vendor.html5lib.constants import namespaces
|
7 |
+
from tensorboard._vendor.html5lib.filters import sanitizer
|
8 |
+
from tensorboard._vendor.html5lib.serializer import HTMLSerializer
|
9 |
+
|
10 |
+
from tensorboard._vendor.bleach.encoding import force_unicode
|
11 |
+
from tensorboard._vendor.bleach.utils import alphabetize_attributes
|
12 |
+
|
13 |
+
|
14 |
+
#: List of allowed tags
|
15 |
+
ALLOWED_TAGS = [
|
16 |
+
'a',
|
17 |
+
'abbr',
|
18 |
+
'acronym',
|
19 |
+
'b',
|
20 |
+
'blockquote',
|
21 |
+
'code',
|
22 |
+
'em',
|
23 |
+
'i',
|
24 |
+
'li',
|
25 |
+
'ol',
|
26 |
+
'strong',
|
27 |
+
'ul',
|
28 |
+
]
|
29 |
+
|
30 |
+
|
31 |
+
#: Map of allowed attributes by tag
|
32 |
+
ALLOWED_ATTRIBUTES = {
|
33 |
+
'a': ['href', 'title'],
|
34 |
+
'abbr': ['title'],
|
35 |
+
'acronym': ['title'],
|
36 |
+
}
|
37 |
+
|
38 |
+
|
39 |
+
#: List of allowed styles
|
40 |
+
ALLOWED_STYLES = []
|
41 |
+
|
42 |
+
|
43 |
+
#: List of allowed protocols
|
44 |
+
ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
|
45 |
+
|
46 |
+
|
47 |
+
class Cleaner(object):
|
48 |
+
"""Cleaner for cleaning HTML fragments of malicious content
|
49 |
+
|
50 |
+
This cleaner is a security-focused function whose sole purpose is to remove
|
51 |
+
malicious content from a string such that it can be displayed as content in
|
52 |
+
a web page.
|
53 |
+
|
54 |
+
This cleaner is not designed to use to transform content to be used in
|
55 |
+
non-web-page contexts.
|
56 |
+
|
57 |
+
To use::
|
58 |
+
|
59 |
+
from bleach.sanitizer import Cleaner
|
60 |
+
|
61 |
+
cleaner = Cleaner()
|
62 |
+
|
63 |
+
for text in all_the_yucky_things:
|
64 |
+
sanitized = cleaner.clean(text)
|
65 |
+
|
66 |
+
"""
|
67 |
+
|
68 |
+
def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
|
69 |
+
styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
|
70 |
+
strip_comments=True, filters=None):
|
71 |
+
"""Initializes a Cleaner
|
72 |
+
|
73 |
+
:arg list tags: allowed list of tags; defaults to
|
74 |
+
``bleach.sanitizer.ALLOWED_TAGS``
|
75 |
+
|
76 |
+
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
77 |
+
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
78 |
+
|
79 |
+
:arg list styles: allowed list of css styles; defaults to
|
80 |
+
``bleach.sanitizer.ALLOWED_STYLES``
|
81 |
+
|
82 |
+
:arg list protocols: allowed list of protocols for links; defaults
|
83 |
+
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
84 |
+
|
85 |
+
:arg bool strip: whether or not to strip disallowed elements
|
86 |
+
|
87 |
+
:arg bool strip_comments: whether or not to strip HTML comments
|
88 |
+
|
89 |
+
:arg list filters: list of html5lib Filter classes to pass streamed content through
|
90 |
+
|
91 |
+
.. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
|
92 |
+
|
93 |
+
.. Warning::
|
94 |
+
|
95 |
+
Using filters changes the output of ``bleach.Cleaner.clean``.
|
96 |
+
Make sure the way the filters change the output are secure.
|
97 |
+
|
98 |
+
"""
|
99 |
+
self.tags = tags
|
100 |
+
self.attributes = attributes
|
101 |
+
self.styles = styles
|
102 |
+
self.protocols = protocols
|
103 |
+
self.strip = strip
|
104 |
+
self.strip_comments = strip_comments
|
105 |
+
self.filters = filters or []
|
106 |
+
|
107 |
+
self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
|
108 |
+
self.walker = html5lib.getTreeWalker('etree')
|
109 |
+
self.serializer = HTMLSerializer(
|
110 |
+
quote_attr_values='always',
|
111 |
+
omit_optional_tags=False,
|
112 |
+
|
113 |
+
# Bleach has its own sanitizer, so don't use the html5lib one
|
114 |
+
sanitize=False,
|
115 |
+
|
116 |
+
# Bleach sanitizer alphabetizes already, so don't use the html5lib one
|
117 |
+
alphabetical_attributes=False,
|
118 |
+
)
|
119 |
+
|
120 |
+
def clean(self, text):
|
121 |
+
"""Cleans text and returns sanitized result as unicode
|
122 |
+
|
123 |
+
:arg str text: text to be cleaned
|
124 |
+
|
125 |
+
:returns: sanitized text as unicode
|
126 |
+
|
127 |
+
"""
|
128 |
+
if not text:
|
129 |
+
return u''
|
130 |
+
|
131 |
+
text = force_unicode(text)
|
132 |
+
|
133 |
+
dom = self.parser.parseFragment(text)
|
134 |
+
filtered = BleachSanitizerFilter(
|
135 |
+
source=self.walker(dom),
|
136 |
+
|
137 |
+
# Bleach-sanitizer-specific things
|
138 |
+
attributes=self.attributes,
|
139 |
+
strip_disallowed_elements=self.strip,
|
140 |
+
strip_html_comments=self.strip_comments,
|
141 |
+
|
142 |
+
# html5lib-sanitizer things
|
143 |
+
allowed_elements=self.tags,
|
144 |
+
allowed_css_properties=self.styles,
|
145 |
+
allowed_protocols=self.protocols,
|
146 |
+
allowed_svg_properties=[],
|
147 |
+
)
|
148 |
+
|
149 |
+
# Apply any filters after the BleachSanitizerFilter
|
150 |
+
for filter_class in self.filters:
|
151 |
+
filtered = filter_class(source=filtered)
|
152 |
+
|
153 |
+
return self.serializer.render(filtered)
|
154 |
+
|
155 |
+
|
156 |
+
def attribute_filter_factory(attributes):
|
157 |
+
"""Generates attribute filter function for the given attributes value
|
158 |
+
|
159 |
+
The attributes value can take one of several shapes. This returns a filter
|
160 |
+
function appropriate to the attributes value. One nice thing about this is
|
161 |
+
that there's less if/then shenanigans in the ``allow_token`` method.
|
162 |
+
|
163 |
+
"""
|
164 |
+
if callable(attributes):
|
165 |
+
return attributes
|
166 |
+
|
167 |
+
if isinstance(attributes, dict):
|
168 |
+
def _attr_filter(tag, attr, value):
|
169 |
+
if tag in attributes:
|
170 |
+
attr_val = attributes[tag]
|
171 |
+
if callable(attr_val):
|
172 |
+
return attr_val(tag, attr, value)
|
173 |
+
|
174 |
+
if attr in attr_val:
|
175 |
+
return True
|
176 |
+
|
177 |
+
if '*' in attributes:
|
178 |
+
attr_val = attributes['*']
|
179 |
+
if callable(attr_val):
|
180 |
+
return attr_val(tag, attr, value)
|
181 |
+
|
182 |
+
return attr in attr_val
|
183 |
+
|
184 |
+
return False
|
185 |
+
|
186 |
+
return _attr_filter
|
187 |
+
|
188 |
+
if isinstance(attributes, list):
|
189 |
+
def _attr_filter(tag, attr, value):
|
190 |
+
return attr in attributes
|
191 |
+
|
192 |
+
return _attr_filter
|
193 |
+
|
194 |
+
raise ValueError('attributes needs to be a callable, a list or a dict')
|
195 |
+
|
196 |
+
|
197 |
+
class BleachSanitizerFilter(sanitizer.Filter):
|
198 |
+
"""html5lib Filter that sanitizes text
|
199 |
+
|
200 |
+
This filter can be used anywhere html5lib filters can be used.
|
201 |
+
|
202 |
+
"""
|
203 |
+
def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
|
204 |
+
strip_disallowed_elements=False, strip_html_comments=True,
|
205 |
+
**kwargs):
|
206 |
+
"""Creates a BleachSanitizerFilter instance
|
207 |
+
|
208 |
+
:arg Treewalker source: stream
|
209 |
+
|
210 |
+
:arg list tags: allowed list of tags; defaults to
|
211 |
+
``bleach.sanitizer.ALLOWED_TAGS``
|
212 |
+
|
213 |
+
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
214 |
+
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
215 |
+
|
216 |
+
:arg list styles: allowed list of css styles; defaults to
|
217 |
+
``bleach.sanitizer.ALLOWED_STYLES``
|
218 |
+
|
219 |
+
:arg list protocols: allowed list of protocols for links; defaults
|
220 |
+
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
221 |
+
|
222 |
+
:arg bool strip_disallowed_elements: whether or not to strip disallowed
|
223 |
+
elements
|
224 |
+
|
225 |
+
:arg bool strip_html_comments: whether or not to strip HTML comments
|
226 |
+
|
227 |
+
"""
|
228 |
+
self.attr_filter = attribute_filter_factory(attributes)
|
229 |
+
|
230 |
+
self.strip_disallowed_elements = strip_disallowed_elements
|
231 |
+
self.strip_html_comments = strip_html_comments
|
232 |
+
|
233 |
+
return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
|
234 |
+
|
235 |
+
def sanitize_token(self, token):
|
236 |
+
"""Sanitize a token either by HTML-encoding or dropping.
|
237 |
+
|
238 |
+
Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
|
239 |
+
['attribute', 'pairs'], 'tag': callable}.
|
240 |
+
|
241 |
+
Here callable is a function with two arguments of attribute name and
|
242 |
+
value. It should return true of false.
|
243 |
+
|
244 |
+
Also gives the option to strip tags instead of encoding.
|
245 |
+
|
246 |
+
"""
|
247 |
+
token_type = token['type']
|
248 |
+
if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
|
249 |
+
if token['name'] in self.allowed_elements:
|
250 |
+
return self.allow_token(token)
|
251 |
+
|
252 |
+
elif self.strip_disallowed_elements:
|
253 |
+
pass
|
254 |
+
|
255 |
+
else:
|
256 |
+
if 'data' in token:
|
257 |
+
# Alphabetize the attributes before calling .disallowed_token()
|
258 |
+
# so that the resulting string is stable
|
259 |
+
token['data'] = alphabetize_attributes(token['data'])
|
260 |
+
return self.disallowed_token(token)
|
261 |
+
|
262 |
+
elif token_type == 'Comment':
|
263 |
+
if not self.strip_html_comments:
|
264 |
+
return token
|
265 |
+
|
266 |
+
else:
|
267 |
+
return token
|
268 |
+
|
269 |
+
def allow_token(self, token):
|
270 |
+
"""Handles the case where we're allowing the tag"""
|
271 |
+
if 'data' in token:
|
272 |
+
# Loop through all the attributes and drop the ones that are not
|
273 |
+
# allowed, are unsafe or break other rules. Additionally, fix
|
274 |
+
# attribute values that need fixing.
|
275 |
+
#
|
276 |
+
# At the end of this loop, we have the final set of attributes
|
277 |
+
# we're keeping.
|
278 |
+
attrs = {}
|
279 |
+
for namespaced_name, val in token['data'].items():
|
280 |
+
namespace, name = namespaced_name
|
281 |
+
|
282 |
+
# Drop attributes that are not explicitly allowed
|
283 |
+
#
|
284 |
+
# NOTE(willkg): We pass in the attribute name--not a namespaced
|
285 |
+
# name.
|
286 |
+
if not self.attr_filter(token['name'], name, val):
|
287 |
+
continue
|
288 |
+
|
289 |
+
# Look at attributes that have uri values
|
290 |
+
if namespaced_name in self.attr_val_is_uri:
|
291 |
+
val_unescaped = re.sub(
|
292 |
+
"[`\000-\040\177-\240\s]+",
|
293 |
+
'',
|
294 |
+
unescape(val)).lower()
|
295 |
+
|
296 |
+
# Remove replacement characters from unescaped characters.
|
297 |
+
val_unescaped = val_unescaped.replace("\ufffd", "")
|
298 |
+
|
299 |
+
# Drop attributes with uri values that have protocols that
|
300 |
+
# aren't allowed
|
301 |
+
if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
|
302 |
+
(val_unescaped.split(':')[0] not in self.allowed_protocols)):
|
303 |
+
continue
|
304 |
+
|
305 |
+
# Drop values in svg attrs with non-local IRIs
|
306 |
+
if namespaced_name in self.svg_attr_val_allows_ref:
|
307 |
+
new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
308 |
+
' ',
|
309 |
+
unescape(val))
|
310 |
+
new_val = new_val.strip()
|
311 |
+
if not new_val:
|
312 |
+
continue
|
313 |
+
|
314 |
+
else:
|
315 |
+
# Replace the val with the unescaped version because
|
316 |
+
# it's a iri
|
317 |
+
val = new_val
|
318 |
+
|
319 |
+
# Drop href and xlink:href attr for svg elements with non-local IRIs
|
320 |
+
if (None, token['name']) in self.svg_allow_local_href:
|
321 |
+
if namespaced_name in [(None, 'href'), (namespaces['xlink'], 'href')]:
|
322 |
+
if re.search(r'^\s*[^#\s]', val):
|
323 |
+
continue
|
324 |
+
|
325 |
+
# If it's a style attribute, sanitize it
|
326 |
+
if namespaced_name == (None, u'style'):
|
327 |
+
val = self.sanitize_css(val)
|
328 |
+
|
329 |
+
# At this point, we want to keep the attribute, so add it in
|
330 |
+
attrs[namespaced_name] = val
|
331 |
+
|
332 |
+
token['data'] = alphabetize_attributes(attrs)
|
333 |
+
|
334 |
+
return token
|
335 |
+
|
336 |
+
def sanitize_css(self, style):
|
337 |
+
"""Sanitizes css in style tags"""
|
338 |
+
# disallow urls
|
339 |
+
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
340 |
+
|
341 |
+
# gauntlet
|
342 |
+
|
343 |
+
# Validate the css in the style tag and if it's not valid, then drop
|
344 |
+
# the whole thing.
|
345 |
+
parts = style.split(';')
|
346 |
+
gauntlet = re.compile(
|
347 |
+
r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""
|
348 |
+
)
|
349 |
+
|
350 |
+
for part in parts:
|
351 |
+
if not gauntlet.match(part):
|
352 |
+
return ''
|
353 |
+
|
354 |
+
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
355 |
+
return ''
|
356 |
+
|
357 |
+
clean = []
|
358 |
+
for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
|
359 |
+
if not value:
|
360 |
+
continue
|
361 |
+
|
362 |
+
if prop.lower() in self.allowed_css_properties:
|
363 |
+
clean.append(prop + ': ' + value + ';')
|
364 |
+
|
365 |
+
elif prop.lower() in self.allowed_svg_properties:
|
366 |
+
clean.append(prop + ': ' + value + ';')
|
367 |
+
|
368 |
+
return ' '.join(clean)
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/utils.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import OrderedDict
|
2 |
+
|
3 |
+
|
4 |
+
def _attr_key(attr):
|
5 |
+
"""Returns appropriate key for sorting attribute names
|
6 |
+
|
7 |
+
Attribute names are a tuple of ``(namespace, name)`` where namespace can be
|
8 |
+
``None`` or a string. These can't be compared in Python 3, so we conver the
|
9 |
+
``None`` to an empty string.
|
10 |
+
|
11 |
+
"""
|
12 |
+
key = (attr[0][0] or ''), attr[0][1]
|
13 |
+
return key
|
14 |
+
|
15 |
+
|
16 |
+
def alphabetize_attributes(attrs):
|
17 |
+
"""Takes a dict of attributes (or None) and returns them alphabetized"""
|
18 |
+
if not attrs:
|
19 |
+
return attrs
|
20 |
+
|
21 |
+
return OrderedDict(
|
22 |
+
[(k, v) for k, v in sorted(attrs.items(), key=_attr_key)]
|
23 |
+
)
|
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/version.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
from __future__ import unicode_literals
|
4 |
+
|
5 |
+
VERSION = (2, 0, 0)
|
6 |
+
__version__ = '.'.join([str(n) for n in VERSION])
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__init__.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
HTML parsing library based on the `WHATWG HTML specification
|
3 |
+
<https://whatwg.org/html>`_. The parser is designed to be compatible with
|
4 |
+
existing HTML found in the wild and implements well-defined error recovery that
|
5 |
+
is largely compatible with modern desktop web browsers.
|
6 |
+
|
7 |
+
Example usage::
|
8 |
+
|
9 |
+
import html5lib
|
10 |
+
with open("my_document.html", "rb") as f:
|
11 |
+
tree = html5lib.parse(f)
|
12 |
+
|
13 |
+
For convenience, this module re-exports the following names:
|
14 |
+
|
15 |
+
* :func:`~.html5parser.parse`
|
16 |
+
* :func:`~.html5parser.parseFragment`
|
17 |
+
* :class:`~.html5parser.HTMLParser`
|
18 |
+
* :func:`~.treebuilders.getTreeBuilder`
|
19 |
+
* :func:`~.treewalkers.getTreeWalker`
|
20 |
+
* :func:`~.serializer.serialize`
|
21 |
+
"""
|
22 |
+
|
23 |
+
from __future__ import absolute_import, division, unicode_literals
|
24 |
+
|
25 |
+
from .html5parser import HTMLParser, parse, parseFragment
|
26 |
+
from .treebuilders import getTreeBuilder
|
27 |
+
from .treewalkers import getTreeWalker
|
28 |
+
from .serializer import serialize
|
29 |
+
|
30 |
+
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
31 |
+
"getTreeWalker", "serialize"]
|
32 |
+
|
33 |
+
# this has to be at the top level, see how setup.py parses this
|
34 |
+
#: Distribution version number.
|
35 |
+
__version__ = "1.1"
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (1.26 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_ihatexml.cpython-39.pyc
ADDED
Binary file (13.7 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_inputstream.cpython-39.pyc
ADDED
Binary file (21.6 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_tokenizer.cpython-39.pyc
ADDED
Binary file (39.7 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_utils.cpython-39.pyc
ADDED
Binary file (4.76 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/constants.cpython-39.pyc
ADDED
Binary file (66.3 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/html5parser.cpython-39.pyc
ADDED
Binary file (91 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/serializer.cpython-39.pyc
ADDED
Binary file (10.8 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_ihatexml.py
ADDED
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import, division, unicode_literals
|
2 |
+
|
3 |
+
import re
|
4 |
+
import warnings
|
5 |
+
|
6 |
+
from .constants import DataLossWarning
|
7 |
+
|
8 |
+
baseChar = """
|
9 |
+
[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
|
10 |
+
[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
|
11 |
+
[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
|
12 |
+
[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
|
13 |
+
[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
|
14 |
+
[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
|
15 |
+
[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
|
16 |
+
[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
|
17 |
+
[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
|
18 |
+
[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
|
19 |
+
[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
|
20 |
+
[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
|
21 |
+
[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
|
22 |
+
[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
|
23 |
+
[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
|
24 |
+
[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
|
25 |
+
[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
|
26 |
+
[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
|
27 |
+
[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
|
28 |
+
[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
|
29 |
+
[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
|
30 |
+
[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
|
31 |
+
[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
|
32 |
+
[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
|
33 |
+
[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
|
34 |
+
[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
|
35 |
+
[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
|
36 |
+
[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
|
37 |
+
[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
|
38 |
+
[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
|
39 |
+
#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
|
40 |
+
#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
|
41 |
+
#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
|
42 |
+
[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
|
43 |
+
[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
|
44 |
+
#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
|
45 |
+
[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
|
46 |
+
[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
|
47 |
+
[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
|
48 |
+
[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
|
49 |
+
[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
|
50 |
+
#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
|
51 |
+
[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
|
52 |
+
[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
|
53 |
+
[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
|
54 |
+
[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
55 |
+
|
56 |
+
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
57 |
+
|
58 |
+
combiningCharacter = """
|
59 |
+
[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
|
60 |
+
[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
|
61 |
+
[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
|
62 |
+
[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
|
63 |
+
#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
|
64 |
+
[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
|
65 |
+
[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
|
66 |
+
#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
|
67 |
+
[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
|
68 |
+
[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
|
69 |
+
#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
|
70 |
+
[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
|
71 |
+
[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
|
72 |
+
[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
|
73 |
+
[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
|
74 |
+
[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
|
75 |
+
#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
|
76 |
+
[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
|
77 |
+
#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
|
78 |
+
[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
|
79 |
+
[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
|
80 |
+
#x3099 | #x309A"""
|
81 |
+
|
82 |
+
digit = """
|
83 |
+
[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
|
84 |
+
[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
|
85 |
+
[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
|
86 |
+
[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
87 |
+
|
88 |
+
extender = """
|
89 |
+
#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
|
90 |
+
#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
91 |
+
|
92 |
+
letter = " | ".join([baseChar, ideographic])
|
93 |
+
|
94 |
+
# Without the
|
95 |
+
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
|
96 |
+
extender])
|
97 |
+
nameFirst = " | ".join([letter, "_"])
|
98 |
+
|
99 |
+
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
100 |
+
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
101 |
+
|
102 |
+
|
103 |
+
def charStringToList(chars):
|
104 |
+
charRanges = [item.strip() for item in chars.split(" | ")]
|
105 |
+
rv = []
|
106 |
+
for item in charRanges:
|
107 |
+
foundMatch = False
|
108 |
+
for regexp in (reChar, reCharRange):
|
109 |
+
match = regexp.match(item)
|
110 |
+
if match is not None:
|
111 |
+
rv.append([hexToInt(item) for item in match.groups()])
|
112 |
+
if len(rv[-1]) == 1:
|
113 |
+
rv[-1] = rv[-1] * 2
|
114 |
+
foundMatch = True
|
115 |
+
break
|
116 |
+
if not foundMatch:
|
117 |
+
assert len(item) == 1
|
118 |
+
|
119 |
+
rv.append([ord(item)] * 2)
|
120 |
+
rv = normaliseCharList(rv)
|
121 |
+
return rv
|
122 |
+
|
123 |
+
|
124 |
+
def normaliseCharList(charList):
|
125 |
+
charList = sorted(charList)
|
126 |
+
for item in charList:
|
127 |
+
assert item[1] >= item[0]
|
128 |
+
rv = []
|
129 |
+
i = 0
|
130 |
+
while i < len(charList):
|
131 |
+
j = 1
|
132 |
+
rv.append(charList[i])
|
133 |
+
while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
|
134 |
+
rv[-1][1] = charList[i + j][1]
|
135 |
+
j += 1
|
136 |
+
i += j
|
137 |
+
return rv
|
138 |
+
|
139 |
+
|
140 |
+
# We don't really support characters above the BMP :(
|
141 |
+
max_unicode = int("FFFF", 16)
|
142 |
+
|
143 |
+
|
144 |
+
def missingRanges(charList):
|
145 |
+
rv = []
|
146 |
+
if charList[0] != 0:
|
147 |
+
rv.append([0, charList[0][0] - 1])
|
148 |
+
for i, item in enumerate(charList[:-1]):
|
149 |
+
rv.append([item[1] + 1, charList[i + 1][0] - 1])
|
150 |
+
if charList[-1][1] != max_unicode:
|
151 |
+
rv.append([charList[-1][1] + 1, max_unicode])
|
152 |
+
return rv
|
153 |
+
|
154 |
+
|
155 |
+
def listToRegexpStr(charList):
|
156 |
+
rv = []
|
157 |
+
for item in charList:
|
158 |
+
if item[0] == item[1]:
|
159 |
+
rv.append(escapeRegexp(chr(item[0])))
|
160 |
+
else:
|
161 |
+
rv.append(escapeRegexp(chr(item[0])) + "-" +
|
162 |
+
escapeRegexp(chr(item[1])))
|
163 |
+
return "[%s]" % "".join(rv)
|
164 |
+
|
165 |
+
|
166 |
+
def hexToInt(hex_str):
|
167 |
+
return int(hex_str, 16)
|
168 |
+
|
169 |
+
|
170 |
+
def escapeRegexp(string):
|
171 |
+
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
172 |
+
"[", "]", "|", "(", ")", "-")
|
173 |
+
for char in specialCharacters:
|
174 |
+
string = string.replace(char, "\\" + char)
|
175 |
+
|
176 |
+
return string
|
177 |
+
|
178 |
+
# output from the above
|
179 |
+
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
180 |
+
|
181 |
+
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
182 |
+
|
183 |
+
# Simpler things
|
184 |
+
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
|
185 |
+
|
186 |
+
|
187 |
+
class InfosetFilter(object):
|
188 |
+
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
189 |
+
|
190 |
+
def __init__(self,
|
191 |
+
dropXmlnsLocalName=False,
|
192 |
+
dropXmlnsAttrNs=False,
|
193 |
+
preventDoubleDashComments=False,
|
194 |
+
preventDashAtCommentEnd=False,
|
195 |
+
replaceFormFeedCharacters=True,
|
196 |
+
preventSingleQuotePubid=False):
|
197 |
+
|
198 |
+
self.dropXmlnsLocalName = dropXmlnsLocalName
|
199 |
+
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
200 |
+
|
201 |
+
self.preventDoubleDashComments = preventDoubleDashComments
|
202 |
+
self.preventDashAtCommentEnd = preventDashAtCommentEnd
|
203 |
+
|
204 |
+
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
205 |
+
|
206 |
+
self.preventSingleQuotePubid = preventSingleQuotePubid
|
207 |
+
|
208 |
+
self.replaceCache = {}
|
209 |
+
|
210 |
+
def coerceAttribute(self, name, namespace=None):
|
211 |
+
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
212 |
+
warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
|
213 |
+
return None
|
214 |
+
elif (self.dropXmlnsAttrNs and
|
215 |
+
namespace == "http://www.w3.org/2000/xmlns/"):
|
216 |
+
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
|
217 |
+
return None
|
218 |
+
else:
|
219 |
+
return self.toXmlName(name)
|
220 |
+
|
221 |
+
def coerceElement(self, name):
|
222 |
+
return self.toXmlName(name)
|
223 |
+
|
224 |
+
def coerceComment(self, data):
|
225 |
+
if self.preventDoubleDashComments:
|
226 |
+
while "--" in data:
|
227 |
+
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
|
228 |
+
data = data.replace("--", "- -")
|
229 |
+
if data.endswith("-"):
|
230 |
+
warnings.warn("Comments cannot end in a dash", DataLossWarning)
|
231 |
+
data += " "
|
232 |
+
return data
|
233 |
+
|
234 |
+
def coerceCharacters(self, data):
|
235 |
+
if self.replaceFormFeedCharacters:
|
236 |
+
for _ in range(data.count("\x0C")):
|
237 |
+
warnings.warn("Text cannot contain U+000C", DataLossWarning)
|
238 |
+
data = data.replace("\x0C", " ")
|
239 |
+
# Other non-xml characters
|
240 |
+
return data
|
241 |
+
|
242 |
+
def coercePubid(self, data):
|
243 |
+
dataOutput = data
|
244 |
+
for char in nonPubidCharRegexp.findall(data):
|
245 |
+
warnings.warn("Coercing non-XML pubid", DataLossWarning)
|
246 |
+
replacement = self.getReplacementCharacter(char)
|
247 |
+
dataOutput = dataOutput.replace(char, replacement)
|
248 |
+
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
|
249 |
+
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
|
250 |
+
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
|
251 |
+
return dataOutput
|
252 |
+
|
253 |
+
def toXmlName(self, name):
|
254 |
+
nameFirst = name[0]
|
255 |
+
nameRest = name[1:]
|
256 |
+
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
|
257 |
+
if m:
|
258 |
+
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
|
259 |
+
nameFirstOutput = self.getReplacementCharacter(nameFirst)
|
260 |
+
else:
|
261 |
+
nameFirstOutput = nameFirst
|
262 |
+
|
263 |
+
nameRestOutput = nameRest
|
264 |
+
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
|
265 |
+
for char in replaceChars:
|
266 |
+
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
|
267 |
+
replacement = self.getReplacementCharacter(char)
|
268 |
+
nameRestOutput = nameRestOutput.replace(char, replacement)
|
269 |
+
return nameFirstOutput + nameRestOutput
|
270 |
+
|
271 |
+
def getReplacementCharacter(self, char):
|
272 |
+
if char in self.replaceCache:
|
273 |
+
replacement = self.replaceCache[char]
|
274 |
+
else:
|
275 |
+
replacement = self.escapeChar(char)
|
276 |
+
return replacement
|
277 |
+
|
278 |
+
def fromXmlName(self, name):
|
279 |
+
for item in set(self.replacementRegexp.findall(name)):
|
280 |
+
name = name.replace(item, self.unescapeChar(item))
|
281 |
+
return name
|
282 |
+
|
283 |
+
def escapeChar(self, char):
|
284 |
+
replacement = "U%05X" % ord(char)
|
285 |
+
self.replaceCache[char] = replacement
|
286 |
+
return replacement
|
287 |
+
|
288 |
+
def unescapeChar(self, charcode):
|
289 |
+
return chr(int(charcode[1:], 16))
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_inputstream.py
ADDED
@@ -0,0 +1,918 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import, division, unicode_literals
|
2 |
+
|
3 |
+
from six import text_type
|
4 |
+
from six.moves import http_client, urllib
|
5 |
+
|
6 |
+
import codecs
|
7 |
+
import re
|
8 |
+
from io import BytesIO, StringIO
|
9 |
+
|
10 |
+
from tensorboard._vendor import webencodings
|
11 |
+
|
12 |
+
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
13 |
+
from .constants import _ReparseException
|
14 |
+
from . import _utils
|
15 |
+
|
16 |
+
# Non-unicode versions of constants for use in the pre-parser
|
17 |
+
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
|
18 |
+
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
19 |
+
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
20 |
+
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
21 |
+
|
22 |
+
|
23 |
+
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
|
24 |
+
|
25 |
+
if _utils.supports_lone_surrogates:
|
26 |
+
# Use one extra step of indirection and create surrogates with
|
27 |
+
# eval. Not using this indirection would introduce an illegal
|
28 |
+
# unicode literal on platforms not supporting such lone
|
29 |
+
# surrogates.
|
30 |
+
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
|
31 |
+
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
|
32 |
+
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
|
33 |
+
"]")
|
34 |
+
else:
|
35 |
+
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
|
36 |
+
|
37 |
+
non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
38 |
+
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
39 |
+
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
|
40 |
+
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
|
41 |
+
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
42 |
+
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
43 |
+
0x10FFFE, 0x10FFFF}
|
44 |
+
|
45 |
+
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
|
46 |
+
|
47 |
+
# Cache for charsUntil()
|
48 |
+
charsUntilRegEx = {}
|
49 |
+
|
50 |
+
|
51 |
+
class BufferedStream(object):
|
52 |
+
"""Buffering for streams that do not have buffering of their own
|
53 |
+
|
54 |
+
The buffer is implemented as a list of chunks on the assumption that
|
55 |
+
joining many strings will be slow since it is O(n**2)
|
56 |
+
"""
|
57 |
+
|
58 |
+
def __init__(self, stream):
|
59 |
+
self.stream = stream
|
60 |
+
self.buffer = []
|
61 |
+
self.position = [-1, 0] # chunk number, offset
|
62 |
+
|
63 |
+
def tell(self):
|
64 |
+
pos = 0
|
65 |
+
for chunk in self.buffer[:self.position[0]]:
|
66 |
+
pos += len(chunk)
|
67 |
+
pos += self.position[1]
|
68 |
+
return pos
|
69 |
+
|
70 |
+
def seek(self, pos):
|
71 |
+
assert pos <= self._bufferedBytes()
|
72 |
+
offset = pos
|
73 |
+
i = 0
|
74 |
+
while len(self.buffer[i]) < offset:
|
75 |
+
offset -= len(self.buffer[i])
|
76 |
+
i += 1
|
77 |
+
self.position = [i, offset]
|
78 |
+
|
79 |
+
def read(self, bytes):
|
80 |
+
if not self.buffer:
|
81 |
+
return self._readStream(bytes)
|
82 |
+
elif (self.position[0] == len(self.buffer) and
|
83 |
+
self.position[1] == len(self.buffer[-1])):
|
84 |
+
return self._readStream(bytes)
|
85 |
+
else:
|
86 |
+
return self._readFromBuffer(bytes)
|
87 |
+
|
88 |
+
def _bufferedBytes(self):
|
89 |
+
return sum([len(item) for item in self.buffer])
|
90 |
+
|
91 |
+
def _readStream(self, bytes):
|
92 |
+
data = self.stream.read(bytes)
|
93 |
+
self.buffer.append(data)
|
94 |
+
self.position[0] += 1
|
95 |
+
self.position[1] = len(data)
|
96 |
+
return data
|
97 |
+
|
98 |
+
def _readFromBuffer(self, bytes):
|
99 |
+
remainingBytes = bytes
|
100 |
+
rv = []
|
101 |
+
bufferIndex = self.position[0]
|
102 |
+
bufferOffset = self.position[1]
|
103 |
+
while bufferIndex < len(self.buffer) and remainingBytes != 0:
|
104 |
+
assert remainingBytes > 0
|
105 |
+
bufferedData = self.buffer[bufferIndex]
|
106 |
+
|
107 |
+
if remainingBytes <= len(bufferedData) - bufferOffset:
|
108 |
+
bytesToRead = remainingBytes
|
109 |
+
self.position = [bufferIndex, bufferOffset + bytesToRead]
|
110 |
+
else:
|
111 |
+
bytesToRead = len(bufferedData) - bufferOffset
|
112 |
+
self.position = [bufferIndex, len(bufferedData)]
|
113 |
+
bufferIndex += 1
|
114 |
+
rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
|
115 |
+
remainingBytes -= bytesToRead
|
116 |
+
|
117 |
+
bufferOffset = 0
|
118 |
+
|
119 |
+
if remainingBytes:
|
120 |
+
rv.append(self._readStream(remainingBytes))
|
121 |
+
|
122 |
+
return b"".join(rv)
|
123 |
+
|
124 |
+
|
125 |
+
def HTMLInputStream(source, **kwargs):
|
126 |
+
# Work around Python bug #20007: read(0) closes the connection.
|
127 |
+
# http://bugs.python.org/issue20007
|
128 |
+
if (isinstance(source, http_client.HTTPResponse) or
|
129 |
+
# Also check for addinfourl wrapping HTTPResponse
|
130 |
+
(isinstance(source, urllib.response.addbase) and
|
131 |
+
isinstance(source.fp, http_client.HTTPResponse))):
|
132 |
+
isUnicode = False
|
133 |
+
elif hasattr(source, "read"):
|
134 |
+
isUnicode = isinstance(source.read(0), text_type)
|
135 |
+
else:
|
136 |
+
isUnicode = isinstance(source, text_type)
|
137 |
+
|
138 |
+
if isUnicode:
|
139 |
+
encodings = [x for x in kwargs if x.endswith("_encoding")]
|
140 |
+
if encodings:
|
141 |
+
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
|
142 |
+
|
143 |
+
return HTMLUnicodeInputStream(source, **kwargs)
|
144 |
+
else:
|
145 |
+
return HTMLBinaryInputStream(source, **kwargs)
|
146 |
+
|
147 |
+
|
148 |
+
class HTMLUnicodeInputStream(object):
|
149 |
+
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
150 |
+
|
151 |
+
This class takes care of character encoding and removing or replacing
|
152 |
+
incorrect byte-sequences and also provides column and line tracking.
|
153 |
+
|
154 |
+
"""
|
155 |
+
|
156 |
+
_defaultChunkSize = 10240
|
157 |
+
|
158 |
+
def __init__(self, source):
|
159 |
+
"""Initialises the HTMLInputStream.
|
160 |
+
|
161 |
+
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
162 |
+
for use by html5lib.
|
163 |
+
|
164 |
+
source can be either a file-object, local filename or a string.
|
165 |
+
|
166 |
+
The optional encoding parameter must be a string that indicates
|
167 |
+
the encoding. If specified, that encoding will be used,
|
168 |
+
regardless of any BOM or later declaration (such as in a meta
|
169 |
+
element)
|
170 |
+
|
171 |
+
"""
|
172 |
+
|
173 |
+
if not _utils.supports_lone_surrogates:
|
174 |
+
# Such platforms will have already checked for such
|
175 |
+
# surrogate errors, so no need to do this checking.
|
176 |
+
self.reportCharacterErrors = None
|
177 |
+
elif len("\U0010FFFF") == 1:
|
178 |
+
self.reportCharacterErrors = self.characterErrorsUCS4
|
179 |
+
else:
|
180 |
+
self.reportCharacterErrors = self.characterErrorsUCS2
|
181 |
+
|
182 |
+
# List of where new lines occur
|
183 |
+
self.newLines = [0]
|
184 |
+
|
185 |
+
self.charEncoding = (lookupEncoding("utf-8"), "certain")
|
186 |
+
self.dataStream = self.openStream(source)
|
187 |
+
|
188 |
+
self.reset()
|
189 |
+
|
190 |
+
def reset(self):
|
191 |
+
self.chunk = ""
|
192 |
+
self.chunkSize = 0
|
193 |
+
self.chunkOffset = 0
|
194 |
+
self.errors = []
|
195 |
+
|
196 |
+
# number of (complete) lines in previous chunks
|
197 |
+
self.prevNumLines = 0
|
198 |
+
# number of columns in the last line of the previous chunk
|
199 |
+
self.prevNumCols = 0
|
200 |
+
|
201 |
+
# Deal with CR LF and surrogates split over chunk boundaries
|
202 |
+
self._bufferedCharacter = None
|
203 |
+
|
204 |
+
def openStream(self, source):
|
205 |
+
"""Produces a file object from source.
|
206 |
+
|
207 |
+
source can be either a file object, local filename or a string.
|
208 |
+
|
209 |
+
"""
|
210 |
+
# Already a file object
|
211 |
+
if hasattr(source, 'read'):
|
212 |
+
stream = source
|
213 |
+
else:
|
214 |
+
stream = StringIO(source)
|
215 |
+
|
216 |
+
return stream
|
217 |
+
|
218 |
+
def _position(self, offset):
|
219 |
+
chunk = self.chunk
|
220 |
+
nLines = chunk.count('\n', 0, offset)
|
221 |
+
positionLine = self.prevNumLines + nLines
|
222 |
+
lastLinePos = chunk.rfind('\n', 0, offset)
|
223 |
+
if lastLinePos == -1:
|
224 |
+
positionColumn = self.prevNumCols + offset
|
225 |
+
else:
|
226 |
+
positionColumn = offset - (lastLinePos + 1)
|
227 |
+
return (positionLine, positionColumn)
|
228 |
+
|
229 |
+
def position(self):
|
230 |
+
"""Returns (line, col) of the current position in the stream."""
|
231 |
+
line, col = self._position(self.chunkOffset)
|
232 |
+
return (line + 1, col)
|
233 |
+
|
234 |
+
def char(self):
|
235 |
+
""" Read one character from the stream or queue if available. Return
|
236 |
+
EOF when EOF is reached.
|
237 |
+
"""
|
238 |
+
# Read a new chunk from the input stream if necessary
|
239 |
+
if self.chunkOffset >= self.chunkSize:
|
240 |
+
if not self.readChunk():
|
241 |
+
return EOF
|
242 |
+
|
243 |
+
chunkOffset = self.chunkOffset
|
244 |
+
char = self.chunk[chunkOffset]
|
245 |
+
self.chunkOffset = chunkOffset + 1
|
246 |
+
|
247 |
+
return char
|
248 |
+
|
249 |
+
def readChunk(self, chunkSize=None):
|
250 |
+
if chunkSize is None:
|
251 |
+
chunkSize = self._defaultChunkSize
|
252 |
+
|
253 |
+
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
254 |
+
|
255 |
+
self.chunk = ""
|
256 |
+
self.chunkSize = 0
|
257 |
+
self.chunkOffset = 0
|
258 |
+
|
259 |
+
data = self.dataStream.read(chunkSize)
|
260 |
+
|
261 |
+
# Deal with CR LF and surrogates broken across chunks
|
262 |
+
if self._bufferedCharacter:
|
263 |
+
data = self._bufferedCharacter + data
|
264 |
+
self._bufferedCharacter = None
|
265 |
+
elif not data:
|
266 |
+
# We have no more data, bye-bye stream
|
267 |
+
return False
|
268 |
+
|
269 |
+
if len(data) > 1:
|
270 |
+
lastv = ord(data[-1])
|
271 |
+
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
|
272 |
+
self._bufferedCharacter = data[-1]
|
273 |
+
data = data[:-1]
|
274 |
+
|
275 |
+
if self.reportCharacterErrors:
|
276 |
+
self.reportCharacterErrors(data)
|
277 |
+
|
278 |
+
# Replace invalid characters
|
279 |
+
data = data.replace("\r\n", "\n")
|
280 |
+
data = data.replace("\r", "\n")
|
281 |
+
|
282 |
+
self.chunk = data
|
283 |
+
self.chunkSize = len(data)
|
284 |
+
|
285 |
+
return True
|
286 |
+
|
287 |
+
def characterErrorsUCS4(self, data):
|
288 |
+
for _ in range(len(invalid_unicode_re.findall(data))):
|
289 |
+
self.errors.append("invalid-codepoint")
|
290 |
+
|
291 |
+
def characterErrorsUCS2(self, data):
|
292 |
+
# Someone picked the wrong compile option
|
293 |
+
# You lose
|
294 |
+
skip = False
|
295 |
+
for match in invalid_unicode_re.finditer(data):
|
296 |
+
if skip:
|
297 |
+
continue
|
298 |
+
codepoint = ord(match.group())
|
299 |
+
pos = match.start()
|
300 |
+
# Pretty sure there should be endianness issues here
|
301 |
+
if _utils.isSurrogatePair(data[pos:pos + 2]):
|
302 |
+
# We have a surrogate pair!
|
303 |
+
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
304 |
+
if char_val in non_bmp_invalid_codepoints:
|
305 |
+
self.errors.append("invalid-codepoint")
|
306 |
+
skip = True
|
307 |
+
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
|
308 |
+
pos == len(data) - 1):
|
309 |
+
self.errors.append("invalid-codepoint")
|
310 |
+
else:
|
311 |
+
skip = False
|
312 |
+
self.errors.append("invalid-codepoint")
|
313 |
+
|
314 |
+
def charsUntil(self, characters, opposite=False):
|
315 |
+
""" Returns a string of characters from the stream up to but not
|
316 |
+
including any character in 'characters' or EOF. 'characters' must be
|
317 |
+
a container that supports the 'in' method and iteration over its
|
318 |
+
characters.
|
319 |
+
"""
|
320 |
+
|
321 |
+
# Use a cache of regexps to find the required characters
|
322 |
+
try:
|
323 |
+
chars = charsUntilRegEx[(characters, opposite)]
|
324 |
+
except KeyError:
|
325 |
+
if __debug__:
|
326 |
+
for c in characters:
|
327 |
+
assert(ord(c) < 128)
|
328 |
+
regex = "".join(["\\x%02x" % ord(c) for c in characters])
|
329 |
+
if not opposite:
|
330 |
+
regex = "^%s" % regex
|
331 |
+
chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
|
332 |
+
|
333 |
+
rv = []
|
334 |
+
|
335 |
+
while True:
|
336 |
+
# Find the longest matching prefix
|
337 |
+
m = chars.match(self.chunk, self.chunkOffset)
|
338 |
+
if m is None:
|
339 |
+
# If nothing matched, and it wasn't because we ran out of chunk,
|
340 |
+
# then stop
|
341 |
+
if self.chunkOffset != self.chunkSize:
|
342 |
+
break
|
343 |
+
else:
|
344 |
+
end = m.end()
|
345 |
+
# If not the whole chunk matched, return everything
|
346 |
+
# up to the part that didn't match
|
347 |
+
if end != self.chunkSize:
|
348 |
+
rv.append(self.chunk[self.chunkOffset:end])
|
349 |
+
self.chunkOffset = end
|
350 |
+
break
|
351 |
+
# If the whole remainder of the chunk matched,
|
352 |
+
# use it all and read the next chunk
|
353 |
+
rv.append(self.chunk[self.chunkOffset:])
|
354 |
+
if not self.readChunk():
|
355 |
+
# Reached EOF
|
356 |
+
break
|
357 |
+
|
358 |
+
r = "".join(rv)
|
359 |
+
return r
|
360 |
+
|
361 |
+
def unget(self, char):
|
362 |
+
# Only one character is allowed to be ungotten at once - it must
|
363 |
+
# be consumed again before any further call to unget
|
364 |
+
if char is not EOF:
|
365 |
+
if self.chunkOffset == 0:
|
366 |
+
# unget is called quite rarely, so it's a good idea to do
|
367 |
+
# more work here if it saves a bit of work in the frequently
|
368 |
+
# called char and charsUntil.
|
369 |
+
# So, just prepend the ungotten character onto the current
|
370 |
+
# chunk:
|
371 |
+
self.chunk = char + self.chunk
|
372 |
+
self.chunkSize += 1
|
373 |
+
else:
|
374 |
+
self.chunkOffset -= 1
|
375 |
+
assert self.chunk[self.chunkOffset] == char
|
376 |
+
|
377 |
+
|
378 |
+
class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
379 |
+
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
380 |
+
|
381 |
+
This class takes care of character encoding and removing or replacing
|
382 |
+
incorrect byte-sequences and also provides column and line tracking.
|
383 |
+
|
384 |
+
"""
|
385 |
+
|
386 |
+
def __init__(self, source, override_encoding=None, transport_encoding=None,
|
387 |
+
same_origin_parent_encoding=None, likely_encoding=None,
|
388 |
+
default_encoding="windows-1252", useChardet=True):
|
389 |
+
"""Initialises the HTMLInputStream.
|
390 |
+
|
391 |
+
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
392 |
+
for use by html5lib.
|
393 |
+
|
394 |
+
source can be either a file-object, local filename or a string.
|
395 |
+
|
396 |
+
The optional encoding parameter must be a string that indicates
|
397 |
+
the encoding. If specified, that encoding will be used,
|
398 |
+
regardless of any BOM or later declaration (such as in a meta
|
399 |
+
element)
|
400 |
+
|
401 |
+
"""
|
402 |
+
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
403 |
+
# self.charEncoding as appropriate
|
404 |
+
self.rawStream = self.openStream(source)
|
405 |
+
|
406 |
+
HTMLUnicodeInputStream.__init__(self, self.rawStream)
|
407 |
+
|
408 |
+
# Encoding Information
|
409 |
+
# Number of bytes to use when looking for a meta element with
|
410 |
+
# encoding information
|
411 |
+
self.numBytesMeta = 1024
|
412 |
+
# Number of bytes to use when using detecting encoding using chardet
|
413 |
+
self.numBytesChardet = 100
|
414 |
+
# Things from args
|
415 |
+
self.override_encoding = override_encoding
|
416 |
+
self.transport_encoding = transport_encoding
|
417 |
+
self.same_origin_parent_encoding = same_origin_parent_encoding
|
418 |
+
self.likely_encoding = likely_encoding
|
419 |
+
self.default_encoding = default_encoding
|
420 |
+
|
421 |
+
# Determine encoding
|
422 |
+
self.charEncoding = self.determineEncoding(useChardet)
|
423 |
+
assert self.charEncoding[0] is not None
|
424 |
+
|
425 |
+
# Call superclass
|
426 |
+
self.reset()
|
427 |
+
|
428 |
+
def reset(self):
|
429 |
+
self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
|
430 |
+
HTMLUnicodeInputStream.reset(self)
|
431 |
+
|
432 |
+
def openStream(self, source):
|
433 |
+
"""Produces a file object from source.
|
434 |
+
|
435 |
+
source can be either a file object, local filename or a string.
|
436 |
+
|
437 |
+
"""
|
438 |
+
# Already a file object
|
439 |
+
if hasattr(source, 'read'):
|
440 |
+
stream = source
|
441 |
+
else:
|
442 |
+
stream = BytesIO(source)
|
443 |
+
|
444 |
+
try:
|
445 |
+
stream.seek(stream.tell())
|
446 |
+
except Exception:
|
447 |
+
stream = BufferedStream(stream)
|
448 |
+
|
449 |
+
return stream
|
450 |
+
|
451 |
+
def determineEncoding(self, chardet=True):
|
452 |
+
# BOMs take precedence over everything
|
453 |
+
# This will also read past the BOM if present
|
454 |
+
charEncoding = self.detectBOM(), "certain"
|
455 |
+
if charEncoding[0] is not None:
|
456 |
+
return charEncoding
|
457 |
+
|
458 |
+
# If we've been overridden, we've been overridden
|
459 |
+
charEncoding = lookupEncoding(self.override_encoding), "certain"
|
460 |
+
if charEncoding[0] is not None:
|
461 |
+
return charEncoding
|
462 |
+
|
463 |
+
# Now check the transport layer
|
464 |
+
charEncoding = lookupEncoding(self.transport_encoding), "certain"
|
465 |
+
if charEncoding[0] is not None:
|
466 |
+
return charEncoding
|
467 |
+
|
468 |
+
# Look for meta elements with encoding information
|
469 |
+
charEncoding = self.detectEncodingMeta(), "tentative"
|
470 |
+
if charEncoding[0] is not None:
|
471 |
+
return charEncoding
|
472 |
+
|
473 |
+
# Parent document encoding
|
474 |
+
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
|
475 |
+
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
|
476 |
+
return charEncoding
|
477 |
+
|
478 |
+
# "likely" encoding
|
479 |
+
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
|
480 |
+
if charEncoding[0] is not None:
|
481 |
+
return charEncoding
|
482 |
+
|
483 |
+
# Guess with chardet, if available
|
484 |
+
if chardet:
|
485 |
+
try:
|
486 |
+
from chardet.universaldetector import UniversalDetector
|
487 |
+
except ImportError:
|
488 |
+
pass
|
489 |
+
else:
|
490 |
+
buffers = []
|
491 |
+
detector = UniversalDetector()
|
492 |
+
while not detector.done:
|
493 |
+
buffer = self.rawStream.read(self.numBytesChardet)
|
494 |
+
assert isinstance(buffer, bytes)
|
495 |
+
if not buffer:
|
496 |
+
break
|
497 |
+
buffers.append(buffer)
|
498 |
+
detector.feed(buffer)
|
499 |
+
detector.close()
|
500 |
+
encoding = lookupEncoding(detector.result['encoding'])
|
501 |
+
self.rawStream.seek(0)
|
502 |
+
if encoding is not None:
|
503 |
+
return encoding, "tentative"
|
504 |
+
|
505 |
+
# Try the default encoding
|
506 |
+
charEncoding = lookupEncoding(self.default_encoding), "tentative"
|
507 |
+
if charEncoding[0] is not None:
|
508 |
+
return charEncoding
|
509 |
+
|
510 |
+
# Fallback to html5lib's default if even that hasn't worked
|
511 |
+
return lookupEncoding("windows-1252"), "tentative"
|
512 |
+
|
513 |
+
def changeEncoding(self, newEncoding):
|
514 |
+
assert self.charEncoding[1] != "certain"
|
515 |
+
newEncoding = lookupEncoding(newEncoding)
|
516 |
+
if newEncoding is None:
|
517 |
+
return
|
518 |
+
if newEncoding.name in ("utf-16be", "utf-16le"):
|
519 |
+
newEncoding = lookupEncoding("utf-8")
|
520 |
+
assert newEncoding is not None
|
521 |
+
elif newEncoding == self.charEncoding[0]:
|
522 |
+
self.charEncoding = (self.charEncoding[0], "certain")
|
523 |
+
else:
|
524 |
+
self.rawStream.seek(0)
|
525 |
+
self.charEncoding = (newEncoding, "certain")
|
526 |
+
self.reset()
|
527 |
+
raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
528 |
+
|
529 |
+
def detectBOM(self):
|
530 |
+
"""Attempts to detect at BOM at the start of the stream. If
|
531 |
+
an encoding can be determined from the BOM return the name of the
|
532 |
+
encoding otherwise return None"""
|
533 |
+
bomDict = {
|
534 |
+
codecs.BOM_UTF8: 'utf-8',
|
535 |
+
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
|
536 |
+
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
|
537 |
+
}
|
538 |
+
|
539 |
+
# Go to beginning of file and read in 4 bytes
|
540 |
+
string = self.rawStream.read(4)
|
541 |
+
assert isinstance(string, bytes)
|
542 |
+
|
543 |
+
# Try detecting the BOM using bytes from the string
|
544 |
+
encoding = bomDict.get(string[:3]) # UTF-8
|
545 |
+
seek = 3
|
546 |
+
if not encoding:
|
547 |
+
# Need to detect UTF-32 before UTF-16
|
548 |
+
encoding = bomDict.get(string) # UTF-32
|
549 |
+
seek = 4
|
550 |
+
if not encoding:
|
551 |
+
encoding = bomDict.get(string[:2]) # UTF-16
|
552 |
+
seek = 2
|
553 |
+
|
554 |
+
# Set the read position past the BOM if one was found, otherwise
|
555 |
+
# set it to the start of the stream
|
556 |
+
if encoding:
|
557 |
+
self.rawStream.seek(seek)
|
558 |
+
return lookupEncoding(encoding)
|
559 |
+
else:
|
560 |
+
self.rawStream.seek(0)
|
561 |
+
return None
|
562 |
+
|
563 |
+
def detectEncodingMeta(self):
|
564 |
+
"""Report the encoding declared by the meta element
|
565 |
+
"""
|
566 |
+
buffer = self.rawStream.read(self.numBytesMeta)
|
567 |
+
assert isinstance(buffer, bytes)
|
568 |
+
parser = EncodingParser(buffer)
|
569 |
+
self.rawStream.seek(0)
|
570 |
+
encoding = parser.getEncoding()
|
571 |
+
|
572 |
+
if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
|
573 |
+
encoding = lookupEncoding("utf-8")
|
574 |
+
|
575 |
+
return encoding
|
576 |
+
|
577 |
+
|
578 |
+
class EncodingBytes(bytes):
|
579 |
+
"""String-like object with an associated position and various extra methods
|
580 |
+
If the position is ever greater than the string length then an exception is
|
581 |
+
raised"""
|
582 |
+
def __new__(self, value):
|
583 |
+
assert isinstance(value, bytes)
|
584 |
+
return bytes.__new__(self, value.lower())
|
585 |
+
|
586 |
+
def __init__(self, value):
|
587 |
+
# pylint:disable=unused-argument
|
588 |
+
self._position = -1
|
589 |
+
|
590 |
+
def __iter__(self):
|
591 |
+
return self
|
592 |
+
|
593 |
+
def __next__(self):
|
594 |
+
p = self._position = self._position + 1
|
595 |
+
if p >= len(self):
|
596 |
+
raise StopIteration
|
597 |
+
elif p < 0:
|
598 |
+
raise TypeError
|
599 |
+
return self[p:p + 1]
|
600 |
+
|
601 |
+
def next(self):
|
602 |
+
# Py2 compat
|
603 |
+
return self.__next__()
|
604 |
+
|
605 |
+
def previous(self):
|
606 |
+
p = self._position
|
607 |
+
if p >= len(self):
|
608 |
+
raise StopIteration
|
609 |
+
elif p < 0:
|
610 |
+
raise TypeError
|
611 |
+
self._position = p = p - 1
|
612 |
+
return self[p:p + 1]
|
613 |
+
|
614 |
+
def setPosition(self, position):
|
615 |
+
if self._position >= len(self):
|
616 |
+
raise StopIteration
|
617 |
+
self._position = position
|
618 |
+
|
619 |
+
def getPosition(self):
|
620 |
+
if self._position >= len(self):
|
621 |
+
raise StopIteration
|
622 |
+
if self._position >= 0:
|
623 |
+
return self._position
|
624 |
+
else:
|
625 |
+
return None
|
626 |
+
|
627 |
+
position = property(getPosition, setPosition)
|
628 |
+
|
629 |
+
def getCurrentByte(self):
|
630 |
+
return self[self.position:self.position + 1]
|
631 |
+
|
632 |
+
currentByte = property(getCurrentByte)
|
633 |
+
|
634 |
+
def skip(self, chars=spaceCharactersBytes):
|
635 |
+
"""Skip past a list of characters"""
|
636 |
+
p = self.position # use property for the error-checking
|
637 |
+
while p < len(self):
|
638 |
+
c = self[p:p + 1]
|
639 |
+
if c not in chars:
|
640 |
+
self._position = p
|
641 |
+
return c
|
642 |
+
p += 1
|
643 |
+
self._position = p
|
644 |
+
return None
|
645 |
+
|
646 |
+
def skipUntil(self, chars):
|
647 |
+
p = self.position
|
648 |
+
while p < len(self):
|
649 |
+
c = self[p:p + 1]
|
650 |
+
if c in chars:
|
651 |
+
self._position = p
|
652 |
+
return c
|
653 |
+
p += 1
|
654 |
+
self._position = p
|
655 |
+
return None
|
656 |
+
|
657 |
+
def matchBytes(self, bytes):
|
658 |
+
"""Look for a sequence of bytes at the start of a string. If the bytes
|
659 |
+
are found return True and advance the position to the byte after the
|
660 |
+
match. Otherwise return False and leave the position alone"""
|
661 |
+
rv = self.startswith(bytes, self.position)
|
662 |
+
if rv:
|
663 |
+
self.position += len(bytes)
|
664 |
+
return rv
|
665 |
+
|
666 |
+
def jumpTo(self, bytes):
|
667 |
+
"""Look for the next sequence of bytes matching a given sequence. If
|
668 |
+
a match is found advance the position to the last byte of the match"""
|
669 |
+
try:
|
670 |
+
self._position = self.index(bytes, self.position) + len(bytes) - 1
|
671 |
+
except ValueError:
|
672 |
+
raise StopIteration
|
673 |
+
return True
|
674 |
+
|
675 |
+
|
676 |
+
class EncodingParser(object):
|
677 |
+
"""Mini parser for detecting character encoding from meta elements"""
|
678 |
+
|
679 |
+
def __init__(self, data):
|
680 |
+
"""string - the data to work on for encoding detection"""
|
681 |
+
self.data = EncodingBytes(data)
|
682 |
+
self.encoding = None
|
683 |
+
|
684 |
+
def getEncoding(self):
|
685 |
+
if b"<meta" not in self.data:
|
686 |
+
return None
|
687 |
+
|
688 |
+
methodDispatch = (
|
689 |
+
(b"<!--", self.handleComment),
|
690 |
+
(b"<meta", self.handleMeta),
|
691 |
+
(b"</", self.handlePossibleEndTag),
|
692 |
+
(b"<!", self.handleOther),
|
693 |
+
(b"<?", self.handleOther),
|
694 |
+
(b"<", self.handlePossibleStartTag))
|
695 |
+
for _ in self.data:
|
696 |
+
keepParsing = True
|
697 |
+
try:
|
698 |
+
self.data.jumpTo(b"<")
|
699 |
+
except StopIteration:
|
700 |
+
break
|
701 |
+
for key, method in methodDispatch:
|
702 |
+
if self.data.matchBytes(key):
|
703 |
+
try:
|
704 |
+
keepParsing = method()
|
705 |
+
break
|
706 |
+
except StopIteration:
|
707 |
+
keepParsing = False
|
708 |
+
break
|
709 |
+
if not keepParsing:
|
710 |
+
break
|
711 |
+
|
712 |
+
return self.encoding
|
713 |
+
|
714 |
+
def handleComment(self):
|
715 |
+
"""Skip over comments"""
|
716 |
+
return self.data.jumpTo(b"-->")
|
717 |
+
|
718 |
+
def handleMeta(self):
|
719 |
+
if self.data.currentByte not in spaceCharactersBytes:
|
720 |
+
# if we have <meta not followed by a space so just keep going
|
721 |
+
return True
|
722 |
+
# We have a valid meta element we want to search for attributes
|
723 |
+
hasPragma = False
|
724 |
+
pendingEncoding = None
|
725 |
+
while True:
|
726 |
+
# Try to find the next attribute after the current position
|
727 |
+
attr = self.getAttribute()
|
728 |
+
if attr is None:
|
729 |
+
return True
|
730 |
+
else:
|
731 |
+
if attr[0] == b"http-equiv":
|
732 |
+
hasPragma = attr[1] == b"content-type"
|
733 |
+
if hasPragma and pendingEncoding is not None:
|
734 |
+
self.encoding = pendingEncoding
|
735 |
+
return False
|
736 |
+
elif attr[0] == b"charset":
|
737 |
+
tentativeEncoding = attr[1]
|
738 |
+
codec = lookupEncoding(tentativeEncoding)
|
739 |
+
if codec is not None:
|
740 |
+
self.encoding = codec
|
741 |
+
return False
|
742 |
+
elif attr[0] == b"content":
|
743 |
+
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
744 |
+
tentativeEncoding = contentParser.parse()
|
745 |
+
if tentativeEncoding is not None:
|
746 |
+
codec = lookupEncoding(tentativeEncoding)
|
747 |
+
if codec is not None:
|
748 |
+
if hasPragma:
|
749 |
+
self.encoding = codec
|
750 |
+
return False
|
751 |
+
else:
|
752 |
+
pendingEncoding = codec
|
753 |
+
|
754 |
+
def handlePossibleStartTag(self):
|
755 |
+
return self.handlePossibleTag(False)
|
756 |
+
|
757 |
+
def handlePossibleEndTag(self):
|
758 |
+
next(self.data)
|
759 |
+
return self.handlePossibleTag(True)
|
760 |
+
|
761 |
+
def handlePossibleTag(self, endTag):
|
762 |
+
data = self.data
|
763 |
+
if data.currentByte not in asciiLettersBytes:
|
764 |
+
# If the next byte is not an ascii letter either ignore this
|
765 |
+
# fragment (possible start tag case) or treat it according to
|
766 |
+
# handleOther
|
767 |
+
if endTag:
|
768 |
+
data.previous()
|
769 |
+
self.handleOther()
|
770 |
+
return True
|
771 |
+
|
772 |
+
c = data.skipUntil(spacesAngleBrackets)
|
773 |
+
if c == b"<":
|
774 |
+
# return to the first step in the overall "two step" algorithm
|
775 |
+
# reprocessing the < byte
|
776 |
+
data.previous()
|
777 |
+
else:
|
778 |
+
# Read all attributes
|
779 |
+
attr = self.getAttribute()
|
780 |
+
while attr is not None:
|
781 |
+
attr = self.getAttribute()
|
782 |
+
return True
|
783 |
+
|
784 |
+
def handleOther(self):
|
785 |
+
return self.data.jumpTo(b">")
|
786 |
+
|
787 |
+
def getAttribute(self):
|
788 |
+
"""Return a name,value pair for the next attribute in the stream,
|
789 |
+
if one is found, or None"""
|
790 |
+
data = self.data
|
791 |
+
# Step 1 (skip chars)
|
792 |
+
c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
|
793 |
+
assert c is None or len(c) == 1
|
794 |
+
# Step 2
|
795 |
+
if c in (b">", None):
|
796 |
+
return None
|
797 |
+
# Step 3
|
798 |
+
attrName = []
|
799 |
+
attrValue = []
|
800 |
+
# Step 4 attribute name
|
801 |
+
while True:
|
802 |
+
if c == b"=" and attrName:
|
803 |
+
break
|
804 |
+
elif c in spaceCharactersBytes:
|
805 |
+
# Step 6!
|
806 |
+
c = data.skip()
|
807 |
+
break
|
808 |
+
elif c in (b"/", b">"):
|
809 |
+
return b"".join(attrName), b""
|
810 |
+
elif c in asciiUppercaseBytes:
|
811 |
+
attrName.append(c.lower())
|
812 |
+
elif c is None:
|
813 |
+
return None
|
814 |
+
else:
|
815 |
+
attrName.append(c)
|
816 |
+
# Step 5
|
817 |
+
c = next(data)
|
818 |
+
# Step 7
|
819 |
+
if c != b"=":
|
820 |
+
data.previous()
|
821 |
+
return b"".join(attrName), b""
|
822 |
+
# Step 8
|
823 |
+
next(data)
|
824 |
+
# Step 9
|
825 |
+
c = data.skip()
|
826 |
+
# Step 10
|
827 |
+
if c in (b"'", b'"'):
|
828 |
+
# 10.1
|
829 |
+
quoteChar = c
|
830 |
+
while True:
|
831 |
+
# 10.2
|
832 |
+
c = next(data)
|
833 |
+
# 10.3
|
834 |
+
if c == quoteChar:
|
835 |
+
next(data)
|
836 |
+
return b"".join(attrName), b"".join(attrValue)
|
837 |
+
# 10.4
|
838 |
+
elif c in asciiUppercaseBytes:
|
839 |
+
attrValue.append(c.lower())
|
840 |
+
# 10.5
|
841 |
+
else:
|
842 |
+
attrValue.append(c)
|
843 |
+
elif c == b">":
|
844 |
+
return b"".join(attrName), b""
|
845 |
+
elif c in asciiUppercaseBytes:
|
846 |
+
attrValue.append(c.lower())
|
847 |
+
elif c is None:
|
848 |
+
return None
|
849 |
+
else:
|
850 |
+
attrValue.append(c)
|
851 |
+
# Step 11
|
852 |
+
while True:
|
853 |
+
c = next(data)
|
854 |
+
if c in spacesAngleBrackets:
|
855 |
+
return b"".join(attrName), b"".join(attrValue)
|
856 |
+
elif c in asciiUppercaseBytes:
|
857 |
+
attrValue.append(c.lower())
|
858 |
+
elif c is None:
|
859 |
+
return None
|
860 |
+
else:
|
861 |
+
attrValue.append(c)
|
862 |
+
|
863 |
+
|
864 |
+
class ContentAttrParser(object):
|
865 |
+
def __init__(self, data):
|
866 |
+
assert isinstance(data, bytes)
|
867 |
+
self.data = data
|
868 |
+
|
869 |
+
def parse(self):
|
870 |
+
try:
|
871 |
+
# Check if the attr name is charset
|
872 |
+
# otherwise return
|
873 |
+
self.data.jumpTo(b"charset")
|
874 |
+
self.data.position += 1
|
875 |
+
self.data.skip()
|
876 |
+
if not self.data.currentByte == b"=":
|
877 |
+
# If there is no = sign keep looking for attrs
|
878 |
+
return None
|
879 |
+
self.data.position += 1
|
880 |
+
self.data.skip()
|
881 |
+
# Look for an encoding between matching quote marks
|
882 |
+
if self.data.currentByte in (b'"', b"'"):
|
883 |
+
quoteMark = self.data.currentByte
|
884 |
+
self.data.position += 1
|
885 |
+
oldPosition = self.data.position
|
886 |
+
if self.data.jumpTo(quoteMark):
|
887 |
+
return self.data[oldPosition:self.data.position]
|
888 |
+
else:
|
889 |
+
return None
|
890 |
+
else:
|
891 |
+
# Unquoted value
|
892 |
+
oldPosition = self.data.position
|
893 |
+
try:
|
894 |
+
self.data.skipUntil(spaceCharactersBytes)
|
895 |
+
return self.data[oldPosition:self.data.position]
|
896 |
+
except StopIteration:
|
897 |
+
# Return the whole remaining value
|
898 |
+
return self.data[oldPosition:]
|
899 |
+
except StopIteration:
|
900 |
+
return None
|
901 |
+
|
902 |
+
|
903 |
+
def lookupEncoding(encoding):
|
904 |
+
"""Return the python codec name corresponding to an encoding or None if the
|
905 |
+
string doesn't correspond to a valid encoding."""
|
906 |
+
if isinstance(encoding, bytes):
|
907 |
+
try:
|
908 |
+
encoding = encoding.decode("ascii")
|
909 |
+
except UnicodeDecodeError:
|
910 |
+
return None
|
911 |
+
|
912 |
+
if encoding is not None:
|
913 |
+
try:
|
914 |
+
return webencodings.lookup(encoding)
|
915 |
+
except AttributeError:
|
916 |
+
return None
|
917 |
+
else:
|
918 |
+
return None
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_tokenizer.py
ADDED
@@ -0,0 +1,1735 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import, division, unicode_literals
|
2 |
+
|
3 |
+
from six import unichr as chr
|
4 |
+
|
5 |
+
from collections import deque, OrderedDict
|
6 |
+
from sys import version_info
|
7 |
+
|
8 |
+
from .constants import spaceCharacters
|
9 |
+
from .constants import entities
|
10 |
+
from .constants import asciiLetters, asciiUpper2Lower
|
11 |
+
from .constants import digits, hexDigits, EOF
|
12 |
+
from .constants import tokenTypes, tagTokenTypes
|
13 |
+
from .constants import replacementCharacters
|
14 |
+
|
15 |
+
from ._inputstream import HTMLInputStream
|
16 |
+
|
17 |
+
from ._trie import Trie
|
18 |
+
|
19 |
+
entitiesTrie = Trie(entities)
|
20 |
+
|
21 |
+
if version_info >= (3, 7):
|
22 |
+
attributeMap = dict
|
23 |
+
else:
|
24 |
+
attributeMap = OrderedDict
|
25 |
+
|
26 |
+
|
27 |
+
class HTMLTokenizer(object):
|
28 |
+
""" This class takes care of tokenizing HTML.
|
29 |
+
|
30 |
+
* self.currentToken
|
31 |
+
Holds the token that is currently being processed.
|
32 |
+
|
33 |
+
* self.state
|
34 |
+
Holds a reference to the method to be invoked... XXX
|
35 |
+
|
36 |
+
* self.stream
|
37 |
+
Points to HTMLInputStream object.
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(self, stream, parser=None, **kwargs):
|
41 |
+
|
42 |
+
self.stream = HTMLInputStream(stream, **kwargs)
|
43 |
+
self.parser = parser
|
44 |
+
|
45 |
+
# Setup the initial tokenizer state
|
46 |
+
self.escapeFlag = False
|
47 |
+
self.lastFourChars = []
|
48 |
+
self.state = self.dataState
|
49 |
+
self.escape = False
|
50 |
+
|
51 |
+
# The current token being created
|
52 |
+
self.currentToken = None
|
53 |
+
super(HTMLTokenizer, self).__init__()
|
54 |
+
|
55 |
+
def __iter__(self):
|
56 |
+
""" This is where the magic happens.
|
57 |
+
|
58 |
+
We do our usually processing through the states and when we have a token
|
59 |
+
to return we yield the token which pauses processing until the next token
|
60 |
+
is requested.
|
61 |
+
"""
|
62 |
+
self.tokenQueue = deque([])
|
63 |
+
# Start processing. When EOF is reached self.state will return False
|
64 |
+
# instead of True and the loop will terminate.
|
65 |
+
while self.state():
|
66 |
+
while self.stream.errors:
|
67 |
+
yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
|
68 |
+
while self.tokenQueue:
|
69 |
+
yield self.tokenQueue.popleft()
|
70 |
+
|
71 |
+
def consumeNumberEntity(self, isHex):
|
72 |
+
"""This function returns either U+FFFD or the character based on the
|
73 |
+
decimal or hexadecimal representation. It also discards ";" if present.
|
74 |
+
If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
|
75 |
+
"""
|
76 |
+
|
77 |
+
allowed = digits
|
78 |
+
radix = 10
|
79 |
+
if isHex:
|
80 |
+
allowed = hexDigits
|
81 |
+
radix = 16
|
82 |
+
|
83 |
+
charStack = []
|
84 |
+
|
85 |
+
# Consume all the characters that are in range while making sure we
|
86 |
+
# don't hit an EOF.
|
87 |
+
c = self.stream.char()
|
88 |
+
while c in allowed and c is not EOF:
|
89 |
+
charStack.append(c)
|
90 |
+
c = self.stream.char()
|
91 |
+
|
92 |
+
# Convert the set of characters consumed to an int.
|
93 |
+
charAsInt = int("".join(charStack), radix)
|
94 |
+
|
95 |
+
# Certain characters get replaced with others
|
96 |
+
if charAsInt in replacementCharacters:
|
97 |
+
char = replacementCharacters[charAsInt]
|
98 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
99 |
+
"illegal-codepoint-for-numeric-entity",
|
100 |
+
"datavars": {"charAsInt": charAsInt}})
|
101 |
+
elif ((0xD800 <= charAsInt <= 0xDFFF) or
|
102 |
+
(charAsInt > 0x10FFFF)):
|
103 |
+
char = "\uFFFD"
|
104 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
105 |
+
"illegal-codepoint-for-numeric-entity",
|
106 |
+
"datavars": {"charAsInt": charAsInt}})
|
107 |
+
else:
|
108 |
+
# Should speed up this check somehow (e.g. move the set to a constant)
|
109 |
+
if ((0x0001 <= charAsInt <= 0x0008) or
|
110 |
+
(0x000E <= charAsInt <= 0x001F) or
|
111 |
+
(0x007F <= charAsInt <= 0x009F) or
|
112 |
+
(0xFDD0 <= charAsInt <= 0xFDEF) or
|
113 |
+
charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
|
114 |
+
0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
115 |
+
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
|
116 |
+
0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
|
117 |
+
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
|
118 |
+
0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
|
119 |
+
0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
120 |
+
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
|
121 |
+
0xFFFFF, 0x10FFFE, 0x10FFFF])):
|
122 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
123 |
+
"data":
|
124 |
+
"illegal-codepoint-for-numeric-entity",
|
125 |
+
"datavars": {"charAsInt": charAsInt}})
|
126 |
+
try:
|
127 |
+
# Try/except needed as UCS-2 Python builds' unichar only works
|
128 |
+
# within the BMP.
|
129 |
+
char = chr(charAsInt)
|
130 |
+
except ValueError:
|
131 |
+
v = charAsInt - 0x10000
|
132 |
+
char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
|
133 |
+
|
134 |
+
# Discard the ; if present. Otherwise, put it back on the queue and
|
135 |
+
# invoke parseError on parser.
|
136 |
+
if c != ";":
|
137 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
138 |
+
"numeric-entity-without-semicolon"})
|
139 |
+
self.stream.unget(c)
|
140 |
+
|
141 |
+
return char
|
142 |
+
|
143 |
+
def consumeEntity(self, allowedChar=None, fromAttribute=False):
|
144 |
+
# Initialise to the default output for when no entity is matched
|
145 |
+
output = "&"
|
146 |
+
|
147 |
+
charStack = [self.stream.char()]
|
148 |
+
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
|
149 |
+
(allowedChar is not None and allowedChar == charStack[0])):
|
150 |
+
self.stream.unget(charStack[0])
|
151 |
+
|
152 |
+
elif charStack[0] == "#":
|
153 |
+
# Read the next character to see if it's hex or decimal
|
154 |
+
hex = False
|
155 |
+
charStack.append(self.stream.char())
|
156 |
+
if charStack[-1] in ("x", "X"):
|
157 |
+
hex = True
|
158 |
+
charStack.append(self.stream.char())
|
159 |
+
|
160 |
+
# charStack[-1] should be the first digit
|
161 |
+
if (hex and charStack[-1] in hexDigits) \
|
162 |
+
or (not hex and charStack[-1] in digits):
|
163 |
+
# At least one digit found, so consume the whole number
|
164 |
+
self.stream.unget(charStack[-1])
|
165 |
+
output = self.consumeNumberEntity(hex)
|
166 |
+
else:
|
167 |
+
# No digits found
|
168 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
169 |
+
"data": "expected-numeric-entity"})
|
170 |
+
self.stream.unget(charStack.pop())
|
171 |
+
output = "&" + "".join(charStack)
|
172 |
+
|
173 |
+
else:
|
174 |
+
# At this point in the process might have named entity. Entities
|
175 |
+
# are stored in the global variable "entities".
|
176 |
+
#
|
177 |
+
# Consume characters and compare to these to a substring of the
|
178 |
+
# entity names in the list until the substring no longer matches.
|
179 |
+
while (charStack[-1] is not EOF):
|
180 |
+
if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
|
181 |
+
break
|
182 |
+
charStack.append(self.stream.char())
|
183 |
+
|
184 |
+
# At this point we have a string that starts with some characters
|
185 |
+
# that may match an entity
|
186 |
+
# Try to find the longest entity the string will match to take care
|
187 |
+
# of ¬i for instance.
|
188 |
+
try:
|
189 |
+
entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
|
190 |
+
entityLength = len(entityName)
|
191 |
+
except KeyError:
|
192 |
+
entityName = None
|
193 |
+
|
194 |
+
if entityName is not None:
|
195 |
+
if entityName[-1] != ";":
|
196 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
197 |
+
"named-entity-without-semicolon"})
|
198 |
+
if (entityName[-1] != ";" and fromAttribute and
|
199 |
+
(charStack[entityLength] in asciiLetters or
|
200 |
+
charStack[entityLength] in digits or
|
201 |
+
charStack[entityLength] == "=")):
|
202 |
+
self.stream.unget(charStack.pop())
|
203 |
+
output = "&" + "".join(charStack)
|
204 |
+
else:
|
205 |
+
output = entities[entityName]
|
206 |
+
self.stream.unget(charStack.pop())
|
207 |
+
output += "".join(charStack[entityLength:])
|
208 |
+
else:
|
209 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
210 |
+
"expected-named-entity"})
|
211 |
+
self.stream.unget(charStack.pop())
|
212 |
+
output = "&" + "".join(charStack)
|
213 |
+
|
214 |
+
if fromAttribute:
|
215 |
+
self.currentToken["data"][-1][1] += output
|
216 |
+
else:
|
217 |
+
if output in spaceCharacters:
|
218 |
+
tokenType = "SpaceCharacters"
|
219 |
+
else:
|
220 |
+
tokenType = "Characters"
|
221 |
+
self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
|
222 |
+
|
223 |
+
def processEntityInAttribute(self, allowedChar):
|
224 |
+
"""This method replaces the need for "entityInAttributeValueState".
|
225 |
+
"""
|
226 |
+
self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
|
227 |
+
|
228 |
+
def emitCurrentToken(self):
|
229 |
+
"""This method is a generic handler for emitting the tags. It also sets
|
230 |
+
the state to "data" because that's what's needed after a token has been
|
231 |
+
emitted.
|
232 |
+
"""
|
233 |
+
token = self.currentToken
|
234 |
+
# Add token to the queue to be yielded
|
235 |
+
if (token["type"] in tagTokenTypes):
|
236 |
+
token["name"] = token["name"].translate(asciiUpper2Lower)
|
237 |
+
if token["type"] == tokenTypes["StartTag"]:
|
238 |
+
raw = token["data"]
|
239 |
+
data = attributeMap(raw)
|
240 |
+
if len(raw) > len(data):
|
241 |
+
# we had some duplicated attribute, fix so first wins
|
242 |
+
data.update(raw[::-1])
|
243 |
+
token["data"] = data
|
244 |
+
|
245 |
+
if token["type"] == tokenTypes["EndTag"]:
|
246 |
+
if token["data"]:
|
247 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
248 |
+
"data": "attributes-in-end-tag"})
|
249 |
+
if token["selfClosing"]:
|
250 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
251 |
+
"data": "self-closing-flag-on-end-tag"})
|
252 |
+
self.tokenQueue.append(token)
|
253 |
+
self.state = self.dataState
|
254 |
+
|
255 |
+
# Below are the various tokenizer states worked out.
|
256 |
+
def dataState(self):
|
257 |
+
data = self.stream.char()
|
258 |
+
if data == "&":
|
259 |
+
self.state = self.entityDataState
|
260 |
+
elif data == "<":
|
261 |
+
self.state = self.tagOpenState
|
262 |
+
elif data == "\u0000":
|
263 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
264 |
+
"data": "invalid-codepoint"})
|
265 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
266 |
+
"data": "\u0000"})
|
267 |
+
elif data is EOF:
|
268 |
+
# Tokenization ends.
|
269 |
+
return False
|
270 |
+
elif data in spaceCharacters:
|
271 |
+
# Directly after emitting a token you switch back to the "data
|
272 |
+
# state". At that point spaceCharacters are important so they are
|
273 |
+
# emitted separately.
|
274 |
+
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
|
275 |
+
data + self.stream.charsUntil(spaceCharacters, True)})
|
276 |
+
# No need to update lastFourChars here, since the first space will
|
277 |
+
# have already been appended to lastFourChars and will have broken
|
278 |
+
# any <!-- or --> sequences
|
279 |
+
else:
|
280 |
+
chars = self.stream.charsUntil(("&", "<", "\u0000"))
|
281 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
282 |
+
data + chars})
|
283 |
+
return True
|
284 |
+
|
285 |
+
def entityDataState(self):
|
286 |
+
self.consumeEntity()
|
287 |
+
self.state = self.dataState
|
288 |
+
return True
|
289 |
+
|
290 |
+
def rcdataState(self):
|
291 |
+
data = self.stream.char()
|
292 |
+
if data == "&":
|
293 |
+
self.state = self.characterReferenceInRcdata
|
294 |
+
elif data == "<":
|
295 |
+
self.state = self.rcdataLessThanSignState
|
296 |
+
elif data == EOF:
|
297 |
+
# Tokenization ends.
|
298 |
+
return False
|
299 |
+
elif data == "\u0000":
|
300 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
301 |
+
"data": "invalid-codepoint"})
|
302 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
303 |
+
"data": "\uFFFD"})
|
304 |
+
elif data in spaceCharacters:
|
305 |
+
# Directly after emitting a token you switch back to the "data
|
306 |
+
# state". At that point spaceCharacters are important so they are
|
307 |
+
# emitted separately.
|
308 |
+
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
|
309 |
+
data + self.stream.charsUntil(spaceCharacters, True)})
|
310 |
+
# No need to update lastFourChars here, since the first space will
|
311 |
+
# have already been appended to lastFourChars and will have broken
|
312 |
+
# any <!-- or --> sequences
|
313 |
+
else:
|
314 |
+
chars = self.stream.charsUntil(("&", "<", "\u0000"))
|
315 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
316 |
+
data + chars})
|
317 |
+
return True
|
318 |
+
|
319 |
+
def characterReferenceInRcdata(self):
|
320 |
+
self.consumeEntity()
|
321 |
+
self.state = self.rcdataState
|
322 |
+
return True
|
323 |
+
|
324 |
+
def rawtextState(self):
|
325 |
+
data = self.stream.char()
|
326 |
+
if data == "<":
|
327 |
+
self.state = self.rawtextLessThanSignState
|
328 |
+
elif data == "\u0000":
|
329 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
330 |
+
"data": "invalid-codepoint"})
|
331 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
332 |
+
"data": "\uFFFD"})
|
333 |
+
elif data == EOF:
|
334 |
+
# Tokenization ends.
|
335 |
+
return False
|
336 |
+
else:
|
337 |
+
chars = self.stream.charsUntil(("<", "\u0000"))
|
338 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
339 |
+
data + chars})
|
340 |
+
return True
|
341 |
+
|
342 |
+
def scriptDataState(self):
|
343 |
+
data = self.stream.char()
|
344 |
+
if data == "<":
|
345 |
+
self.state = self.scriptDataLessThanSignState
|
346 |
+
elif data == "\u0000":
|
347 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
348 |
+
"data": "invalid-codepoint"})
|
349 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
350 |
+
"data": "\uFFFD"})
|
351 |
+
elif data == EOF:
|
352 |
+
# Tokenization ends.
|
353 |
+
return False
|
354 |
+
else:
|
355 |
+
chars = self.stream.charsUntil(("<", "\u0000"))
|
356 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
357 |
+
data + chars})
|
358 |
+
return True
|
359 |
+
|
360 |
+
def plaintextState(self):
|
361 |
+
data = self.stream.char()
|
362 |
+
if data == EOF:
|
363 |
+
# Tokenization ends.
|
364 |
+
return False
|
365 |
+
elif data == "\u0000":
|
366 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
367 |
+
"data": "invalid-codepoint"})
|
368 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
369 |
+
"data": "\uFFFD"})
|
370 |
+
else:
|
371 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
372 |
+
data + self.stream.charsUntil("\u0000")})
|
373 |
+
return True
|
374 |
+
|
375 |
+
def tagOpenState(self):
|
376 |
+
data = self.stream.char()
|
377 |
+
if data == "!":
|
378 |
+
self.state = self.markupDeclarationOpenState
|
379 |
+
elif data == "/":
|
380 |
+
self.state = self.closeTagOpenState
|
381 |
+
elif data in asciiLetters:
|
382 |
+
self.currentToken = {"type": tokenTypes["StartTag"],
|
383 |
+
"name": data, "data": [],
|
384 |
+
"selfClosing": False,
|
385 |
+
"selfClosingAcknowledged": False}
|
386 |
+
self.state = self.tagNameState
|
387 |
+
elif data == ">":
|
388 |
+
# XXX In theory it could be something besides a tag name. But
|
389 |
+
# do we really care?
|
390 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
391 |
+
"expected-tag-name-but-got-right-bracket"})
|
392 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
|
393 |
+
self.state = self.dataState
|
394 |
+
elif data == "?":
|
395 |
+
# XXX In theory it could be something besides a tag name. But
|
396 |
+
# do we really care?
|
397 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
398 |
+
"expected-tag-name-but-got-question-mark"})
|
399 |
+
self.stream.unget(data)
|
400 |
+
self.state = self.bogusCommentState
|
401 |
+
else:
|
402 |
+
# XXX
|
403 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
404 |
+
"expected-tag-name"})
|
405 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
406 |
+
self.stream.unget(data)
|
407 |
+
self.state = self.dataState
|
408 |
+
return True
|
409 |
+
|
410 |
+
def closeTagOpenState(self):
|
411 |
+
data = self.stream.char()
|
412 |
+
if data in asciiLetters:
|
413 |
+
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
|
414 |
+
"data": [], "selfClosing": False}
|
415 |
+
self.state = self.tagNameState
|
416 |
+
elif data == ">":
|
417 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
418 |
+
"expected-closing-tag-but-got-right-bracket"})
|
419 |
+
self.state = self.dataState
|
420 |
+
elif data is EOF:
|
421 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
422 |
+
"expected-closing-tag-but-got-eof"})
|
423 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
424 |
+
self.state = self.dataState
|
425 |
+
else:
|
426 |
+
# XXX data can be _'_...
|
427 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
428 |
+
"expected-closing-tag-but-got-char",
|
429 |
+
"datavars": {"data": data}})
|
430 |
+
self.stream.unget(data)
|
431 |
+
self.state = self.bogusCommentState
|
432 |
+
return True
|
433 |
+
|
434 |
+
def tagNameState(self):
|
435 |
+
data = self.stream.char()
|
436 |
+
if data in spaceCharacters:
|
437 |
+
self.state = self.beforeAttributeNameState
|
438 |
+
elif data == ">":
|
439 |
+
self.emitCurrentToken()
|
440 |
+
elif data is EOF:
|
441 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
442 |
+
"eof-in-tag-name"})
|
443 |
+
self.state = self.dataState
|
444 |
+
elif data == "/":
|
445 |
+
self.state = self.selfClosingStartTagState
|
446 |
+
elif data == "\u0000":
|
447 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
448 |
+
"data": "invalid-codepoint"})
|
449 |
+
self.currentToken["name"] += "\uFFFD"
|
450 |
+
else:
|
451 |
+
self.currentToken["name"] += data
|
452 |
+
# (Don't use charsUntil here, because tag names are
|
453 |
+
# very short and it's faster to not do anything fancy)
|
454 |
+
return True
|
455 |
+
|
456 |
+
def rcdataLessThanSignState(self):
|
457 |
+
data = self.stream.char()
|
458 |
+
if data == "/":
|
459 |
+
self.temporaryBuffer = ""
|
460 |
+
self.state = self.rcdataEndTagOpenState
|
461 |
+
else:
|
462 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
463 |
+
self.stream.unget(data)
|
464 |
+
self.state = self.rcdataState
|
465 |
+
return True
|
466 |
+
|
467 |
+
def rcdataEndTagOpenState(self):
|
468 |
+
data = self.stream.char()
|
469 |
+
if data in asciiLetters:
|
470 |
+
self.temporaryBuffer += data
|
471 |
+
self.state = self.rcdataEndTagNameState
|
472 |
+
else:
|
473 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
474 |
+
self.stream.unget(data)
|
475 |
+
self.state = self.rcdataState
|
476 |
+
return True
|
477 |
+
|
478 |
+
def rcdataEndTagNameState(self):
|
479 |
+
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
480 |
+
data = self.stream.char()
|
481 |
+
if data in spaceCharacters and appropriate:
|
482 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
483 |
+
"name": self.temporaryBuffer,
|
484 |
+
"data": [], "selfClosing": False}
|
485 |
+
self.state = self.beforeAttributeNameState
|
486 |
+
elif data == "/" and appropriate:
|
487 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
488 |
+
"name": self.temporaryBuffer,
|
489 |
+
"data": [], "selfClosing": False}
|
490 |
+
self.state = self.selfClosingStartTagState
|
491 |
+
elif data == ">" and appropriate:
|
492 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
493 |
+
"name": self.temporaryBuffer,
|
494 |
+
"data": [], "selfClosing": False}
|
495 |
+
self.emitCurrentToken()
|
496 |
+
self.state = self.dataState
|
497 |
+
elif data in asciiLetters:
|
498 |
+
self.temporaryBuffer += data
|
499 |
+
else:
|
500 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
501 |
+
"data": "</" + self.temporaryBuffer})
|
502 |
+
self.stream.unget(data)
|
503 |
+
self.state = self.rcdataState
|
504 |
+
return True
|
505 |
+
|
506 |
+
def rawtextLessThanSignState(self):
|
507 |
+
data = self.stream.char()
|
508 |
+
if data == "/":
|
509 |
+
self.temporaryBuffer = ""
|
510 |
+
self.state = self.rawtextEndTagOpenState
|
511 |
+
else:
|
512 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
513 |
+
self.stream.unget(data)
|
514 |
+
self.state = self.rawtextState
|
515 |
+
return True
|
516 |
+
|
517 |
+
def rawtextEndTagOpenState(self):
|
518 |
+
data = self.stream.char()
|
519 |
+
if data in asciiLetters:
|
520 |
+
self.temporaryBuffer += data
|
521 |
+
self.state = self.rawtextEndTagNameState
|
522 |
+
else:
|
523 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
524 |
+
self.stream.unget(data)
|
525 |
+
self.state = self.rawtextState
|
526 |
+
return True
|
527 |
+
|
528 |
+
def rawtextEndTagNameState(self):
|
529 |
+
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
530 |
+
data = self.stream.char()
|
531 |
+
if data in spaceCharacters and appropriate:
|
532 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
533 |
+
"name": self.temporaryBuffer,
|
534 |
+
"data": [], "selfClosing": False}
|
535 |
+
self.state = self.beforeAttributeNameState
|
536 |
+
elif data == "/" and appropriate:
|
537 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
538 |
+
"name": self.temporaryBuffer,
|
539 |
+
"data": [], "selfClosing": False}
|
540 |
+
self.state = self.selfClosingStartTagState
|
541 |
+
elif data == ">" and appropriate:
|
542 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
543 |
+
"name": self.temporaryBuffer,
|
544 |
+
"data": [], "selfClosing": False}
|
545 |
+
self.emitCurrentToken()
|
546 |
+
self.state = self.dataState
|
547 |
+
elif data in asciiLetters:
|
548 |
+
self.temporaryBuffer += data
|
549 |
+
else:
|
550 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
551 |
+
"data": "</" + self.temporaryBuffer})
|
552 |
+
self.stream.unget(data)
|
553 |
+
self.state = self.rawtextState
|
554 |
+
return True
|
555 |
+
|
556 |
+
def scriptDataLessThanSignState(self):
|
557 |
+
data = self.stream.char()
|
558 |
+
if data == "/":
|
559 |
+
self.temporaryBuffer = ""
|
560 |
+
self.state = self.scriptDataEndTagOpenState
|
561 |
+
elif data == "!":
|
562 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
|
563 |
+
self.state = self.scriptDataEscapeStartState
|
564 |
+
else:
|
565 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
566 |
+
self.stream.unget(data)
|
567 |
+
self.state = self.scriptDataState
|
568 |
+
return True
|
569 |
+
|
570 |
+
def scriptDataEndTagOpenState(self):
|
571 |
+
data = self.stream.char()
|
572 |
+
if data in asciiLetters:
|
573 |
+
self.temporaryBuffer += data
|
574 |
+
self.state = self.scriptDataEndTagNameState
|
575 |
+
else:
|
576 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
577 |
+
self.stream.unget(data)
|
578 |
+
self.state = self.scriptDataState
|
579 |
+
return True
|
580 |
+
|
581 |
+
def scriptDataEndTagNameState(self):
|
582 |
+
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
583 |
+
data = self.stream.char()
|
584 |
+
if data in spaceCharacters and appropriate:
|
585 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
586 |
+
"name": self.temporaryBuffer,
|
587 |
+
"data": [], "selfClosing": False}
|
588 |
+
self.state = self.beforeAttributeNameState
|
589 |
+
elif data == "/" and appropriate:
|
590 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
591 |
+
"name": self.temporaryBuffer,
|
592 |
+
"data": [], "selfClosing": False}
|
593 |
+
self.state = self.selfClosingStartTagState
|
594 |
+
elif data == ">" and appropriate:
|
595 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
596 |
+
"name": self.temporaryBuffer,
|
597 |
+
"data": [], "selfClosing": False}
|
598 |
+
self.emitCurrentToken()
|
599 |
+
self.state = self.dataState
|
600 |
+
elif data in asciiLetters:
|
601 |
+
self.temporaryBuffer += data
|
602 |
+
else:
|
603 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
604 |
+
"data": "</" + self.temporaryBuffer})
|
605 |
+
self.stream.unget(data)
|
606 |
+
self.state = self.scriptDataState
|
607 |
+
return True
|
608 |
+
|
609 |
+
def scriptDataEscapeStartState(self):
|
610 |
+
data = self.stream.char()
|
611 |
+
if data == "-":
|
612 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
613 |
+
self.state = self.scriptDataEscapeStartDashState
|
614 |
+
else:
|
615 |
+
self.stream.unget(data)
|
616 |
+
self.state = self.scriptDataState
|
617 |
+
return True
|
618 |
+
|
619 |
+
def scriptDataEscapeStartDashState(self):
|
620 |
+
data = self.stream.char()
|
621 |
+
if data == "-":
|
622 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
623 |
+
self.state = self.scriptDataEscapedDashDashState
|
624 |
+
else:
|
625 |
+
self.stream.unget(data)
|
626 |
+
self.state = self.scriptDataState
|
627 |
+
return True
|
628 |
+
|
629 |
+
def scriptDataEscapedState(self):
|
630 |
+
data = self.stream.char()
|
631 |
+
if data == "-":
|
632 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
633 |
+
self.state = self.scriptDataEscapedDashState
|
634 |
+
elif data == "<":
|
635 |
+
self.state = self.scriptDataEscapedLessThanSignState
|
636 |
+
elif data == "\u0000":
|
637 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
638 |
+
"data": "invalid-codepoint"})
|
639 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
640 |
+
"data": "\uFFFD"})
|
641 |
+
elif data == EOF:
|
642 |
+
self.state = self.dataState
|
643 |
+
else:
|
644 |
+
chars = self.stream.charsUntil(("<", "-", "\u0000"))
|
645 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
646 |
+
data + chars})
|
647 |
+
return True
|
648 |
+
|
649 |
+
def scriptDataEscapedDashState(self):
|
650 |
+
data = self.stream.char()
|
651 |
+
if data == "-":
|
652 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
653 |
+
self.state = self.scriptDataEscapedDashDashState
|
654 |
+
elif data == "<":
|
655 |
+
self.state = self.scriptDataEscapedLessThanSignState
|
656 |
+
elif data == "\u0000":
|
657 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
658 |
+
"data": "invalid-codepoint"})
|
659 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
660 |
+
"data": "\uFFFD"})
|
661 |
+
self.state = self.scriptDataEscapedState
|
662 |
+
elif data == EOF:
|
663 |
+
self.state = self.dataState
|
664 |
+
else:
|
665 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
666 |
+
self.state = self.scriptDataEscapedState
|
667 |
+
return True
|
668 |
+
|
669 |
+
def scriptDataEscapedDashDashState(self):
|
670 |
+
data = self.stream.char()
|
671 |
+
if data == "-":
|
672 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
673 |
+
elif data == "<":
|
674 |
+
self.state = self.scriptDataEscapedLessThanSignState
|
675 |
+
elif data == ">":
|
676 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
|
677 |
+
self.state = self.scriptDataState
|
678 |
+
elif data == "\u0000":
|
679 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
680 |
+
"data": "invalid-codepoint"})
|
681 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
682 |
+
"data": "\uFFFD"})
|
683 |
+
self.state = self.scriptDataEscapedState
|
684 |
+
elif data == EOF:
|
685 |
+
self.state = self.dataState
|
686 |
+
else:
|
687 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
688 |
+
self.state = self.scriptDataEscapedState
|
689 |
+
return True
|
690 |
+
|
691 |
+
def scriptDataEscapedLessThanSignState(self):
|
692 |
+
data = self.stream.char()
|
693 |
+
if data == "/":
|
694 |
+
self.temporaryBuffer = ""
|
695 |
+
self.state = self.scriptDataEscapedEndTagOpenState
|
696 |
+
elif data in asciiLetters:
|
697 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
|
698 |
+
self.temporaryBuffer = data
|
699 |
+
self.state = self.scriptDataDoubleEscapeStartState
|
700 |
+
else:
|
701 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
702 |
+
self.stream.unget(data)
|
703 |
+
self.state = self.scriptDataEscapedState
|
704 |
+
return True
|
705 |
+
|
706 |
+
def scriptDataEscapedEndTagOpenState(self):
|
707 |
+
data = self.stream.char()
|
708 |
+
if data in asciiLetters:
|
709 |
+
self.temporaryBuffer = data
|
710 |
+
self.state = self.scriptDataEscapedEndTagNameState
|
711 |
+
else:
|
712 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
713 |
+
self.stream.unget(data)
|
714 |
+
self.state = self.scriptDataEscapedState
|
715 |
+
return True
|
716 |
+
|
717 |
+
def scriptDataEscapedEndTagNameState(self):
|
718 |
+
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
719 |
+
data = self.stream.char()
|
720 |
+
if data in spaceCharacters and appropriate:
|
721 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
722 |
+
"name": self.temporaryBuffer,
|
723 |
+
"data": [], "selfClosing": False}
|
724 |
+
self.state = self.beforeAttributeNameState
|
725 |
+
elif data == "/" and appropriate:
|
726 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
727 |
+
"name": self.temporaryBuffer,
|
728 |
+
"data": [], "selfClosing": False}
|
729 |
+
self.state = self.selfClosingStartTagState
|
730 |
+
elif data == ">" and appropriate:
|
731 |
+
self.currentToken = {"type": tokenTypes["EndTag"],
|
732 |
+
"name": self.temporaryBuffer,
|
733 |
+
"data": [], "selfClosing": False}
|
734 |
+
self.emitCurrentToken()
|
735 |
+
self.state = self.dataState
|
736 |
+
elif data in asciiLetters:
|
737 |
+
self.temporaryBuffer += data
|
738 |
+
else:
|
739 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
740 |
+
"data": "</" + self.temporaryBuffer})
|
741 |
+
self.stream.unget(data)
|
742 |
+
self.state = self.scriptDataEscapedState
|
743 |
+
return True
|
744 |
+
|
745 |
+
def scriptDataDoubleEscapeStartState(self):
|
746 |
+
data = self.stream.char()
|
747 |
+
if data in (spaceCharacters | frozenset(("/", ">"))):
|
748 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
749 |
+
if self.temporaryBuffer.lower() == "script":
|
750 |
+
self.state = self.scriptDataDoubleEscapedState
|
751 |
+
else:
|
752 |
+
self.state = self.scriptDataEscapedState
|
753 |
+
elif data in asciiLetters:
|
754 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
755 |
+
self.temporaryBuffer += data
|
756 |
+
else:
|
757 |
+
self.stream.unget(data)
|
758 |
+
self.state = self.scriptDataEscapedState
|
759 |
+
return True
|
760 |
+
|
761 |
+
def scriptDataDoubleEscapedState(self):
|
762 |
+
data = self.stream.char()
|
763 |
+
if data == "-":
|
764 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
765 |
+
self.state = self.scriptDataDoubleEscapedDashState
|
766 |
+
elif data == "<":
|
767 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
768 |
+
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
769 |
+
elif data == "\u0000":
|
770 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
771 |
+
"data": "invalid-codepoint"})
|
772 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
773 |
+
"data": "\uFFFD"})
|
774 |
+
elif data == EOF:
|
775 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
776 |
+
"eof-in-script-in-script"})
|
777 |
+
self.state = self.dataState
|
778 |
+
else:
|
779 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
780 |
+
return True
|
781 |
+
|
782 |
+
def scriptDataDoubleEscapedDashState(self):
|
783 |
+
data = self.stream.char()
|
784 |
+
if data == "-":
|
785 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
786 |
+
self.state = self.scriptDataDoubleEscapedDashDashState
|
787 |
+
elif data == "<":
|
788 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
789 |
+
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
790 |
+
elif data == "\u0000":
|
791 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
792 |
+
"data": "invalid-codepoint"})
|
793 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
794 |
+
"data": "\uFFFD"})
|
795 |
+
self.state = self.scriptDataDoubleEscapedState
|
796 |
+
elif data == EOF:
|
797 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
798 |
+
"eof-in-script-in-script"})
|
799 |
+
self.state = self.dataState
|
800 |
+
else:
|
801 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
802 |
+
self.state = self.scriptDataDoubleEscapedState
|
803 |
+
return True
|
804 |
+
|
805 |
+
def scriptDataDoubleEscapedDashDashState(self):
|
806 |
+
data = self.stream.char()
|
807 |
+
if data == "-":
|
808 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
809 |
+
elif data == "<":
|
810 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
811 |
+
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
812 |
+
elif data == ">":
|
813 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
|
814 |
+
self.state = self.scriptDataState
|
815 |
+
elif data == "\u0000":
|
816 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
817 |
+
"data": "invalid-codepoint"})
|
818 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
819 |
+
"data": "\uFFFD"})
|
820 |
+
self.state = self.scriptDataDoubleEscapedState
|
821 |
+
elif data == EOF:
|
822 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
823 |
+
"eof-in-script-in-script"})
|
824 |
+
self.state = self.dataState
|
825 |
+
else:
|
826 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
827 |
+
self.state = self.scriptDataDoubleEscapedState
|
828 |
+
return True
|
829 |
+
|
830 |
+
def scriptDataDoubleEscapedLessThanSignState(self):
|
831 |
+
data = self.stream.char()
|
832 |
+
if data == "/":
|
833 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
|
834 |
+
self.temporaryBuffer = ""
|
835 |
+
self.state = self.scriptDataDoubleEscapeEndState
|
836 |
+
else:
|
837 |
+
self.stream.unget(data)
|
838 |
+
self.state = self.scriptDataDoubleEscapedState
|
839 |
+
return True
|
840 |
+
|
841 |
+
def scriptDataDoubleEscapeEndState(self):
|
842 |
+
data = self.stream.char()
|
843 |
+
if data in (spaceCharacters | frozenset(("/", ">"))):
|
844 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
845 |
+
if self.temporaryBuffer.lower() == "script":
|
846 |
+
self.state = self.scriptDataEscapedState
|
847 |
+
else:
|
848 |
+
self.state = self.scriptDataDoubleEscapedState
|
849 |
+
elif data in asciiLetters:
|
850 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
851 |
+
self.temporaryBuffer += data
|
852 |
+
else:
|
853 |
+
self.stream.unget(data)
|
854 |
+
self.state = self.scriptDataDoubleEscapedState
|
855 |
+
return True
|
856 |
+
|
857 |
+
def beforeAttributeNameState(self):
|
858 |
+
data = self.stream.char()
|
859 |
+
if data in spaceCharacters:
|
860 |
+
self.stream.charsUntil(spaceCharacters, True)
|
861 |
+
elif data in asciiLetters:
|
862 |
+
self.currentToken["data"].append([data, ""])
|
863 |
+
self.state = self.attributeNameState
|
864 |
+
elif data == ">":
|
865 |
+
self.emitCurrentToken()
|
866 |
+
elif data == "/":
|
867 |
+
self.state = self.selfClosingStartTagState
|
868 |
+
elif data in ("'", '"', "=", "<"):
|
869 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
870 |
+
"invalid-character-in-attribute-name"})
|
871 |
+
self.currentToken["data"].append([data, ""])
|
872 |
+
self.state = self.attributeNameState
|
873 |
+
elif data == "\u0000":
|
874 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
875 |
+
"data": "invalid-codepoint"})
|
876 |
+
self.currentToken["data"].append(["\uFFFD", ""])
|
877 |
+
self.state = self.attributeNameState
|
878 |
+
elif data is EOF:
|
879 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
880 |
+
"expected-attribute-name-but-got-eof"})
|
881 |
+
self.state = self.dataState
|
882 |
+
else:
|
883 |
+
self.currentToken["data"].append([data, ""])
|
884 |
+
self.state = self.attributeNameState
|
885 |
+
return True
|
886 |
+
|
887 |
+
def attributeNameState(self):
|
888 |
+
data = self.stream.char()
|
889 |
+
leavingThisState = True
|
890 |
+
emitToken = False
|
891 |
+
if data == "=":
|
892 |
+
self.state = self.beforeAttributeValueState
|
893 |
+
elif data in asciiLetters:
|
894 |
+
self.currentToken["data"][-1][0] += data +\
|
895 |
+
self.stream.charsUntil(asciiLetters, True)
|
896 |
+
leavingThisState = False
|
897 |
+
elif data == ">":
|
898 |
+
# XXX If we emit here the attributes are converted to a dict
|
899 |
+
# without being checked and when the code below runs we error
|
900 |
+
# because data is a dict not a list
|
901 |
+
emitToken = True
|
902 |
+
elif data in spaceCharacters:
|
903 |
+
self.state = self.afterAttributeNameState
|
904 |
+
elif data == "/":
|
905 |
+
self.state = self.selfClosingStartTagState
|
906 |
+
elif data == "\u0000":
|
907 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
908 |
+
"data": "invalid-codepoint"})
|
909 |
+
self.currentToken["data"][-1][0] += "\uFFFD"
|
910 |
+
leavingThisState = False
|
911 |
+
elif data in ("'", '"', "<"):
|
912 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
913 |
+
"data":
|
914 |
+
"invalid-character-in-attribute-name"})
|
915 |
+
self.currentToken["data"][-1][0] += data
|
916 |
+
leavingThisState = False
|
917 |
+
elif data is EOF:
|
918 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
919 |
+
"data": "eof-in-attribute-name"})
|
920 |
+
self.state = self.dataState
|
921 |
+
else:
|
922 |
+
self.currentToken["data"][-1][0] += data
|
923 |
+
leavingThisState = False
|
924 |
+
|
925 |
+
if leavingThisState:
|
926 |
+
# Attributes are not dropped at this stage. That happens when the
|
927 |
+
# start tag token is emitted so values can still be safely appended
|
928 |
+
# to attributes, but we do want to report the parse error in time.
|
929 |
+
self.currentToken["data"][-1][0] = (
|
930 |
+
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
|
931 |
+
for name, _ in self.currentToken["data"][:-1]:
|
932 |
+
if self.currentToken["data"][-1][0] == name:
|
933 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
934 |
+
"duplicate-attribute"})
|
935 |
+
break
|
936 |
+
# XXX Fix for above XXX
|
937 |
+
if emitToken:
|
938 |
+
self.emitCurrentToken()
|
939 |
+
return True
|
940 |
+
|
941 |
+
def afterAttributeNameState(self):
|
942 |
+
data = self.stream.char()
|
943 |
+
if data in spaceCharacters:
|
944 |
+
self.stream.charsUntil(spaceCharacters, True)
|
945 |
+
elif data == "=":
|
946 |
+
self.state = self.beforeAttributeValueState
|
947 |
+
elif data == ">":
|
948 |
+
self.emitCurrentToken()
|
949 |
+
elif data in asciiLetters:
|
950 |
+
self.currentToken["data"].append([data, ""])
|
951 |
+
self.state = self.attributeNameState
|
952 |
+
elif data == "/":
|
953 |
+
self.state = self.selfClosingStartTagState
|
954 |
+
elif data == "\u0000":
|
955 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
956 |
+
"data": "invalid-codepoint"})
|
957 |
+
self.currentToken["data"].append(["\uFFFD", ""])
|
958 |
+
self.state = self.attributeNameState
|
959 |
+
elif data in ("'", '"', "<"):
|
960 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
961 |
+
"invalid-character-after-attribute-name"})
|
962 |
+
self.currentToken["data"].append([data, ""])
|
963 |
+
self.state = self.attributeNameState
|
964 |
+
elif data is EOF:
|
965 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
966 |
+
"expected-end-of-tag-but-got-eof"})
|
967 |
+
self.state = self.dataState
|
968 |
+
else:
|
969 |
+
self.currentToken["data"].append([data, ""])
|
970 |
+
self.state = self.attributeNameState
|
971 |
+
return True
|
972 |
+
|
973 |
+
def beforeAttributeValueState(self):
|
974 |
+
data = self.stream.char()
|
975 |
+
if data in spaceCharacters:
|
976 |
+
self.stream.charsUntil(spaceCharacters, True)
|
977 |
+
elif data == "\"":
|
978 |
+
self.state = self.attributeValueDoubleQuotedState
|
979 |
+
elif data == "&":
|
980 |
+
self.state = self.attributeValueUnQuotedState
|
981 |
+
self.stream.unget(data)
|
982 |
+
elif data == "'":
|
983 |
+
self.state = self.attributeValueSingleQuotedState
|
984 |
+
elif data == ">":
|
985 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
986 |
+
"expected-attribute-value-but-got-right-bracket"})
|
987 |
+
self.emitCurrentToken()
|
988 |
+
elif data == "\u0000":
|
989 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
990 |
+
"data": "invalid-codepoint"})
|
991 |
+
self.currentToken["data"][-1][1] += "\uFFFD"
|
992 |
+
self.state = self.attributeValueUnQuotedState
|
993 |
+
elif data in ("=", "<", "`"):
|
994 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
995 |
+
"equals-in-unquoted-attribute-value"})
|
996 |
+
self.currentToken["data"][-1][1] += data
|
997 |
+
self.state = self.attributeValueUnQuotedState
|
998 |
+
elif data is EOF:
|
999 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1000 |
+
"expected-attribute-value-but-got-eof"})
|
1001 |
+
self.state = self.dataState
|
1002 |
+
else:
|
1003 |
+
self.currentToken["data"][-1][1] += data
|
1004 |
+
self.state = self.attributeValueUnQuotedState
|
1005 |
+
return True
|
1006 |
+
|
1007 |
+
def attributeValueDoubleQuotedState(self):
|
1008 |
+
data = self.stream.char()
|
1009 |
+
if data == "\"":
|
1010 |
+
self.state = self.afterAttributeValueState
|
1011 |
+
elif data == "&":
|
1012 |
+
self.processEntityInAttribute('"')
|
1013 |
+
elif data == "\u0000":
|
1014 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1015 |
+
"data": "invalid-codepoint"})
|
1016 |
+
self.currentToken["data"][-1][1] += "\uFFFD"
|
1017 |
+
elif data is EOF:
|
1018 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1019 |
+
"eof-in-attribute-value-double-quote"})
|
1020 |
+
self.state = self.dataState
|
1021 |
+
else:
|
1022 |
+
self.currentToken["data"][-1][1] += data +\
|
1023 |
+
self.stream.charsUntil(("\"", "&", "\u0000"))
|
1024 |
+
return True
|
1025 |
+
|
1026 |
+
def attributeValueSingleQuotedState(self):
|
1027 |
+
data = self.stream.char()
|
1028 |
+
if data == "'":
|
1029 |
+
self.state = self.afterAttributeValueState
|
1030 |
+
elif data == "&":
|
1031 |
+
self.processEntityInAttribute("'")
|
1032 |
+
elif data == "\u0000":
|
1033 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1034 |
+
"data": "invalid-codepoint"})
|
1035 |
+
self.currentToken["data"][-1][1] += "\uFFFD"
|
1036 |
+
elif data is EOF:
|
1037 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1038 |
+
"eof-in-attribute-value-single-quote"})
|
1039 |
+
self.state = self.dataState
|
1040 |
+
else:
|
1041 |
+
self.currentToken["data"][-1][1] += data +\
|
1042 |
+
self.stream.charsUntil(("'", "&", "\u0000"))
|
1043 |
+
return True
|
1044 |
+
|
1045 |
+
def attributeValueUnQuotedState(self):
|
1046 |
+
data = self.stream.char()
|
1047 |
+
if data in spaceCharacters:
|
1048 |
+
self.state = self.beforeAttributeNameState
|
1049 |
+
elif data == "&":
|
1050 |
+
self.processEntityInAttribute(">")
|
1051 |
+
elif data == ">":
|
1052 |
+
self.emitCurrentToken()
|
1053 |
+
elif data in ('"', "'", "=", "<", "`"):
|
1054 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1055 |
+
"unexpected-character-in-unquoted-attribute-value"})
|
1056 |
+
self.currentToken["data"][-1][1] += data
|
1057 |
+
elif data == "\u0000":
|
1058 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1059 |
+
"data": "invalid-codepoint"})
|
1060 |
+
self.currentToken["data"][-1][1] += "\uFFFD"
|
1061 |
+
elif data is EOF:
|
1062 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1063 |
+
"eof-in-attribute-value-no-quotes"})
|
1064 |
+
self.state = self.dataState
|
1065 |
+
else:
|
1066 |
+
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
|
1067 |
+
frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
|
1068 |
+
return True
|
1069 |
+
|
1070 |
+
def afterAttributeValueState(self):
|
1071 |
+
data = self.stream.char()
|
1072 |
+
if data in spaceCharacters:
|
1073 |
+
self.state = self.beforeAttributeNameState
|
1074 |
+
elif data == ">":
|
1075 |
+
self.emitCurrentToken()
|
1076 |
+
elif data == "/":
|
1077 |
+
self.state = self.selfClosingStartTagState
|
1078 |
+
elif data is EOF:
|
1079 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1080 |
+
"unexpected-EOF-after-attribute-value"})
|
1081 |
+
self.stream.unget(data)
|
1082 |
+
self.state = self.dataState
|
1083 |
+
else:
|
1084 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1085 |
+
"unexpected-character-after-attribute-value"})
|
1086 |
+
self.stream.unget(data)
|
1087 |
+
self.state = self.beforeAttributeNameState
|
1088 |
+
return True
|
1089 |
+
|
1090 |
+
def selfClosingStartTagState(self):
|
1091 |
+
data = self.stream.char()
|
1092 |
+
if data == ">":
|
1093 |
+
self.currentToken["selfClosing"] = True
|
1094 |
+
self.emitCurrentToken()
|
1095 |
+
elif data is EOF:
|
1096 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1097 |
+
"data":
|
1098 |
+
"unexpected-EOF-after-solidus-in-tag"})
|
1099 |
+
self.stream.unget(data)
|
1100 |
+
self.state = self.dataState
|
1101 |
+
else:
|
1102 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1103 |
+
"unexpected-character-after-solidus-in-tag"})
|
1104 |
+
self.stream.unget(data)
|
1105 |
+
self.state = self.beforeAttributeNameState
|
1106 |
+
return True
|
1107 |
+
|
1108 |
+
def bogusCommentState(self):
|
1109 |
+
# Make a new comment token and give it as value all the characters
|
1110 |
+
# until the first > or EOF (charsUntil checks for EOF automatically)
|
1111 |
+
# and emit it.
|
1112 |
+
data = self.stream.charsUntil(">")
|
1113 |
+
data = data.replace("\u0000", "\uFFFD")
|
1114 |
+
self.tokenQueue.append(
|
1115 |
+
{"type": tokenTypes["Comment"], "data": data})
|
1116 |
+
|
1117 |
+
# Eat the character directly after the bogus comment which is either a
|
1118 |
+
# ">" or an EOF.
|
1119 |
+
self.stream.char()
|
1120 |
+
self.state = self.dataState
|
1121 |
+
return True
|
1122 |
+
|
1123 |
+
def markupDeclarationOpenState(self):
|
1124 |
+
charStack = [self.stream.char()]
|
1125 |
+
if charStack[-1] == "-":
|
1126 |
+
charStack.append(self.stream.char())
|
1127 |
+
if charStack[-1] == "-":
|
1128 |
+
self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
|
1129 |
+
self.state = self.commentStartState
|
1130 |
+
return True
|
1131 |
+
elif charStack[-1] in ('d', 'D'):
|
1132 |
+
matched = True
|
1133 |
+
for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
|
1134 |
+
('y', 'Y'), ('p', 'P'), ('e', 'E')):
|
1135 |
+
charStack.append(self.stream.char())
|
1136 |
+
if charStack[-1] not in expected:
|
1137 |
+
matched = False
|
1138 |
+
break
|
1139 |
+
if matched:
|
1140 |
+
self.currentToken = {"type": tokenTypes["Doctype"],
|
1141 |
+
"name": "",
|
1142 |
+
"publicId": None, "systemId": None,
|
1143 |
+
"correct": True}
|
1144 |
+
self.state = self.doctypeState
|
1145 |
+
return True
|
1146 |
+
elif (charStack[-1] == "[" and
|
1147 |
+
self.parser is not None and
|
1148 |
+
self.parser.tree.openElements and
|
1149 |
+
self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
|
1150 |
+
matched = True
|
1151 |
+
for expected in ["C", "D", "A", "T", "A", "["]:
|
1152 |
+
charStack.append(self.stream.char())
|
1153 |
+
if charStack[-1] != expected:
|
1154 |
+
matched = False
|
1155 |
+
break
|
1156 |
+
if matched:
|
1157 |
+
self.state = self.cdataSectionState
|
1158 |
+
return True
|
1159 |
+
|
1160 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1161 |
+
"expected-dashes-or-doctype"})
|
1162 |
+
|
1163 |
+
while charStack:
|
1164 |
+
self.stream.unget(charStack.pop())
|
1165 |
+
self.state = self.bogusCommentState
|
1166 |
+
return True
|
1167 |
+
|
1168 |
+
def commentStartState(self):
|
1169 |
+
data = self.stream.char()
|
1170 |
+
if data == "-":
|
1171 |
+
self.state = self.commentStartDashState
|
1172 |
+
elif data == "\u0000":
|
1173 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1174 |
+
"data": "invalid-codepoint"})
|
1175 |
+
self.currentToken["data"] += "\uFFFD"
|
1176 |
+
elif data == ">":
|
1177 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1178 |
+
"incorrect-comment"})
|
1179 |
+
self.tokenQueue.append(self.currentToken)
|
1180 |
+
self.state = self.dataState
|
1181 |
+
elif data is EOF:
|
1182 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1183 |
+
"eof-in-comment"})
|
1184 |
+
self.tokenQueue.append(self.currentToken)
|
1185 |
+
self.state = self.dataState
|
1186 |
+
else:
|
1187 |
+
self.currentToken["data"] += data
|
1188 |
+
self.state = self.commentState
|
1189 |
+
return True
|
1190 |
+
|
1191 |
+
def commentStartDashState(self):
|
1192 |
+
data = self.stream.char()
|
1193 |
+
if data == "-":
|
1194 |
+
self.state = self.commentEndState
|
1195 |
+
elif data == "\u0000":
|
1196 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1197 |
+
"data": "invalid-codepoint"})
|
1198 |
+
self.currentToken["data"] += "-\uFFFD"
|
1199 |
+
elif data == ">":
|
1200 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1201 |
+
"incorrect-comment"})
|
1202 |
+
self.tokenQueue.append(self.currentToken)
|
1203 |
+
self.state = self.dataState
|
1204 |
+
elif data is EOF:
|
1205 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1206 |
+
"eof-in-comment"})
|
1207 |
+
self.tokenQueue.append(self.currentToken)
|
1208 |
+
self.state = self.dataState
|
1209 |
+
else:
|
1210 |
+
self.currentToken["data"] += "-" + data
|
1211 |
+
self.state = self.commentState
|
1212 |
+
return True
|
1213 |
+
|
1214 |
+
def commentState(self):
|
1215 |
+
data = self.stream.char()
|
1216 |
+
if data == "-":
|
1217 |
+
self.state = self.commentEndDashState
|
1218 |
+
elif data == "\u0000":
|
1219 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1220 |
+
"data": "invalid-codepoint"})
|
1221 |
+
self.currentToken["data"] += "\uFFFD"
|
1222 |
+
elif data is EOF:
|
1223 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1224 |
+
"data": "eof-in-comment"})
|
1225 |
+
self.tokenQueue.append(self.currentToken)
|
1226 |
+
self.state = self.dataState
|
1227 |
+
else:
|
1228 |
+
self.currentToken["data"] += data + \
|
1229 |
+
self.stream.charsUntil(("-", "\u0000"))
|
1230 |
+
return True
|
1231 |
+
|
1232 |
+
def commentEndDashState(self):
|
1233 |
+
data = self.stream.char()
|
1234 |
+
if data == "-":
|
1235 |
+
self.state = self.commentEndState
|
1236 |
+
elif data == "\u0000":
|
1237 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1238 |
+
"data": "invalid-codepoint"})
|
1239 |
+
self.currentToken["data"] += "-\uFFFD"
|
1240 |
+
self.state = self.commentState
|
1241 |
+
elif data is EOF:
|
1242 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1243 |
+
"eof-in-comment-end-dash"})
|
1244 |
+
self.tokenQueue.append(self.currentToken)
|
1245 |
+
self.state = self.dataState
|
1246 |
+
else:
|
1247 |
+
self.currentToken["data"] += "-" + data
|
1248 |
+
self.state = self.commentState
|
1249 |
+
return True
|
1250 |
+
|
1251 |
+
def commentEndState(self):
|
1252 |
+
data = self.stream.char()
|
1253 |
+
if data == ">":
|
1254 |
+
self.tokenQueue.append(self.currentToken)
|
1255 |
+
self.state = self.dataState
|
1256 |
+
elif data == "\u0000":
|
1257 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1258 |
+
"data": "invalid-codepoint"})
|
1259 |
+
self.currentToken["data"] += "--\uFFFD"
|
1260 |
+
self.state = self.commentState
|
1261 |
+
elif data == "!":
|
1262 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1263 |
+
"unexpected-bang-after-double-dash-in-comment"})
|
1264 |
+
self.state = self.commentEndBangState
|
1265 |
+
elif data == "-":
|
1266 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1267 |
+
"unexpected-dash-after-double-dash-in-comment"})
|
1268 |
+
self.currentToken["data"] += data
|
1269 |
+
elif data is EOF:
|
1270 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1271 |
+
"eof-in-comment-double-dash"})
|
1272 |
+
self.tokenQueue.append(self.currentToken)
|
1273 |
+
self.state = self.dataState
|
1274 |
+
else:
|
1275 |
+
# XXX
|
1276 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1277 |
+
"unexpected-char-in-comment"})
|
1278 |
+
self.currentToken["data"] += "--" + data
|
1279 |
+
self.state = self.commentState
|
1280 |
+
return True
|
1281 |
+
|
1282 |
+
def commentEndBangState(self):
|
1283 |
+
data = self.stream.char()
|
1284 |
+
if data == ">":
|
1285 |
+
self.tokenQueue.append(self.currentToken)
|
1286 |
+
self.state = self.dataState
|
1287 |
+
elif data == "-":
|
1288 |
+
self.currentToken["data"] += "--!"
|
1289 |
+
self.state = self.commentEndDashState
|
1290 |
+
elif data == "\u0000":
|
1291 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1292 |
+
"data": "invalid-codepoint"})
|
1293 |
+
self.currentToken["data"] += "--!\uFFFD"
|
1294 |
+
self.state = self.commentState
|
1295 |
+
elif data is EOF:
|
1296 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1297 |
+
"eof-in-comment-end-bang-state"})
|
1298 |
+
self.tokenQueue.append(self.currentToken)
|
1299 |
+
self.state = self.dataState
|
1300 |
+
else:
|
1301 |
+
self.currentToken["data"] += "--!" + data
|
1302 |
+
self.state = self.commentState
|
1303 |
+
return True
|
1304 |
+
|
1305 |
+
def doctypeState(self):
|
1306 |
+
data = self.stream.char()
|
1307 |
+
if data in spaceCharacters:
|
1308 |
+
self.state = self.beforeDoctypeNameState
|
1309 |
+
elif data is EOF:
|
1310 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1311 |
+
"expected-doctype-name-but-got-eof"})
|
1312 |
+
self.currentToken["correct"] = False
|
1313 |
+
self.tokenQueue.append(self.currentToken)
|
1314 |
+
self.state = self.dataState
|
1315 |
+
else:
|
1316 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1317 |
+
"need-space-after-doctype"})
|
1318 |
+
self.stream.unget(data)
|
1319 |
+
self.state = self.beforeDoctypeNameState
|
1320 |
+
return True
|
1321 |
+
|
1322 |
+
def beforeDoctypeNameState(self):
|
1323 |
+
data = self.stream.char()
|
1324 |
+
if data in spaceCharacters:
|
1325 |
+
pass
|
1326 |
+
elif data == ">":
|
1327 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1328 |
+
"expected-doctype-name-but-got-right-bracket"})
|
1329 |
+
self.currentToken["correct"] = False
|
1330 |
+
self.tokenQueue.append(self.currentToken)
|
1331 |
+
self.state = self.dataState
|
1332 |
+
elif data == "\u0000":
|
1333 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1334 |
+
"data": "invalid-codepoint"})
|
1335 |
+
self.currentToken["name"] = "\uFFFD"
|
1336 |
+
self.state = self.doctypeNameState
|
1337 |
+
elif data is EOF:
|
1338 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1339 |
+
"expected-doctype-name-but-got-eof"})
|
1340 |
+
self.currentToken["correct"] = False
|
1341 |
+
self.tokenQueue.append(self.currentToken)
|
1342 |
+
self.state = self.dataState
|
1343 |
+
else:
|
1344 |
+
self.currentToken["name"] = data
|
1345 |
+
self.state = self.doctypeNameState
|
1346 |
+
return True
|
1347 |
+
|
1348 |
+
def doctypeNameState(self):
|
1349 |
+
data = self.stream.char()
|
1350 |
+
if data in spaceCharacters:
|
1351 |
+
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
1352 |
+
self.state = self.afterDoctypeNameState
|
1353 |
+
elif data == ">":
|
1354 |
+
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
1355 |
+
self.tokenQueue.append(self.currentToken)
|
1356 |
+
self.state = self.dataState
|
1357 |
+
elif data == "\u0000":
|
1358 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1359 |
+
"data": "invalid-codepoint"})
|
1360 |
+
self.currentToken["name"] += "\uFFFD"
|
1361 |
+
self.state = self.doctypeNameState
|
1362 |
+
elif data is EOF:
|
1363 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1364 |
+
"eof-in-doctype-name"})
|
1365 |
+
self.currentToken["correct"] = False
|
1366 |
+
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
1367 |
+
self.tokenQueue.append(self.currentToken)
|
1368 |
+
self.state = self.dataState
|
1369 |
+
else:
|
1370 |
+
self.currentToken["name"] += data
|
1371 |
+
return True
|
1372 |
+
|
1373 |
+
def afterDoctypeNameState(self):
|
1374 |
+
data = self.stream.char()
|
1375 |
+
if data in spaceCharacters:
|
1376 |
+
pass
|
1377 |
+
elif data == ">":
|
1378 |
+
self.tokenQueue.append(self.currentToken)
|
1379 |
+
self.state = self.dataState
|
1380 |
+
elif data is EOF:
|
1381 |
+
self.currentToken["correct"] = False
|
1382 |
+
self.stream.unget(data)
|
1383 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1384 |
+
"eof-in-doctype"})
|
1385 |
+
self.tokenQueue.append(self.currentToken)
|
1386 |
+
self.state = self.dataState
|
1387 |
+
else:
|
1388 |
+
if data in ("p", "P"):
|
1389 |
+
matched = True
|
1390 |
+
for expected in (("u", "U"), ("b", "B"), ("l", "L"),
|
1391 |
+
("i", "I"), ("c", "C")):
|
1392 |
+
data = self.stream.char()
|
1393 |
+
if data not in expected:
|
1394 |
+
matched = False
|
1395 |
+
break
|
1396 |
+
if matched:
|
1397 |
+
self.state = self.afterDoctypePublicKeywordState
|
1398 |
+
return True
|
1399 |
+
elif data in ("s", "S"):
|
1400 |
+
matched = True
|
1401 |
+
for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
|
1402 |
+
("e", "E"), ("m", "M")):
|
1403 |
+
data = self.stream.char()
|
1404 |
+
if data not in expected:
|
1405 |
+
matched = False
|
1406 |
+
break
|
1407 |
+
if matched:
|
1408 |
+
self.state = self.afterDoctypeSystemKeywordState
|
1409 |
+
return True
|
1410 |
+
|
1411 |
+
# All the characters read before the current 'data' will be
|
1412 |
+
# [a-zA-Z], so they're garbage in the bogus doctype and can be
|
1413 |
+
# discarded; only the latest character might be '>' or EOF
|
1414 |
+
# and needs to be ungetted
|
1415 |
+
self.stream.unget(data)
|
1416 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1417 |
+
"expected-space-or-right-bracket-in-doctype", "datavars":
|
1418 |
+
{"data": data}})
|
1419 |
+
self.currentToken["correct"] = False
|
1420 |
+
self.state = self.bogusDoctypeState
|
1421 |
+
|
1422 |
+
return True
|
1423 |
+
|
1424 |
+
def afterDoctypePublicKeywordState(self):
|
1425 |
+
data = self.stream.char()
|
1426 |
+
if data in spaceCharacters:
|
1427 |
+
self.state = self.beforeDoctypePublicIdentifierState
|
1428 |
+
elif data in ("'", '"'):
|
1429 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1430 |
+
"unexpected-char-in-doctype"})
|
1431 |
+
self.stream.unget(data)
|
1432 |
+
self.state = self.beforeDoctypePublicIdentifierState
|
1433 |
+
elif data is EOF:
|
1434 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1435 |
+
"eof-in-doctype"})
|
1436 |
+
self.currentToken["correct"] = False
|
1437 |
+
self.tokenQueue.append(self.currentToken)
|
1438 |
+
self.state = self.dataState
|
1439 |
+
else:
|
1440 |
+
self.stream.unget(data)
|
1441 |
+
self.state = self.beforeDoctypePublicIdentifierState
|
1442 |
+
return True
|
1443 |
+
|
1444 |
+
def beforeDoctypePublicIdentifierState(self):
|
1445 |
+
data = self.stream.char()
|
1446 |
+
if data in spaceCharacters:
|
1447 |
+
pass
|
1448 |
+
elif data == "\"":
|
1449 |
+
self.currentToken["publicId"] = ""
|
1450 |
+
self.state = self.doctypePublicIdentifierDoubleQuotedState
|
1451 |
+
elif data == "'":
|
1452 |
+
self.currentToken["publicId"] = ""
|
1453 |
+
self.state = self.doctypePublicIdentifierSingleQuotedState
|
1454 |
+
elif data == ">":
|
1455 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1456 |
+
"unexpected-end-of-doctype"})
|
1457 |
+
self.currentToken["correct"] = False
|
1458 |
+
self.tokenQueue.append(self.currentToken)
|
1459 |
+
self.state = self.dataState
|
1460 |
+
elif data is EOF:
|
1461 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1462 |
+
"eof-in-doctype"})
|
1463 |
+
self.currentToken["correct"] = False
|
1464 |
+
self.tokenQueue.append(self.currentToken)
|
1465 |
+
self.state = self.dataState
|
1466 |
+
else:
|
1467 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1468 |
+
"unexpected-char-in-doctype"})
|
1469 |
+
self.currentToken["correct"] = False
|
1470 |
+
self.state = self.bogusDoctypeState
|
1471 |
+
return True
|
1472 |
+
|
1473 |
+
def doctypePublicIdentifierDoubleQuotedState(self):
|
1474 |
+
data = self.stream.char()
|
1475 |
+
if data == "\"":
|
1476 |
+
self.state = self.afterDoctypePublicIdentifierState
|
1477 |
+
elif data == "\u0000":
|
1478 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1479 |
+
"data": "invalid-codepoint"})
|
1480 |
+
self.currentToken["publicId"] += "\uFFFD"
|
1481 |
+
elif data == ">":
|
1482 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1483 |
+
"unexpected-end-of-doctype"})
|
1484 |
+
self.currentToken["correct"] = False
|
1485 |
+
self.tokenQueue.append(self.currentToken)
|
1486 |
+
self.state = self.dataState
|
1487 |
+
elif data is EOF:
|
1488 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1489 |
+
"eof-in-doctype"})
|
1490 |
+
self.currentToken["correct"] = False
|
1491 |
+
self.tokenQueue.append(self.currentToken)
|
1492 |
+
self.state = self.dataState
|
1493 |
+
else:
|
1494 |
+
self.currentToken["publicId"] += data
|
1495 |
+
return True
|
1496 |
+
|
1497 |
+
def doctypePublicIdentifierSingleQuotedState(self):
|
1498 |
+
data = self.stream.char()
|
1499 |
+
if data == "'":
|
1500 |
+
self.state = self.afterDoctypePublicIdentifierState
|
1501 |
+
elif data == "\u0000":
|
1502 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1503 |
+
"data": "invalid-codepoint"})
|
1504 |
+
self.currentToken["publicId"] += "\uFFFD"
|
1505 |
+
elif data == ">":
|
1506 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1507 |
+
"unexpected-end-of-doctype"})
|
1508 |
+
self.currentToken["correct"] = False
|
1509 |
+
self.tokenQueue.append(self.currentToken)
|
1510 |
+
self.state = self.dataState
|
1511 |
+
elif data is EOF:
|
1512 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1513 |
+
"eof-in-doctype"})
|
1514 |
+
self.currentToken["correct"] = False
|
1515 |
+
self.tokenQueue.append(self.currentToken)
|
1516 |
+
self.state = self.dataState
|
1517 |
+
else:
|
1518 |
+
self.currentToken["publicId"] += data
|
1519 |
+
return True
|
1520 |
+
|
1521 |
+
def afterDoctypePublicIdentifierState(self):
|
1522 |
+
data = self.stream.char()
|
1523 |
+
if data in spaceCharacters:
|
1524 |
+
self.state = self.betweenDoctypePublicAndSystemIdentifiersState
|
1525 |
+
elif data == ">":
|
1526 |
+
self.tokenQueue.append(self.currentToken)
|
1527 |
+
self.state = self.dataState
|
1528 |
+
elif data == '"':
|
1529 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1530 |
+
"unexpected-char-in-doctype"})
|
1531 |
+
self.currentToken["systemId"] = ""
|
1532 |
+
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
1533 |
+
elif data == "'":
|
1534 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1535 |
+
"unexpected-char-in-doctype"})
|
1536 |
+
self.currentToken["systemId"] = ""
|
1537 |
+
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
1538 |
+
elif data is EOF:
|
1539 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1540 |
+
"eof-in-doctype"})
|
1541 |
+
self.currentToken["correct"] = False
|
1542 |
+
self.tokenQueue.append(self.currentToken)
|
1543 |
+
self.state = self.dataState
|
1544 |
+
else:
|
1545 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1546 |
+
"unexpected-char-in-doctype"})
|
1547 |
+
self.currentToken["correct"] = False
|
1548 |
+
self.state = self.bogusDoctypeState
|
1549 |
+
return True
|
1550 |
+
|
1551 |
+
def betweenDoctypePublicAndSystemIdentifiersState(self):
|
1552 |
+
data = self.stream.char()
|
1553 |
+
if data in spaceCharacters:
|
1554 |
+
pass
|
1555 |
+
elif data == ">":
|
1556 |
+
self.tokenQueue.append(self.currentToken)
|
1557 |
+
self.state = self.dataState
|
1558 |
+
elif data == '"':
|
1559 |
+
self.currentToken["systemId"] = ""
|
1560 |
+
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
1561 |
+
elif data == "'":
|
1562 |
+
self.currentToken["systemId"] = ""
|
1563 |
+
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
1564 |
+
elif data == EOF:
|
1565 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1566 |
+
"eof-in-doctype"})
|
1567 |
+
self.currentToken["correct"] = False
|
1568 |
+
self.tokenQueue.append(self.currentToken)
|
1569 |
+
self.state = self.dataState
|
1570 |
+
else:
|
1571 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1572 |
+
"unexpected-char-in-doctype"})
|
1573 |
+
self.currentToken["correct"] = False
|
1574 |
+
self.state = self.bogusDoctypeState
|
1575 |
+
return True
|
1576 |
+
|
1577 |
+
def afterDoctypeSystemKeywordState(self):
|
1578 |
+
data = self.stream.char()
|
1579 |
+
if data in spaceCharacters:
|
1580 |
+
self.state = self.beforeDoctypeSystemIdentifierState
|
1581 |
+
elif data in ("'", '"'):
|
1582 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1583 |
+
"unexpected-char-in-doctype"})
|
1584 |
+
self.stream.unget(data)
|
1585 |
+
self.state = self.beforeDoctypeSystemIdentifierState
|
1586 |
+
elif data is EOF:
|
1587 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1588 |
+
"eof-in-doctype"})
|
1589 |
+
self.currentToken["correct"] = False
|
1590 |
+
self.tokenQueue.append(self.currentToken)
|
1591 |
+
self.state = self.dataState
|
1592 |
+
else:
|
1593 |
+
self.stream.unget(data)
|
1594 |
+
self.state = self.beforeDoctypeSystemIdentifierState
|
1595 |
+
return True
|
1596 |
+
|
1597 |
+
def beforeDoctypeSystemIdentifierState(self):
|
1598 |
+
data = self.stream.char()
|
1599 |
+
if data in spaceCharacters:
|
1600 |
+
pass
|
1601 |
+
elif data == "\"":
|
1602 |
+
self.currentToken["systemId"] = ""
|
1603 |
+
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
1604 |
+
elif data == "'":
|
1605 |
+
self.currentToken["systemId"] = ""
|
1606 |
+
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
1607 |
+
elif data == ">":
|
1608 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1609 |
+
"unexpected-char-in-doctype"})
|
1610 |
+
self.currentToken["correct"] = False
|
1611 |
+
self.tokenQueue.append(self.currentToken)
|
1612 |
+
self.state = self.dataState
|
1613 |
+
elif data is EOF:
|
1614 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1615 |
+
"eof-in-doctype"})
|
1616 |
+
self.currentToken["correct"] = False
|
1617 |
+
self.tokenQueue.append(self.currentToken)
|
1618 |
+
self.state = self.dataState
|
1619 |
+
else:
|
1620 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1621 |
+
"unexpected-char-in-doctype"})
|
1622 |
+
self.currentToken["correct"] = False
|
1623 |
+
self.state = self.bogusDoctypeState
|
1624 |
+
return True
|
1625 |
+
|
1626 |
+
def doctypeSystemIdentifierDoubleQuotedState(self):
|
1627 |
+
data = self.stream.char()
|
1628 |
+
if data == "\"":
|
1629 |
+
self.state = self.afterDoctypeSystemIdentifierState
|
1630 |
+
elif data == "\u0000":
|
1631 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1632 |
+
"data": "invalid-codepoint"})
|
1633 |
+
self.currentToken["systemId"] += "\uFFFD"
|
1634 |
+
elif data == ">":
|
1635 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1636 |
+
"unexpected-end-of-doctype"})
|
1637 |
+
self.currentToken["correct"] = False
|
1638 |
+
self.tokenQueue.append(self.currentToken)
|
1639 |
+
self.state = self.dataState
|
1640 |
+
elif data is EOF:
|
1641 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1642 |
+
"eof-in-doctype"})
|
1643 |
+
self.currentToken["correct"] = False
|
1644 |
+
self.tokenQueue.append(self.currentToken)
|
1645 |
+
self.state = self.dataState
|
1646 |
+
else:
|
1647 |
+
self.currentToken["systemId"] += data
|
1648 |
+
return True
|
1649 |
+
|
1650 |
+
def doctypeSystemIdentifierSingleQuotedState(self):
|
1651 |
+
data = self.stream.char()
|
1652 |
+
if data == "'":
|
1653 |
+
self.state = self.afterDoctypeSystemIdentifierState
|
1654 |
+
elif data == "\u0000":
|
1655 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1656 |
+
"data": "invalid-codepoint"})
|
1657 |
+
self.currentToken["systemId"] += "\uFFFD"
|
1658 |
+
elif data == ">":
|
1659 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1660 |
+
"unexpected-end-of-doctype"})
|
1661 |
+
self.currentToken["correct"] = False
|
1662 |
+
self.tokenQueue.append(self.currentToken)
|
1663 |
+
self.state = self.dataState
|
1664 |
+
elif data is EOF:
|
1665 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1666 |
+
"eof-in-doctype"})
|
1667 |
+
self.currentToken["correct"] = False
|
1668 |
+
self.tokenQueue.append(self.currentToken)
|
1669 |
+
self.state = self.dataState
|
1670 |
+
else:
|
1671 |
+
self.currentToken["systemId"] += data
|
1672 |
+
return True
|
1673 |
+
|
1674 |
+
def afterDoctypeSystemIdentifierState(self):
|
1675 |
+
data = self.stream.char()
|
1676 |
+
if data in spaceCharacters:
|
1677 |
+
pass
|
1678 |
+
elif data == ">":
|
1679 |
+
self.tokenQueue.append(self.currentToken)
|
1680 |
+
self.state = self.dataState
|
1681 |
+
elif data is EOF:
|
1682 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1683 |
+
"eof-in-doctype"})
|
1684 |
+
self.currentToken["correct"] = False
|
1685 |
+
self.tokenQueue.append(self.currentToken)
|
1686 |
+
self.state = self.dataState
|
1687 |
+
else:
|
1688 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
1689 |
+
"unexpected-char-in-doctype"})
|
1690 |
+
self.state = self.bogusDoctypeState
|
1691 |
+
return True
|
1692 |
+
|
1693 |
+
def bogusDoctypeState(self):
|
1694 |
+
data = self.stream.char()
|
1695 |
+
if data == ">":
|
1696 |
+
self.tokenQueue.append(self.currentToken)
|
1697 |
+
self.state = self.dataState
|
1698 |
+
elif data is EOF:
|
1699 |
+
# XXX EMIT
|
1700 |
+
self.stream.unget(data)
|
1701 |
+
self.tokenQueue.append(self.currentToken)
|
1702 |
+
self.state = self.dataState
|
1703 |
+
else:
|
1704 |
+
pass
|
1705 |
+
return True
|
1706 |
+
|
1707 |
+
def cdataSectionState(self):
|
1708 |
+
data = []
|
1709 |
+
while True:
|
1710 |
+
data.append(self.stream.charsUntil("]"))
|
1711 |
+
data.append(self.stream.charsUntil(">"))
|
1712 |
+
char = self.stream.char()
|
1713 |
+
if char == EOF:
|
1714 |
+
break
|
1715 |
+
else:
|
1716 |
+
assert char == ">"
|
1717 |
+
if data[-1][-2:] == "]]":
|
1718 |
+
data[-1] = data[-1][:-2]
|
1719 |
+
break
|
1720 |
+
else:
|
1721 |
+
data.append(char)
|
1722 |
+
|
1723 |
+
data = "".join(data) # pylint:disable=redefined-variable-type
|
1724 |
+
# Deal with null here rather than in the parser
|
1725 |
+
nullCount = data.count("\u0000")
|
1726 |
+
if nullCount > 0:
|
1727 |
+
for _ in range(nullCount):
|
1728 |
+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
1729 |
+
"data": "invalid-codepoint"})
|
1730 |
+
data = data.replace("\u0000", "\uFFFD")
|
1731 |
+
if data:
|
1732 |
+
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
1733 |
+
"data": data})
|
1734 |
+
self.state = self.dataState
|
1735 |
+
return True
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import, division, unicode_literals
|
2 |
+
|
3 |
+
from .py import Trie
|
4 |
+
|
5 |
+
__all__ = ["Trie"]
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (325 Bytes). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/_base.cpython-39.pyc
ADDED
Binary file (1.57 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/py.cpython-39.pyc
ADDED
Binary file (2.22 kB). View file
|
|
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/_base.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import, division, unicode_literals
|
2 |
+
|
3 |
+
try:
|
4 |
+
from collections.abc import Mapping
|
5 |
+
except ImportError: # Python 2.7
|
6 |
+
from collections import Mapping
|
7 |
+
|
8 |
+
|
9 |
+
class Trie(Mapping):
|
10 |
+
"""Abstract base class for tries"""
|
11 |
+
|
12 |
+
def keys(self, prefix=None):
|
13 |
+
# pylint:disable=arguments-differ
|
14 |
+
keys = super(Trie, self).keys()
|
15 |
+
|
16 |
+
if prefix is None:
|
17 |
+
return set(keys)
|
18 |
+
|
19 |
+
return {x for x in keys if x.startswith(prefix)}
|
20 |
+
|
21 |
+
def has_keys_with_prefix(self, prefix):
|
22 |
+
for key in self.keys():
|
23 |
+
if key.startswith(prefix):
|
24 |
+
return True
|
25 |
+
|
26 |
+
return False
|
27 |
+
|
28 |
+
def longest_prefix(self, prefix):
|
29 |
+
if prefix in self:
|
30 |
+
return prefix
|
31 |
+
|
32 |
+
for i in range(1, len(prefix) + 1):
|
33 |
+
if prefix[:-i] in self:
|
34 |
+
return prefix[:-i]
|
35 |
+
|
36 |
+
raise KeyError(prefix)
|
37 |
+
|
38 |
+
def longest_prefix_item(self, prefix):
|
39 |
+
lprefix = self.longest_prefix(prefix)
|
40 |
+
return (lprefix, self[lprefix])
|