Kano001 commited on
Commit
cf2a15a
1 Parent(s): 6a86ad5

Upload 527 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. MLPY/Lib/site-packages/tensorboard/__init__.py +113 -0
  2. MLPY/Lib/site-packages/tensorboard/__pycache__/__init__.cpython-39.pyc +0 -0
  3. MLPY/Lib/site-packages/tensorboard/__pycache__/assets.cpython-39.pyc +0 -0
  4. MLPY/Lib/site-packages/tensorboard/__pycache__/auth.cpython-39.pyc +0 -0
  5. MLPY/Lib/site-packages/tensorboard/__pycache__/context.cpython-39.pyc +0 -0
  6. MLPY/Lib/site-packages/tensorboard/__pycache__/data_compat.cpython-39.pyc +0 -0
  7. MLPY/Lib/site-packages/tensorboard/__pycache__/dataclass_compat.cpython-39.pyc +0 -0
  8. MLPY/Lib/site-packages/tensorboard/__pycache__/default.cpython-39.pyc +0 -0
  9. MLPY/Lib/site-packages/tensorboard/__pycache__/errors.cpython-39.pyc +0 -0
  10. MLPY/Lib/site-packages/tensorboard/__pycache__/lazy.cpython-39.pyc +0 -0
  11. MLPY/Lib/site-packages/tensorboard/__pycache__/main.cpython-39.pyc +0 -0
  12. MLPY/Lib/site-packages/tensorboard/__pycache__/main_lib.cpython-39.pyc +0 -0
  13. MLPY/Lib/site-packages/tensorboard/__pycache__/manager.cpython-39.pyc +0 -0
  14. MLPY/Lib/site-packages/tensorboard/__pycache__/notebook.cpython-39.pyc +0 -0
  15. MLPY/Lib/site-packages/tensorboard/__pycache__/plugin_util.cpython-39.pyc +0 -0
  16. MLPY/Lib/site-packages/tensorboard/__pycache__/program.cpython-39.pyc +0 -0
  17. MLPY/Lib/site-packages/tensorboard/__pycache__/version.cpython-39.pyc +0 -0
  18. MLPY/Lib/site-packages/tensorboard/_vendor/__init__.py +0 -0
  19. MLPY/Lib/site-packages/tensorboard/_vendor/__pycache__/__init__.cpython-39.pyc +0 -0
  20. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__init__.py +124 -0
  21. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/__init__.cpython-39.pyc +0 -0
  22. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/callbacks.cpython-39.pyc +0 -0
  23. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/encoding.cpython-39.pyc +0 -0
  24. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/linkifier.cpython-39.pyc +0 -0
  25. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/sanitizer.cpython-39.pyc +0 -0
  26. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/utils.cpython-39.pyc +0 -0
  27. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/version.cpython-39.pyc +0 -0
  28. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/callbacks.py +25 -0
  29. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/encoding.py +62 -0
  30. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/linkifier.py +526 -0
  31. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/sanitizer.py +368 -0
  32. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/utils.py +23 -0
  33. MLPY/Lib/site-packages/tensorboard/_vendor/bleach/version.py +6 -0
  34. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__init__.py +35 -0
  35. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/__init__.cpython-39.pyc +0 -0
  36. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_ihatexml.cpython-39.pyc +0 -0
  37. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_inputstream.cpython-39.pyc +0 -0
  38. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_tokenizer.cpython-39.pyc +0 -0
  39. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_utils.cpython-39.pyc +0 -0
  40. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/constants.cpython-39.pyc +0 -0
  41. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/html5parser.cpython-39.pyc +0 -0
  42. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/serializer.cpython-39.pyc +0 -0
  43. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_ihatexml.py +289 -0
  44. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_inputstream.py +918 -0
  45. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_tokenizer.py +1735 -0
  46. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__init__.py +5 -0
  47. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/__init__.cpython-39.pyc +0 -0
  48. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/_base.cpython-39.pyc +0 -0
  49. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/py.cpython-39.pyc +0 -0
  50. MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/_base.py +40 -0
MLPY/Lib/site-packages/tensorboard/__init__.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+ """TensorBoard is a webapp for understanding TensorFlow runs and graphs."""
16
+
17
+
18
+ from tensorboard import lazy as _lazy
19
+ from tensorboard import version as _version
20
+
21
+ # TensorBoard public API.
22
+ __all__ = [
23
+ "__version__",
24
+ "errors",
25
+ "notebook",
26
+ "program",
27
+ "summary",
28
+ ]
29
+
30
+
31
+ # Please be careful when changing the structure of this file.
32
+ #
33
+ # The lazy imports in this file must use `importlib.import_module`, not
34
+ # `import tensorboard.foo` or `from tensorboard import foo`, or it will
35
+ # be impossible to reload the TensorBoard module without breaking these
36
+ # top-level public APIs. This has to do with the gory details of
37
+ # Python's module system. Take `tensorboard.notebook` as an example:
38
+ #
39
+ # - When the `tensorboard` module (that's us!) is initialized, its
40
+ # `notebook` attribute is initialized to a new LazyModule. The
41
+ # actual `tensorboard.notebook` submodule is not loaded.
42
+ #
43
+ # - When the `tensorboard.notebook` submodule is first loaded, Python
44
+ # _reassigns_ the `notebook` attribute on the `tensorboard` module
45
+ # object to point to the underlying `tensorboard.notebook` module
46
+ # object, rather than its former LazyModule value. This occurs
47
+ # whether the module is loaded via the lazy module or directly as an
48
+ # import:
49
+ #
50
+ # - import tensorboard; tensorboard.notebook.start(...) # one way
51
+ # - from tensorboard import notebook # other way; same effect
52
+ #
53
+ # - When the `tensorboard` module is reloaded, its `notebook`
54
+ # attribute is once again bound to a (new) LazyModule, while the
55
+ # `tensorboard.notebook` module object is unaffected and still
56
+ # exists in `sys.modules`. But then...
57
+ #
58
+ # - When the new LazyModule is forced, it must resolve to the existing
59
+ # `tensorboard.notebook` module object rather than itself (which
60
+ # just creates a stack overflow). If the LazyModule load function
61
+ # uses `import tensorboard.notebook; return tensorboard.notebook`,
62
+ # then the first statement will do _nothing_ because the
63
+ # `tensorboard.notebook` module is already loaded, and the second
64
+ # statement will return the LazyModule itself. The same goes for the
65
+ # `from tensorboard import notebook` form. We need to ensure that
66
+ # the submodule is loaded and then pull the actual module object out
67
+ # of `sys.modules`... which is exactly what `importlib` handles for
68
+ # us.
69
+ #
70
+ # See <https://github.com/tensorflow/tensorboard/issues/1989> for
71
+ # additional discussion.
72
+
73
+
74
+ @_lazy.lazy_load("tensorboard.errors")
75
+ def errors():
76
+ import importlib
77
+
78
+ return importlib.import_module("tensorboard.errors")
79
+
80
+
81
+ @_lazy.lazy_load("tensorboard.notebook")
82
+ def notebook():
83
+ import importlib
84
+
85
+ return importlib.import_module("tensorboard.notebook")
86
+
87
+
88
+ @_lazy.lazy_load("tensorboard.program")
89
+ def program():
90
+ import importlib
91
+
92
+ return importlib.import_module("tensorboard.program")
93
+
94
+
95
+ @_lazy.lazy_load("tensorboard.summary")
96
+ def summary():
97
+ import importlib
98
+
99
+ return importlib.import_module("tensorboard.summary")
100
+
101
+
102
+ def load_ipython_extension(ipython):
103
+ """IPython API entry point.
104
+
105
+ Only intended to be called by the IPython runtime.
106
+
107
+ See:
108
+ https://ipython.readthedocs.io/en/stable/config/extensions/index.html
109
+ """
110
+ notebook._load_ipython_extension(ipython)
111
+
112
+
113
+ __version__ = _version.VERSION
MLPY/Lib/site-packages/tensorboard/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (1.41 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/assets.cpython-39.pyc ADDED
Binary file (1.01 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/auth.cpython-39.pyc ADDED
Binary file (3.45 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/context.cpython-39.pyc ADDED
Binary file (4.19 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/data_compat.cpython-39.pyc ADDED
Binary file (4.99 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/dataclass_compat.cpython-39.pyc ADDED
Binary file (6.47 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/default.cpython-39.pyc ADDED
Binary file (4.02 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/errors.cpython-39.pyc ADDED
Binary file (4.57 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/lazy.cpython-39.pyc ADDED
Binary file (2.84 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/main.cpython-39.pyc ADDED
Binary file (1.26 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/main_lib.cpython-39.pyc ADDED
Binary file (1.33 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/manager.cpython-39.pyc ADDED
Binary file (15 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/notebook.cpython-39.pyc ADDED
Binary file (11.7 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/plugin_util.cpython-39.pyc ADDED
Binary file (6.67 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/program.cpython-39.pyc ADDED
Binary file (26.7 kB). View file
 
MLPY/Lib/site-packages/tensorboard/__pycache__/version.cpython-39.pyc ADDED
Binary file (257 Bytes). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/__init__.py ADDED
File without changes
MLPY/Lib/site-packages/tensorboard/_vendor/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (156 Bytes). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__init__.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import unicode_literals
4
+
5
+ from tensorboard._vendor.bleach.linkifier import (
6
+ DEFAULT_CALLBACKS,
7
+ Linker,
8
+ LinkifyFilter,
9
+ )
10
+ from tensorboard._vendor.bleach.sanitizer import (
11
+ ALLOWED_ATTRIBUTES,
12
+ ALLOWED_PROTOCOLS,
13
+ ALLOWED_STYLES,
14
+ ALLOWED_TAGS,
15
+ BleachSanitizerFilter,
16
+ Cleaner,
17
+ )
18
+ from tensorboard._vendor.bleach.version import __version__, VERSION # flake8: noqa
19
+
20
+ __all__ = ['clean', 'linkify']
21
+
22
+
23
+ def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
24
+ styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
25
+ strip_comments=True):
26
+ """Clean an HTML fragment of malicious content and return it
27
+
28
+ This function is a security-focused function whose sole purpose is to
29
+ remove malicious content from a string such that it can be displayed as
30
+ content in a web page.
31
+
32
+ This function is not designed to use to transform content to be used in
33
+ non-web-page contexts.
34
+
35
+ Example::
36
+
37
+ import bleach
38
+
39
+ better_text = bleach.clean(yucky_text)
40
+
41
+
42
+ .. Note::
43
+
44
+ If you're cleaning a lot of text and passing the same argument values or
45
+ you want more configurability, consider using a
46
+ :py:class:`bleach.sanitizer.Cleaner` instance.
47
+
48
+ :arg str text: the text to clean
49
+
50
+ :arg list tags: allowed list of tags; defaults to
51
+ ``bleach.sanitizer.ALLOWED_TAGS``
52
+
53
+ :arg dict attributes: allowed attributes; can be a callable, list or dict;
54
+ defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
55
+
56
+ :arg list styles: allowed list of css styles; defaults to
57
+ ``bleach.sanitizer.ALLOWED_STYLES``
58
+
59
+ :arg list protocols: allowed list of protocols for links; defaults
60
+ to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
61
+
62
+ :arg bool strip: whether or not to strip disallowed elements
63
+
64
+ :arg bool strip_comments: whether or not to strip HTML comments
65
+
66
+ :returns: cleaned text as unicode
67
+
68
+ """
69
+ cleaner = Cleaner(
70
+ tags=tags,
71
+ attributes=attributes,
72
+ styles=styles,
73
+ protocols=protocols,
74
+ strip=strip,
75
+ strip_comments=strip_comments,
76
+ )
77
+ return cleaner.clean(text)
78
+
79
+
80
+ def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
81
+ """Convert URL-like strings in an HTML fragment to links
82
+
83
+ This function converts strings that look like URLs, domain names and email
84
+ addresses in text that may be an HTML fragment to links, while preserving:
85
+
86
+ 1. links already in the string
87
+ 2. urls found in attributes
88
+ 3. email addresses
89
+
90
+ linkify does a best-effort approach and tries to recover from bad
91
+ situations due to crazy text.
92
+
93
+ .. Note::
94
+
95
+ If you're linking a lot of text and passing the same argument values or
96
+ you want more configurability, consider using a
97
+ :py:class:`bleach.linkifier.Linker` instance.
98
+
99
+ .. Note::
100
+
101
+ If you have text that you want to clean and then linkify, consider using
102
+ the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
103
+ pass. That way you're not parsing the HTML twice.
104
+
105
+ :arg str text: the text to linkify
106
+
107
+ :arg list callbacks: list of callbacks to run when adjusting tag attributes;
108
+ defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
109
+
110
+ :arg list skip_tags: list of tags that you don't want to linkify the
111
+ contents of; for example, you could set this to ``['pre']`` to skip
112
+ linkifying contents of ``pre`` tags
113
+
114
+ :arg bool parse_email: whether or not to linkify email addresses
115
+
116
+ :returns: linkified text as unicode
117
+
118
+ """
119
+ linker = Linker(
120
+ callbacks=callbacks,
121
+ skip_tags=skip_tags,
122
+ parse_email=parse_email
123
+ )
124
+ return linker.linkify(text)
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (3.79 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/callbacks.cpython-39.pyc ADDED
Binary file (1.06 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/encoding.cpython-39.pyc ADDED
Binary file (1.6 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/linkifier.cpython-39.pyc ADDED
Binary file (11.2 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/sanitizer.cpython-39.pyc ADDED
Binary file (8.63 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/utils.cpython-39.pyc ADDED
Binary file (1.03 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/__pycache__/version.cpython-39.pyc ADDED
Binary file (401 Bytes). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/callbacks.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A set of basic callbacks for bleach.linkify."""
2
+ from __future__ import unicode_literals
3
+
4
+
5
+ def nofollow(attrs, new=False):
6
+ href_key = (None, u'href')
7
+ if href_key not in attrs or attrs[href_key].startswith(u'mailto:'):
8
+ return attrs
9
+
10
+ rel_key = (None, u'rel')
11
+ rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val]
12
+ if u'nofollow' not in [rel_val.lower() for rel_val in rel_values]:
13
+ rel_values.append(u'nofollow')
14
+ attrs[rel_key] = u' '.join(rel_values)
15
+
16
+ return attrs
17
+
18
+
19
+ def target_blank(attrs, new=False):
20
+ href_key = (None, u'href')
21
+ if attrs[href_key].startswith(u'mailto:'):
22
+ return attrs
23
+
24
+ attrs[(None, u'target')] = u'_blank'
25
+ return attrs
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/encoding.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ from decimal import Decimal
3
+ import types
4
+ import six
5
+
6
+
7
+ def is_protected_type(obj):
8
+ """Determine if the object instance is of a protected type.
9
+
10
+ Objects of protected types are preserved as-is when passed to
11
+ force_unicode(strings_only=True).
12
+ """
13
+ return isinstance(obj, (
14
+ six.integer_types +
15
+ (types.NoneType,
16
+ datetime.datetime, datetime.date, datetime.time,
17
+ float, Decimal))
18
+ )
19
+
20
+
21
+ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
22
+ """
23
+ Similar to smart_text, except that lazy instances are resolved to
24
+ strings, rather than kept as lazy objects.
25
+
26
+ If strings_only is True, don't convert (some) non-string-like objects.
27
+ """
28
+ # Handle the common case first, saves 30-40% when s is an instance of
29
+ # six.text_type. This function gets called often in that setting.
30
+ if isinstance(s, six.text_type):
31
+ return s
32
+ if strings_only and is_protected_type(s):
33
+ return s
34
+ try:
35
+ if not isinstance(s, six.string_types):
36
+ if hasattr(s, '__unicode__'):
37
+ s = s.__unicode__()
38
+ else:
39
+ if six.PY3:
40
+ if isinstance(s, bytes):
41
+ s = six.text_type(s, encoding, errors)
42
+ else:
43
+ s = six.text_type(s)
44
+ else:
45
+ s = six.text_type(bytes(s), encoding, errors)
46
+ else:
47
+ # Note: We use .decode() here, instead of six.text_type(s,
48
+ # encoding, errors), so that if s is a SafeBytes, it ends up being
49
+ # a SafeText at the end.
50
+ s = s.decode(encoding, errors)
51
+ except UnicodeDecodeError as e:
52
+ if not isinstance(s, Exception):
53
+ raise UnicodeDecodeError(*e.args)
54
+ else:
55
+ # If we get to here, the caller has passed in an Exception
56
+ # subclass populated with non-ASCII bytestring data without a
57
+ # working unicode method. Try to handle this without raising a
58
+ # further exception by individually forcing the exception args
59
+ # to unicode.
60
+ s = ' '.join([force_unicode(arg, encoding, strings_only,
61
+ errors) for arg in s])
62
+ return s
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/linkifier.py ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import unicode_literals
2
+ import re
3
+
4
+ from tensorboard._vendor import html5lib
5
+ from tensorboard._vendor.html5lib.filters.base import Filter
6
+ from tensorboard._vendor.html5lib.filters.sanitizer import allowed_protocols
7
+ from tensorboard._vendor.html5lib.serializer import HTMLSerializer
8
+
9
+ from tensorboard._vendor.bleach import callbacks as linkify_callbacks
10
+ from tensorboard._vendor.bleach.encoding import force_unicode
11
+ from tensorboard._vendor.bleach.utils import alphabetize_attributes
12
+
13
+
14
+ #: List of default callbacks
15
+ DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
16
+
17
+
18
+ TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
19
+ ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
20
+ cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
21
+ dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
22
+ gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
23
+ im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
24
+ kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
25
+ ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
26
+ net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
27
+ pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
28
+ sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
29
+ tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
30
+ xn xxx ye yt yu za zm zw""".split()
31
+
32
+ # Make sure that .com doesn't get matched by .co first
33
+ TLDS.reverse()
34
+
35
+
36
+ def build_url_re(tlds=TLDS, protocols=allowed_protocols):
37
+ """Builds the url regex used by linkifier
38
+
39
+ If you want a different set of tlds or allowed protocols, pass those in
40
+ and stomp on the existing ``url_re``::
41
+
42
+ from bleach import linkifier
43
+
44
+ my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
45
+
46
+ linker = LinkifyFilter(url_re=my_url_re)
47
+
48
+ """
49
+ return re.compile(
50
+ r"""\(* # Match any opening parentheses.
51
+ \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
52
+ ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
53
+ (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
54
+ # /path/zz (excluding "unsafe" chars from RFC 1738,
55
+ # except for # and ~, which happen in practice)
56
+ """.format('|'.join(protocols), '|'.join(tlds)),
57
+ re.IGNORECASE | re.VERBOSE | re.UNICODE)
58
+
59
+
60
+ URL_RE = build_url_re()
61
+
62
+
63
+ PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
64
+
65
+
66
+ EMAIL_RE = re.compile(
67
+ r"""(?<!//)
68
+ (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
69
+ (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)* # dot-atom
70
+ |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
71
+ |\\[\001-\011\013\014\016-\177])*" # quoted-string
72
+ )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}) # domain
73
+ """,
74
+ re.IGNORECASE | re.MULTILINE | re.VERBOSE)
75
+
76
+
77
+ class Linker(object):
78
+ """Convert URL-like strings in an HTML fragment to links
79
+
80
+ This function converts strings that look like URLs, domain names and email
81
+ addresses in text that may be an HTML fragment to links, while preserving:
82
+
83
+ 1. links already in the string
84
+ 2. urls found in attributes
85
+ 3. email addresses
86
+
87
+ linkify does a best-effort approach and tries to recover from bad
88
+ situations due to crazy text.
89
+
90
+ """
91
+ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
92
+ url_re=URL_RE, email_re=EMAIL_RE):
93
+ """Creates a Linker instance
94
+
95
+ :arg list callbacks: list of callbacks to run when adjusting tag attributes;
96
+ defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
97
+
98
+ :arg list skip_tags: list of tags that you don't want to linkify the
99
+ contents of; for example, you could set this to ``['pre']`` to skip
100
+ linkifying contents of ``pre`` tags
101
+
102
+ :arg bool parse_email: whether or not to linkify email addresses
103
+
104
+ :arg re url_re: url matching regex
105
+
106
+ :arg re email_re: email matching regex
107
+
108
+ :returns: linkified text as unicode
109
+
110
+ """
111
+ self.callbacks = callbacks
112
+ self.skip_tags = skip_tags
113
+ self.parse_email = parse_email
114
+ self.url_re = url_re
115
+ self.email_re = email_re
116
+
117
+ self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
118
+ self.walker = html5lib.getTreeWalker('etree')
119
+ self.serializer = HTMLSerializer(
120
+ quote_attr_values='always',
121
+ omit_optional_tags=False,
122
+
123
+ # linkify does not sanitize
124
+ sanitize=False,
125
+
126
+ # linkify alphabetizes
127
+ alphabetical_attributes=False,
128
+ )
129
+
130
+ def linkify(self, text):
131
+ """Linkify specified text
132
+
133
+ :arg str text: the text to add links to
134
+
135
+ :returns: linkified text as unicode
136
+
137
+ """
138
+ text = force_unicode(text)
139
+
140
+ if not text:
141
+ return u''
142
+
143
+ dom = self.parser.parseFragment(text)
144
+ filtered = LinkifyFilter(
145
+ source=self.walker(dom),
146
+ callbacks=self.callbacks,
147
+ skip_tags=self.skip_tags,
148
+ parse_email=self.parse_email,
149
+ url_re=self.url_re,
150
+ email_re=self.email_re,
151
+ )
152
+ return self.serializer.render(filtered)
153
+
154
+
155
+ class LinkifyFilter(Filter):
156
+ """html5lib filter that linkifies text
157
+
158
+ This will do the following:
159
+
160
+ * convert email addresses into links
161
+ * convert urls into links
162
+ * edit existing links by running them through callbacks--the default is to
163
+ add a ``rel="nofollow"``
164
+
165
+ This filter can be used anywhere html5lib filters can be used.
166
+
167
+ """
168
+ def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
169
+ url_re=URL_RE, email_re=EMAIL_RE):
170
+ """Creates a LinkifyFilter instance
171
+
172
+ :arg TreeWalker source: stream
173
+
174
+ :arg list callbacks: list of callbacks to run when adjusting tag attributes;
175
+ defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
176
+
177
+ :arg list skip_tags: list of tags that you don't want to linkify the
178
+ contents of; for example, you could set this to ``['pre']`` to skip
179
+ linkifying contents of ``pre`` tags
180
+
181
+ :arg bool parse_email: whether or not to linkify email addresses
182
+
183
+ :arg re url_re: url matching regex
184
+
185
+ :arg re email_re: email matching regex
186
+
187
+ """
188
+ super(LinkifyFilter, self).__init__(source)
189
+
190
+ self.callbacks = callbacks or []
191
+ self.skip_tags = skip_tags or []
192
+ self.parse_email = parse_email
193
+
194
+ self.url_re = url_re
195
+ self.email_re = email_re
196
+
197
+ def apply_callbacks(self, attrs, is_new):
198
+ """Given an attrs dict and an is_new bool, runs through callbacks
199
+
200
+ Callbacks can return an adjusted attrs dict or ``None``. In the case of
201
+ ``None``, we stop going through callbacks and return that and the link
202
+ gets dropped.
203
+
204
+ :arg dict attrs: map of ``(namespace, name)`` -> ``value``
205
+
206
+ :arg bool is_new: whether or not this link was added by linkify
207
+
208
+ :returns: adjusted attrs dict or ``None``
209
+
210
+ """
211
+ for cb in self.callbacks:
212
+ attrs = cb(attrs, is_new)
213
+ if attrs is None:
214
+ return None
215
+ return attrs
216
+
217
+ def extract_character_data(self, token_list):
218
+ """Extracts and squashes character sequences in a token stream"""
219
+ # FIXME(willkg): This is a terrible idea. What it does is drop all the
220
+ # tags from the token list and merge the Characters and SpaceCharacters
221
+ # tokens into a single text.
222
+ #
223
+ # So something like this::
224
+ #
225
+ # "<span>" "<b>" "some text" "</b>" "</span>"
226
+ #
227
+ # gets converted to "some text".
228
+ #
229
+ # This gets used to figure out the ``_text`` fauxttribute value for
230
+ # linkify callables.
231
+ #
232
+ # I'm not really sure how else to support that ``_text`` fauxttribute and
233
+ # maintain some modicum of backwards compatability with previous versions
234
+ # of Bleach.
235
+
236
+ out = []
237
+ for token in token_list:
238
+ token_type = token['type']
239
+ if token_type in ['Characters', 'SpaceCharacters']:
240
+ out.append(token['data'])
241
+
242
+ return u''.join(out)
243
+
244
+ def handle_email_addresses(self, src_iter):
245
+ """Handle email addresses in character tokens"""
246
+ for token in src_iter:
247
+ if token['type'] == 'Characters':
248
+ text = token['data']
249
+ new_tokens = []
250
+ end = 0
251
+
252
+ # For each email address we find in the text
253
+ for match in self.email_re.finditer(text):
254
+ if match.start() > end:
255
+ new_tokens.append(
256
+ {u'type': u'Characters', u'data': text[end:match.start()]}
257
+ )
258
+
259
+ # Run attributes through the callbacks to see what we
260
+ # should do with this match
261
+ attrs = {
262
+ (None, u'href'): u'mailto:%s' % match.group(0),
263
+ u'_text': match.group(0)
264
+ }
265
+ attrs = self.apply_callbacks(attrs, True)
266
+
267
+ if attrs is None:
268
+ # Just add the text--but not as a link
269
+ new_tokens.append(
270
+ {u'type': u'Characters', u'data': match.group(0)}
271
+ )
272
+
273
+ else:
274
+ # Add an "a" tag for the new link
275
+ _text = attrs.pop(u'_text', '')
276
+ attrs = alphabetize_attributes(attrs)
277
+ new_tokens.extend([
278
+ {u'type': u'StartTag', u'name': u'a', u'data': attrs},
279
+ {u'type': u'Characters', u'data': force_unicode(_text)},
280
+ {u'type': u'EndTag', u'name': 'a'}
281
+ ])
282
+ end = match.end()
283
+
284
+ if new_tokens:
285
+ # Yield the adjusted set of tokens and then continue
286
+ # through the loop
287
+ if end < len(text):
288
+ new_tokens.append({u'type': u'Characters', u'data': text[end:]})
289
+
290
+ for new_token in new_tokens:
291
+ yield new_token
292
+
293
+ continue
294
+
295
+ yield token
296
+
297
+ def strip_non_url_bits(self, fragment):
298
+ """Strips non-url bits from the url
299
+
300
+ This accounts for over-eager matching by the regex.
301
+
302
+ """
303
+ prefix = suffix = ''
304
+
305
+ while fragment:
306
+ # Try removing ( from the beginning and, if it's balanced, from the
307
+ # end, too
308
+ if fragment.startswith(u'('):
309
+ prefix = prefix + u'('
310
+ fragment = fragment[1:]
311
+
312
+ if fragment.endswith(u')'):
313
+ suffix = u')' + suffix
314
+ fragment = fragment[:-1]
315
+ continue
316
+
317
+ # Now try extraneous things from the end. For example, sometimes we
318
+ # pick up ) at the end of a url, but the url is in a parenthesized
319
+ # phrase like:
320
+ #
321
+ # "i looked at the site (at http://example.com)"
322
+
323
+ if fragment.endswith(u')') and u'(' not in fragment:
324
+ fragment = fragment[:-1]
325
+ suffix = u')' + suffix
326
+ continue
327
+
328
+ # Handle commas
329
+ if fragment.endswith(u','):
330
+ fragment = fragment[:-1]
331
+ suffix = u',' + suffix
332
+ continue
333
+
334
+ # Handle periods
335
+ if fragment.endswith(u'.'):
336
+ fragment = fragment[:-1]
337
+ suffix = u'.' + suffix
338
+ continue
339
+
340
+ # Nothing matched, so we're done
341
+ break
342
+
343
+ return fragment, prefix, suffix
344
+
345
+ def handle_links(self, src_iter):
346
+ """Handle links in character tokens"""
347
+ for token in src_iter:
348
+ if token['type'] == 'Characters':
349
+ text = token['data']
350
+ new_tokens = []
351
+ end = 0
352
+
353
+ for match in self.url_re.finditer(text):
354
+ if match.start() > end:
355
+ new_tokens.append(
356
+ {u'type': u'Characters', u'data': text[end:match.start()]}
357
+ )
358
+
359
+ url = match.group(0)
360
+ prefix = suffix = ''
361
+
362
+ # Sometimes we pick up too much in the url match, so look for
363
+ # bits we should drop and remove them from the match
364
+ url, prefix, suffix = self.strip_non_url_bits(url)
365
+
366
+ # If there's no protocol, add one
367
+ if PROTO_RE.search(url):
368
+ href = url
369
+ else:
370
+ href = u'http://%s' % url
371
+
372
+ attrs = {
373
+ (None, u'href'): href,
374
+ u'_text': url
375
+ }
376
+ attrs = self.apply_callbacks(attrs, True)
377
+
378
+ if attrs is None:
379
+ # Just add the text
380
+ new_tokens.append(
381
+ {u'type': u'Characters', u'data': prefix + url + suffix}
382
+ )
383
+
384
+ else:
385
+ # Add the "a" tag!
386
+ if prefix:
387
+ new_tokens.append(
388
+ {u'type': u'Characters', u'data': prefix}
389
+ )
390
+
391
+ _text = attrs.pop(u'_text', '')
392
+ attrs = alphabetize_attributes(attrs)
393
+
394
+ new_tokens.extend([
395
+ {u'type': u'StartTag', u'name': u'a', u'data': attrs},
396
+ {u'type': u'Characters', u'data': force_unicode(_text)},
397
+ {u'type': u'EndTag', u'name': 'a'},
398
+ ])
399
+
400
+ if suffix:
401
+ new_tokens.append(
402
+ {u'type': u'Characters', u'data': suffix}
403
+ )
404
+
405
+ end = match.end()
406
+
407
+ if new_tokens:
408
+ # Yield the adjusted set of tokens and then continue
409
+ # through the loop
410
+ if end < len(text):
411
+ new_tokens.append({u'type': u'Characters', u'data': text[end:]})
412
+
413
+ for new_token in new_tokens:
414
+ yield new_token
415
+
416
+ continue
417
+
418
+ yield token
419
+
420
+ def handle_a_tag(self, token_buffer):
421
+ """Handle the "a" tag
422
+
423
+ This could adjust the link or drop it altogether depending on what the
424
+ callbacks return.
425
+
426
+ This yields the new set of tokens.
427
+
428
+ """
429
+ a_token = token_buffer[0]
430
+ if a_token['data']:
431
+ attrs = a_token['data']
432
+ else:
433
+ attrs = {}
434
+ text = self.extract_character_data(token_buffer)
435
+ attrs['_text'] = text
436
+
437
+ attrs = self.apply_callbacks(attrs, False)
438
+
439
+ if attrs is None:
440
+ # We're dropping the "a" tag and everything else and replacing
441
+ # it with character data. So emit that token.
442
+ yield {'type': 'Characters', 'data': text}
443
+
444
+ else:
445
+ new_text = attrs.pop('_text', '')
446
+ a_token['data'] = alphabetize_attributes(attrs)
447
+
448
+ if text == new_text:
449
+ # The callbacks didn't change the text, so we yield the new "a"
450
+ # token, then whatever else was there, then the end "a" token
451
+ yield a_token
452
+ for mem in token_buffer[1:]:
453
+ yield mem
454
+
455
+ else:
456
+ # If the callbacks changed the text, then we're going to drop
457
+ # all the tokens between the start and end "a" tags and replace
458
+ # it with the new text
459
+ yield a_token
460
+ yield {'type': 'Characters', 'data': force_unicode(new_text)}
461
+ yield token_buffer[-1]
462
+
463
+ def __iter__(self):
464
+ in_a = False
465
+ in_skip_tag = None
466
+
467
+ token_buffer = []
468
+
469
+ for token in super(LinkifyFilter, self).__iter__():
470
+ if in_a:
471
+ # Handle the case where we're in an "a" tag--we want to buffer tokens
472
+ # until we hit an end "a" tag.
473
+ if token['type'] == 'EndTag' and token['name'] == 'a':
474
+ # Add the end tag to the token buffer and then handle them
475
+ # and yield anything returned
476
+ token_buffer.append(token)
477
+ for new_token in self.handle_a_tag(token_buffer):
478
+ yield new_token
479
+
480
+ # Clear "a" related state and continue since we've yielded all
481
+ # the tokens we're going to yield
482
+ in_a = False
483
+ token_buffer = []
484
+ continue
485
+
486
+ else:
487
+ token_buffer.append(token)
488
+ continue
489
+
490
+ elif token['type'] in ['StartTag', 'EmptyTag']:
491
+ if token['name'] in self.skip_tags:
492
+ # Skip tags start a "special mode" where we don't linkify
493
+ # anything until the end tag.
494
+ in_skip_tag = token['name']
495
+
496
+ elif token['name'] == 'a':
497
+ # The "a" tag is special--we switch to a slurp mode and
498
+ # slurp all the tokens until the end "a" tag and then
499
+ # figure out what to do with them there.
500
+ in_a = True
501
+ token_buffer.append(token)
502
+
503
+ # We buffer the start tag, so we don't want to yield it,
504
+ # yet
505
+ continue
506
+
507
+ elif in_skip_tag and self.skip_tags:
508
+ # NOTE(willkg): We put this clause here since in_a and
509
+ # switching in and out of in_a takes precedence.
510
+ if token['type'] == 'EndTag' and token['name'] == in_skip_tag:
511
+ in_skip_tag = None
512
+
513
+ elif not in_a and not in_skip_tag and token['type'] == 'Characters':
514
+ new_stream = iter([token])
515
+ if self.parse_email:
516
+ new_stream = self.handle_email_addresses(new_stream)
517
+
518
+ new_stream = self.handle_links(new_stream)
519
+
520
+ for token in new_stream:
521
+ yield token
522
+
523
+ # We've already yielded this token, so continue
524
+ continue
525
+
526
+ yield token
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/sanitizer.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import unicode_literals
2
+ import re
3
+ from xml.sax.saxutils import unescape
4
+
5
+ from tensorboard._vendor import html5lib
6
+ from tensorboard._vendor.html5lib.constants import namespaces
7
+ from tensorboard._vendor.html5lib.filters import sanitizer
8
+ from tensorboard._vendor.html5lib.serializer import HTMLSerializer
9
+
10
+ from tensorboard._vendor.bleach.encoding import force_unicode
11
+ from tensorboard._vendor.bleach.utils import alphabetize_attributes
12
+
13
+
14
+ #: List of allowed tags
15
+ ALLOWED_TAGS = [
16
+ 'a',
17
+ 'abbr',
18
+ 'acronym',
19
+ 'b',
20
+ 'blockquote',
21
+ 'code',
22
+ 'em',
23
+ 'i',
24
+ 'li',
25
+ 'ol',
26
+ 'strong',
27
+ 'ul',
28
+ ]
29
+
30
+
31
+ #: Map of allowed attributes by tag
32
+ ALLOWED_ATTRIBUTES = {
33
+ 'a': ['href', 'title'],
34
+ 'abbr': ['title'],
35
+ 'acronym': ['title'],
36
+ }
37
+
38
+
39
+ #: List of allowed styles
40
+ ALLOWED_STYLES = []
41
+
42
+
43
+ #: List of allowed protocols
44
+ ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
45
+
46
+
47
+ class Cleaner(object):
48
+ """Cleaner for cleaning HTML fragments of malicious content
49
+
50
+ This cleaner is a security-focused function whose sole purpose is to remove
51
+ malicious content from a string such that it can be displayed as content in
52
+ a web page.
53
+
54
+ This cleaner is not designed to use to transform content to be used in
55
+ non-web-page contexts.
56
+
57
+ To use::
58
+
59
+ from bleach.sanitizer import Cleaner
60
+
61
+ cleaner = Cleaner()
62
+
63
+ for text in all_the_yucky_things:
64
+ sanitized = cleaner.clean(text)
65
+
66
+ """
67
+
68
+ def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
69
+ styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
70
+ strip_comments=True, filters=None):
71
+ """Initializes a Cleaner
72
+
73
+ :arg list tags: allowed list of tags; defaults to
74
+ ``bleach.sanitizer.ALLOWED_TAGS``
75
+
76
+ :arg dict attributes: allowed attributes; can be a callable, list or dict;
77
+ defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
78
+
79
+ :arg list styles: allowed list of css styles; defaults to
80
+ ``bleach.sanitizer.ALLOWED_STYLES``
81
+
82
+ :arg list protocols: allowed list of protocols for links; defaults
83
+ to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
84
+
85
+ :arg bool strip: whether or not to strip disallowed elements
86
+
87
+ :arg bool strip_comments: whether or not to strip HTML comments
88
+
89
+ :arg list filters: list of html5lib Filter classes to pass streamed content through
90
+
91
+ .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
92
+
93
+ .. Warning::
94
+
95
+ Using filters changes the output of ``bleach.Cleaner.clean``.
96
+ Make sure the way the filters change the output are secure.
97
+
98
+ """
99
+ self.tags = tags
100
+ self.attributes = attributes
101
+ self.styles = styles
102
+ self.protocols = protocols
103
+ self.strip = strip
104
+ self.strip_comments = strip_comments
105
+ self.filters = filters or []
106
+
107
+ self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
108
+ self.walker = html5lib.getTreeWalker('etree')
109
+ self.serializer = HTMLSerializer(
110
+ quote_attr_values='always',
111
+ omit_optional_tags=False,
112
+
113
+ # Bleach has its own sanitizer, so don't use the html5lib one
114
+ sanitize=False,
115
+
116
+ # Bleach sanitizer alphabetizes already, so don't use the html5lib one
117
+ alphabetical_attributes=False,
118
+ )
119
+
120
+ def clean(self, text):
121
+ """Cleans text and returns sanitized result as unicode
122
+
123
+ :arg str text: text to be cleaned
124
+
125
+ :returns: sanitized text as unicode
126
+
127
+ """
128
+ if not text:
129
+ return u''
130
+
131
+ text = force_unicode(text)
132
+
133
+ dom = self.parser.parseFragment(text)
134
+ filtered = BleachSanitizerFilter(
135
+ source=self.walker(dom),
136
+
137
+ # Bleach-sanitizer-specific things
138
+ attributes=self.attributes,
139
+ strip_disallowed_elements=self.strip,
140
+ strip_html_comments=self.strip_comments,
141
+
142
+ # html5lib-sanitizer things
143
+ allowed_elements=self.tags,
144
+ allowed_css_properties=self.styles,
145
+ allowed_protocols=self.protocols,
146
+ allowed_svg_properties=[],
147
+ )
148
+
149
+ # Apply any filters after the BleachSanitizerFilter
150
+ for filter_class in self.filters:
151
+ filtered = filter_class(source=filtered)
152
+
153
+ return self.serializer.render(filtered)
154
+
155
+
156
+ def attribute_filter_factory(attributes):
157
+ """Generates attribute filter function for the given attributes value
158
+
159
+ The attributes value can take one of several shapes. This returns a filter
160
+ function appropriate to the attributes value. One nice thing about this is
161
+ that there's less if/then shenanigans in the ``allow_token`` method.
162
+
163
+ """
164
+ if callable(attributes):
165
+ return attributes
166
+
167
+ if isinstance(attributes, dict):
168
+ def _attr_filter(tag, attr, value):
169
+ if tag in attributes:
170
+ attr_val = attributes[tag]
171
+ if callable(attr_val):
172
+ return attr_val(tag, attr, value)
173
+
174
+ if attr in attr_val:
175
+ return True
176
+
177
+ if '*' in attributes:
178
+ attr_val = attributes['*']
179
+ if callable(attr_val):
180
+ return attr_val(tag, attr, value)
181
+
182
+ return attr in attr_val
183
+
184
+ return False
185
+
186
+ return _attr_filter
187
+
188
+ if isinstance(attributes, list):
189
+ def _attr_filter(tag, attr, value):
190
+ return attr in attributes
191
+
192
+ return _attr_filter
193
+
194
+ raise ValueError('attributes needs to be a callable, a list or a dict')
195
+
196
+
197
+ class BleachSanitizerFilter(sanitizer.Filter):
198
+ """html5lib Filter that sanitizes text
199
+
200
+ This filter can be used anywhere html5lib filters can be used.
201
+
202
+ """
203
+ def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
204
+ strip_disallowed_elements=False, strip_html_comments=True,
205
+ **kwargs):
206
+ """Creates a BleachSanitizerFilter instance
207
+
208
+ :arg Treewalker source: stream
209
+
210
+ :arg list tags: allowed list of tags; defaults to
211
+ ``bleach.sanitizer.ALLOWED_TAGS``
212
+
213
+ :arg dict attributes: allowed attributes; can be a callable, list or dict;
214
+ defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
215
+
216
+ :arg list styles: allowed list of css styles; defaults to
217
+ ``bleach.sanitizer.ALLOWED_STYLES``
218
+
219
+ :arg list protocols: allowed list of protocols for links; defaults
220
+ to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
221
+
222
+ :arg bool strip_disallowed_elements: whether or not to strip disallowed
223
+ elements
224
+
225
+ :arg bool strip_html_comments: whether or not to strip HTML comments
226
+
227
+ """
228
+ self.attr_filter = attribute_filter_factory(attributes)
229
+
230
+ self.strip_disallowed_elements = strip_disallowed_elements
231
+ self.strip_html_comments = strip_html_comments
232
+
233
+ return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
234
+
235
+ def sanitize_token(self, token):
236
+ """Sanitize a token either by HTML-encoding or dropping.
237
+
238
+ Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
239
+ ['attribute', 'pairs'], 'tag': callable}.
240
+
241
+ Here callable is a function with two arguments of attribute name and
242
+ value. It should return true of false.
243
+
244
+ Also gives the option to strip tags instead of encoding.
245
+
246
+ """
247
+ token_type = token['type']
248
+ if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
249
+ if token['name'] in self.allowed_elements:
250
+ return self.allow_token(token)
251
+
252
+ elif self.strip_disallowed_elements:
253
+ pass
254
+
255
+ else:
256
+ if 'data' in token:
257
+ # Alphabetize the attributes before calling .disallowed_token()
258
+ # so that the resulting string is stable
259
+ token['data'] = alphabetize_attributes(token['data'])
260
+ return self.disallowed_token(token)
261
+
262
+ elif token_type == 'Comment':
263
+ if not self.strip_html_comments:
264
+ return token
265
+
266
+ else:
267
+ return token
268
+
269
+ def allow_token(self, token):
270
+ """Handles the case where we're allowing the tag"""
271
+ if 'data' in token:
272
+ # Loop through all the attributes and drop the ones that are not
273
+ # allowed, are unsafe or break other rules. Additionally, fix
274
+ # attribute values that need fixing.
275
+ #
276
+ # At the end of this loop, we have the final set of attributes
277
+ # we're keeping.
278
+ attrs = {}
279
+ for namespaced_name, val in token['data'].items():
280
+ namespace, name = namespaced_name
281
+
282
+ # Drop attributes that are not explicitly allowed
283
+ #
284
+ # NOTE(willkg): We pass in the attribute name--not a namespaced
285
+ # name.
286
+ if not self.attr_filter(token['name'], name, val):
287
+ continue
288
+
289
+ # Look at attributes that have uri values
290
+ if namespaced_name in self.attr_val_is_uri:
291
+ val_unescaped = re.sub(
292
+ "[`\000-\040\177-\240\s]+",
293
+ '',
294
+ unescape(val)).lower()
295
+
296
+ # Remove replacement characters from unescaped characters.
297
+ val_unescaped = val_unescaped.replace("\ufffd", "")
298
+
299
+ # Drop attributes with uri values that have protocols that
300
+ # aren't allowed
301
+ if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
302
+ (val_unescaped.split(':')[0] not in self.allowed_protocols)):
303
+ continue
304
+
305
+ # Drop values in svg attrs with non-local IRIs
306
+ if namespaced_name in self.svg_attr_val_allows_ref:
307
+ new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
308
+ ' ',
309
+ unescape(val))
310
+ new_val = new_val.strip()
311
+ if not new_val:
312
+ continue
313
+
314
+ else:
315
+ # Replace the val with the unescaped version because
316
+ # it's a iri
317
+ val = new_val
318
+
319
+ # Drop href and xlink:href attr for svg elements with non-local IRIs
320
+ if (None, token['name']) in self.svg_allow_local_href:
321
+ if namespaced_name in [(None, 'href'), (namespaces['xlink'], 'href')]:
322
+ if re.search(r'^\s*[^#\s]', val):
323
+ continue
324
+
325
+ # If it's a style attribute, sanitize it
326
+ if namespaced_name == (None, u'style'):
327
+ val = self.sanitize_css(val)
328
+
329
+ # At this point, we want to keep the attribute, so add it in
330
+ attrs[namespaced_name] = val
331
+
332
+ token['data'] = alphabetize_attributes(attrs)
333
+
334
+ return token
335
+
336
+ def sanitize_css(self, style):
337
+ """Sanitizes css in style tags"""
338
+ # disallow urls
339
+ style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
340
+
341
+ # gauntlet
342
+
343
+ # Validate the css in the style tag and if it's not valid, then drop
344
+ # the whole thing.
345
+ parts = style.split(';')
346
+ gauntlet = re.compile(
347
+ r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""
348
+ )
349
+
350
+ for part in parts:
351
+ if not gauntlet.match(part):
352
+ return ''
353
+
354
+ if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
355
+ return ''
356
+
357
+ clean = []
358
+ for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
359
+ if not value:
360
+ continue
361
+
362
+ if prop.lower() in self.allowed_css_properties:
363
+ clean.append(prop + ': ' + value + ';')
364
+
365
+ elif prop.lower() in self.allowed_svg_properties:
366
+ clean.append(prop + ': ' + value + ';')
367
+
368
+ return ' '.join(clean)
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+
4
+ def _attr_key(attr):
5
+ """Returns appropriate key for sorting attribute names
6
+
7
+ Attribute names are a tuple of ``(namespace, name)`` where namespace can be
8
+ ``None`` or a string. These can't be compared in Python 3, so we conver the
9
+ ``None`` to an empty string.
10
+
11
+ """
12
+ key = (attr[0][0] or ''), attr[0][1]
13
+ return key
14
+
15
+
16
+ def alphabetize_attributes(attrs):
17
+ """Takes a dict of attributes (or None) and returns them alphabetized"""
18
+ if not attrs:
19
+ return attrs
20
+
21
+ return OrderedDict(
22
+ [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)]
23
+ )
MLPY/Lib/site-packages/tensorboard/_vendor/bleach/version.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import unicode_literals
4
+
5
+ VERSION = (2, 0, 0)
6
+ __version__ = '.'.join([str(n) for n in VERSION])
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__init__.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTML parsing library based on the `WHATWG HTML specification
3
+ <https://whatwg.org/html>`_. The parser is designed to be compatible with
4
+ existing HTML found in the wild and implements well-defined error recovery that
5
+ is largely compatible with modern desktop web browsers.
6
+
7
+ Example usage::
8
+
9
+ import html5lib
10
+ with open("my_document.html", "rb") as f:
11
+ tree = html5lib.parse(f)
12
+
13
+ For convenience, this module re-exports the following names:
14
+
15
+ * :func:`~.html5parser.parse`
16
+ * :func:`~.html5parser.parseFragment`
17
+ * :class:`~.html5parser.HTMLParser`
18
+ * :func:`~.treebuilders.getTreeBuilder`
19
+ * :func:`~.treewalkers.getTreeWalker`
20
+ * :func:`~.serializer.serialize`
21
+ """
22
+
23
+ from __future__ import absolute_import, division, unicode_literals
24
+
25
+ from .html5parser import HTMLParser, parse, parseFragment
26
+ from .treebuilders import getTreeBuilder
27
+ from .treewalkers import getTreeWalker
28
+ from .serializer import serialize
29
+
30
+ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
31
+ "getTreeWalker", "serialize"]
32
+
33
+ # this has to be at the top level, see how setup.py parses this
34
+ #: Distribution version number.
35
+ __version__ = "1.1"
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (1.26 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_ihatexml.cpython-39.pyc ADDED
Binary file (13.7 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_inputstream.cpython-39.pyc ADDED
Binary file (21.6 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_tokenizer.cpython-39.pyc ADDED
Binary file (39.7 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/_utils.cpython-39.pyc ADDED
Binary file (4.76 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/constants.cpython-39.pyc ADDED
Binary file (66.3 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/html5parser.cpython-39.pyc ADDED
Binary file (91 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/__pycache__/serializer.cpython-39.pyc ADDED
Binary file (10.8 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_ihatexml.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, division, unicode_literals
2
+
3
+ import re
4
+ import warnings
5
+
6
+ from .constants import DataLossWarning
7
+
8
+ baseChar = """
9
+ [#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
10
+ [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
11
+ [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
12
+ [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
13
+ [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
14
+ [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
15
+ [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
16
+ [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
17
+ [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
18
+ [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
19
+ [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
20
+ [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
21
+ [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
22
+ [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
23
+ [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
24
+ [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
25
+ [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
26
+ [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
27
+ [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
28
+ [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
29
+ [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
30
+ [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
31
+ [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
32
+ [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
33
+ [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
34
+ [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
35
+ [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
36
+ [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
37
+ [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
38
+ [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
39
+ #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
40
+ #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
41
+ #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
42
+ [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
43
+ [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
44
+ #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
45
+ [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
46
+ [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
47
+ [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
48
+ [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
49
+ [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
50
+ #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
51
+ [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
52
+ [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
53
+ [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
54
+ [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
55
+
56
+ ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
57
+
58
+ combiningCharacter = """
59
+ [#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
60
+ [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
61
+ [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
62
+ [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
63
+ #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
64
+ [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
65
+ [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
66
+ #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
67
+ [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
68
+ [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
69
+ #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
70
+ [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
71
+ [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
72
+ [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
73
+ [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
74
+ [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
75
+ #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
76
+ [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
77
+ #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
78
+ [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
79
+ [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
80
+ #x3099 | #x309A"""
81
+
82
+ digit = """
83
+ [#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
84
+ [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
85
+ [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
86
+ [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
87
+
88
+ extender = """
89
+ #x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
90
+ #[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
91
+
92
+ letter = " | ".join([baseChar, ideographic])
93
+
94
+ # Without the
95
+ name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
96
+ extender])
97
+ nameFirst = " | ".join([letter, "_"])
98
+
99
+ reChar = re.compile(r"#x([\d|A-F]{4,4})")
100
+ reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
101
+
102
+
103
+ def charStringToList(chars):
104
+ charRanges = [item.strip() for item in chars.split(" | ")]
105
+ rv = []
106
+ for item in charRanges:
107
+ foundMatch = False
108
+ for regexp in (reChar, reCharRange):
109
+ match = regexp.match(item)
110
+ if match is not None:
111
+ rv.append([hexToInt(item) for item in match.groups()])
112
+ if len(rv[-1]) == 1:
113
+ rv[-1] = rv[-1] * 2
114
+ foundMatch = True
115
+ break
116
+ if not foundMatch:
117
+ assert len(item) == 1
118
+
119
+ rv.append([ord(item)] * 2)
120
+ rv = normaliseCharList(rv)
121
+ return rv
122
+
123
+
124
+ def normaliseCharList(charList):
125
+ charList = sorted(charList)
126
+ for item in charList:
127
+ assert item[1] >= item[0]
128
+ rv = []
129
+ i = 0
130
+ while i < len(charList):
131
+ j = 1
132
+ rv.append(charList[i])
133
+ while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
134
+ rv[-1][1] = charList[i + j][1]
135
+ j += 1
136
+ i += j
137
+ return rv
138
+
139
+
140
+ # We don't really support characters above the BMP :(
141
+ max_unicode = int("FFFF", 16)
142
+
143
+
144
+ def missingRanges(charList):
145
+ rv = []
146
+ if charList[0] != 0:
147
+ rv.append([0, charList[0][0] - 1])
148
+ for i, item in enumerate(charList[:-1]):
149
+ rv.append([item[1] + 1, charList[i + 1][0] - 1])
150
+ if charList[-1][1] != max_unicode:
151
+ rv.append([charList[-1][1] + 1, max_unicode])
152
+ return rv
153
+
154
+
155
+ def listToRegexpStr(charList):
156
+ rv = []
157
+ for item in charList:
158
+ if item[0] == item[1]:
159
+ rv.append(escapeRegexp(chr(item[0])))
160
+ else:
161
+ rv.append(escapeRegexp(chr(item[0])) + "-" +
162
+ escapeRegexp(chr(item[1])))
163
+ return "[%s]" % "".join(rv)
164
+
165
+
166
+ def hexToInt(hex_str):
167
+ return int(hex_str, 16)
168
+
169
+
170
+ def escapeRegexp(string):
171
+ specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
172
+ "[", "]", "|", "(", ")", "-")
173
+ for char in specialCharacters:
174
+ string = string.replace(char, "\\" + char)
175
+
176
+ return string
177
+
178
+ # output from the above
179
+ nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
180
+
181
+ nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
182
+
183
+ # Simpler things
184
+ nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
185
+
186
+
187
+ class InfosetFilter(object):
188
+ replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
189
+
190
+ def __init__(self,
191
+ dropXmlnsLocalName=False,
192
+ dropXmlnsAttrNs=False,
193
+ preventDoubleDashComments=False,
194
+ preventDashAtCommentEnd=False,
195
+ replaceFormFeedCharacters=True,
196
+ preventSingleQuotePubid=False):
197
+
198
+ self.dropXmlnsLocalName = dropXmlnsLocalName
199
+ self.dropXmlnsAttrNs = dropXmlnsAttrNs
200
+
201
+ self.preventDoubleDashComments = preventDoubleDashComments
202
+ self.preventDashAtCommentEnd = preventDashAtCommentEnd
203
+
204
+ self.replaceFormFeedCharacters = replaceFormFeedCharacters
205
+
206
+ self.preventSingleQuotePubid = preventSingleQuotePubid
207
+
208
+ self.replaceCache = {}
209
+
210
+ def coerceAttribute(self, name, namespace=None):
211
+ if self.dropXmlnsLocalName and name.startswith("xmlns:"):
212
+ warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
213
+ return None
214
+ elif (self.dropXmlnsAttrNs and
215
+ namespace == "http://www.w3.org/2000/xmlns/"):
216
+ warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
217
+ return None
218
+ else:
219
+ return self.toXmlName(name)
220
+
221
+ def coerceElement(self, name):
222
+ return self.toXmlName(name)
223
+
224
+ def coerceComment(self, data):
225
+ if self.preventDoubleDashComments:
226
+ while "--" in data:
227
+ warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
228
+ data = data.replace("--", "- -")
229
+ if data.endswith("-"):
230
+ warnings.warn("Comments cannot end in a dash", DataLossWarning)
231
+ data += " "
232
+ return data
233
+
234
+ def coerceCharacters(self, data):
235
+ if self.replaceFormFeedCharacters:
236
+ for _ in range(data.count("\x0C")):
237
+ warnings.warn("Text cannot contain U+000C", DataLossWarning)
238
+ data = data.replace("\x0C", " ")
239
+ # Other non-xml characters
240
+ return data
241
+
242
+ def coercePubid(self, data):
243
+ dataOutput = data
244
+ for char in nonPubidCharRegexp.findall(data):
245
+ warnings.warn("Coercing non-XML pubid", DataLossWarning)
246
+ replacement = self.getReplacementCharacter(char)
247
+ dataOutput = dataOutput.replace(char, replacement)
248
+ if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
249
+ warnings.warn("Pubid cannot contain single quote", DataLossWarning)
250
+ dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
251
+ return dataOutput
252
+
253
+ def toXmlName(self, name):
254
+ nameFirst = name[0]
255
+ nameRest = name[1:]
256
+ m = nonXmlNameFirstBMPRegexp.match(nameFirst)
257
+ if m:
258
+ warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
259
+ nameFirstOutput = self.getReplacementCharacter(nameFirst)
260
+ else:
261
+ nameFirstOutput = nameFirst
262
+
263
+ nameRestOutput = nameRest
264
+ replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
265
+ for char in replaceChars:
266
+ warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
267
+ replacement = self.getReplacementCharacter(char)
268
+ nameRestOutput = nameRestOutput.replace(char, replacement)
269
+ return nameFirstOutput + nameRestOutput
270
+
271
+ def getReplacementCharacter(self, char):
272
+ if char in self.replaceCache:
273
+ replacement = self.replaceCache[char]
274
+ else:
275
+ replacement = self.escapeChar(char)
276
+ return replacement
277
+
278
+ def fromXmlName(self, name):
279
+ for item in set(self.replacementRegexp.findall(name)):
280
+ name = name.replace(item, self.unescapeChar(item))
281
+ return name
282
+
283
+ def escapeChar(self, char):
284
+ replacement = "U%05X" % ord(char)
285
+ self.replaceCache[char] = replacement
286
+ return replacement
287
+
288
+ def unescapeChar(self, charcode):
289
+ return chr(int(charcode[1:], 16))
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_inputstream.py ADDED
@@ -0,0 +1,918 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, division, unicode_literals
2
+
3
+ from six import text_type
4
+ from six.moves import http_client, urllib
5
+
6
+ import codecs
7
+ import re
8
+ from io import BytesIO, StringIO
9
+
10
+ from tensorboard._vendor import webencodings
11
+
12
+ from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
13
+ from .constants import _ReparseException
14
+ from . import _utils
15
+
16
+ # Non-unicode versions of constants for use in the pre-parser
17
+ spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
18
+ asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
19
+ asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
20
+ spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
21
+
22
+
23
+ invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
24
+
25
+ if _utils.supports_lone_surrogates:
26
+ # Use one extra step of indirection and create surrogates with
27
+ # eval. Not using this indirection would introduce an illegal
28
+ # unicode literal on platforms not supporting such lone
29
+ # surrogates.
30
+ assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
31
+ invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
32
+ eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
33
+ "]")
34
+ else:
35
+ invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
36
+
37
+ non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
38
+ 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
39
+ 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
40
+ 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
41
+ 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
42
+ 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
43
+ 0x10FFFE, 0x10FFFF}
44
+
45
+ ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
46
+
47
+ # Cache for charsUntil()
48
+ charsUntilRegEx = {}
49
+
50
+
51
+ class BufferedStream(object):
52
+ """Buffering for streams that do not have buffering of their own
53
+
54
+ The buffer is implemented as a list of chunks on the assumption that
55
+ joining many strings will be slow since it is O(n**2)
56
+ """
57
+
58
+ def __init__(self, stream):
59
+ self.stream = stream
60
+ self.buffer = []
61
+ self.position = [-1, 0] # chunk number, offset
62
+
63
+ def tell(self):
64
+ pos = 0
65
+ for chunk in self.buffer[:self.position[0]]:
66
+ pos += len(chunk)
67
+ pos += self.position[1]
68
+ return pos
69
+
70
+ def seek(self, pos):
71
+ assert pos <= self._bufferedBytes()
72
+ offset = pos
73
+ i = 0
74
+ while len(self.buffer[i]) < offset:
75
+ offset -= len(self.buffer[i])
76
+ i += 1
77
+ self.position = [i, offset]
78
+
79
+ def read(self, bytes):
80
+ if not self.buffer:
81
+ return self._readStream(bytes)
82
+ elif (self.position[0] == len(self.buffer) and
83
+ self.position[1] == len(self.buffer[-1])):
84
+ return self._readStream(bytes)
85
+ else:
86
+ return self._readFromBuffer(bytes)
87
+
88
+ def _bufferedBytes(self):
89
+ return sum([len(item) for item in self.buffer])
90
+
91
+ def _readStream(self, bytes):
92
+ data = self.stream.read(bytes)
93
+ self.buffer.append(data)
94
+ self.position[0] += 1
95
+ self.position[1] = len(data)
96
+ return data
97
+
98
+ def _readFromBuffer(self, bytes):
99
+ remainingBytes = bytes
100
+ rv = []
101
+ bufferIndex = self.position[0]
102
+ bufferOffset = self.position[1]
103
+ while bufferIndex < len(self.buffer) and remainingBytes != 0:
104
+ assert remainingBytes > 0
105
+ bufferedData = self.buffer[bufferIndex]
106
+
107
+ if remainingBytes <= len(bufferedData) - bufferOffset:
108
+ bytesToRead = remainingBytes
109
+ self.position = [bufferIndex, bufferOffset + bytesToRead]
110
+ else:
111
+ bytesToRead = len(bufferedData) - bufferOffset
112
+ self.position = [bufferIndex, len(bufferedData)]
113
+ bufferIndex += 1
114
+ rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
115
+ remainingBytes -= bytesToRead
116
+
117
+ bufferOffset = 0
118
+
119
+ if remainingBytes:
120
+ rv.append(self._readStream(remainingBytes))
121
+
122
+ return b"".join(rv)
123
+
124
+
125
+ def HTMLInputStream(source, **kwargs):
126
+ # Work around Python bug #20007: read(0) closes the connection.
127
+ # http://bugs.python.org/issue20007
128
+ if (isinstance(source, http_client.HTTPResponse) or
129
+ # Also check for addinfourl wrapping HTTPResponse
130
+ (isinstance(source, urllib.response.addbase) and
131
+ isinstance(source.fp, http_client.HTTPResponse))):
132
+ isUnicode = False
133
+ elif hasattr(source, "read"):
134
+ isUnicode = isinstance(source.read(0), text_type)
135
+ else:
136
+ isUnicode = isinstance(source, text_type)
137
+
138
+ if isUnicode:
139
+ encodings = [x for x in kwargs if x.endswith("_encoding")]
140
+ if encodings:
141
+ raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
142
+
143
+ return HTMLUnicodeInputStream(source, **kwargs)
144
+ else:
145
+ return HTMLBinaryInputStream(source, **kwargs)
146
+
147
+
148
+ class HTMLUnicodeInputStream(object):
149
+ """Provides a unicode stream of characters to the HTMLTokenizer.
150
+
151
+ This class takes care of character encoding and removing or replacing
152
+ incorrect byte-sequences and also provides column and line tracking.
153
+
154
+ """
155
+
156
+ _defaultChunkSize = 10240
157
+
158
+ def __init__(self, source):
159
+ """Initialises the HTMLInputStream.
160
+
161
+ HTMLInputStream(source, [encoding]) -> Normalized stream from source
162
+ for use by html5lib.
163
+
164
+ source can be either a file-object, local filename or a string.
165
+
166
+ The optional encoding parameter must be a string that indicates
167
+ the encoding. If specified, that encoding will be used,
168
+ regardless of any BOM or later declaration (such as in a meta
169
+ element)
170
+
171
+ """
172
+
173
+ if not _utils.supports_lone_surrogates:
174
+ # Such platforms will have already checked for such
175
+ # surrogate errors, so no need to do this checking.
176
+ self.reportCharacterErrors = None
177
+ elif len("\U0010FFFF") == 1:
178
+ self.reportCharacterErrors = self.characterErrorsUCS4
179
+ else:
180
+ self.reportCharacterErrors = self.characterErrorsUCS2
181
+
182
+ # List of where new lines occur
183
+ self.newLines = [0]
184
+
185
+ self.charEncoding = (lookupEncoding("utf-8"), "certain")
186
+ self.dataStream = self.openStream(source)
187
+
188
+ self.reset()
189
+
190
+ def reset(self):
191
+ self.chunk = ""
192
+ self.chunkSize = 0
193
+ self.chunkOffset = 0
194
+ self.errors = []
195
+
196
+ # number of (complete) lines in previous chunks
197
+ self.prevNumLines = 0
198
+ # number of columns in the last line of the previous chunk
199
+ self.prevNumCols = 0
200
+
201
+ # Deal with CR LF and surrogates split over chunk boundaries
202
+ self._bufferedCharacter = None
203
+
204
+ def openStream(self, source):
205
+ """Produces a file object from source.
206
+
207
+ source can be either a file object, local filename or a string.
208
+
209
+ """
210
+ # Already a file object
211
+ if hasattr(source, 'read'):
212
+ stream = source
213
+ else:
214
+ stream = StringIO(source)
215
+
216
+ return stream
217
+
218
+ def _position(self, offset):
219
+ chunk = self.chunk
220
+ nLines = chunk.count('\n', 0, offset)
221
+ positionLine = self.prevNumLines + nLines
222
+ lastLinePos = chunk.rfind('\n', 0, offset)
223
+ if lastLinePos == -1:
224
+ positionColumn = self.prevNumCols + offset
225
+ else:
226
+ positionColumn = offset - (lastLinePos + 1)
227
+ return (positionLine, positionColumn)
228
+
229
+ def position(self):
230
+ """Returns (line, col) of the current position in the stream."""
231
+ line, col = self._position(self.chunkOffset)
232
+ return (line + 1, col)
233
+
234
+ def char(self):
235
+ """ Read one character from the stream or queue if available. Return
236
+ EOF when EOF is reached.
237
+ """
238
+ # Read a new chunk from the input stream if necessary
239
+ if self.chunkOffset >= self.chunkSize:
240
+ if not self.readChunk():
241
+ return EOF
242
+
243
+ chunkOffset = self.chunkOffset
244
+ char = self.chunk[chunkOffset]
245
+ self.chunkOffset = chunkOffset + 1
246
+
247
+ return char
248
+
249
+ def readChunk(self, chunkSize=None):
250
+ if chunkSize is None:
251
+ chunkSize = self._defaultChunkSize
252
+
253
+ self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
254
+
255
+ self.chunk = ""
256
+ self.chunkSize = 0
257
+ self.chunkOffset = 0
258
+
259
+ data = self.dataStream.read(chunkSize)
260
+
261
+ # Deal with CR LF and surrogates broken across chunks
262
+ if self._bufferedCharacter:
263
+ data = self._bufferedCharacter + data
264
+ self._bufferedCharacter = None
265
+ elif not data:
266
+ # We have no more data, bye-bye stream
267
+ return False
268
+
269
+ if len(data) > 1:
270
+ lastv = ord(data[-1])
271
+ if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
272
+ self._bufferedCharacter = data[-1]
273
+ data = data[:-1]
274
+
275
+ if self.reportCharacterErrors:
276
+ self.reportCharacterErrors(data)
277
+
278
+ # Replace invalid characters
279
+ data = data.replace("\r\n", "\n")
280
+ data = data.replace("\r", "\n")
281
+
282
+ self.chunk = data
283
+ self.chunkSize = len(data)
284
+
285
+ return True
286
+
287
+ def characterErrorsUCS4(self, data):
288
+ for _ in range(len(invalid_unicode_re.findall(data))):
289
+ self.errors.append("invalid-codepoint")
290
+
291
+ def characterErrorsUCS2(self, data):
292
+ # Someone picked the wrong compile option
293
+ # You lose
294
+ skip = False
295
+ for match in invalid_unicode_re.finditer(data):
296
+ if skip:
297
+ continue
298
+ codepoint = ord(match.group())
299
+ pos = match.start()
300
+ # Pretty sure there should be endianness issues here
301
+ if _utils.isSurrogatePair(data[pos:pos + 2]):
302
+ # We have a surrogate pair!
303
+ char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
304
+ if char_val in non_bmp_invalid_codepoints:
305
+ self.errors.append("invalid-codepoint")
306
+ skip = True
307
+ elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
308
+ pos == len(data) - 1):
309
+ self.errors.append("invalid-codepoint")
310
+ else:
311
+ skip = False
312
+ self.errors.append("invalid-codepoint")
313
+
314
+ def charsUntil(self, characters, opposite=False):
315
+ """ Returns a string of characters from the stream up to but not
316
+ including any character in 'characters' or EOF. 'characters' must be
317
+ a container that supports the 'in' method and iteration over its
318
+ characters.
319
+ """
320
+
321
+ # Use a cache of regexps to find the required characters
322
+ try:
323
+ chars = charsUntilRegEx[(characters, opposite)]
324
+ except KeyError:
325
+ if __debug__:
326
+ for c in characters:
327
+ assert(ord(c) < 128)
328
+ regex = "".join(["\\x%02x" % ord(c) for c in characters])
329
+ if not opposite:
330
+ regex = "^%s" % regex
331
+ chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
332
+
333
+ rv = []
334
+
335
+ while True:
336
+ # Find the longest matching prefix
337
+ m = chars.match(self.chunk, self.chunkOffset)
338
+ if m is None:
339
+ # If nothing matched, and it wasn't because we ran out of chunk,
340
+ # then stop
341
+ if self.chunkOffset != self.chunkSize:
342
+ break
343
+ else:
344
+ end = m.end()
345
+ # If not the whole chunk matched, return everything
346
+ # up to the part that didn't match
347
+ if end != self.chunkSize:
348
+ rv.append(self.chunk[self.chunkOffset:end])
349
+ self.chunkOffset = end
350
+ break
351
+ # If the whole remainder of the chunk matched,
352
+ # use it all and read the next chunk
353
+ rv.append(self.chunk[self.chunkOffset:])
354
+ if not self.readChunk():
355
+ # Reached EOF
356
+ break
357
+
358
+ r = "".join(rv)
359
+ return r
360
+
361
+ def unget(self, char):
362
+ # Only one character is allowed to be ungotten at once - it must
363
+ # be consumed again before any further call to unget
364
+ if char is not EOF:
365
+ if self.chunkOffset == 0:
366
+ # unget is called quite rarely, so it's a good idea to do
367
+ # more work here if it saves a bit of work in the frequently
368
+ # called char and charsUntil.
369
+ # So, just prepend the ungotten character onto the current
370
+ # chunk:
371
+ self.chunk = char + self.chunk
372
+ self.chunkSize += 1
373
+ else:
374
+ self.chunkOffset -= 1
375
+ assert self.chunk[self.chunkOffset] == char
376
+
377
+
378
+ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
379
+ """Provides a unicode stream of characters to the HTMLTokenizer.
380
+
381
+ This class takes care of character encoding and removing or replacing
382
+ incorrect byte-sequences and also provides column and line tracking.
383
+
384
+ """
385
+
386
+ def __init__(self, source, override_encoding=None, transport_encoding=None,
387
+ same_origin_parent_encoding=None, likely_encoding=None,
388
+ default_encoding="windows-1252", useChardet=True):
389
+ """Initialises the HTMLInputStream.
390
+
391
+ HTMLInputStream(source, [encoding]) -> Normalized stream from source
392
+ for use by html5lib.
393
+
394
+ source can be either a file-object, local filename or a string.
395
+
396
+ The optional encoding parameter must be a string that indicates
397
+ the encoding. If specified, that encoding will be used,
398
+ regardless of any BOM or later declaration (such as in a meta
399
+ element)
400
+
401
+ """
402
+ # Raw Stream - for unicode objects this will encode to utf-8 and set
403
+ # self.charEncoding as appropriate
404
+ self.rawStream = self.openStream(source)
405
+
406
+ HTMLUnicodeInputStream.__init__(self, self.rawStream)
407
+
408
+ # Encoding Information
409
+ # Number of bytes to use when looking for a meta element with
410
+ # encoding information
411
+ self.numBytesMeta = 1024
412
+ # Number of bytes to use when using detecting encoding using chardet
413
+ self.numBytesChardet = 100
414
+ # Things from args
415
+ self.override_encoding = override_encoding
416
+ self.transport_encoding = transport_encoding
417
+ self.same_origin_parent_encoding = same_origin_parent_encoding
418
+ self.likely_encoding = likely_encoding
419
+ self.default_encoding = default_encoding
420
+
421
+ # Determine encoding
422
+ self.charEncoding = self.determineEncoding(useChardet)
423
+ assert self.charEncoding[0] is not None
424
+
425
+ # Call superclass
426
+ self.reset()
427
+
428
+ def reset(self):
429
+ self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
430
+ HTMLUnicodeInputStream.reset(self)
431
+
432
+ def openStream(self, source):
433
+ """Produces a file object from source.
434
+
435
+ source can be either a file object, local filename or a string.
436
+
437
+ """
438
+ # Already a file object
439
+ if hasattr(source, 'read'):
440
+ stream = source
441
+ else:
442
+ stream = BytesIO(source)
443
+
444
+ try:
445
+ stream.seek(stream.tell())
446
+ except Exception:
447
+ stream = BufferedStream(stream)
448
+
449
+ return stream
450
+
451
+ def determineEncoding(self, chardet=True):
452
+ # BOMs take precedence over everything
453
+ # This will also read past the BOM if present
454
+ charEncoding = self.detectBOM(), "certain"
455
+ if charEncoding[0] is not None:
456
+ return charEncoding
457
+
458
+ # If we've been overridden, we've been overridden
459
+ charEncoding = lookupEncoding(self.override_encoding), "certain"
460
+ if charEncoding[0] is not None:
461
+ return charEncoding
462
+
463
+ # Now check the transport layer
464
+ charEncoding = lookupEncoding(self.transport_encoding), "certain"
465
+ if charEncoding[0] is not None:
466
+ return charEncoding
467
+
468
+ # Look for meta elements with encoding information
469
+ charEncoding = self.detectEncodingMeta(), "tentative"
470
+ if charEncoding[0] is not None:
471
+ return charEncoding
472
+
473
+ # Parent document encoding
474
+ charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
475
+ if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
476
+ return charEncoding
477
+
478
+ # "likely" encoding
479
+ charEncoding = lookupEncoding(self.likely_encoding), "tentative"
480
+ if charEncoding[0] is not None:
481
+ return charEncoding
482
+
483
+ # Guess with chardet, if available
484
+ if chardet:
485
+ try:
486
+ from chardet.universaldetector import UniversalDetector
487
+ except ImportError:
488
+ pass
489
+ else:
490
+ buffers = []
491
+ detector = UniversalDetector()
492
+ while not detector.done:
493
+ buffer = self.rawStream.read(self.numBytesChardet)
494
+ assert isinstance(buffer, bytes)
495
+ if not buffer:
496
+ break
497
+ buffers.append(buffer)
498
+ detector.feed(buffer)
499
+ detector.close()
500
+ encoding = lookupEncoding(detector.result['encoding'])
501
+ self.rawStream.seek(0)
502
+ if encoding is not None:
503
+ return encoding, "tentative"
504
+
505
+ # Try the default encoding
506
+ charEncoding = lookupEncoding(self.default_encoding), "tentative"
507
+ if charEncoding[0] is not None:
508
+ return charEncoding
509
+
510
+ # Fallback to html5lib's default if even that hasn't worked
511
+ return lookupEncoding("windows-1252"), "tentative"
512
+
513
+ def changeEncoding(self, newEncoding):
514
+ assert self.charEncoding[1] != "certain"
515
+ newEncoding = lookupEncoding(newEncoding)
516
+ if newEncoding is None:
517
+ return
518
+ if newEncoding.name in ("utf-16be", "utf-16le"):
519
+ newEncoding = lookupEncoding("utf-8")
520
+ assert newEncoding is not None
521
+ elif newEncoding == self.charEncoding[0]:
522
+ self.charEncoding = (self.charEncoding[0], "certain")
523
+ else:
524
+ self.rawStream.seek(0)
525
+ self.charEncoding = (newEncoding, "certain")
526
+ self.reset()
527
+ raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
528
+
529
+ def detectBOM(self):
530
+ """Attempts to detect at BOM at the start of the stream. If
531
+ an encoding can be determined from the BOM return the name of the
532
+ encoding otherwise return None"""
533
+ bomDict = {
534
+ codecs.BOM_UTF8: 'utf-8',
535
+ codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
536
+ codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
537
+ }
538
+
539
+ # Go to beginning of file and read in 4 bytes
540
+ string = self.rawStream.read(4)
541
+ assert isinstance(string, bytes)
542
+
543
+ # Try detecting the BOM using bytes from the string
544
+ encoding = bomDict.get(string[:3]) # UTF-8
545
+ seek = 3
546
+ if not encoding:
547
+ # Need to detect UTF-32 before UTF-16
548
+ encoding = bomDict.get(string) # UTF-32
549
+ seek = 4
550
+ if not encoding:
551
+ encoding = bomDict.get(string[:2]) # UTF-16
552
+ seek = 2
553
+
554
+ # Set the read position past the BOM if one was found, otherwise
555
+ # set it to the start of the stream
556
+ if encoding:
557
+ self.rawStream.seek(seek)
558
+ return lookupEncoding(encoding)
559
+ else:
560
+ self.rawStream.seek(0)
561
+ return None
562
+
563
+ def detectEncodingMeta(self):
564
+ """Report the encoding declared by the meta element
565
+ """
566
+ buffer = self.rawStream.read(self.numBytesMeta)
567
+ assert isinstance(buffer, bytes)
568
+ parser = EncodingParser(buffer)
569
+ self.rawStream.seek(0)
570
+ encoding = parser.getEncoding()
571
+
572
+ if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
573
+ encoding = lookupEncoding("utf-8")
574
+
575
+ return encoding
576
+
577
+
578
+ class EncodingBytes(bytes):
579
+ """String-like object with an associated position and various extra methods
580
+ If the position is ever greater than the string length then an exception is
581
+ raised"""
582
+ def __new__(self, value):
583
+ assert isinstance(value, bytes)
584
+ return bytes.__new__(self, value.lower())
585
+
586
+ def __init__(self, value):
587
+ # pylint:disable=unused-argument
588
+ self._position = -1
589
+
590
+ def __iter__(self):
591
+ return self
592
+
593
+ def __next__(self):
594
+ p = self._position = self._position + 1
595
+ if p >= len(self):
596
+ raise StopIteration
597
+ elif p < 0:
598
+ raise TypeError
599
+ return self[p:p + 1]
600
+
601
+ def next(self):
602
+ # Py2 compat
603
+ return self.__next__()
604
+
605
+ def previous(self):
606
+ p = self._position
607
+ if p >= len(self):
608
+ raise StopIteration
609
+ elif p < 0:
610
+ raise TypeError
611
+ self._position = p = p - 1
612
+ return self[p:p + 1]
613
+
614
+ def setPosition(self, position):
615
+ if self._position >= len(self):
616
+ raise StopIteration
617
+ self._position = position
618
+
619
+ def getPosition(self):
620
+ if self._position >= len(self):
621
+ raise StopIteration
622
+ if self._position >= 0:
623
+ return self._position
624
+ else:
625
+ return None
626
+
627
+ position = property(getPosition, setPosition)
628
+
629
+ def getCurrentByte(self):
630
+ return self[self.position:self.position + 1]
631
+
632
+ currentByte = property(getCurrentByte)
633
+
634
+ def skip(self, chars=spaceCharactersBytes):
635
+ """Skip past a list of characters"""
636
+ p = self.position # use property for the error-checking
637
+ while p < len(self):
638
+ c = self[p:p + 1]
639
+ if c not in chars:
640
+ self._position = p
641
+ return c
642
+ p += 1
643
+ self._position = p
644
+ return None
645
+
646
+ def skipUntil(self, chars):
647
+ p = self.position
648
+ while p < len(self):
649
+ c = self[p:p + 1]
650
+ if c in chars:
651
+ self._position = p
652
+ return c
653
+ p += 1
654
+ self._position = p
655
+ return None
656
+
657
+ def matchBytes(self, bytes):
658
+ """Look for a sequence of bytes at the start of a string. If the bytes
659
+ are found return True and advance the position to the byte after the
660
+ match. Otherwise return False and leave the position alone"""
661
+ rv = self.startswith(bytes, self.position)
662
+ if rv:
663
+ self.position += len(bytes)
664
+ return rv
665
+
666
+ def jumpTo(self, bytes):
667
+ """Look for the next sequence of bytes matching a given sequence. If
668
+ a match is found advance the position to the last byte of the match"""
669
+ try:
670
+ self._position = self.index(bytes, self.position) + len(bytes) - 1
671
+ except ValueError:
672
+ raise StopIteration
673
+ return True
674
+
675
+
676
+ class EncodingParser(object):
677
+ """Mini parser for detecting character encoding from meta elements"""
678
+
679
+ def __init__(self, data):
680
+ """string - the data to work on for encoding detection"""
681
+ self.data = EncodingBytes(data)
682
+ self.encoding = None
683
+
684
+ def getEncoding(self):
685
+ if b"<meta" not in self.data:
686
+ return None
687
+
688
+ methodDispatch = (
689
+ (b"<!--", self.handleComment),
690
+ (b"<meta", self.handleMeta),
691
+ (b"</", self.handlePossibleEndTag),
692
+ (b"<!", self.handleOther),
693
+ (b"<?", self.handleOther),
694
+ (b"<", self.handlePossibleStartTag))
695
+ for _ in self.data:
696
+ keepParsing = True
697
+ try:
698
+ self.data.jumpTo(b"<")
699
+ except StopIteration:
700
+ break
701
+ for key, method in methodDispatch:
702
+ if self.data.matchBytes(key):
703
+ try:
704
+ keepParsing = method()
705
+ break
706
+ except StopIteration:
707
+ keepParsing = False
708
+ break
709
+ if not keepParsing:
710
+ break
711
+
712
+ return self.encoding
713
+
714
+ def handleComment(self):
715
+ """Skip over comments"""
716
+ return self.data.jumpTo(b"-->")
717
+
718
+ def handleMeta(self):
719
+ if self.data.currentByte not in spaceCharactersBytes:
720
+ # if we have <meta not followed by a space so just keep going
721
+ return True
722
+ # We have a valid meta element we want to search for attributes
723
+ hasPragma = False
724
+ pendingEncoding = None
725
+ while True:
726
+ # Try to find the next attribute after the current position
727
+ attr = self.getAttribute()
728
+ if attr is None:
729
+ return True
730
+ else:
731
+ if attr[0] == b"http-equiv":
732
+ hasPragma = attr[1] == b"content-type"
733
+ if hasPragma and pendingEncoding is not None:
734
+ self.encoding = pendingEncoding
735
+ return False
736
+ elif attr[0] == b"charset":
737
+ tentativeEncoding = attr[1]
738
+ codec = lookupEncoding(tentativeEncoding)
739
+ if codec is not None:
740
+ self.encoding = codec
741
+ return False
742
+ elif attr[0] == b"content":
743
+ contentParser = ContentAttrParser(EncodingBytes(attr[1]))
744
+ tentativeEncoding = contentParser.parse()
745
+ if tentativeEncoding is not None:
746
+ codec = lookupEncoding(tentativeEncoding)
747
+ if codec is not None:
748
+ if hasPragma:
749
+ self.encoding = codec
750
+ return False
751
+ else:
752
+ pendingEncoding = codec
753
+
754
+ def handlePossibleStartTag(self):
755
+ return self.handlePossibleTag(False)
756
+
757
+ def handlePossibleEndTag(self):
758
+ next(self.data)
759
+ return self.handlePossibleTag(True)
760
+
761
+ def handlePossibleTag(self, endTag):
762
+ data = self.data
763
+ if data.currentByte not in asciiLettersBytes:
764
+ # If the next byte is not an ascii letter either ignore this
765
+ # fragment (possible start tag case) or treat it according to
766
+ # handleOther
767
+ if endTag:
768
+ data.previous()
769
+ self.handleOther()
770
+ return True
771
+
772
+ c = data.skipUntil(spacesAngleBrackets)
773
+ if c == b"<":
774
+ # return to the first step in the overall "two step" algorithm
775
+ # reprocessing the < byte
776
+ data.previous()
777
+ else:
778
+ # Read all attributes
779
+ attr = self.getAttribute()
780
+ while attr is not None:
781
+ attr = self.getAttribute()
782
+ return True
783
+
784
+ def handleOther(self):
785
+ return self.data.jumpTo(b">")
786
+
787
+ def getAttribute(self):
788
+ """Return a name,value pair for the next attribute in the stream,
789
+ if one is found, or None"""
790
+ data = self.data
791
+ # Step 1 (skip chars)
792
+ c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
793
+ assert c is None or len(c) == 1
794
+ # Step 2
795
+ if c in (b">", None):
796
+ return None
797
+ # Step 3
798
+ attrName = []
799
+ attrValue = []
800
+ # Step 4 attribute name
801
+ while True:
802
+ if c == b"=" and attrName:
803
+ break
804
+ elif c in spaceCharactersBytes:
805
+ # Step 6!
806
+ c = data.skip()
807
+ break
808
+ elif c in (b"/", b">"):
809
+ return b"".join(attrName), b""
810
+ elif c in asciiUppercaseBytes:
811
+ attrName.append(c.lower())
812
+ elif c is None:
813
+ return None
814
+ else:
815
+ attrName.append(c)
816
+ # Step 5
817
+ c = next(data)
818
+ # Step 7
819
+ if c != b"=":
820
+ data.previous()
821
+ return b"".join(attrName), b""
822
+ # Step 8
823
+ next(data)
824
+ # Step 9
825
+ c = data.skip()
826
+ # Step 10
827
+ if c in (b"'", b'"'):
828
+ # 10.1
829
+ quoteChar = c
830
+ while True:
831
+ # 10.2
832
+ c = next(data)
833
+ # 10.3
834
+ if c == quoteChar:
835
+ next(data)
836
+ return b"".join(attrName), b"".join(attrValue)
837
+ # 10.4
838
+ elif c in asciiUppercaseBytes:
839
+ attrValue.append(c.lower())
840
+ # 10.5
841
+ else:
842
+ attrValue.append(c)
843
+ elif c == b">":
844
+ return b"".join(attrName), b""
845
+ elif c in asciiUppercaseBytes:
846
+ attrValue.append(c.lower())
847
+ elif c is None:
848
+ return None
849
+ else:
850
+ attrValue.append(c)
851
+ # Step 11
852
+ while True:
853
+ c = next(data)
854
+ if c in spacesAngleBrackets:
855
+ return b"".join(attrName), b"".join(attrValue)
856
+ elif c in asciiUppercaseBytes:
857
+ attrValue.append(c.lower())
858
+ elif c is None:
859
+ return None
860
+ else:
861
+ attrValue.append(c)
862
+
863
+
864
+ class ContentAttrParser(object):
865
+ def __init__(self, data):
866
+ assert isinstance(data, bytes)
867
+ self.data = data
868
+
869
+ def parse(self):
870
+ try:
871
+ # Check if the attr name is charset
872
+ # otherwise return
873
+ self.data.jumpTo(b"charset")
874
+ self.data.position += 1
875
+ self.data.skip()
876
+ if not self.data.currentByte == b"=":
877
+ # If there is no = sign keep looking for attrs
878
+ return None
879
+ self.data.position += 1
880
+ self.data.skip()
881
+ # Look for an encoding between matching quote marks
882
+ if self.data.currentByte in (b'"', b"'"):
883
+ quoteMark = self.data.currentByte
884
+ self.data.position += 1
885
+ oldPosition = self.data.position
886
+ if self.data.jumpTo(quoteMark):
887
+ return self.data[oldPosition:self.data.position]
888
+ else:
889
+ return None
890
+ else:
891
+ # Unquoted value
892
+ oldPosition = self.data.position
893
+ try:
894
+ self.data.skipUntil(spaceCharactersBytes)
895
+ return self.data[oldPosition:self.data.position]
896
+ except StopIteration:
897
+ # Return the whole remaining value
898
+ return self.data[oldPosition:]
899
+ except StopIteration:
900
+ return None
901
+
902
+
903
+ def lookupEncoding(encoding):
904
+ """Return the python codec name corresponding to an encoding or None if the
905
+ string doesn't correspond to a valid encoding."""
906
+ if isinstance(encoding, bytes):
907
+ try:
908
+ encoding = encoding.decode("ascii")
909
+ except UnicodeDecodeError:
910
+ return None
911
+
912
+ if encoding is not None:
913
+ try:
914
+ return webencodings.lookup(encoding)
915
+ except AttributeError:
916
+ return None
917
+ else:
918
+ return None
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_tokenizer.py ADDED
@@ -0,0 +1,1735 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, division, unicode_literals
2
+
3
+ from six import unichr as chr
4
+
5
+ from collections import deque, OrderedDict
6
+ from sys import version_info
7
+
8
+ from .constants import spaceCharacters
9
+ from .constants import entities
10
+ from .constants import asciiLetters, asciiUpper2Lower
11
+ from .constants import digits, hexDigits, EOF
12
+ from .constants import tokenTypes, tagTokenTypes
13
+ from .constants import replacementCharacters
14
+
15
+ from ._inputstream import HTMLInputStream
16
+
17
+ from ._trie import Trie
18
+
19
+ entitiesTrie = Trie(entities)
20
+
21
+ if version_info >= (3, 7):
22
+ attributeMap = dict
23
+ else:
24
+ attributeMap = OrderedDict
25
+
26
+
27
+ class HTMLTokenizer(object):
28
+ """ This class takes care of tokenizing HTML.
29
+
30
+ * self.currentToken
31
+ Holds the token that is currently being processed.
32
+
33
+ * self.state
34
+ Holds a reference to the method to be invoked... XXX
35
+
36
+ * self.stream
37
+ Points to HTMLInputStream object.
38
+ """
39
+
40
+ def __init__(self, stream, parser=None, **kwargs):
41
+
42
+ self.stream = HTMLInputStream(stream, **kwargs)
43
+ self.parser = parser
44
+
45
+ # Setup the initial tokenizer state
46
+ self.escapeFlag = False
47
+ self.lastFourChars = []
48
+ self.state = self.dataState
49
+ self.escape = False
50
+
51
+ # The current token being created
52
+ self.currentToken = None
53
+ super(HTMLTokenizer, self).__init__()
54
+
55
+ def __iter__(self):
56
+ """ This is where the magic happens.
57
+
58
+ We do our usually processing through the states and when we have a token
59
+ to return we yield the token which pauses processing until the next token
60
+ is requested.
61
+ """
62
+ self.tokenQueue = deque([])
63
+ # Start processing. When EOF is reached self.state will return False
64
+ # instead of True and the loop will terminate.
65
+ while self.state():
66
+ while self.stream.errors:
67
+ yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
68
+ while self.tokenQueue:
69
+ yield self.tokenQueue.popleft()
70
+
71
+ def consumeNumberEntity(self, isHex):
72
+ """This function returns either U+FFFD or the character based on the
73
+ decimal or hexadecimal representation. It also discards ";" if present.
74
+ If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
75
+ """
76
+
77
+ allowed = digits
78
+ radix = 10
79
+ if isHex:
80
+ allowed = hexDigits
81
+ radix = 16
82
+
83
+ charStack = []
84
+
85
+ # Consume all the characters that are in range while making sure we
86
+ # don't hit an EOF.
87
+ c = self.stream.char()
88
+ while c in allowed and c is not EOF:
89
+ charStack.append(c)
90
+ c = self.stream.char()
91
+
92
+ # Convert the set of characters consumed to an int.
93
+ charAsInt = int("".join(charStack), radix)
94
+
95
+ # Certain characters get replaced with others
96
+ if charAsInt in replacementCharacters:
97
+ char = replacementCharacters[charAsInt]
98
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
99
+ "illegal-codepoint-for-numeric-entity",
100
+ "datavars": {"charAsInt": charAsInt}})
101
+ elif ((0xD800 <= charAsInt <= 0xDFFF) or
102
+ (charAsInt > 0x10FFFF)):
103
+ char = "\uFFFD"
104
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
105
+ "illegal-codepoint-for-numeric-entity",
106
+ "datavars": {"charAsInt": charAsInt}})
107
+ else:
108
+ # Should speed up this check somehow (e.g. move the set to a constant)
109
+ if ((0x0001 <= charAsInt <= 0x0008) or
110
+ (0x000E <= charAsInt <= 0x001F) or
111
+ (0x007F <= charAsInt <= 0x009F) or
112
+ (0xFDD0 <= charAsInt <= 0xFDEF) or
113
+ charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
114
+ 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
115
+ 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
116
+ 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
117
+ 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
118
+ 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
119
+ 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
120
+ 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
121
+ 0xFFFFF, 0x10FFFE, 0x10FFFF])):
122
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
123
+ "data":
124
+ "illegal-codepoint-for-numeric-entity",
125
+ "datavars": {"charAsInt": charAsInt}})
126
+ try:
127
+ # Try/except needed as UCS-2 Python builds' unichar only works
128
+ # within the BMP.
129
+ char = chr(charAsInt)
130
+ except ValueError:
131
+ v = charAsInt - 0x10000
132
+ char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
133
+
134
+ # Discard the ; if present. Otherwise, put it back on the queue and
135
+ # invoke parseError on parser.
136
+ if c != ";":
137
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
138
+ "numeric-entity-without-semicolon"})
139
+ self.stream.unget(c)
140
+
141
+ return char
142
+
143
+ def consumeEntity(self, allowedChar=None, fromAttribute=False):
144
+ # Initialise to the default output for when no entity is matched
145
+ output = "&"
146
+
147
+ charStack = [self.stream.char()]
148
+ if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
149
+ (allowedChar is not None and allowedChar == charStack[0])):
150
+ self.stream.unget(charStack[0])
151
+
152
+ elif charStack[0] == "#":
153
+ # Read the next character to see if it's hex or decimal
154
+ hex = False
155
+ charStack.append(self.stream.char())
156
+ if charStack[-1] in ("x", "X"):
157
+ hex = True
158
+ charStack.append(self.stream.char())
159
+
160
+ # charStack[-1] should be the first digit
161
+ if (hex and charStack[-1] in hexDigits) \
162
+ or (not hex and charStack[-1] in digits):
163
+ # At least one digit found, so consume the whole number
164
+ self.stream.unget(charStack[-1])
165
+ output = self.consumeNumberEntity(hex)
166
+ else:
167
+ # No digits found
168
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
169
+ "data": "expected-numeric-entity"})
170
+ self.stream.unget(charStack.pop())
171
+ output = "&" + "".join(charStack)
172
+
173
+ else:
174
+ # At this point in the process might have named entity. Entities
175
+ # are stored in the global variable "entities".
176
+ #
177
+ # Consume characters and compare to these to a substring of the
178
+ # entity names in the list until the substring no longer matches.
179
+ while (charStack[-1] is not EOF):
180
+ if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
181
+ break
182
+ charStack.append(self.stream.char())
183
+
184
+ # At this point we have a string that starts with some characters
185
+ # that may match an entity
186
+ # Try to find the longest entity the string will match to take care
187
+ # of &noti for instance.
188
+ try:
189
+ entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
190
+ entityLength = len(entityName)
191
+ except KeyError:
192
+ entityName = None
193
+
194
+ if entityName is not None:
195
+ if entityName[-1] != ";":
196
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
197
+ "named-entity-without-semicolon"})
198
+ if (entityName[-1] != ";" and fromAttribute and
199
+ (charStack[entityLength] in asciiLetters or
200
+ charStack[entityLength] in digits or
201
+ charStack[entityLength] == "=")):
202
+ self.stream.unget(charStack.pop())
203
+ output = "&" + "".join(charStack)
204
+ else:
205
+ output = entities[entityName]
206
+ self.stream.unget(charStack.pop())
207
+ output += "".join(charStack[entityLength:])
208
+ else:
209
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
210
+ "expected-named-entity"})
211
+ self.stream.unget(charStack.pop())
212
+ output = "&" + "".join(charStack)
213
+
214
+ if fromAttribute:
215
+ self.currentToken["data"][-1][1] += output
216
+ else:
217
+ if output in spaceCharacters:
218
+ tokenType = "SpaceCharacters"
219
+ else:
220
+ tokenType = "Characters"
221
+ self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
222
+
223
+ def processEntityInAttribute(self, allowedChar):
224
+ """This method replaces the need for "entityInAttributeValueState".
225
+ """
226
+ self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
227
+
228
+ def emitCurrentToken(self):
229
+ """This method is a generic handler for emitting the tags. It also sets
230
+ the state to "data" because that's what's needed after a token has been
231
+ emitted.
232
+ """
233
+ token = self.currentToken
234
+ # Add token to the queue to be yielded
235
+ if (token["type"] in tagTokenTypes):
236
+ token["name"] = token["name"].translate(asciiUpper2Lower)
237
+ if token["type"] == tokenTypes["StartTag"]:
238
+ raw = token["data"]
239
+ data = attributeMap(raw)
240
+ if len(raw) > len(data):
241
+ # we had some duplicated attribute, fix so first wins
242
+ data.update(raw[::-1])
243
+ token["data"] = data
244
+
245
+ if token["type"] == tokenTypes["EndTag"]:
246
+ if token["data"]:
247
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
248
+ "data": "attributes-in-end-tag"})
249
+ if token["selfClosing"]:
250
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
251
+ "data": "self-closing-flag-on-end-tag"})
252
+ self.tokenQueue.append(token)
253
+ self.state = self.dataState
254
+
255
+ # Below are the various tokenizer states worked out.
256
+ def dataState(self):
257
+ data = self.stream.char()
258
+ if data == "&":
259
+ self.state = self.entityDataState
260
+ elif data == "<":
261
+ self.state = self.tagOpenState
262
+ elif data == "\u0000":
263
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
264
+ "data": "invalid-codepoint"})
265
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
266
+ "data": "\u0000"})
267
+ elif data is EOF:
268
+ # Tokenization ends.
269
+ return False
270
+ elif data in spaceCharacters:
271
+ # Directly after emitting a token you switch back to the "data
272
+ # state". At that point spaceCharacters are important so they are
273
+ # emitted separately.
274
+ self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
275
+ data + self.stream.charsUntil(spaceCharacters, True)})
276
+ # No need to update lastFourChars here, since the first space will
277
+ # have already been appended to lastFourChars and will have broken
278
+ # any <!-- or --> sequences
279
+ else:
280
+ chars = self.stream.charsUntil(("&", "<", "\u0000"))
281
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
282
+ data + chars})
283
+ return True
284
+
285
+ def entityDataState(self):
286
+ self.consumeEntity()
287
+ self.state = self.dataState
288
+ return True
289
+
290
+ def rcdataState(self):
291
+ data = self.stream.char()
292
+ if data == "&":
293
+ self.state = self.characterReferenceInRcdata
294
+ elif data == "<":
295
+ self.state = self.rcdataLessThanSignState
296
+ elif data == EOF:
297
+ # Tokenization ends.
298
+ return False
299
+ elif data == "\u0000":
300
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
301
+ "data": "invalid-codepoint"})
302
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
303
+ "data": "\uFFFD"})
304
+ elif data in spaceCharacters:
305
+ # Directly after emitting a token you switch back to the "data
306
+ # state". At that point spaceCharacters are important so they are
307
+ # emitted separately.
308
+ self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
309
+ data + self.stream.charsUntil(spaceCharacters, True)})
310
+ # No need to update lastFourChars here, since the first space will
311
+ # have already been appended to lastFourChars and will have broken
312
+ # any <!-- or --> sequences
313
+ else:
314
+ chars = self.stream.charsUntil(("&", "<", "\u0000"))
315
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
316
+ data + chars})
317
+ return True
318
+
319
+ def characterReferenceInRcdata(self):
320
+ self.consumeEntity()
321
+ self.state = self.rcdataState
322
+ return True
323
+
324
+ def rawtextState(self):
325
+ data = self.stream.char()
326
+ if data == "<":
327
+ self.state = self.rawtextLessThanSignState
328
+ elif data == "\u0000":
329
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
330
+ "data": "invalid-codepoint"})
331
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
332
+ "data": "\uFFFD"})
333
+ elif data == EOF:
334
+ # Tokenization ends.
335
+ return False
336
+ else:
337
+ chars = self.stream.charsUntil(("<", "\u0000"))
338
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
339
+ data + chars})
340
+ return True
341
+
342
+ def scriptDataState(self):
343
+ data = self.stream.char()
344
+ if data == "<":
345
+ self.state = self.scriptDataLessThanSignState
346
+ elif data == "\u0000":
347
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
348
+ "data": "invalid-codepoint"})
349
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
350
+ "data": "\uFFFD"})
351
+ elif data == EOF:
352
+ # Tokenization ends.
353
+ return False
354
+ else:
355
+ chars = self.stream.charsUntil(("<", "\u0000"))
356
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
357
+ data + chars})
358
+ return True
359
+
360
+ def plaintextState(self):
361
+ data = self.stream.char()
362
+ if data == EOF:
363
+ # Tokenization ends.
364
+ return False
365
+ elif data == "\u0000":
366
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
367
+ "data": "invalid-codepoint"})
368
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
369
+ "data": "\uFFFD"})
370
+ else:
371
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
372
+ data + self.stream.charsUntil("\u0000")})
373
+ return True
374
+
375
+ def tagOpenState(self):
376
+ data = self.stream.char()
377
+ if data == "!":
378
+ self.state = self.markupDeclarationOpenState
379
+ elif data == "/":
380
+ self.state = self.closeTagOpenState
381
+ elif data in asciiLetters:
382
+ self.currentToken = {"type": tokenTypes["StartTag"],
383
+ "name": data, "data": [],
384
+ "selfClosing": False,
385
+ "selfClosingAcknowledged": False}
386
+ self.state = self.tagNameState
387
+ elif data == ">":
388
+ # XXX In theory it could be something besides a tag name. But
389
+ # do we really care?
390
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
391
+ "expected-tag-name-but-got-right-bracket"})
392
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
393
+ self.state = self.dataState
394
+ elif data == "?":
395
+ # XXX In theory it could be something besides a tag name. But
396
+ # do we really care?
397
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
398
+ "expected-tag-name-but-got-question-mark"})
399
+ self.stream.unget(data)
400
+ self.state = self.bogusCommentState
401
+ else:
402
+ # XXX
403
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
404
+ "expected-tag-name"})
405
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
406
+ self.stream.unget(data)
407
+ self.state = self.dataState
408
+ return True
409
+
410
+ def closeTagOpenState(self):
411
+ data = self.stream.char()
412
+ if data in asciiLetters:
413
+ self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
414
+ "data": [], "selfClosing": False}
415
+ self.state = self.tagNameState
416
+ elif data == ">":
417
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
418
+ "expected-closing-tag-but-got-right-bracket"})
419
+ self.state = self.dataState
420
+ elif data is EOF:
421
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
422
+ "expected-closing-tag-but-got-eof"})
423
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
424
+ self.state = self.dataState
425
+ else:
426
+ # XXX data can be _'_...
427
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
428
+ "expected-closing-tag-but-got-char",
429
+ "datavars": {"data": data}})
430
+ self.stream.unget(data)
431
+ self.state = self.bogusCommentState
432
+ return True
433
+
434
+ def tagNameState(self):
435
+ data = self.stream.char()
436
+ if data in spaceCharacters:
437
+ self.state = self.beforeAttributeNameState
438
+ elif data == ">":
439
+ self.emitCurrentToken()
440
+ elif data is EOF:
441
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
442
+ "eof-in-tag-name"})
443
+ self.state = self.dataState
444
+ elif data == "/":
445
+ self.state = self.selfClosingStartTagState
446
+ elif data == "\u0000":
447
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
448
+ "data": "invalid-codepoint"})
449
+ self.currentToken["name"] += "\uFFFD"
450
+ else:
451
+ self.currentToken["name"] += data
452
+ # (Don't use charsUntil here, because tag names are
453
+ # very short and it's faster to not do anything fancy)
454
+ return True
455
+
456
+ def rcdataLessThanSignState(self):
457
+ data = self.stream.char()
458
+ if data == "/":
459
+ self.temporaryBuffer = ""
460
+ self.state = self.rcdataEndTagOpenState
461
+ else:
462
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
463
+ self.stream.unget(data)
464
+ self.state = self.rcdataState
465
+ return True
466
+
467
+ def rcdataEndTagOpenState(self):
468
+ data = self.stream.char()
469
+ if data in asciiLetters:
470
+ self.temporaryBuffer += data
471
+ self.state = self.rcdataEndTagNameState
472
+ else:
473
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
474
+ self.stream.unget(data)
475
+ self.state = self.rcdataState
476
+ return True
477
+
478
+ def rcdataEndTagNameState(self):
479
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
480
+ data = self.stream.char()
481
+ if data in spaceCharacters and appropriate:
482
+ self.currentToken = {"type": tokenTypes["EndTag"],
483
+ "name": self.temporaryBuffer,
484
+ "data": [], "selfClosing": False}
485
+ self.state = self.beforeAttributeNameState
486
+ elif data == "/" and appropriate:
487
+ self.currentToken = {"type": tokenTypes["EndTag"],
488
+ "name": self.temporaryBuffer,
489
+ "data": [], "selfClosing": False}
490
+ self.state = self.selfClosingStartTagState
491
+ elif data == ">" and appropriate:
492
+ self.currentToken = {"type": tokenTypes["EndTag"],
493
+ "name": self.temporaryBuffer,
494
+ "data": [], "selfClosing": False}
495
+ self.emitCurrentToken()
496
+ self.state = self.dataState
497
+ elif data in asciiLetters:
498
+ self.temporaryBuffer += data
499
+ else:
500
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
501
+ "data": "</" + self.temporaryBuffer})
502
+ self.stream.unget(data)
503
+ self.state = self.rcdataState
504
+ return True
505
+
506
+ def rawtextLessThanSignState(self):
507
+ data = self.stream.char()
508
+ if data == "/":
509
+ self.temporaryBuffer = ""
510
+ self.state = self.rawtextEndTagOpenState
511
+ else:
512
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
513
+ self.stream.unget(data)
514
+ self.state = self.rawtextState
515
+ return True
516
+
517
+ def rawtextEndTagOpenState(self):
518
+ data = self.stream.char()
519
+ if data in asciiLetters:
520
+ self.temporaryBuffer += data
521
+ self.state = self.rawtextEndTagNameState
522
+ else:
523
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
524
+ self.stream.unget(data)
525
+ self.state = self.rawtextState
526
+ return True
527
+
528
+ def rawtextEndTagNameState(self):
529
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
530
+ data = self.stream.char()
531
+ if data in spaceCharacters and appropriate:
532
+ self.currentToken = {"type": tokenTypes["EndTag"],
533
+ "name": self.temporaryBuffer,
534
+ "data": [], "selfClosing": False}
535
+ self.state = self.beforeAttributeNameState
536
+ elif data == "/" and appropriate:
537
+ self.currentToken = {"type": tokenTypes["EndTag"],
538
+ "name": self.temporaryBuffer,
539
+ "data": [], "selfClosing": False}
540
+ self.state = self.selfClosingStartTagState
541
+ elif data == ">" and appropriate:
542
+ self.currentToken = {"type": tokenTypes["EndTag"],
543
+ "name": self.temporaryBuffer,
544
+ "data": [], "selfClosing": False}
545
+ self.emitCurrentToken()
546
+ self.state = self.dataState
547
+ elif data in asciiLetters:
548
+ self.temporaryBuffer += data
549
+ else:
550
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
551
+ "data": "</" + self.temporaryBuffer})
552
+ self.stream.unget(data)
553
+ self.state = self.rawtextState
554
+ return True
555
+
556
+ def scriptDataLessThanSignState(self):
557
+ data = self.stream.char()
558
+ if data == "/":
559
+ self.temporaryBuffer = ""
560
+ self.state = self.scriptDataEndTagOpenState
561
+ elif data == "!":
562
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
563
+ self.state = self.scriptDataEscapeStartState
564
+ else:
565
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
566
+ self.stream.unget(data)
567
+ self.state = self.scriptDataState
568
+ return True
569
+
570
+ def scriptDataEndTagOpenState(self):
571
+ data = self.stream.char()
572
+ if data in asciiLetters:
573
+ self.temporaryBuffer += data
574
+ self.state = self.scriptDataEndTagNameState
575
+ else:
576
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
577
+ self.stream.unget(data)
578
+ self.state = self.scriptDataState
579
+ return True
580
+
581
+ def scriptDataEndTagNameState(self):
582
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
583
+ data = self.stream.char()
584
+ if data in spaceCharacters and appropriate:
585
+ self.currentToken = {"type": tokenTypes["EndTag"],
586
+ "name": self.temporaryBuffer,
587
+ "data": [], "selfClosing": False}
588
+ self.state = self.beforeAttributeNameState
589
+ elif data == "/" and appropriate:
590
+ self.currentToken = {"type": tokenTypes["EndTag"],
591
+ "name": self.temporaryBuffer,
592
+ "data": [], "selfClosing": False}
593
+ self.state = self.selfClosingStartTagState
594
+ elif data == ">" and appropriate:
595
+ self.currentToken = {"type": tokenTypes["EndTag"],
596
+ "name": self.temporaryBuffer,
597
+ "data": [], "selfClosing": False}
598
+ self.emitCurrentToken()
599
+ self.state = self.dataState
600
+ elif data in asciiLetters:
601
+ self.temporaryBuffer += data
602
+ else:
603
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
604
+ "data": "</" + self.temporaryBuffer})
605
+ self.stream.unget(data)
606
+ self.state = self.scriptDataState
607
+ return True
608
+
609
+ def scriptDataEscapeStartState(self):
610
+ data = self.stream.char()
611
+ if data == "-":
612
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
613
+ self.state = self.scriptDataEscapeStartDashState
614
+ else:
615
+ self.stream.unget(data)
616
+ self.state = self.scriptDataState
617
+ return True
618
+
619
+ def scriptDataEscapeStartDashState(self):
620
+ data = self.stream.char()
621
+ if data == "-":
622
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
623
+ self.state = self.scriptDataEscapedDashDashState
624
+ else:
625
+ self.stream.unget(data)
626
+ self.state = self.scriptDataState
627
+ return True
628
+
629
+ def scriptDataEscapedState(self):
630
+ data = self.stream.char()
631
+ if data == "-":
632
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
633
+ self.state = self.scriptDataEscapedDashState
634
+ elif data == "<":
635
+ self.state = self.scriptDataEscapedLessThanSignState
636
+ elif data == "\u0000":
637
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
638
+ "data": "invalid-codepoint"})
639
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
640
+ "data": "\uFFFD"})
641
+ elif data == EOF:
642
+ self.state = self.dataState
643
+ else:
644
+ chars = self.stream.charsUntil(("<", "-", "\u0000"))
645
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
646
+ data + chars})
647
+ return True
648
+
649
+ def scriptDataEscapedDashState(self):
650
+ data = self.stream.char()
651
+ if data == "-":
652
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
653
+ self.state = self.scriptDataEscapedDashDashState
654
+ elif data == "<":
655
+ self.state = self.scriptDataEscapedLessThanSignState
656
+ elif data == "\u0000":
657
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
658
+ "data": "invalid-codepoint"})
659
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
660
+ "data": "\uFFFD"})
661
+ self.state = self.scriptDataEscapedState
662
+ elif data == EOF:
663
+ self.state = self.dataState
664
+ else:
665
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
666
+ self.state = self.scriptDataEscapedState
667
+ return True
668
+
669
+ def scriptDataEscapedDashDashState(self):
670
+ data = self.stream.char()
671
+ if data == "-":
672
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
673
+ elif data == "<":
674
+ self.state = self.scriptDataEscapedLessThanSignState
675
+ elif data == ">":
676
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
677
+ self.state = self.scriptDataState
678
+ elif data == "\u0000":
679
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
680
+ "data": "invalid-codepoint"})
681
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
682
+ "data": "\uFFFD"})
683
+ self.state = self.scriptDataEscapedState
684
+ elif data == EOF:
685
+ self.state = self.dataState
686
+ else:
687
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
688
+ self.state = self.scriptDataEscapedState
689
+ return True
690
+
691
+ def scriptDataEscapedLessThanSignState(self):
692
+ data = self.stream.char()
693
+ if data == "/":
694
+ self.temporaryBuffer = ""
695
+ self.state = self.scriptDataEscapedEndTagOpenState
696
+ elif data in asciiLetters:
697
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
698
+ self.temporaryBuffer = data
699
+ self.state = self.scriptDataDoubleEscapeStartState
700
+ else:
701
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
702
+ self.stream.unget(data)
703
+ self.state = self.scriptDataEscapedState
704
+ return True
705
+
706
+ def scriptDataEscapedEndTagOpenState(self):
707
+ data = self.stream.char()
708
+ if data in asciiLetters:
709
+ self.temporaryBuffer = data
710
+ self.state = self.scriptDataEscapedEndTagNameState
711
+ else:
712
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
713
+ self.stream.unget(data)
714
+ self.state = self.scriptDataEscapedState
715
+ return True
716
+
717
+ def scriptDataEscapedEndTagNameState(self):
718
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
719
+ data = self.stream.char()
720
+ if data in spaceCharacters and appropriate:
721
+ self.currentToken = {"type": tokenTypes["EndTag"],
722
+ "name": self.temporaryBuffer,
723
+ "data": [], "selfClosing": False}
724
+ self.state = self.beforeAttributeNameState
725
+ elif data == "/" and appropriate:
726
+ self.currentToken = {"type": tokenTypes["EndTag"],
727
+ "name": self.temporaryBuffer,
728
+ "data": [], "selfClosing": False}
729
+ self.state = self.selfClosingStartTagState
730
+ elif data == ">" and appropriate:
731
+ self.currentToken = {"type": tokenTypes["EndTag"],
732
+ "name": self.temporaryBuffer,
733
+ "data": [], "selfClosing": False}
734
+ self.emitCurrentToken()
735
+ self.state = self.dataState
736
+ elif data in asciiLetters:
737
+ self.temporaryBuffer += data
738
+ else:
739
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
740
+ "data": "</" + self.temporaryBuffer})
741
+ self.stream.unget(data)
742
+ self.state = self.scriptDataEscapedState
743
+ return True
744
+
745
+ def scriptDataDoubleEscapeStartState(self):
746
+ data = self.stream.char()
747
+ if data in (spaceCharacters | frozenset(("/", ">"))):
748
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
749
+ if self.temporaryBuffer.lower() == "script":
750
+ self.state = self.scriptDataDoubleEscapedState
751
+ else:
752
+ self.state = self.scriptDataEscapedState
753
+ elif data in asciiLetters:
754
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
755
+ self.temporaryBuffer += data
756
+ else:
757
+ self.stream.unget(data)
758
+ self.state = self.scriptDataEscapedState
759
+ return True
760
+
761
+ def scriptDataDoubleEscapedState(self):
762
+ data = self.stream.char()
763
+ if data == "-":
764
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
765
+ self.state = self.scriptDataDoubleEscapedDashState
766
+ elif data == "<":
767
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
768
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
769
+ elif data == "\u0000":
770
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
771
+ "data": "invalid-codepoint"})
772
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
773
+ "data": "\uFFFD"})
774
+ elif data == EOF:
775
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
776
+ "eof-in-script-in-script"})
777
+ self.state = self.dataState
778
+ else:
779
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
780
+ return True
781
+
782
+ def scriptDataDoubleEscapedDashState(self):
783
+ data = self.stream.char()
784
+ if data == "-":
785
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
786
+ self.state = self.scriptDataDoubleEscapedDashDashState
787
+ elif data == "<":
788
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
789
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
790
+ elif data == "\u0000":
791
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
792
+ "data": "invalid-codepoint"})
793
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
794
+ "data": "\uFFFD"})
795
+ self.state = self.scriptDataDoubleEscapedState
796
+ elif data == EOF:
797
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
798
+ "eof-in-script-in-script"})
799
+ self.state = self.dataState
800
+ else:
801
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
802
+ self.state = self.scriptDataDoubleEscapedState
803
+ return True
804
+
805
+ def scriptDataDoubleEscapedDashDashState(self):
806
+ data = self.stream.char()
807
+ if data == "-":
808
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
809
+ elif data == "<":
810
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
811
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
812
+ elif data == ">":
813
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
814
+ self.state = self.scriptDataState
815
+ elif data == "\u0000":
816
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
817
+ "data": "invalid-codepoint"})
818
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
819
+ "data": "\uFFFD"})
820
+ self.state = self.scriptDataDoubleEscapedState
821
+ elif data == EOF:
822
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
823
+ "eof-in-script-in-script"})
824
+ self.state = self.dataState
825
+ else:
826
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
827
+ self.state = self.scriptDataDoubleEscapedState
828
+ return True
829
+
830
+ def scriptDataDoubleEscapedLessThanSignState(self):
831
+ data = self.stream.char()
832
+ if data == "/":
833
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
834
+ self.temporaryBuffer = ""
835
+ self.state = self.scriptDataDoubleEscapeEndState
836
+ else:
837
+ self.stream.unget(data)
838
+ self.state = self.scriptDataDoubleEscapedState
839
+ return True
840
+
841
+ def scriptDataDoubleEscapeEndState(self):
842
+ data = self.stream.char()
843
+ if data in (spaceCharacters | frozenset(("/", ">"))):
844
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
845
+ if self.temporaryBuffer.lower() == "script":
846
+ self.state = self.scriptDataEscapedState
847
+ else:
848
+ self.state = self.scriptDataDoubleEscapedState
849
+ elif data in asciiLetters:
850
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
851
+ self.temporaryBuffer += data
852
+ else:
853
+ self.stream.unget(data)
854
+ self.state = self.scriptDataDoubleEscapedState
855
+ return True
856
+
857
+ def beforeAttributeNameState(self):
858
+ data = self.stream.char()
859
+ if data in spaceCharacters:
860
+ self.stream.charsUntil(spaceCharacters, True)
861
+ elif data in asciiLetters:
862
+ self.currentToken["data"].append([data, ""])
863
+ self.state = self.attributeNameState
864
+ elif data == ">":
865
+ self.emitCurrentToken()
866
+ elif data == "/":
867
+ self.state = self.selfClosingStartTagState
868
+ elif data in ("'", '"', "=", "<"):
869
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
870
+ "invalid-character-in-attribute-name"})
871
+ self.currentToken["data"].append([data, ""])
872
+ self.state = self.attributeNameState
873
+ elif data == "\u0000":
874
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
875
+ "data": "invalid-codepoint"})
876
+ self.currentToken["data"].append(["\uFFFD", ""])
877
+ self.state = self.attributeNameState
878
+ elif data is EOF:
879
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
880
+ "expected-attribute-name-but-got-eof"})
881
+ self.state = self.dataState
882
+ else:
883
+ self.currentToken["data"].append([data, ""])
884
+ self.state = self.attributeNameState
885
+ return True
886
+
887
+ def attributeNameState(self):
888
+ data = self.stream.char()
889
+ leavingThisState = True
890
+ emitToken = False
891
+ if data == "=":
892
+ self.state = self.beforeAttributeValueState
893
+ elif data in asciiLetters:
894
+ self.currentToken["data"][-1][0] += data +\
895
+ self.stream.charsUntil(asciiLetters, True)
896
+ leavingThisState = False
897
+ elif data == ">":
898
+ # XXX If we emit here the attributes are converted to a dict
899
+ # without being checked and when the code below runs we error
900
+ # because data is a dict not a list
901
+ emitToken = True
902
+ elif data in spaceCharacters:
903
+ self.state = self.afterAttributeNameState
904
+ elif data == "/":
905
+ self.state = self.selfClosingStartTagState
906
+ elif data == "\u0000":
907
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
908
+ "data": "invalid-codepoint"})
909
+ self.currentToken["data"][-1][0] += "\uFFFD"
910
+ leavingThisState = False
911
+ elif data in ("'", '"', "<"):
912
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
913
+ "data":
914
+ "invalid-character-in-attribute-name"})
915
+ self.currentToken["data"][-1][0] += data
916
+ leavingThisState = False
917
+ elif data is EOF:
918
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
919
+ "data": "eof-in-attribute-name"})
920
+ self.state = self.dataState
921
+ else:
922
+ self.currentToken["data"][-1][0] += data
923
+ leavingThisState = False
924
+
925
+ if leavingThisState:
926
+ # Attributes are not dropped at this stage. That happens when the
927
+ # start tag token is emitted so values can still be safely appended
928
+ # to attributes, but we do want to report the parse error in time.
929
+ self.currentToken["data"][-1][0] = (
930
+ self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
931
+ for name, _ in self.currentToken["data"][:-1]:
932
+ if self.currentToken["data"][-1][0] == name:
933
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
934
+ "duplicate-attribute"})
935
+ break
936
+ # XXX Fix for above XXX
937
+ if emitToken:
938
+ self.emitCurrentToken()
939
+ return True
940
+
941
+ def afterAttributeNameState(self):
942
+ data = self.stream.char()
943
+ if data in spaceCharacters:
944
+ self.stream.charsUntil(spaceCharacters, True)
945
+ elif data == "=":
946
+ self.state = self.beforeAttributeValueState
947
+ elif data == ">":
948
+ self.emitCurrentToken()
949
+ elif data in asciiLetters:
950
+ self.currentToken["data"].append([data, ""])
951
+ self.state = self.attributeNameState
952
+ elif data == "/":
953
+ self.state = self.selfClosingStartTagState
954
+ elif data == "\u0000":
955
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
956
+ "data": "invalid-codepoint"})
957
+ self.currentToken["data"].append(["\uFFFD", ""])
958
+ self.state = self.attributeNameState
959
+ elif data in ("'", '"', "<"):
960
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
961
+ "invalid-character-after-attribute-name"})
962
+ self.currentToken["data"].append([data, ""])
963
+ self.state = self.attributeNameState
964
+ elif data is EOF:
965
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
966
+ "expected-end-of-tag-but-got-eof"})
967
+ self.state = self.dataState
968
+ else:
969
+ self.currentToken["data"].append([data, ""])
970
+ self.state = self.attributeNameState
971
+ return True
972
+
973
+ def beforeAttributeValueState(self):
974
+ data = self.stream.char()
975
+ if data in spaceCharacters:
976
+ self.stream.charsUntil(spaceCharacters, True)
977
+ elif data == "\"":
978
+ self.state = self.attributeValueDoubleQuotedState
979
+ elif data == "&":
980
+ self.state = self.attributeValueUnQuotedState
981
+ self.stream.unget(data)
982
+ elif data == "'":
983
+ self.state = self.attributeValueSingleQuotedState
984
+ elif data == ">":
985
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
986
+ "expected-attribute-value-but-got-right-bracket"})
987
+ self.emitCurrentToken()
988
+ elif data == "\u0000":
989
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
990
+ "data": "invalid-codepoint"})
991
+ self.currentToken["data"][-1][1] += "\uFFFD"
992
+ self.state = self.attributeValueUnQuotedState
993
+ elif data in ("=", "<", "`"):
994
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
995
+ "equals-in-unquoted-attribute-value"})
996
+ self.currentToken["data"][-1][1] += data
997
+ self.state = self.attributeValueUnQuotedState
998
+ elif data is EOF:
999
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1000
+ "expected-attribute-value-but-got-eof"})
1001
+ self.state = self.dataState
1002
+ else:
1003
+ self.currentToken["data"][-1][1] += data
1004
+ self.state = self.attributeValueUnQuotedState
1005
+ return True
1006
+
1007
+ def attributeValueDoubleQuotedState(self):
1008
+ data = self.stream.char()
1009
+ if data == "\"":
1010
+ self.state = self.afterAttributeValueState
1011
+ elif data == "&":
1012
+ self.processEntityInAttribute('"')
1013
+ elif data == "\u0000":
1014
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1015
+ "data": "invalid-codepoint"})
1016
+ self.currentToken["data"][-1][1] += "\uFFFD"
1017
+ elif data is EOF:
1018
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1019
+ "eof-in-attribute-value-double-quote"})
1020
+ self.state = self.dataState
1021
+ else:
1022
+ self.currentToken["data"][-1][1] += data +\
1023
+ self.stream.charsUntil(("\"", "&", "\u0000"))
1024
+ return True
1025
+
1026
+ def attributeValueSingleQuotedState(self):
1027
+ data = self.stream.char()
1028
+ if data == "'":
1029
+ self.state = self.afterAttributeValueState
1030
+ elif data == "&":
1031
+ self.processEntityInAttribute("'")
1032
+ elif data == "\u0000":
1033
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1034
+ "data": "invalid-codepoint"})
1035
+ self.currentToken["data"][-1][1] += "\uFFFD"
1036
+ elif data is EOF:
1037
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1038
+ "eof-in-attribute-value-single-quote"})
1039
+ self.state = self.dataState
1040
+ else:
1041
+ self.currentToken["data"][-1][1] += data +\
1042
+ self.stream.charsUntil(("'", "&", "\u0000"))
1043
+ return True
1044
+
1045
+ def attributeValueUnQuotedState(self):
1046
+ data = self.stream.char()
1047
+ if data in spaceCharacters:
1048
+ self.state = self.beforeAttributeNameState
1049
+ elif data == "&":
1050
+ self.processEntityInAttribute(">")
1051
+ elif data == ">":
1052
+ self.emitCurrentToken()
1053
+ elif data in ('"', "'", "=", "<", "`"):
1054
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1055
+ "unexpected-character-in-unquoted-attribute-value"})
1056
+ self.currentToken["data"][-1][1] += data
1057
+ elif data == "\u0000":
1058
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1059
+ "data": "invalid-codepoint"})
1060
+ self.currentToken["data"][-1][1] += "\uFFFD"
1061
+ elif data is EOF:
1062
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1063
+ "eof-in-attribute-value-no-quotes"})
1064
+ self.state = self.dataState
1065
+ else:
1066
+ self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
1067
+ frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
1068
+ return True
1069
+
1070
+ def afterAttributeValueState(self):
1071
+ data = self.stream.char()
1072
+ if data in spaceCharacters:
1073
+ self.state = self.beforeAttributeNameState
1074
+ elif data == ">":
1075
+ self.emitCurrentToken()
1076
+ elif data == "/":
1077
+ self.state = self.selfClosingStartTagState
1078
+ elif data is EOF:
1079
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1080
+ "unexpected-EOF-after-attribute-value"})
1081
+ self.stream.unget(data)
1082
+ self.state = self.dataState
1083
+ else:
1084
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1085
+ "unexpected-character-after-attribute-value"})
1086
+ self.stream.unget(data)
1087
+ self.state = self.beforeAttributeNameState
1088
+ return True
1089
+
1090
+ def selfClosingStartTagState(self):
1091
+ data = self.stream.char()
1092
+ if data == ">":
1093
+ self.currentToken["selfClosing"] = True
1094
+ self.emitCurrentToken()
1095
+ elif data is EOF:
1096
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1097
+ "data":
1098
+ "unexpected-EOF-after-solidus-in-tag"})
1099
+ self.stream.unget(data)
1100
+ self.state = self.dataState
1101
+ else:
1102
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1103
+ "unexpected-character-after-solidus-in-tag"})
1104
+ self.stream.unget(data)
1105
+ self.state = self.beforeAttributeNameState
1106
+ return True
1107
+
1108
+ def bogusCommentState(self):
1109
+ # Make a new comment token and give it as value all the characters
1110
+ # until the first > or EOF (charsUntil checks for EOF automatically)
1111
+ # and emit it.
1112
+ data = self.stream.charsUntil(">")
1113
+ data = data.replace("\u0000", "\uFFFD")
1114
+ self.tokenQueue.append(
1115
+ {"type": tokenTypes["Comment"], "data": data})
1116
+
1117
+ # Eat the character directly after the bogus comment which is either a
1118
+ # ">" or an EOF.
1119
+ self.stream.char()
1120
+ self.state = self.dataState
1121
+ return True
1122
+
1123
+ def markupDeclarationOpenState(self):
1124
+ charStack = [self.stream.char()]
1125
+ if charStack[-1] == "-":
1126
+ charStack.append(self.stream.char())
1127
+ if charStack[-1] == "-":
1128
+ self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
1129
+ self.state = self.commentStartState
1130
+ return True
1131
+ elif charStack[-1] in ('d', 'D'):
1132
+ matched = True
1133
+ for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
1134
+ ('y', 'Y'), ('p', 'P'), ('e', 'E')):
1135
+ charStack.append(self.stream.char())
1136
+ if charStack[-1] not in expected:
1137
+ matched = False
1138
+ break
1139
+ if matched:
1140
+ self.currentToken = {"type": tokenTypes["Doctype"],
1141
+ "name": "",
1142
+ "publicId": None, "systemId": None,
1143
+ "correct": True}
1144
+ self.state = self.doctypeState
1145
+ return True
1146
+ elif (charStack[-1] == "[" and
1147
+ self.parser is not None and
1148
+ self.parser.tree.openElements and
1149
+ self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
1150
+ matched = True
1151
+ for expected in ["C", "D", "A", "T", "A", "["]:
1152
+ charStack.append(self.stream.char())
1153
+ if charStack[-1] != expected:
1154
+ matched = False
1155
+ break
1156
+ if matched:
1157
+ self.state = self.cdataSectionState
1158
+ return True
1159
+
1160
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1161
+ "expected-dashes-or-doctype"})
1162
+
1163
+ while charStack:
1164
+ self.stream.unget(charStack.pop())
1165
+ self.state = self.bogusCommentState
1166
+ return True
1167
+
1168
+ def commentStartState(self):
1169
+ data = self.stream.char()
1170
+ if data == "-":
1171
+ self.state = self.commentStartDashState
1172
+ elif data == "\u0000":
1173
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1174
+ "data": "invalid-codepoint"})
1175
+ self.currentToken["data"] += "\uFFFD"
1176
+ elif data == ">":
1177
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1178
+ "incorrect-comment"})
1179
+ self.tokenQueue.append(self.currentToken)
1180
+ self.state = self.dataState
1181
+ elif data is EOF:
1182
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1183
+ "eof-in-comment"})
1184
+ self.tokenQueue.append(self.currentToken)
1185
+ self.state = self.dataState
1186
+ else:
1187
+ self.currentToken["data"] += data
1188
+ self.state = self.commentState
1189
+ return True
1190
+
1191
+ def commentStartDashState(self):
1192
+ data = self.stream.char()
1193
+ if data == "-":
1194
+ self.state = self.commentEndState
1195
+ elif data == "\u0000":
1196
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1197
+ "data": "invalid-codepoint"})
1198
+ self.currentToken["data"] += "-\uFFFD"
1199
+ elif data == ">":
1200
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1201
+ "incorrect-comment"})
1202
+ self.tokenQueue.append(self.currentToken)
1203
+ self.state = self.dataState
1204
+ elif data is EOF:
1205
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1206
+ "eof-in-comment"})
1207
+ self.tokenQueue.append(self.currentToken)
1208
+ self.state = self.dataState
1209
+ else:
1210
+ self.currentToken["data"] += "-" + data
1211
+ self.state = self.commentState
1212
+ return True
1213
+
1214
+ def commentState(self):
1215
+ data = self.stream.char()
1216
+ if data == "-":
1217
+ self.state = self.commentEndDashState
1218
+ elif data == "\u0000":
1219
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1220
+ "data": "invalid-codepoint"})
1221
+ self.currentToken["data"] += "\uFFFD"
1222
+ elif data is EOF:
1223
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1224
+ "data": "eof-in-comment"})
1225
+ self.tokenQueue.append(self.currentToken)
1226
+ self.state = self.dataState
1227
+ else:
1228
+ self.currentToken["data"] += data + \
1229
+ self.stream.charsUntil(("-", "\u0000"))
1230
+ return True
1231
+
1232
+ def commentEndDashState(self):
1233
+ data = self.stream.char()
1234
+ if data == "-":
1235
+ self.state = self.commentEndState
1236
+ elif data == "\u0000":
1237
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1238
+ "data": "invalid-codepoint"})
1239
+ self.currentToken["data"] += "-\uFFFD"
1240
+ self.state = self.commentState
1241
+ elif data is EOF:
1242
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1243
+ "eof-in-comment-end-dash"})
1244
+ self.tokenQueue.append(self.currentToken)
1245
+ self.state = self.dataState
1246
+ else:
1247
+ self.currentToken["data"] += "-" + data
1248
+ self.state = self.commentState
1249
+ return True
1250
+
1251
+ def commentEndState(self):
1252
+ data = self.stream.char()
1253
+ if data == ">":
1254
+ self.tokenQueue.append(self.currentToken)
1255
+ self.state = self.dataState
1256
+ elif data == "\u0000":
1257
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1258
+ "data": "invalid-codepoint"})
1259
+ self.currentToken["data"] += "--\uFFFD"
1260
+ self.state = self.commentState
1261
+ elif data == "!":
1262
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1263
+ "unexpected-bang-after-double-dash-in-comment"})
1264
+ self.state = self.commentEndBangState
1265
+ elif data == "-":
1266
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1267
+ "unexpected-dash-after-double-dash-in-comment"})
1268
+ self.currentToken["data"] += data
1269
+ elif data is EOF:
1270
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1271
+ "eof-in-comment-double-dash"})
1272
+ self.tokenQueue.append(self.currentToken)
1273
+ self.state = self.dataState
1274
+ else:
1275
+ # XXX
1276
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1277
+ "unexpected-char-in-comment"})
1278
+ self.currentToken["data"] += "--" + data
1279
+ self.state = self.commentState
1280
+ return True
1281
+
1282
+ def commentEndBangState(self):
1283
+ data = self.stream.char()
1284
+ if data == ">":
1285
+ self.tokenQueue.append(self.currentToken)
1286
+ self.state = self.dataState
1287
+ elif data == "-":
1288
+ self.currentToken["data"] += "--!"
1289
+ self.state = self.commentEndDashState
1290
+ elif data == "\u0000":
1291
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1292
+ "data": "invalid-codepoint"})
1293
+ self.currentToken["data"] += "--!\uFFFD"
1294
+ self.state = self.commentState
1295
+ elif data is EOF:
1296
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1297
+ "eof-in-comment-end-bang-state"})
1298
+ self.tokenQueue.append(self.currentToken)
1299
+ self.state = self.dataState
1300
+ else:
1301
+ self.currentToken["data"] += "--!" + data
1302
+ self.state = self.commentState
1303
+ return True
1304
+
1305
+ def doctypeState(self):
1306
+ data = self.stream.char()
1307
+ if data in spaceCharacters:
1308
+ self.state = self.beforeDoctypeNameState
1309
+ elif data is EOF:
1310
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1311
+ "expected-doctype-name-but-got-eof"})
1312
+ self.currentToken["correct"] = False
1313
+ self.tokenQueue.append(self.currentToken)
1314
+ self.state = self.dataState
1315
+ else:
1316
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1317
+ "need-space-after-doctype"})
1318
+ self.stream.unget(data)
1319
+ self.state = self.beforeDoctypeNameState
1320
+ return True
1321
+
1322
+ def beforeDoctypeNameState(self):
1323
+ data = self.stream.char()
1324
+ if data in spaceCharacters:
1325
+ pass
1326
+ elif data == ">":
1327
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1328
+ "expected-doctype-name-but-got-right-bracket"})
1329
+ self.currentToken["correct"] = False
1330
+ self.tokenQueue.append(self.currentToken)
1331
+ self.state = self.dataState
1332
+ elif data == "\u0000":
1333
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1334
+ "data": "invalid-codepoint"})
1335
+ self.currentToken["name"] = "\uFFFD"
1336
+ self.state = self.doctypeNameState
1337
+ elif data is EOF:
1338
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1339
+ "expected-doctype-name-but-got-eof"})
1340
+ self.currentToken["correct"] = False
1341
+ self.tokenQueue.append(self.currentToken)
1342
+ self.state = self.dataState
1343
+ else:
1344
+ self.currentToken["name"] = data
1345
+ self.state = self.doctypeNameState
1346
+ return True
1347
+
1348
+ def doctypeNameState(self):
1349
+ data = self.stream.char()
1350
+ if data in spaceCharacters:
1351
+ self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1352
+ self.state = self.afterDoctypeNameState
1353
+ elif data == ">":
1354
+ self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1355
+ self.tokenQueue.append(self.currentToken)
1356
+ self.state = self.dataState
1357
+ elif data == "\u0000":
1358
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1359
+ "data": "invalid-codepoint"})
1360
+ self.currentToken["name"] += "\uFFFD"
1361
+ self.state = self.doctypeNameState
1362
+ elif data is EOF:
1363
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1364
+ "eof-in-doctype-name"})
1365
+ self.currentToken["correct"] = False
1366
+ self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
1367
+ self.tokenQueue.append(self.currentToken)
1368
+ self.state = self.dataState
1369
+ else:
1370
+ self.currentToken["name"] += data
1371
+ return True
1372
+
1373
+ def afterDoctypeNameState(self):
1374
+ data = self.stream.char()
1375
+ if data in spaceCharacters:
1376
+ pass
1377
+ elif data == ">":
1378
+ self.tokenQueue.append(self.currentToken)
1379
+ self.state = self.dataState
1380
+ elif data is EOF:
1381
+ self.currentToken["correct"] = False
1382
+ self.stream.unget(data)
1383
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1384
+ "eof-in-doctype"})
1385
+ self.tokenQueue.append(self.currentToken)
1386
+ self.state = self.dataState
1387
+ else:
1388
+ if data in ("p", "P"):
1389
+ matched = True
1390
+ for expected in (("u", "U"), ("b", "B"), ("l", "L"),
1391
+ ("i", "I"), ("c", "C")):
1392
+ data = self.stream.char()
1393
+ if data not in expected:
1394
+ matched = False
1395
+ break
1396
+ if matched:
1397
+ self.state = self.afterDoctypePublicKeywordState
1398
+ return True
1399
+ elif data in ("s", "S"):
1400
+ matched = True
1401
+ for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
1402
+ ("e", "E"), ("m", "M")):
1403
+ data = self.stream.char()
1404
+ if data not in expected:
1405
+ matched = False
1406
+ break
1407
+ if matched:
1408
+ self.state = self.afterDoctypeSystemKeywordState
1409
+ return True
1410
+
1411
+ # All the characters read before the current 'data' will be
1412
+ # [a-zA-Z], so they're garbage in the bogus doctype and can be
1413
+ # discarded; only the latest character might be '>' or EOF
1414
+ # and needs to be ungetted
1415
+ self.stream.unget(data)
1416
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1417
+ "expected-space-or-right-bracket-in-doctype", "datavars":
1418
+ {"data": data}})
1419
+ self.currentToken["correct"] = False
1420
+ self.state = self.bogusDoctypeState
1421
+
1422
+ return True
1423
+
1424
+ def afterDoctypePublicKeywordState(self):
1425
+ data = self.stream.char()
1426
+ if data in spaceCharacters:
1427
+ self.state = self.beforeDoctypePublicIdentifierState
1428
+ elif data in ("'", '"'):
1429
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1430
+ "unexpected-char-in-doctype"})
1431
+ self.stream.unget(data)
1432
+ self.state = self.beforeDoctypePublicIdentifierState
1433
+ elif data is EOF:
1434
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1435
+ "eof-in-doctype"})
1436
+ self.currentToken["correct"] = False
1437
+ self.tokenQueue.append(self.currentToken)
1438
+ self.state = self.dataState
1439
+ else:
1440
+ self.stream.unget(data)
1441
+ self.state = self.beforeDoctypePublicIdentifierState
1442
+ return True
1443
+
1444
+ def beforeDoctypePublicIdentifierState(self):
1445
+ data = self.stream.char()
1446
+ if data in spaceCharacters:
1447
+ pass
1448
+ elif data == "\"":
1449
+ self.currentToken["publicId"] = ""
1450
+ self.state = self.doctypePublicIdentifierDoubleQuotedState
1451
+ elif data == "'":
1452
+ self.currentToken["publicId"] = ""
1453
+ self.state = self.doctypePublicIdentifierSingleQuotedState
1454
+ elif data == ">":
1455
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1456
+ "unexpected-end-of-doctype"})
1457
+ self.currentToken["correct"] = False
1458
+ self.tokenQueue.append(self.currentToken)
1459
+ self.state = self.dataState
1460
+ elif data is EOF:
1461
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1462
+ "eof-in-doctype"})
1463
+ self.currentToken["correct"] = False
1464
+ self.tokenQueue.append(self.currentToken)
1465
+ self.state = self.dataState
1466
+ else:
1467
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1468
+ "unexpected-char-in-doctype"})
1469
+ self.currentToken["correct"] = False
1470
+ self.state = self.bogusDoctypeState
1471
+ return True
1472
+
1473
+ def doctypePublicIdentifierDoubleQuotedState(self):
1474
+ data = self.stream.char()
1475
+ if data == "\"":
1476
+ self.state = self.afterDoctypePublicIdentifierState
1477
+ elif data == "\u0000":
1478
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1479
+ "data": "invalid-codepoint"})
1480
+ self.currentToken["publicId"] += "\uFFFD"
1481
+ elif data == ">":
1482
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1483
+ "unexpected-end-of-doctype"})
1484
+ self.currentToken["correct"] = False
1485
+ self.tokenQueue.append(self.currentToken)
1486
+ self.state = self.dataState
1487
+ elif data is EOF:
1488
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1489
+ "eof-in-doctype"})
1490
+ self.currentToken["correct"] = False
1491
+ self.tokenQueue.append(self.currentToken)
1492
+ self.state = self.dataState
1493
+ else:
1494
+ self.currentToken["publicId"] += data
1495
+ return True
1496
+
1497
+ def doctypePublicIdentifierSingleQuotedState(self):
1498
+ data = self.stream.char()
1499
+ if data == "'":
1500
+ self.state = self.afterDoctypePublicIdentifierState
1501
+ elif data == "\u0000":
1502
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1503
+ "data": "invalid-codepoint"})
1504
+ self.currentToken["publicId"] += "\uFFFD"
1505
+ elif data == ">":
1506
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1507
+ "unexpected-end-of-doctype"})
1508
+ self.currentToken["correct"] = False
1509
+ self.tokenQueue.append(self.currentToken)
1510
+ self.state = self.dataState
1511
+ elif data is EOF:
1512
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1513
+ "eof-in-doctype"})
1514
+ self.currentToken["correct"] = False
1515
+ self.tokenQueue.append(self.currentToken)
1516
+ self.state = self.dataState
1517
+ else:
1518
+ self.currentToken["publicId"] += data
1519
+ return True
1520
+
1521
+ def afterDoctypePublicIdentifierState(self):
1522
+ data = self.stream.char()
1523
+ if data in spaceCharacters:
1524
+ self.state = self.betweenDoctypePublicAndSystemIdentifiersState
1525
+ elif data == ">":
1526
+ self.tokenQueue.append(self.currentToken)
1527
+ self.state = self.dataState
1528
+ elif data == '"':
1529
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1530
+ "unexpected-char-in-doctype"})
1531
+ self.currentToken["systemId"] = ""
1532
+ self.state = self.doctypeSystemIdentifierDoubleQuotedState
1533
+ elif data == "'":
1534
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1535
+ "unexpected-char-in-doctype"})
1536
+ self.currentToken["systemId"] = ""
1537
+ self.state = self.doctypeSystemIdentifierSingleQuotedState
1538
+ elif data is EOF:
1539
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1540
+ "eof-in-doctype"})
1541
+ self.currentToken["correct"] = False
1542
+ self.tokenQueue.append(self.currentToken)
1543
+ self.state = self.dataState
1544
+ else:
1545
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1546
+ "unexpected-char-in-doctype"})
1547
+ self.currentToken["correct"] = False
1548
+ self.state = self.bogusDoctypeState
1549
+ return True
1550
+
1551
+ def betweenDoctypePublicAndSystemIdentifiersState(self):
1552
+ data = self.stream.char()
1553
+ if data in spaceCharacters:
1554
+ pass
1555
+ elif data == ">":
1556
+ self.tokenQueue.append(self.currentToken)
1557
+ self.state = self.dataState
1558
+ elif data == '"':
1559
+ self.currentToken["systemId"] = ""
1560
+ self.state = self.doctypeSystemIdentifierDoubleQuotedState
1561
+ elif data == "'":
1562
+ self.currentToken["systemId"] = ""
1563
+ self.state = self.doctypeSystemIdentifierSingleQuotedState
1564
+ elif data == EOF:
1565
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1566
+ "eof-in-doctype"})
1567
+ self.currentToken["correct"] = False
1568
+ self.tokenQueue.append(self.currentToken)
1569
+ self.state = self.dataState
1570
+ else:
1571
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1572
+ "unexpected-char-in-doctype"})
1573
+ self.currentToken["correct"] = False
1574
+ self.state = self.bogusDoctypeState
1575
+ return True
1576
+
1577
+ def afterDoctypeSystemKeywordState(self):
1578
+ data = self.stream.char()
1579
+ if data in spaceCharacters:
1580
+ self.state = self.beforeDoctypeSystemIdentifierState
1581
+ elif data in ("'", '"'):
1582
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1583
+ "unexpected-char-in-doctype"})
1584
+ self.stream.unget(data)
1585
+ self.state = self.beforeDoctypeSystemIdentifierState
1586
+ elif data is EOF:
1587
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1588
+ "eof-in-doctype"})
1589
+ self.currentToken["correct"] = False
1590
+ self.tokenQueue.append(self.currentToken)
1591
+ self.state = self.dataState
1592
+ else:
1593
+ self.stream.unget(data)
1594
+ self.state = self.beforeDoctypeSystemIdentifierState
1595
+ return True
1596
+
1597
+ def beforeDoctypeSystemIdentifierState(self):
1598
+ data = self.stream.char()
1599
+ if data in spaceCharacters:
1600
+ pass
1601
+ elif data == "\"":
1602
+ self.currentToken["systemId"] = ""
1603
+ self.state = self.doctypeSystemIdentifierDoubleQuotedState
1604
+ elif data == "'":
1605
+ self.currentToken["systemId"] = ""
1606
+ self.state = self.doctypeSystemIdentifierSingleQuotedState
1607
+ elif data == ">":
1608
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1609
+ "unexpected-char-in-doctype"})
1610
+ self.currentToken["correct"] = False
1611
+ self.tokenQueue.append(self.currentToken)
1612
+ self.state = self.dataState
1613
+ elif data is EOF:
1614
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1615
+ "eof-in-doctype"})
1616
+ self.currentToken["correct"] = False
1617
+ self.tokenQueue.append(self.currentToken)
1618
+ self.state = self.dataState
1619
+ else:
1620
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1621
+ "unexpected-char-in-doctype"})
1622
+ self.currentToken["correct"] = False
1623
+ self.state = self.bogusDoctypeState
1624
+ return True
1625
+
1626
+ def doctypeSystemIdentifierDoubleQuotedState(self):
1627
+ data = self.stream.char()
1628
+ if data == "\"":
1629
+ self.state = self.afterDoctypeSystemIdentifierState
1630
+ elif data == "\u0000":
1631
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1632
+ "data": "invalid-codepoint"})
1633
+ self.currentToken["systemId"] += "\uFFFD"
1634
+ elif data == ">":
1635
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1636
+ "unexpected-end-of-doctype"})
1637
+ self.currentToken["correct"] = False
1638
+ self.tokenQueue.append(self.currentToken)
1639
+ self.state = self.dataState
1640
+ elif data is EOF:
1641
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1642
+ "eof-in-doctype"})
1643
+ self.currentToken["correct"] = False
1644
+ self.tokenQueue.append(self.currentToken)
1645
+ self.state = self.dataState
1646
+ else:
1647
+ self.currentToken["systemId"] += data
1648
+ return True
1649
+
1650
+ def doctypeSystemIdentifierSingleQuotedState(self):
1651
+ data = self.stream.char()
1652
+ if data == "'":
1653
+ self.state = self.afterDoctypeSystemIdentifierState
1654
+ elif data == "\u0000":
1655
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1656
+ "data": "invalid-codepoint"})
1657
+ self.currentToken["systemId"] += "\uFFFD"
1658
+ elif data == ">":
1659
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1660
+ "unexpected-end-of-doctype"})
1661
+ self.currentToken["correct"] = False
1662
+ self.tokenQueue.append(self.currentToken)
1663
+ self.state = self.dataState
1664
+ elif data is EOF:
1665
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1666
+ "eof-in-doctype"})
1667
+ self.currentToken["correct"] = False
1668
+ self.tokenQueue.append(self.currentToken)
1669
+ self.state = self.dataState
1670
+ else:
1671
+ self.currentToken["systemId"] += data
1672
+ return True
1673
+
1674
+ def afterDoctypeSystemIdentifierState(self):
1675
+ data = self.stream.char()
1676
+ if data in spaceCharacters:
1677
+ pass
1678
+ elif data == ">":
1679
+ self.tokenQueue.append(self.currentToken)
1680
+ self.state = self.dataState
1681
+ elif data is EOF:
1682
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1683
+ "eof-in-doctype"})
1684
+ self.currentToken["correct"] = False
1685
+ self.tokenQueue.append(self.currentToken)
1686
+ self.state = self.dataState
1687
+ else:
1688
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1689
+ "unexpected-char-in-doctype"})
1690
+ self.state = self.bogusDoctypeState
1691
+ return True
1692
+
1693
+ def bogusDoctypeState(self):
1694
+ data = self.stream.char()
1695
+ if data == ">":
1696
+ self.tokenQueue.append(self.currentToken)
1697
+ self.state = self.dataState
1698
+ elif data is EOF:
1699
+ # XXX EMIT
1700
+ self.stream.unget(data)
1701
+ self.tokenQueue.append(self.currentToken)
1702
+ self.state = self.dataState
1703
+ else:
1704
+ pass
1705
+ return True
1706
+
1707
+ def cdataSectionState(self):
1708
+ data = []
1709
+ while True:
1710
+ data.append(self.stream.charsUntil("]"))
1711
+ data.append(self.stream.charsUntil(">"))
1712
+ char = self.stream.char()
1713
+ if char == EOF:
1714
+ break
1715
+ else:
1716
+ assert char == ">"
1717
+ if data[-1][-2:] == "]]":
1718
+ data[-1] = data[-1][:-2]
1719
+ break
1720
+ else:
1721
+ data.append(char)
1722
+
1723
+ data = "".join(data) # pylint:disable=redefined-variable-type
1724
+ # Deal with null here rather than in the parser
1725
+ nullCount = data.count("\u0000")
1726
+ if nullCount > 0:
1727
+ for _ in range(nullCount):
1728
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
1729
+ "data": "invalid-codepoint"})
1730
+ data = data.replace("\u0000", "\uFFFD")
1731
+ if data:
1732
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
1733
+ "data": data})
1734
+ self.state = self.dataState
1735
+ return True
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from __future__ import absolute_import, division, unicode_literals
2
+
3
+ from .py import Trie
4
+
5
+ __all__ = ["Trie"]
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (325 Bytes). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/_base.cpython-39.pyc ADDED
Binary file (1.57 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/__pycache__/py.cpython-39.pyc ADDED
Binary file (2.22 kB). View file
 
MLPY/Lib/site-packages/tensorboard/_vendor/html5lib/_trie/_base.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, division, unicode_literals
2
+
3
+ try:
4
+ from collections.abc import Mapping
5
+ except ImportError: # Python 2.7
6
+ from collections import Mapping
7
+
8
+
9
+ class Trie(Mapping):
10
+ """Abstract base class for tries"""
11
+
12
+ def keys(self, prefix=None):
13
+ # pylint:disable=arguments-differ
14
+ keys = super(Trie, self).keys()
15
+
16
+ if prefix is None:
17
+ return set(keys)
18
+
19
+ return {x for x in keys if x.startswith(prefix)}
20
+
21
+ def has_keys_with_prefix(self, prefix):
22
+ for key in self.keys():
23
+ if key.startswith(prefix):
24
+ return True
25
+
26
+ return False
27
+
28
+ def longest_prefix(self, prefix):
29
+ if prefix in self:
30
+ return prefix
31
+
32
+ for i in range(1, len(prefix) + 1):
33
+ if prefix[:-i] in self:
34
+ return prefix[:-i]
35
+
36
+ raise KeyError(prefix)
37
+
38
+ def longest_prefix_item(self, prefix):
39
+ lprefix = self.longest_prefix(prefix)
40
+ return (lprefix, self[lprefix])