File size: 14,257 Bytes
d1ceb73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 |
#
# Copyright 2009 Facebook
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Escaping/unescaping methods for HTML, JSON, URLs, and others.
Also includes a few other miscellaneous string manipulation functions that
have crept in over time.
Many functions in this module have near-equivalents in the standard library
(the differences mainly relate to handling of bytes and unicode strings,
and were more relevant in Python 2). In new code, the standard library
functions are encouraged instead of this module where applicable. See the
docstrings on each function for details.
"""
import html
import json
import re
import urllib.parse
from tornado.util import unicode_type
import typing
from typing import Union, Any, Optional, Dict, List, Callable
def xhtml_escape(value: Union[str, bytes]) -> str:
"""Escapes a string so it is valid within HTML or XML.
Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``.
When used in attribute values the escaped strings must be enclosed
in quotes.
Equivalent to `html.escape` except that this function always returns
type `str` while `html.escape` returns `bytes` if its input is `bytes`.
.. versionchanged:: 3.2
Added the single quote to the list of escaped characters.
.. versionchanged:: 6.4
Now simply wraps `html.escape`. This is equivalent to the old behavior
except that single quotes are now escaped as ``'`` instead of
``'`` and performance may be different.
"""
return html.escape(to_unicode(value))
def xhtml_unescape(value: Union[str, bytes]) -> str:
"""Un-escapes an XML-escaped string.
Equivalent to `html.unescape` except that this function always returns
type `str` while `html.unescape` returns `bytes` if its input is `bytes`.
.. versionchanged:: 6.4
Now simply wraps `html.unescape`. This changes behavior for some inputs
as required by the HTML 5 specification
https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
Some invalid inputs such as surrogates now raise an error, and numeric
references to certain ISO-8859-1 characters are now handled correctly.
"""
return html.unescape(to_unicode(value))
# The fact that json_encode wraps json.dumps is an implementation detail.
# Please see https://github.com/tornadoweb/tornado/pull/706
# before sending a pull request that adds **kwargs to this function.
def json_encode(value: Any) -> str:
"""JSON-encodes the given Python object.
Equivalent to `json.dumps` with the additional guarantee that the output
will never contain the character sequence ``</`` which can be problematic
when JSON is embedded in an HTML ``<script>`` tag.
"""
# JSON permits but does not require forward slashes to be escaped.
# This is useful when json data is emitted in a <script> tag
# in HTML, as it prevents </script> tags from prematurely terminating
# the JavaScript. Some json libraries do this escaping by default,
# although python's standard library does not, so we do it here.
# http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
return json.dumps(value).replace("</", "<\\/")
def json_decode(value: Union[str, bytes]) -> Any:
"""Returns Python objects for the given JSON string.
Supports both `str` and `bytes` inputs. Equvalent to `json.loads`.
"""
return json.loads(value)
def squeeze(value: str) -> str:
"""Replace all sequences of whitespace chars with a single space."""
return re.sub(r"[\x00-\x20]+", " ", value).strip()
def url_escape(value: Union[str, bytes], plus: bool = True) -> str:
"""Returns a URL-encoded version of the given value.
Equivalent to either `urllib.parse.quote_plus` or `urllib.parse.quote` depending on the ``plus``
argument.
If ``plus`` is true (the default), spaces will be represented as ``+`` and slashes will be
represented as ``%2F``. This is appropriate for query strings. If ``plus`` is false, spaces
will be represented as ``%20`` and slashes are left as-is. This is appropriate for the path
component of a URL. Note that the default of ``plus=True`` is effectively the
reverse of Python's urllib module.
.. versionadded:: 3.1
The ``plus`` argument
"""
quote = urllib.parse.quote_plus if plus else urllib.parse.quote
return quote(value)
@typing.overload
def url_unescape(value: Union[str, bytes], encoding: None, plus: bool = True) -> bytes:
pass
@typing.overload
def url_unescape(
value: Union[str, bytes], encoding: str = "utf-8", plus: bool = True
) -> str:
pass
def url_unescape(
value: Union[str, bytes], encoding: Optional[str] = "utf-8", plus: bool = True
) -> Union[str, bytes]:
"""Decodes the given value from a URL.
The argument may be either a byte or unicode string.
If encoding is None, the result will be a byte string and this function is equivalent to
`urllib.parse.unquote_to_bytes` if ``plus=False``. Otherwise, the result is a unicode string in
the specified encoding and this function is equivalent to either `urllib.parse.unquote_plus` or
`urllib.parse.unquote` except that this function also accepts `bytes` as input.
If ``plus`` is true (the default), plus signs will be interpreted as spaces (literal plus signs
must be represented as "%2B"). This is appropriate for query strings and form-encoded values
but not for the path component of a URL. Note that this default is the reverse of Python's
urllib module.
.. versionadded:: 3.1
The ``plus`` argument
"""
if encoding is None:
if plus:
# unquote_to_bytes doesn't have a _plus variant
value = to_basestring(value).replace("+", " ")
return urllib.parse.unquote_to_bytes(value)
else:
unquote = urllib.parse.unquote_plus if plus else urllib.parse.unquote
return unquote(to_basestring(value), encoding=encoding)
def parse_qs_bytes(
qs: Union[str, bytes], keep_blank_values: bool = False, strict_parsing: bool = False
) -> Dict[str, List[bytes]]:
"""Parses a query string like urlparse.parse_qs,
but takes bytes and returns the values as byte strings.
Keys still become type str (interpreted as latin1 in python3!)
because it's too painful to keep them as byte strings in
python3 and in practice they're nearly always ascii anyway.
"""
# This is gross, but python3 doesn't give us another way.
# Latin1 is the universal donor of character encodings.
if isinstance(qs, bytes):
qs = qs.decode("latin1")
result = urllib.parse.parse_qs(
qs, keep_blank_values, strict_parsing, encoding="latin1", errors="strict"
)
encoded = {}
for k, v in result.items():
encoded[k] = [i.encode("latin1") for i in v]
return encoded
_UTF8_TYPES = (bytes, type(None))
@typing.overload
def utf8(value: bytes) -> bytes:
pass
@typing.overload
def utf8(value: str) -> bytes:
pass
@typing.overload
def utf8(value: None) -> None:
pass
def utf8(value: Union[None, str, bytes]) -> Optional[bytes]:
"""Converts a string argument to a byte string.
If the argument is already a byte string or None, it is returned unchanged.
Otherwise it must be a unicode string and is encoded as utf8.
"""
if isinstance(value, _UTF8_TYPES):
return value
if not isinstance(value, unicode_type):
raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
return value.encode("utf-8")
_TO_UNICODE_TYPES = (unicode_type, type(None))
@typing.overload
def to_unicode(value: str) -> str:
pass
@typing.overload
def to_unicode(value: bytes) -> str:
pass
@typing.overload
def to_unicode(value: None) -> None:
pass
def to_unicode(value: Union[None, str, bytes]) -> Optional[str]:
"""Converts a string argument to a unicode string.
If the argument is already a unicode string or None, it is returned
unchanged. Otherwise it must be a byte string and is decoded as utf8.
"""
if isinstance(value, _TO_UNICODE_TYPES):
return value
if not isinstance(value, bytes):
raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
return value.decode("utf-8")
# to_unicode was previously named _unicode not because it was private,
# but to avoid conflicts with the built-in unicode() function/type
_unicode = to_unicode
# When dealing with the standard library across python 2 and 3 it is
# sometimes useful to have a direct conversion to the native string type
native_str = to_unicode
to_basestring = to_unicode
def recursive_unicode(obj: Any) -> Any:
"""Walks a simple data structure, converting byte strings to unicode.
Supports lists, tuples, and dictionaries.
"""
if isinstance(obj, dict):
return dict(
(recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items()
)
elif isinstance(obj, list):
return list(recursive_unicode(i) for i in obj)
elif isinstance(obj, tuple):
return tuple(recursive_unicode(i) for i in obj)
elif isinstance(obj, bytes):
return to_unicode(obj)
else:
return obj
# I originally used the regex from
# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
# but it gets all exponential on certain patterns (such as too many trailing
# dots), causing the regex matcher to never return.
# This regex should avoid those problems.
# Use to_unicode instead of tornado.util.u - we don't want backslashes getting
# processed as escapes.
_URL_RE = re.compile(
to_unicode(
r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)""" # noqa: E501
)
)
def linkify(
text: Union[str, bytes],
shorten: bool = False,
extra_params: Union[str, Callable[[str], str]] = "",
require_protocol: bool = False,
permitted_protocols: List[str] = ["http", "https"],
) -> str:
"""Converts plain text into HTML with links.
For example: ``linkify("Hello http://tornadoweb.org!")`` would return
``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
Parameters:
* ``shorten``: Long urls will be shortened for display.
* ``extra_params``: Extra text to include in the link tag, or a callable
taking the link as an argument and returning the extra text
e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
or::
def extra_params_cb(url):
if url.startswith("http://example.com"):
return 'class="internal"'
else:
return 'class="external" rel="nofollow"'
linkify(text, extra_params=extra_params_cb)
* ``require_protocol``: Only linkify urls which include a protocol. If
this is False, urls such as www.facebook.com will also be linkified.
* ``permitted_protocols``: List (or set) of protocols which should be
linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp",
"mailto"])``. It is very unsafe to include protocols such as
``javascript``.
"""
if extra_params and not callable(extra_params):
extra_params = " " + extra_params.strip()
def make_link(m: typing.Match) -> str:
url = m.group(1)
proto = m.group(2)
if require_protocol and not proto:
return url # not protocol, no linkify
if proto and proto not in permitted_protocols:
return url # bad protocol, no linkify
href = m.group(1)
if not proto:
href = "http://" + href # no proto specified, use http
if callable(extra_params):
params = " " + extra_params(href).strip()
else:
params = extra_params
# clip long urls. max_len is just an approximation
max_len = 30
if shorten and len(url) > max_len:
before_clip = url
if proto:
proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :
else:
proto_len = 0
parts = url[proto_len:].split("/")
if len(parts) > 1:
# Grab the whole host part plus the first bit of the path
# The path is usually not that interesting once shortened
# (no more slug, etc), so it really just provides a little
# extra indication of shortening.
url = (
url[:proto_len]
+ parts[0]
+ "/"
+ parts[1][:8].split("?")[0].split(".")[0]
)
if len(url) > max_len * 1.5: # still too long
url = url[:max_len]
if url != before_clip:
amp = url.rfind("&")
# avoid splitting html char entities
if amp > max_len - 5:
url = url[:amp]
url += "..."
if len(url) >= len(before_clip):
url = before_clip
else:
# full url is visible on mouse-over (for those who don't
# have a status bar, such as Safari by default)
params += ' title="%s"' % href
return '<a href="%s"%s>%s</a>' % (href, params, url)
# First HTML-escape so that our strings are all safe.
# The regex is modified to avoid character entites other than & so
# that we won't pick up ", etc.
text = _unicode(xhtml_escape(text))
return _URL_RE.sub(make_link, text)
|