Upload 1579 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +4 -0
- pproyect/test1/.gitignore +2 -0
- pproyect/test1/Lib/site-packages/__pycache__/_virtualenv.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/_distutils_hack/__init__.py +220 -0
- pproyect/test1/Lib/site-packages/_distutils_hack/__pycache__/__init__.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/_distutils_hack/__pycache__/override.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/_distutils_hack/override.py +1 -0
- pproyect/test1/Lib/site-packages/_virtualenv.pth +3 -0
- pproyect/test1/Lib/site-packages/_virtualenv.py +103 -0
- pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/INSTALLER +1 -0
- pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/LICENSE +20 -0
- pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/METADATA +66 -0
- pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/RECORD +14 -0
- pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/WHEEL +5 -0
- pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/top_level.txt +1 -0
- pproyect/test1/Lib/site-packages/certifi/__init__.py +4 -0
- pproyect/test1/Lib/site-packages/certifi/__main__.py +12 -0
- pproyect/test1/Lib/site-packages/certifi/__pycache__/__init__.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/certifi/__pycache__/__main__.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/certifi/__pycache__/core.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/certifi/cacert.pem +0 -0
- pproyect/test1/Lib/site-packages/certifi/core.py +114 -0
- pproyect/test1/Lib/site-packages/certifi/py.typed +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/INSTALLER +1 -0
- pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/LICENSE +21 -0
- pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/METADATA +683 -0
- pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/RECORD +35 -0
- pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/WHEEL +5 -0
- pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/entry_points.txt +2 -0
- pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/top_level.txt +1 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__init__.py +46 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__main__.py +4 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/__init__.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/__main__.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/api.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/cd.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/constant.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/legacy.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/md.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/models.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/utils.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/version.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/api.py +626 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/cd.py +395 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/cli/__init__.py +6 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/cli/__main__.py +296 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-310.pyc +0 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/constant.py +1995 -0
- pproyect/test1/Lib/site-packages/charset_normalizer/legacy.py +54 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
pproyect/test1/Scripts/libcrypto-1_1.dll filter=lfs diff=lfs merge=lfs -text
|
37 |
+
pproyect/test1/Scripts/python310.dll filter=lfs diff=lfs merge=lfs -text
|
38 |
+
pproyect/test1/Scripts/sqlite3.dll filter=lfs diff=lfs merge=lfs -text
|
39 |
+
pproyect/test1/Scripts/unicodedata.pyd filter=lfs diff=lfs merge=lfs -text
|
pproyect/test1/.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
# created by virtualenv automatically
|
2 |
+
*
|
pproyect/test1/Lib/site-packages/__pycache__/_virtualenv.cpython-310.pyc
ADDED
Binary file (2.81 kB). View file
|
|
pproyect/test1/Lib/site-packages/_distutils_hack/__init__.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# don't import any costly modules
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
def warn_distutils_present():
|
7 |
+
if 'distutils' not in sys.modules:
|
8 |
+
return
|
9 |
+
import warnings
|
10 |
+
|
11 |
+
warnings.warn(
|
12 |
+
"Distutils was imported before Setuptools, but importing Setuptools "
|
13 |
+
"also replaces the `distutils` module in `sys.modules`. This may lead "
|
14 |
+
"to undesirable behaviors or errors. To avoid these issues, avoid "
|
15 |
+
"using distutils directly, ensure that setuptools is installed in the "
|
16 |
+
"traditional way (e.g. not an editable install), and/or make sure "
|
17 |
+
"that setuptools is always imported before distutils."
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
def clear_distutils():
|
22 |
+
if 'distutils' not in sys.modules:
|
23 |
+
return
|
24 |
+
import warnings
|
25 |
+
|
26 |
+
warnings.warn("Setuptools is replacing distutils.")
|
27 |
+
mods = [
|
28 |
+
name
|
29 |
+
for name in sys.modules
|
30 |
+
if name == "distutils" or name.startswith("distutils.")
|
31 |
+
]
|
32 |
+
for name in mods:
|
33 |
+
del sys.modules[name]
|
34 |
+
|
35 |
+
|
36 |
+
def enabled():
|
37 |
+
"""
|
38 |
+
Allow selection of distutils by environment variable.
|
39 |
+
"""
|
40 |
+
which = os.environ.get('SETUPTOOLS_USE_DISTUTILS', 'local')
|
41 |
+
return which == 'local'
|
42 |
+
|
43 |
+
|
44 |
+
def ensure_local_distutils():
|
45 |
+
import importlib
|
46 |
+
|
47 |
+
clear_distutils()
|
48 |
+
|
49 |
+
# With the DistutilsMetaFinder in place,
|
50 |
+
# perform an import to cause distutils to be
|
51 |
+
# loaded from setuptools._distutils. Ref #2906.
|
52 |
+
with shim():
|
53 |
+
importlib.import_module('distutils')
|
54 |
+
|
55 |
+
# check that submodules load as expected
|
56 |
+
core = importlib.import_module('distutils.core')
|
57 |
+
assert '_distutils' in core.__file__, core.__file__
|
58 |
+
assert 'setuptools._distutils.log' not in sys.modules
|
59 |
+
|
60 |
+
|
61 |
+
def do_override():
|
62 |
+
"""
|
63 |
+
Ensure that the local copy of distutils is preferred over stdlib.
|
64 |
+
|
65 |
+
See https://github.com/pypa/setuptools/issues/417#issuecomment-392298401
|
66 |
+
for more motivation.
|
67 |
+
"""
|
68 |
+
if enabled():
|
69 |
+
warn_distutils_present()
|
70 |
+
ensure_local_distutils()
|
71 |
+
|
72 |
+
|
73 |
+
class _TrivialRe:
|
74 |
+
def __init__(self, *patterns):
|
75 |
+
self._patterns = patterns
|
76 |
+
|
77 |
+
def match(self, string):
|
78 |
+
return all(pat in string for pat in self._patterns)
|
79 |
+
|
80 |
+
|
81 |
+
class DistutilsMetaFinder:
|
82 |
+
def find_spec(self, fullname, path, target=None):
|
83 |
+
# optimization: only consider top level modules and those
|
84 |
+
# found in the CPython test suite.
|
85 |
+
if path is not None and not fullname.startswith('test.'):
|
86 |
+
return None
|
87 |
+
|
88 |
+
method_name = 'spec_for_{fullname}'.format(**locals())
|
89 |
+
method = getattr(self, method_name, lambda: None)
|
90 |
+
return method()
|
91 |
+
|
92 |
+
def spec_for_distutils(self):
|
93 |
+
if self.is_cpython():
|
94 |
+
return None
|
95 |
+
|
96 |
+
import importlib
|
97 |
+
import importlib.abc
|
98 |
+
import importlib.util
|
99 |
+
|
100 |
+
try:
|
101 |
+
mod = importlib.import_module('setuptools._distutils')
|
102 |
+
except Exception:
|
103 |
+
# There are a couple of cases where setuptools._distutils
|
104 |
+
# may not be present:
|
105 |
+
# - An older Setuptools without a local distutils is
|
106 |
+
# taking precedence. Ref #2957.
|
107 |
+
# - Path manipulation during sitecustomize removes
|
108 |
+
# setuptools from the path but only after the hook
|
109 |
+
# has been loaded. Ref #2980.
|
110 |
+
# In either case, fall back to stdlib behavior.
|
111 |
+
return None
|
112 |
+
|
113 |
+
class DistutilsLoader(importlib.abc.Loader):
|
114 |
+
def create_module(self, spec):
|
115 |
+
mod.__name__ = 'distutils'
|
116 |
+
return mod
|
117 |
+
|
118 |
+
def exec_module(self, module):
|
119 |
+
pass
|
120 |
+
|
121 |
+
return importlib.util.spec_from_loader(
|
122 |
+
'distutils', DistutilsLoader(), origin=mod.__file__
|
123 |
+
)
|
124 |
+
|
125 |
+
@staticmethod
|
126 |
+
def is_cpython():
|
127 |
+
"""
|
128 |
+
Suppress supplying distutils for CPython (build and tests).
|
129 |
+
Ref #2965 and #3007.
|
130 |
+
"""
|
131 |
+
return os.path.isfile('pybuilddir.txt')
|
132 |
+
|
133 |
+
def spec_for_pip(self):
|
134 |
+
"""
|
135 |
+
Ensure stdlib distutils when running under pip.
|
136 |
+
See pypa/pip#8761 for rationale.
|
137 |
+
"""
|
138 |
+
if sys.version_info >= (3, 12) or self.pip_imported_during_build():
|
139 |
+
return
|
140 |
+
clear_distutils()
|
141 |
+
self.spec_for_distutils = lambda: None
|
142 |
+
|
143 |
+
@classmethod
|
144 |
+
def pip_imported_during_build(cls):
|
145 |
+
"""
|
146 |
+
Detect if pip is being imported in a build script. Ref #2355.
|
147 |
+
"""
|
148 |
+
import traceback
|
149 |
+
|
150 |
+
return any(
|
151 |
+
cls.frame_file_is_setup(frame) for frame, line in traceback.walk_stack(None)
|
152 |
+
)
|
153 |
+
|
154 |
+
@staticmethod
|
155 |
+
def frame_file_is_setup(frame):
|
156 |
+
"""
|
157 |
+
Return True if the indicated frame suggests a setup.py file.
|
158 |
+
"""
|
159 |
+
# some frames may not have __file__ (#2940)
|
160 |
+
return frame.f_globals.get('__file__', '').endswith('setup.py')
|
161 |
+
|
162 |
+
def spec_for_sensitive_tests(self):
|
163 |
+
"""
|
164 |
+
Ensure stdlib distutils when running select tests under CPython.
|
165 |
+
|
166 |
+
python/cpython#91169
|
167 |
+
"""
|
168 |
+
clear_distutils()
|
169 |
+
self.spec_for_distutils = lambda: None
|
170 |
+
|
171 |
+
sensitive_tests = (
|
172 |
+
[
|
173 |
+
'test.test_distutils',
|
174 |
+
'test.test_peg_generator',
|
175 |
+
'test.test_importlib',
|
176 |
+
]
|
177 |
+
if sys.version_info < (3, 10)
|
178 |
+
else [
|
179 |
+
'test.test_distutils',
|
180 |
+
]
|
181 |
+
)
|
182 |
+
|
183 |
+
|
184 |
+
for name in DistutilsMetaFinder.sensitive_tests:
|
185 |
+
setattr(
|
186 |
+
DistutilsMetaFinder,
|
187 |
+
f'spec_for_{name}',
|
188 |
+
DistutilsMetaFinder.spec_for_sensitive_tests,
|
189 |
+
)
|
190 |
+
|
191 |
+
|
192 |
+
DISTUTILS_FINDER = DistutilsMetaFinder()
|
193 |
+
|
194 |
+
|
195 |
+
def add_shim():
|
196 |
+
DISTUTILS_FINDER in sys.meta_path or insert_shim()
|
197 |
+
|
198 |
+
|
199 |
+
class shim:
|
200 |
+
def __enter__(self):
|
201 |
+
insert_shim()
|
202 |
+
|
203 |
+
def __exit__(self, exc, value, tb):
|
204 |
+
_remove_shim()
|
205 |
+
|
206 |
+
|
207 |
+
def insert_shim():
|
208 |
+
sys.meta_path.insert(0, DISTUTILS_FINDER)
|
209 |
+
|
210 |
+
|
211 |
+
def _remove_shim():
|
212 |
+
try:
|
213 |
+
sys.meta_path.remove(DISTUTILS_FINDER)
|
214 |
+
except ValueError:
|
215 |
+
pass
|
216 |
+
|
217 |
+
|
218 |
+
if sys.version_info < (3, 12):
|
219 |
+
# DistutilsMetaFinder can only be disabled in Python < 3.12 (PEP 632)
|
220 |
+
remove_shim = _remove_shim
|
pproyect/test1/Lib/site-packages/_distutils_hack/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (7.61 kB). View file
|
|
pproyect/test1/Lib/site-packages/_distutils_hack/__pycache__/override.cpython-310.pyc
ADDED
Binary file (294 Bytes). View file
|
|
pproyect/test1/Lib/site-packages/_distutils_hack/override.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__import__('_distutils_hack').do_override()
|
pproyect/test1/Lib/site-packages/_virtualenv.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69ac3d8f27e679c81b94ab30b3b56e9cd138219b1ba94a1fa3606d5a76a1433d
|
3 |
+
size 18
|
pproyect/test1/Lib/site-packages/_virtualenv.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Patches that are applied at runtime to the virtual environment."""
|
2 |
+
|
3 |
+
from __future__ import annotations
|
4 |
+
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
|
8 |
+
VIRTUALENV_PATCH_FILE = os.path.join(__file__)
|
9 |
+
|
10 |
+
|
11 |
+
def patch_dist(dist):
|
12 |
+
"""
|
13 |
+
Distutils allows user to configure some arguments via a configuration file:
|
14 |
+
https://docs.python.org/3/install/index.html#distutils-configuration-files.
|
15 |
+
|
16 |
+
Some of this arguments though don't make sense in context of the virtual environment files, let's fix them up.
|
17 |
+
""" # noqa: D205
|
18 |
+
# we cannot allow some install config as that would get packages installed outside of the virtual environment
|
19 |
+
old_parse_config_files = dist.Distribution.parse_config_files
|
20 |
+
|
21 |
+
def parse_config_files(self, *args, **kwargs):
|
22 |
+
result = old_parse_config_files(self, *args, **kwargs)
|
23 |
+
install = self.get_option_dict("install")
|
24 |
+
|
25 |
+
if "prefix" in install: # the prefix governs where to install the libraries
|
26 |
+
install["prefix"] = VIRTUALENV_PATCH_FILE, os.path.abspath(sys.prefix)
|
27 |
+
for base in ("purelib", "platlib", "headers", "scripts", "data"):
|
28 |
+
key = f"install_{base}"
|
29 |
+
if key in install: # do not allow global configs to hijack venv paths
|
30 |
+
install.pop(key, None)
|
31 |
+
return result
|
32 |
+
|
33 |
+
dist.Distribution.parse_config_files = parse_config_files
|
34 |
+
|
35 |
+
|
36 |
+
# Import hook that patches some modules to ignore configuration values that break package installation in case
|
37 |
+
# of virtual environments.
|
38 |
+
_DISTUTILS_PATCH = "distutils.dist", "setuptools.dist"
|
39 |
+
# https://docs.python.org/3/library/importlib.html#setting-up-an-importer
|
40 |
+
|
41 |
+
|
42 |
+
class _Finder:
|
43 |
+
"""A meta path finder that allows patching the imported distutils modules."""
|
44 |
+
|
45 |
+
fullname = None
|
46 |
+
|
47 |
+
# lock[0] is threading.Lock(), but initialized lazily to avoid importing threading very early at startup,
|
48 |
+
# because there are gevent-based applications that need to be first to import threading by themselves.
|
49 |
+
# See https://github.com/pypa/virtualenv/issues/1895 for details.
|
50 |
+
lock = [] # noqa: RUF012
|
51 |
+
|
52 |
+
def find_spec(self, fullname, path, target=None): # noqa: ARG002
|
53 |
+
if fullname in _DISTUTILS_PATCH and self.fullname is None: # noqa: PLR1702
|
54 |
+
# initialize lock[0] lazily
|
55 |
+
if len(self.lock) == 0:
|
56 |
+
import threading # noqa: PLC0415
|
57 |
+
|
58 |
+
lock = threading.Lock()
|
59 |
+
# there is possibility that two threads T1 and T2 are simultaneously running into find_spec,
|
60 |
+
# observing .lock as empty, and further going into hereby initialization. However due to the GIL,
|
61 |
+
# list.append() operation is atomic and this way only one of the threads will "win" to put the lock
|
62 |
+
# - that every thread will use - into .lock[0].
|
63 |
+
# https://docs.python.org/3/faq/library.html#what-kinds-of-global-value-mutation-are-thread-safe
|
64 |
+
self.lock.append(lock)
|
65 |
+
|
66 |
+
from functools import partial # noqa: PLC0415
|
67 |
+
from importlib.util import find_spec # noqa: PLC0415
|
68 |
+
|
69 |
+
with self.lock[0]:
|
70 |
+
self.fullname = fullname
|
71 |
+
try:
|
72 |
+
spec = find_spec(fullname, path)
|
73 |
+
if spec is not None:
|
74 |
+
# https://www.python.org/dev/peps/pep-0451/#how-loading-will-work
|
75 |
+
is_new_api = hasattr(spec.loader, "exec_module")
|
76 |
+
func_name = "exec_module" if is_new_api else "load_module"
|
77 |
+
old = getattr(spec.loader, func_name)
|
78 |
+
func = self.exec_module if is_new_api else self.load_module
|
79 |
+
if old is not func:
|
80 |
+
try: # noqa: SIM105
|
81 |
+
setattr(spec.loader, func_name, partial(func, old))
|
82 |
+
except AttributeError:
|
83 |
+
pass # C-Extension loaders are r/o such as zipimporter with <3.7
|
84 |
+
return spec
|
85 |
+
finally:
|
86 |
+
self.fullname = None
|
87 |
+
return None
|
88 |
+
|
89 |
+
@staticmethod
|
90 |
+
def exec_module(old, module):
|
91 |
+
old(module)
|
92 |
+
if module.__name__ in _DISTUTILS_PATCH:
|
93 |
+
patch_dist(module)
|
94 |
+
|
95 |
+
@staticmethod
|
96 |
+
def load_module(old, name):
|
97 |
+
module = old(name)
|
98 |
+
if module.__name__ in _DISTUTILS_PATCH:
|
99 |
+
patch_dist(module)
|
100 |
+
return module
|
101 |
+
|
102 |
+
|
103 |
+
sys.meta_path.insert(0, _Finder())
|
pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/INSTALLER
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pip
|
pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This package contains a modified version of ca-bundle.crt:
|
2 |
+
|
3 |
+
ca-bundle.crt -- Bundle of CA Root Certificates
|
4 |
+
|
5 |
+
This is a bundle of X.509 certificates of public Certificate Authorities
|
6 |
+
(CA). These were automatically extracted from Mozilla's root certificates
|
7 |
+
file (certdata.txt). This file can be found in the mozilla source tree:
|
8 |
+
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
|
9 |
+
It contains the certificates in PEM format and therefore
|
10 |
+
can be directly used with curl / libcurl / php_curl, or with
|
11 |
+
an Apache+mod_ssl webserver for SSL client authentication.
|
12 |
+
Just configure this file as the SSLCACertificateFile.#
|
13 |
+
|
14 |
+
***** BEGIN LICENSE BLOCK *****
|
15 |
+
This Source Code Form is subject to the terms of the Mozilla Public License,
|
16 |
+
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
|
17 |
+
one at http://mozilla.org/MPL/2.0/.
|
18 |
+
|
19 |
+
***** END LICENSE BLOCK *****
|
20 |
+
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
|
pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/METADATA
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: certifi
|
3 |
+
Version: 2024.2.2
|
4 |
+
Summary: Python package for providing Mozilla's CA Bundle.
|
5 |
+
Home-page: https://github.com/certifi/python-certifi
|
6 |
+
Author: Kenneth Reitz
|
7 |
+
Author-email: [email protected]
|
8 |
+
License: MPL-2.0
|
9 |
+
Project-URL: Source, https://github.com/certifi/python-certifi
|
10 |
+
Classifier: Development Status :: 5 - Production/Stable
|
11 |
+
Classifier: Intended Audience :: Developers
|
12 |
+
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
13 |
+
Classifier: Natural Language :: English
|
14 |
+
Classifier: Programming Language :: Python
|
15 |
+
Classifier: Programming Language :: Python :: 3
|
16 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
17 |
+
Classifier: Programming Language :: Python :: 3.6
|
18 |
+
Classifier: Programming Language :: Python :: 3.7
|
19 |
+
Classifier: Programming Language :: Python :: 3.8
|
20 |
+
Classifier: Programming Language :: Python :: 3.9
|
21 |
+
Classifier: Programming Language :: Python :: 3.10
|
22 |
+
Classifier: Programming Language :: Python :: 3.11
|
23 |
+
Requires-Python: >=3.6
|
24 |
+
License-File: LICENSE
|
25 |
+
|
26 |
+
Certifi: Python SSL Certificates
|
27 |
+
================================
|
28 |
+
|
29 |
+
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
30 |
+
validating the trustworthiness of SSL certificates while verifying the identity
|
31 |
+
of TLS hosts. It has been extracted from the `Requests`_ project.
|
32 |
+
|
33 |
+
Installation
|
34 |
+
------------
|
35 |
+
|
36 |
+
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
37 |
+
|
38 |
+
$ pip install certifi
|
39 |
+
|
40 |
+
Usage
|
41 |
+
-----
|
42 |
+
|
43 |
+
To reference the installed certificate authority (CA) bundle, you can use the
|
44 |
+
built-in function::
|
45 |
+
|
46 |
+
>>> import certifi
|
47 |
+
|
48 |
+
>>> certifi.where()
|
49 |
+
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
50 |
+
|
51 |
+
Or from the command line::
|
52 |
+
|
53 |
+
$ python -m certifi
|
54 |
+
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
55 |
+
|
56 |
+
Enjoy!
|
57 |
+
|
58 |
+
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
59 |
+
|
60 |
+
Addition/Removal of Certificates
|
61 |
+
--------------------------------
|
62 |
+
|
63 |
+
Certifi does not support any addition/removal or other modification of the
|
64 |
+
CA trust store content. This project is intended to provide a reliable and
|
65 |
+
highly portable root of trust to python deployments. Look to upstream projects
|
66 |
+
for methods to use alternate trust.
|
pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/RECORD
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
certifi-2024.2.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
2 |
+
certifi-2024.2.2.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
|
3 |
+
certifi-2024.2.2.dist-info/METADATA,sha256=1noreLRChpOgeSj0uJT1mehiBl8ngh33Guc7KdvzYYM,2170
|
4 |
+
certifi-2024.2.2.dist-info/RECORD,,
|
5 |
+
certifi-2024.2.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
6 |
+
certifi-2024.2.2.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
7 |
+
certifi/__init__.py,sha256=ljtEx-EmmPpTe2SOd5Kzsujm_lUD0fKJVnE9gzce320,94
|
8 |
+
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
9 |
+
certifi/__pycache__/__init__.cpython-310.pyc,,
|
10 |
+
certifi/__pycache__/__main__.cpython-310.pyc,,
|
11 |
+
certifi/__pycache__/core.cpython-310.pyc,,
|
12 |
+
certifi/cacert.pem,sha256=ejR8qP724p-CtuR4U1WmY1wX-nVeCUD2XxWqj8e9f5I,292541
|
13 |
+
certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
|
14 |
+
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/WHEEL
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Wheel-Version: 1.0
|
2 |
+
Generator: bdist_wheel (0.42.0)
|
3 |
+
Root-Is-Purelib: true
|
4 |
+
Tag: py3-none-any
|
5 |
+
|
pproyect/test1/Lib/site-packages/certifi-2024.2.2.dist-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
certifi
|
pproyect/test1/Lib/site-packages/certifi/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .core import contents, where
|
2 |
+
|
3 |
+
__all__ = ["contents", "where"]
|
4 |
+
__version__ = "2024.02.02"
|
pproyect/test1/Lib/site-packages/certifi/__main__.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
from certifi import contents, where
|
4 |
+
|
5 |
+
parser = argparse.ArgumentParser()
|
6 |
+
parser.add_argument("-c", "--contents", action="store_true")
|
7 |
+
args = parser.parse_args()
|
8 |
+
|
9 |
+
if args.contents:
|
10 |
+
print(contents())
|
11 |
+
else:
|
12 |
+
print(where())
|
pproyect/test1/Lib/site-packages/certifi/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (348 Bytes). View file
|
|
pproyect/test1/Lib/site-packages/certifi/__pycache__/__main__.cpython-310.pyc
ADDED
Binary file (486 Bytes). View file
|
|
pproyect/test1/Lib/site-packages/certifi/__pycache__/core.cpython-310.pyc
ADDED
Binary file (2.18 kB). View file
|
|
pproyect/test1/Lib/site-packages/certifi/cacert.pem
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pproyect/test1/Lib/site-packages/certifi/core.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
certifi.py
|
3 |
+
~~~~~~~~~~
|
4 |
+
|
5 |
+
This module returns the installation location of cacert.pem or its contents.
|
6 |
+
"""
|
7 |
+
import sys
|
8 |
+
import atexit
|
9 |
+
|
10 |
+
def exit_cacert_ctx() -> None:
|
11 |
+
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
|
12 |
+
|
13 |
+
|
14 |
+
if sys.version_info >= (3, 11):
|
15 |
+
|
16 |
+
from importlib.resources import as_file, files
|
17 |
+
|
18 |
+
_CACERT_CTX = None
|
19 |
+
_CACERT_PATH = None
|
20 |
+
|
21 |
+
def where() -> str:
|
22 |
+
# This is slightly terrible, but we want to delay extracting the file
|
23 |
+
# in cases where we're inside of a zipimport situation until someone
|
24 |
+
# actually calls where(), but we don't want to re-extract the file
|
25 |
+
# on every call of where(), so we'll do it once then store it in a
|
26 |
+
# global variable.
|
27 |
+
global _CACERT_CTX
|
28 |
+
global _CACERT_PATH
|
29 |
+
if _CACERT_PATH is None:
|
30 |
+
# This is slightly janky, the importlib.resources API wants you to
|
31 |
+
# manage the cleanup of this file, so it doesn't actually return a
|
32 |
+
# path, it returns a context manager that will give you the path
|
33 |
+
# when you enter it and will do any cleanup when you leave it. In
|
34 |
+
# the common case of not needing a temporary file, it will just
|
35 |
+
# return the file system location and the __exit__() is a no-op.
|
36 |
+
#
|
37 |
+
# We also have to hold onto the actual context manager, because
|
38 |
+
# it will do the cleanup whenever it gets garbage collected, so
|
39 |
+
# we will also store that at the global level as well.
|
40 |
+
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
41 |
+
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
42 |
+
atexit.register(exit_cacert_ctx)
|
43 |
+
|
44 |
+
return _CACERT_PATH
|
45 |
+
|
46 |
+
def contents() -> str:
|
47 |
+
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
48 |
+
|
49 |
+
elif sys.version_info >= (3, 7):
|
50 |
+
|
51 |
+
from importlib.resources import path as get_path, read_text
|
52 |
+
|
53 |
+
_CACERT_CTX = None
|
54 |
+
_CACERT_PATH = None
|
55 |
+
|
56 |
+
def where() -> str:
|
57 |
+
# This is slightly terrible, but we want to delay extracting the
|
58 |
+
# file in cases where we're inside of a zipimport situation until
|
59 |
+
# someone actually calls where(), but we don't want to re-extract
|
60 |
+
# the file on every call of where(), so we'll do it once then store
|
61 |
+
# it in a global variable.
|
62 |
+
global _CACERT_CTX
|
63 |
+
global _CACERT_PATH
|
64 |
+
if _CACERT_PATH is None:
|
65 |
+
# This is slightly janky, the importlib.resources API wants you
|
66 |
+
# to manage the cleanup of this file, so it doesn't actually
|
67 |
+
# return a path, it returns a context manager that will give
|
68 |
+
# you the path when you enter it and will do any cleanup when
|
69 |
+
# you leave it. In the common case of not needing a temporary
|
70 |
+
# file, it will just return the file system location and the
|
71 |
+
# __exit__() is a no-op.
|
72 |
+
#
|
73 |
+
# We also have to hold onto the actual context manager, because
|
74 |
+
# it will do the cleanup whenever it gets garbage collected, so
|
75 |
+
# we will also store that at the global level as well.
|
76 |
+
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
77 |
+
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
78 |
+
atexit.register(exit_cacert_ctx)
|
79 |
+
|
80 |
+
return _CACERT_PATH
|
81 |
+
|
82 |
+
def contents() -> str:
|
83 |
+
return read_text("certifi", "cacert.pem", encoding="ascii")
|
84 |
+
|
85 |
+
else:
|
86 |
+
import os
|
87 |
+
import types
|
88 |
+
from typing import Union
|
89 |
+
|
90 |
+
Package = Union[types.ModuleType, str]
|
91 |
+
Resource = Union[str, "os.PathLike"]
|
92 |
+
|
93 |
+
# This fallback will work for Python versions prior to 3.7 that lack the
|
94 |
+
# importlib.resources module but relies on the existing `where` function
|
95 |
+
# so won't address issues with environments like PyOxidizer that don't set
|
96 |
+
# __file__ on modules.
|
97 |
+
def read_text(
|
98 |
+
package: Package,
|
99 |
+
resource: Resource,
|
100 |
+
encoding: str = 'utf-8',
|
101 |
+
errors: str = 'strict'
|
102 |
+
) -> str:
|
103 |
+
with open(where(), encoding=encoding) as data:
|
104 |
+
return data.read()
|
105 |
+
|
106 |
+
# If we don't have importlib.resources, then we will just do the old logic
|
107 |
+
# of assuming we're on the filesystem and munge the path directly.
|
108 |
+
def where() -> str:
|
109 |
+
f = os.path.dirname(__file__)
|
110 |
+
|
111 |
+
return os.path.join(f, "cacert.pem")
|
112 |
+
|
113 |
+
def contents() -> str:
|
114 |
+
return read_text("certifi", "cacert.pem", encoding="ascii")
|
pproyect/test1/Lib/site-packages/certifi/py.typed
ADDED
File without changes
|
pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/INSTALLER
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pip
|
pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2019 TAHRI Ahmed R.
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/METADATA
ADDED
@@ -0,0 +1,683 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: charset-normalizer
|
3 |
+
Version: 3.3.2
|
4 |
+
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
5 |
+
Home-page: https://github.com/Ousret/charset_normalizer
|
6 |
+
Author: Ahmed TAHRI
|
7 |
+
Author-email: [email protected]
|
8 |
+
License: MIT
|
9 |
+
Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
|
10 |
+
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
|
11 |
+
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
|
12 |
+
Classifier: Development Status :: 5 - Production/Stable
|
13 |
+
Classifier: License :: OSI Approved :: MIT License
|
14 |
+
Classifier: Intended Audience :: Developers
|
15 |
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
16 |
+
Classifier: Operating System :: OS Independent
|
17 |
+
Classifier: Programming Language :: Python
|
18 |
+
Classifier: Programming Language :: Python :: 3
|
19 |
+
Classifier: Programming Language :: Python :: 3.7
|
20 |
+
Classifier: Programming Language :: Python :: 3.8
|
21 |
+
Classifier: Programming Language :: Python :: 3.9
|
22 |
+
Classifier: Programming Language :: Python :: 3.10
|
23 |
+
Classifier: Programming Language :: Python :: 3.11
|
24 |
+
Classifier: Programming Language :: Python :: 3.12
|
25 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
26 |
+
Classifier: Topic :: Text Processing :: Linguistic
|
27 |
+
Classifier: Topic :: Utilities
|
28 |
+
Classifier: Typing :: Typed
|
29 |
+
Requires-Python: >=3.7.0
|
30 |
+
Description-Content-Type: text/markdown
|
31 |
+
License-File: LICENSE
|
32 |
+
Provides-Extra: unicode_backport
|
33 |
+
|
34 |
+
<h1 align="center">Charset Detection, for Everyone 👋</h1>
|
35 |
+
|
36 |
+
<p align="center">
|
37 |
+
<sup>The Real First Universal Charset Detector</sup><br>
|
38 |
+
<a href="https://pypi.org/project/charset-normalizer">
|
39 |
+
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
40 |
+
</a>
|
41 |
+
<a href="https://pepy.tech/project/charset-normalizer/">
|
42 |
+
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
|
43 |
+
</a>
|
44 |
+
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
|
45 |
+
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
|
46 |
+
</a>
|
47 |
+
</p>
|
48 |
+
<p align="center">
|
49 |
+
<sup><i>Featured Packages</i></sup><br>
|
50 |
+
<a href="https://github.com/jawah/niquests">
|
51 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-HTTP_1.1%2C%202%2C_and_3_Client-cyan">
|
52 |
+
</a>
|
53 |
+
<a href="https://github.com/jawah/wassima">
|
54 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
|
55 |
+
</a>
|
56 |
+
</p>
|
57 |
+
<p align="center">
|
58 |
+
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
|
59 |
+
<a href="https://github.com/nickspring/charset-normalizer-rs">
|
60 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
|
61 |
+
</a>
|
62 |
+
</p>
|
63 |
+
|
64 |
+
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
65 |
+
> I'm trying to resolve the issue by taking a new approach.
|
66 |
+
> All IANA character set names for which the Python core library provides codecs are supported.
|
67 |
+
|
68 |
+
<p align="center">
|
69 |
+
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
70 |
+
</p>
|
71 |
+
|
72 |
+
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
73 |
+
|
74 |
+
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
75 |
+
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
|
76 |
+
| `Fast` | ❌ | ✅ | ✅ |
|
77 |
+
| `Universal**` | ❌ | ✅ | ❌ |
|
78 |
+
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
79 |
+
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
80 |
+
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
81 |
+
| `Native Python` | ✅ | ✅ | ❌ |
|
82 |
+
| `Detect spoken language` | ❌ | ✅ | N/A |
|
83 |
+
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
84 |
+
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
|
85 |
+
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
|
86 |
+
|
87 |
+
<p align="center">
|
88 |
+
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
89 |
+
</p>
|
90 |
+
|
91 |
+
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
92 |
+
Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
|
93 |
+
|
94 |
+
## ⚡ Performance
|
95 |
+
|
96 |
+
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
97 |
+
|
98 |
+
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
99 |
+
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
|
100 |
+
| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
|
101 |
+
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
|
102 |
+
|
103 |
+
| Package | 99th percentile | 95th percentile | 50th percentile |
|
104 |
+
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
|
105 |
+
| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
|
106 |
+
| charset-normalizer | 100 ms | 50 ms | 5 ms |
|
107 |
+
|
108 |
+
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
109 |
+
|
110 |
+
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
111 |
+
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
112 |
+
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
113 |
+
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
114 |
+
> (eg. Supported Encoding) Challenge-them if you want.
|
115 |
+
|
116 |
+
## ✨ Installation
|
117 |
+
|
118 |
+
Using pip:
|
119 |
+
|
120 |
+
```sh
|
121 |
+
pip install charset-normalizer -U
|
122 |
+
```
|
123 |
+
|
124 |
+
## 🚀 Basic Usage
|
125 |
+
|
126 |
+
### CLI
|
127 |
+
This package comes with a CLI.
|
128 |
+
|
129 |
+
```
|
130 |
+
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
131 |
+
file [file ...]
|
132 |
+
|
133 |
+
The Real First Universal Charset Detector. Discover originating encoding used
|
134 |
+
on text file. Normalize text to unicode.
|
135 |
+
|
136 |
+
positional arguments:
|
137 |
+
files File(s) to be analysed
|
138 |
+
|
139 |
+
optional arguments:
|
140 |
+
-h, --help show this help message and exit
|
141 |
+
-v, --verbose Display complementary information about file if any.
|
142 |
+
Stdout will contain logs about the detection process.
|
143 |
+
-a, --with-alternative
|
144 |
+
Output complementary possibilities if any. Top-level
|
145 |
+
JSON WILL be a list.
|
146 |
+
-n, --normalize Permit to normalize input file. If not set, program
|
147 |
+
does not write anything.
|
148 |
+
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
149 |
+
JSON output.
|
150 |
+
-r, --replace Replace file when trying to normalize it instead of
|
151 |
+
creating a new one.
|
152 |
+
-f, --force Replace file without asking if you are sure, use this
|
153 |
+
flag with caution.
|
154 |
+
-t THRESHOLD, --threshold THRESHOLD
|
155 |
+
Define a custom maximum amount of chaos allowed in
|
156 |
+
decoded content. 0. <= chaos <= 1.
|
157 |
+
--version Show version information and exit.
|
158 |
+
```
|
159 |
+
|
160 |
+
```bash
|
161 |
+
normalizer ./data/sample.1.fr.srt
|
162 |
+
```
|
163 |
+
|
164 |
+
or
|
165 |
+
|
166 |
+
```bash
|
167 |
+
python -m charset_normalizer ./data/sample.1.fr.srt
|
168 |
+
```
|
169 |
+
|
170 |
+
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
171 |
+
|
172 |
+
```json
|
173 |
+
{
|
174 |
+
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
175 |
+
"encoding": "cp1252",
|
176 |
+
"encoding_aliases": [
|
177 |
+
"1252",
|
178 |
+
"windows_1252"
|
179 |
+
],
|
180 |
+
"alternative_encodings": [
|
181 |
+
"cp1254",
|
182 |
+
"cp1256",
|
183 |
+
"cp1258",
|
184 |
+
"iso8859_14",
|
185 |
+
"iso8859_15",
|
186 |
+
"iso8859_16",
|
187 |
+
"iso8859_3",
|
188 |
+
"iso8859_9",
|
189 |
+
"latin_1",
|
190 |
+
"mbcs"
|
191 |
+
],
|
192 |
+
"language": "French",
|
193 |
+
"alphabets": [
|
194 |
+
"Basic Latin",
|
195 |
+
"Latin-1 Supplement"
|
196 |
+
],
|
197 |
+
"has_sig_or_bom": false,
|
198 |
+
"chaos": 0.149,
|
199 |
+
"coherence": 97.152,
|
200 |
+
"unicode_path": null,
|
201 |
+
"is_preferred": true
|
202 |
+
}
|
203 |
+
```
|
204 |
+
|
205 |
+
### Python
|
206 |
+
*Just print out normalized text*
|
207 |
+
```python
|
208 |
+
from charset_normalizer import from_path
|
209 |
+
|
210 |
+
results = from_path('./my_subtitle.srt')
|
211 |
+
|
212 |
+
print(str(results.best()))
|
213 |
+
```
|
214 |
+
|
215 |
+
*Upgrade your code without effort*
|
216 |
+
```python
|
217 |
+
from charset_normalizer import detect
|
218 |
+
```
|
219 |
+
|
220 |
+
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
221 |
+
|
222 |
+
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
223 |
+
|
224 |
+
## 😇 Why
|
225 |
+
|
226 |
+
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
227 |
+
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
228 |
+
|
229 |
+
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
230 |
+
produce **two identical rendered string.**
|
231 |
+
What I want is to get readable text, the best I can.
|
232 |
+
|
233 |
+
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
234 |
+
|
235 |
+
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
236 |
+
|
237 |
+
## 🍰 How
|
238 |
+
|
239 |
+
- Discard all charset encoding table that could not fit the binary content.
|
240 |
+
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
|
241 |
+
- Extract matches with the lowest mess detected.
|
242 |
+
- Additionally, we measure coherence / probe for a language.
|
243 |
+
|
244 |
+
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
|
245 |
+
|
246 |
+
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
247 |
+
**I established** some ground rules about **what is obvious** when **it seems like** a mess.
|
248 |
+
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
|
249 |
+
improve or rewrite it.
|
250 |
+
|
251 |
+
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
252 |
+
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
253 |
+
|
254 |
+
## ⚡ Known limitations
|
255 |
+
|
256 |
+
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
257 |
+
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
258 |
+
|
259 |
+
## ⚠️ About Python EOLs
|
260 |
+
|
261 |
+
**If you are running:**
|
262 |
+
|
263 |
+
- Python >=2.7,<3.5: Unsupported
|
264 |
+
- Python 3.5: charset-normalizer < 2.1
|
265 |
+
- Python 3.6: charset-normalizer < 3.1
|
266 |
+
- Python 3.7: charset-normalizer < 4.0
|
267 |
+
|
268 |
+
Upgrade your Python interpreter as soon as possible.
|
269 |
+
|
270 |
+
## 👤 Contributing
|
271 |
+
|
272 |
+
Contributions, issues and feature requests are very much welcome.<br />
|
273 |
+
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
274 |
+
|
275 |
+
## 📝 License
|
276 |
+
|
277 |
+
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
278 |
+
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
279 |
+
|
280 |
+
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
281 |
+
|
282 |
+
## 💼 For Enterprise
|
283 |
+
|
284 |
+
Professional support for charset-normalizer is available as part of the [Tidelift
|
285 |
+
Subscription][1]. Tidelift gives software development teams a single source for
|
286 |
+
purchasing and maintaining their software, with professional grade assurances
|
287 |
+
from the experts who know it best, while seamlessly integrating with existing
|
288 |
+
tools.
|
289 |
+
|
290 |
+
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
|
291 |
+
|
292 |
+
# Changelog
|
293 |
+
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
294 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
295 |
+
|
296 |
+
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
|
297 |
+
|
298 |
+
### Fixed
|
299 |
+
- Unintentional memory usage regression when using large payload that match several encoding (#376)
|
300 |
+
- Regression on some detection case showcased in the documentation (#371)
|
301 |
+
|
302 |
+
### Added
|
303 |
+
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
|
304 |
+
|
305 |
+
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
|
306 |
+
|
307 |
+
### Changed
|
308 |
+
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
|
309 |
+
- Improved the general detection reliability based on reports from the community
|
310 |
+
|
311 |
+
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
|
312 |
+
|
313 |
+
### Added
|
314 |
+
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
|
315 |
+
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
|
316 |
+
|
317 |
+
### Removed
|
318 |
+
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
|
319 |
+
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
|
320 |
+
|
321 |
+
### Changed
|
322 |
+
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
|
323 |
+
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
|
324 |
+
|
325 |
+
### Fixed
|
326 |
+
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
|
327 |
+
|
328 |
+
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
|
329 |
+
|
330 |
+
### Changed
|
331 |
+
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
|
332 |
+
- Minor improvement over the global detection reliability
|
333 |
+
|
334 |
+
### Added
|
335 |
+
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
|
336 |
+
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
|
337 |
+
- Explicit support for Python 3.12
|
338 |
+
|
339 |
+
### Fixed
|
340 |
+
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
|
341 |
+
|
342 |
+
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
|
343 |
+
|
344 |
+
### Added
|
345 |
+
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
|
346 |
+
|
347 |
+
### Removed
|
348 |
+
- Support for Python 3.6 (PR #260)
|
349 |
+
|
350 |
+
### Changed
|
351 |
+
- Optional speedup provided by mypy/c 1.0.1
|
352 |
+
|
353 |
+
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
|
354 |
+
|
355 |
+
### Fixed
|
356 |
+
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
|
357 |
+
|
358 |
+
### Changed
|
359 |
+
- Speedup provided by mypy/c 0.990 on Python >= 3.7
|
360 |
+
|
361 |
+
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
|
362 |
+
|
363 |
+
### Added
|
364 |
+
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
365 |
+
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
366 |
+
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
367 |
+
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
368 |
+
|
369 |
+
### Changed
|
370 |
+
- Build with static metadata using 'build' frontend
|
371 |
+
- Make the language detection stricter
|
372 |
+
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
373 |
+
|
374 |
+
### Fixed
|
375 |
+
- CLI with opt --normalize fail when using full path for files
|
376 |
+
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
377 |
+
- Sphinx warnings when generating the documentation
|
378 |
+
|
379 |
+
### Removed
|
380 |
+
- Coherence detector no longer return 'Simple English' instead return 'English'
|
381 |
+
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
382 |
+
- Breaking: Method `first()` and `best()` from CharsetMatch
|
383 |
+
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
384 |
+
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
385 |
+
- Breaking: Top-level function `normalize`
|
386 |
+
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
387 |
+
- Support for the backport `unicodedata2`
|
388 |
+
|
389 |
+
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
|
390 |
+
|
391 |
+
### Added
|
392 |
+
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
393 |
+
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
394 |
+
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
395 |
+
|
396 |
+
### Changed
|
397 |
+
- Build with static metadata using 'build' frontend
|
398 |
+
- Make the language detection stricter
|
399 |
+
|
400 |
+
### Fixed
|
401 |
+
- CLI with opt --normalize fail when using full path for files
|
402 |
+
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
403 |
+
|
404 |
+
### Removed
|
405 |
+
- Coherence detector no longer return 'Simple English' instead return 'English'
|
406 |
+
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
407 |
+
|
408 |
+
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
|
409 |
+
|
410 |
+
### Added
|
411 |
+
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
412 |
+
|
413 |
+
### Removed
|
414 |
+
- Breaking: Method `first()` and `best()` from CharsetMatch
|
415 |
+
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
416 |
+
|
417 |
+
### Fixed
|
418 |
+
- Sphinx warnings when generating the documentation
|
419 |
+
|
420 |
+
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
|
421 |
+
|
422 |
+
### Changed
|
423 |
+
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
424 |
+
|
425 |
+
### Removed
|
426 |
+
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
427 |
+
- Breaking: Top-level function `normalize`
|
428 |
+
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
429 |
+
- Support for the backport `unicodedata2`
|
430 |
+
|
431 |
+
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
|
432 |
+
|
433 |
+
### Deprecated
|
434 |
+
- Function `normalize` scheduled for removal in 3.0
|
435 |
+
|
436 |
+
### Changed
|
437 |
+
- Removed useless call to decode in fn is_unprintable (#206)
|
438 |
+
|
439 |
+
### Fixed
|
440 |
+
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
|
441 |
+
|
442 |
+
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
|
443 |
+
|
444 |
+
### Added
|
445 |
+
- Output the Unicode table version when running the CLI with `--version` (PR #194)
|
446 |
+
|
447 |
+
### Changed
|
448 |
+
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
|
449 |
+
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
|
450 |
+
|
451 |
+
### Fixed
|
452 |
+
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
|
453 |
+
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
|
454 |
+
|
455 |
+
### Removed
|
456 |
+
- Support for Python 3.5 (PR #192)
|
457 |
+
|
458 |
+
### Deprecated
|
459 |
+
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
|
460 |
+
|
461 |
+
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
|
462 |
+
|
463 |
+
### Fixed
|
464 |
+
- ASCII miss-detection on rare cases (PR #170)
|
465 |
+
|
466 |
+
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
|
467 |
+
|
468 |
+
### Added
|
469 |
+
- Explicit support for Python 3.11 (PR #164)
|
470 |
+
|
471 |
+
### Changed
|
472 |
+
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
|
473 |
+
|
474 |
+
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
|
475 |
+
|
476 |
+
### Fixed
|
477 |
+
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
|
478 |
+
|
479 |
+
### Changed
|
480 |
+
- Skipping the language-detection (CD) on ASCII (PR #155)
|
481 |
+
|
482 |
+
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
|
483 |
+
|
484 |
+
### Changed
|
485 |
+
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
|
486 |
+
|
487 |
+
### Fixed
|
488 |
+
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
|
489 |
+
|
490 |
+
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
|
491 |
+
### Changed
|
492 |
+
- Improvement over Vietnamese detection (PR #126)
|
493 |
+
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
|
494 |
+
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
|
495 |
+
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
|
496 |
+
- Code style as refactored by Sourcery-AI (PR #131)
|
497 |
+
- Minor adjustment on the MD around european words (PR #133)
|
498 |
+
- Remove and replace SRTs from assets / tests (PR #139)
|
499 |
+
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
500 |
+
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
|
501 |
+
|
502 |
+
### Fixed
|
503 |
+
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
|
504 |
+
- Avoid using too insignificant chunk (PR #137)
|
505 |
+
|
506 |
+
### Added
|
507 |
+
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
508 |
+
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
|
509 |
+
|
510 |
+
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
|
511 |
+
### Added
|
512 |
+
- Add support for Kazakh (Cyrillic) language detection (PR #109)
|
513 |
+
|
514 |
+
### Changed
|
515 |
+
- Further, improve inferring the language from a given single-byte code page (PR #112)
|
516 |
+
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
|
517 |
+
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
|
518 |
+
- Various detection improvement (MD+CD) (PR #117)
|
519 |
+
|
520 |
+
### Removed
|
521 |
+
- Remove redundant logging entry about detected language(s) (PR #115)
|
522 |
+
|
523 |
+
### Fixed
|
524 |
+
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
|
525 |
+
|
526 |
+
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
|
527 |
+
### Fixed
|
528 |
+
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
|
529 |
+
- Fix CLI crash when using --minimal output in certain cases (PR #103)
|
530 |
+
|
531 |
+
### Changed
|
532 |
+
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
|
533 |
+
|
534 |
+
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
|
535 |
+
### Changed
|
536 |
+
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
|
537 |
+
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
|
538 |
+
- The Unicode detection is slightly improved (PR #93)
|
539 |
+
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
|
540 |
+
|
541 |
+
### Removed
|
542 |
+
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
|
543 |
+
|
544 |
+
### Fixed
|
545 |
+
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
|
546 |
+
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
|
547 |
+
- The MANIFEST.in was not exhaustive (PR #78)
|
548 |
+
|
549 |
+
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
|
550 |
+
### Fixed
|
551 |
+
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
|
552 |
+
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
|
553 |
+
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
|
554 |
+
- Submatch factoring could be wrong in rare edge cases (PR #72)
|
555 |
+
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
|
556 |
+
- Fix line endings from CRLF to LF for certain project files (PR #67)
|
557 |
+
|
558 |
+
### Changed
|
559 |
+
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
|
560 |
+
- Allow fallback on specified encoding if any (PR #71)
|
561 |
+
|
562 |
+
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
|
563 |
+
### Changed
|
564 |
+
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
|
565 |
+
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
|
566 |
+
|
567 |
+
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
|
568 |
+
### Fixed
|
569 |
+
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
|
570 |
+
|
571 |
+
### Changed
|
572 |
+
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
|
573 |
+
|
574 |
+
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
|
575 |
+
### Fixed
|
576 |
+
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
|
577 |
+
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
|
578 |
+
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
|
579 |
+
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
|
580 |
+
|
581 |
+
### Changed
|
582 |
+
- Public function normalize default args values were not aligned with from_bytes (PR #53)
|
583 |
+
|
584 |
+
### Added
|
585 |
+
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
|
586 |
+
|
587 |
+
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
|
588 |
+
### Changed
|
589 |
+
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
|
590 |
+
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
|
591 |
+
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
|
592 |
+
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
|
593 |
+
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
|
594 |
+
- utf_7 detection has been reinstated.
|
595 |
+
|
596 |
+
### Removed
|
597 |
+
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
|
598 |
+
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
|
599 |
+
- The exception hook on UnicodeDecodeError has been removed.
|
600 |
+
|
601 |
+
### Deprecated
|
602 |
+
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
|
603 |
+
|
604 |
+
### Fixed
|
605 |
+
- The CLI output used the relative path of the file(s). Should be absolute.
|
606 |
+
|
607 |
+
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
|
608 |
+
### Fixed
|
609 |
+
- Logger configuration/usage no longer conflict with others (PR #44)
|
610 |
+
|
611 |
+
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
|
612 |
+
### Removed
|
613 |
+
- Using standard logging instead of using the package loguru.
|
614 |
+
- Dropping nose test framework in favor of the maintained pytest.
|
615 |
+
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
|
616 |
+
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
|
617 |
+
- Stop support for UTF-7 that does not contain a SIG.
|
618 |
+
- Dropping PrettyTable, replaced with pure JSON output in CLI.
|
619 |
+
|
620 |
+
### Fixed
|
621 |
+
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
|
622 |
+
- Not searching properly for the BOM when trying utf32/16 parent codec.
|
623 |
+
|
624 |
+
### Changed
|
625 |
+
- Improving the package final size by compressing frequencies.json.
|
626 |
+
- Huge improvement over the larges payload.
|
627 |
+
|
628 |
+
### Added
|
629 |
+
- CLI now produces JSON consumable output.
|
630 |
+
- Return ASCII if given sequences fit. Given reasonable confidence.
|
631 |
+
|
632 |
+
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
|
633 |
+
|
634 |
+
### Fixed
|
635 |
+
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
|
636 |
+
|
637 |
+
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
|
638 |
+
|
639 |
+
### Fixed
|
640 |
+
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
|
641 |
+
|
642 |
+
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
|
643 |
+
|
644 |
+
### Fixed
|
645 |
+
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
|
646 |
+
|
647 |
+
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
|
648 |
+
|
649 |
+
### Changed
|
650 |
+
- Amend the previous release to allow prettytable 2.0 (PR #35)
|
651 |
+
|
652 |
+
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
|
653 |
+
|
654 |
+
### Fixed
|
655 |
+
- Fix error while using the package with a python pre-release interpreter (PR #33)
|
656 |
+
|
657 |
+
### Changed
|
658 |
+
- Dependencies refactoring, constraints revised.
|
659 |
+
|
660 |
+
### Added
|
661 |
+
- Add python 3.9 and 3.10 to the supported interpreters
|
662 |
+
|
663 |
+
MIT License
|
664 |
+
|
665 |
+
Copyright (c) 2019 TAHRI Ahmed R.
|
666 |
+
|
667 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
668 |
+
of this software and associated documentation files (the "Software"), to deal
|
669 |
+
in the Software without restriction, including without limitation the rights
|
670 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
671 |
+
copies of the Software, and to permit persons to whom the Software is
|
672 |
+
furnished to do so, subject to the following conditions:
|
673 |
+
|
674 |
+
The above copyright notice and this permission notice shall be included in all
|
675 |
+
copies or substantial portions of the Software.
|
676 |
+
|
677 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
678 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
679 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
680 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
681 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
682 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
683 |
+
SOFTWARE.
|
pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/RECORD
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
../../Scripts/normalizer.exe,sha256=NYL_TSd1oUrn6Lj_mIPac64bg_01rxDFMuU1YJ76nW4,108494
|
2 |
+
charset_normalizer-3.3.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
3 |
+
charset_normalizer-3.3.2.dist-info/LICENSE,sha256=znnj1Var_lZ-hzOvD5W50wcQDp9qls3SD2xIau88ufc,1090
|
4 |
+
charset_normalizer-3.3.2.dist-info/METADATA,sha256=hHDqDpXmQH3f8XSn30NlqB3R3NuhJzXC0zABqFwA6Nk,34233
|
5 |
+
charset_normalizer-3.3.2.dist-info/RECORD,,
|
6 |
+
charset_normalizer-3.3.2.dist-info/WHEEL,sha256=yrvteVAZzxQvtDnzdCRh4dP01sPIxYhLXIXplC7o50E,102
|
7 |
+
charset_normalizer-3.3.2.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
|
8 |
+
charset_normalizer-3.3.2.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
9 |
+
charset_normalizer/__init__.py,sha256=m1cUEsb9K5v831m9P_lv2JlUEKD7MhxL7fxw3hn75o4,1623
|
10 |
+
charset_normalizer/__main__.py,sha256=nVnMo31hTPN2Yy045GJIvHj3dKDJz4dAQR3cUSdvYyc,77
|
11 |
+
charset_normalizer/__pycache__/__init__.cpython-310.pyc,,
|
12 |
+
charset_normalizer/__pycache__/__main__.cpython-310.pyc,,
|
13 |
+
charset_normalizer/__pycache__/api.cpython-310.pyc,,
|
14 |
+
charset_normalizer/__pycache__/cd.cpython-310.pyc,,
|
15 |
+
charset_normalizer/__pycache__/constant.cpython-310.pyc,,
|
16 |
+
charset_normalizer/__pycache__/legacy.cpython-310.pyc,,
|
17 |
+
charset_normalizer/__pycache__/md.cpython-310.pyc,,
|
18 |
+
charset_normalizer/__pycache__/models.cpython-310.pyc,,
|
19 |
+
charset_normalizer/__pycache__/utils.cpython-310.pyc,,
|
20 |
+
charset_normalizer/__pycache__/version.cpython-310.pyc,,
|
21 |
+
charset_normalizer/api.py,sha256=qFL0frUrcfcYEJmGpqoJ4Af68ToVue3f5SK1gp8UC5Q,21723
|
22 |
+
charset_normalizer/cd.py,sha256=Yfk3sbee0Xqo1-vmQYbOqM51-SajXPLzFVG89nTsZzc,12955
|
23 |
+
charset_normalizer/cli/__init__.py,sha256=COwP8fK2qbuldMem2lL81JieY-PIA2G2GZ5IdAPMPFA,106
|
24 |
+
charset_normalizer/cli/__main__.py,sha256=rs-cBipBzr7d0TAaUa0nG4qrjXhdddeCVB-f6Xt_wS0,10040
|
25 |
+
charset_normalizer/cli/__pycache__/__init__.cpython-310.pyc,,
|
26 |
+
charset_normalizer/cli/__pycache__/__main__.cpython-310.pyc,,
|
27 |
+
charset_normalizer/constant.py,sha256=2tVrXQ9cvC8jt0b8gZzRXvXte1pVbRra0A5dOWDQSao,42476
|
28 |
+
charset_normalizer/legacy.py,sha256=KbJxEpu7g6zE2uXSB3T-3178cgiSQdVJlJmY-gv3EAM,2125
|
29 |
+
charset_normalizer/md.cp310-win_amd64.pyd,sha256=mz1wki3PrrAoEq-pAwpAQzudK1i88Ih4H5q2inTSBmQ,10752
|
30 |
+
charset_normalizer/md.py,sha256=F7S001NdPgkAoma2w598Idx2clW9ljXlRIYKZQKsCQA,20239
|
31 |
+
charset_normalizer/md__mypyc.cp310-win_amd64.pyd,sha256=rZvMDeaBVRbf3pG7Lkd_j7Xwmdf1UR0PVLUPp3tyEFE,120320
|
32 |
+
charset_normalizer/models.py,sha256=AlehuyGDE74jhryjg6TTkYh1MCntfxXFfGhTi0esu-Y,11964
|
33 |
+
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34 |
+
charset_normalizer/utils.py,sha256=jjvfSXHJD6QPgxcxIx4utsOFx3PxFssWef1IYxA3uKs,12315
|
35 |
+
charset_normalizer/version.py,sha256=q3fF12xGlBuaub5kroTZt7lBPQLO3kFvMnkoEnt-6YA,85
|
pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/WHEEL
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Wheel-Version: 1.0
|
2 |
+
Generator: bdist_wheel (0.41.2)
|
3 |
+
Root-Is-Purelib: false
|
4 |
+
Tag: cp310-cp310-win_amd64
|
5 |
+
|
pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/entry_points.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[console_scripts]
|
2 |
+
normalizer = charset_normalizer.cli:cli_detect
|
pproyect/test1/Lib/site-packages/charset_normalizer-3.3.2.dist-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
charset_normalizer
|
pproyect/test1/Lib/site-packages/charset_normalizer/__init__.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
Charset-Normalizer
|
4 |
+
~~~~~~~~~~~~~~
|
5 |
+
The Real First Universal Charset Detector.
|
6 |
+
A library that helps you read text from an unknown charset encoding.
|
7 |
+
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
8 |
+
All IANA character set names for which the Python core library provides codecs are supported.
|
9 |
+
|
10 |
+
Basic usage:
|
11 |
+
>>> from charset_normalizer import from_bytes
|
12 |
+
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
13 |
+
>>> best_guess = results.best()
|
14 |
+
>>> str(best_guess)
|
15 |
+
'Bсеки човек има право на образование. Oбразованието!'
|
16 |
+
|
17 |
+
Others methods and usages are available - see the full documentation
|
18 |
+
at <https://github.com/Ousret/charset_normalizer>.
|
19 |
+
:copyright: (c) 2021 by Ahmed TAHRI
|
20 |
+
:license: MIT, see LICENSE for more details.
|
21 |
+
"""
|
22 |
+
import logging
|
23 |
+
|
24 |
+
from .api import from_bytes, from_fp, from_path, is_binary
|
25 |
+
from .legacy import detect
|
26 |
+
from .models import CharsetMatch, CharsetMatches
|
27 |
+
from .utils import set_logging_handler
|
28 |
+
from .version import VERSION, __version__
|
29 |
+
|
30 |
+
__all__ = (
|
31 |
+
"from_fp",
|
32 |
+
"from_path",
|
33 |
+
"from_bytes",
|
34 |
+
"is_binary",
|
35 |
+
"detect",
|
36 |
+
"CharsetMatch",
|
37 |
+
"CharsetMatches",
|
38 |
+
"__version__",
|
39 |
+
"VERSION",
|
40 |
+
"set_logging_handler",
|
41 |
+
)
|
42 |
+
|
43 |
+
# Attach a NullHandler to the top level logger by default
|
44 |
+
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
45 |
+
|
46 |
+
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
pproyect/test1/Lib/site-packages/charset_normalizer/__main__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .cli import cli_detect
|
2 |
+
|
3 |
+
if __name__ == "__main__":
|
4 |
+
cli_detect()
|
pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (1.64 kB). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/__main__.cpython-310.pyc
ADDED
Binary file (328 Bytes). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/api.cpython-310.pyc
ADDED
Binary file (11.5 kB). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/cd.cpython-310.pyc
ADDED
Binary file (9.72 kB). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/constant.cpython-310.pyc
ADDED
Binary file (30.5 kB). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/legacy.cpython-310.pyc
ADDED
Binary file (1.91 kB). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/md.cpython-310.pyc
ADDED
Binary file (15.8 kB). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/models.cpython-310.pyc
ADDED
Binary file (11.6 kB). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (8.99 kB). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/__pycache__/version.cpython-310.pyc
ADDED
Binary file (330 Bytes). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/api.py
ADDED
@@ -0,0 +1,626 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from os import PathLike
|
3 |
+
from typing import BinaryIO, List, Optional, Set, Union
|
4 |
+
|
5 |
+
from .cd import (
|
6 |
+
coherence_ratio,
|
7 |
+
encoding_languages,
|
8 |
+
mb_encoding_languages,
|
9 |
+
merge_coherence_ratios,
|
10 |
+
)
|
11 |
+
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
12 |
+
from .md import mess_ratio
|
13 |
+
from .models import CharsetMatch, CharsetMatches
|
14 |
+
from .utils import (
|
15 |
+
any_specified_encoding,
|
16 |
+
cut_sequence_chunks,
|
17 |
+
iana_name,
|
18 |
+
identify_sig_or_bom,
|
19 |
+
is_cp_similar,
|
20 |
+
is_multi_byte_encoding,
|
21 |
+
should_strip_sig_or_bom,
|
22 |
+
)
|
23 |
+
|
24 |
+
# Will most likely be controversial
|
25 |
+
# logging.addLevelName(TRACE, "TRACE")
|
26 |
+
logger = logging.getLogger("charset_normalizer")
|
27 |
+
explain_handler = logging.StreamHandler()
|
28 |
+
explain_handler.setFormatter(
|
29 |
+
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
30 |
+
)
|
31 |
+
|
32 |
+
|
33 |
+
def from_bytes(
|
34 |
+
sequences: Union[bytes, bytearray],
|
35 |
+
steps: int = 5,
|
36 |
+
chunk_size: int = 512,
|
37 |
+
threshold: float = 0.2,
|
38 |
+
cp_isolation: Optional[List[str]] = None,
|
39 |
+
cp_exclusion: Optional[List[str]] = None,
|
40 |
+
preemptive_behaviour: bool = True,
|
41 |
+
explain: bool = False,
|
42 |
+
language_threshold: float = 0.1,
|
43 |
+
enable_fallback: bool = True,
|
44 |
+
) -> CharsetMatches:
|
45 |
+
"""
|
46 |
+
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
47 |
+
If there is no results, it is a strong indicator that the source is binary/not text.
|
48 |
+
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
49 |
+
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
50 |
+
|
51 |
+
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
52 |
+
but never take it for granted. Can improve the performance.
|
53 |
+
|
54 |
+
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
55 |
+
purpose.
|
56 |
+
|
57 |
+
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
58 |
+
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
59 |
+
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
60 |
+
Custom logging format and handler can be set manually.
|
61 |
+
"""
|
62 |
+
|
63 |
+
if not isinstance(sequences, (bytearray, bytes)):
|
64 |
+
raise TypeError(
|
65 |
+
"Expected object of type bytes or bytearray, got: {0}".format(
|
66 |
+
type(sequences)
|
67 |
+
)
|
68 |
+
)
|
69 |
+
|
70 |
+
if explain:
|
71 |
+
previous_logger_level: int = logger.level
|
72 |
+
logger.addHandler(explain_handler)
|
73 |
+
logger.setLevel(TRACE)
|
74 |
+
|
75 |
+
length: int = len(sequences)
|
76 |
+
|
77 |
+
if length == 0:
|
78 |
+
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
79 |
+
if explain:
|
80 |
+
logger.removeHandler(explain_handler)
|
81 |
+
logger.setLevel(previous_logger_level or logging.WARNING)
|
82 |
+
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
83 |
+
|
84 |
+
if cp_isolation is not None:
|
85 |
+
logger.log(
|
86 |
+
TRACE,
|
87 |
+
"cp_isolation is set. use this flag for debugging purpose. "
|
88 |
+
"limited list of encoding allowed : %s.",
|
89 |
+
", ".join(cp_isolation),
|
90 |
+
)
|
91 |
+
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
92 |
+
else:
|
93 |
+
cp_isolation = []
|
94 |
+
|
95 |
+
if cp_exclusion is not None:
|
96 |
+
logger.log(
|
97 |
+
TRACE,
|
98 |
+
"cp_exclusion is set. use this flag for debugging purpose. "
|
99 |
+
"limited list of encoding excluded : %s.",
|
100 |
+
", ".join(cp_exclusion),
|
101 |
+
)
|
102 |
+
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
103 |
+
else:
|
104 |
+
cp_exclusion = []
|
105 |
+
|
106 |
+
if length <= (chunk_size * steps):
|
107 |
+
logger.log(
|
108 |
+
TRACE,
|
109 |
+
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
110 |
+
steps,
|
111 |
+
chunk_size,
|
112 |
+
length,
|
113 |
+
)
|
114 |
+
steps = 1
|
115 |
+
chunk_size = length
|
116 |
+
|
117 |
+
if steps > 1 and length / steps < chunk_size:
|
118 |
+
chunk_size = int(length / steps)
|
119 |
+
|
120 |
+
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
121 |
+
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
122 |
+
|
123 |
+
if is_too_small_sequence:
|
124 |
+
logger.log(
|
125 |
+
TRACE,
|
126 |
+
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
127 |
+
length
|
128 |
+
),
|
129 |
+
)
|
130 |
+
elif is_too_large_sequence:
|
131 |
+
logger.log(
|
132 |
+
TRACE,
|
133 |
+
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
134 |
+
length
|
135 |
+
),
|
136 |
+
)
|
137 |
+
|
138 |
+
prioritized_encodings: List[str] = []
|
139 |
+
|
140 |
+
specified_encoding: Optional[str] = (
|
141 |
+
any_specified_encoding(sequences) if preemptive_behaviour else None
|
142 |
+
)
|
143 |
+
|
144 |
+
if specified_encoding is not None:
|
145 |
+
prioritized_encodings.append(specified_encoding)
|
146 |
+
logger.log(
|
147 |
+
TRACE,
|
148 |
+
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
149 |
+
specified_encoding,
|
150 |
+
)
|
151 |
+
|
152 |
+
tested: Set[str] = set()
|
153 |
+
tested_but_hard_failure: List[str] = []
|
154 |
+
tested_but_soft_failure: List[str] = []
|
155 |
+
|
156 |
+
fallback_ascii: Optional[CharsetMatch] = None
|
157 |
+
fallback_u8: Optional[CharsetMatch] = None
|
158 |
+
fallback_specified: Optional[CharsetMatch] = None
|
159 |
+
|
160 |
+
results: CharsetMatches = CharsetMatches()
|
161 |
+
|
162 |
+
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
163 |
+
|
164 |
+
if sig_encoding is not None:
|
165 |
+
prioritized_encodings.append(sig_encoding)
|
166 |
+
logger.log(
|
167 |
+
TRACE,
|
168 |
+
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
169 |
+
len(sig_payload),
|
170 |
+
sig_encoding,
|
171 |
+
)
|
172 |
+
|
173 |
+
prioritized_encodings.append("ascii")
|
174 |
+
|
175 |
+
if "utf_8" not in prioritized_encodings:
|
176 |
+
prioritized_encodings.append("utf_8")
|
177 |
+
|
178 |
+
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
179 |
+
if cp_isolation and encoding_iana not in cp_isolation:
|
180 |
+
continue
|
181 |
+
|
182 |
+
if cp_exclusion and encoding_iana in cp_exclusion:
|
183 |
+
continue
|
184 |
+
|
185 |
+
if encoding_iana in tested:
|
186 |
+
continue
|
187 |
+
|
188 |
+
tested.add(encoding_iana)
|
189 |
+
|
190 |
+
decoded_payload: Optional[str] = None
|
191 |
+
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
192 |
+
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
193 |
+
encoding_iana
|
194 |
+
)
|
195 |
+
|
196 |
+
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
197 |
+
logger.log(
|
198 |
+
TRACE,
|
199 |
+
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
200 |
+
encoding_iana,
|
201 |
+
)
|
202 |
+
continue
|
203 |
+
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
204 |
+
logger.log(
|
205 |
+
TRACE,
|
206 |
+
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
207 |
+
encoding_iana,
|
208 |
+
)
|
209 |
+
continue
|
210 |
+
|
211 |
+
try:
|
212 |
+
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
213 |
+
except (ModuleNotFoundError, ImportError):
|
214 |
+
logger.log(
|
215 |
+
TRACE,
|
216 |
+
"Encoding %s does not provide an IncrementalDecoder",
|
217 |
+
encoding_iana,
|
218 |
+
)
|
219 |
+
continue
|
220 |
+
|
221 |
+
try:
|
222 |
+
if is_too_large_sequence and is_multi_byte_decoder is False:
|
223 |
+
str(
|
224 |
+
sequences[: int(50e4)]
|
225 |
+
if strip_sig_or_bom is False
|
226 |
+
else sequences[len(sig_payload) : int(50e4)],
|
227 |
+
encoding=encoding_iana,
|
228 |
+
)
|
229 |
+
else:
|
230 |
+
decoded_payload = str(
|
231 |
+
sequences
|
232 |
+
if strip_sig_or_bom is False
|
233 |
+
else sequences[len(sig_payload) :],
|
234 |
+
encoding=encoding_iana,
|
235 |
+
)
|
236 |
+
except (UnicodeDecodeError, LookupError) as e:
|
237 |
+
if not isinstance(e, LookupError):
|
238 |
+
logger.log(
|
239 |
+
TRACE,
|
240 |
+
"Code page %s does not fit given bytes sequence at ALL. %s",
|
241 |
+
encoding_iana,
|
242 |
+
str(e),
|
243 |
+
)
|
244 |
+
tested_but_hard_failure.append(encoding_iana)
|
245 |
+
continue
|
246 |
+
|
247 |
+
similar_soft_failure_test: bool = False
|
248 |
+
|
249 |
+
for encoding_soft_failed in tested_but_soft_failure:
|
250 |
+
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
251 |
+
similar_soft_failure_test = True
|
252 |
+
break
|
253 |
+
|
254 |
+
if similar_soft_failure_test:
|
255 |
+
logger.log(
|
256 |
+
TRACE,
|
257 |
+
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
258 |
+
encoding_iana,
|
259 |
+
encoding_soft_failed,
|
260 |
+
)
|
261 |
+
continue
|
262 |
+
|
263 |
+
r_ = range(
|
264 |
+
0 if not bom_or_sig_available else len(sig_payload),
|
265 |
+
length,
|
266 |
+
int(length / steps),
|
267 |
+
)
|
268 |
+
|
269 |
+
multi_byte_bonus: bool = (
|
270 |
+
is_multi_byte_decoder
|
271 |
+
and decoded_payload is not None
|
272 |
+
and len(decoded_payload) < length
|
273 |
+
)
|
274 |
+
|
275 |
+
if multi_byte_bonus:
|
276 |
+
logger.log(
|
277 |
+
TRACE,
|
278 |
+
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
279 |
+
"was encoded using n-bytes.",
|
280 |
+
encoding_iana,
|
281 |
+
)
|
282 |
+
|
283 |
+
max_chunk_gave_up: int = int(len(r_) / 4)
|
284 |
+
|
285 |
+
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
286 |
+
early_stop_count: int = 0
|
287 |
+
lazy_str_hard_failure = False
|
288 |
+
|
289 |
+
md_chunks: List[str] = []
|
290 |
+
md_ratios = []
|
291 |
+
|
292 |
+
try:
|
293 |
+
for chunk in cut_sequence_chunks(
|
294 |
+
sequences,
|
295 |
+
encoding_iana,
|
296 |
+
r_,
|
297 |
+
chunk_size,
|
298 |
+
bom_or_sig_available,
|
299 |
+
strip_sig_or_bom,
|
300 |
+
sig_payload,
|
301 |
+
is_multi_byte_decoder,
|
302 |
+
decoded_payload,
|
303 |
+
):
|
304 |
+
md_chunks.append(chunk)
|
305 |
+
|
306 |
+
md_ratios.append(
|
307 |
+
mess_ratio(
|
308 |
+
chunk,
|
309 |
+
threshold,
|
310 |
+
explain is True and 1 <= len(cp_isolation) <= 2,
|
311 |
+
)
|
312 |
+
)
|
313 |
+
|
314 |
+
if md_ratios[-1] >= threshold:
|
315 |
+
early_stop_count += 1
|
316 |
+
|
317 |
+
if (early_stop_count >= max_chunk_gave_up) or (
|
318 |
+
bom_or_sig_available and strip_sig_or_bom is False
|
319 |
+
):
|
320 |
+
break
|
321 |
+
except (
|
322 |
+
UnicodeDecodeError
|
323 |
+
) as e: # Lazy str loading may have missed something there
|
324 |
+
logger.log(
|
325 |
+
TRACE,
|
326 |
+
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
327 |
+
encoding_iana,
|
328 |
+
str(e),
|
329 |
+
)
|
330 |
+
early_stop_count = max_chunk_gave_up
|
331 |
+
lazy_str_hard_failure = True
|
332 |
+
|
333 |
+
# We might want to check the sequence again with the whole content
|
334 |
+
# Only if initial MD tests passes
|
335 |
+
if (
|
336 |
+
not lazy_str_hard_failure
|
337 |
+
and is_too_large_sequence
|
338 |
+
and not is_multi_byte_decoder
|
339 |
+
):
|
340 |
+
try:
|
341 |
+
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
342 |
+
except UnicodeDecodeError as e:
|
343 |
+
logger.log(
|
344 |
+
TRACE,
|
345 |
+
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
346 |
+
encoding_iana,
|
347 |
+
str(e),
|
348 |
+
)
|
349 |
+
tested_but_hard_failure.append(encoding_iana)
|
350 |
+
continue
|
351 |
+
|
352 |
+
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
353 |
+
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
354 |
+
tested_but_soft_failure.append(encoding_iana)
|
355 |
+
logger.log(
|
356 |
+
TRACE,
|
357 |
+
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
358 |
+
"Computed mean chaos is %f %%.",
|
359 |
+
encoding_iana,
|
360 |
+
early_stop_count,
|
361 |
+
round(mean_mess_ratio * 100, ndigits=3),
|
362 |
+
)
|
363 |
+
# Preparing those fallbacks in case we got nothing.
|
364 |
+
if (
|
365 |
+
enable_fallback
|
366 |
+
and encoding_iana in ["ascii", "utf_8", specified_encoding]
|
367 |
+
and not lazy_str_hard_failure
|
368 |
+
):
|
369 |
+
fallback_entry = CharsetMatch(
|
370 |
+
sequences, encoding_iana, threshold, False, [], decoded_payload
|
371 |
+
)
|
372 |
+
if encoding_iana == specified_encoding:
|
373 |
+
fallback_specified = fallback_entry
|
374 |
+
elif encoding_iana == "ascii":
|
375 |
+
fallback_ascii = fallback_entry
|
376 |
+
else:
|
377 |
+
fallback_u8 = fallback_entry
|
378 |
+
continue
|
379 |
+
|
380 |
+
logger.log(
|
381 |
+
TRACE,
|
382 |
+
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
383 |
+
encoding_iana,
|
384 |
+
round(mean_mess_ratio * 100, ndigits=3),
|
385 |
+
)
|
386 |
+
|
387 |
+
if not is_multi_byte_decoder:
|
388 |
+
target_languages: List[str] = encoding_languages(encoding_iana)
|
389 |
+
else:
|
390 |
+
target_languages = mb_encoding_languages(encoding_iana)
|
391 |
+
|
392 |
+
if target_languages:
|
393 |
+
logger.log(
|
394 |
+
TRACE,
|
395 |
+
"{} should target any language(s) of {}".format(
|
396 |
+
encoding_iana, str(target_languages)
|
397 |
+
),
|
398 |
+
)
|
399 |
+
|
400 |
+
cd_ratios = []
|
401 |
+
|
402 |
+
# We shall skip the CD when its about ASCII
|
403 |
+
# Most of the time its not relevant to run "language-detection" on it.
|
404 |
+
if encoding_iana != "ascii":
|
405 |
+
for chunk in md_chunks:
|
406 |
+
chunk_languages = coherence_ratio(
|
407 |
+
chunk,
|
408 |
+
language_threshold,
|
409 |
+
",".join(target_languages) if target_languages else None,
|
410 |
+
)
|
411 |
+
|
412 |
+
cd_ratios.append(chunk_languages)
|
413 |
+
|
414 |
+
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
415 |
+
|
416 |
+
if cd_ratios_merged:
|
417 |
+
logger.log(
|
418 |
+
TRACE,
|
419 |
+
"We detected language {} using {}".format(
|
420 |
+
cd_ratios_merged, encoding_iana
|
421 |
+
),
|
422 |
+
)
|
423 |
+
|
424 |
+
results.append(
|
425 |
+
CharsetMatch(
|
426 |
+
sequences,
|
427 |
+
encoding_iana,
|
428 |
+
mean_mess_ratio,
|
429 |
+
bom_or_sig_available,
|
430 |
+
cd_ratios_merged,
|
431 |
+
decoded_payload,
|
432 |
+
)
|
433 |
+
)
|
434 |
+
|
435 |
+
if (
|
436 |
+
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
437 |
+
and mean_mess_ratio < 0.1
|
438 |
+
):
|
439 |
+
logger.debug(
|
440 |
+
"Encoding detection: %s is most likely the one.", encoding_iana
|
441 |
+
)
|
442 |
+
if explain:
|
443 |
+
logger.removeHandler(explain_handler)
|
444 |
+
logger.setLevel(previous_logger_level)
|
445 |
+
return CharsetMatches([results[encoding_iana]])
|
446 |
+
|
447 |
+
if encoding_iana == sig_encoding:
|
448 |
+
logger.debug(
|
449 |
+
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
450 |
+
"the beginning of the sequence.",
|
451 |
+
encoding_iana,
|
452 |
+
)
|
453 |
+
if explain:
|
454 |
+
logger.removeHandler(explain_handler)
|
455 |
+
logger.setLevel(previous_logger_level)
|
456 |
+
return CharsetMatches([results[encoding_iana]])
|
457 |
+
|
458 |
+
if len(results) == 0:
|
459 |
+
if fallback_u8 or fallback_ascii or fallback_specified:
|
460 |
+
logger.log(
|
461 |
+
TRACE,
|
462 |
+
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
463 |
+
)
|
464 |
+
|
465 |
+
if fallback_specified:
|
466 |
+
logger.debug(
|
467 |
+
"Encoding detection: %s will be used as a fallback match",
|
468 |
+
fallback_specified.encoding,
|
469 |
+
)
|
470 |
+
results.append(fallback_specified)
|
471 |
+
elif (
|
472 |
+
(fallback_u8 and fallback_ascii is None)
|
473 |
+
or (
|
474 |
+
fallback_u8
|
475 |
+
and fallback_ascii
|
476 |
+
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
477 |
+
)
|
478 |
+
or (fallback_u8 is not None)
|
479 |
+
):
|
480 |
+
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
481 |
+
results.append(fallback_u8)
|
482 |
+
elif fallback_ascii:
|
483 |
+
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
484 |
+
results.append(fallback_ascii)
|
485 |
+
|
486 |
+
if results:
|
487 |
+
logger.debug(
|
488 |
+
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
489 |
+
results.best().encoding, # type: ignore
|
490 |
+
len(results) - 1,
|
491 |
+
)
|
492 |
+
else:
|
493 |
+
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
494 |
+
|
495 |
+
if explain:
|
496 |
+
logger.removeHandler(explain_handler)
|
497 |
+
logger.setLevel(previous_logger_level)
|
498 |
+
|
499 |
+
return results
|
500 |
+
|
501 |
+
|
502 |
+
def from_fp(
|
503 |
+
fp: BinaryIO,
|
504 |
+
steps: int = 5,
|
505 |
+
chunk_size: int = 512,
|
506 |
+
threshold: float = 0.20,
|
507 |
+
cp_isolation: Optional[List[str]] = None,
|
508 |
+
cp_exclusion: Optional[List[str]] = None,
|
509 |
+
preemptive_behaviour: bool = True,
|
510 |
+
explain: bool = False,
|
511 |
+
language_threshold: float = 0.1,
|
512 |
+
enable_fallback: bool = True,
|
513 |
+
) -> CharsetMatches:
|
514 |
+
"""
|
515 |
+
Same thing than the function from_bytes but using a file pointer that is already ready.
|
516 |
+
Will not close the file pointer.
|
517 |
+
"""
|
518 |
+
return from_bytes(
|
519 |
+
fp.read(),
|
520 |
+
steps,
|
521 |
+
chunk_size,
|
522 |
+
threshold,
|
523 |
+
cp_isolation,
|
524 |
+
cp_exclusion,
|
525 |
+
preemptive_behaviour,
|
526 |
+
explain,
|
527 |
+
language_threshold,
|
528 |
+
enable_fallback,
|
529 |
+
)
|
530 |
+
|
531 |
+
|
532 |
+
def from_path(
|
533 |
+
path: Union[str, bytes, PathLike], # type: ignore[type-arg]
|
534 |
+
steps: int = 5,
|
535 |
+
chunk_size: int = 512,
|
536 |
+
threshold: float = 0.20,
|
537 |
+
cp_isolation: Optional[List[str]] = None,
|
538 |
+
cp_exclusion: Optional[List[str]] = None,
|
539 |
+
preemptive_behaviour: bool = True,
|
540 |
+
explain: bool = False,
|
541 |
+
language_threshold: float = 0.1,
|
542 |
+
enable_fallback: bool = True,
|
543 |
+
) -> CharsetMatches:
|
544 |
+
"""
|
545 |
+
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
546 |
+
Can raise IOError.
|
547 |
+
"""
|
548 |
+
with open(path, "rb") as fp:
|
549 |
+
return from_fp(
|
550 |
+
fp,
|
551 |
+
steps,
|
552 |
+
chunk_size,
|
553 |
+
threshold,
|
554 |
+
cp_isolation,
|
555 |
+
cp_exclusion,
|
556 |
+
preemptive_behaviour,
|
557 |
+
explain,
|
558 |
+
language_threshold,
|
559 |
+
enable_fallback,
|
560 |
+
)
|
561 |
+
|
562 |
+
|
563 |
+
def is_binary(
|
564 |
+
fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
|
565 |
+
steps: int = 5,
|
566 |
+
chunk_size: int = 512,
|
567 |
+
threshold: float = 0.20,
|
568 |
+
cp_isolation: Optional[List[str]] = None,
|
569 |
+
cp_exclusion: Optional[List[str]] = None,
|
570 |
+
preemptive_behaviour: bool = True,
|
571 |
+
explain: bool = False,
|
572 |
+
language_threshold: float = 0.1,
|
573 |
+
enable_fallback: bool = False,
|
574 |
+
) -> bool:
|
575 |
+
"""
|
576 |
+
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
|
577 |
+
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
|
578 |
+
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
|
579 |
+
"""
|
580 |
+
if isinstance(fp_or_path_or_payload, (str, PathLike)):
|
581 |
+
guesses = from_path(
|
582 |
+
fp_or_path_or_payload,
|
583 |
+
steps=steps,
|
584 |
+
chunk_size=chunk_size,
|
585 |
+
threshold=threshold,
|
586 |
+
cp_isolation=cp_isolation,
|
587 |
+
cp_exclusion=cp_exclusion,
|
588 |
+
preemptive_behaviour=preemptive_behaviour,
|
589 |
+
explain=explain,
|
590 |
+
language_threshold=language_threshold,
|
591 |
+
enable_fallback=enable_fallback,
|
592 |
+
)
|
593 |
+
elif isinstance(
|
594 |
+
fp_or_path_or_payload,
|
595 |
+
(
|
596 |
+
bytes,
|
597 |
+
bytearray,
|
598 |
+
),
|
599 |
+
):
|
600 |
+
guesses = from_bytes(
|
601 |
+
fp_or_path_or_payload,
|
602 |
+
steps=steps,
|
603 |
+
chunk_size=chunk_size,
|
604 |
+
threshold=threshold,
|
605 |
+
cp_isolation=cp_isolation,
|
606 |
+
cp_exclusion=cp_exclusion,
|
607 |
+
preemptive_behaviour=preemptive_behaviour,
|
608 |
+
explain=explain,
|
609 |
+
language_threshold=language_threshold,
|
610 |
+
enable_fallback=enable_fallback,
|
611 |
+
)
|
612 |
+
else:
|
613 |
+
guesses = from_fp(
|
614 |
+
fp_or_path_or_payload,
|
615 |
+
steps=steps,
|
616 |
+
chunk_size=chunk_size,
|
617 |
+
threshold=threshold,
|
618 |
+
cp_isolation=cp_isolation,
|
619 |
+
cp_exclusion=cp_exclusion,
|
620 |
+
preemptive_behaviour=preemptive_behaviour,
|
621 |
+
explain=explain,
|
622 |
+
language_threshold=language_threshold,
|
623 |
+
enable_fallback=enable_fallback,
|
624 |
+
)
|
625 |
+
|
626 |
+
return not guesses
|
pproyect/test1/Lib/site-packages/charset_normalizer/cd.py
ADDED
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import importlib
|
2 |
+
from codecs import IncrementalDecoder
|
3 |
+
from collections import Counter
|
4 |
+
from functools import lru_cache
|
5 |
+
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
|
6 |
+
|
7 |
+
from .constant import (
|
8 |
+
FREQUENCIES,
|
9 |
+
KO_NAMES,
|
10 |
+
LANGUAGE_SUPPORTED_COUNT,
|
11 |
+
TOO_SMALL_SEQUENCE,
|
12 |
+
ZH_NAMES,
|
13 |
+
)
|
14 |
+
from .md import is_suspiciously_successive_range
|
15 |
+
from .models import CoherenceMatches
|
16 |
+
from .utils import (
|
17 |
+
is_accentuated,
|
18 |
+
is_latin,
|
19 |
+
is_multi_byte_encoding,
|
20 |
+
is_unicode_range_secondary,
|
21 |
+
unicode_range,
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
def encoding_unicode_range(iana_name: str) -> List[str]:
|
26 |
+
"""
|
27 |
+
Return associated unicode ranges in a single byte code page.
|
28 |
+
"""
|
29 |
+
if is_multi_byte_encoding(iana_name):
|
30 |
+
raise IOError("Function not supported on multi-byte code page")
|
31 |
+
|
32 |
+
decoder = importlib.import_module(
|
33 |
+
"encodings.{}".format(iana_name)
|
34 |
+
).IncrementalDecoder
|
35 |
+
|
36 |
+
p: IncrementalDecoder = decoder(errors="ignore")
|
37 |
+
seen_ranges: Dict[str, int] = {}
|
38 |
+
character_count: int = 0
|
39 |
+
|
40 |
+
for i in range(0x40, 0xFF):
|
41 |
+
chunk: str = p.decode(bytes([i]))
|
42 |
+
|
43 |
+
if chunk:
|
44 |
+
character_range: Optional[str] = unicode_range(chunk)
|
45 |
+
|
46 |
+
if character_range is None:
|
47 |
+
continue
|
48 |
+
|
49 |
+
if is_unicode_range_secondary(character_range) is False:
|
50 |
+
if character_range not in seen_ranges:
|
51 |
+
seen_ranges[character_range] = 0
|
52 |
+
seen_ranges[character_range] += 1
|
53 |
+
character_count += 1
|
54 |
+
|
55 |
+
return sorted(
|
56 |
+
[
|
57 |
+
character_range
|
58 |
+
for character_range in seen_ranges
|
59 |
+
if seen_ranges[character_range] / character_count >= 0.15
|
60 |
+
]
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
def unicode_range_languages(primary_range: str) -> List[str]:
|
65 |
+
"""
|
66 |
+
Return inferred languages used with a unicode range.
|
67 |
+
"""
|
68 |
+
languages: List[str] = []
|
69 |
+
|
70 |
+
for language, characters in FREQUENCIES.items():
|
71 |
+
for character in characters:
|
72 |
+
if unicode_range(character) == primary_range:
|
73 |
+
languages.append(language)
|
74 |
+
break
|
75 |
+
|
76 |
+
return languages
|
77 |
+
|
78 |
+
|
79 |
+
@lru_cache()
|
80 |
+
def encoding_languages(iana_name: str) -> List[str]:
|
81 |
+
"""
|
82 |
+
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
83 |
+
This function does the correspondence.
|
84 |
+
"""
|
85 |
+
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
|
86 |
+
primary_range: Optional[str] = None
|
87 |
+
|
88 |
+
for specified_range in unicode_ranges:
|
89 |
+
if "Latin" not in specified_range:
|
90 |
+
primary_range = specified_range
|
91 |
+
break
|
92 |
+
|
93 |
+
if primary_range is None:
|
94 |
+
return ["Latin Based"]
|
95 |
+
|
96 |
+
return unicode_range_languages(primary_range)
|
97 |
+
|
98 |
+
|
99 |
+
@lru_cache()
|
100 |
+
def mb_encoding_languages(iana_name: str) -> List[str]:
|
101 |
+
"""
|
102 |
+
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
103 |
+
This function does the correspondence.
|
104 |
+
"""
|
105 |
+
if (
|
106 |
+
iana_name.startswith("shift_")
|
107 |
+
or iana_name.startswith("iso2022_jp")
|
108 |
+
or iana_name.startswith("euc_j")
|
109 |
+
or iana_name == "cp932"
|
110 |
+
):
|
111 |
+
return ["Japanese"]
|
112 |
+
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
113 |
+
return ["Chinese"]
|
114 |
+
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
115 |
+
return ["Korean"]
|
116 |
+
|
117 |
+
return []
|
118 |
+
|
119 |
+
|
120 |
+
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
121 |
+
def get_target_features(language: str) -> Tuple[bool, bool]:
|
122 |
+
"""
|
123 |
+
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
124 |
+
"""
|
125 |
+
target_have_accents: bool = False
|
126 |
+
target_pure_latin: bool = True
|
127 |
+
|
128 |
+
for character in FREQUENCIES[language]:
|
129 |
+
if not target_have_accents and is_accentuated(character):
|
130 |
+
target_have_accents = True
|
131 |
+
if target_pure_latin and is_latin(character) is False:
|
132 |
+
target_pure_latin = False
|
133 |
+
|
134 |
+
return target_have_accents, target_pure_latin
|
135 |
+
|
136 |
+
|
137 |
+
def alphabet_languages(
|
138 |
+
characters: List[str], ignore_non_latin: bool = False
|
139 |
+
) -> List[str]:
|
140 |
+
"""
|
141 |
+
Return associated languages associated to given characters.
|
142 |
+
"""
|
143 |
+
languages: List[Tuple[str, float]] = []
|
144 |
+
|
145 |
+
source_have_accents = any(is_accentuated(character) for character in characters)
|
146 |
+
|
147 |
+
for language, language_characters in FREQUENCIES.items():
|
148 |
+
target_have_accents, target_pure_latin = get_target_features(language)
|
149 |
+
|
150 |
+
if ignore_non_latin and target_pure_latin is False:
|
151 |
+
continue
|
152 |
+
|
153 |
+
if target_have_accents is False and source_have_accents:
|
154 |
+
continue
|
155 |
+
|
156 |
+
character_count: int = len(language_characters)
|
157 |
+
|
158 |
+
character_match_count: int = len(
|
159 |
+
[c for c in language_characters if c in characters]
|
160 |
+
)
|
161 |
+
|
162 |
+
ratio: float = character_match_count / character_count
|
163 |
+
|
164 |
+
if ratio >= 0.2:
|
165 |
+
languages.append((language, ratio))
|
166 |
+
|
167 |
+
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
168 |
+
|
169 |
+
return [compatible_language[0] for compatible_language in languages]
|
170 |
+
|
171 |
+
|
172 |
+
def characters_popularity_compare(
|
173 |
+
language: str, ordered_characters: List[str]
|
174 |
+
) -> float:
|
175 |
+
"""
|
176 |
+
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
177 |
+
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
178 |
+
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
179 |
+
"""
|
180 |
+
if language not in FREQUENCIES:
|
181 |
+
raise ValueError("{} not available".format(language))
|
182 |
+
|
183 |
+
character_approved_count: int = 0
|
184 |
+
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
185 |
+
|
186 |
+
ordered_characters_count: int = len(ordered_characters)
|
187 |
+
target_language_characters_count: int = len(FREQUENCIES[language])
|
188 |
+
|
189 |
+
large_alphabet: bool = target_language_characters_count > 26
|
190 |
+
|
191 |
+
for character, character_rank in zip(
|
192 |
+
ordered_characters, range(0, ordered_characters_count)
|
193 |
+
):
|
194 |
+
if character not in FREQUENCIES_language_set:
|
195 |
+
continue
|
196 |
+
|
197 |
+
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
198 |
+
expected_projection_ratio: float = (
|
199 |
+
target_language_characters_count / ordered_characters_count
|
200 |
+
)
|
201 |
+
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
202 |
+
|
203 |
+
if (
|
204 |
+
large_alphabet is False
|
205 |
+
and abs(character_rank_projection - character_rank_in_language) > 4
|
206 |
+
):
|
207 |
+
continue
|
208 |
+
|
209 |
+
if (
|
210 |
+
large_alphabet is True
|
211 |
+
and abs(character_rank_projection - character_rank_in_language)
|
212 |
+
< target_language_characters_count / 3
|
213 |
+
):
|
214 |
+
character_approved_count += 1
|
215 |
+
continue
|
216 |
+
|
217 |
+
characters_before_source: List[str] = FREQUENCIES[language][
|
218 |
+
0:character_rank_in_language
|
219 |
+
]
|
220 |
+
characters_after_source: List[str] = FREQUENCIES[language][
|
221 |
+
character_rank_in_language:
|
222 |
+
]
|
223 |
+
characters_before: List[str] = ordered_characters[0:character_rank]
|
224 |
+
characters_after: List[str] = ordered_characters[character_rank:]
|
225 |
+
|
226 |
+
before_match_count: int = len(
|
227 |
+
set(characters_before) & set(characters_before_source)
|
228 |
+
)
|
229 |
+
|
230 |
+
after_match_count: int = len(
|
231 |
+
set(characters_after) & set(characters_after_source)
|
232 |
+
)
|
233 |
+
|
234 |
+
if len(characters_before_source) == 0 and before_match_count <= 4:
|
235 |
+
character_approved_count += 1
|
236 |
+
continue
|
237 |
+
|
238 |
+
if len(characters_after_source) == 0 and after_match_count <= 4:
|
239 |
+
character_approved_count += 1
|
240 |
+
continue
|
241 |
+
|
242 |
+
if (
|
243 |
+
before_match_count / len(characters_before_source) >= 0.4
|
244 |
+
or after_match_count / len(characters_after_source) >= 0.4
|
245 |
+
):
|
246 |
+
character_approved_count += 1
|
247 |
+
continue
|
248 |
+
|
249 |
+
return character_approved_count / len(ordered_characters)
|
250 |
+
|
251 |
+
|
252 |
+
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
253 |
+
"""
|
254 |
+
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
255 |
+
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
256 |
+
One containing the latin letters and the other hebrew.
|
257 |
+
"""
|
258 |
+
layers: Dict[str, str] = {}
|
259 |
+
|
260 |
+
for character in decoded_sequence:
|
261 |
+
if character.isalpha() is False:
|
262 |
+
continue
|
263 |
+
|
264 |
+
character_range: Optional[str] = unicode_range(character)
|
265 |
+
|
266 |
+
if character_range is None:
|
267 |
+
continue
|
268 |
+
|
269 |
+
layer_target_range: Optional[str] = None
|
270 |
+
|
271 |
+
for discovered_range in layers:
|
272 |
+
if (
|
273 |
+
is_suspiciously_successive_range(discovered_range, character_range)
|
274 |
+
is False
|
275 |
+
):
|
276 |
+
layer_target_range = discovered_range
|
277 |
+
break
|
278 |
+
|
279 |
+
if layer_target_range is None:
|
280 |
+
layer_target_range = character_range
|
281 |
+
|
282 |
+
if layer_target_range not in layers:
|
283 |
+
layers[layer_target_range] = character.lower()
|
284 |
+
continue
|
285 |
+
|
286 |
+
layers[layer_target_range] += character.lower()
|
287 |
+
|
288 |
+
return list(layers.values())
|
289 |
+
|
290 |
+
|
291 |
+
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
292 |
+
"""
|
293 |
+
This function merge results previously given by the function coherence_ratio.
|
294 |
+
The return type is the same as coherence_ratio.
|
295 |
+
"""
|
296 |
+
per_language_ratios: Dict[str, List[float]] = {}
|
297 |
+
for result in results:
|
298 |
+
for sub_result in result:
|
299 |
+
language, ratio = sub_result
|
300 |
+
if language not in per_language_ratios:
|
301 |
+
per_language_ratios[language] = [ratio]
|
302 |
+
continue
|
303 |
+
per_language_ratios[language].append(ratio)
|
304 |
+
|
305 |
+
merge = [
|
306 |
+
(
|
307 |
+
language,
|
308 |
+
round(
|
309 |
+
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
310 |
+
4,
|
311 |
+
),
|
312 |
+
)
|
313 |
+
for language in per_language_ratios
|
314 |
+
]
|
315 |
+
|
316 |
+
return sorted(merge, key=lambda x: x[1], reverse=True)
|
317 |
+
|
318 |
+
|
319 |
+
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
320 |
+
"""
|
321 |
+
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
322 |
+
of "English". This function only keeps the best match and remove the em-dash in it.
|
323 |
+
"""
|
324 |
+
index_results: Dict[str, List[float]] = dict()
|
325 |
+
|
326 |
+
for result in results:
|
327 |
+
language, ratio = result
|
328 |
+
no_em_name: str = language.replace("—", "")
|
329 |
+
|
330 |
+
if no_em_name not in index_results:
|
331 |
+
index_results[no_em_name] = []
|
332 |
+
|
333 |
+
index_results[no_em_name].append(ratio)
|
334 |
+
|
335 |
+
if any(len(index_results[e]) > 1 for e in index_results):
|
336 |
+
filtered_results: CoherenceMatches = []
|
337 |
+
|
338 |
+
for language in index_results:
|
339 |
+
filtered_results.append((language, max(index_results[language])))
|
340 |
+
|
341 |
+
return filtered_results
|
342 |
+
|
343 |
+
return results
|
344 |
+
|
345 |
+
|
346 |
+
@lru_cache(maxsize=2048)
|
347 |
+
def coherence_ratio(
|
348 |
+
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
349 |
+
) -> CoherenceMatches:
|
350 |
+
"""
|
351 |
+
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
352 |
+
A layer = Character extraction by alphabets/ranges.
|
353 |
+
"""
|
354 |
+
|
355 |
+
results: List[Tuple[str, float]] = []
|
356 |
+
ignore_non_latin: bool = False
|
357 |
+
|
358 |
+
sufficient_match_count: int = 0
|
359 |
+
|
360 |
+
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
361 |
+
if "Latin Based" in lg_inclusion_list:
|
362 |
+
ignore_non_latin = True
|
363 |
+
lg_inclusion_list.remove("Latin Based")
|
364 |
+
|
365 |
+
for layer in alpha_unicode_split(decoded_sequence):
|
366 |
+
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
367 |
+
most_common = sequence_frequencies.most_common()
|
368 |
+
|
369 |
+
character_count: int = sum(o for c, o in most_common)
|
370 |
+
|
371 |
+
if character_count <= TOO_SMALL_SEQUENCE:
|
372 |
+
continue
|
373 |
+
|
374 |
+
popular_character_ordered: List[str] = [c for c, o in most_common]
|
375 |
+
|
376 |
+
for language in lg_inclusion_list or alphabet_languages(
|
377 |
+
popular_character_ordered, ignore_non_latin
|
378 |
+
):
|
379 |
+
ratio: float = characters_popularity_compare(
|
380 |
+
language, popular_character_ordered
|
381 |
+
)
|
382 |
+
|
383 |
+
if ratio < threshold:
|
384 |
+
continue
|
385 |
+
elif ratio >= 0.8:
|
386 |
+
sufficient_match_count += 1
|
387 |
+
|
388 |
+
results.append((language, round(ratio, 4)))
|
389 |
+
|
390 |
+
if sufficient_match_count >= 3:
|
391 |
+
break
|
392 |
+
|
393 |
+
return sorted(
|
394 |
+
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
395 |
+
)
|
pproyect/test1/Lib/site-packages/charset_normalizer/cli/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .__main__ import cli_detect, query_yes_no
|
2 |
+
|
3 |
+
__all__ = (
|
4 |
+
"cli_detect",
|
5 |
+
"query_yes_no",
|
6 |
+
)
|
pproyect/test1/Lib/site-packages/charset_normalizer/cli/__main__.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import sys
|
3 |
+
from json import dumps
|
4 |
+
from os.path import abspath, basename, dirname, join, realpath
|
5 |
+
from platform import python_version
|
6 |
+
from typing import List, Optional
|
7 |
+
from unicodedata import unidata_version
|
8 |
+
|
9 |
+
import charset_normalizer.md as md_module
|
10 |
+
from charset_normalizer import from_fp
|
11 |
+
from charset_normalizer.models import CliDetectionResult
|
12 |
+
from charset_normalizer.version import __version__
|
13 |
+
|
14 |
+
|
15 |
+
def query_yes_no(question: str, default: str = "yes") -> bool:
|
16 |
+
"""Ask a yes/no question via input() and return their answer.
|
17 |
+
|
18 |
+
"question" is a string that is presented to the user.
|
19 |
+
"default" is the presumed answer if the user just hits <Enter>.
|
20 |
+
It must be "yes" (the default), "no" or None (meaning
|
21 |
+
an answer is required of the user).
|
22 |
+
|
23 |
+
The "answer" return value is True for "yes" or False for "no".
|
24 |
+
|
25 |
+
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
26 |
+
"""
|
27 |
+
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
28 |
+
if default is None:
|
29 |
+
prompt = " [y/n] "
|
30 |
+
elif default == "yes":
|
31 |
+
prompt = " [Y/n] "
|
32 |
+
elif default == "no":
|
33 |
+
prompt = " [y/N] "
|
34 |
+
else:
|
35 |
+
raise ValueError("invalid default answer: '%s'" % default)
|
36 |
+
|
37 |
+
while True:
|
38 |
+
sys.stdout.write(question + prompt)
|
39 |
+
choice = input().lower()
|
40 |
+
if default is not None and choice == "":
|
41 |
+
return valid[default]
|
42 |
+
elif choice in valid:
|
43 |
+
return valid[choice]
|
44 |
+
else:
|
45 |
+
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
46 |
+
|
47 |
+
|
48 |
+
def cli_detect(argv: Optional[List[str]] = None) -> int:
|
49 |
+
"""
|
50 |
+
CLI assistant using ARGV and ArgumentParser
|
51 |
+
:param argv:
|
52 |
+
:return: 0 if everything is fine, anything else equal trouble
|
53 |
+
"""
|
54 |
+
parser = argparse.ArgumentParser(
|
55 |
+
description="The Real First Universal Charset Detector. "
|
56 |
+
"Discover originating encoding used on text file. "
|
57 |
+
"Normalize text to unicode."
|
58 |
+
)
|
59 |
+
|
60 |
+
parser.add_argument(
|
61 |
+
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
62 |
+
)
|
63 |
+
parser.add_argument(
|
64 |
+
"-v",
|
65 |
+
"--verbose",
|
66 |
+
action="store_true",
|
67 |
+
default=False,
|
68 |
+
dest="verbose",
|
69 |
+
help="Display complementary information about file if any. "
|
70 |
+
"Stdout will contain logs about the detection process.",
|
71 |
+
)
|
72 |
+
parser.add_argument(
|
73 |
+
"-a",
|
74 |
+
"--with-alternative",
|
75 |
+
action="store_true",
|
76 |
+
default=False,
|
77 |
+
dest="alternatives",
|
78 |
+
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
79 |
+
)
|
80 |
+
parser.add_argument(
|
81 |
+
"-n",
|
82 |
+
"--normalize",
|
83 |
+
action="store_true",
|
84 |
+
default=False,
|
85 |
+
dest="normalize",
|
86 |
+
help="Permit to normalize input file. If not set, program does not write anything.",
|
87 |
+
)
|
88 |
+
parser.add_argument(
|
89 |
+
"-m",
|
90 |
+
"--minimal",
|
91 |
+
action="store_true",
|
92 |
+
default=False,
|
93 |
+
dest="minimal",
|
94 |
+
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
95 |
+
)
|
96 |
+
parser.add_argument(
|
97 |
+
"-r",
|
98 |
+
"--replace",
|
99 |
+
action="store_true",
|
100 |
+
default=False,
|
101 |
+
dest="replace",
|
102 |
+
help="Replace file when trying to normalize it instead of creating a new one.",
|
103 |
+
)
|
104 |
+
parser.add_argument(
|
105 |
+
"-f",
|
106 |
+
"--force",
|
107 |
+
action="store_true",
|
108 |
+
default=False,
|
109 |
+
dest="force",
|
110 |
+
help="Replace file without asking if you are sure, use this flag with caution.",
|
111 |
+
)
|
112 |
+
parser.add_argument(
|
113 |
+
"-t",
|
114 |
+
"--threshold",
|
115 |
+
action="store",
|
116 |
+
default=0.2,
|
117 |
+
type=float,
|
118 |
+
dest="threshold",
|
119 |
+
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
120 |
+
)
|
121 |
+
parser.add_argument(
|
122 |
+
"--version",
|
123 |
+
action="version",
|
124 |
+
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
125 |
+
__version__,
|
126 |
+
python_version(),
|
127 |
+
unidata_version,
|
128 |
+
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
129 |
+
),
|
130 |
+
help="Show version information and exit.",
|
131 |
+
)
|
132 |
+
|
133 |
+
args = parser.parse_args(argv)
|
134 |
+
|
135 |
+
if args.replace is True and args.normalize is False:
|
136 |
+
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
137 |
+
return 1
|
138 |
+
|
139 |
+
if args.force is True and args.replace is False:
|
140 |
+
print("Use --force in addition of --replace only.", file=sys.stderr)
|
141 |
+
return 1
|
142 |
+
|
143 |
+
if args.threshold < 0.0 or args.threshold > 1.0:
|
144 |
+
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
145 |
+
return 1
|
146 |
+
|
147 |
+
x_ = []
|
148 |
+
|
149 |
+
for my_file in args.files:
|
150 |
+
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
|
151 |
+
|
152 |
+
best_guess = matches.best()
|
153 |
+
|
154 |
+
if best_guess is None:
|
155 |
+
print(
|
156 |
+
'Unable to identify originating encoding for "{}". {}'.format(
|
157 |
+
my_file.name,
|
158 |
+
"Maybe try increasing maximum amount of chaos."
|
159 |
+
if args.threshold < 1.0
|
160 |
+
else "",
|
161 |
+
),
|
162 |
+
file=sys.stderr,
|
163 |
+
)
|
164 |
+
x_.append(
|
165 |
+
CliDetectionResult(
|
166 |
+
abspath(my_file.name),
|
167 |
+
None,
|
168 |
+
[],
|
169 |
+
[],
|
170 |
+
"Unknown",
|
171 |
+
[],
|
172 |
+
False,
|
173 |
+
1.0,
|
174 |
+
0.0,
|
175 |
+
None,
|
176 |
+
True,
|
177 |
+
)
|
178 |
+
)
|
179 |
+
else:
|
180 |
+
x_.append(
|
181 |
+
CliDetectionResult(
|
182 |
+
abspath(my_file.name),
|
183 |
+
best_guess.encoding,
|
184 |
+
best_guess.encoding_aliases,
|
185 |
+
[
|
186 |
+
cp
|
187 |
+
for cp in best_guess.could_be_from_charset
|
188 |
+
if cp != best_guess.encoding
|
189 |
+
],
|
190 |
+
best_guess.language,
|
191 |
+
best_guess.alphabets,
|
192 |
+
best_guess.bom,
|
193 |
+
best_guess.percent_chaos,
|
194 |
+
best_guess.percent_coherence,
|
195 |
+
None,
|
196 |
+
True,
|
197 |
+
)
|
198 |
+
)
|
199 |
+
|
200 |
+
if len(matches) > 1 and args.alternatives:
|
201 |
+
for el in matches:
|
202 |
+
if el != best_guess:
|
203 |
+
x_.append(
|
204 |
+
CliDetectionResult(
|
205 |
+
abspath(my_file.name),
|
206 |
+
el.encoding,
|
207 |
+
el.encoding_aliases,
|
208 |
+
[
|
209 |
+
cp
|
210 |
+
for cp in el.could_be_from_charset
|
211 |
+
if cp != el.encoding
|
212 |
+
],
|
213 |
+
el.language,
|
214 |
+
el.alphabets,
|
215 |
+
el.bom,
|
216 |
+
el.percent_chaos,
|
217 |
+
el.percent_coherence,
|
218 |
+
None,
|
219 |
+
False,
|
220 |
+
)
|
221 |
+
)
|
222 |
+
|
223 |
+
if args.normalize is True:
|
224 |
+
if best_guess.encoding.startswith("utf") is True:
|
225 |
+
print(
|
226 |
+
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
227 |
+
my_file.name
|
228 |
+
),
|
229 |
+
file=sys.stderr,
|
230 |
+
)
|
231 |
+
if my_file.closed is False:
|
232 |
+
my_file.close()
|
233 |
+
continue
|
234 |
+
|
235 |
+
dir_path = dirname(realpath(my_file.name))
|
236 |
+
file_name = basename(realpath(my_file.name))
|
237 |
+
|
238 |
+
o_: List[str] = file_name.split(".")
|
239 |
+
|
240 |
+
if args.replace is False:
|
241 |
+
o_.insert(-1, best_guess.encoding)
|
242 |
+
if my_file.closed is False:
|
243 |
+
my_file.close()
|
244 |
+
elif (
|
245 |
+
args.force is False
|
246 |
+
and query_yes_no(
|
247 |
+
'Are you sure to normalize "{}" by replacing it ?'.format(
|
248 |
+
my_file.name
|
249 |
+
),
|
250 |
+
"no",
|
251 |
+
)
|
252 |
+
is False
|
253 |
+
):
|
254 |
+
if my_file.closed is False:
|
255 |
+
my_file.close()
|
256 |
+
continue
|
257 |
+
|
258 |
+
try:
|
259 |
+
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
260 |
+
|
261 |
+
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
262 |
+
fp.write(str(best_guess))
|
263 |
+
except IOError as e:
|
264 |
+
print(str(e), file=sys.stderr)
|
265 |
+
if my_file.closed is False:
|
266 |
+
my_file.close()
|
267 |
+
return 2
|
268 |
+
|
269 |
+
if my_file.closed is False:
|
270 |
+
my_file.close()
|
271 |
+
|
272 |
+
if args.minimal is False:
|
273 |
+
print(
|
274 |
+
dumps(
|
275 |
+
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
276 |
+
ensure_ascii=True,
|
277 |
+
indent=4,
|
278 |
+
)
|
279 |
+
)
|
280 |
+
else:
|
281 |
+
for my_file in args.files:
|
282 |
+
print(
|
283 |
+
", ".join(
|
284 |
+
[
|
285 |
+
el.encoding or "undefined"
|
286 |
+
for el in x_
|
287 |
+
if el.path == abspath(my_file.name)
|
288 |
+
]
|
289 |
+
)
|
290 |
+
)
|
291 |
+
|
292 |
+
return 0
|
293 |
+
|
294 |
+
|
295 |
+
if __name__ == "__main__":
|
296 |
+
cli_detect()
|
pproyect/test1/Lib/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (331 Bytes). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-310.pyc
ADDED
Binary file (6.52 kB). View file
|
|
pproyect/test1/Lib/site-packages/charset_normalizer/constant.py
ADDED
@@ -0,0 +1,1995 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
3 |
+
from encodings.aliases import aliases
|
4 |
+
from re import IGNORECASE, compile as re_compile
|
5 |
+
from typing import Dict, List, Set, Union
|
6 |
+
|
7 |
+
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
8 |
+
ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
|
9 |
+
"utf_8": BOM_UTF8,
|
10 |
+
"utf_7": [
|
11 |
+
b"\x2b\x2f\x76\x38",
|
12 |
+
b"\x2b\x2f\x76\x39",
|
13 |
+
b"\x2b\x2f\x76\x2b",
|
14 |
+
b"\x2b\x2f\x76\x2f",
|
15 |
+
b"\x2b\x2f\x76\x38\x2d",
|
16 |
+
],
|
17 |
+
"gb18030": b"\x84\x31\x95\x33",
|
18 |
+
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
|
19 |
+
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
|
20 |
+
}
|
21 |
+
|
22 |
+
TOO_SMALL_SEQUENCE: int = 32
|
23 |
+
TOO_BIG_SEQUENCE: int = int(10e6)
|
24 |
+
|
25 |
+
UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
|
26 |
+
|
27 |
+
# Up-to-date Unicode ucd/15.0.0
|
28 |
+
UNICODE_RANGES_COMBINED: Dict[str, range] = {
|
29 |
+
"Control character": range(32),
|
30 |
+
"Basic Latin": range(32, 128),
|
31 |
+
"Latin-1 Supplement": range(128, 256),
|
32 |
+
"Latin Extended-A": range(256, 384),
|
33 |
+
"Latin Extended-B": range(384, 592),
|
34 |
+
"IPA Extensions": range(592, 688),
|
35 |
+
"Spacing Modifier Letters": range(688, 768),
|
36 |
+
"Combining Diacritical Marks": range(768, 880),
|
37 |
+
"Greek and Coptic": range(880, 1024),
|
38 |
+
"Cyrillic": range(1024, 1280),
|
39 |
+
"Cyrillic Supplement": range(1280, 1328),
|
40 |
+
"Armenian": range(1328, 1424),
|
41 |
+
"Hebrew": range(1424, 1536),
|
42 |
+
"Arabic": range(1536, 1792),
|
43 |
+
"Syriac": range(1792, 1872),
|
44 |
+
"Arabic Supplement": range(1872, 1920),
|
45 |
+
"Thaana": range(1920, 1984),
|
46 |
+
"NKo": range(1984, 2048),
|
47 |
+
"Samaritan": range(2048, 2112),
|
48 |
+
"Mandaic": range(2112, 2144),
|
49 |
+
"Syriac Supplement": range(2144, 2160),
|
50 |
+
"Arabic Extended-B": range(2160, 2208),
|
51 |
+
"Arabic Extended-A": range(2208, 2304),
|
52 |
+
"Devanagari": range(2304, 2432),
|
53 |
+
"Bengali": range(2432, 2560),
|
54 |
+
"Gurmukhi": range(2560, 2688),
|
55 |
+
"Gujarati": range(2688, 2816),
|
56 |
+
"Oriya": range(2816, 2944),
|
57 |
+
"Tamil": range(2944, 3072),
|
58 |
+
"Telugu": range(3072, 3200),
|
59 |
+
"Kannada": range(3200, 3328),
|
60 |
+
"Malayalam": range(3328, 3456),
|
61 |
+
"Sinhala": range(3456, 3584),
|
62 |
+
"Thai": range(3584, 3712),
|
63 |
+
"Lao": range(3712, 3840),
|
64 |
+
"Tibetan": range(3840, 4096),
|
65 |
+
"Myanmar": range(4096, 4256),
|
66 |
+
"Georgian": range(4256, 4352),
|
67 |
+
"Hangul Jamo": range(4352, 4608),
|
68 |
+
"Ethiopic": range(4608, 4992),
|
69 |
+
"Ethiopic Supplement": range(4992, 5024),
|
70 |
+
"Cherokee": range(5024, 5120),
|
71 |
+
"Unified Canadian Aboriginal Syllabics": range(5120, 5760),
|
72 |
+
"Ogham": range(5760, 5792),
|
73 |
+
"Runic": range(5792, 5888),
|
74 |
+
"Tagalog": range(5888, 5920),
|
75 |
+
"Hanunoo": range(5920, 5952),
|
76 |
+
"Buhid": range(5952, 5984),
|
77 |
+
"Tagbanwa": range(5984, 6016),
|
78 |
+
"Khmer": range(6016, 6144),
|
79 |
+
"Mongolian": range(6144, 6320),
|
80 |
+
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
|
81 |
+
"Limbu": range(6400, 6480),
|
82 |
+
"Tai Le": range(6480, 6528),
|
83 |
+
"New Tai Lue": range(6528, 6624),
|
84 |
+
"Khmer Symbols": range(6624, 6656),
|
85 |
+
"Buginese": range(6656, 6688),
|
86 |
+
"Tai Tham": range(6688, 6832),
|
87 |
+
"Combining Diacritical Marks Extended": range(6832, 6912),
|
88 |
+
"Balinese": range(6912, 7040),
|
89 |
+
"Sundanese": range(7040, 7104),
|
90 |
+
"Batak": range(7104, 7168),
|
91 |
+
"Lepcha": range(7168, 7248),
|
92 |
+
"Ol Chiki": range(7248, 7296),
|
93 |
+
"Cyrillic Extended-C": range(7296, 7312),
|
94 |
+
"Georgian Extended": range(7312, 7360),
|
95 |
+
"Sundanese Supplement": range(7360, 7376),
|
96 |
+
"Vedic Extensions": range(7376, 7424),
|
97 |
+
"Phonetic Extensions": range(7424, 7552),
|
98 |
+
"Phonetic Extensions Supplement": range(7552, 7616),
|
99 |
+
"Combining Diacritical Marks Supplement": range(7616, 7680),
|
100 |
+
"Latin Extended Additional": range(7680, 7936),
|
101 |
+
"Greek Extended": range(7936, 8192),
|
102 |
+
"General Punctuation": range(8192, 8304),
|
103 |
+
"Superscripts and Subscripts": range(8304, 8352),
|
104 |
+
"Currency Symbols": range(8352, 8400),
|
105 |
+
"Combining Diacritical Marks for Symbols": range(8400, 8448),
|
106 |
+
"Letterlike Symbols": range(8448, 8528),
|
107 |
+
"Number Forms": range(8528, 8592),
|
108 |
+
"Arrows": range(8592, 8704),
|
109 |
+
"Mathematical Operators": range(8704, 8960),
|
110 |
+
"Miscellaneous Technical": range(8960, 9216),
|
111 |
+
"Control Pictures": range(9216, 9280),
|
112 |
+
"Optical Character Recognition": range(9280, 9312),
|
113 |
+
"Enclosed Alphanumerics": range(9312, 9472),
|
114 |
+
"Box Drawing": range(9472, 9600),
|
115 |
+
"Block Elements": range(9600, 9632),
|
116 |
+
"Geometric Shapes": range(9632, 9728),
|
117 |
+
"Miscellaneous Symbols": range(9728, 9984),
|
118 |
+
"Dingbats": range(9984, 10176),
|
119 |
+
"Miscellaneous Mathematical Symbols-A": range(10176, 10224),
|
120 |
+
"Supplemental Arrows-A": range(10224, 10240),
|
121 |
+
"Braille Patterns": range(10240, 10496),
|
122 |
+
"Supplemental Arrows-B": range(10496, 10624),
|
123 |
+
"Miscellaneous Mathematical Symbols-B": range(10624, 10752),
|
124 |
+
"Supplemental Mathematical Operators": range(10752, 11008),
|
125 |
+
"Miscellaneous Symbols and Arrows": range(11008, 11264),
|
126 |
+
"Glagolitic": range(11264, 11360),
|
127 |
+
"Latin Extended-C": range(11360, 11392),
|
128 |
+
"Coptic": range(11392, 11520),
|
129 |
+
"Georgian Supplement": range(11520, 11568),
|
130 |
+
"Tifinagh": range(11568, 11648),
|
131 |
+
"Ethiopic Extended": range(11648, 11744),
|
132 |
+
"Cyrillic Extended-A": range(11744, 11776),
|
133 |
+
"Supplemental Punctuation": range(11776, 11904),
|
134 |
+
"CJK Radicals Supplement": range(11904, 12032),
|
135 |
+
"Kangxi Radicals": range(12032, 12256),
|
136 |
+
"Ideographic Description Characters": range(12272, 12288),
|
137 |
+
"CJK Symbols and Punctuation": range(12288, 12352),
|
138 |
+
"Hiragana": range(12352, 12448),
|
139 |
+
"Katakana": range(12448, 12544),
|
140 |
+
"Bopomofo": range(12544, 12592),
|
141 |
+
"Hangul Compatibility Jamo": range(12592, 12688),
|
142 |
+
"Kanbun": range(12688, 12704),
|
143 |
+
"Bopomofo Extended": range(12704, 12736),
|
144 |
+
"CJK Strokes": range(12736, 12784),
|
145 |
+
"Katakana Phonetic Extensions": range(12784, 12800),
|
146 |
+
"Enclosed CJK Letters and Months": range(12800, 13056),
|
147 |
+
"CJK Compatibility": range(13056, 13312),
|
148 |
+
"CJK Unified Ideographs Extension A": range(13312, 19904),
|
149 |
+
"Yijing Hexagram Symbols": range(19904, 19968),
|
150 |
+
"CJK Unified Ideographs": range(19968, 40960),
|
151 |
+
"Yi Syllables": range(40960, 42128),
|
152 |
+
"Yi Radicals": range(42128, 42192),
|
153 |
+
"Lisu": range(42192, 42240),
|
154 |
+
"Vai": range(42240, 42560),
|
155 |
+
"Cyrillic Extended-B": range(42560, 42656),
|
156 |
+
"Bamum": range(42656, 42752),
|
157 |
+
"Modifier Tone Letters": range(42752, 42784),
|
158 |
+
"Latin Extended-D": range(42784, 43008),
|
159 |
+
"Syloti Nagri": range(43008, 43056),
|
160 |
+
"Common Indic Number Forms": range(43056, 43072),
|
161 |
+
"Phags-pa": range(43072, 43136),
|
162 |
+
"Saurashtra": range(43136, 43232),
|
163 |
+
"Devanagari Extended": range(43232, 43264),
|
164 |
+
"Kayah Li": range(43264, 43312),
|
165 |
+
"Rejang": range(43312, 43360),
|
166 |
+
"Hangul Jamo Extended-A": range(43360, 43392),
|
167 |
+
"Javanese": range(43392, 43488),
|
168 |
+
"Myanmar Extended-B": range(43488, 43520),
|
169 |
+
"Cham": range(43520, 43616),
|
170 |
+
"Myanmar Extended-A": range(43616, 43648),
|
171 |
+
"Tai Viet": range(43648, 43744),
|
172 |
+
"Meetei Mayek Extensions": range(43744, 43776),
|
173 |
+
"Ethiopic Extended-A": range(43776, 43824),
|
174 |
+
"Latin Extended-E": range(43824, 43888),
|
175 |
+
"Cherokee Supplement": range(43888, 43968),
|
176 |
+
"Meetei Mayek": range(43968, 44032),
|
177 |
+
"Hangul Syllables": range(44032, 55216),
|
178 |
+
"Hangul Jamo Extended-B": range(55216, 55296),
|
179 |
+
"High Surrogates": range(55296, 56192),
|
180 |
+
"High Private Use Surrogates": range(56192, 56320),
|
181 |
+
"Low Surrogates": range(56320, 57344),
|
182 |
+
"Private Use Area": range(57344, 63744),
|
183 |
+
"CJK Compatibility Ideographs": range(63744, 64256),
|
184 |
+
"Alphabetic Presentation Forms": range(64256, 64336),
|
185 |
+
"Arabic Presentation Forms-A": range(64336, 65024),
|
186 |
+
"Variation Selectors": range(65024, 65040),
|
187 |
+
"Vertical Forms": range(65040, 65056),
|
188 |
+
"Combining Half Marks": range(65056, 65072),
|
189 |
+
"CJK Compatibility Forms": range(65072, 65104),
|
190 |
+
"Small Form Variants": range(65104, 65136),
|
191 |
+
"Arabic Presentation Forms-B": range(65136, 65280),
|
192 |
+
"Halfwidth and Fullwidth Forms": range(65280, 65520),
|
193 |
+
"Specials": range(65520, 65536),
|
194 |
+
"Linear B Syllabary": range(65536, 65664),
|
195 |
+
"Linear B Ideograms": range(65664, 65792),
|
196 |
+
"Aegean Numbers": range(65792, 65856),
|
197 |
+
"Ancient Greek Numbers": range(65856, 65936),
|
198 |
+
"Ancient Symbols": range(65936, 66000),
|
199 |
+
"Phaistos Disc": range(66000, 66048),
|
200 |
+
"Lycian": range(66176, 66208),
|
201 |
+
"Carian": range(66208, 66272),
|
202 |
+
"Coptic Epact Numbers": range(66272, 66304),
|
203 |
+
"Old Italic": range(66304, 66352),
|
204 |
+
"Gothic": range(66352, 66384),
|
205 |
+
"Old Permic": range(66384, 66432),
|
206 |
+
"Ugaritic": range(66432, 66464),
|
207 |
+
"Old Persian": range(66464, 66528),
|
208 |
+
"Deseret": range(66560, 66640),
|
209 |
+
"Shavian": range(66640, 66688),
|
210 |
+
"Osmanya": range(66688, 66736),
|
211 |
+
"Osage": range(66736, 66816),
|
212 |
+
"Elbasan": range(66816, 66864),
|
213 |
+
"Caucasian Albanian": range(66864, 66928),
|
214 |
+
"Vithkuqi": range(66928, 67008),
|
215 |
+
"Linear A": range(67072, 67456),
|
216 |
+
"Latin Extended-F": range(67456, 67520),
|
217 |
+
"Cypriot Syllabary": range(67584, 67648),
|
218 |
+
"Imperial Aramaic": range(67648, 67680),
|
219 |
+
"Palmyrene": range(67680, 67712),
|
220 |
+
"Nabataean": range(67712, 67760),
|
221 |
+
"Hatran": range(67808, 67840),
|
222 |
+
"Phoenician": range(67840, 67872),
|
223 |
+
"Lydian": range(67872, 67904),
|
224 |
+
"Meroitic Hieroglyphs": range(67968, 68000),
|
225 |
+
"Meroitic Cursive": range(68000, 68096),
|
226 |
+
"Kharoshthi": range(68096, 68192),
|
227 |
+
"Old South Arabian": range(68192, 68224),
|
228 |
+
"Old North Arabian": range(68224, 68256),
|
229 |
+
"Manichaean": range(68288, 68352),
|
230 |
+
"Avestan": range(68352, 68416),
|
231 |
+
"Inscriptional Parthian": range(68416, 68448),
|
232 |
+
"Inscriptional Pahlavi": range(68448, 68480),
|
233 |
+
"Psalter Pahlavi": range(68480, 68528),
|
234 |
+
"Old Turkic": range(68608, 68688),
|
235 |
+
"Old Hungarian": range(68736, 68864),
|
236 |
+
"Hanifi Rohingya": range(68864, 68928),
|
237 |
+
"Rumi Numeral Symbols": range(69216, 69248),
|
238 |
+
"Yezidi": range(69248, 69312),
|
239 |
+
"Arabic Extended-C": range(69312, 69376),
|
240 |
+
"Old Sogdian": range(69376, 69424),
|
241 |
+
"Sogdian": range(69424, 69488),
|
242 |
+
"Old Uyghur": range(69488, 69552),
|
243 |
+
"Chorasmian": range(69552, 69600),
|
244 |
+
"Elymaic": range(69600, 69632),
|
245 |
+
"Brahmi": range(69632, 69760),
|
246 |
+
"Kaithi": range(69760, 69840),
|
247 |
+
"Sora Sompeng": range(69840, 69888),
|
248 |
+
"Chakma": range(69888, 69968),
|
249 |
+
"Mahajani": range(69968, 70016),
|
250 |
+
"Sharada": range(70016, 70112),
|
251 |
+
"Sinhala Archaic Numbers": range(70112, 70144),
|
252 |
+
"Khojki": range(70144, 70224),
|
253 |
+
"Multani": range(70272, 70320),
|
254 |
+
"Khudawadi": range(70320, 70400),
|
255 |
+
"Grantha": range(70400, 70528),
|
256 |
+
"Newa": range(70656, 70784),
|
257 |
+
"Tirhuta": range(70784, 70880),
|
258 |
+
"Siddham": range(71040, 71168),
|
259 |
+
"Modi": range(71168, 71264),
|
260 |
+
"Mongolian Supplement": range(71264, 71296),
|
261 |
+
"Takri": range(71296, 71376),
|
262 |
+
"Ahom": range(71424, 71504),
|
263 |
+
"Dogra": range(71680, 71760),
|
264 |
+
"Warang Citi": range(71840, 71936),
|
265 |
+
"Dives Akuru": range(71936, 72032),
|
266 |
+
"Nandinagari": range(72096, 72192),
|
267 |
+
"Zanabazar Square": range(72192, 72272),
|
268 |
+
"Soyombo": range(72272, 72368),
|
269 |
+
"Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
|
270 |
+
"Pau Cin Hau": range(72384, 72448),
|
271 |
+
"Devanagari Extended-A": range(72448, 72544),
|
272 |
+
"Bhaiksuki": range(72704, 72816),
|
273 |
+
"Marchen": range(72816, 72896),
|
274 |
+
"Masaram Gondi": range(72960, 73056),
|
275 |
+
"Gunjala Gondi": range(73056, 73136),
|
276 |
+
"Makasar": range(73440, 73472),
|
277 |
+
"Kawi": range(73472, 73568),
|
278 |
+
"Lisu Supplement": range(73648, 73664),
|
279 |
+
"Tamil Supplement": range(73664, 73728),
|
280 |
+
"Cuneiform": range(73728, 74752),
|
281 |
+
"Cuneiform Numbers and Punctuation": range(74752, 74880),
|
282 |
+
"Early Dynastic Cuneiform": range(74880, 75088),
|
283 |
+
"Cypro-Minoan": range(77712, 77824),
|
284 |
+
"Egyptian Hieroglyphs": range(77824, 78896),
|
285 |
+
"Egyptian Hieroglyph Format Controls": range(78896, 78944),
|
286 |
+
"Anatolian Hieroglyphs": range(82944, 83584),
|
287 |
+
"Bamum Supplement": range(92160, 92736),
|
288 |
+
"Mro": range(92736, 92784),
|
289 |
+
"Tangsa": range(92784, 92880),
|
290 |
+
"Bassa Vah": range(92880, 92928),
|
291 |
+
"Pahawh Hmong": range(92928, 93072),
|
292 |
+
"Medefaidrin": range(93760, 93856),
|
293 |
+
"Miao": range(93952, 94112),
|
294 |
+
"Ideographic Symbols and Punctuation": range(94176, 94208),
|
295 |
+
"Tangut": range(94208, 100352),
|
296 |
+
"Tangut Components": range(100352, 101120),
|
297 |
+
"Khitan Small Script": range(101120, 101632),
|
298 |
+
"Tangut Supplement": range(101632, 101760),
|
299 |
+
"Kana Extended-B": range(110576, 110592),
|
300 |
+
"Kana Supplement": range(110592, 110848),
|
301 |
+
"Kana Extended-A": range(110848, 110896),
|
302 |
+
"Small Kana Extension": range(110896, 110960),
|
303 |
+
"Nushu": range(110960, 111360),
|
304 |
+
"Duployan": range(113664, 113824),
|
305 |
+
"Shorthand Format Controls": range(113824, 113840),
|
306 |
+
"Znamenny Musical Notation": range(118528, 118736),
|
307 |
+
"Byzantine Musical Symbols": range(118784, 119040),
|
308 |
+
"Musical Symbols": range(119040, 119296),
|
309 |
+
"Ancient Greek Musical Notation": range(119296, 119376),
|
310 |
+
"Kaktovik Numerals": range(119488, 119520),
|
311 |
+
"Mayan Numerals": range(119520, 119552),
|
312 |
+
"Tai Xuan Jing Symbols": range(119552, 119648),
|
313 |
+
"Counting Rod Numerals": range(119648, 119680),
|
314 |
+
"Mathematical Alphanumeric Symbols": range(119808, 120832),
|
315 |
+
"Sutton SignWriting": range(120832, 121520),
|
316 |
+
"Latin Extended-G": range(122624, 122880),
|
317 |
+
"Glagolitic Supplement": range(122880, 122928),
|
318 |
+
"Cyrillic Extended-D": range(122928, 123024),
|
319 |
+
"Nyiakeng Puachue Hmong": range(123136, 123216),
|
320 |
+
"Toto": range(123536, 123584),
|
321 |
+
"Wancho": range(123584, 123648),
|
322 |
+
"Nag Mundari": range(124112, 124160),
|
323 |
+
"Ethiopic Extended-B": range(124896, 124928),
|
324 |
+
"Mende Kikakui": range(124928, 125152),
|
325 |
+
"Adlam": range(125184, 125280),
|
326 |
+
"Indic Siyaq Numbers": range(126064, 126144),
|
327 |
+
"Ottoman Siyaq Numbers": range(126208, 126288),
|
328 |
+
"Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
|
329 |
+
"Mahjong Tiles": range(126976, 127024),
|
330 |
+
"Domino Tiles": range(127024, 127136),
|
331 |
+
"Playing Cards": range(127136, 127232),
|
332 |
+
"Enclosed Alphanumeric Supplement": range(127232, 127488),
|
333 |
+
"Enclosed Ideographic Supplement": range(127488, 127744),
|
334 |
+
"Miscellaneous Symbols and Pictographs": range(127744, 128512),
|
335 |
+
"Emoticons range(Emoji)": range(128512, 128592),
|
336 |
+
"Ornamental Dingbats": range(128592, 128640),
|
337 |
+
"Transport and Map Symbols": range(128640, 128768),
|
338 |
+
"Alchemical Symbols": range(128768, 128896),
|
339 |
+
"Geometric Shapes Extended": range(128896, 129024),
|
340 |
+
"Supplemental Arrows-C": range(129024, 129280),
|
341 |
+
"Supplemental Symbols and Pictographs": range(129280, 129536),
|
342 |
+
"Chess Symbols": range(129536, 129648),
|
343 |
+
"Symbols and Pictographs Extended-A": range(129648, 129792),
|
344 |
+
"Symbols for Legacy Computing": range(129792, 130048),
|
345 |
+
"CJK Unified Ideographs Extension B": range(131072, 173792),
|
346 |
+
"CJK Unified Ideographs Extension C": range(173824, 177984),
|
347 |
+
"CJK Unified Ideographs Extension D": range(177984, 178208),
|
348 |
+
"CJK Unified Ideographs Extension E": range(178208, 183984),
|
349 |
+
"CJK Unified Ideographs Extension F": range(183984, 191472),
|
350 |
+
"CJK Compatibility Ideographs Supplement": range(194560, 195104),
|
351 |
+
"CJK Unified Ideographs Extension G": range(196608, 201552),
|
352 |
+
"CJK Unified Ideographs Extension H": range(201552, 205744),
|
353 |
+
"Tags": range(917504, 917632),
|
354 |
+
"Variation Selectors Supplement": range(917760, 918000),
|
355 |
+
"Supplementary Private Use Area-A": range(983040, 1048576),
|
356 |
+
"Supplementary Private Use Area-B": range(1048576, 1114112),
|
357 |
+
}
|
358 |
+
|
359 |
+
|
360 |
+
UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
|
361 |
+
"Supplement",
|
362 |
+
"Extended",
|
363 |
+
"Extensions",
|
364 |
+
"Modifier",
|
365 |
+
"Marks",
|
366 |
+
"Punctuation",
|
367 |
+
"Symbols",
|
368 |
+
"Forms",
|
369 |
+
"Operators",
|
370 |
+
"Miscellaneous",
|
371 |
+
"Drawing",
|
372 |
+
"Block",
|
373 |
+
"Shapes",
|
374 |
+
"Supplemental",
|
375 |
+
"Tags",
|
376 |
+
]
|
377 |
+
|
378 |
+
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
379 |
+
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
380 |
+
IGNORECASE,
|
381 |
+
)
|
382 |
+
|
383 |
+
IANA_NO_ALIASES = [
|
384 |
+
"cp720",
|
385 |
+
"cp737",
|
386 |
+
"cp856",
|
387 |
+
"cp874",
|
388 |
+
"cp875",
|
389 |
+
"cp1006",
|
390 |
+
"koi8_r",
|
391 |
+
"koi8_t",
|
392 |
+
"koi8_u",
|
393 |
+
]
|
394 |
+
|
395 |
+
IANA_SUPPORTED: List[str] = sorted(
|
396 |
+
filter(
|
397 |
+
lambda x: x.endswith("_codec") is False
|
398 |
+
and x not in {"rot_13", "tactis", "mbcs"},
|
399 |
+
list(set(aliases.values())) + IANA_NO_ALIASES,
|
400 |
+
)
|
401 |
+
)
|
402 |
+
|
403 |
+
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
404 |
+
|
405 |
+
# pre-computed code page that are similar using the function cp_similarity.
|
406 |
+
IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
|
407 |
+
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
408 |
+
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
409 |
+
"cp1125": ["cp866"],
|
410 |
+
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
|
411 |
+
"cp1250": ["iso8859_2"],
|
412 |
+
"cp1251": ["kz1048", "ptcp154"],
|
413 |
+
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
|
414 |
+
"cp1253": ["iso8859_7"],
|
415 |
+
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
|
416 |
+
"cp1257": ["iso8859_13"],
|
417 |
+
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
|
418 |
+
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
|
419 |
+
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
|
420 |
+
"cp850": ["cp437", "cp857", "cp858", "cp865"],
|
421 |
+
"cp857": ["cp850", "cp858", "cp865"],
|
422 |
+
"cp858": ["cp437", "cp850", "cp857", "cp865"],
|
423 |
+
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
|
424 |
+
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
|
425 |
+
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
|
426 |
+
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
|
427 |
+
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
|
428 |
+
"cp866": ["cp1125"],
|
429 |
+
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
|
430 |
+
"iso8859_11": ["tis_620"],
|
431 |
+
"iso8859_13": ["cp1257"],
|
432 |
+
"iso8859_14": [
|
433 |
+
"iso8859_10",
|
434 |
+
"iso8859_15",
|
435 |
+
"iso8859_16",
|
436 |
+
"iso8859_3",
|
437 |
+
"iso8859_9",
|
438 |
+
"latin_1",
|
439 |
+
],
|
440 |
+
"iso8859_15": [
|
441 |
+
"cp1252",
|
442 |
+
"cp1254",
|
443 |
+
"iso8859_10",
|
444 |
+
"iso8859_14",
|
445 |
+
"iso8859_16",
|
446 |
+
"iso8859_3",
|
447 |
+
"iso8859_9",
|
448 |
+
"latin_1",
|
449 |
+
],
|
450 |
+
"iso8859_16": [
|
451 |
+
"iso8859_14",
|
452 |
+
"iso8859_15",
|
453 |
+
"iso8859_2",
|
454 |
+
"iso8859_3",
|
455 |
+
"iso8859_9",
|
456 |
+
"latin_1",
|
457 |
+
],
|
458 |
+
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
|
459 |
+
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
|
460 |
+
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
|
461 |
+
"iso8859_7": ["cp1253"],
|
462 |
+
"iso8859_9": [
|
463 |
+
"cp1252",
|
464 |
+
"cp1254",
|
465 |
+
"cp1258",
|
466 |
+
"iso8859_10",
|
467 |
+
"iso8859_14",
|
468 |
+
"iso8859_15",
|
469 |
+
"iso8859_16",
|
470 |
+
"iso8859_3",
|
471 |
+
"iso8859_4",
|
472 |
+
"latin_1",
|
473 |
+
],
|
474 |
+
"kz1048": ["cp1251", "ptcp154"],
|
475 |
+
"latin_1": [
|
476 |
+
"cp1252",
|
477 |
+
"cp1254",
|
478 |
+
"cp1258",
|
479 |
+
"iso8859_10",
|
480 |
+
"iso8859_14",
|
481 |
+
"iso8859_15",
|
482 |
+
"iso8859_16",
|
483 |
+
"iso8859_3",
|
484 |
+
"iso8859_4",
|
485 |
+
"iso8859_9",
|
486 |
+
],
|
487 |
+
"mac_iceland": ["mac_roman", "mac_turkish"],
|
488 |
+
"mac_roman": ["mac_iceland", "mac_turkish"],
|
489 |
+
"mac_turkish": ["mac_iceland", "mac_roman"],
|
490 |
+
"ptcp154": ["cp1251", "kz1048"],
|
491 |
+
"tis_620": ["iso8859_11"],
|
492 |
+
}
|
493 |
+
|
494 |
+
|
495 |
+
CHARDET_CORRESPONDENCE: Dict[str, str] = {
|
496 |
+
"iso2022_kr": "ISO-2022-KR",
|
497 |
+
"iso2022_jp": "ISO-2022-JP",
|
498 |
+
"euc_kr": "EUC-KR",
|
499 |
+
"tis_620": "TIS-620",
|
500 |
+
"utf_32": "UTF-32",
|
501 |
+
"euc_jp": "EUC-JP",
|
502 |
+
"koi8_r": "KOI8-R",
|
503 |
+
"iso8859_1": "ISO-8859-1",
|
504 |
+
"iso8859_2": "ISO-8859-2",
|
505 |
+
"iso8859_5": "ISO-8859-5",
|
506 |
+
"iso8859_6": "ISO-8859-6",
|
507 |
+
"iso8859_7": "ISO-8859-7",
|
508 |
+
"iso8859_8": "ISO-8859-8",
|
509 |
+
"utf_16": "UTF-16",
|
510 |
+
"cp855": "IBM855",
|
511 |
+
"mac_cyrillic": "MacCyrillic",
|
512 |
+
"gb2312": "GB2312",
|
513 |
+
"gb18030": "GB18030",
|
514 |
+
"cp932": "CP932",
|
515 |
+
"cp866": "IBM866",
|
516 |
+
"utf_8": "utf-8",
|
517 |
+
"utf_8_sig": "UTF-8-SIG",
|
518 |
+
"shift_jis": "SHIFT_JIS",
|
519 |
+
"big5": "Big5",
|
520 |
+
"cp1250": "windows-1250",
|
521 |
+
"cp1251": "windows-1251",
|
522 |
+
"cp1252": "Windows-1252",
|
523 |
+
"cp1253": "windows-1253",
|
524 |
+
"cp1255": "windows-1255",
|
525 |
+
"cp1256": "windows-1256",
|
526 |
+
"cp1254": "Windows-1254",
|
527 |
+
"cp949": "CP949",
|
528 |
+
}
|
529 |
+
|
530 |
+
|
531 |
+
COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
|
532 |
+
"<",
|
533 |
+
">",
|
534 |
+
"=",
|
535 |
+
":",
|
536 |
+
"/",
|
537 |
+
"&",
|
538 |
+
";",
|
539 |
+
"{",
|
540 |
+
"}",
|
541 |
+
"[",
|
542 |
+
"]",
|
543 |
+
",",
|
544 |
+
"|",
|
545 |
+
'"',
|
546 |
+
"-",
|
547 |
+
}
|
548 |
+
|
549 |
+
|
550 |
+
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
|
551 |
+
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
552 |
+
|
553 |
+
# Logging LEVEL below DEBUG
|
554 |
+
TRACE: int = 5
|
555 |
+
|
556 |
+
|
557 |
+
# Language label that contain the em dash "—"
|
558 |
+
# character are to be considered alternative seq to origin
|
559 |
+
FREQUENCIES: Dict[str, List[str]] = {
|
560 |
+
"English": [
|
561 |
+
"e",
|
562 |
+
"a",
|
563 |
+
"t",
|
564 |
+
"i",
|
565 |
+
"o",
|
566 |
+
"n",
|
567 |
+
"s",
|
568 |
+
"r",
|
569 |
+
"h",
|
570 |
+
"l",
|
571 |
+
"d",
|
572 |
+
"c",
|
573 |
+
"u",
|
574 |
+
"m",
|
575 |
+
"f",
|
576 |
+
"p",
|
577 |
+
"g",
|
578 |
+
"w",
|
579 |
+
"y",
|
580 |
+
"b",
|
581 |
+
"v",
|
582 |
+
"k",
|
583 |
+
"x",
|
584 |
+
"j",
|
585 |
+
"z",
|
586 |
+
"q",
|
587 |
+
],
|
588 |
+
"English—": [
|
589 |
+
"e",
|
590 |
+
"a",
|
591 |
+
"t",
|
592 |
+
"i",
|
593 |
+
"o",
|
594 |
+
"n",
|
595 |
+
"s",
|
596 |
+
"r",
|
597 |
+
"h",
|
598 |
+
"l",
|
599 |
+
"d",
|
600 |
+
"c",
|
601 |
+
"m",
|
602 |
+
"u",
|
603 |
+
"f",
|
604 |
+
"p",
|
605 |
+
"g",
|
606 |
+
"w",
|
607 |
+
"b",
|
608 |
+
"y",
|
609 |
+
"v",
|
610 |
+
"k",
|
611 |
+
"j",
|
612 |
+
"x",
|
613 |
+
"z",
|
614 |
+
"q",
|
615 |
+
],
|
616 |
+
"German": [
|
617 |
+
"e",
|
618 |
+
"n",
|
619 |
+
"i",
|
620 |
+
"r",
|
621 |
+
"s",
|
622 |
+
"t",
|
623 |
+
"a",
|
624 |
+
"d",
|
625 |
+
"h",
|
626 |
+
"u",
|
627 |
+
"l",
|
628 |
+
"g",
|
629 |
+
"o",
|
630 |
+
"c",
|
631 |
+
"m",
|
632 |
+
"b",
|
633 |
+
"f",
|
634 |
+
"k",
|
635 |
+
"w",
|
636 |
+
"z",
|
637 |
+
"p",
|
638 |
+
"v",
|
639 |
+
"ü",
|
640 |
+
"ä",
|
641 |
+
"ö",
|
642 |
+
"j",
|
643 |
+
],
|
644 |
+
"French": [
|
645 |
+
"e",
|
646 |
+
"a",
|
647 |
+
"s",
|
648 |
+
"n",
|
649 |
+
"i",
|
650 |
+
"t",
|
651 |
+
"r",
|
652 |
+
"l",
|
653 |
+
"u",
|
654 |
+
"o",
|
655 |
+
"d",
|
656 |
+
"c",
|
657 |
+
"p",
|
658 |
+
"m",
|
659 |
+
"é",
|
660 |
+
"v",
|
661 |
+
"g",
|
662 |
+
"f",
|
663 |
+
"b",
|
664 |
+
"h",
|
665 |
+
"q",
|
666 |
+
"à",
|
667 |
+
"x",
|
668 |
+
"è",
|
669 |
+
"y",
|
670 |
+
"j",
|
671 |
+
],
|
672 |
+
"Dutch": [
|
673 |
+
"e",
|
674 |
+
"n",
|
675 |
+
"a",
|
676 |
+
"i",
|
677 |
+
"r",
|
678 |
+
"t",
|
679 |
+
"o",
|
680 |
+
"d",
|
681 |
+
"s",
|
682 |
+
"l",
|
683 |
+
"g",
|
684 |
+
"h",
|
685 |
+
"v",
|
686 |
+
"m",
|
687 |
+
"u",
|
688 |
+
"k",
|
689 |
+
"c",
|
690 |
+
"p",
|
691 |
+
"b",
|
692 |
+
"w",
|
693 |
+
"j",
|
694 |
+
"z",
|
695 |
+
"f",
|
696 |
+
"y",
|
697 |
+
"x",
|
698 |
+
"ë",
|
699 |
+
],
|
700 |
+
"Italian": [
|
701 |
+
"e",
|
702 |
+
"i",
|
703 |
+
"a",
|
704 |
+
"o",
|
705 |
+
"n",
|
706 |
+
"l",
|
707 |
+
"t",
|
708 |
+
"r",
|
709 |
+
"s",
|
710 |
+
"c",
|
711 |
+
"d",
|
712 |
+
"u",
|
713 |
+
"p",
|
714 |
+
"m",
|
715 |
+
"g",
|
716 |
+
"v",
|
717 |
+
"f",
|
718 |
+
"b",
|
719 |
+
"z",
|
720 |
+
"h",
|
721 |
+
"q",
|
722 |
+
"è",
|
723 |
+
"à",
|
724 |
+
"k",
|
725 |
+
"y",
|
726 |
+
"ò",
|
727 |
+
],
|
728 |
+
"Polish": [
|
729 |
+
"a",
|
730 |
+
"i",
|
731 |
+
"o",
|
732 |
+
"e",
|
733 |
+
"n",
|
734 |
+
"r",
|
735 |
+
"z",
|
736 |
+
"w",
|
737 |
+
"s",
|
738 |
+
"c",
|
739 |
+
"t",
|
740 |
+
"k",
|
741 |
+
"y",
|
742 |
+
"d",
|
743 |
+
"p",
|
744 |
+
"m",
|
745 |
+
"u",
|
746 |
+
"l",
|
747 |
+
"j",
|
748 |
+
"ł",
|
749 |
+
"g",
|
750 |
+
"b",
|
751 |
+
"h",
|
752 |
+
"ą",
|
753 |
+
"ę",
|
754 |
+
"ó",
|
755 |
+
],
|
756 |
+
"Spanish": [
|
757 |
+
"e",
|
758 |
+
"a",
|
759 |
+
"o",
|
760 |
+
"n",
|
761 |
+
"s",
|
762 |
+
"r",
|
763 |
+
"i",
|
764 |
+
"l",
|
765 |
+
"d",
|
766 |
+
"t",
|
767 |
+
"c",
|
768 |
+
"u",
|
769 |
+
"m",
|
770 |
+
"p",
|
771 |
+
"b",
|
772 |
+
"g",
|
773 |
+
"v",
|
774 |
+
"f",
|
775 |
+
"y",
|
776 |
+
"ó",
|
777 |
+
"h",
|
778 |
+
"q",
|
779 |
+
"í",
|
780 |
+
"j",
|
781 |
+
"z",
|
782 |
+
"á",
|
783 |
+
],
|
784 |
+
"Russian": [
|
785 |
+
"о",
|
786 |
+
"а",
|
787 |
+
"е",
|
788 |
+
"и",
|
789 |
+
"н",
|
790 |
+
"с",
|
791 |
+
"т",
|
792 |
+
"р",
|
793 |
+
"в",
|
794 |
+
"л",
|
795 |
+
"к",
|
796 |
+
"м",
|
797 |
+
"д",
|
798 |
+
"п",
|
799 |
+
"у",
|
800 |
+
"г",
|
801 |
+
"я",
|
802 |
+
"ы",
|
803 |
+
"з",
|
804 |
+
"б",
|
805 |
+
"й",
|
806 |
+
"ь",
|
807 |
+
"ч",
|
808 |
+
"х",
|
809 |
+
"ж",
|
810 |
+
"ц",
|
811 |
+
],
|
812 |
+
# Jap-Kanji
|
813 |
+
"Japanese": [
|
814 |
+
"人",
|
815 |
+
"一",
|
816 |
+
"大",
|
817 |
+
"亅",
|
818 |
+
"丁",
|
819 |
+
"丨",
|
820 |
+
"竹",
|
821 |
+
"笑",
|
822 |
+
"口",
|
823 |
+
"日",
|
824 |
+
"今",
|
825 |
+
"二",
|
826 |
+
"彳",
|
827 |
+
"行",
|
828 |
+
"十",
|
829 |
+
"土",
|
830 |
+
"丶",
|
831 |
+
"寸",
|
832 |
+
"寺",
|
833 |
+
"時",
|
834 |
+
"乙",
|
835 |
+
"丿",
|
836 |
+
"乂",
|
837 |
+
"气",
|
838 |
+
"気",
|
839 |
+
"冂",
|
840 |
+
"巾",
|
841 |
+
"亠",
|
842 |
+
"市",
|
843 |
+
"目",
|
844 |
+
"儿",
|
845 |
+
"見",
|
846 |
+
"八",
|
847 |
+
"小",
|
848 |
+
"凵",
|
849 |
+
"県",
|
850 |
+
"月",
|
851 |
+
"彐",
|
852 |
+
"門",
|
853 |
+
"間",
|
854 |
+
"木",
|
855 |
+
"東",
|
856 |
+
"山",
|
857 |
+
"出",
|
858 |
+
"本",
|
859 |
+
"中",
|
860 |
+
"刀",
|
861 |
+
"分",
|
862 |
+
"耳",
|
863 |
+
"又",
|
864 |
+
"取",
|
865 |
+
"最",
|
866 |
+
"言",
|
867 |
+
"田",
|
868 |
+
"心",
|
869 |
+
"思",
|
870 |
+
"刂",
|
871 |
+
"前",
|
872 |
+
"京",
|
873 |
+
"尹",
|
874 |
+
"事",
|
875 |
+
"生",
|
876 |
+
"厶",
|
877 |
+
"云",
|
878 |
+
"会",
|
879 |
+
"未",
|
880 |
+
"来",
|
881 |
+
"白",
|
882 |
+
"冫",
|
883 |
+
"楽",
|
884 |
+
"灬",
|
885 |
+
"馬",
|
886 |
+
"尸",
|
887 |
+
"尺",
|
888 |
+
"駅",
|
889 |
+
"明",
|
890 |
+
"耂",
|
891 |
+
"者",
|
892 |
+
"了",
|
893 |
+
"阝",
|
894 |
+
"都",
|
895 |
+
"高",
|
896 |
+
"卜",
|
897 |
+
"占",
|
898 |
+
"厂",
|
899 |
+
"广",
|
900 |
+
"店",
|
901 |
+
"子",
|
902 |
+
"申",
|
903 |
+
"奄",
|
904 |
+
"亻",
|
905 |
+
"俺",
|
906 |
+
"上",
|
907 |
+
"方",
|
908 |
+
"冖",
|
909 |
+
"学",
|
910 |
+
"衣",
|
911 |
+
"艮",
|
912 |
+
"食",
|
913 |
+
"自",
|
914 |
+
],
|
915 |
+
# Jap-Katakana
|
916 |
+
"Japanese—": [
|
917 |
+
"ー",
|
918 |
+
"ン",
|
919 |
+
"ス",
|
920 |
+
"・",
|
921 |
+
"ル",
|
922 |
+
"ト",
|
923 |
+
"リ",
|
924 |
+
"イ",
|
925 |
+
"ア",
|
926 |
+
"ラ",
|
927 |
+
"ッ",
|
928 |
+
"ク",
|
929 |
+
"ド",
|
930 |
+
"シ",
|
931 |
+
"レ",
|
932 |
+
"ジ",
|
933 |
+
"タ",
|
934 |
+
"フ",
|
935 |
+
"ロ",
|
936 |
+
"カ",
|
937 |
+
"テ",
|
938 |
+
"マ",
|
939 |
+
"ィ",
|
940 |
+
"グ",
|
941 |
+
"バ",
|
942 |
+
"ム",
|
943 |
+
"プ",
|
944 |
+
"オ",
|
945 |
+
"コ",
|
946 |
+
"デ",
|
947 |
+
"ニ",
|
948 |
+
"ウ",
|
949 |
+
"メ",
|
950 |
+
"サ",
|
951 |
+
"ビ",
|
952 |
+
"ナ",
|
953 |
+
"ブ",
|
954 |
+
"ャ",
|
955 |
+
"エ",
|
956 |
+
"ュ",
|
957 |
+
"チ",
|
958 |
+
"キ",
|
959 |
+
"ズ",
|
960 |
+
"ダ",
|
961 |
+
"パ",
|
962 |
+
"ミ",
|
963 |
+
"ェ",
|
964 |
+
"ョ",
|
965 |
+
"ハ",
|
966 |
+
"セ",
|
967 |
+
"ベ",
|
968 |
+
"ガ",
|
969 |
+
"モ",
|
970 |
+
"ツ",
|
971 |
+
"ネ",
|
972 |
+
"ボ",
|
973 |
+
"ソ",
|
974 |
+
"ノ",
|
975 |
+
"ァ",
|
976 |
+
"ヴ",
|
977 |
+
"ワ",
|
978 |
+
"ポ",
|
979 |
+
"ペ",
|
980 |
+
"ピ",
|
981 |
+
"ケ",
|
982 |
+
"ゴ",
|
983 |
+
"ギ",
|
984 |
+
"ザ",
|
985 |
+
"ホ",
|
986 |
+
"ゲ",
|
987 |
+
"ォ",
|
988 |
+
"ヤ",
|
989 |
+
"ヒ",
|
990 |
+
"ユ",
|
991 |
+
"ヨ",
|
992 |
+
"ヘ",
|
993 |
+
"ゼ",
|
994 |
+
"ヌ",
|
995 |
+
"ゥ",
|
996 |
+
"ゾ",
|
997 |
+
"ヶ",
|
998 |
+
"ヂ",
|
999 |
+
"ヲ",
|
1000 |
+
"ヅ",
|
1001 |
+
"ヵ",
|
1002 |
+
"ヱ",
|
1003 |
+
"ヰ",
|
1004 |
+
"ヮ",
|
1005 |
+
"ヽ",
|
1006 |
+
"゠",
|
1007 |
+
"ヾ",
|
1008 |
+
"ヷ",
|
1009 |
+
"ヿ",
|
1010 |
+
"ヸ",
|
1011 |
+
"ヹ",
|
1012 |
+
"ヺ",
|
1013 |
+
],
|
1014 |
+
# Jap-Hiragana
|
1015 |
+
"Japanese——": [
|
1016 |
+
"の",
|
1017 |
+
"に",
|
1018 |
+
"る",
|
1019 |
+
"た",
|
1020 |
+
"と",
|
1021 |
+
"は",
|
1022 |
+
"し",
|
1023 |
+
"い",
|
1024 |
+
"を",
|
1025 |
+
"で",
|
1026 |
+
"て",
|
1027 |
+
"が",
|
1028 |
+
"な",
|
1029 |
+
"れ",
|
1030 |
+
"か",
|
1031 |
+
"ら",
|
1032 |
+
"さ",
|
1033 |
+
"っ",
|
1034 |
+
"り",
|
1035 |
+
"す",
|
1036 |
+
"あ",
|
1037 |
+
"も",
|
1038 |
+
"こ",
|
1039 |
+
"ま",
|
1040 |
+
"う",
|
1041 |
+
"く",
|
1042 |
+
"よ",
|
1043 |
+
"き",
|
1044 |
+
"ん",
|
1045 |
+
"め",
|
1046 |
+
"お",
|
1047 |
+
"け",
|
1048 |
+
"そ",
|
1049 |
+
"つ",
|
1050 |
+
"だ",
|
1051 |
+
"や",
|
1052 |
+
"え",
|
1053 |
+
"ど",
|
1054 |
+
"わ",
|
1055 |
+
"ち",
|
1056 |
+
"み",
|
1057 |
+
"せ",
|
1058 |
+
"じ",
|
1059 |
+
"ば",
|
1060 |
+
"へ",
|
1061 |
+
"び",
|
1062 |
+
"ず",
|
1063 |
+
"ろ",
|
1064 |
+
"ほ",
|
1065 |
+
"げ",
|
1066 |
+
"む",
|
1067 |
+
"べ",
|
1068 |
+
"ひ",
|
1069 |
+
"ょ",
|
1070 |
+
"ゆ",
|
1071 |
+
"ぶ",
|
1072 |
+
"ご",
|
1073 |
+
"ゃ",
|
1074 |
+
"ね",
|
1075 |
+
"ふ",
|
1076 |
+
"ぐ",
|
1077 |
+
"ぎ",
|
1078 |
+
"ぼ",
|
1079 |
+
"ゅ",
|
1080 |
+
"づ",
|
1081 |
+
"ざ",
|
1082 |
+
"ぞ",
|
1083 |
+
"ぬ",
|
1084 |
+
"ぜ",
|
1085 |
+
"ぱ",
|
1086 |
+
"ぽ",
|
1087 |
+
"ぷ",
|
1088 |
+
"ぴ",
|
1089 |
+
"ぃ",
|
1090 |
+
"ぁ",
|
1091 |
+
"ぇ",
|
1092 |
+
"ぺ",
|
1093 |
+
"ゞ",
|
1094 |
+
"ぢ",
|
1095 |
+
"ぉ",
|
1096 |
+
"ぅ",
|
1097 |
+
"ゐ",
|
1098 |
+
"ゝ",
|
1099 |
+
"ゑ",
|
1100 |
+
"゛",
|
1101 |
+
"゜",
|
1102 |
+
"ゎ",
|
1103 |
+
"ゔ",
|
1104 |
+
"゚",
|
1105 |
+
"ゟ",
|
1106 |
+
"゙",
|
1107 |
+
"ゕ",
|
1108 |
+
"ゖ",
|
1109 |
+
],
|
1110 |
+
"Portuguese": [
|
1111 |
+
"a",
|
1112 |
+
"e",
|
1113 |
+
"o",
|
1114 |
+
"s",
|
1115 |
+
"i",
|
1116 |
+
"r",
|
1117 |
+
"d",
|
1118 |
+
"n",
|
1119 |
+
"t",
|
1120 |
+
"m",
|
1121 |
+
"u",
|
1122 |
+
"c",
|
1123 |
+
"l",
|
1124 |
+
"p",
|
1125 |
+
"g",
|
1126 |
+
"v",
|
1127 |
+
"b",
|
1128 |
+
"f",
|
1129 |
+
"h",
|
1130 |
+
"ã",
|
1131 |
+
"q",
|
1132 |
+
"é",
|
1133 |
+
"ç",
|
1134 |
+
"á",
|
1135 |
+
"z",
|
1136 |
+
"í",
|
1137 |
+
],
|
1138 |
+
"Swedish": [
|
1139 |
+
"e",
|
1140 |
+
"a",
|
1141 |
+
"n",
|
1142 |
+
"r",
|
1143 |
+
"t",
|
1144 |
+
"s",
|
1145 |
+
"i",
|
1146 |
+
"l",
|
1147 |
+
"d",
|
1148 |
+
"o",
|
1149 |
+
"m",
|
1150 |
+
"k",
|
1151 |
+
"g",
|
1152 |
+
"v",
|
1153 |
+
"h",
|
1154 |
+
"f",
|
1155 |
+
"u",
|
1156 |
+
"p",
|
1157 |
+
"ä",
|
1158 |
+
"c",
|
1159 |
+
"b",
|
1160 |
+
"ö",
|
1161 |
+
"å",
|
1162 |
+
"y",
|
1163 |
+
"j",
|
1164 |
+
"x",
|
1165 |
+
],
|
1166 |
+
"Chinese": [
|
1167 |
+
"的",
|
1168 |
+
"一",
|
1169 |
+
"是",
|
1170 |
+
"不",
|
1171 |
+
"了",
|
1172 |
+
"在",
|
1173 |
+
"人",
|
1174 |
+
"有",
|
1175 |
+
"我",
|
1176 |
+
"他",
|
1177 |
+
"这",
|
1178 |
+
"个",
|
1179 |
+
"们",
|
1180 |
+
"中",
|
1181 |
+
"来",
|
1182 |
+
"上",
|
1183 |
+
"大",
|
1184 |
+
"为",
|
1185 |
+
"和",
|
1186 |
+
"国",
|
1187 |
+
"地",
|
1188 |
+
"到",
|
1189 |
+
"以",
|
1190 |
+
"说",
|
1191 |
+
"时",
|
1192 |
+
"要",
|
1193 |
+
"就",
|
1194 |
+
"出",
|
1195 |
+
"会",
|
1196 |
+
"可",
|
1197 |
+
"也",
|
1198 |
+
"你",
|
1199 |
+
"对",
|
1200 |
+
"生",
|
1201 |
+
"能",
|
1202 |
+
"而",
|
1203 |
+
"子",
|
1204 |
+
"那",
|
1205 |
+
"得",
|
1206 |
+
"于",
|
1207 |
+
"着",
|
1208 |
+
"下",
|
1209 |
+
"自",
|
1210 |
+
"之",
|
1211 |
+
"年",
|
1212 |
+
"过",
|
1213 |
+
"发",
|
1214 |
+
"后",
|
1215 |
+
"作",
|
1216 |
+
"里",
|
1217 |
+
"用",
|
1218 |
+
"道",
|
1219 |
+
"行",
|
1220 |
+
"所",
|
1221 |
+
"然",
|
1222 |
+
"家",
|
1223 |
+
"种",
|
1224 |
+
"事",
|
1225 |
+
"成",
|
1226 |
+
"方",
|
1227 |
+
"多",
|
1228 |
+
"经",
|
1229 |
+
"么",
|
1230 |
+
"去",
|
1231 |
+
"法",
|
1232 |
+
"学",
|
1233 |
+
"如",
|
1234 |
+
"都",
|
1235 |
+
"同",
|
1236 |
+
"现",
|
1237 |
+
"当",
|
1238 |
+
"没",
|
1239 |
+
"动",
|
1240 |
+
"面",
|
1241 |
+
"起",
|
1242 |
+
"看",
|
1243 |
+
"定",
|
1244 |
+
"天",
|
1245 |
+
"分",
|
1246 |
+
"还",
|
1247 |
+
"进",
|
1248 |
+
"好",
|
1249 |
+
"小",
|
1250 |
+
"部",
|
1251 |
+
"其",
|
1252 |
+
"些",
|
1253 |
+
"主",
|
1254 |
+
"样",
|
1255 |
+
"理",
|
1256 |
+
"心",
|
1257 |
+
"她",
|
1258 |
+
"本",
|
1259 |
+
"前",
|
1260 |
+
"开",
|
1261 |
+
"但",
|
1262 |
+
"因",
|
1263 |
+
"只",
|
1264 |
+
"从",
|
1265 |
+
"想",
|
1266 |
+
"实",
|
1267 |
+
],
|
1268 |
+
"Ukrainian": [
|
1269 |
+
"о",
|
1270 |
+
"а",
|
1271 |
+
"н",
|
1272 |
+
"і",
|
1273 |
+
"и",
|
1274 |
+
"р",
|
1275 |
+
"в",
|
1276 |
+
"т",
|
1277 |
+
"е",
|
1278 |
+
"с",
|
1279 |
+
"к",
|
1280 |
+
"л",
|
1281 |
+
"у",
|
1282 |
+
"д",
|
1283 |
+
"м",
|
1284 |
+
"п",
|
1285 |
+
"з",
|
1286 |
+
"я",
|
1287 |
+
"ь",
|
1288 |
+
"б",
|
1289 |
+
"г",
|
1290 |
+
"й",
|
1291 |
+
"ч",
|
1292 |
+
"х",
|
1293 |
+
"ц",
|
1294 |
+
"ї",
|
1295 |
+
],
|
1296 |
+
"Norwegian": [
|
1297 |
+
"e",
|
1298 |
+
"r",
|
1299 |
+
"n",
|
1300 |
+
"t",
|
1301 |
+
"a",
|
1302 |
+
"s",
|
1303 |
+
"i",
|
1304 |
+
"o",
|
1305 |
+
"l",
|
1306 |
+
"d",
|
1307 |
+
"g",
|
1308 |
+
"k",
|
1309 |
+
"m",
|
1310 |
+
"v",
|
1311 |
+
"f",
|
1312 |
+
"p",
|
1313 |
+
"u",
|
1314 |
+
"b",
|
1315 |
+
"h",
|
1316 |
+
"å",
|
1317 |
+
"y",
|
1318 |
+
"j",
|
1319 |
+
"ø",
|
1320 |
+
"c",
|
1321 |
+
"æ",
|
1322 |
+
"w",
|
1323 |
+
],
|
1324 |
+
"Finnish": [
|
1325 |
+
"a",
|
1326 |
+
"i",
|
1327 |
+
"n",
|
1328 |
+
"t",
|
1329 |
+
"e",
|
1330 |
+
"s",
|
1331 |
+
"l",
|
1332 |
+
"o",
|
1333 |
+
"u",
|
1334 |
+
"k",
|
1335 |
+
"ä",
|
1336 |
+
"m",
|
1337 |
+
"r",
|
1338 |
+
"v",
|
1339 |
+
"j",
|
1340 |
+
"h",
|
1341 |
+
"p",
|
1342 |
+
"y",
|
1343 |
+
"d",
|
1344 |
+
"ö",
|
1345 |
+
"g",
|
1346 |
+
"c",
|
1347 |
+
"b",
|
1348 |
+
"f",
|
1349 |
+
"w",
|
1350 |
+
"z",
|
1351 |
+
],
|
1352 |
+
"Vietnamese": [
|
1353 |
+
"n",
|
1354 |
+
"h",
|
1355 |
+
"t",
|
1356 |
+
"i",
|
1357 |
+
"c",
|
1358 |
+
"g",
|
1359 |
+
"a",
|
1360 |
+
"o",
|
1361 |
+
"u",
|
1362 |
+
"m",
|
1363 |
+
"l",
|
1364 |
+
"r",
|
1365 |
+
"à",
|
1366 |
+
"đ",
|
1367 |
+
"s",
|
1368 |
+
"e",
|
1369 |
+
"v",
|
1370 |
+
"p",
|
1371 |
+
"b",
|
1372 |
+
"y",
|
1373 |
+
"ư",
|
1374 |
+
"d",
|
1375 |
+
"á",
|
1376 |
+
"k",
|
1377 |
+
"ộ",
|
1378 |
+
"ế",
|
1379 |
+
],
|
1380 |
+
"Czech": [
|
1381 |
+
"o",
|
1382 |
+
"e",
|
1383 |
+
"a",
|
1384 |
+
"n",
|
1385 |
+
"t",
|
1386 |
+
"s",
|
1387 |
+
"i",
|
1388 |
+
"l",
|
1389 |
+
"v",
|
1390 |
+
"r",
|
1391 |
+
"k",
|
1392 |
+
"d",
|
1393 |
+
"u",
|
1394 |
+
"m",
|
1395 |
+
"p",
|
1396 |
+
"í",
|
1397 |
+
"c",
|
1398 |
+
"h",
|
1399 |
+
"z",
|
1400 |
+
"á",
|
1401 |
+
"y",
|
1402 |
+
"j",
|
1403 |
+
"b",
|
1404 |
+
"ě",
|
1405 |
+
"é",
|
1406 |
+
"ř",
|
1407 |
+
],
|
1408 |
+
"Hungarian": [
|
1409 |
+
"e",
|
1410 |
+
"a",
|
1411 |
+
"t",
|
1412 |
+
"l",
|
1413 |
+
"s",
|
1414 |
+
"n",
|
1415 |
+
"k",
|
1416 |
+
"r",
|
1417 |
+
"i",
|
1418 |
+
"o",
|
1419 |
+
"z",
|
1420 |
+
"á",
|
1421 |
+
"é",
|
1422 |
+
"g",
|
1423 |
+
"m",
|
1424 |
+
"b",
|
1425 |
+
"y",
|
1426 |
+
"v",
|
1427 |
+
"d",
|
1428 |
+
"h",
|
1429 |
+
"u",
|
1430 |
+
"p",
|
1431 |
+
"j",
|
1432 |
+
"ö",
|
1433 |
+
"f",
|
1434 |
+
"c",
|
1435 |
+
],
|
1436 |
+
"Korean": [
|
1437 |
+
"이",
|
1438 |
+
"다",
|
1439 |
+
"에",
|
1440 |
+
"의",
|
1441 |
+
"는",
|
1442 |
+
"로",
|
1443 |
+
"하",
|
1444 |
+
"을",
|
1445 |
+
"가",
|
1446 |
+
"고",
|
1447 |
+
"지",
|
1448 |
+
"서",
|
1449 |
+
"한",
|
1450 |
+
"은",
|
1451 |
+
"기",
|
1452 |
+
"으",
|
1453 |
+
"년",
|
1454 |
+
"대",
|
1455 |
+
"사",
|
1456 |
+
"시",
|
1457 |
+
"를",
|
1458 |
+
"리",
|
1459 |
+
"도",
|
1460 |
+
"인",
|
1461 |
+
"스",
|
1462 |
+
"일",
|
1463 |
+
],
|
1464 |
+
"Indonesian": [
|
1465 |
+
"a",
|
1466 |
+
"n",
|
1467 |
+
"e",
|
1468 |
+
"i",
|
1469 |
+
"r",
|
1470 |
+
"t",
|
1471 |
+
"u",
|
1472 |
+
"s",
|
1473 |
+
"d",
|
1474 |
+
"k",
|
1475 |
+
"m",
|
1476 |
+
"l",
|
1477 |
+
"g",
|
1478 |
+
"p",
|
1479 |
+
"b",
|
1480 |
+
"o",
|
1481 |
+
"h",
|
1482 |
+
"y",
|
1483 |
+
"j",
|
1484 |
+
"c",
|
1485 |
+
"w",
|
1486 |
+
"f",
|
1487 |
+
"v",
|
1488 |
+
"z",
|
1489 |
+
"x",
|
1490 |
+
"q",
|
1491 |
+
],
|
1492 |
+
"Turkish": [
|
1493 |
+
"a",
|
1494 |
+
"e",
|
1495 |
+
"i",
|
1496 |
+
"n",
|
1497 |
+
"r",
|
1498 |
+
"l",
|
1499 |
+
"ı",
|
1500 |
+
"k",
|
1501 |
+
"d",
|
1502 |
+
"t",
|
1503 |
+
"s",
|
1504 |
+
"m",
|
1505 |
+
"y",
|
1506 |
+
"u",
|
1507 |
+
"o",
|
1508 |
+
"b",
|
1509 |
+
"ü",
|
1510 |
+
"ş",
|
1511 |
+
"v",
|
1512 |
+
"g",
|
1513 |
+
"z",
|
1514 |
+
"h",
|
1515 |
+
"c",
|
1516 |
+
"p",
|
1517 |
+
"ç",
|
1518 |
+
"ğ",
|
1519 |
+
],
|
1520 |
+
"Romanian": [
|
1521 |
+
"e",
|
1522 |
+
"i",
|
1523 |
+
"a",
|
1524 |
+
"r",
|
1525 |
+
"n",
|
1526 |
+
"t",
|
1527 |
+
"u",
|
1528 |
+
"l",
|
1529 |
+
"o",
|
1530 |
+
"c",
|
1531 |
+
"s",
|
1532 |
+
"d",
|
1533 |
+
"p",
|
1534 |
+
"m",
|
1535 |
+
"ă",
|
1536 |
+
"f",
|
1537 |
+
"v",
|
1538 |
+
"î",
|
1539 |
+
"g",
|
1540 |
+
"b",
|
1541 |
+
"ș",
|
1542 |
+
"ț",
|
1543 |
+
"z",
|
1544 |
+
"h",
|
1545 |
+
"â",
|
1546 |
+
"j",
|
1547 |
+
],
|
1548 |
+
"Farsi": [
|
1549 |
+
"ا",
|
1550 |
+
"ی",
|
1551 |
+
"ر",
|
1552 |
+
"د",
|
1553 |
+
"ن",
|
1554 |
+
"ه",
|
1555 |
+
"و",
|
1556 |
+
"م",
|
1557 |
+
"ت",
|
1558 |
+
"ب",
|
1559 |
+
"س",
|
1560 |
+
"ل",
|
1561 |
+
"ک",
|
1562 |
+
"ش",
|
1563 |
+
"ز",
|
1564 |
+
"ف",
|
1565 |
+
"گ",
|
1566 |
+
"ع",
|
1567 |
+
"خ",
|
1568 |
+
"ق",
|
1569 |
+
"ج",
|
1570 |
+
"آ",
|
1571 |
+
"پ",
|
1572 |
+
"ح",
|
1573 |
+
"ط",
|
1574 |
+
"ص",
|
1575 |
+
],
|
1576 |
+
"Arabic": [
|
1577 |
+
"ا",
|
1578 |
+
"ل",
|
1579 |
+
"ي",
|
1580 |
+
"م",
|
1581 |
+
"و",
|
1582 |
+
"ن",
|
1583 |
+
"ر",
|
1584 |
+
"ت",
|
1585 |
+
"ب",
|
1586 |
+
"ة",
|
1587 |
+
"ع",
|
1588 |
+
"د",
|
1589 |
+
"س",
|
1590 |
+
"ف",
|
1591 |
+
"ه",
|
1592 |
+
"ك",
|
1593 |
+
"ق",
|
1594 |
+
"أ",
|
1595 |
+
"ح",
|
1596 |
+
"ج",
|
1597 |
+
"ش",
|
1598 |
+
"ط",
|
1599 |
+
"ص",
|
1600 |
+
"ى",
|
1601 |
+
"خ",
|
1602 |
+
"إ",
|
1603 |
+
],
|
1604 |
+
"Danish": [
|
1605 |
+
"e",
|
1606 |
+
"r",
|
1607 |
+
"n",
|
1608 |
+
"t",
|
1609 |
+
"a",
|
1610 |
+
"i",
|
1611 |
+
"s",
|
1612 |
+
"d",
|
1613 |
+
"l",
|
1614 |
+
"o",
|
1615 |
+
"g",
|
1616 |
+
"m",
|
1617 |
+
"k",
|
1618 |
+
"f",
|
1619 |
+
"v",
|
1620 |
+
"u",
|
1621 |
+
"b",
|
1622 |
+
"h",
|
1623 |
+
"p",
|
1624 |
+
"å",
|
1625 |
+
"y",
|
1626 |
+
"ø",
|
1627 |
+
"æ",
|
1628 |
+
"c",
|
1629 |
+
"j",
|
1630 |
+
"w",
|
1631 |
+
],
|
1632 |
+
"Serbian": [
|
1633 |
+
"а",
|
1634 |
+
"и",
|
1635 |
+
"о",
|
1636 |
+
"е",
|
1637 |
+
"н",
|
1638 |
+
"р",
|
1639 |
+
"с",
|
1640 |
+
"у",
|
1641 |
+
"т",
|
1642 |
+
"к",
|
1643 |
+
"ј",
|
1644 |
+
"в",
|
1645 |
+
"д",
|
1646 |
+
"м",
|
1647 |
+
"п",
|
1648 |
+
"л",
|
1649 |
+
"г",
|
1650 |
+
"з",
|
1651 |
+
"б",
|
1652 |
+
"a",
|
1653 |
+
"i",
|
1654 |
+
"e",
|
1655 |
+
"o",
|
1656 |
+
"n",
|
1657 |
+
"ц",
|
1658 |
+
"ш",
|
1659 |
+
],
|
1660 |
+
"Lithuanian": [
|
1661 |
+
"i",
|
1662 |
+
"a",
|
1663 |
+
"s",
|
1664 |
+
"o",
|
1665 |
+
"r",
|
1666 |
+
"e",
|
1667 |
+
"t",
|
1668 |
+
"n",
|
1669 |
+
"u",
|
1670 |
+
"k",
|
1671 |
+
"m",
|
1672 |
+
"l",
|
1673 |
+
"p",
|
1674 |
+
"v",
|
1675 |
+
"d",
|
1676 |
+
"j",
|
1677 |
+
"g",
|
1678 |
+
"ė",
|
1679 |
+
"b",
|
1680 |
+
"y",
|
1681 |
+
"ų",
|
1682 |
+
"š",
|
1683 |
+
"ž",
|
1684 |
+
"c",
|
1685 |
+
"ą",
|
1686 |
+
"į",
|
1687 |
+
],
|
1688 |
+
"Slovene": [
|
1689 |
+
"e",
|
1690 |
+
"a",
|
1691 |
+
"i",
|
1692 |
+
"o",
|
1693 |
+
"n",
|
1694 |
+
"r",
|
1695 |
+
"s",
|
1696 |
+
"l",
|
1697 |
+
"t",
|
1698 |
+
"j",
|
1699 |
+
"v",
|
1700 |
+
"k",
|
1701 |
+
"d",
|
1702 |
+
"p",
|
1703 |
+
"m",
|
1704 |
+
"u",
|
1705 |
+
"z",
|
1706 |
+
"b",
|
1707 |
+
"g",
|
1708 |
+
"h",
|
1709 |
+
"č",
|
1710 |
+
"c",
|
1711 |
+
"š",
|
1712 |
+
"ž",
|
1713 |
+
"f",
|
1714 |
+
"y",
|
1715 |
+
],
|
1716 |
+
"Slovak": [
|
1717 |
+
"o",
|
1718 |
+
"a",
|
1719 |
+
"e",
|
1720 |
+
"n",
|
1721 |
+
"i",
|
1722 |
+
"r",
|
1723 |
+
"v",
|
1724 |
+
"t",
|
1725 |
+
"s",
|
1726 |
+
"l",
|
1727 |
+
"k",
|
1728 |
+
"d",
|
1729 |
+
"m",
|
1730 |
+
"p",
|
1731 |
+
"u",
|
1732 |
+
"c",
|
1733 |
+
"h",
|
1734 |
+
"j",
|
1735 |
+
"b",
|
1736 |
+
"z",
|
1737 |
+
"á",
|
1738 |
+
"y",
|
1739 |
+
"ý",
|
1740 |
+
"í",
|
1741 |
+
"č",
|
1742 |
+
"é",
|
1743 |
+
],
|
1744 |
+
"Hebrew": [
|
1745 |
+
"י",
|
1746 |
+
"ו",
|
1747 |
+
"ה",
|
1748 |
+
"ל",
|
1749 |
+
"ר",
|
1750 |
+
"ב",
|
1751 |
+
"ת",
|
1752 |
+
"מ",
|
1753 |
+
"א",
|
1754 |
+
"ש",
|
1755 |
+
"נ",
|
1756 |
+
"ע",
|
1757 |
+
"ם",
|
1758 |
+
"ד",
|
1759 |
+
"ק",
|
1760 |
+
"ח",
|
1761 |
+
"פ",
|
1762 |
+
"ס",
|
1763 |
+
"כ",
|
1764 |
+
"ג",
|
1765 |
+
"ט",
|
1766 |
+
"צ",
|
1767 |
+
"ן",
|
1768 |
+
"ז",
|
1769 |
+
"ך",
|
1770 |
+
],
|
1771 |
+
"Bulgarian": [
|
1772 |
+
"а",
|
1773 |
+
"и",
|
1774 |
+
"о",
|
1775 |
+
"е",
|
1776 |
+
"н",
|
1777 |
+
"т",
|
1778 |
+
"р",
|
1779 |
+
"с",
|
1780 |
+
"в",
|
1781 |
+
"л",
|
1782 |
+
"к",
|
1783 |
+
"д",
|
1784 |
+
"п",
|
1785 |
+
"м",
|
1786 |
+
"з",
|
1787 |
+
"г",
|
1788 |
+
"я",
|
1789 |
+
"ъ",
|
1790 |
+
"у",
|
1791 |
+
"б",
|
1792 |
+
"ч",
|
1793 |
+
"ц",
|
1794 |
+
"й",
|
1795 |
+
"ж",
|
1796 |
+
"щ",
|
1797 |
+
"х",
|
1798 |
+
],
|
1799 |
+
"Croatian": [
|
1800 |
+
"a",
|
1801 |
+
"i",
|
1802 |
+
"o",
|
1803 |
+
"e",
|
1804 |
+
"n",
|
1805 |
+
"r",
|
1806 |
+
"j",
|
1807 |
+
"s",
|
1808 |
+
"t",
|
1809 |
+
"u",
|
1810 |
+
"k",
|
1811 |
+
"l",
|
1812 |
+
"v",
|
1813 |
+
"d",
|
1814 |
+
"m",
|
1815 |
+
"p",
|
1816 |
+
"g",
|
1817 |
+
"z",
|
1818 |
+
"b",
|
1819 |
+
"c",
|
1820 |
+
"č",
|
1821 |
+
"h",
|
1822 |
+
"š",
|
1823 |
+
"ž",
|
1824 |
+
"ć",
|
1825 |
+
"f",
|
1826 |
+
],
|
1827 |
+
"Hindi": [
|
1828 |
+
"क",
|
1829 |
+
"र",
|
1830 |
+
"स",
|
1831 |
+
"न",
|
1832 |
+
"त",
|
1833 |
+
"म",
|
1834 |
+
"ह",
|
1835 |
+
"प",
|
1836 |
+
"य",
|
1837 |
+
"ल",
|
1838 |
+
"व",
|
1839 |
+
"ज",
|
1840 |
+
"द",
|
1841 |
+
"ग",
|
1842 |
+
"ब",
|
1843 |
+
"श",
|
1844 |
+
"ट",
|
1845 |
+
"अ",
|
1846 |
+
"ए",
|
1847 |
+
"थ",
|
1848 |
+
"भ",
|
1849 |
+
"ड",
|
1850 |
+
"च",
|
1851 |
+
"ध",
|
1852 |
+
"ष",
|
1853 |
+
"इ",
|
1854 |
+
],
|
1855 |
+
"Estonian": [
|
1856 |
+
"a",
|
1857 |
+
"i",
|
1858 |
+
"e",
|
1859 |
+
"s",
|
1860 |
+
"t",
|
1861 |
+
"l",
|
1862 |
+
"u",
|
1863 |
+
"n",
|
1864 |
+
"o",
|
1865 |
+
"k",
|
1866 |
+
"r",
|
1867 |
+
"d",
|
1868 |
+
"m",
|
1869 |
+
"v",
|
1870 |
+
"g",
|
1871 |
+
"p",
|
1872 |
+
"j",
|
1873 |
+
"h",
|
1874 |
+
"ä",
|
1875 |
+
"b",
|
1876 |
+
"õ",
|
1877 |
+
"ü",
|
1878 |
+
"f",
|
1879 |
+
"c",
|
1880 |
+
"ö",
|
1881 |
+
"y",
|
1882 |
+
],
|
1883 |
+
"Thai": [
|
1884 |
+
"า",
|
1885 |
+
"น",
|
1886 |
+
"ร",
|
1887 |
+
"อ",
|
1888 |
+
"ก",
|
1889 |
+
"เ",
|
1890 |
+
"ง",
|
1891 |
+
"ม",
|
1892 |
+
"ย",
|
1893 |
+
"ล",
|
1894 |
+
"ว",
|
1895 |
+
"ด",
|
1896 |
+
"ท",
|
1897 |
+
"ส",
|
1898 |
+
"ต",
|
1899 |
+
"ะ",
|
1900 |
+
"ป",
|
1901 |
+
"บ",
|
1902 |
+
"ค",
|
1903 |
+
"ห",
|
1904 |
+
"แ",
|
1905 |
+
"จ",
|
1906 |
+
"พ",
|
1907 |
+
"ช",
|
1908 |
+
"ข",
|
1909 |
+
"ใ",
|
1910 |
+
],
|
1911 |
+
"Greek": [
|
1912 |
+
"α",
|
1913 |
+
"τ",
|
1914 |
+
"ο",
|
1915 |
+
"ι",
|
1916 |
+
"ε",
|
1917 |
+
"ν",
|
1918 |
+
"ρ",
|
1919 |
+
"σ",
|
1920 |
+
"κ",
|
1921 |
+
"η",
|
1922 |
+
"π",
|
1923 |
+
"ς",
|
1924 |
+
"υ",
|
1925 |
+
"μ",
|
1926 |
+
"λ",
|
1927 |
+
"ί",
|
1928 |
+
"ό",
|
1929 |
+
"ά",
|
1930 |
+
"γ",
|
1931 |
+
"έ",
|
1932 |
+
"δ",
|
1933 |
+
"ή",
|
1934 |
+
"ω",
|
1935 |
+
"χ",
|
1936 |
+
"θ",
|
1937 |
+
"ύ",
|
1938 |
+
],
|
1939 |
+
"Tamil": [
|
1940 |
+
"க",
|
1941 |
+
"த",
|
1942 |
+
"ப",
|
1943 |
+
"ட",
|
1944 |
+
"ர",
|
1945 |
+
"ம",
|
1946 |
+
"ல",
|
1947 |
+
"ன",
|
1948 |
+
"வ",
|
1949 |
+
"ற",
|
1950 |
+
"ய",
|
1951 |
+
"ள",
|
1952 |
+
"ச",
|
1953 |
+
"ந",
|
1954 |
+
"இ",
|
1955 |
+
"ண",
|
1956 |
+
"அ",
|
1957 |
+
"ஆ",
|
1958 |
+
"ழ",
|
1959 |
+
"ங",
|
1960 |
+
"எ",
|
1961 |
+
"உ",
|
1962 |
+
"ஒ",
|
1963 |
+
"ஸ",
|
1964 |
+
],
|
1965 |
+
"Kazakh": [
|
1966 |
+
"а",
|
1967 |
+
"ы",
|
1968 |
+
"е",
|
1969 |
+
"н",
|
1970 |
+
"т",
|
1971 |
+
"р",
|
1972 |
+
"л",
|
1973 |
+
"і",
|
1974 |
+
"д",
|
1975 |
+
"с",
|
1976 |
+
"м",
|
1977 |
+
"қ",
|
1978 |
+
"к",
|
1979 |
+
"о",
|
1980 |
+
"б",
|
1981 |
+
"и",
|
1982 |
+
"у",
|
1983 |
+
"ғ",
|
1984 |
+
"ж",
|
1985 |
+
"ң",
|
1986 |
+
"з",
|
1987 |
+
"ш",
|
1988 |
+
"й",
|
1989 |
+
"п",
|
1990 |
+
"г",
|
1991 |
+
"ө",
|
1992 |
+
],
|
1993 |
+
}
|
1994 |
+
|
1995 |
+
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
pproyect/test1/Lib/site-packages/charset_normalizer/legacy.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Dict, Optional, Union
|
2 |
+
from warnings import warn
|
3 |
+
|
4 |
+
from .api import from_bytes
|
5 |
+
from .constant import CHARDET_CORRESPONDENCE
|
6 |
+
|
7 |
+
|
8 |
+
def detect(
|
9 |
+
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
10 |
+
) -> Dict[str, Optional[Union[str, float]]]:
|
11 |
+
"""
|
12 |
+
chardet legacy method
|
13 |
+
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
14 |
+
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
15 |
+
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
16 |
+
further information. Not planned for removal.
|
17 |
+
|
18 |
+
:param byte_str: The byte sequence to examine.
|
19 |
+
:param should_rename_legacy: Should we rename legacy encodings
|
20 |
+
to their more modern equivalents?
|
21 |
+
"""
|
22 |
+
if len(kwargs):
|
23 |
+
warn(
|
24 |
+
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
25 |
+
)
|
26 |
+
|
27 |
+
if not isinstance(byte_str, (bytearray, bytes)):
|
28 |
+
raise TypeError( # pragma: nocover
|
29 |
+
"Expected object of type bytes or bytearray, got: "
|
30 |
+
"{0}".format(type(byte_str))
|
31 |
+
)
|
32 |
+
|
33 |
+
if isinstance(byte_str, bytearray):
|
34 |
+
byte_str = bytes(byte_str)
|
35 |
+
|
36 |
+
r = from_bytes(byte_str).best()
|
37 |
+
|
38 |
+
encoding = r.encoding if r is not None else None
|
39 |
+
language = r.language if r is not None and r.language != "Unknown" else ""
|
40 |
+
confidence = 1.0 - r.chaos if r is not None else None
|
41 |
+
|
42 |
+
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
43 |
+
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
44 |
+
if r is not None and encoding == "utf_8" and r.bom:
|
45 |
+
encoding += "_sig"
|
46 |
+
|
47 |
+
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
48 |
+
encoding = CHARDET_CORRESPONDENCE[encoding]
|
49 |
+
|
50 |
+
return {
|
51 |
+
"encoding": encoding,
|
52 |
+
"language": language,
|
53 |
+
"confidence": confidence,
|
54 |
+
}
|