sashtech commited on
Commit
7b96a1b
·
verified ·
1 Parent(s): f89abc8

Upload 9 files

Browse files
language_tool_python/__main__.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LanguageTool command line."""
2
+
3
+ import argparse
4
+ import locale
5
+ import re
6
+ import sys
7
+
8
+ from .server import LanguageTool
9
+ from .utils import LanguageToolError
10
+
11
+ import pkg_resources
12
+ __version__ = pkg_resources.require("language_tool_python")[0].version
13
+
14
+
15
+ def parse_args():
16
+ parser = argparse.ArgumentParser(
17
+ description=__doc__.strip() if __doc__ else None,
18
+ prog='language_tool_python')
19
+ parser.add_argument('files', nargs='+',
20
+ help='plain text file or "-" for stdin')
21
+ parser.add_argument('-c', '--encoding',
22
+ help='input encoding')
23
+ parser.add_argument('-l', '--language', metavar='CODE',
24
+ help='language code of the input or "auto"')
25
+ parser.add_argument('-m', '--mother-tongue', metavar='CODE',
26
+ help='language code of your first language')
27
+ parser.add_argument('-d', '--disable', metavar='RULES', type=get_rules,
28
+ action=RulesAction, default=set(),
29
+ help='list of rule IDs to be disabled')
30
+ parser.add_argument('-e', '--enable', metavar='RULES', type=get_rules,
31
+ action=RulesAction, default=set(),
32
+ help='list of rule IDs to be enabled')
33
+ parser.add_argument('--enabled-only', action='store_true',
34
+ help='disable all rules except those specified in '
35
+ '--enable')
36
+ parser.add_argument(
37
+ '--version', action='version',
38
+ version='%(prog)s {}'.format(__version__),
39
+ help='show version')
40
+ parser.add_argument('-a', '--apply', action='store_true',
41
+ help='automatically apply suggestions if available')
42
+ parser.add_argument('-s', '--spell-check-off', dest='spell_check',
43
+ action='store_false',
44
+ help='disable spell-checking rules')
45
+ parser.add_argument('--ignore-lines',
46
+ help='ignore lines that match this regular expression')
47
+ parser.add_argument('--remote-host',
48
+ help='hostname of the remote LanguageTool server')
49
+ parser.add_argument('--remote-port',
50
+ help='port of the remote LanguageTool server')
51
+
52
+ args = parser.parse_args()
53
+
54
+ if args.enabled_only:
55
+ if args.disable:
56
+ parser.error('--enabled-only cannot be used with --disable')
57
+
58
+ if not args.enable:
59
+ parser.error('--enabled-only requires --enable')
60
+
61
+ return args
62
+
63
+
64
+ class RulesAction(argparse.Action):
65
+ def __call__(self, parser, namespace, values, option_string=None):
66
+ getattr(namespace, self.dest).update(values)
67
+
68
+
69
+ def get_rules(rules: str) -> set:
70
+ return {rule.upper() for rule in re.findall(r"[\w\-]+", rules)}
71
+
72
+
73
+ def get_text(filename, encoding, ignore):
74
+ with open(filename, encoding=encoding) as f:
75
+ text = ''.join('\n' if (ignore and re.match(ignore, line)) else line
76
+ for line in f.readlines())
77
+ return text
78
+
79
+
80
+ def print_unicode(text):
81
+ """Print in a portable manner."""
82
+ if sys.version_info[0] < 3:
83
+ text = text.encode('utf-8')
84
+
85
+ print(text)
86
+
87
+
88
+ def main():
89
+ args = parse_args()
90
+
91
+ status = 0
92
+
93
+ for filename in args.files:
94
+ if len(args.files) > 1:
95
+ print(filename, file=sys.stderr)
96
+
97
+ if filename == '-':
98
+ filename = sys.stdin.fileno()
99
+ encoding = args.encoding or (
100
+ sys.stdin.encoding if sys.stdin.isatty()
101
+ else locale.getpreferredencoding()
102
+ )
103
+ else:
104
+ encoding = args.encoding or 'utf-8'
105
+
106
+ remote_server = None
107
+ if args.remote_host is not None:
108
+ remote_server = args.remote_host
109
+ if args.remote_port is not None:
110
+ remote_server += ':{}'.format(args.remote_port)
111
+ lang_tool = LanguageTool(
112
+ motherTongue=args.mother_tongue,
113
+ remote_server=remote_server,
114
+ )
115
+ guess_language = None
116
+
117
+ try:
118
+ text = get_text(filename, encoding, ignore=args.ignore_lines)
119
+ except UnicodeError as exception:
120
+ print('{}: {}'.format(filename, exception), file=sys.stderr)
121
+ continue
122
+
123
+ if args.language:
124
+ if args.language.lower() == 'auto':
125
+ try:
126
+ from guess_language import guess_language
127
+ except ImportError:
128
+ print('guess_language is unavailable.', file=sys.stderr)
129
+ return 1
130
+ else:
131
+ language = guess_language(text)
132
+ print('Detected language: {}'.format(language),
133
+ file=sys.stderr)
134
+ if not language:
135
+ return 1
136
+ lang_tool.language = language
137
+ else:
138
+ lang_tool.language = args.language
139
+
140
+ if not args.spell_check:
141
+ lang_tool.disable_spellchecking()
142
+
143
+ lang_tool.disabled_rules.update(args.disable)
144
+ lang_tool.enabled_rules.update(args.enable)
145
+ lang_tool.enabled_rules_only = args.enabled_only
146
+
147
+ try:
148
+ if args.apply:
149
+ print_unicode(lang_tool.correct(text))
150
+ else:
151
+ for match in lang_tool.check(text):
152
+ rule_id = match.ruleId
153
+
154
+ replacement_text = ', '.join(
155
+ "'{}'".format(word)
156
+ for word in match.replacements).strip()
157
+
158
+ message = match.message
159
+
160
+ # Messages that end with punctuation already include the
161
+ # suggestion.
162
+ if replacement_text and not message.endswith(('.', '?')):
163
+ message += '; suggestions: ' + replacement_text
164
+
165
+ print_unicode('{}: {}: {}'.format(
166
+ filename,
167
+ rule_id,
168
+ message))
169
+
170
+ status = 2
171
+ except LanguageToolError as exception:
172
+ print('{}: {}'.format(filename, exception), file=sys.stderr)
173
+ continue
174
+
175
+ return status
176
+
177
+
178
+ sys.exit(main())
language_tool_python/config_file.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict
2
+
3
+ import atexit
4
+ import os
5
+ import tempfile
6
+
7
+ ALLOWED_CONFIG_KEYS = {
8
+ 'maxTextLength', 'maxTextHardLength', 'secretTokenKey', 'maxCheckTimeMillis', 'maxErrorsPerWordRate',
9
+ 'maxSpellingSuggestions', 'maxCheckThreads', 'cacheSize', 'cacheTTLSeconds', 'cacheSize', 'requestLimit',
10
+ 'requestLimitInBytes', 'timeoutRequestLimit', 'requestLimitPeriodInSeconds', 'languageModel',
11
+ 'word2vecModel', 'fasttextModel', 'fasttextBinary', 'maxWorkQueueSize', 'rulesFile', 'warmUp',
12
+ 'blockedReferrers' 'premiumOnly', 'disabledRuleIds', 'pipelineCaching', 'maxPipelinePoolSize',
13
+ 'pipelineCaching', 'pipelineExpireTimeInSeconds', 'pipelinePrewarming'
14
+ }
15
+ class LanguageToolConfig:
16
+ config: Dict[str, Any]
17
+ path: str
18
+ def __init__(self, config: Dict[str, Any]):
19
+ assert set(config.keys()) <= ALLOWED_CONFIG_KEYS, f"unexpected keys in config: {set(config.keys()) - ALLOWED_CONFIG_KEYS}"
20
+ assert len(config), "config cannot be empty"
21
+ self.config = config
22
+ self.path = self._create_temp_file()
23
+
24
+ def _create_temp_file(self) -> str:
25
+ tmp_file = tempfile.NamedTemporaryFile(delete=False)
26
+
27
+ # WRite key=value entries as lines in temporary file.
28
+ for key, value in self.config.items():
29
+ next_line = f'{key}={value}\n'
30
+ tmp_file.write(next_line.encode())
31
+ tmp_file.close()
32
+
33
+ # Remove file when program exits.
34
+ atexit.register(lambda: os.unlink(tmp_file.name))
35
+
36
+ return tmp_file.name
37
+
38
+
39
+
40
+ """
41
+ ❯ /usr/bin/java -cp /Users/johnmorris/.cache/language_tool_python/LanguageTool-5.6/languagetool-server.jar org.languagetool.server.HTTPServer --help
42
+ Usage: HTTPServer [--config propertyFile] [--port|-p port] [--public]
43
+ --config FILE a Java property file (one key=value entry per line) with values for:
44
+ 'maxTextLength' - maximum text length, longer texts will cause an error (optional)
45
+ 'maxTextHardLength' - maximum text length, applies even to users with a special secret 'token' parameter (optional)
46
+ 'secretTokenKey' - secret JWT token key, if set by user and valid, maxTextLength can be increased by the user (optional)
47
+ 'maxCheckTimeMillis' - maximum time in milliseconds allowed per check (optional)
48
+ 'maxErrorsPerWordRate' - checking will stop with error if there are more rules matches per word (optional)
49
+ 'maxSpellingSuggestions' - only this many spelling errors will have suggestions for performance reasons (optional,
50
+ affects Hunspell-based languages only)
51
+ 'maxCheckThreads' - maximum number of threads working in parallel (optional)
52
+ 'cacheSize' - size of internal cache in number of sentences (optional, default: 0)
53
+ 'cacheTTLSeconds' - how many seconds sentences are kept in cache (optional, default: 300 if 'cacheSize' is set)
54
+ 'requestLimit' - maximum number of requests per requestLimitPeriodInSeconds (optional)
55
+ 'requestLimitInBytes' - maximum aggregated size of requests per requestLimitPeriodInSeconds (optional)
56
+ 'timeoutRequestLimit' - maximum number of timeout request (optional)
57
+ 'requestLimitPeriodInSeconds' - time period to which requestLimit and timeoutRequestLimit applies (optional)
58
+ 'languageModel' - a directory with '1grams', '2grams', '3grams' sub directories which contain a Lucene index
59
+ each with ngram occurrence counts; activates the confusion rule if supported (optional)
60
+ 'word2vecModel' - a directory with word2vec data (optional), see
61
+ https://github.com/languagetool-org/languagetool/blob/master/languagetool-standalone/CHANGES.md#word2vec
62
+ 'fasttextModel' - a model file for better language detection (optional), see
63
+ https://fasttext.cc/docs/en/language-identification.html
64
+ 'fasttextBinary' - compiled fasttext executable for language detection (optional), see
65
+ https://fasttext.cc/docs/en/support.html
66
+ 'maxWorkQueueSize' - reject request if request queue gets larger than this (optional)
67
+ 'rulesFile' - a file containing rules configuration, such as .langugagetool.cfg (optional)
68
+ 'warmUp' - set to 'true' to warm up server at start, i.e. run a short check with all languages (optional)
69
+ 'blockedReferrers' - a comma-separated list of HTTP referrers (and 'Origin' headers) that are blocked and will not be served (optional)
70
+ 'premiumOnly' - activate only the premium rules (optional)
71
+ 'disabledRuleIds' - a comma-separated list of rule ids that are turned off for this server (optional)
72
+ 'pipelineCaching' - set to 'true' to enable caching of internal pipelines to improve performance
73
+ 'maxPipelinePoolSize' - cache size if 'pipelineCaching' is set
74
+ 'pipelineExpireTimeInSeconds' - time after which pipeline cache items expire
75
+ 'pipelinePrewarming' - set to 'true' to fill pipeline cache on start (can slow down start a lot)
76
+ Spellcheck-only languages: You can add simple spellcheck-only support for languages that LT doesn't
77
+ support by defining two optional properties:
78
+ 'lang-xx' - set name of the language, use language code instead of 'xx', e.g. lang-tr=Turkish
79
+ 'lang-xx-dictPath' - absolute path to the hunspell .dic file, use language code instead of 'xx', e.g.
80
+ lang-tr-dictPath=/path/to/tr.dic. Note that the same directory also needs to
81
+ contain a common_words.txt file with the most common 10,000 words (used for better language detection)
82
+ --port, -p PRT port to bind to, defaults to 8081 if not specified
83
+ --public allow this server process to be connected from anywhere; if not set,
84
+ it can only be connected from the computer it was started on
85
+ --allow-origin [ORIGIN] set the Access-Control-Allow-Origin header in the HTTP response,
86
+ used for direct (non-proxy) JavaScript-based access from browsers.
87
+ Example: --allow-origin "https://my-website.org"
88
+ Don't set a parameter for `*`, i.e. access from all websites.
89
+ --verbose, -v in case of exceptions, log the input text (up to 500 characters)
90
+ --languageModel a directory with '1grams', '2grams', '3grams' sub directories (per language)
91
+ which contain a Lucene index (optional, overwrites 'languageModel'
92
+ parameter in properties files)
93
+ --word2vecModel a directory with word2vec data (optional), see
94
+ https://github.com/languagetool-org/languagetool/blob/master/languagetool-standalone/CHANGES.md#word2vec
95
+ --premiumAlways activate the premium rules even when user has no username/password - useful for API servers
96
+ """
language_tool_python/console_mode.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Write to stdout without causing UnicodeEncodeError."""
3
+
4
+ import sys
5
+
6
+
7
+ if (getattr(sys.stdout, 'errors', '') == 'strict' and
8
+ not getattr(sys.stdout, 'encoding', '').lower().startswith('utf')):
9
+ try:
10
+ import translit
11
+ sys.stdout = translit.StreamFilter(sys.stdout)
12
+ except ImportError:
13
+ import codecs
14
+ import unicodedata
15
+ import warnings
16
+
17
+ TRANSLIT_MAP = {
18
+ 0x2018: "'",
19
+ 0x2019: "'",
20
+ 0x201c: '"',
21
+ 0x201d: '"',
22
+ }
23
+
24
+ def simplify(s):
25
+ s = s.translate(TRANSLIT_MAP)
26
+ return ''.join([c for c in unicodedata.normalize('NFKD', s)
27
+ if not unicodedata.combining(c)])
28
+
29
+ def simple_translit_error_handler(error):
30
+ if not isinstance(error, UnicodeEncodeError):
31
+ raise error
32
+ chunk = error.object[error.start:error.end]
33
+ repl = simplify(chunk)
34
+ repl = (repl.encode(error.encoding, 'backslashreplace')
35
+ .decode(error.encoding))
36
+ return repl, error.end
37
+
38
+ class SimpleTranslitStreamFilter:
39
+
40
+ """Filter a stream through simple transliteration."""
41
+ errors = 'simple_translit'
42
+
43
+ def __init__(self, target):
44
+ self.target = target
45
+
46
+ def __getattr__(self, name):
47
+ return getattr(self.target, name)
48
+
49
+ def write(self, s):
50
+ self.target.write(self.downgrade(s))
51
+
52
+ def writelines(self, lines):
53
+ self.target.writelines(
54
+ [self.downgrade(line) for line in lines])
55
+
56
+ def downgrade(self, s):
57
+ return (s.encode(self.target.encoding, self.errors)
58
+ .decode(self.target.encoding))
59
+
60
+ codecs.register_error(SimpleTranslitStreamFilter.errors,
61
+ simple_translit_error_handler)
62
+ sys.stdout = SimpleTranslitStreamFilter(sys.stdout)
63
+ warnings.warn('translit is unavailable', ImportWarning)
language_tool_python/download_lt.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """Download latest LanguageTool distribution."""
4
+
5
+ import logging
6
+ import os
7
+ import re
8
+ import requests
9
+ import subprocess
10
+ import sys
11
+ import tempfile
12
+ import tqdm
13
+ from typing import Optional
14
+ import zipfile
15
+
16
+ from distutils.spawn import find_executable
17
+ from urllib.parse import urljoin
18
+ from .utils import (
19
+ find_existing_language_tool_downloads,
20
+ get_language_tool_download_path,
21
+ LTP_JAR_DIR_PATH_ENV_VAR
22
+ )
23
+
24
+ # Create logger for this file.
25
+ logging.basicConfig(format='%(message)s')
26
+ logger = logging.getLogger(__name__)
27
+ logger.setLevel(logging.INFO)
28
+
29
+
30
+ # Get download host from environment or default.
31
+ BASE_URL = os.environ.get('LTP_DOWNLOAD_HOST', 'https://www.languagetool.org/download/')
32
+ FILENAME = 'LanguageTool-{version}.zip'
33
+
34
+ LTP_DOWNLOAD_VERSION = '6.4'
35
+
36
+ JAVA_VERSION_REGEX = re.compile(
37
+ r'^(?:java|openjdk) version "(?P<major1>\d+)(|\.(?P<major2>\d+)\.[^"]+)"',
38
+ re.MULTILINE)
39
+
40
+ # Updated for later versions of java
41
+ JAVA_VERSION_REGEX_UPDATED = re.compile(
42
+ r'^(?:java|openjdk) [version ]?(?P<major1>\d+)\.(?P<major2>\d+)',
43
+ re.MULTILINE)
44
+
45
+
46
+ def parse_java_version(version_text):
47
+ """Return Java version (major1, major2).
48
+
49
+ >>> parse_java_version('''java version "1.6.0_65"
50
+ ... Java(TM) SE Runtime Environment (build 1.6.0_65-b14-462-11M4609)
51
+ ... Java HotSpot(TM) 64-Bit Server VM (build 20.65-b04-462, mixed mode))
52
+ ... ''')
53
+ (1, 6)
54
+
55
+ >>> parse_java_version('''
56
+ ... openjdk version "1.8.0_60"
57
+ ... OpenJDK Runtime Environment (build 1.8.0_60-b27)
58
+ ... OpenJDK 64-Bit Server VM (build 25.60-b23, mixed mode))
59
+ ... ''')
60
+ (1, 8)
61
+
62
+ """
63
+ match = (
64
+ re.search(JAVA_VERSION_REGEX, version_text)
65
+ or re.search(JAVA_VERSION_REGEX_UPDATED, version_text)
66
+ )
67
+ if not match:
68
+ raise SystemExit(
69
+ 'Could not parse Java version from """{}""".'.format(version_text))
70
+ major1 = int(match.group('major1'))
71
+ major2 = int(match.group('major2')) if match.group('major2') else 0
72
+ return (major1, major2)
73
+
74
+
75
+ def confirm_java_compatibility():
76
+ """ Confirms Java major version >= 8. """
77
+ java_path = find_executable('java')
78
+ if not java_path:
79
+ raise ModuleNotFoundError(
80
+ 'No java install detected. '
81
+ 'Please install java to use language-tool-python.'
82
+ )
83
+
84
+ output = subprocess.check_output([java_path, '-version'],
85
+ stderr=subprocess.STDOUT,
86
+ universal_newlines=True)
87
+
88
+ major_version, minor_version = parse_java_version(output)
89
+ # Some installs of java show the version number like `14.0.1`
90
+ # and others show `1.14.0.1`
91
+ # (with a leading 1). We want to support both,
92
+ # as long as the major version is >= 8.
93
+ # (See softwareengineering.stackexchange.com/questions/175075/why-is-java-version-1-x-referred-to-as-java-x)
94
+ if major_version == 1 and minor_version >= 8:
95
+ return True
96
+ elif major_version >= 8:
97
+ return True
98
+ else:
99
+ raise SystemError('Detected java {}.{}. LanguageTool requires Java >= 8.'.format(major_version, minor_version))
100
+
101
+
102
+ def get_common_prefix(z):
103
+ """Get common directory in a zip file if any."""
104
+ name_list = z.namelist()
105
+ if name_list and all(n.startswith(name_list[0]) for n in name_list[1:]):
106
+ return name_list[0]
107
+ return None
108
+
109
+
110
+ def http_get(url, out_file, proxies=None):
111
+ """ Get contents of a URL and save to a file.
112
+ """
113
+ req = requests.get(url, stream=True, proxies=proxies)
114
+ content_length = req.headers.get('Content-Length')
115
+ total = int(content_length) if content_length is not None else None
116
+ if req.status_code == 403: # Not found on AWS
117
+ raise Exception('Could not find at URL {}.'.format(url))
118
+ progress = tqdm.tqdm(unit="B", unit_scale=True, total=total,
119
+ desc=f'Downloading LanguageTool {LTP_DOWNLOAD_VERSION}')
120
+ for chunk in req.iter_content(chunk_size=1024):
121
+ if chunk: # filter out keep-alive new chunks
122
+ progress.update(len(chunk))
123
+ out_file.write(chunk)
124
+ progress.close()
125
+
126
+
127
+ def unzip_file(temp_file, directory_to_extract_to):
128
+ """ Unzips a .zip file to folder path. """
129
+ logger.info(
130
+ 'Unzipping {} to {}.'.format(temp_file.name, directory_to_extract_to)
131
+ )
132
+ with zipfile.ZipFile(temp_file.name, 'r') as zip_ref:
133
+ zip_ref.extractall(directory_to_extract_to)
134
+
135
+
136
+ def download_zip(url, directory):
137
+ """ Downloads and unzips zip file from `url` to `directory`. """
138
+ # Download file.
139
+ downloaded_file = tempfile.NamedTemporaryFile(suffix='.zip', delete=False)
140
+ http_get(url, downloaded_file)
141
+ # Close the file so we can extract it.
142
+ downloaded_file.close()
143
+ # Extract zip file to path.
144
+ unzip_file(downloaded_file, directory)
145
+ # Remove the temporary file.
146
+ os.remove(downloaded_file.name)
147
+ # Tell the user the download path.
148
+ logger.info('Downloaded {} to {}.'.format(url, directory))
149
+
150
+
151
+ def download_lt(language_tool_version: Optional[str] = LTP_DOWNLOAD_VERSION):
152
+ confirm_java_compatibility()
153
+
154
+ download_folder = get_language_tool_download_path()
155
+
156
+ # Use the env var to the jar directory if it is defined
157
+ # otherwise look in the download directory
158
+ if os.environ.get(LTP_JAR_DIR_PATH_ENV_VAR):
159
+ return
160
+
161
+ # Make download path, if it doesn't exist.
162
+ os.makedirs(download_folder, exist_ok=True)
163
+
164
+ assert os.path.isdir(download_folder)
165
+ old_path_list = find_existing_language_tool_downloads(download_folder)
166
+
167
+ if language_tool_version:
168
+ version = language_tool_version
169
+ filename = FILENAME.format(version=version)
170
+ language_tool_download_url = urljoin(BASE_URL, filename)
171
+ dirname, _ = os.path.splitext(filename)
172
+ extract_path = os.path.join(download_folder, dirname)
173
+
174
+ if extract_path in old_path_list:
175
+ return
176
+ download_zip(language_tool_download_url, download_folder)
177
+
178
+
179
+ if __name__ == '__main__':
180
+ sys.exit(download_lt())
language_tool_python/language_tag.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from functools import total_ordering
4
+
5
+ @total_ordering
6
+ class LanguageTag:
7
+ """Language tag supported by LanguageTool."""
8
+ _LANGUAGE_RE = re.compile(r"^([a-z]{2,3})(?:[_-]([a-z]{2}))?$", re.I)
9
+
10
+ def __init__(self, tag, languages):
11
+ self.tag = tag
12
+ self.languages = languages
13
+ self.normalized_tag = self._normalize(tag)
14
+
15
+ def __eq__(self, other_tag):
16
+ return self.normalized_tag == self._normalize(other_tag)
17
+
18
+ def __lt__(self, other_tag):
19
+ return str(self) < self._normalize(other)
20
+
21
+ def __str__(self):
22
+ return self.normalized_tag
23
+
24
+ def __repr__(self):
25
+ return '<LanguageTag "{}">'.format(str(self))
26
+
27
+ def _normalize(self, tag):
28
+ if not tag:
29
+ raise ValueError('empty language tag')
30
+ languages = {language.lower().replace('-', '_'): language
31
+ for language in self.languages}
32
+ try:
33
+ return languages[tag.lower().replace('-', '_')]
34
+ except KeyError:
35
+ try:
36
+ return languages[self._LANGUAGE_RE.match(tag).group(1).lower()]
37
+ except (KeyError, AttributeError):
38
+ raise ValueError('unsupported language: {!r}'.format(tag))
language_tool_python/match.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unicodedata
2
+ from collections import OrderedDict
3
+ from functools import total_ordering
4
+
5
+ def get_match_ordered_dict():
6
+ slots = OrderedDict([
7
+ ('ruleId', str),
8
+ ('message', str),
9
+ ('replacements', list),
10
+ ('offsetInContext', int),
11
+ ('context', str),
12
+ ('offset', int),
13
+ ('errorLength', int),
14
+ ('category', str),
15
+ ('ruleIssueType', str),
16
+ ('sentence', str),
17
+ ])
18
+ return slots
19
+
20
+ """ Sample match JSON:
21
+ {
22
+ 'message': 'Possible spelling mistake found.',
23
+ 'shortMessage': 'Spelling mistake',
24
+ 'replacements': [{'value': 'newt'}, {'value': 'not'}, {'value': 'new', 'shortDescription': 'having just been made'}, {'value': 'news'}, {'value': 'foot', 'shortDescription': 'singular'}, {'value': 'root', 'shortDescription': 'underground organ of a plant'}, {'value': 'boot'}, {'value': 'noon'}, {'value': 'loot', 'shortDescription': 'plunder'}, {'value': 'moot'}, {'value': 'Root'}, {'value': 'soot', 'shortDescription': 'carbon black'}, {'value': 'newts'}, {'value': 'nook'}, {'value': 'Lieut'}, {'value': 'coot'}, {'value': 'hoot'}, {'value': 'toot'}, {'value': 'snoot'}, {'value': 'neut'}, {'value': 'nowt'}, {'value': 'Noor'}, {'value': 'noob'}],
25
+ 'offset': 8,
26
+ 'length': 4,
27
+ 'context': {'text': 'This is noot okay. ', 'offset': 8, 'length': 4}, 'sentence': 'This is noot okay.',
28
+ 'type': {'typeName': 'Other'},
29
+ 'rule': {'id': 'MORFOLOGIK_RULE_EN_US', 'description': 'Possible spelling mistake', 'issueType': 'misspelling', 'category': {'id': 'TYPOS', 'name': 'Possible Typo'}},
30
+ 'ignoreForIncompleteSentence': False,
31
+ 'contextForSureMatch': 0
32
+ }
33
+
34
+ """
35
+ def auto_type(obj):
36
+ try:
37
+ return int(obj)
38
+ except ValueError:
39
+ try:
40
+ return float(obj)
41
+ except ValueError:
42
+ return obj
43
+
44
+ @total_ordering
45
+ class Match:
46
+ """Hold information about where a rule matches text."""
47
+ def __init__(self, attrib):
48
+ # Process rule.
49
+ attrib['category'] = attrib['rule']['category']['id']
50
+ attrib['ruleId'] = attrib['rule']['id']
51
+ attrib['ruleIssueType'] = attrib['rule']['issueType']
52
+ del attrib['rule']
53
+ # Process context.
54
+ attrib['offsetInContext'] = attrib['context']['offset']
55
+ attrib['context'] = attrib['context']['text']
56
+ # Process replacements.
57
+ attrib['replacements'] = [r['value'] for r in attrib['replacements']]
58
+ # Rename error length.
59
+ attrib['errorLength'] = attrib['length']
60
+ # Normalize unicode
61
+ attrib['message'] = unicodedata.normalize("NFKC", attrib['message'])
62
+ # Store objects on self.
63
+ for k, v in attrib.items():
64
+ setattr(self, k, v)
65
+
66
+ def __repr__(self):
67
+ def _ordered_dict_repr():
68
+ slots = list(get_match_ordered_dict())
69
+ slots += list(set(self.__dict__).difference(slots))
70
+ attrs = [slot for slot in slots
71
+ if slot in self.__dict__ and not slot.startswith('_')]
72
+ return '{{{}}}'.format(
73
+ ', '.join([
74
+ '{!r}: {!r}'.format(attr, getattr(self, attr))
75
+ for attr in attrs
76
+ ])
77
+ )
78
+
79
+ return '{}({})'.format(self.__class__.__name__, _ordered_dict_repr())
80
+
81
+ def __str__(self):
82
+ ruleId = self.ruleId
83
+ s = 'Offset {}, length {}, Rule ID: {}'.format(
84
+ self.offset, self.errorLength, ruleId)
85
+ if self.message:
86
+ s += '\nMessage: {}'.format(self.message)
87
+ if self.replacements:
88
+ s += '\nSuggestion: {}'.format('; '.join(self.replacements))
89
+ s += '\n{}\n{}'.format(
90
+ self.context, ' ' * self.offsetInContext + '^' * self.errorLength
91
+ )
92
+ return s
93
+
94
+ @property
95
+ def matchedText(self):
96
+ """ Returns the text that garnered the error (without its surrounding context).
97
+ """
98
+ return self.context[self.offsetInContext:self.offsetInContext+self.errorLength]
99
+
100
+ def __eq__(self, other):
101
+ return list(self) == list(other)
102
+
103
+ def __lt__(self, other):
104
+ return list(self) < list(other)
105
+
106
+ def __iter__(self):
107
+ return iter(getattr(self, attr) for attr in get_match_ordered_dict())
108
+
109
+ def __setattr__(self, key, value):
110
+ try:
111
+ value = get_match_ordered_dict()[key](value)
112
+ except KeyError:
113
+ return
114
+ super().__setattr__(key, value)
115
+
116
+ def __getattr__(self, name):
117
+ if name not in get_match_ordered_dict():
118
+ raise AttributeError('{!r} object has no attribute {!r}'
119
+ .format(self.__class__.__name__, name))
language_tool_python/server.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ import atexit
4
+ import http.client
5
+ import json
6
+ import os
7
+ import re
8
+ import requests
9
+ import socket
10
+ import subprocess
11
+ import threading
12
+ import urllib.parse
13
+
14
+ from .config_file import LanguageToolConfig
15
+ from .download_lt import download_lt, LTP_DOWNLOAD_VERSION
16
+ from .language_tag import LanguageTag
17
+ from .match import Match
18
+ from .utils import (
19
+ correct,
20
+ parse_url, get_locale_language,
21
+ get_language_tool_directory, get_server_cmd,
22
+ FAILSAFE_LANGUAGE, startupinfo,
23
+ LanguageToolError, ServerError, PathError
24
+ )
25
+
26
+
27
+ DEBUG_MODE = False
28
+
29
+ # Keep track of running server PIDs in a global list. This way,
30
+ # we can ensure they're killed on exit.
31
+ RUNNING_SERVER_PROCESSES: List[subprocess.Popen] = []
32
+
33
+
34
+ class LanguageTool:
35
+ """Main class used for checking text against different rules.
36
+ LanguageTool v2 API documentation:
37
+ https://languagetool.org/http-api/swagger-ui/#!/default/post_check
38
+ """
39
+ _MIN_PORT = 8081
40
+ _MAX_PORT = 8999
41
+ _TIMEOUT = 5 * 60
42
+ _remote = False
43
+ _port = _MIN_PORT
44
+ _server: subprocess.Popen = None
45
+ _consumer_thread: threading.Thread = None
46
+ _PORT_RE = re.compile(r"(?:https?://.*:|port\s+)(\d+)", re.I)
47
+
48
+ def __init__(
49
+ self, language=None, motherTongue=None,
50
+ remote_server=None, newSpellings=None,
51
+ new_spellings_persist=True,
52
+ host=None, config=None,
53
+ language_tool_download_version: str = LTP_DOWNLOAD_VERSION
54
+ ):
55
+ self.language_tool_download_version = language_tool_download_version
56
+ self._new_spellings = None
57
+ self._new_spellings_persist = new_spellings_persist
58
+ self._host = host or socket.gethostbyname('localhost')
59
+
60
+ if remote_server:
61
+ assert config is None, "cannot pass config file to remote server"
62
+ self.config = LanguageToolConfig(config) if config else None
63
+
64
+ if remote_server is not None:
65
+ self._remote = True
66
+ self._url = parse_url(remote_server)
67
+ self._url = urllib.parse.urljoin(self._url, 'v2/')
68
+ self._update_remote_server_config(self._url)
69
+ elif not self._server_is_alive():
70
+ self._start_server_on_free_port()
71
+ if language is None:
72
+ try:
73
+ language = get_locale_language()
74
+ except ValueError:
75
+ language = FAILSAFE_LANGUAGE
76
+ if newSpellings:
77
+ self._new_spellings = newSpellings
78
+ self._register_spellings(self._new_spellings)
79
+ self._language = LanguageTag(language, self._get_languages())
80
+ self.motherTongue = motherTongue
81
+ self.disabled_rules = set()
82
+ self.enabled_rules = set()
83
+ self.disabled_categories = set()
84
+ self.enabled_categories = set()
85
+ self.enabled_rules_only = False
86
+ self.preferred_variants = set()
87
+
88
+ def __enter__(self):
89
+ return self
90
+
91
+ def __exit__(self, exc_type, exc_val, exc_tb):
92
+ self.close()
93
+
94
+ def __del__(self):
95
+ self.close()
96
+
97
+ def __repr__(self):
98
+ return '{}(language={!r}, motherTongue={!r})'.format(
99
+ self.__class__.__name__, self.language, self.motherTongue)
100
+
101
+ def close(self):
102
+ if self._server_is_alive():
103
+ self._terminate_server()
104
+ if not self._new_spellings_persist and self._new_spellings:
105
+ self._unregister_spellings()
106
+ self._new_spellings = []
107
+
108
+ @property
109
+ def language(self):
110
+ """The language to be used."""
111
+ return self._language
112
+
113
+ @language.setter
114
+ def language(self, language):
115
+ self._language = LanguageTag(language, self._get_languages())
116
+ self.disabled_rules.clear()
117
+ self.enabled_rules.clear()
118
+
119
+ @property
120
+ def motherTongue(self):
121
+ """The user's mother tongue or None.
122
+ The mother tongue may also be used as a source language for
123
+ checking bilingual texts.
124
+ """
125
+ return self._motherTongue
126
+
127
+ @motherTongue.setter
128
+ def motherTongue(self, motherTongue):
129
+ self._motherTongue = (
130
+ None if motherTongue is None
131
+ else LanguageTag(motherTongue, self._get_languages())
132
+ )
133
+
134
+ @property
135
+ def _spell_checking_categories(self):
136
+ return {'TYPOS'}
137
+
138
+ def check(self, text: str) -> List[Match]:
139
+ """Match text against enabled rules."""
140
+ url = urllib.parse.urljoin(self._url, 'check')
141
+ response = self._query_server(url, self._create_params(text))
142
+ matches = response['matches']
143
+ return [Match(match) for match in matches]
144
+
145
+ def _create_params(self, text: str) -> Dict[str, str]:
146
+ params = {'language': str(self.language), 'text': text}
147
+ if self.motherTongue is not None:
148
+ params['motherTongue'] = self.motherTongue
149
+ if self.disabled_rules:
150
+ params['disabledRules'] = ','.join(self.disabled_rules)
151
+ if self.enabled_rules:
152
+ params['enabledRules'] = ','.join(self.enabled_rules)
153
+ if self.enabled_rules_only:
154
+ params['enabledOnly'] = 'true'
155
+ if self.disabled_categories:
156
+ params['disabledCategories'] = ','.join(self.disabled_categories)
157
+ if self.enabled_categories:
158
+ params['enabledCategories'] = ','.join(self.enabled_categories)
159
+ if self.preferred_variants:
160
+ params['preferredVariants'] = ','.join(self.preferred_variants)
161
+ return params
162
+
163
+ def correct(self, text: str) -> str:
164
+ """Automatically apply suggestions to the text."""
165
+ return correct(text, self.check(text))
166
+
167
+ def enable_spellchecking(self):
168
+ """Enable spell-checking rules."""
169
+ self.disabled_categories.difference_update(
170
+ self._spell_checking_categories
171
+ )
172
+
173
+ def disable_spellchecking(self):
174
+ """Disable spell-checking rules."""
175
+ self.disabled_categories.update(self._spell_checking_categories)
176
+
177
+ @staticmethod
178
+ def _get_valid_spelling_file_path() -> str:
179
+ library_path = get_language_tool_directory()
180
+ spelling_file_path = os.path.join(
181
+ library_path, "org/languagetool/resource/en/hunspell/spelling.txt"
182
+ )
183
+ if not os.path.exists(spelling_file_path):
184
+ raise FileNotFoundError(
185
+ "Failed to find the spellings file at {}\n "
186
+ "Please file an issue at "
187
+ "https://github.com/jxmorris12/language_tool_python/issues"
188
+ .format(spelling_file_path))
189
+ return spelling_file_path
190
+
191
+ def _register_spellings(self, spellings):
192
+ spelling_file_path = self._get_valid_spelling_file_path()
193
+ with (
194
+ open(spelling_file_path, "a+", encoding='utf-8')
195
+ ) as spellings_file:
196
+ spellings_file.write(
197
+ "\n" + "\n".join([word for word in spellings])
198
+ )
199
+ if DEBUG_MODE:
200
+ print("Registered new spellings at {}".format(spelling_file_path))
201
+
202
+ def _unregister_spellings(self):
203
+ spelling_file_path = self._get_valid_spelling_file_path()
204
+ with (
205
+ open(spelling_file_path, 'r+', encoding='utf-8')
206
+ ) as spellings_file:
207
+ spellings_file.seek(0, os.SEEK_END)
208
+ for _ in range(len(self._new_spellings)):
209
+ while spellings_file.read(1) != '\n':
210
+ spellings_file.seek(spellings_file.tell() - 2, os.SEEK_SET)
211
+ spellings_file.seek(spellings_file.tell() - 2, os.SEEK_SET)
212
+ spellings_file.seek(spellings_file.tell() + 1, os.SEEK_SET)
213
+ spellings_file.truncate()
214
+ if DEBUG_MODE:
215
+ print(
216
+ "Unregistered new spellings at {}".format(spelling_file_path)
217
+ )
218
+
219
+ def _get_languages(self) -> set:
220
+ """Get supported languages (by querying the server)."""
221
+ self._start_server_if_needed()
222
+ url = urllib.parse.urljoin(self._url, 'languages')
223
+ languages = set()
224
+ for e in self._query_server(url, num_tries=1):
225
+ languages.add(e.get('code'))
226
+ languages.add(e.get('longCode'))
227
+ languages.add("auto")
228
+ return languages
229
+
230
+ def _start_server_if_needed(self):
231
+ # Start server.
232
+ if not self._server_is_alive() and self._remote is False:
233
+ self._start_server_on_free_port()
234
+
235
+ def _update_remote_server_config(self, url):
236
+ self._url = url
237
+ self._remote = True
238
+
239
+ def _query_server(self, url, params=None, num_tries=2):
240
+ if DEBUG_MODE:
241
+ print('_query_server url:', url, 'params:', params)
242
+ for n in range(num_tries):
243
+ try:
244
+ with (
245
+ requests.get(url, params=params, timeout=self._TIMEOUT)
246
+ ) as response:
247
+ try:
248
+ return response.json()
249
+ except json.decoder.JSONDecodeError as e:
250
+ if DEBUG_MODE:
251
+ print(
252
+ 'URL {} and params {} '
253
+ 'returned invalid JSON response: {}'
254
+ .format(url, params, e)
255
+ )
256
+ print(response)
257
+ print(response.content)
258
+ raise LanguageToolError(response.content.decode())
259
+ except (IOError, http.client.HTTPException) as e:
260
+ if self._remote is False:
261
+ self._terminate_server()
262
+ self._start_local_server()
263
+ if n + 1 >= num_tries:
264
+ raise LanguageToolError('{}: {}'.format(self._url, e))
265
+
266
+ def _start_server_on_free_port(self):
267
+ while True:
268
+ self._url = 'http://{}:{}/v2/'.format(self._host, self._port)
269
+ try:
270
+ self._start_local_server()
271
+ break
272
+ except ServerError:
273
+ if self._MIN_PORT <= self._port < self._MAX_PORT:
274
+ self._port += 1
275
+ else:
276
+ raise
277
+
278
+ def _start_local_server(self):
279
+ # Before starting local server, download language tool if needed.
280
+ download_lt(self.language_tool_download_version)
281
+ err = None
282
+ try:
283
+ if DEBUG_MODE:
284
+ if self._port:
285
+ print(
286
+ 'language_tool_python initializing with port:',
287
+ self._port
288
+ )
289
+ if self.config:
290
+ print(
291
+ 'language_tool_python initializing '
292
+ 'with temporary config file:',
293
+ self.config.path
294
+ )
295
+ server_cmd = get_server_cmd(self._port, self.config)
296
+ except PathError as e:
297
+ # Can't find path to LanguageTool.
298
+ err = e
299
+ else:
300
+ # Need to PIPE all handles: http://bugs.python.org/issue3905
301
+ self._server = subprocess.Popen(
302
+ server_cmd,
303
+ stdin=subprocess.PIPE,
304
+ stdout=subprocess.PIPE,
305
+ stderr=subprocess.PIPE,
306
+ universal_newlines=True,
307
+ startupinfo=startupinfo
308
+ )
309
+ global RUNNING_SERVER_PROCESSES
310
+ RUNNING_SERVER_PROCESSES.append(self._server)
311
+
312
+ match = None
313
+ while True:
314
+ line = self._server.stdout.readline()
315
+ if not line:
316
+ break
317
+ match = self._PORT_RE.search(line)
318
+ if match:
319
+ port = int(match.group(1))
320
+ if port != self._port:
321
+ raise LanguageToolError(
322
+ 'requested port {}, but got {}'
323
+ .format(self._port, port)
324
+ )
325
+ break
326
+ if not match:
327
+ err_msg = self._terminate_server()
328
+ match = self._PORT_RE.search(err_msg)
329
+ if not match:
330
+ raise LanguageToolError(err_msg)
331
+ port = int(match.group(1))
332
+ if port != self._port:
333
+ raise LanguageToolError(err_msg)
334
+
335
+ if self._server:
336
+ self._consumer_thread = threading.Thread(
337
+ target=lambda: _consume(self._server.stdout))
338
+ self._consumer_thread.daemon = True
339
+ self._consumer_thread.start()
340
+ else:
341
+ # Couldn't start the server, so maybe there is already one running.
342
+ if err:
343
+ raise Exception(err)
344
+ else:
345
+ raise ServerError(
346
+ 'Server running; don\'t start a server here.'
347
+ )
348
+
349
+ def _server_is_alive(self):
350
+ return self._server and self._server.poll() is None
351
+
352
+ def _terminate_server(self):
353
+ LanguageToolError_message = ''
354
+ try:
355
+ self._server.terminate()
356
+ except OSError:
357
+ pass
358
+ try:
359
+ LanguageToolError_message = self._server.communicate()[1].strip()
360
+ except (IOError, ValueError):
361
+ pass
362
+ try:
363
+ self._server.stdout.close()
364
+ except IOError:
365
+ pass
366
+ try:
367
+ self._server.stdin.close()
368
+ except IOError:
369
+ pass
370
+ try:
371
+ self._server.stderr.close()
372
+ except IOError:
373
+ pass
374
+ self._server = None
375
+ return LanguageToolError_message
376
+
377
+
378
+ class LanguageToolPublicAPI(LanguageTool):
379
+ """Language tool client of the official API."""
380
+ def __init__(self, *args, **kwargs):
381
+ super().__init__(
382
+ *args, remote_server='https://languagetool.org/api/', **kwargs
383
+ )
384
+
385
+
386
+ @atexit.register
387
+ def terminate_server():
388
+ """Terminate the server."""
389
+ for proc in RUNNING_SERVER_PROCESSES:
390
+ proc.terminate()
391
+
392
+
393
+ def _consume(stdout):
394
+ """Consume/ignore the rest of the server output.
395
+ Without this, the server will end up hanging due to the buffer
396
+ filling up.
397
+ """
398
+ while stdout.readline():
399
+ pass
language_tool_python/utils.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+
3
+ import glob
4
+ import locale
5
+ import os
6
+ import subprocess
7
+ import urllib.parse
8
+ import urllib.request
9
+
10
+ from .config_file import LanguageToolConfig
11
+ from .match import Match
12
+ from .which import which
13
+
14
+ JAR_NAMES = [
15
+ 'languagetool-server.jar',
16
+ 'languagetool-standalone*.jar', # 2.1
17
+ 'LanguageTool.jar',
18
+ 'LanguageTool.uno.jar'
19
+ ]
20
+ FAILSAFE_LANGUAGE = 'en'
21
+
22
+ LTP_PATH_ENV_VAR = "LTP_PATH" # LanguageTool download path
23
+
24
+ # Directory containing the LanguageTool jar file:
25
+ LTP_JAR_DIR_PATH_ENV_VAR = "LTP_JAR_DIR_PATH"
26
+
27
+ # https://mail.python.org/pipermail/python-dev/2011-July/112551.html
28
+
29
+ if os.name == 'nt':
30
+ startupinfo = subprocess.STARTUPINFO()
31
+ startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
32
+ else:
33
+ startupinfo = None
34
+
35
+
36
+ class LanguageToolError(Exception):
37
+ pass
38
+
39
+
40
+ class ServerError(LanguageToolError):
41
+ pass
42
+
43
+
44
+ class JavaError(LanguageToolError):
45
+ pass
46
+
47
+
48
+ class PathError(LanguageToolError):
49
+ pass
50
+
51
+
52
+ def parse_url(url_str):
53
+ """ Parses a URL string, and adds 'http' if necessary. """
54
+ if 'http' not in url_str:
55
+ url_str = 'http://' + url_str
56
+
57
+ return urllib.parse.urlparse(url_str).geturl()
58
+
59
+
60
+ def _4_bytes_encoded_positions(text: str) -> List[int]:
61
+ """Return a list of positions of 4-byte encoded characters in the text."""
62
+ positions = []
63
+ char_index = 0
64
+ for char in text:
65
+ if len(char.encode('utf-8')) == 4:
66
+ positions.append(char_index)
67
+ # Adding 1 to the index because 4 byte characters are
68
+ # 2 bytes in length in LanguageTool, instead of 1 byte in Python.
69
+ char_index += 1
70
+ char_index += 1
71
+ return positions
72
+
73
+
74
+ def correct(text: str, matches: List[Match]) -> str:
75
+ """Automatically apply suggestions to the text."""
76
+ # Get the positions of 4-byte encoded characters in the text because without
77
+ # carrying out this step, the offsets of the matches could be incorrect.
78
+ for match in matches:
79
+ match.offset -= sum(1 for i in _4_bytes_encoded_positions(text) if i <= match.offset)
80
+ ltext = list(text)
81
+ matches = [match for match in matches if match.replacements]
82
+ errors = [ltext[match.offset:match.offset + match.errorLength]
83
+ for match in matches]
84
+ correct_offset = 0
85
+ for n, match in enumerate(matches):
86
+ frompos, topos = (correct_offset + match.offset,
87
+ correct_offset + match.offset + match.errorLength)
88
+ if ltext[frompos:topos] != errors[n]:
89
+ continue
90
+ repl = match.replacements[0]
91
+ ltext[frompos:topos] = list(repl)
92
+ correct_offset += len(repl) - len(errors[n])
93
+ return ''.join(ltext)
94
+
95
+
96
+ def get_language_tool_download_path() -> str:
97
+ # Get download path from environment or use default.
98
+ download_path = os.environ.get(
99
+ LTP_PATH_ENV_VAR,
100
+ os.path.join(os.path.expanduser("~"), ".cache", "language_tool_python")
101
+ )
102
+ return download_path
103
+
104
+
105
+ def find_existing_language_tool_downloads(download_folder: str) -> List[str]:
106
+ language_tool_path_list = [
107
+ path for path in
108
+ glob.glob(os.path.join(download_folder, 'LanguageTool*'))
109
+ if os.path.isdir(path)
110
+ ]
111
+ return language_tool_path_list
112
+
113
+
114
+ def get_language_tool_directory() -> str:
115
+ """Get LanguageTool directory."""
116
+ download_folder = get_language_tool_download_path()
117
+ if not os.path.isdir(download_folder):
118
+ raise NotADirectoryError(
119
+ "LanguageTool directory path is not a valid directory {}."
120
+ .format(download_folder)
121
+ )
122
+ language_tool_path_list = find_existing_language_tool_downloads(
123
+ download_folder
124
+ )
125
+
126
+ if not len(language_tool_path_list):
127
+ raise FileNotFoundError(
128
+ 'LanguageTool not found in {}.'.format(download_folder)
129
+ )
130
+
131
+ # Return the latest version found in the directory.
132
+ return max(language_tool_path_list)
133
+
134
+
135
+ def get_server_cmd(
136
+ port: int = None, config: LanguageToolConfig = None
137
+ ) -> List[str]:
138
+ java_path, jar_path = get_jar_info()
139
+ cmd = [java_path, '-cp', jar_path,
140
+ 'org.languagetool.server.HTTPServer']
141
+
142
+ if port is not None:
143
+ cmd += ['-p', str(port)]
144
+
145
+ if config is not None:
146
+ cmd += ['--config', config.path]
147
+
148
+ return cmd
149
+
150
+
151
+ def get_jar_info() -> Tuple[str, str]:
152
+ java_path = which('java')
153
+ if not java_path:
154
+ raise JavaError("can't find Java")
155
+
156
+ # Use the env var to the jar directory if it is defined
157
+ # otherwise look in the download directory
158
+ jar_dir_name = os.environ.get(
159
+ LTP_JAR_DIR_PATH_ENV_VAR,
160
+ get_language_tool_directory()
161
+ )
162
+ jar_path = None
163
+ for jar_name in JAR_NAMES:
164
+ for jar_path in glob.glob(os.path.join(jar_dir_name, jar_name)):
165
+ if os.path.isfile(jar_path):
166
+ break
167
+ else:
168
+ jar_path = None
169
+ if jar_path:
170
+ break
171
+ else:
172
+ raise PathError("can't find languagetool-standalone in {!r}"
173
+ .format(jar_dir_name))
174
+ return java_path, jar_path
175
+
176
+
177
+ def get_locale_language():
178
+ """Get the language code for the current locale setting."""
179
+ return locale.getlocale()[0] or locale.getdefaultlocale()[0]
language_tool_python/which.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """Cross-platform which command."""
4
+
5
+ import os
6
+ import sys
7
+
8
+
9
+ __all__ = ['which']
10
+
11
+ WIN_ALLOW_CROSS_ARCH = True
12
+
13
+ def which(program):
14
+ """Identify the location of an executable file."""
15
+ if os.path.split(program)[0]:
16
+ program_path = find_exe(program)
17
+ if program_path:
18
+ return program_path
19
+ else:
20
+ for path in get_path_list():
21
+ program_path = find_exe(os.path.join(path, program))
22
+ if program_path:
23
+ return program_path
24
+ return None
25
+
26
+
27
+ def is_exe(path):
28
+ return os.path.isfile(path) and os.access(path, os.X_OK)
29
+
30
+
31
+ def _get_path_list():
32
+ return os.environ['PATH'].split(os.pathsep)
33
+
34
+
35
+ if os.name == 'nt':
36
+ def find_exe(program):
37
+ root, ext = os.path.splitext(program)
38
+ if ext:
39
+ if is_exe(program):
40
+ return program
41
+ else:
42
+ for ext in os.environ['PATHEXT'].split(os.pathsep):
43
+ program_path = root + ext.lower()
44
+ if is_exe(program_path):
45
+ return program_path
46
+ return None
47
+
48
+ def get_path_list():
49
+ paths = _get_path_list()
50
+ if WIN_ALLOW_CROSS_ARCH:
51
+ alt_sys_path = os.path.expandvars(r"$WINDIR\Sysnative")
52
+ if os.path.isdir(alt_sys_path):
53
+ paths.insert(0, alt_sys_path)
54
+ else:
55
+ alt_sys_path = os.path.expandvars(r"$WINDIR\SysWOW64")
56
+ if os.path.isdir(alt_sys_path):
57
+ paths.append(alt_sys_path)
58
+ return paths
59
+
60
+ else:
61
+ def find_exe(program):
62
+ return program if is_exe(program) else None
63
+
64
+ get_path_list = _get_path_list
65
+
66
+
67
+ def main():
68
+ for arg in sys.argv[1:]:
69
+ path = which(arg)
70
+ if path:
71
+ print(path)
72
+
73
+
74
+ if __name__ == '__main__':
75
+ sys.exit(main())