sashtech's picture
Upload 9 files
7b96a1b verified
from typing import Dict, List
import atexit
import http.client
import json
import os
import re
import requests
import socket
import subprocess
import threading
import urllib.parse
from .config_file import LanguageToolConfig
from .download_lt import download_lt, LTP_DOWNLOAD_VERSION
from .language_tag import LanguageTag
from .match import Match
from .utils import (
correct,
parse_url, get_locale_language,
get_language_tool_directory, get_server_cmd,
FAILSAFE_LANGUAGE, startupinfo,
LanguageToolError, ServerError, PathError
)
DEBUG_MODE = False
# Keep track of running server PIDs in a global list. This way,
# we can ensure they're killed on exit.
RUNNING_SERVER_PROCESSES: List[subprocess.Popen] = []
class LanguageTool:
"""Main class used for checking text against different rules.
LanguageTool v2 API documentation:
https://languagetool.org/http-api/swagger-ui/#!/default/post_check
"""
_MIN_PORT = 8081
_MAX_PORT = 8999
_TIMEOUT = 5 * 60
_remote = False
_port = _MIN_PORT
_server: subprocess.Popen = None
_consumer_thread: threading.Thread = None
_PORT_RE = re.compile(r"(?:https?://.*:|port\s+)(\d+)", re.I)
def __init__(
self, language=None, motherTongue=None,
remote_server=None, newSpellings=None,
new_spellings_persist=True,
host=None, config=None,
language_tool_download_version: str = LTP_DOWNLOAD_VERSION
):
self.language_tool_download_version = language_tool_download_version
self._new_spellings = None
self._new_spellings_persist = new_spellings_persist
self._host = host or socket.gethostbyname('localhost')
if remote_server:
assert config is None, "cannot pass config file to remote server"
self.config = LanguageToolConfig(config) if config else None
if remote_server is not None:
self._remote = True
self._url = parse_url(remote_server)
self._url = urllib.parse.urljoin(self._url, 'v2/')
self._update_remote_server_config(self._url)
elif not self._server_is_alive():
self._start_server_on_free_port()
if language is None:
try:
language = get_locale_language()
except ValueError:
language = FAILSAFE_LANGUAGE
if newSpellings:
self._new_spellings = newSpellings
self._register_spellings(self._new_spellings)
self._language = LanguageTag(language, self._get_languages())
self.motherTongue = motherTongue
self.disabled_rules = set()
self.enabled_rules = set()
self.disabled_categories = set()
self.enabled_categories = set()
self.enabled_rules_only = False
self.preferred_variants = set()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def __del__(self):
self.close()
def __repr__(self):
return '{}(language={!r}, motherTongue={!r})'.format(
self.__class__.__name__, self.language, self.motherTongue)
def close(self):
if self._server_is_alive():
self._terminate_server()
if not self._new_spellings_persist and self._new_spellings:
self._unregister_spellings()
self._new_spellings = []
@property
def language(self):
"""The language to be used."""
return self._language
@language.setter
def language(self, language):
self._language = LanguageTag(language, self._get_languages())
self.disabled_rules.clear()
self.enabled_rules.clear()
@property
def motherTongue(self):
"""The user's mother tongue or None.
The mother tongue may also be used as a source language for
checking bilingual texts.
"""
return self._motherTongue
@motherTongue.setter
def motherTongue(self, motherTongue):
self._motherTongue = (
None if motherTongue is None
else LanguageTag(motherTongue, self._get_languages())
)
@property
def _spell_checking_categories(self):
return {'TYPOS'}
def check(self, text: str) -> List[Match]:
"""Match text against enabled rules."""
url = urllib.parse.urljoin(self._url, 'check')
response = self._query_server(url, self._create_params(text))
matches = response['matches']
return [Match(match) for match in matches]
def _create_params(self, text: str) -> Dict[str, str]:
params = {'language': str(self.language), 'text': text}
if self.motherTongue is not None:
params['motherTongue'] = self.motherTongue
if self.disabled_rules:
params['disabledRules'] = ','.join(self.disabled_rules)
if self.enabled_rules:
params['enabledRules'] = ','.join(self.enabled_rules)
if self.enabled_rules_only:
params['enabledOnly'] = 'true'
if self.disabled_categories:
params['disabledCategories'] = ','.join(self.disabled_categories)
if self.enabled_categories:
params['enabledCategories'] = ','.join(self.enabled_categories)
if self.preferred_variants:
params['preferredVariants'] = ','.join(self.preferred_variants)
return params
def correct(self, text: str) -> str:
"""Automatically apply suggestions to the text."""
return correct(text, self.check(text))
def enable_spellchecking(self):
"""Enable spell-checking rules."""
self.disabled_categories.difference_update(
self._spell_checking_categories
)
def disable_spellchecking(self):
"""Disable spell-checking rules."""
self.disabled_categories.update(self._spell_checking_categories)
@staticmethod
def _get_valid_spelling_file_path() -> str:
library_path = get_language_tool_directory()
spelling_file_path = os.path.join(
library_path, "org/languagetool/resource/en/hunspell/spelling.txt"
)
if not os.path.exists(spelling_file_path):
raise FileNotFoundError(
"Failed to find the spellings file at {}\n "
"Please file an issue at "
"https://github.com/jxmorris12/language_tool_python/issues"
.format(spelling_file_path))
return spelling_file_path
def _register_spellings(self, spellings):
spelling_file_path = self._get_valid_spelling_file_path()
with (
open(spelling_file_path, "a+", encoding='utf-8')
) as spellings_file:
spellings_file.write(
"\n" + "\n".join([word for word in spellings])
)
if DEBUG_MODE:
print("Registered new spellings at {}".format(spelling_file_path))
def _unregister_spellings(self):
spelling_file_path = self._get_valid_spelling_file_path()
with (
open(spelling_file_path, 'r+', encoding='utf-8')
) as spellings_file:
spellings_file.seek(0, os.SEEK_END)
for _ in range(len(self._new_spellings)):
while spellings_file.read(1) != '\n':
spellings_file.seek(spellings_file.tell() - 2, os.SEEK_SET)
spellings_file.seek(spellings_file.tell() - 2, os.SEEK_SET)
spellings_file.seek(spellings_file.tell() + 1, os.SEEK_SET)
spellings_file.truncate()
if DEBUG_MODE:
print(
"Unregistered new spellings at {}".format(spelling_file_path)
)
def _get_languages(self) -> set:
"""Get supported languages (by querying the server)."""
self._start_server_if_needed()
url = urllib.parse.urljoin(self._url, 'languages')
languages = set()
for e in self._query_server(url, num_tries=1):
languages.add(e.get('code'))
languages.add(e.get('longCode'))
languages.add("auto")
return languages
def _start_server_if_needed(self):
# Start server.
if not self._server_is_alive() and self._remote is False:
self._start_server_on_free_port()
def _update_remote_server_config(self, url):
self._url = url
self._remote = True
def _query_server(self, url, params=None, num_tries=2):
if DEBUG_MODE:
print('_query_server url:', url, 'params:', params)
for n in range(num_tries):
try:
with (
requests.get(url, params=params, timeout=self._TIMEOUT)
) as response:
try:
return response.json()
except json.decoder.JSONDecodeError as e:
if DEBUG_MODE:
print(
'URL {} and params {} '
'returned invalid JSON response: {}'
.format(url, params, e)
)
print(response)
print(response.content)
raise LanguageToolError(response.content.decode())
except (IOError, http.client.HTTPException) as e:
if self._remote is False:
self._terminate_server()
self._start_local_server()
if n + 1 >= num_tries:
raise LanguageToolError('{}: {}'.format(self._url, e))
def _start_server_on_free_port(self):
while True:
self._url = 'http://{}:{}/v2/'.format(self._host, self._port)
try:
self._start_local_server()
break
except ServerError:
if self._MIN_PORT <= self._port < self._MAX_PORT:
self._port += 1
else:
raise
def _start_local_server(self):
# Before starting local server, download language tool if needed.
download_lt(self.language_tool_download_version)
err = None
try:
if DEBUG_MODE:
if self._port:
print(
'language_tool_python initializing with port:',
self._port
)
if self.config:
print(
'language_tool_python initializing '
'with temporary config file:',
self.config.path
)
server_cmd = get_server_cmd(self._port, self.config)
except PathError as e:
# Can't find path to LanguageTool.
err = e
else:
# Need to PIPE all handles: http://bugs.python.org/issue3905
self._server = subprocess.Popen(
server_cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
startupinfo=startupinfo
)
global RUNNING_SERVER_PROCESSES
RUNNING_SERVER_PROCESSES.append(self._server)
match = None
while True:
line = self._server.stdout.readline()
if not line:
break
match = self._PORT_RE.search(line)
if match:
port = int(match.group(1))
if port != self._port:
raise LanguageToolError(
'requested port {}, but got {}'
.format(self._port, port)
)
break
if not match:
err_msg = self._terminate_server()
match = self._PORT_RE.search(err_msg)
if not match:
raise LanguageToolError(err_msg)
port = int(match.group(1))
if port != self._port:
raise LanguageToolError(err_msg)
if self._server:
self._consumer_thread = threading.Thread(
target=lambda: _consume(self._server.stdout))
self._consumer_thread.daemon = True
self._consumer_thread.start()
else:
# Couldn't start the server, so maybe there is already one running.
if err:
raise Exception(err)
else:
raise ServerError(
'Server running; don\'t start a server here.'
)
def _server_is_alive(self):
return self._server and self._server.poll() is None
def _terminate_server(self):
LanguageToolError_message = ''
try:
self._server.terminate()
except OSError:
pass
try:
LanguageToolError_message = self._server.communicate()[1].strip()
except (IOError, ValueError):
pass
try:
self._server.stdout.close()
except IOError:
pass
try:
self._server.stdin.close()
except IOError:
pass
try:
self._server.stderr.close()
except IOError:
pass
self._server = None
return LanguageToolError_message
class LanguageToolPublicAPI(LanguageTool):
"""Language tool client of the official API."""
def __init__(self, *args, **kwargs):
super().__init__(
*args, remote_server='https://languagetool.org/api/', **kwargs
)
@atexit.register
def terminate_server():
"""Terminate the server."""
for proc in RUNNING_SERVER_PROCESSES:
proc.terminate()
def _consume(stdout):
"""Consume/ignore the rest of the server output.
Without this, the server will end up hanging due to the buffer
filling up.
"""
while stdout.readline():
pass