Spaces:
Running
Running
import re | |
from collections import Counter | |
_cyr2lat = [ | |
{'find_what': 'А', 'replacer': 'A', 're': False}, | |
{'find_what': 'а', 'replacer': 'a', 're': False}, | |
{'find_what': 'О', 'replacer': 'O', 're': False}, | |
{'find_what': 'о', 'replacer': 'o', 're': False}, | |
{'find_what': 'У', 'replacer': 'U', 're': False}, | |
{'find_what': 'у', 'replacer': 'u', 're': False}, | |
{'find_what': 'Ы', 'replacer': 'Y', 're': False}, | |
{'find_what': 'ы', 'replacer': 'y', 're': False}, | |
{'find_what': 'И', 'replacer': 'I', 're': False}, | |
{'find_what': 'и', 'replacer': 'i', 're': False}, | |
{'find_what': 'Е', 'replacer': 'E', 're': False}, | |
{'find_what': 'е', 'replacer': 'e', 're': False}, | |
{'find_what': 'Б', 'replacer': 'B', 're': False}, | |
{'find_what': 'б', 'replacer': 'b', 're': False}, | |
{'find_what': 'В', 'replacer': 'V', 're': False}, | |
{'find_what': 'в', 'replacer': 'v', 're': False}, | |
{'find_what': 'Г', 'replacer': 'G', 're': False}, | |
{'find_what': 'г', 'replacer': 'g', 're': False}, | |
{'find_what': 'Д', 'replacer': 'D', 're': False}, | |
{'find_what': 'д', 'replacer': 'd', 're': False}, | |
{'find_what': 'З', 'replacer': 'Z', 're': False}, | |
{'find_what': 'з', 'replacer': 'z', 're': False}, | |
{'find_what': 'К', 'replacer': 'K', 're': False}, | |
{'find_what': 'к', 'replacer': 'k', 're': False}, | |
{'find_what': 'Л', 'replacer': 'L', 're': False}, | |
{'find_what': 'л', 'replacer': 'l', 're': False}, | |
{'find_what': 'М', 'replacer': 'M', 're': False}, | |
{'find_what': 'м', 'replacer': 'm', 're': False}, | |
{'find_what': 'Н', 'replacer': 'N', 're': False}, | |
{'find_what': 'н', 'replacer': 'n', 're': False}, | |
{'find_what': 'П', 'replacer': 'P', 're': False}, | |
{'find_what': 'п', 'replacer': 'p', 're': False}, | |
{'find_what': 'Р', 'replacer': 'R', 're': False}, | |
{'find_what': 'р', 'replacer': 'r', 're': False}, | |
{'find_what': 'С', 'replacer': 'S', 're': False}, | |
{'find_what': 'с', 'replacer': 's', 're': False}, | |
{'find_what': 'Т', 'replacer': 'T', 're': False}, | |
{'find_what': 'т', 'replacer': 't', 're': False}, | |
{'find_what': 'Ф', 'replacer': 'F', 're': False}, | |
{'find_what': 'ф', 'replacer': 'f', 're': False}, | |
{'find_what': 'Х', 'replacer': 'H', 're': False}, | |
{'find_what': 'х', 'replacer': 'h', 're': False}, | |
{'find_what': 'Ц', 'replacer': 'C', 're': False}, | |
{'find_what': 'ц', 'replacer': 'c', 're': False}, | |
{'find_what': 'Ч', 'replacer': 'Č', 're': False}, | |
{'find_what': 'ч', 'replacer': 'č', 're': False}, | |
{'find_what': 'Ш', 'replacer': 'Š', 're': False}, | |
{'find_what': 'ш', 'replacer': 'š', 're': False}, | |
{'find_what': 'Ж', 'replacer': 'Ž', 're': False}, | |
{'find_what': 'ж', 'replacer': 'ž', 're': False}, | |
{'find_what': 'Щ', 'replacer': 'Čš', 're': False}, | |
{'find_what': 'щ', 'replacer': 'čš', 're': False}, | |
{'find_what': 'Ь', 'replacer': '́', 're': False}, | |
{'find_what': 'ь', 'replacer': '́', 're': False}, | |
{'find_what': 'Й', 'replacer': 'J', 're': False}, | |
{'find_what': 'й', 'replacer': 'j', 're': False}, | |
{'find_what': 'Ъ', 'replacer': '', 're': False}, | |
{'find_what': 'ъ', 'replacer': '', 're': False}, | |
{'find_what': 'Э', 'replacer': 'Ě', 're': False}, | |
{'find_what': 'э', 'replacer': 'ě', 're': False}, | |
{'find_what': 'Я', 'replacer': 'Ä', 're': False}, | |
{'find_what': 'я', 'replacer': 'ä', 're': False}, | |
{'find_what': 'Ю', 'replacer': 'Ü', 're': False}, | |
{'find_what': 'ю', 'replacer': 'ü', 're': False}, | |
{'find_what': 'Ё', 'replacer': 'Ö', 're': False}, | |
{'find_what': 'ё', 'replacer': 'ö', 're': False}, | |
{'find_what': '\\bö\\b', 'replacer': 'jo', 're': True}, | |
{'find_what': '\\bÖ\\b', 'replacer': 'Jo', 're': True}, | |
{'find_what': '\\bü\\b', 'replacer': 'ju', 're': True}, | |
{'find_what': '\\bÜ\\b', 'replacer': 'Ju', 're': True}, | |
{'find_what': '\\bä\\b', 'replacer': 'ja', 're': True}, | |
{'find_what': '\\bÄ\\b', 'replacer': 'Ja', 're': True}, | |
{'find_what': '(\\bö)([a-zöäüšžčě])', 'replacer': 'jo\\2', 're': True}, | |
{'find_what': '(\\bä)([a-zöäüšžčě])', 'replacer': 'ja\\2', 're': True}, | |
{'find_what': '(\\bü)([a-zöäüšžčě])', 'replacer': 'ju\\2', 're': True}, | |
{'find_what': '(\\bÖ)([a-zöäüšžčě])', 'replacer': 'Jo\\2', 're': True}, | |
{'find_what': '(\\bÄ)([a-zöäüšžčě])', 'replacer': 'Ja\\2', 're': True}, | |
{'find_what': '(\\bÜ)([a-zöäüšžčě])', 'replacer': 'Ju\\2', 're': True}, | |
{'find_what': '(\\bö)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'jo\\2', 're': True}, | |
{'find_what': '(\\bä)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'ja\\2', 're': True}, | |
{'find_what': '(\\bü)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'ju\\2', 're': True}, | |
{'find_what': '(\\bÖ)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'JO\\2', 're': True}, | |
{'find_what': '(\\bÄ)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'JA\\2', 're': True}, | |
{'find_what': '(\\bÜ)([A-ZÖÄÜŠŽČĚ])', 'replacer': 'JU\\2', 're': True}, | |
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ä)', 'replacer': '\\1ja', 're': True}, | |
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ä)', 'replacer': '\\1JA', 're': True}, | |
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ö)', 'replacer': '\\1jo', 're': True}, | |
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ö)', 'replacer': '\\1JO', 're': True}, | |
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ü)', 'replacer': '\\1ju', 're': True}, | |
{'find_what': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ü)', 'replacer': '\\1JU', 're': True}, | |
] | |
_cyr2lat_joint_acutes = [ | |
{'find_what': 'ś', 'replacer': 'ś', 're': False}, | |
{'find_what': 'ź', 'replacer': 'ź', 're': False}, | |
{'find_what': 'ć', 'replacer': 'ć', 're': False}, | |
{'find_what': 'ń', 'replacer': 'ń', 're': False}, | |
{'find_what': 'ŕ', 'replacer': 'ŕ', 're': False}, | |
{'find_what': 't́', 'replacer': 'ť', 're': False}, | |
{'find_what': 'd́', 'replacer': 'ď', 're': False}, | |
{'find_what': 'ĺ', 'replacer': 'ĺ', 're': False}, | |
{'find_what': 'Ś', 'replacer': 'Ś', 're': False}, | |
{'find_what': 'Ź', 'replacer': 'Ź', 're': False}, | |
{'find_what': 'Ć', 'replacer': 'Ć', 're': False}, | |
{'find_what': 'Ń', 'replacer': 'Ń', 're': False}, | |
{'find_what': 'T́', 'replacer': 'Ť', 're': False}, | |
{'find_what': 'D́', 'replacer': 'Ď', 're': False}, | |
{'find_what': 'Ĺ', 'replacer': 'Ĺ', 're': False}, | |
{'find_what': 'Ŕ', 'replacer': 'Ŕ', 're': False}, | |
] | |
_cyr2lat_first_e = [ | |
{'find_what': '\\bĚ', 'replacer': 'E', 're': True}, | |
{'find_what': '\\bě', 'replacer': 'e', 're': True}, | |
] | |
_cyr2lat_soft_l_after_vowels = [ | |
# joint acutes | disjoint acutes | |
{'find_what': '([yiěeYIĚE])(Ĺ|Ĺ)', 'replacer': '\\1L', 're': True}, | |
{'find_what': '([yiěeYIĚE])(ĺ|ĺ)', 'replacer': '\\1l', 're': True}, | |
] | |
_lat2cyr = [ | |
{'find_what': 'Ŕ', 'replacer': 'Ŕ', 're': False}, | |
{'find_what': 'Ĺ', 'replacer': 'Ĺ', 're': False}, | |
{'find_what': 'Ď', 'replacer': 'D́', 're': False}, | |
{'find_what': 'Ť', 'replacer': 'T́', 're': False}, | |
{'find_what': 'Ń', 'replacer': 'Ń', 're': False}, | |
{'find_what': 'Ć', 'replacer': 'Ć', 're': False}, | |
{'find_what': 'Ź', 'replacer': 'Ź', 're': False}, | |
{'find_what': 'Ś', 'replacer': 'Ś', 're': False}, | |
{'find_what': 'ĺ', 'replacer': 'ĺ', 're': False}, | |
{'find_what': 'ď', 'replacer': 'd́', 're': False}, | |
{'find_what': 'ť', 'replacer': 't́', 're': False}, | |
{'find_what': 'ŕ', 'replacer': 'ŕ', 're': False}, | |
{'find_what': 'ń', 'replacer': 'ń', 're': False}, | |
{'find_what': 'ć', 'replacer': 'ć', 're': False}, | |
{'find_what': 'ź', 'replacer': 'ź', 're': False}, | |
{'find_what': 'ś', 'replacer': 'ś', 're': False}, | |
# {'find_what': '\\1JU', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ü)', 're': True}, | |
# {'find_what': '\\1ju', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ü)', 're': True}, | |
# {'find_what': '\\1JO', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ö)', 're': True}, | |
# {'find_what': '\\1jo', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ö)', 're': True}, | |
# {'find_what': '\\1JA', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(Ä)', 're': True}, | |
# {'find_what': '\\1ja', 'replacer': '([aouiěyeöüäAOUIĚYEÖÜÄ])(ä)', 're': True}, | |
# {'find_what': 'JU\\2', 'replacer': '(\\bÜ)([A-ZÖÄÜŠŽČĚ])', 're': True}, | |
# {'find_what': 'JA\\2', 'replacer': '(\\bÄ)([A-ZÖÄÜŠŽČĚ])', 're': True}, | |
# {'find_what': 'JO\\2', 'replacer': '(\\bÖ)([A-ZÖÄÜŠŽČĚ])', 're': True}, | |
# {'find_what': 'ju\\2', 'replacer': '(\\bü)([A-ZÖÄÜŠŽČĚ])', 're': True}, | |
# {'find_what': 'ja\\2', 'replacer': '(\\bä)([A-ZÖÄÜŠŽČĚ])', 're': True}, | |
# {'find_what': 'jo\\2', 'replacer': '(\\bö)([A-ZÖÄÜŠŽČĚ])', 're': True}, | |
# {'find_what': 'Ju\\2', 'replacer': '(\\bÜ)([a-zöäüšžčě])', 're': True}, | |
# {'find_what': 'Ja\\2', 'replacer': '(\\bÄ)([a-zöäüšžčě])', 're': True}, | |
# {'find_what': 'Jo\\2', 'replacer': '(\\bÖ)([a-zöäüšžčě])', 're': True}, | |
# {'find_what': 'ju\\2', 'replacer': '(\\bü)([a-zöäüšžčě])', 're': True}, | |
# {'find_what': 'ja\\2', 'replacer': '(\\bä)([a-zöäüšžčě])', 're': True}, | |
# {'find_what': 'jo\\2', 'replacer': '(\\bö)([a-zöäüšžčě])', 're': True}, | |
# {'find_what': 'Ja', 'replacer': '\\bÄ\\b', 're': True}, | |
# {'find_what': 'ja', 'replacer': '\\bä\\b', 're': True}, | |
# {'find_what': 'Ju', 'replacer': '\\bÜ\\b', 're': True}, | |
# {'find_what': 'ju', 'replacer': '\\bü\\b', 're': True}, | |
# {'find_what': 'Jo', 'replacer': '\\bÖ\\b', 're': True}, | |
# {'find_what': 'jo', 'replacer': '\\bö\\b', 're': True}, | |
{'find_what': 'ö', 'replacer': 'ё', 're': False}, | |
{'find_what': 'Ö', 'replacer': 'Ё', 're': False}, | |
{'find_what': 'ü', 'replacer': 'ю', 're': False}, | |
{'find_what': 'Ü', 'replacer': 'Ю', 're': False}, | |
{'find_what': 'ä', 'replacer': 'я', 're': False}, | |
{'find_what': 'Ä', 'replacer': 'Я', 're': False}, | |
{'find_what': 'ě', 'replacer': 'э', 're': False}, | |
{'find_what': 'Ě', 'replacer': 'Э', 're': False}, | |
# {'find_what': '', 'replacer': 'ъ', 're': False}, | |
# {'find_what': '', 'replacer': 'Ъ', 're': False}, | |
{'find_what': 'j', 'replacer': 'й', 're': False}, | |
{'find_what': 'J', 'replacer': 'Й', 're': False}, | |
{'find_what': '́', 'replacer': 'ь', 're': False}, | |
{'find_what': '́', 'replacer': 'Ь', 're': False}, | |
{'find_what': 'čš', 'replacer': 'щ', 're': False}, | |
{'find_what': 'Čš', 'replacer': 'Щ', 're': False}, | |
{'find_what': 'ž', 'replacer': 'ж', 're': False}, | |
{'find_what': 'Ž', 'replacer': 'Ж', 're': False}, | |
{'find_what': 'š', 'replacer': 'ш', 're': False}, | |
{'find_what': 'Š', 'replacer': 'Ш', 're': False}, | |
{'find_what': 'č', 'replacer': 'ч', 're': False}, | |
{'find_what': 'Č', 'replacer': 'Ч', 're': False}, | |
{'find_what': 'c', 'replacer': 'ц', 're': False}, | |
{'find_what': 'C', 'replacer': 'Ц', 're': False}, | |
{'find_what': 'h', 'replacer': 'х', 're': False}, | |
{'find_what': 'H', 'replacer': 'Х', 're': False}, | |
{'find_what': 'f', 'replacer': 'ф', 're': False}, | |
{'find_what': 'F', 'replacer': 'Ф', 're': False}, | |
{'find_what': 't', 'replacer': 'т', 're': False}, | |
{'find_what': 'T', 'replacer': 'Т', 're': False}, | |
{'find_what': 's', 'replacer': 'с', 're': False}, | |
{'find_what': 'S', 'replacer': 'С', 're': False}, | |
{'find_what': 'r', 'replacer': 'р', 're': False}, | |
{'find_what': 'R', 'replacer': 'Р', 're': False}, | |
{'find_what': 'p', 'replacer': 'п', 're': False}, | |
{'find_what': 'P', 'replacer': 'П', 're': False}, | |
{'find_what': 'n', 'replacer': 'н', 're': False}, | |
{'find_what': 'N', 'replacer': 'Н', 're': False}, | |
{'find_what': 'm', 'replacer': 'м', 're': False}, | |
{'find_what': 'M', 'replacer': 'М', 're': False}, | |
{'find_what': 'l', 'replacer': 'л', 're': False}, | |
{'find_what': 'L', 'replacer': 'Л', 're': False}, | |
{'find_what': 'k', 'replacer': 'к', 're': False}, | |
{'find_what': 'K', 'replacer': 'К', 're': False}, | |
{'find_what': 'z', 'replacer': 'з', 're': False}, | |
{'find_what': 'Z', 'replacer': 'З', 're': False}, | |
{'find_what': 'd', 'replacer': 'д', 're': False}, | |
{'find_what': 'D', 'replacer': 'Д', 're': False}, | |
{'find_what': 'g', 'replacer': 'г', 're': False}, | |
{'find_what': 'G', 'replacer': 'Г', 're': False}, | |
{'find_what': 'v', 'replacer': 'в', 're': False}, | |
{'find_what': 'V', 'replacer': 'В', 're': False}, | |
{'find_what': 'b', 'replacer': 'б', 're': False}, | |
{'find_what': 'B', 'replacer': 'Б', 're': False}, | |
{'find_what': 'e', 'replacer': 'е', 're': False}, | |
{'find_what': 'E', 'replacer': 'Е', 're': False}, | |
{'find_what': 'i', 'replacer': 'и', 're': False}, | |
{'find_what': 'I', 'replacer': 'И', 're': False}, | |
{'find_what': 'y', 'replacer': 'ы', 're': False}, | |
{'find_what': 'Y', 'replacer': 'Ы', 're': False}, | |
{'find_what': 'u', 'replacer': 'у', 're': False}, | |
{'find_what': 'U', 'replacer': 'У', 're': False}, | |
{'find_what': 'o', 'replacer': 'о', 're': False}, | |
{'find_what': 'O', 'replacer': 'О', 're': False}, | |
{'find_what': 'a', 'replacer': 'а', 're': False}, | |
{'find_what': 'A', 'replacer': 'А', 're': False}, | |
# ya, yo, yu | |
{'find_what': 'Й[Аа]', 'replacer': 'Я', 're': True}, | |
{'find_what': 'й[Аа]', 'replacer': 'я', 're': True}, | |
{'find_what': 'Й[Ее]', 'replacer': 'Ё', 're': True}, | |
{'find_what': 'й[Ее]', 'replacer': 'ё', 're': True}, | |
{'find_what': 'Й[Уу]', 'replacer': 'Ю', 're': True}, | |
{'find_what': 'й[Уу]', 'replacer': 'ю', 're': True}, | |
# todo: introduce Ъ when appropriate | |
] | |
def transliterate_with_rules(text, rules): | |
for item in rules: | |
if item.get('re'): | |
text = re.sub(item['find_what'], item['replacer'], text) | |
else: | |
text = text.replace(item['find_what'], item['replacer']) | |
return text | |
def cyr2lat(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True): | |
text = transliterate_with_rules(text, _cyr2lat) | |
if joint_acute: | |
text = transliterate_with_rules(text, _cyr2lat_joint_acutes) | |
if not first_e_with_hacek: | |
text = transliterate_with_rules(text, _cyr2lat_first_e) | |
if not soft_l_after_vowels: | |
text = transliterate_with_rules(text, _cyr2lat_soft_l_after_vowels) | |
return text | |
def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True): | |
# todo: support all the optional settings | |
return transliterate_with_rules(text, _lat2cyr) | |
CYR_CHARS = 'абвгдеёжзиклмнопрстуфхцчшщъыьэюя' | |
BASIC_LAT_CHARS = 'abcdefghijklmnopqrtuvwxyz' | |
ACCENT_LAT_CHARS = 'ěäüöśźćńŕťďĺ' | |
LAT_CHARS = BASIC_LAT_CHARS + ACCENT_LAT_CHARS | |
def detect_script(text: str, min_prevalence: float = 2.0, min_detectable: float = 0.1) -> str: | |
""" Detect the script of the text. | |
Possible values: | |
- cyr - Cyrillic | |
- lat - Latin | |
- mix - Mixed Cyrillic and Latin script | |
- unk - Unknown script (probably neither Latin nor Cyrillic) | |
""" | |
cyr, lat, other = 0, 0, 0 | |
char_cnt = Counter(text.lower()) | |
for char, cnt in char_cnt.items(): | |
if char in CYR_CHARS: | |
cyr += cnt | |
elif char in LAT_CHARS: | |
lat += cnt | |
else: | |
other += cnt | |
total = cyr + lat + other | |
if not total: | |
return 'unk' | |
if (cyr + lat) / total < min_detectable: | |
return 'unk' | |
if cyr >= lat * min_prevalence: | |
return 'cyr' | |
if lat >= cyr * min_prevalence: | |
return 'lat' | |
return 'mix' | |