Spaces:
Sleeping
Sleeping
File size: 4,458 Bytes
80d8416 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
"""Hyphenation module"""
import string
from hyphen import Hyphenator, dictools
from modules.console_colors import (
ULTRASINGER_HEAD,
blue_highlighted,
)
# PyHyphen tries to retrieve dictionaries for download 'https://cgit.freedesktop.org/libreoffice/dictionaries/plain/'
# Updated PyHyphen dictools Languages, so they can be installed
LANGUAGES = [
"af_ZA",
"an_ES",
"ar",
"be_BY",
"bg_BG",
"bn_BD",
"bo",
"br_FR",
"bs_BA",
"ca",
"ckb",
"cs_CZ",
"da_DK",
"de",
"el_GR",
"en",
"eo",
"es",
"et_EE",
"fa_IR",
"fr_FR",
"gd_GB",
"gl",
"gu_IN",
"gug",
"he_IL",
"hi_IN",
"hr_HR",
"hu_HU",
"id",
"is",
"it_IT",
"kmr_Latn",
"ko_KR",
"lo_LA",
"lt_LT",
"lv_LV",
"mn_MN",
"ne_NP",
"nl_NL",
"no",
"oc_FR",
"pl_PL",
"pt_BR",
"pt_PT",
"ro",
"ru_RU",
"si_LK",
"sk_SK",
"sl_SI",
"sq_AL",
"sr",
"sv_SE",
"sw_TZ",
"te_IN",
"th_TH",
"tr_TR",
"uk_UA",
"vi",
"zu_ZA",
]
def language_check(language="en") -> str | None:
"""Check if language is supported"""
lang_region = None
installed = dictools.list_installed()
installed_region_keys = [i for i in installed if i.startswith(language) and "_" in i]
try:
# Try to find installed language with region prediction
lang_region = next(i for i in installed_region_keys if i == f"{language}_{language.upper()}")
except StopIteration:
if installed_region_keys:
# Take first installed region language
lang_region = installed_region_keys[0]
else:
# Take downloadable language key
downloadable_key = [i for i in LANGUAGES if i.startswith(language)]
downloadable_folder_key = [i for i in downloadable_key if i == language]
if downloadable_folder_key:
lang_region = downloadable_key[0]
else:
try:
# Try to find downloadable language with region prediction
lang_region = next(i for i in downloadable_key if i == f"{language}_{language.upper()}")
except StopIteration:
if downloadable_key:
# Take first installed region language
lang_region = downloadable_key[0]
if lang_region is None:
return None
print(
f"{ULTRASINGER_HEAD} Hyphenate using language code: {blue_highlighted(lang_region)}"
)
return lang_region
def contains_punctuation(word: str) -> bool:
"""Check if word contains punctuation"""
return any(elem in word for elem in string.punctuation)
def clean_word(word: str):
"""Remove punctuation from word"""
cleaned_string = ""
removed_indices = []
removed_symbols = []
for i, char in enumerate(word):
if char not in string.punctuation and char not in " ":
cleaned_string += char
else:
removed_indices.append(i)
removed_symbols.append(char)
return cleaned_string, removed_indices, removed_symbols
def insert_removed_symbols(separated_array, removed_indices, symbols):
"""Insert symbols into the syllables"""
result = []
symbol_index = 0
i = 0
# Add removed symbols to the syllables
for syllable in separated_array:
tmp = ""
for char in syllable:
if i in removed_indices:
tmp += symbols[symbol_index]
symbol_index += 1
i += 1
tmp += char
i += 1
result.append(tmp)
# Add remaining symbols to the last syllable
if symbol_index < len(symbols):
tmp = result[-1]
for i in range(symbol_index, len(symbols)):
tmp += symbols[i]
result[-1] = tmp
return result
def create_hyphenator(lang_region: str) -> Hyphenator:
"""Create hyphenator"""
hyphenator = Hyphenator(lang_region)
return hyphenator
def hyphenation(word: str, hyphenator: Hyphenator) -> list[str] | None:
"""Hyphenate word"""
cleaned_string, removed_indices, removed_symbols = clean_word(word)
# Hyphenation of word longer than 100 characters throws exception
if len(cleaned_string) > 100:
return None
syllabus = hyphenator.syllables(cleaned_string)
length = len(syllabus)
if length > 1:
hyphen = []
for i in range(length):
hyphen.append(syllabus[i])
hyphen = insert_removed_symbols(hyphen, removed_indices, removed_symbols)
else:
hyphen = None
return hyphen
|