Spaces:
Sleeping
Sleeping
File size: 8,164 Bytes
9fab08e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
"""
Copyright @ nguyenvanhieu.vn
Thằng code python này không giữ được lower/upper case
Sẽ update khi rảnh
"""
import re
uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"
def loaddicchar():
dic = {}
char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
'|')
charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
'|')
for i in range(len(char1252)):
dic[char1252[i]] = charutf8[i]
return dic
dicchar = loaddicchar()
def convertwindown1525toutf8(txt):
return re.sub(
r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
lambda x: dicchar[x.group()], txt)
"""
Start section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey
Ví dụ: thủy = thuyr, tượng = tuwowngj
"""
bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']
nguyen_am_to_ids = {}
for i in range(len(bang_nguyen_am)):
for j in range(len(bang_nguyen_am[i]) - 1):
nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)
def vn_word_to_telex_type(word):
dau_cau = 0
new_word = ''
for char in word:
x, y = nguyen_am_to_ids.get(char, (-1, -1))
if x == -1:
new_word += char
continue
if y != 0:
dau_cau = y
new_word += bang_nguyen_am[x][-1]
new_word += bang_ky_tu_dau[dau_cau]
return new_word
def vn_sentence_to_telex_type(sentence):
"""
Chuyển câu tiếng việt có dấu về kiểu gõ telex.
:param sentence:
:return:
"""
words = sentence.split()
for index, word in enumerate(words):
words[index] = vn_word_to_telex_type(word)
return ' '.join(words)
"""
Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF
"""
def chuan_hoa_dau_tu_tieng_viet(word):
if not is_valid_vietnam_word(word):
return word
chars = list(word)
dau_cau = 0
nguyen_am_index = []
qu_or_gi = False
for index, char in enumerate(chars):
x, y = nguyen_am_to_ids.get(char, (-1, -1))
if x == -1:
continue
elif x == 9: # check qu
if index != 0 and chars[index - 1] == 'q':
chars[index] = 'u'
qu_or_gi = True
elif x == 5: # check gi
if index != 0 and chars[index - 1] == 'g':
chars[index] = 'i'
qu_or_gi = True
if y != 0:
dau_cau = y
chars[index] = bang_nguyen_am[x][0]
if not qu_or_gi or index != 1:
nguyen_am_index.append(index)
if len(nguyen_am_index) < 2:
if qu_or_gi:
if len(chars) == 2:
x, y = nguyen_am_to_ids.get(chars[1])
chars[1] = bang_nguyen_am[x][dau_cau]
else:
x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
if x != -1:
chars[2] = bang_nguyen_am[x][dau_cau]
else:
chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
return ''.join(chars)
return word
for index in nguyen_am_index:
x, y = nguyen_am_to_ids[chars[index]]
if x == 4 or x == 8: # ê, ơ
chars[index] = bang_nguyen_am[x][dau_cau]
# for index2 in nguyen_am_index:
# if index2 != index:
# x, y = nguyen_am_to_ids[chars[index]]
# chars[index2] = bang_nguyen_am[x][0]
return ''.join(chars)
if len(nguyen_am_index) == 2:
if nguyen_am_index[-1] == len(chars) - 1:
x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
# chars[nguyen_am_index[1]] = bang_nguyen_am[x][0]
else:
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
else:
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]]
# chars[nguyen_am_index[2]] = bang_nguyen_am[x][0]
return ''.join(chars)
def is_valid_vietnam_word(word):
chars = list(word)
nguyen_am_index = -1
for index, char in enumerate(chars):
x, y = nguyen_am_to_ids.get(char, (-1, -1))
if x != -1:
if nguyen_am_index == -1:
nguyen_am_index = index
else:
if index - nguyen_am_index != 1:
return False
nguyen_am_index = index
return True
def chuan_hoa_dau_cau_tieng_viet(sentence):
"""
Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ.
:param sentence:
:return:
"""
sentence = sentence.lower()
words = sentence.split()
for index, word in enumerate(words):
words[index] = chuan_hoa_dau_tu_tieng_viet(word)
return ' '.join(words) |