Update Utils/phonemize/cotlet_utils.py
Browse files
Utils/phonemize/cotlet_utils.py
CHANGED
@@ -27,6 +27,7 @@ formal_to_informal3 = {
|
|
27 |
"$":"どる",
|
28 |
"#":"はっしゅたぐ",
|
29 |
"何が":"なにが",
|
|
|
30 |
|
31 |
"何も":"なにも",
|
32 |
"何か":"なにか",
|
@@ -137,8 +138,9 @@ mapper = dict([
|
|
137 |
|
138 |
("その節","そのせつ"),
|
139 |
|
140 |
-
("
|
141 |
("何する","なにする"),
|
|
|
142 |
|
143 |
("心さん","しんさん"),
|
144 |
("心ちゃん","しんちゃん"),
|
@@ -500,6 +502,15 @@ def hira2ipa(text, roma_mapper=roma_mapper):
|
|
500 |
|
501 |
import re
|
502 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
503 |
|
504 |
k_mapper = dict([
|
505 |
("ゔぁ","ba"),
|
@@ -1011,7 +1022,16 @@ spaces = dict([
|
|
1011 |
(" gaːɽɯ "," ga aɽɯ "),
|
1012 |
(" waːɽɯ "," wa aɽɯ "),
|
1013 |
(" gaːrɯ "," ga arɯ "),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1014 |
(" waːrɯ "," wa arɯ "),
|
|
|
1015 |
|
1016 |
(" gaːɽi"," ga aɽi"),
|
1017 |
(" waːɽi"," wa aɽi"),
|
@@ -1028,22 +1048,58 @@ spaces = dict([
|
|
1028 |
(" gaːʔta"," ga aʔta"),
|
1029 |
(" waːʔta"," wa aʔta"),
|
1030 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1031 |
("ɕiːk ","ɕi ik"),
|
1032 |
("ɕijoː neɴ","ɕoɯneɴ"),
|
1033 |
("aːna","a ana"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1034 |
|
1035 |
|
1036 |
])
|
1037 |
|
1038 |
|
1039 |
-
def random_space_fix(text):
|
1040 |
-
orig = text
|
1041 |
-
|
1042 |
-
for k, v in spaces.items():
|
1043 |
-
text = text.replace(k, v)
|
1044 |
-
|
1045 |
-
return fix_wagas(text)
|
1046 |
-
|
1047 |
def fix_wagas(text):
|
1048 |
|
1049 |
pattern = r'\b(gaːɽɯ|waːɽɯ|gaːrɯ|waːrɯ|gaːɽi|waːɽi|gaːri|waːri)\b(?!\s*\w)'
|
@@ -1055,3 +1111,13 @@ def fix_wagas(text):
|
|
1055 |
return word
|
1056 |
|
1057 |
return re.sub(pattern, replace_match, text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
"$":"どる",
|
28 |
"#":"はっしゅたぐ",
|
29 |
"何が":"なにが",
|
30 |
+
"何に":"なにに",
|
31 |
|
32 |
"何も":"なにも",
|
33 |
"何か":"なにか",
|
|
|
138 |
|
139 |
("その節","そのせつ"),
|
140 |
|
141 |
+
("何しに","なにしに"),
|
142 |
("何する","なにする"),
|
143 |
+
# ("何しよう","なにしよう"),
|
144 |
|
145 |
("心さん","しんさん"),
|
146 |
("心ちゃん","しんちゃん"),
|
|
|
502 |
|
503 |
import re
|
504 |
|
505 |
+
def replace_ending(text):
|
506 |
+
# Pattern explanation:
|
507 |
+
# (?<!naɴ)(?<!nan) - negative lookbehind to ensure "naɴ" or "nan" doesn't precede
|
508 |
+
# niɴ - the target ending to replace
|
509 |
+
# $ - end of string
|
510 |
+
pattern = r'(?<!naɴ)(?<!nan)niɴ$'
|
511 |
+
return re.sub(pattern, 'dʑiɴ', text)
|
512 |
+
|
513 |
+
|
514 |
|
515 |
k_mapper = dict([
|
516 |
("ゔぁ","ba"),
|
|
|
1022 |
(" gaːɽɯ "," ga aɽɯ "),
|
1023 |
(" waːɽɯ "," wa aɽɯ "),
|
1024 |
(" gaːrɯ "," ga arɯ "),
|
1025 |
+
("gaːrɯɕi","ga arɯɕi"),
|
1026 |
+
("gaːrɯnaɽa","ga arɯ naɽa"),
|
1027 |
+
("gaːrɯɴ","ga arɯɴ"),
|
1028 |
+
("gaːreba","ga areba"),
|
1029 |
+
("gaːrɯkedo","ga arɯ kedo"),
|
1030 |
+
("gaːrinagaɽa","ga ari nagaɽa"),
|
1031 |
+
("gaːrɯ wa","ga arɯ wa"),
|
1032 |
+
("gaːrɯwa","ga arɯ wa"),
|
1033 |
(" waːrɯ "," wa arɯ "),
|
1034 |
+
("waːmaɽi ","wa amaɽi "),
|
1035 |
|
1036 |
(" gaːɽi"," ga aɽi"),
|
1037 |
(" waːɽi"," wa aɽi"),
|
|
|
1048 |
(" gaːʔta"," ga aʔta"),
|
1049 |
(" waːʔta"," wa aʔta"),
|
1050 |
|
1051 |
+
("gaːʔmaɽi","ga aʔmaɽi"),
|
1052 |
+
("waːʔmaɽi","wa aʔmaɽi"),
|
1053 |
+
|
1054 |
+
("gaːsakaʔta","ga asakaʔta"),
|
1055 |
+
("waːsakaʔta","wa asakaʔta"),
|
1056 |
+
|
1057 |
+
(" waːme "," wa ame "),
|
1058 |
+
(" gaːme "," ga ame "),
|
1059 |
+
|
1060 |
+
("aɽaːɽa","aɽa aɽa"),
|
1061 |
+
("waːi rʲoɯ","wa aiɽa"),
|
1062 |
+
("gaːi rʲoɯ","ga aiɽa"),
|
1063 |
+
("gaːi ","ga ai "),
|
1064 |
+
("ai rʲoɯ", "aiɽa"),
|
1065 |
+
("ai joʔte","aiɽaʔte"),
|
1066 |
+
("arɯihaːi rʲoɯ","arɯi wa aiɽa"),
|
1067 |
+
("rʲoɯ ɕiki","joɕiki"),
|
1068 |
+
("waːkarɯi","wa akarɯi"),
|
1069 |
+
("gaːkarɯi","ga akarɯi"),
|
1070 |
+
|
1071 |
+
("waːi o","wa ai o"),
|
1072 |
+
("madaːtama","mada atama"),
|
1073 |
+
("hahasaɴ","kaːsaɴ"),
|
1074 |
+
("ohahasaɴ","okaːsaɴ"),
|
1075 |
+
("hahatɕaɴ","kaːtɕaɴ"),
|
1076 |
+
("ohahatɕaɴ","okaːtɕaɴ"),
|
1077 |
+
|
1078 |
+
("nani mo ka mo","nanimokamo"),
|
1079 |
+
|
1080 |
+
(" nihon "," niʔpon "),
|
1081 |
+
("niʔponniɴ","nihondʑiɴ"),
|
1082 |
+
("arɽaːfɯ","arɽaː"),
|
1083 |
+
|
1084 |
("ɕiːk ","ɕi ik"),
|
1085 |
("ɕijoː neɴ","ɕoɯneɴ"),
|
1086 |
("aːna","a ana"),
|
1087 |
+
("naɴ ɕijoɯ","nani ɕijoɯ"),
|
1088 |
+
("hana ni sɯrɯ","wa nani sɯrɯ"),
|
1089 |
+
("naɴ ɕite","nani ɕite"),
|
1090 |
+
|
1091 |
+
("hatsɯ taimeɴ","ɕotaimeɴ"),
|
1092 |
+
("soŋkei go","soŋkeigo"),
|
1093 |
+
(" go ","go "),
|
1094 |
+
(" keiː "," kei i "),
|
1095 |
+
(" kaŋgae hoɯ ","kaŋgaekata"),
|
1096 |
+
("tomete kɯdasai","jamete kɯdasai"),
|
1097 |
+
(" bɯkaikedo"," fɯkaikedo"),
|
1098 |
|
1099 |
|
1100 |
])
|
1101 |
|
1102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1103 |
def fix_wagas(text):
|
1104 |
|
1105 |
pattern = r'\b(gaːɽɯ|waːɽɯ|gaːrɯ|waːrɯ|gaːɽi|waːɽi|gaːri|waːri)\b(?!\s*\w)'
|
|
|
1111 |
return word
|
1112 |
|
1113 |
return re.sub(pattern, replace_match, text)
|
1114 |
+
|
1115 |
+
def random_space_fix(text):
|
1116 |
+
orig = text
|
1117 |
+
|
1118 |
+
for k, v in spaces.items():
|
1119 |
+
text = text.replace(k, v)
|
1120 |
+
|
1121 |
+
return fix_wagas(text)
|
1122 |
+
|
1123 |
+
|