Respair commited on
Commit
c8d3855
·
verified ·
1 Parent(s): 962a288

Update Utils/phonemize/cotlet_utils.py

Browse files
Files changed (1) hide show
  1. Utils/phonemize/cotlet_utils.py +75 -9
Utils/phonemize/cotlet_utils.py CHANGED
@@ -27,6 +27,7 @@ formal_to_informal3 = {
27
  "$":"どる",
28
  "#":"はっしゅたぐ",
29
  "何が":"なにが",
 
30
 
31
  "何も":"なにも",
32
  "何か":"なにか",
@@ -137,8 +138,9 @@ mapper = dict([
137
 
138
  ("その節","そのせつ"),
139
 
140
- ("何し","なにし"),
141
  ("何する","なにする"),
 
142
 
143
  ("心さん","しんさん"),
144
  ("心ちゃん","しんちゃん"),
@@ -500,6 +502,15 @@ def hira2ipa(text, roma_mapper=roma_mapper):
500
 
501
  import re
502
 
 
 
 
 
 
 
 
 
 
503
 
504
  k_mapper = dict([
505
  ("ゔぁ","ba"),
@@ -1011,7 +1022,16 @@ spaces = dict([
1011
  (" gaːɽɯ "," ga aɽɯ "),
1012
  (" waːɽɯ "," wa aɽɯ "),
1013
  (" gaːrɯ "," ga arɯ "),
 
 
 
 
 
 
 
 
1014
  (" waːrɯ "," wa arɯ "),
 
1015
 
1016
  (" gaːɽi"," ga aɽi"),
1017
  (" waːɽi"," wa aɽi"),
@@ -1028,22 +1048,58 @@ spaces = dict([
1028
  (" gaːʔta"," ga aʔta"),
1029
  (" waːʔta"," wa aʔta"),
1030
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1031
  ("ɕiːk ","ɕi ik"),
1032
  ("ɕijoː neɴ","ɕoɯneɴ"),
1033
  ("aːna","a ana"),
 
 
 
 
 
 
 
 
 
 
 
1034
 
1035
 
1036
  ])
1037
 
1038
 
1039
- def random_space_fix(text):
1040
- orig = text
1041
-
1042
- for k, v in spaces.items():
1043
- text = text.replace(k, v)
1044
-
1045
- return fix_wagas(text)
1046
-
1047
  def fix_wagas(text):
1048
 
1049
  pattern = r'\b(gaːɽɯ|waːɽɯ|gaːrɯ|waːrɯ|gaːɽi|waːɽi|gaːri|waːri)\b(?!\s*\w)'
@@ -1055,3 +1111,13 @@ def fix_wagas(text):
1055
  return word
1056
 
1057
  return re.sub(pattern, replace_match, text)
 
 
 
 
 
 
 
 
 
 
 
27
  "$":"どる",
28
  "#":"はっしゅたぐ",
29
  "何が":"なにが",
30
+ "何に":"なにに",
31
 
32
  "何も":"なにも",
33
  "何か":"なにか",
 
138
 
139
  ("その節","そのせつ"),
140
 
141
+ ("何しに","なにしに"),
142
  ("何する","なにする"),
143
+ # ("何しよう","なにしよう"),
144
 
145
  ("心さん","しんさん"),
146
  ("心ちゃん","しんちゃん"),
 
502
 
503
  import re
504
 
505
+ def replace_ending(text):
506
+ # Pattern explanation:
507
+ # (?<!naɴ)(?<!nan) - negative lookbehind to ensure "naɴ" or "nan" doesn't precede
508
+ # niɴ - the target ending to replace
509
+ # $ - end of string
510
+ pattern = r'(?<!naɴ)(?<!nan)niɴ$'
511
+ return re.sub(pattern, 'dʑiɴ', text)
512
+
513
+
514
 
515
  k_mapper = dict([
516
  ("ゔぁ","ba"),
 
1022
  (" gaːɽɯ "," ga aɽɯ "),
1023
  (" waːɽɯ "," wa aɽɯ "),
1024
  (" gaːrɯ "," ga arɯ "),
1025
+ ("gaːrɯɕi","ga arɯɕi"),
1026
+ ("gaːrɯnaɽa","ga arɯ naɽa"),
1027
+ ("gaːrɯɴ","ga arɯɴ"),
1028
+ ("gaːreba","ga areba"),
1029
+ ("gaːrɯkedo","ga arɯ kedo"),
1030
+ ("gaːrinagaɽa","ga ari nagaɽa"),
1031
+ ("gaːrɯ wa","ga arɯ wa"),
1032
+ ("gaːrɯwa","ga arɯ wa"),
1033
  (" waːrɯ "," wa arɯ "),
1034
+ ("waːmaɽi ","wa amaɽi "),
1035
 
1036
  (" gaːɽi"," ga aɽi"),
1037
  (" waːɽi"," wa aɽi"),
 
1048
  (" gaːʔta"," ga aʔta"),
1049
  (" waːʔta"," wa aʔta"),
1050
 
1051
+ ("gaːʔmaɽi","ga aʔmaɽi"),
1052
+ ("waːʔmaɽi","wa aʔmaɽi"),
1053
+
1054
+ ("gaːsakaʔta","ga asakaʔta"),
1055
+ ("waːsakaʔta","wa asakaʔta"),
1056
+
1057
+ (" waːme "," wa ame "),
1058
+ (" gaːme "," ga ame "),
1059
+
1060
+ ("aɽaːɽa","aɽa aɽa"),
1061
+ ("waːi rʲoɯ","wa aiɽa"),
1062
+ ("gaːi rʲoɯ","ga aiɽa"),
1063
+ ("gaːi ","ga ai "),
1064
+ ("ai rʲoɯ", "aiɽa"),
1065
+ ("ai joʔte","aiɽaʔte"),
1066
+ ("arɯihaːi rʲoɯ","arɯi wa aiɽa"),
1067
+ ("rʲoɯ ɕiki","joɕiki"),
1068
+ ("waːkarɯi","wa akarɯi"),
1069
+ ("gaːkarɯi","ga akarɯi"),
1070
+
1071
+ ("waːi o","wa ai o"),
1072
+ ("madaːtama","mada atama"),
1073
+ ("hahasaɴ","kaːsaɴ"),
1074
+ ("ohahasaɴ","okaːsaɴ"),
1075
+ ("hahatɕaɴ","kaːtɕaɴ"),
1076
+ ("ohahatɕaɴ","okaːtɕaɴ"),
1077
+
1078
+ ("nani mo ka mo","nanimokamo"),
1079
+
1080
+ (" nihon "," niʔpon "),
1081
+ ("niʔponniɴ","nihondʑiɴ"),
1082
+ ("arɽaːfɯ","arɽaː"),
1083
+
1084
  ("ɕiːk ","ɕi ik"),
1085
  ("ɕijoː neɴ","ɕoɯneɴ"),
1086
  ("aːna","a ana"),
1087
+ ("naɴ ɕijoɯ","nani ɕijoɯ"),
1088
+ ("hana ni sɯrɯ","wa nani sɯrɯ"),
1089
+ ("naɴ ɕite","nani ɕite"),
1090
+
1091
+ ("hatsɯ taimeɴ","ɕotaimeɴ"),
1092
+ ("soŋkei go","soŋkeigo"),
1093
+ (" go ","go "),
1094
+ (" keiː "," kei i "),
1095
+ (" kaŋgae hoɯ ","kaŋgaekata"),
1096
+ ("tomete kɯdasai","jamete kɯdasai"),
1097
+ (" bɯkaikedo"," fɯkaikedo"),
1098
 
1099
 
1100
  ])
1101
 
1102
 
 
 
 
 
 
 
 
 
1103
  def fix_wagas(text):
1104
 
1105
  pattern = r'\b(gaːɽɯ|waːɽɯ|gaːrɯ|waːrɯ|gaːɽi|waːɽi|gaːri|waːri)\b(?!\s*\w)'
 
1111
  return word
1112
 
1113
  return re.sub(pattern, replace_match, text)
1114
+
1115
+ def random_space_fix(text):
1116
+ orig = text
1117
+
1118
+ for k, v in spaces.items():
1119
+ text = text.replace(k, v)
1120
+
1121
+ return fix_wagas(text)
1122
+
1123
+