ciyidogan commited on
Commit
c461a97
·
verified ·
1 Parent(s): 3486391

Update tts_preprocessor.py

Browse files
Files changed (1) hide show
  1. tts_preprocessor.py +87 -52
tts_preprocessor.py CHANGED
@@ -134,52 +134,77 @@ class TTSPreprocessor:
134
 
135
  return re.sub(pattern, replace_number, text)
136
 
 
 
 
 
 
 
 
 
 
 
137
  def _process_currency(self, text: str) -> str:
138
  """Process currency symbols and amounts based on locale"""
139
  currency_data = self.locale_data.get("currency", {})
140
 
141
- # Replace currency symbols
142
- for symbol, word in currency_data.get("symbols", {}).items():
 
 
 
 
 
 
 
 
143
  text = text.replace(symbol, f" {word} ")
144
 
 
 
 
 
 
 
 
 
 
 
145
  # Process currency codes
146
- for code, word in currency_data.get("codes", {}).items():
147
- pattern = rf'(\d+)\s*{code}\b'
148
  text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE)
149
 
150
  return text
151
 
152
- def _process_time(self, text: str) -> str:
153
- """Process time formats based on locale"""
154
- time_format = self.locale_data.get("time", {}).get("format", "word")
155
 
156
- def replace_time(match):
157
- hour, minute = match.groups()
158
- hour_int = int(hour)
159
- minute_int = int(minute)
160
 
161
- if time_format == "word":
162
- try:
163
- hour_word = num2words(hour_int, lang=self.language)
164
- minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else ""
165
-
166
- if minute_int == 0:
167
- return hour_word
168
- else:
169
- separator = self.locale_data.get("time", {}).get("separator", " ")
170
- return f"{hour_word}{separator}{minute_word}"
171
- except NotImplementedError:
172
- return f"{hour} {minute}"
173
- else:
174
- return f"{hour} {minute}"
175
-
176
- pattern = r'(\d{1,2}):(\d{2})'
177
- return re.sub(pattern, replace_time, text)
178
 
179
  def _process_date(self, text: str) -> str:
180
  """Process date formats based on locale"""
181
  months = self.locale_data.get("months", {})
182
- date_format = self.locale_data.get("date", {}).get("format", "YYYY-MM-DD")
 
 
 
183
 
184
  # Convert ISO format dates
185
  def replace_date(match):
@@ -187,9 +212,11 @@ class TTSPreprocessor:
187
  month_name = months.get(month, month)
188
 
189
  # Format based on locale preference
190
- if "DD MMMM YYYY" in date_format:
 
191
  return f"{int(day)} {month_name} {year}"
192
- elif "MMMM DD, YYYY" in date_format:
 
193
  return f"{month_name} {int(day)}, {year}"
194
  else:
195
  return match.group()
@@ -197,27 +224,35 @@ class TTSPreprocessor:
197
  pattern = r'(\d{4})-(\d{2})-(\d{2})'
198
  return re.sub(pattern, replace_date, text)
199
 
200
- def _process_codes(self, text: str) -> str:
201
- """Process codes like PNR, flight numbers - language agnostic"""
202
- def spell_code(match):
203
- code = match.group()
204
- return ' '.join(code)
205
-
206
- # Match uppercase letters followed by numbers
207
- pattern = r'\b[A-Z]{2,5}\d{2,5}\b'
208
- return re.sub(pattern, spell_code, text)
209
-
210
- def _process_percentage(self, text: str) -> str:
211
- """Process percentage symbols based on locale"""
212
- percentage = self.locale_data.get("percentage", {})
213
- prefix = percentage.get("prefix", "")
214
- suffix = percentage.get("suffix", "")
215
 
216
- if prefix:
217
- pattern = r'%\s*(\d+)'
218
- replacement = rf'{prefix} \1'
219
  else:
220
- pattern = r'(\d+)\s*%'
221
- replacement = rf'\1 {suffix}'
 
 
 
 
 
222
 
223
- return re.sub(pattern, replacement, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  return re.sub(pattern, replace_number, text)
136
 
137
+ def _process_codes(self, text: str) -> str:
138
+ """Process codes like PNR, flight numbers - language agnostic"""
139
+ def spell_code(match):
140
+ code = match.group()
141
+ return ' '.join(code)
142
+
143
+ # Match uppercase letters followed by numbers
144
+ pattern = r'\b[A-Z]{2,5}\d{2,5}\b'
145
+ return re.sub(pattern, spell_code, text)
146
+
147
  def _process_currency(self, text: str) -> str:
148
  """Process currency symbols and amounts based on locale"""
149
  currency_data = self.locale_data.get("currency", {})
150
 
151
+ if not isinstance(currency_data, dict):
152
+ return text
153
+
154
+ symbol = currency_data.get("symbol", "")
155
+ word = currency_data.get("word", "")
156
+ code = currency_data.get("code", "")
157
+ position = currency_data.get("position", "before")
158
+
159
+ if symbol and word:
160
+ # Replace standalone symbols
161
  text = text.replace(symbol, f" {word} ")
162
 
163
+ # Replace symbol with amount
164
+ if position == "before":
165
+ # $100 -> 100 dollar
166
+ pattern = rf'{re.escape(symbol)}\s*(\d+(?:[.,]\d+)?)'
167
+ text = re.sub(pattern, rf'\1 {word}', text)
168
+ else:
169
+ # 100₺ -> 100 lira
170
+ pattern = rf'(\d+(?:[.,]\d+)?)\s*{re.escape(symbol)}'
171
+ text = re.sub(pattern, rf'\1 {word}', text)
172
+
173
  # Process currency codes
174
+ if code and word:
175
+ pattern = rf'(\d+(?:[.,]\d+)?)\s*{code}\b'
176
  text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE)
177
 
178
  return text
179
 
180
+ def _process_percentage(self, text: str) -> str:
181
+ """Process percentage symbols based on locale"""
182
+ percentage = self.locale_data.get("percentage", {})
183
 
184
+ if not isinstance(percentage, dict):
185
+ return text
 
 
186
 
187
+ word = percentage.get("word", "percent")
188
+ position = percentage.get("position", "after")
189
+
190
+ if position == "before":
191
+ # %50 -> yüzde 50
192
+ pattern = r'%\s*(\d+(?:[.,]\d+)?)'
193
+ replacement = rf'{word} \1'
194
+ else:
195
+ # 50% -> 50 percent
196
+ pattern = r'(\d+(?:[.,]\d+)?)\s*%'
197
+ replacement = rf'\1 {word}'
198
+
199
+ return re.sub(pattern, replacement, text)
 
 
 
 
200
 
201
  def _process_date(self, text: str) -> str:
202
  """Process date formats based on locale"""
203
  months = self.locale_data.get("months", {})
204
+ date_format = self.locale_data.get("date_format", "YYYY-MM-DD")
205
+
206
+ if not isinstance(months, dict):
207
+ return text
208
 
209
  # Convert ISO format dates
210
  def replace_date(match):
 
212
  month_name = months.get(month, month)
213
 
214
  # Format based on locale preference
215
+ if "DD.MM.YYYY" in date_format:
216
+ # Turkish format with month name
217
  return f"{int(day)} {month_name} {year}"
218
+ elif "MM/DD/YYYY" in date_format:
219
+ # US format with month name
220
  return f"{month_name} {int(day)}, {year}"
221
  else:
222
  return match.group()
 
224
  pattern = r'(\d{4})-(\d{2})-(\d{2})'
225
  return re.sub(pattern, replace_date, text)
226
 
227
+ def _process_time(self, text: str) -> str:
228
+ """Process time formats based on locale"""
229
+ time_data = self.locale_data.get("time", {})
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
+ if not isinstance(time_data, dict):
232
+ time_format = "word"
233
+ separator = " "
234
  else:
235
+ time_format = time_data.get("format", "word")
236
+ separator = time_data.get("separator", " ")
237
+
238
+ def replace_time(match):
239
+ hour, minute = match.groups()
240
+ hour_int = int(hour)
241
+ minute_int = int(minute)
242
 
243
+ if time_format == "word":
244
+ try:
245
+ hour_word = num2words(hour_int, lang=self.language)
246
+ minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else ""
247
+
248
+ if minute_int == 0:
249
+ return hour_word
250
+ else:
251
+ return f"{hour_word}{separator}{minute_word}"
252
+ except NotImplementedError:
253
+ return f"{hour} {minute}"
254
+ else:
255
+ return f"{hour} {minute}"
256
+
257
+ pattern = r'(\d{1,2}):(\d{2})'
258
+ return re.sub(pattern, replace_time, text)