Spaces:
Building
Building
Delete tts_preprocessor.py
Browse files- tts_preprocessor.py +0 -232
tts_preprocessor.py
DELETED
@@ -1,232 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
TTS Text Preprocessing Utilities with Multilingual Support
|
3 |
-
"""
|
4 |
-
|
5 |
-
import re
|
6 |
-
import json
|
7 |
-
from typing import Dict, Set, Optional
|
8 |
-
from num2words import num2words
|
9 |
-
from pathlib import Path
|
10 |
-
from locale_manager import LocaleManager
|
11 |
-
|
12 |
-
class TTSPreprocessor:
|
13 |
-
"""Text preprocessor for TTS providers with multilingual support"""
|
14 |
-
|
15 |
-
# Preprocessing flags
|
16 |
-
PREPROCESS_NUMBERS = "numbers"
|
17 |
-
PREPROCESS_CURRENCY = "currency"
|
18 |
-
PREPROCESS_TIME = "time"
|
19 |
-
PREPROCESS_DATE = "date"
|
20 |
-
PREPROCESS_CODES = "codes"
|
21 |
-
PREPROCESS_PERCENTAGE = "percentage"
|
22 |
-
|
23 |
-
def __init__(self, language: str = "tr"):
|
24 |
-
self.language = language
|
25 |
-
self.locale_data = LocaleManager.get_locale(language)
|
26 |
-
|
27 |
-
def preprocess(self, text: str, flags: Set[str]) -> str:
|
28 |
-
"""Apply preprocessing based on flags"""
|
29 |
-
|
30 |
-
if self.PREPROCESS_CURRENCY in flags:
|
31 |
-
text = self._process_currency(text)
|
32 |
-
|
33 |
-
if self.PREPROCESS_TIME in flags:
|
34 |
-
text = self._process_time(text)
|
35 |
-
|
36 |
-
if self.PREPROCESS_DATE in flags:
|
37 |
-
text = self._process_date(text)
|
38 |
-
|
39 |
-
if self.PREPROCESS_CODES in flags:
|
40 |
-
text = self._process_codes(text)
|
41 |
-
|
42 |
-
if self.PREPROCESS_PERCENTAGE in flags:
|
43 |
-
text = self._process_percentage(text)
|
44 |
-
|
45 |
-
# Numbers should be processed last to avoid conflicts
|
46 |
-
if self.PREPROCESS_NUMBERS in flags:
|
47 |
-
text = self._process_numbers(text)
|
48 |
-
|
49 |
-
return text
|
50 |
-
|
51 |
-
def _process_numbers(self, text: str) -> str:
|
52 |
-
"""Convert numbers to words based on locale"""
|
53 |
-
decimal_sep = self.locale_data["numbers"]["decimal_separator"]
|
54 |
-
thousands_sep = self.locale_data["numbers"]["thousands_separator"]
|
55 |
-
decimal_word = self.locale_data["numbers"]["decimal_word"]
|
56 |
-
threshold = self.locale_data.get("small_number_threshold", 100)
|
57 |
-
|
58 |
-
def replace_number(match):
|
59 |
-
num_str = match.group()
|
60 |
-
|
61 |
-
# Normalize number format
|
62 |
-
if self.language == "tr":
|
63 |
-
# Turkish: 1.234,56 -> 1234.56
|
64 |
-
num_str = num_str.replace('.', '').replace(',', '.')
|
65 |
-
else:
|
66 |
-
# English: 1,234.56 -> 1234.56
|
67 |
-
num_str = num_str.replace(',', '')
|
68 |
-
|
69 |
-
try:
|
70 |
-
num = float(num_str)
|
71 |
-
if num.is_integer():
|
72 |
-
num = int(num)
|
73 |
-
|
74 |
-
# Keep small numbers as is based on threshold
|
75 |
-
if isinstance(num, int) and 0 <= num <= threshold:
|
76 |
-
return str(num)
|
77 |
-
|
78 |
-
# Convert large numbers to words
|
79 |
-
if isinstance(num, int):
|
80 |
-
try:
|
81 |
-
return num2words(num, lang=self.language)
|
82 |
-
except NotImplementedError:
|
83 |
-
# Fallback to English if language not supported
|
84 |
-
return num2words(num, lang='en')
|
85 |
-
else:
|
86 |
-
# Handle decimal
|
87 |
-
integer_part = int(num)
|
88 |
-
decimal_part = int((num - integer_part) * 100)
|
89 |
-
|
90 |
-
try:
|
91 |
-
int_words = num2words(integer_part, lang=self.language)
|
92 |
-
dec_words = num2words(decimal_part, lang=self.language)
|
93 |
-
return f"{int_words} {decimal_word} {dec_words}"
|
94 |
-
except NotImplementedError:
|
95 |
-
# Fallback
|
96 |
-
int_words = num2words(integer_part, lang='en')
|
97 |
-
dec_words = num2words(decimal_part, lang='en')
|
98 |
-
return f"{int_words} {decimal_word} {dec_words}"
|
99 |
-
|
100 |
-
except:
|
101 |
-
return num_str
|
102 |
-
|
103 |
-
# Match numbers with locale-specific format
|
104 |
-
if self.language == "tr":
|
105 |
-
pattern = r'\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\b|\b\d+(?:,\d+)?\b'
|
106 |
-
else:
|
107 |
-
pattern = r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b|\b\d+(?:\.\d+)?\b'
|
108 |
-
|
109 |
-
return re.sub(pattern, replace_number, text)
|
110 |
-
|
111 |
-
def _process_codes(self, text: str) -> str:
|
112 |
-
"""Process codes like PNR, flight numbers - language agnostic"""
|
113 |
-
def spell_code(match):
|
114 |
-
code = match.group()
|
115 |
-
return ' '.join(code)
|
116 |
-
|
117 |
-
# Match uppercase letters followed by numbers
|
118 |
-
pattern = r'\b[A-Z]{2,5}\d{2,5}\b'
|
119 |
-
return re.sub(pattern, spell_code, text)
|
120 |
-
|
121 |
-
def _process_currency(self, text: str) -> str:
|
122 |
-
"""Process currency symbols and amounts based on locale"""
|
123 |
-
currency_data = self.locale_data.get("currency", {})
|
124 |
-
|
125 |
-
if not isinstance(currency_data, dict):
|
126 |
-
return text
|
127 |
-
|
128 |
-
symbol = currency_data.get("symbol", "")
|
129 |
-
word = currency_data.get("word", "")
|
130 |
-
code = currency_data.get("code", "")
|
131 |
-
position = currency_data.get("position", "before")
|
132 |
-
|
133 |
-
if symbol and word:
|
134 |
-
# Replace standalone symbols
|
135 |
-
text = text.replace(symbol, f" {word} ")
|
136 |
-
|
137 |
-
# Replace symbol with amount
|
138 |
-
if position == "before":
|
139 |
-
# $100 -> 100 dollar
|
140 |
-
pattern = rf'{re.escape(symbol)}\s*(\d+(?:[.,]\d+)?)'
|
141 |
-
text = re.sub(pattern, rf'\1 {word}', text)
|
142 |
-
else:
|
143 |
-
# 100₺ -> 100 lira
|
144 |
-
pattern = rf'(\d+(?:[.,]\d+)?)\s*{re.escape(symbol)}'
|
145 |
-
text = re.sub(pattern, rf'\1 {word}', text)
|
146 |
-
|
147 |
-
# Process currency codes
|
148 |
-
if code and word:
|
149 |
-
pattern = rf'(\d+(?:[.,]\d+)?)\s*{code}\b'
|
150 |
-
text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE)
|
151 |
-
|
152 |
-
return text
|
153 |
-
|
154 |
-
def _process_percentage(self, text: str) -> str:
|
155 |
-
"""Process percentage symbols based on locale"""
|
156 |
-
percentage = self.locale_data.get("percentage", {})
|
157 |
-
|
158 |
-
if not isinstance(percentage, dict):
|
159 |
-
return text
|
160 |
-
|
161 |
-
word = percentage.get("word", "percent")
|
162 |
-
position = percentage.get("position", "after")
|
163 |
-
|
164 |
-
if position == "before":
|
165 |
-
# %50 -> yüzde 50
|
166 |
-
pattern = r'%\s*(\d+(?:[.,]\d+)?)'
|
167 |
-
replacement = rf'{word} \1'
|
168 |
-
else:
|
169 |
-
# 50% -> 50 percent
|
170 |
-
pattern = r'(\d+(?:[.,]\d+)?)\s*%'
|
171 |
-
replacement = rf'\1 {word}'
|
172 |
-
|
173 |
-
return re.sub(pattern, replacement, text)
|
174 |
-
|
175 |
-
def _process_date(self, text: str) -> str:
|
176 |
-
"""Process date formats based on locale"""
|
177 |
-
months = self.locale_data.get("months", {})
|
178 |
-
date_format = self.locale_data.get("date_format", "YYYY-MM-DD")
|
179 |
-
|
180 |
-
if not isinstance(months, dict):
|
181 |
-
return text
|
182 |
-
|
183 |
-
# Convert ISO format dates
|
184 |
-
def replace_date(match):
|
185 |
-
year, month, day = match.groups()
|
186 |
-
month_name = months.get(month, month)
|
187 |
-
|
188 |
-
# Format based on locale preference
|
189 |
-
if "DD.MM.YYYY" in date_format:
|
190 |
-
# Turkish format with month name
|
191 |
-
return f"{int(day)} {month_name} {year}"
|
192 |
-
elif "MM/DD/YYYY" in date_format:
|
193 |
-
# US format with month name
|
194 |
-
return f"{month_name} {int(day)}, {year}"
|
195 |
-
else:
|
196 |
-
return match.group()
|
197 |
-
|
198 |
-
pattern = r'(\d{4})-(\d{2})-(\d{2})'
|
199 |
-
return re.sub(pattern, replace_date, text)
|
200 |
-
|
201 |
-
def _process_time(self, text: str) -> str:
|
202 |
-
"""Process time formats based on locale"""
|
203 |
-
time_data = self.locale_data.get("time", {})
|
204 |
-
|
205 |
-
if not isinstance(time_data, dict):
|
206 |
-
time_format = "word"
|
207 |
-
separator = " "
|
208 |
-
else:
|
209 |
-
time_format = time_data.get("format", "word")
|
210 |
-
separator = time_data.get("separator", " ")
|
211 |
-
|
212 |
-
def replace_time(match):
|
213 |
-
hour, minute = match.groups()
|
214 |
-
hour_int = int(hour)
|
215 |
-
minute_int = int(minute)
|
216 |
-
|
217 |
-
if time_format == "word":
|
218 |
-
try:
|
219 |
-
hour_word = num2words(hour_int, lang=self.language)
|
220 |
-
minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else ""
|
221 |
-
|
222 |
-
if minute_int == 0:
|
223 |
-
return hour_word
|
224 |
-
else:
|
225 |
-
return f"{hour_word}{separator}{minute_word}"
|
226 |
-
except NotImplementedError:
|
227 |
-
return f"{hour} {minute}"
|
228 |
-
else:
|
229 |
-
return f"{hour} {minute}"
|
230 |
-
|
231 |
-
pattern = r'(\d{1,2}):(\d{2})'
|
232 |
-
return re.sub(pattern, replace_time, text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|