ciyidogan commited on
Commit
1ad909f
·
verified ·
1 Parent(s): 269eb02

Delete tts_preprocessor.py

Browse files
Files changed (1) hide show
  1. tts_preprocessor.py +0 -232
tts_preprocessor.py DELETED
@@ -1,232 +0,0 @@
1
- """
2
- TTS Text Preprocessing Utilities with Multilingual Support
3
- """
4
-
5
- import re
6
- import json
7
- from typing import Dict, Set, Optional
8
- from num2words import num2words
9
- from pathlib import Path
10
- from locale_manager import LocaleManager
11
-
12
- class TTSPreprocessor:
13
- """Text preprocessor for TTS providers with multilingual support"""
14
-
15
- # Preprocessing flags
16
- PREPROCESS_NUMBERS = "numbers"
17
- PREPROCESS_CURRENCY = "currency"
18
- PREPROCESS_TIME = "time"
19
- PREPROCESS_DATE = "date"
20
- PREPROCESS_CODES = "codes"
21
- PREPROCESS_PERCENTAGE = "percentage"
22
-
23
- def __init__(self, language: str = "tr"):
24
- self.language = language
25
- self.locale_data = LocaleManager.get_locale(language)
26
-
27
- def preprocess(self, text: str, flags: Set[str]) -> str:
28
- """Apply preprocessing based on flags"""
29
-
30
- if self.PREPROCESS_CURRENCY in flags:
31
- text = self._process_currency(text)
32
-
33
- if self.PREPROCESS_TIME in flags:
34
- text = self._process_time(text)
35
-
36
- if self.PREPROCESS_DATE in flags:
37
- text = self._process_date(text)
38
-
39
- if self.PREPROCESS_CODES in flags:
40
- text = self._process_codes(text)
41
-
42
- if self.PREPROCESS_PERCENTAGE in flags:
43
- text = self._process_percentage(text)
44
-
45
- # Numbers should be processed last to avoid conflicts
46
- if self.PREPROCESS_NUMBERS in flags:
47
- text = self._process_numbers(text)
48
-
49
- return text
50
-
51
- def _process_numbers(self, text: str) -> str:
52
- """Convert numbers to words based on locale"""
53
- decimal_sep = self.locale_data["numbers"]["decimal_separator"]
54
- thousands_sep = self.locale_data["numbers"]["thousands_separator"]
55
- decimal_word = self.locale_data["numbers"]["decimal_word"]
56
- threshold = self.locale_data.get("small_number_threshold", 100)
57
-
58
- def replace_number(match):
59
- num_str = match.group()
60
-
61
- # Normalize number format
62
- if self.language == "tr":
63
- # Turkish: 1.234,56 -> 1234.56
64
- num_str = num_str.replace('.', '').replace(',', '.')
65
- else:
66
- # English: 1,234.56 -> 1234.56
67
- num_str = num_str.replace(',', '')
68
-
69
- try:
70
- num = float(num_str)
71
- if num.is_integer():
72
- num = int(num)
73
-
74
- # Keep small numbers as is based on threshold
75
- if isinstance(num, int) and 0 <= num <= threshold:
76
- return str(num)
77
-
78
- # Convert large numbers to words
79
- if isinstance(num, int):
80
- try:
81
- return num2words(num, lang=self.language)
82
- except NotImplementedError:
83
- # Fallback to English if language not supported
84
- return num2words(num, lang='en')
85
- else:
86
- # Handle decimal
87
- integer_part = int(num)
88
- decimal_part = int((num - integer_part) * 100)
89
-
90
- try:
91
- int_words = num2words(integer_part, lang=self.language)
92
- dec_words = num2words(decimal_part, lang=self.language)
93
- return f"{int_words} {decimal_word} {dec_words}"
94
- except NotImplementedError:
95
- # Fallback
96
- int_words = num2words(integer_part, lang='en')
97
- dec_words = num2words(decimal_part, lang='en')
98
- return f"{int_words} {decimal_word} {dec_words}"
99
-
100
- except:
101
- return num_str
102
-
103
- # Match numbers with locale-specific format
104
- if self.language == "tr":
105
- pattern = r'\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\b|\b\d+(?:,\d+)?\b'
106
- else:
107
- pattern = r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b|\b\d+(?:\.\d+)?\b'
108
-
109
- return re.sub(pattern, replace_number, text)
110
-
111
- def _process_codes(self, text: str) -> str:
112
- """Process codes like PNR, flight numbers - language agnostic"""
113
- def spell_code(match):
114
- code = match.group()
115
- return ' '.join(code)
116
-
117
- # Match uppercase letters followed by numbers
118
- pattern = r'\b[A-Z]{2,5}\d{2,5}\b'
119
- return re.sub(pattern, spell_code, text)
120
-
121
- def _process_currency(self, text: str) -> str:
122
- """Process currency symbols and amounts based on locale"""
123
- currency_data = self.locale_data.get("currency", {})
124
-
125
- if not isinstance(currency_data, dict):
126
- return text
127
-
128
- symbol = currency_data.get("symbol", "")
129
- word = currency_data.get("word", "")
130
- code = currency_data.get("code", "")
131
- position = currency_data.get("position", "before")
132
-
133
- if symbol and word:
134
- # Replace standalone symbols
135
- text = text.replace(symbol, f" {word} ")
136
-
137
- # Replace symbol with amount
138
- if position == "before":
139
- # $100 -> 100 dollar
140
- pattern = rf'{re.escape(symbol)}\s*(\d+(?:[.,]\d+)?)'
141
- text = re.sub(pattern, rf'\1 {word}', text)
142
- else:
143
- # 100₺ -> 100 lira
144
- pattern = rf'(\d+(?:[.,]\d+)?)\s*{re.escape(symbol)}'
145
- text = re.sub(pattern, rf'\1 {word}', text)
146
-
147
- # Process currency codes
148
- if code and word:
149
- pattern = rf'(\d+(?:[.,]\d+)?)\s*{code}\b'
150
- text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE)
151
-
152
- return text
153
-
154
- def _process_percentage(self, text: str) -> str:
155
- """Process percentage symbols based on locale"""
156
- percentage = self.locale_data.get("percentage", {})
157
-
158
- if not isinstance(percentage, dict):
159
- return text
160
-
161
- word = percentage.get("word", "percent")
162
- position = percentage.get("position", "after")
163
-
164
- if position == "before":
165
- # %50 -> yüzde 50
166
- pattern = r'%\s*(\d+(?:[.,]\d+)?)'
167
- replacement = rf'{word} \1'
168
- else:
169
- # 50% -> 50 percent
170
- pattern = r'(\d+(?:[.,]\d+)?)\s*%'
171
- replacement = rf'\1 {word}'
172
-
173
- return re.sub(pattern, replacement, text)
174
-
175
- def _process_date(self, text: str) -> str:
176
- """Process date formats based on locale"""
177
- months = self.locale_data.get("months", {})
178
- date_format = self.locale_data.get("date_format", "YYYY-MM-DD")
179
-
180
- if not isinstance(months, dict):
181
- return text
182
-
183
- # Convert ISO format dates
184
- def replace_date(match):
185
- year, month, day = match.groups()
186
- month_name = months.get(month, month)
187
-
188
- # Format based on locale preference
189
- if "DD.MM.YYYY" in date_format:
190
- # Turkish format with month name
191
- return f"{int(day)} {month_name} {year}"
192
- elif "MM/DD/YYYY" in date_format:
193
- # US format with month name
194
- return f"{month_name} {int(day)}, {year}"
195
- else:
196
- return match.group()
197
-
198
- pattern = r'(\d{4})-(\d{2})-(\d{2})'
199
- return re.sub(pattern, replace_date, text)
200
-
201
- def _process_time(self, text: str) -> str:
202
- """Process time formats based on locale"""
203
- time_data = self.locale_data.get("time", {})
204
-
205
- if not isinstance(time_data, dict):
206
- time_format = "word"
207
- separator = " "
208
- else:
209
- time_format = time_data.get("format", "word")
210
- separator = time_data.get("separator", " ")
211
-
212
- def replace_time(match):
213
- hour, minute = match.groups()
214
- hour_int = int(hour)
215
- minute_int = int(minute)
216
-
217
- if time_format == "word":
218
- try:
219
- hour_word = num2words(hour_int, lang=self.language)
220
- minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else ""
221
-
222
- if minute_int == 0:
223
- return hour_word
224
- else:
225
- return f"{hour_word}{separator}{minute_word}"
226
- except NotImplementedError:
227
- return f"{hour} {minute}"
228
- else:
229
- return f"{hour} {minute}"
230
-
231
- pattern = r'(\d{1,2}):(\d{2})'
232
- return re.sub(pattern, replace_time, text)