Spaces:
Building
Building
Create tts_preprocessor.py
Browse files- tts_preprocessor.py +87 -0
tts_preprocessor.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TTS Text Preprocessing Utilities with Multilingual Support
|
3 |
+
"""
|
4 |
+
|
5 |
+
import re
|
6 |
+
import json
|
7 |
+
from typing import Dict, Set, Optional
|
8 |
+
from num2words import num2words
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
class TTSPreprocessor:
|
12 |
+
"""Text preprocessor for TTS providers with multilingual support"""
|
13 |
+
|
14 |
+
# Preprocessing flags
|
15 |
+
PREPROCESS_NUMBERS = "numbers"
|
16 |
+
PREPROCESS_CURRENCY = "currency"
|
17 |
+
PREPROCESS_TIME = "time"
|
18 |
+
PREPROCESS_DATE = "date"
|
19 |
+
PREPROCESS_CODES = "codes"
|
20 |
+
PREPROCESS_PERCENTAGE = "percentage"
|
21 |
+
|
22 |
+
def __init__(self, language: str = "tr"):
|
23 |
+
self.language = language
|
24 |
+
self.locale_data = self._load_locale(language)
|
25 |
+
|
26 |
+
def _load_locale(self, language: str) -> Dict:
|
27 |
+
"""Load locale data from JSON file"""
|
28 |
+
locale_path = Path(__file__).parent / "locales" / f"{language}.json"
|
29 |
+
|
30 |
+
# Fallback to English if locale not found
|
31 |
+
if not locale_path.exists():
|
32 |
+
print(f"⚠️ Locale file not found for {language}, falling back to English")
|
33 |
+
locale_path = Path(__file__).parent / "locales" / "en.json"
|
34 |
+
|
35 |
+
try:
|
36 |
+
with open(locale_path, 'r', encoding='utf-8') as f:
|
37 |
+
return json.load(f)
|
38 |
+
except Exception as e:
|
39 |
+
print(f"❌ Error loading locale {language}: {e}")
|
40 |
+
# Return minimal default structure
|
41 |
+
return {
|
42 |
+
"language_code": language,
|
43 |
+
"currency": {"symbols": {}, "codes": {}},
|
44 |
+
"months": {},
|
45 |
+
"numbers": {
|
46 |
+
"decimal_separator": ".",
|
47 |
+
"thousands_separator": ",",
|
48 |
+
"decimal_word": "point"
|
49 |
+
},
|
50 |
+
"small_number_threshold": 100
|
51 |
+
}
|
52 |
+
|
53 |
+
def preprocess(self, text: str, flags: Set[str]) -> str:
|
54 |
+
"""Apply preprocessing based on flags"""
|
55 |
+
|
56 |
+
if self.PREPROCESS_CURRENCY in flags:
|
57 |
+
text = self._process_currency(text)
|
58 |
+
|
59 |
+
if self.PREPROCESS_TIME in flags:
|
60 |
+
text = self._process_time(text)
|
61 |
+
|
62 |
+
if self.PREPROCESS_DATE in flags:
|
63 |
+
text = self._process_date(text)
|
64 |
+
|
65 |
+
if self.PREPROCESS_CODES in flags:
|
66 |
+
text = self._process_codes(text)
|
67 |
+
|
68 |
+
if self.PREPROCESS_PERCENTAGE in flags:
|
69 |
+
text = self._process_percentage(text)
|
70 |
+
|
71 |
+
# Numbers should be processed last to avoid conflicts
|
72 |
+
if self.PREPROCESS_NUMBERS in flags:
|
73 |
+
text = self._process_numbers(text)
|
74 |
+
|
75 |
+
return text
|
76 |
+
|
77 |
+
def _process_numbers(self, text: str) -> str:
|
78 |
+
"""Convert numbers to words based on locale"""
|
79 |
+
decimal_sep = self.locale_data["numbers"]["decimal_separator"]
|
80 |
+
thousands_sep = self.locale_data["numbers"]["thousands_separator"]
|
81 |
+
decimal_word = self.locale_data["numbers"]["decimal_word"]
|
82 |
+
threshold = self.locale_data.get("small_number_threshold", 100)
|
83 |
+
|
84 |
+
def replace_number(match):
|
85 |
+
num_str = match.group()
|
86 |
+
|
87 |
+
# Normalize number format
|