ciyidogan commited on
Commit
81e4201
·
verified ·
1 Parent(s): 3d5e78f

Create tts_preprocessor.py

Browse files
Files changed (1) hide show
  1. tts_preprocessor.py +87 -0
tts_preprocessor.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TTS Text Preprocessing Utilities with Multilingual Support
3
+ """
4
+
5
+ import re
6
+ import json
7
+ from typing import Dict, Set, Optional
8
+ from num2words import num2words
9
+ from pathlib import Path
10
+
11
+ class TTSPreprocessor:
12
+ """Text preprocessor for TTS providers with multilingual support"""
13
+
14
+ # Preprocessing flags
15
+ PREPROCESS_NUMBERS = "numbers"
16
+ PREPROCESS_CURRENCY = "currency"
17
+ PREPROCESS_TIME = "time"
18
+ PREPROCESS_DATE = "date"
19
+ PREPROCESS_CODES = "codes"
20
+ PREPROCESS_PERCENTAGE = "percentage"
21
+
22
+ def __init__(self, language: str = "tr"):
23
+ self.language = language
24
+ self.locale_data = self._load_locale(language)
25
+
26
+ def _load_locale(self, language: str) -> Dict:
27
+ """Load locale data from JSON file"""
28
+ locale_path = Path(__file__).parent / "locales" / f"{language}.json"
29
+
30
+ # Fallback to English if locale not found
31
+ if not locale_path.exists():
32
+ print(f"⚠️ Locale file not found for {language}, falling back to English")
33
+ locale_path = Path(__file__).parent / "locales" / "en.json"
34
+
35
+ try:
36
+ with open(locale_path, 'r', encoding='utf-8') as f:
37
+ return json.load(f)
38
+ except Exception as e:
39
+ print(f"❌ Error loading locale {language}: {e}")
40
+ # Return minimal default structure
41
+ return {
42
+ "language_code": language,
43
+ "currency": {"symbols": {}, "codes": {}},
44
+ "months": {},
45
+ "numbers": {
46
+ "decimal_separator": ".",
47
+ "thousands_separator": ",",
48
+ "decimal_word": "point"
49
+ },
50
+ "small_number_threshold": 100
51
+ }
52
+
53
+ def preprocess(self, text: str, flags: Set[str]) -> str:
54
+ """Apply preprocessing based on flags"""
55
+
56
+ if self.PREPROCESS_CURRENCY in flags:
57
+ text = self._process_currency(text)
58
+
59
+ if self.PREPROCESS_TIME in flags:
60
+ text = self._process_time(text)
61
+
62
+ if self.PREPROCESS_DATE in flags:
63
+ text = self._process_date(text)
64
+
65
+ if self.PREPROCESS_CODES in flags:
66
+ text = self._process_codes(text)
67
+
68
+ if self.PREPROCESS_PERCENTAGE in flags:
69
+ text = self._process_percentage(text)
70
+
71
+ # Numbers should be processed last to avoid conflicts
72
+ if self.PREPROCESS_NUMBERS in flags:
73
+ text = self._process_numbers(text)
74
+
75
+ return text
76
+
77
+ def _process_numbers(self, text: str) -> str:
78
+ """Convert numbers to words based on locale"""
79
+ decimal_sep = self.locale_data["numbers"]["decimal_separator"]
80
+ thousands_sep = self.locale_data["numbers"]["thousands_separator"]
81
+ decimal_word = self.locale_data["numbers"]["decimal_word"]
82
+ threshold = self.locale_data.get("small_number_threshold", 100)
83
+
84
+ def replace_number(match):
85
+ num_str = match.group()
86
+
87
+ # Normalize number format