""" TTS Text Preprocessing Utilities with Multilingual Support """ import re import json from typing import Dict, Set, Optional from num2words import num2words from pathlib import Path from config.locale_manager import LocaleManager class TTSPreprocessor: """Text preprocessor for TTS providers with multilingual support""" # Preprocessing flags PREPROCESS_NUMBERS = "numbers" PREPROCESS_CURRENCY = "currency" PREPROCESS_TIME = "time" PREPROCESS_DATE = "date" PREPROCESS_CODES = "codes" PREPROCESS_PERCENTAGE = "percentage" def __init__(self, language: str = "tr"): self.language = language self.locale_data = LocaleManager.get_locale(language) def preprocess(self, text: str, flags: Set[str]) -> str: """Apply preprocessing based on flags""" if self.PREPROCESS_CURRENCY in flags: text = self._process_currency(text) if self.PREPROCESS_TIME in flags: text = self._process_time(text) if self.PREPROCESS_DATE in flags: text = self._process_date(text) if self.PREPROCESS_CODES in flags: text = self._process_codes(text) if self.PREPROCESS_PERCENTAGE in flags: text = self._process_percentage(text) # Numbers should be processed last to avoid conflicts if self.PREPROCESS_NUMBERS in flags: text = self._process_numbers(text) return text def _process_numbers(self, text: str) -> str: """Convert numbers to words based on locale""" decimal_sep = self.locale_data["numbers"]["decimal_separator"] thousands_sep = self.locale_data["numbers"]["thousands_separator"] decimal_word = self.locale_data["numbers"]["decimal_word"] threshold = self.locale_data.get("small_number_threshold", 100) def replace_number(match): num_str = match.group() # Normalize number format if self.language == "tr": # Turkish: 1.234,56 -> 1234.56 num_str = num_str.replace('.', '').replace(',', '.') else: # English: 1,234.56 -> 1234.56 num_str = num_str.replace(',', '') try: num = float(num_str) if num.is_integer(): num = int(num) # Keep small numbers as is based on threshold if isinstance(num, int) and 0 <= num <= threshold: return str(num) # Convert large numbers to words if isinstance(num, int): try: return num2words(num, lang=self.language) except NotImplementedError: # Fallback to English if language not supported return num2words(num, lang='en') else: # Handle decimal integer_part = int(num) decimal_part = int((num - integer_part) * 100) try: int_words = num2words(integer_part, lang=self.language) dec_words = num2words(decimal_part, lang=self.language) return f"{int_words} {decimal_word} {dec_words}" except NotImplementedError: # Fallback int_words = num2words(integer_part, lang='en') dec_words = num2words(decimal_part, lang='en') return f"{int_words} {decimal_word} {dec_words}" except: return num_str # Match numbers with locale-specific format if self.language == "tr": pattern = r'\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\b|\b\d+(?:,\d+)?\b' else: pattern = r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b|\b\d+(?:\.\d+)?\b' return re.sub(pattern, replace_number, text) def _process_codes(self, text: str) -> str: """Process codes like PNR, flight numbers - language agnostic""" def spell_code(match): code = match.group() return ' '.join(code) # Match uppercase letters followed by numbers pattern = r'\b[A-Z]{2,5}\d{2,5}\b' return re.sub(pattern, spell_code, text) def _process_currency(self, text: str) -> str: """Process currency symbols and amounts based on locale""" currency_data = self.locale_data.get("currency", {}) if not isinstance(currency_data, dict): return text symbol = currency_data.get("symbol", "") word = currency_data.get("word", "") code = currency_data.get("code", "") position = currency_data.get("position", "before") if symbol and word: # Replace standalone symbols text = text.replace(symbol, f" {word} ") # Replace symbol with amount if position == "before": # $100 -> 100 dollar pattern = rf'{re.escape(symbol)}\s*(\d+(?:[.,]\d+)?)' text = re.sub(pattern, rf'\1 {word}', text) else: # 100₺ -> 100 lira pattern = rf'(\d+(?:[.,]\d+)?)\s*{re.escape(symbol)}' text = re.sub(pattern, rf'\1 {word}', text) # Process currency codes if code and word: pattern = rf'(\d+(?:[.,]\d+)?)\s*{code}\b' text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE) return text def _process_percentage(self, text: str) -> str: """Process percentage symbols based on locale""" percentage = self.locale_data.get("percentage", {}) if not isinstance(percentage, dict): return text word = percentage.get("word", "percent") position = percentage.get("position", "after") if position == "before": # %50 -> yüzde 50 pattern = r'%\s*(\d+(?:[.,]\d+)?)' replacement = rf'{word} \1' else: # 50% -> 50 percent pattern = r'(\d+(?:[.,]\d+)?)\s*%' replacement = rf'\1 {word}' return re.sub(pattern, replacement, text) def _process_date(self, text: str) -> str: """Process date formats based on locale""" months = self.locale_data.get("months", {}) date_format = self.locale_data.get("date_format", "YYYY-MM-DD") if not isinstance(months, dict): return text # Convert ISO format dates def replace_date(match): year, month, day = match.groups() month_name = months.get(month, month) # Format based on locale preference if "DD.MM.YYYY" in date_format: # Turkish format with month name return f"{int(day)} {month_name} {year}" elif "MM/DD/YYYY" in date_format: # US format with month name return f"{month_name} {int(day)}, {year}" else: return match.group() pattern = r'(\d{4})-(\d{2})-(\d{2})' return re.sub(pattern, replace_date, text) def _process_time(self, text: str) -> str: """Process time formats based on locale""" time_data = self.locale_data.get("time", {}) if not isinstance(time_data, dict): time_format = "word" separator = " " else: time_format = time_data.get("format", "word") separator = time_data.get("separator", " ") def replace_time(match): hour, minute = match.groups() hour_int = int(hour) minute_int = int(minute) if time_format == "word": try: hour_word = num2words(hour_int, lang=self.language) minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else "" if minute_int == 0: return hour_word else: return f"{hour_word}{separator}{minute_word}" except NotImplementedError: return f"{hour} {minute}" else: return f"{hour} {minute}" pattern = r'(\d{1,2}):(\d{2})' return re.sub(pattern, replace_time, text)