Spaces:
Building
Building
""" | |
TTS Text Preprocessing Utilities with Multilingual Support | |
""" | |
import re | |
import json | |
from typing import Dict, Set, Optional | |
from num2words import num2words | |
from pathlib import Path | |
from config.locale_manager import LocaleManager | |
class TTSPreprocessor: | |
"""Text preprocessor for TTS providers with multilingual support""" | |
# Preprocessing flags | |
PREPROCESS_NUMBERS = "numbers" | |
PREPROCESS_CURRENCY = "currency" | |
PREPROCESS_TIME = "time" | |
PREPROCESS_DATE = "date" | |
PREPROCESS_CODES = "codes" | |
PREPROCESS_PERCENTAGE = "percentage" | |
def __init__(self, language: str = "tr"): | |
self.language = language | |
self.locale_data = LocaleManager.get_locale(language) | |
def preprocess(self, text: str, flags: Set[str]) -> str: | |
"""Apply preprocessing based on flags""" | |
if self.PREPROCESS_CURRENCY in flags: | |
text = self._process_currency(text) | |
if self.PREPROCESS_TIME in flags: | |
text = self._process_time(text) | |
if self.PREPROCESS_DATE in flags: | |
text = self._process_date(text) | |
if self.PREPROCESS_CODES in flags: | |
text = self._process_codes(text) | |
if self.PREPROCESS_PERCENTAGE in flags: | |
text = self._process_percentage(text) | |
# Numbers should be processed last to avoid conflicts | |
if self.PREPROCESS_NUMBERS in flags: | |
text = self._process_numbers(text) | |
return text | |
def _process_numbers(self, text: str) -> str: | |
"""Convert numbers to words based on locale""" | |
decimal_sep = self.locale_data["numbers"]["decimal_separator"] | |
thousands_sep = self.locale_data["numbers"]["thousands_separator"] | |
decimal_word = self.locale_data["numbers"]["decimal_word"] | |
threshold = self.locale_data.get("small_number_threshold", 100) | |
def replace_number(match): | |
num_str = match.group() | |
# Normalize number format | |
if self.language == "tr": | |
# Turkish: 1.234,56 -> 1234.56 | |
num_str = num_str.replace('.', '').replace(',', '.') | |
else: | |
# English: 1,234.56 -> 1234.56 | |
num_str = num_str.replace(',', '') | |
try: | |
num = float(num_str) | |
if num.is_integer(): | |
num = int(num) | |
# Keep small numbers as is based on threshold | |
if isinstance(num, int) and 0 <= num <= threshold: | |
return str(num) | |
# Convert large numbers to words | |
if isinstance(num, int): | |
try: | |
return num2words(num, lang=self.language) | |
except NotImplementedError: | |
# Fallback to English if language not supported | |
return num2words(num, lang='en') | |
else: | |
# Handle decimal | |
integer_part = int(num) | |
decimal_part = int((num - integer_part) * 100) | |
try: | |
int_words = num2words(integer_part, lang=self.language) | |
dec_words = num2words(decimal_part, lang=self.language) | |
return f"{int_words} {decimal_word} {dec_words}" | |
except NotImplementedError: | |
# Fallback | |
int_words = num2words(integer_part, lang='en') | |
dec_words = num2words(decimal_part, lang='en') | |
return f"{int_words} {decimal_word} {dec_words}" | |
except: | |
return num_str | |
# Match numbers with locale-specific format | |
if self.language == "tr": | |
pattern = r'\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\b|\b\d+(?:,\d+)?\b' | |
else: | |
pattern = r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b|\b\d+(?:\.\d+)?\b' | |
return re.sub(pattern, replace_number, text) | |
def _process_codes(self, text: str) -> str: | |
"""Process codes like PNR, flight numbers - language agnostic""" | |
def spell_code(match): | |
code = match.group() | |
return ' '.join(code) | |
# Match uppercase letters followed by numbers | |
pattern = r'\b[A-Z]{2,5}\d{2,5}\b' | |
return re.sub(pattern, spell_code, text) | |
def _process_currency(self, text: str) -> str: | |
"""Process currency symbols and amounts based on locale""" | |
currency_data = self.locale_data.get("currency", {}) | |
if not isinstance(currency_data, dict): | |
return text | |
symbol = currency_data.get("symbol", "") | |
word = currency_data.get("word", "") | |
code = currency_data.get("code", "") | |
position = currency_data.get("position", "before") | |
if symbol and word: | |
# Replace standalone symbols | |
text = text.replace(symbol, f" {word} ") | |
# Replace symbol with amount | |
if position == "before": | |
# $100 -> 100 dollar | |
pattern = rf'{re.escape(symbol)}\s*(\d+(?:[.,]\d+)?)' | |
text = re.sub(pattern, rf'\1 {word}', text) | |
else: | |
# 100₺ -> 100 lira | |
pattern = rf'(\d+(?:[.,]\d+)?)\s*{re.escape(symbol)}' | |
text = re.sub(pattern, rf'\1 {word}', text) | |
# Process currency codes | |
if code and word: | |
pattern = rf'(\d+(?:[.,]\d+)?)\s*{code}\b' | |
text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE) | |
return text | |
def _process_percentage(self, text: str) -> str: | |
"""Process percentage symbols based on locale""" | |
percentage = self.locale_data.get("percentage", {}) | |
if not isinstance(percentage, dict): | |
return text | |
word = percentage.get("word", "percent") | |
position = percentage.get("position", "after") | |
if position == "before": | |
# %50 -> yüzde 50 | |
pattern = r'%\s*(\d+(?:[.,]\d+)?)' | |
replacement = rf'{word} \1' | |
else: | |
# 50% -> 50 percent | |
pattern = r'(\d+(?:[.,]\d+)?)\s*%' | |
replacement = rf'\1 {word}' | |
return re.sub(pattern, replacement, text) | |
def _process_date(self, text: str) -> str: | |
"""Process date formats based on locale""" | |
months = self.locale_data.get("months", {}) | |
date_format = self.locale_data.get("date_format", "YYYY-MM-DD") | |
if not isinstance(months, dict): | |
return text | |
# Convert ISO format dates | |
def replace_date(match): | |
year, month, day = match.groups() | |
month_name = months.get(month, month) | |
# Format based on locale preference | |
if "DD.MM.YYYY" in date_format: | |
# Turkish format with month name | |
return f"{int(day)} {month_name} {year}" | |
elif "MM/DD/YYYY" in date_format: | |
# US format with month name | |
return f"{month_name} {int(day)}, {year}" | |
else: | |
return match.group() | |
pattern = r'(\d{4})-(\d{2})-(\d{2})' | |
return re.sub(pattern, replace_date, text) | |
def _process_time(self, text: str) -> str: | |
"""Process time formats based on locale""" | |
time_data = self.locale_data.get("time", {}) | |
if not isinstance(time_data, dict): | |
time_format = "word" | |
separator = " " | |
else: | |
time_format = time_data.get("format", "word") | |
separator = time_data.get("separator", " ") | |
def replace_time(match): | |
hour, minute = match.groups() | |
hour_int = int(hour) | |
minute_int = int(minute) | |
if time_format == "word": | |
try: | |
hour_word = num2words(hour_int, lang=self.language) | |
minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else "" | |
if minute_int == 0: | |
return hour_word | |
else: | |
return f"{hour_word}{separator}{minute_word}" | |
except NotImplementedError: | |
return f"{hour} {minute}" | |
else: | |
return f"{hour} {minute}" | |
pattern = r'(\d{1,2}):(\d{2})' | |
return re.sub(pattern, replace_time, text) |