Spaces:

UcsTurkey
/

flare

Building

App Files Files Community

ciyidogan commited on 11 days ago

Commit

1ad909f

verified ·

1 Parent(s): 269eb02

Delete tts_preprocessor.py

Browse files

Files changed (1) hide show

tts_preprocessor.py +0 -232

tts_preprocessor.py DELETED Viewed

@@ -1,232 +0,0 @@
-"""
-TTS Text Preprocessing Utilities with Multilingual Support
-"""
-import re
-import json
-from typing import Dict, Set, Optional
-from num2words import num2words
-from pathlib import Path
-from locale_manager import LocaleManager
-class TTSPreprocessor:
-    """Text preprocessor for TTS providers with multilingual support"""
-    # Preprocessing flags
-    PREPROCESS_NUMBERS = "numbers"
-    PREPROCESS_CURRENCY = "currency"
-    PREPROCESS_TIME = "time"
-    PREPROCESS_DATE = "date"
-    PREPROCESS_CODES = "codes"
-    PREPROCESS_PERCENTAGE = "percentage"
-    def __init__(self, language: str = "tr"):
-        self.language = language
-        self.locale_data = LocaleManager.get_locale(language)
-    def preprocess(self, text: str, flags: Set[str]) -> str:
-        """Apply preprocessing based on flags"""
-        if self.PREPROCESS_CURRENCY in flags:
-            text = self._process_currency(text)
-        if self.PREPROCESS_TIME in flags:
-            text = self._process_time(text)
-        if self.PREPROCESS_DATE in flags:
-            text = self._process_date(text)
-        if self.PREPROCESS_CODES in flags:
-            text = self._process_codes(text)
-        if self.PREPROCESS_PERCENTAGE in flags:
-            text = self._process_percentage(text)
-        # Numbers should be processed last to avoid conflicts
-        if self.PREPROCESS_NUMBERS in flags:
-            text = self._process_numbers(text)
-        return text
-    def _process_numbers(self, text: str) -> str:
-        """Convert numbers to words based on locale"""
-        decimal_sep = self.locale_data["numbers"]["decimal_separator"]
-        thousands_sep = self.locale_data["numbers"]["thousands_separator"]
-        decimal_word = self.locale_data["numbers"]["decimal_word"]
-        threshold = self.locale_data.get("small_number_threshold", 100)
-        def replace_number(match):
-            num_str = match.group()
-            # Normalize number format
-            if self.language == "tr":
-                # Turkish: 1.234,56 -> 1234.56
-                num_str = num_str.replace('.', '').replace(',', '.')
-            else:
-                # English: 1,234.56 -> 1234.56
-                num_str = num_str.replace(',', '')
-            try:
-                num = float(num_str)
-                if num.is_integer():
-                    num = int(num)
-                # Keep small numbers as is based on threshold
-                if isinstance(num, int) and 0 <= num <= threshold:
-                    return str(num)
-                # Convert large numbers to words
-                if isinstance(num, int):
-                    try:
-                        return num2words(num, lang=self.language)
-                    except NotImplementedError:
-                        # Fallback to English if language not supported
-                        return num2words(num, lang='en')
-                else:
-                    # Handle decimal
-                    integer_part = int(num)
-                    decimal_part = int((num - integer_part) * 100)
-                    try:
-                        int_words = num2words(integer_part, lang=self.language)
-                        dec_words = num2words(decimal_part, lang=self.language)
-                        return f"{int_words} {decimal_word} {dec_words}"
-                    except NotImplementedError:
-                        # Fallback
-                        int_words = num2words(integer_part, lang='en')
-                        dec_words = num2words(decimal_part, lang='en')
-                        return f"{int_words} {decimal_word} {dec_words}"
-            except:
-                return num_str
-        # Match numbers with locale-specific format
-        if self.language == "tr":
-            pattern = r'\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\b|\b\d+(?:,\d+)?\b'
-        else:
-            pattern = r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b|\b\d+(?:\.\d+)?\b'
-        return re.sub(pattern, replace_number, text)
-    def _process_codes(self, text: str) -> str:
-        """Process codes like PNR, flight numbers - language agnostic"""
-        def spell_code(match):
-            code = match.group()
-            return ' '.join(code)
-        # Match uppercase letters followed by numbers
-        pattern = r'\b[A-Z]{2,5}\d{2,5}\b'
-        return re.sub(pattern, spell_code, text)
-    def _process_currency(self, text: str) -> str:
-        """Process currency symbols and amounts based on locale"""
-        currency_data = self.locale_data.get("currency", {})
-        if not isinstance(currency_data, dict):
-            return text
-        symbol = currency_data.get("symbol", "")
-        word = currency_data.get("word", "")
-        code = currency_data.get("code", "")
-        position = currency_data.get("position", "before")
-        if symbol and word:
-            # Replace standalone symbols
-            text = text.replace(symbol, f" {word} ")
-            # Replace symbol with amount
-            if position == "before":
-                # $100 -> 100 dollar
-                pattern = rf'{re.escape(symbol)}\s*(\d+(?:[.,]\d+)?)'
-                text = re.sub(pattern, rf'\1 {word}', text)
-            else:
-                # 100₺ -> 100 lira
-                pattern = rf'(\d+(?:[.,]\d+)?)\s*{re.escape(symbol)}'
-                text = re.sub(pattern, rf'\1 {word}', text)
-        # Process currency codes
-        if code and word:
-            pattern = rf'(\d+(?:[.,]\d+)?)\s*{code}\b'
-            text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE)
-        return text
-    def _process_percentage(self, text: str) -> str:
-        """Process percentage symbols based on locale"""
-        percentage = self.locale_data.get("percentage", {})
-        if not isinstance(percentage, dict):
-            return text
-        word = percentage.get("word", "percent")
-        position = percentage.get("position", "after")
-        if position == "before":
-            # %50 -> yüzde 50
-            pattern = r'%\s*(\d+(?:[.,]\d+)?)'
-            replacement = rf'{word} \1'
-        else:
-            # 50% -> 50 percent
-            pattern = r'(\d+(?:[.,]\d+)?)\s*%'
-            replacement = rf'\1 {word}'
-        return re.sub(pattern, replacement, text)
-    def _process_date(self, text: str) -> str:
-        """Process date formats based on locale"""
-        months = self.locale_data.get("months", {})
-        date_format = self.locale_data.get("date_format", "YYYY-MM-DD")
-        if not isinstance(months, dict):
-            return text
-        # Convert ISO format dates
-        def replace_date(match):
-            year, month, day = match.groups()
-            month_name = months.get(month, month)
-            # Format based on locale preference
-            if "DD.MM.YYYY" in date_format:
-                # Turkish format with month name
-                return f"{int(day)} {month_name} {year}"
-            elif "MM/DD/YYYY" in date_format:
-                # US format with month name
-                return f"{month_name} {int(day)}, {year}"
-            else:
-                return match.group()
-        pattern = r'(\d{4})-(\d{2})-(\d{2})'
-        return re.sub(pattern, replace_date, text)
-    def _process_time(self, text: str) -> str:
-        """Process time formats based on locale"""
-        time_data = self.locale_data.get("time", {})
-        if not isinstance(time_data, dict):
-            time_format = "word"
-            separator = " "
-        else:
-            time_format = time_data.get("format", "word")
-            separator = time_data.get("separator", " ")
-        def replace_time(match):
-            hour, minute = match.groups()
-            hour_int = int(hour)
-            minute_int = int(minute)
-            if time_format == "word":
-                try:
-                    hour_word = num2words(hour_int, lang=self.language)
-                    minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else ""
-                    if minute_int == 0:
-                        return hour_word
-                    else:
-                        return f"{hour_word}{separator}{minute_word}"
-                except NotImplementedError:
-                    return f"{hour} {minute}"
-            else:
-                return f"{hour} {minute}"
-        pattern = r'(\d{1,2}):(\d{2})'
-        return re.sub(pattern, replace_time, text)