"""
TTS Text Preprocessing Utilities with Multilingual Support
"""

import re
import json
from typing import Dict, Set, Optional
from num2words import num2words
from pathlib import Path
from config.locale_manager import LocaleManager

class TTSPreprocessor:
    """Text preprocessor for TTS providers with multilingual support"""

    # Preprocessing flags
    PREPROCESS_NUMBERS = "numbers"
    PREPROCESS_CURRENCY = "currency"
    PREPROCESS_TIME = "time"
    PREPROCESS_DATE = "date"
    PREPROCESS_CODES = "codes"
    PREPROCESS_PERCENTAGE = "percentage"

    def __init__(self, language: str = "tr"):
        self.language = language
        self.locale_data = LocaleManager.get_locale(language)

    def preprocess(self, text: str, flags: Set[str]) -> str:
        """Apply preprocessing based on flags"""

        if self.PREPROCESS_CURRENCY in flags:
            text = self._process_currency(text)

        if self.PREPROCESS_TIME in flags:
            text = self._process_time(text)

        if self.PREPROCESS_DATE in flags:
            text = self._process_date(text)

        if self.PREPROCESS_CODES in flags:
            text = self._process_codes(text)

        if self.PREPROCESS_PERCENTAGE in flags:
            text = self._process_percentage(text)

        # Numbers should be processed last to avoid conflicts
        if self.PREPROCESS_NUMBERS in flags:
            text = self._process_numbers(text)

        return text

    def _process_numbers(self, text: str) -> str:
        """Convert numbers to words based on locale"""
        decimal_sep = self.locale_data["numbers"]["decimal_separator"]
        thousands_sep = self.locale_data["numbers"]["thousands_separator"]
        decimal_word = self.locale_data["numbers"]["decimal_word"]
        threshold = self.locale_data.get("small_number_threshold", 100)

        def replace_number(match):
            num_str = match.group()

            # Normalize number format
            if self.language == "tr":
                # Turkish: 1.234,56 -> 1234.56
                num_str = num_str.replace('.', '').replace(',', '.')
            else:
                # English: 1,234.56 -> 1234.56
                num_str = num_str.replace(',', '')

            try:
                num = float(num_str)
                if num.is_integer():
                    num = int(num)

                # Keep small numbers as is based on threshold
                if isinstance(num, int) and 0 <= num <= threshold:
                    return str(num)

                # Convert large numbers to words
                if isinstance(num, int):
                    try:
                        return num2words(num, lang=self.language)
                    except NotImplementedError:
                        # Fallback to English if language not supported
                        return num2words(num, lang='en')
                else:
                    # Handle decimal
                    integer_part = int(num)
                    decimal_part = int((num - integer_part) * 100)

                    try:
                        int_words = num2words(integer_part, lang=self.language)
                        dec_words = num2words(decimal_part, lang=self.language)
                        return f"{int_words} {decimal_word} {dec_words}"
                    except NotImplementedError:
                        # Fallback
                        int_words = num2words(integer_part, lang='en')
                        dec_words = num2words(decimal_part, lang='en')
                        return f"{int_words} {decimal_word} {dec_words}"

            except:
                return num_str

        # Match numbers with locale-specific format
        if self.language == "tr":
            pattern = r'\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\b|\b\d+(?:,\d+)?\b'
        else:
            pattern = r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b|\b\d+(?:\.\d+)?\b'

        return re.sub(pattern, replace_number, text)

    def _process_codes(self, text: str) -> str:
        """Process codes like PNR, flight numbers - language agnostic"""
        def spell_code(match):
            code = match.group()
            return ' '.join(code)

        # Match uppercase letters followed by numbers
        pattern = r'\b[A-Z]{2,5}\d{2,5}\b'
        return re.sub(pattern, spell_code, text)

    def _process_currency(self, text: str) -> str:
        """Process currency symbols and amounts based on locale"""
        currency_data = self.locale_data.get("currency", {})

        if not isinstance(currency_data, dict):
            return text

        symbol = currency_data.get("symbol", "")
        word = currency_data.get("word", "")
        code = currency_data.get("code", "")
        position = currency_data.get("position", "before")

        if symbol and word:
            # Replace standalone symbols
            text = text.replace(symbol, f" {word} ")

            # Replace symbol with amount
            if position == "before":
                # $100 -> 100 dollar
                pattern = rf'{re.escape(symbol)}\s*(\d+(?:[.,]\d+)?)'
                text = re.sub(pattern, rf'\1 {word}', text)
            else:
                # 100₺ -> 100 lira
                pattern = rf'(\d+(?:[.,]\d+)?)\s*{re.escape(symbol)}'
                text = re.sub(pattern, rf'\1 {word}', text)

        # Process currency codes
        if code and word:
            pattern = rf'(\d+(?:[.,]\d+)?)\s*{code}\b'
            text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE)

        return text

    def _process_percentage(self, text: str) -> str:
        """Process percentage symbols based on locale"""
        percentage = self.locale_data.get("percentage", {})

        if not isinstance(percentage, dict):
            return text

        word = percentage.get("word", "percent")
        position = percentage.get("position", "after")

        if position == "before":
            # %50 -> yüzde 50
            pattern = r'%\s*(\d+(?:[.,]\d+)?)'
            replacement = rf'{word} \1'
        else:
            # 50% -> 50 percent
            pattern = r'(\d+(?:[.,]\d+)?)\s*%'
            replacement = rf'\1 {word}'

        return re.sub(pattern, replacement, text)

    def _process_date(self, text: str) -> str:
        """Process date formats based on locale"""
        months = self.locale_data.get("months", {})
        date_format = self.locale_data.get("date_format", "YYYY-MM-DD")

        if not isinstance(months, dict):
            return text

        # Convert ISO format dates
        def replace_date(match):
            year, month, day = match.groups()
            month_name = months.get(month, month)

            # Format based on locale preference
            if "DD.MM.YYYY" in date_format:
                # Turkish format with month name
                return f"{int(day)} {month_name} {year}"
            elif "MM/DD/YYYY" in date_format:
                # US format with month name
                return f"{month_name} {int(day)}, {year}"
            else:
                return match.group()

        pattern = r'(\d{4})-(\d{2})-(\d{2})'
        return re.sub(pattern, replace_date, text)

    def _process_time(self, text: str) -> str:
        """Process time formats based on locale"""
        time_data = self.locale_data.get("time", {})

        if not isinstance(time_data, dict):
            time_format = "word"
            separator = " "
        else:
            time_format = time_data.get("format", "word")
            separator = time_data.get("separator", " ")

        def replace_time(match):
            hour, minute = match.groups()
            hour_int = int(hour)
            minute_int = int(minute)

            if time_format == "word":
                try:
                    hour_word = num2words(hour_int, lang=self.language)
                    minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else ""

                    if minute_int == 0:
                        return hour_word
                    else:
                        return f"{hour_word}{separator}{minute_word}"
                except NotImplementedError:
                    return f"{hour} {minute}"
            else:
                return f"{hour} {minute}"

        pattern = r'(\d{1,2}):(\d{2})'
        return re.sub(pattern, replace_time, text)