File size: 10,714 Bytes
b163aa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
"""
Text Preprocessing Module
========================

Handles text normalization, translation, chunking, and optimization for TTS processing.
Implements caching and batch processing for improved performance.
"""

import re
import string
import logging
import asyncio
from typing import List, Tuple, Dict, Optional
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
import time

import inflect
import requests
from requests.exceptions import Timeout, RequestException

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class TextProcessor:
    """High-performance text processor with caching and optimization."""
    
    def __init__(self, max_chunk_length: int = 200, overlap_words: int = 5, 
                 translation_timeout: int = 10):
        """
        Initialize the text processor.
        
        Args:
            max_chunk_length: Maximum characters per chunk
            overlap_words: Number of words to overlap between chunks
            translation_timeout: Timeout for translation requests in seconds
        """
        self.max_chunk_length = max_chunk_length
        self.overlap_words = overlap_words
        self.translation_timeout = translation_timeout
        self.inflect_engine = inflect.engine()
        self.translation_cache: Dict[str, str] = {}
        self.number_cache: Dict[str, str] = {}
        
        # Thread pool for parallel processing
        self.executor = ThreadPoolExecutor(max_workers=4)
        
    @lru_cache(maxsize=1000)
    def _cached_translate(self, text: str) -> str:
        """
        Cached translation function to avoid repeated API calls.
        
        Args:
            text: Text to translate
            
        Returns:
            Translated text in Armenian
        """
        if not text.strip():
            return text
            
        try:
            response = requests.get(
                "https://translate.googleapis.com/translate_a/single",
                params={
                    'client': 'gtx',
                    'sl': 'auto',
                    'tl': 'hy',
                    'dt': 't',
                    'q': text,
                },
                timeout=self.translation_timeout,
            )
            response.raise_for_status()
            translation = response.json()[0][0][0]
            logger.debug(f"Translated '{text}' to '{translation}'")
            return translation
            
        except (RequestException, Timeout, IndexError) as e:
            logger.warning(f"Translation failed for '{text}': {e}")
            return text  # Return original text if translation fails
    
    def _convert_number_to_armenian_words(self, number: int) -> str:
        """
        Convert number to Armenian words with caching.
        
        Args:
            number: Integer to convert
            
        Returns:
            Number as Armenian words
        """
        cache_key = str(number)
        if cache_key in self.number_cache:
            return self.number_cache[cache_key]
            
        try:
            # Convert to English words first
            english_words = self.inflect_engine.number_to_words(number)
            # Translate to Armenian
            armenian_words = self._cached_translate(english_words)
            
            # Cache the result
            self.number_cache[cache_key] = armenian_words
            return armenian_words
            
        except Exception as e:
            logger.warning(f"Number conversion failed for {number}: {e}")
            return str(number)  # Fallback to original number
    
    def _normalize_text(self, text: str) -> str:
        """
        Normalize text by handling numbers, punctuation, and special characters.
        
        Args:
            text: Input text to normalize
            
        Returns:
            Normalized text
        """
        if not text:
            return ""
            
        # Convert to string and strip
        text = str(text).strip()
        
        # Process each word
        words = []
        for word in text.split():
            # Extract numbers from word
            if re.search(r'\d', word):
                # Extract just the digits
                digits = ''.join(filter(str.isdigit, word))
                if digits:
                    try:
                        number = int(digits)
                        armenian_word = self._convert_number_to_armenian_words(number)
                        words.append(armenian_word)
                    except ValueError:
                        words.append(word)  # Keep original if conversion fails
                else:
                    words.append(word)
            else:
                words.append(word)
        
        return ' '.join(words)
    
    def _split_into_sentences(self, text: str) -> List[str]:
        """
        Split text into sentences using multiple delimiters.
        
        Args:
            text: Text to split
            
        Returns:
            List of sentences
        """
        # Armenian sentence delimiters
        sentence_endings = r'[.!?։՞՜]+'
        sentences = re.split(sentence_endings, text)
        
        # Clean and filter empty sentences
        sentences = [s.strip() for s in sentences if s.strip()]
        return sentences
    
    def chunk_text(self, text: str) -> List[str]:
        """
        Intelligently chunk text for optimal TTS processing.
        
        This method implements sophisticated chunking that:
        1. Respects sentence boundaries
        2. Maintains semantic coherence
        3. Includes overlap for smooth transitions
        4. Optimizes chunk sizes for the TTS model
        
        Args:
            text: Input text to chunk
            
        Returns:
            List of text chunks optimized for TTS
        """
        if not text or len(text) <= self.max_chunk_length:
            return [text] if text else []
        
        sentences = self._split_into_sentences(text)
        if not sentences:
            return [text]
        
        chunks = []
        current_chunk = ""
        
        for i, sentence in enumerate(sentences):
            # If single sentence is too long, split by clauses
            if len(sentence) > self.max_chunk_length:
                # Split by commas and conjunctions
                clauses = re.split(r'[,;]|\sև\s|\sկամ\s|\sբայց\s', sentence)
                for clause in clauses:
                    clause = clause.strip()
                    if not clause:
                        continue
                        
                    if len(current_chunk + " " + clause) <= self.max_chunk_length:
                        current_chunk = (current_chunk + " " + clause).strip()
                    else:
                        if current_chunk:
                            chunks.append(current_chunk)
                        current_chunk = clause
            else:
                # Try to add whole sentence
                test_chunk = (current_chunk + " " + sentence).strip()
                if len(test_chunk) <= self.max_chunk_length:
                    current_chunk = test_chunk
                else:
                    # Current chunk is full, start new one
                    if current_chunk:
                        chunks.append(current_chunk)
                    current_chunk = sentence
        
        # Add final chunk
        if current_chunk:
            chunks.append(current_chunk)
        
        # Implement overlap for smooth transitions
        if len(chunks) > 1:
            chunks = self._add_overlap(chunks)
        
        logger.info(f"Split text into {len(chunks)} chunks")
        return chunks
    
    def _add_overlap(self, chunks: List[str]) -> List[str]:
        """
        Add overlapping words between chunks for smoother transitions.
        
        Args:
            chunks: List of text chunks
            
        Returns:
            Chunks with added overlap
        """
        if len(chunks) <= 1:
            return chunks
            
        overlapped_chunks = [chunks[0]]
        
        for i in range(1, len(chunks)):
            prev_words = chunks[i-1].split()
            current_chunk = chunks[i]
            
            # Get last few words from previous chunk
            overlap_words = prev_words[-self.overlap_words:] if len(prev_words) >= self.overlap_words else prev_words
            overlap_text = " ".join(overlap_words)
            
            # Prepend overlap to current chunk
            overlapped_chunk = f"{overlap_text} {current_chunk}".strip()
            overlapped_chunks.append(overlapped_chunk)
        
        return overlapped_chunks
    
    def process_text(self, text: str) -> str:
        """
        Main text processing pipeline.
        
        Args:
            text: Raw input text
            
        Returns:
            Processed and normalized text ready for TTS
        """
        start_time = time.time()
        
        if not text or not text.strip():
            return ""
        
        try:
            # Normalize the text
            processed_text = self._normalize_text(text)
            
            processing_time = time.time() - start_time
            logger.info(f"Text processed in {processing_time:.3f}s")
            
            return processed_text
            
        except Exception as e:
            logger.error(f"Text processing failed: {e}")
            return str(text)  # Return original text as fallback
    
    def process_chunks(self, text: str) -> List[str]:
        """
        Process text and return optimized chunks for TTS.
        
        Args:
            text: Input text
            
        Returns:
            List of processed text chunks
        """
        # First normalize the text
        processed_text = self.process_text(text)
        
        # Then chunk it
        chunks = self.chunk_text(processed_text)
        
        return chunks
    
    def clear_cache(self):
        """Clear all caches to free memory."""
        self._cached_translate.cache_clear()
        self.translation_cache.clear()
        self.number_cache.clear()
        logger.info("Caches cleared")
    
    def get_cache_stats(self) -> Dict[str, int]:
        """Get statistics about cache usage."""
        return {
            "translation_cache_size": len(self.translation_cache),
            "number_cache_size": len(self.number_cache),
            "lru_cache_hits": self._cached_translate.cache_info().hits,
            "lru_cache_misses": self._cached_translate.cache_info().misses,
        }