File size: 11,390 Bytes
d187b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
import re
import nltk
import logging
from typing import List, Set, Dict, Optional
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from TurkishStemmer import TurkishStemmer
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import unicodedata
import warnings

# Suppress BeautifulSoup warning about markup resembling a filename
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

# Download required NLTK data
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
except Exception as e:
    print(f"Warning: Could not download NLTK data: {str(e)}")

# Configure logging
logging.basicConfig(level=logging.WARNING)

class TextPreprocessor:
    """
    A comprehensive text preprocessor for multilingual text cleaning and normalization.
    Supports multiple languages and provides various text cleaning operations.
    """
    
    SUPPORTED_LANGUAGES = {'en', 'es', 'fr', 'it', 'pt', 'ru', 'tr'}
    
    # Common contractions mapping (can be extended)
    CONTRACTIONS = {
        "ain't": "is not", "aren't": "are not", "can't": "cannot", 
        "couldn't": "could not", "didn't": "did not", "doesn't": "does not",
        "don't": "do not", "hadn't": "had not", "hasn't": "has not",
        "haven't": "have not", "he'd": "he would", "he'll": "he will",
        "he's": "he is", "i'd": "i would", "i'll": "i will", "i'm": "i am",
        "i've": "i have", "isn't": "is not", "it's": "it is",
        "let's": "let us", "shouldn't": "should not", "that's": "that is",
        "there's": "there is", "they'd": "they would", "they'll": "they will",
        "they're": "they are", "they've": "they have", "wasn't": "was not",
        "we'd": "we would", "we're": "we are", "we've": "we have",
        "weren't": "were not", "what's": "what is", "where's": "where is",
        "who's": "who is", "won't": "will not", "wouldn't": "would not",
        "you'd": "you would", "you'll": "you will", "you're": "you are",
        "you've": "you have"
    }
    
    def __init__(self, languages: Optional[Set[str]] = None):
        """
        Initialize the text preprocessor with specified languages.
        
        Args:
            languages: Set of language codes to support. If None, all supported languages are used.
        """
        self.languages = languages or self.SUPPORTED_LANGUAGES
        self._initialize_resources()
        
    def _initialize_resources(self):
        """Initialize language-specific resources like stop words and stemmers."""
        # Initialize logging
        self.logger = logging.getLogger(__name__)
        
        # Initialize stop words for each language
        self.stop_words = {}
        nltk_langs = {
            'en': 'english', 'es': 'spanish', 'fr': 'french',
            'it': 'italian', 'pt': 'portuguese', 'ru': 'russian'
        }
        
        for lang, nltk_name in nltk_langs.items():
            if lang in self.languages:
                try:
                    self.stop_words[lang] = set(stopwords.words(nltk_name))
                except Exception as e:
                    self.logger.warning(f"Could not load stop words for {lang}: {str(e)}")
                    self.stop_words[lang] = set()
        
        # Add Turkish stop words manually
        if 'tr' in self.languages:
            self.stop_words['tr'] = {
                'acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 
                'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 
                'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 
                'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 
                'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 
                'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani'
            }
        
        # Initialize stemmers
        self.stemmers = {}
        for lang, name in [
            ('en', 'english'), ('es', 'spanish'), ('fr', 'french'),
            ('it', 'italian'), ('pt', 'portuguese'), ('ru', 'russian')
        ]:
            if lang in self.languages:
                self.stemmers[lang] = SnowballStemmer(name)
        
        # Initialize Turkish stemmer separately
        if 'tr' in self.languages:
            self.stemmers['tr'] = TurkishStemmer()
    
    def remove_html(self, text: str) -> str:
        """Remove HTML tags from text."""
        return BeautifulSoup(text, "html.parser").get_text()
    
    def expand_contractions(self, text: str) -> str:
        """Expand contractions in English text."""
        for contraction, expansion in self.CONTRACTIONS.items():
            text = re.sub(rf'\b{contraction}\b', expansion, text, flags=re.IGNORECASE)
        return text
    
    def remove_accents(self, text: str) -> str:
        """Remove accents from text while preserving base characters."""
        return ''.join(c for c in unicodedata.normalize('NFKD', text)
                      if not unicodedata.combining(c))
    
    def clean_text(self, text: str, lang: str = 'en', 
                  remove_stops: bool = True, 
                  remove_numbers: bool = True,
                  remove_urls: bool = True,
                  remove_emails: bool = True,
                  remove_mentions: bool = True,
                  remove_hashtags: bool = True,
                  expand_contractions: bool = True,
                  remove_accents: bool = False,
                  min_word_length: int = 2) -> str:
        """
        Clean and normalize text with configurable options.
        
        Args:
            text: Input text to clean
            lang: Language code of the text
            remove_stops: Whether to remove stop words
            remove_numbers: Whether to remove numbers
            remove_urls: Whether to remove URLs
            remove_emails: Whether to remove email addresses
            remove_mentions: Whether to remove social media mentions
            remove_hashtags: Whether to remove hashtags
            expand_contractions: Whether to expand contractions (English only)
            remove_accents: Whether to remove accents from characters
            min_word_length: Minimum length of words to keep
            
        Returns:
            Cleaned text string
        """
        try:
            # Convert to string and lowercase
            text = str(text).lower().strip()
            
            # Remove HTML tags if any HTML-like content is detected
            if '<' in text and '>' in text:
                text = self.remove_html(text)
            
            # Remove URLs if requested
            if remove_urls:
                text = re.sub(r'http\S+|www\S+', '', text)
            
            # Remove email addresses if requested
            if remove_emails:
                text = re.sub(r'\S+@\S+', '', text)
            
            # Remove mentions if requested
            if remove_mentions:
                text = re.sub(r'@\w+', '', text)
            
            # Remove hashtags if requested
            if remove_hashtags:
                text = re.sub(r'#\w+', '', text)
            
            # Remove numbers if requested
            if remove_numbers:
                text = re.sub(r'\d+', '', text)
            
            # Expand contractions for English text
            if lang == 'en' and expand_contractions:
                text = self.expand_contractions(text)
            
            # Remove accents if requested
            if remove_accents:
                text = self.remove_accents(text)
            
            # Language-specific character cleaning
            if lang == 'tr':
                text = re.sub(r'[^a-zA-ZçğıöşüÇĞİÖŞÜ\s]', '', text)
            elif lang == 'ru':
                text = re.sub(r'[^а-яА-Я\s]', '', text)
            else:
                text = re.sub(r'[^\w\s]', '', text)
            
            # Simple word splitting as fallback if tokenization fails
            try:
                words = word_tokenize(text)
            except Exception as e:
                self.logger.debug(f"Word tokenization failed, falling back to simple split: {str(e)}")
                words = text.split()
            
            # Remove stop words if requested
            if remove_stops and lang in self.stop_words:
                words = [w for w in words if w not in self.stop_words[lang]]
            
            # Remove short words
            words = [w for w in words if len(w) > min_word_length]
            
            # Rejoin words
            return ' '.join(words)
            
        except Exception as e:
            self.logger.warning(f"Error in text cleaning: {str(e)}")
            return text
    
    def stem_text(self, text: str, lang: str = 'en') -> str:
        """
        Apply language-specific stemming to text.
        
        Args:
            text: Input text to stem
            lang: Language code of the text
            
        Returns:
            Stemmed text string
        """
        try:
            if lang not in self.stemmers:
                return text
                
            words = text.split()
            stemmed_words = [self.stemmers[lang].stem(word) for word in words]
            return ' '.join(stemmed_words)
            
        except Exception as e:
            self.logger.warning(f"Error in text stemming: {str(e)}")
            return text
    
    def preprocess_text(self, text: str, lang: str = 'en', 
                       clean_options: Dict = None, 
                       do_stemming: bool = True) -> str:
        """
        Complete preprocessing pipeline combining cleaning and stemming.
        
        Args:
            text: Input text to preprocess
            lang: Language code of the text
            clean_options: Dictionary of options to pass to clean_text
            do_stemming: Whether to apply stemming
            
        Returns:
            Preprocessed text string
        """
        # Use default cleaning options if none provided
        clean_options = clean_options or {}
        
        # Clean text
        cleaned_text = self.clean_text(text, lang, **clean_options)
        
        # Apply stemming if requested
        if do_stemming:
            cleaned_text = self.stem_text(cleaned_text, lang)
        
        return cleaned_text.strip()

# Usage example
if __name__ == "__main__":
    # Initialize preprocessor
    preprocessor = TextPreprocessor()
    
    # Example texts in different languages
    examples = {
        'en': "Here's an example! This is a test text with @mentions and #hashtags http://example.com",
        'es': "¡Hola! Este es un ejemplo de texto en español con números 12345",
        'fr': "Voici un exemple de texte en français avec des accents é è à",
        'tr': "Bu bir Türkçe örnek metindir ve bazı özel karakterler içerir."
    }
    
    # Process each example
    for lang, text in examples.items():
        print(f"\nProcessing {lang} text:")
        print("Original:", text)
        processed = preprocessor.preprocess_text(text, lang)
        print("Processed:", processed)