File size: 11,390 Bytes
d187b57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 |
import re
import nltk
import logging
from typing import List, Set, Dict, Optional
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from TurkishStemmer import TurkishStemmer
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import unicodedata
import warnings
# Suppress BeautifulSoup warning about markup resembling a filename
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
# Download required NLTK data
try:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
except Exception as e:
print(f"Warning: Could not download NLTK data: {str(e)}")
# Configure logging
logging.basicConfig(level=logging.WARNING)
class TextPreprocessor:
"""
A comprehensive text preprocessor for multilingual text cleaning and normalization.
Supports multiple languages and provides various text cleaning operations.
"""
SUPPORTED_LANGUAGES = {'en', 'es', 'fr', 'it', 'pt', 'ru', 'tr'}
# Common contractions mapping (can be extended)
CONTRACTIONS = {
"ain't": "is not", "aren't": "are not", "can't": "cannot",
"couldn't": "could not", "didn't": "did not", "doesn't": "does not",
"don't": "do not", "hadn't": "had not", "hasn't": "has not",
"haven't": "have not", "he'd": "he would", "he'll": "he will",
"he's": "he is", "i'd": "i would", "i'll": "i will", "i'm": "i am",
"i've": "i have", "isn't": "is not", "it's": "it is",
"let's": "let us", "shouldn't": "should not", "that's": "that is",
"there's": "there is", "they'd": "they would", "they'll": "they will",
"they're": "they are", "they've": "they have", "wasn't": "was not",
"we'd": "we would", "we're": "we are", "we've": "we have",
"weren't": "were not", "what's": "what is", "where's": "where is",
"who's": "who is", "won't": "will not", "wouldn't": "would not",
"you'd": "you would", "you'll": "you will", "you're": "you are",
"you've": "you have"
}
def __init__(self, languages: Optional[Set[str]] = None):
"""
Initialize the text preprocessor with specified languages.
Args:
languages: Set of language codes to support. If None, all supported languages are used.
"""
self.languages = languages or self.SUPPORTED_LANGUAGES
self._initialize_resources()
def _initialize_resources(self):
"""Initialize language-specific resources like stop words and stemmers."""
# Initialize logging
self.logger = logging.getLogger(__name__)
# Initialize stop words for each language
self.stop_words = {}
nltk_langs = {
'en': 'english', 'es': 'spanish', 'fr': 'french',
'it': 'italian', 'pt': 'portuguese', 'ru': 'russian'
}
for lang, nltk_name in nltk_langs.items():
if lang in self.languages:
try:
self.stop_words[lang] = set(stopwords.words(nltk_name))
except Exception as e:
self.logger.warning(f"Could not load stop words for {lang}: {str(e)}")
self.stop_words[lang] = set()
# Add Turkish stop words manually
if 'tr' in self.languages:
self.stop_words['tr'] = {
'acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç',
'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa',
'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç',
'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl',
'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o',
'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani'
}
# Initialize stemmers
self.stemmers = {}
for lang, name in [
('en', 'english'), ('es', 'spanish'), ('fr', 'french'),
('it', 'italian'), ('pt', 'portuguese'), ('ru', 'russian')
]:
if lang in self.languages:
self.stemmers[lang] = SnowballStemmer(name)
# Initialize Turkish stemmer separately
if 'tr' in self.languages:
self.stemmers['tr'] = TurkishStemmer()
def remove_html(self, text: str) -> str:
"""Remove HTML tags from text."""
return BeautifulSoup(text, "html.parser").get_text()
def expand_contractions(self, text: str) -> str:
"""Expand contractions in English text."""
for contraction, expansion in self.CONTRACTIONS.items():
text = re.sub(rf'\b{contraction}\b', expansion, text, flags=re.IGNORECASE)
return text
def remove_accents(self, text: str) -> str:
"""Remove accents from text while preserving base characters."""
return ''.join(c for c in unicodedata.normalize('NFKD', text)
if not unicodedata.combining(c))
def clean_text(self, text: str, lang: str = 'en',
remove_stops: bool = True,
remove_numbers: bool = True,
remove_urls: bool = True,
remove_emails: bool = True,
remove_mentions: bool = True,
remove_hashtags: bool = True,
expand_contractions: bool = True,
remove_accents: bool = False,
min_word_length: int = 2) -> str:
"""
Clean and normalize text with configurable options.
Args:
text: Input text to clean
lang: Language code of the text
remove_stops: Whether to remove stop words
remove_numbers: Whether to remove numbers
remove_urls: Whether to remove URLs
remove_emails: Whether to remove email addresses
remove_mentions: Whether to remove social media mentions
remove_hashtags: Whether to remove hashtags
expand_contractions: Whether to expand contractions (English only)
remove_accents: Whether to remove accents from characters
min_word_length: Minimum length of words to keep
Returns:
Cleaned text string
"""
try:
# Convert to string and lowercase
text = str(text).lower().strip()
# Remove HTML tags if any HTML-like content is detected
if '<' in text and '>' in text:
text = self.remove_html(text)
# Remove URLs if requested
if remove_urls:
text = re.sub(r'http\S+|www\S+', '', text)
# Remove email addresses if requested
if remove_emails:
text = re.sub(r'\S+@\S+', '', text)
# Remove mentions if requested
if remove_mentions:
text = re.sub(r'@\w+', '', text)
# Remove hashtags if requested
if remove_hashtags:
text = re.sub(r'#\w+', '', text)
# Remove numbers if requested
if remove_numbers:
text = re.sub(r'\d+', '', text)
# Expand contractions for English text
if lang == 'en' and expand_contractions:
text = self.expand_contractions(text)
# Remove accents if requested
if remove_accents:
text = self.remove_accents(text)
# Language-specific character cleaning
if lang == 'tr':
text = re.sub(r'[^a-zA-ZçğıöşüÇĞİÖŞÜ\s]', '', text)
elif lang == 'ru':
text = re.sub(r'[^а-яА-Я\s]', '', text)
else:
text = re.sub(r'[^\w\s]', '', text)
# Simple word splitting as fallback if tokenization fails
try:
words = word_tokenize(text)
except Exception as e:
self.logger.debug(f"Word tokenization failed, falling back to simple split: {str(e)}")
words = text.split()
# Remove stop words if requested
if remove_stops and lang in self.stop_words:
words = [w for w in words if w not in self.stop_words[lang]]
# Remove short words
words = [w for w in words if len(w) > min_word_length]
# Rejoin words
return ' '.join(words)
except Exception as e:
self.logger.warning(f"Error in text cleaning: {str(e)}")
return text
def stem_text(self, text: str, lang: str = 'en') -> str:
"""
Apply language-specific stemming to text.
Args:
text: Input text to stem
lang: Language code of the text
Returns:
Stemmed text string
"""
try:
if lang not in self.stemmers:
return text
words = text.split()
stemmed_words = [self.stemmers[lang].stem(word) for word in words]
return ' '.join(stemmed_words)
except Exception as e:
self.logger.warning(f"Error in text stemming: {str(e)}")
return text
def preprocess_text(self, text: str, lang: str = 'en',
clean_options: Dict = None,
do_stemming: bool = True) -> str:
"""
Complete preprocessing pipeline combining cleaning and stemming.
Args:
text: Input text to preprocess
lang: Language code of the text
clean_options: Dictionary of options to pass to clean_text
do_stemming: Whether to apply stemming
Returns:
Preprocessed text string
"""
# Use default cleaning options if none provided
clean_options = clean_options or {}
# Clean text
cleaned_text = self.clean_text(text, lang, **clean_options)
# Apply stemming if requested
if do_stemming:
cleaned_text = self.stem_text(cleaned_text, lang)
return cleaned_text.strip()
# Usage example
if __name__ == "__main__":
# Initialize preprocessor
preprocessor = TextPreprocessor()
# Example texts in different languages
examples = {
'en': "Here's an example! This is a test text with @mentions and #hashtags http://example.com",
'es': "¡Hola! Este es un ejemplo de texto en español con números 12345",
'fr': "Voici un exemple de texte en français avec des accents é è à",
'tr': "Bu bir Türkçe örnek metindir ve bazı özel karakterler içerir."
}
# Process each example
for lang, text in examples.items():
print(f"\nProcessing {lang} text:")
print("Original:", text)
processed = preprocessor.preprocess_text(text, lang)
print("Processed:", processed) |