import re import emoji import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from textblob import TextBlob from typing import List, Union import pandas as pd class TextPreprocessor: def __init__(self): # Download required NLTK data nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') self.stop_words = set(stopwords.words('english')) self.lemmatizer = WordNetLemmatizer() def remove_urls(self, text: str) -> str: """Remove URLs from text.""" url_pattern = re.compile(r'https?://\S+|www\.\S+') return url_pattern.sub('', text) def remove_emojis(self, text: str) -> str: """Remove emojis from text.""" return emoji.replace_emoji(text, replace='') def remove_special_chars(self, text: str) -> str: """Remove special characters and numbers.""" return re.sub(r'[^a-zA-Z\s]', '', text) def remove_extra_spaces(self, text: str) -> str: """Remove extra spaces.""" return re.sub(r'\s+', ' ', text).strip() def lemmatize_text(self, text: str) -> str: """Lemmatize text.""" # Simple word tokenization using split tokens = text.split() return ' '.join([self.lemmatizer.lemmatize(token) for token in tokens]) def remove_stopwords(self, text: str) -> str: """Remove stopwords from text.""" # Simple word tokenization using split tokens = text.split() return ' '.join([token for token in tokens if token.lower() not in self.stop_words]) def correct_spelling(self, text: str) -> str: """Correct spelling in text.""" return str(TextBlob(text).correct()) def preprocess_text(self, text: str, remove_urls: bool = True, remove_emojis: bool = True, remove_special_chars: bool = True, remove_stopwords: bool = True, lemmatize: bool = True, correct_spelling: bool = False) -> str: """Apply all preprocessing steps to text.""" if not isinstance(text, str): return "" text = text.lower() if remove_urls: text = self.remove_urls(text) if remove_emojis: text = self.remove_emojis(text) if remove_special_chars: text = self.remove_special_chars(text) if remove_stopwords: text = self.remove_stopwords(text) if lemmatize: text = self.lemmatize_text(text) if correct_spelling: text = self.correct_spelling(text) text = self.remove_extra_spaces(text) return text def preprocess_dataframe(self, df: pd.DataFrame, text_column: str, **kwargs) -> pd.DataFrame: """Preprocess text column in a dataframe.""" df = df.copy() df[text_column] = df[text_column].apply( lambda x: self.preprocess_text(x, **kwargs) ) return df