File size: 3,230 Bytes
469c254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import re
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from typing import List, Union
import pandas as pd

class TextPreprocessor:
    def __init__(self):
        # Download required NLTK data
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
        
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        
    def remove_urls(self, text: str) -> str:
        """Remove URLs from text."""
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub('', text)
    
    def remove_emojis(self, text: str) -> str:
        """Remove emojis from text."""
        return emoji.replace_emoji(text, replace='')
    
    def remove_special_chars(self, text: str) -> str:
        """Remove special characters and numbers."""
        return re.sub(r'[^a-zA-Z\s]', '', text)
    
    def remove_extra_spaces(self, text: str) -> str:
        """Remove extra spaces."""
        return re.sub(r'\s+', ' ', text).strip()
    
    def lemmatize_text(self, text: str) -> str:
        """Lemmatize text."""
        # Simple word tokenization using split
        tokens = text.split()
        return ' '.join([self.lemmatizer.lemmatize(token) for token in tokens])
    
    def remove_stopwords(self, text: str) -> str:
        """Remove stopwords from text."""
        # Simple word tokenization using split
        tokens = text.split()
        return ' '.join([token for token in tokens if token.lower() not in self.stop_words])
    
    def correct_spelling(self, text: str) -> str:
        """Correct spelling in text."""
        return str(TextBlob(text).correct())
    
    def preprocess_text(self, text: str, 
                       remove_urls: bool = True,
                       remove_emojis: bool = True,
                       remove_special_chars: bool = True,
                       remove_stopwords: bool = True,
                       lemmatize: bool = True,
                       correct_spelling: bool = False) -> str:
        """Apply all preprocessing steps to text."""
        if not isinstance(text, str):
            return ""
            
        text = text.lower()
        
        if remove_urls:
            text = self.remove_urls(text)
        if remove_emojis:
            text = self.remove_emojis(text)
        if remove_special_chars:
            text = self.remove_special_chars(text)
        if remove_stopwords:
            text = self.remove_stopwords(text)
        if lemmatize:
            text = self.lemmatize_text(text)
        if correct_spelling:
            text = self.correct_spelling(text)
            
        text = self.remove_extra_spaces(text)
        return text
    
    def preprocess_dataframe(self, df: pd.DataFrame, 
                           text_column: str,
                           **kwargs) -> pd.DataFrame:
        """Preprocess text column in a dataframe."""
        df = df.copy()
        df[text_column] = df[text_column].apply(
            lambda x: self.preprocess_text(x, **kwargs)
        )
        return df