Spaces:
Sleeping
Sleeping
File size: 3,230 Bytes
469c254 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import re
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from typing import List, Union
import pandas as pd
class TextPreprocessor:
def __init__(self):
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def remove_urls(self, text: str) -> str:
"""Remove URLs from text."""
url_pattern = re.compile(r'https?://\S+|www\.\S+')
return url_pattern.sub('', text)
def remove_emojis(self, text: str) -> str:
"""Remove emojis from text."""
return emoji.replace_emoji(text, replace='')
def remove_special_chars(self, text: str) -> str:
"""Remove special characters and numbers."""
return re.sub(r'[^a-zA-Z\s]', '', text)
def remove_extra_spaces(self, text: str) -> str:
"""Remove extra spaces."""
return re.sub(r'\s+', ' ', text).strip()
def lemmatize_text(self, text: str) -> str:
"""Lemmatize text."""
# Simple word tokenization using split
tokens = text.split()
return ' '.join([self.lemmatizer.lemmatize(token) for token in tokens])
def remove_stopwords(self, text: str) -> str:
"""Remove stopwords from text."""
# Simple word tokenization using split
tokens = text.split()
return ' '.join([token for token in tokens if token.lower() not in self.stop_words])
def correct_spelling(self, text: str) -> str:
"""Correct spelling in text."""
return str(TextBlob(text).correct())
def preprocess_text(self, text: str,
remove_urls: bool = True,
remove_emojis: bool = True,
remove_special_chars: bool = True,
remove_stopwords: bool = True,
lemmatize: bool = True,
correct_spelling: bool = False) -> str:
"""Apply all preprocessing steps to text."""
if not isinstance(text, str):
return ""
text = text.lower()
if remove_urls:
text = self.remove_urls(text)
if remove_emojis:
text = self.remove_emojis(text)
if remove_special_chars:
text = self.remove_special_chars(text)
if remove_stopwords:
text = self.remove_stopwords(text)
if lemmatize:
text = self.lemmatize_text(text)
if correct_spelling:
text = self.correct_spelling(text)
text = self.remove_extra_spaces(text)
return text
def preprocess_dataframe(self, df: pd.DataFrame,
text_column: str,
**kwargs) -> pd.DataFrame:
"""Preprocess text column in a dataframe."""
df = df.copy()
df[text_column] = df[text_column].apply(
lambda x: self.preprocess_text(x, **kwargs)
)
return df |