SAMH / utils.py
Timmyafolami's picture
Upload 35 files
6c17133 verified
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')
# Custom transformer for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self):
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def preprocess_text(self, text):
# Lowercase the text
text = text.lower()
# Remove punctuation
text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
# Remove numbers
text = re.sub(r'\d+', '', text)
# Tokenize the text
words = text.split()
# Remove stopwords and apply lemmatization
words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
# Join words back into a single string
cleaned_text = ' '.join(words)
return cleaned_text
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return [self.preprocess_text(text) for text in X]
# Model pipeline
pipeline = Pipeline([
('preprocessor', TextPreprocessor()),
('vectorizer', TfidfVectorizer()),
('classifier', RandomForestClassifier())
])