Spaces:
Sleeping
Sleeping
File size: 1,715 Bytes
6c17133 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')
# Custom transformer for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self):
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def preprocess_text(self, text):
# Lowercase the text
text = text.lower()
# Remove punctuation
text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
# Remove numbers
text = re.sub(r'\d+', '', text)
# Tokenize the text
words = text.split()
# Remove stopwords and apply lemmatization
words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
# Join words back into a single string
cleaned_text = ' '.join(words)
return cleaned_text
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return [self.preprocess_text(text) for text in X]
# Model pipeline
pipeline = Pipeline([
('preprocessor', TextPreprocessor()),
('vectorizer', TfidfVectorizer()),
('classifier', RandomForestClassifier())
])
|