SAMH / data_pipeline /data_preprocessor.py
Timmyafolami's picture
Upload 35 files
6c17133 verified
import os
import sys
import re
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from logging_config.logger_config import get_logger
# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')
# Get the logger
logger = get_logger(__name__)
# Custom Preprocessor Class
class TextPreprocessor:
def __init__(self):
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
logger.info("TextPreprocessor initialized.")
def preprocess_text(self, text):
# logger.info(f"Original text: {text}")
# Lowercase the text
text = text.lower()
# logger.info(f"Lowercased text: {text}")
# Remove punctuation
text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
# logger.info(f"Text after punctuation removal: {text}")
# Remove numbers
text = re.sub(r'\d+', '', text)
# logger.info(f"Text after number removal: {text}")
# Tokenize the text
words = text.split()
# logger.info(f"Tokenized text: {words}")
# Remove stopwords and apply lemmatization
words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
# logger.info(f"Text after stopword removal and lemmatization: {words}")
# Join words back into a single string
cleaned_text = ' '.join(words)
# logger.info(f"Cleaned text: {cleaned_text}")
return cleaned_text
def load_and_preprocess_data(file_path):
# Load the data
logger.info(f"Loading data from {file_path}")
df = pd.read_csv(file_path)
# dropping missing values
logger.info("Dropping missing values")
df.dropna(inplace=True)
# Check if the necessary column exists
if 'statement' not in df.columns:
logger.error("The required column 'statement' is missing from the dataset.")
return
# Initialize the text preprocessor
preprocessor = TextPreprocessor()
# Apply the preprocessing to the 'statement' column
logger.info("Starting text preprocessing...")
df['cleaned_statement'] = df['statement'].apply(preprocessor.preprocess_text)
logger.info("Text preprocessing completed.")
# Save the cleaned data to a new CSV file
cleaned_file_path = os.path.join('./data', 'cleaned_data.csv')
df.to_csv(cleaned_file_path, index=False)
logger.info(f"Cleaned data saved to {cleaned_file_path}")
if __name__ == "__main__":
# Path to the downloaded dataset
dataset_path = os.path.join("./data", "Combined_Data.csv")
# Preprocess the data
load_and_preprocess_data(dataset_path)