SAMH / data_pipeline /data_preprocessor.py
Timmyafolami's picture
Upload 35 files
6c17133 verified
raw
history blame
3.01 kB
import os
import sys
import re
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from logging_config.logger_config import get_logger
# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')
# Get the logger
logger = get_logger(__name__)
# Custom Preprocessor Class
class TextPreprocessor:
def __init__(self):
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
logger.info("TextPreprocessor initialized.")
def preprocess_text(self, text):
# logger.info(f"Original text: {text}")
# Lowercase the text
text = text.lower()
# logger.info(f"Lowercased text: {text}")
# Remove punctuation
text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
# logger.info(f"Text after punctuation removal: {text}")
# Remove numbers
text = re.sub(r'\d+', '', text)
# logger.info(f"Text after number removal: {text}")
# Tokenize the text
words = text.split()
# logger.info(f"Tokenized text: {words}")
# Remove stopwords and apply lemmatization
words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
# logger.info(f"Text after stopword removal and lemmatization: {words}")
# Join words back into a single string
cleaned_text = ' '.join(words)
# logger.info(f"Cleaned text: {cleaned_text}")
return cleaned_text
def load_and_preprocess_data(file_path):
# Load the data
logger.info(f"Loading data from {file_path}")
df = pd.read_csv(file_path)
# dropping missing values
logger.info("Dropping missing values")
df.dropna(inplace=True)
# Check if the necessary column exists
if 'statement' not in df.columns:
logger.error("The required column 'statement' is missing from the dataset.")
return
# Initialize the text preprocessor
preprocessor = TextPreprocessor()
# Apply the preprocessing to the 'statement' column
logger.info("Starting text preprocessing...")
df['cleaned_statement'] = df['statement'].apply(preprocessor.preprocess_text)
logger.info("Text preprocessing completed.")
# Save the cleaned data to a new CSV file
cleaned_file_path = os.path.join('./data', 'cleaned_data.csv')
df.to_csv(cleaned_file_path, index=False)
logger.info(f"Cleaned data saved to {cleaned_file_path}")
if __name__ == "__main__":
# Path to the downloaded dataset
dataset_path = os.path.join("./data", "Combined_Data.csv")
# Preprocess the data
load_and_preprocess_data(dataset_path)