Spaces:

Timmyafolami
/

SAMH

Sleeping

App Files Files Community

SAMH / data_pipeline /data_preprocessor.py

Timmyafolami

Upload 35 files

6c17133 verified 12 months ago

raw

history blame contribute delete

3.01 kB

	import os
	import sys
	import re
	import string
	import pandas as pd
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer

	# Add the root directory to sys.path
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

	from logging_config.logger_config import get_logger


	# Download necessary NLTK data files
	nltk.download('stopwords')
	nltk.download('wordnet')

	# Get the logger
	logger = get_logger(__name__)

	# Custom Preprocessor Class
	class TextPreprocessor:
	def __init__(self):
	self.stop_words = set(stopwords.words('english'))
	self.lemmatizer = WordNetLemmatizer()
	logger.info("TextPreprocessor initialized.")

	def preprocess_text(self, text):
	# logger.info(f"Original text: {text}")
	# Lowercase the text
	text = text.lower()
	# logger.info(f"Lowercased text: {text}")

	# Remove punctuation
	text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
	# logger.info(f"Text after punctuation removal: {text}")

	# Remove numbers
	text = re.sub(r'\d+', '', text)
	# logger.info(f"Text after number removal: {text}")

	# Tokenize the text
	words = text.split()
	# logger.info(f"Tokenized text: {words}")

	# Remove stopwords and apply lemmatization
	words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
	# logger.info(f"Text after stopword removal and lemmatization: {words}")

	# Join words back into a single string
	cleaned_text = ' '.join(words)
	# logger.info(f"Cleaned text: {cleaned_text}")

	return cleaned_text

	def load_and_preprocess_data(file_path):
	# Load the data
	logger.info(f"Loading data from {file_path}")
	df = pd.read_csv(file_path)
	# dropping missing values
	logger.info("Dropping missing values")
	df.dropna(inplace=True)

	# Check if the necessary column exists
	if 'statement' not in df.columns:
	logger.error("The required column 'statement' is missing from the dataset.")
	return

	# Initialize the text preprocessor
	preprocessor = TextPreprocessor()

	# Apply the preprocessing to the 'statement' column
	logger.info("Starting text preprocessing...")
	df['cleaned_statement'] = df['statement'].apply(preprocessor.preprocess_text)
	logger.info("Text preprocessing completed.")

	# Save the cleaned data to a new CSV file
	cleaned_file_path = os.path.join('./data', 'cleaned_data.csv')
	df.to_csv(cleaned_file_path, index=False)
	logger.info(f"Cleaned data saved to {cleaned_file_path}")

	if __name__ == "__main__":
	# Path to the downloaded dataset
	dataset_path = os.path.join("./data", "Combined_Data.csv")

	# Preprocess the data
	load_and_preprocess_data(dataset_path)