File size: 3,014 Bytes
6c17133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import sys
import re
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from logging_config.logger_config import get_logger


# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

# Get the logger
logger = get_logger(__name__)

# Custom Preprocessor Class
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        logger.info("TextPreprocessor initialized.")
    
    def preprocess_text(self, text):
        # logger.info(f"Original text: {text}")
        # Lowercase the text
        text = text.lower()
        # logger.info(f"Lowercased text: {text}")
        
        # Remove punctuation
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
        # logger.info(f"Text after punctuation removal: {text}")
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        # logger.info(f"Text after number removal: {text}")
        
        # Tokenize the text
        words = text.split()
        # logger.info(f"Tokenized text: {words}")
        
        # Remove stopwords and apply lemmatization
        words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        # logger.info(f"Text after stopword removal and lemmatization: {words}")
        
        # Join words back into a single string
        cleaned_text = ' '.join(words)
        # logger.info(f"Cleaned text: {cleaned_text}")
        
        return cleaned_text

def load_and_preprocess_data(file_path):
    # Load the data
    logger.info(f"Loading data from {file_path}")
    df = pd.read_csv(file_path)
    # dropping missing values
    logger.info("Dropping missing values")
    df.dropna(inplace=True)
    
    # Check if the necessary column exists
    if 'statement' not in df.columns:
        logger.error("The required column 'statement' is missing from the dataset.")
        return
    
    # Initialize the text preprocessor
    preprocessor = TextPreprocessor()
    
    # Apply the preprocessing to the 'statement' column
    logger.info("Starting text preprocessing...")
    df['cleaned_statement'] = df['statement'].apply(preprocessor.preprocess_text)
    logger.info("Text preprocessing completed.")
    
    # Save the cleaned data to a new CSV file
    cleaned_file_path = os.path.join('./data', 'cleaned_data.csv')
    df.to_csv(cleaned_file_path, index=False)
    logger.info(f"Cleaned data saved to {cleaned_file_path}")

if __name__ == "__main__":
    # Path to the downloaded dataset
    dataset_path = os.path.join("./data", "Combined_Data.csv")
    
    # Preprocess the data
    load_and_preprocess_data(dataset_path)