File size: 3,225 Bytes
6c17133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import sys
import re
import string
import joblib
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from glob import glob

# Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from logging_config.logger_config import get_logger

# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

# Get the logger
logger = get_logger(__name__)

# Custom Preprocessor Class
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        logger.info("TextPreprocessor initialized.")
    
    def preprocess_text(self, text):
        logger.info(f"Original text: {text}")
        # Lowercase the text
        text = text.lower()
        logger.info(f"Lowercased text: {text}")
        
        # Remove punctuation
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
        logger.info(f"Text after punctuation removal: {text}")
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        logger.info(f"Text after number removal: {text}")
        
        # Tokenize the text
        words = text.split()
        logger.info(f"Tokenized text: {words}")
        
        # Remove stopwords and apply lemmatization
        words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        logger.info(f"Text after stopword removal and lemmatization: {words}")
        
        # Join words back into a single string
        cleaned_text = ' '.join(words)
        logger.info(f"Cleaned text: {cleaned_text}")
        
        return cleaned_text

def get_latest_model_path(models_dir='./models'):
    model_files = glob(os.path.join(models_dir, 'model_v*.joblib'))
    if not model_files:
        logger.error("No model files found in the models directory.")
        raise FileNotFoundError("No model files found in the models directory.")
    
    latest_model_file = max(model_files, key=os.path.getctime)
    logger.info(f"Latest model file found: {latest_model_file}")
    return latest_model_file

def load_model():
    model_path = get_latest_model_path()
    logger.info(f"Loading model from {model_path}")
    return joblib.load(model_path)

def predict(text, model):
    # Initialize the text preprocessor
    preprocessor = TextPreprocessor()
    
    # Preprocess the input text
    logger.info("Preprocessing input text...")
    cleaned_text = preprocessor.preprocess_text(text)
    
    # Make a prediction
    logger.info("Making prediction...")
    prediction = model.predict([cleaned_text])
    
    logger.info(f"Prediction: {prediction}")
    return prediction[0]

if __name__ == "__main__":
    # Example text input
    example_text = "I love programming in Python."
    
    # Load the latest model
    model = load_model()
    
    # Make a prediction
    prediction = predict(example_text, model)
    
    # Print the prediction
    print(f"Prediction: {prediction}")