arkodeep commited on
Commit
0d85011
·
verified ·
1 Parent(s): 1fddcc9

Upload 3 files

Browse files
training/train_model_legacy.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import nltk, string, logging, pickle
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from collections import Counter
7
+ from nltk.corpus import stopwords
8
+ from nltk.stem.porter import PorterStemmer
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
12
+ from sklearn.ensemble import VotingClassifier
13
+ from sklearn.svm import SVC
14
+ from sklearn.naive_bayes import MultinomialNB
15
+ from sklearn.ensemble import ExtraTreesClassifier
16
+
17
+ # Setup logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ def transform_text(text):
22
+ ps = PorterStemmer()
23
+ text = text.lower()
24
+ text = nltk.word_tokenize(text)
25
+
26
+ y = []
27
+ for i in text:
28
+ if i.isalnum():
29
+ y.append(i)
30
+
31
+ text = y[:]
32
+ y.clear()
33
+
34
+ for i in text:
35
+ if i not in stopwords.words('english') and i not in string.punctuation:
36
+ y.append(i)
37
+
38
+ text = y[:]
39
+ y.clear()
40
+
41
+ for i in text:
42
+ y.append(ps.stem(i))
43
+
44
+ return " ".join(y)
45
+
46
+ def plot_dataset_insights(df):
47
+ plt.figure(figsize=(15, 5))
48
+
49
+ plt.subplot(131)
50
+ sns.histplot(data=df, x='num_characters', hue='target', bins=50)
51
+ plt.title('Message Length Distribution')
52
+
53
+ plt.subplot(132)
54
+ df['target'].value_counts().plot(kind='bar')
55
+ plt.title('Class Distribution')
56
+
57
+ plt.subplot(133)
58
+ sns.boxplot(data=df, x='target', y='num_words')
59
+ plt.title('Word Count by Class')
60
+
61
+ plt.tight_layout()
62
+ plt.savefig('./graphs/dataset_insights.png')
63
+ plt.close()
64
+
65
+ def plot_word_clouds(df):
66
+ from wordcloud import WordCloud
67
+ plt.figure(figsize=(15, 5))
68
+
69
+ # Map text labels to numeric
70
+ df['target_num'] = df['target'].map({'ham': 0, 'spam': 1})
71
+
72
+ for idx, label in enumerate(['ham', 'spam']):
73
+ # Get text for current label
74
+ text = ' '.join(df[df['target'] == label]['transformed_text'])
75
+
76
+ if not text.strip():
77
+ logger.warning(f"No text found for label: {label}")
78
+ continue
79
+
80
+ try:
81
+ wordcloud = WordCloud(width=800, height=400).generate(text)
82
+ plt.subplot(1, 2, idx+1)
83
+ plt.imshow(wordcloud)
84
+ plt.axis('off')
85
+ plt.title(f'Word Cloud - {label.upper()}')
86
+ except Exception as e:
87
+ logger.error(f"Error generating wordcloud for {label}: {e}")
88
+
89
+ plt.savefig('./graphs/wordclouds.png')
90
+ plt.close()
91
+
92
+ def plot_performance_metrics(y_test, y_pred, model):
93
+ plt.figure(figsize=(15, 5))
94
+
95
+ plt.subplot(131)
96
+ cm = confusion_matrix(y_test, y_pred)
97
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
98
+ plt.title('Confusion Matrix')
99
+
100
+ plt.subplot(132)
101
+ performance_df = pd.DataFrame({
102
+ 'Metric': ['Accuracy', 'Precision'],
103
+ 'Score': [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred)]
104
+ })
105
+ sns.barplot(x='Metric', y='Score', data=performance_df)
106
+ plt.title('Model Performance')
107
+
108
+ plt.subplot(133)
109
+ etc = model.named_estimators_['et']
110
+ importances = pd.Series(etc.feature_importances_)
111
+ importances.nlargest(10).plot(kind='bar')
112
+ plt.title('Top 10 Important Features')
113
+
114
+ plt.tight_layout()
115
+ plt.savefig('./graphs/performance_metrics.png')
116
+ plt.close()
117
+
118
+ def save_metrics(metrics):
119
+ with open('./models/metrics.txt', 'w') as f:
120
+ for metric, value in metrics.items():
121
+ f.write(f"{metric}: {value:.4f}\n")
122
+
123
+ def main():
124
+ try:
125
+ # Load and preprocess data
126
+ logger.info("Loading data...")
127
+ df = pd.read_csv('./data/spam.csv', encoding='latin-1')
128
+ df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
129
+ df = df.rename(columns={'v1': 'target', 'v2': 'text'})
130
+
131
+ logger.info(f"Target value counts:\n{df['target'].value_counts()}")
132
+
133
+ # Add numerical features
134
+ df['num_characters'] = df['text'].apply(len)
135
+ df['num_words'] = df['text'].apply(lambda x: len(nltk.word_tokenize(x)))
136
+ df['num_sentences'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))
137
+
138
+ logger.info("Transforming text...")
139
+ df['transformed_text'] = df['text'].apply(transform_text)
140
+
141
+ # Verify transformed text
142
+ logger.info(f"Sample transformed text:\n{df['transformed_text'].head()}")
143
+
144
+ logger.info("Generating visualizations...")
145
+ plot_dataset_insights(df)
146
+ plot_word_clouds(df)
147
+
148
+ # Text vectorization
149
+ tfidf = TfidfVectorizer(max_features=3000)
150
+ X = tfidf.fit_transform(df['transformed_text']).toarray()
151
+ # Convert target to numeric for model
152
+ y = (df['target'] == 'spam').astype(int)
153
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
154
+
155
+ # Create ensemble
156
+ logger.info("Training model...")
157
+ svc = SVC(kernel='sigmoid', gamma=1.0, probability=True)
158
+ mnb = MultinomialNB()
159
+ etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
160
+
161
+ voting = VotingClassifier([('svm', svc), ('nb', mnb), ('et', etc)], voting='soft')
162
+ voting.fit(X_train, y_train)
163
+
164
+ y_pred = voting.predict(X_test)
165
+
166
+ metrics = {
167
+ "Accuracy": accuracy_score(y_test, y_pred),
168
+ "Precision": precision_score(y_test, y_pred)
169
+ }
170
+
171
+ save_metrics(metrics)
172
+ for metric, value in metrics.items():
173
+ logger.info(f"{metric}: {value:.4f}")
174
+
175
+ plot_performance_metrics(y_test, y_pred, voting)
176
+
177
+ logger.info("Saving models...")
178
+ pickle.dump(tfidf, open('./models/vectorizer.pkl', 'wb'))
179
+ pickle.dump(voting, open('./models/model.pkl', 'wb'))
180
+
181
+ logger.info("Training completed successfully")
182
+
183
+ except Exception as e:
184
+ logger.error(f"An error occurred: {e}")
185
+ raise
186
+
187
+ if __name__ == "__main__":
188
+ try:
189
+ nltk.download('punkt')
190
+ nltk.download('stopwords')
191
+ main()
192
+ except Exception as e:
193
+ print(f"Fatal error: {e}")
training/train_model_lite.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import nltk, string, logging, pickle
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from collections import Counter
7
+ from nltk.corpus import stopwords
8
+ from sklearn.metrics import confusion_matrix, classification_report
9
+ from sklearn.model_selection import train_test_split
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.svm import SVC
12
+ from sklearn.naive_bayes import MultinomialNB
13
+ from sklearn.ensemble import ExtraTreesClassifier
14
+ from sklearn.model_selection import cross_val_score, GridSearchCV
15
+ from sklearn.ensemble import VotingClassifier
16
+ from sklearn.metrics import accuracy_score, precision_score, f1_score
17
+ nltk.download('punkt')
18
+ nltk.download('wordnet')
19
+ nltk.download('stopwords')
20
+ nltk.download('punkt_tab')
21
+
22
+ # Setup logging
23
+ logging.basicConfig(level=logging.INFO)
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Download required NLTK data
27
+ try:
28
+ nltk.download('punkt')
29
+ nltk.download('wordnet')
30
+ nltk.download('stopwords')
31
+ except Exception as e:
32
+ logger.error(f"Failed to download NLTK data: {e}")
33
+
34
+ def improved_transform_text(text):
35
+ try:
36
+ from nltk.stem import WordNetLemmatizer
37
+ lemmatizer = WordNetLemmatizer()
38
+
39
+ text = str(text).lower()
40
+ words = nltk.word_tokenize(text)
41
+
42
+ words = [lemmatizer.lemmatize(word) for word in words
43
+ if word.isalnum() and
44
+ word not in stopwords.words('english') and
45
+ word not in string.punctuation]
46
+
47
+ return " ".join(words)
48
+ except Exception as e:
49
+ logger.error(f"Error in text transformation: {e}")
50
+ return text
51
+
52
+ def extract_features(df):
53
+ try:
54
+ df['text_length'] = df['text'].str.len()
55
+ df['word_count'] = df['text'].str.split().str.len()
56
+ df['unique_word_count'] = df['text'].apply(lambda x: len(set(str(x).split())))
57
+ df['uppercase_count'] = df['text'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))
58
+ df['special_char_count'] = df['text'].apply(lambda x: sum(not c.isalnum() for c in str(x)))
59
+ return df
60
+ except Exception as e:
61
+ logger.error(f"Error in feature extraction: {e}")
62
+ return df
63
+
64
+ def create_optimized_ensemble():
65
+ try:
66
+ svc = SVC(kernel='rbf', C=10, gamma='auto', probability=True, random_state=42)
67
+ mnb = MultinomialNB(alpha=0.1)
68
+ etc = ExtraTreesClassifier(n_estimators=200, max_depth=None,
69
+ min_samples_split=2, random_state=42)
70
+
71
+ estimators = [('svc', svc), ('mnb', mnb), ('etc', etc)]
72
+ voting_clf = VotingClassifier(estimators=estimators,
73
+ voting='soft',
74
+ weights=[2,1,2])
75
+ return voting_clf
76
+ except Exception as e:
77
+ logger.error(f"Error creating ensemble: {e}")
78
+ raise
79
+
80
+ def plot_dataset_insights(df):
81
+ plt.figure(figsize=(15, 5))
82
+
83
+ # Message length distribution
84
+ plt.subplot(131)
85
+ sns.histplot(data=df, x='text_length', hue='target', bins=50)
86
+ plt.title('Message Length Distribution')
87
+
88
+ # Class distribution
89
+ plt.subplot(132)
90
+ df['target'].value_counts().plot(kind='bar')
91
+ plt.title('Class Distribution')
92
+
93
+ # Word count distribution
94
+ plt.subplot(133)
95
+ sns.boxplot(data=df, x='target', y='word_count')
96
+ plt.title('Word Count by Class')
97
+
98
+ plt.tight_layout()
99
+ plt.savefig('./graphs/dataset_insights.png')
100
+ plt.close()
101
+
102
+ def plot_word_clouds(df):
103
+ from wordcloud import WordCloud
104
+
105
+ plt.figure(figsize=(15, 5))
106
+
107
+ for idx, label in enumerate(['ham', 'spam']):
108
+ text = ' '.join(df[df['target'] == label]['transformed_text'])
109
+ wordcloud = WordCloud(width=800, height=400).generate(text)
110
+
111
+ plt.subplot(1, 2, idx+1)
112
+ plt.imshow(wordcloud)
113
+ plt.axis('off')
114
+ plt.title(f'Word Cloud - {label.upper()}')
115
+
116
+ plt.savefig('./graphs/wordclouds.png')
117
+ plt.close()
118
+
119
+ def plot_performance_metrics(y_test, y_pred, model):
120
+ # Confusion Matrix
121
+ plt.figure(figsize=(15, 5))
122
+
123
+ plt.subplot(131)
124
+ cm = confusion_matrix(y_test, y_pred)
125
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
126
+ plt.title('Confusion Matrix')
127
+
128
+ # Classification Report Visualization
129
+ plt.subplot(132)
130
+ report = classification_report(y_test, y_pred, output_dict=True)
131
+ sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True, cmap='RdYlGn')
132
+ plt.title('Classification Report')
133
+
134
+ # Feature Importance (for ExtraTreesClassifier)
135
+ plt.subplot(133)
136
+ etc = model.named_estimators_['etc']
137
+ importances = pd.Series(etc.feature_importances_)
138
+ importances.nlargest(10).plot(kind='bar')
139
+ plt.title('Top 10 Important Features')
140
+
141
+ plt.tight_layout()
142
+ plt.savefig('./graphs/performance_metrics.png')
143
+ plt.close()
144
+
145
+ def save_metrics(metrics):
146
+ with open('./models/metrics.txt', 'w') as f:
147
+ for metric, value in metrics.items():
148
+ f.write(f"{metric}: {value:.4f}\n")
149
+
150
+ def main():
151
+ try:
152
+ # Load and preprocess data
153
+ df = pd.read_csv('./data/spam.csv', encoding='latin-1')
154
+ df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, errors='ignore')
155
+ df = df.rename(columns={'v1': 'target', 'v2': 'text'})
156
+
157
+ logger.info("Preprocessing text...")
158
+ df['transformed_text'] = df['text'].apply(improved_transform_text)
159
+ df = extract_features(df)
160
+
161
+ logger.info("Generating dataset insights...")
162
+ plot_dataset_insights(df)
163
+ plot_word_clouds(df)
164
+
165
+ # Vectorization with optimized parameters
166
+ tfidf = TfidfVectorizer(
167
+ max_features=5000,
168
+ ngram_range=(1,3),
169
+ min_df=2,
170
+ max_df=0.95
171
+ )
172
+
173
+ X = tfidf.fit_transform(df['transformed_text'])
174
+ y = (df['target'] == 'spam').astype(int)
175
+
176
+ X_train, X_test, y_train, y_test = train_test_split(
177
+ X, y, test_size=0.2, random_state=42, stratify=y
178
+ )
179
+
180
+ logger.info("Training model...")
181
+ model = create_optimized_ensemble()
182
+ model.fit(X_train, y_train)
183
+
184
+ y_pred = model.predict(X_test)
185
+
186
+ metrics = {
187
+ "Accuracy": accuracy_score(y_test, y_pred),
188
+ "Precision": precision_score(y_test, y_pred),
189
+ "F1": f1_score(y_test, y_pred)
190
+ }
191
+
192
+ # Save metrics to file
193
+ save_metrics(metrics)
194
+
195
+ for metric, value in metrics.items():
196
+ logger.info(f"{metric}: {value:.4f}")
197
+
198
+ plot_performance_metrics(y_test, y_pred, model)
199
+
200
+ with open('./models/vectorizer_optimized.pkl', 'wb') as f:
201
+ pickle.dump(tfidf, f)
202
+ with open('./models/model_optimized.pkl', 'wb') as f:
203
+ pickle.dump(model, f)
204
+
205
+ logger.info(f"Training completed. Metrics:\n{metrics}")
206
+
207
+ except Exception as e:
208
+ logger.error(f"An error occurred: {e}")
209
+ raise
210
+
211
+ if __name__ == "__main__":
212
+ main()
training/train_model_mbo.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import nltk, string, logging, pickle, torch
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from collections import Counter
7
+ from nltk.corpus import stopwords
8
+ from sklearn.metrics import confusion_matrix, classification_report
9
+ from sklearn.model_selection import train_test_split, cross_val_score
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.svm import SVC
12
+ from sklearn.naive_bayes import MultinomialNB
13
+ from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
14
+ from sklearn.metrics import accuracy_score, precision_score, f1_score
15
+ from torch.cuda import is_available as cuda_available
16
+
17
+ # Setup logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class MonarchButterflyOptimizer:
22
+ def __init__(self, bounds, n_butterflies=20, p_period=1.2, migration_ratio=0.85, max_iter=30, use_gpu=False):
23
+ self.bounds = bounds
24
+ self.n_butterflies = n_butterflies
25
+ self.p_period = p_period
26
+ self.migration_ratio = migration_ratio
27
+ self.max_iter = max_iter
28
+ self.best_solution = None
29
+ self.best_fitness = float('-inf')
30
+
31
+ # GPU setup
32
+ self.use_gpu = use_gpu and cuda_available()
33
+ self.device = torch.device('cuda' if self.use_gpu else 'cpu')
34
+ logger.info(f"Using device: {self.device}")
35
+
36
+ def initialize(self):
37
+ try:
38
+ population = []
39
+ for _ in range(self.n_butterflies):
40
+ butterfly = {}
41
+ for param, (low, high) in self.bounds.items():
42
+ if isinstance(low, int) and isinstance(high, int):
43
+ butterfly[param] = int(torch.randint(low, high+1, (1,), device=self.device).item())
44
+ else:
45
+ butterfly[param] = float(torch.rand(1, device=self.device).item() * (high - low) + low)
46
+ population.append(butterfly)
47
+ return population
48
+ except RuntimeError as e:
49
+ logger.error(f"CUDA error during initialization: {e}")
50
+ self.device = torch.device('cpu')
51
+ logger.info("Falling back to CPU")
52
+ return self.initialize()
53
+
54
+ def migration(self, population):
55
+ try:
56
+ new_population = []
57
+ migration_tensor = torch.rand(len(population), device=self.device)
58
+
59
+ for idx, butterfly in enumerate(population):
60
+ if migration_tensor[idx].item() < self.migration_ratio:
61
+ new_butterfly = {}
62
+ for param in butterfly:
63
+ r = torch.rand(1, device=self.device).item()
64
+ new_val = butterfly[param] + self.p_period * r * (self.best_solution[param] - butterfly[param])
65
+ new_butterfly[param] = self.clip(new_val, param)
66
+ new_population.append(new_butterfly)
67
+ else:
68
+ new_population.append(butterfly.copy())
69
+ return new_population
70
+ except RuntimeError as e:
71
+ logger.error(f"CUDA error during migration: {e}")
72
+ self.device = torch.device('cpu')
73
+ logger.info("Falling back to CPU")
74
+ return self.migration(population)
75
+
76
+ def clip(self, value, param):
77
+ low, high = self.bounds[param]
78
+ if isinstance(low, int) and isinstance(high, int):
79
+ return int(np.clip(value, low, high))
80
+ return np.clip(value, low, high)
81
+
82
+ def optimize(self, fitness_func):
83
+ population = self.initialize()
84
+
85
+ for _ in range(self.max_iter):
86
+ for butterfly in population:
87
+ fitness = fitness_func(butterfly)
88
+ if fitness > self.best_fitness:
89
+ self.best_fitness = fitness
90
+ self.best_solution = butterfly.copy()
91
+
92
+ population = self.migration(population)
93
+
94
+ return self.best_solution, self.best_fitness
95
+
96
+ def plot_dataset_insights(df):
97
+ plt.figure(figsize=(15, 5))
98
+
99
+ plt.subplot(131)
100
+ sns.histplot(data=df, x='feature_length', hue='target', bins=50)
101
+ plt.title('Message Length Distribution')
102
+
103
+ plt.subplot(132)
104
+ df['target'].value_counts().plot(kind='bar')
105
+ plt.title('Class Distribution')
106
+
107
+ plt.subplot(133)
108
+ sns.boxplot(data=df, x='target', y='word_count')
109
+ plt.title('Word Count by Class')
110
+
111
+ plt.tight_layout()
112
+ plt.savefig('./graphs/dataset_insights.png')
113
+ plt.close()
114
+
115
+ def plot_word_clouds(df):
116
+ from wordcloud import WordCloud
117
+ plt.figure(figsize=(15, 5))
118
+
119
+ for idx, label in enumerate(['ham', 'spam']):
120
+ text = ' '.join(df[df['target'] == label]['transformed_text'])
121
+ wordcloud = WordCloud(width=800, height=400).generate(text)
122
+
123
+ plt.subplot(1, 2, idx+1)
124
+ plt.imshow(wordcloud)
125
+ plt.axis('off')
126
+ plt.title(f'Word Cloud - {label.upper()}')
127
+
128
+ plt.savefig('./graphs/wordclouds.png')
129
+ plt.close()
130
+
131
+ def plot_performance_metrics(y_test, y_pred, model):
132
+ plt.figure(figsize=(15, 5))
133
+
134
+ plt.subplot(131)
135
+ cm = confusion_matrix(y_test, y_pred)
136
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
137
+ plt.title('Confusion Matrix')
138
+
139
+ plt.subplot(132)
140
+ report = classification_report(y_test, y_pred, output_dict=True)
141
+ sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True, cmap='RdYlGn')
142
+ plt.title('Classification Report')
143
+
144
+ plt.subplot(133)
145
+ etc = model.named_estimators_['etc']
146
+ importances = pd.Series(etc.feature_importances_)
147
+ importances.nlargest(10).plot(kind='bar')
148
+ plt.title('Top 10 Important Features')
149
+
150
+ plt.tight_layout()
151
+ plt.savefig('./graphs/performance_metrics.png')
152
+ plt.close()
153
+
154
+ def save_metrics(metrics):
155
+ with open('./models/metrics.txt', 'w') as f:
156
+ for metric, value in metrics.items():
157
+ f.write(f"{metric}: {value:.4f}\n")
158
+
159
+ def create_optimized_ensemble(X_train, y_train, mbo_params):
160
+ param_bounds = {
161
+ 'svc_C': (0.1, 20.0),
162
+ 'svc_gamma': (0.001, 1.0),
163
+ 'mnb_alpha': (0.1, 2.0),
164
+ 'etc_n_estimators': (100, 300),
165
+ 'w1': (0, 5),
166
+ 'w2': (0, 5),
167
+ 'w3': (0, 5)
168
+ }
169
+
170
+ mbo = MonarchButterflyOptimizer(
171
+ param_bounds,
172
+ n_butterflies=int(mbo_params.get('n_butterflies', 20)),
173
+ p_period=float(mbo_params.get('p_period', 1.2)),
174
+ migration_ratio=float(mbo_params.get('migration_ratio', 0.85)),
175
+ max_iter=int(mbo_params.get('max_iter', 30)),
176
+ use_gpu=bool(mbo_params.get('use_gpu', False))
177
+ )
178
+
179
+ def fitness_function(params):
180
+ svc = SVC(kernel='rbf', C=params['svc_C'],
181
+ gamma=params['svc_gamma'], probability=True)
182
+ mnb = MultinomialNB(alpha=params['mnb_alpha'])
183
+ etc = ExtraTreesClassifier(n_estimators=int(params['etc_n_estimators']))
184
+
185
+ estimators = [('svc', svc), ('mnb', mnb), ('etc', etc)]
186
+ weights = [params['w1'], params['w2'], params['w3']]
187
+
188
+ clf = VotingClassifier(estimators=estimators, voting='soft', weights=weights)
189
+ scores = cross_val_score(clf, X_train, y_train, cv=5)
190
+ return np.mean(scores)
191
+
192
+ # Initialize and run MBO
193
+ mbo = MonarchButterflyOptimizer(param_bounds)
194
+ best_params, _ = mbo.optimize(fitness_function)
195
+ # Create final model with optimized parameters
196
+ svc = SVC(kernel='rbf', C=best_params['svc_C'],
197
+ gamma=best_params['svc_gamma'], probability=True)
198
+ mnb = MultinomialNB(alpha=best_params['mnb_alpha'])
199
+ etc = ExtraTreesClassifier(n_estimators=int(best_params['etc_n_estimators']))
200
+
201
+ estimators = [('svc', svc), ('mnb', mnb), ('etc', etc)]
202
+ weights = [best_params['w1'], best_params['w2'], best_params['w3']]
203
+
204
+ return VotingClassifier(estimators=estimators, voting='soft', weights=weights)
205
+
206
+ def main(mbo_params=None):
207
+ try:
208
+ logger.info("Loading data...")
209
+ # Load and preprocess data
210
+ df = pd.read_csv('./data/spam.csv', encoding='latin-1')
211
+ df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
212
+ df = df.rename(columns={'v1': 'target', 'v2': 'text'})
213
+
214
+ logger.info("Preprocessing text...")
215
+ df['transformed_text'] = df['text'].apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))
216
+ df['word_count'] = df['transformed_text'].str.split().str.len()
217
+ df['feature_length'] = df['transformed_text'].apply(len)
218
+
219
+ logger.info("Generating visualizations...")
220
+ plot_dataset_insights(df)
221
+ plot_word_clouds(df)
222
+
223
+ tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
224
+ X = tfidf.fit_transform(df['transformed_text'])
225
+ y = (df['target'] == 'spam').astype(int)
226
+
227
+ X_train, X_test, y_train, y_test = train_test_split(
228
+ X, y, test_size=0.2, random_state=42, stratify=y
229
+ )
230
+
231
+ logger.info("Training model with MBO...")
232
+ if mbo_params and mbo_params.get('use_gpu'):
233
+ logger.info("GPU acceleration enabled")
234
+ model = create_optimized_ensemble(X_train, y_train, mbo_params or {})
235
+
236
+ model.fit(X_train, y_train)
237
+
238
+ y_pred = model.predict(X_test)
239
+
240
+ metrics = {
241
+ "Accuracy": accuracy_score(y_test, y_pred),
242
+ "Precision": precision_score(y_test, y_pred),
243
+ "F1": f1_score(y_test, y_pred)
244
+ }
245
+
246
+ save_metrics(metrics)
247
+ for metric, value in metrics.items():
248
+ logger.info(f"{metric}: {value:.4f}")
249
+
250
+ plot_performance_metrics(y_test, y_pred, model)
251
+
252
+ logger.info("Saving models...")
253
+ with open('./models/vectorizer_mbo.pkl', 'wb') as f:
254
+ pickle.dump(tfidf, f)
255
+ with open('./models/model_mbo.pkl', 'wb') as f:
256
+ pickle.dump(model, f)
257
+
258
+ logger.info("MBO optimization completed successfully")
259
+
260
+ except Exception as e:
261
+ logger.error(f"An error occurred: {e}")
262
+ raise
263
+
264
+ if __name__ == "__main__":
265
+ main()