Upload 3 files
Browse files- training/train_model_legacy.py +193 -0
- training/train_model_lite.py +212 -0
- training/train_model_mbo.py +265 -0
training/train_model_legacy.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import nltk, string, logging, pickle
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
from collections import Counter
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
from nltk.stem.porter import PorterStemmer
|
9 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
+
from sklearn.model_selection import train_test_split
|
11 |
+
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
|
12 |
+
from sklearn.ensemble import VotingClassifier
|
13 |
+
from sklearn.svm import SVC
|
14 |
+
from sklearn.naive_bayes import MultinomialNB
|
15 |
+
from sklearn.ensemble import ExtraTreesClassifier
|
16 |
+
|
17 |
+
# Setup logging
|
18 |
+
logging.basicConfig(level=logging.INFO)
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
def transform_text(text):
|
22 |
+
ps = PorterStemmer()
|
23 |
+
text = text.lower()
|
24 |
+
text = nltk.word_tokenize(text)
|
25 |
+
|
26 |
+
y = []
|
27 |
+
for i in text:
|
28 |
+
if i.isalnum():
|
29 |
+
y.append(i)
|
30 |
+
|
31 |
+
text = y[:]
|
32 |
+
y.clear()
|
33 |
+
|
34 |
+
for i in text:
|
35 |
+
if i not in stopwords.words('english') and i not in string.punctuation:
|
36 |
+
y.append(i)
|
37 |
+
|
38 |
+
text = y[:]
|
39 |
+
y.clear()
|
40 |
+
|
41 |
+
for i in text:
|
42 |
+
y.append(ps.stem(i))
|
43 |
+
|
44 |
+
return " ".join(y)
|
45 |
+
|
46 |
+
def plot_dataset_insights(df):
|
47 |
+
plt.figure(figsize=(15, 5))
|
48 |
+
|
49 |
+
plt.subplot(131)
|
50 |
+
sns.histplot(data=df, x='num_characters', hue='target', bins=50)
|
51 |
+
plt.title('Message Length Distribution')
|
52 |
+
|
53 |
+
plt.subplot(132)
|
54 |
+
df['target'].value_counts().plot(kind='bar')
|
55 |
+
plt.title('Class Distribution')
|
56 |
+
|
57 |
+
plt.subplot(133)
|
58 |
+
sns.boxplot(data=df, x='target', y='num_words')
|
59 |
+
plt.title('Word Count by Class')
|
60 |
+
|
61 |
+
plt.tight_layout()
|
62 |
+
plt.savefig('./graphs/dataset_insights.png')
|
63 |
+
plt.close()
|
64 |
+
|
65 |
+
def plot_word_clouds(df):
|
66 |
+
from wordcloud import WordCloud
|
67 |
+
plt.figure(figsize=(15, 5))
|
68 |
+
|
69 |
+
# Map text labels to numeric
|
70 |
+
df['target_num'] = df['target'].map({'ham': 0, 'spam': 1})
|
71 |
+
|
72 |
+
for idx, label in enumerate(['ham', 'spam']):
|
73 |
+
# Get text for current label
|
74 |
+
text = ' '.join(df[df['target'] == label]['transformed_text'])
|
75 |
+
|
76 |
+
if not text.strip():
|
77 |
+
logger.warning(f"No text found for label: {label}")
|
78 |
+
continue
|
79 |
+
|
80 |
+
try:
|
81 |
+
wordcloud = WordCloud(width=800, height=400).generate(text)
|
82 |
+
plt.subplot(1, 2, idx+1)
|
83 |
+
plt.imshow(wordcloud)
|
84 |
+
plt.axis('off')
|
85 |
+
plt.title(f'Word Cloud - {label.upper()}')
|
86 |
+
except Exception as e:
|
87 |
+
logger.error(f"Error generating wordcloud for {label}: {e}")
|
88 |
+
|
89 |
+
plt.savefig('./graphs/wordclouds.png')
|
90 |
+
plt.close()
|
91 |
+
|
92 |
+
def plot_performance_metrics(y_test, y_pred, model):
|
93 |
+
plt.figure(figsize=(15, 5))
|
94 |
+
|
95 |
+
plt.subplot(131)
|
96 |
+
cm = confusion_matrix(y_test, y_pred)
|
97 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
98 |
+
plt.title('Confusion Matrix')
|
99 |
+
|
100 |
+
plt.subplot(132)
|
101 |
+
performance_df = pd.DataFrame({
|
102 |
+
'Metric': ['Accuracy', 'Precision'],
|
103 |
+
'Score': [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred)]
|
104 |
+
})
|
105 |
+
sns.barplot(x='Metric', y='Score', data=performance_df)
|
106 |
+
plt.title('Model Performance')
|
107 |
+
|
108 |
+
plt.subplot(133)
|
109 |
+
etc = model.named_estimators_['et']
|
110 |
+
importances = pd.Series(etc.feature_importances_)
|
111 |
+
importances.nlargest(10).plot(kind='bar')
|
112 |
+
plt.title('Top 10 Important Features')
|
113 |
+
|
114 |
+
plt.tight_layout()
|
115 |
+
plt.savefig('./graphs/performance_metrics.png')
|
116 |
+
plt.close()
|
117 |
+
|
118 |
+
def save_metrics(metrics):
|
119 |
+
with open('./models/metrics.txt', 'w') as f:
|
120 |
+
for metric, value in metrics.items():
|
121 |
+
f.write(f"{metric}: {value:.4f}\n")
|
122 |
+
|
123 |
+
def main():
|
124 |
+
try:
|
125 |
+
# Load and preprocess data
|
126 |
+
logger.info("Loading data...")
|
127 |
+
df = pd.read_csv('./data/spam.csv', encoding='latin-1')
|
128 |
+
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
|
129 |
+
df = df.rename(columns={'v1': 'target', 'v2': 'text'})
|
130 |
+
|
131 |
+
logger.info(f"Target value counts:\n{df['target'].value_counts()}")
|
132 |
+
|
133 |
+
# Add numerical features
|
134 |
+
df['num_characters'] = df['text'].apply(len)
|
135 |
+
df['num_words'] = df['text'].apply(lambda x: len(nltk.word_tokenize(x)))
|
136 |
+
df['num_sentences'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))
|
137 |
+
|
138 |
+
logger.info("Transforming text...")
|
139 |
+
df['transformed_text'] = df['text'].apply(transform_text)
|
140 |
+
|
141 |
+
# Verify transformed text
|
142 |
+
logger.info(f"Sample transformed text:\n{df['transformed_text'].head()}")
|
143 |
+
|
144 |
+
logger.info("Generating visualizations...")
|
145 |
+
plot_dataset_insights(df)
|
146 |
+
plot_word_clouds(df)
|
147 |
+
|
148 |
+
# Text vectorization
|
149 |
+
tfidf = TfidfVectorizer(max_features=3000)
|
150 |
+
X = tfidf.fit_transform(df['transformed_text']).toarray()
|
151 |
+
# Convert target to numeric for model
|
152 |
+
y = (df['target'] == 'spam').astype(int)
|
153 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
|
154 |
+
|
155 |
+
# Create ensemble
|
156 |
+
logger.info("Training model...")
|
157 |
+
svc = SVC(kernel='sigmoid', gamma=1.0, probability=True)
|
158 |
+
mnb = MultinomialNB()
|
159 |
+
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
|
160 |
+
|
161 |
+
voting = VotingClassifier([('svm', svc), ('nb', mnb), ('et', etc)], voting='soft')
|
162 |
+
voting.fit(X_train, y_train)
|
163 |
+
|
164 |
+
y_pred = voting.predict(X_test)
|
165 |
+
|
166 |
+
metrics = {
|
167 |
+
"Accuracy": accuracy_score(y_test, y_pred),
|
168 |
+
"Precision": precision_score(y_test, y_pred)
|
169 |
+
}
|
170 |
+
|
171 |
+
save_metrics(metrics)
|
172 |
+
for metric, value in metrics.items():
|
173 |
+
logger.info(f"{metric}: {value:.4f}")
|
174 |
+
|
175 |
+
plot_performance_metrics(y_test, y_pred, voting)
|
176 |
+
|
177 |
+
logger.info("Saving models...")
|
178 |
+
pickle.dump(tfidf, open('./models/vectorizer.pkl', 'wb'))
|
179 |
+
pickle.dump(voting, open('./models/model.pkl', 'wb'))
|
180 |
+
|
181 |
+
logger.info("Training completed successfully")
|
182 |
+
|
183 |
+
except Exception as e:
|
184 |
+
logger.error(f"An error occurred: {e}")
|
185 |
+
raise
|
186 |
+
|
187 |
+
if __name__ == "__main__":
|
188 |
+
try:
|
189 |
+
nltk.download('punkt')
|
190 |
+
nltk.download('stopwords')
|
191 |
+
main()
|
192 |
+
except Exception as e:
|
193 |
+
print(f"Fatal error: {e}")
|
training/train_model_lite.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import nltk, string, logging, pickle
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
from collections import Counter
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
9 |
+
from sklearn.model_selection import train_test_split
|
10 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
11 |
+
from sklearn.svm import SVC
|
12 |
+
from sklearn.naive_bayes import MultinomialNB
|
13 |
+
from sklearn.ensemble import ExtraTreesClassifier
|
14 |
+
from sklearn.model_selection import cross_val_score, GridSearchCV
|
15 |
+
from sklearn.ensemble import VotingClassifier
|
16 |
+
from sklearn.metrics import accuracy_score, precision_score, f1_score
|
17 |
+
nltk.download('punkt')
|
18 |
+
nltk.download('wordnet')
|
19 |
+
nltk.download('stopwords')
|
20 |
+
nltk.download('punkt_tab')
|
21 |
+
|
22 |
+
# Setup logging
|
23 |
+
logging.basicConfig(level=logging.INFO)
|
24 |
+
logger = logging.getLogger(__name__)
|
25 |
+
|
26 |
+
# Download required NLTK data
|
27 |
+
try:
|
28 |
+
nltk.download('punkt')
|
29 |
+
nltk.download('wordnet')
|
30 |
+
nltk.download('stopwords')
|
31 |
+
except Exception as e:
|
32 |
+
logger.error(f"Failed to download NLTK data: {e}")
|
33 |
+
|
34 |
+
def improved_transform_text(text):
|
35 |
+
try:
|
36 |
+
from nltk.stem import WordNetLemmatizer
|
37 |
+
lemmatizer = WordNetLemmatizer()
|
38 |
+
|
39 |
+
text = str(text).lower()
|
40 |
+
words = nltk.word_tokenize(text)
|
41 |
+
|
42 |
+
words = [lemmatizer.lemmatize(word) for word in words
|
43 |
+
if word.isalnum() and
|
44 |
+
word not in stopwords.words('english') and
|
45 |
+
word not in string.punctuation]
|
46 |
+
|
47 |
+
return " ".join(words)
|
48 |
+
except Exception as e:
|
49 |
+
logger.error(f"Error in text transformation: {e}")
|
50 |
+
return text
|
51 |
+
|
52 |
+
def extract_features(df):
|
53 |
+
try:
|
54 |
+
df['text_length'] = df['text'].str.len()
|
55 |
+
df['word_count'] = df['text'].str.split().str.len()
|
56 |
+
df['unique_word_count'] = df['text'].apply(lambda x: len(set(str(x).split())))
|
57 |
+
df['uppercase_count'] = df['text'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))
|
58 |
+
df['special_char_count'] = df['text'].apply(lambda x: sum(not c.isalnum() for c in str(x)))
|
59 |
+
return df
|
60 |
+
except Exception as e:
|
61 |
+
logger.error(f"Error in feature extraction: {e}")
|
62 |
+
return df
|
63 |
+
|
64 |
+
def create_optimized_ensemble():
|
65 |
+
try:
|
66 |
+
svc = SVC(kernel='rbf', C=10, gamma='auto', probability=True, random_state=42)
|
67 |
+
mnb = MultinomialNB(alpha=0.1)
|
68 |
+
etc = ExtraTreesClassifier(n_estimators=200, max_depth=None,
|
69 |
+
min_samples_split=2, random_state=42)
|
70 |
+
|
71 |
+
estimators = [('svc', svc), ('mnb', mnb), ('etc', etc)]
|
72 |
+
voting_clf = VotingClassifier(estimators=estimators,
|
73 |
+
voting='soft',
|
74 |
+
weights=[2,1,2])
|
75 |
+
return voting_clf
|
76 |
+
except Exception as e:
|
77 |
+
logger.error(f"Error creating ensemble: {e}")
|
78 |
+
raise
|
79 |
+
|
80 |
+
def plot_dataset_insights(df):
|
81 |
+
plt.figure(figsize=(15, 5))
|
82 |
+
|
83 |
+
# Message length distribution
|
84 |
+
plt.subplot(131)
|
85 |
+
sns.histplot(data=df, x='text_length', hue='target', bins=50)
|
86 |
+
plt.title('Message Length Distribution')
|
87 |
+
|
88 |
+
# Class distribution
|
89 |
+
plt.subplot(132)
|
90 |
+
df['target'].value_counts().plot(kind='bar')
|
91 |
+
plt.title('Class Distribution')
|
92 |
+
|
93 |
+
# Word count distribution
|
94 |
+
plt.subplot(133)
|
95 |
+
sns.boxplot(data=df, x='target', y='word_count')
|
96 |
+
plt.title('Word Count by Class')
|
97 |
+
|
98 |
+
plt.tight_layout()
|
99 |
+
plt.savefig('./graphs/dataset_insights.png')
|
100 |
+
plt.close()
|
101 |
+
|
102 |
+
def plot_word_clouds(df):
|
103 |
+
from wordcloud import WordCloud
|
104 |
+
|
105 |
+
plt.figure(figsize=(15, 5))
|
106 |
+
|
107 |
+
for idx, label in enumerate(['ham', 'spam']):
|
108 |
+
text = ' '.join(df[df['target'] == label]['transformed_text'])
|
109 |
+
wordcloud = WordCloud(width=800, height=400).generate(text)
|
110 |
+
|
111 |
+
plt.subplot(1, 2, idx+1)
|
112 |
+
plt.imshow(wordcloud)
|
113 |
+
plt.axis('off')
|
114 |
+
plt.title(f'Word Cloud - {label.upper()}')
|
115 |
+
|
116 |
+
plt.savefig('./graphs/wordclouds.png')
|
117 |
+
plt.close()
|
118 |
+
|
119 |
+
def plot_performance_metrics(y_test, y_pred, model):
|
120 |
+
# Confusion Matrix
|
121 |
+
plt.figure(figsize=(15, 5))
|
122 |
+
|
123 |
+
plt.subplot(131)
|
124 |
+
cm = confusion_matrix(y_test, y_pred)
|
125 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
126 |
+
plt.title('Confusion Matrix')
|
127 |
+
|
128 |
+
# Classification Report Visualization
|
129 |
+
plt.subplot(132)
|
130 |
+
report = classification_report(y_test, y_pred, output_dict=True)
|
131 |
+
sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True, cmap='RdYlGn')
|
132 |
+
plt.title('Classification Report')
|
133 |
+
|
134 |
+
# Feature Importance (for ExtraTreesClassifier)
|
135 |
+
plt.subplot(133)
|
136 |
+
etc = model.named_estimators_['etc']
|
137 |
+
importances = pd.Series(etc.feature_importances_)
|
138 |
+
importances.nlargest(10).plot(kind='bar')
|
139 |
+
plt.title('Top 10 Important Features')
|
140 |
+
|
141 |
+
plt.tight_layout()
|
142 |
+
plt.savefig('./graphs/performance_metrics.png')
|
143 |
+
plt.close()
|
144 |
+
|
145 |
+
def save_metrics(metrics):
|
146 |
+
with open('./models/metrics.txt', 'w') as f:
|
147 |
+
for metric, value in metrics.items():
|
148 |
+
f.write(f"{metric}: {value:.4f}\n")
|
149 |
+
|
150 |
+
def main():
|
151 |
+
try:
|
152 |
+
# Load and preprocess data
|
153 |
+
df = pd.read_csv('./data/spam.csv', encoding='latin-1')
|
154 |
+
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, errors='ignore')
|
155 |
+
df = df.rename(columns={'v1': 'target', 'v2': 'text'})
|
156 |
+
|
157 |
+
logger.info("Preprocessing text...")
|
158 |
+
df['transformed_text'] = df['text'].apply(improved_transform_text)
|
159 |
+
df = extract_features(df)
|
160 |
+
|
161 |
+
logger.info("Generating dataset insights...")
|
162 |
+
plot_dataset_insights(df)
|
163 |
+
plot_word_clouds(df)
|
164 |
+
|
165 |
+
# Vectorization with optimized parameters
|
166 |
+
tfidf = TfidfVectorizer(
|
167 |
+
max_features=5000,
|
168 |
+
ngram_range=(1,3),
|
169 |
+
min_df=2,
|
170 |
+
max_df=0.95
|
171 |
+
)
|
172 |
+
|
173 |
+
X = tfidf.fit_transform(df['transformed_text'])
|
174 |
+
y = (df['target'] == 'spam').astype(int)
|
175 |
+
|
176 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
177 |
+
X, y, test_size=0.2, random_state=42, stratify=y
|
178 |
+
)
|
179 |
+
|
180 |
+
logger.info("Training model...")
|
181 |
+
model = create_optimized_ensemble()
|
182 |
+
model.fit(X_train, y_train)
|
183 |
+
|
184 |
+
y_pred = model.predict(X_test)
|
185 |
+
|
186 |
+
metrics = {
|
187 |
+
"Accuracy": accuracy_score(y_test, y_pred),
|
188 |
+
"Precision": precision_score(y_test, y_pred),
|
189 |
+
"F1": f1_score(y_test, y_pred)
|
190 |
+
}
|
191 |
+
|
192 |
+
# Save metrics to file
|
193 |
+
save_metrics(metrics)
|
194 |
+
|
195 |
+
for metric, value in metrics.items():
|
196 |
+
logger.info(f"{metric}: {value:.4f}")
|
197 |
+
|
198 |
+
plot_performance_metrics(y_test, y_pred, model)
|
199 |
+
|
200 |
+
with open('./models/vectorizer_optimized.pkl', 'wb') as f:
|
201 |
+
pickle.dump(tfidf, f)
|
202 |
+
with open('./models/model_optimized.pkl', 'wb') as f:
|
203 |
+
pickle.dump(model, f)
|
204 |
+
|
205 |
+
logger.info(f"Training completed. Metrics:\n{metrics}")
|
206 |
+
|
207 |
+
except Exception as e:
|
208 |
+
logger.error(f"An error occurred: {e}")
|
209 |
+
raise
|
210 |
+
|
211 |
+
if __name__ == "__main__":
|
212 |
+
main()
|
training/train_model_mbo.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import nltk, string, logging, pickle, torch
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
from collections import Counter
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
9 |
+
from sklearn.model_selection import train_test_split, cross_val_score
|
10 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
11 |
+
from sklearn.svm import SVC
|
12 |
+
from sklearn.naive_bayes import MultinomialNB
|
13 |
+
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
|
14 |
+
from sklearn.metrics import accuracy_score, precision_score, f1_score
|
15 |
+
from torch.cuda import is_available as cuda_available
|
16 |
+
|
17 |
+
# Setup logging
|
18 |
+
logging.basicConfig(level=logging.INFO)
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
class MonarchButterflyOptimizer:
|
22 |
+
def __init__(self, bounds, n_butterflies=20, p_period=1.2, migration_ratio=0.85, max_iter=30, use_gpu=False):
|
23 |
+
self.bounds = bounds
|
24 |
+
self.n_butterflies = n_butterflies
|
25 |
+
self.p_period = p_period
|
26 |
+
self.migration_ratio = migration_ratio
|
27 |
+
self.max_iter = max_iter
|
28 |
+
self.best_solution = None
|
29 |
+
self.best_fitness = float('-inf')
|
30 |
+
|
31 |
+
# GPU setup
|
32 |
+
self.use_gpu = use_gpu and cuda_available()
|
33 |
+
self.device = torch.device('cuda' if self.use_gpu else 'cpu')
|
34 |
+
logger.info(f"Using device: {self.device}")
|
35 |
+
|
36 |
+
def initialize(self):
|
37 |
+
try:
|
38 |
+
population = []
|
39 |
+
for _ in range(self.n_butterflies):
|
40 |
+
butterfly = {}
|
41 |
+
for param, (low, high) in self.bounds.items():
|
42 |
+
if isinstance(low, int) and isinstance(high, int):
|
43 |
+
butterfly[param] = int(torch.randint(low, high+1, (1,), device=self.device).item())
|
44 |
+
else:
|
45 |
+
butterfly[param] = float(torch.rand(1, device=self.device).item() * (high - low) + low)
|
46 |
+
population.append(butterfly)
|
47 |
+
return population
|
48 |
+
except RuntimeError as e:
|
49 |
+
logger.error(f"CUDA error during initialization: {e}")
|
50 |
+
self.device = torch.device('cpu')
|
51 |
+
logger.info("Falling back to CPU")
|
52 |
+
return self.initialize()
|
53 |
+
|
54 |
+
def migration(self, population):
|
55 |
+
try:
|
56 |
+
new_population = []
|
57 |
+
migration_tensor = torch.rand(len(population), device=self.device)
|
58 |
+
|
59 |
+
for idx, butterfly in enumerate(population):
|
60 |
+
if migration_tensor[idx].item() < self.migration_ratio:
|
61 |
+
new_butterfly = {}
|
62 |
+
for param in butterfly:
|
63 |
+
r = torch.rand(1, device=self.device).item()
|
64 |
+
new_val = butterfly[param] + self.p_period * r * (self.best_solution[param] - butterfly[param])
|
65 |
+
new_butterfly[param] = self.clip(new_val, param)
|
66 |
+
new_population.append(new_butterfly)
|
67 |
+
else:
|
68 |
+
new_population.append(butterfly.copy())
|
69 |
+
return new_population
|
70 |
+
except RuntimeError as e:
|
71 |
+
logger.error(f"CUDA error during migration: {e}")
|
72 |
+
self.device = torch.device('cpu')
|
73 |
+
logger.info("Falling back to CPU")
|
74 |
+
return self.migration(population)
|
75 |
+
|
76 |
+
def clip(self, value, param):
|
77 |
+
low, high = self.bounds[param]
|
78 |
+
if isinstance(low, int) and isinstance(high, int):
|
79 |
+
return int(np.clip(value, low, high))
|
80 |
+
return np.clip(value, low, high)
|
81 |
+
|
82 |
+
def optimize(self, fitness_func):
|
83 |
+
population = self.initialize()
|
84 |
+
|
85 |
+
for _ in range(self.max_iter):
|
86 |
+
for butterfly in population:
|
87 |
+
fitness = fitness_func(butterfly)
|
88 |
+
if fitness > self.best_fitness:
|
89 |
+
self.best_fitness = fitness
|
90 |
+
self.best_solution = butterfly.copy()
|
91 |
+
|
92 |
+
population = self.migration(population)
|
93 |
+
|
94 |
+
return self.best_solution, self.best_fitness
|
95 |
+
|
96 |
+
def plot_dataset_insights(df):
|
97 |
+
plt.figure(figsize=(15, 5))
|
98 |
+
|
99 |
+
plt.subplot(131)
|
100 |
+
sns.histplot(data=df, x='feature_length', hue='target', bins=50)
|
101 |
+
plt.title('Message Length Distribution')
|
102 |
+
|
103 |
+
plt.subplot(132)
|
104 |
+
df['target'].value_counts().plot(kind='bar')
|
105 |
+
plt.title('Class Distribution')
|
106 |
+
|
107 |
+
plt.subplot(133)
|
108 |
+
sns.boxplot(data=df, x='target', y='word_count')
|
109 |
+
plt.title('Word Count by Class')
|
110 |
+
|
111 |
+
plt.tight_layout()
|
112 |
+
plt.savefig('./graphs/dataset_insights.png')
|
113 |
+
plt.close()
|
114 |
+
|
115 |
+
def plot_word_clouds(df):
|
116 |
+
from wordcloud import WordCloud
|
117 |
+
plt.figure(figsize=(15, 5))
|
118 |
+
|
119 |
+
for idx, label in enumerate(['ham', 'spam']):
|
120 |
+
text = ' '.join(df[df['target'] == label]['transformed_text'])
|
121 |
+
wordcloud = WordCloud(width=800, height=400).generate(text)
|
122 |
+
|
123 |
+
plt.subplot(1, 2, idx+1)
|
124 |
+
plt.imshow(wordcloud)
|
125 |
+
plt.axis('off')
|
126 |
+
plt.title(f'Word Cloud - {label.upper()}')
|
127 |
+
|
128 |
+
plt.savefig('./graphs/wordclouds.png')
|
129 |
+
plt.close()
|
130 |
+
|
131 |
+
def plot_performance_metrics(y_test, y_pred, model):
|
132 |
+
plt.figure(figsize=(15, 5))
|
133 |
+
|
134 |
+
plt.subplot(131)
|
135 |
+
cm = confusion_matrix(y_test, y_pred)
|
136 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
137 |
+
plt.title('Confusion Matrix')
|
138 |
+
|
139 |
+
plt.subplot(132)
|
140 |
+
report = classification_report(y_test, y_pred, output_dict=True)
|
141 |
+
sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True, cmap='RdYlGn')
|
142 |
+
plt.title('Classification Report')
|
143 |
+
|
144 |
+
plt.subplot(133)
|
145 |
+
etc = model.named_estimators_['etc']
|
146 |
+
importances = pd.Series(etc.feature_importances_)
|
147 |
+
importances.nlargest(10).plot(kind='bar')
|
148 |
+
plt.title('Top 10 Important Features')
|
149 |
+
|
150 |
+
plt.tight_layout()
|
151 |
+
plt.savefig('./graphs/performance_metrics.png')
|
152 |
+
plt.close()
|
153 |
+
|
154 |
+
def save_metrics(metrics):
|
155 |
+
with open('./models/metrics.txt', 'w') as f:
|
156 |
+
for metric, value in metrics.items():
|
157 |
+
f.write(f"{metric}: {value:.4f}\n")
|
158 |
+
|
159 |
+
def create_optimized_ensemble(X_train, y_train, mbo_params):
|
160 |
+
param_bounds = {
|
161 |
+
'svc_C': (0.1, 20.0),
|
162 |
+
'svc_gamma': (0.001, 1.0),
|
163 |
+
'mnb_alpha': (0.1, 2.0),
|
164 |
+
'etc_n_estimators': (100, 300),
|
165 |
+
'w1': (0, 5),
|
166 |
+
'w2': (0, 5),
|
167 |
+
'w3': (0, 5)
|
168 |
+
}
|
169 |
+
|
170 |
+
mbo = MonarchButterflyOptimizer(
|
171 |
+
param_bounds,
|
172 |
+
n_butterflies=int(mbo_params.get('n_butterflies', 20)),
|
173 |
+
p_period=float(mbo_params.get('p_period', 1.2)),
|
174 |
+
migration_ratio=float(mbo_params.get('migration_ratio', 0.85)),
|
175 |
+
max_iter=int(mbo_params.get('max_iter', 30)),
|
176 |
+
use_gpu=bool(mbo_params.get('use_gpu', False))
|
177 |
+
)
|
178 |
+
|
179 |
+
def fitness_function(params):
|
180 |
+
svc = SVC(kernel='rbf', C=params['svc_C'],
|
181 |
+
gamma=params['svc_gamma'], probability=True)
|
182 |
+
mnb = MultinomialNB(alpha=params['mnb_alpha'])
|
183 |
+
etc = ExtraTreesClassifier(n_estimators=int(params['etc_n_estimators']))
|
184 |
+
|
185 |
+
estimators = [('svc', svc), ('mnb', mnb), ('etc', etc)]
|
186 |
+
weights = [params['w1'], params['w2'], params['w3']]
|
187 |
+
|
188 |
+
clf = VotingClassifier(estimators=estimators, voting='soft', weights=weights)
|
189 |
+
scores = cross_val_score(clf, X_train, y_train, cv=5)
|
190 |
+
return np.mean(scores)
|
191 |
+
|
192 |
+
# Initialize and run MBO
|
193 |
+
mbo = MonarchButterflyOptimizer(param_bounds)
|
194 |
+
best_params, _ = mbo.optimize(fitness_function)
|
195 |
+
# Create final model with optimized parameters
|
196 |
+
svc = SVC(kernel='rbf', C=best_params['svc_C'],
|
197 |
+
gamma=best_params['svc_gamma'], probability=True)
|
198 |
+
mnb = MultinomialNB(alpha=best_params['mnb_alpha'])
|
199 |
+
etc = ExtraTreesClassifier(n_estimators=int(best_params['etc_n_estimators']))
|
200 |
+
|
201 |
+
estimators = [('svc', svc), ('mnb', mnb), ('etc', etc)]
|
202 |
+
weights = [best_params['w1'], best_params['w2'], best_params['w3']]
|
203 |
+
|
204 |
+
return VotingClassifier(estimators=estimators, voting='soft', weights=weights)
|
205 |
+
|
206 |
+
def main(mbo_params=None):
|
207 |
+
try:
|
208 |
+
logger.info("Loading data...")
|
209 |
+
# Load and preprocess data
|
210 |
+
df = pd.read_csv('./data/spam.csv', encoding='latin-1')
|
211 |
+
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
|
212 |
+
df = df.rename(columns={'v1': 'target', 'v2': 'text'})
|
213 |
+
|
214 |
+
logger.info("Preprocessing text...")
|
215 |
+
df['transformed_text'] = df['text'].apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))
|
216 |
+
df['word_count'] = df['transformed_text'].str.split().str.len()
|
217 |
+
df['feature_length'] = df['transformed_text'].apply(len)
|
218 |
+
|
219 |
+
logger.info("Generating visualizations...")
|
220 |
+
plot_dataset_insights(df)
|
221 |
+
plot_word_clouds(df)
|
222 |
+
|
223 |
+
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
|
224 |
+
X = tfidf.fit_transform(df['transformed_text'])
|
225 |
+
y = (df['target'] == 'spam').astype(int)
|
226 |
+
|
227 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
228 |
+
X, y, test_size=0.2, random_state=42, stratify=y
|
229 |
+
)
|
230 |
+
|
231 |
+
logger.info("Training model with MBO...")
|
232 |
+
if mbo_params and mbo_params.get('use_gpu'):
|
233 |
+
logger.info("GPU acceleration enabled")
|
234 |
+
model = create_optimized_ensemble(X_train, y_train, mbo_params or {})
|
235 |
+
|
236 |
+
model.fit(X_train, y_train)
|
237 |
+
|
238 |
+
y_pred = model.predict(X_test)
|
239 |
+
|
240 |
+
metrics = {
|
241 |
+
"Accuracy": accuracy_score(y_test, y_pred),
|
242 |
+
"Precision": precision_score(y_test, y_pred),
|
243 |
+
"F1": f1_score(y_test, y_pred)
|
244 |
+
}
|
245 |
+
|
246 |
+
save_metrics(metrics)
|
247 |
+
for metric, value in metrics.items():
|
248 |
+
logger.info(f"{metric}: {value:.4f}")
|
249 |
+
|
250 |
+
plot_performance_metrics(y_test, y_pred, model)
|
251 |
+
|
252 |
+
logger.info("Saving models...")
|
253 |
+
with open('./models/vectorizer_mbo.pkl', 'wb') as f:
|
254 |
+
pickle.dump(tfidf, f)
|
255 |
+
with open('./models/model_mbo.pkl', 'wb') as f:
|
256 |
+
pickle.dump(model, f)
|
257 |
+
|
258 |
+
logger.info("MBO optimization completed successfully")
|
259 |
+
|
260 |
+
except Exception as e:
|
261 |
+
logger.error(f"An error occurred: {e}")
|
262 |
+
raise
|
263 |
+
|
264 |
+
if __name__ == "__main__":
|
265 |
+
main()
|