Spaces:
Sleeping
Sleeping
File size: 5,512 Bytes
052e8a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.utils import shuffle
from sklearn.model_selection import learning_curve
import gender_guesser.detector as gender
def read_datasets():
""" Reads users profile from csv files """
genuine_users = pd.read_csv("data/users.csv")
fake_users = pd.read_csv("data/fusers.csv")
x = pd.concat([genuine_users, fake_users])
y = [1] * len(genuine_users) + [0] * len(fake_users)
return x, y
def predict_sex(names):
sex_predictor = gender.Detector(case_sensitive=False)
sex_code = []
for name in names:
first_name = name.split(' ')[0]
sex = sex_predictor.get_gender(first_name)
if sex == 'female':
sex_code.append(2)
# elif sex == 'mostly_female':
# sex_code.append(-1)
elif sex == 'male':
sex_code.append(1)
# elif sex == 'mostly_male':
# sex_code.append(1)
else:
sex_code.append(0) # Assign a default value for unknown genders
return sex_code
def extract_features(x):
x['sex_code'] = predict_sex(x['name'])
feature_columns_to_use = ['statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count', 'sex_code']
x = x[feature_columns_to_use]
return x
# Rest of your code...
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
target_names=['Fake','Genuine']
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
def plot_roc_curve(y_test, y_pred):
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
print("False Positive rate: ", false_positive_rate)
print("True Positive rate: ", true_positive_rate)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.2])
plt.ylim([-0.1, 1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
def train(X_train, y_train, X_test):
""" Trains and predicts dataset with a Random Forest classifier """
clf = RandomForestClassifier(n_estimators=40, oob_score=True)
clf.fit(X_train, y_train)
print("The best classifier is: ", clf)
# Estimate score
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
print('Estimated score: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2))
title = 'Learning Curves (Random Forest)'
plot_learning_curve(clf, title, X_train, y_train, cv=5)
plt.show()
# Predict
y_pred = clf.predict(X_test)
import pickle
with open('data.pkl','wb') as file:
pickle.dump(clf,file)
return y_test, y_pred
print("Reading datasets...\n")
x, y = read_datasets()
x.describe()
print("Extracting features...\n")
x = extract_features(x)
print(x.columns)
print(x.describe())
print("Splitting datasets into train and test dataset...\n")
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=44)
print("Training datasets...\n")
y_test, y_pred = train(X_train, y_train, X_test)
print('Classification Accuracy on Test dataset: ', accuracy_score(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix, without normalization')
print(cm)
plot_confusion_matrix(cm)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
print(classification_report(y_test, y_pred, target_names=['Fake', 'Genuine']))
plot_roc_curve(y_test, y_pred) |