Mekam commited on
Commit
913f112
·
verified ·
1 Parent(s): 13dff20

Create salary_prediction

Browse files
Files changed (1) hide show
  1. salary_prediction +62 -0
salary_prediction ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ##### RUN SCRIPT #######
3
+
4
+ import pandas as pd
5
+ from sklearn.impute import SimpleImputer
6
+ import numpy as np
7
+ from sklearn.compose import ColumnTransformer
8
+ from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler,PolynomialFeatures
9
+ from sklearn.model_selection import train_test_split
10
+ from sklearn.linear_model import LinearRegression
11
+ from joblib import dump
12
+ import os
13
+
14
+ data = pd.read_csv("src/datasets/Salary Prediction of Data Professions.csv")
15
+
16
+ X = data.iloc[:, data.columns != data.columns[7]].values
17
+ X = X[:, [i for i in range(X.shape[1]) if i not in [0, 1, 3]]]
18
+ Y = data.iloc[:, 7].values
19
+ print("initial array",X[0])
20
+
21
+ imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
22
+ imputer.fit(X[:, [3, 5, 6, 7]])
23
+ X[:, [3, 5, 6, 7]] = imputer.transform(X[:, [3, 5, 6, 7]])
24
+ print("after managing missing values",X[0])
25
+
26
+ X[:, 1] = pd.to_datetime(X[:, 1], format='%d-%m-%Y').astype('int64') // 10**9
27
+ print("after modifying date to timestamp",X[0])
28
+
29
+
30
+ ct = ColumnTransformer(transformers=[('encode',OneHotEncoder(),[0])],remainder='passthrough')
31
+ X = ct.fit_transform(X)
32
+ X = np.array(X)
33
+ print("encoding the sex M and F",X[0])
34
+
35
+
36
+ label_encoder_2 = LabelEncoder()
37
+ label_encoder_4 = LabelEncoder()
38
+ X[:, 3] = label_encoder_2.fit_transform(X[:, 3])
39
+ X[:, 5] = label_encoder_4.fit_transform(X[:, 5])
40
+ print("encoding position and departement",X[0])
41
+
42
+ X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
43
+
44
+
45
+ scaler = StandardScaler()
46
+ Y_train = scaler.fit_transform(Y_train.reshape(-1, 1)).ravel()
47
+ Y_test = scaler.transform(Y_test.reshape(-1, 1)).ravel()
48
+ print("Y train",Y_train[0])
49
+
50
+ polynomial = LinearRegression()
51
+ poly_reg = PolynomialFeatures(degree=2)
52
+ X_train_poly = poly_reg.fit_transform(X_train)
53
+ X_test_poly = poly_reg.fit_transform(X_test)
54
+
55
+ polynomial_model = polynomial.fit(X_train_poly,Y_train)
56
+
57
+ poly_train_accuracy = polynomial_model.score(X_train_poly,Y_train)
58
+ poly_test_accuracy = polynomial_model.score(X_test_poly,Y_test)
59
+
60
+
61
+ print('poly_train_accuracy',poly_train_accuracy)
62
+ print('poly_test_accuracy',poly_test_accuracy)