File size: 2,067 Bytes
913f112 dda36fd 913f112 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
##### RUN SCRIPT #######
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from joblib import dump
import os
data = pd.read_csv("Salary Prediction of Data Professions.csv")
X = data.iloc[:, data.columns != data.columns[7]].values
X = X[:, [i for i in range(X.shape[1]) if i not in [0, 1, 3]]]
Y = data.iloc[:, 7].values
print("initial array",X[0])
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X[:, [3, 5, 6, 7]])
X[:, [3, 5, 6, 7]] = imputer.transform(X[:, [3, 5, 6, 7]])
print("after managing missing values",X[0])
X[:, 1] = pd.to_datetime(X[:, 1], format='%d-%m-%Y').astype('int64') // 10**9
print("after modifying date to timestamp",X[0])
ct = ColumnTransformer(transformers=[('encode',OneHotEncoder(),[0])],remainder='passthrough')
X = ct.fit_transform(X)
X = np.array(X)
print("encoding the sex M and F",X[0])
label_encoder_2 = LabelEncoder()
label_encoder_4 = LabelEncoder()
X[:, 3] = label_encoder_2.fit_transform(X[:, 3])
X[:, 5] = label_encoder_4.fit_transform(X[:, 5])
print("encoding position and departement",X[0])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
scaler = StandardScaler()
Y_train = scaler.fit_transform(Y_train.reshape(-1, 1)).ravel()
Y_test = scaler.transform(Y_test.reshape(-1, 1)).ravel()
print("Y train",Y_train[0])
polynomial = LinearRegression()
poly_reg = PolynomialFeatures(degree=2)
X_train_poly = poly_reg.fit_transform(X_train)
X_test_poly = poly_reg.fit_transform(X_test)
polynomial_model = polynomial.fit(X_train_poly,Y_train)
poly_train_accuracy = polynomial_model.score(X_train_poly,Y_train)
poly_test_accuracy = polynomial_model.score(X_test_poly,Y_test)
print('poly_train_accuracy',poly_train_accuracy)
print('poly_test_accuracy',poly_test_accuracy) |