|
|
|
|
|
|
|
import pandas as pd |
|
from sklearn.impute import SimpleImputer |
|
import numpy as np |
|
from sklearn.compose import ColumnTransformer |
|
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler,PolynomialFeatures |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.linear_model import LinearRegression |
|
from joblib import dump |
|
import os |
|
|
|
data = pd.read_csv("Salary Prediction of Data Professions.csv") |
|
|
|
X = data.iloc[:, data.columns != data.columns[7]].values |
|
X = X[:, [i for i in range(X.shape[1]) if i not in [0, 1, 3]]] |
|
Y = data.iloc[:, 7].values |
|
print("initial array",X[0]) |
|
|
|
imputer = SimpleImputer(missing_values=np.nan,strategy='mean') |
|
imputer.fit(X[:, [3, 5, 6, 7]]) |
|
X[:, [3, 5, 6, 7]] = imputer.transform(X[:, [3, 5, 6, 7]]) |
|
print("after managing missing values",X[0]) |
|
|
|
X[:, 1] = pd.to_datetime(X[:, 1], format='%d-%m-%Y').astype('int64') // 10**9 |
|
print("after modifying date to timestamp",X[0]) |
|
|
|
|
|
ct = ColumnTransformer(transformers=[('encode',OneHotEncoder(),[0])],remainder='passthrough') |
|
X = ct.fit_transform(X) |
|
X = np.array(X) |
|
print("encoding the sex M and F",X[0]) |
|
|
|
|
|
label_encoder_2 = LabelEncoder() |
|
label_encoder_4 = LabelEncoder() |
|
X[:, 3] = label_encoder_2.fit_transform(X[:, 3]) |
|
X[:, 5] = label_encoder_4.fit_transform(X[:, 5]) |
|
print("encoding position and departement",X[0]) |
|
|
|
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) |
|
|
|
|
|
scaler = StandardScaler() |
|
Y_train = scaler.fit_transform(Y_train.reshape(-1, 1)).ravel() |
|
Y_test = scaler.transform(Y_test.reshape(-1, 1)).ravel() |
|
print("Y train",Y_train[0]) |
|
|
|
polynomial = LinearRegression() |
|
poly_reg = PolynomialFeatures(degree=2) |
|
X_train_poly = poly_reg.fit_transform(X_train) |
|
X_test_poly = poly_reg.fit_transform(X_test) |
|
|
|
polynomial_model = polynomial.fit(X_train_poly,Y_train) |
|
|
|
poly_train_accuracy = polynomial_model.score(X_train_poly,Y_train) |
|
poly_test_accuracy = polynomial_model.score(X_test_poly,Y_test) |
|
|
|
|
|
print('poly_train_accuracy',poly_train_accuracy) |
|
print('poly_test_accuracy',poly_test_accuracy) |