import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import LabelEncoder, RobustScaler from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.pipeline import Pipeline import joblib import matplotlib.pyplot as plt import seaborn as sns import os # Load dataset file_path = "CAR/CTP_Model1.csv" data = pd.read_csv(file_path, low_memory=False) # Function to remove outliers using IQR def remove_outliers_iqr(df, column, multiplier=1.5): Q1 = df[column].quantile(0.25) Q3 = df[column].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - multiplier * IQR upper_bound = Q3 + multiplier * IQR return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)] # Remove outliers and unrealistic prices data = remove_outliers_iqr(data, 'price', multiplier=2) data = data[data['price'] > 100] # Feature engineering def create_features(df): df = df.copy() current_year = 2024 df['age'] = current_year - df['year'] df['age_squared'] = df['age'] ** 2 df['mileage_per_year'] = np.clip(df['odometer'] / (df['age'] + 1), 0, 200000) return df data = create_features(data) # Handle categorical features categorical_features = ['make', 'model', 'condition', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color'] label_encoders = {} encoding_dict = {} # To save mappings for the app for feature in categorical_features: if feature in data.columns: le = LabelEncoder() data[feature] = le.fit_transform(data[feature]) label_encoders[feature] = le # Save mapping for later use encoding_dict[feature] = dict(zip(le.classes_, le.transform(le.classes_))) # Save the encoding dictionary to a CSV encoding_df = pd.DataFrame.from_dict(encoding_dict, orient='index').transpose() encoding_df.to_csv("categorical_encodings.csv", index=False) # Prepare features and labels numeric_features = ['year', 'odometer', 'age', 'age_squared', 'mileage_per_year'] features = numeric_features + categorical_features X = data[features] y = np.log1p(data['price']) # Log-transform the price for better model performance # Train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Create a pipeline with scaling and regression model = Pipeline([ ('scaler', RobustScaler()), ('regressor', RandomForestRegressor( n_estimators=300, max_depth=25, random_state=42, n_jobs=-1)) ]) # Train the model model.fit(X_train, y_train) # Evaluate the model y_pred = model.predict(X_test) rmse = mean_squared_error(y_test, y_pred, squared=False) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) print(f"RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}") # Save the model and encoders joblib.dump(model, "car_price_modelv3.pkl") print("Model saved successfully.") viz_path = '/Users/estebanm/Desktop/carShopping_tool/CAR/visualizations' os.makedirs(viz_path, exist_ok=True) # 1. Price Distribution Plot plt.figure(figsize=(10, 6)) sns.histplot(data=data, x='price', bins=50) plt.title('Price Distribution') plt.savefig(os.path.join(viz_path, 'price_distribution_plot.png')) plt.close() # 2. Actual vs Predicted Plot actual_prices = np.expm1(y_test) predicted_prices = np.expm1(y_pred) plt.figure(figsize=(10, 6)) plt.scatter(actual_prices, predicted_prices, alpha=0.5) plt.plot([actual_prices.min(), actual_prices.max()], [actual_prices.min(), actual_prices.max()], 'r--') plt.xlabel('Actual Price') plt.ylabel('Predicted Price') plt.title('Actual vs Predicted Prices') plt.savefig(os.path.join(viz_path, 'actual_vs_predicted_scatter.png')) plt.close() # 3. Feature Importance Plot feature_importance = model.named_steps['regressor'].feature_importances_ feature_names = numeric_features + categorical_features plt.figure(figsize=(12, 6)) importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance}) importance_df = importance_df.sort_values('importance', ascending=True) plt.barh(importance_df['feature'], importance_df['importance']) plt.title('Feature Importance') plt.savefig(os.path.join(viz_path, 'feature_importance_plot.png')) plt.close() # 4. Residuals Distribution Plot residuals = actual_prices - predicted_prices plt.figure(figsize=(10, 6)) sns.histplot(residuals, bins=50) plt.title('Residuals Distribution') plt.xlabel('Residuals') plt.savefig(os.path.join(viz_path, 'residuals_distribution_plot.png')) plt.close()