import streamlit as st import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_california_housing import pickle from sklearn import datasets from sklearn.metrics import mean_squared_error, r2_score # Load the data california = fetch_california_housing() df = pd.DataFrame(california.data, columns=california.feature_names) df['MedHouseVal'] = california.target # Prepare the data for the model X = df[['MedInc']] y = df['MedHouseVal'] # Pairplot to visualize relationships between features and the target plt.show() plt.figure(figsize=(10, 8)) plt.show() # Scatter plot for specific features against the target variable features = ['MedInc', 'AveRooms', 'AveOccup', 'HouseAge'] for feature in features: plt.figure(figsize=(6, 4)) plt.scatter(df[feature], df['MedHouseVal'], alpha=0.3) plt.title(f'MedHouseVal vs {feature}') plt.xlabel(feature) plt.ylabel('MedHouseVal') plt.show() #5 # Select the predictor and target variable X = df[['MedInc']] y = df['MedHouseVal'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print("Training and testing data split done.") #6 7 and 8 #lineare regression model model = LinearRegression() # Fitting the model on the training data model.fit(X_train, y_train) # Making predictions on the test data y_pred = model.predict(X_test) # Evaluating the model mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) r2 = r2_score(y_test, y_pred) # Plot the regression line plt.figure(figsize=(8, 6)) plt.scatter(X_test, y_test, color='blue', alpha=0.3, label='Actual') plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted') plt.title('Simple Linear Regression: MedInc vs MedHouseVal') plt.xlabel('MedInc') plt.ylabel('MedHouseVal') plt.legend() plt.show() #Split the data into training (80%) and testing (20%) sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Print the sizes of the training and testing sets print(f"Training set size: {X_train.shape[0]} samples") print(f"Testing set size: {X_test.shape[0]} samples") # Create the linear regression model model = LinearRegression() # Fit the model on the training data model.fit(X_train, y_train) # Print the coefficients print(f"Coefficients: {model.coef_}") print(f"Intercept: {model.intercept_}") # Make predictions on the test data y_pred = model.predict(X_test) # Calculate RMSE and R-squared mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) r2 = r2_score(y_test, y_pred) print(f"Root Mean Squared Error (RMSE): {rmse}") print(f"R-squared: {r2}") # Scatter plot of actual vs. predicted values plt.figure(figsize=(8, 6)) plt.scatter(y_test, y_pred, color='blue', alpha=0.3) plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, color='green') plt.title('Multilinear Regression: Actual vs. Predicted MedHouseVal') plt.xlabel('Actual MedHouseVal') plt.ylabel('Predicted MedHouseVal') plt.show() #comparing the performance between RMSE and R-squared values # Simple Linear Regression # Select a single predictor X_single = df[['MedInc']] y = df['MedHouseVal'] # Split the data into training and testing sets X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(X_single, y, test_size=0.2, random_state=42) # Create the linear regression model model_single = LinearRegression() # Fit the model on the training data model_single.fit(X_train_single, y_train_single) # Make predictions on the test data y_pred_single = model_single.predict(X_test_single) # Evaluate the model mse_single = mean_squared_error(y_test_single, y_pred_single) rmse_single = np.sqrt(mse_single) r2_single = r2_score(y_test_single, y_pred_single) print(f"Simple Linear Regression - RMSE: {rmse_single}") print(f"Simple Linear Regression - R-squared: {r2_single}") # Multilinear Regression # Select multiple predictors X_multi = df[['MedInc', 'AveRooms', 'HouseAge', 'AveOccup']] y = df['MedHouseVal'] # Split the data into training and testing sets X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y, test_size=0.2, random_state=42) # Create the linear regression model model_multi = LinearRegression() # Fit the model on the training data model_multi.fit(X_train_multi, y_train_multi) # Make predictions on the test data y_pred_multi = model_multi.predict(X_test_multi) # Evaluate the model mse_multi = mean_squared_error(y_test_multi, y_pred_multi) rmse_multi = np.sqrt(mse_multi) r2_multi = r2_score(y_test_multi, y_pred_multi) print(f"Multilinear Regression - RMSE: {rmse_multi}") print(f"Multilinear Regression - R-squared: {r2_multi}") #Residual Plot for Multilinear Regression residuals = y_test_multi - y_pred_multi plt.figure(figsize=(8, 6)) plt.scatter(y_pred_multi, residuals, color='blue', alpha=0.3) plt.hlines(y=0, xmin=y_pred_multi.min(), xmax=y_pred_multi.max(), colors='red', linestyles='--', lw=2) plt.title('Residual Plot: Multilinear Regression') plt.xlabel('Predicted MedHouseVal') plt.ylabel('Residuals') plt.show() # Save the model with open("linear_regression_model.pkl", "wb") as file: pickle.dump(model, file) # Load the model with open("linear_regression_model.pkl", "rb") as file: model = pickle.load(file) # Sidebar for user input features st.sidebar.header('User Input Features') selected_feature = st.sidebar.selectbox('Select feature for visualization', df.columns) selected_target = st.sidebar.selectbox('Select target variable', df.columns) st.write(df) # Visualization of selected feature st.subheader(f'Distribution of {selected_feature}') fig, ax = plt.subplots() ax.hist(df[selected_feature], bins=30, edgecolor='black') st.pyplot(fig) # Scatter plot of selected feature vs target st.subheader(f'Scatter plot of {selected_feature} vs {selected_target}') fig, ax = plt.subplots() ax.scatter(df[selected_feature], df[selected_target], alpha=0.3) ax.set_xlabel(selected_feature) ax.set_ylabel(selected_target) # Simple Linear Regression X_single = df[['MedInc']] y = df['MedHouseVal'] X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(X_single, y, test_size=0.2, random_state=42) model_single = LinearRegression() model_single.fit(X_train_single, y_train_single) y_pred_single = model_single.predict(X_test_single) r2_single = r2_score(y_test_single, y_pred_single) # Plot the regression line for simple linear regression fig, ax = plt.subplots() ax.scatter(X_test_single, y_test_single, color='blue', alpha=0.3, label='Actual') ax.plot(X_test_single, y_pred_single, color='red', linewidth=2, label='Predicted') ax.set_title('Simple Linear Regression: MedInc vs MedHouseVal') ax.set_xlabel('MedInc') ax.set_ylabel('MedHouseVal') ax.legend() st.pyplot(fig) # Multilinear Regression X_multi = df[['MedInc', 'AveRooms', 'HouseAge', 'AveOccup']] y = df['MedHouseVal'] X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y, test_size=0.2, random_state=42) model_multi = LinearRegression() model_multi.fit(X_train_multi, y_train_multi) y_pred_multi = model_multi.predict(X_test_multi) r2_multi = r2_score(y_test_multi, y_pred_multi) # Show regression line if selected show_regression = st.checkbox('Show Regression Line') if show_regression and selected_feature in df.columns and selected_target == 'MedHouseVal': X_feature = df[[selected_feature]] y = df[selected_target] model_feature = LinearRegression() model_feature.fit(X_feature, y) line = model_feature.predict(X_feature) ax.plot(df[selected_feature], line, color='red', linewidth=2) st.pyplot(fig) # Add checkbox for multilinear regression plot show_multilinear_plot = st.checkbox('Show Multilinear Regression Plot') if show_multilinear_plot: fig, ax = plt.subplots() ax.scatter(y_test_multi, y_pred_multi, color='blue', alpha=0.3) ax.plot([y_test_multi.min(), y_test_multi.max()], [y_test_multi.min(), y_test_multi.max()], 'k--', lw=2, color='green') ax.set_title('Multilinear Regression: Actual vs. Predicted MedHouseVal') ax.set_xlabel('Actual MedHouseVal') ax.set_ylabel('Predicted MedHouseVal') st.pyplot(fig) # Compare R-squared values st.subheader('R-squared Comparison') st.write(f"Simple Linear Regression R-squared: {r2_single:.4f}") st.write(f"Multilinear Regression R-squared: {r2_multi:.4f}") # Prediction st.subheader('Predict Median House Value') input_values = {} for feature in X_multi.columns: input_values[feature] = st.number_input(f'Enter {feature}', value=float(df[feature].mean())) if st.button('Predict'): input_data = np.array([list(input_values.values())]) prediction = model_multi.predict(input_data) st.write(f'Predicted Median House Value: {prediction[0]}')