Spaces:
Sleeping
Sleeping
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from sklearn.linear_model import LinearRegression | |
from sklearn.model_selection import train_test_split | |
from sklearn.datasets import fetch_california_housing | |
import pickle | |
from sklearn import datasets | |
from sklearn.metrics import mean_squared_error, r2_score | |
# Load the data | |
california = fetch_california_housing() | |
df = pd.DataFrame(california.data, columns=california.feature_names) | |
df['MedHouseVal'] = california.target | |
# Prepare the data for the model | |
X = df[['MedInc']] | |
y = df['MedHouseVal'] | |
# Pairplot to visualize relationships between features and the target | |
plt.show() | |
plt.figure(figsize=(10, 8)) | |
plt.show() | |
# Scatter plot for specific features against the target variable | |
features = ['MedInc', 'AveRooms', 'AveOccup', 'HouseAge'] | |
for feature in features: | |
plt.figure(figsize=(6, 4)) | |
plt.scatter(df[feature], df['MedHouseVal'], alpha=0.3) | |
plt.title(f'MedHouseVal vs {feature}') | |
plt.xlabel(feature) | |
plt.ylabel('MedHouseVal') | |
plt.show() | |
#5 | |
# Select the predictor and target variable | |
X = df[['MedInc']] | |
y = df['MedHouseVal'] | |
# Split the data into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
print("Training and testing data split done.") | |
#6 7 and 8 | |
#lineare regression model | |
model = LinearRegression() | |
# Fitting the model on the training data | |
model.fit(X_train, y_train) | |
# Making predictions on the test data | |
y_pred = model.predict(X_test) | |
# Evaluating the model | |
mse = mean_squared_error(y_test, y_pred) | |
rmse = np.sqrt(mse) | |
r2 = r2_score(y_test, y_pred) | |
# Plot the regression line | |
plt.figure(figsize=(8, 6)) | |
plt.scatter(X_test, y_test, color='blue', alpha=0.3, label='Actual') | |
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted') | |
plt.title('Simple Linear Regression: MedInc vs MedHouseVal') | |
plt.xlabel('MedInc') | |
plt.ylabel('MedHouseVal') | |
plt.legend() | |
plt.show() | |
#Split the data into training (80%) and testing (20%) sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# Print the sizes of the training and testing sets | |
print(f"Training set size: {X_train.shape[0]} samples") | |
print(f"Testing set size: {X_test.shape[0]} samples") | |
# Create the linear regression model | |
model = LinearRegression() | |
# Fit the model on the training data | |
model.fit(X_train, y_train) | |
# Print the coefficients | |
print(f"Coefficients: {model.coef_}") | |
print(f"Intercept: {model.intercept_}") | |
# Make predictions on the test data | |
y_pred = model.predict(X_test) | |
# Calculate RMSE and R-squared | |
mse = mean_squared_error(y_test, y_pred) | |
rmse = np.sqrt(mse) | |
r2 = r2_score(y_test, y_pred) | |
print(f"Root Mean Squared Error (RMSE): {rmse}") | |
print(f"R-squared: {r2}") | |
# Scatter plot of actual vs. predicted values | |
plt.figure(figsize=(8, 6)) | |
plt.scatter(y_test, y_pred, color='blue', alpha=0.3) | |
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, color='green') | |
plt.title('Multilinear Regression: Actual vs. Predicted MedHouseVal') | |
plt.xlabel('Actual MedHouseVal') | |
plt.ylabel('Predicted MedHouseVal') | |
plt.show() | |
#comparing the performance between RMSE and R-squared values | |
# Simple Linear Regression | |
# Select a single predictor | |
X_single = df[['MedInc']] | |
y = df['MedHouseVal'] | |
# Split the data into training and testing sets | |
X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(X_single, y, test_size=0.2, random_state=42) | |
# Create the linear regression model | |
model_single = LinearRegression() | |
# Fit the model on the training data | |
model_single.fit(X_train_single, y_train_single) | |
# Make predictions on the test data | |
y_pred_single = model_single.predict(X_test_single) | |
# Evaluate the model | |
mse_single = mean_squared_error(y_test_single, y_pred_single) | |
rmse_single = np.sqrt(mse_single) | |
r2_single = r2_score(y_test_single, y_pred_single) | |
print(f"Simple Linear Regression - RMSE: {rmse_single}") | |
print(f"Simple Linear Regression - R-squared: {r2_single}") | |
# Multilinear Regression | |
# Select multiple predictors | |
X_multi = df[['MedInc', 'AveRooms', 'HouseAge', 'AveOccup']] | |
y = df['MedHouseVal'] | |
# Split the data into training and testing sets | |
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y, test_size=0.2, random_state=42) | |
# Create the linear regression model | |
model_multi = LinearRegression() | |
# Fit the model on the training data | |
model_multi.fit(X_train_multi, y_train_multi) | |
# Make predictions on the test data | |
y_pred_multi = model_multi.predict(X_test_multi) | |
# Evaluate the model | |
mse_multi = mean_squared_error(y_test_multi, y_pred_multi) | |
rmse_multi = np.sqrt(mse_multi) | |
r2_multi = r2_score(y_test_multi, y_pred_multi) | |
print(f"Multilinear Regression - RMSE: {rmse_multi}") | |
print(f"Multilinear Regression - R-squared: {r2_multi}") | |
#Residual Plot for Multilinear Regression | |
residuals = y_test_multi - y_pred_multi | |
plt.figure(figsize=(8, 6)) | |
plt.scatter(y_pred_multi, residuals, color='blue', alpha=0.3) | |
plt.hlines(y=0, xmin=y_pred_multi.min(), xmax=y_pred_multi.max(), colors='red', linestyles='--', lw=2) | |
plt.title('Residual Plot: Multilinear Regression') | |
plt.xlabel('Predicted MedHouseVal') | |
plt.ylabel('Residuals') | |
plt.show() | |
# Save the model | |
with open("linear_regression_model.pkl", "wb") as file: | |
pickle.dump(model, file) | |
# Load the model | |
with open("linear_regression_model.pkl", "rb") as file: | |
model = pickle.load(file) | |
# Sidebar for user input features | |
st.sidebar.header('User Input Features') | |
selected_feature = st.sidebar.selectbox('Select feature for visualization', df.columns) | |
selected_target = st.sidebar.selectbox('Select target variable', df.columns) | |
st.write(df) | |
# Visualization of selected feature | |
st.subheader(f'Distribution of {selected_feature}') | |
fig, ax = plt.subplots() | |
ax.hist(df[selected_feature], bins=30, edgecolor='black') | |
st.pyplot(fig) | |
# Scatter plot of selected feature vs target | |
st.subheader(f'Scatter plot of {selected_feature} vs {selected_target}') | |
fig, ax = plt.subplots() | |
ax.scatter(df[selected_feature], df[selected_target], alpha=0.3) | |
ax.set_xlabel(selected_feature) | |
ax.set_ylabel(selected_target) | |
# Simple Linear Regression | |
X_single = df[['MedInc']] | |
y = df['MedHouseVal'] | |
X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(X_single, y, test_size=0.2, random_state=42) | |
model_single = LinearRegression() | |
model_single.fit(X_train_single, y_train_single) | |
y_pred_single = model_single.predict(X_test_single) | |
r2_single = r2_score(y_test_single, y_pred_single) | |
# Plot the regression line for simple linear regression | |
fig, ax = plt.subplots() | |
ax.scatter(X_test_single, y_test_single, color='blue', alpha=0.3, label='Actual') | |
ax.plot(X_test_single, y_pred_single, color='red', linewidth=2, label='Predicted') | |
ax.set_title('Simple Linear Regression: MedInc vs MedHouseVal') | |
ax.set_xlabel('MedInc') | |
ax.set_ylabel('MedHouseVal') | |
ax.legend() | |
st.pyplot(fig) | |
# Multilinear Regression | |
X_multi = df[['MedInc', 'AveRooms', 'HouseAge', 'AveOccup']] | |
y = df['MedHouseVal'] | |
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y, test_size=0.2, random_state=42) | |
model_multi = LinearRegression() | |
model_multi.fit(X_train_multi, y_train_multi) | |
y_pred_multi = model_multi.predict(X_test_multi) | |
r2_multi = r2_score(y_test_multi, y_pred_multi) | |
# Show regression line if selected | |
show_regression = st.checkbox('Show Regression Line') | |
if show_regression and selected_feature in df.columns and selected_target == 'MedHouseVal': | |
X_feature = df[[selected_feature]] | |
y = df[selected_target] | |
model_feature = LinearRegression() | |
model_feature.fit(X_feature, y) | |
line = model_feature.predict(X_feature) | |
ax.plot(df[selected_feature], line, color='red', linewidth=2) | |
st.pyplot(fig) | |
# Add checkbox for multilinear regression plot | |
show_multilinear_plot = st.checkbox('Show Multilinear Regression Plot') | |
if show_multilinear_plot: | |
fig, ax = plt.subplots() | |
ax.scatter(y_test_multi, y_pred_multi, color='blue', alpha=0.3) | |
ax.plot([y_test_multi.min(), y_test_multi.max()], [y_test_multi.min(), y_test_multi.max()], 'k--', lw=2, color='green') | |
ax.set_title('Multilinear Regression: Actual vs. Predicted MedHouseVal') | |
ax.set_xlabel('Actual MedHouseVal') | |
ax.set_ylabel('Predicted MedHouseVal') | |
st.pyplot(fig) | |
# Compare R-squared values | |
st.subheader('R-squared Comparison') | |
st.write(f"Simple Linear Regression R-squared: {r2_single:.4f}") | |
st.write(f"Multilinear Regression R-squared: {r2_multi:.4f}") | |
# Prediction | |
st.subheader('Predict Median House Value') | |
input_values = {} | |
for feature in X_multi.columns: | |
input_values[feature] = st.number_input(f'Enter {feature}', value=float(df[feature].mean())) | |
if st.button('Predict'): | |
input_data = np.array([list(input_values.values())]) | |
prediction = model_multi.predict(input_data) | |
st.write(f'Predicted Median House Value: {prediction[0]}') | |