Spaces:

YoneSlapWind80085
/

streamlittt

Sleeping

App Files Files Community

streamlittt / app.py

YoneSlapWind80085

Update app.py

154b2d9 verified about 1 year ago

raw

history blame

8.98 kB

	import streamlit as st
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn.linear_model import LinearRegression
	from sklearn.model_selection import train_test_split
	from sklearn.datasets import fetch_california_housing
	import pickle

	from sklearn import datasets

	from sklearn.metrics import mean_squared_error, r2_score

	# Load the data
	california = fetch_california_housing()
	df = pd.DataFrame(california.data, columns=california.feature_names)
	df['MedHouseVal'] = california.target

	# Prepare the data for the model
	X = df[['MedInc']]
	y = df['MedHouseVal']

	# Pairplot to visualize relationships between features and the target
	plt.show()


	plt.figure(figsize=(10, 8))
	plt.show()

	# Scatter plot for specific features against the target variable
	features = ['MedInc', 'AveRooms', 'AveOccup', 'HouseAge']
	for feature in features:
	plt.figure(figsize=(6, 4))
	plt.scatter(df[feature], df['MedHouseVal'], alpha=0.3)
	plt.title(f'MedHouseVal vs {feature}')
	plt.xlabel(feature)
	plt.ylabel('MedHouseVal')
	plt.show()
	#5
	# Select the predictor and target variable
	X = df[['MedInc']]
	y = df['MedHouseVal']

	# Split the data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	print("Training and testing data split done.")

	#6 7 and 8
	#lineare regression model
	model = LinearRegression()

	# Fitting the model on the training data
	model.fit(X_train, y_train)

	# Making predictions on the test data
	y_pred = model.predict(X_test)

	# Evaluating the model
	mse = mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	r2 = r2_score(y_test, y_pred)




	# Plot the regression line
	plt.figure(figsize=(8, 6))
	plt.scatter(X_test, y_test, color='blue', alpha=0.3, label='Actual')
	plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
	plt.title('Simple Linear Regression: MedInc vs MedHouseVal')
	plt.xlabel('MedInc')
	plt.ylabel('MedHouseVal')
	plt.legend()
	plt.show()

	#Split the data into training (80%) and testing (20%) sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Print the sizes of the training and testing sets
	print(f"Training set size: {X_train.shape[0]} samples")
	print(f"Testing set size: {X_test.shape[0]} samples")

	# Create the linear regression model
	model = LinearRegression()

	# Fit the model on the training data
	model.fit(X_train, y_train)

	# Print the coefficients
	print(f"Coefficients: {model.coef_}")
	print(f"Intercept: {model.intercept_}")

	# Make predictions on the test data
	y_pred = model.predict(X_test)

	# Calculate RMSE and R-squared
	mse = mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	r2 = r2_score(y_test, y_pred)

	print(f"Root Mean Squared Error (RMSE): {rmse}")
	print(f"R-squared: {r2}")

	# Scatter plot of actual vs. predicted values
	plt.figure(figsize=(8, 6))
	plt.scatter(y_test, y_pred, color='blue', alpha=0.3)
	plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, color='green')
	plt.title('Multilinear Regression: Actual vs. Predicted MedHouseVal')
	plt.xlabel('Actual MedHouseVal')
	plt.ylabel('Predicted MedHouseVal')
	plt.show()

	#comparing the performance between RMSE and R-squared values
	# Simple Linear Regression
	# Select a single predictor
	X_single = df[['MedInc']]
	y = df['MedHouseVal']

	# Split the data into training and testing sets
	X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(X_single, y, test_size=0.2, random_state=42)

	# Create the linear regression model
	model_single = LinearRegression()

	# Fit the model on the training data
	model_single.fit(X_train_single, y_train_single)

	# Make predictions on the test data
	y_pred_single = model_single.predict(X_test_single)

	# Evaluate the model
	mse_single = mean_squared_error(y_test_single, y_pred_single)
	rmse_single = np.sqrt(mse_single)
	r2_single = r2_score(y_test_single, y_pred_single)

	print(f"Simple Linear Regression - RMSE: {rmse_single}")
	print(f"Simple Linear Regression - R-squared: {r2_single}")

	# Multilinear Regression
	# Select multiple predictors
	X_multi = df[['MedInc', 'AveRooms', 'HouseAge', 'AveOccup']]
	y = df['MedHouseVal']

	# Split the data into training and testing sets
	X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y, test_size=0.2, random_state=42)

	# Create the linear regression model
	model_multi = LinearRegression()

	# Fit the model on the training data
	model_multi.fit(X_train_multi, y_train_multi)

	# Make predictions on the test data
	y_pred_multi = model_multi.predict(X_test_multi)

	# Evaluate the model
	mse_multi = mean_squared_error(y_test_multi, y_pred_multi)
	rmse_multi = np.sqrt(mse_multi)
	r2_multi = r2_score(y_test_multi, y_pred_multi)

	print(f"Multilinear Regression - RMSE: {rmse_multi}")
	print(f"Multilinear Regression - R-squared: {r2_multi}")

	#Residual Plot for Multilinear Regression
	residuals = y_test_multi - y_pred_multi
	plt.figure(figsize=(8, 6))
	plt.scatter(y_pred_multi, residuals, color='blue', alpha=0.3)
	plt.hlines(y=0, xmin=y_pred_multi.min(), xmax=y_pred_multi.max(), colors='red', linestyles='--', lw=2)
	plt.title('Residual Plot: Multilinear Regression')
	plt.xlabel('Predicted MedHouseVal')
	plt.ylabel('Residuals')
	plt.show()

	# Save the model
	with open("linear_regression_model.pkl", "wb") as file:
	pickle.dump(model, file)

	# Load the model
	with open("linear_regression_model.pkl", "rb") as file:
	model = pickle.load(file)

	# Sidebar for user input features
	st.sidebar.header('User Input Features')
	selected_feature = st.sidebar.selectbox('Select feature for visualization', df.columns)
	selected_target = st.sidebar.selectbox('Select target variable', df.columns)


	st.write(df)

	# Visualization of selected feature
	st.subheader(f'Distribution of {selected_feature}')
	fig, ax = plt.subplots()
	ax.hist(df[selected_feature], bins=30, edgecolor='black')
	st.pyplot(fig)

	# Scatter plot of selected feature vs target
	st.subheader(f'Scatter plot of {selected_feature} vs {selected_target}')
	fig, ax = plt.subplots()
	ax.scatter(df[selected_feature], df[selected_target], alpha=0.3)
	ax.set_xlabel(selected_feature)
	ax.set_ylabel(selected_target)



	# Simple Linear Regression
	X_single = df[['MedInc']]
	y = df['MedHouseVal']

	X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(X_single, y, test_size=0.2, random_state=42)

	model_single = LinearRegression()
	model_single.fit(X_train_single, y_train_single)

	y_pred_single = model_single.predict(X_test_single)

	r2_single = r2_score(y_test_single, y_pred_single)

	# Plot the regression line for simple linear regression
	fig, ax = plt.subplots()
	ax.scatter(X_test_single, y_test_single, color='blue', alpha=0.3, label='Actual')
	ax.plot(X_test_single, y_pred_single, color='red', linewidth=2, label='Predicted')
	ax.set_title('Simple Linear Regression: MedInc vs MedHouseVal')
	ax.set_xlabel('MedInc')
	ax.set_ylabel('MedHouseVal')
	ax.legend()
	st.pyplot(fig)

	# Multilinear Regression
	X_multi = df[['MedInc', 'AveRooms', 'HouseAge', 'AveOccup']]
	y = df['MedHouseVal']

	X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y, test_size=0.2, random_state=42)

	model_multi = LinearRegression()
	model_multi.fit(X_train_multi, y_train_multi)

	y_pred_multi = model_multi.predict(X_test_multi)

	r2_multi = r2_score(y_test_multi, y_pred_multi)

	# Show regression line if selected
	show_regression = st.checkbox('Show Regression Line')
	if show_regression and selected_feature in df.columns and selected_target == 'MedHouseVal':
	X_feature = df[[selected_feature]]
	y = df[selected_target]
	model_feature = LinearRegression()
	model_feature.fit(X_feature, y)
	line = model_feature.predict(X_feature)
	ax.plot(df[selected_feature], line, color='red', linewidth=2)

	st.pyplot(fig)

	# Add checkbox for multilinear regression plot
	show_multilinear_plot = st.checkbox('Show Multilinear Regression Plot')

	if show_multilinear_plot:
	fig, ax = plt.subplots()
	ax.scatter(y_test_multi, y_pred_multi, color='blue', alpha=0.3)
	ax.plot([y_test_multi.min(), y_test_multi.max()], [y_test_multi.min(), y_test_multi.max()], 'k--', lw=2, color='green')
	ax.set_title('Multilinear Regression: Actual vs. Predicted MedHouseVal')
	ax.set_xlabel('Actual MedHouseVal')
	ax.set_ylabel('Predicted MedHouseVal')
	st.pyplot(fig)

	# Compare R-squared values
	st.subheader('R-squared Comparison')
	st.write(f"Simple Linear Regression R-squared: {r2_single:.4f}")
	st.write(f"Multilinear Regression R-squared: {r2_multi:.4f}")

	# Prediction
	st.subheader('Predict Median House Value')

	input_values = {}
	for feature in X_multi.columns:
	input_values[feature] = st.number_input(f'Enter {feature}', value=float(df[feature].mean()))

	if st.button('Predict'):
	input_data = np.array([list(input_values.values())])
	prediction = model_multi.predict(input_data)
	st.write(f'Predicted Median House Value: {prediction[0]}')