Upload application21.py
Browse files- application21.py +131 -0
application21.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ast import increment_lineno
|
2 |
+
from statistics import LinearRegression
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import matplotlib
|
6 |
+
matplotlib.use('Agg')
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
plt.switch_backend('Agg')
|
9 |
+
import seaborn as sns
|
10 |
+
import warnings
|
11 |
+
warnings.filterwarnings('ignore')
|
12 |
+
|
13 |
+
sns.set(style='whitegrid')
|
14 |
+
|
15 |
+
print('Import and setup completed successfully.')
|
16 |
+
|
17 |
+
file_path = ''
|
18 |
+
|
19 |
+
file_path = r'C:\Users\Donte Patton\Downloads\dataset_2191_sleep.csv'
|
20 |
+
df = pd.read_csv(file_path, encoding='ascii', delimiter=',')
|
21 |
+
|
22 |
+
print('Dataset loaded successfull. Showing first few rows:')
|
23 |
+
print(df.head())
|
24 |
+
|
25 |
+
print('Dataset Info:')
|
26 |
+
df.info()
|
27 |
+
|
28 |
+
print('\nMissing values in each column:')
|
29 |
+
print(df.isnull().sum())
|
30 |
+
|
31 |
+
df.dropna(inplace=True)
|
32 |
+
print('\nDataframe shape after dropping missing values:', df.shape)
|
33 |
+
|
34 |
+
# Removed Year conversion as the column doesn't exist in the dataset
|
35 |
+
|
36 |
+
print('\nData types after conversion:')
|
37 |
+
print(df.dtypes)
|
38 |
+
|
39 |
+
numeric_df = df.select_dtypes(include=[np.number])
|
40 |
+
|
41 |
+
if numeric_df.shape[1] >= 4:
|
42 |
+
plt.figure(figsize=(12, 10))
|
43 |
+
corr = numeric_df.corr()
|
44 |
+
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
|
45 |
+
plt.title('Correlation Heatmap of Numeric Variables')
|
46 |
+
plt.show()
|
47 |
+
else:
|
48 |
+
print('Not enough numeric columns for a correlation heatmap.')
|
49 |
+
|
50 |
+
# Using available numeric columns for pairplot
|
51 |
+
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
|
52 |
+
if len(numeric_cols) > 1:
|
53 |
+
sns.pairplot(df[numeric_cols])
|
54 |
+
plt.suptitle('Pair Plot of Numeric Features', y=1.02)
|
55 |
+
plt.show()
|
56 |
+
else:
|
57 |
+
print('Not enough numeric columns for pair plot.')
|
58 |
+
|
59 |
+
# Plotting distribution of body_weight instead of CO2
|
60 |
+
plt.figure(figsize=(8, 6))
|
61 |
+
sns.histplot(df['body_weight'], kde=True, bins=30)
|
62 |
+
plt.title('Distribution of Body Weight')
|
63 |
+
plt.xlabel('Body Weight (kg)')
|
64 |
+
plt.ylabel('Frequency')
|
65 |
+
plt.show()
|
66 |
+
|
67 |
+
# Plotting mean body weight by predation index
|
68 |
+
plt.figure(figsize=(10, 6))
|
69 |
+
body_weight_by_predation = df.groupby('predation_index')['body_weight'].mean().reset_index()
|
70 |
+
sns.barplot(x='predation_index', y='body_weight', data=body_weight_by_predation, palette='viridis')
|
71 |
+
plt.title('Average Body Weight by Predation Index')
|
72 |
+
plt.xlabel('Predation Index')
|
73 |
+
plt.ylabel('Average Body Weight (kg)')
|
74 |
+
plt.show()
|
75 |
+
|
76 |
+
# Create a count plot for predation_index instead of Emissions Category
|
77 |
+
plt.figure(figsize=(8, 6))
|
78 |
+
sns.countplot(x='predation_index', data=df, palette='Set2')
|
79 |
+
plt.title('Count of Records by Predation Index')
|
80 |
+
plt.xlabel('Predation Index')
|
81 |
+
plt.ylabel('Count')
|
82 |
+
plt.show()
|
83 |
+
|
84 |
+
from sklearn.model_selection import train_test_split
|
85 |
+
from sklearn.linear_model import LinearRegression
|
86 |
+
from sklearn.metrics import r2_score, mean_squared_error
|
87 |
+
|
88 |
+
# Update features to use existing numeric columns
|
89 |
+
features = ['body_weight', 'brain_weight', 'predation_index', 'sleep_exposure_index', 'danger_index']
|
90 |
+
|
91 |
+
# Convert string columns to numeric where needed
|
92 |
+
model_df = df.copy()
|
93 |
+
|
94 |
+
# Convert total_sleep to numeric (it's currently an object/string)
|
95 |
+
model_df['total_sleep'] = pd.to_numeric(model_df['total_sleep'], errors='coerce')
|
96 |
+
|
97 |
+
# Drop any rows with missing values
|
98 |
+
model_df = model_df.dropna()
|
99 |
+
|
100 |
+
# Use available numeric features for prediction
|
101 |
+
# We'll predict 'total_sleep' using other numeric features
|
102 |
+
X = model_df[['body_weight', 'brain_weight', 'predation_index', 'sleep_exposure_index', 'danger_index']]
|
103 |
+
y = model_df['total_sleep']
|
104 |
+
|
105 |
+
# Split the data
|
106 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
107 |
+
|
108 |
+
print('Training set shape:', X_train.shape)
|
109 |
+
print('Testing set shape:', X_test.shape)
|
110 |
+
|
111 |
+
# Train the model
|
112 |
+
regressor = LinearRegression()
|
113 |
+
regressor.fit(X_train, y_train)
|
114 |
+
|
115 |
+
# Make predictions
|
116 |
+
y_pred = regressor.predict(X_test)
|
117 |
+
|
118 |
+
r2 = r2_score(y_test, y_pred)
|
119 |
+
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
|
120 |
+
|
121 |
+
print(f'R^2 score for the predictor: {r2:.3f}')
|
122 |
+
print(f'RMSE for the predictor: {rmse:.3f}')
|
123 |
+
|
124 |
+
plt.figure(figsize=(8, 6))
|
125 |
+
plt.scatter(y_test, y_pred, alpha=0.5, color='teal')
|
126 |
+
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
|
127 |
+
plt.xlabel('Actual CO2')
|
128 |
+
plt.ylabel('Predicted CO2')
|
129 |
+
plt.title('Actual vs Predicted CO2 Emissions')
|
130 |
+
plt.show()
|
131 |
+
|