HemanthSai7 commited on
Commit
ab977d5
verified
1 Parent(s): a8abec7

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +347 -37
src/streamlit_app.py CHANGED
@@ -1,40 +1,350 @@
1
- import altair as alt
2
  import numpy as np
 
 
 
3
  import pandas as pd
4
- import streamlit as st
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
  import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from scipy.stats import norm
5
+ from scipy.optimize import minimize
6
  import pandas as pd
 
7
 
8
+ # Set page config
9
+ st.set_page_config(page_title="Gaussian Distribution & Overfitting Demo", layout="wide")
10
+
11
+ st.title("Gaussian Distribution & Overfitting in ML")
12
+ st.markdown("Interactive demonstration of concepts from PRML Chapter 1")
13
+
14
+ # Sidebar for navigation
15
+ page = st.sidebar.selectbox("Select Demo",
16
+ ["Gaussian Distribution Basics",
17
+ "Maximum Likelihood Bias",
18
+ "Polynomial Curve Fitting",
19
+ "Probabilistic Curve Fitting",
20
+ "Regularized Curve Fitting"])
21
+
22
+ if page == "Gaussian Distribution Basics":
23
+ st.header("1.2.4 The Gaussian Distribution")
24
+
25
+ col1, col2 = st.columns(2)
26
+
27
+ with col1:
28
+ st.subheader("Parameters")
29
+ mu = st.slider("Mean (渭)", -5.0, 5.0, 0.0, 0.1)
30
+ sigma = st.slider("Standard Deviation (蟽)", 0.1, 5.0, 1.0, 0.1)
31
+
32
+ st.latex(r"N(x|\mu, \sigma^2) = \frac{1}{(2\pi\sigma^2)^{1/2}} \exp\left\{-\frac{1}{2\sigma^2}(x-\mu)^2\right\}")
33
+
34
+ with col2:
35
+ st.subheader("Gaussian Distribution Plot")
36
+ x = np.linspace(mu - 4*sigma, mu + 4*sigma, 1000)
37
+ y = norm.pdf(x, mu, sigma)
38
+
39
+ fig, ax = plt.subplots(figsize=(8, 6))
40
+ ax.plot(x, y, 'b-', linewidth=2, label=f'N({mu:.1f}, {sigma:.1f}虏)')
41
+ ax.fill_between(x, y, alpha=0.3)
42
+ ax.axvline(mu, color='r', linestyle='--', label=f'Mean = {mu:.1f}')
43
+ ax.axvline(mu - sigma, color='g', linestyle='--', alpha=0.5)
44
+ ax.axvline(mu + sigma, color='g', linestyle='--', alpha=0.5, label=f'卤蟽 = 卤{sigma:.1f}')
45
+ ax.set_xlabel('x')
46
+ ax.set_ylabel('p(x)')
47
+ ax.legend()
48
+ ax.grid(True, alpha=0.3)
49
+ st.pyplot(fig)
50
+
51
+ elif page == "Maximum Likelihood Bias":
52
+ st.header("Maximum Likelihood Bias in Variance Estimation")
53
+ st.markdown("This demonstrates how ML systematically underestimates the true variance")
54
+
55
+ col1, col2 = st.columns(2)
56
+
57
+ with col1:
58
+ st.subheader("Simulation Parameters")
59
+ true_mu = st.slider("True Mean", -2.0, 2.0, 0.0, 0.1)
60
+ true_sigma = st.slider("True Std Dev", 0.5, 3.0, 1.0, 0.1)
61
+ n_samples = st.slider("Number of Samples (N)", 2, 100, 10, 1)
62
+ n_experiments = st.slider("Number of Experiments", 100, 1000, 500, 100)
63
+
64
+ if st.button("Run Simulation"):
65
+ # Run multiple experiments
66
+ ml_means = []
67
+ ml_vars = []
68
+ unbiased_vars = []
69
+
70
+ for _ in range(n_experiments):
71
+ # Generate random samples
72
+ samples = np.random.normal(true_mu, true_sigma, n_samples)
73
+
74
+ # ML estimates
75
+ ml_mean = np.mean(samples)
76
+ ml_var = np.var(samples, ddof=0) # ML estimate
77
+ unbiased_var = np.var(samples, ddof=1) # Unbiased estimate
78
+
79
+ ml_means.append(ml_mean)
80
+ ml_vars.append(ml_var)
81
+ unbiased_vars.append(unbiased_var)
82
+
83
+ # Store results in session state
84
+ st.session_state.ml_means = ml_means
85
+ st.session_state.ml_vars = ml_vars
86
+ st.session_state.unbiased_vars = unbiased_vars
87
+ st.session_state.true_var = true_sigma**2
88
+ st.session_state.n_samples_used = n_samples
89
+
90
+ # Results section below parameters
91
+ if 'ml_vars' in st.session_state:
92
+ st.markdown("---") # Separator line
93
+ st.subheader("Results")
94
+
95
+ # Calculate averages
96
+ avg_ml_var = np.mean(st.session_state.ml_vars)
97
+ avg_unbiased_var = np.mean(st.session_state.unbiased_vars)
98
+ true_var = st.session_state.true_var
99
+ n_samples_used = st.session_state.n_samples_used
100
+ expected_ml_var = (n_samples_used - 1) / n_samples_used * true_var
101
+
102
+ # Display metrics
103
+ col3, col4, col5, col6 = st.columns(4)
104
+ with col3:
105
+ st.metric("Average ML Mean", f"{np.mean(st.session_state.ml_means):.4f}")
106
+ with col4:
107
+ st.metric("Average Unbiased Mean", f"{np.mean(st.session_state.unbiased_vars):.4f}")
108
+ with col5:
109
+ st.metric("True Mean", f"{true_mu:.4f}")
110
+ with col6:
111
+ st.metric("Expected ML Variance", f"{expected_ml_var:.4f}",
112
+ f"{(expected_ml_var - true_var) / true_var * 100:.1f}%")
113
+
114
+ # Bias factor
115
+ st.info(f"Bias Factor: (N-1)/N = {n_samples_used-1}/{n_samples_used} = {(n_samples_used-1)/n_samples_used:.3f}")
116
+
117
+ with col2:
118
+ if 'ml_vars' in st.session_state:
119
+ st.subheader("Variance Distribution")
120
+
121
+ # Get values for plotting
122
+ true_var = st.session_state.true_var
123
+ n_samples_used = st.session_state.n_samples_used
124
+ expected_ml_var = (n_samples_used - 1) / n_samples_used * true_var
125
+
126
+ # Histogram
127
+ fig, ax = plt.subplots(figsize=(10, 8))
128
+ ax.hist(st.session_state.ml_vars, bins=30, alpha=0.5, label='ML Variance', density=True)
129
+ ax.hist(st.session_state.unbiased_vars, bins=30, alpha=0.5, label='Unbiased Variance', density=True)
130
+ ax.axvline(true_var, color='r', linestyle='--', linewidth=2, label='True Variance')
131
+ ax.axvline(expected_ml_var, color='g', linestyle='--', linewidth=2, label='Expected ML Variance')
132
+ ax.set_xlabel('Variance Estimate', fontsize=12)
133
+ ax.set_ylabel('Density', fontsize=12)
134
+ ax.legend(fontsize=11)
135
+ ax.grid(True, alpha=0.3)
136
+ ax.set_title(f'Distribution of Variance Estimates (N={n_samples_used})', fontsize=14)
137
+ st.pyplot(fig)
138
+
139
+ elif page == "Polynomial Curve Fitting":
140
+ st.header("Polynomial Curve Fitting and Overfitting")
141
+
142
+ # Generate true function
143
+ def true_function(x):
144
+ return np.sin(2 * np.pi * x)
145
+
146
+ col1, col2 = st.columns([1, 2])
147
+
148
+ with col1:
149
+ st.subheader("Parameters")
150
+ n_data_points = st.slider("Number of Data Points", 5, 50, 15, 1)
151
+ noise_level = st.slider("Noise Level", 0.0, 0.5, 0.2, 0.05)
152
+ polynomial_degree = st.slider("Polynomial Degree (M)", 0, 15, 3, 1)
153
+
154
+ if st.button("Generate New Data"):
155
+ np.random.seed(None) # Random seed
156
+ x_train = np.random.uniform(0, 1, n_data_points)
157
+ y_train = true_function(x_train) + np.random.normal(0, noise_level, n_data_points)
158
+ st.session_state.x_train = x_train
159
+ st.session_state.y_train = y_train
160
+
161
+ # Initialize data if not exists
162
+ if 'x_train' not in st.session_state:
163
+ np.random.seed(42)
164
+ x_train = np.random.uniform(0, 1, n_data_points)
165
+ y_train = true_function(x_train) + np.random.normal(0, noise_level, n_data_points)
166
+ st.session_state.x_train = x_train
167
+ st.session_state.y_train = y_train
168
+
169
+ with col2:
170
+ st.subheader("Polynomial Fit")
171
+
172
+ # Fit polynomial
173
+ X_train = np.vander(st.session_state.x_train, polynomial_degree + 1, increasing=True)
174
+ w = np.linalg.lstsq(X_train, st.session_state.y_train, rcond=None)[0]
175
+
176
+ # Plot
177
+ x_plot = np.linspace(0, 1, 200)
178
+ X_plot = np.vander(x_plot, polynomial_degree + 1, increasing=True)
179
+ y_pred = X_plot @ w
180
+ y_true = true_function(x_plot)
181
+
182
+ fig, ax = plt.subplots(figsize=(10, 6))
183
+ ax.plot(x_plot, y_true, 'g-', linewidth=2, label='True Function')
184
+ ax.plot(x_plot, y_pred, 'r-', linewidth=2, label=f'Polynomial (M={polynomial_degree})')
185
+ ax.scatter(st.session_state.x_train, st.session_state.y_train,
186
+ c='blue', s=50, alpha=0.8, edgecolors='black', label='Training Data')
187
+ ax.set_xlabel('x')
188
+ ax.set_ylabel('y')
189
+ ax.set_ylim(-1.5, 1.5)
190
+ ax.legend()
191
+ ax.grid(True, alpha=0.3)
192
+ ax.set_title(f'Polynomial Degree M = {polynomial_degree}')
193
+ st.pyplot(fig)
194
+
195
+ # Calculate training error
196
+ y_train_pred = X_train @ w
197
+ train_rmse = np.sqrt(np.mean((st.session_state.y_train - y_train_pred)**2))
198
+ st.metric("Training RMSE", f"{train_rmse:.4f}")
199
+
200
+ elif page == "Probabilistic Curve Fitting":
201
+ st.header("Probabilistic View of Curve Fitting")
202
+ st.latex(r"p(t|x,\mathbf{w},\beta) = N(t|y(x,\mathbf{w}), \beta^{-1})")
203
+
204
+ col1, col2 = st.columns([1, 2])
205
+
206
+ with col1:
207
+ st.subheader("Parameters")
208
+ n_data_points = st.slider("Number of Data Points", 5, 50, 20, 1)
209
+ true_noise = st.slider("True Noise (蟽)", 0.1, 0.5, 0.2, 0.05)
210
+ polynomial_degree = st.slider("Polynomial Degree", 0, 9, 3, 1)
211
+ show_uncertainty = st.checkbox("Show Predictive Distribution", True)
212
+
213
+ if st.button("Generate Data"):
214
+ np.random.seed(None)
215
+ x_train = np.random.uniform(0, 1, n_data_points)
216
+ y_train = np.sin(2 * np.pi * x_train) + np.random.normal(0, true_noise, n_data_points)
217
+ st.session_state.prob_x_train = x_train
218
+ st.session_state.prob_y_train = y_train
219
+
220
+ # Initialize data
221
+ if 'prob_x_train' not in st.session_state:
222
+ np.random.seed(42)
223
+ x_train = np.random.uniform(0, 1, n_data_points)
224
+ y_train = np.sin(2 * np.pi * x_train) + np.random.normal(0, true_noise, n_data_points)
225
+ st.session_state.prob_x_train = x_train
226
+ st.session_state.prob_y_train = y_train
227
+
228
+ with col2:
229
+ st.subheader("Maximum Likelihood Fit")
230
+
231
+ # Fit polynomial and estimate noise
232
+ X_train = np.vander(st.session_state.prob_x_train, polynomial_degree + 1, increasing=True)
233
+ w_ml = np.linalg.lstsq(X_train, st.session_state.prob_y_train, rcond=None)[0]
234
+
235
+ # Estimate noise variance (beta^-1)
236
+ y_train_pred = X_train @ w_ml
237
+ residuals = st.session_state.prob_y_train - y_train_pred
238
+ sigma_ml = np.sqrt(np.mean(residuals**2))
239
+ beta_ml = 1 / (sigma_ml**2)
240
+
241
+ # Plot
242
+ x_plot = np.linspace(0, 1, 200)
243
+ X_plot = np.vander(x_plot, polynomial_degree + 1, increasing=True)
244
+ y_mean = X_plot @ w_ml
245
+
246
+ fig, ax = plt.subplots(figsize=(10, 6))
247
+
248
+ # Plot uncertainty bands if requested
249
+ if show_uncertainty:
250
+ y_std = np.sqrt(1 / beta_ml)
251
+ ax.fill_between(x_plot, y_mean - 2*y_std, y_mean + 2*y_std,
252
+ alpha=0.3, color='red', label='卤2蟽 predictive')
253
+
254
+ ax.plot(x_plot, np.sin(2 * np.pi * x_plot), 'g-', linewidth=2, label='True Function')
255
+ ax.plot(x_plot, y_mean, 'r-', linewidth=2, label=f'ML Fit (M={polynomial_degree})')
256
+ ax.scatter(st.session_state.prob_x_train, st.session_state.prob_y_train,
257
+ c='blue', s=50, alpha=0.8, edgecolors='black', label='Training Data')
258
+
259
+ ax.set_xlabel('x')
260
+ ax.set_ylabel('t')
261
+ ax.legend()
262
+ ax.grid(True, alpha=0.3)
263
+ st.pyplot(fig)
264
+
265
+ # Display estimated parameters
266
+ col3, col4 = st.columns(2)
267
+ with col3:
268
+ st.metric("ML Noise Estimate (蟽)", f"{sigma_ml:.3f}")
269
+ with col4:
270
+ st.metric("True Noise (蟽)", f"{true_noise:.3f}")
271
+
272
+ elif page == "Regularized Curve Fitting":
273
+ st.header("Regularized Curve Fitting (MAP Estimation)")
274
+ st.latex(r"E(\mathbf{w}) = \frac{\beta}{2}\sum_{n=1}^{N}\{y(x_n,\mathbf{w})-t_n\}^2 + \frac{\alpha}{2}\mathbf{w}^T\mathbf{w}")
275
+
276
+ col1, col2 = st.columns([1, 2])
277
+
278
+ with col1:
279
+ st.subheader("Parameters")
280
+ n_data_points = st.slider("Data Points", 10, 50, 15, 1)
281
+ noise_level = st.slider("Noise", 0.1, 0.5, 0.3, 0.05)
282
+ polynomial_degree = st.slider("Degree (M)", 0, 15, 9, 1)
283
+ log_lambda = st.slider("log鈧佲個(位)", -8.0, 2.0, -3.0, 0.5)
284
+ regularization = 10**log_lambda
285
+
286
+ if st.button("New Data"):
287
+ np.random.seed(None)
288
+ x_train = np.random.uniform(0, 1, n_data_points)
289
+ y_train = np.sin(2 * np.pi * x_train) + np.random.normal(0, noise_level, n_data_points)
290
+ st.session_state.reg_x_train = x_train
291
+ st.session_state.reg_y_train = y_train
292
+
293
+ # Initialize
294
+ if 'reg_x_train' not in st.session_state:
295
+ np.random.seed(42)
296
+ x_train = np.random.uniform(0, 1, n_data_points)
297
+ y_train = np.sin(2 * np.pi * x_train) + np.random.normal(0, noise_level, n_data_points)
298
+ st.session_state.reg_x_train = x_train
299
+ st.session_state.reg_y_train = y_train
300
+
301
+ with col2:
302
+ st.subheader("Regularized Fit")
303
+
304
+ # Fit with regularization
305
+ X_train = np.vander(st.session_state.reg_x_train, polynomial_degree + 1, increasing=True)
306
+
307
+ # Ridge regression (L2 regularization)
308
+ XtX = X_train.T @ X_train
309
+ Xty = X_train.T @ st.session_state.reg_y_train
310
+ w_reg = np.linalg.solve(XtX + regularization * np.eye(polynomial_degree + 1), Xty)
311
+
312
+ # Plot
313
+ x_plot = np.linspace(0, 1, 200)
314
+ X_plot = np.vander(x_plot, polynomial_degree + 1, increasing=True)
315
+ y_pred = X_plot @ w_reg
316
+
317
+ fig, ax = plt.subplots(figsize=(10, 6))
318
+ ax.plot(x_plot, np.sin(2 * np.pi * x_plot), 'g-', linewidth=2, label='True Function')
319
+ ax.plot(x_plot, y_pred, 'r-', linewidth=2, label=f'Regularized (位={regularization:.1e})')
320
+ ax.scatter(st.session_state.reg_x_train, st.session_state.reg_y_train,
321
+ c='blue', s=50, alpha=0.8, edgecolors='black', label='Training Data')
322
+ ax.set_xlabel('x')
323
+ ax.set_ylabel('t')
324
+ ax.set_ylim(-1.5, 1.5)
325
+ ax.legend()
326
+ ax.grid(True, alpha=0.3)
327
+ ax.set_title(f'M = {polynomial_degree}, 位 = {regularization:.1e}')
328
+ st.pyplot(fig)
329
+
330
+ # Metrics
331
+ train_pred = X_train @ w_reg
332
+ train_rmse = np.sqrt(np.mean((st.session_state.reg_y_train - train_pred)**2))
333
+ weight_norm = np.linalg.norm(w_reg)
334
+
335
+ col3, col4 = st.columns(2)
336
+ with col3:
337
+ st.metric("Training RMSE", f"{train_rmse:.4f}")
338
+ with col4:
339
+ st.metric("||w||虏", f"{weight_norm:.2f}")
340
+
341
+ # Add information footer
342
+ st.markdown("---")
343
+ st.markdown("### Key Concepts Demonstrated:")
344
+ st.markdown("""
345
+ - **Gaussian Distribution**: Fundamental probability distribution with mean 渭 and variance 蟽虏
346
+ - **Maximum Likelihood Bias**: ML estimation systematically underestimates variance by factor (N-1)/N
347
+ - **Overfitting**: High-degree polynomials fit training data perfectly but generalize poorly
348
+ - **Probabilistic Curve Fitting**: View regression as estimating conditional distribution p(t|x)
349
+ - **Regularization**: Adding penalty term 伪||w||虏 prevents overfitting (equivalent to MAP with Gaussian prior)
350
+ """)