Spaces:
Running
Running
Test and summary sections. Some refactoring/updates
Browse files
probability/14_binomial_distribution.py
CHANGED
@@ -13,7 +13,7 @@
|
|
13 |
|
14 |
import marimo
|
15 |
|
16 |
-
__generated_with = "0.11.
|
17 |
app = marimo.App(width="medium", app_title="Binomial Distribution")
|
18 |
|
19 |
|
@@ -248,7 +248,7 @@ def _(mo):
|
|
248 |
|
249 |
There is an easy way to calculate the expectation of a binomial and a hard way. The easy way is to leverage the fact that a binomial is the sum of Bernoulli indicator random variables $X = \sum_{i=1}^{n} Y_i$ where $Y_i$ is an indicator of whether the $i$-th experiment was a success: $Y_i \sim \text{Bernoulli}(p)$.
|
250 |
|
251 |
-
Since the expectation of the sum of random variables is the sum of expectations, we can add the expectation, $E[Y_i] = p$, of each of the Bernoulli's:
|
252 |
|
253 |
\begin{align}
|
254 |
E[X] &= E\Big[\sum_{i=1}^{n} Y_i\Big] && \text{Since }X = \sum_{i=1}^{n} Y_i \\
|
@@ -285,7 +285,7 @@ def _(mo):
|
|
285 |
|
286 |
|
287 |
@app.cell
|
288 |
-
def _(stats
|
289 |
# define variables for x, n, and p
|
290 |
_n = 5 # Integer value for n
|
291 |
_p = 0.6
|
@@ -295,7 +295,7 @@ def _(stats, x):
|
|
295 |
p_x = stats.binom.pmf(_x, _n, _p)
|
296 |
|
297 |
# use the probability for future work
|
298 |
-
print(f'P(X = {
|
299 |
return (p_x,)
|
300 |
|
301 |
|
@@ -306,8 +306,7 @@ def _(mo):
|
|
306 |
|
307 |
|
308 |
@app.cell
|
309 |
-
def _(n,
|
310 |
-
# Ensure n is an integer to prevent TypeError
|
311 |
n_int = int(n)
|
312 |
|
313 |
# samples from the binomial distribution
|
@@ -315,7 +314,11 @@ def _(n, np, p, plt, stats):
|
|
315 |
|
316 |
# Print the samples
|
317 |
print(samples)
|
|
|
318 |
|
|
|
|
|
|
|
319 |
# Plot histogram of samples
|
320 |
plt.figure(figsize=(10, 5))
|
321 |
plt.hist(samples, bins=np.arange(-0.5, n_int+1.5, 1), alpha=0.7, color='royalblue',
|
@@ -343,7 +346,7 @@ def _(n, np, p, plt, stats):
|
|
343 |
|
344 |
plt.tight_layout()
|
345 |
plt.gca()
|
346 |
-
return
|
347 |
|
348 |
|
349 |
@app.cell(hide_code=True)
|
@@ -376,7 +379,105 @@ def _(mo):
|
|
376 |
return
|
377 |
|
378 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
@app.cell
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
def _(mo):
|
381 |
alpha_slider = mo.ui.slider(
|
382 |
value=0.1,
|
@@ -390,7 +491,7 @@ def _(mo):
|
|
390 |
return alpha_slider, mu_slider
|
391 |
|
392 |
|
393 |
-
@app.cell
|
394 |
def _():
|
395 |
equation = """
|
396 |
$$
|
@@ -404,7 +505,7 @@ def _():
|
|
404 |
return (equation,)
|
405 |
|
406 |
|
407 |
-
@app.cell
|
408 |
def _(alpha_slider, alt, mu_slider, np, pd, stats):
|
409 |
mu = mu_slider.value
|
410 |
alpha = alpha_slider.value
|
@@ -440,66 +541,5 @@ def _(alpha_slider, alt, mu_slider, np, pd, stats):
|
|
440 |
return alpha, base, chart, chart_nb, chart_poi, df, mu, n, p, r1k, x
|
441 |
|
442 |
|
443 |
-
@app.cell
|
444 |
-
def _(alpha_slider, chart, equation, mo, mu_slider):
|
445 |
-
mo.vstack(
|
446 |
-
[
|
447 |
-
mo.md(f"## Negative Binomial Distribution (Poisson + Overdispersion)\n{equation}"),
|
448 |
-
mo.hstack([mu_slider, alpha_slider], justify="start"),
|
449 |
-
chart,
|
450 |
-
], justify='space-around'
|
451 |
-
).center()
|
452 |
-
return
|
453 |
-
|
454 |
-
|
455 |
-
@app.cell(hide_code=True)
|
456 |
-
def _(mo):
|
457 |
-
mo.md(
|
458 |
-
r"""
|
459 |
-
## Key Takeaways
|
460 |
-
|
461 |
-
The binomial distribution is a fundamental discrete probability distribution that models the number of successes in a fixed number of independent trials, each with the same probability of success.
|
462 |
-
|
463 |
-
Here's what we've learned:
|
464 |
-
|
465 |
-
1. **Binomial Distribution Definition**: It models the number of successes in $n$ independent trials, each with probability $p$ of success.
|
466 |
-
|
467 |
-
2. **PMF Formula**: $P(X=k) = {n \choose k}p^k(1-p)^{n-k}$, which calculates the probability of getting exactly $k$ successes.
|
468 |
-
|
469 |
-
3. **Key Properties**:
|
470 |
-
- Expected value: $E[X] = np$
|
471 |
-
- Variance: $Var(X) = np(1-p)$
|
472 |
-
|
473 |
-
4. **Relation to Other Distributions**:
|
474 |
-
- Sum of Bernoulli random variables
|
475 |
-
- Related to negative binomial and Poisson distributions
|
476 |
-
|
477 |
-
5. **Practical Usage**:
|
478 |
-
- Easily model in Python using `scipy.stats.binom`
|
479 |
-
- Generate random samples and calculate probabilities
|
480 |
-
|
481 |
-
The binomial distribution is widely used in many fields including computer science, quality control, epidemiology, and data science to model scenarios with binary outcomes over multiple trials.
|
482 |
-
"""
|
483 |
-
)
|
484 |
-
return
|
485 |
-
|
486 |
-
|
487 |
-
@app.cell
|
488 |
-
def _():
|
489 |
-
import marimo as mo
|
490 |
-
return (mo,)
|
491 |
-
|
492 |
-
|
493 |
-
@app.cell
|
494 |
-
def _():
|
495 |
-
import numpy as np
|
496 |
-
import matplotlib.pyplot as plt
|
497 |
-
import scipy.stats as stats
|
498 |
-
import pandas as pd
|
499 |
-
import altair as alt
|
500 |
-
from wigglystuff import TangleSlider
|
501 |
-
return TangleSlider, alt, np, pd, plt, stats
|
502 |
-
|
503 |
-
|
504 |
if __name__ == "__main__":
|
505 |
app.run()
|
|
|
13 |
|
14 |
import marimo
|
15 |
|
16 |
+
__generated_with = "0.11.24"
|
17 |
app = marimo.App(width="medium", app_title="Binomial Distribution")
|
18 |
|
19 |
|
|
|
248 |
|
249 |
There is an easy way to calculate the expectation of a binomial and a hard way. The easy way is to leverage the fact that a binomial is the sum of Bernoulli indicator random variables $X = \sum_{i=1}^{n} Y_i$ where $Y_i$ is an indicator of whether the $i$-th experiment was a success: $Y_i \sim \text{Bernoulli}(p)$.
|
250 |
|
251 |
+
Since the [expectation of the sum](http://marimo.app/https://github.com/marimo-team/learn/blob/main/probability/11_expectation.py) of random variables is the sum of expectations, we can add the expectation, $E[Y_i] = p$, of each of the Bernoulli's:
|
252 |
|
253 |
\begin{align}
|
254 |
E[X] &= E\Big[\sum_{i=1}^{n} Y_i\Big] && \text{Since }X = \sum_{i=1}^{n} Y_i \\
|
|
|
285 |
|
286 |
|
287 |
@app.cell
|
288 |
+
def _(stats):
|
289 |
# define variables for x, n, and p
|
290 |
_n = 5 # Integer value for n
|
291 |
_p = 0.6
|
|
|
295 |
p_x = stats.binom.pmf(_x, _n, _p)
|
296 |
|
297 |
# use the probability for future work
|
298 |
+
print(f'P(X = {_x}) = {p_x:.4f}')
|
299 |
return (p_x,)
|
300 |
|
301 |
|
|
|
306 |
|
307 |
|
308 |
@app.cell
|
309 |
+
def _(n, p, stats):
|
|
|
310 |
n_int = int(n)
|
311 |
|
312 |
# samples from the binomial distribution
|
|
|
314 |
|
315 |
# Print the samples
|
316 |
print(samples)
|
317 |
+
return n_int, samples
|
318 |
|
319 |
+
|
320 |
+
@app.cell(hide_code=True)
|
321 |
+
def _(n_int, np, p, plt, samples, stats):
|
322 |
# Plot histogram of samples
|
323 |
plt.figure(figsize=(10, 5))
|
324 |
plt.hist(samples, bins=np.arange(-0.5, n_int+1.5, 1), alpha=0.7, color='royalblue',
|
|
|
346 |
|
347 |
plt.tight_layout()
|
348 |
plt.gca()
|
349 |
+
return pmf_values, x_values
|
350 |
|
351 |
|
352 |
@app.cell(hide_code=True)
|
|
|
379 |
return
|
380 |
|
381 |
|
382 |
+
@app.cell(hide_code=True)
|
383 |
+
def _(alpha_slider, chart, equation, mo, mu_slider):
|
384 |
+
mo.vstack(
|
385 |
+
[
|
386 |
+
mo.md(f"## Negative Binomial Distribution (Poisson + Overdispersion)\n{equation}"),
|
387 |
+
mo.hstack([mu_slider, alpha_slider], justify="start"),
|
388 |
+
chart,
|
389 |
+
], justify='space-around'
|
390 |
+
).center()
|
391 |
+
return
|
392 |
+
|
393 |
+
|
394 |
+
@app.cell(hide_code=True)
|
395 |
+
def _(mo):
|
396 |
+
mo.md(
|
397 |
+
r"""
|
398 |
+
## 🤔 Test Your Understanding
|
399 |
+
Pick which of these statements about binomial distributions you think are correct:
|
400 |
+
|
401 |
+
/// details | The variance of a binomial distribution is always equal to its mean
|
402 |
+
❌ Incorrect! The variance is $np(1-p)$ while the mean is $np$. They're only equal when $p=1$ (which is a degenerate case).
|
403 |
+
///
|
404 |
+
|
405 |
+
/// details | If $X \sim \text{Bin}(n, p)$ and $Y \sim \text{Bin}(n, 1-p)$, then $X$ and $Y$ have the same variance
|
406 |
+
✅ Correct! $\text{Var}(X) = np(1-p)$ and $\text{Var}(Y) = n(1-p)p$, which are the same.
|
407 |
+
///
|
408 |
+
|
409 |
+
/// details | As the number of trials increases, the binomial distribution approaches a normal distribution
|
410 |
+
✅ Correct! For large $n$, the binomial distribution can be approximated by a normal distribution with the same mean and variance.
|
411 |
+
///
|
412 |
+
|
413 |
+
/// details | The PMF of a binomial distribution is symmetric when $p = 0.5$
|
414 |
+
✅ Correct! When $p = 0.5$, the PMF is symmetric around $n/2$.
|
415 |
+
///
|
416 |
+
|
417 |
+
/// details | The sum of two independent binomial random variables with the same $p$ is also a binomial random variable
|
418 |
+
✅ Correct! If $X \sim \text{Bin}(n_1, p)$ and $Y \sim \text{Bin}(n_2, p)$ are independent, then $X + Y \sim \text{Bin}(n_1 + n_2, p)$.
|
419 |
+
///
|
420 |
+
|
421 |
+
/// details | The maximum value of the PMF for $\text{Bin}(n,p)$ always occurs at $k = np$
|
422 |
+
❌ Incorrect! The mode (maximum value of PMF) is either $\lfloor (n+1)p \rfloor$ or $\lceil (n+1)p-1 \rceil$ depending on whether $(n+1)p$ is an integer.
|
423 |
+
///
|
424 |
+
"""
|
425 |
+
)
|
426 |
+
return
|
427 |
+
|
428 |
+
|
429 |
+
@app.cell(hide_code=True)
|
430 |
+
def _(mo):
|
431 |
+
mo.md(
|
432 |
+
r"""
|
433 |
+
## Summary
|
434 |
+
|
435 |
+
So we've explored the binomial distribution, and honestly, it's one of the most practical probability distributions you'll encounter. Think about it — anytime you're counting successes in a fixed number of trials (like those coin flips we discussed), this is your go-to distribution.
|
436 |
+
|
437 |
+
I find it fascinating how the expectation is simply $np$. Such a clean, intuitive formula! And remember that neat visualization we saw earlier? When we adjusted the parameters, you could actually see how the distribution shape changes—becoming more symmetric as $n$ increases.
|
438 |
+
|
439 |
+
The key things to take away:
|
440 |
+
|
441 |
+
- The binomial distribution models the number of successes in $n$ independent trials, each with probability $p$ of success
|
442 |
+
|
443 |
+
- Its PMF is given by the formula $P(X=k) = {n \choose k}p^k(1-p)^{n-k}$, which lets us calculate exactly how likely any specific number of successes is
|
444 |
+
|
445 |
+
- The expected value is $E[X] = np$ and the variance is $Var(X) = np(1-p)$
|
446 |
+
|
447 |
+
- It's related to other distributions: it's essentially a sum of Bernoulli random variables, and connects to both the negative binomial and Poisson distributions
|
448 |
+
|
449 |
+
- In Python, the `scipy.stats.binom` module makes working with binomial distributions straightforward—you can generate random samples and calculate probabilities with just a few lines of code
|
450 |
+
|
451 |
+
You'll see the binomial distribution pop up everywhere—from computer science to quality control, epidemiology, and data science. Any time you have scenarios with binary outcomes over multiple trials, this distribution has you covered.
|
452 |
+
"""
|
453 |
+
)
|
454 |
+
return
|
455 |
+
|
456 |
+
|
457 |
+
@app.cell(hide_code=True)
|
458 |
+
def _(mo):
|
459 |
+
mo.md(r"""Appendix code (helper functions, variables, etc.):""")
|
460 |
+
return
|
461 |
+
|
462 |
+
|
463 |
@app.cell
|
464 |
+
def _():
|
465 |
+
import marimo as mo
|
466 |
+
return (mo,)
|
467 |
+
|
468 |
+
|
469 |
+
@app.cell(hide_code=True)
|
470 |
+
def _():
|
471 |
+
import numpy as np
|
472 |
+
import matplotlib.pyplot as plt
|
473 |
+
import scipy.stats as stats
|
474 |
+
import pandas as pd
|
475 |
+
import altair as alt
|
476 |
+
from wigglystuff import TangleSlider
|
477 |
+
return TangleSlider, alt, np, pd, plt, stats
|
478 |
+
|
479 |
+
|
480 |
+
@app.cell(hide_code=True)
|
481 |
def _(mo):
|
482 |
alpha_slider = mo.ui.slider(
|
483 |
value=0.1,
|
|
|
491 |
return alpha_slider, mu_slider
|
492 |
|
493 |
|
494 |
+
@app.cell(hide_code=True)
|
495 |
def _():
|
496 |
equation = """
|
497 |
$$
|
|
|
505 |
return (equation,)
|
506 |
|
507 |
|
508 |
+
@app.cell(hide_code=True)
|
509 |
def _(alpha_slider, alt, mu_slider, np, pd, stats):
|
510 |
mu = mu_slider.value
|
511 |
alpha = alpha_slider.value
|
|
|
541 |
return alpha, base, chart, chart_nb, chart_poi, df, mu, n, p, r1k, x
|
542 |
|
543 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
if __name__ == "__main__":
|
545 |
app.run()
|