updates
Browse files
app.py
CHANGED
@@ -32,10 +32,16 @@ def __(mo):
|
|
32 |
r"""
|
33 |
# Visualizing text embeddings using MotherDuck and marimo
|
34 |
|
35 |
-
> Text embeddings have become a crucial tool in AI/ML applications, allowing us to convert text into numerical vectors that capture semantic meaning. These vectors are often used for semantic search, but in this blog post, we'll explore how to visualize and explore text embeddings interactively using MotherDuck and marimo.
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
!!! Info
|
38 |
-
**This marimo application
|
39 |
"""
|
40 |
)
|
41 |
return
|
@@ -101,6 +107,38 @@ def __(demo_with_embeddings, mo, my_db):
|
|
101 |
return (embeddings,)
|
102 |
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
@app.cell
|
105 |
def __(np):
|
106 |
def umap_reduce(np_array, metric="cosine"):
|
@@ -156,7 +194,7 @@ def __(mo):
|
|
156 |
label="Cluster Size (min, max)",
|
157 |
)
|
158 |
metric_dropdown = mo.ui.dropdown(
|
159 |
-
["cosine", "euclidean", "manhattan"
|
160 |
value="cosine",
|
161 |
label="Distance Metric",
|
162 |
)
|
@@ -185,7 +223,7 @@ def __(
|
|
185 |
umap_reduce,
|
186 |
):
|
187 |
with mo.status.spinner("Clustering points...") as _s:
|
188 |
-
import numba
|
189 |
|
190 |
embeddings_array = embeddings["text_embedding"].to_numpy()
|
191 |
hdb_labels = cluster_points(
|
|
|
32 |
r"""
|
33 |
# Visualizing text embeddings using MotherDuck and marimo
|
34 |
|
35 |
+
> Text embeddings have become a crucial tool in AI/ML applications, allowing us to convert text into numerical vectors that capture semantic meaning. These vectors are often used for semantic search, but in ~~this blog post~~ marimo app, we'll explore how to visualize and explore text embeddings interactively using MotherDuck and marimo.
|
36 |
+
|
37 |
+
This app lets you visualize and explore text embeddings from Hacker News posts about **databases**. You can:
|
38 |
+
|
39 |
+
- See how different posts cluster together based on semantic similarity
|
40 |
+
- Adjust clustering parameters in real-time
|
41 |
+
- Explore relationships between posts through an interactive visualization
|
42 |
|
43 |
!!! Info
|
44 |
+
**This marimo application based on [this blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).** We recommend looking through the blog first.
|
45 |
"""
|
46 |
)
|
47 |
return
|
|
|
107 |
return (embeddings,)
|
108 |
|
109 |
|
110 |
+
@app.cell
|
111 |
+
def __(mo):
|
112 |
+
mo.md(
|
113 |
+
"""
|
114 |
+
## Making Sense of High-Dimensional Data
|
115 |
+
|
116 |
+
Text embeddings typically have hundreds of dimensions (512 in our case), making them impossible to visualize directly. We'll use two techniques to make them interpretable:
|
117 |
+
|
118 |
+
1. **Dimensionality Reduction**: UMAP will convert our 512D vectors into 2D points while preserving relationships between texts
|
119 |
+
2. **Clustering**: HDBSCAN will group similar texts together
|
120 |
+
"""
|
121 |
+
)
|
122 |
+
return
|
123 |
+
|
124 |
+
|
125 |
+
@app.cell(hide_code=True)
|
126 |
+
def __(cluster_points, mo, umap_reduce):
|
127 |
+
def md_help(cls):
|
128 |
+
import inspect
|
129 |
+
|
130 |
+
return f"def {cls.__name__} {inspect.signature(cls)}:\n {cls.__doc__}"
|
131 |
+
|
132 |
+
|
133 |
+
mo.accordion(
|
134 |
+
{
|
135 |
+
"`umap_reduce`": md_help(umap_reduce),
|
136 |
+
"`cluster_points`": md_help(cluster_points),
|
137 |
+
}
|
138 |
+
)
|
139 |
+
return (md_help,)
|
140 |
+
|
141 |
+
|
142 |
@app.cell
|
143 |
def __(np):
|
144 |
def umap_reduce(np_array, metric="cosine"):
|
|
|
194 |
label="Cluster Size (min, max)",
|
195 |
)
|
196 |
metric_dropdown = mo.ui.dropdown(
|
197 |
+
["cosine", "euclidean", "manhattan"],
|
198 |
value="cosine",
|
199 |
label="Distance Metric",
|
200 |
)
|
|
|
223 |
umap_reduce,
|
224 |
):
|
225 |
with mo.status.spinner("Clustering points...") as _s:
|
226 |
+
import numba
|
227 |
|
228 |
embeddings_array = embeddings["text_embedding"].to_numpy()
|
229 |
hdb_labels = cluster_points(
|