mylessss commited on
Commit
1505be8
·
1 Parent(s): d86024f
Files changed (1) hide show
  1. app.py +42 -4
app.py CHANGED
@@ -32,10 +32,16 @@ def __(mo):
32
  r"""
33
  # Visualizing text embeddings using MotherDuck and marimo
34
 
35
- > Text embeddings have become a crucial tool in AI/ML applications, allowing us to convert text into numerical vectors that capture semantic meaning. These vectors are often used for semantic search, but in this blog post, we'll explore how to visualize and explore text embeddings interactively using MotherDuck and marimo.
 
 
 
 
 
 
36
 
37
  !!! Info
38
- **This marimo application is the result [this blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).** It is recommend to go through the blog first.
39
  """
40
  )
41
  return
@@ -101,6 +107,38 @@ def __(demo_with_embeddings, mo, my_db):
101
  return (embeddings,)
102
 
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  @app.cell
105
  def __(np):
106
  def umap_reduce(np_array, metric="cosine"):
@@ -156,7 +194,7 @@ def __(mo):
156
  label="Cluster Size (min, max)",
157
  )
158
  metric_dropdown = mo.ui.dropdown(
159
- ["cosine", "euclidean", "manhattan", "mahalanobis"],
160
  value="cosine",
161
  label="Distance Metric",
162
  )
@@ -185,7 +223,7 @@ def __(
185
  umap_reduce,
186
  ):
187
  with mo.status.spinner("Clustering points...") as _s:
188
- import numba # <- FYI, this module takes a while to load, be patient
189
 
190
  embeddings_array = embeddings["text_embedding"].to_numpy()
191
  hdb_labels = cluster_points(
 
32
  r"""
33
  # Visualizing text embeddings using MotherDuck and marimo
34
 
35
+ > Text embeddings have become a crucial tool in AI/ML applications, allowing us to convert text into numerical vectors that capture semantic meaning. These vectors are often used for semantic search, but in ~~this blog post~~ marimo app, we'll explore how to visualize and explore text embeddings interactively using MotherDuck and marimo.
36
+
37
+ This app lets you visualize and explore text embeddings from Hacker News posts about **databases**. You can:
38
+
39
+ - See how different posts cluster together based on semantic similarity
40
+ - Adjust clustering parameters in real-time
41
+ - Explore relationships between posts through an interactive visualization
42
 
43
  !!! Info
44
+ **This marimo application based on [this blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).** We recommend looking through the blog first.
45
  """
46
  )
47
  return
 
107
  return (embeddings,)
108
 
109
 
110
+ @app.cell
111
+ def __(mo):
112
+ mo.md(
113
+ """
114
+ ## Making Sense of High-Dimensional Data
115
+
116
+ Text embeddings typically have hundreds of dimensions (512 in our case), making them impossible to visualize directly. We'll use two techniques to make them interpretable:
117
+
118
+ 1. **Dimensionality Reduction**: UMAP will convert our 512D vectors into 2D points while preserving relationships between texts
119
+ 2. **Clustering**: HDBSCAN will group similar texts together
120
+ """
121
+ )
122
+ return
123
+
124
+
125
+ @app.cell(hide_code=True)
126
+ def __(cluster_points, mo, umap_reduce):
127
+ def md_help(cls):
128
+ import inspect
129
+
130
+ return f"def {cls.__name__} {inspect.signature(cls)}:\n {cls.__doc__}"
131
+
132
+
133
+ mo.accordion(
134
+ {
135
+ "`umap_reduce`": md_help(umap_reduce),
136
+ "`cluster_points`": md_help(cluster_points),
137
+ }
138
+ )
139
+ return (md_help,)
140
+
141
+
142
  @app.cell
143
  def __(np):
144
  def umap_reduce(np_array, metric="cosine"):
 
194
  label="Cluster Size (min, max)",
195
  )
196
  metric_dropdown = mo.ui.dropdown(
197
+ ["cosine", "euclidean", "manhattan"],
198
  value="cosine",
199
  label="Distance Metric",
200
  )
 
223
  umap_reduce,
224
  ):
225
  with mo.status.spinner("Clustering points...") as _s:
226
+ import numba
227
 
228
  embeddings_array = embeddings["text_embedding"].to_numpy()
229
  hdb_labels = cluster_points(