updates
Browse files- README.md +7 -6
- app.py +189 -366
- requirements.txt +9 -4
README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
-
pinned:
|
8 |
license: mit
|
9 |
-
short_description:
|
10 |
---
|
11 |
|
12 |
Check out marimo at <https://github.com/marimo-team/marimo>
|
|
|
13 |
Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
|
|
|
1 |
---
|
2 |
+
title: motherduck embedding explorer
|
3 |
+
emoji: π¦
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: indigo
|
6 |
sdk: docker
|
7 |
+
pinned: false
|
8 |
license: mit
|
9 |
+
short_description: motherduck embedding explorer
|
10 |
---
|
11 |
|
12 |
Check out marimo at <https://github.com/marimo-team/marimo>
|
13 |
+
Check out motherduck at <https://motherduck.com>
|
14 |
Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
|
app.py
CHANGED
@@ -1,469 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import marimo
|
2 |
|
3 |
-
__generated_with = "0.9.
|
4 |
-
app = marimo.App()
|
5 |
|
6 |
|
7 |
@app.cell
|
8 |
def __():
|
9 |
import marimo as mo
|
10 |
|
11 |
-
mo.md("# Welcome to marimo! ππ")
|
12 |
return (mo,)
|
13 |
|
14 |
|
15 |
-
@app.cell
|
16 |
def __(mo):
|
17 |
-
slider = mo.ui.slider(1, 22)
|
18 |
-
return (slider,)
|
19 |
-
|
20 |
-
|
21 |
-
@app.cell
|
22 |
-
def __(mo, slider):
|
23 |
mo.md(
|
24 |
-
|
25 |
-
|
26 |
|
27 |
-
|
28 |
-
automatically** when you modify them or
|
29 |
-
interact with UI elements, like this slider: {slider}.
|
30 |
|
31 |
-
|
32 |
"""
|
33 |
)
|
34 |
return
|
35 |
|
36 |
|
37 |
-
@app.cell(hide_code=True)
|
38 |
-
def __(mo):
|
39 |
-
mo.accordion(
|
40 |
-
{
|
41 |
-
"Tip: disabling automatic execution": mo.md(
|
42 |
-
rf"""
|
43 |
-
marimo lets you disable automatic execution: just go into the
|
44 |
-
notebook settings and set
|
45 |
-
|
46 |
-
"Runtime > On Cell Change" to "lazy".
|
47 |
-
|
48 |
-
When the runtime is lazy, after running a cell, marimo marks its
|
49 |
-
descendants as stale instead of automatically running them. The
|
50 |
-
lazy runtime puts you in control over when cells are run, while
|
51 |
-
still giving guarantees about the notebook state.
|
52 |
-
"""
|
53 |
-
)
|
54 |
-
}
|
55 |
-
)
|
56 |
-
return
|
57 |
-
|
58 |
-
|
59 |
-
@app.cell(hide_code=True)
|
60 |
-
def __(mo):
|
61 |
-
mo.md(
|
62 |
-
"""
|
63 |
-
Tip: This is a tutorial notebook. You can create your own notebooks
|
64 |
-
by entering `marimo edit` at the command line.
|
65 |
-
"""
|
66 |
-
).callout()
|
67 |
-
return
|
68 |
-
|
69 |
-
|
70 |
@app.cell(hide_code=True)
|
71 |
def __(mo):
|
72 |
mo.md(
|
73 |
"""
|
74 |
-
##
|
75 |
-
|
76 |
-
A marimo notebook is made up of small blocks of Python code called
|
77 |
-
cells.
|
78 |
|
79 |
-
|
80 |
-
a cell that defines a global variable is run, marimo
|
81 |
-
**automatically runs** all cells that reference that variable.
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
86 |
"""
|
87 |
)
|
88 |
return
|
89 |
|
90 |
|
91 |
-
@app.cell(hide_code=True)
|
92 |
-
def __(changed, mo):
|
93 |
-
(
|
94 |
-
mo.md(
|
95 |
-
f"""
|
96 |
-
**β¨ Nice!** The value of `changed` is now {changed}.
|
97 |
-
|
98 |
-
When you updated the value of the variable `changed`, marimo
|
99 |
-
**reacted** by running this cell automatically, because this cell
|
100 |
-
references the global variable `changed`.
|
101 |
-
|
102 |
-
Reactivity ensures that your notebook state is always
|
103 |
-
consistent, which is crucial for doing good science; it's also what
|
104 |
-
enables marimo notebooks to double as tools and apps.
|
105 |
-
"""
|
106 |
-
)
|
107 |
-
if changed
|
108 |
-
else mo.md(
|
109 |
-
"""
|
110 |
-
**π See it in action.** In the next cell, change the value of the
|
111 |
-
variable `changed` to `True`, then click the run button.
|
112 |
-
"""
|
113 |
-
)
|
114 |
-
)
|
115 |
-
return
|
116 |
-
|
117 |
-
|
118 |
@app.cell
|
119 |
-
def __():
|
120 |
-
changed = False
|
121 |
-
return (changed,)
|
122 |
-
|
123 |
-
|
124 |
-
@app.cell(hide_code=True)
|
125 |
def __(mo):
|
126 |
-
mo.
|
127 |
-
{
|
128 |
-
"Tip: execution order": (
|
129 |
-
"""
|
130 |
-
The order of cells on the page has no bearing on
|
131 |
-
the order in which cells are executed: marimo knows that a cell
|
132 |
-
reading a variable must run after the cell that defines it. This
|
133 |
-
frees you to organize your code in the way that makes the most
|
134 |
-
sense for you.
|
135 |
-
"""
|
136 |
-
)
|
137 |
-
}
|
138 |
-
)
|
139 |
-
return
|
140 |
-
|
141 |
-
|
142 |
-
@app.cell(hide_code=True)
|
143 |
-
def __(mo):
|
144 |
-
mo.md(
|
145 |
"""
|
146 |
-
|
147 |
-
constraint on how names appear in cells: no two cells may define the same
|
148 |
-
variable.
|
149 |
"""
|
150 |
)
|
151 |
-
return
|
152 |
|
153 |
|
154 |
-
@app.cell
|
155 |
-
def __(mo):
|
156 |
-
mo.accordion(
|
157 |
-
{
|
158 |
-
"Tip: encapsulation": (
|
159 |
-
"""
|
160 |
-
By encapsulating logic in functions, classes, or Python modules,
|
161 |
-
you can minimize the number of global variables in your notebook.
|
162 |
-
"""
|
163 |
-
)
|
164 |
-
}
|
165 |
-
)
|
166 |
-
return
|
167 |
-
|
168 |
-
|
169 |
-
@app.cell(hide_code=True)
|
170 |
-
def __(mo):
|
171 |
-
mo.accordion(
|
172 |
-
{
|
173 |
-
"Tip: private variables": (
|
174 |
-
"""
|
175 |
-
Variables prefixed with an underscore are "private" to a cell, so
|
176 |
-
they can be defined by multiple cells.
|
177 |
-
"""
|
178 |
-
)
|
179 |
-
}
|
180 |
-
)
|
181 |
-
return
|
182 |
-
|
183 |
-
|
184 |
-
@app.cell(hide_code=True)
|
185 |
def __(mo):
|
186 |
-
mo.
|
187 |
"""
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
`marimo.ui`.
|
197 |
"""
|
198 |
)
|
199 |
return
|
200 |
|
201 |
|
202 |
@app.cell
|
203 |
-
def __(mo):
|
204 |
-
mo.
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
|
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
return (
|
212 |
|
213 |
|
214 |
@app.cell
|
215 |
-
def __(
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
|
|
224 |
|
|
|
|
|
|
|
225 |
|
226 |
-
|
227 |
-
def __(icon, mo, repetitions):
|
228 |
-
mo.md("# " + icon.value * repetitions.value)
|
229 |
-
return
|
230 |
|
231 |
|
232 |
-
@app.cell
|
233 |
def __(mo):
|
234 |
-
mo.
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
The Python files generated by marimo are:
|
243 |
-
|
244 |
-
- easily versioned with git, yielding minimal diffs
|
245 |
-
- legible for both humans and machines
|
246 |
-
- formattable using your tool of choice,
|
247 |
-
- usable as Python scripts, with UI elements taking their default
|
248 |
-
values, and
|
249 |
-
- importable by other modules (more on that in the future).
|
250 |
-
"""
|
251 |
)
|
252 |
-
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
|
255 |
-
@app.cell
|
256 |
def __(mo):
|
257 |
mo.md(
|
258 |
-
"""
|
259 |
-
##
|
260 |
-
|
261 |
-
marimo notebooks can double as apps. Click the app window icon in the
|
262 |
-
bottom-right to see this notebook in "app view."
|
263 |
|
264 |
-
|
265 |
-
Of course, you can use marimo just to level-up your
|
266 |
-
notebooking, without ever making apps.
|
267 |
"""
|
268 |
)
|
269 |
return
|
270 |
|
271 |
|
272 |
-
@app.cell
|
273 |
-
def __(
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
|
|
|
|
|
|
289 |
|
290 |
-
```
|
291 |
-
marimo run notebook.py
|
292 |
-
```
|
293 |
|
294 |
-
|
295 |
-
|
|
|
|
|
296 |
|
297 |
-
**Convert a Jupyter notebook.** Convert a Jupyter notebook to a marimo
|
298 |
-
notebook using `marimo convert`:
|
299 |
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
-
- `dataflow`: more on marimo's automatic execution
|
307 |
-
- `ui`: how to use UI elements
|
308 |
-
- `markdown`: how to write markdown, with interpolated values and
|
309 |
-
LaTeX
|
310 |
-
- `plots`: how plotting works in marimo
|
311 |
-
- `sql`: how to use SQL
|
312 |
-
- `layout`: layout elements in marimo
|
313 |
-
- `fileformat`: how marimo's file format works
|
314 |
-
- `markdown-format`: for using `.md` files in marimo
|
315 |
-
- `for-jupyter-users`: if you are coming from Jupyter
|
316 |
|
317 |
-
|
|
|
|
|
|
|
|
|
|
|
318 |
|
319 |
-
```
|
320 |
-
marimo tutorial dataflow
|
321 |
-
```
|
322 |
|
323 |
-
|
324 |
-
|
325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
)
|
327 |
-
|
|
|
|
|
328 |
|
329 |
|
330 |
@app.cell(hide_code=True)
|
331 |
def __(mo):
|
332 |
mo.md(
|
333 |
-
"""
|
334 |
-
##
|
335 |
|
336 |
-
|
337 |
"""
|
338 |
)
|
339 |
return
|
340 |
|
341 |
|
342 |
@app.cell
|
343 |
-
def __(
|
344 |
-
|
345 |
return
|
346 |
|
347 |
|
348 |
-
@app.cell
|
349 |
-
def __(
|
350 |
-
|
351 |
return
|
352 |
|
353 |
|
354 |
-
@app.cell
|
355 |
def __(mo):
|
356 |
-
|
357 |
-
|
358 |
-
The name "marimo" is a reference to a type of algae that, under
|
359 |
-
the right conditions, clumps together to form a small sphere
|
360 |
-
called a "marimo moss ball". Made of just strands of algae, these
|
361 |
-
beloved assemblages are greater than the sum of their parts.
|
362 |
-
"""
|
363 |
-
)
|
364 |
return
|
365 |
|
366 |
|
367 |
-
@app.cell
|
368 |
def __():
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
1. _Run a cell_ by clicking the play ( β· ) button on the top
|
386 |
-
right of a cell, or by inputting `Ctrl/Cmd+Enter`.
|
387 |
-
|
388 |
-
2. _Run a stale cell_ by clicking the yellow run button on the
|
389 |
-
right of the cell, or by inputting `Ctrl/Cmd+Enter`. A cell is
|
390 |
-
stale when its code has been modified but not run.
|
391 |
-
|
392 |
-
3. _Run all stale cells_ by clicking the play ( β· ) button on
|
393 |
-
the bottom right of the screen, or input `Ctrl/Cmd+Shift+r`.
|
394 |
-
"""
|
395 |
-
),
|
396 |
-
"Console Output": (
|
397 |
-
"""
|
398 |
-
Console output (e.g., `print()` statements) is shown below a
|
399 |
-
cell.
|
400 |
-
"""
|
401 |
-
),
|
402 |
-
"Creating, Moving, and Deleting Cells": (
|
403 |
-
"""
|
404 |
-
1. _Create_ a new cell above or below a given one by clicking
|
405 |
-
the plus button to the left of the cell, which appears on
|
406 |
-
mouse hover.
|
407 |
-
|
408 |
-
2. _Move_ a cell up or down by dragging on the handle to the
|
409 |
-
right of the cell, which appears on mouse hover.
|
410 |
-
|
411 |
-
3. _Delete_ a cell by clicking the trash bin icon. Bring it
|
412 |
-
back by clicking the undo button on the bottom right of the
|
413 |
-
screen, or with `Ctrl/Cmd+Shift+z`.
|
414 |
-
"""
|
415 |
-
),
|
416 |
-
"Disabling Automatic Execution": (
|
417 |
-
"""
|
418 |
-
Via the notebook settings (gear icon) or footer panel, you
|
419 |
-
can disable automatic execution. This is helpful when
|
420 |
-
working with expensive notebooks or notebooks that have
|
421 |
-
side-effects like database transactions.
|
422 |
-
"""
|
423 |
-
),
|
424 |
-
"Disabling Cells": (
|
425 |
-
"""
|
426 |
-
You can disable a cell via the cell context menu.
|
427 |
-
marimo will never run a disabled cell or any cells that depend on it.
|
428 |
-
This can help prevent accidental execution of expensive computations
|
429 |
-
when editing a notebook.
|
430 |
-
"""
|
431 |
-
),
|
432 |
-
"Code Folding": (
|
433 |
-
"""
|
434 |
-
You can collapse or fold the code in a cell by clicking the arrow
|
435 |
-
icons in the line number column to the left, or by using keyboard
|
436 |
-
shortcuts.
|
437 |
-
|
438 |
-
Use the command palette (`Ctrl/Cmd+k`) or a keyboard shortcut to
|
439 |
-
quickly fold or unfold all cells.
|
440 |
-
"""
|
441 |
-
),
|
442 |
-
"Code Formatting": (
|
443 |
-
"""
|
444 |
-
If you have [ruff](https://github.com/astral-sh/ruff) installed,
|
445 |
-
you can format a cell with the keyboard shortcut `Ctrl/Cmd+b`.
|
446 |
-
"""
|
447 |
-
),
|
448 |
-
"Command Palette": (
|
449 |
-
"""
|
450 |
-
Use `Ctrl/Cmd+k` to open the command palette.
|
451 |
-
"""
|
452 |
-
),
|
453 |
-
"Keyboard Shortcuts": (
|
454 |
-
"""
|
455 |
-
Open the notebook menu (top-right) or input `Ctrl/Cmd+Shift+h` to
|
456 |
-
view a list of all keyboard shortcuts.
|
457 |
-
"""
|
458 |
-
),
|
459 |
-
"Configuration": (
|
460 |
-
"""
|
461 |
-
Configure the editor by clicking the gears icon near the top-right
|
462 |
-
of the screen.
|
463 |
-
"""
|
464 |
-
),
|
465 |
-
}
|
466 |
-
return (tips,)
|
467 |
|
468 |
|
469 |
if __name__ == "__main__":
|
|
|
1 |
+
# /// script
|
2 |
+
# requires-python = ">=3.12"
|
3 |
+
# dependencies = [
|
4 |
+
# "altair==5.4.1",
|
5 |
+
# "duckdb==1.1.3",
|
6 |
+
# "hdbscan==0.8.39",
|
7 |
+
# "marimo",
|
8 |
+
# "numba==0.60.0",
|
9 |
+
# "numpy==2.0.2",
|
10 |
+
# "polars==1.17.1",
|
11 |
+
# "pyarrow==18.0.0",
|
12 |
+
# "scikit-learn==1.5.2",
|
13 |
+
# "umap-learn==0.5.7",
|
14 |
+
# ]
|
15 |
+
# ///
|
16 |
+
|
17 |
import marimo
|
18 |
|
19 |
+
__generated_with = "0.9.33"
|
20 |
+
app = marimo.App(width="medium")
|
21 |
|
22 |
|
23 |
@app.cell
|
24 |
def __():
|
25 |
import marimo as mo
|
26 |
|
|
|
27 |
return (mo,)
|
28 |
|
29 |
|
30 |
+
@app.cell(hide_code=True)
|
31 |
def __(mo):
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
mo.md(
|
33 |
+
r"""
|
34 |
+
# Visualizing text embeddings using MotherDuck and marimo
|
35 |
|
36 |
+
> Text embeddings have become a crucial tool in AI/ML applications, allowing us to convert text into numerical vectors that capture semantic meaning. These vectors are often used for semantic search, but in this blog post, we'll explore how to visualize and explore text embeddings interactively using MotherDuck and marimo.
|
|
|
|
|
37 |
|
38 |
+
[_Read the full blog here._](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/)
|
39 |
"""
|
40 |
)
|
41 |
return
|
42 |
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
@app.cell(hide_code=True)
|
45 |
def __(mo):
|
46 |
mo.md(
|
47 |
"""
|
48 |
+
## Connecting to MotherDuck and Loading Sample Data
|
|
|
|
|
|
|
49 |
|
50 |
+
This data has already been pre-computed, but you can fork and edit this notebook to run with your own data!
|
|
|
|
|
51 |
|
52 |
+
```sql
|
53 |
+
ATTACH IF NOT EXISTS 'md:my_db'
|
54 |
+
SELECT * FROM my_db.demo_with_embeddings;
|
55 |
+
```
|
56 |
"""
|
57 |
)
|
58 |
return
|
59 |
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
@app.cell
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
def __(mo):
|
63 |
+
_df = mo.sql(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
"""
|
65 |
+
ATTACH IF NOT EXISTS 'md:my_db'
|
|
|
|
|
66 |
"""
|
67 |
)
|
68 |
+
return (my_db,)
|
69 |
|
70 |
|
71 |
+
@app.cell
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
def __(mo):
|
73 |
+
_df = mo.sql(
|
74 |
"""
|
75 |
+
-- Commented out as we have already run the embeddings for showcasing purposes.
|
76 |
+
|
77 |
+
-- CREATE OR REPLACE TABLE my_db.demo_embedding_data AS
|
78 |
+
-- SELECT DISTINCT ON (url) * -- Remove duplicate URLs
|
79 |
+
-- FROM 'hf://datasets/julien040/hacker-news-posts/story.parquet'
|
80 |
+
-- WHERE contains(title, 'database') -- Filter for posts about databases
|
81 |
+
-- AND score > 5 -- Only include popular posts
|
82 |
+
-- LIMIT 50000;
|
|
|
83 |
"""
|
84 |
)
|
85 |
return
|
86 |
|
87 |
|
88 |
@app.cell
|
89 |
+
def __(demo_with_embeddings, mo, my_db):
|
90 |
+
embeddings = mo.sql(
|
91 |
+
f"""
|
92 |
+
-- Commented out as we have already run the embeddings for showcasing purposes.
|
93 |
+
-- CREATE TABLE my_db.demo_with_embeddings AS
|
94 |
+
-- SELECT *, embedding(title) as text_embedding
|
95 |
+
-- FROM my_db.demo_embedding_data
|
96 |
+
-- LIMIT 1500;
|
97 |
|
98 |
+
SELECT title, text_embedding, * EXCLUDE(id, title, text_embedding, comments) FROM my_db.demo_with_embeddings;
|
99 |
+
"""
|
100 |
+
)
|
101 |
+
return (embeddings,)
|
102 |
|
103 |
|
104 |
@app.cell
|
105 |
+
def __(PCA, hdbscan, np, umap):
|
106 |
+
def umap_reduce(np_array, metric="cosine"):
|
107 |
+
"""
|
108 |
+
Reduce the dimensionality of the embeddings to 2D using
|
109 |
+
UMAP algorithm. UMAP preserves both local and global structure
|
110 |
+
of the high-dimensional data.
|
111 |
+
"""
|
112 |
+
reducer = umap.UMAP(
|
113 |
+
n_components=2, # Reduce to 2D for visualization
|
114 |
+
metric=metric, # Default: cosine similarity for text embeddings
|
115 |
+
n_neighbors=80, # Higher values = more global structure
|
116 |
+
min_dist=0.1, # Controls how tightly points cluster
|
117 |
+
)
|
118 |
+
return reducer.fit_transform(np_array)
|
119 |
|
120 |
+
def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
|
121 |
+
"""
|
122 |
+
Cluster the embeddings using HDBSCAN algorithm.
|
123 |
+
We first reduce dimensionality to 50D with PCA to speed up clustering,
|
124 |
+
while still preserving most of the important information.
|
125 |
+
"""
|
126 |
+
pca = PCA(n_components=50)
|
127 |
+
np_array = pca.fit_transform(np_array)
|
128 |
|
129 |
+
hdb = hdbscan.HDBSCAN(
|
130 |
+
min_samples=3, # Minimum points to form dense region
|
131 |
+
min_cluster_size=min_cluster_size, # Minimum size of a cluster
|
132 |
+
max_cluster_size=max_cluster_size, # Maximum size of a cluster
|
133 |
+
).fit(np_array)
|
134 |
|
135 |
+
return np.where(
|
136 |
+
hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
|
137 |
+
)
|
138 |
|
139 |
+
return cluster_points, umap_reduce
|
|
|
|
|
|
|
140 |
|
141 |
|
142 |
+
@app.cell
|
143 |
def __(mo):
|
144 |
+
cluster_size_slider = mo.ui.range_slider(
|
145 |
+
start=1,
|
146 |
+
stop=80,
|
147 |
+
value=(4, 50),
|
148 |
+
step=1,
|
149 |
+
show_value=True,
|
150 |
+
debounce=True,
|
151 |
+
label="Cluster Size (min, max)",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
)
|
153 |
+
metric_dropdown = mo.ui.dropdown(
|
154 |
+
["cosine", "euclidean", "manhattan", "mahalanobis"],
|
155 |
+
value="cosine",
|
156 |
+
label="Distance Metric",
|
157 |
+
)
|
158 |
+
return cluster_size_slider, metric_dropdown
|
159 |
|
160 |
|
161 |
+
@app.cell
|
162 |
def __(mo):
|
163 |
mo.md(
|
164 |
+
r"""
|
165 |
+
## Processing the Data
|
|
|
|
|
|
|
166 |
|
167 |
+
Now we'll transform our high-dimensional embeddings into something we can visualize, using `umap_reduce` and `cluster_points`. More details on this step [in the blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).
|
|
|
|
|
168 |
"""
|
169 |
)
|
170 |
return
|
171 |
|
172 |
|
173 |
+
@app.cell
|
174 |
+
def __(
|
175 |
+
cluster_points,
|
176 |
+
cluster_size_slider,
|
177 |
+
embeddings,
|
178 |
+
metric_dropdown,
|
179 |
+
mo,
|
180 |
+
umap_reduce,
|
181 |
+
):
|
182 |
+
with mo.status.spinner("Clustering points...") as _s:
|
183 |
+
embeddings_array = embeddings["text_embedding"].to_numpy()
|
184 |
+
hdb_labels = cluster_points(
|
185 |
+
embeddings_array,
|
186 |
+
min_cluster_size=cluster_size_slider.value[0],
|
187 |
+
max_cluster_size=cluster_size_slider.value[1],
|
188 |
+
)
|
189 |
+
_s.update("Reducing dimensionality...")
|
190 |
+
embeddings_2d = umap_reduce(embeddings_array, metric=metric_dropdown.value)
|
191 |
+
mo.show_code()
|
192 |
+
return embeddings_2d, embeddings_array, hdb_labels
|
193 |
|
|
|
|
|
|
|
194 |
|
195 |
+
@app.cell
|
196 |
+
def __(cluster_size_slider, metric_dropdown, mo):
|
197 |
+
mo.hstack([cluster_size_slider, metric_dropdown])
|
198 |
+
return
|
199 |
|
|
|
|
|
200 |
|
201 |
+
@app.cell
|
202 |
+
def __(embeddings, embeddings_2d, hdb_labels, pl):
|
203 |
+
data = embeddings.lazy() # Lazy evaluation for performance
|
204 |
+
data = data.with_columns(
|
205 |
+
text_embedding_2d_1=embeddings_2d[:, 0],
|
206 |
+
text_embedding_2d_2=embeddings_2d[:, 1],
|
207 |
+
cluster=hdb_labels,
|
208 |
+
)
|
209 |
+
data = data.unique(subset=["url"], maintain_order=True) # Remove duplicate URLs
|
210 |
+
data = data.drop(["text_embedding"]) # Drop unused columns
|
211 |
+
data = data.filter(pl.col("cluster") != "outlier") # Filter out outliers
|
212 |
+
data = data.collect() # Collect the data
|
213 |
+
return (data,)
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
+
@app.cell
|
217 |
+
def __(data):
|
218 |
+
data.select(
|
219 |
+
"title", "cluster", "text_embedding_2d_1", "text_embedding_2d_2", "score"
|
220 |
+
)
|
221 |
+
return
|
222 |
|
|
|
|
|
|
|
223 |
|
224 |
+
@app.cell
|
225 |
+
def __(alt, data, mo):
|
226 |
+
chart = (
|
227 |
+
alt.Chart(data)
|
228 |
+
.mark_point()
|
229 |
+
.encode(
|
230 |
+
x=alt.X("text_embedding_2d_1").scale(zero=False),
|
231 |
+
y=alt.Y("text_embedding_2d_2").scale(zero=False),
|
232 |
+
color="cluster",
|
233 |
+
tooltip=["title", "score", "cluster"],
|
234 |
+
)
|
235 |
)
|
236 |
+
chart = mo.ui.altair_chart(chart)
|
237 |
+
mo.show_code()
|
238 |
+
return (chart,)
|
239 |
|
240 |
|
241 |
@app.cell(hide_code=True)
|
242 |
def __(mo):
|
243 |
mo.md(
|
244 |
+
r"""
|
245 |
+
## Creating an Interactive Visualization
|
246 |
|
247 |
+
We will plot the 2D representation of the text embeddings, colored by the clusters identified by HDBSCAN. You can select points on the chart to explore the text embeddings further. π
|
248 |
"""
|
249 |
)
|
250 |
return
|
251 |
|
252 |
|
253 |
@app.cell
|
254 |
+
def __(chart):
|
255 |
+
chart
|
256 |
return
|
257 |
|
258 |
|
259 |
+
@app.cell
|
260 |
+
def __(chart):
|
261 |
+
chart.value
|
262 |
return
|
263 |
|
264 |
|
265 |
+
@app.cell
|
266 |
def __(mo):
|
267 |
+
# Empty space for the table
|
268 |
+
mo.Html("<div style='height: 400px;'></div>")
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
return
|
270 |
|
271 |
|
272 |
+
@app.cell
|
273 |
def __():
|
274 |
+
# Data manipulation and database connections
|
275 |
+
import polars as pl
|
276 |
+
import duckdb
|
277 |
+
import numba # <- FYI, this module takes a while to load, be patient
|
278 |
+
import pyarrow
|
279 |
+
|
280 |
+
# Visualization
|
281 |
+
import altair as alt
|
282 |
+
|
283 |
+
# ML tools for dimensionality reduction and clustering
|
284 |
+
import umap # For reducing high-dimensional embeddings to 2D
|
285 |
+
import hdbscan # For clustering similar embeddings
|
286 |
+
import numpy as np
|
287 |
+
from sklearn.decomposition import PCA
|
288 |
+
|
289 |
+
return PCA, alt, duckdb, hdbscan, np, numba, pl, pyarrow, umap
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
|
291 |
|
292 |
if __name__ == "__main__":
|
requirements.txt
CHANGED
@@ -1,5 +1,10 @@
|
|
|
|
|
|
|
|
1 |
marimo
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
|
|
|
|
|
1 |
+
altair==5.4.1
|
2 |
+
duckdb==1.1.3
|
3 |
+
hdbscan==0.8.39
|
4 |
marimo
|
5 |
+
numba==0.60.0
|
6 |
+
numpy==2.0.2
|
7 |
+
polars==1.17.1
|
8 |
+
pyarrow==18.0.0
|
9 |
+
scikit-learn==1.5.2
|
10 |
+
umap-learn==0.5.7
|