Xianbao QIAN
commited on
Commit
Β·
c80b461
1
Parent(s):
2ea1dfc
use client side rendering for the homepage.
Browse files- @types/parquetjs-lite.d.ts +7 -0
- Dockerfile +3 -0
- bun.lockb +0 -0
- package.json +1 -1
- {tables β public}/ancestor_children.example.yaml +0 -0
- {tables β public}/ancestor_children.parquet +0 -0
- {tables β public}/datasets.example.yaml +0 -0
- {tables β public}/datasets.parquet +0 -0
- {tables β public}/models.example.yaml +0 -0
- {tables β public}/models.parquet +0 -0
- {tables β public}/parents.example.yaml +0 -0
- {tables β public}/parents.parquet +0 -0
- {tables β public}/spaces.example.yaml +0 -0
- {tables β public}/spaces.parquet +0 -0
- python/0_download_files.py +6 -6
- python/1_parents.py +3 -3
- python/2_ancestors.py +3 -3
- src/app/page.tsx +103 -48
@types/parquetjs-lite.d.ts
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
declare module 'parquetjs-lite' {
|
2 |
+
export class ParquetReader {
|
3 |
+
static openFile(filePath: string): Promise<ParquetReader>;
|
4 |
+
getCursor(): any;
|
5 |
+
close(): Promise<void>;
|
6 |
+
}
|
7 |
+
}
|
Dockerfile
CHANGED
@@ -34,6 +34,9 @@ RUN \
|
|
34 |
addgroup --system --gid 1001 nodejs; \
|
35 |
adduser --system --uid 1001 nextjs
|
36 |
|
|
|
|
|
|
|
37 |
# Automatically leverage output traces to reduce image size
|
38 |
COPY --from=builder --link --chown=1001:1001 /app/.next/standalone ./
|
39 |
COPY --from=builder --link --chown=1001:1001 /app/.next/static ./.next/static
|
|
|
34 |
addgroup --system --gid 1001 nodejs; \
|
35 |
adduser --system --uid 1001 nextjs
|
36 |
|
37 |
+
# Public files are served by nextjs.
|
38 |
+
COPY --from=builder --link /app/public ./public
|
39 |
+
|
40 |
# Automatically leverage output traces to reduce image size
|
41 |
COPY --from=builder --link --chown=1001:1001 /app/.next/standalone ./
|
42 |
COPY --from=builder --link --chown=1001:1001 /app/.next/static ./.next/static
|
bun.lockb
CHANGED
Binary files a/bun.lockb and b/bun.lockb differ
|
|
package.json
CHANGED
@@ -9,8 +9,8 @@
|
|
9 |
"lint": "next lint"
|
10 |
},
|
11 |
"dependencies": {
|
|
|
12 |
"next": "14.2.15",
|
13 |
-
"parquetjs-lite": "^0.8.7",
|
14 |
"react": "^18",
|
15 |
"react-dom": "^18"
|
16 |
},
|
|
|
9 |
"lint": "next lint"
|
10 |
},
|
11 |
"dependencies": {
|
12 |
+
"@duckdb/duckdb-wasm": "^1.29.0",
|
13 |
"next": "14.2.15",
|
|
|
14 |
"react": "^18",
|
15 |
"react-dom": "^18"
|
16 |
},
|
{tables β public}/ancestor_children.example.yaml
RENAMED
File without changes
|
{tables β public}/ancestor_children.parquet
RENAMED
File without changes
|
{tables β public}/datasets.example.yaml
RENAMED
File without changes
|
{tables β public}/datasets.parquet
RENAMED
File without changes
|
{tables β public}/models.example.yaml
RENAMED
File without changes
|
{tables β public}/models.parquet
RENAMED
File without changes
|
{tables β public}/parents.example.yaml
RENAMED
File without changes
|
{tables β public}/parents.parquet
RENAMED
File without changes
|
{tables β public}/spaces.example.yaml
RENAMED
File without changes
|
{tables β public}/spaces.parquet
RENAMED
File without changes
|
python/0_download_files.py
CHANGED
@@ -7,8 +7,8 @@ import random
|
|
7 |
import argparse
|
8 |
import yaml
|
9 |
|
10 |
-
# Create the "
|
11 |
-
os.makedirs("
|
12 |
|
13 |
# URLs of the files to download
|
14 |
urls = [
|
@@ -18,7 +18,7 @@ urls = [
|
|
18 |
]
|
19 |
|
20 |
def download_file(url, overwrite=True):
|
21 |
-
filename = os.path.join("
|
22 |
|
23 |
if not overwrite and os.path.exists(filename):
|
24 |
print(f"File already exists: {filename}. Skipping download.")
|
@@ -56,7 +56,7 @@ def main(overwrite):
|
|
56 |
|
57 |
# Process each downloaded Parquet file
|
58 |
for url in urls:
|
59 |
-
filename = os.path.join("
|
60 |
table_name = os.path.splitext(os.path.basename(filename))[0]
|
61 |
|
62 |
# Connect to the Parquet file using DuckDB
|
@@ -86,8 +86,8 @@ def main(overwrite):
|
|
86 |
yaml_content = yaml_content.rstrip() # Remove trailing spaces
|
87 |
yaml_content += "\n"
|
88 |
|
89 |
-
# Save the YAML content to a file in the "
|
90 |
-
yaml_file = os.path.join("
|
91 |
with open(yaml_file, "w") as file:
|
92 |
file.write(yaml_content)
|
93 |
|
|
|
7 |
import argparse
|
8 |
import yaml
|
9 |
|
10 |
+
# Create the "public" folders if they don't exist
|
11 |
+
os.makedirs("public", exist_ok=True)
|
12 |
|
13 |
# URLs of the files to download
|
14 |
urls = [
|
|
|
18 |
]
|
19 |
|
20 |
def download_file(url, overwrite=True):
|
21 |
+
filename = os.path.join("public", url.split("/")[-1].split("?")[0])
|
22 |
|
23 |
if not overwrite and os.path.exists(filename):
|
24 |
print(f"File already exists: {filename}. Skipping download.")
|
|
|
56 |
|
57 |
# Process each downloaded Parquet file
|
58 |
for url in urls:
|
59 |
+
filename = os.path.join("public", url.split("/")[-1].split("?")[0])
|
60 |
table_name = os.path.splitext(os.path.basename(filename))[0]
|
61 |
|
62 |
# Connect to the Parquet file using DuckDB
|
|
|
86 |
yaml_content = yaml_content.rstrip() # Remove trailing spaces
|
87 |
yaml_content += "\n"
|
88 |
|
89 |
+
# Save the YAML content to a file in the "public" folder
|
90 |
+
yaml_file = os.path.join("public", f"{table_name}.example.yaml")
|
91 |
with open(yaml_file, "w") as file:
|
92 |
file.write(yaml_content)
|
93 |
|
python/1_parents.py
CHANGED
@@ -23,7 +23,7 @@ query = """
|
|
23 |
_id,
|
24 |
id,
|
25 |
extract_base_models(tags) AS base_models
|
26 |
-
FROM parquet_scan('
|
27 |
"""
|
28 |
|
29 |
start_time = time.time()
|
@@ -32,7 +32,7 @@ start_time = time.time()
|
|
32 |
con.execute(f"CREATE VIEW parent_models AS {query}")
|
33 |
|
34 |
# Write the view to a parquet file
|
35 |
-
con.execute("COPY parent_models TO '
|
36 |
|
37 |
end_time = time.time()
|
38 |
execution_time = end_time - start_time
|
@@ -55,5 +55,5 @@ result = con.execute("""
|
|
55 |
LIMIT 10
|
56 |
""").fetchall()
|
57 |
|
58 |
-
with open("
|
59 |
yaml.safe_dump(result, f, default_flow_style=False)
|
|
|
23 |
_id,
|
24 |
id,
|
25 |
extract_base_models(tags) AS base_models
|
26 |
+
FROM parquet_scan('public/models.parquet')
|
27 |
"""
|
28 |
|
29 |
start_time = time.time()
|
|
|
32 |
con.execute(f"CREATE VIEW parent_models AS {query}")
|
33 |
|
34 |
# Write the view to a parquet file
|
35 |
+
con.execute("COPY parent_models TO 'public/parents.parquet' (FORMAT 'parquet')")
|
36 |
|
37 |
end_time = time.time()
|
38 |
execution_time = end_time - start_time
|
|
|
55 |
LIMIT 10
|
56 |
""").fetchall()
|
57 |
|
58 |
+
with open("public/parents.example.yaml", "w") as f:
|
59 |
yaml.safe_dump(result, f, default_flow_style=False)
|
python/2_ancestors.py
CHANGED
@@ -22,7 +22,7 @@ total_start_time = time.perf_counter()
|
|
22 |
# Load parents.parquet into an in-memory table
|
23 |
load_parents_query = """
|
24 |
CREATE TABLE parents_in_memory AS
|
25 |
-
SELECT * FROM parquet_scan('
|
26 |
"""
|
27 |
execute_with_timing(load_parents_query, "Loaded parents.parquet into RAM")
|
28 |
|
@@ -115,7 +115,7 @@ final_output_query = """
|
|
115 |
FROM ancestor_children ac
|
116 |
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent
|
117 |
ORDER BY all_children_count DESC
|
118 |
-
) TO '
|
119 |
"""
|
120 |
con.execute(final_output_query)
|
121 |
end_time = time.perf_counter()
|
@@ -131,7 +131,7 @@ sample_query = """
|
|
131 |
LIMIT 10
|
132 |
"""
|
133 |
sample_data = con.execute(sample_query).fetchall()
|
134 |
-
with open("
|
135 |
yaml.safe_dump(sample_data, f, default_flow_style=False)
|
136 |
end_time = time.perf_counter()
|
137 |
logging.info(f"Written sample data to YAML file in {end_time - start_time:.6f} seconds.")
|
|
|
22 |
# Load parents.parquet into an in-memory table
|
23 |
load_parents_query = """
|
24 |
CREATE TABLE parents_in_memory AS
|
25 |
+
SELECT * FROM parquet_scan('public/parents.parquet')
|
26 |
"""
|
27 |
execute_with_timing(load_parents_query, "Loaded parents.parquet into RAM")
|
28 |
|
|
|
115 |
FROM ancestor_children ac
|
116 |
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent
|
117 |
ORDER BY all_children_count DESC
|
118 |
+
) TO 'public/ancestor_children.parquet' (FORMAT 'parquet')
|
119 |
"""
|
120 |
con.execute(final_output_query)
|
121 |
end_time = time.perf_counter()
|
|
|
131 |
LIMIT 10
|
132 |
"""
|
133 |
sample_data = con.execute(sample_query).fetchall()
|
134 |
+
with open("public/ancestor_children.example.yaml", "w") as f:
|
135 |
yaml.safe_dump(sample_data, f, default_flow_style=False)
|
136 |
end_time = time.perf_counter()
|
137 |
logging.info(f"Written sample data to YAML file in {end_time - start_time:.6f} seconds.")
|
src/app/page.tsx
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
import {
|
|
|
4 |
|
5 |
type ModelData = {
|
6 |
ancestor: string;
|
@@ -10,41 +11,87 @@ type ModelData = {
|
|
10 |
direct_children_count: number | null;
|
11 |
};
|
12 |
|
13 |
-
export default
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
}
|
33 |
-
|
|
|
34 |
|
35 |
-
|
|
|
|
|
36 |
|
37 |
-
|
38 |
-
const top10Models = data
|
39 |
-
.sort((a, b) => b.all_children.length - a.all_children.length)
|
40 |
-
.slice(0, 10);
|
41 |
|
42 |
-
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
<table className="table-auto border-collapse w-full">
|
49 |
<thead>
|
50 |
<tr>
|
@@ -54,7 +101,7 @@ export default async function Home() {
|
|
54 |
</tr>
|
55 |
</thead>
|
56 |
<tbody>
|
57 |
-
{
|
58 |
<tr key={index} className="border-t border-gray-200 dark:border-gray-700">
|
59 |
<td className="px-4 py-2">{model.ancestor}</td>
|
60 |
<td className="px-4 py-2 text-right">{model.direct_children_count ?? 0}</td>
|
@@ -63,18 +110,26 @@ export default async function Home() {
|
|
63 |
))}
|
64 |
</tbody>
|
65 |
</table>
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
}
|
|
|
1 |
+
'use client';
|
2 |
+
|
3 |
+
import { useState, useEffect } from 'react';
|
4 |
+
import * as duckdb from '@duckdb/duckdb-wasm';
|
5 |
|
6 |
type ModelData = {
|
7 |
ancestor: string;
|
|
|
11 |
direct_children_count: number | null;
|
12 |
};
|
13 |
|
14 |
+
export default function Home() {
|
15 |
+
const [allModels, setAllModels] = useState<ModelData[]>([]);
|
16 |
+
const [currentPage, setCurrentPage] = useState(1);
|
17 |
+
const [pageSize, setPageSize] = useState(100);
|
18 |
+
const [filterText, setFilterText] = useState('');
|
19 |
+
|
20 |
+
useEffect(() => {
|
21 |
+
async function fetchData() {
|
22 |
+
const JSDELIVR_BUNDLES = duckdb.getJsDelivrBundles();
|
23 |
+
|
24 |
+
// Select a bundle based on browser checks
|
25 |
+
const bundle = await duckdb.selectBundle(JSDELIVR_BUNDLES);
|
26 |
+
|
27 |
+
const worker_url = URL.createObjectURL(
|
28 |
+
new Blob([`importScripts("${bundle.mainWorker!}");`], { type: 'text/javascript' })
|
29 |
+
);
|
30 |
+
|
31 |
+
// Instantiate the asynchronous version of DuckDB-Wasm
|
32 |
+
const worker = new Worker(worker_url);
|
33 |
+
const logger = new duckdb.ConsoleLogger();
|
34 |
+
const db = new duckdb.AsyncDuckDB(logger, worker);
|
35 |
+
await db.instantiate(bundle.mainModule, bundle.pthreadWorker);
|
36 |
+
|
37 |
+
// Register the Parquet file using the URL
|
38 |
+
await db.registerFileURL(
|
39 |
+
'ancestor_children.parquet',
|
40 |
+
`${window.location.origin}/ancestor_children.parquet`,
|
41 |
+
duckdb.DuckDBDataProtocol.HTTP,
|
42 |
+
false
|
43 |
+
);
|
44 |
+
|
45 |
+
// Execute the SQL query using the registered Parquet file
|
46 |
+
const query = `
|
47 |
+
SELECT
|
48 |
+
ancestor,
|
49 |
+
direct_children,
|
50 |
+
all_children,
|
51 |
+
CAST(all_children_count AS INTEGER) AS all_children_count,
|
52 |
+
CAST(direct_children_count AS INTEGER) AS direct_children_count
|
53 |
+
FROM 'ancestor_children.parquet'
|
54 |
+
`;
|
55 |
+
const conn = await db.connect();
|
56 |
+
const result = await conn.query(query);
|
57 |
+
|
58 |
+
// Convert the result to a JavaScript array
|
59 |
+
const data: ModelData[] = result.toArray();
|
60 |
+
|
61 |
+
// Close the connection and terminate the worker
|
62 |
+
await conn.close();
|
63 |
+
await db.terminate();
|
64 |
+
|
65 |
+
setAllModels(data);
|
66 |
}
|
67 |
+
fetchData();
|
68 |
+
}, []);
|
69 |
|
70 |
+
const filteredModels = allModels.filter((model) =>
|
71 |
+
model.ancestor.toLowerCase().includes(filterText.toLowerCase())
|
72 |
+
);
|
73 |
|
74 |
+
const totalPages = Math.ceil(filteredModels.length / pageSize);
|
|
|
|
|
|
|
75 |
|
76 |
+
const paginatedModels = filteredModels.slice(
|
77 |
+
(currentPage - 1) * pageSize,
|
78 |
+
currentPage * pageSize
|
79 |
+
);
|
80 |
|
81 |
+
return (
|
82 |
+
<main className="container mx-auto py-8 text-gray-900 dark:text-white">
|
83 |
+
<h1 className="text-4xl font-bold mb-4">All Models</h1>
|
84 |
+
<div className="mb-4">
|
85 |
+
<input
|
86 |
+
type="text"
|
87 |
+
placeholder="Filter by model name"
|
88 |
+
value={filterText}
|
89 |
+
onChange={(e) => setFilterText(e.target.value)}
|
90 |
+
className="px-4 py-2 border border-gray-300 rounded-md"
|
91 |
+
/>
|
92 |
+
</div>
|
93 |
+
{paginatedModels.length > 0 ? (
|
94 |
+
<>
|
95 |
<table className="table-auto border-collapse w-full">
|
96 |
<thead>
|
97 |
<tr>
|
|
|
101 |
</tr>
|
102 |
</thead>
|
103 |
<tbody>
|
104 |
+
{paginatedModels.map((model, index) => (
|
105 |
<tr key={index} className="border-t border-gray-200 dark:border-gray-700">
|
106 |
<td className="px-4 py-2">{model.ancestor}</td>
|
107 |
<td className="px-4 py-2 text-right">{model.direct_children_count ?? 0}</td>
|
|
|
110 |
))}
|
111 |
</tbody>
|
112 |
</table>
|
113 |
+
<div className="mt-4">
|
114 |
+
<button
|
115 |
+
onClick={() => setCurrentPage((prev) => Math.max(prev - 1, 1))}
|
116 |
+
disabled={currentPage === 1}
|
117 |
+
className="px-4 py-2 bg-blue-500 text-white rounded-md mr-2"
|
118 |
+
>
|
119 |
+
Previous
|
120 |
+
</button>
|
121 |
+
<button
|
122 |
+
onClick={() => setCurrentPage((prev) => Math.min(prev + 1, totalPages))}
|
123 |
+
disabled={currentPage === totalPages}
|
124 |
+
className="px-4 py-2 bg-blue-500 text-white rounded-md"
|
125 |
+
>
|
126 |
+
Next
|
127 |
+
</button>
|
128 |
+
</div>
|
129 |
+
</>
|
130 |
+
) : (
|
131 |
+
<p>No data found.</p>
|
132 |
+
)}
|
133 |
+
</main>
|
134 |
+
);
|
135 |
}
|