Spaces:
Running
Running
Deploy (see actual commits on https://github.com/mlcommons/croissant).
Browse files- app.py +0 -1
- core/constants.py +1 -1
- core/state.py +11 -3
- deploy_to_hf.sh +5 -2
- events/record_sets.py +14 -0
- views/foo.py.py +36 -0
- views/overview.py +44 -23
- views/record_sets.py +172 -63
app.py
CHANGED
@@ -20,7 +20,6 @@ col1.header("Croissant Editor")
|
|
20 |
init_state()
|
21 |
|
22 |
user = get_cached_user()
|
23 |
-
print("USER", user)
|
24 |
|
25 |
if OAUTH_CLIENT_ID and not user:
|
26 |
query_params = st.experimental_get_query_params()
|
|
|
20 |
init_state()
|
21 |
|
22 |
user = get_cached_user()
|
|
|
23 |
|
24 |
if OAUTH_CLIENT_ID and not user:
|
25 |
query_params = st.experimental_get_query_params()
|
core/constants.py
CHANGED
@@ -33,5 +33,5 @@ DF_HEIGHT = 150
|
|
33 |
OVERVIEW = "Overview"
|
34 |
METADATA = "Metadata"
|
35 |
RESOURCES = "Resources"
|
36 |
-
RECORD_SETS = "
|
37 |
TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
|
|
|
33 |
OVERVIEW = "Overview"
|
34 |
METADATA = "Metadata"
|
35 |
RESOURCES = "Resources"
|
36 |
+
RECORD_SETS = "Record Sets"
|
37 |
TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
|
core/state.py
CHANGED
@@ -168,7 +168,7 @@ class RecordSet:
|
|
168 |
"""Record Set analogue for editor"""
|
169 |
|
170 |
name: str = ""
|
171 |
-
data: Any = None
|
172 |
description: str | None = None
|
173 |
is_enumeration: bool | None = None
|
174 |
key: str | list[str] | None = None
|
@@ -208,9 +208,14 @@ class Metadata:
|
|
208 |
"""Renames a RecordSet by changing all the references to this RecordSet."""
|
209 |
for i, record_set in enumerate(self.record_sets):
|
210 |
for j, field in enumerate(record_set.fields):
|
|
|
211 |
# Update source
|
212 |
source = field.source
|
213 |
-
if
|
|
|
|
|
|
|
|
|
214 |
new_uid = source.uid.replace(old_name, new_name, 1)
|
215 |
self.record_sets[i].fields[j].source.uid = new_uid
|
216 |
# Update references
|
@@ -218,7 +223,10 @@ class Metadata:
|
|
218 |
if (
|
219 |
references
|
220 |
and references.uid
|
221 |
-
and
|
|
|
|
|
|
|
222 |
):
|
223 |
new_uid = references.uid.replace(old_name, new_name, 1)
|
224 |
self.record_sets[i].fields[j].references.uid = new_uid
|
|
|
168 |
"""Record Set analogue for editor"""
|
169 |
|
170 |
name: str = ""
|
171 |
+
data: list[Any] | None = None
|
172 |
description: str | None = None
|
173 |
is_enumeration: bool | None = None
|
174 |
key: str | list[str] | None = None
|
|
|
208 |
"""Renames a RecordSet by changing all the references to this RecordSet."""
|
209 |
for i, record_set in enumerate(self.record_sets):
|
210 |
for j, field in enumerate(record_set.fields):
|
211 |
+
possible_uid = f"{old_name}/"
|
212 |
# Update source
|
213 |
source = field.source
|
214 |
+
if (
|
215 |
+
source
|
216 |
+
and source.uid
|
217 |
+
and (source.uid.startswith(possible_uid) or source.uid == old_name)
|
218 |
+
):
|
219 |
new_uid = source.uid.replace(old_name, new_name, 1)
|
220 |
self.record_sets[i].fields[j].source.uid = new_uid
|
221 |
# Update references
|
|
|
223 |
if (
|
224 |
references
|
225 |
and references.uid
|
226 |
+
and (
|
227 |
+
references.uid.startswith(possible_uid)
|
228 |
+
or references.uid == old_name
|
229 |
+
)
|
230 |
):
|
231 |
new_uid = references.uid.replace(old_name, new_name, 1)
|
232 |
self.record_sets[i].fields[j].references.uid = new_uid
|
deploy_to_hf.sh
CHANGED
@@ -3,12 +3,15 @@ echo "Deleting $HF_REPO..."
|
|
3 |
rm -rf ${HF_REPO}
|
4 |
git clone [email protected]:spaces/marcenacp/croissant-editor ${HF_REPO}
|
5 |
echo "Copying files from $PWD to $HF_REPO..."
|
6 |
-
rsync -aP --exclude="README.md" --exclude="*node_modules*" --exclude="*__pycache__*" . ${HF_REPO}
|
7 |
cd ${HF_REPO}
|
8 |
-
|
|
|
|
|
9 |
echo "Warning: if it fails, you may need to follow https://huggingface.co/docs/hub/security-git-ssh#generating-a-new-ssh-keypair"
|
10 |
echo "On Hugging Face Spaces, you might have to set the following environment variables:"
|
11 |
echo "- REDIRECT_URI"
|
12 |
echo "- OAUTH_STATE"
|
13 |
echo "- OAUTH_CLIENT_ID"
|
14 |
echo "- OAUTH_CLIENT_SECRET"
|
|
|
|
3 |
rm -rf ${HF_REPO}
|
4 |
git clone [email protected]:spaces/marcenacp/croissant-editor ${HF_REPO}
|
5 |
echo "Copying files from $PWD to $HF_REPO..."
|
6 |
+
rsync -aP --exclude="README.md" --exclude="*node_modules*" --exclude="cypress/*" --exclude="*__pycache__*" . ${HF_REPO}
|
7 |
cd ${HF_REPO}
|
8 |
+
git add .
|
9 |
+
git commit -m "Deploy (see actual commits on https://github.com/mlcommons/croissant)."
|
10 |
+
echo "Now push with: 'cd $HF_REPO && git push'."
|
11 |
echo "Warning: if it fails, you may need to follow https://huggingface.co/docs/hub/security-git-ssh#generating-a-new-ssh-keypair"
|
12 |
echo "On Hugging Face Spaces, you might have to set the following environment variables:"
|
13 |
echo "- REDIRECT_URI"
|
14 |
echo "- OAUTH_STATE"
|
15 |
echo "- OAUTH_CLIENT_ID"
|
16 |
echo "- OAUTH_CLIENT_SECRET"
|
17 |
+
echo "Visit: https://huggingface.co/spaces/marcenacp/croissant-editor"
|
events/record_sets.py
CHANGED
@@ -13,6 +13,8 @@ class RecordSetEvent(enum.Enum):
|
|
13 |
NAME = "NAME"
|
14 |
DESCRIPTION = "DESCRIPTION"
|
15 |
IS_ENUMERATION = "IS_ENUMERATION"
|
|
|
|
|
16 |
|
17 |
|
18 |
def handle_record_set_change(event: RecordSetEvent, record_set: RecordSet, key: str):
|
@@ -28,4 +30,16 @@ def handle_record_set_change(event: RecordSetEvent, record_set: RecordSet, key:
|
|
28 |
record_set.description = value
|
29 |
elif event == RecordSetEvent.IS_ENUMERATION:
|
30 |
record_set.is_enumeration = value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
expand_record_set(record_set=record_set)
|
|
|
13 |
NAME = "NAME"
|
14 |
DESCRIPTION = "DESCRIPTION"
|
15 |
IS_ENUMERATION = "IS_ENUMERATION"
|
16 |
+
HAS_DATA = "HAS_DATA"
|
17 |
+
CHANGE_DATA = "CHANGE_DATA"
|
18 |
|
19 |
|
20 |
def handle_record_set_change(event: RecordSetEvent, record_set: RecordSet, key: str):
|
|
|
30 |
record_set.description = value
|
31 |
elif event == RecordSetEvent.IS_ENUMERATION:
|
32 |
record_set.is_enumeration = value
|
33 |
+
elif event == RecordSetEvent.HAS_DATA:
|
34 |
+
if value:
|
35 |
+
record_set.data = []
|
36 |
+
else:
|
37 |
+
record_set.data = None
|
38 |
+
elif event == RecordSetEvent.CHANGE_DATA:
|
39 |
+
for index, new_value in value["edited_rows"].items():
|
40 |
+
record_set.data[index] = {**record_set.data[index], **new_value}
|
41 |
+
for row in value["added_rows"]:
|
42 |
+
record_set.data.append(row)
|
43 |
+
for row in value["deleted_rows"]:
|
44 |
+
del record_set.data[row]
|
45 |
expand_record_set(record_set=record_set)
|
views/foo.py.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
import time
|
3 |
+
from typing import TypedDict
|
4 |
+
|
5 |
+
|
6 |
+
class _Result(TypedDict):
|
7 |
+
bar: int
|
8 |
+
|
9 |
+
|
10 |
+
def bar(result):
|
11 |
+
while True:
|
12 |
+
time.sleep(1)
|
13 |
+
result["bar"] += 1
|
14 |
+
print(result["bar"])
|
15 |
+
if result["bar"] > 5:
|
16 |
+
return
|
17 |
+
|
18 |
+
|
19 |
+
def foo():
|
20 |
+
"""Generates the data and waits at most _TIMEOUT_SECONDS."""
|
21 |
+
with multiprocessing.Manager() as manager:
|
22 |
+
result: _Result = manager.dict(bar=0)
|
23 |
+
process = multiprocessing.Process(target=bar, args=(result,))
|
24 |
+
process.start()
|
25 |
+
if not process.is_alive():
|
26 |
+
return result
|
27 |
+
time.sleep(3)
|
28 |
+
if process.is_alive():
|
29 |
+
process.kill()
|
30 |
+
result["exception"] = TimeoutError(
|
31 |
+
"The generation took too long and was killed."
|
32 |
+
)
|
33 |
+
return _Result(**result)
|
34 |
+
|
35 |
+
|
36 |
+
print("FINAL RESULT", foo().get("bar"))
|
views/overview.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from typing import Any
|
2 |
|
3 |
import streamlit as st
|
@@ -8,12 +9,22 @@ from utils import needed_field
|
|
8 |
from views.metadata import handle_metadata_change
|
9 |
from views.metadata import MetadataEvent
|
10 |
|
|
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
def render_overview():
|
@@ -21,7 +32,7 @@ def render_overview():
|
|
21 |
col1, col2 = st.columns([1, 1], gap="medium")
|
22 |
with col1:
|
23 |
key = "metadata-name"
|
24 |
-
st.text_input(
|
25 |
label=needed_field("Name"),
|
26 |
key=key,
|
27 |
value=metadata.name,
|
@@ -29,8 +40,10 @@ def render_overview():
|
|
29 |
on_change=handle_metadata_change,
|
30 |
args=(MetadataEvent.NAME, metadata, key),
|
31 |
)
|
|
|
|
|
32 |
key = "metadata-url"
|
33 |
-
st.text_input(
|
34 |
label=needed_field("URL"),
|
35 |
key=key,
|
36 |
value=metadata.url,
|
@@ -38,6 +51,8 @@ def render_overview():
|
|
38 |
on_change=handle_metadata_change,
|
39 |
args=(MetadataEvent.URL, metadata, key),
|
40 |
)
|
|
|
|
|
41 |
key = "metadata-description"
|
42 |
st.text_area(
|
43 |
label="Description",
|
@@ -47,29 +62,35 @@ def render_overview():
|
|
47 |
on_change=handle_metadata_change,
|
48 |
args=(MetadataEvent.DESCRIPTION, metadata, key),
|
49 |
)
|
50 |
-
|
51 |
-
st.
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
57 |
with col2:
|
58 |
user_started_editing = metadata.record_sets or metadata.distribution
|
59 |
if user_started_editing:
|
60 |
-
|
61 |
try:
|
62 |
issues = metadata.to_canonical().issues
|
63 |
if issues.errors:
|
64 |
-
|
65 |
for error in issues.errors:
|
66 |
-
|
67 |
if issues.warnings:
|
68 |
-
|
69 |
for warning in issues.warnings:
|
70 |
-
|
71 |
-
if not issues.errors and not issues.warnings:
|
72 |
-
st.write("No validation issues detected!")
|
73 |
except mlc.ValidationError as exception:
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
from typing import Any
|
3 |
|
4 |
import streamlit as st
|
|
|
9 |
from views.metadata import handle_metadata_change
|
10 |
from views.metadata import MetadataEvent
|
11 |
|
12 |
+
_NON_RELEVANT_METADATA = ["name", "distribution", "record_sets", "rdf"]
|
13 |
|
14 |
+
_INFO_TEXT = """Croissant files are composed of three layers:
|
15 |
+
|
16 |
+
- **Metadata** about the dataset covering Responsible AI, licensing and attributes of
|
17 |
+
[sc\:Dataset](https://schema.org/Dataset).
|
18 |
+
- **Resources**: The contents of a dataset as the underlying files
|
19 |
+
([`FileObject`](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec.md#fileobject))
|
20 |
+
and/or sets of files ([`FileSet`](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec.md#fileset)).
|
21 |
+
- **RecordSets**: the sets of structured records obtained from one or more resources
|
22 |
+
(typically a file or set of files) and the structure of these records,
|
23 |
+
expressed as a set of fields (e.g., the columns of a table).
|
24 |
+
|
25 |
+
The next three tabs will guide you through filling those layers. The errors if any will
|
26 |
+
be displayed on this page. Once you are ready, you can download the dataset by clicking
|
27 |
+
the export button in the upper right corner."""
|
28 |
|
29 |
|
30 |
def render_overview():
|
|
|
32 |
col1, col2 = st.columns([1, 1], gap="medium")
|
33 |
with col1:
|
34 |
key = "metadata-name"
|
35 |
+
name = st.text_input(
|
36 |
label=needed_field("Name"),
|
37 |
key=key,
|
38 |
value=metadata.name,
|
|
|
40 |
on_change=handle_metadata_change,
|
41 |
args=(MetadataEvent.NAME, metadata, key),
|
42 |
)
|
43 |
+
if not name:
|
44 |
+
st.stop()
|
45 |
key = "metadata-url"
|
46 |
+
url = st.text_input(
|
47 |
label=needed_field("URL"),
|
48 |
key=key,
|
49 |
value=metadata.url,
|
|
|
51 |
on_change=handle_metadata_change,
|
52 |
args=(MetadataEvent.URL, metadata, key),
|
53 |
)
|
54 |
+
if not url:
|
55 |
+
st.stop()
|
56 |
key = "metadata-description"
|
57 |
st.text_area(
|
58 |
label="Description",
|
|
|
62 |
on_change=handle_metadata_change,
|
63 |
args=(MetadataEvent.DESCRIPTION, metadata, key),
|
64 |
)
|
65 |
+
st.divider()
|
66 |
+
left, middle, right = st.columns([1, 1, 1])
|
67 |
+
fields = [
|
68 |
+
field
|
69 |
+
for field, value in dataclasses.asdict(metadata).items()
|
70 |
+
if value and field not in _NON_RELEVANT_METADATA
|
71 |
+
]
|
72 |
+
left.metric("Number of metadata", len(fields))
|
73 |
+
middle.metric("Number of resources", len(metadata.distribution))
|
74 |
+
right.metric("Number of RecordSets", len(metadata.record_sets))
|
75 |
with col2:
|
76 |
user_started_editing = metadata.record_sets or metadata.distribution
|
77 |
if user_started_editing:
|
78 |
+
warning = ""
|
79 |
try:
|
80 |
issues = metadata.to_canonical().issues
|
81 |
if issues.errors:
|
82 |
+
warning += "**Errors**\n"
|
83 |
for error in issues.errors:
|
84 |
+
warning += f"{error}\n"
|
85 |
if issues.warnings:
|
86 |
+
warning += "**Warnings**\n"
|
87 |
for warning in issues.warnings:
|
88 |
+
warning += f"{warning}\n"
|
|
|
|
|
89 |
except mlc.ValidationError as exception:
|
90 |
+
warning += "**Errors**\n"
|
91 |
+
warning += f"{str(exception)}\n"
|
92 |
+
if warning:
|
93 |
+
st.warning(warning, icon="⚠️")
|
94 |
+
else:
|
95 |
+
st.success("No validation issues detected!", icon="✅")
|
96 |
+
st.info(_INFO_TEXT, icon="💡")
|
views/record_sets.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
@@ -28,6 +31,65 @@ DATA_TYPES = [
|
|
28 |
mlc.DataType.URL,
|
29 |
]
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def _handle_close_fields():
|
33 |
st.session_state[SelectedRecordSet] = None
|
@@ -116,23 +178,22 @@ def _handle_fields_change(record_set_key: int, record_set: RecordSet):
|
|
116 |
name=added_row.get(FieldDataFrame.NAME),
|
117 |
description=added_row.get(FieldDataFrame.DESCRIPTION),
|
118 |
data_types=[added_row.get(FieldDataFrame.DATA_TYPE)],
|
119 |
-
source=mlc.Source(
|
120 |
-
uid="foo",
|
121 |
-
node_type="distribution",
|
122 |
-
extract=mlc.Extract(column=""),
|
123 |
-
),
|
124 |
references=mlc.Source(),
|
125 |
)
|
126 |
st.session_state[Metadata].add_field(record_set_key, field)
|
127 |
for field_key in result["deleted_rows"]:
|
128 |
st.session_state[Metadata].remove_field(record_set_key, field_key)
|
|
|
|
|
|
|
129 |
|
130 |
|
131 |
class FieldDataFrame:
|
132 |
"""Names of the columns in the pd.DataFrame for `fields`."""
|
133 |
|
134 |
-
NAME = "
|
135 |
-
DESCRIPTION = "
|
136 |
DATA_TYPE = "Data type"
|
137 |
SOURCE_UID = "Source"
|
138 |
SOURCE_EXTRACT = "Source extract"
|
@@ -144,17 +205,14 @@ class FieldDataFrame:
|
|
144 |
def render_record_sets():
|
145 |
col1, col2 = st.columns([1, 1])
|
146 |
with col1:
|
147 |
-
|
|
|
148 |
with col2:
|
149 |
_render_right_panel()
|
150 |
|
151 |
|
152 |
def _render_left_panel():
|
153 |
"""Left panel: visualization of all RecordSets as expandable forms."""
|
154 |
-
distribution = st.session_state[Metadata].distribution
|
155 |
-
if not distribution:
|
156 |
-
st.markdown("Please add resources first.")
|
157 |
-
return
|
158 |
record_sets = st.session_state[Metadata].record_sets
|
159 |
record_set: RecordSet
|
160 |
for record_set_key, record_set in enumerate(record_sets):
|
@@ -188,12 +246,20 @@ def _render_left_panel():
|
|
188 |
on_change=handle_record_set_change,
|
189 |
args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
|
190 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
joins = _find_joins(record_set.fields)
|
193 |
has_join = st.checkbox(
|
194 |
-
"Whether the RecordSet contains joins. To add a new join, add a"
|
195 |
-
|
196 |
-
" another RecordSet
|
197 |
key=f"{prefix}-has-joins",
|
198 |
value=bool(joins),
|
199 |
disabled=True,
|
@@ -248,8 +314,7 @@ def _render_left_panel():
|
|
248 |
)
|
249 |
st.data_editor(
|
250 |
fields,
|
251 |
-
|
252 |
-
use_container_width=not fields.empty,
|
253 |
num_rows="dynamic",
|
254 |
key=data_editor_key,
|
255 |
column_config={
|
@@ -273,6 +338,26 @@ def _render_left_panel():
|
|
273 |
on_change=_handle_fields_change,
|
274 |
args=(record_set_key, record_set),
|
275 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
|
277 |
st.button(
|
278 |
"Edit fields details",
|
@@ -297,56 +382,80 @@ def _render_right_panel():
|
|
297 |
record_set = selected.record_set
|
298 |
record_set_key = selected.record_set_key
|
299 |
with st.expander("**Fields**", expanded=True):
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
key = f"{prefix}-name"
|
305 |
-
col1.text_input(
|
306 |
-
needed_field("Name"),
|
307 |
-
placeholder="Name without special character.",
|
308 |
-
key=key,
|
309 |
-
value=field.name,
|
310 |
-
on_change=handle_field_change,
|
311 |
-
args=(FieldEvent.NAME, field, key),
|
312 |
)
|
313 |
-
key = f"{
|
314 |
-
|
315 |
-
|
316 |
-
|
|
|
|
|
317 |
key=key,
|
318 |
-
|
319 |
-
|
320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
)
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
else:
|
329 |
data_type_index = None
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
st.divider()
|
350 |
|
351 |
st.button(
|
352 |
"Close",
|
|
|
1 |
+
import multiprocessing
|
2 |
+
import textwrap
|
3 |
+
import time
|
4 |
+
from typing import TypedDict
|
5 |
|
6 |
import numpy as np
|
7 |
import pandas as pd
|
|
|
31 |
mlc.DataType.URL,
|
32 |
]
|
33 |
|
34 |
+
_NUM_RECORDS = 3
|
35 |
+
_TIMEOUT_SECONDS = 1
|
36 |
+
|
37 |
+
|
38 |
+
class _Result(TypedDict):
|
39 |
+
df: pd.DataFrame | None
|
40 |
+
exception: Exception | None
|
41 |
+
|
42 |
+
|
43 |
+
@st.cache_data(show_spinner="Generating the dataset...")
|
44 |
+
def _generate_data_with_timeout(record_set: RecordSet) -> _Result:
|
45 |
+
"""Generates the data and waits at most _TIMEOUT_SECONDS."""
|
46 |
+
with multiprocessing.Manager() as manager:
|
47 |
+
result: _Result = manager.dict(df=None, exception=None)
|
48 |
+
args = (record_set, result)
|
49 |
+
process = multiprocessing.Process(target=_generate_data, args=args)
|
50 |
+
process.start()
|
51 |
+
if not process.is_alive():
|
52 |
+
return _Result(**result)
|
53 |
+
time.sleep(_TIMEOUT_SECONDS)
|
54 |
+
if process.is_alive():
|
55 |
+
process.kill()
|
56 |
+
result["exception"] = TimeoutError(
|
57 |
+
"The generation took too long and was killed. Please, use the CLI as"
|
58 |
+
" described in"
|
59 |
+
" https://github.com/mlcommons/croissant/tree/main/python/mlcroissant#verifyload-a-croissant-dataset."
|
60 |
+
)
|
61 |
+
return _Result(**result)
|
62 |
+
|
63 |
+
|
64 |
+
def _generate_data(record_set: RecordSet, result: _Result) -> pd.DataFrame | None:
|
65 |
+
"""Generates the first _NUM_RECORDS records."""
|
66 |
+
try:
|
67 |
+
metadata: Metadata = st.session_state[Metadata]
|
68 |
+
if not metadata:
|
69 |
+
raise ValueError(
|
70 |
+
"The dataset is still incomplete. Please, go to the overview to see"
|
71 |
+
" errors."
|
72 |
+
)
|
73 |
+
croissant = metadata.to_canonical()
|
74 |
+
if croissant:
|
75 |
+
dataset = mlc.Dataset.from_metadata(croissant)
|
76 |
+
records = iter(dataset.records(record_set=record_set.name))
|
77 |
+
df = []
|
78 |
+
for i, record in enumerate(iter(records)):
|
79 |
+
if i >= _NUM_RECORDS:
|
80 |
+
break
|
81 |
+
# Decode bytes as str:
|
82 |
+
for key, value in record.items():
|
83 |
+
if isinstance(value, bytes):
|
84 |
+
try:
|
85 |
+
record[key] = value.decode("utf-8")
|
86 |
+
except:
|
87 |
+
pass
|
88 |
+
df.append(record)
|
89 |
+
result["df"] = pd.DataFrame(df)
|
90 |
+
except Exception as exception:
|
91 |
+
result["exception"] = exception
|
92 |
+
|
93 |
|
94 |
def _handle_close_fields():
|
95 |
st.session_state[SelectedRecordSet] = None
|
|
|
178 |
name=added_row.get(FieldDataFrame.NAME),
|
179 |
description=added_row.get(FieldDataFrame.DESCRIPTION),
|
180 |
data_types=[added_row.get(FieldDataFrame.DATA_TYPE)],
|
181 |
+
source=mlc.Source(),
|
|
|
|
|
|
|
|
|
182 |
references=mlc.Source(),
|
183 |
)
|
184 |
st.session_state[Metadata].add_field(record_set_key, field)
|
185 |
for field_key in result["deleted_rows"]:
|
186 |
st.session_state[Metadata].remove_field(record_set_key, field_key)
|
187 |
+
# Reset the in-line data if it exists.
|
188 |
+
if record_set.data:
|
189 |
+
record_set.data = []
|
190 |
|
191 |
|
192 |
class FieldDataFrame:
|
193 |
"""Names of the columns in the pd.DataFrame for `fields`."""
|
194 |
|
195 |
+
NAME = "Field name"
|
196 |
+
DESCRIPTION = "Field description"
|
197 |
DATA_TYPE = "Data type"
|
198 |
SOURCE_UID = "Source"
|
199 |
SOURCE_EXTRACT = "Source extract"
|
|
|
205 |
def render_record_sets():
|
206 |
col1, col2 = st.columns([1, 1])
|
207 |
with col1:
|
208 |
+
with st.spinner("Generating the dataset..."):
|
209 |
+
_render_left_panel()
|
210 |
with col2:
|
211 |
_render_right_panel()
|
212 |
|
213 |
|
214 |
def _render_left_panel():
|
215 |
"""Left panel: visualization of all RecordSets as expandable forms."""
|
|
|
|
|
|
|
|
|
216 |
record_sets = st.session_state[Metadata].record_sets
|
217 |
record_set: RecordSet
|
218 |
for record_set_key, record_set in enumerate(record_sets):
|
|
|
246 |
on_change=handle_record_set_change,
|
247 |
args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
|
248 |
)
|
249 |
+
key = f"{prefix}-has-data"
|
250 |
+
st.checkbox(
|
251 |
+
"Whether the RecordSet has in-line data",
|
252 |
+
key=key,
|
253 |
+
value=bool(record_set.data),
|
254 |
+
on_change=handle_record_set_change,
|
255 |
+
args=(RecordSetEvent.HAS_DATA, record_set, key),
|
256 |
+
)
|
257 |
|
258 |
joins = _find_joins(record_set.fields)
|
259 |
has_join = st.checkbox(
|
260 |
+
"Whether the RecordSet contains joins. To add a new join, add a field"
|
261 |
+
" with a source in `RecordSet`/`FileSet`/`FileObject` and a reference"
|
262 |
+
" to another `RecordSet`/`FileSet`/`FileObject`.",
|
263 |
key=f"{prefix}-has-joins",
|
264 |
value=bool(joins),
|
265 |
disabled=True,
|
|
|
314 |
)
|
315 |
st.data_editor(
|
316 |
fields,
|
317 |
+
use_container_width=True,
|
|
|
318 |
num_rows="dynamic",
|
319 |
key=data_editor_key,
|
320 |
column_config={
|
|
|
338 |
on_change=_handle_fields_change,
|
339 |
args=(record_set_key, record_set),
|
340 |
)
|
341 |
+
result: _Result = _generate_data_with_timeout(record_set)
|
342 |
+
df, exception = result.get("df"), result.get("exception")
|
343 |
+
if exception is None and df is not None and not df.empty:
|
344 |
+
st.markdown("Previsualize the data:")
|
345 |
+
st.dataframe(df, use_container_width=True)
|
346 |
+
# The generation is not triggered if record_set has in-line `data`.
|
347 |
+
elif not record_set.data:
|
348 |
+
left, right = st.columns([1, 10])
|
349 |
+
if exception:
|
350 |
+
left.button(
|
351 |
+
"⚠️",
|
352 |
+
key=f"idea-{prefix}",
|
353 |
+
disabled=True,
|
354 |
+
help=textwrap.dedent(f"""**Error**:
|
355 |
+
```
|
356 |
+
{exception}
|
357 |
+
```
|
358 |
+
"""),
|
359 |
+
)
|
360 |
+
right.markdown("No preview is possible.")
|
361 |
|
362 |
st.button(
|
363 |
"Edit fields details",
|
|
|
382 |
record_set = selected.record_set
|
383 |
record_set_key = selected.record_set_key
|
384 |
with st.expander("**Fields**", expanded=True):
|
385 |
+
if isinstance(record_set.data, list):
|
386 |
+
st.markdown(
|
387 |
+
f"{needed_field('Data')}. This RecordSet is marked as having in-line"
|
388 |
+
" data. Please, list the data below:"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
)
|
390 |
+
key = f"{record_set_key}-fields-data"
|
391 |
+
columns = [field.name for field in record_set.fields]
|
392 |
+
st.data_editor(
|
393 |
+
pd.DataFrame(record_set.data, columns=columns),
|
394 |
+
use_container_width=True,
|
395 |
+
num_rows="dynamic",
|
396 |
key=key,
|
397 |
+
column_config={
|
398 |
+
field.name: st.column_config.TextColumn(
|
399 |
+
field.name,
|
400 |
+
help=field.description,
|
401 |
+
required=True,
|
402 |
+
)
|
403 |
+
for field in record_set.fields
|
404 |
+
},
|
405 |
+
on_change=handle_record_set_change,
|
406 |
+
args=(RecordSetEvent.CHANGE_DATA, record_set, key),
|
407 |
)
|
408 |
+
else:
|
409 |
+
for field_key, field in enumerate(record_set.fields):
|
410 |
+
prefix = f"{record_set_key}-{field.name}-{field_key}"
|
411 |
+
col1, col2, col3 = st.columns([1, 1, 1])
|
412 |
+
|
413 |
+
key = f"{prefix}-name"
|
414 |
+
col1.text_input(
|
415 |
+
needed_field("Name"),
|
416 |
+
placeholder="Name without special character.",
|
417 |
+
key=key,
|
418 |
+
value=field.name,
|
419 |
+
on_change=handle_field_change,
|
420 |
+
args=(FieldEvent.NAME, field, key),
|
421 |
+
)
|
422 |
+
key = f"{prefix}-description"
|
423 |
+
col2.text_input(
|
424 |
+
"Description",
|
425 |
+
placeholder="Provide a clear description of the RecordSet.",
|
426 |
+
key=key,
|
427 |
+
on_change=handle_field_change,
|
428 |
+
value=field.description,
|
429 |
+
args=(FieldEvent.DESCRIPTION, field, key),
|
430 |
+
)
|
431 |
+
if field.data_types:
|
432 |
+
data_type = field.data_types[0]
|
433 |
+
if isinstance(data_type, str):
|
434 |
+
data_type = term.URIRef(data_type)
|
435 |
+
if data_type in DATA_TYPES:
|
436 |
+
data_type_index = DATA_TYPES.index(data_type)
|
437 |
+
else:
|
438 |
+
data_type_index = None
|
439 |
else:
|
440 |
data_type_index = None
|
441 |
+
key = f"{prefix}-datatypes"
|
442 |
+
col3.selectbox(
|
443 |
+
needed_field("Data type"),
|
444 |
+
index=data_type_index,
|
445 |
+
options=DATA_TYPES,
|
446 |
+
key=key,
|
447 |
+
on_change=handle_field_change,
|
448 |
+
args=(FieldEvent.DATA_TYPE, field, key),
|
449 |
+
)
|
450 |
+
possible_sources = _get_possible_sources(metadata)
|
451 |
+
render_source(
|
452 |
+
record_set_key, record_set, field, field_key, possible_sources
|
453 |
+
)
|
454 |
+
render_references(
|
455 |
+
record_set_key, record_set, field, field_key, possible_sources
|
456 |
+
)
|
457 |
+
|
458 |
+
st.divider()
|
|
|
|
|
459 |
|
460 |
st.button(
|
461 |
"Close",
|