File size: 4,500 Bytes
edf454b
6a31b9a
 
cb5b71d
 
bc133ae
cb5b71d
 
 
 
 
 
edf454b
cb5b71d
edf454b
 
 
 
 
 
 
 
 
 
 
bc133ae
 
 
6a31b9a
 
cb308b8
 
 
 
 
c439062
cb308b8
 
 
 
 
 
 
 
 
cb5b71d
 
 
 
 
edf454b
cb5b71d
 
 
bc133ae
cb5b71d
 
 
 
edf454b
 
cb5b71d
 
 
 
 
bc133ae
cb5b71d
 
 
edf454b
cb308b8
 
 
 
c439062
 
 
cb308b8
 
 
 
 
 
 
 
bc133ae
 
 
 
 
 
 
 
 
 
 
cb5b71d
 
 
edf454b
cb5b71d
 
 
edf454b
cb5b71d
edf454b
cb5b71d
edf454b
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import dataclasses
from typing import Any

import streamlit as st

from core.constants import NAMES_INFO
from core.state import Metadata
import mlcroissant as mlc
from utils import needed_field
from views.metadata import handle_metadata_change
from views.metadata import MetadataEvent

_NON_RELEVANT_METADATA = ["name", "distribution", "record_sets", "rdf"]

_INFO_TEXT = """Croissant files are composed of three layers:

- **Metadata** about the dataset covering Responsible AI, licensing and attributes of
                [sc\:Dataset](https://schema.org/Dataset).
- **Resources**: The contents of a dataset as the underlying files
                ([`FileObject`](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec.md#fileobject))
                and/or sets of files ([`FileSet`](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec.md#fileset)).
- **RecordSets**: the sets of structured records obtained from one or more resources
                (typically a file or set of files) and the structure of these records,
                expressed as a set of fields (e.g., the columns of a table).

The next three tabs will guide you through filling those layers. Any error will be
displayed on the overview. Once the dataset is finished, you can download the dataset by
clicking the export button in the upper right corner."""


def _relevant_fields(class_or_instance: type):
    if isinstance(class_or_instance, type):
        return [
            field.name
            for field in dataclasses.fields(class_or_instance)
            if field.name not in _NON_RELEVANT_METADATA
        ]
    else:
        return [
            field
            for field, value in dataclasses.asdict(class_or_instance).items()
            if value and field not in _NON_RELEVANT_METADATA
        ]


def render_overview():
    metadata: Metadata = st.session_state[Metadata]
    col1, col2 = st.columns([1, 1], gap="medium")
    with col1:
        key = "metadata-name"
        name = st.text_input(
            label=needed_field("Name"),
            key=key,
            value=metadata.name,
            help=f"The name of the dataset. {NAMES_INFO}",
            placeholder="Dataset",
            on_change=handle_metadata_change,
            args=(MetadataEvent.NAME, metadata, key),
        )
        if not name:
            st.stop()
        key = "metadata-description"
        st.text_area(
            label="Description",
            key=key,
            value=metadata.description,
            placeholder="Provide a description of the dataset.",
            on_change=handle_metadata_change,
            args=(MetadataEvent.DESCRIPTION, metadata, key),
        )
        st.divider()
        col_a, col_b, col_c, col_d = st.columns([1, 1, 1, 1])
        fields = len(_relevant_fields(metadata))
        metadata_weight = len(_relevant_fields(Metadata))
        completion = int(
            # Formula for the completion:
            # - Resources and RecordSets count as much as Metadata.
            # - Metadata is the percentage of filled fields.
            (
                fields
                + (metadata_weight if metadata.distribution else 0)
                + (metadata_weight if metadata.record_sets else 0)
            )
            * 100
            / (3 * metadata_weight)
        )
        col_a.metric(
            "Completion",
            f"{completion}%",
            help=(
                "Approximation of the total completion based on the number of fields"
                " that are filled."
            ),
        )
        col_b.metric("Metadata fields", fields)
        col_c.metric("Resources", len(metadata.distribution))
        col_d.metric("RecordSets", len(metadata.record_sets))
    with col2:
        user_started_editing = metadata.record_sets or metadata.distribution
        if user_started_editing:
            warning = ""
            try:
                issues = metadata.to_canonical().issues
                if issues.errors:
                    warning += "**Errors**\n"
                    for error in issues.errors:
                        warning += f"{error}\n"
            except mlc.ValidationError as exception:
                warning += "**Errors**\n"
                warning += f"{str(exception)}\n"
            if warning:
                st.warning(warning, icon="⚠️")
            else:
                st.success("No validation issues detected!", icon="✅")
        st.info(_INFO_TEXT, icon="💡")