Spaces:
Running
Running
File size: 4,979 Bytes
041af8a cb5b71d bbea1cc cc8c6fc fe3ba5f dc92053 cb5b71d 041af8a cb5b71d bc133ae fe3ba5f bc133ae cb5b71d bc133ae bbea1cc 5a782ad bbea1cc cb5b71d bc133ae cb5b71d cc8c6fc cb5b71d bc133ae 041af8a 0c5b67f fe3ba5f 041af8a cc8c6fc fe3ba5f 041af8a 0c5b67f 041af8a 0c5b67f bc133ae fe3ba5f 0c5b67f 041af8a 0c5b67f 041af8a 0c5b67f 041af8a bc133ae 36f4fe3 bc133ae cb5b71d bc133ae cb5b71d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import logging
import requests
import streamlit as st
from core.constants import OAUTH_CLIENT_ID
from core.past_projects import save_current_project
from core.path import get_resource_path
from core.query_params import set_project
from core.state import CurrentProject
from core.state import Metadata
import mlcroissant as mlc
from views.load import render_load
from views.previous_files import render_previous_files
_HUGGING_FACE_URL = "https://huggingface.co/datasets/"
_DATASETS = {
"Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
"FLORES-200": [],
"GPT-3": [],
"COCO2014": [],
"PASS": [],
"MovieLens": [],
"Bigcode-The-Stack": [],
}
_INFO = """[Croissant](https://mlcommons.org/croissant) 🥐 is a high-level format for
machine learning datasets built
on [schema.org](https://schema.org/) and its Dataset vocabulary. A croissant
configuration file combines metadata, resource file descriptions, data structure, and
default ML semantics of dataset. You can familiarize yourself with the editor by
exploring the provided examples.
The editor supports creating a new configuration from scratch, as well as uploading
an existing Croissant JSON-MD file. Finally, you can also select any of your
past projects from the list.
You can change the project you are currently editing at any time by clicking
the Menu button and then choosing one of the options on this page."""
def render_splash():
st.info(_INFO, icon="💡")
if OAUTH_CLIENT_ID:
st.info(
"**Disclaimer**: Do not put sensitive information or datasets here. The"
" storage on Hugging Face Spaces is ephemeral. If you want to host your own"
" version locally, build the app from [the GitHub"
" repository](https://github.com/mlcommons/croissant/tree/main/editor)."
)
col1, col2 = st.columns([1, 1], gap="large")
with col1:
with st.expander("**Create a new dataset**", expanded=True):
def create_new_croissant():
st.session_state[Metadata] = Metadata()
save_current_project()
st.button(
"Create",
on_click=create_new_croissant,
type="primary",
)
with st.expander("**Load an existing dataset**", expanded=True):
def create_example(dataset: str):
base = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/{dataset.lower()}"
url = f"{base}/metadata.json"
try:
json = requests.get(url).json()
metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
st.session_state[Metadata] = Metadata.from_canonical(metadata)
save_current_project()
# Write supplementary files.
files = _DATASETS.get(dataset, [])
for file in files:
path = get_resource_path(file)
json = requests.get(f"{base}/{file}")
path.write_bytes(json.content)
except Exception as exception:
logging.error(exception)
st.error(
"Sorry, it seems that the example is broken... Can you please"
" [open an issue on"
" GitHub](https://github.com/mlcommons/croissant/issues/new)?"
)
dataset = st.selectbox(
label="Canonical dataset",
options=_DATASETS.keys(),
)
st.button(
f"{dataset} dataset",
on_click=create_example,
type="primary",
args=(dataset,),
)
url = st.text_input(
label="Hugging Face dataset",
placeholder="Example: https://huggingface.co/datasets/mnist",
)
if url.startswith(_HUGGING_FACE_URL):
name = url.replace(_HUGGING_FACE_URL, "")
api_url = (
f"https://datasets-server.huggingface.co/croissant?dataset={name}"
)
json = requests.get(api_url, headers=None).json()
try:
metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
st.session_state[Metadata] = Metadata.from_canonical(metadata)
save_current_project()
st.rerun()
except Exception:
st.error(f"Malformed JSON: {json}")
elif url:
st.error(
f"Unknown URL {url}. Hugging Face URLS should look like"
f" {_HUGGING_FACE_URL}somedataset."
)
render_load()
with col2:
with st.expander("**Recent projects**", expanded=True):
render_previous_files()
|