croissant-editor / events /metadata.py
marcenacp's picture
Deploy (see actual commits on https://github.com/mlcommons/croissant).
73ebcab
import datetime
import enum
import streamlit as st
from core.names import find_unique_name
from core.state import Metadata
import mlcroissant as mlc
# List from:
LICENSES_URL = "https://huggingface.co/docs/hub/repositories-licenses"
LICENSES = {
"Unknown": "unknown",
"Other": "other",
"Apache license 2.0": "apache-2.0",
"MIT": "mit",
"OpenRAIL license family": "openrail",
"BigScience OpenRAIL-M": "bigscience-openrail-m",
"CreativeML OpenRAIL-M": "creativeml-openrail-m",
"BigScience BLOOM RAIL 1.0": "bigscience-bloom-rail-1.0",
"BigCode Open RAIL-M v1": "bigcode-openrail-m",
"Academic Free License v3.0": "afl-3.0",
"Artistic license 2.0": "artistic-2.0",
"Boost Software License 1.0": "bsl-1.0",
"BSD license family": "bsd",
"BSD 2-clause “Simplified” license": "bsd-2-clause",
"BSD 3-clause “New” or “Revised” license": "bsd-3-clause",
"BSD 3-clause Clear license": "bsd-3-clause-clear",
"Computational Use of Data Agreement": "c-uda",
"Creative Commons license family": "cc",
"Creative Commons Zero v1.0 Universal": "cc0-1.0",
"Creative Commons Attribution 2.0": "cc-by-2.0",
"Creative Commons Attribution 2.5": "cc-by-2.5",
"Creative Commons Attribution 3.0": "cc-by-3.0",
"Creative Commons Attribution 4.0": "cc-by-4.0",
"Creative Commons Attribution Share Alike 3.0": "cc-by-sa-3.0",
"Creative Commons Attribution Share Alike 4.0": "cc-by-sa-4.0",
"Creative Commons Attribution Non Commercial 2.0": "cc-by-nc-2.0",
"Creative Commons Attribution Non Commercial 3.0": "cc-by-nc-3.0",
"Creative Commons Attribution Non Commercial 4.0": "cc-by-nc-4.0",
"Creative Commons Attribution No Derivatives 4.0": "cc-by-nd-4.0",
"Creative Commons Attribution Non Commercial No Derivatives 3.0": "cc-by-nc-nd-3.0",
"Creative Commons Attribution Non Commercial No Derivatives 4.0": "cc-by-nc-nd-4.0",
"Creative Commons Attribution Non Commercial Share Alike 2.0": "cc-by-nc-sa-2.0",
"Creative Commons Attribution Non Commercial Share Alike 3.0": "cc-by-nc-sa-3.0",
"Creative Commons Attribution Non Commercial Share Alike 4.0": "cc-by-nc-sa-4.0",
"Community Data License Agreement – Sharing, Version 1.0": "cdla-sharing-1.0",
"Community Data License Agreement – Permissive, Version 1.0": "cdla-permissive-1.0",
"Community Data License Agreement – Permissive, Version 2.0": "cdla-permissive-2.0",
"Do What The F*ck You Want To Public License": "wtfpl",
"Educational Community License v2.0": "ecl-2.0",
"Eclipse Public License 1.0": "epl-1.0",
"Eclipse Public License 2.0": "epl-2.0",
"European Union Public License 1.1": "eupl-1.1",
"GNU Affero General Public License v3.0": "agpl-3.0",
"GNU Free Documentation License family": "gfdl",
"GNU General Public License family": "gpl",
"GNU General Public License v2.0": "gpl-2.0",
"GNU General Public License v3.0": "gpl-3.0",
"GNU Lesser General Public License family": "lgpl",
"GNU Lesser General Public License v2.1": "lgpl-2.1",
"GNU Lesser General Public License v3.0": "lgpl-3.0",
"ISC": "isc",
"LaTeX Project Public License v1.3c": "lppl-1.3c",
"Microsoft Public License": "ms-pl",
"Mozilla Public License 2.0": "mpl-2.0",
"Open Data Commons License Attribution family": "odc-by",
"Open Database License family": "odbl",
"Open Rail++-M License": "openrail++",
"Open Software License 3.0": "osl-3.0",
"PostgreSQL License": "postgresql",
"SIL Open Font License 1.1": "ofl-1.1",
"University of Illinois/NCSA Open Source License": "ncsa",
"The Unlicense": "unlicense",
"zLib License": "zlib",
"Open Data Commons Public Domain Dedication and License": "pddl",
"Lesser General Public License For Linguistic Resources": "lgpl-lr",
"DeepFloyd IF Research License Agreement": "deepfloyd-if-license",
"Llama 2 Community License Agreement": "llama2",
}
def find_license_index(code: str) -> int | None:
"""Finds the index in the list of LICENSES."""
for index, license_code in enumerate(LICENSES.values()):
if license_code == code:
return index
return None
class MetadataEvent(enum.Enum):
"""Event that triggers a metadata change."""
NAME = "NAME"
DESCRIPTION = "DESCRIPTION"
DATE_PUBLISHED = "DATE_PUBLISHED"
URL = "URL"
LICENSE = "LICENSE"
CITE_AS = "CITE_AS"
VERSION = "VERSION"
DATA_BIASES = "DATA_BIASES"
DATA_COLLECTION = "DATA_COLLECTION"
PERSONAL_SENSITIVE_INFORMATION = "PERSONAL_SENSITIVE_INFORMATION"
CREATOR_ADD = "CREATOR_ADD"
CREATOR_NAME = "CREATOR_NAME"
CREATOR_URL = "CREATOR_URL"
CREATOR_REMOVE = "CREATOR_REMOVE"
def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
if event == MetadataEvent.NAME:
metadata.name = find_unique_name(set(), st.session_state[key])
elif event == MetadataEvent.DESCRIPTION:
metadata.description = st.session_state[key]
elif event == MetadataEvent.LICENSE:
metadata.license = LICENSES.get(st.session_state[key])
elif event == MetadataEvent.CITE_AS:
metadata.cite_as = st.session_state[key]
elif event == MetadataEvent.URL:
metadata.url = st.session_state[key]
elif event == MetadataEvent.VERSION:
metadata.version = st.session_state[key]
elif event == MetadataEvent.DATA_BIASES:
metadata.data_biases = st.session_state[key]
elif event == MetadataEvent.DATA_COLLECTION:
metadata.data_collection = st.session_state[key]
elif event == MetadataEvent.PERSONAL_SENSITIVE_INFORMATION:
metadata.personal_sensitive_information = st.session_state[key]
elif event == MetadataEvent.DATE_PUBLISHED:
date = st.session_state[key]
metadata.date_published = datetime.datetime(date.year, date.month, date.day)
elif event == MetadataEvent.CREATOR_ADD:
metadata.creators = [mlc.Person()]
elif event == MetadataEvent.CREATOR_REMOVE:
metadata.creators = []
elif event == MetadataEvent.CREATOR_NAME:
if metadata.creators:
metadata.creators[0].name = st.session_state[key]
else:
metadata.creators = [mlc.Person(name=st.session_state[key])]
elif event == MetadataEvent.CREATOR_URL:
if metadata.creators:
metadata.creators[0].url = st.session_state[key]
else:
metadata.creators = [mlc.Person(url=st.session_state[key])]