import datetime import enum import streamlit as st from core.names import find_unique_name from core.state import Metadata import mlcroissant as mlc # List from: LICENSES_URL = "https://huggingface.co/docs/hub/repositories-licenses" LICENSES = { "Unknown": "unknown", "Other": "other", "Apache license 2.0": "apache-2.0", "MIT": "mit", "OpenRAIL license family": "openrail", "BigScience OpenRAIL-M": "bigscience-openrail-m", "CreativeML OpenRAIL-M": "creativeml-openrail-m", "BigScience BLOOM RAIL 1.0": "bigscience-bloom-rail-1.0", "BigCode Open RAIL-M v1": "bigcode-openrail-m", "Academic Free License v3.0": "afl-3.0", "Artistic license 2.0": "artistic-2.0", "Boost Software License 1.0": "bsl-1.0", "BSD license family": "bsd", "BSD 2-clause “Simplified” license": "bsd-2-clause", "BSD 3-clause “New” or “Revised” license": "bsd-3-clause", "BSD 3-clause Clear license": "bsd-3-clause-clear", "Computational Use of Data Agreement": "c-uda", "Creative Commons license family": "cc", "Creative Commons Zero v1.0 Universal": "cc0-1.0", "Creative Commons Attribution 2.0": "cc-by-2.0", "Creative Commons Attribution 2.5": "cc-by-2.5", "Creative Commons Attribution 3.0": "cc-by-3.0", "Creative Commons Attribution 4.0": "cc-by-4.0", "Creative Commons Attribution Share Alike 3.0": "cc-by-sa-3.0", "Creative Commons Attribution Share Alike 4.0": "cc-by-sa-4.0", "Creative Commons Attribution Non Commercial 2.0": "cc-by-nc-2.0", "Creative Commons Attribution Non Commercial 3.0": "cc-by-nc-3.0", "Creative Commons Attribution Non Commercial 4.0": "cc-by-nc-4.0", "Creative Commons Attribution No Derivatives 4.0": "cc-by-nd-4.0", "Creative Commons Attribution Non Commercial No Derivatives 3.0": "cc-by-nc-nd-3.0", "Creative Commons Attribution Non Commercial No Derivatives 4.0": "cc-by-nc-nd-4.0", "Creative Commons Attribution Non Commercial Share Alike 2.0": "cc-by-nc-sa-2.0", "Creative Commons Attribution Non Commercial Share Alike 3.0": "cc-by-nc-sa-3.0", "Creative Commons Attribution Non Commercial Share Alike 4.0": "cc-by-nc-sa-4.0", "Community Data License Agreement – Sharing, Version 1.0": "cdla-sharing-1.0", "Community Data License Agreement – Permissive, Version 1.0": "cdla-permissive-1.0", "Community Data License Agreement – Permissive, Version 2.0": "cdla-permissive-2.0", "Do What The F*ck You Want To Public License": "wtfpl", "Educational Community License v2.0": "ecl-2.0", "Eclipse Public License 1.0": "epl-1.0", "Eclipse Public License 2.0": "epl-2.0", "European Union Public License 1.1": "eupl-1.1", "GNU Affero General Public License v3.0": "agpl-3.0", "GNU Free Documentation License family": "gfdl", "GNU General Public License family": "gpl", "GNU General Public License v2.0": "gpl-2.0", "GNU General Public License v3.0": "gpl-3.0", "GNU Lesser General Public License family": "lgpl", "GNU Lesser General Public License v2.1": "lgpl-2.1", "GNU Lesser General Public License v3.0": "lgpl-3.0", "ISC": "isc", "LaTeX Project Public License v1.3c": "lppl-1.3c", "Microsoft Public License": "ms-pl", "Mozilla Public License 2.0": "mpl-2.0", "Open Data Commons License Attribution family": "odc-by", "Open Database License family": "odbl", "Open Rail++-M License": "openrail++", "Open Software License 3.0": "osl-3.0", "PostgreSQL License": "postgresql", "SIL Open Font License 1.1": "ofl-1.1", "University of Illinois/NCSA Open Source License": "ncsa", "The Unlicense": "unlicense", "zLib License": "zlib", "Open Data Commons Public Domain Dedication and License": "pddl", "Lesser General Public License For Linguistic Resources": "lgpl-lr", "DeepFloyd IF Research License Agreement": "deepfloyd-if-license", "Llama 2 Community License Agreement": "llama2", } def find_license_index(code: str) -> int | None: """Finds the index in the list of LICENSES.""" for index, license_code in enumerate(LICENSES.values()): if license_code == code: return index return None class MetadataEvent(enum.Enum): """Event that triggers a metadata change.""" NAME = "NAME" DESCRIPTION = "DESCRIPTION" DATE_PUBLISHED = "DATE_PUBLISHED" URL = "URL" LICENSE = "LICENSE" CITE_AS = "CITE_AS" VERSION = "VERSION" DATA_BIASES = "DATA_BIASES" DATA_COLLECTION = "DATA_COLLECTION" PERSONAL_SENSITIVE_INFORMATION = "PERSONAL_SENSITIVE_INFORMATION" CREATOR_ADD = "CREATOR_ADD" CREATOR_NAME = "CREATOR_NAME" CREATOR_URL = "CREATOR_URL" CREATOR_REMOVE = "CREATOR_REMOVE" def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str): if event == MetadataEvent.NAME: metadata.name = find_unique_name(set(), st.session_state[key]) elif event == MetadataEvent.DESCRIPTION: metadata.description = st.session_state[key] elif event == MetadataEvent.LICENSE: metadata.license = LICENSES.get(st.session_state[key]) elif event == MetadataEvent.CITE_AS: metadata.cite_as = st.session_state[key] elif event == MetadataEvent.URL: metadata.url = st.session_state[key] elif event == MetadataEvent.VERSION: metadata.version = st.session_state[key] elif event == MetadataEvent.DATA_BIASES: metadata.data_biases = st.session_state[key] elif event == MetadataEvent.DATA_COLLECTION: metadata.data_collection = st.session_state[key] elif event == MetadataEvent.PERSONAL_SENSITIVE_INFORMATION: metadata.personal_sensitive_information = st.session_state[key] elif event == MetadataEvent.DATE_PUBLISHED: date = st.session_state[key] metadata.date_published = datetime.datetime(date.year, date.month, date.day) elif event == MetadataEvent.CREATOR_ADD: metadata.creators = [mlc.Person()] elif event == MetadataEvent.CREATOR_REMOVE: metadata.creators = [] elif event == MetadataEvent.CREATOR_NAME: if metadata.creators: metadata.creators[0].name = st.session_state[key] else: metadata.creators = [mlc.Person(name=st.session_state[key])] elif event == MetadataEvent.CREATOR_URL: if metadata.creators: metadata.creators[0].url = st.session_state[key] else: metadata.creators = [mlc.Person(url=st.session_state[key])]