Spaces:

Saving-Willy
/

saving-willy-dev

Running

App Files Files Community

saving-willy-dev / src /dataset /download.py

vancauwe

feat: add types and home page text

de2a82e 2 months ago

raw

history blame contribute delete

3.1 kB

	import streamlit as st
	import time
	import logging
	import pandas as pd
	from datasets import load_dataset
	from datasets import DatasetDict

	############################################################
	# the dataset of observations (hf dataset in our space)
	dataset_id = "Saving-Willy/temp_dataset"
	data_files = "data/train-00000-of-00001.parquet"
	############################################################

	m_logger = logging.getLogger(__name__)
	# we can set the log level locally for funcs in this module
	#g_m_logger.setLevel(logging.DEBUG)
	m_logger.setLevel(logging.INFO)

	presentation_data_schema = {
	'lat': 'float',
	'lon': 'float',
	'species': 'str',
	'author_email': 'str',
	'date' : 'timestamp',
	}

	def try_download_dataset(dataset_id:str, data_files:str) -> dict:
	"""
	Attempts to download a dataset from Hugging Face, catching any errors that occur.

	Args:
	dataset_id (str): The ID of the dataset to download.
	data_files (str): The data files associated with the dataset.
	Returns:
	dict: A dictionary containing the dataset metadata if the download is successful,
	or an empty dictionary if an error occurs.

	"""

	m_logger.info(f"Starting to download dataset {dataset_id} from Hugging Face")
	t1 = time.time()
	try:
	metadata:DatasetDict = load_dataset(dataset_id, data_files=data_files)
	t2 = time.time(); elap = t2 - t1
	except ValueError as e:
	t2 = time.time(); elap = t2 - t1
	msg = f"Error downloading dataset: {e}. (after {elap:.2f}s)."
	st.error(msg)
	m_logger.error(msg)
	metadata = {}
	except Exception as e:
	# catch all (other) exceptions and log them, handle them once isolated
	t2 = time.time(); elap = t2 - t1
	msg = f"!!Unknown Error!! downloading dataset: {e}. (after {elap:.2f}s)."
	st.error(msg)
	m_logger.error(msg)
	metadata = {}


	msg = f"Downloaded dataset: (after {elap:.2f}s). "
	m_logger.info(msg)
	#st.write(msg)
	return metadata

	def get_dataset() -> pd.DataFrame:
	"""
	Downloads the dataset from Hugging Face and prepares it for use.
	If the dataset is not available, it creates an empty DataFrame with the specified schema.
	Returns:
	pd.DataFrame: A DataFrame containing the dataset, or an empty DataFrame if the dataset is not available.
	"""
	# load/download data from huggingface dataset
	metadata = try_download_dataset(dataset_id, data_files)

	if not metadata:
	# create an empty, but compliant dataframe
	df = pd.DataFrame(columns=presentation_data_schema).astype(presentation_data_schema)
	else:
	# make a pandas df that is compliant with folium/streamlit maps
	df = pd.DataFrame({
	'lat': metadata["train"]["latitude"],
	'lon': metadata["train"]["longitude"],
	'species': metadata["train"]["selected_class"],
	'author_email': metadata["train"]["author_email"],
	'date': metadata["train"]["date"],}
	)
	return df