Spaces:
Running
Running
File size: 3,099 Bytes
6f0e32c de2a82e 6f0e32c 41c5156 6f0e32c a4eb606 6f0e32c de2a82e aba41f2 6f0e32c 41c5156 6f0e32c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import streamlit as st
import time
import logging
import pandas as pd
from datasets import load_dataset
from datasets import DatasetDict
############################################################
# the dataset of observations (hf dataset in our space)
dataset_id = "Saving-Willy/temp_dataset"
data_files = "data/train-00000-of-00001.parquet"
############################################################
m_logger = logging.getLogger(__name__)
# we can set the log level locally for funcs in this module
#g_m_logger.setLevel(logging.DEBUG)
m_logger.setLevel(logging.INFO)
presentation_data_schema = {
'lat': 'float',
'lon': 'float',
'species': 'str',
'author_email': 'str',
'date' : 'timestamp',
}
def try_download_dataset(dataset_id:str, data_files:str) -> dict:
"""
Attempts to download a dataset from Hugging Face, catching any errors that occur.
Args:
dataset_id (str): The ID of the dataset to download.
data_files (str): The data files associated with the dataset.
Returns:
dict: A dictionary containing the dataset metadata if the download is successful,
or an empty dictionary if an error occurs.
"""
m_logger.info(f"Starting to download dataset {dataset_id} from Hugging Face")
t1 = time.time()
try:
metadata:DatasetDict = load_dataset(dataset_id, data_files=data_files)
t2 = time.time(); elap = t2 - t1
except ValueError as e:
t2 = time.time(); elap = t2 - t1
msg = f"Error downloading dataset: {e}. (after {elap:.2f}s)."
st.error(msg)
m_logger.error(msg)
metadata = {}
except Exception as e:
# catch all (other) exceptions and log them, handle them once isolated
t2 = time.time(); elap = t2 - t1
msg = f"!!Unknown Error!! downloading dataset: {e}. (after {elap:.2f}s)."
st.error(msg)
m_logger.error(msg)
metadata = {}
msg = f"Downloaded dataset: (after {elap:.2f}s). "
m_logger.info(msg)
#st.write(msg)
return metadata
def get_dataset() -> pd.DataFrame:
"""
Downloads the dataset from Hugging Face and prepares it for use.
If the dataset is not available, it creates an empty DataFrame with the specified schema.
Returns:
pd.DataFrame: A DataFrame containing the dataset, or an empty DataFrame if the dataset is not available.
"""
# load/download data from huggingface dataset
metadata = try_download_dataset(dataset_id, data_files)
if not metadata:
# create an empty, but compliant dataframe
df = pd.DataFrame(columns=presentation_data_schema).astype(presentation_data_schema)
else:
# make a pandas df that is compliant with folium/streamlit maps
df = pd.DataFrame({
'lat': metadata["train"]["latitude"],
'lon': metadata["train"]["longitude"],
'species': metadata["train"]["selected_class"],
'author_email': metadata["train"]["author_email"],
'date': metadata["train"]["date"],}
)
return df |