File size: 3,099 Bytes
6f0e32c
 
 
 
 
de2a82e
6f0e32c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41c5156
 
6f0e32c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4eb606
6f0e32c
 
de2a82e
aba41f2
 
 
 
 
 
6f0e32c
 
 
 
 
 
 
 
 
 
 
41c5156
 
 
6f0e32c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import streamlit as st
import time
import logging
import pandas as pd
from datasets import load_dataset
from datasets import DatasetDict

############################################################
# the dataset of observations (hf dataset in our space)
dataset_id = "Saving-Willy/temp_dataset"
data_files = "data/train-00000-of-00001.parquet"
############################################################

m_logger = logging.getLogger(__name__)
# we can set the log level locally for funcs in this module
#g_m_logger.setLevel(logging.DEBUG)
m_logger.setLevel(logging.INFO)

presentation_data_schema = {
    'lat': 'float',
    'lon': 'float',
    'species': 'str',
    'author_email': 'str',
    'date' : 'timestamp',
}

def try_download_dataset(dataset_id:str, data_files:str) -> dict:
    """
    Attempts to download a dataset from Hugging Face, catching any errors that occur.
    
    Args:
        dataset_id (str): The ID of the dataset to download.
        data_files (str): The data files associated with the dataset.
    Returns:
        dict: A dictionary containing the dataset metadata if the download is successful, 
              or an empty dictionary if an error occurs.

    """

    m_logger.info(f"Starting to download dataset {dataset_id} from Hugging Face")
    t1 = time.time()
    try:
        metadata:DatasetDict = load_dataset(dataset_id, data_files=data_files)
        t2 = time.time(); elap = t2 - t1
    except ValueError as e:
        t2 = time.time(); elap = t2 - t1
        msg = f"Error downloading dataset: {e}.  (after {elap:.2f}s)."
        st.error(msg)
        m_logger.error(msg)
        metadata = {}
    except Exception as e:
        # catch all (other) exceptions and log them, handle them once isolated 
        t2 = time.time(); elap = t2 - t1
        msg = f"!!Unknown Error!! downloading dataset: {e}.  (after {elap:.2f}s)."
        st.error(msg)
        m_logger.error(msg)
        metadata = {}
        

    msg = f"Downloaded dataset: (after {elap:.2f}s). "
    m_logger.info(msg)
    #st.write(msg)
    return metadata

def get_dataset() -> pd.DataFrame:
    """
    Downloads the dataset from Hugging Face and prepares it for use.
    If the dataset is not available, it creates an empty DataFrame with the specified schema.
    Returns:
        pd.DataFrame: A DataFrame containing the dataset, or an empty DataFrame if the dataset is not available.
    """
    # load/download data from huggingface dataset
    metadata = try_download_dataset(dataset_id, data_files)
    
    if not metadata:
        # create an empty, but compliant dataframe
        df = pd.DataFrame(columns=presentation_data_schema).astype(presentation_data_schema)
    else:
        # make a pandas df that is compliant with folium/streamlit maps
        df = pd.DataFrame({
            'lat': metadata["train"]["latitude"],
            'lon': metadata["train"]["longitude"],
            'species': metadata["train"]["selected_class"],
            'author_email': metadata["train"]["author_email"],
            'date': metadata["train"]["date"],}
        )
    return df