vancauwe commited on
Commit
6f0e32c
Β·
1 Parent(s): 44e5f5e

feat: extract dataset manipulation from map features

Browse files
src/dataset/download.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ import logging
4
+ import pandas as pd
5
+ from datasets import load_dataset
6
+ from datasets import DatasetDict, Dataset
7
+
8
+ ############################################################
9
+ # the dataset of observations (hf dataset in our space)
10
+ dataset_id = "Saving-Willy/temp_dataset"
11
+ data_files = "data/train-00000-of-00001.parquet"
12
+ ############################################################
13
+
14
+ m_logger = logging.getLogger(__name__)
15
+ # we can set the log level locally for funcs in this module
16
+ #g_m_logger.setLevel(logging.DEBUG)
17
+ m_logger.setLevel(logging.INFO)
18
+
19
+ presentation_data_schema = {
20
+ 'lat': 'float',
21
+ 'lon': 'float',
22
+ 'species': 'str',
23
+ }
24
+
25
+ def try_download_dataset(dataset_id:str, data_files:str) -> dict:
26
+ """
27
+ Attempts to download a dataset from Hugging Face, catching any errors that occur.
28
+
29
+ Args:
30
+ dataset_id (str): The ID of the dataset to download.
31
+ data_files (str): The data files associated with the dataset.
32
+ Returns:
33
+ dict: A dictionary containing the dataset metadata if the download is successful,
34
+ or an empty dictionary if an error occurs.
35
+
36
+ """
37
+
38
+ m_logger.info(f"Starting to download dataset {dataset_id} from Hugging Face")
39
+ t1 = time.time()
40
+ try:
41
+ metadata:DatasetDict = load_dataset(dataset_id, data_files=data_files)
42
+ t2 = time.time(); elap = t2 - t1
43
+ except ValueError as e:
44
+ t2 = time.time(); elap = t2 - t1
45
+ msg = f"Error downloading dataset: {e}. (after {elap:.2f}s)."
46
+ st.error(msg)
47
+ m_logger.error(msg)
48
+ metadata = {}
49
+ except Exception as e:
50
+ # catch all (other) exceptions and log them, handle them once isolated
51
+ t2 = time.time(); elap = t2 - t1
52
+ msg = f"!!Unknown Error!! downloading dataset: {e}. (after {elap:.2f}s)."
53
+ st.error(msg)
54
+ m_logger.error(msg)
55
+ metadata = {}
56
+
57
+
58
+ msg = f"Downloaded dataset: (after {elap:.2f}s). "
59
+ m_logger.info(msg)
60
+ st.write(msg)
61
+ return metadata
62
+
63
+ def get_dataset():
64
+ # load/download data from huggingface dataset
65
+ metadata = try_download_dataset(dataset_id, data_files)
66
+
67
+ if not metadata:
68
+ # create an empty, but compliant dataframe
69
+ df = pd.DataFrame(columns=presentation_data_schema).astype(presentation_data_schema)
70
+ else:
71
+ # make a pandas df that is compliant with folium/streamlit maps
72
+ df = pd.DataFrame({
73
+ 'lat': metadata["train"]["latitude"],
74
+ 'lon': metadata["train"]["longitude"],
75
+ 'species': metadata["train"]["selected_class"],}
76
+ )
77
+ return df
src/dataset/requests.py ADDED
File without changes
src/maps/obs_map.py CHANGED
@@ -1,18 +1,13 @@
1
  from typing import Tuple
2
  import logging
3
 
4
- import pandas as pd
5
- from datasets import load_dataset
6
- from datasets import DatasetDict, Dataset
7
-
8
- import time
9
-
10
  import streamlit as st
11
  import folium
12
  from streamlit_folium import st_folium
13
 
14
  import whale_viewer as viewer
15
  from utils.fix_tabrender import js_show_zeroheight_iframe
 
16
 
17
  m_logger = logging.getLogger(__name__)
18
  # we can set the log level locally for funcs in this module
@@ -66,13 +61,6 @@ _colors = [
66
 
67
  whale2color = {k: v for k, v in zip(viewer.WHALE_CLASSES, _colors)}
68
 
69
- presentation_data_schema = {
70
- 'lat': 'float',
71
- 'lon': 'float',
72
- 'species': 'str',
73
- }
74
-
75
-
76
  def create_map(tile_name:str, location:Tuple[float], zoom_start: int = 7) -> folium.Map:
77
  """
78
  Create a folium map with the specified tile layer
@@ -124,48 +112,8 @@ def create_map(tile_name:str, location:Tuple[float], zoom_start: int = 7) -> fol
124
  #folium.LayerControl().add_to(m)
125
  return m
126
 
127
- def try_download_dataset(dataset_id:str, data_files:str) -> dict:
128
- """
129
- Attempts to download a dataset from Hugging Face, catching any errors that occur.
130
-
131
- Args:
132
- dataset_id (str): The ID of the dataset to download.
133
- data_files (str): The data files associated with the dataset.
134
- Returns:
135
- dict: A dictionary containing the dataset metadata if the download is successful,
136
- or an empty dictionary if an error occurs.
137
 
138
- """
139
-
140
- m_logger.info(f"Starting to download dataset {dataset_id} from Hugging Face")
141
- t1 = time.time()
142
- try:
143
- metadata:DatasetDict = load_dataset(dataset_id, data_files=data_files)
144
- t2 = time.time(); elap = t2 - t1
145
- except ValueError as e:
146
- t2 = time.time(); elap = t2 - t1
147
- msg = f"Error downloading dataset: {e}. (after {elap:.2f}s)."
148
- st.error(msg)
149
- m_logger.error(msg)
150
- metadata = {}
151
- except Exception as e:
152
- # catch all (other) exceptions and log them, handle them once isolated
153
- t2 = time.time(); elap = t2 - t1
154
- msg = f"!!Unknown Error!! downloading dataset: {e}. (after {elap:.2f}s)."
155
- st.error(msg)
156
- m_logger.error(msg)
157
- metadata = {}
158
-
159
-
160
- msg = f"Downloaded dataset: (after {elap:.2f}s). "
161
- m_logger.info(msg)
162
- st.write(msg)
163
- return metadata
164
-
165
-
166
- def present_obs_map(dataset_id:str = "Saving-Willy/Happywhale-kaggle",
167
- data_files:str = "data/train-00000-of-00001.parquet",
168
- dbg_show_extra:bool = False) -> dict:
169
  """
170
  Render map plus tile selector, with markers for whale observations
171
 
@@ -186,19 +134,7 @@ def present_obs_map(dataset_id:str = "Saving-Willy/Happywhale-kaggle",
186
 
187
  """
188
 
189
- # load/download data from huggingface dataset
190
- metadata = try_download_dataset(dataset_id, data_files)
191
-
192
- if not metadata:
193
- # create an empty, but compliant dataframe
194
- _df = pd.DataFrame(columns=presentation_data_schema).astype(presentation_data_schema)
195
- else:
196
- # make a pandas df that is compliant with folium/streamlit maps
197
- _df = pd.DataFrame({
198
- 'lat': metadata["train"]["latitude"],
199
- 'lon': metadata["train"]["longitude"],
200
- 'species': metadata["train"]["selected_class"],}
201
- )
202
 
203
  if dbg_show_extra:
204
  # add a few samples to visualise colours
 
1
  from typing import Tuple
2
  import logging
3
 
 
 
 
 
 
 
4
  import streamlit as st
5
  import folium
6
  from streamlit_folium import st_folium
7
 
8
  import whale_viewer as viewer
9
  from utils.fix_tabrender import js_show_zeroheight_iframe
10
+ from dataset.download import get_dataset
11
 
12
  m_logger = logging.getLogger(__name__)
13
  # we can set the log level locally for funcs in this module
 
61
 
62
  whale2color = {k: v for k, v in zip(viewer.WHALE_CLASSES, _colors)}
63
 
 
 
 
 
 
 
 
64
  def create_map(tile_name:str, location:Tuple[float], zoom_start: int = 7) -> folium.Map:
65
  """
66
  Create a folium map with the specified tile layer
 
112
  #folium.LayerControl().add_to(m)
113
  return m
114
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ def present_obs_map(dbg_show_extra:bool = False) -> dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  """
118
  Render map plus tile selector, with markers for whale observations
119
 
 
134
 
135
  """
136
 
137
+ _df = get_dataset()
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  if dbg_show_extra:
140
  # add a few samples to visualise colours
src/pages/2_🌍_map.py CHANGED
@@ -6,8 +6,6 @@ st.set_page_config(
6
  layout="wide",
7
  )
8
 
9
- from utils.st_logs import parse_log_buffer, init_logging_session_states
10
-
11
  from maps.obs_map import add_obs_map_header
12
  from maps.alps_map import present_alps_map
13
  from maps.obs_map import present_obs_map
@@ -16,10 +14,6 @@ from datasets import disable_caching
16
  disable_caching()
17
 
18
  ############################################################
19
- # TO- DO: MAKE ENV FILE
20
- # the dataset of observations (hf dataset in our space)
21
- dataset_id = "Saving-Willy/temp_dataset"
22
- data_files = "data/train-00000-of-00001.parquet"
23
  USE_BASIC_MAP = False
24
  DEV_SIDEBAR_LIB = True
25
  ############################################################
@@ -35,10 +29,7 @@ with tab_map_ui_cols[1]:
35
 
36
  if show_db_points:
37
  # show a nicer map, observations marked, tileset selectable.
38
- st_observation = present_obs_map(
39
- dataset_id=dataset_id, data_files=data_files,
40
- dbg_show_extra=dbg_show_extra)
41
-
42
  else:
43
  # development map.
44
  st_observation = present_alps_map()
 
6
  layout="wide",
7
  )
8
 
 
 
9
  from maps.obs_map import add_obs_map_header
10
  from maps.alps_map import present_alps_map
11
  from maps.obs_map import present_obs_map
 
14
  disable_caching()
15
 
16
  ############################################################
 
 
 
 
17
  USE_BASIC_MAP = False
18
  DEV_SIDEBAR_LIB = True
19
  ############################################################
 
29
 
30
  if show_db_points:
31
  # show a nicer map, observations marked, tileset selectable.
32
+ st_observation = present_obs_map(dbg_show_extra=dbg_show_extra)
 
 
 
33
  else:
34
  # development map.
35
  st_observation = present_alps_map()
src/pages/5_🀝_requests.py CHANGED
@@ -5,4 +5,13 @@ st.set_page_config(
5
  page_icon="🀝",
6
  )
7
 
8
- from utils.st_logs import parse_log_buffer, init_logging_session_states
 
 
 
 
 
 
 
 
 
 
5
  page_icon="🀝",
6
  )
7
 
8
+ from utils.st_logs import parse_log_buffer, init_logging_session_states
9
+
10
+ from datasets import disable_caching
11
+ disable_caching()
12
+
13
+ ############################################################
14
+ # the dataset of observations (hf dataset in our space)
15
+ dataset_id = "Saving-Willy/temp_dataset"
16
+ data_files = "data/train-00000-of-00001.parquet"
17
+ ############################################################