vancauwe commited on
Commit
41c5156
·
1 Parent(s): 6f0e32c

feat: requests basic architecture

Browse files
src/dataset/cleaner.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def clean_lat_long(df): # Ensure lat and lon are numeric, coerce errors to NaN
4
+ df['lat'] = pd.to_numeric(df['lat'], errors='coerce')
5
+ df['lon'] = pd.to_numeric(df['lon'], errors='coerce')
6
+
7
+ # Drop rows with NaN in lat or lon
8
+ df = df.dropna(subset=['lat', 'lon']).reset_index(drop=True)
9
+ return df
10
+
11
+ def clean_date(df): # Ensure lat and lon are numeric, coerce errors to NaN
12
+ df['date'] = pd.to_datetime(df['date'], errors='coerce')
13
+ # Drop rows with NaN in lat or lon
14
+ df = df.dropna(subset=['date']).reset_index(drop=True)
15
+ return df
src/dataset/download.py CHANGED
@@ -20,6 +20,8 @@ presentation_data_schema = {
20
  'lat': 'float',
21
  'lon': 'float',
22
  'species': 'str',
 
 
23
  }
24
 
25
  def try_download_dataset(dataset_id:str, data_files:str) -> dict:
@@ -72,6 +74,8 @@ def get_dataset():
72
  df = pd.DataFrame({
73
  'lat': metadata["train"]["latitude"],
74
  'lon': metadata["train"]["longitude"],
75
- 'species': metadata["train"]["selected_class"],}
 
 
76
  )
77
  return df
 
20
  'lat': 'float',
21
  'lon': 'float',
22
  'species': 'str',
23
+ 'author_email': 'str',
24
+ 'date' : 'timestamp',
25
  }
26
 
27
  def try_download_dataset(dataset_id:str, data_files:str) -> dict:
 
74
  df = pd.DataFrame({
75
  'lat': metadata["train"]["latitude"],
76
  'lon': metadata["train"]["longitude"],
77
+ 'species': metadata["train"]["selected_class"],
78
+ 'author_email': metadata["train"]["author_email"],
79
+ 'date': metadata["train"]["date"],}
80
  )
81
  return df
src/dataset/fake_data.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import random
4
+ from datetime import datetime, timedelta
5
+
6
+ def generate_fake_data(df, num_fake):
7
+
8
+ # Options for random generation
9
+ species_options = [
10
+ "beluga",
11
+ "blue_whale",
12
+ "bottlenose_dolphin",
13
+ "brydes_whale",
14
+ "commersons_dolphin",
15
+ "common_dolphin",
16
+ "cuviers_beaked_whale",
17
+ "dusky_dolphin",
18
+ "false_killer_whale",
19
+ "fin_whale",
20
+ "frasiers_dolphin",
21
+ "gray_whale",
22
+ "humpback_whale",
23
+ "killer_whale",
24
+ "long_finned_pilot_whale",
25
+ "melon_headed_whale",
26
+ "minke_whale",
27
+ "pantropic_spotted_dolphin",
28
+ "pygmy_killer_whale",
29
+ "rough_toothed_dolphin",
30
+ "sei_whale",
31
+ "short_finned_pilot_whale",
32
+ "southern_right_whale",
33
+ "spinner_dolphin",
34
+ "spotted_dolphin",
35
+ "white_sided_dolphin",
36
+ ]
37
+ email_options = [
38
39
40
+ ]
41
+
42
+ def random_ocean_coord():
43
+ """Generate random ocean-friendly coordinates."""
44
+ lat = random.uniform(-60, 60) # avoid poles
45
+ lon = random.uniform(-180, 180)
46
+ return lat, lon
47
+
48
+ def random_date(start_year=2018, end_year=2025):
49
+ """Generate a random date."""
50
+ start = datetime(start_year, 1, 1)
51
+ end = datetime(end_year, 1, 1)
52
+ return start + timedelta(days=random.randint(0, (end - start).days))
53
+
54
+ # Generate 20 new observations
55
+ new_data = []
56
+ for _ in range(num_fake):
57
+ lat, lon = random_ocean_coord()
58
+ species = random.choice(species_options)
59
+ email = random.choice(email_options)
60
+ date = random_date()
61
+ new_data.append([lat, lon, species, email, date])
62
+
63
+ # Create a DataFrame and append
64
+ new_df = pd.DataFrame(new_data, columns=['lat', 'lon', 'species', 'author_email', 'date'])
65
+ df = pd.concat([df, new_df], ignore_index=True)
66
+ return df
src/dataset/requests.py CHANGED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from dataset.cleaner import clean_lat_long, clean_date
4
+ from dataset.download import get_dataset
5
+ from dataset.fake_data import generate_fake_data
6
+
7
+ def default_data_view():
8
+ df = get_dataset()
9
+ df = generate_fake_data(df, 100)
10
+ df = clean_lat_long(df)
11
+ df = clean_date(df)
12
+ return df
13
+
14
+ def filter_data(df):
15
+ if st.session_state.date_range:
16
+ df_filtered = df[
17
+ (df['date'] >= pd.to_datetime(st.session_state.date_range[0])) & \
18
+ (df['date'] <= pd.to_datetime(st.session_state.date_range[1]))
19
+ ]
20
+ if st.session_state.lon_range:
21
+ df_filtered = df[
22
+ (df['lon'] >= st.session_state.lon_range[0]) & \
23
+ (df['lon'] <= st.session_state.lon_range[1])
24
+ ]
25
+ if st.session_state.lat_range:
26
+ df_filtered = df[
27
+ (df['lat'] >= st.session_state.lat_range[0]) & \
28
+ (df['lat'] <= st.session_state.lat_range[1])
29
+ ]
30
+ return df_filtered
31
+
32
+ def show_specie_author(df):
33
+ df = df.groupby(['species', 'author_email']).size().reset_index(name='counts')
34
+ for specie in df["species"].unique():
35
+ st.subheader(f"Species: {specie}")
36
+ specie_data = df[df['species'] == specie]
37
+ for _, row in specie_data.iterrows():
38
+ key = f"{specie}_{row['author_email']}"
39
+ label = f"{row['author_email']} ({row['counts']})"
40
+ st.session_state.checkbox_states[key] = st.checkbox(label, key=key)
41
+
42
+ def show_new_data_view(df):
43
+ df = filter_data(df)
44
+ df_ordered = show_specie_author(df)
45
+ return df_ordered
46
+
47
+
48
+
49
+
50
+
src/maps/obs_map.py CHANGED
@@ -135,7 +135,7 @@ def present_obs_map(dbg_show_extra:bool = False) -> dict:
135
  """
136
 
137
  _df = get_dataset()
138
-
139
  if dbg_show_extra:
140
  # add a few samples to visualise colours
141
  _df.loc[len(_df)] = {'lat': 0, 'lon': 0, 'species': 'rough_toothed_dolphin'}
 
135
  """
136
 
137
  _df = get_dataset()
138
+ print(_df)
139
  if dbg_show_extra:
140
  # add a few samples to visualise colours
141
  _df.loc[len(_df)] = {'lat': 0, 'lon': 0, 'species': 'rough_toothed_dolphin'}
src/pages/5_🤝_requests.py CHANGED
@@ -6,12 +6,63 @@ st.set_page_config(
6
  )
7
 
8
  from utils.st_logs import parse_log_buffer, init_logging_session_states
9
-
10
  from datasets import disable_caching
11
  disable_caching()
12
 
13
- ############################################################
14
- # the dataset of observations (hf dataset in our space)
15
- dataset_id = "Saving-Willy/temp_dataset"
16
- data_files = "data/train-00000-of-00001.parquet"
17
- ############################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  )
7
 
8
  from utils.st_logs import parse_log_buffer, init_logging_session_states
9
+ from dataset.requests import default_data_view, show_new_data_view
10
  from datasets import disable_caching
11
  disable_caching()
12
 
13
+ st.title("Requests")
14
+
15
+ # Initialize the default data view
16
+ df = default_data_view()
17
+ print(df)
18
+
19
+ if 'checkbox_states' not in st.session_state:
20
+ st.session_state.checkbox_states = {}
21
+
22
+ if 'lat_range' not in st.session_state:
23
+ st.session_state.lat_range = (float(df['lat'].min()), float(df['lat'].max()))
24
+
25
+ if 'lon_range' not in st.session_state:
26
+ st.session_state.lon_range = (df['lon'].min(), df['lon'].max())
27
+
28
+ if 'date_range' not in st.session_state:
29
+ st.session_state.date_range = (df['date'].min(), df['date'].max())
30
+
31
+ # Request button at the bottom
32
+ if st.button("Request (Bottom)"):
33
+ selected = [k for k, v in st.session_state.checkbox_states.items() if v]
34
+ if selected:
35
+ st.success(f"Request submitted for: {', '.join(selected)}")
36
+ else:
37
+ st.warning("No selections made.")
38
+
39
+ # Latitude range filter
40
+ lat_min, lat_max = float(df['lat'].min()), float(df['lat'].max())
41
+ lat_range = st.sidebar.slider("Latitude range",
42
+ min_value=lat_min,
43
+ max_value=lat_max,
44
+ value=(lat_min, lat_max),
45
+ key='lat_range')
46
+
47
+ # Longitude range filter
48
+ lon_min, lon_max = float(df['lon'].min()), float(df['lon'].max())
49
+ lon_range = st.sidebar.slider("Longitude range",
50
+ min_value=lon_min,
51
+ max_value=lon_max,
52
+ value=(lon_min, lon_max),
53
+ key='lon_range')
54
+
55
+ # Date range filter
56
+ date_min, date_max = df['date'].min(), df['date'].max()
57
+ date_range = st.sidebar.date_input("Date range",
58
+ value=(date_min, date_max),
59
+ min_value=date_min,
60
+ max_value=date_max,
61
+ key='date_range')
62
+
63
+ # Show authors per specie
64
+ show_new_data_view(df)
65
+
66
+
67
+
68
+