vancauwe commited on
Commit
de2a82e
Β·
1 Parent(s): 14b60f7

feat: add types and home page text

Browse files
README.md CHANGED
@@ -28,7 +28,7 @@ pip install -r requirements.txt
28
  ```
29
 
30
  ```
31
- streamlit run src/main.py
32
  ```
33
 
34
 
 
28
  ```
29
 
30
  ```
31
+ streamlit run src/home.py
32
  ```
33
 
34
 
requirements.txt CHANGED
@@ -13,6 +13,9 @@ datasets==3.0.2
13
  ## FSM
14
  transitions==0.9.2
15
 
 
 
 
16
  # running ML models
17
 
18
  ## to use ML models hosted on HF
@@ -28,9 +31,6 @@ pillow==10.4.0
28
  opencv-python-headless==4.5.5.64
29
  albumentations==1.1.0
30
 
31
- # for states
32
- transitions==0.9.2
33
-
34
  # for env variables
35
  python-dotenv==1.1.0
36
 
 
13
  ## FSM
14
  transitions==0.9.2
15
 
16
+ # data manipulation
17
+ pandas==2.2.3
18
+
19
  # running ML models
20
 
21
  ## to use ML models hosted on HF
 
31
  opencv-python-headless==4.5.5.64
32
  albumentations==1.1.0
33
 
 
 
 
34
  # for env variables
35
  python-dotenv==1.1.0
36
 
src/dataset/cleaner.py CHANGED
@@ -1,8 +1,9 @@
1
  import pandas as pd
2
 
3
- def clean_lat_long(df): # Ensure lat and lon are numeric, coerce errors to NaN
4
  """
5
  Clean latitude and longitude columns in the DataFrame.
 
6
  Args:
7
  df (pd.DataFrame): DataFrame containing latitude and longitude columns.
8
  Returns:
@@ -15,7 +16,7 @@ def clean_lat_long(df): # Ensure lat and lon are numeric, coerce errors to NaN
15
  df = df.dropna(subset=['lat', 'lon']).reset_index(drop=True)
16
  return df
17
 
18
- def clean_date(df): # Ensure lat and lon are numeric, coerce errors to NaN
19
  """
20
  Clean date column in the DataFrame.
21
  Args:
 
1
  import pandas as pd
2
 
3
+ def clean_lat_long(df) -> pd.DataFrame:
4
  """
5
  Clean latitude and longitude columns in the DataFrame.
6
+ Ensure lat and lon are numeric, coerce errors to NaN
7
  Args:
8
  df (pd.DataFrame): DataFrame containing latitude and longitude columns.
9
  Returns:
 
16
  df = df.dropna(subset=['lat', 'lon']).reset_index(drop=True)
17
  return df
18
 
19
+ def clean_date(df) -> pd.DataFrame: # Ensure lat and lon are numeric, coerce errors to NaN
20
  """
21
  Clean date column in the DataFrame.
22
  Args:
src/dataset/{requests.py β†’ data_requests.py} RENAMED
@@ -4,7 +4,7 @@ from dataset.cleaner import clean_lat_long, clean_date
4
  from dataset.download import get_dataset
5
  from dataset.fake_data import generate_fake_data
6
 
7
- def data_prep():
8
  """
9
  Prepares the dataset for use in the application.
10
  Downloads the dataset and cleans the data (and generates fake data if needed).
@@ -18,7 +18,7 @@ def data_prep():
18
  df = clean_date(df)
19
  return df
20
 
21
- def filter_data(df):
22
  """
23
  Filter the DataFrame based on user-selected ranges for latitude, longitude, and date.
24
  Args:
@@ -51,8 +51,9 @@ def show_specie_author(df):
51
  label = f"{row['author_email']} ({row['counts']})"
52
  st.session_state.checkbox_states[key] = st.checkbox(label, key=key)
53
 
54
- def show_new_data_view(df):
55
  """
 
56
  Filter the dataframe based on the state of the localisation sliders and selected timeframe by the user.
57
  Then, show the results of the filtering grouped by species then by authors.
58
  Authors are matched to a checkbox component so the user can click it if he/she/they wish to request data from this author.
 
4
  from dataset.download import get_dataset
5
  from dataset.fake_data import generate_fake_data
6
 
7
+ def data_prep() -> pd.DataFrame:
8
  """
9
  Prepares the dataset for use in the application.
10
  Downloads the dataset and cleans the data (and generates fake data if needed).
 
18
  df = clean_date(df)
19
  return df
20
 
21
+ def filter_data(df) -> pd.DataFrame:
22
  """
23
  Filter the DataFrame based on user-selected ranges for latitude, longitude, and date.
24
  Args:
 
51
  label = f"{row['author_email']} ({row['counts']})"
52
  st.session_state.checkbox_states[key] = st.checkbox(label, key=key)
53
 
54
+ def show_new_data_view(df) -> pd.DataFrame:
55
  """
56
+ Show the new filtered data view on the UI.
57
  Filter the dataframe based on the state of the localisation sliders and selected timeframe by the user.
58
  Then, show the results of the filtering grouped by species then by authors.
59
  Authors are matched to a checkbox component so the user can click it if he/she/they wish to request data from this author.
src/dataset/download.py CHANGED
@@ -3,7 +3,7 @@ import time
3
  import logging
4
  import pandas as pd
5
  from datasets import load_dataset
6
- from datasets import DatasetDict, Dataset
7
 
8
  ############################################################
9
  # the dataset of observations (hf dataset in our space)
@@ -62,7 +62,7 @@ def try_download_dataset(dataset_id:str, data_files:str) -> dict:
62
  #st.write(msg)
63
  return metadata
64
 
65
- def get_dataset():
66
  """
67
  Downloads the dataset from Hugging Face and prepares it for use.
68
  If the dataset is not available, it creates an empty DataFrame with the specified schema.
 
3
  import logging
4
  import pandas as pd
5
  from datasets import load_dataset
6
+ from datasets import DatasetDict
7
 
8
  ############################################################
9
  # the dataset of observations (hf dataset in our space)
 
62
  #st.write(msg)
63
  return metadata
64
 
65
+ def get_dataset() -> pd.DataFrame:
66
  """
67
  Downloads the dataset from Hugging Face and prepares it for use.
68
  If the dataset is not available, it creates an empty DataFrame with the specified schema.
src/dataset/fake_data.py CHANGED
@@ -1,11 +1,14 @@
1
  import pandas as pd
2
- import numpy as np
3
  import random
4
  from datetime import datetime, timedelta
5
 
6
- def generate_fake_data(df, num_fake):
 
 
 
7
  """
8
  Generate fake data for the dataset.
 
9
  Args:
10
  df (pd.DataFrame): Original DataFrame to append fake data to.
11
  num_fake (int): Number of fake observations to generate.
@@ -14,34 +17,7 @@ def generate_fake_data(df, num_fake):
14
  """
15
 
16
  # Options for random generation
17
- species_options = [
18
- "beluga",
19
- "blue_whale",
20
- "bottlenose_dolphin",
21
- "brydes_whale",
22
- "commersons_dolphin",
23
- "common_dolphin",
24
- "cuviers_beaked_whale",
25
- "dusky_dolphin",
26
- "false_killer_whale",
27
- "fin_whale",
28
- "frasiers_dolphin",
29
- "gray_whale",
30
- "humpback_whale",
31
- "killer_whale",
32
- "long_finned_pilot_whale",
33
- "melon_headed_whale",
34
- "minke_whale",
35
- "pantropic_spotted_dolphin",
36
- "pygmy_killer_whale",
37
- "rough_toothed_dolphin",
38
- "sei_whale",
39
- "short_finned_pilot_whale",
40
- "southern_right_whale",
41
- "spinner_dolphin",
42
- "spotted_dolphin",
43
- "white_sided_dolphin",
44
- ]
45
  email_options = [
46
47
@@ -67,6 +43,6 @@ def generate_fake_data(df, num_fake):
67
  date = random_date()
68
  new_data.append([lat, lon, species, email, date])
69
 
70
- new_df = pd.DataFrame(new_data, columns=['lat', 'lon', 'species', 'author_email', 'date'])
71
  df = pd.concat([df, new_df], ignore_index=True)
72
  return df
 
1
  import pandas as pd
 
2
  import random
3
  from datetime import datetime, timedelta
4
 
5
+ from download import presentation_data_schema
6
+ from whale_viewer import WHALE_CLASSES
7
+
8
+ def generate_fake_data(df, num_fake) -> pd.DataFrame:
9
  """
10
  Generate fake data for the dataset.
11
+
12
  Args:
13
  df (pd.DataFrame): Original DataFrame to append fake data to.
14
  num_fake (int): Number of fake observations to generate.
 
17
  """
18
 
19
  # Options for random generation
20
+ species_options = WHALE_CLASSES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  email_options = [
22
23
 
43
  date = random_date()
44
  new_data.append([lat, lon, species, email, date])
45
 
46
+ new_df = pd.DataFrame(new_data, columns=presentation_data_schema).astype(presentation_data_schema)
47
  df = pd.concat([df, new_df], ignore_index=True)
48
  return df
src/home.py CHANGED
@@ -24,18 +24,58 @@ init_logging_session_states() # logging init should be early
24
  if "input_author_email" not in st.session_state:
25
  st.session_state.input_author_email = ""
26
 
27
- st.write("# Welcome to Cetacean Research Data Infrastructure! πŸ¬ΛšΛ–π“’Φ΄ΰ»‹ πŸ‹βœ§Λš.⋆")
 
28
 
29
- st.sidebar.success("Here are the pages.")
 
 
 
 
 
 
 
30
 
31
  st.markdown(
32
  """
33
- About: blablabla
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  """
35
  )
36
 
37
 
38
 
 
39
  g_logger.info("App started.")
40
  g_logger.warning(f"[D] Streamlit version: {st.__version__}. Python version: {os.sys.version}")
41
 
 
24
  if "input_author_email" not in st.session_state:
25
  st.session_state.input_author_email = ""
26
 
27
+ st.write("""
28
+ # Welcome ! 🐬˚✧˚.β‹†πŸ‹
29
 
30
+ # Cetacean Conservation Community
31
+ """)
32
+
33
+ st.sidebar.success("Explore the pages: there are machine learning models, data requests, maps and more !")
34
+ st.sidebar.image(
35
+ "src/images/logo/sdsc-horizontal.png",
36
+ width=200
37
+ )
38
 
39
  st.markdown(
40
  """
41
+ ## πŸ’™ Research Data Infrastructure
42
+
43
+ Λ–Β°π“‡ΌπŸŒŠβ‹†πŸšπŸ«§ This interface is a Proof of Concept of a Community-driven Research Data Infrastructure (RDI) for the Cetacean Conservation Community.
44
+ This PoC will happily be made into a production-ready RDI if the community is interested.
45
+
46
+ πŸ‘€ The intended users of this interface are the researchers and conservationists working on cetacean conservation.
47
+ In its current state, the interface is designed to be user-friendly, allowing users to upload images of cetaceans and receive species classification results.
48
+
49
+ 🀝 We value community-contributions and encourage anyone interested to reach out on [the main repository's Github issues](https://github.com/sdsc-ordes/saving-willy/issues).
50
+
51
+ 🌍 The goal of this RDI is to explore community methods for sharing code and data.
52
+
53
+
54
+ ## πŸ’» Sharing Code
55
+
56
+ Through the platform of Hugging Face πŸ€—, machine learning models are published so they can be used for inference on this UI or by other users.
57
+ Currently, a demonstration model is available for cetacean species classification.
58
+ The model is based on the [HappyWhale](https://www.kaggle.com/competitions/happy-whale-and-dolphin) competition with the most recent weights.
59
+ Since part of the model was not made public, the classifier should not be used for inference and is purely demonstrative.
60
+
61
+ πŸ† Ideally, through new Kaggle challenges or ongoing development in research groups, new models can be brought to Hugging Face and onto the UI.
62
+
63
+
64
+ ## πŸ’Ž Sharing Data
65
+
66
+ The dataset is hosted on Hugging Face πŸ€— as well, in order to share the metadata of the images which have been classified by the model.
67
+ Making the metadata public is under the choice of th researcher, who can choose to use the model for inference without making the image metadata public afterwards.
68
+ Of course, we encourage open data. Please note that the original images are never made public in the current-state RDI.
69
+
70
+ πŸ’ͺ The RDI also explore how to share data after inference, with a simple data request page where researchers can fitler the existing metadata from the Hugging Face dataset, and then easily select those of interest for them.
71
+ Ideally, the Request button would either start a Discord channel discussion between concerned parties of the data request, or generate an e-mail with interested parties. This design is still under conception.
72
+
73
  """
74
  )
75
 
76
 
77
 
78
+
79
  g_logger.info("App started.")
80
  g_logger.warning(f"[D] Streamlit version: {st.__version__}. Python version: {os.sys.version}")
81
 
src/images/logo/sdsc-horizontal.png ADDED

Git LFS Details

  • SHA256: a4a40e28f815045ff6251fbc937edf4423da7e36ad9b0418458f5e1eb767f6e2
  • Pointer size: 130 Bytes
  • Size of remote file: 37.4 kB
src/pages/1_πŸ‹_about.py CHANGED
@@ -8,26 +8,39 @@ st.set_page_config(
8
  st.markdown(
9
  """
10
  # About
11
- We created this web app in a hackathon.
 
12
  This interface is a Proof of Concept of a Community-driven Research Data Infrastructure for the Cetacean Conservation Community.
13
 
14
- Please reach out for feedback, suggestions, or if you want to join the project.
15
 
16
  # Open Source Resources
17
 
18
- The space is hosted on Hugging Face.
19
- The code is available on Github.
20
- All model codes are open.
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Credits and Thanks
23
 
24
- Developers:
25
- - Rob Mills
26
- - Laure Vancauwenberghe
27
 
28
- Thanks to:
29
- - EDMAKTUB for their advice.
30
- - SDSC for the hackathon that started the project.
 
31
 
32
  """
33
  )
 
8
  st.markdown(
9
  """
10
  # About
11
+ We created this web app in [a hackathon](https://sdsc-hackathons.ch/projectPage?projectRef=vUt8BfDJXaAs0UfOesXI|XyWLFpqjq3CX3zrM4uz8).
12
+
13
  This interface is a Proof of Concept of a Community-driven Research Data Infrastructure for the Cetacean Conservation Community.
14
 
15
+ Please reach out on [the project Github issues](https://github.com/sdsc-ordes/saving-willy/issues) for feedback, suggestions, or if you want to join the project.
16
 
17
  # Open Source Resources
18
 
19
+ ## UI Code
20
+ - The [space is hosted on Hugging Face](https://huggingface.co/spaces/Saving-Willy/saving-willy-space).
21
+ - The [UI code is available on Github](https://github.com/sdsc-ordes/saving-willy).
22
+ - The [development space](https://huggingface.co/spaces/Saving-Willy/saving-willy-dev) is also hosted publically on Hugging Face.
23
+
24
+ ## The Machine Learning Models
25
+ - The [model](https://huggingface.co/Saving-Willy/cetacean-classifier) is hosted on Hugging Face.
26
+ - The [original Kaggle model code](https://github.com/knshnb/kaggle-happywhale-1st-place) is open on Github as well.
27
+
28
+ ## The Data
29
+
30
+ (temporary setup, a more stable database is probably desired.)
31
+ - The dataset is hosted on Hugging Face.
32
+ - The [dataset syncing code](https://github.com/vancauwe/saving-willy-data-sync) is available on Github.
33
 
34
  # Credits and Thanks
35
 
36
+ ## Developers
37
+ - [Rob Mills](https://github.com/rmm-ch)
38
+ - [Laure Vancauwenberghe](https://github.com/vancauwe)
39
 
40
+ ## Special Thanks
41
+ - [EDMAKTUB](https://edmaktub.org) for their advice.
42
+ - [Swiss Data Science Center](https://www.datascience.ch) for [the hackathon that started the project](https://sdsc-hackathons.ch/projectPage?projectRef=vUt8BfDJXaAs0UfOesXI|XyWLFpqjq3CX3zrM4uz8).
43
+ - [HappyWhale](https://happywhale.com) for launching [the Kaggle challenge that led to model development](https://www.kaggle.com/competitions/happy-whale-and-dolphin).
44
 
45
  """
46
  )
src/pages/3_🀝_data requests.py CHANGED
@@ -5,7 +5,7 @@ st.set_page_config(
5
  page_icon="🀝",
6
  )
7
 
8
- from dataset.requests import data_prep, show_new_data_view
9
 
10
  st.title("Data Requests")
11
  st.write("This page is ensure findability of data across the community.")
 
5
  page_icon="🀝",
6
  )
7
 
8
+ from dataset.data_requests import data_prep, show_new_data_view
9
 
10
  st.title("Data Requests")
11
  st.write("This page is ensure findability of data across the community.")