Spaces:
Running
Running
feat: add types and home page text
Browse files- README.md +1 -1
- requirements.txt +3 -3
- src/dataset/cleaner.py +3 -2
- src/dataset/{requests.py β data_requests.py} +4 -3
- src/dataset/download.py +2 -2
- src/dataset/fake_data.py +7 -31
- src/home.py +43 -3
- src/images/logo/sdsc-horizontal.png +3 -0
- src/pages/1_π_about.py +24 -11
- src/pages/3_π€_data requests.py +1 -1
README.md
CHANGED
@@ -28,7 +28,7 @@ pip install -r requirements.txt
|
|
28 |
```
|
29 |
|
30 |
```
|
31 |
-
streamlit run src/
|
32 |
```
|
33 |
|
34 |
|
|
|
28 |
```
|
29 |
|
30 |
```
|
31 |
+
streamlit run src/home.py
|
32 |
```
|
33 |
|
34 |
|
requirements.txt
CHANGED
@@ -13,6 +13,9 @@ datasets==3.0.2
|
|
13 |
## FSM
|
14 |
transitions==0.9.2
|
15 |
|
|
|
|
|
|
|
16 |
# running ML models
|
17 |
|
18 |
## to use ML models hosted on HF
|
@@ -28,9 +31,6 @@ pillow==10.4.0
|
|
28 |
opencv-python-headless==4.5.5.64
|
29 |
albumentations==1.1.0
|
30 |
|
31 |
-
# for states
|
32 |
-
transitions==0.9.2
|
33 |
-
|
34 |
# for env variables
|
35 |
python-dotenv==1.1.0
|
36 |
|
|
|
13 |
## FSM
|
14 |
transitions==0.9.2
|
15 |
|
16 |
+
# data manipulation
|
17 |
+
pandas==2.2.3
|
18 |
+
|
19 |
# running ML models
|
20 |
|
21 |
## to use ML models hosted on HF
|
|
|
31 |
opencv-python-headless==4.5.5.64
|
32 |
albumentations==1.1.0
|
33 |
|
|
|
|
|
|
|
34 |
# for env variables
|
35 |
python-dotenv==1.1.0
|
36 |
|
src/dataset/cleaner.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
import pandas as pd
|
2 |
|
3 |
-
def clean_lat_long(df)
|
4 |
"""
|
5 |
Clean latitude and longitude columns in the DataFrame.
|
|
|
6 |
Args:
|
7 |
df (pd.DataFrame): DataFrame containing latitude and longitude columns.
|
8 |
Returns:
|
@@ -15,7 +16,7 @@ def clean_lat_long(df): # Ensure lat and lon are numeric, coerce errors to NaN
|
|
15 |
df = df.dropna(subset=['lat', 'lon']).reset_index(drop=True)
|
16 |
return df
|
17 |
|
18 |
-
def clean_date(df): # Ensure lat and lon are numeric, coerce errors to NaN
|
19 |
"""
|
20 |
Clean date column in the DataFrame.
|
21 |
Args:
|
|
|
1 |
import pandas as pd
|
2 |
|
3 |
+
def clean_lat_long(df) -> pd.DataFrame:
|
4 |
"""
|
5 |
Clean latitude and longitude columns in the DataFrame.
|
6 |
+
Ensure lat and lon are numeric, coerce errors to NaN
|
7 |
Args:
|
8 |
df (pd.DataFrame): DataFrame containing latitude and longitude columns.
|
9 |
Returns:
|
|
|
16 |
df = df.dropna(subset=['lat', 'lon']).reset_index(drop=True)
|
17 |
return df
|
18 |
|
19 |
+
def clean_date(df) -> pd.DataFrame: # Ensure lat and lon are numeric, coerce errors to NaN
|
20 |
"""
|
21 |
Clean date column in the DataFrame.
|
22 |
Args:
|
src/dataset/{requests.py β data_requests.py}
RENAMED
@@ -4,7 +4,7 @@ from dataset.cleaner import clean_lat_long, clean_date
|
|
4 |
from dataset.download import get_dataset
|
5 |
from dataset.fake_data import generate_fake_data
|
6 |
|
7 |
-
def data_prep():
|
8 |
"""
|
9 |
Prepares the dataset for use in the application.
|
10 |
Downloads the dataset and cleans the data (and generates fake data if needed).
|
@@ -18,7 +18,7 @@ def data_prep():
|
|
18 |
df = clean_date(df)
|
19 |
return df
|
20 |
|
21 |
-
def filter_data(df):
|
22 |
"""
|
23 |
Filter the DataFrame based on user-selected ranges for latitude, longitude, and date.
|
24 |
Args:
|
@@ -51,8 +51,9 @@ def show_specie_author(df):
|
|
51 |
label = f"{row['author_email']} ({row['counts']})"
|
52 |
st.session_state.checkbox_states[key] = st.checkbox(label, key=key)
|
53 |
|
54 |
-
def show_new_data_view(df):
|
55 |
"""
|
|
|
56 |
Filter the dataframe based on the state of the localisation sliders and selected timeframe by the user.
|
57 |
Then, show the results of the filtering grouped by species then by authors.
|
58 |
Authors are matched to a checkbox component so the user can click it if he/she/they wish to request data from this author.
|
|
|
4 |
from dataset.download import get_dataset
|
5 |
from dataset.fake_data import generate_fake_data
|
6 |
|
7 |
+
def data_prep() -> pd.DataFrame:
|
8 |
"""
|
9 |
Prepares the dataset for use in the application.
|
10 |
Downloads the dataset and cleans the data (and generates fake data if needed).
|
|
|
18 |
df = clean_date(df)
|
19 |
return df
|
20 |
|
21 |
+
def filter_data(df) -> pd.DataFrame:
|
22 |
"""
|
23 |
Filter the DataFrame based on user-selected ranges for latitude, longitude, and date.
|
24 |
Args:
|
|
|
51 |
label = f"{row['author_email']} ({row['counts']})"
|
52 |
st.session_state.checkbox_states[key] = st.checkbox(label, key=key)
|
53 |
|
54 |
+
def show_new_data_view(df) -> pd.DataFrame:
|
55 |
"""
|
56 |
+
Show the new filtered data view on the UI.
|
57 |
Filter the dataframe based on the state of the localisation sliders and selected timeframe by the user.
|
58 |
Then, show the results of the filtering grouped by species then by authors.
|
59 |
Authors are matched to a checkbox component so the user can click it if he/she/they wish to request data from this author.
|
src/dataset/download.py
CHANGED
@@ -3,7 +3,7 @@ import time
|
|
3 |
import logging
|
4 |
import pandas as pd
|
5 |
from datasets import load_dataset
|
6 |
-
from datasets import DatasetDict
|
7 |
|
8 |
############################################################
|
9 |
# the dataset of observations (hf dataset in our space)
|
@@ -62,7 +62,7 @@ def try_download_dataset(dataset_id:str, data_files:str) -> dict:
|
|
62 |
#st.write(msg)
|
63 |
return metadata
|
64 |
|
65 |
-
def get_dataset():
|
66 |
"""
|
67 |
Downloads the dataset from Hugging Face and prepares it for use.
|
68 |
If the dataset is not available, it creates an empty DataFrame with the specified schema.
|
|
|
3 |
import logging
|
4 |
import pandas as pd
|
5 |
from datasets import load_dataset
|
6 |
+
from datasets import DatasetDict
|
7 |
|
8 |
############################################################
|
9 |
# the dataset of observations (hf dataset in our space)
|
|
|
62 |
#st.write(msg)
|
63 |
return metadata
|
64 |
|
65 |
+
def get_dataset() -> pd.DataFrame:
|
66 |
"""
|
67 |
Downloads the dataset from Hugging Face and prepares it for use.
|
68 |
If the dataset is not available, it creates an empty DataFrame with the specified schema.
|
src/dataset/fake_data.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1 |
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
import random
|
4 |
from datetime import datetime, timedelta
|
5 |
|
6 |
-
|
|
|
|
|
|
|
7 |
"""
|
8 |
Generate fake data for the dataset.
|
|
|
9 |
Args:
|
10 |
df (pd.DataFrame): Original DataFrame to append fake data to.
|
11 |
num_fake (int): Number of fake observations to generate.
|
@@ -14,34 +17,7 @@ def generate_fake_data(df, num_fake):
|
|
14 |
"""
|
15 |
|
16 |
# Options for random generation
|
17 |
-
species_options =
|
18 |
-
"beluga",
|
19 |
-
"blue_whale",
|
20 |
-
"bottlenose_dolphin",
|
21 |
-
"brydes_whale",
|
22 |
-
"commersons_dolphin",
|
23 |
-
"common_dolphin",
|
24 |
-
"cuviers_beaked_whale",
|
25 |
-
"dusky_dolphin",
|
26 |
-
"false_killer_whale",
|
27 |
-
"fin_whale",
|
28 |
-
"frasiers_dolphin",
|
29 |
-
"gray_whale",
|
30 |
-
"humpback_whale",
|
31 |
-
"killer_whale",
|
32 |
-
"long_finned_pilot_whale",
|
33 |
-
"melon_headed_whale",
|
34 |
-
"minke_whale",
|
35 |
-
"pantropic_spotted_dolphin",
|
36 |
-
"pygmy_killer_whale",
|
37 |
-
"rough_toothed_dolphin",
|
38 |
-
"sei_whale",
|
39 |
-
"short_finned_pilot_whale",
|
40 |
-
"southern_right_whale",
|
41 |
-
"spinner_dolphin",
|
42 |
-
"spotted_dolphin",
|
43 |
-
"white_sided_dolphin",
|
44 |
-
]
|
45 |
email_options = [
|
46 | |
47 | |
@@ -67,6 +43,6 @@ def generate_fake_data(df, num_fake):
|
|
67 |
date = random_date()
|
68 |
new_data.append([lat, lon, species, email, date])
|
69 |
|
70 |
-
new_df = pd.DataFrame(new_data, columns=
|
71 |
df = pd.concat([df, new_df], ignore_index=True)
|
72 |
return df
|
|
|
1 |
import pandas as pd
|
|
|
2 |
import random
|
3 |
from datetime import datetime, timedelta
|
4 |
|
5 |
+
from download import presentation_data_schema
|
6 |
+
from whale_viewer import WHALE_CLASSES
|
7 |
+
|
8 |
+
def generate_fake_data(df, num_fake) -> pd.DataFrame:
|
9 |
"""
|
10 |
Generate fake data for the dataset.
|
11 |
+
|
12 |
Args:
|
13 |
df (pd.DataFrame): Original DataFrame to append fake data to.
|
14 |
num_fake (int): Number of fake observations to generate.
|
|
|
17 |
"""
|
18 |
|
19 |
# Options for random generation
|
20 |
+
species_options = WHALE_CLASSES
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
email_options = [
|
22 | |
23 | |
|
|
43 |
date = random_date()
|
44 |
new_data.append([lat, lon, species, email, date])
|
45 |
|
46 |
+
new_df = pd.DataFrame(new_data, columns=presentation_data_schema).astype(presentation_data_schema)
|
47 |
df = pd.concat([df, new_df], ignore_index=True)
|
48 |
return df
|
src/home.py
CHANGED
@@ -24,18 +24,58 @@ init_logging_session_states() # logging init should be early
|
|
24 |
if "input_author_email" not in st.session_state:
|
25 |
st.session_state.input_author_email = ""
|
26 |
|
27 |
-
st.write("
|
|
|
28 |
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
st.markdown(
|
32 |
"""
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
"""
|
35 |
)
|
36 |
|
37 |
|
38 |
|
|
|
39 |
g_logger.info("App started.")
|
40 |
g_logger.warning(f"[D] Streamlit version: {st.__version__}. Python version: {os.sys.version}")
|
41 |
|
|
|
24 |
if "input_author_email" not in st.session_state:
|
25 |
st.session_state.input_author_email = ""
|
26 |
|
27 |
+
st.write("""
|
28 |
+
# Welcome ! π¬Λβ§Λ.βπ
|
29 |
|
30 |
+
# Cetacean Conservation Community
|
31 |
+
""")
|
32 |
+
|
33 |
+
st.sidebar.success("Explore the pages: there are machine learning models, data requests, maps and more !")
|
34 |
+
st.sidebar.image(
|
35 |
+
"src/images/logo/sdsc-horizontal.png",
|
36 |
+
width=200
|
37 |
+
)
|
38 |
|
39 |
st.markdown(
|
40 |
"""
|
41 |
+
## π Research Data Infrastructure
|
42 |
+
|
43 |
+
ΛΒ°πΌπβππ«§ This interface is a Proof of Concept of a Community-driven Research Data Infrastructure (RDI) for the Cetacean Conservation Community.
|
44 |
+
This PoC will happily be made into a production-ready RDI if the community is interested.
|
45 |
+
|
46 |
+
π€ The intended users of this interface are the researchers and conservationists working on cetacean conservation.
|
47 |
+
In its current state, the interface is designed to be user-friendly, allowing users to upload images of cetaceans and receive species classification results.
|
48 |
+
|
49 |
+
π€ We value community-contributions and encourage anyone interested to reach out on [the main repository's Github issues](https://github.com/sdsc-ordes/saving-willy/issues).
|
50 |
+
|
51 |
+
π The goal of this RDI is to explore community methods for sharing code and data.
|
52 |
+
|
53 |
+
|
54 |
+
## π» Sharing Code
|
55 |
+
|
56 |
+
Through the platform of Hugging Face π€, machine learning models are published so they can be used for inference on this UI or by other users.
|
57 |
+
Currently, a demonstration model is available for cetacean species classification.
|
58 |
+
The model is based on the [HappyWhale](https://www.kaggle.com/competitions/happy-whale-and-dolphin) competition with the most recent weights.
|
59 |
+
Since part of the model was not made public, the classifier should not be used for inference and is purely demonstrative.
|
60 |
+
|
61 |
+
π Ideally, through new Kaggle challenges or ongoing development in research groups, new models can be brought to Hugging Face and onto the UI.
|
62 |
+
|
63 |
+
|
64 |
+
## π Sharing Data
|
65 |
+
|
66 |
+
The dataset is hosted on Hugging Face π€ as well, in order to share the metadata of the images which have been classified by the model.
|
67 |
+
Making the metadata public is under the choice of th researcher, who can choose to use the model for inference without making the image metadata public afterwards.
|
68 |
+
Of course, we encourage open data. Please note that the original images are never made public in the current-state RDI.
|
69 |
+
|
70 |
+
πͺ The RDI also explore how to share data after inference, with a simple data request page where researchers can fitler the existing metadata from the Hugging Face dataset, and then easily select those of interest for them.
|
71 |
+
Ideally, the Request button would either start a Discord channel discussion between concerned parties of the data request, or generate an e-mail with interested parties. This design is still under conception.
|
72 |
+
|
73 |
"""
|
74 |
)
|
75 |
|
76 |
|
77 |
|
78 |
+
|
79 |
g_logger.info("App started.")
|
80 |
g_logger.warning(f"[D] Streamlit version: {st.__version__}. Python version: {os.sys.version}")
|
81 |
|
src/images/logo/sdsc-horizontal.png
ADDED
![]() |
Git LFS Details
|
src/pages/1_π_about.py
CHANGED
@@ -8,26 +8,39 @@ st.set_page_config(
|
|
8 |
st.markdown(
|
9 |
"""
|
10 |
# About
|
11 |
-
We created this web app in a hackathon.
|
|
|
12 |
This interface is a Proof of Concept of a Community-driven Research Data Infrastructure for the Cetacean Conservation Community.
|
13 |
|
14 |
-
Please reach out for feedback, suggestions, or if you want to join the project.
|
15 |
|
16 |
# Open Source Resources
|
17 |
|
18 |
-
|
19 |
-
The
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# Credits and Thanks
|
23 |
|
24 |
-
Developers
|
25 |
-
- Rob Mills
|
26 |
-
- Laure Vancauwenberghe
|
27 |
|
28 |
-
|
29 |
-
- EDMAKTUB for their advice.
|
30 |
-
-
|
|
|
31 |
|
32 |
"""
|
33 |
)
|
|
|
8 |
st.markdown(
|
9 |
"""
|
10 |
# About
|
11 |
+
We created this web app in [a hackathon](https://sdsc-hackathons.ch/projectPage?projectRef=vUt8BfDJXaAs0UfOesXI|XyWLFpqjq3CX3zrM4uz8).
|
12 |
+
|
13 |
This interface is a Proof of Concept of a Community-driven Research Data Infrastructure for the Cetacean Conservation Community.
|
14 |
|
15 |
+
Please reach out on [the project Github issues](https://github.com/sdsc-ordes/saving-willy/issues) for feedback, suggestions, or if you want to join the project.
|
16 |
|
17 |
# Open Source Resources
|
18 |
|
19 |
+
## UI Code
|
20 |
+
- The [space is hosted on Hugging Face](https://huggingface.co/spaces/Saving-Willy/saving-willy-space).
|
21 |
+
- The [UI code is available on Github](https://github.com/sdsc-ordes/saving-willy).
|
22 |
+
- The [development space](https://huggingface.co/spaces/Saving-Willy/saving-willy-dev) is also hosted publically on Hugging Face.
|
23 |
+
|
24 |
+
## The Machine Learning Models
|
25 |
+
- The [model](https://huggingface.co/Saving-Willy/cetacean-classifier) is hosted on Hugging Face.
|
26 |
+
- The [original Kaggle model code](https://github.com/knshnb/kaggle-happywhale-1st-place) is open on Github as well.
|
27 |
+
|
28 |
+
## The Data
|
29 |
+
|
30 |
+
(temporary setup, a more stable database is probably desired.)
|
31 |
+
- The dataset is hosted on Hugging Face.
|
32 |
+
- The [dataset syncing code](https://github.com/vancauwe/saving-willy-data-sync) is available on Github.
|
33 |
|
34 |
# Credits and Thanks
|
35 |
|
36 |
+
## Developers
|
37 |
+
- [Rob Mills](https://github.com/rmm-ch)
|
38 |
+
- [Laure Vancauwenberghe](https://github.com/vancauwe)
|
39 |
|
40 |
+
## Special Thanks
|
41 |
+
- [EDMAKTUB](https://edmaktub.org) for their advice.
|
42 |
+
- [Swiss Data Science Center](https://www.datascience.ch) for [the hackathon that started the project](https://sdsc-hackathons.ch/projectPage?projectRef=vUt8BfDJXaAs0UfOesXI|XyWLFpqjq3CX3zrM4uz8).
|
43 |
+
- [HappyWhale](https://happywhale.com) for launching [the Kaggle challenge that led to model development](https://www.kaggle.com/competitions/happy-whale-and-dolphin).
|
44 |
|
45 |
"""
|
46 |
)
|
src/pages/3_π€_data requests.py
CHANGED
@@ -5,7 +5,7 @@ st.set_page_config(
|
|
5 |
page_icon="π€",
|
6 |
)
|
7 |
|
8 |
-
from dataset.
|
9 |
|
10 |
st.title("Data Requests")
|
11 |
st.write("This page is ensure findability of data across the community.")
|
|
|
5 |
page_icon="π€",
|
6 |
)
|
7 |
|
8 |
+
from dataset.data_requests import data_prep, show_new_data_view
|
9 |
|
10 |
st.title("Data Requests")
|
11 |
st.write("This page is ensure findability of data across the community.")
|