Spaces:

Saving-Willy
/

saving-willy-dev

Sleeping

App Files Files Community

vancauwe commited on Apr 18

Commit

de2a82e

1 Parent(s): 14b60f7

feat: add types and home page text

Browse files

Files changed (10) hide show

README.md +1 -1
requirements.txt +3 -3
src/dataset/cleaner.py +3 -2
src/dataset/{requests.py → data_requests.py} +4 -3
src/dataset/download.py +2 -2
src/dataset/fake_data.py +7 -31
src/home.py +43 -3
src/images/logo/sdsc-horizontal.png +3 -0
src/pages/1_🐋_about.py +24 -11
src/pages/3_🤝_data requests.py +1 -1

README.md CHANGED Viewed

@@ -28,7 +28,7 @@ pip install -r requirements.txt
 ```
 ```
-streamlit run src/main.py
 ```

 ```
 ```
+streamlit run src/home.py
 ```

requirements.txt CHANGED Viewed

@@ -13,6 +13,9 @@ datasets==3.0.2
 ## FSM
 transitions==0.9.2
 # running ML models
 ## to use ML models hosted on HF
@@ -28,9 +31,6 @@ pillow==10.4.0
 opencv-python-headless==4.5.5.64
 albumentations==1.1.0
-# for states
-transitions==0.9.2
 # for env variables
 python-dotenv==1.1.0

 ## FSM
 transitions==0.9.2
+# data manipulation
+pandas==2.2.3
 # running ML models
 ## to use ML models hosted on HF
 opencv-python-headless==4.5.5.64
 albumentations==1.1.0
 # for env variables
 python-dotenv==1.1.0

src/dataset/cleaner.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import pandas as pd
-def clean_lat_long(df): # Ensure lat and lon are numeric, coerce errors to NaN
     """
     Clean latitude and longitude columns in the DataFrame.
     Args:
         df (pd.DataFrame): DataFrame containing latitude and longitude columns.
     Returns:
@@ -15,7 +16,7 @@ def clean_lat_long(df): # Ensure lat and lon are numeric, coerce errors to NaN
     df = df.dropna(subset=['lat', 'lon']).reset_index(drop=True)
     return df
-def clean_date(df): # Ensure lat and lon are numeric, coerce errors to NaN
     """
     Clean date column in the DataFrame.
     Args:

 import pandas as pd
+def clean_lat_long(df) -> pd.DataFrame:
     """
     Clean latitude and longitude columns in the DataFrame.
+    Ensure lat and lon are numeric, coerce errors to NaN
     Args:
         df (pd.DataFrame): DataFrame containing latitude and longitude columns.
     Returns:
     df = df.dropna(subset=['lat', 'lon']).reset_index(drop=True)
     return df
+def clean_date(df) -> pd.DataFrame: # Ensure lat and lon are numeric, coerce errors to NaN
     """
     Clean date column in the DataFrame.
     Args:

src/dataset/{requests.py → data_requests.py} RENAMED Viewed

@@ -4,7 +4,7 @@ from dataset.cleaner import clean_lat_long, clean_date
 from dataset.download import get_dataset
 from dataset.fake_data import generate_fake_data
-def data_prep():
     """
     Prepares the dataset for use in the application.
     Downloads the dataset and cleans the data (and generates fake data if needed).
@@ -18,7 +18,7 @@ def data_prep():
     df = clean_date(df)
     return df
-def filter_data(df):
     """
     Filter the DataFrame based on user-selected ranges for latitude, longitude, and date.
     Args:
@@ -51,8 +51,9 @@ def show_specie_author(df):
             label = f"{row['author_email']} ({row['counts']})"
             st.session_state.checkbox_states[key] = st.checkbox(label, key=key)
-def show_new_data_view(df):
     """
     Filter the dataframe based on the state of the localisation sliders and selected timeframe by the user.
     Then, show the results of the filtering grouped by species then by authors.
     Authors are matched to a checkbox component so the user can click it if he/she/they wish to request data from this author.

 from dataset.download import get_dataset
 from dataset.fake_data import generate_fake_data
+def data_prep() -> pd.DataFrame:
     """
     Prepares the dataset for use in the application.
     Downloads the dataset and cleans the data (and generates fake data if needed).
     df = clean_date(df)
     return df
+def filter_data(df) -> pd.DataFrame:
     """
     Filter the DataFrame based on user-selected ranges for latitude, longitude, and date.
     Args:
             label = f"{row['author_email']} ({row['counts']})"
             st.session_state.checkbox_states[key] = st.checkbox(label, key=key)
+def show_new_data_view(df) -> pd.DataFrame:
     """
+    Show the new filtered data view on the UI.
     Filter the dataframe based on the state of the localisation sliders and selected timeframe by the user.
     Then, show the results of the filtering grouped by species then by authors.
     Authors are matched to a checkbox component so the user can click it if he/she/they wish to request data from this author.

src/dataset/download.py CHANGED Viewed

@@ -3,7 +3,7 @@ import time
 import logging
 import pandas as pd
 from datasets import load_dataset
-from datasets import DatasetDict, Dataset
 ############################################################
 # the dataset of observations (hf dataset in our space)
@@ -62,7 +62,7 @@ def try_download_dataset(dataset_id:str, data_files:str) -> dict:
     #st.write(msg)
     return metadata
-def get_dataset():
     """
     Downloads the dataset from Hugging Face and prepares it for use.
     If the dataset is not available, it creates an empty DataFrame with the specified schema.

 import logging
 import pandas as pd
 from datasets import load_dataset
+from datasets import DatasetDict
 ############################################################
 # the dataset of observations (hf dataset in our space)
     #st.write(msg)
     return metadata
+def get_dataset() -> pd.DataFrame:
     """
     Downloads the dataset from Hugging Face and prepares it for use.
     If the dataset is not available, it creates an empty DataFrame with the specified schema.

src/dataset/fake_data.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import pandas as pd
-import numpy as np
 import random
 from datetime import datetime, timedelta
-def generate_fake_data(df, num_fake):
     """
     Generate fake data for the dataset.
     Args:
         df (pd.DataFrame): Original DataFrame to append fake data to.
         num_fake (int): Number of fake observations to generate.
@@ -14,34 +17,7 @@ def generate_fake_data(df, num_fake):
     """
     # Options for random generation
-    species_options = [
-        "beluga",
-        "blue_whale",
-        "bottlenose_dolphin",
-        "brydes_whale",
-        "commersons_dolphin",
-        "common_dolphin",
-        "cuviers_beaked_whale",
-        "dusky_dolphin",
-        "false_killer_whale",
-        "fin_whale",
-        "frasiers_dolphin",
-        "gray_whale",
-        "humpback_whale",
-        "killer_whale",
-        "long_finned_pilot_whale",
-        "melon_headed_whale",
-        "minke_whale",
-        "pantropic_spotted_dolphin",
-        "pygmy_killer_whale",
-        "rough_toothed_dolphin",
-        "sei_whale",
-        "short_finned_pilot_whale",
-        "southern_right_whale",
-        "spinner_dolphin",
-        "spotted_dolphin",
-        "white_sided_dolphin",
-    ]
     email_options = [
         '[email protected]', '[email protected]',
         '[email protected]', '[email protected]'
@@ -67,6 +43,6 @@ def generate_fake_data(df, num_fake):
         date = random_date()
         new_data.append([lat, lon, species, email, date])
-    new_df = pd.DataFrame(new_data, columns=['lat', 'lon', 'species', 'author_email', 'date'])
     df = pd.concat([df, new_df], ignore_index=True)
     return df

 import pandas as pd
 import random
 from datetime import datetime, timedelta
+from download import presentation_data_schema
+from whale_viewer import WHALE_CLASSES
+def generate_fake_data(df, num_fake) -> pd.DataFrame:
     """
     Generate fake data for the dataset.
     Args:
         df (pd.DataFrame): Original DataFrame to append fake data to.
         num_fake (int): Number of fake observations to generate.
     """
     # Options for random generation
+    species_options = WHALE_CLASSES
     email_options = [
         '[email protected]', '[email protected]',
         '[email protected]', '[email protected]'
         date = random_date()
         new_data.append([lat, lon, species, email, date])
+    new_df = pd.DataFrame(new_data, columns=presentation_data_schema).astype(presentation_data_schema)
     df = pd.concat([df, new_df], ignore_index=True)
     return df

src/home.py CHANGED Viewed

@@ -24,18 +24,58 @@ init_logging_session_states() # logging init should be early
 if "input_author_email" not in st.session_state:
     st.session_state.input_author_email = ""
-st.write("# Welcome to Cetacean Research Data Infrastructure! 🐬˚˖𓍢ִ໋ 🐋✧˚.⋆")
-st.sidebar.success("Here are the pages.")
 st.markdown(
     """
-    About: blablabla
 """
 )
 g_logger.info("App started.")
 g_logger.warning(f"[D] Streamlit version: {st.__version__}. Python version: {os.sys.version}")

 if "input_author_email" not in st.session_state:
     st.session_state.input_author_email = ""
+st.write("""
+         # Welcome ! 🐬˚✧˚.⋆🐋
+         # Cetacean Conservation Community
+        """)
+st.sidebar.success("Explore the pages: there are machine learning models, data requests, maps and more !")
+st.sidebar.image(
+    "src/images/logo/sdsc-horizontal.png",
+    width=200
+)
 st.markdown(
     """
+    ## 💙 Research Data Infrastructure
+    ˖°𓇼🌊⋆🐚🫧 This interface is a Proof of Concept of a Community-driven Research Data Infrastructure (RDI) for the Cetacean Conservation Community.
+    This PoC will happily be made into a production-ready RDI if the community is interested.
+    👤 The intended users of this interface are the researchers and conservationists working on cetacean conservation.
+    In its current state, the interface is designed to be user-friendly, allowing users to upload images of cetaceans and receive species classification results.
+    🤝 We value community-contributions and encourage anyone interested to reach out on [the main repository's Github issues](https://github.com/sdsc-ordes/saving-willy/issues).
+    🌍 The goal of this RDI is to explore community methods for sharing code and data.
+    ## 💻 Sharing Code
+    Through the platform of Hugging Face 🤗, machine learning models are published so they can be used for inference on this UI or by other users.
+    Currently, a demonstration model is available for cetacean species classification.
+    The model is based on the [HappyWhale](https://www.kaggle.com/competitions/happy-whale-and-dolphin) competition with the most recent weights.
+    Since part of the model was not made public, the classifier should not be used for inference and is purely demonstrative.
+    🏆 Ideally, through new Kaggle challenges or ongoing development in research groups, new models can be brought to Hugging Face and onto the UI.
+    ## 💎 Sharing Data
+    The dataset is hosted on Hugging Face 🤗 as well, in order to share the metadata of the images which have been classified by the model.
+    Making the metadata public is under the choice of th researcher, who can choose to use the model for inference without making the image metadata public afterwards.
+    Of course, we encourage open data. Please note that the original images are never made public in the current-state RDI.
+    💪 The RDI also explore how to share data after inference, with a simple data request page where researchers can fitler the existing metadata from the Hugging Face dataset, and then easily select those of interest for them.
+    Ideally, the Request button would either start a Discord channel discussion between concerned parties of the data request, or generate an e-mail with interested parties. This design is still under conception.
 """
 )
 g_logger.info("App started.")
 g_logger.warning(f"[D] Streamlit version: {st.__version__}. Python version: {os.sys.version}")

src/images/logo/sdsc-horizontal.png ADDED Viewed

Git LFS Details

SHA256: a4a40e28f815045ff6251fbc937edf4423da7e36ad9b0418458f5e1eb767f6e2
Pointer size: 130 Bytes
Size of remote file: 37.4 kB

src/pages/1_🐋_about.py CHANGED Viewed

@@ -8,26 +8,39 @@ st.set_page_config(
 st.markdown(
     """
 # About
-    We created this web app in a hackathon.
     This interface is a Proof of Concept of a Community-driven Research Data Infrastructure for the Cetacean Conservation Community.
-    Please reach out for feedback, suggestions, or if you want to join the project.
 # Open Source Resources
-    The space is hosted on Hugging Face.
-    The code is available on Github.
-    All model codes are open.
 # Credits and Thanks
-Developers:
-- Rob Mills
-- Laure Vancauwenberghe
-Thanks to:
-- EDMAKTUB for their advice.
-- SDSC for the hackathon that started the project.
 """
 )

 st.markdown(
     """
 # About
+    We created this web app in [a hackathon](https://sdsc-hackathons.ch/projectPage?projectRef=vUt8BfDJXaAs0UfOesXI|XyWLFpqjq3CX3zrM4uz8).
     This interface is a Proof of Concept of a Community-driven Research Data Infrastructure for the Cetacean Conservation Community.
+    Please reach out on [the project Github issues](https://github.com/sdsc-ordes/saving-willy/issues) for feedback, suggestions, or if you want to join the project.
 # Open Source Resources
+## UI Code
+    - The [space is hosted on Hugging Face](https://huggingface.co/spaces/Saving-Willy/saving-willy-space).
+    - The [UI code is available on Github](https://github.com/sdsc-ordes/saving-willy).
+    - The [development space](https://huggingface.co/spaces/Saving-Willy/saving-willy-dev) is also hosted publically on Hugging Face.
+## The Machine Learning Models
+    - The [model](https://huggingface.co/Saving-Willy/cetacean-classifier) is hosted on Hugging Face.
+    - The [original Kaggle model code](https://github.com/knshnb/kaggle-happywhale-1st-place) is open on Github as well.
+## The Data
+(temporary setup, a more stable database is probably desired.)
+    - The dataset is hosted on Hugging Face.
+    - The [dataset syncing code](https://github.com/vancauwe/saving-willy-data-sync) is available on Github.
 # Credits and Thanks
+## Developers
+- [Rob Mills](https://github.com/rmm-ch)
+- [Laure Vancauwenberghe](https://github.com/vancauwe)
+## Special Thanks
+- [EDMAKTUB](https://edmaktub.org) for their advice.
+- [Swiss Data Science Center](https://www.datascience.ch) for [the hackathon that started the project](https://sdsc-hackathons.ch/projectPage?projectRef=vUt8BfDJXaAs0UfOesXI|XyWLFpqjq3CX3zrM4uz8).
+- [HappyWhale](https://happywhale.com) for launching [the Kaggle challenge that led to model development](https://www.kaggle.com/competitions/happy-whale-and-dolphin).
 """
 )

src/pages/3_🤝_data requests.py CHANGED Viewed

@@ -5,7 +5,7 @@ st.set_page_config(
     page_icon="🤝",
 )
-from dataset.requests import data_prep, show_new_data_view
 st.title("Data Requests")
 st.write("This page is ensure findability of data across the community.")

     page_icon="🤝",
 )
+from dataset.data_requests import data_prep, show_new_data_view
 st.title("Data Requests")
 st.write("This page is ensure findability of data across the community.")