Spaces:

adityajain07
/

Mila_Global_Moth_Classifier

Sleeping

App Files Files Community

adityajain07 commited on Aug 29, 2024

Commit

d6c6696

verified ·

1 Parent(s): 1655813

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

README.md +23 -7
__pycache__/gradio_demo.cpython-311.pyc +0 -0
__pycache__/model_inference.cpython-311.pyc +0 -0
analyze_checklist.ipynb +155 -0
gbif_data_stats.sh +16 -0
gradio_demo.py +40 -0
job_clean_dataset.sh +31 -0
job_copy_data_to_object_storage.sh +28 -0
job_create_webdataset.sh +40 -0
job_delete_images.sh +30 -0
job_fetch_images.sh +31 -0
job_gradio_demo.sh +17 -0
job_predict_lifestage.sh +40 -0
job_split_dataset.sh +32 -0
job_verify_images.sh +34 -0
key_to_name_map.py +31 -0
model_inference.py +153 -0
prepare_gbif_checklist.py +69 -0
split_verification_list.py +32 -0
test.py +28 -0

README.md CHANGED Viewed

@@ -1,12 +1,28 @@
 ---
-title: Mila Global Moth Classifier
-emoji: 👁
-colorFrom: green
-colorTo: green
 sdk: gradio
 sdk_version: 4.42.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Mila_Global_Moth_Classifier
+app_file: gradio_demo.py
 sdk: gradio
 sdk_version: 4.42.0
 ---
+# Global Moth Model
+Research related to the development of a global moth species classification model for automated moth monitoring.
+## Process
+The below steps are carrried out to train a global model.
+### Checklist preparation
+1. **Fetch Leps Checklist**: Download the Lepidoptera taxonomy from GBIF ([DOI](https://www.gbif.org/occurrence/download/)).
+2. **Fetch DwC-A**: Fetch the Darwin Core Archive from GBIF for the order Lepidoptera ([DOI](https://doi.org/10.15468/dl.6j5bzj)).
+3. **Curate Moth Checklist** (`prepare_gbif_checklist.py`): Clean and curate the Lepidoptera checklist to have only moth species. Remove all non-species taxa and butterfly families. A curated list is [here](https://docs.google.com/spreadsheets/d/1E6Zn2hXbHGMMAiPhtDXFO9_hDtl68lG5fx2vg0jyBvg/edit?usp=sharing).
+### Dataset download and curation
+The next steps to download and curate data are followed from [here](https://github.com/RolnickLab/ami-ml/tree/main/src/dataset_tools).
+1. **Fetch GBIF images**: Download the images from GBIF using the command `ami-dataset fetch-images`. An example slurm script with the argument options is provided (`job_fetch_images.sh`). The DwC-A file requires about 300GB of RAM to be loaded. There should be smarter ways to load the archive file in (multiple?) smaller memory but we haven't explored it ourselves.
+2. **Verify images**: Verify the downloaded images for corruption (`job_verify_images.sh`).
+3. **Delete corrupted images**: `job_delete_images.sh`
+4. **Lifestage prediction:** Run the lifestage prediction model on images without the lifestage tag. The purpose is to remove non-adult moth images from the dataset (`job_predict_lifestage.sh`).
+5. **Final clean dataset:** Create the final list of images cleaned after image verification and lifestage prediction (`job_clean_dataset.sh`).
+6. **Dataset splits:** Create dataset splits for model training (`job_split_dataset.sh`).
+### Model training

__pycache__/gradio_demo.cpython-311.pyc ADDED Viewed

Binary file (1.74 kB). View file

__pycache__/model_inference.cpython-311.pyc ADDED Viewed

Binary file (7.93 kB). View file

analyze_checklist.ipynb ADDED Viewed

	@@ -0,0 +1,155 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# System packages\n",
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "# 3rd party packages\n",
+    "import pandas as pd\n",
+    "import dotenv\n",
+    "import json\n",
+    "\n",
+    "# Our main package (coming soon!)\n",
+    "# import ami_ml\n",
+    "\n",
+    "# Local development packages not yet in the main package\n",
+    "sys.path.append(\"./\")\n",
+    "\n",
+    "# Auto reload your development packages\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "# Load secrets and config from optional .env file\n",
+    "dotenv.load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No. of accepted moth species: 46983\n",
+      "No. of unique genera: 9413\n",
+      "No. of unique families: 124\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Read the global moth checklist\n",
+    "moth_checklist_df = pd.read_csv(os.getenv(\"GLOBAL_MOTH_CHECKLIST\"))\n",
+    "\n",
+    "# Get statistics regarding accepted moth species\n",
+    "accepted_moths = moth_checklist_df[moth_checklist_df[\"taxonomicStatus\"] == \"ACCEPTED\"]\n",
+    "num_genus = set(accepted_moths[\"genus\"])\n",
+    "num_family = set(accepted_moths[\"family\"])\n",
+    "print(f\"No. of accepted moth species: {accepted_moths.shape[0]}\")\n",
+    "print(f\"No. of unique genera: {len(num_genus)}\")\n",
+    "print(f\"No. of unique families: {len(num_family)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the accepted taxon keys to json file\n",
+    "unique_accepted_keys = list(accepted_moths[\"acceptedTaxonKey\"])\n",
+    "file_path = os.getenv(\"ACCEPTED_KEY_LIST\")\n",
+    "with open(file_path, \"w\") as file:\n",
+    "    json.dump(unique_accepted_keys, file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test the json file read\n",
+    "with open(os.getenv(\"ACCEPTED_KEY_LIST\")) as f:\n",
+    "    keys_list = json.load(f)\n",
+    "    keys_list = [int(x) for x in keys_list]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Calculate the total occurrences for all accepted taxon keys, with a cap of 1000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The total occurrences with a cap of thousand images is 3898528.\n"
+     ]
+    }
+   ],
+   "source": [
+    "num_occ = list(accepted_moths[\"numberOfOccurrences\"])\n",
+    "num_occ_limit = []  \n",
+    "for count in num_occ:\n",
+    "    if count <= 1000: num_occ_limit.append(count)\n",
+    "    else: num_occ_limit.append(1000)\n",
+    "\n",
+    "print(f\"The total occurrences with a cap of thousand images is {sum(num_occ_limit)}.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

gbif_data_stats.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+# Load absolute data paths
+set -o allexport
+source .env
+set +o allexport
+# Calculate data statistics
+datasets_count=$(ls $GLOBAL_MODEL_DATASET_PATH | wc -l)
+num_images=$(find $GLOBAL_MODEL_DATASET_PATH -type f | wc -l)
+dataset_size=$(du -sh $GLOBAL_MODEL_DATASET_PATH)
+# Print statistics
+echo "Number of dataset sources: $datasets_count"
+echo "Number of images: $num_images"
+echo "Dataset size: $dataset_size"

gradio_demo.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import gradio as gr
+import PIL
+import torch
+from dotenv import load_dotenv
+from model_inference import ModelInference
+# Load secrets and config from optional .env file
+load_dotenv()
+GLOBAL_MODEL = os.getenv("GLOBAL_MODEL")
+CATEGORY_MAP = os.getenv("CATEGORY_MAP_JSON")
+CATEG_TO_NAME_MAP = os.getenv("CATEG_TO_NAME_MAP")
+# Model prediction function
+def predict_species(image: PIL.Image.Image) -> dict[str, float]:
+    """Moth species prediction"""
+    # Build the model class
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    fgrained_classifier = ModelInference(
+        GLOBAL_MODEL, "timm_resnet50", CATEGORY_MAP, CATEG_TO_NAME_MAP, device, topk=5
+    )
+    # Predict on image
+    sp_pred = fgrained_classifier.predict(image)
+    return sp_pred
+demo = gr.Interface(
+    fn=predict_species,
+    inputs=gr.Image(type="pil"),
+    outputs=gr.Label(),
+    title="Mila Global Moth Species Classifier",
+)
+if __name__ == "__main__":
+    demo.launch(share=True)

job_clean_dataset.sh ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/bin/bash
+#SBATCH --job-name=clean_dataset
+#SBATCH --ntasks=1
+#SBATCH --time=3:00:00
+#SBATCH --partition=long-cpu                # Ask for long-cpu job
+#SBATCH --cpus-per-task=2                   # Ask for 2 CPUs
+#SBATCH --mem=300G                          # Ask for 300 GB of RAM
+#SBATCH --output=clean_dataset_%j.out
+# 1. Load the required modules
+module load miniconda/3
+# 2. Load your environment
+conda activate ami-ml
+# 3. Load the environment variables outside of python script
+set -o allexport
+source .env
+set +o allexport
+# Keep track of time
+SECONDS=0
+# 4. Launch your script
+ami-dataset clean-dataset \
+--dwca-file $DWCA_FILE \
+--verified-data-csv $VERIFICATION_RESULTS \
+--life-stage-predictions $LIFESTAGE_RESULTS
+# Print time taken to execute the script
+echo "Time taken to clean the dataset: $SECONDS seconds"

job_copy_data_to_object_storage.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+#SBATCH --job-name=upload_dataset
+#SBATCH --ntasks=1
+#SBATCH --time=96:00:00
+#SBATCH --partition=long-cpu          # Ask for long cpu job
+#SBATCH --cpus-per-task=2             # Ask for 2 CPUs
+#SBATCH --mem=4G                      # Ask for 4 GB of RAM
+#SBATCH --output=upload_dataset_%j.out
+# 1. Load the required modules
+module load miniconda/3
+# 2. Load your environment
+conda activate ami-ml
+# 3. Load the environment variables outside of python script
+set -o allexport
+source .env
+set +o allexport
+# Keep track of time
+SECONDS=0
+# 4. Launch your script
+aws s3 sync $GLOBAL_MODEL_DIR $GLOBAL_MODEL_OBJECT_STORE
+# Print time taken to execute the script
+echo "Time taken to upload the dataset: $SECONDS seconds"

job_create_webdataset.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/bin/bash
+#SBATCH --job-name=create_webdataset
+#SBATCH --ntasks=1
+#SBATCH --time=72:00:00
+#SBATCH --partition=long-cpu            # Ask for long-cpu job
+#SBATCH --cpus-per-task=4               # Ask for 4 CPUs
+#SBATCH --mem=10G                       # Ask for 10 GB of RAM
+#SBATCH --output=create_webdataset_%j.out
+# 1. Load the required modules
+module load miniconda/3
+# 2. Load your environment
+conda activate ami-ml
+# 3. Load the environment variables outside of python script
+set -o allexport
+source .env
+set +o allexport
+# Keep track of time
+SECONDS=0
+# 4. Launch your script
+ami-dataset create-webdataset \
+--annotations-csv $SAMPLE_TRAIN_CSV \
+--webdataset-pattern $SAMPLE_TRAIN_WBDS \
+--wandb-run wbds_train_sample \
+--dataset-path $GLOBAL_MODEL_DATASET_PATH \
+--image-path-column image_path \
+--label-column acceptedTaxonKey \
+--columns-to-json $COLUMNS_TO_JSON \
+--resize-min-size 450 \
+--wandb-entity $WANDB_ENTITY \
+--wandb-project $WANDB_PROJECT
+# --save-category-map-json $CATEGORY_MAP_JSON
+# Print time taken to execute the script
+echo "Time taken to create the webdataset: $SECONDS seconds"

job_delete_images.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/bin/bash
+#SBATCH --job-name=delete_corrupted_images
+#SBATCH --ntasks=1
+#SBATCH --time=4:00:00
+#SBATCH --partition=long-cpu              # Ask for long-cpu job
+#SBATCH --cpus-per-task=2                 # Ask for 2 CPUs
+#SBATCH --mem=4G                          # Ask for 4 GB of RAM
+#SBATCH --output=delete_corrupted_images_%j.out
+# 1. Load the required modules
+module load miniconda/3
+# 2. Load your environment
+conda activate ami-ml
+# 3. Load the environment variables outside of python script
+set -o allexport
+source .env
+set +o allexport
+# Keep track of time
+SECONDS=0
+# 4. Launch your script
+ami-dataset delete-images \
+--error-images-csv $VERIFICATION_ERROR_RESULTS \
+--base-path $GLOBAL_MODEL_DATASET_PATH
+# Print time taken to execute the script
+echo "Time taken to delete the corrupted images: $SECONDS seconds"

job_fetch_images.sh ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/bin/bash
+#SBATCH --job-name=fetch_gbif_images
+#SBATCH --partition=long-cpu                 # Ask for long-cpu job
+#SBATCH --cpus-per-task=1                   # Ask for 1 CPUs
+#SBATCH --mem=300G                           # Ask for 300 GB of RAM
+#SBATCH --output=fetch_gbif_images_%j.out
+# 1. Load the required modules
+module load miniconda/3
+# 2. Load your environment
+conda activate ami-ml
+# 3. Load the environment variables outside of python script
+set -o allexport
+source .env
+set +o allexport
+# Keep track of time
+SECONDS=0
+# 4. Launch your script
+ami-dataset fetch-images \
+--dataset-path $GLOBAL_MODEL_DATASET_PATH \
+--dwca-file $DWCA_FILE \
+--num-images-per-category 1000 \
+--num-workers 4 \
+--subset-list $ACCEPTED_KEY_LIST
+# Print time taken to execute the script
+echo "Time taken: $SECONDS seconds"

job_gradio_demo.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/bin/bash
+#SBATCH --job-name=gradio_demo
+#SBATCH --ntasks=1
+#SBATCH --time=120:00:00
+#SBATCH --partition=long-cpu              # Ask for long-cpu job
+#SBATCH --cpus-per-task=1                 # Ask for 1 CPUs
+#SBATCH --mem=5G                          # Ask for 5 GB of RAM
+#SBATCH --output=gradio_demo_%j.out
+# 1. Load the required modules
+module load miniconda/3
+# 2. Load your environment
+conda activate ami-ml
+# 3. Run the demo
+gradio global_moth_model/gradio_demo.py

job_predict_lifestage.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/bin/bash
+#SBATCH --job-name=lifestage_prediction
+#SBATCH --ntasks=1
+#SBATCH --time=24:00:00
+#SBATCH --mem=16G
+#SBATCH --partition=long     # Ask for long job
+#SBATCH --cpus-per-task=4    # Ask for 4 CPUs
+#SBATCH --gres=gpu:1         # Ask or 1 GPU
+#SBATCH --output=lifestage_prediction_%j.out
+# 1. Load the required modules
+module load miniconda/3
+# 2. Load your environment
+conda activate ami-ml
+# 3. Load the environment variables outside of python script
+set -o allexport
+source .env
+set +o allexport
+# Keep track of time
+SECONDS=0
+# 4. Launch your script
+ami-dataset predict-lifestage \
+--verified-data-csv $VERIFICATION_RESULTS_P2 \
+--results-csv $LIFESTAGE_RESULTS_P2 \
+--wandb-run lifestage_prediction_p2 \
+--dataset-path $GLOBAL_MODEL_DATASET_PATH \
+--model-path $LIFESTAGE_MODEL \
+--category-map-json $LIFESTAGE_CATEGORY_MAP \
+--wandb-entity $WANDB_ENTITY \
+--wandb-project $WANDB_PROJECT \
+--log-frequence 25 \
+--batch-size 1024 \
+--num-classes 2
+# Print time taken to execute the script
+echo "Time taken to run life stage prediction: $SECONDS seconds"

job_split_dataset.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/bin/bash
+#SBATCH --job-name=split_dataset
+#SBATCH --ntasks=1
+#SBATCH --time=2:00:00
+#SBATCH --partition=long-cpu              # Ask for long-cpu job
+#SBATCH --cpus-per-task=2                 # Ask for 2 CPUs
+#SBATCH --mem=6G                          # Ask for 6 GB of RAM
+#SBATCH --output=split_dataset_%j.out
+# 1. Load the required modules
+module load miniconda/3
+# 2. Load your environment
+conda activate ami-ml
+# 3. Load the environment variables outside of python script
+set -o allexport
+source .env
+set +o allexport
+# Keep track of time
+SECONDS=0
+# 4. Launch your script
+ami-dataset split-dataset \
+--dataset-csv $FINAL_CLEAN_DATASET \
+--split-prefix $SPLIT_PREFIX \
+--max-instances 1000 \
+--min-instances 4
+# Print time taken to execute the script
+echo "Time taken to split the dataset: $SECONDS seconds"

job_verify_images.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/bin/bash
+#SBATCH --job-name=verify_gbif_images
+#SBATCH --ntasks=1
+#SBATCH --time=24:00:00
+#SBATCH --partition=long-cpu                 # Ask for long-cpu job
+#SBATCH --cpus-per-task=16                   # Ask for 16 CPUs
+#SBATCH --mem=300G                           # Ask for 300 GB of RAM
+#SBATCH --output=verify_gbif_images_%j.out
+# 1. Load the required modules
+module load miniconda/3
+# 2. Load your environment
+conda activate ami-ml
+# 3. Load the environment variables outside of python script
+set -o allexport
+source .env
+set +o allexport
+# Keep track of time
+SECONDS=0
+# 4. Launch your script
+ami-dataset verify-images \
+--dataset-path $GLOBAL_MODEL_DATASET_PATH \
+--dwca-file $DWCA_FILE \
+--num-workers 16 \
+--results-csv $VERIFICATION_RESULTS \
+--resume-from-ckpt $VERIFICATION_RESULTS \
+--subset-list $ACCEPTED_KEY_LIST
+# Print time taken to execute the script
+echo "Time taken to verify images: $SECONDS seconds"

key_to_name_map.py ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/usr/bin/env python
+# coding: utf-8
+"""Create a mapping from taxon keys to species names"""
+import json
+import os
+from pathlib import Path
+# System packages
+import pandas as pd
+# 3rd party packages
+from dotenv import load_dotenv
+# Load secrets and config from optional .env file
+load_dotenv()
+# Variable definitions
+GLOBAL_MODEL_DIR = os.getenv("GLOBAL_MODEL_DIR")
+moth_list = pd.read_csv(Path(GLOBAL_MODEL_DIR) / "gbif_moth_checklist_07242024.csv")
+map_dict = {}
+map_file = Path(GLOBAL_MODEL_DIR) / "categ_to_name_map.json"
+# Build the dict
+for _, row in moth_list.iterrows():
+    map_dict[int(row["acceptedTaxonKey"])] = row["species"]
+# Save the dict
+with open(map_file, "w") as file:
+    json.dump(map_dict, file, indent=2)

model_inference.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import json
+import PIL
+import timm
+import torch
+from torchvision import transforms
+class ModelInference:
+    """Model inference class definition"""
+    def __init__(
+        self,
+        model_path: str,
+        model_type: str,
+        category_map_json: str,
+        categ_to_name_map_json: str,
+        device: str,
+        input_size: int = 128,
+        topk: int = 10,
+    ):
+        self.device = device
+        self.topk = topk
+        self.input_size = input_size
+        self.model_type = model_type
+        self.image = None
+        self.id2categ = self._load_category_map(category_map_json)
+        self.categ2name = self._load_categ_to_name_map(categ_to_name_map_json)
+        self.model = self._load_model(model_path, num_classes=len(self.id2categ))
+        self.model.eval()
+    def _load_categ_to_name_map(self, categ_to_name_map_json: str):
+        with open(categ_to_name_map_json, "r") as f:
+            categ_to_name_map = json.load(f)
+        return categ_to_name_map
+    def _load_category_map(self, category_map_json: str):
+        with open(category_map_json, "r") as f:
+            categories_map = json.load(f)
+        id2categ = {categories_map[categ]: categ for categ in categories_map}
+        return id2categ
+    def _pad_to_square(self):
+        """Padding transformation to make the image square"""
+        width, height = self.image.size
+        if height < width:
+            return transforms.Pad(padding=[0, 0, 0, width - height])
+        elif height > width:
+            return transforms.Pad(padding=[0, 0, height - width, 0])
+        else:
+            return transforms.Pad(padding=[0, 0, 0, 0])
+    def get_transforms(self):
+        mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
+        return transforms.Compose(
+            [
+                self._pad_to_square(),
+                transforms.ToTensor(),
+                transforms.Resize((self.input_size, self.input_size), antialias=True),
+                transforms.Normalize(mean, std),
+            ]
+        )
+    def _load_model(self, model_path: str, num_classes: int, pretrained: bool = True):
+        if self.model_type == "resnet50":
+            model = timm.create_model(
+                "resnet50", pretrained=pretrained, num_classes=num_classes
+            )
+        elif self.model_type == "timm_resnet50":
+            model = timm.create_model(
+                "resnet50", pretrained=pretrained, num_classes=num_classes
+            )
+        elif self.model_type == "timm_convnext-t":
+            model = timm.create_model(
+                "convnext_tiny_in22k", pretrained=pretrained, num_classes=num_classes
+            )
+        elif self.model_type == "timm_convnext-b":
+            model = timm.create_model(
+                "convnext_base_in22k", pretrained=pretrained, num_classes=num_classes
+            )
+        elif self.model_type == "efficientnetv2-b3":
+            model = timm.create_model(
+                "tf_efficientnetv2_b3", pretrained=pretrained, num_classes=num_classes
+            )
+        elif self.model_type == "timm_mobilenetv3large":
+            model = timm.create_model(
+                "mobilenetv3_large_100", pretrained=pretrained, num_classes=num_classes
+            )
+        elif self.model_type == "timm_vit-b16-128":
+            model = timm.create_model(
+                "vit_base_patch16_224_in21k",
+                pretrained=pretrained,
+                img_size=128,
+                num_classes=num_classes,
+            )
+        else:
+            raise RuntimeError(f"Model {self.model_type} not implemented")
+        # Load model weights
+        model.load_state_dict(
+            torch.load(model_path, map_location=torch.device(self.device))
+        )
+        # Parallelize inference if multiple GPUs available
+        if torch.cuda.device_count() > 1:
+            model = torch.nn.DataParallel(model)
+        model = model.to(self.device)
+        return model
+    def predict(self, image: PIL.Image.Image):
+        with torch.no_grad():
+            # Process the image for prediction
+            self.image = image
+            transforms = self.get_transforms()
+            image = transforms(image)
+            image = image.to(self.device)
+            image = image.unsqueeze_(0)
+            # Model prediction on the image
+            predictions = self.model(image)
+            predictions = torch.nn.functional.softmax(predictions, dim=1)
+            predictions = predictions.cpu()
+            if self.topk == 0 or self.topk > len(
+                predictions[0]
+            ):  # topk=0 means get all predictions
+                predictions = torch.topk(predictions, len(predictions[0]))
+            else:
+                predictions = torch.topk(predictions, self.topk)
+            # Process the results
+            values, indices = (
+                predictions.values.numpy()[0],
+                predictions.indices.numpy()[0],
+            )
+            pred_results = {}
+            for i in range(len(indices)):
+                idx, value = indices[i], values[i]
+                categ = self.id2categ[idx]
+                sp_name = self.categ2name[categ]
+                pred_results[sp_name] = value
+                # pred_results.append([sp_name, round(value*100, 2)])
+            return pred_results

prepare_gbif_checklist.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/env python
+# coding: utf-8
+"""Prepare the GBIF checklist for the global moth model"""
+import os
+from pathlib import Path
+# System packages
+import pandas as pd
+# 3rd party packages
+from dotenv import load_dotenv
+# Load secrets and config from optional .env file
+load_dotenv()
+def remove_non_species_taxon(checklist: pd.DataFrame) -> pd.DataFrame:
+    """
+    Remove all non-species taxa from the checklist
+    """
+    # Keep only rows where the taxa rank is "SPECIES"
+    checklist = checklist.loc[checklist["taxonRank"] == "SPECIES"]
+    return checklist
+def remove_butterflies(checklist: pd.DataFrame) -> pd.DataFrame:
+    """
+    Remove all butterflies from the checklist
+    """
+    # List of butterfly families
+    butterfly_fm = [
+        "Hesperiidae",
+        "Lycaenidae",
+        "Nymphalidae",
+        "Papilionidae",
+        "Pieridae",
+        "Riodinidae",
+        "Hedylidae",
+    ]
+    # Remove butterfly families
+    checklist = checklist.loc[~checklist["family"].isin(butterfly_fm)]
+    return checklist
+if __name__ == "__main__":
+    GLOBAL_MODEL_DIR = os.getenv("GLOBAL_MODEL_DIR")
+    # Remove non-species taxa
+    checklist = "gbif_leps_checklist_07242024_original.csv"
+    checklist_pd = pd.read_csv(Path(GLOBAL_MODEL_DIR) / checklist)
+    leps_checklist_pd = remove_non_species_taxon(checklist_pd)
+    leps_checklist_pd.to_csv(
+        Path(GLOBAL_MODEL_DIR) / "gbif_leps_checklist_07242024.csv", index=False
+    )
+    # Remove butterflies
+    checklist = "gbif_leps_checklist_07242024.csv"
+    checklist_pd = pd.read_csv(Path(GLOBAL_MODEL_DIR) / checklist)
+    moth_checklist_pd = remove_butterflies(checklist_pd)
+    moth_checklist_pd.to_csv(
+        Path(GLOBAL_MODEL_DIR) / "gbif_moth_checklist_07242024.csv", index=False
+    )

split_verification_list.py ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/usr/bin/env python
+# coding: utf-8
+"""Split the image verification list to multiple parts"""
+import os
+from pathlib import Path
+# System packages
+import pandas as pd
+# 3rd party packages
+from dotenv import load_dotenv
+# Load secrets and config from optional .env file
+load_dotenv()
+# Load the list
+img_verf_df = pd.read_csv(os.getenv("VERIFICATION_RESULTS"))
+img_verf_lstage_nan_df = img_verf_df[img_verf_df.lifeStage.isnull()].copy()
+# Slice the list
+num_entries = img_verf_lstage_nan_df.shape[0]
+half = int(num_entries / 2)
+img_verf_lstage_nan_p1 = img_verf_lstage_nan_df.iloc[:half, :].copy()
+img_verf_lstage_nan_p2 = img_verf_lstage_nan_df.iloc[half:, :].copy()
+# Save the scripts
+save_dir = os.getenv("GLOBAL_MODEL_DIR")
+fname = Path(os.getenv("VERIFICATION_RESULTS")).stem
+img_verf_lstage_nan_p1.to_csv(Path(save_dir) / str(fname + "_p1" + ".csv"), index=False)
+img_verf_lstage_nan_p2.to_csv(Path(save_dir) / str(fname + "_p2" + ".csv"), index=False)

test.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# import webdataset as wds
+# dataset_path  = "/home/mila/a/aditya.jain/scratch/global_model/webdataset/train/train450-000000.tar"
+# # Create a WebDataset reader
+# dataset = wds.WebDataset(dataset_path)
+# for sample in dataset:
+#     a = 2
+#     for key, value in sample.items():
+#         print(f"{key}: {type(value)}")
+import json
+categ_map_f = "/home/mila/a/aditya.jain/scratch/global_model/category_map.json"
+new_categ_map = {}
+with open(categ_map_f, "r") as f:
+    category_map = json.load(f)
+for key in category_map.keys():
+    new_key = str(int(float(key)))
+    new_categ_map[new_key] = category_map[key]
+with open("/home/mila/a/aditya.jain/scratch/global_model/category_map_v2.json", "w") as f:
+    json.dump(new_categ_map, f)