Initial commit

Browse files

Files changed (6) hide show

README.md +103 -3
brain2vec_PCA.py +194 -0
create_csv.py +39 -0
inputs_example.csv +6 -0
model.py +121 -0
requirements.txt +15 -0

README.md CHANGED Viewed

@@ -1,3 +1,103 @@
----
-license: mit
----

+---
+license: mit
+language:
+  - en
+task_categories:
+  - image-classification
+tags:
+  - medical
+  - brain-data
+  - mri
+pretty_name: 3D Brain Structure MRI PCA
+---
+## 🧠 Model Summary
+# brain2vec
+An linear PCA model for brain structure T1 MRIs. The models takes in a 3d MRI NIfTI file and compresses to 1200 latent dimensions before reconstructing the image.
+# Training data
+[Radiata brain-structure](https://huggingface.co/datasets/radiata-ai/brain-structure): 3066 scans from 2085 individuals in the 'train' split. Mean age = 45.1 +- 24.5, including 2847 scans from cognitively normal subjects and 219 scans from individuals with an Alzheimer's disease clinical diagnosis.
+# Example usage
+```
+# get brain2vec model repository
+git clone https://huggingface.co/radiata-ai/brain2vec
+cd brain2vec
+# set up virtual environemt
+python3 -m venv venv_brain2vec
+source venv_brain2vec/bin/activate
+# install Python libraries
+pip install -r requirements.txt
+# create the csv file inputs.csv listing the scan paths and other info
+# this script loads the radiata-ai/brain-structure dataset
+python create_csv.py
+mkdir ae_cache
+mkdir ae_output
+# train the model
+nohup python brain2vec.py train \
+  --dataset_csv /home/ubuntu/brain2vec/inputs.csv \
+  --cache_dir   ./ae_cache \
+  --output_dir  ./ae_output \
+  --n_epochs    10 \
+> train_log.txt 2>&1 &
+# run model inference to create *_embeddings.npz files
+python brain2vec.py infererence \
+  --dataset_csv home/ubuntu/brain2vec/inputs.csv \
+  --aekl_ckpt /home/ubuntu/brain2vec/autoencoder_final.pth \
+  --output_dir /home/ubuntu/brain2vec
+```
+# Methods
+transform:
+(80, 96, 80)
+pixdim=2
+10 epochs
+    max_batch_size: int = 2,
+    batch_size: int = 16,
+    lr: float = 1e-4,
+# References
+Puglisi
+Pinaya
+# Citation
+```
+@dataset{Radiata-Brain-Structure,
+  author    = {Jesse Brown and Clayton Young},
+  title     = {Brain-Structure: Processed Structural MRI Brain Scans Across the Lifespan},
+  year      = {2025},
+  url          = {https://huggingface.co/datasets/radiata-ai/brain-structure},
+  note      = {Version 1.0},
+  publisher    = {Hugging Face}
+}
+```
+# License
+MIT License
+Copyright (c) 2025
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

brain2vec_PCA.py ADDED Viewed

	@@ -0,0 +1,194 @@

+#!/usr/bin/env python3
+"""
+pca_autoencoder.py
+This script demonstrates how to:
+  1) Load a dataset of MRI volumes using MONAI transforms (as in brain2vec_linearAE.py).
+  2) Flatten each 3D volume into a 1D vector (614,400 features if 80x96x80).
+  3) Perform IncrementalPCA to reduce dimensionality to 1200 components.
+  4) Provide a 'forward()' method that returns (reconstruction, embedding),
+     mimicking the interface of a linear autoencoder.
+"""
+import os
+import argparse
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import DataLoader
+from monai import transforms
+from monai.data import Dataset, PersistentDataset
+from sklearn.decomposition import IncrementalPCA
+###################################################################
+# Constants for your typical config
+###################################################################
+RESOLUTION = 2
+INPUT_SHAPE_AE = (80, 96, 80)
+N_COMPONENTS = 1200
+###################################################################
+# Helper classes/functions
+###################################################################
+def get_dataset_from_pd(df: pd.DataFrame, transforms_fn, cache_dir: str):
+    """
+    Returns a monai.data.Dataset or monai.data.PersistentDataset
+    if `cache_dir` is defined, to speed up loading.
+    """
+    if cache_dir and cache_dir.strip():
+        os.makedirs(cache_dir, exist_ok=True)
+        dataset = PersistentDataset(data=df.to_dict(orient='records'),
+                                    transform=transforms_fn,
+                                    cache_dir=cache_dir)
+    else:
+        dataset = Dataset(data=df.to_dict(orient='records'),
+                          transform=transforms_fn)
+    return dataset
+class PCAAutoencoder:
+    """
+    A PCA 'autoencoder' using IncrementalPCA for memory efficiency,
+    providing:
+      - fit(X): partial fit on batches
+      - transform(X): get embeddings
+      - inverse_transform(Z): reconstruct from embeddings
+      - forward(X): returns (X_recon, Z) for a direct API
+                    similar to a shallow linear AE.
+    """
+    def __init__(self, n_components=N_COMPONENTS, batch_size=128):
+        self.n_components = n_components
+        self.batch_size = batch_size
+        self.ipca = IncrementalPCA(n_components=self.n_components)
+    def fit(self, X: np.ndarray):
+        """
+        Incrementally fit the PCA model on batches of data.
+        X: shape (n_samples, n_features).
+        """
+        n_samples = X.shape[0]
+        for start_idx in range(0, n_samples, self.batch_size):
+            end_idx = min(start_idx + self.batch_size, n_samples)
+            self.ipca.partial_fit(X[start_idx:end_idx])
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        """
+        Projects data into the PCA latent space in batches.
+        Returns Z: shape (n_samples, n_components).
+        """
+        results = []
+        n_samples = X.shape[0]
+        for start_idx in range(0, n_samples, self.batch_size):
+            end_idx = min(start_idx + self.batch_size, n_samples)
+            Z_chunk = self.ipca.transform(X[start_idx:end_idx])
+            results.append(Z_chunk)
+        return np.vstack(results)
+    def inverse_transform(self, Z: np.ndarray) -> np.ndarray:
+        """
+        Reconstruct data from PCA latent space in batches.
+        Returns X_recon: shape (n_samples, n_features).
+        """
+        results = []
+        n_samples = Z.shape[0]
+        for start_idx in range(0, n_samples, self.batch_size):
+            end_idx = min(start_idx + self.batch_size, n_samples)
+            X_chunk = self.ipca.inverse_transform(Z[start_idx:end_idx])
+            results.append(X_chunk)
+        return np.vstack(results)
+    def forward(self, X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Mimics a linear AE's forward() returning (X_recon, Z).
+        """
+        Z = self.transform(X)
+        X_recon = self.inverse_transform(Z)
+        return X_recon, Z
+def load_and_flatten_dataset(csv_path: str, cache_dir: str, transforms_fn) -> np.ndarray:
+    """
+    Loads the dataset from csv_path, applies the monai transforms,
+    and flattens each 3D MRI into a 1D vector of shape (80*96*80).
+    Returns a numpy array X with shape (n_samples, 614400).
+    """
+    df = pd.read_csv(csv_path)
+    dataset = get_dataset_from_pd(df, transforms_fn, cache_dir)
+    # We'll put the flattened data into this list, then stack.
+    X_list = []
+    # If memory allows, you can simply do a single-threaded loop
+    # or multi-worker DataLoader for speed.
+    # We'll demonstrate a simple single-worker here for clarity.
+    loader = DataLoader(dataset, batch_size=1, num_workers=0)
+    for batch in loader:
+        # batch["image"] shape: (1, 1, 80, 96, 80)
+        img = batch["image"].squeeze(0)  # shape: (1, 80, 96, 80)
+        img_np = img.numpy()  # convert to np array, shape: (1, D, H, W)
+        flattened = img_np.flatten()     # shape: (614400,)
+        X_list.append(flattened)
+    X = np.vstack(X_list)  # shape: (n_samples, 614400)
+    return X
+def main():
+    parser = argparse.ArgumentParser(description="PCA Autoencoder with MONAI transforms example.")
+    parser.add_argument("--inputs_csv", type=str, required=True, help="CSV with 'image_path' column.")
+    parser.add_argument("--cache_dir", type=str, default="", help="Cache directory for MONAI PersistentDataset.")
+    parser.add_argument("--output_dir", type=str, default="./pca_outputs", help="Where to save PCA model and embeddings.")
+    parser.add_argument("--batch_size_ipca", type=int, default=128, help="Batch size for IncrementalPCA partial_fit().")
+    parser.add_argument("--n_components", type=int, default=1200, help="Number of PCA components.")
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Same transforms as in brain2vec_linearAE.py
+    transforms_fn = transforms.Compose([
+        transforms.CopyItemsD(keys={'image_path'}, names=['image']),
+        transforms.LoadImageD(image_only=True, keys=['image']),
+        transforms.EnsureChannelFirstD(keys=['image']),
+        transforms.SpacingD(pixdim=RESOLUTION, keys=['image']),
+        transforms.ResizeWithPadOrCropD(spatial_size=INPUT_SHAPE_AE, mode='minimum', keys=['image']),
+        transforms.ScaleIntensityD(minv=0, maxv=1, keys=['image']),
+    ])
+    print("Loading and flattening dataset from:", args.inputs_csv)
+    X = load_and_flatten_dataset(args.inputs_csv, args.cache_dir, transforms_fn)
+    print(f"Dataset shape after flattening: {X.shape}")
+    # Build PCAAutoencoder
+    model = PCAAutoencoder(n_components=args.n_components, batch_size=args.batch_size_ipca)
+    # Fit the PCA model
+    print("Fitting IncrementalPCA in batches...")
+    model.fit(X)
+    print("Done fitting PCA. Transforming data to embeddings...")
+    # Get embeddings & reconstruction
+    X_recon, Z = model.forward(X)
+    print("Embeddings shape:", Z.shape)
+    print("Reconstruction shape:", X_recon.shape)
+    # Optional: Save
+    embeddings_path = os.path.join(args.output_dir, "pca_embeddings.npy")
+    recons_path = os.path.join(args.output_dir, "pca_reconstructions.npy")
+    np.save(embeddings_path, Z)
+    np.save(recons_path, X_recon)
+    print(f"Saved embeddings to {embeddings_path} and reconstructions to {recons_path}")
+    # If you want to store the actual PCA components for future usage:
+    # from joblib import dump
+    # ipca_model_path = os.path.join(args.output_dir, "pca_model.joblib")
+    # dump(model.ipca, ipca_model_path)
+    # print(f"Saved PCA model to {ipca_model_path}")
+if __name__ == "__main__":
+    main()

create_csv.py ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/usr/bin/env python3
+import os
+import pandas as pd
+from datasets import load_dataset
+def row_to_dict(row, split_name):
+    return {
+        "image_uid": row["id"],
+        "age": int(row["metadata"]["age"]),
+        "sex": 1 if row["metadata"]["sex"].lower() == "male" else 2,
+        "image_path": os.path.abspath(row["nii_filepath"]),
+        "split": split_name
+    }
+def main():
+    # Load the datasets
+    ds_train = load_dataset("radiata-ai/brain-structure", split="train", trust_remote_code=True)
+    ds_val = load_dataset("radiata-ai/brain-structure", split="validation", trust_remote_code=True)
+    ds_test = load_dataset("radiata-ai/brain-structure", split="test", trust_remote_code=True)
+    rows = []
+    # Process each split
+    for data_row in ds_train:
+        rows.append(row_to_dict(data_row, "train"))
+    for data_row in ds_val:
+        rows.append(row_to_dict(data_row, "validation"))
+    for data_row in ds_test:
+        rows.append(row_to_dict(data_row, "test"))
+    # Create a DataFrame and write it to CSV
+    df = pd.DataFrame(rows)
+    output_csv = "inputs.csv"
+    df.to_csv(output_csv, index=False)
+    print(f"CSV file created: {output_csv}")
+if __name__ == "__main__":
+    main()

inputs_example.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+image_uid,age,sex,image_path,split
+0,81,2,/Users/jbrown2/.cache/huggingface/datasets/downloads/extracted/6429865a89f9ae54df1c3c2db5d0f1f25cf7dd43cb87704d76ed08cf8c194aba/OASIS-2/sub-OASIS20133/ses-03/anat/msub-OASIS20133_ses-03_T1w_brain_affine_mni.nii.gz,train
+1,78,2,/Users/jbrown2/.cache/huggingface/datasets/downloads/extracted/6429865a89f9ae54df1c3c2db5d0f1f25cf7dd43cb87704d76ed08cf8c194aba/OASIS-2/sub-OASIS20133/ses-01/anat/msub-OASIS20133_ses-01_T1w_brain_affine_mni.nii.gz,train
+2,87,1,/Users/jbrown2/.cache/huggingface/datasets/downloads/extracted/6429865a89f9ae54df1c3c2db5d0f1f25cf7dd43cb87704d76ed08cf8c194aba/OASIS-2/sub-OASIS20105/ses-02/anat/msub-OASIS20105_ses-02_T1w_brain_affine_mni.nii.gz,train
+3,86,1,/Users/jbrown2/.cache/huggingface/datasets/downloads/extracted/6429865a89f9ae54df1c3c2db5d0f1f25cf7dd43cb87704d76ed08cf8c194aba/OASIS-2/sub-OASIS20105/ses-01/anat/msub-OASIS20105_ses-01_T1w_brain_affine_mni.nii.gz,train
+4,84,1,/Users/jbrown2/.cache/huggingface/datasets/downloads/extracted/6429865a89f9ae54df1c3c2db5d0f1f25cf7dd43cb87704d76ed08cf8c194aba/OASIS-2/sub-OASIS20102/ses-02/anat/msub-OASIS20102_ses-02_T1w_brain_affine_mni.nii.gz,train

model.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# model.py
+import os
+from typing import Optional
+import torch
+import torch.nn as nn
+from monai.transforms import (
+    Compose,
+    CopyItemsD,
+    LoadImageD,
+    EnsureChannelFirstD,
+    SpacingD,
+    ResizeWithPadOrCropD,
+    ScaleIntensityD,
+)
+# Constants for your typical config
+RESOLUTION = 2
+INPUT_SHAPE_AE = (80, 96, 80)
+# Define the exact transform pipeline for input MRI
+transforms_fn = Compose([
+    CopyItemsD(keys={'image_path'}, names=['image']),
+    LoadImageD(image_only=True, keys=['image']),
+    EnsureChannelFirstD(keys=['image']),
+    SpacingD(pixdim=RESOLUTION, keys=['image']),
+    ResizeWithPadOrCropD(spatial_size=INPUT_SHAPE_AE, mode='minimum', keys=['image']),
+    ScaleIntensityD(minv=0, maxv=1, keys=['image']),
+])
+def preprocess_mri(image_path: str, device: str = "cpu") -> torch.Tensor:
+    """
+    Preprocess an MRI using MONAI transforms to produce
+    a 5D tensor (batch=1, channels=1, D, H, W) for inference.
+    """
+    data_dict = {"image_path": image_path}
+    output_dict = transforms_fn(data_dict)
+    image_tensor = output_dict["image"]  # shape: (1, D, H, W)
+    image_tensor = image_tensor.unsqueeze(0)  # => (batch=1, channel=1, D, H, W)
+    return image_tensor.to(device)
+class ShallowLinearAutoencoder(nn.Module):
+    """
+    A purely linear autoencoder with one hidden layer.
+    - Flatten input into a vector
+    - Linear encoder (no activation)
+    - Linear decoder (no activation)
+    - Reshape output to original volume shape
+    """
+    def __init__(self, input_shape=(80, 96, 80), hidden_size=1200):
+        super().__init__()
+        self.input_shape = input_shape
+        self.input_dim = input_shape[0] * input_shape[1] * input_shape[2]
+        self.hidden_size = hidden_size
+        # Encoder (no activation for PCA-like behavior)
+        self.encoder = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(self.input_dim, self.hidden_size),
+        )
+        # Decoder (no activation)
+        self.decoder = nn.Sequential(
+            nn.Linear(self.hidden_size, self.input_dim),
+        )
+    def encode(self, x: torch.Tensor):
+        return self.encoder(x)
+    def decode(self, z: torch.Tensor):
+        out = self.decoder(z)
+        # Reshape to (N, 1, D, H, W)
+        return out.view(-1, 1, *self.input_shape)
+    def forward(self, x: torch.Tensor):
+        """
+        Return (reconstruction, embedding, None) to keep a similar API
+        to the old VAE-based code, though there's no σ for sampling.
+        """
+        z = self.encode(x)
+        reconstruction = self.decode(z)
+        return reconstruction, z, None
+class Brain2vec(nn.Module):
+    """
+    A wrapper around the ShallowLinearAutoencoder, providing a from_pretrained(...)
+    method for model loading, mirroring the old usage with AutoencoderKL.
+    """
+    def __init__(self, device: str = "cpu"):
+        super().__init__()
+        # Instantiate the shallow linear model
+        self.model = ShallowLinearAutoencoder(input_shape=INPUT_SHAPE_AE, hidden_size=1200)
+        self.to(device)
+    def forward(self, x: torch.Tensor):
+        """
+        Forward pass that returns (reconstruction, embedding, None).
+        """
+        return self.model(x)
+    @staticmethod
+    def from_pretrained(
+        checkpoint_path: Optional[str] = None,
+        device: str = "cpu"
+    ) -> nn.Module:
+        """
+        Load a pretrained ShallowLinearAutoencoder if a checkpoint path is provided.
+        Args:
+            checkpoint_path (Optional[str]): path to a .pth checkpoint
+            device (str): "cpu", "cuda", etc.
+        """
+        model = Brain2vec(device=device)
+        if checkpoint_path is not None:
+            if not os.path.exists(checkpoint_path):
+                raise FileNotFoundError(f"Checkpoint {checkpoint_path} not found.")
+            state_dict = torch.load(checkpoint_path, map_location=device)
+            model.load_state_dict(state_dict)
+        model.eval()
+        return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+# requirements.txt
+# PyTorch (CUDA or CPU version). For GPU install, see PyTorch docs for the correct wheel.
+torch>=1.12
+# MONAI v1.2+ has the 'generative' subpackage with AutoencoderKL, PatchDiscriminator, etc.
+monai-weekly
+monai-generative
+# Common Python libraries
+pandas
+numpy
+nibabel
+matplotlib
+datasets