radiata-ai
/

brain2vec_PCA

@@ -3,12 +3,16 @@
 """
 pca_autoencoder.py
-This script demonstrates how to:
-  1) Load a dataset of MRI volumes using MONAI transforms (as in brain2vec_linearAE.py).
-  2) Flatten each 3D volume into a 1D vector (614,400 features if 80x96x80).
-  3) Perform IncrementalPCA to reduce dimensionality to 1200 components.
-  4) Provide a 'forward()' method that returns (reconstruction, embedding),
-     mimicking the interface of a linear autoencoder.
 """
 import os
@@ -22,17 +26,20 @@ from torch.utils.data import DataLoader
 from monai import transforms
 from monai.data import Dataset, PersistentDataset
-from sklearn.decomposition import IncrementalPCA
 ###################################################################
 # Constants for your typical config
 ###################################################################
 RESOLUTION = 2
 INPUT_SHAPE_AE = (80, 96, 80)
-N_COMPONENTS = 1200
 ###################################################################
-# Helper classes/functions
 ###################################################################
 def get_dataset_from_pd(df: pd.DataFrame, transforms_fn, cache_dir: str):
     """
@@ -50,35 +57,57 @@ def get_dataset_from_pd(df: pd.DataFrame, transforms_fn, cache_dir: str):
     return dataset
 class PCAAutoencoder:
     """
-    A PCA 'autoencoder' using IncrementalPCA for memory efficiency,
-    providing:
-      - fit(X): partial fit on batches
       - transform(X): get embeddings
-      - inverse_transform(Z): reconstruct from embeddings
-      - forward(X): returns (X_recon, Z) for a direct API
-                    similar to a shallow linear AE.
     """
-    def __init__(self, n_components=N_COMPONENTS, batch_size=128):
         self.n_components = n_components
         self.batch_size = batch_size
-        self.ipca = IncrementalPCA(n_components=self.n_components)
     def fit(self, X: np.ndarray):
         """
-        Incrementally fit the PCA model on batches of data.
-        X: shape (n_samples, n_features).
         """
-        n_samples = X.shape[0]
-        for start_idx in range(0, n_samples, self.batch_size):
-            end_idx = min(start_idx + self.batch_size, n_samples)
-            self.ipca.partial_fit(X[start_idx:end_idx])
     def transform(self, X: np.ndarray) -> np.ndarray:
         """
-        Projects data into the PCA latent space in batches.
-        Returns Z: shape (n_samples, n_components).
         """
         results = []
         n_samples = X.shape[0]
@@ -91,7 +120,7 @@ class PCAAutoencoder:
     def inverse_transform(self, Z: np.ndarray) -> np.ndarray:
         """
         Reconstruct data from PCA latent space in batches.
-        Returns X_recon: shape (n_samples, n_features).
         """
         results = []
         n_samples = Z.shape[0]
@@ -110,46 +139,65 @@ class PCAAutoencoder:
         return X_recon, Z
 def load_and_flatten_dataset(csv_path: str, cache_dir: str, transforms_fn) -> np.ndarray:
     """
-    Loads the dataset from csv_path, applies the monai transforms,
-    and flattens each 3D MRI into a 1D vector of shape (80*96*80).
-    Returns a numpy array X with shape (n_samples, 614400).
     """
     df = pd.read_csv(csv_path)
-    dataset = get_dataset_from_pd(df, transforms_fn, cache_dir)
-    # We'll put the flattened data into this list, then stack.
-    X_list = []
-    # If memory allows, you can simply do a single-threaded loop
-    # or multi-worker DataLoader for speed.
-    # We'll demonstrate a simple single-worker here for clarity.
     loader = DataLoader(dataset, batch_size=1, num_workers=0)
     for batch in loader:
-        # batch["image"] shape: (1, 1, 80, 96, 80)
-        img = batch["image"].squeeze(0)  # shape: (1, 80, 96, 80)
-        img_np = img.numpy()  # convert to np array, shape: (1, D, H, W)
-        flattened = img_np.flatten()     # shape: (614400,)
         X_list.append(flattened)
-    X = np.vstack(X_list)  # shape: (n_samples, 614400)
     return X
 def main():
-    parser = argparse.ArgumentParser(description="PCA Autoencoder with MONAI transforms example.")
-    parser.add_argument("--inputs_csv", type=str, required=True, help="CSV with 'image_path' column.")
-    parser.add_argument("--cache_dir", type=str, default="", help="Cache directory for MONAI PersistentDataset.")
-    parser.add_argument("--output_dir", type=str, default="./pca_outputs", help="Where to save PCA model and embeddings.")
-    parser.add_argument("--batch_size_ipca", type=int, default=128, help="Batch size for IncrementalPCA partial_fit().")
-    parser.add_argument("--n_components", type=int, default=1200, help="Number of PCA components.")
     args = parser.parse_args()
     os.makedirs(args.output_dir, exist_ok=True)
-    # Same transforms as in brain2vec_linearAE.py
     transforms_fn = transforms.Compose([
         transforms.CopyItemsD(keys={'image_path'}, names=['image']),
         transforms.LoadImageD(image_only=True, keys=['image']),
@@ -163,27 +211,32 @@ def main():
     X = load_and_flatten_dataset(args.inputs_csv, args.cache_dir, transforms_fn)
     print(f"Dataset shape after flattening: {X.shape}")
-    # Build PCAAutoencoder
-    model = PCAAutoencoder(n_components=args.n_components, batch_size=args.batch_size_ipca)
     # Fit the PCA model
-    print("Fitting IncrementalPCA in batches...")
     model.fit(X)
     print("Done fitting PCA. Transforming data to embeddings...")
     # Get embeddings & reconstruction
     X_recon, Z = model.forward(X)
-    print("Embeddings shape:", Z.shape)
-    print("Reconstruction shape:", X_recon.shape)
-    # Optional: Save
     embeddings_path = os.path.join(args.output_dir, "pca_embeddings.npy")
     recons_path = os.path.join(args.output_dir, "pca_reconstructions.npy")
     np.save(embeddings_path, Z)
     np.save(recons_path, X_recon)
-    print(f"Saved embeddings to {embeddings_path} and reconstructions to {recons_path}")
-    # If you want to store the actual PCA components for future usage:
     # from joblib import dump
     # ipca_model_path = os.path.join(args.output_dir, "pca_model.joblib")
     # dump(model.ipca, ipca_model_path)

 """
 pca_autoencoder.py
+Adjustments requested:
+  1. Only fit on scans with a 'train' label in the inputs.csv 'split' column.
+  2. An option to either run incremental PCA or standard PCA.
+Example usage:
+    python pca_autoencoder.py \
+        --inputs_csv /path/to/inputs.csv \
+        --output_dir ./pca_outputs \
+        --pca_type standard \
+        --n_components 100
 """
 import os
 from monai import transforms
 from monai.data import Dataset, PersistentDataset
+# We'll import both PCA classes, and decide which to use based on CLI arg.
+from sklearn.decomposition import PCA, IncrementalPCA
 ###################################################################
 # Constants for your typical config
 ###################################################################
 RESOLUTION = 2
 INPUT_SHAPE_AE = (80, 96, 80)
+DEFAULT_N_COMPONENTS = 1200
 ###################################################################
+# Helper: get_dataset_from_pd (same as in brain2vec_linearAE.py)
 ###################################################################
 def get_dataset_from_pd(df: pd.DataFrame, transforms_fn, cache_dir: str):
     """
     return dataset
+###################################################################
+# PCAAutoencoder
+###################################################################
 class PCAAutoencoder:
     """
+    A PCA 'autoencoder' that can use either standard PCA or IncrementalPCA:
+      - fit(X): trains the model
       - transform(X): get embeddings
+      - inverse_transform(Z): reconstruct data from embeddings
+      - forward(X): returns (X_recon, Z)
+    If using standard PCA, we do a single call to .fit(X).
+    If using incremental PCA, we do .partial_fit on data in batches.
     """
+    def __init__(self, n_components=DEFAULT_N_COMPONENTS, batch_size=128, pca_type='incremental'):
+        """
+        Args:
+            n_components (int): number of principal components to keep
+            batch_size (int): chunk size for either partial_fit or chunked .transform
+            pca_type (str): 'incremental' or 'standard'
+        """
         self.n_components = n_components
         self.batch_size = batch_size
+        self.pca_type = pca_type.lower()
+        if self.pca_type == 'standard':
+            self.ipca = PCA(n_components=self.n_components, svd_solver='randomized')
+        else:
+            # default to incremental
+            self.ipca = IncrementalPCA(n_components=self.n_components)
     def fit(self, X: np.ndarray):
         """
+        Fit the PCA model. If incremental, calls partial_fit in batches.
+        If standard, calls .fit once on the entire data matrix.
+        X: shape (n_samples, n_features)
         """
+        if self.pca_type == 'standard':
+            # Potentially large memory usage, so be sure your system can handle it.
+            self.ipca.fit(X)
+        else:
+            # IncrementalPCA
+            n_samples = X.shape[0]
+            for start_idx in range(0, n_samples, self.batch_size):
+                end_idx = min(start_idx + self.batch_size, n_samples)
+                self.ipca.partial_fit(X[start_idx:end_idx])
     def transform(self, X: np.ndarray) -> np.ndarray:
         """
+        Project data into the PCA latent space in batches for memory efficiency.
+        Returns Z with shape (n_samples, n_components)
         """
         results = []
         n_samples = X.shape[0]
     def inverse_transform(self, Z: np.ndarray) -> np.ndarray:
         """
         Reconstruct data from PCA latent space in batches.
+        Returns X_recon with shape (n_samples, n_features).
         """
         results = []
         n_samples = Z.shape[0]
         return X_recon, Z
+###################################################################
+# Load and Flatten Data
+###################################################################
 def load_and_flatten_dataset(csv_path: str, cache_dir: str, transforms_fn) -> np.ndarray:
     """
+    1) Reads CSV.
+    2) Filters rows if 'split' in columns => only keep 'split' == 'train'.
+    3) Applies transforms to each image, flattening them into a 1D vector (614,400).
+    4) Returns a NumPy array X: shape (n_samples, 614400).
     """
     df = pd.read_csv(csv_path)
+    # Filter only 'train' if the split column exists
+    if 'split' in df.columns:
+        df = df[df['split'] == 'train']
+    # If there is no 'split' column, we assume the entire CSV is for training.
+    dataset = get_dataset_from_pd(df, transforms_fn, cache_dir)
     loader = DataLoader(dataset, batch_size=1, num_workers=0)
+    # We'll store each flattened volume in a list, then stack
+    X_list = []
     for batch in loader:
+        # batch["image"] shape => (1, 1, 80, 96, 80)
+        img = batch["image"].squeeze(0)  # => (1, 80, 96, 80)
+        img_np = img.numpy()
+        flattened = img_np.flatten()  # => (614400,)
         X_list.append(flattened)
+    if len(X_list) == 0:
+        raise ValueError("No training samples found (split='train'). Check your CSV or 'split' values.")
+    X = np.vstack(X_list)
     return X
+###################################################################
+# Main
+###################################################################
 def main():
+    parser = argparse.ArgumentParser(description="PCA Autoencoder with MONAI transforms and 'split' filtering.")
+    parser.add_argument("--inputs_csv", type=str, required=True,
+                        help="Path to CSV with at least 'image_path' column, optional 'split' column.")
+    parser.add_argument("--cache_dir", type=str, default="",
+                        help="Cache directory for MONAI PersistentDataset (optional).")
+    parser.add_argument("--output_dir", type=str, default="./pca_outputs",
+                        help="Where to save PCA model and embeddings.")
+    parser.add_argument("--batch_size_ipca", type=int, default=128,
+                        help="Batch size for partial_fit or chunked transform.")
+    parser.add_argument("--n_components", type=int, default=1200,
+                        help="Number of PCA components to keep.")
+    parser.add_argument("--pca_type", type=str, default="incremental",
+                        choices=["incremental", "standard"],
+                        help="Which PCA algorithm to use: 'incremental' or 'standard'.")
     args = parser.parse_args()
     os.makedirs(args.output_dir, exist_ok=True)
+    # define transforms as in brain2vec_linearAE.py
     transforms_fn = transforms.Compose([
         transforms.CopyItemsD(keys={'image_path'}, names=['image']),
         transforms.LoadImageD(image_only=True, keys=['image']),
     X = load_and_flatten_dataset(args.inputs_csv, args.cache_dir, transforms_fn)
     print(f"Dataset shape after flattening: {X.shape}")
+    # Build the PCAAutoencoder with chosen type
+    model = PCAAutoencoder(
+        n_components=args.n_components,
+        batch_size=args.batch_size_ipca,
+        pca_type=args.pca_type
+    )
     # Fit the PCA model
+    print(f"Fitting {args.pca_type.capitalize()}PCA in batches...")
     model.fit(X)
     print("Done fitting PCA. Transforming data to embeddings...")
     # Get embeddings & reconstruction
     X_recon, Z = model.forward(X)
+    print("Embeddings shape:", Z.shape)         # (n_samples, n_components)
+    print("Reconstruction shape:", X_recon.shape)  # (n_samples, 614400)
+    # Save
     embeddings_path = os.path.join(args.output_dir, "pca_embeddings.npy")
     recons_path = os.path.join(args.output_dir, "pca_reconstructions.npy")
     np.save(embeddings_path, Z)
     np.save(recons_path, X_recon)
+    print(f"Saved embeddings to {embeddings_path}")
+    print(f"Saved reconstructions to {recons_path}")
+    # Optionally save the actual PCA model with joblib
     # from joblib import dump
     # ipca_model_path = os.path.join(args.output_dir, "pca_model.joblib")
     # dump(model.ipca, ipca_model_path)

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ pandas
 numpy
 nibabel
 matplotlib
-datasets

 numpy
 nibabel
 matplotlib
+datasets
+scikit-learn