# %% from pathlib import Path import caveclient as cc import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report from sklearn.model_selection import KFold from sklearn.pipeline import Pipeline from sklearn.preprocessing import QuantileTransformer from skops.io import dump client = cc.CAVEclient("minnie65_phase3_v1") out_path = Path("./troglobyte-sandbox/models/") model_name = "local_compartment_classifier_bd_boxes" data_path = Path("./troglobyte-sandbox/data/bounding_box_labels") files = list(data_path.glob("*.csv")) # %% label_df = pd.read_csv(out_path / model_name / "labels.csv", index_col=[0, 1]) label_df = label_df.rename(columns=lambda x: x.replace(".1", "")) # # %% # X_df = wrangler.features_.copy() # X_df = X_df.drop(columns=[col for col in X_df.columns if "rep_coord" in col]) # %% X_df = pd.read_csv(out_path / model_name / "features_new.csv", index_col=[0, 1]) # %% def box_train_test_split( train_box_indices, test_box_indices, X_df, label_df, label_column ): train_label_df = label_df.loc[train_box_indices + 1].droplevel("bbox_id") test_label_df = label_df.loc[test_box_indices + 1].droplevel("bbox_id") train_X_df = X_df.loc[train_label_df["root_id"]] test_X_df = X_df.loc[test_label_df["root_id"]] train_X_df = train_X_df.dropna() test_X_df = test_X_df.dropna() train_l2_y = train_X_df.index.get_level_values("object_id").map( train_label_df[label_column] ) test_l2_y = test_X_df.index.get_level_values("object_id").map( test_label_df[label_column] ) # TODO do something more fair here w/ evaluation on the uncertains train_X_df = train_X_df.loc[train_l2_y.notna()] train_l2_y = train_l2_y[train_l2_y.notna()].values.astype(str) test_X_df = test_X_df.loc[test_l2_y.notna()] test_l2_y = test_l2_y[test_l2_y.notna()].values.astype(str) return train_X_df, test_X_df, train_l2_y, test_l2_y def aggregate_votes_by_object(X_df, l2_node_predictions): l2_node_predictions = pd.Series( index=X_df.index, data=l2_node_predictions, name="label" ) object_prediction_counts = ( l2_node_predictions.groupby(level="object_id").value_counts().to_frame() ) object_n_predictions = object_prediction_counts.groupby("object_id").sum() sufficient_data_index = object_n_predictions.query("count > 3").index object_prediction_counts = object_prediction_counts.loc[sufficient_data_index] object_prediction_probs = object_prediction_counts.unstack(fill_value=0) object_prediction_probs = object_prediction_probs.div( object_prediction_probs.sum(axis=1), axis=0 ) object_prediction_counts.reset_index(drop=False, inplace=True) max_locs = object_prediction_counts.groupby("object_id")["count"].idxmax() max_predictions = object_prediction_counts.loc[max_locs] max_predictions["proportion"] = ( max_predictions["count"] / object_n_predictions.loc[max_predictions["object_id"]]["count"].values ) max_predictions = max_predictions.set_index("object_id") return max_predictions, object_prediction_probs # models to evaluate def get_lda(n_classes): lda = Pipeline( [ ("transformer", QuantileTransformer(output_distribution="normal")), ("lda", LinearDiscriminantAnalysis(n_components=n_classes - 1)), ] ) return lda rf = RandomForestClassifier(n_estimators=500, max_depth=4) box_indices = np.arange(1, 4) rows = [] for fold, (train_box_indices, test_box_indices) in enumerate( KFold(n_splits=3).split(box_indices.reshape(-1, 1)) ): for label_column in ["axon_label", "simple_label"]: train_X_df, test_X_df, train_l2_y, test_l2_y = box_train_test_split( train_box_indices, test_box_indices, X_df, label_df, label_column ) n_classes = label_df[label_column].nunique() models = {"rf": rf, "lda": get_lda(n_classes)} for model_type, model in models.items(): model.fit(train_X_df, train_l2_y) train_preds = model.predict(train_X_df) test_preds = model.predict(test_X_df) # evaluate at the L2 level train_report = classification_report( train_l2_y, train_preds, output_dict=True ) rows.append( { "model": model_type, "fold": fold, "accuracy": train_report["accuracy"], "macro_f1": train_report["macro avg"]["f1-score"], "weighted_f1": train_report["weighted avg"]["f1-score"], "evaluation": "train", "labeling": label_column, "level": "level2", } ) test_report = classification_report(test_l2_y, test_preds, output_dict=True) rows.append( { "model": model_type, "fold": fold, "accuracy": test_report["accuracy"], "macro_f1": test_report["macro avg"]["f1-score"], "weighted_f1": test_report["weighted avg"]["f1-score"], "evaluation": "test", "labeling": label_column, "level": "level2", } ) # evaluate at the object level train_object_predictions, train_object_probs = aggregate_votes_by_object( train_X_df, train_preds ) train_object_y = ( label_df.droplevel(0) .loc[train_object_predictions.index, label_column] .values.astype(str) ) train_object_report = classification_report( train_object_y, train_object_predictions["label"], output_dict=True ) rows.append( { "model": model_type + "-vote", "fold": fold, "accuracy": train_object_report["accuracy"], "macro_f1": train_object_report["macro avg"]["f1-score"], "weighted_f1": train_object_report["weighted avg"]["f1-score"], "evaluation": "train", "labeling": label_column, "level": "root", } ) test_object_predictions, test_object_probs = aggregate_votes_by_object( test_X_df, test_preds ) test_object_y = ( label_df.droplevel(0) .loc[test_object_predictions.index, label_column] .values.astype(str) ) test_object_report = classification_report( test_object_y, test_object_predictions["label"], output_dict=True ) rows.append( { "model": model_type + "-vote", "fold": fold, "accuracy": test_object_report["accuracy"], "macro_f1": train_object_report["macro avg"]["f1-score"], "weighted_f1": train_object_report["weighted avg"]["f1-score"], "evaluation": "test", "labeling": label_column, "level": "root", } ) # %% evaluation_df = pd.DataFrame(rows) sns.set_context("talk") fig, axs = plt.subplots(2, 3, figsize=(15, 10), constrained_layout=True, sharey="col") for i, labeling in enumerate(["simple_label", "axon_label"]): for j, metric in enumerate(["accuracy", "weighted_f1", "macro_f1"]): ax = axs[i, j] show_legend = (i == 0) & (j == 0) sns.stripplot( data=evaluation_df.query("labeling == @labeling"), x="model", y=metric, hue="evaluation", ax=ax, legend=show_legend, s=10, jitter=True, ) ax.spines[["right", "top"]].set_visible(False) if j == 1: ax.set_title("Labeling: " + labeling) # %% lda = model train_X_transformed = lda.transform(train_X_df) # %% fig, ax = plt.subplots(1, 1, figsize=(10, 10)) sns.scatterplot( x=train_X_transformed[:, 0], y=train_X_transformed[:, 1], hue=train_l2_y, ax=ax, s=10, alpha=0.7, ) ax.set(xticks=[], yticks=[], xlabel="LDA1", ylabel="LDA2") ax.spines[["right", "top"]].set_visible(False) # %% fig, ax = plt.subplots(1, 1, figsize=(10, 10)) sns.scatterplot( x=train_X_transformed[:, 0], y=train_X_transformed[:, 2], hue=train_l2_y, ax=ax, s=10, alpha=0.7, ) ax.set(xticks=[], yticks=[], xlabel="LDA1", ylabel="LDA3") ax.spines[["right", "top"]].set_visible(False) # %% final_lda = Pipeline( [ ("transformer", QuantileTransformer(output_distribution="normal")), ("lda", LinearDiscriminantAnalysis(n_components=n_classes - 1)), ] ) train_X_df, test_X_df, train_l2_y, test_l2_y = box_train_test_split( np.array([0, 1, 2]), np.array([]), X_df, label_df, label_column ) final_lda.fit(train_X_df, train_l2_y) report = classification_report( train_l2_y, final_lda.predict(train_X_df), output_dict=True ) # %% report_table = pd.DataFrame(report).T report_overall = report_table.loc[["accuracy", "macro avg", "weighted avg"]] report_overall.index.name = "type" report_overall.reset_index(inplace=True) report_by_class = report_table.drop(index=["accuracy", "macro avg", "weighted avg"]) report_by_class.index.name = "class" report_by_class.reset_index(inplace=True) # %% # model_pickle_file = out_path / model_name / f"{model_name}.skops" # with open(model_pickle_file, mode="bw") as f: # dump(final_lda, file=f) # %% import os from pathlib import Path from skops import card, hub_utils hub_out_path = Path( "troglobyte-sandbox/models/local_compartment_classifier_bd_boxes/hub_model" ) if not hub_out_path.exists(): hub_utils.init( model=model_pickle_file, requirements=["scikit-learn", "caveclient"], dst=hub_out_path, task="tabular-classification", data=train_X_df, ) hub_utils.add_files(__file__, dst=hub_out_path, exist_ok=True) # if True: if not os.path.exists(hub_out_path / "README.md"): model_card = card.Card(model, metadata=card.metadata_from_config(hub_out_path)) model_card.metadata.license = "mit" model_description = ( "This is a model trained to classify pieces of neuron as axon, dendrite, soma, or" "glia, " "based only on their local shape and synapse features." "The model is a linear discriminant classifier which was trained on compartment " "labels generated by Bethanny Danskin for 3 6x6x6 um boxes in the Minnie65 Phase3 " "dataset." ) model_card_authors = "bdpedigo" model_card.add( model_card_authors=model_card_authors, model_description=model_description, ) model_card.add_table( folded=False, **{ "Classification Report (overall)": report_overall, "Classification Report (by class)": report_by_class, }, ) model_card.save(hub_out_path / "README.md") from huggingface_hub import HfApi api = HfApi() api.upload_folder( repo_id=f"bdpedigo/{model_name}", folder_path=hub_out_path, # filename=f"{model_name}.skops", ) # hub_utils.push( # repo_id=f"bdpedigo/{model_name}", # source=hub_out_path, # create_remote=False, # private=False, # ) # %% syn_features = [col for col in X_df.columns if "syn" in col] train_X_df_no_syn = train_X_df.drop(columns=syn_features) final_lda_no_syn = Pipeline( [ ("transformer", QuantileTransformer(output_distribution="normal")), ("lda", LinearDiscriminantAnalysis(n_components=n_classes - 1)), ] ) final_lda_no_syn.fit(train_X_df_no_syn, train_l2_y) print(classification_report(train_l2_y, final_lda_no_syn.predict(train_X_df_no_syn))) with open(out_path / model_name / f"{model_name}_no_syn.skops", mode="bw") as f: dump(final_lda_no_syn, file=f)