Spaces:

InstaDeepAI
/

folding-studio-demo

Running

App Files Files Community

Add more correlation metrics to correlation tab

by jfaustin - opened 5 days ago

base: refs/heads/main

←

from: refs/pr/8

Discussion Files changed

+140

-52

Files changed (2) hide show

folding_studio_demo/app.py +39 -10
folding_studio_demo/correlate.py +101 -42

folding_studio_demo/app.py CHANGED Viewed

@@ -9,8 +9,12 @@ from gradio_molecule3d import Molecule3D
 from folding_studio_demo.correlate import (
     SCORE_COLUMNS,
     fake_predict_and_correlate,
-    make_correlation_plot,
 )
 from folding_studio_demo.predict import predict, predict_comparison
@@ -107,7 +111,6 @@ def simple_prediction(api_key: str) -> None:
         elem_classes="gradient-button",
         elem_id="predict-btn",
         variant="primary",
-        # css=f".gradio-container #predict-btn {{background: linear-gradient(90deg, {BLUE}, {PURPLE});}}",
     )
     with gr.Row():
@@ -145,7 +148,6 @@ def model_comparison(api_key: str) -> None:
         elem_classes=["gradient-button"],
         elem_id="compare-models-btn",
         variant="primary",
-        # css=f".gradio-container #compare-models-btn {{background: linear-gradient(90deg, {BLUE}, {PURPLE});}}"
     )
     with gr.Row():
@@ -181,6 +183,7 @@ def create_correlation_tab():
         of binding strength.
     """)
     spr_data_with_scores = pd.read_csv("spr_af_scores_mapped.csv")
     prettified_columns = {
         "antibody_name": "Antibody Name",
         "KD (nM)": "KD (nM)",
@@ -209,12 +212,19 @@ def create_correlation_tab():
             "Predict structures of all complexes",
             elem_classes="gradient-button",
             variant="primary",
-            # css=f".gradio-container #fake-predict-btn {{background: linear-gradient(90deg, {BLUE}, {PURPLE});}}",
         )
     with gr.Row():
         prediction_dataframe = gr.Dataframe(label="Predicted Structures Data")
     with gr.Row():
-        correlation_ranking_plot = gr.Plot(label="Correlation ranking")
     with gr.Row():
         with gr.Column():
             with gr.Row():
@@ -225,6 +235,13 @@ def create_correlation_tab():
                 # Add checkbox for log scale and update plot when either input changes
             with gr.Row():
                 log_scale = gr.Checkbox(label="Display x-axis on logarithmic scale", value=False)
         with gr.Column():
             correlation_plot = gr.Plot(label="Correlation with binding affinity")
@@ -232,21 +249,33 @@ def create_correlation_tab():
         fn=lambda x: fake_predict_and_correlate(
             spr_data_with_scores, SCORE_COLUMNS, ["Antibody Name", "KD (nM)"]
         ),
-        inputs=None,
         outputs=[prediction_dataframe, correlation_ranking_plot, correlation_plot],
     )
-    def update_plot(score, use_log):
-        return make_correlation_plot(spr_data_with_scores, score, use_log)
     correlation_column.change(
-        fn=update_plot,
         inputs=[correlation_column, log_scale],
         outputs=correlation_plot,
     )
     log_scale.change(
-        fn=update_plot,
         inputs=[correlation_column, log_scale],
         outputs=correlation_plot,
     )

 from folding_studio_demo.correlate import (
     SCORE_COLUMNS,
+    SCORE_COLUMN_NAMES,
     fake_predict_and_correlate,
+    make_regression_plot,
+    compute_correlation_data,
+    plot_correlation_ranking,
+    get_score_description
 )
 from folding_studio_demo.predict import predict, predict_comparison
         elem_classes="gradient-button",
         elem_id="predict-btn",
         variant="primary",
     )
     with gr.Row():
         elem_classes=["gradient-button"],
         elem_id="compare-models-btn",
         variant="primary",
     )
     with gr.Row():
         of binding strength.
     """)
     spr_data_with_scores = pd.read_csv("spr_af_scores_mapped.csv")
+    spr_data_with_scores = spr_data_with_scores.rename(columns=SCORE_COLUMN_NAMES)
     prettified_columns = {
         "antibody_name": "Antibody Name",
         "KD (nM)": "KD (nM)",
             "Predict structures of all complexes",
             elem_classes="gradient-button",
             variant="primary",
         )
     with gr.Row():
         prediction_dataframe = gr.Dataframe(label="Predicted Structures Data")
     with gr.Row():
+        with gr.Row():
+            correlation_type = gr.Radio(
+                choices=["Spearman", "Pearson", "R²"],
+                value="Spearman",
+                label="Correlation Type",
+                interactive=True
+            )
+        with gr.Row():
+            correlation_ranking_plot = gr.Plot(label="Correlation ranking")
     with gr.Row():
         with gr.Column():
             with gr.Row():
                 # Add checkbox for log scale and update plot when either input changes
             with gr.Row():
                 log_scale = gr.Checkbox(label="Display x-axis on logarithmic scale", value=False)
+            with gr.Row():
+                score_description = gr.Markdown(get_score_description(correlation_column.value))
+                correlation_column.change(
+                    fn=lambda x: get_score_description(x),
+                    inputs=correlation_column,
+                    outputs=score_description
+                )
         with gr.Column():
             correlation_plot = gr.Plot(label="Correlation with binding affinity")
         fn=lambda x: fake_predict_and_correlate(
             spr_data_with_scores, SCORE_COLUMNS, ["Antibody Name", "KD (nM)"]
         ),
+        inputs=[correlation_type],
         outputs=[prediction_dataframe, correlation_ranking_plot, correlation_plot],
     )
+    def update_regression_plot(score, use_log):
+        return make_regression_plot(spr_data_with_scores, score, use_log)
+    def update_correlation_plot(correlation_type):
+        logger.info(f"Updating correlation plot for {correlation_type}")
+        corr_data = compute_correlation_data(spr_data_with_scores, SCORE_COLUMNS)
+        logger.info(f"Correlation data: {corr_data}")
+        return plot_correlation_ranking(corr_data, correlation_type)
     correlation_column.change(
+        fn=update_regression_plot,
         inputs=[correlation_column, log_scale],
         outputs=correlation_plot,
     )
+    correlation_type.change(
+        fn=update_correlation_plot,
+        inputs=[correlation_type],
+        outputs=correlation_ranking_plot,
+    )
     log_scale.change(
+        fn=update_regression_plot,
         inputs=[correlation_column, log_scale],
         outputs=correlation_plot,
     )

folding_studio_demo/correlate.py CHANGED Viewed

@@ -1,45 +1,90 @@
 import logging
 import pandas as pd
 import numpy as np
 import plotly.graph_objects as go
-from scipy.stats import spearmanr
 logger = logging.getLogger(__name__)
-SCORE_COLUMNS = [
-        "confidence_score_boltz",
-        "ptm_boltz",
-        "iptm_boltz",
-        "complex_plddt_boltz",
-        "complex_iplddt_boltz",
-        "complex_pde_boltz",
-        "complex_ipde_boltz",
-        "interchain_pae_monomer",
-        "interface_pae_monomer",
-        "overall_pae_monomer",
-        "interface_plddt_monomer",
-        "average_plddt_monomer",
-        "ptm_monomer",
-        "interface_ptm_monomer",
-        "interchain_pae_multimer",
-        "interface_pae_multimer",
-        "overall_pae_multimer",
-        "interface_plddt_multimer",
-        "average_plddt_multimer",
-        "ptm_multimer",
-        "interface_ptm_multimer"
-    ]
-def fake_predict_and_correlate(spr_data_with_scores: pd.DataFrame, score_cols: list[str], main_cols: list[str]) -> tuple[pd.DataFrame, go.Figure]:
-    """Fake predict structures of all complexes and correlate the results."""
     corr_data = []
     spr_data_with_scores["log_kd"] = np.log10(spr_data_with_scores["KD (nM)"])
     kd_col = "KD (nM)"
-    for score_col in score_cols:
-        logger.info(f"Computing correlation between {score_col} and KD (nM)")
-        res = spearmanr(spr_data_with_scores[kd_col], spr_data_with_scores[score_col])
-        corr_data.append({"score": score_col, "correlation": res.statistic, "p-value": res.pvalue})
-        logger.info(f"Correlation between {score_col} and KD (nM): {res.statistic}")
     corr_data = pd.DataFrame(corr_data)
     # Find the lines in corr_data with NaN values and remove them
@@ -47,34 +92,48 @@ def fake_predict_and_correlate(spr_data_with_scores: pd.DataFrame, score_cols: l
     # Sort correlation data by correlation value
     corr_data = corr_data.sort_values('correlation', ascending=True)
     # Create bar plot of correlations
     corr_ranking_plot = go.Figure(data=[
         go.Bar(
-            x=corr_data["correlation"],
-            y=corr_data["score"],
-            name="correlation",
             orientation='h',
             hovertemplate="<i>Score:</i> %{y}<br><i>Correlation:</i> %{x:.3f}<br>"
         )
     ])
     corr_ranking_plot.update_layout(
         title="Correlation with Binding Affinity",
-        yaxis_title="Score Type",
-        xaxis_title="Spearman Correlation",
         template="simple_white",
         showlegend=False
     )
     cols_to_show = main_cols[:]
     cols_to_show.extend(score_cols)
-    corr_plot = make_correlation_plot(spr_data_with_scores, score_cols[0], use_log=False)
     return spr_data_with_scores[cols_to_show].round(2), corr_ranking_plot, corr_plot
-def make_correlation_plot(spr_data_with_scores: pd.DataFrame, score: str, use_log: bool) -> go.Figure:
-    """Select the correlation plot to display."""
-    # corr_plot is a scatter plot of the correlation between the binding affinity and each of the scores
     scatter =  go.Scatter(
             x=spr_data_with_scores["KD (nM)"],
             y=spr_data_with_scores[score],
@@ -97,11 +156,11 @@ def make_correlation_plot(spr_data_with_scores: pd.DataFrame, score: str, use_lo
         ),
         xaxis_type="log" if use_log else "linear"  # Set x-axis to logarithmic scale
     )
-    # compute the correlation line
     corr_line = np.polyfit(spr_data_with_scores["KD (nM)"], spr_data_with_scores[score], 1)
     corr_line_x = np.linspace(min(spr_data_with_scores["KD (nM)"]), max(spr_data_with_scores["KD (nM)"]), 100)
     corr_line_y = corr_line[0] * corr_line_x + corr_line[1]
-    # add the correlation line to the plot
     corr_plot.add_trace(go.Scatter(
         x=corr_line_x,
         y=corr_line_y,

 import logging
 import pandas as pd
+from pathlib import Path
 import numpy as np
 import plotly.graph_objects as go
+from scipy.stats import spearmanr, pearsonr, linregress
 logger = logging.getLogger(__name__)
+SCORE_COLUMN_NAMES = {
+    "confidence_score_boltz": "Boltz Confidence Score",
+    "ptm_boltz": "Boltz pTM Score",
+    "iptm_boltz": "Boltz ipTM Score",
+    "complex_plddt_boltz": "Boltz Complex pLDDT",
+    "complex_iplddt_boltz": "Boltz Complex ipLDDT",
+    "complex_pde_boltz": "Boltz Complex pDE",
+    "complex_ipde_boltz": "Boltz Complex ipDE",
+    "interchain_pae_monomer": "AlphaFold2 GapTrick Interchain PAE",
+    "interface_pae_monomer": "AlphaFold2 GapTrick Interface PAE",
+    "overall_pae_monomer": "AlphaFold2 GapTrick Overall PAE",
+    "interface_plddt_monomer": "AlphaFold2 GapTrick Interface pLDDT",
+    "average_plddt_monomer": "AlphaFold2 GapTrick Average pLDDT",
+    "ptm_monomer": "AlphaFold2 GapTrick pTM Score",
+    "interface_ptm_monomer": "AlphaFold2 GapTrick Interface pTM",
+    "interchain_pae_multimer": "AlphaFold2 Multimer Interchain PAE",
+    "interface_pae_multimer": "AlphaFold2 Multimer Interface PAE",
+    "overall_pae_multimer": "AlphaFold2 Multimer Overall PAE",
+    "interface_plddt_multimer": "AlphaFold2 Multimer Interface pLDDT",
+    "average_plddt_multimer": "AlphaFold2 Multimer Average pLDDT",
+    "ptm_multimer": "AlphaFold2 Multimer pTM Score",
+    "interface_ptm_multimer": "AlphaFold2 Multimer Interface pTM"
+}
+SCORE_COLUMNS = list(SCORE_COLUMN_NAMES.values())
+def get_score_description(score: str) -> str:
+    descriptions = {
+        "Boltz Confidence Score": "The Boltz model confidence score provides an overall assessment of prediction quality (0-1, higher is better).",
+        "Boltz pTM Score": "The Boltz model predicted TM-score (pTM) assesses the overall fold accuracy of the predicted structure (0-1, higher is better).",
+        "Boltz ipTM Score": "The Boltz model interface pTM score (ipTM) specifically evaluates the accuracy of interface regions (0-1, higher is better).",
+        "Boltz Complex pLDDT": "The Boltz model Complex pLDDT measures confidence in local structure predictions across the entire complex (0-100, higher is better).",
+        "Boltz Complex ipLDDT": "The Boltz model Complex interface pLDDT (ipLDDT) focuses on confidence in interface region predictions (0-100, higher is better).",
+        "Boltz Complex pDE": "The Boltz model Complex predicted distance error (pDE) estimates the confidence in predicted distances between residues (0-1, higher is better).",
+        "Boltz Complex ipDE": "The Boltz model Complex interface pDE (ipDE) estimates confidence in predicted distances specifically at interfaces (0-1, higher is better).",
+        "AlphaFold2 GapTrick Interchain PAE": "The AlphaFold2 GapTrick model interchain predicted aligned error (PAE) estimates position errors between chains in monomeric predictions (lower is better).",
+        "AlphaFold2 GapTrick Interface PAE": "The AlphaFold2 GapTrick model interface PAE estimates position errors specifically at interfaces in monomeric predictions (lower is better).",
+        "AlphaFold2 GapTrick Overall PAE": "The AlphaFold2 GapTrick model overall PAE estimates position errors across the entire structure in monomeric predictions (lower is better).",
+        "AlphaFold2 GapTrick Interface pLDDT": "The AlphaFold2 GapTrick model interface pLDDT measures confidence in interface region predictions for monomeric models (0-100, higher is better).",
+        "AlphaFold2 GapTrick Average pLDDT": "The AlphaFold2 GapTrick model average pLDDT provides the mean confidence across all residues in monomeric predictions (0-100, higher is better).",
+        "AlphaFold2 GapTrick pTM Score": "The AlphaFold2 GapTrick model pTM score assesses overall fold accuracy in monomeric predictions (0-1, higher is better).",
+        "AlphaFold2 GapTrick Interface pTM": "The AlphaFold2 GapTrick model interface pTM specifically evaluates accuracy of interface regions in monomeric predictions (0-1, higher is better).",
+        "AlphaFold2 GapTrick Interchain PAE": "The AlphaFold2 GapTrick model interchain PAE estimates position errors between chains in multimeric predictions (lower is better).",
+        "AlphaFold2 Multimer Interface PAE": "The AlphaFold2 Multimer model interface PAE estimates position errors specifically at interfaces in multimeric predictions (lower is better).",
+        "AlphaFold2 Multimer Overall PAE": "The AlphaFold2 Multimer model overall PAE estimates position errors across the entire structure in multimeric predictions (lower is better).",
+        "AlphaFold2 Multimer Interface pLDDT": "The AlphaFold2 Multimer model interface pLDDT measures confidence in interface region predictions for multimeric models (0-100, higher is better).",
+        "AlphaFold2 Multimer Average pLDDT": "The AlphaFold2 Multimer model average pLDDT provides the mean confidence across all residues in multimeric predictions (0-100, higher is better).",
+        "AlphaFold2 Multimer pTM Score": "The AlphaFold2 Multimer model pTM score assesses overall fold accuracy in multimeric predictions (0-1, higher is better).",
+        "AlphaFold2 Multimer Interface pTM": "The AlphaFold2 Multimer model interface pTM specifically evaluates accuracy of interface regions in multimeric predictions (0-1, higher is better)."
+    }
+    return descriptions.get(score, "No description available for this score.")
+def compute_correlation_data(spr_data_with_scores: pd.DataFrame, score_cols: list[str]) -> pd.DataFrame:
+    corr_data_file = Path("corr_data.csv")
+    if corr_data_file.exists():
+        logger.info(f"Loading correlation data from {corr_data_file}")
+        return pd.read_csv(corr_data_file)
     corr_data = []
     spr_data_with_scores["log_kd"] = np.log10(spr_data_with_scores["KD (nM)"])
     kd_col = "KD (nM)"
+    corr_funcs = {}
+    corr_funcs["Spearman"] = spearmanr
+    corr_funcs["Pearson"] = pearsonr
+    corr_funcs["R²"] = linregress
+    for correlation_type, corr_func in corr_funcs.items():
+        for score_col in score_cols:
+            logger.info(f"Computing {correlation_type} correlation between {score_col} and KD (nM)")
+            res = corr_func(spr_data_with_scores[kd_col], spr_data_with_scores[score_col])
+            logger.info(f"Correlation function: {corr_func}")
+            correlation_value = res.rvalue**2 if correlation_type == "R²" else res.statistic
+            corr_data.append({
+                "correlation_type": correlation_type,
+                "score": score_col,
+                "correlation": correlation_value,
+                "p-value": res.pvalue
+            })
+            logger.info(f"Correlation {correlation_type} between {score_col} and KD (nM): {correlation_value}")
     corr_data = pd.DataFrame(corr_data)
     # Find the lines in corr_data with NaN values and remove them
     # Sort correlation data by correlation value
     corr_data = corr_data.sort_values('correlation', ascending=True)
+    corr_data.to_csv("corr_data.csv", index=False)
+    return corr_data
+def plot_correlation_ranking(corr_data: pd.DataFrame, correlation_type: str) -> go.Figure:
     # Create bar plot of correlations
+    data = corr_data[corr_data["correlation_type"] == correlation_type]
     corr_ranking_plot = go.Figure(data=[
         go.Bar(
+            x=data["correlation"],
+            y=data["score"],
+            name=correlation_type,
+            text=data["correlation"],
             orientation='h',
             hovertemplate="<i>Score:</i> %{y}<br><i>Correlation:</i> %{x:.3f}<br>"
         )
     ])
     corr_ranking_plot.update_layout(
         title="Correlation with Binding Affinity",
+        yaxis_title="Score",
+        xaxis_title=correlation_type,
         template="simple_white",
         showlegend=False
     )
+    return corr_ranking_plot
+def fake_predict_and_correlate(spr_data_with_scores: pd.DataFrame, score_cols: list[str], main_cols: list[str]) -> tuple[pd.DataFrame, go.Figure]:
+    """Fake predict structures of all complexes and correlate the results."""
+    corr_data = compute_correlation_data(spr_data_with_scores, score_cols)
+    corr_ranking_plot = plot_correlation_ranking(corr_data, "Spearman")
     cols_to_show = main_cols[:]
     cols_to_show.extend(score_cols)
+    corr_plot = make_regression_plot(spr_data_with_scores, score_cols[0], use_log=False)
     return spr_data_with_scores[cols_to_show].round(2), corr_ranking_plot, corr_plot
+def make_regression_plot(spr_data_with_scores: pd.DataFrame, score: str, use_log: bool) -> go.Figure:
+    """Select the regression plot to display."""
+    # corr_plot is a scatter plot of the regression between the binding affinity and each of the scores
     scatter =  go.Scatter(
             x=spr_data_with_scores["KD (nM)"],
             y=spr_data_with_scores[score],
         ),
         xaxis_type="log" if use_log else "linear"  # Set x-axis to logarithmic scale
     )
+    # compute the regression line
     corr_line = np.polyfit(spr_data_with_scores["KD (nM)"], spr_data_with_scores[score], 1)
     corr_line_x = np.linspace(min(spr_data_with_scores["KD (nM)"]), max(spr_data_with_scores["KD (nM)"]), 100)
     corr_line_y = corr_line[0] * corr_line_x + corr_line[1]
+    # add the regression line to the plot
     corr_plot.add_trace(go.Scatter(
         x=corr_line_x,
         y=corr_line_y,