Spaces:

InstaDeepAI
/

folding-studio-demo

Running

App Files Files Community

jfaustin commited on 3 days ago

Commit

d861d5c

2 Parent(s): 24f13c2 6354ea8

Merge remote-tracking branch 'origin/main' into pr/12

Browse files

Files changed (5) hide show

folding_studio_demo/app.py +128 -77
folding_studio_demo/correlate.py +85 -58
folding_studio_demo/model_fasta_validators.py +9 -9
folding_studio_demo/models.py +146 -7
folding_studio_demo/predict.py +210 -94

folding_studio_demo/app.py CHANGED Viewed

@@ -39,8 +39,8 @@ MOLECULE_REPS = [
 MODEL_CHOICES = [
-    # ("AlphaFold2", FoldingModel.AF2),
-    # ("OpenFold", FoldingModel.OPENFOLD),
     # ("SoloSeq", FoldingModel.SOLOSEQ),
     ("Boltz-1", FoldingModel.BOLTZ),
     ("Chai-1", FoldingModel.CHAI),
@@ -49,6 +49,15 @@ MODEL_CHOICES = [
 DEFAULT_SEQ = "MALWMRLLPLLALLALWGPDPAAA"
 MODEL_EXAMPLES = {
     FoldingModel.BOLTZ: [
         ["Monomer", f">A|protein\n{DEFAULT_SEQ}"],
         ["Multimer", f">A|protein\n{DEFAULT_SEQ}\n>B|protein\n{DEFAULT_SEQ}"],
@@ -70,27 +79,31 @@ def sequence_input(dropdown: gr.Dropdown | None = None) -> gr.Textbox:
     Returns:
         gr.Textbox: Sequence input component
     """
-    sequence = gr.Textbox(
-        label="Protein Sequence",
-        lines=2,
-        placeholder="Enter a protein sequence or upload a FASTA file",
-    )
-    dummy = gr.Textbox(label="Complex type", visible=False)
-    examples = gr.Examples(
-        examples=MODEL_EXAMPLES[FoldingModel.BOLTZ],
-        inputs=[dummy, sequence],
-    )
     if dropdown is not None:
         dropdown.change(
             fn=lambda x: gr.Dataset(samples=MODEL_EXAMPLES[x]),
             inputs=[dropdown],
             outputs=[examples.dataset],
         )
-    file_input = gr.File(
-        label="Upload a FASTA file",
-        file_types=[".fasta", ".fa"],
-    )
     def _process_file(file: gr.File | None) -> gr.Textbox:
         if file is None:
@@ -115,7 +128,7 @@ def simple_prediction(api_key: str) -> None:
     """
     gr.Markdown(
         """
-        ### Predict a Protein Structure
         It will be run in the background and the results will be displayed in the output section.
         The output will contain the protein structure and the pLDDT plot.
@@ -157,7 +170,19 @@ def model_comparison(api_key: str) -> None:
     Args:
         api_key (str): Folding Studio API key
     """
     with gr.Row():
         models = gr.CheckboxGroup(
             label="Model",
@@ -176,6 +201,9 @@ def model_comparison(api_key: str) -> None:
         variant="primary",
     )
     with gr.Row():
         chai_predictions = gr.CheckboxGroup(label="Chai", visible=False)
         protenix_predictions = gr.CheckboxGroup(label="Protenix", visible=False)
         boltz_predictions = gr.CheckboxGroup(label="Boltz", visible=False)
@@ -186,28 +214,50 @@ def model_comparison(api_key: str) -> None:
         metrics_plot = gr.Plot(label="pLDDT")
     # Store the initial predictions
-    aligned_paths = gr.State()
-    plddt_fig = gr.State()
     predict_btn.click(
         fn=predict_comparison,
         inputs=[sequence, api_key, models],
         outputs=[
             chai_predictions,
             boltz_predictions,
             protenix_predictions,
-            aligned_paths,
-            plddt_fig,
         ],
     )
     # Handle checkbox changes
-    for checkbox in [chai_predictions, boltz_predictions, protenix_predictions]:
         checkbox.change(
             fn=filter_predictions,
             inputs=[
-                aligned_paths,
-                plddt_fig,
                 chai_predictions,
                 boltz_predictions,
                 protenix_predictions,
@@ -242,63 +292,64 @@ def create_correlation_tab():
         "antigen_sequence": "Antigen Sequence",
     }
     spr_data_with_scores = spr_data_with_scores.rename(columns=prettified_columns)
-    with gr.Row():
-        columns = [
-            "Antibody Name",
-            "KD (nM)",
-            "Antibody VH Sequence",
-            "Antibody VL Sequence",
-            "Antigen Sequence",
-        ]
-        # Display dataframe with floating point values rounded to 2 decimal places
-        spr_data = gr.DataFrame(
-            value=spr_data_with_scores[columns].round(2),
-            label="Experimental Antibody-Antigen Binding Affinity Data",
-        )
     gr.Markdown("# Prediction and correlation")
     with gr.Row():
-        fake_predict_btn = gr.Button(
-            "Predict structures of all complexes",
-            elem_classes="gradient-button",
-            variant="primary",
         )
     with gr.Row():
-        prediction_dataframe = gr.Dataframe(label="Predicted Structures Data")
-    with gr.Row():
-        with gr.Row():
-            correlation_type = gr.Radio(
-                choices=["Spearman", "Pearson", "R²"],
-                value="Spearman",
-                label="Correlation Type",
-                interactive=True,
             )
-        with gr.Row():
-            correlation_ranking_plot = gr.Plot(label="Correlation ranking")
-    with gr.Row():
-        with gr.Column():
-            with gr.Row():
-                # User can select the columns to display in the correlation plot
-                correlation_column = gr.Dropdown(
-                    label="Score data to display",
-                    choices=SCORE_COLUMNS,
-                    multiselect=False,
-                    value=SCORE_COLUMNS[0],
-                )
-                # Add checkbox for log scale and update plot when either input changes
-            with gr.Row():
-                log_scale = gr.Checkbox(
-                    label="Display x-axis on logarithmic scale", value=False
-                )
-            with gr.Row():
-                score_description = gr.Markdown(
-                    get_score_description(correlation_column.value)
-                )
-                correlation_column.change(
-                    fn=lambda x: get_score_description(x),
-                    inputs=correlation_column,
-                    outputs=score_description,
-                )
         with gr.Column():
             regression_plot = gr.Plot(label="Correlation with binding affinity")
@@ -333,7 +384,7 @@ def create_correlation_tab():
     log_scale.change(
         fn=update_regression_plot,
-        inputs=[correlation_column, log_scale],
         outputs=regression_plot,
     )
@@ -360,7 +411,7 @@ def __main__():
         )
         api_key = gr.Textbox(label="Folding Studio API Key", type="password")
         gr.Markdown("## Demo Usage")
-        with gr.Tab("🚀 Simple Prediction"):
             simple_prediction(api_key)
         with gr.Tab("📊 Model Comparison"):
             model_comparison(api_key)

 MODEL_CHOICES = [
+    ("AlphaFold2", FoldingModel.AF2),
+    ("OpenFold", FoldingModel.OPENFOLD),
     # ("SoloSeq", FoldingModel.SOLOSEQ),
     ("Boltz-1", FoldingModel.BOLTZ),
     ("Chai-1", FoldingModel.CHAI),
 DEFAULT_SEQ = "MALWMRLLPLLALLALWGPDPAAA"
 MODEL_EXAMPLES = {
+    FoldingModel.AF2: [
+        ["Monomer", f">A\n{DEFAULT_SEQ}"],
+        ["Multimer", f">A\n{DEFAULT_SEQ}\n>B\n{DEFAULT_SEQ}"],
+    ],
+    FoldingModel.OPENFOLD: [
+        ["Monomer", f">A\n{DEFAULT_SEQ}"],
+        ["Multimer", f">A\n{DEFAULT_SEQ}\n>B\n{DEFAULT_SEQ}"],
+    ],
+    FoldingModel.SOLOSEQ: [["Monomer", f">A\n{DEFAULT_SEQ}"]],
     FoldingModel.BOLTZ: [
         ["Monomer", f">A|protein\n{DEFAULT_SEQ}"],
         ["Multimer", f">A|protein\n{DEFAULT_SEQ}\n>B|protein\n{DEFAULT_SEQ}"],
     Returns:
         gr.Textbox: Sequence input component
     """
+    with gr.Row(equal_height=True):
+        with gr.Column():
+            sequence = gr.Textbox(
+                label="Protein Sequence",
+                lines=2,
+                placeholder="Enter a protein sequence or upload a FASTA file",
+            )
+            dummy = gr.Textbox(label="Complex type", visible=False)
+            examples = gr.Examples(
+                examples=MODEL_EXAMPLES[FoldingModel.BOLTZ],
+                inputs=[dummy, sequence],
+            )
+        file_input = gr.File(
+            label="Upload a FASTA file",
+            file_types=[".fasta", ".fa"],
+            scale=0,
+        )
     if dropdown is not None:
         dropdown.change(
             fn=lambda x: gr.Dataset(samples=MODEL_EXAMPLES[x]),
             inputs=[dropdown],
             outputs=[examples.dataset],
         )
     def _process_file(file: gr.File | None) -> gr.Textbox:
         if file is None:
     """
     gr.Markdown(
         """
+        ## Predict a Protein Structure
         It will be run in the background and the results will be displayed in the output section.
         The output will contain the protein structure and the pLDDT plot.
     Args:
         api_key (str): Folding Studio API key
     """
+    gr.Markdown(
+        """
+        ## Compare Folding Models
+        Select multiple models to compare their predictions on your protein sequence.
+        You can either enter the sequence directly or upload a FASTA file.
+        The selected models will run in parallel and generate:
+        - 3D structures of your protein that you can visualize and compare
+        - pLDDT confidence scores plotted for each residue
+        """
+    )
     with gr.Row():
         models = gr.CheckboxGroup(
             label="Model",
         variant="primary",
     )
     with gr.Row():
+        af2_predictions = gr.CheckboxGroup(label="AlphaFold2", visible=False)
+        openfold_predictions = gr.CheckboxGroup(label="OpenFold", visible=False)
+        solo_predictions = gr.CheckboxGroup(label="SoloSeq", visible=False)
         chai_predictions = gr.CheckboxGroup(label="Chai", visible=False)
         protenix_predictions = gr.CheckboxGroup(label="Protenix", visible=False)
         boltz_predictions = gr.CheckboxGroup(label="Boltz", visible=False)
         metrics_plot = gr.Plot(label="pLDDT")
     # Store the initial predictions
+    prediction_outputs = gr.State()
     predict_btn.click(
         fn=predict_comparison,
         inputs=[sequence, api_key, models],
         outputs=[
+            prediction_outputs,
+            af2_predictions,
+            openfold_predictions,
+            solo_predictions,
+            chai_predictions,
+            boltz_predictions,
+            protenix_predictions,
+        ],
+    ).then(
+        fn=filter_predictions,
+        inputs=[
+            prediction_outputs,
+            af2_predictions,
+            openfold_predictions,
+            solo_predictions,
             chai_predictions,
             boltz_predictions,
             protenix_predictions,
         ],
+        outputs=[mol_outputs, metrics_plot],
     )
     # Handle checkbox changes
+    for checkbox in [
+        af2_predictions,
+        openfold_predictions,
+        solo_predictions,
+        chai_predictions,
+        boltz_predictions,
+        protenix_predictions,
+    ]:
         checkbox.change(
             fn=filter_predictions,
             inputs=[
+                prediction_outputs,
+                af2_predictions,
+                openfold_predictions,
+                solo_predictions,
                 chai_predictions,
                 boltz_predictions,
                 protenix_predictions,
         "antigen_sequence": "Antigen Sequence",
     }
     spr_data_with_scores = spr_data_with_scores.rename(columns=prettified_columns)
+    columns = [
+        "Antibody Name",
+        "KD (nM)",
+        "Antibody VH Sequence",
+        "Antibody VL Sequence",
+        "Antigen Sequence",
+    ]
+    # Display dataframe with floating point values rounded to 2 decimal places
+    spr_data = gr.DataFrame(
+        value=spr_data_with_scores[columns].round(2),
+        label="Experimental Antibody-Antigen Binding Affinity Data",
+    )
     gr.Markdown("# Prediction and correlation")
+    fake_predict_btn = gr.Button(
+        "Predict structures of all complexes",
+        elem_classes="gradient-button",
+        variant="primary",
+    )
+    prediction_dataframe = gr.Dataframe(
+        label="Predicted Structures Data", visible=False
+    )
+    prediction_dataframe.change(
+        fn=lambda x: gr.Dataframe(x, visible=True),
+        inputs=[prediction_dataframe],
+        outputs=[prediction_dataframe],
+    )
     with gr.Row():
+        correlation_type = gr.Radio(
+            choices=["Spearman", "Pearson", "R²"],
+            value="Spearman",
+            label="Correlation Type",
+            interactive=True,
+            scale=0,
         )
+        correlation_ranking_plot = gr.Plot(label="Correlation ranking")
     with gr.Row():
+        with gr.Column(scale=0):
+            # User can select the columns to display in the correlation plot
+            correlation_column = gr.Dropdown(
+                label="Score data to display",
+                choices=SCORE_COLUMNS,
+                multiselect=False,
+                value=SCORE_COLUMNS[0],
+            )
+            # Add checkbox for log scale and update plot when either input changes
+            log_scale = gr.Checkbox(
+                label="Display x-axis on logarithmic scale", value=False
+            )
+            score_description = gr.Markdown(
+                get_score_description(correlation_column.value)
+            )
+            correlation_column.change(
+                fn=lambda x: get_score_description(x),
+                inputs=correlation_column,
+                outputs=score_description,
             )
         with gr.Column():
             regression_plot = gr.Plot(label="Correlation with binding affinity")
     log_scale.change(
         fn=update_regression_plot,
+        inputs=[correlation_column, log_scale],
         outputs=regression_plot,
     )
         )
         api_key = gr.Textbox(label="Folding Studio API Key", type="password")
         gr.Markdown("## Demo Usage")
+        with gr.Tab("🚀 Basic Folding"):
             simple_prediction(api_key)
         with gr.Tab("📊 Model Comparison"):
             model_comparison(api_key)

folding_studio_demo/correlate.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import logging
-import pandas as pd
 from pathlib import Path
 import numpy as np
 import plotly.graph_objects as go
-from scipy.stats import spearmanr, pearsonr, linregress
 logger = logging.getLogger(__name__)
@@ -16,7 +17,7 @@ SCORE_COLUMN_NAMES = {
     "complex_pde_boltz": "Boltz Complex pDE",
     "complex_ipde_boltz": "Boltz Complex ipDE",
     "interchain_pae_monomer": "AlphaFold2 GapTrick Interchain PAE",
-    "interface_pae_monomer": "AlphaFold2 GapTrick Interface PAE",
     "overall_pae_monomer": "AlphaFold2 GapTrick Overall PAE",
     "interface_plddt_monomer": "AlphaFold2 GapTrick Interface pLDDT",
     "average_plddt_monomer": "AlphaFold2 GapTrick Average pLDDT",
@@ -24,15 +25,16 @@ SCORE_COLUMN_NAMES = {
     "interface_ptm_monomer": "AlphaFold2 GapTrick Interface pTM",
     "interchain_pae_multimer": "AlphaFold2 Multimer Interchain PAE",
     "interface_pae_multimer": "AlphaFold2 Multimer Interface PAE",
-    "overall_pae_multimer": "AlphaFold2 Multimer Overall PAE",
     "interface_plddt_multimer": "AlphaFold2 Multimer Interface pLDDT",
     "average_plddt_multimer": "AlphaFold2 Multimer Average pLDDT",
     "ptm_multimer": "AlphaFold2 Multimer pTM Score",
-    "interface_ptm_multimer": "AlphaFold2 Multimer Interface pTM"
 }
 SCORE_COLUMNS = list(SCORE_COLUMN_NAMES.values())
 def get_score_description(score: str) -> str:
     descriptions = {
         "Boltz Confidence Score": "The Boltz model confidence score provides an overall assessment of prediction quality (0-1, higher is better).",
@@ -49,22 +51,24 @@ def get_score_description(score: str) -> str:
         "AlphaFold2 GapTrick Average pLDDT": "The AlphaFold2 GapTrick model average pLDDT provides the mean confidence across all residues in monomeric predictions (0-100, higher is better).",
         "AlphaFold2 GapTrick pTM Score": "The AlphaFold2 GapTrick model pTM score assesses overall fold accuracy in monomeric predictions (0-1, higher is better).",
         "AlphaFold2 GapTrick Interface pTM": "The AlphaFold2 GapTrick model interface pTM specifically evaluates accuracy of interface regions in monomeric predictions (0-1, higher is better).",
-        "AlphaFold2 GapTrick Interchain PAE": "The AlphaFold2 GapTrick model interchain PAE estimates position errors between chains in multimeric predictions (lower is better).",
-        "AlphaFold2 Multimer Interface PAE": "The AlphaFold2 Multimer model interface PAE estimates position errors specifically at interfaces in multimeric predictions (lower is better).",
         "AlphaFold2 Multimer Overall PAE": "The AlphaFold2 Multimer model overall PAE estimates position errors across the entire structure in multimeric predictions (lower is better).",
         "AlphaFold2 Multimer Interface pLDDT": "The AlphaFold2 Multimer model interface pLDDT measures confidence in interface region predictions for multimeric models (0-100, higher is better).",
         "AlphaFold2 Multimer Average pLDDT": "The AlphaFold2 Multimer model average pLDDT provides the mean confidence across all residues in multimeric predictions (0-100, higher is better).",
         "AlphaFold2 Multimer pTM Score": "The AlphaFold2 Multimer model pTM score assesses overall fold accuracy in multimeric predictions (0-1, higher is better).",
-        "AlphaFold2 Multimer Interface pTM": "The AlphaFold2 Multimer model interface pTM specifically evaluates accuracy of interface regions in multimeric predictions (0-1, higher is better)."
     }
     return descriptions.get(score, "No description available for this score.")
-def compute_correlation_data(spr_data_with_scores: pd.DataFrame, score_cols: list[str]) -> pd.DataFrame:
     corr_data_file = Path("corr_data.csv")
     if corr_data_file.exists():
         logger.info(f"Loading correlation data from {corr_data_file}")
         return pd.read_csv(corr_data_file)
     corr_data = []
     spr_data_with_scores["log_kd"] = np.log10(spr_data_with_scores["KD (nM)"])
     kd_col = "KD (nM)"
@@ -74,53 +78,71 @@ def compute_correlation_data(spr_data_with_scores: pd.DataFrame, score_cols: lis
     corr_funcs["R²"] = linregress
     for correlation_type, corr_func in corr_funcs.items():
         for score_col in score_cols:
-            logger.info(f"Computing {correlation_type} correlation between {score_col} and KD (nM)")
-            res = corr_func(spr_data_with_scores[kd_col], spr_data_with_scores[score_col])
             logger.info(f"Correlation function: {corr_func}")
-            correlation_value = res.rvalue**2 if correlation_type == "R²" else res.statistic
-            corr_data.append({
-                "correlation_type": correlation_type,
-                "score": score_col,
-                "correlation": correlation_value,
-                "p-value": res.pvalue
-            })
-            logger.info(f"Correlation {correlation_type} between {score_col} and KD (nM): {correlation_value}")
     corr_data = pd.DataFrame(corr_data)
     # Find the lines in corr_data with NaN values and remove them
     corr_data = corr_data[corr_data["correlation"].notna()]
     # Sort correlation data by correlation value
-    corr_data = corr_data.sort_values('correlation', ascending=True)
     corr_data.to_csv("corr_data.csv", index=False)
     return corr_data
-def plot_correlation_ranking(corr_data: pd.DataFrame, correlation_type: str) -> go.Figure:
     # Create bar plot of correlations
     data = corr_data[corr_data["correlation_type"] == correlation_type]
-    corr_ranking_plot = go.Figure(data=[
-        go.Bar(
-            x=data["correlation"],
-            y=data["score"],
-            name=correlation_type,
-            text=data["correlation"],
-            orientation='h',
-            hovertemplate="<i>Score:</i> %{y}<br><i>Correlation:</i> %{x:.3f}<br>"
-        )
-    ])
     corr_ranking_plot.update_layout(
         title="Correlation with Binding Affinity",
         yaxis_title="Score",
         xaxis_title=correlation_type,
         template="simple_white",
-        showlegend=False
     )
     return corr_ranking_plot
-def fake_predict_and_correlate(spr_data_with_scores: pd.DataFrame, score_cols: list[str], main_cols: list[str]) -> tuple[pd.DataFrame, go.Figure]:
     """Fake predict structures of all complexes and correlate the results."""
     corr_data = compute_correlation_data(spr_data_with_scores, score_cols)
     corr_ranking_plot = plot_correlation_ranking(corr_data, "Spearman")
@@ -131,17 +153,20 @@ def fake_predict_and_correlate(spr_data_with_scores: pd.DataFrame, score_cols: l
     return spr_data_with_scores[cols_to_show].round(2), corr_ranking_plot, corr_plot
-def make_regression_plot(spr_data_with_scores: pd.DataFrame, score: str, use_log: bool) -> go.Figure:
     """Select the regression plot to display."""
     # corr_plot is a scatter plot of the regression between the binding affinity and each of the scores
-    scatter =  go.Scatter(
-            x=spr_data_with_scores["KD (nM)"],
-            y=spr_data_with_scores[score],
-            name=f"Samples",
-            mode='markers',  # Only show markers/dots, no lines
-            hovertemplate="<i>Score:</i> %{y}<br><i>KD:</i> %{x:.2f}<br>",
-            marker=dict(color='#1f77b4')  # Set color to match default first color
-        )
     corr_plot = go.Figure(data=scatter)
     corr_plot.update_layout(
         xaxis_title="KD (nM)",
@@ -154,7 +179,7 @@ def make_regression_plot(spr_data_with_scores: pd.DataFrame, score: str, use_log
             xanchor="right",
             x=1,
         ),
-        xaxis_type="log" if use_log else "linear"  # Set x-axis to logarithmic scale
     )
     # compute the regression line
     if use_log:
@@ -162,23 +187,25 @@ def make_regression_plot(spr_data_with_scores: pd.DataFrame, score: str, use_log
         x_vals = np.log10(spr_data_with_scores["KD (nM)"])
     else:
         x_vals = spr_data_with_scores["KD (nM)"]
     # Fit line to data
     corr_line = np.polyfit(x_vals, spr_data_with_scores[score], 1)
     # Generate x points for line
     corr_line_x = np.linspace(min(x_vals), max(x_vals), 100)
     corr_line_y = corr_line[0] * corr_line_x + corr_line[1]
     # Convert back from log space if needed
     if use_log:
         corr_line_x = 10**corr_line_x
     # add the regression line to the plot
-    corr_plot.add_trace(go.Scatter(
-        x=corr_line_x,
-        y=corr_line_y,
-        mode='lines',
-        name=f"Regression line",
-        line=dict(color='#1f77b4')  # Set same color as scatter points
-    ))
-    return corr_plot

 import logging
 from pathlib import Path
 import numpy as np
+import pandas as pd
 import plotly.graph_objects as go
+from scipy.stats import linregress, pearsonr, spearmanr
 logger = logging.getLogger(__name__)
     "complex_pde_boltz": "Boltz Complex pDE",
     "complex_ipde_boltz": "Boltz Complex ipDE",
     "interchain_pae_monomer": "AlphaFold2 GapTrick Interchain PAE",
+    "interface_pae_monomer": "AlphaFold2 GapTrick Interface PAE",
     "overall_pae_monomer": "AlphaFold2 GapTrick Overall PAE",
     "interface_plddt_monomer": "AlphaFold2 GapTrick Interface pLDDT",
     "average_plddt_monomer": "AlphaFold2 GapTrick Average pLDDT",
     "interface_ptm_monomer": "AlphaFold2 GapTrick Interface pTM",
     "interchain_pae_multimer": "AlphaFold2 Multimer Interchain PAE",
     "interface_pae_multimer": "AlphaFold2 Multimer Interface PAE",
+    "overall_pae_multimer": "AlphaFold2 Multimer Overall PAE",
     "interface_plddt_multimer": "AlphaFold2 Multimer Interface pLDDT",
     "average_plddt_multimer": "AlphaFold2 Multimer Average pLDDT",
     "ptm_multimer": "AlphaFold2 Multimer pTM Score",
+    "interface_ptm_multimer": "AlphaFold2 Multimer Interface pTM",
 }
 SCORE_COLUMNS = list(SCORE_COLUMN_NAMES.values())
 def get_score_description(score: str) -> str:
     descriptions = {
         "Boltz Confidence Score": "The Boltz model confidence score provides an overall assessment of prediction quality (0-1, higher is better).",
         "AlphaFold2 GapTrick Average pLDDT": "The AlphaFold2 GapTrick model average pLDDT provides the mean confidence across all residues in monomeric predictions (0-100, higher is better).",
         "AlphaFold2 GapTrick pTM Score": "The AlphaFold2 GapTrick model pTM score assesses overall fold accuracy in monomeric predictions (0-1, higher is better).",
         "AlphaFold2 GapTrick Interface pTM": "The AlphaFold2 GapTrick model interface pTM specifically evaluates accuracy of interface regions in monomeric predictions (0-1, higher is better).",
+        "AlphaFold2 Multimer Interface PAE": "The AlphaFold2 Multimer model interface PAE estimates position errors specifically at interfaces in multimeric predictions (lower is better).",
         "AlphaFold2 Multimer Overall PAE": "The AlphaFold2 Multimer model overall PAE estimates position errors across the entire structure in multimeric predictions (lower is better).",
         "AlphaFold2 Multimer Interface pLDDT": "The AlphaFold2 Multimer model interface pLDDT measures confidence in interface region predictions for multimeric models (0-100, higher is better).",
         "AlphaFold2 Multimer Average pLDDT": "The AlphaFold2 Multimer model average pLDDT provides the mean confidence across all residues in multimeric predictions (0-100, higher is better).",
         "AlphaFold2 Multimer pTM Score": "The AlphaFold2 Multimer model pTM score assesses overall fold accuracy in multimeric predictions (0-1, higher is better).",
+        "AlphaFold2 Multimer Interface pTM": "The AlphaFold2 Multimer model interface pTM specifically evaluates accuracy of interface regions in multimeric predictions (0-1, higher is better).",
     }
     return descriptions.get(score, "No description available for this score.")
+def compute_correlation_data(
+    spr_data_with_scores: pd.DataFrame, score_cols: list[str]
+) -> pd.DataFrame:
     corr_data_file = Path("corr_data.csv")
     if corr_data_file.exists():
         logger.info(f"Loading correlation data from {corr_data_file}")
         return pd.read_csv(corr_data_file)
     corr_data = []
     spr_data_with_scores["log_kd"] = np.log10(spr_data_with_scores["KD (nM)"])
     kd_col = "KD (nM)"
     corr_funcs["R²"] = linregress
     for correlation_type, corr_func in corr_funcs.items():
         for score_col in score_cols:
+            logger.info(
+                f"Computing {correlation_type} correlation between {score_col} and KD (nM)"
+            )
+            res = corr_func(
+                spr_data_with_scores[kd_col], spr_data_with_scores[score_col]
+            )
             logger.info(f"Correlation function: {corr_func}")
+            correlation_value = (
+                res.rvalue**2 if correlation_type == "R²" else res.statistic
+            )
+            corr_data.append(
+                {
+                    "correlation_type": correlation_type,
+                    "score": score_col,
+                    "correlation": correlation_value,
+                    "p-value": res.pvalue,
+                }
+            )
+            logger.info(
+                f"Correlation {correlation_type} between {score_col} and KD (nM): {correlation_value}"
+            )
     corr_data = pd.DataFrame(corr_data)
     # Find the lines in corr_data with NaN values and remove them
     corr_data = corr_data[corr_data["correlation"].notna()]
     # Sort correlation data by correlation value
+    corr_data = corr_data.sort_values("correlation", ascending=True)
     corr_data.to_csv("corr_data.csv", index=False)
     return corr_data
+def plot_correlation_ranking(
+    corr_data: pd.DataFrame, correlation_type: str
+) -> go.Figure:
     # Create bar plot of correlations
     data = corr_data[corr_data["correlation_type"] == correlation_type]
+    corr_ranking_plot = go.Figure(
+        data=[
+            go.Bar(
+                x=data["correlation"],
+                y=data["score"],
+                name=correlation_type,
+                text=data["correlation"],
+                orientation="h",
+                hovertemplate="<i>Score:</i> %{y}<br><i>Correlation:</i> %{x:.3f}<br>",
+            )
+        ]
+    )
     corr_ranking_plot.update_layout(
         title="Correlation with Binding Affinity",
         yaxis_title="Score",
         xaxis_title=correlation_type,
         template="simple_white",
+        showlegend=False,
     )
     return corr_ranking_plot
+def fake_predict_and_correlate(
+    spr_data_with_scores: pd.DataFrame, score_cols: list[str], main_cols: list[str]
+) -> tuple[pd.DataFrame, go.Figure]:
     """Fake predict structures of all complexes and correlate the results."""
     corr_data = compute_correlation_data(spr_data_with_scores, score_cols)
     corr_ranking_plot = plot_correlation_ranking(corr_data, "Spearman")
     return spr_data_with_scores[cols_to_show].round(2), corr_ranking_plot, corr_plot
+def make_regression_plot(
+    spr_data_with_scores: pd.DataFrame, score: str, use_log: bool
+) -> go.Figure:
     """Select the regression plot to display."""
     # corr_plot is a scatter plot of the regression between the binding affinity and each of the scores
+    scatter = go.Scatter(
+        x=spr_data_with_scores["KD (nM)"],
+        y=spr_data_with_scores[score],
+        name=f"Samples",
+        mode="markers",  # Only show markers/dots, no lines
+        hovertemplate="<i>Score:</i> %{y}<br><i>KD:</i> %{x:.2f}<br>",
+        marker=dict(color="#1f77b4"),  # Set color to match default first color
+    )
     corr_plot = go.Figure(data=scatter)
     corr_plot.update_layout(
         xaxis_title="KD (nM)",
             xanchor="right",
             x=1,
         ),
+        xaxis_type="log" if use_log else "linear",  # Set x-axis to logarithmic scale
     )
     # compute the regression line
     if use_log:
         x_vals = np.log10(spr_data_with_scores["KD (nM)"])
     else:
         x_vals = spr_data_with_scores["KD (nM)"]
     # Fit line to data
     corr_line = np.polyfit(x_vals, spr_data_with_scores[score], 1)
     # Generate x points for line
     corr_line_x = np.linspace(min(x_vals), max(x_vals), 100)
     corr_line_y = corr_line[0] * corr_line_x + corr_line[1]
     # Convert back from log space if needed
     if use_log:
         corr_line_x = 10**corr_line_x
     # add the regression line to the plot
+    corr_plot.add_trace(
+        go.Scatter(
+            x=corr_line_x,
+            y=corr_line_y,
+            mode="lines",
+            name=f"Regression line",
+            line=dict(color="#1f77b4"),  # Set same color as scatter points
+        )
+    )
+    return corr_plot

folding_studio_demo/model_fasta_validators.py CHANGED Viewed

@@ -248,15 +248,15 @@ class ChaiFastaValidator(BaseFastaValidator):
                     )
                 seen_names.add(name)
                 # validate sequence format
-                # sequence = str(record.seq).strip()
-                # if (
-                #     entity_type in {EntityType.PEPTIDE, EntityType.PROTEIN}
-                #     and not get_entity_type(sequence) == entity_type
-                # ):
-                #     return (
-                #         False,
-                #         f"CHAI Validation Error: Sequence type mismatch. Expected '{entity_type}' but found '{get_entity_type(sequence)}'",
-                #     )
         return True, None

                     )
                 seen_names.add(name)
                 # validate sequence format
+                sequence = str(record.seq).strip()
+                if (
+                    entity_type in {EntityType.PEPTIDE, EntityType.PROTEIN}
+                    and not get_entity_type(sequence) == entity_type
+                ):
+                    return (
+                        False,
+                        f"CHAI Validation Error: Sequence type mismatch. Expected '{entity_type}' but found '{get_entity_type(sequence)}'",
+                    )
         return True, None

folding_studio_demo/models.py CHANGED Viewed

@@ -1,17 +1,26 @@
 """Models for the Folding Studio API."""
 import logging
 import os
 from pathlib import Path
 from typing import Any
 import gradio as gr
 import numpy as np
 from folding_studio.client import Client
 from folding_studio.query import Query
 from folding_studio.query.boltz import BoltzQuery
 from folding_studio.query.chai import ChaiQuery
 from folding_studio.query.protenix import ProtenixQuery
 from folding_studio_demo.model_fasta_validators import (
     BaseFastaValidator,
@@ -20,15 +29,29 @@ from folding_studio_demo.model_fasta_validators import (
     ProtenixFastaValidator,
 )
 logger = logging.getLogger(__name__)
 class AF3Model:
-    def __init__(
-        self, api_key: str, model_name: str, query: Query, validator: BaseFastaValidator
-    ):
         self.api_key = api_key
-        self.model_name = model_name
         self.query = query
         self.validator = validator
@@ -116,8 +139,10 @@ class AF3Model:
 class ChaiModel(AF3Model):
     def __init__(self, api_key: str):
-        super().__init__(api_key, "Chai", ChaiQuery, ChaiFastaValidator())
     def call(
         self, seq_file: Path | str, output_dir: Path, format_fasta: bool = False
@@ -158,8 +183,10 @@ class ChaiModel(AF3Model):
 class ProtenixModel(AF3Model):
     def __init__(self, api_key: str):
-        super().__init__(api_key, "Protenix", ProtenixQuery, ProtenixFastaValidator())
     def call(
         self, seq_file: Path | str, output_dir: Path, format_fasta: bool = False
@@ -179,8 +206,10 @@ class ProtenixModel(AF3Model):
 class BoltzModel(AF3Model):
     def __init__(self, api_key: str):
-        super().__init__(api_key, "Boltz", BoltzQuery, BoltzFastaValidator())
     def call(
         self, seq_file: Path | str, output_dir: Path, format_fasta: bool = False
@@ -205,3 +234,113 @@ class BoltzModel(AF3Model):
             }
             for cif_path in prediction_paths
         }

 """Models for the Folding Studio API."""
+import json
 import logging
 import os
+import sys
+import time
+from io import StringIO
 from pathlib import Path
 from typing import Any
 import gradio as gr
 import numpy as np
+from folding_studio import single_job_prediction
 from folding_studio.client import Client
+from folding_studio.commands.experiment import results as get_results
+from folding_studio.commands.experiment import status as get_status
 from folding_studio.query import Query
 from folding_studio.query.boltz import BoltzQuery
 from folding_studio.query.chai import ChaiQuery
 from folding_studio.query.protenix import ProtenixQuery
+from folding_studio_data_models import AF2Parameters, OpenFoldParameters
+from folding_studio_data_models.parameters.base import BaseFoldingParameters
 from folding_studio_demo.model_fasta_validators import (
     BaseFastaValidator,
     ProtenixFastaValidator,
 )
+class Capturing(list):
+    """Capture stdout output."""
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        return self
+    def __exit__(self, *args):
+        self.extend(self._stringio.getvalue().splitlines())
+        del self._stringio  # free up some memory
+        sys.stdout = self._stdout
 logger = logging.getLogger(__name__)
 class AF3Model:
+    model_name = None
+    def __init__(self, api_key: str, query: Query, validator: BaseFastaValidator):
         self.api_key = api_key
         self.query = query
         self.validator = validator
 class ChaiModel(AF3Model):
+    model_name = "Chai"
     def __init__(self, api_key: str):
+        super().__init__(api_key, ChaiQuery, ChaiFastaValidator())
     def call(
         self, seq_file: Path | str, output_dir: Path, format_fasta: bool = False
 class ProtenixModel(AF3Model):
+    model_name = "Protenix"
     def __init__(self, api_key: str):
+        super().__init__(api_key, ProtenixQuery, ProtenixFastaValidator())
     def call(
         self, seq_file: Path | str, output_dir: Path, format_fasta: bool = False
 class BoltzModel(AF3Model):
+    model_name = "Boltz"
     def __init__(self, api_key: str):
+        super().__init__(api_key, BoltzQuery, BoltzFastaValidator())
     def call(
         self, seq_file: Path | str, output_dir: Path, format_fasta: bool = False
             }
             for cif_path in prediction_paths
         }
+class OldModel:
+    model_name = None
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+    def call(
+        self,
+        seq_file: Path | str,
+        output_dir: Path,
+        parameters: BaseFoldingParameters,
+        *args,
+        **kwargs,
+    ) -> None:
+        """Predict protein structure from amino acid sequence using AF2 model.
+        Args:
+            seq_file (Path | str): Path to FASTA file containing amino acid sequence
+            output_dir (Path): Path to output directory
+        """
+        output = single_job_prediction(
+            fasta_file=seq_file,
+            parameters=parameters,
+        )
+        experiment_id = output["message"]["experiment_id"]
+        done = False
+        while not done:
+            with Capturing() as output:
+                get_status(experiment_id)
+            status = output[0]
+            logger.info(f"Experiment {experiment_id} status: {status}")
+            if status == "Done":
+                done = True
+                logger.info("Downloading results")
+                get_results(
+                    experiment_id,
+                    force=True,
+                    unzip=True,
+                    output=output_dir / "results.zip",
+                )
+                logger.info("Results downloaded to %s", output_dir)
+            else:
+                logger.info("Sleeping for 10 seconds")
+                time.sleep(10)
+    def format_fasta(self, seq_file: Path | str) -> None:
+        """Format sequence to FASTA format.
+        Args:
+            seq_file (Path | str): Path to FASTA file
+        """
+        return
+    def predictions(self, output_dir: Path) -> dict[int, dict[str, Any]]:
+        """Get the path to the prediction.
+        Args:
+            output_dir (Path): Path to output directory
+        Returns:
+            dict[int, dict[str, Any]]: Dictionary mapping model indices to their prediction paths and metrics
+        """
+        prediction_paths = list(
+            (output_dir / "results").rglob("relaxed_model_[0-9]_ptm_pred_0.pdb")
+        )
+        metrics_path = output_dir / "results" / "metrics_per_model.json"
+        if not metrics_path.exists():
+            return {}
+        with open(metrics_path, "r") as f:
+            metrics = json.load(f)
+        output = {
+            int(pred_path.stem.split("_")[2]): {
+                "prediction_path": pred_path,
+                "metrics": metrics[f"model_{int(pred_path.stem.split('_')[2])}_ptm"],
+            }
+            for pred_path in prediction_paths
+        }
+        return output
+    def has_prediction(self, output_dir: Path) -> bool:
+        """Check if prediction exists in output directory."""
+        return len(self.predictions(output_dir)) > 0
+    def check_file_description(self, seq_file: Path | str) -> tuple[bool, str | None]:
+        """Check if the file description is correct.
+        Args:
+            seq_file (Path | str): Path to FASTA file
+        Returns:
+            tuple[bool, str | None]: Tuple containing a boolean indicating if the format is correct and an error message if not
+        """
+        return True, None
+class AF2Model(OldModel):
+    model_name = "AlphaFold2"
+    def call(self, seq_file: Path | str, output_dir: Path, *args, **kwargs) -> None:
+        super().call(seq_file, output_dir, AF2Parameters(), *args, **kwargs)
+class OpenFoldModel(OldModel):
+    model_name = "OpenFold"
+    def call(self, seq_file: Path | str, output_dir: Path, *args, **kwargs) -> None:
+        super().call(seq_file, output_dir, OpenFoldParameters(), *args, **kwargs)

folding_studio_demo/predict.py CHANGED Viewed

@@ -1,9 +1,11 @@
 """Predict protein structure using Folding Studio."""
 import hashlib
 import logging
 from io import StringIO
 from pathlib import Path
 import gradio as gr
 import numpy as np
@@ -12,7 +14,13 @@ from Bio import SeqIO
 from Bio.PDB import PDBIO, MMCIFParser, PDBParser, Superimposer
 from folding_studio_data_models import FoldingModel
-from folding_studio_demo.models import BoltzModel, ChaiModel, ProtenixModel
 logger = logging.getLogger(__name__)
@@ -85,20 +93,22 @@ def convert_cif_to_pdb(cif_path: str, pdb_path: str) -> None:
 def create_plddt_figure(
     plddt_vals: list[list[float]],
     model_name: str,
     residue_codes: list[list[str]] = None,
 ) -> go.Figure:
     """Create a plot of metrics."""
     plddt_traces = []
-    for i, plddt_val in enumerate(plddt_vals):
         # Create hover text with residue codes if available
         if residue_codes and i < len(residue_codes):
             hover_text = [
-                f"<i>pLDDT</i>: {plddt:.2f}<br><i>Residue:</i> {code} {idx}"
                 for idx, (plddt, code) in enumerate(zip(plddt_val, residue_codes[i]))
             ]
         else:
             hover_text = [
-                f"<i>pLDDT</i>: {plddt:.2f}<br><i>Residue index:</i> {idx}"
                 for idx, plddt in enumerate(plddt_val)
             ]
@@ -108,7 +118,7 @@ def create_plddt_figure(
                 y=plddt_val,
                 hovertemplate="%{text}<extra></extra>",
                 text=hover_text,
-                name=f"{model_name} {i}",
                 visible=True,
             )
         )
@@ -150,8 +160,19 @@ def _write_fasta_file(
     return seq_id, seq_file
-def extract_plddt_from_cif(cif_path):
-    structure = MMCIFParser().get_structure("structure", cif_path)
     # Lists to store pLDDT values and residue codes
     plddt_values = []
@@ -206,6 +227,10 @@ def predict(
         model = ChaiModel(api_key)
     elif model_type == FoldingModel.PROTENIX:
         model = ProtenixModel(api_key)
     else:
         raise ValueError(f"Model {model_type} not supported")
@@ -235,22 +260,36 @@ def predict(
         progress(
             0.4 + (0.4 * i / total_predictions), desc=f"Converting model {model_idx}..."
         )
-        cif_path = prediction["prediction_path"]
-        logger.info(f"CIF file: {cif_path}")
-        converted_pdb_path = str(
-            output_dir / f"{model.model_name}_prediction_{model_idx}.pdb"
-        )
-        convert_cif_to_pdb(str(cif_path), str(converted_pdb_path))
-        plddt_vals, residue_codes = extract_plddt_from_cif(cif_path)
-        pdb_paths.append(converted_pdb_path)
         model_plddt_vals.append(plddt_vals)
         model_residue_codes.append(residue_codes)
     progress(0.8, desc="Generating plots...")
     plddt_fig = create_plddt_figure(
         plddt_vals=model_plddt_vals,
         model_name=model.model_name,
         residue_codes=model_residue_codes,
     )
@@ -258,11 +297,13 @@ def predict(
     return pdb_paths, plddt_fig
-def align_structures(pdb_paths: list[str]) -> list[str]:
     """Align multiple PDB structures to the first structure.
     Args:
-        pdb_paths (list[str]): List of paths to PDB files to align
     Returns:
         list[str]: List of paths to aligned PDB files
@@ -271,39 +312,47 @@ def align_structures(pdb_paths: list[str]) -> list[str]:
     parser = PDBParser()
     io = PDBIO()
-    # Parse the reference structure (first one)
-    ref_structure = parser.get_structure("reference", pdb_paths[0])
     ref_atoms = [atom for atom in ref_structure.get_atoms() if atom.get_name() == "CA"]
-    aligned_paths = [pdb_paths[0]]  # First structure is already aligned
-    # Align each subsequent structure to the reference
-    for i, pdb_path in enumerate(pdb_paths[1:], start=1):
-        # Parse the structure to align
-        structure = parser.get_structure(f"model_{i}", pdb_path)
-        atoms = [atom for atom in structure.get_atoms() if atom.get_name() == "CA"]
-        # Create superimposer
-        sup = Superimposer()
-        # Set the reference and moving atoms
-        sup.set_atoms(ref_atoms, atoms)
-        # Apply the transformation to all atoms in the structure
-        sup.apply(structure.get_atoms())
-        # Save the aligned structure
-        aligned_path = str(Path(pdb_path).parent / f"aligned_{Path(pdb_path).name}")
-        io.set_structure(structure)
-        io.save(aligned_path)
-        aligned_paths.append(aligned_path)
-    return aligned_paths
 def filter_predictions(
-    aligned_paths: list[str],
-    plddt_fig: go.Figure,
     chai_selected: list[int],
     boltz_selected: list[int],
     protenix_selected: list[int],
@@ -316,7 +365,7 @@ def filter_predictions(
         chai_selected (list[int]): Selected Chai model indices
         boltz_selected (list[int]): Selected Boltz model indices
         protenix_selected (list[int]): Selected Protenix model indices
-        model_predictions (dict[FoldingModel, list[int]]): Dictionary mapping models to their prediction indices
     Returns:
         tuple[list[str], go.Figure]: Filtered PDB paths and updated pLDDT plot
@@ -325,26 +374,30 @@ def filter_predictions(
     filtered_fig = go.Figure()
     # Keep track of which traces to show
-    visible_paths = []
     # Helper function to check if a trace should be visible
-    def should_show_trace(trace_name: str) -> bool:
-        model_name = trace_name.split()[0]
-        model_idx = int(trace_name.split()[1])
-        if model_name == "Chai" and model_idx in chai_selected:
             return True
-        if model_name == "Boltz" and model_idx in boltz_selected:
             return True
-        if model_name == "Protenix" and model_idx in protenix_selected:
             return True
         return False
     # Filter traces and paths
-    for i, trace in enumerate(plddt_fig.data):
-        if should_show_trace(trace.name):
-            filtered_fig.add_trace(trace)
-            visible_paths.append(aligned_paths[i])
     # Update layout
     filtered_fig.update_layout(
@@ -355,21 +408,58 @@ def filter_predictions(
         template="simple_white",
         legend=dict(yanchor="bottom", y=0.01, xanchor="left", x=0.99),
     )
-    return visible_paths, filtered_fig
 def predict_comparison(
     sequence: str, api_key: str, model_types: list[FoldingModel], progress=gr.Progress()
 ) -> tuple[
-    list[str],
-    go.Figure,
     gr.CheckboxGroup,
     gr.CheckboxGroup,
     gr.CheckboxGroup,
-    list[str],
-    go.Figure,
-    dict,
 ]:
     """Predict protein structure from amino acid sequence using multiple models.
@@ -381,68 +471,94 @@ def predict_comparison(
     Returns:
         tuple containing:
-            - list[str]: Aligned PDB paths
-            - go.Figure: pLDDT plot
             - gr.CheckboxGroup: Chai predictions checkbox group
             - gr.CheckboxGroup: Boltz predictions checkbox group
             - gr.CheckboxGroup: Protenix predictions checkbox group
-            - list[str]: Original PDB paths
-            - go.Figure: Original pLDDT plot
-            - dict: Model predictions mapping
     """
     if not api_key:
         raise gr.Error("Missing API key, please enter a valid API key")
-    # Set up unique output directory based on sequence hash
-    pdb_paths = []
-    plddt_traces = []
-    total_models = len(model_types)
     model_predictions = {}
-    for i, model_type in enumerate(model_types):
-        progress(i / total_models, desc=f"Running {model_type} prediction...")
-        model_pdb_paths, model_plddt_traces = predict(
-            sequence, api_key, model_type, format_fasta=True
-        )
-        pdb_paths += model_pdb_paths
-        plddt_traces += model_plddt_traces.data
-        model_predictions[model_type] = [int(Path(p).stem[-1]) for p in model_pdb_paths]
     progress(0.9, desc="Aligning structures...")
-    aligned_paths = align_structures(pdb_paths)
-    plddt_fig = go.Figure(data=plddt_traces)
-    plddt_fig.update_layout(
-        title="pLDDT",
-        xaxis_title="Residue index",
-        yaxis_title="pLDDT",
-        height=500,
-        template="simple_white",
-        legend=dict(yanchor="bottom", y=0.01, xanchor="left", x=0.99),
-    )
     progress(1.0, desc="Done!")
     # Create checkbox groups for each model type
     chai_predictions = gr.CheckboxGroup(
         visible=model_predictions.get(FoldingModel.CHAI) is not None,
-        choices=model_predictions.get(FoldingModel.CHAI, []),
-        value=model_predictions.get(FoldingModel.CHAI, []),
     )
     boltz_predictions = gr.CheckboxGroup(
         visible=model_predictions.get(FoldingModel.BOLTZ) is not None,
-        choices=model_predictions.get(FoldingModel.BOLTZ, []),
-        value=model_predictions.get(FoldingModel.BOLTZ, []),
     )
     protenix_predictions = gr.CheckboxGroup(
         visible=model_predictions.get(FoldingModel.PROTENIX) is not None,
-        choices=model_predictions.get(FoldingModel.PROTENIX, []),
-        value=model_predictions.get(FoldingModel.PROTENIX, []),
     )
     return (
         chai_predictions,
         boltz_predictions,
         protenix_predictions,
-        aligned_paths,
-        plddt_fig,
     )

 """Predict protein structure using Folding Studio."""
+import concurrent.futures
 import hashlib
 import logging
 from io import StringIO
 from pathlib import Path
+from typing import Any
 import gradio as gr
 import numpy as np
 from Bio.PDB import PDBIO, MMCIFParser, PDBParser, Superimposer
 from folding_studio_data_models import FoldingModel
+from folding_studio_demo.models import (
+    AF2Model,
+    BoltzModel,
+    ChaiModel,
+    OpenFoldModel,
+    ProtenixModel,
+)
 logger = logging.getLogger(__name__)
 def create_plddt_figure(
     plddt_vals: list[list[float]],
     model_name: str,
+    indexes: list[int],
     residue_codes: list[list[str]] = None,
 ) -> go.Figure:
     """Create a plot of metrics."""
     plddt_traces = []
+    for i, (plddt_val, index) in enumerate(zip(plddt_vals, indexes)):
         # Create hover text with residue codes if available
         if residue_codes and i < len(residue_codes):
             hover_text = [
+                f"<i>{model_name} {index}</i><br><i>pLDDT</i>: {plddt:.2f}<br><i>Residue:</i> {code} {idx}"
                 for idx, (plddt, code) in enumerate(zip(plddt_val, residue_codes[i]))
             ]
         else:
             hover_text = [
+                f"<i>{model_name} {index}</i><br><i>pLDDT</i>: {plddt:.2f}<br><i>Residue index:</i> {idx}"
                 for idx, plddt in enumerate(plddt_val)
             ]
                 y=plddt_val,
                 hovertemplate="%{text}<extra></extra>",
                 text=hover_text,
+                name=f"{model_name} {index}",
                 visible=True,
             )
         )
     return seq_id, seq_file
+def extract_plddt_from_structure(structure_path: str) -> tuple[list[float], list[str]]:
+    """Extract pLDDT values and residue codes from a structure file.
+    Args:
+        structure_path (Path): Path to structure file
+    Returns:
+        tuple[list[float], list[str]]: Tuple containing lists of pLDDT values and residue codes
+    """
+    if Path(structure_path).suffix == ".cif":
+        structure = MMCIFParser().get_structure("structure", structure_path)
+    else:
+        structure = PDBParser().get_structure("structure", structure_path)
     # Lists to store pLDDT values and residue codes
     plddt_values = []
         model = ChaiModel(api_key)
     elif model_type == FoldingModel.PROTENIX:
         model = ProtenixModel(api_key)
+    elif model_type == FoldingModel.AF2:
+        model = AF2Model(api_key)
+    elif model_type == FoldingModel.OPENFOLD:
+        model = OpenFoldModel(api_key)
     else:
         raise ValueError(f"Model {model_type} not supported")
         progress(
             0.4 + (0.4 * i / total_predictions), desc=f"Converting model {model_idx}..."
         )
+        prediction_path = prediction["prediction_path"]
+        logger.info(f"Prediction file: {prediction_path}")
+        if Path(prediction_path).suffix == ".cif":
+            converted_pdb_path = str(
+                output_dir / f"{model.model_name}_prediction_{model_idx}.pdb"
+            )
+            convert_cif_to_pdb(str(prediction_path), str(converted_pdb_path))
+            pdb_paths.append(converted_pdb_path)
+        else:
+            pdb_paths.append(str(prediction_path))
+        plddt_vals, residue_codes = extract_plddt_from_structure(prediction_path)
         model_plddt_vals.append(plddt_vals)
         model_residue_codes.append(residue_codes)
     progress(0.8, desc="Generating plots...")
+    indexes = []
+    for pdb_path in pdb_paths:
+        if model_type in [
+            FoldingModel.AF2,
+            FoldingModel.OPENFOLD,
+            FoldingModel.SOLOSEQ,
+        ]:
+            indexes.append(int(Path(pdb_path).stem.split("_")[2]))
+        else:
+            indexes.append(int(Path(pdb_path).stem[-1]))
     plddt_fig = create_plddt_figure(
         plddt_vals=model_plddt_vals,
         model_name=model.model_name,
+        indexes=indexes,
         residue_codes=model_residue_codes,
     )
     return pdb_paths, plddt_fig
+def align_structures(
+    model_predictions: dict[FoldingModel, dict[int, dict[str, Any]]],
+) -> list[str]:
     """Align multiple PDB structures to the first structure.
     Args:
+        model_predictions (dict[FoldingModel, dict[int, dict[str, Any]]]): Dictionary mapping models to their prediction indices
     Returns:
         list[str]: List of paths to aligned PDB files
     parser = PDBParser()
     io = PDBIO()
+    # Get the first structure as reference
+    first_model = next(iter(model_predictions.keys()))
+    first_pred = next(iter(model_predictions[first_model].values()))
+    ref_pdb_path = first_pred["pdb_path"]
+    # Parse reference structure and get CA atoms
+    ref_structure = parser.get_structure("reference", ref_pdb_path)
     ref_atoms = [atom for atom in ref_structure.get_atoms() if atom.get_name() == "CA"]
+    for model_type in model_predictions.keys():
+        for index, prediction in model_predictions[model_type].items():
+            pdb_path = prediction["pdb_path"]
+            # Parse the structure to align
+            structure = parser.get_structure(f"{model_type}_{index}", pdb_path)
+            atoms = [atom for atom in structure.get_atoms() if atom.get_name() == "CA"]
+            # Create superimposer
+            sup = Superimposer()
+            # Set the reference and moving atoms
+            sup.set_atoms(ref_atoms, atoms)
+            # Apply the transformation to all atoms in the structure
+            sup.apply(structure.get_atoms())
+            # Save the aligned structure
+            aligned_path = str(Path(pdb_path).parent / f"aligned_{Path(pdb_path).name}")
+            io.set_structure(structure)
+            io.save(aligned_path)
+            model_predictions[model_type][index]["pdb_path"] = aligned_path
+    return model_predictions
 def filter_predictions(
+    model_predictions: dict[FoldingModel, dict[int, dict[str, Any]]],
+    af2_selected: list[int],
+    openfold_selected: list[int],
+    solo_selected: list[int],
     chai_selected: list[int],
     boltz_selected: list[int],
     protenix_selected: list[int],
         chai_selected (list[int]): Selected Chai model indices
         boltz_selected (list[int]): Selected Boltz model indices
         protenix_selected (list[int]): Selected Protenix model indices
+        model_predictions (dict[FoldingModel, dict[int, dict[str, Any]]]): Dictionary mapping models to their prediction indices
     Returns:
         tuple[list[str], go.Figure]: Filtered PDB paths and updated pLDDT plot
     filtered_fig = go.Figure()
     # Keep track of which traces to show
+    filtered_paths = []
     # Helper function to check if a trace should be visible
+    def should_show_trace(model_name, pred_index: int) -> bool:
+        if model_name == FoldingModel.CHAI and pred_index in chai_selected:
+            return True
+        if model_name == FoldingModel.BOLTZ and pred_index in boltz_selected:
+            return True
+        if model_name == FoldingModel.PROTENIX and pred_index in protenix_selected:
+            return True
+        if model_name == FoldingModel.AF2 and pred_index in af2_selected:
             return True
+        if model_name == FoldingModel.OPENFOLD and pred_index in openfold_selected:
             return True
+        if model_name == FoldingModel.SOLOSEQ and pred_index in solo_selected:
             return True
         return False
     # Filter traces and paths
+    for model_type in model_predictions.keys():
+        for index, prediction in model_predictions[model_type].items():
+            if should_show_trace(model_type, index):
+                filtered_fig.add_trace(prediction["plddt_trace"])
+                filtered_paths.append(prediction["pdb_path"])
     # Update layout
     filtered_fig.update_layout(
         template="simple_white",
         legend=dict(yanchor="bottom", y=0.01, xanchor="left", x=0.99),
     )
+    return filtered_paths, filtered_fig
+def run_prediction(
+    sequence: str,
+    api_key: str,
+    model_type: FoldingModel,
+    format_fasta: bool = False,
+) -> dict[FoldingModel, dict[int, dict[str, Any]]]:
+    """Run a single prediction.
+    Args:
+        sequence (str): Amino acid sequence to predict structure for
+        api_key (str): Folding API key
+        model_type (FoldingModel): Folding model to use
+        format_fasta (bool): Whether to format the FASTA file
+    Returns:
+        Tuple containing:
+            - List of PDB paths
+            - pLDDT plot
+            - Dictionary mapping model to prediction indices
+    """
+    model_pdb_paths, model_plddt_traces = predict(
+        sequence, api_key, model_type, format_fasta=format_fasta
+    )
+    model_pdb_paths = sorted(model_pdb_paths)
+    model_predictions = {}
+    for pdb_path, plddt_trace in zip(model_pdb_paths, model_plddt_traces.data):
+        if model_type in [
+            FoldingModel.AF2,
+            FoldingModel.OPENFOLD,
+            FoldingModel.SOLOSEQ,
+        ]:
+            index = int(Path(pdb_path).stem.split("_")[2])
+        else:
+            index = int(Path(pdb_path).stem[-1])
+        model_predictions[index] = {"pdb_path": pdb_path, "plddt_trace": plddt_trace}
+    return model_predictions
 def predict_comparison(
     sequence: str, api_key: str, model_types: list[FoldingModel], progress=gr.Progress()
 ) -> tuple[
+    dict[FoldingModel, dict[int, dict[str, Any]]],
+    gr.CheckboxGroup,
+    gr.CheckboxGroup,
+    gr.CheckboxGroup,
     gr.CheckboxGroup,
     gr.CheckboxGroup,
     gr.CheckboxGroup,
 ]:
     """Predict protein structure from amino acid sequence using multiple models.
     Returns:
         tuple containing:
+            - dict[FoldingModel, dict[int, dict[str, Any]]]: Model predictions mapping
+            - gr.CheckboxGroup: AF2 predictions checkbox group
+            - gr.CheckboxGroup: OpenFold predictions checkbox group
+            - gr.CheckboxGroup: SoloSeq predictions checkbox group
             - gr.CheckboxGroup: Chai predictions checkbox group
             - gr.CheckboxGroup: Boltz predictions checkbox group
             - gr.CheckboxGroup: Protenix predictions checkbox group
     """
     if not api_key:
         raise gr.Error("Missing API key, please enter a valid API key")
+    progress(0, desc="Starting parallel predictions...")
+    # Run predictions in parallel
     model_predictions = {}
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        # Create a future for each model prediction
+        future_to_model = {
+            executor.submit(
+                run_prediction, sequence, api_key, model_type, True
+            ): model_type
+            for model_type in model_types
+        }
+        # Process results as they complete
+        total_models = len(model_types)
+        completed = 0
+        for future in concurrent.futures.as_completed(future_to_model):
+            model_type = future_to_model[future]
+            try:
+                model_preds = future.result()
+                model_predictions[model_type] = model_preds
+                completed += 1
+                progress(
+                    completed / total_models,
+                    desc=f"Completed {model_type} prediction...",
+                )
+            except Exception as e:
+                logger.error(f"Prediction failed for {model_type}: {str(e)}")
+                raise gr.Error(f"Prediction failed for {model_type}: {str(e)}")
     progress(0.9, desc="Aligning structures...")
+    model_predictions = align_structures(model_predictions)
     progress(1.0, desc="Done!")
     # Create checkbox groups for each model type
+    af2_predictions = gr.CheckboxGroup(
+        visible=model_predictions.get(FoldingModel.AF2) is not None,
+        choices=list(model_predictions.get(FoldingModel.AF2, {}).keys()),
+        value=list(model_predictions.get(FoldingModel.AF2, {}).keys()),
+    )
+    openfold_predictions = gr.CheckboxGroup(
+        visible=model_predictions.get(FoldingModel.OPENFOLD) is not None,
+        choices=list(model_predictions.get(FoldingModel.OPENFOLD, {}).keys()),
+        value=list(model_predictions.get(FoldingModel.OPENFOLD, {}).keys()),
+    )
+    solo_predictions = gr.CheckboxGroup(
+        visible=model_predictions.get(FoldingModel.SOLOSEQ) is not None,
+        choices=list(model_predictions.get(FoldingModel.SOLOSEQ, {}).keys()),
+        value=list(model_predictions.get(FoldingModel.SOLOSEQ, {}).keys()),
+    )
     chai_predictions = gr.CheckboxGroup(
         visible=model_predictions.get(FoldingModel.CHAI) is not None,
+        choices=list(model_predictions.get(FoldingModel.CHAI, {}).keys()),
+        value=list(model_predictions.get(FoldingModel.CHAI, {}).keys()),
     )
     boltz_predictions = gr.CheckboxGroup(
         visible=model_predictions.get(FoldingModel.BOLTZ) is not None,
+        choices=list(model_predictions.get(FoldingModel.BOLTZ, {}).keys()),
+        value=list(model_predictions.get(FoldingModel.BOLTZ, {}).keys()),
     )
     protenix_predictions = gr.CheckboxGroup(
         visible=model_predictions.get(FoldingModel.PROTENIX) is not None,
+        choices=list(model_predictions.get(FoldingModel.PROTENIX, {}).keys()),
+        value=list(model_predictions.get(FoldingModel.PROTENIX, {}).keys()),
     )
     return (
+        model_predictions,
+        af2_predictions,
+        openfold_predictions,
+        solo_predictions,
         chai_predictions,
         boltz_predictions,
         protenix_predictions,
     )