Spaces:

InstaDeepAI
/

folding-studio-demo

Running

App Files Files Community

jfaustin

AchilleSoulieID commited on 3 days ago

Commit

b26b7a0

verified ·

1 Parent(s): e967c14

improve model comparison (#10)

Browse files

- improve model comparion (6187a6421d0fce2de6e37381ac15baa56c12ea39)

Co-authored-by: Achille Soulie <[email protected]>

Files changed (4) hide show

folding_studio_demo/app.py +85 -29
folding_studio_demo/model_fasta_validators.py +9 -9
folding_studio_demo/models.py +207 -0
folding_studio_demo/predict.py +240 -216

folding_studio_demo/app.py CHANGED Viewed

@@ -4,19 +4,20 @@ import logging
 import gradio as gr
 import pandas as pd
 from folding_studio_data_models import FoldingModel
 from gradio_molecule3d import Molecule3D
 from folding_studio_demo.correlate import (
-    SCORE_COLUMNS,
     SCORE_COLUMN_NAMES,
     fake_predict_and_correlate,
     make_regression_plot,
-    compute_correlation_data,
     plot_correlation_ranking,
-    get_score_description
 )
-from folding_studio_demo.predict import predict, predict_comparison
 logger = logging.getLogger(__name__)
@@ -24,8 +25,8 @@ logger = logging.getLogger(__name__)
 MOLECULE_REPS = [
     {
         "model": 0,
-        "chain": "",
-        "resname": "",
         "style": "cartoon",
         "color": "alphafold",
         # "residue_range": "",
@@ -36,7 +37,6 @@ MOLECULE_REPS = [
     }
 ]
-DEFAULT_PROTEIN_SEQ = ">protein description\nMALWMRLLPLLALLALWGPDPAAA"
 MODEL_CHOICES = [
     # ("AlphaFold2", FoldingModel.AF2),
@@ -47,8 +47,24 @@ MODEL_CHOICES = [
     ("Protenix", FoldingModel.PROTENIX),
 ]
-def sequence_input() -> gr.Textbox:
     """Sequence input component.
     Returns:
@@ -56,10 +72,21 @@ def sequence_input() -> gr.Textbox:
     """
     sequence = gr.Textbox(
         label="Protein Sequence",
-        value=DEFAULT_PROTEIN_SEQ,
         lines=2,
         placeholder="Enter a protein sequence or upload a FASTA file",
     )
     file_input = gr.File(
         label="Upload a FASTA file",
         file_types=[".fasta", ".fa"],
@@ -104,7 +131,7 @@ def simple_prediction(api_key: str) -> None:
             value=FoldingModel.BOLTZ,
         )
         with gr.Column():
-            sequence = sequence_input()
     predict_btn = gr.Button(
         "Predict",
@@ -132,10 +159,9 @@ def model_comparison(api_key: str) -> None:
     """
     with gr.Row():
-        models = gr.Dropdown(
             label="Model",
             choices=MODEL_CHOICES,
-            multiselect=True,
             scale=0,
             min_width=300,
             value=[FoldingModel.BOLTZ, FoldingModel.CHAI, FoldingModel.PROTENIX],
@@ -149,22 +175,46 @@ def model_comparison(api_key: str) -> None:
         elem_id="compare-models-btn",
         variant="primary",
     )
     with gr.Row():
         mol_outputs = Molecule3D(
-            label="Protein Structure",
-            reps=MOLECULE_REPS,
-            file_count="multiple",
         )
-        # metrics_plot = gr.Plot(label="pLDDT")
     predict_btn.click(
         fn=predict_comparison,
         inputs=[sequence, api_key, models],
-        outputs=[mol_outputs],
     )
 def create_correlation_tab():
     gr.Markdown("# Correlation with experimental binding affinity data")
@@ -221,7 +271,7 @@ def create_correlation_tab():
                 choices=["Spearman", "Pearson", "R²"],
                 value="Spearman",
                 label="Correlation Type",
-                interactive=True
             )
         with gr.Row():
             correlation_ranking_plot = gr.Plot(label="Correlation ranking")
@@ -230,17 +280,24 @@ def create_correlation_tab():
             with gr.Row():
                 # User can select the columns to display in the correlation plot
                 correlation_column = gr.Dropdown(
-                    label="Score data to display", choices=SCORE_COLUMNS, multiselect=False, value=SCORE_COLUMNS[0]
                 )
                 # Add checkbox for log scale and update plot when either input changes
             with gr.Row():
-                log_scale = gr.Checkbox(label="Display x-axis on logarithmic scale", value=False)
             with gr.Row():
-                score_description = gr.Markdown(get_score_description(correlation_column.value))
                 correlation_column.change(
                     fn=lambda x: get_score_description(x),
                     inputs=correlation_column,
-                    outputs=score_description
                 )
         with gr.Column():
             correlation_plot = gr.Plot(label="Correlation with binding affinity")
@@ -252,10 +309,10 @@ def create_correlation_tab():
         inputs=[correlation_type],
         outputs=[prediction_dataframe, correlation_ranking_plot, correlation_plot],
     )
     def update_regression_plot(score, use_log):
         return make_regression_plot(spr_data_with_scores, score, use_log)
     def update_correlation_plot(correlation_type):
         logger.info(f"Updating correlation plot for {correlation_type}")
         corr_data = compute_correlation_data(spr_data_with_scores, SCORE_COLUMNS)
@@ -273,16 +330,15 @@ def create_correlation_tab():
         inputs=[correlation_type],
         outputs=correlation_ranking_plot,
     )
     log_scale.change(
         fn=update_regression_plot,
-        inputs=[correlation_column, log_scale],
         outputs=correlation_plot,
     )
 def __main__():
     theme = gr.themes.Ocean(
         primary_hue="blue",
         secondary_hue="purple",

 import gradio as gr
 import pandas as pd
+import plotly.graph_objects as go
 from folding_studio_data_models import FoldingModel
 from gradio_molecule3d import Molecule3D
 from folding_studio_demo.correlate import (
     SCORE_COLUMN_NAMES,
+    SCORE_COLUMNS,
+    compute_correlation_data,
     fake_predict_and_correlate,
+    get_score_description,
     make_regression_plot,
     plot_correlation_ranking,
 )
+from folding_studio_demo.predict import filter_predictions, predict, predict_comparison
 logger = logging.getLogger(__name__)
 MOLECULE_REPS = [
     {
         "model": 0,
+        # "chain": "",
+        # "resname": "",
         "style": "cartoon",
         "color": "alphafold",
         # "residue_range": "",
     }
 ]
 MODEL_CHOICES = [
     # ("AlphaFold2", FoldingModel.AF2),
     ("Protenix", FoldingModel.PROTENIX),
 ]
+DEFAULT_SEQ = "MALWMRLLPLLALLALWGPDPAAA"
+MODEL_EXAMPLES = {
+    FoldingModel.BOLTZ: [
+        ["Monomer", f">A|protein\n{DEFAULT_SEQ}"],
+        ["Multimer", f">A|protein\n{DEFAULT_SEQ}\n>B|protein\n{DEFAULT_SEQ}"],
+    ],
+    FoldingModel.CHAI: [
+        ["Monomer", f">protein|name=A\n{DEFAULT_SEQ}"],
+        ["Multimer", f">protein|name=A\n{DEFAULT_SEQ}\n>protein|name=B\n{DEFAULT_SEQ}"],
+    ],
+    FoldingModel.PROTENIX: [
+        ["Monomer", f">A|protein\n{DEFAULT_SEQ}"],
+        ["Multimer", f">A|protein\n{DEFAULT_SEQ}\n>B|protein\n{DEFAULT_SEQ}"],
+    ],
+}
+def sequence_input(dropdown: gr.Dropdown | None = None) -> gr.Textbox:
     """Sequence input component.
     Returns:
     """
     sequence = gr.Textbox(
         label="Protein Sequence",
         lines=2,
         placeholder="Enter a protein sequence or upload a FASTA file",
     )
+    dummy = gr.Textbox(label="Complex type", visible=False)
+    examples = gr.Examples(
+        examples=MODEL_EXAMPLES[FoldingModel.BOLTZ],
+        inputs=[dummy, sequence],
+    )
+    if dropdown is not None:
+        dropdown.change(
+            fn=lambda x: gr.Dataset(samples=MODEL_EXAMPLES[x]),
+            inputs=[dropdown],
+            outputs=[examples.dataset],
+        )
     file_input = gr.File(
         label="Upload a FASTA file",
         file_types=[".fasta", ".fa"],
             value=FoldingModel.BOLTZ,
         )
         with gr.Column():
+            sequence = sequence_input(dropdown)
     predict_btn = gr.Button(
         "Predict",
     """
     with gr.Row():
+        models = gr.CheckboxGroup(
             label="Model",
             choices=MODEL_CHOICES,
             scale=0,
             min_width=300,
             value=[FoldingModel.BOLTZ, FoldingModel.CHAI, FoldingModel.PROTENIX],
         elem_id="compare-models-btn",
         variant="primary",
     )
+    with gr.Row():
+        chai_predictions = gr.CheckboxGroup(label="Chai", visible=False)
+        protenix_predictions = gr.CheckboxGroup(label="Protenix", visible=False)
+        boltz_predictions = gr.CheckboxGroup(label="Boltz", visible=False)
     with gr.Row():
         mol_outputs = Molecule3D(
+            label="Protein Structure", reps=MOLECULE_REPS, height=1000
         )
+        metrics_plot = gr.Plot(label="pLDDT")
+    # Store the initial predictions
+    aligned_paths = gr.State()
+    plddt_fig = gr.State()
     predict_btn.click(
         fn=predict_comparison,
         inputs=[sequence, api_key, models],
+        outputs=[
+            chai_predictions,
+            boltz_predictions,
+            protenix_predictions,
+            aligned_paths,
+            plddt_fig,
+        ],
     )
+    # Handle checkbox changes
+    for checkbox in [chai_predictions, boltz_predictions, protenix_predictions]:
+        checkbox.change(
+            fn=filter_predictions,
+            inputs=[
+                aligned_paths,
+                plddt_fig,
+                chai_predictions,
+                boltz_predictions,
+                protenix_predictions,
+            ],
+            outputs=[mol_outputs, metrics_plot],
+        )
 def create_correlation_tab():
     gr.Markdown("# Correlation with experimental binding affinity data")
                 choices=["Spearman", "Pearson", "R²"],
                 value="Spearman",
                 label="Correlation Type",
+                interactive=True,
             )
         with gr.Row():
             correlation_ranking_plot = gr.Plot(label="Correlation ranking")
             with gr.Row():
                 # User can select the columns to display in the correlation plot
                 correlation_column = gr.Dropdown(
+                    label="Score data to display",
+                    choices=SCORE_COLUMNS,
+                    multiselect=False,
+                    value=SCORE_COLUMNS[0],
                 )
                 # Add checkbox for log scale and update plot when either input changes
             with gr.Row():
+                log_scale = gr.Checkbox(
+                    label="Display x-axis on logarithmic scale", value=False
+                )
             with gr.Row():
+                score_description = gr.Markdown(
+                    get_score_description(correlation_column.value)
+                )
                 correlation_column.change(
                     fn=lambda x: get_score_description(x),
                     inputs=correlation_column,
+                    outputs=score_description,
                 )
         with gr.Column():
             correlation_plot = gr.Plot(label="Correlation with binding affinity")
         inputs=[correlation_type],
         outputs=[prediction_dataframe, correlation_ranking_plot, correlation_plot],
     )
     def update_regression_plot(score, use_log):
         return make_regression_plot(spr_data_with_scores, score, use_log)
     def update_correlation_plot(correlation_type):
         logger.info(f"Updating correlation plot for {correlation_type}")
         corr_data = compute_correlation_data(spr_data_with_scores, SCORE_COLUMNS)
         inputs=[correlation_type],
         outputs=correlation_ranking_plot,
     )
     log_scale.change(
         fn=update_regression_plot,
+        inputs=[correlation_column, log_scale],
         outputs=correlation_plot,
     )
 def __main__():
     theme = gr.themes.Ocean(
         primary_hue="blue",
         secondary_hue="purple",

folding_studio_demo/model_fasta_validators.py CHANGED Viewed

@@ -248,15 +248,15 @@ class ChaiFastaValidator(BaseFastaValidator):
                     )
                 seen_names.add(name)
                 # validate sequence format
-                sequence = str(record.seq).strip()
-                if (
-                    entity_type in {EntityType.PEPTIDE, EntityType.PROTEIN}
-                    and not get_entity_type(sequence) == entity_type
-                ):
-                    return (
-                        False,
-                        f"CHAI Validation Error: Sequence type mismatch. Expected '{entity_type}' but found '{get_entity_type(sequence)}'",
-                    )
         return True, None

                     )
                 seen_names.add(name)
                 # validate sequence format
+                # sequence = str(record.seq).strip()
+                # if (
+                #     entity_type in {EntityType.PEPTIDE, EntityType.PROTEIN}
+                #     and not get_entity_type(sequence) == entity_type
+                # ):
+                #     return (
+                #         False,
+                #         f"CHAI Validation Error: Sequence type mismatch. Expected '{entity_type}' but found '{get_entity_type(sequence)}'",
+                #     )
         return True, None

folding_studio_demo/models.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""Models for the Folding Studio API."""
+import logging
+import os
+from pathlib import Path
+from typing import Any
+import gradio as gr
+import numpy as np
+from folding_studio.client import Client
+from folding_studio.query import Query
+from folding_studio.query.boltz import BoltzQuery
+from folding_studio.query.chai import ChaiQuery
+from folding_studio.query.protenix import ProtenixQuery
+from folding_studio_demo.model_fasta_validators import (
+    BaseFastaValidator,
+    BoltzFastaValidator,
+    ChaiFastaValidator,
+    ProtenixFastaValidator,
+)
+logger = logging.getLogger(__name__)
+class AF3Model:
+    def __init__(
+        self, api_key: str, model_name: str, query: Query, validator: BaseFastaValidator
+    ):
+        self.api_key = api_key
+        self.model_name = model_name
+        self.query = query
+        self.validator = validator
+    def call(
+        self, seq_file: Path | str, output_dir: Path, format_fasta: bool = False
+    ) -> None:
+        """Predict protein structure from amino acid sequence using AF3 model.
+        Args:
+            seq_file (Path | str): Path to FASTA file containing amino acid sequence
+            output_dir (Path): Path to output directory
+            format_description (bool): Whether to format the description of the sequence
+        """
+        # Validate FASTA format before calling
+        is_valid, error_msg = self.check_file_description(seq_file)
+        if format_fasta and not is_valid:
+            logger.info("Invalid FASTA file format, forcing formatting...")
+            self.format_fasta(seq_file)
+        elif not is_valid:
+            logger.error(error_msg)
+            raise gr.Error(error_msg)
+        # Create a client using API key
+        logger.info("Authenticating client with API key")
+        client = Client.from_api_key(api_key=self.api_key)
+        # Define query
+        query: Query = self.query.from_file(path=seq_file, query_name="gradio")
+        query.save_parameters(output_dir)
+        logger.info("Payload: %s", query.payload)
+        # Send a request
+        logger.info(f"Sending {self.model_name} request to Folding Studio API")
+        response = client.send_request(
+            query, project_code=os.environ["FOLDING_PROJECT_CODE"]
+        )
+        # Access confidence data
+        logger.info("Confidence data: %s", response.confidence_data)
+        response.download_results(output_dir=output_dir, force=True, unzip=True)
+        logger.info("Results downloaded to %s", output_dir)
+    def format_fasta(self, seq_file: Path | str) -> None:
+        """Format sequence to FASTA format.
+        Args:
+            seq_file (Path | str): Path to FASTA file
+        """
+        formatted_fasta = self.validator.transform_fasta(seq_file)
+        with open(seq_file, "w") as f:
+            f.write(formatted_fasta)
+    def predictions(self, output_dir: Path) -> list[Path]:
+        """Get the path to the prediction.
+        Args:
+            output_dir (Path): Path to output directory
+        Returns:
+            list[Path]: List of paths to predictions
+        """
+        raise NotImplementedError()
+    def has_prediction(self, output_dir: Path) -> bool:
+        """Check if prediction exists in output directory."""
+        return len(self.predictions(output_dir)) > 0
+    def check_file_description(self, seq_file: Path | str) -> tuple[bool, str | None]:
+        """Check if the file description is correct.
+        Args:
+            seq_file (Path | str): Path to FASTA file
+        Returns:
+            tuple[bool, str | None]: Tuple containing a boolean indicating if the format is correct and an error message if not
+        """
+        is_valid, error_msg = self.validator.is_valid_fasta(seq_file)
+        if not is_valid:
+            return False, error_msg
+        return True, None
+class ChaiModel(AF3Model):
+    def __init__(self, api_key: str):
+        super().__init__(api_key, "Chai", ChaiQuery, ChaiFastaValidator())
+    def call(
+        self, seq_file: Path | str, output_dir: Path, format_fasta: bool = False
+    ) -> None:
+        """Predict protein structure from amino acid sequence using Chai model.
+        Args:
+            seq_file (Path | str): Path to FASTA file containing amino acid sequence
+            output_dir (Path): Path to output directory
+            format_fasta (bool): Whether to format the FASTA file
+        """
+        super().call(seq_file, output_dir, format_fasta)
+    def predictions(self, output_dir: Path) -> dict[Path, dict[str, Any]]:
+        """Get the path to the prediction."""
+        prediction = next(output_dir.rglob("pred.model_idx_[0-9].cif"), None)
+        if prediction is None:
+            return {}
+        cif_files = {
+            int(f.stem.split("model_idx_")[1]): f
+            for f in prediction.parent.glob("pred.model_idx_*.cif")
+        }
+        # Get all npz files and extract their indices
+        npz_files = {
+            int(f.stem.split("model_idx_")[1]): f
+            for f in prediction.parent.glob("scores.model_idx_*.npz")
+        }
+        # Find common indices and create pairs
+        common_indices = sorted(set(cif_files.keys()) & set(npz_files.keys()))
+        return {
+            idx: {"prediction_path": cif_files[idx], "metrics": np.load(npz_files[idx])}
+            for idx in common_indices
+        }
+class ProtenixModel(AF3Model):
+    def __init__(self, api_key: str):
+        super().__init__(api_key, "Protenix", ProtenixQuery, ProtenixFastaValidator())
+    def call(
+        self, seq_file: Path | str, output_dir: Path, format_fasta: bool = False
+    ) -> None:
+        """Predict protein structure from amino acid sequence using Protenix model.
+        Args:
+            seq_file (Path | str): Path to FASTA file containing amino acid sequence
+            output_dir (Path): Path to output directory
+            format_fasta (bool): Whether to format the FASTA file
+        """
+        super().call(seq_file, output_dir, format_fasta)
+    def predictions(self, output_dir: Path) -> list[Path]:
+        """Get the path to the prediction."""
+        return list(output_dir.rglob("*_model_[0-9].cif"))
+class BoltzModel(AF3Model):
+    def __init__(self, api_key: str):
+        super().__init__(api_key, "Boltz", BoltzQuery, BoltzFastaValidator())
+    def call(
+        self, seq_file: Path | str, output_dir: Path, format_fasta: bool = False
+    ) -> None:
+        """Predict protein structure from amino acid sequence using Boltz model.
+        Args:
+            seq_file (Path | str): Path to FASTA file containing amino acid sequence
+            output_dir (Path): Path to output directory
+            format_fasta (bool): Whether to format the FASTA file
+        """
+        super().call(seq_file, output_dir, format_fasta)
+    def predictions(self, output_dir: Path) -> list[Path]:
+        """Get the path to the prediction."""
+        prediction_paths = list(output_dir.rglob("*_model_[0-9].cif"))
+        return {
+            int(cif_path.stem[-1]): {
+                "prediction_path": cif_path,
+                "metrics": np.load(list(cif_path.parent.glob("plddt_*.npz"))[0]),
+            }
+            for cif_path in prediction_paths
+        }

folding_studio_demo/predict.py CHANGED Viewed

@@ -2,29 +2,17 @@
 import hashlib
 import logging
-import os
 from io import StringIO
 from pathlib import Path
-from typing import Any
 import gradio as gr
 import numpy as np
 import plotly.graph_objects as go
 from Bio import SeqIO
 from Bio.PDB import PDBIO, MMCIFParser, PDBParser, Superimposer
-from folding_studio.client import Client
-from folding_studio.query import Query
-from folding_studio.query.boltz import BoltzQuery
-from folding_studio.query.chai import ChaiQuery
-from folding_studio.query.protenix import ProtenixQuery
 from folding_studio_data_models import FoldingModel
-from folding_studio_demo.model_fasta_validators import (
-    BaseFastaValidator,
-    BoltzFastaValidator,
-    ChaiFastaValidator,
-    ProtenixFastaValidator,
-)
 logger = logging.getLogger(__name__)
@@ -34,6 +22,48 @@ SEQUENCE_DIR.mkdir(parents=True, exist_ok=True)
 OUTPUT_DIR = Path("output")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 def convert_cif_to_pdb(cif_path: str, pdb_path: str) -> None:
     """Convert a .cif file to .pdb format using Biopython.
@@ -52,29 +82,46 @@ def convert_cif_to_pdb(cif_path: str, pdb_path: str) -> None:
     io.save(pdb_path)
-def add_plddt_plot(plddt_vals: list[list[float]], model_name: str) -> go.Figure:
     """Create a plot of metrics."""
-    visible = True
-    plddt_traces = [
-        go.Scatter(
-            x=np.arange(len(plddt_val)),
-            y=plddt_val,
-            hovertemplate="<i>pLDDT</i>: %{y:.2f} <br><i>Residue index:</i> %{x}<br>",
-            name=f"{model_name} {i}",
-            visible=visible,
         )
-        for i, plddt_val in enumerate(plddt_vals)
-    ]
     plddt_fig = go.Figure(data=plddt_traces)
     plddt_fig.update_layout(
         title="pLDDT",
-        xaxis_title="Residue index",
         yaxis_title="pLDDT",
         height=500,
         template="simple_white",
         legend=dict(yanchor="bottom", y=0.01, xanchor="left", x=0.99),
     )
     return plddt_fig
@@ -103,178 +150,12 @@ def _write_fasta_file(
     return seq_id, seq_file
-class AF3Model:
-    def __init__(
-        self, api_key: str, model_name: str, query: Query, validator: BaseFastaValidator
-    ):
-        self.api_key = api_key
-        self.model_name = model_name
-        self.query = query
-        self.validator = validator
-    def call(self, seq_file: Path | str, output_dir: Path) -> None:
-        """Predict protein structure from amino acid sequence using AF3 model.
-        Args:
-            seq_file (Path | str): Path to FASTA file containing amino acid sequence
-            output_dir (Path): Path to output directory
-        """
-        # Validate FASTA format before calling
-        is_valid, error_msg = self.check_file_description(seq_file)
-        if not is_valid:
-            logger.error(error_msg)
-            raise gr.Error(error_msg)
-        # Create a client using API key
-        logger.info("Authenticating client with API key")
-        client = Client.from_api_key(api_key=self.api_key)
-        # Define query
-        query: Query = self.query.from_file(path=seq_file, query_name="gradio")
-        query.save_parameters(output_dir)
-        logger.info("Payload: %s", query.payload)
-        # Send a request
-        logger.info(f"Sending {self.model_name} request to Folding Studio API")
-        response = client.send_request(
-            query, project_code=os.environ["FOLDING_PROJECT_CODE"]
-        )
-        # Access confidence data
-        logger.info("Confidence data: %s", response.confidence_data)
-        response.download_results(output_dir=output_dir, force=True, unzip=True)
-        logger.info("Results downloaded to %s", output_dir)
-    def format_fasta(self, sequence: str) -> str:
-        """Format sequence to FASTA format."""
-        return f">{self.model_name}\n{sequence}"
-    def predictions(self, output_dir: Path) -> list[Path]:
-        """Get the path to the prediction."""
-        raise NotImplementedError("Not implemented")
-    def has_prediction(self, output_dir: Path) -> bool:
-        """Check if prediction exists in output directory."""
-        return len(self.predictions(output_dir)) > 0
-    def check_file_description(self, seq_file: Path | str) -> tuple[bool, str | None]:
-        """Check if the file description is correct.
-        Args:
-            seq_file (Path | str): Path to FASTA file
-        Returns:
-            tuple[bool, str | None]: Tuple containing a boolean indicating if the format is correct and an error message if not
-        """
-        is_valid, error_msg = self.validator.is_valid_fasta(seq_file)
-        if not is_valid:
-            return False, error_msg
-        return True, None
-class ChaiModel(AF3Model):
-    def __init__(self, api_key: str):
-        super().__init__(api_key, "Chai", ChaiQuery, ChaiFastaValidator())
-    def call(self, seq_file: Path | str, output_dir: Path) -> None:
-        """Predict protein structure from amino acid sequence using Chai model.
-        Args:
-            seq_file (Path | str): Path to FASTA file containing amino acid sequence
-            output_dir (Path): Path to output directory
-        """
-        super().call(seq_file, output_dir)
-    def _get_chai_paired_files(self, directory: Path) -> list[tuple[Path, Path]]:
-        """Get pairs of .cif and .npz files with matching model indices.
-        Args:
-            directory (Path): Directory containing the prediction files
-        Returns:
-            list[tuple[Path, Path]]: List of tuples containing (cif_path, npz_path) pairs
-        """
-        # Get all cif files and extract their indices
-    def predictions(self, output_dir: Path) -> dict[Path, dict[str, Any]]:
-        """Get the path to the prediction."""
-        prediction = next(output_dir.rglob("pred.model_idx_[0-9].cif"), None)
-        if prediction is None:
-            return {}
-        cif_files = {
-            int(f.stem.split("model_idx_")[1]): f
-            for f in prediction.parent.glob("pred.model_idx_*.cif")
-        }
-        # Get all npz files and extract their indices
-        npz_files = {
-            int(f.stem.split("model_idx_")[1]): f
-            for f in prediction.parent.glob("scores.model_idx_*.npz")
-        }
-        # Find common indices and create pairs
-        common_indices = sorted(set(cif_files.keys()) & set(npz_files.keys()))
-        return {
-            idx: {"prediction_path": cif_files[idx], "metrics": np.load(npz_files[idx])}
-            for idx in common_indices
-        }
-class ProtenixModel(AF3Model):
-    def __init__(self, api_key: str):
-        super().__init__(api_key, "Protenix", ProtenixQuery, ProtenixFastaValidator())
-    def call(self, seq_file: Path | str, output_dir: Path) -> None:
-        """Predict protein structure from amino acid sequence using Protenix model.
-        Args:
-            seq_file (Path | str): Path to FASTA file containing amino acid sequence
-            output_dir (Path): Path to output directory
-        """
-        super().call(seq_file, output_dir)
-    def predictions(self, output_dir: Path) -> list[Path]:
-        """Get the path to the prediction."""
-        return list(output_dir.rglob("*_model_[0-9].cif"))
-class BoltzModel(AF3Model):
-    def __init__(self, api_key: str):
-        super().__init__(api_key, "Boltz", BoltzQuery, BoltzFastaValidator())
-    def call(self, seq_file: Path | str, output_dir: Path) -> None:
-        """Predict protein structure from amino acid sequence using Boltz model.
-        Args:
-            seq_file (Path | str): Path to FASTA file containing amino acid sequence
-            output_dir (Path): Path to output directory
-        """
-        super().call(seq_file, output_dir)
-    def predictions(self, output_dir: Path) -> list[Path]:
-        """Get the path to the prediction."""
-        prediction_paths = list(output_dir.rglob("*_model_[0-9].cif"))
-        return {
-            int(cif_path.stem[-1]): {
-                "prediction_path": cif_path,
-                "metrics": np.load(list(cif_path.parent.glob("plddt_*.npz"))[0]),
-            }
-            for cif_path in prediction_paths
-        }
 def extract_plddt_from_cif(cif_path):
     structure = MMCIFParser().get_structure("structure", cif_path)
-    # Dictionary to store pLDDT values per residue
     plddt_values = []
     # Iterate through all atoms
     for model in structure:
@@ -285,17 +166,27 @@ def extract_plddt_from_cif(cif_path):
                     # The B-factor contains the pLDDT value
                     plddt = residue["CA"].get_bfactor()
                     plddt_values.append(plddt)
-    return plddt_values
-def predict(sequence: str, api_key: str, model_type: FoldingModel) -> tuple[str, str]:
     """Predict protein structure from amino acid sequence using Boltz model.
     Args:
         sequence (str): Amino acid sequence to predict structure for
         api_key (str): Folding API key
         model (FoldingModel): Folding model to use
     Returns:
         tuple[str, str]: Tuple containing the path to the PDB file and the pLDDT plot
@@ -303,6 +194,7 @@ def predict(sequence: str, api_key: str, model_type: FoldingModel) -> tuple[str,
     if not api_key:
         raise gr.Error("Missing API key, please enter a valid API key")
     # Set up unique output directory based on sequence hash
     seq_id, seq_file = _write_fasta_file(sequence)
     output_dir = OUTPUT_DIR / seq_id / model_type
@@ -319,15 +211,16 @@ def predict(sequence: str, api_key: str, model_type: FoldingModel) -> tuple[str,
     # Check if prediction already exists
     if not model.has_prediction(output_dir):
-        # Run Boltz prediction
         logger.info(f"Predicting {seq_id}")
-        model.call(seq_file=seq_file, output_dir=output_dir)
         logger.info("Prediction done. Output directory: %s", output_dir)
     else:
         logger.info("Prediction already exists. Output directory: %s", output_dir)
-    # output_dir = Path("boltz_results")  # debug
     # Convert output CIF to PDB
     if not model.has_prediction(output_dir):
         raise gr.Error("No prediction found")
@@ -335,23 +228,34 @@ def predict(sequence: str, api_key: str, model_type: FoldingModel) -> tuple[str,
     predictions = model.predictions(output_dir)
     pdb_paths = []
     model_plddt_vals = []
-    for model_idx, prediction in predictions.items():
-        cif_path = prediction["prediction_path"]
-        logger.info(
-            "CIF file: %s",
         )
         converted_pdb_path = str(
             output_dir / f"{model.model_name}_prediction_{model_idx}.pdb"
         )
         convert_cif_to_pdb(str(cif_path), str(converted_pdb_path))
-        plddt_vals = extract_plddt_from_cif(cif_path)
         pdb_paths.append(converted_pdb_path)
         model_plddt_vals.append(plddt_vals)
-    plddt_plot = add_plddt_plot(
-        plddt_vals=model_plddt_vals, model_name=model.model_name
     )
-    return pdb_paths, plddt_plot
 def align_structures(pdb_paths: list[str]) -> list[str]:
@@ -397,28 +301,148 @@ def align_structures(pdb_paths: list[str]) -> list[str]:
     return aligned_paths
 def predict_comparison(
-    sequence: str, api_key: str, model_types: list[FoldingModel]
-) -> tuple[str, str]:
-    """Predict protein structure from amino acid sequence using Boltz model.
     Args:
         sequence (str): Amino acid sequence to predict structure for
         api_key (str): Folding API key
-        model (FoldingModel): Folding model to use
     Returns:
-        tuple[str, str]: Tuple containing the path to the PDB file and the pLDDT plot
     """
     if not api_key:
         raise gr.Error("Missing API key, please enter a valid API key")
     # Set up unique output directory based on sequence hash
     pdb_paths = []
-    for model_type in model_types:
-        model_pdb_paths, _ = predict(sequence, api_key, model_type)
         pdb_paths += model_pdb_paths
     aligned_paths = align_structures(pdb_paths)
-    return aligned_paths

 import hashlib
 import logging
 from io import StringIO
 from pathlib import Path
 import gradio as gr
 import numpy as np
 import plotly.graph_objects as go
 from Bio import SeqIO
 from Bio.PDB import PDBIO, MMCIFParser, PDBParser, Superimposer
 from folding_studio_data_models import FoldingModel
+from folding_studio_demo.models import BoltzModel, ChaiModel, ProtenixModel
 logger = logging.getLogger(__name__)
 OUTPUT_DIR = Path("output")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+THREE_TO_ONE_LETTER = {
+    "ALA": "A",
+    "ARG": "R",
+    "ASN": "N",
+    "ASP": "D",
+    "CYS": "C",
+    "GLN": "Q",
+    "GLU": "E",
+    "GLY": "G",
+    "HIS": "H",
+    "ILE": "I",
+    "LEU": "L",
+    "LYS": "K",
+    "MET": "M",
+    "PHE": "F",
+    "PRO": "P",
+    "SER": "S",
+    "THR": "T",
+    "TRP": "W",
+    "TYR": "Y",
+    "VAL": "V",
+    "SEC": "U",
+    "PYL": "O",
+    "ASX": "B",
+    "GLX": "Z",
+    "XAA": "X",
+    "XLE": "J",
+    "UNK": "X",
+}
+def convert_to_one_letter(resname: str) -> str:
+    """Convert three-letter amino acid code to one-letter code.
+    Args:
+        resname (str): Three-letter amino acid code
+    Returns:
+        str: One-letter amino acid code
+    """
+    return THREE_TO_ONE_LETTER.get(resname, "X")
 def convert_cif_to_pdb(cif_path: str, pdb_path: str) -> None:
     """Convert a .cif file to .pdb format using Biopython.
     io.save(pdb_path)
+def create_plddt_figure(
+    plddt_vals: list[list[float]],
+    model_name: str,
+    residue_codes: list[list[str]] = None,
+) -> go.Figure:
     """Create a plot of metrics."""
+    plddt_traces = []
+    for i, plddt_val in enumerate(plddt_vals):
+        # Create hover text with residue codes if available
+        if residue_codes and i < len(residue_codes):
+            hover_text = [
+                f"<i>pLDDT</i>: {plddt:.2f}<br><i>Residue:</i> {code} {idx}"
+                for idx, (plddt, code) in enumerate(zip(plddt_val, residue_codes[i]))
+            ]
+        else:
+            hover_text = [
+                f"<i>pLDDT</i>: {plddt:.2f}<br><i>Residue index:</i> {idx}"
+                for idx, plddt in enumerate(plddt_val)
+            ]
+        plddt_traces.append(
+            go.Scatter(
+                x=np.arange(len(plddt_val)),
+                y=plddt_val,
+                hovertemplate="%{text}<extra></extra>",
+                text=hover_text,
+                name=f"{model_name} {i}",
+                visible=True,
+            )
         )
     plddt_fig = go.Figure(data=plddt_traces)
     plddt_fig.update_layout(
         title="pLDDT",
+        xaxis_title="Residue",
         yaxis_title="pLDDT",
         height=500,
         template="simple_white",
         legend=dict(yanchor="bottom", y=0.01, xanchor="left", x=0.99),
     )
     return plddt_fig
     return seq_id, seq_file
 def extract_plddt_from_cif(cif_path):
     structure = MMCIFParser().get_structure("structure", cif_path)
+    # Lists to store pLDDT values and residue codes
     plddt_values = []
+    residue_codes = []
     # Iterate through all atoms
     for model in structure:
                     # The B-factor contains the pLDDT value
                     plddt = residue["CA"].get_bfactor()
                     plddt_values.append(plddt)
+                    # Get residue code and convert to one-letter code
+                    residue_codes.append(convert_to_one_letter(residue.get_resname()))
+    return plddt_values, residue_codes
+def predict(
+    sequence: str,
+    api_key: str,
+    model_type: FoldingModel,
+    format_fasta: bool = False,
+    progress=gr.Progress(),
+) -> tuple[str, str]:
     """Predict protein structure from amino acid sequence using Boltz model.
     Args:
         sequence (str): Amino acid sequence to predict structure for
         api_key (str): Folding API key
         model (FoldingModel): Folding model to use
+        format_fasta (bool): Whether to format the FASTA file
+        progress (gr.Progress): Gradio progress tracker
     Returns:
         tuple[str, str]: Tuple containing the path to the PDB file and the pLDDT plot
     if not api_key:
         raise gr.Error("Missing API key, please enter a valid API key")
+    progress(0, desc="Setting up prediction...")
     # Set up unique output directory based on sequence hash
     seq_id, seq_file = _write_fasta_file(sequence)
     output_dir = OUTPUT_DIR / seq_id / model_type
     # Check if prediction already exists
     if not model.has_prediction(output_dir):
+        progress(0.2, desc="Running prediction...")
+        # Run prediction
         logger.info(f"Predicting {seq_id}")
+        model.call(seq_file=seq_file, output_dir=output_dir, format_fasta=format_fasta)
         logger.info("Prediction done. Output directory: %s", output_dir)
     else:
+        progress(0.2, desc="Using existing prediction...")
         logger.info("Prediction already exists. Output directory: %s", output_dir)
+    progress(0.4, desc="Processing results...")
     # Convert output CIF to PDB
     if not model.has_prediction(output_dir):
         raise gr.Error("No prediction found")
     predictions = model.predictions(output_dir)
     pdb_paths = []
     model_plddt_vals = []
+    model_residue_codes = []
+    total_predictions = len(predictions)
+    for i, (model_idx, prediction) in enumerate(predictions.items()):
+        progress(
+            0.4 + (0.4 * i / total_predictions), desc=f"Converting model {model_idx}..."
         )
+        cif_path = prediction["prediction_path"]
+        logger.info(f"CIF file: {cif_path}")
         converted_pdb_path = str(
             output_dir / f"{model.model_name}_prediction_{model_idx}.pdb"
         )
         convert_cif_to_pdb(str(cif_path), str(converted_pdb_path))
+        plddt_vals, residue_codes = extract_plddt_from_cif(cif_path)
         pdb_paths.append(converted_pdb_path)
         model_plddt_vals.append(plddt_vals)
+        model_residue_codes.append(residue_codes)
+    progress(0.8, desc="Generating plots...")
+    plddt_fig = create_plddt_figure(
+        plddt_vals=model_plddt_vals,
+        model_name=model.model_name,
+        residue_codes=model_residue_codes,
     )
+    progress(1.0, desc="Done!")
+    return pdb_paths, plddt_fig
 def align_structures(pdb_paths: list[str]) -> list[str]:
     return aligned_paths
+def filter_predictions(
+    aligned_paths: list[str],
+    plddt_fig: go.Figure,
+    chai_selected: list[int],
+    boltz_selected: list[int],
+    protenix_selected: list[int],
+) -> tuple[list[str], go.Figure]:
+    """Filter predictions based on selected checkboxes.
+    Args:
+        aligned_paths (list[str]): List of aligned PDB paths
+        plddt_fig (go.Figure): Original pLDDT plot
+        chai_selected (list[int]): Selected Chai model indices
+        boltz_selected (list[int]): Selected Boltz model indices
+        protenix_selected (list[int]): Selected Protenix model indices
+        model_predictions (dict[FoldingModel, list[int]]): Dictionary mapping models to their prediction indices
+    Returns:
+        tuple[list[str], go.Figure]: Filtered PDB paths and updated pLDDT plot
+    """
+    # Create a new figure with only selected traces
+    filtered_fig = go.Figure()
+    # Keep track of which traces to show
+    visible_paths = []
+    # Helper function to check if a trace should be visible
+    def should_show_trace(trace_name: str) -> bool:
+        model_name = trace_name.split()[0]
+        model_idx = int(trace_name.split()[1])
+        if model_name == "Chai" and model_idx in chai_selected:
+            return True
+        if model_name == "Boltz" and model_idx in boltz_selected:
+            return True
+        if model_name == "Protenix" and model_idx in protenix_selected:
+            return True
+        return False
+    # Filter traces and paths
+    for i, trace in enumerate(plddt_fig.data):
+        if should_show_trace(trace.name):
+            filtered_fig.add_trace(trace)
+            visible_paths.append(aligned_paths[i])
+    # Update layout
+    filtered_fig.update_layout(
+        title="pLDDT",
+        xaxis_title="Residue index",
+        yaxis_title="pLDDT",
+        height=500,
+        template="simple_white",
+        legend=dict(yanchor="bottom", y=0.01, xanchor="left", x=0.99),
+    )
+    return visible_paths, filtered_fig
 def predict_comparison(
+    sequence: str, api_key: str, model_types: list[FoldingModel], progress=gr.Progress()
+) -> tuple[
+    list[str],
+    go.Figure,
+    gr.CheckboxGroup,
+    gr.CheckboxGroup,
+    gr.CheckboxGroup,
+    list[str],
+    go.Figure,
+    dict,
+]:
+    """Predict protein structure from amino acid sequence using multiple models.
     Args:
         sequence (str): Amino acid sequence to predict structure for
         api_key (str): Folding API key
+        model_types (list[FoldingModel]): List of folding models to use
+        progress (gr.Progress): Gradio progress tracker
     Returns:
+        tuple containing:
+            - list[str]: Aligned PDB paths
+            - go.Figure: pLDDT plot
+            - gr.CheckboxGroup: Chai predictions checkbox group
+            - gr.CheckboxGroup: Boltz predictions checkbox group
+            - gr.CheckboxGroup: Protenix predictions checkbox group
+            - list[str]: Original PDB paths
+            - go.Figure: Original pLDDT plot
+            - dict: Model predictions mapping
     """
     if not api_key:
         raise gr.Error("Missing API key, please enter a valid API key")
     # Set up unique output directory based on sequence hash
     pdb_paths = []
+    plddt_traces = []
+    total_models = len(model_types)
+    model_predictions = {}
+    for i, model_type in enumerate(model_types):
+        progress(i / total_models, desc=f"Running {model_type} prediction...")
+        model_pdb_paths, model_plddt_traces = predict(
+            sequence, api_key, model_type, format_fasta=True
+        )
         pdb_paths += model_pdb_paths
+        plddt_traces += model_plddt_traces.data
+        model_predictions[model_type] = [int(Path(p).stem[-1]) for p in model_pdb_paths]
+    progress(0.9, desc="Aligning structures...")
     aligned_paths = align_structures(pdb_paths)
+    plddt_fig = go.Figure(data=plddt_traces)
+    plddt_fig.update_layout(
+        title="pLDDT",
+        xaxis_title="Residue index",
+        yaxis_title="pLDDT",
+        height=500,
+        template="simple_white",
+        legend=dict(yanchor="bottom", y=0.01, xanchor="left", x=0.99),
+    )
+    progress(1.0, desc="Done!")
+    # Create checkbox groups for each model type
+    chai_predictions = gr.CheckboxGroup(
+        visible=model_predictions.get(FoldingModel.CHAI) is not None,
+        choices=model_predictions.get(FoldingModel.CHAI, []),
+        value=model_predictions.get(FoldingModel.CHAI, []),
+    )
+    boltz_predictions = gr.CheckboxGroup(
+        visible=model_predictions.get(FoldingModel.BOLTZ) is not None,
+        choices=model_predictions.get(FoldingModel.BOLTZ, []),
+        value=model_predictions.get(FoldingModel.BOLTZ, []),
+    )
+    protenix_predictions = gr.CheckboxGroup(
+        visible=model_predictions.get(FoldingModel.PROTENIX) is not None,
+        choices=model_predictions.get(FoldingModel.PROTENIX, []),
+        value=model_predictions.get(FoldingModel.PROTENIX, []),
+    )
+    return (
+        chai_predictions,
+        boltz_predictions,
+        protenix_predictions,
+        aligned_paths,
+        plddt_fig,
+    )