Spaces:

InstaDeepAI
/

folding-studio-demo

Running

App Files Files Community

jfaustin

AchilleSoulieID commited on 4 days ago

Commit

f601557

verified ·

1 Parent(s): 142035a

Add model comparison (#3)

Browse files

- add model comparison (485bd69b80145fab090b2e9d368082247e7a4d93)

Co-authored-by: Achille Soulie <[email protected]>

Files changed (2) hide show

folding_studio_demo/app.py +39 -22
folding_studio_demo/predict.py +173 -28

folding_studio_demo/app.py CHANGED Viewed

@@ -3,12 +3,16 @@
 import logging
 import gradio as gr
 from folding_studio_data_models import FoldingModel
 from gradio_molecule3d import Molecule3D
-import pandas as pd
-from folding_studio_demo.predict import predict
-from folding_studio_demo.correlate import fake_predict_and_correlate, SCORE_COLUMNS, select_correlation_plot
 logger = logging.getLogger(__name__)
@@ -119,7 +123,7 @@ def model_comparison(api_key: str) -> None:
     """
     with gr.Row():
-        model = gr.Dropdown(
             label="Model",
             choices=MODEL_CHOICES,
             multiselect=True,
@@ -133,13 +137,18 @@ def model_comparison(api_key: str) -> None:
     predict_btn = gr.Button("Compare Models")
     with gr.Row():
-        mol_output = Molecule3D(label="Protein Structure", reps=MOLECULE_REPS)
-        metrics_plot = gr.Plot(label="pLDDT")
     predict_btn.click(
-        fn=predict,
-        inputs=[sequence, api_key, model],
-        outputs=[mol_output, metrics_plot],
     )
@@ -147,12 +156,12 @@ def create_correlation_tab():
     gr.Markdown("# Correlation with experimental binding affinity data")
     spr_data_with_scores = pd.read_csv("spr_af_scores_mapped.csv")
     prettified_columns = {
-            "antibody_name": "Antibody Name",
-            "KD (nM)": "KD (nM)",
-            "antibody_vh_sequence": "Antibody VH Sequence",
-            "antibody_vl_sequence": "Antibody VL Sequence",
-            "antigen_sequence": "Antigen Sequence"
-        }
     spr_data_with_scores = spr_data_with_scores.rename(columns=prettified_columns)
     with gr.Row():
         columns = [
@@ -160,10 +169,13 @@ def create_correlation_tab():
             "KD (nM)",
             "Antibody VH Sequence",
             "Antibody VL Sequence",
-            "Antigen Sequence"
         ]
         # Display dataframe with floating point values rounded to 2 decimal places
-        spr_data = gr.DataFrame(value=spr_data_with_scores[columns].round(2), label="Experimental Antibody-Antigen Binding Affinity Data")
     gr.Markdown("# Prediction and correlation")
     with gr.Row():
@@ -174,22 +186,27 @@ def create_correlation_tab():
         correlation_ranking_plot = gr.Plot(label="Correlation ranking")
     with gr.Row():
         # User can select the columns to display in the correlation plot
-        correlation_column = gr.Dropdown(label="Score data to display", choices=SCORE_COLUMNS, multiselect=False)
         correlation_plot = gr.Plot(label="Correlation with binding affinity")
     fake_predict_btn.click(
-        fn=lambda x: fake_predict_and_correlate(spr_data_with_scores, SCORE_COLUMNS, ["Antibody Name", "KD (nM)"]),
         inputs=None,
-        outputs=[prediction_dataframe, correlation_ranking_plot]
     )
     # Call function to update the correlation plot when the user selects the columns
     correlation_column.change(
         fn=lambda score: select_correlation_plot(spr_data_with_scores, score),
         inputs=correlation_column,
-        outputs=correlation_plot
     )
 def __main__():
     with gr.Blocks(title="Folding Studio Demo") as demo:
         gr.Markdown(

 import logging
 import gradio as gr
+import pandas as pd
 from folding_studio_data_models import FoldingModel
 from gradio_molecule3d import Molecule3D
+from folding_studio_demo.correlate import (
+    SCORE_COLUMNS,
+    fake_predict_and_correlate,
+    select_correlation_plot,
+)
+from folding_studio_demo.predict import predict, predict_comparison
 logger = logging.getLogger(__name__)
     """
     with gr.Row():
+        models = gr.Dropdown(
             label="Model",
             choices=MODEL_CHOICES,
             multiselect=True,
     predict_btn = gr.Button("Compare Models")
     with gr.Row():
+        mol_outputs = Molecule3D(
+            label="Protein Structure",
+            reps=MOLECULE_REPS,
+            file_count="multiple",
+        )
+        # metrics_plot = gr.Plot(label="pLDDT")
     predict_btn.click(
+        fn=predict_comparison,
+        inputs=[sequence, api_key, models],
+        outputs=[mol_outputs],
     )
     gr.Markdown("# Correlation with experimental binding affinity data")
     spr_data_with_scores = pd.read_csv("spr_af_scores_mapped.csv")
     prettified_columns = {
+        "antibody_name": "Antibody Name",
+        "KD (nM)": "KD (nM)",
+        "antibody_vh_sequence": "Antibody VH Sequence",
+        "antibody_vl_sequence": "Antibody VL Sequence",
+        "antigen_sequence": "Antigen Sequence",
+    }
     spr_data_with_scores = spr_data_with_scores.rename(columns=prettified_columns)
     with gr.Row():
         columns = [
             "KD (nM)",
             "Antibody VH Sequence",
             "Antibody VL Sequence",
+            "Antigen Sequence",
         ]
         # Display dataframe with floating point values rounded to 2 decimal places
+        spr_data = gr.DataFrame(
+            value=spr_data_with_scores[columns].round(2),
+            label="Experimental Antibody-Antigen Binding Affinity Data",
+        )
     gr.Markdown("# Prediction and correlation")
     with gr.Row():
         correlation_ranking_plot = gr.Plot(label="Correlation ranking")
     with gr.Row():
         # User can select the columns to display in the correlation plot
+        correlation_column = gr.Dropdown(
+            label="Score data to display", choices=SCORE_COLUMNS, multiselect=False
+        )
         correlation_plot = gr.Plot(label="Correlation with binding affinity")
     fake_predict_btn.click(
+        fn=lambda x: fake_predict_and_correlate(
+            spr_data_with_scores, SCORE_COLUMNS, ["Antibody Name", "KD (nM)"]
+        ),
         inputs=None,
+        outputs=[prediction_dataframe, correlation_ranking_plot],
     )
     # Call function to update the correlation plot when the user selects the columns
     correlation_column.change(
         fn=lambda score: select_correlation_plot(spr_data_with_scores, score),
         inputs=correlation_column,
+        outputs=correlation_plot,
     )
 def __main__():
     with gr.Blocks(title="Folding Studio Demo") as demo:
         gr.Markdown(

folding_studio_demo/predict.py CHANGED Viewed

@@ -3,13 +3,15 @@
 import hashlib
 import logging
 import os
 from pathlib import Path
 import gradio as gr
 import numpy as np
 import plotly.graph_objects as go
 from Bio import SeqIO
-from Bio.PDB import PDBIO, MMCIFParser
 from folding_studio.client import Client
 from folding_studio.query import Query
 from folding_studio.query.boltz import BoltzQuery
@@ -50,18 +52,21 @@ def convert_cif_to_pdb(cif_path: str, pdb_path: str) -> None:
     io.save(pdb_path)
-def add_plddt_plot(plddt_vals: list[float]) -> str:
     """Create a plot of metrics."""
     visible = True
-    plddt_trace = go.Scatter(
-        x=np.arange(len(plddt_vals)),
-        y=plddt_vals,
-        hovertemplate="<i>pLDDT</i>: %{y:.2f} <br><i>Residue index:</i> %{x}<br>",
-        name="seq",
-        visible=visible,
-    )
-    plddt_fig = go.Figure(data=[plddt_trace])
     plddt_fig.update_layout(
         title="pLDDT",
         xaxis_title="Residue index",
@@ -85,7 +90,13 @@ def _write_fasta_file(
     Returns:
         tuple[str, Path]: Tuple containing the sequence ID and the path to the FASTA file
     """
-    seq_id = hashlib.sha1(sequence.encode()).hexdigest()
     seq_file = directory / f"sequence_{seq_id}.fasta"
     with open(seq_file, "w") as f:
         f.write(sequence)
@@ -146,7 +157,7 @@ class AF3Model:
     def has_prediction(self, output_dir: Path) -> bool:
         """Check if prediction exists in output directory."""
-        return any(self.predictions(output_dir))
     def check_file_description(self, seq_file: Path | str) -> tuple[bool, str | None]:
         """Check if the file description is correct.
@@ -157,10 +168,6 @@ class AF3Model:
         Returns:
             tuple[bool, str | None]: Tuple containing a boolean indicating if the format is correct and an error message if not
         """
-        input_rep = list(SeqIO.parse(seq_file, "fasta"))
-        if not input_rep:
-            error_msg = f"{self.model_name.upper()} Validation Error: No sequence found"
-            return False, error_msg
         is_valid, error_msg = self.validator.is_valid_fasta(seq_file)
         if not is_valid:
@@ -182,9 +189,41 @@ class ChaiModel(AF3Model):
         """
         super().call(seq_file, output_dir)
-    def predictions(self, output_dir: Path) -> list[Path]:
         """Get the path to the prediction."""
-        return list(output_dir.rglob("*_model_[0-9].cif"))
 class ProtenixModel(AF3Model):
@@ -221,7 +260,33 @@ class BoltzModel(AF3Model):
     def predictions(self, output_dir: Path) -> list[Path]:
         """Get the path to the prediction."""
-        return list(output_dir.rglob("*_model_[0-9].cif"))
 def predict(sequence: str, api_key: str, model_type: FoldingModel) -> tuple[str, str]:
@@ -235,6 +300,8 @@ def predict(sequence: str, api_key: str, model_type: FoldingModel) -> tuple[str,
     Returns:
         tuple[str, str]: Tuple containing the path to the PDB file and the pLDDT plot
     """
     # Set up unique output directory based on sequence hash
     seq_id, seq_file = _write_fasta_file(sequence)
@@ -265,15 +332,93 @@ def predict(sequence: str, api_key: str, model_type: FoldingModel) -> tuple[str,
     if not model.has_prediction(output_dir):
         raise gr.Error("No prediction found")
-    pred_cif = model.predictions(output_dir)[0]
-    logger.info("Output file: %s", pred_cif)
-    converted_pdb_path = str(output_dir / f"pred_{seq_id}.pdb")
-    convert_cif_to_pdb(str(pred_cif), str(converted_pdb_path))
-    logger.info("Converted PDB file: %s", converted_pdb_path)
-    plddt_file = list(pred_cif.parent.glob("plddt_*.npz"))[0]
-    logger.info("plddt file: %s", plddt_file)
-    plddt_vals = np.load(plddt_file)["plddt"]
-    return converted_pdb_path, add_plddt_plot(plddt_vals=plddt_vals)

 import hashlib
 import logging
 import os
+from io import StringIO
 from pathlib import Path
+from typing import Any
 import gradio as gr
 import numpy as np
 import plotly.graph_objects as go
 from Bio import SeqIO
+from Bio.PDB import PDBIO, MMCIFParser, PDBParser, Superimposer
 from folding_studio.client import Client
 from folding_studio.query import Query
 from folding_studio.query.boltz import BoltzQuery
     io.save(pdb_path)
+def add_plddt_plot(plddt_vals: list[list[float]], model_name: str) -> go.Figure:
     """Create a plot of metrics."""
     visible = True
+    plddt_traces = [
+        go.Scatter(
+            x=np.arange(len(plddt_val)),
+            y=plddt_val,
+            hovertemplate="<i>pLDDT</i>: %{y:.2f} <br><i>Residue index:</i> %{x}<br>",
+            name=f"{model_name} {i}",
+            visible=visible,
+        )
+        for i, plddt_val in enumerate(plddt_vals)
+    ]
+    plddt_fig = go.Figure(data=plddt_traces)
     plddt_fig.update_layout(
         title="pLDDT",
         xaxis_title="Residue index",
     Returns:
         tuple[str, Path]: Tuple containing the sequence ID and the path to the FASTA file
     """
+    input_rep = list(SeqIO.parse(StringIO(sequence), "fasta"))
+    if not input_rep:
+        raise gr.Error("No sequence found")
+    seq_id = hashlib.sha256(
+        "_".join([str(records.seq) for records in input_rep]).encode()
+    ).hexdigest()
     seq_file = directory / f"sequence_{seq_id}.fasta"
     with open(seq_file, "w") as f:
         f.write(sequence)
     def has_prediction(self, output_dir: Path) -> bool:
         """Check if prediction exists in output directory."""
+        return len(self.predictions(output_dir)) > 0
     def check_file_description(self, seq_file: Path | str) -> tuple[bool, str | None]:
         """Check if the file description is correct.
         Returns:
             tuple[bool, str | None]: Tuple containing a boolean indicating if the format is correct and an error message if not
         """
         is_valid, error_msg = self.validator.is_valid_fasta(seq_file)
         if not is_valid:
         """
         super().call(seq_file, output_dir)
+    def _get_chai_paired_files(self, directory: Path) -> list[tuple[Path, Path]]:
+        """Get pairs of .cif and .npz files with matching model indices.
+        Args:
+            directory (Path): Directory containing the prediction files
+        Returns:
+            list[tuple[Path, Path]]: List of tuples containing (cif_path, npz_path) pairs
+        """
+        # Get all cif files and extract their indices
+    def predictions(self, output_dir: Path) -> dict[Path, dict[str, Any]]:
         """Get the path to the prediction."""
+        prediction = next(output_dir.rglob("pred.model_idx_[0-9].cif"), None)
+        if prediction is None:
+            return {}
+        cif_files = {
+            int(f.stem.split("model_idx_")[1]): f
+            for f in prediction.parent.glob("pred.model_idx_*.cif")
+        }
+        # Get all npz files and extract their indices
+        npz_files = {
+            int(f.stem.split("model_idx_")[1]): f
+            for f in prediction.parent.glob("scores.model_idx_*.npz")
+        }
+        # Find common indices and create pairs
+        common_indices = sorted(set(cif_files.keys()) & set(npz_files.keys()))
+        return {
+            idx: {"prediction_path": cif_files[idx], "metrics": np.load(npz_files[idx])}
+            for idx in common_indices
+        }
 class ProtenixModel(AF3Model):
     def predictions(self, output_dir: Path) -> list[Path]:
         """Get the path to the prediction."""
+        prediction_paths = list(output_dir.rglob("*_model_[0-9].cif"))
+        return {
+            int(cif_path.stem[-1]): {
+                "prediction_path": cif_path,
+                "metrics": np.load(list(cif_path.parent.glob("plddt_*.npz"))[0]),
+            }
+            for cif_path in prediction_paths
+        }
+def extract_plddt_from_cif(cif_path):
+    structure = MMCIFParser().get_structure("structure", cif_path)
+    # Dictionary to store pLDDT values per residue
+    plddt_values = []
+    # Iterate through all atoms
+    for model in structure:
+        for chain in model:
+            for residue in chain:
+                # Get the first atom of each residue (usually CA atom)
+                if "CA" in residue:
+                    # The B-factor contains the pLDDT value
+                    plddt = residue["CA"].get_bfactor()
+                    plddt_values.append(plddt)
+    return plddt_values
 def predict(sequence: str, api_key: str, model_type: FoldingModel) -> tuple[str, str]:
     Returns:
         tuple[str, str]: Tuple containing the path to the PDB file and the pLDDT plot
     """
+    if not api_key:
+        raise gr.Error("Missing API key, please enter a valid API key")
     # Set up unique output directory based on sequence hash
     seq_id, seq_file = _write_fasta_file(sequence)
     if not model.has_prediction(output_dir):
         raise gr.Error("No prediction found")
+    predictions = model.predictions(output_dir)
+    pdb_paths = []
+    model_plddt_vals = []
+    for model_idx, prediction in predictions.items():
+        cif_path = prediction["prediction_path"]
+        logger.info(
+            "CIF file: %s",
+        )
+        converted_pdb_path = str(
+            output_dir / f"{model.model_name}_prediction_{model_idx}.pdb"
+        )
+        convert_cif_to_pdb(str(cif_path), str(converted_pdb_path))
+        plddt_vals = extract_plddt_from_cif(cif_path)
+        pdb_paths.append(converted_pdb_path)
+        model_plddt_vals.append(plddt_vals)
+    plddt_plot = add_plddt_plot(
+        plddt_vals=model_plddt_vals, model_name=model.model_name
+    )
+    return pdb_paths, plddt_plot
+def align_structures(pdb_paths: list[str]) -> list[str]:
+    """Align multiple PDB structures to the first structure.
+    Args:
+        pdb_paths (list[str]): List of paths to PDB files to align
+    Returns:
+        list[str]: List of paths to aligned PDB files
+    """
+    parser = PDBParser()
+    io = PDBIO()
+    # Parse the reference structure (first one)
+    ref_structure = parser.get_structure("reference", pdb_paths[0])
+    ref_atoms = [atom for atom in ref_structure.get_atoms() if atom.get_name() == "CA"]
+    aligned_paths = [pdb_paths[0]]  # First structure is already aligned
+    # Align each subsequent structure to the reference
+    for i, pdb_path in enumerate(pdb_paths[1:], start=1):
+        # Parse the structure to align
+        structure = parser.get_structure(f"model_{i}", pdb_path)
+        atoms = [atom for atom in structure.get_atoms() if atom.get_name() == "CA"]
+        # Create superimposer
+        sup = Superimposer()
+        # Set the reference and moving atoms
+        sup.set_atoms(ref_atoms, atoms)
+        # Apply the transformation to all atoms in the structure
+        sup.apply(structure.get_atoms())
+        # Save the aligned structure
+        aligned_path = str(Path(pdb_path).parent / f"aligned_{Path(pdb_path).name}")
+        io.set_structure(structure)
+        io.save(aligned_path)
+        aligned_paths.append(aligned_path)
+    return aligned_paths
+def predict_comparison(
+    sequence: str, api_key: str, model_types: list[FoldingModel]
+) -> tuple[str, str]:
+    """Predict protein structure from amino acid sequence using Boltz model.
+    Args:
+        sequence (str): Amino acid sequence to predict structure for
+        api_key (str): Folding API key
+        model (FoldingModel): Folding model to use
+    Returns:
+        tuple[str, str]: Tuple containing the path to the PDB file and the pLDDT plot
+    """
+    if not api_key:
+        raise gr.Error("Missing API key, please enter a valid API key")
+    # Set up unique output directory based on sequence hash
+    pdb_paths = []
+    for model_type in model_types:
+        model_pdb_paths, _ = predict(sequence, api_key, model_type)
+        pdb_paths += model_pdb_paths
+    aligned_paths = align_structures(pdb_paths)
+    return aligned_paths