|
import warnings |
|
warnings.filterwarnings("ignore") |
|
import os |
|
import re |
|
import sys |
|
import shutil |
|
import random |
|
import subprocess |
|
import torch |
|
import numpy as np |
|
import pandas as pd |
|
import MDAnalysis as mda |
|
from typing import Optional |
|
|
|
from pathlib import Path |
|
from tempfile import NamedTemporaryFile |
|
|
|
import huggingface_hub |
|
from huggingface_hub.utils import GatedRepoError |
|
from huggingface_hub import get_hf_file_metadata, hf_hub_download, login |
|
|
|
import spaces |
|
import gradio as gr |
|
|
|
DEVICE = torch.device('cpu') |
|
REPO_URL = "https://github.com/WaymentSteeleLab/Dyna-1.git" |
|
DYNA_MODEL_ID = "gelnesr/Dyna-1" |
|
|
|
def setup_environment(): |
|
base_dir = Path(os.getcwd()) |
|
dyna1_dir = base_dir / "Dyna-1" |
|
|
|
for filename in ["dyna1.pt", "dyna1-esm2.pt", "config.json"]: |
|
if not os.path.exists(f'Dyna-1/model/weights/{filename}'): |
|
print(f"Downloading {filename} from HuggingFace...") |
|
try: |
|
hf_hub_download( |
|
repo_id=DYNA_MODEL_ID, |
|
filename=filename, |
|
repo_type='model', |
|
local_dir=f'{dyna1_dir}/model/weights/', |
|
) |
|
print(f"Successfully downloaded {filename}") |
|
except Exception as e: |
|
print(f"Error downloading {filename}: {str(e)}") |
|
|
|
return dyna1_dir |
|
|
|
dyna1_dir = setup_environment() |
|
sys.path.insert(0, str(dyna1_dir)) |
|
|
|
from model.model import ESM_model |
|
from esm.sdk.api import ESMProtein |
|
from esm.utils.structure.protein_chain import ProteinChain |
|
from transformers import AutoTokenizer |
|
import utils |
|
|
|
def check_permissions(token: Optional[str] = None) -> None: |
|
if token is None: |
|
raise gr.Error("Please log in to use this Space") |
|
try: |
|
url = huggingface_hub.hf_hub_url(repo_id="EvolutionaryScale/esm3-sm-open-v1", repo_type='model', filename="config.json") |
|
get_hf_file_metadata(url=url) |
|
return |
|
except GatedRepoError: |
|
raise gr.Error("You must have access to ... to run this Space. Please go through the gating process and come back.") |
|
|
|
def validate_sequence(sequence): |
|
if not sequence: |
|
return None |
|
alphabets = {'protein': re.compile('^[acdefghiklmnpqrstvwy]*$', re.I)} |
|
if alphabets['protein'].search(sequence) is None: |
|
raise gr.Error('Invalid protein sequence. Please use standard amino acid letters.') |
|
return sequence.upper() |
|
|
|
def process_structure(pdb_input, chain_id='A'): |
|
if not pdb_input: |
|
return None, None |
|
|
|
if isinstance(pdb_input, str) and len(pdb_input) == 4: |
|
try: |
|
protein_chain = ProteinChain.from_rcsb(pdb_input.upper(), chain_id=chain_id) |
|
except Exception as e: |
|
raise gr.Error(f"Error fetching PDB {pdb_input}: {str(e)}") |
|
else: |
|
temp_pdb = NamedTemporaryFile(suffix='.pdb', delete=False) |
|
try: |
|
if hasattr(pdb_input, 'name'): |
|
with open(pdb_input.name, 'rb') as f: |
|
pdb_content = f.read() |
|
else: |
|
pdb_content = pdb_input.encode() if isinstance(pdb_input, str) else pdb_input |
|
|
|
temp_pdb.write(pdb_content) |
|
temp_pdb.close() |
|
|
|
protein_chain = ProteinChain.from_pdb(temp_pdb.name, chain_id=chain_id) |
|
except Exception as e: |
|
if os.path.exists(temp_pdb.name): |
|
os.unlink(temp_pdb.name) |
|
raise gr.Error(f"Error processing PDB file: {str(e)}") |
|
|
|
if os.path.exists(temp_pdb.name): |
|
os.unlink(temp_pdb.name) |
|
|
|
protein = ESMProtein.from_protein_chain(protein_chain) |
|
return protein, protein_chain |
|
|
|
def write_probabilities_to_pdb(protein, probabilities, output_path): |
|
"""Write probabilities to PDB B-factors and save the file.""" |
|
temp_pdb = NamedTemporaryFile(suffix='.pdb', delete=False) |
|
protein.to_pdb(temp_pdb.name) |
|
|
|
curr = mda.Universe(temp_pdb.name) |
|
curr.add_TopologyAttr('bfactors') |
|
protein_out = curr.select_atoms("protein") |
|
|
|
for residue, prob in zip(protein_out.residues, probabilities): |
|
for atom in residue.atoms: |
|
atom.tempfactor = prob |
|
|
|
protein_out.write(output_path) |
|
os.unlink(temp_pdb.name) |
|
return output_path |
|
|
|
def handle_name(name=None, pdb_input=None, model_version="ESM3"): |
|
"""Processes the output file name given inputs of name and pdb; otherwise generates a random number""" |
|
if name: |
|
pdb_name = name |
|
elif pdb_input: |
|
if isinstance(pdb_input, str) and len(pdb_input) == 4: |
|
pdb_name = pdb_input |
|
else: |
|
if hasattr(pdb_input, 'name'): |
|
pdb_name = Path(pdb_input.name).stem |
|
else: |
|
pdb_name = str(random.randint(0, 100000)) |
|
else: |
|
pdb_name = str(random.randint(0, 100000)) |
|
return f'{pdb_name}-Dyna1{"" if model_version == "ESM3" else "-ESM2"}' |
|
|
|
@spaces.GPU(duration=50) |
|
def run_model(model, model_version='ESM2', seq_input=None, struct_input=None, sequence_id=None): |
|
if model_version == "ESM3": |
|
logits = model((seq_input, struct_input), sequence_id) |
|
else: |
|
logits = model(seq_input, sequence_id) |
|
return logits.cpu().detach() |
|
|
|
def predict_dynamics(sequence=None, pdb_input=None, chain_id='A', use_pdb_seq=False, model_version="ESM3", name=None, oauth_token: Optional[str] = None): |
|
try: |
|
|
|
if model_version == "ESM2" and not sequence: |
|
raise ValueError("ESM-2 model requires a sequence input. Please provide a protein sequence.") |
|
|
|
if model_version == "ESM3" and not (sequence or pdb_input): |
|
raise ValueError("ESM-3 model requires either a sequence, structure (PDB ID/file), or both. Please provide at least one input.") |
|
|
|
base_name = handle_name(name, pdb_input, model_version) |
|
|
|
seq_input, struct_input = None, None |
|
sequence = validate_sequence(sequence) if sequence else None |
|
protein = None |
|
if model_version == "ESM3": |
|
model = ESM_model(method='esm3') |
|
model.load_state_dict(torch.load('Dyna-1/model/weights/dyna1.pt', map_location=torch.device('cpu')), strict=False) |
|
else: |
|
model = ESM_model(method='esm2', nheads=8, nlayers=12, layer=30).to(DEVICE) |
|
model.load_state_dict(torch.load('Dyna-1/model/weights/dyna1-esm2.pt', map_location=torch.device('cpu')), strict=False) |
|
|
|
model.eval() |
|
|
|
if pdb_input and model_version == "ESM3": |
|
protein, protein_chain = process_structure(pdb_input, chain_id) |
|
encoder = model.model.encode(protein) |
|
struct_input = encoder.structure[1:-1].unsqueeze(0) |
|
pdb_seq = protein.sequence |
|
seq_input = encoder.sequence[1:-1].unsqueeze(0) |
|
sequence_id = seq_input != 4099 |
|
|
|
if not use_pdb_seq: |
|
seq_input = None |
|
|
|
if sequence and len(pdb_seq) != len(sequence): |
|
raise ValueError('Length of provided sequence does not match length of structure input.') |
|
|
|
if sequence: |
|
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D") |
|
token_seq = tokenizer.encode(sequence, add_special_tokens=False, return_tensors='np') |
|
seq_input = torch.from_numpy(token_seq).to(DEVICE) |
|
sequence_id = seq_input != 4099 |
|
|
|
if not (sequence or (pdb_input and model_version == "ESM3")): |
|
raise ValueError('Please provide a sequence' + (' or structure input' if model_version == "ESM3" else '')) |
|
|
|
logits = run_model(model, model_version, seq_input, struct_input, sequence_id) |
|
|
|
probabilities = utils.prob_adjusted(logits).numpy() |
|
|
|
seq_to_use = sequence if sequence else pdb_seq if pdb_input else sequence |
|
results_df = pd.DataFrame({ |
|
'position': np.arange(1, len(probabilities) + 1), |
|
'residue': np.array(list(seq_to_use)), |
|
'p_exchange': probabilities, |
|
}) |
|
|
|
csv_output = None |
|
pdb_output = None |
|
temp_csv = None |
|
temp_pdb = None |
|
|
|
try: |
|
temp_csv = NamedTemporaryFile(suffix='.csv', delete=False) |
|
results_df.to_csv(temp_csv.name, index=False) |
|
csv_output = temp_csv.name |
|
os.rename(csv_output, f"{base_name}.csv") |
|
csv_output = f"{base_name}.csv" |
|
|
|
if protein is not None and model_version == "ESM3": |
|
temp_pdb = NamedTemporaryFile(suffix='.pdb', delete=False) |
|
pdb_output = write_probabilities_to_pdb(protein, probabilities, temp_pdb.name) |
|
os.rename(pdb_output, f"{base_name}.pdb") |
|
pdb_output = f"{base_name}.pdb" |
|
|
|
return csv_output, pdb_output if pdb_output else None |
|
|
|
except Exception as e: |
|
if temp_csv and os.path.exists(temp_csv.name): |
|
os.unlink(temp_csv.name) |
|
if temp_pdb and os.path.exists(temp_pdb.name): |
|
os.unlink(temp_pdb.name) |
|
raise gr.Error(f"Error saving output files: {str(e)}") |
|
|
|
except Exception as e: |
|
raise gr.Error(str(e)) |
|
|
|
css = """ |
|
.gradio-container { |
|
font-family: 'Inter', system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; |
|
} |
|
.tabs { |
|
margin-top: 0; |
|
margin-bottom: 0; |
|
} |
|
.gap { |
|
gap: 1rem; |
|
} |
|
""" |
|
|
|
dyna1_app = gr.Blocks(theme=gr.themes.Soft(), mode="light") |
|
|
|
with dyna1_app: |
|
|
|
gr.Markdown("# Dyna-1") |
|
gr.Markdown("## Predict micro-millisecond protein dynamics from sequence and/or structure") |
|
gr.Markdown("""[Paper](https://www.biorxiv.org/content/10.1101/2025.03.19.642801v1) | |
|
[GitHub](https://github.com/WaymentSteeleLab/Dyna-1) | |
|
[Model](https://huggingface.co/gelnesr/Dyna-1) | |
|
[Datasets](https://huggingface.co/datasets/gelnesr/RelaxDB) | |
|
[Colab](https://colab.research.google.com/github/WaymentSteeleLab/Dyna-1/blob/main/colab/Dyna_1.ipynb)""") |
|
|
|
gr.Markdown(""" |
|
Dyna-1 predicts the probability that each residue experiences micro-millisecond motions. |
|
You can provide either a protein sequence, a structure (PDB ID or file), or both for the best performance. |
|
""") |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
## Instructions |
|
- Authorize access to ESM-3 by logging in to HuggingFace (required for ESM-3) |
|
- Enter a protein sequence using standard amino acid letters (optional) |
|
- Provide a PDB ID (e.g., "1ubq") or upload a PDB file (optional) |
|
- Specify the chain ID if using a structure (default: A) |
|
- Choose whether to use the sequence from the PDB structure |
|
|
|
You can toggle between using the ESM-3 and ESM-2 versions of the Dyna-1 model. To run with ESM-3, make sure you already |
|
have access to the `EvolutionaryScale/esm3-sm-open-v1` weights [here](https://huggingface.co/EvolutionaryScale/esm3-sm-open-v1). |
|
|
|
Note: The model will automatically set up the required environment on first run. |
|
|
|
Use of this HF Space is subject to a [Non-Commercial Use License](https://github.com/WaymentSteeleLab/Dyna-1/blob/main/LICENSE.txt). |
|
""") |
|
gr.Image(f"assets/dyna1.png", show_label=False) |
|
|
|
gr.LoginButton() |
|
model_choice = gr.Dropdown( |
|
choices=["ESM3", "ESM2"], |
|
value="ESM3", |
|
label="Choose model version" |
|
) |
|
|
|
with gr.Tabs() as tabs: |
|
with gr.Tab("Input"): |
|
with gr.Column(visible=True) as esm3_inputs: |
|
name_input = gr.Text( |
|
label="Job Name (optional)", |
|
placeholder="Enter name for the job. This will specify the output files. Leave blank to use PDB ID or a random number" |
|
) |
|
sequence_input_esm3 = gr.Textbox( |
|
label="Protein Sequence", |
|
placeholder="Enter protein sequence using standard amino acid letters", |
|
lines=1 |
|
) |
|
pdb_id = gr.Text( |
|
label="PDB ID", |
|
placeholder="Enter 4-letter PDB ID (e.g. 1UBQ)" |
|
) |
|
use_pdb_seq = gr.Checkbox( |
|
label="Use sequence from PDB", |
|
value=False |
|
) |
|
pdb_file = gr.File( |
|
label="Or upload PDB file", |
|
file_count="single" |
|
) |
|
chain_id = gr.Text( |
|
label="Chain ID", |
|
value="A", |
|
placeholder="Enter chain ID" |
|
) |
|
submit_btn_esm3 = gr.Button("Predict", variant="primary") |
|
|
|
with gr.Column(visible=False) as esm2_inputs: |
|
name_input_esm2 = gr.Text( |
|
label="Output Name (optional)", |
|
placeholder="Enter name for the job. Leave blank to use a random number" |
|
) |
|
sequence_input_esm2 = gr.Textbox( |
|
label="Protein Sequence", |
|
placeholder="Enter protein sequence using standard amino acid letters", |
|
lines=1 |
|
) |
|
submit_btn_esm2 = gr.Button("Predict", variant="primary") |
|
|
|
with gr.Row(visible=True) as examples_esm3: |
|
label = gr.Textbox(label="Label", visible=False) |
|
|
|
examples = gr.Dataset( |
|
components=[label, sequence_input_esm3, pdb_id, chain_id, use_pdb_seq], |
|
samples=[ |
|
["Structure and its sequence", "-", "1ubq", "A", True], |
|
["Structure and unique sequence", "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG", "1ubq", "A", False], |
|
["Sequence only", "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG", "-", "-", False], |
|
["Structure only", "-", "1ubq", "A", False] |
|
], |
|
label="Examples" |
|
) |
|
|
|
with gr.Tab("Results"): |
|
with gr.Row(visible=True) as results_esm3: |
|
csv_output_esm3 = gr.File(label="Download Results (.csv)") |
|
pdb_output_esm3 = gr.File(label="Download PDB") |
|
with gr.Row(visible=False) as results_esm2: |
|
csv_output_esm2 = gr.File(label="Download Results (.csv)") |
|
|
|
def toggle_model_inputs(choice): |
|
if choice == "ESM3": |
|
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True) |
|
else: |
|
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) |
|
|
|
model_choice.change( |
|
fn=toggle_model_inputs, |
|
inputs=model_choice, |
|
outputs=[esm3_inputs, esm2_inputs, results_esm3, results_esm2, examples_esm3] |
|
) |
|
|
|
def predict_esm3(name, sequence, pdb_id, pdb_file, chain_id, use_pdb_seq, oauth_token: gr.OAuthToken | None = None): |
|
if oauth_token is None: |
|
raise gr.Error("Please log in to use this Space") |
|
token_value = oauth_token.token |
|
check_permissions(token_value) |
|
|
|
csv_output, pdb_output = predict_dynamics( |
|
sequence=sequence, |
|
pdb_input=pdb_id if pdb_id else pdb_file, |
|
chain_id=chain_id, |
|
use_pdb_seq=use_pdb_seq, |
|
model_version="ESM3", |
|
name=name, |
|
oauth_token=token_value |
|
) |
|
return [csv_output, pdb_output] |
|
|
|
def predict_esm2(name, sequence): |
|
csv_output, _ = predict_dynamics( |
|
sequence=sequence, |
|
pdb_input=None, |
|
chain_id=None, |
|
use_pdb_seq=False, |
|
model_version="ESM2", |
|
name=name |
|
) |
|
return [csv_output] |
|
|
|
submit_btn_esm3.click( |
|
fn=predict_esm3, |
|
inputs=[name_input, sequence_input_esm3, pdb_id, pdb_file, chain_id, use_pdb_seq], |
|
outputs=[csv_output_esm3, pdb_output_esm3] |
|
) |
|
|
|
submit_btn_esm2.click( |
|
fn=predict_esm2, |
|
inputs=[name_input_esm2, sequence_input_esm2], |
|
outputs=[csv_output_esm2] |
|
) |
|
|
|
gr.Markdown(""" |
|
--- |
|
This HuggingFace Space was created by Gina El Nesr [@ginaelnesr](https://twitter.com/ginaelnesr). |
|
""") |
|
|
|
gr.Markdown("""If you are using our code, datasets, or model, please use the following citation: |
|
```bibtex |
|
@article {Dyna-1, |
|
author = {Wayment-Steele, Hannah K. and El Nesr, Gina and Hettiarachchi, Ramith and Kariyawasam, Hasindu and Ovchinnikov, Sergey and Kern, Dorothee}, |
|
title = {Learning millisecond protein dynamics from what is missing in NMR spectra}, |
|
year = {2025}, |
|
doi = {10.1101/2025.03.19.642801}, |
|
journal = {bioRxiv} |
|
} |
|
``` |
|
""") |
|
|
|
if __name__ == "__main__": |
|
dyna1_app.launch( |
|
share=True |
|
) |
|
|