Spaces:

ThorbenF
/

test_webpage

Sleeping

App Files Files Community

ThorbenF commited on Dec 3, 2024

Commit

11bcc1a

1 Parent(s): 01ff8b6

Update

Browse files

Files changed (4) hide show

.ipynb_checkpoints/app-checkpoint.py +96 -58
.ipynb_checkpoints/requirements-checkpoint.txt +2 -1
app.py +96 -58
requirements.txt +2 -1

.ipynb_checkpoints/app-checkpoint.py CHANGED Viewed

@@ -4,7 +4,6 @@ from model_loader import load_model
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.utils.data import DataLoader
 import re
@@ -14,53 +13,25 @@ import pandas as pd
 import copy
 import transformers, datasets
-from transformers.modeling_outputs import TokenClassifierOutput
-from transformers.models.t5.modeling_t5 import T5Config, T5PreTrainedModel, T5Stack
-from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
-from transformers import T5EncoderModel, T5Tokenizer
-from transformers.models.esm.modeling_esm import EsmPreTrainedModel, EsmModel
 from transformers import AutoTokenizer
-from transformers import TrainingArguments, Trainer, set_seed
 from transformers import DataCollatorForTokenClassification
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
-# for custom DataCollator
-from transformers.data.data_collator import DataCollatorMixin
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from transformers.utils import PaddingStrategy
 from datasets import Dataset
 from scipy.special import expit
 import requests
-from gradio_molecule3d import Molecule3D
-#import peft
-#from peft import get_peft_config, PeftModel, PeftConfig, inject_adapter_in_model, LoraConfig
 # Configuration
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
 max_length = 1500
-# Default representations for molecule rendering
-reps = [
-    {
-        "model": 0,
-        "chain": "",
-        "resname": "",
-        "style": "cartoon",
-        "color": "spectrum",
-        "residue_range": "",
-        "around": 0,
-        "byres": False,
-        "visible": True
-    }
-]
 # Load model and move to device
 model, tokenizer = load_model(checkpoint, max_length)
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -142,9 +113,7 @@ def predict_protein_sequence(test_one_letter_sequence):
     normalized_scores = normalize_scores(logits)
     test_one_letter_sequence = test_one_letter_sequence.replace(" ", "")
-    result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(test_one_letter_sequence, normalized_scores)])
-    return result_str
 def fetch_pdb(pdb_id):
     try:
@@ -169,14 +138,88 @@ def fetch_pdb(pdb_id):
         print(f"Error fetching PDB: {e}")
         return None
-def process_input(sequence, pdb_id):
-    # Predict binding sites
-    binding_site_predictions = predict_protein_sequence(sequence)
     # Fetch PDB file
     pdb_path = fetch_pdb(pdb_id)
-    return binding_site_predictions, pdb_path
 # Create Gradio interface
 with gr.Blocks() as demo:
@@ -184,18 +227,11 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            # Sequence input
-            sequence_input = gr.Textbox(
-                lines=2,
-                placeholder="Enter protein sequence here...",
-                label="Protein Sequence"
-            )
-            # PDB ID input
             pdb_input = gr.Textbox(
-                lines=1,
-                placeholder="Enter PDB ID here...",
-                label="PDB ID for 3D Visualization"
             )
             # Predict button
@@ -210,24 +246,26 @@ with gr.Blocks() as demo:
             # 3D Molecule visualization
             molecule_output = Molecule3D(
                 label="Protein Structure",
-                reps=reps
             )
     # Prediction logic
     predict_btn.click(
-        process_input,
-        inputs=[sequence_input, pdb_input],
-        outputs=[predictions_output, molecule_output]
     )
     # Add some example inputs
     gr.Markdown("## Examples")
     gr.Examples(
         examples=[
-            ["MKVLWAALLVTFLAGCQAKVEQAVETEPEPELRQQTEWQSGQRWELALGRFWDYLRWVQTLSEQVQEELLSSQVTQELRALMDETMKELKAYKSELEEQLTPVAEETRARLSKELQAAQARLGADMEDVCGRLVQYRGEVQAMLGQSTEELRVRLASHLRKLRKRLLRDADDLQKRLAVYQAGAREGAERGLSAIRERLGPLVEQGRVRAATVGSLAGQPLQERAQAWGERLRARMEEMGSRTRDRLDEVKEQVAEVRAKLEEQAQQRL", "1ABC"],
         ],
-        inputs=[sequence_input, pdb_input],
-        outputs=[predictions_output, molecule_output]
     )
 demo.launch()

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
 import re
 import copy
 import transformers, datasets
 from transformers import AutoTokenizer
 from transformers import DataCollatorForTokenClassification
 from datasets import Dataset
 from scipy.special import expit
 import requests
+# Biopython imports
+from Bio.PDB import PDBParser, Select
+from Bio.PDB.DSSP import DSSP
+from gradio_molecule3d import Molecule3D
 # Configuration
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
 max_length = 1500
 # Load model and move to device
 model, tokenizer = load_model(checkpoint, max_length)
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     normalized_scores = normalize_scores(logits)
     test_one_letter_sequence = test_one_letter_sequence.replace(" ", "")
+    return test_one_letter_sequence, normalized_scores
 def fetch_pdb(pdb_id):
     try:
         print(f"Error fetching PDB: {e}")
         return None
+def extract_protein_sequence(pdb_path):
+    """
+    Extract the longest protein sequence from a PDB file
+    """
+    parser = PDBParser(QUIET=1)
+    structure = parser.get_structure('protein', pdb_path)
+    class ProteinSelect(Select):
+        def accept_residue(self, residue):
+            # Only accept standard amino acids
+            standard_aa = set('ACDEFGHIKLMNPQRSTVWY')
+            return residue.get_resname() in standard_aa
+    # Find the longest protein chain
+    longest_sequence = ""
+    longest_chain = None
+    for model in structure:
+        for chain in model:
+            sequence = ""
+            for residue in chain:
+                if Select().accept_residue(residue):
+                    sequence += residue.get_resname()
+            # Convert 3-letter amino acid codes to 1-letter
+            aa_dict = {
+                'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E', 'PHE':'F',
+                'GLY':'G', 'HIS':'H', 'ILE':'I', 'LYS':'K', 'LEU':'L',
+                'MET':'M', 'ASN':'N', 'PRO':'P', 'GLN':'Q', 'ARG':'R',
+                'SER':'S', 'THR':'T', 'VAL':'V', 'TRP':'W', 'TYR':'Y'
+            }
+            one_letter_sequence = ''.join([aa_dict.get(res, 'X') for res in sequence])
+            # Track the longest sequence
+            if len(one_letter_sequence) > len(longest_sequence) and \
+               10 < len(one_letter_sequence) < 1500:
+                longest_sequence = one_letter_sequence
+                longest_chain = chain
+    return longest_sequence, longest_chain
+def process_pdb(pdb_id):
     # Fetch PDB file
     pdb_path = fetch_pdb(pdb_id)
+    if not pdb_path:
+        return "Failed to fetch PDB file", None, None
+    # Extract protein sequence and chain
+    protein_sequence, chain = extract_protein_sequence(pdb_path)
+    if not protein_sequence:
+        return "No suitable protein sequence found", None, None
+    # Predict binding sites
+    sequence, normalized_scores = predict_protein_sequence(protein_sequence)
+    # Prepare representations for coloring residues
+    reps = []
+    for i, (res, score) in enumerate(zip(sequence, normalized_scores), start=1):
+        # Map score to a color gradient from blue (low) to red (high)
+        color_intensity = int(score * 255)
+        color = f'rgb({color_intensity}, 0, {255-color_intensity})'
+        rep = {
+            "model": 0,
+            "chain": chain.id,
+            "resname": res,
+            "resnum": i,
+            "style": "cartoon",
+            "color": color,
+            "residue_range": f"{i}-{i}",
+            "around": 0,
+            "byres": True,
+            "visible": True
+        }
+        reps.append(rep)
+    # Prepare result string
+    result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(sequence, normalized_scores)])
+    return result_str, reps, pdb_path
 # Create Gradio interface
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
+            # PDB ID input with default suggestion
             pdb_input = gr.Textbox(
+                value="2IWI",
+                label="PDB ID",
+                placeholder="Enter PDB ID here..."
             )
             # Predict button
             # 3D Molecule visualization
             molecule_output = Molecule3D(
                 label="Protein Structure",
+                reps=[]  # Start with empty representations
             )
     # Prediction logic
     predict_btn.click(
+        process_pdb,
+        inputs=[pdb_input],
+        outputs=[predictions_output, molecule_output, molecule_output]
     )
     # Add some example inputs
     gr.Markdown("## Examples")
     gr.Examples(
         examples=[
+            ["2IWI"],
+            ["1ABC"],
+            ["4HHB"]
         ],
+        inputs=[pdb_input],
+        outputs=[predictions_output, molecule_output, molecule_output]
     )
 demo.launch()

.ipynb_checkpoints/requirements-checkpoint.txt CHANGED Viewed

@@ -9,4 +9,5 @@ scikit-learn>=0.24.0
 sentencepiece
 huggingface_hub>=0.15.0
 requests
-gradio_molecule3d

 sentencepiece
 huggingface_hub>=0.15.0
 requests
+gradio_molecule3d
+biopython>=1.81

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ from model_loader import load_model
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.utils.data import DataLoader
 import re
@@ -14,53 +13,25 @@ import pandas as pd
 import copy
 import transformers, datasets
-from transformers.modeling_outputs import TokenClassifierOutput
-from transformers.models.t5.modeling_t5 import T5Config, T5PreTrainedModel, T5Stack
-from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
-from transformers import T5EncoderModel, T5Tokenizer
-from transformers.models.esm.modeling_esm import EsmPreTrainedModel, EsmModel
 from transformers import AutoTokenizer
-from transformers import TrainingArguments, Trainer, set_seed
 from transformers import DataCollatorForTokenClassification
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
-# for custom DataCollator
-from transformers.data.data_collator import DataCollatorMixin
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from transformers.utils import PaddingStrategy
 from datasets import Dataset
 from scipy.special import expit
 import requests
-from gradio_molecule3d import Molecule3D
-#import peft
-#from peft import get_peft_config, PeftModel, PeftConfig, inject_adapter_in_model, LoraConfig
 # Configuration
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
 max_length = 1500
-# Default representations for molecule rendering
-reps = [
-    {
-        "model": 0,
-        "chain": "",
-        "resname": "",
-        "style": "cartoon",
-        "color": "spectrum",
-        "residue_range": "",
-        "around": 0,
-        "byres": False,
-        "visible": True
-    }
-]
 # Load model and move to device
 model, tokenizer = load_model(checkpoint, max_length)
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -142,9 +113,7 @@ def predict_protein_sequence(test_one_letter_sequence):
     normalized_scores = normalize_scores(logits)
     test_one_letter_sequence = test_one_letter_sequence.replace(" ", "")
-    result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(test_one_letter_sequence, normalized_scores)])
-    return result_str
 def fetch_pdb(pdb_id):
     try:
@@ -169,14 +138,88 @@ def fetch_pdb(pdb_id):
         print(f"Error fetching PDB: {e}")
         return None
-def process_input(sequence, pdb_id):
-    # Predict binding sites
-    binding_site_predictions = predict_protein_sequence(sequence)
     # Fetch PDB file
     pdb_path = fetch_pdb(pdb_id)
-    return binding_site_predictions, pdb_path
 # Create Gradio interface
 with gr.Blocks() as demo:
@@ -184,18 +227,11 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            # Sequence input
-            sequence_input = gr.Textbox(
-                lines=2,
-                placeholder="Enter protein sequence here...",
-                label="Protein Sequence"
-            )
-            # PDB ID input
             pdb_input = gr.Textbox(
-                lines=1,
-                placeholder="Enter PDB ID here...",
-                label="PDB ID for 3D Visualization"
             )
             # Predict button
@@ -210,24 +246,26 @@ with gr.Blocks() as demo:
             # 3D Molecule visualization
             molecule_output = Molecule3D(
                 label="Protein Structure",
-                reps=reps
             )
     # Prediction logic
     predict_btn.click(
-        process_input,
-        inputs=[sequence_input, pdb_input],
-        outputs=[predictions_output, molecule_output]
     )
     # Add some example inputs
     gr.Markdown("## Examples")
     gr.Examples(
         examples=[
-            ["MKVLWAALLVTFLAGCQAKVEQAVETEPEPELRQQTEWQSGQRWELALGRFWDYLRWVQTLSEQVQEELLSSQVTQELRALMDETMKELKAYKSELEEQLTPVAEETRARLSKELQAAQARLGADMEDVCGRLVQYRGEVQAMLGQSTEELRVRLASHLRKLRKRLLRDADDLQKRLAVYQAGAREGAERGLSAIRERLGPLVEQGRVRAATVGSLAGQPLQERAQAWGERLRARMEEMGSRTRDRLDEVKEQVAEVRAKLEEQAQQRL", "1ABC"],
         ],
-        inputs=[sequence_input, pdb_input],
-        outputs=[predictions_output, molecule_output]
     )
 demo.launch()

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
 import re
 import copy
 import transformers, datasets
 from transformers import AutoTokenizer
 from transformers import DataCollatorForTokenClassification
 from datasets import Dataset
 from scipy.special import expit
 import requests
+# Biopython imports
+from Bio.PDB import PDBParser, Select
+from Bio.PDB.DSSP import DSSP
+from gradio_molecule3d import Molecule3D
 # Configuration
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
 max_length = 1500
 # Load model and move to device
 model, tokenizer = load_model(checkpoint, max_length)
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     normalized_scores = normalize_scores(logits)
     test_one_letter_sequence = test_one_letter_sequence.replace(" ", "")
+    return test_one_letter_sequence, normalized_scores
 def fetch_pdb(pdb_id):
     try:
         print(f"Error fetching PDB: {e}")
         return None
+def extract_protein_sequence(pdb_path):
+    """
+    Extract the longest protein sequence from a PDB file
+    """
+    parser = PDBParser(QUIET=1)
+    structure = parser.get_structure('protein', pdb_path)
+    class ProteinSelect(Select):
+        def accept_residue(self, residue):
+            # Only accept standard amino acids
+            standard_aa = set('ACDEFGHIKLMNPQRSTVWY')
+            return residue.get_resname() in standard_aa
+    # Find the longest protein chain
+    longest_sequence = ""
+    longest_chain = None
+    for model in structure:
+        for chain in model:
+            sequence = ""
+            for residue in chain:
+                if Select().accept_residue(residue):
+                    sequence += residue.get_resname()
+            # Convert 3-letter amino acid codes to 1-letter
+            aa_dict = {
+                'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E', 'PHE':'F',
+                'GLY':'G', 'HIS':'H', 'ILE':'I', 'LYS':'K', 'LEU':'L',
+                'MET':'M', 'ASN':'N', 'PRO':'P', 'GLN':'Q', 'ARG':'R',
+                'SER':'S', 'THR':'T', 'VAL':'V', 'TRP':'W', 'TYR':'Y'
+            }
+            one_letter_sequence = ''.join([aa_dict.get(res, 'X') for res in sequence])
+            # Track the longest sequence
+            if len(one_letter_sequence) > len(longest_sequence) and \
+               10 < len(one_letter_sequence) < 1500:
+                longest_sequence = one_letter_sequence
+                longest_chain = chain
+    return longest_sequence, longest_chain
+def process_pdb(pdb_id):
     # Fetch PDB file
     pdb_path = fetch_pdb(pdb_id)
+    if not pdb_path:
+        return "Failed to fetch PDB file", None, None
+    # Extract protein sequence and chain
+    protein_sequence, chain = extract_protein_sequence(pdb_path)
+    if not protein_sequence:
+        return "No suitable protein sequence found", None, None
+    # Predict binding sites
+    sequence, normalized_scores = predict_protein_sequence(protein_sequence)
+    # Prepare representations for coloring residues
+    reps = []
+    for i, (res, score) in enumerate(zip(sequence, normalized_scores), start=1):
+        # Map score to a color gradient from blue (low) to red (high)
+        color_intensity = int(score * 255)
+        color = f'rgb({color_intensity}, 0, {255-color_intensity})'
+        rep = {
+            "model": 0,
+            "chain": chain.id,
+            "resname": res,
+            "resnum": i,
+            "style": "cartoon",
+            "color": color,
+            "residue_range": f"{i}-{i}",
+            "around": 0,
+            "byres": True,
+            "visible": True
+        }
+        reps.append(rep)
+    # Prepare result string
+    result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(sequence, normalized_scores)])
+    return result_str, reps, pdb_path
 # Create Gradio interface
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
+            # PDB ID input with default suggestion
             pdb_input = gr.Textbox(
+                value="2IWI",
+                label="PDB ID",
+                placeholder="Enter PDB ID here..."
             )
             # Predict button
             # 3D Molecule visualization
             molecule_output = Molecule3D(
                 label="Protein Structure",
+                reps=[]  # Start with empty representations
             )
     # Prediction logic
     predict_btn.click(
+        process_pdb,
+        inputs=[pdb_input],
+        outputs=[predictions_output, molecule_output, molecule_output]
     )
     # Add some example inputs
     gr.Markdown("## Examples")
     gr.Examples(
         examples=[
+            ["2IWI"],
+            ["1ABC"],
+            ["4HHB"]
         ],
+        inputs=[pdb_input],
+        outputs=[predictions_output, molecule_output, molecule_output]
     )
 demo.launch()

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ scikit-learn>=0.24.0
 sentencepiece
 huggingface_hub>=0.15.0
 requests
-gradio_molecule3d

 sentencepiece
 huggingface_hub>=0.15.0
 requests
+gradio_molecule3d
+biopython>=1.81