ThorbenF commited on
Commit
8bef2d8
·
1 Parent(s): 1f960e0
Files changed (2) hide show
  1. .ipynb_checkpoints/app-checkpoint.py +22 -5
  2. app.py +22 -5
.ipynb_checkpoints/app-checkpoint.py CHANGED
@@ -40,8 +40,6 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
40
  model.to(device)
41
  model.eval()
42
 
43
- reps = [{"model": 0, "style": "cartoon", "color": "spectrum"}]
44
-
45
  # Function to fetch a PDB file
46
  def fetch_pdb(pdb_id):
47
  pdb_url = f'https://files.rcsb.org/download/{pdb_id}.pdb'
@@ -60,7 +58,6 @@ def normalize_scores(scores):
60
  max_score = np.max(scores)
61
  return (scores - min_score) / (max_score - min_score) if max_score > min_score else scores
62
 
63
- # Extract sequence and predict binding scores
64
  def process_pdb(pdb_id, segment):
65
  pdb_path = fetch_pdb(pdb_id)
66
  if not pdb_path:
@@ -70,25 +67,45 @@ def process_pdb(pdb_id, segment):
70
  structure = parser.get_structure('protein', pdb_path)
71
  chain = structure[0][segment]
72
 
73
- sequence = "".join(residue.get_resname().strip() for residue in chain)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
 
75
  input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
76
  with torch.no_grad():
77
  outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
78
 
 
79
  scores = expit(outputs[:, 1] - outputs[:, 0])
80
  normalized_scores = normalize_scores(scores)
81
 
 
82
  result_str = "\n".join([
83
  f"{res.get_resname()} {res.id[1]} {sequence[i]} {normalized_scores[i]:.2f}"
84
- for i, res in enumerate(chain)
85
  ])
86
 
 
87
  with open(f"{pdb_id}_predictions.txt", "w") as f:
88
  f.write(result_str)
89
 
90
  return result_str, pdb_path, f"{pdb_id}_predictions.txt"
91
 
 
 
92
  # Gradio UI
93
  with gr.Blocks() as demo:
94
  gr.Markdown("# Protein Binding Site Prediction")
 
40
  model.to(device)
41
  model.eval()
42
 
 
 
43
  # Function to fetch a PDB file
44
  def fetch_pdb(pdb_id):
45
  pdb_url = f'https://files.rcsb.org/download/{pdb_id}.pdb'
 
58
  max_score = np.max(scores)
59
  return (scores - min_score) / (max_score - min_score) if max_score > min_score else scores
60
 
 
61
  def process_pdb(pdb_id, segment):
62
  pdb_path = fetch_pdb(pdb_id)
63
  if not pdb_path:
 
67
  structure = parser.get_structure('protein', pdb_path)
68
  chain = structure[0][segment]
69
 
70
+ # Comprehensive amino acid mapping
71
+ aa_dict = {
72
+ 'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
73
+ 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
74
+ 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
75
+ 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y',
76
+ 'MSE': 'M', 'SEP': 'S', 'TPO': 'T', 'CSO': 'C', 'PTR': 'Y', 'HYP': 'P'
77
+ }
78
+
79
+ # Exclude non-amino acid residues
80
+ sequence = "".join(
81
+ aa_dict[residue.get_resname().strip()]
82
+ for residue in chain
83
+ if residue.get_resname().strip() in aa_dict
84
+ )
85
 
86
+ # Prepare input for model prediction
87
  input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
88
  with torch.no_grad():
89
  outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
90
 
91
+ # Calculate scores and normalize them
92
  scores = expit(outputs[:, 1] - outputs[:, 0])
93
  normalized_scores = normalize_scores(scores)
94
 
95
+ # Prepare the result string, including only amino acid residues
96
  result_str = "\n".join([
97
  f"{res.get_resname()} {res.id[1]} {sequence[i]} {normalized_scores[i]:.2f}"
98
+ for i, res in enumerate(chain) if res.get_resname().strip() in aa_dict
99
  ])
100
 
101
+ # Save predictions to file
102
  with open(f"{pdb_id}_predictions.txt", "w") as f:
103
  f.write(result_str)
104
 
105
  return result_str, pdb_path, f"{pdb_id}_predictions.txt"
106
 
107
+ reps = [{"model": 0, "style": "cartoon", "color": "spectrum"}]
108
+
109
  # Gradio UI
110
  with gr.Blocks() as demo:
111
  gr.Markdown("# Protein Binding Site Prediction")
app.py CHANGED
@@ -40,8 +40,6 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
40
  model.to(device)
41
  model.eval()
42
 
43
- reps = [{"model": 0, "style": "cartoon", "color": "spectrum"}]
44
-
45
  # Function to fetch a PDB file
46
  def fetch_pdb(pdb_id):
47
  pdb_url = f'https://files.rcsb.org/download/{pdb_id}.pdb'
@@ -60,7 +58,6 @@ def normalize_scores(scores):
60
  max_score = np.max(scores)
61
  return (scores - min_score) / (max_score - min_score) if max_score > min_score else scores
62
 
63
- # Extract sequence and predict binding scores
64
  def process_pdb(pdb_id, segment):
65
  pdb_path = fetch_pdb(pdb_id)
66
  if not pdb_path:
@@ -70,25 +67,45 @@ def process_pdb(pdb_id, segment):
70
  structure = parser.get_structure('protein', pdb_path)
71
  chain = structure[0][segment]
72
 
73
- sequence = "".join(residue.get_resname().strip() for residue in chain)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
 
75
  input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
76
  with torch.no_grad():
77
  outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
78
 
 
79
  scores = expit(outputs[:, 1] - outputs[:, 0])
80
  normalized_scores = normalize_scores(scores)
81
 
 
82
  result_str = "\n".join([
83
  f"{res.get_resname()} {res.id[1]} {sequence[i]} {normalized_scores[i]:.2f}"
84
- for i, res in enumerate(chain)
85
  ])
86
 
 
87
  with open(f"{pdb_id}_predictions.txt", "w") as f:
88
  f.write(result_str)
89
 
90
  return result_str, pdb_path, f"{pdb_id}_predictions.txt"
91
 
 
 
92
  # Gradio UI
93
  with gr.Blocks() as demo:
94
  gr.Markdown("# Protein Binding Site Prediction")
 
40
  model.to(device)
41
  model.eval()
42
 
 
 
43
  # Function to fetch a PDB file
44
  def fetch_pdb(pdb_id):
45
  pdb_url = f'https://files.rcsb.org/download/{pdb_id}.pdb'
 
58
  max_score = np.max(scores)
59
  return (scores - min_score) / (max_score - min_score) if max_score > min_score else scores
60
 
 
61
  def process_pdb(pdb_id, segment):
62
  pdb_path = fetch_pdb(pdb_id)
63
  if not pdb_path:
 
67
  structure = parser.get_structure('protein', pdb_path)
68
  chain = structure[0][segment]
69
 
70
+ # Comprehensive amino acid mapping
71
+ aa_dict = {
72
+ 'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
73
+ 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
74
+ 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
75
+ 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y',
76
+ 'MSE': 'M', 'SEP': 'S', 'TPO': 'T', 'CSO': 'C', 'PTR': 'Y', 'HYP': 'P'
77
+ }
78
+
79
+ # Exclude non-amino acid residues
80
+ sequence = "".join(
81
+ aa_dict[residue.get_resname().strip()]
82
+ for residue in chain
83
+ if residue.get_resname().strip() in aa_dict
84
+ )
85
 
86
+ # Prepare input for model prediction
87
  input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
88
  with torch.no_grad():
89
  outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
90
 
91
+ # Calculate scores and normalize them
92
  scores = expit(outputs[:, 1] - outputs[:, 0])
93
  normalized_scores = normalize_scores(scores)
94
 
95
+ # Prepare the result string, including only amino acid residues
96
  result_str = "\n".join([
97
  f"{res.get_resname()} {res.id[1]} {sequence[i]} {normalized_scores[i]:.2f}"
98
+ for i, res in enumerate(chain) if res.get_resname().strip() in aa_dict
99
  ])
100
 
101
+ # Save predictions to file
102
  with open(f"{pdb_id}_predictions.txt", "w") as f:
103
  f.write(result_str)
104
 
105
  return result_str, pdb_path, f"{pdb_id}_predictions.txt"
106
 
107
+ reps = [{"model": 0, "style": "cartoon", "color": "spectrum"}]
108
+
109
  # Gradio UI
110
  with gr.Blocks() as demo:
111
  gr.Markdown("# Protein Binding Site Prediction")