ThorbenF commited on
Commit
a28eeb5
·
1 Parent(s): a653779
Files changed (2) hide show
  1. .ipynb_checkpoints/app-checkpoint.py +84 -51
  2. app.py +84 -51
.ipynb_checkpoints/app-checkpoint.py CHANGED
@@ -23,10 +23,9 @@ from scipy.special import expit
23
  import requests
24
 
25
  # Biopython imports
26
- from Bio.PDB import PDBParser, Select
27
  from Bio.PDB.DSSP import DSSP
28
-
29
- from gradio_molecule3d import Molecule3D
30
 
31
  # Configuration
32
  checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
@@ -38,6 +37,79 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
38
  model.to(device)
39
  model.eval()
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def create_dataset(tokenizer, seqs, labels, checkpoint):
42
  tokenized = tokenizer(seqs, max_length=max_length, padding=False, truncation=True)
43
  dataset = Dataset.from_dict(tokenized)
@@ -138,59 +210,20 @@ def fetch_pdb(pdb_id):
138
  print(f"Error fetching PDB: {e}")
139
  return None
140
 
141
- def extract_protein_sequence(pdb_path):
142
- """
143
- Extract the longest protein sequence from a PDB file
144
- """
145
- parser = PDBParser(QUIET=1)
146
- structure = parser.get_structure('protein', pdb_path)
147
-
148
- class ProteinSelect(Select):
149
- def accept_residue(self, residue):
150
- # Only accept standard amino acids
151
- standard_aa = set('ACDEFGHIKLMNPQRSTVWY')
152
- return residue.get_resname() in standard_aa
153
-
154
- # Find the longest protein chain
155
- longest_sequence = ""
156
- longest_chain = None
157
- for model in structure:
158
- for chain in model:
159
- sequence = ""
160
- for residue in chain:
161
- if Select().accept_residue(residue):
162
- sequence += residue.get_resname()
163
-
164
- # Convert 3-letter amino acid codes to 1-letter
165
- aa_dict = {
166
- 'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E', 'PHE':'F',
167
- 'GLY':'G', 'HIS':'H', 'ILE':'I', 'LYS':'K', 'LEU':'L',
168
- 'MET':'M', 'ASN':'N', 'PRO':'P', 'GLN':'Q', 'ARG':'R',
169
- 'SER':'S', 'THR':'T', 'VAL':'V', 'TRP':'W', 'TYR':'Y'
170
- }
171
-
172
- one_letter_sequence = ''.join([aa_dict.get(res, 'X') for res in sequence])
173
-
174
- # Track the longest sequence
175
- if len(one_letter_sequence) > len(longest_sequence) and \
176
- 10 < len(one_letter_sequence) < 1500:
177
- longest_sequence = one_letter_sequence
178
- longest_chain = chain
179
-
180
- return longest_sequence, longest_chain, pdb_path
181
-
182
  def process_pdb(pdb_id):
183
  # Fetch PDB file
184
- pdb_path = fetch_pdb(pdb_id)
 
 
185
 
186
- if not pdb_path:
187
- return "Failed to fetch PDB file", pdb_path
188
 
189
  # Extract protein sequence and chain
190
- protein_sequence, chain, pdb_file = extract_protein_sequence(pdb_path)
191
 
192
  if not protein_sequence:
193
- return "No suitable protein sequence found", pdb_file
194
 
195
  # Predict binding sites
196
  sequence, normalized_scores = predict_protein_sequence(protein_sequence)
@@ -198,7 +231,7 @@ def process_pdb(pdb_id):
198
  # Prepare result string
199
  result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(sequence, normalized_scores)])
200
 
201
- return result_str, pdb_file
202
 
203
  # Create Gradio interface
204
  with gr.Blocks() as demo:
@@ -246,4 +279,4 @@ with gr.Blocks() as demo:
246
  outputs=[predictions_output, molecule_output]
247
  )
248
 
249
- demo.launch(share=True)
 
23
  import requests
24
 
25
  # Biopython imports
26
+ from Bio.PDB import PDBParser, Select, PDBIO
27
  from Bio.PDB.DSSP import DSSP
28
+ import Bio.PDB.PDBList as PDBList
 
29
 
30
  # Configuration
31
  checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
 
37
  model.to(device)
38
  model.eval()
39
 
40
+ def is_valid_sequence_length(length: int) -> bool:
41
+ """Check if sequence length is within valid range."""
42
+ return 100 <= length <= 1500
43
+
44
+ def is_nucleic_acid_chain(chain) -> bool:
45
+ """Check if chain contains nucleic acids."""
46
+ nucleic_acids = {'A', 'C', 'G', 'T', 'U', 'DA', 'DC', 'DG', 'DT', 'DU', 'UNK'}
47
+ return any(residue.get_resname().strip() in nucleic_acids for residue in chain)
48
+
49
+ def extract_protein_sequence(pdb_path):
50
+ """
51
+ Extract the longest protein sequence from a PDB file with improved logic
52
+ """
53
+ parser = PDBParser(QUIET=1)
54
+ structure = parser.get_structure('protein', pdb_path)
55
+
56
+ # Comprehensive amino acid mapping
57
+ aa_dict = {
58
+ # Standard amino acids (20 canonical)
59
+ 'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
60
+ 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
61
+ 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
62
+ 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y',
63
+
64
+ # Modified amino acids and alternative names
65
+ 'MSE': 'M', # Selenomethionine
66
+ 'SEP': 'S', # Phosphoserine
67
+ 'TPO': 'T', # Phosphothreonine
68
+ 'CSO': 'C', # Hydroxylalanine
69
+ 'PTR': 'Y', # Phosphotyrosine
70
+ 'HYP': 'P', # Hydroxyproline
71
+ }
72
+
73
+ # Ligand and nucleic acid exclusion set
74
+ ligand_exclusion_set = {'HOH', 'WAT', 'DOD', 'SO4', 'PO4', 'GOL', 'ACT', 'EDO'}
75
+
76
+ # Find the longest protein chain
77
+ longest_sequence = ""
78
+ longest_chain = None
79
+
80
+ for model in structure:
81
+ for chain in model:
82
+ # Skip nucleic acid chains
83
+ if is_nucleic_acid_chain(chain):
84
+ continue
85
+
86
+ # Extract and convert sequence
87
+ sequence = ""
88
+ for residue in chain:
89
+ # Check if residue is a standard amino acid or a known modified amino acid
90
+ res_name = residue.get_resname().strip()
91
+ if res_name in aa_dict:
92
+ sequence += aa_dict[res_name]
93
+
94
+ # Check for valid length and update longest sequence
95
+ if (10 < len(sequence) < 1500 and
96
+ len(sequence) > len(longest_sequence)):
97
+ longest_sequence = sequence
98
+ longest_chain = chain
99
+
100
+ if not longest_sequence:
101
+ return None, None, pdb_path
102
+
103
+ # Save filtered PDB if needed
104
+ if longest_chain:
105
+ io = PDBIO()
106
+ io.set_structure(longest_chain.get_parent().get_parent())
107
+ filtered_pdb_path = pdb_path.replace('.pdb', '_filtered.pdb')
108
+ io.save(filtered_pdb_path)
109
+ return longest_sequence, longest_chain, filtered_pdb_path
110
+
111
+ return longest_sequence, longest_chain, pdb_path
112
+
113
  def create_dataset(tokenizer, seqs, labels, checkpoint):
114
  tokenized = tokenizer(seqs, max_length=max_length, padding=False, truncation=True)
115
  dataset = Dataset.from_dict(tokenized)
 
210
  print(f"Error fetching PDB: {e}")
211
  return None
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  def process_pdb(pdb_id):
214
  # Fetch PDB file
215
+ # Use PDBList to download the file if it doesn't exist locally
216
+ pdbl = PDBList.PDBList()
217
+ pdb_path = pdbl.retrieve_pdb_file(pdb_id, pdir='pdb_files', file_format='pdb')
218
 
219
+ if not pdb_path or not os.path.exists(pdb_path):
220
+ return "Failed to fetch PDB file", None
221
 
222
  # Extract protein sequence and chain
223
+ protein_sequence, chain, filtered_pdb_path = extract_protein_sequence(pdb_path)
224
 
225
  if not protein_sequence:
226
+ return "No suitable protein sequence found", None
227
 
228
  # Predict binding sites
229
  sequence, normalized_scores = predict_protein_sequence(protein_sequence)
 
231
  # Prepare result string
232
  result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(sequence, normalized_scores)])
233
 
234
+ return result_str, filtered_pdb_path
235
 
236
  # Create Gradio interface
237
  with gr.Blocks() as demo:
 
279
  outputs=[predictions_output, molecule_output]
280
  )
281
 
282
+ demo.launch()
app.py CHANGED
@@ -23,10 +23,9 @@ from scipy.special import expit
23
  import requests
24
 
25
  # Biopython imports
26
- from Bio.PDB import PDBParser, Select
27
  from Bio.PDB.DSSP import DSSP
28
-
29
- from gradio_molecule3d import Molecule3D
30
 
31
  # Configuration
32
  checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
@@ -38,6 +37,79 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
38
  model.to(device)
39
  model.eval()
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def create_dataset(tokenizer, seqs, labels, checkpoint):
42
  tokenized = tokenizer(seqs, max_length=max_length, padding=False, truncation=True)
43
  dataset = Dataset.from_dict(tokenized)
@@ -138,59 +210,20 @@ def fetch_pdb(pdb_id):
138
  print(f"Error fetching PDB: {e}")
139
  return None
140
 
141
- def extract_protein_sequence(pdb_path):
142
- """
143
- Extract the longest protein sequence from a PDB file
144
- """
145
- parser = PDBParser(QUIET=1)
146
- structure = parser.get_structure('protein', pdb_path)
147
-
148
- class ProteinSelect(Select):
149
- def accept_residue(self, residue):
150
- # Only accept standard amino acids
151
- standard_aa = set('ACDEFGHIKLMNPQRSTVWY')
152
- return residue.get_resname() in standard_aa
153
-
154
- # Find the longest protein chain
155
- longest_sequence = ""
156
- longest_chain = None
157
- for model in structure:
158
- for chain in model:
159
- sequence = ""
160
- for residue in chain:
161
- if Select().accept_residue(residue):
162
- sequence += residue.get_resname()
163
-
164
- # Convert 3-letter amino acid codes to 1-letter
165
- aa_dict = {
166
- 'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E', 'PHE':'F',
167
- 'GLY':'G', 'HIS':'H', 'ILE':'I', 'LYS':'K', 'LEU':'L',
168
- 'MET':'M', 'ASN':'N', 'PRO':'P', 'GLN':'Q', 'ARG':'R',
169
- 'SER':'S', 'THR':'T', 'VAL':'V', 'TRP':'W', 'TYR':'Y'
170
- }
171
-
172
- one_letter_sequence = ''.join([aa_dict.get(res, 'X') for res in sequence])
173
-
174
- # Track the longest sequence
175
- if len(one_letter_sequence) > len(longest_sequence) and \
176
- 10 < len(one_letter_sequence) < 1500:
177
- longest_sequence = one_letter_sequence
178
- longest_chain = chain
179
-
180
- return longest_sequence, longest_chain, pdb_path
181
-
182
  def process_pdb(pdb_id):
183
  # Fetch PDB file
184
- pdb_path = fetch_pdb(pdb_id)
 
 
185
 
186
- if not pdb_path:
187
- return "Failed to fetch PDB file", pdb_path
188
 
189
  # Extract protein sequence and chain
190
- protein_sequence, chain, pdb_file = extract_protein_sequence(pdb_path)
191
 
192
  if not protein_sequence:
193
- return "No suitable protein sequence found", pdb_file
194
 
195
  # Predict binding sites
196
  sequence, normalized_scores = predict_protein_sequence(protein_sequence)
@@ -198,7 +231,7 @@ def process_pdb(pdb_id):
198
  # Prepare result string
199
  result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(sequence, normalized_scores)])
200
 
201
- return result_str, pdb_file
202
 
203
  # Create Gradio interface
204
  with gr.Blocks() as demo:
@@ -246,4 +279,4 @@ with gr.Blocks() as demo:
246
  outputs=[predictions_output, molecule_output]
247
  )
248
 
249
- demo.launch(share=True)
 
23
  import requests
24
 
25
  # Biopython imports
26
+ from Bio.PDB import PDBParser, Select, PDBIO
27
  from Bio.PDB.DSSP import DSSP
28
+ import Bio.PDB.PDBList as PDBList
 
29
 
30
  # Configuration
31
  checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
 
37
  model.to(device)
38
  model.eval()
39
 
40
+ def is_valid_sequence_length(length: int) -> bool:
41
+ """Check if sequence length is within valid range."""
42
+ return 100 <= length <= 1500
43
+
44
+ def is_nucleic_acid_chain(chain) -> bool:
45
+ """Check if chain contains nucleic acids."""
46
+ nucleic_acids = {'A', 'C', 'G', 'T', 'U', 'DA', 'DC', 'DG', 'DT', 'DU', 'UNK'}
47
+ return any(residue.get_resname().strip() in nucleic_acids for residue in chain)
48
+
49
+ def extract_protein_sequence(pdb_path):
50
+ """
51
+ Extract the longest protein sequence from a PDB file with improved logic
52
+ """
53
+ parser = PDBParser(QUIET=1)
54
+ structure = parser.get_structure('protein', pdb_path)
55
+
56
+ # Comprehensive amino acid mapping
57
+ aa_dict = {
58
+ # Standard amino acids (20 canonical)
59
+ 'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
60
+ 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
61
+ 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
62
+ 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y',
63
+
64
+ # Modified amino acids and alternative names
65
+ 'MSE': 'M', # Selenomethionine
66
+ 'SEP': 'S', # Phosphoserine
67
+ 'TPO': 'T', # Phosphothreonine
68
+ 'CSO': 'C', # Hydroxylalanine
69
+ 'PTR': 'Y', # Phosphotyrosine
70
+ 'HYP': 'P', # Hydroxyproline
71
+ }
72
+
73
+ # Ligand and nucleic acid exclusion set
74
+ ligand_exclusion_set = {'HOH', 'WAT', 'DOD', 'SO4', 'PO4', 'GOL', 'ACT', 'EDO'}
75
+
76
+ # Find the longest protein chain
77
+ longest_sequence = ""
78
+ longest_chain = None
79
+
80
+ for model in structure:
81
+ for chain in model:
82
+ # Skip nucleic acid chains
83
+ if is_nucleic_acid_chain(chain):
84
+ continue
85
+
86
+ # Extract and convert sequence
87
+ sequence = ""
88
+ for residue in chain:
89
+ # Check if residue is a standard amino acid or a known modified amino acid
90
+ res_name = residue.get_resname().strip()
91
+ if res_name in aa_dict:
92
+ sequence += aa_dict[res_name]
93
+
94
+ # Check for valid length and update longest sequence
95
+ if (10 < len(sequence) < 1500 and
96
+ len(sequence) > len(longest_sequence)):
97
+ longest_sequence = sequence
98
+ longest_chain = chain
99
+
100
+ if not longest_sequence:
101
+ return None, None, pdb_path
102
+
103
+ # Save filtered PDB if needed
104
+ if longest_chain:
105
+ io = PDBIO()
106
+ io.set_structure(longest_chain.get_parent().get_parent())
107
+ filtered_pdb_path = pdb_path.replace('.pdb', '_filtered.pdb')
108
+ io.save(filtered_pdb_path)
109
+ return longest_sequence, longest_chain, filtered_pdb_path
110
+
111
+ return longest_sequence, longest_chain, pdb_path
112
+
113
  def create_dataset(tokenizer, seqs, labels, checkpoint):
114
  tokenized = tokenizer(seqs, max_length=max_length, padding=False, truncation=True)
115
  dataset = Dataset.from_dict(tokenized)
 
210
  print(f"Error fetching PDB: {e}")
211
  return None
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  def process_pdb(pdb_id):
214
  # Fetch PDB file
215
+ # Use PDBList to download the file if it doesn't exist locally
216
+ pdbl = PDBList.PDBList()
217
+ pdb_path = pdbl.retrieve_pdb_file(pdb_id, pdir='pdb_files', file_format='pdb')
218
 
219
+ if not pdb_path or not os.path.exists(pdb_path):
220
+ return "Failed to fetch PDB file", None
221
 
222
  # Extract protein sequence and chain
223
+ protein_sequence, chain, filtered_pdb_path = extract_protein_sequence(pdb_path)
224
 
225
  if not protein_sequence:
226
+ return "No suitable protein sequence found", None
227
 
228
  # Predict binding sites
229
  sequence, normalized_scores = predict_protein_sequence(protein_sequence)
 
231
  # Prepare result string
232
  result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(sequence, normalized_scores)])
233
 
234
+ return result_str, filtered_pdb_path
235
 
236
  # Create Gradio interface
237
  with gr.Blocks() as demo:
 
279
  outputs=[predictions_output, molecule_output]
280
  )
281
 
282
+ demo.launch()