yinuozhang commited on
Commit
0f1a97c
·
1 Parent(s): 4ae3df6

add more UAA and description

Browse files
Files changed (3) hide show
  1. __init__.py +0 -0
  2. app.py +123 -21
  3. swisssidechain.py +0 -0
__init__.py ADDED
File without changes
app.py CHANGED
@@ -23,6 +23,7 @@ import matplotlib.patches as patches
23
  from io import BytesIO
24
  import tempfile
25
  from rdkit import Chem
 
26
 
27
  class PeptideAnalyzer:
28
  def __init__(self):
@@ -66,6 +67,44 @@ class PeptideAnalyzer:
66
  'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
67
  'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
68
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def preprocess_complex_residues(self, smiles):
70
  """Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
71
  complex_positions = []
@@ -233,7 +272,6 @@ class PeptideAnalyzer:
233
  return False
234
 
235
  def is_cyclic(self, smiles):
236
- """Improved cyclic peptide detection"""
237
  # Check for C-terminal carboxyl
238
  if smiles.endswith('C(=O)O'):
239
  return False, [], []
@@ -270,10 +308,9 @@ class PeptideAnalyzer:
270
  return content
271
 
272
  def identify_residue(self, segment):
273
- """Identify residue with Pro reconstruction"""
274
- # Only clean terminal carboxyl if this is the last segment
275
  if 'complex_type' in segment:
276
  return segment['complex_type'], []
 
277
  content = self.clean_terminal_carboxyl(segment)
278
  mods = self.get_modifications(segment)
279
 
@@ -285,9 +322,10 @@ class PeptideAnalyzer:
285
  print("DIRECT MATCH: Found Pro at end")
286
  return 'Pro', mods
287
 
288
- # Eal - Glu(OAll) - Multiple patterns
289
  if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
290
  return 'Eal', mods
 
291
  # Proline (P)
292
  if any([
293
  (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
@@ -315,20 +353,20 @@ class PeptideAnalyzer:
315
  if ('N1[C@H](CCC1)' in content):
316
  return 'pro', mods
317
 
318
- # Tryptophan (W) - more specific indole pattern
319
  if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
320
  'c[nH]c' in content.replace(' ', ''):
321
  if '[C@H](CC' in content: # D-form
322
  return 'trp', mods
323
  return 'Trp', mods
324
 
325
- # Lysine (K) - both patterns
326
  if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
327
  if '[C@H](CCCCN)' in content: # D-form
328
  return 'lys', mods
329
  return 'Lys', mods
330
 
331
- # Arginine (R) - both patterns
332
  if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
333
  if '[C@H](CCCNC(=N)N)' in content: # D-form
334
  return 'arg', mods
@@ -370,7 +408,6 @@ class PeptideAnalyzer:
370
  if ('CC(C)[C@@H]' in content or 'CC(C)[C@H]' in content or
371
  '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content or
372
  'C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content):
373
-
374
  if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
375
  if '[C@H]' in content and not '[C@@H]' in content: # D-form
376
  return 'val', mods
@@ -385,7 +422,6 @@ class PeptideAnalyzer:
385
  'CC[C@H](C)[C@@H]' in content, 'CC[C@@H](C)[C@H]' in content,
386
  'CC[C@H](C)[C@H]' in content, 'CC[C@@H](C)[C@@H]' in content])
387
  and 'CC(C)C' not in content): # Exclude valine pattern
388
-
389
  if any(['[C@H]([C@@H](CC)C)' in content, '[C@H](CC)C' in content,
390
  '[C@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
391
  'C[C@@H](CC)[C@H]' in content, 'C[C@H](CC)[C@H]' in content,
@@ -393,6 +429,7 @@ class PeptideAnalyzer:
393
  # D-form
394
  return 'ile', mods
395
  return 'Ile', mods
 
396
  # Tpb - Thr(PO(OBzl)OH)
397
  if re.search(r'\(C\)OP\(=O\)\(O\)OCc[0-9]ccccc[0-9]', content) or 'OP(=O)(O)OCC' in content:
398
  return 'Tpb', mods
@@ -503,7 +540,24 @@ class PeptideAnalyzer:
503
  if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
504
  return 'Kpg', mods
505
 
506
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  return None, mods
508
 
509
  def get_modifications(self, segment):
@@ -564,7 +618,48 @@ class PeptideAnalyzer:
564
  'residues': sequence,
565
  'details': "\n".join(logs)
566
  }
567
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
  def annotate_cyclic_structure(mol, sequence):
569
  """Create structure visualization"""
570
  AllChem.Compute2DCoords(mol)
@@ -814,11 +909,14 @@ def process_input(
814
  generate_3d=False,
815
  use_uff=False
816
  ):
817
- """Process input and create visualizations using PeptideAnalyzer"""
818
  analyzer = PeptideAnalyzer()
819
  temp_dir = tempfile.mkdtemp() if generate_3d else None
820
  structure_files = []
821
 
 
 
 
822
  # Handle direct SMILES input
823
  if smiles_input:
824
  smiles = smiles_input.strip()
@@ -885,11 +983,11 @@ def process_input(
885
  summary += f"- {os.path.basename(filepath)}\n"
886
 
887
  #return summary, img_cyclic, img_linear, structure_files if structure_files else None
888
- return summary, img_cyclic, structure_files or None
889
 
890
  except Exception as e:
891
  #return f"Error processing SMILES: {str(e)}", None, None, []
892
- return f"Error processing SMILES: {str(e)}", None, []
893
  # Handle file input
894
  if file_obj is not None:
895
  try:
@@ -921,16 +1019,17 @@ def process_input(
921
  output_text += f"Error processing SMILES: {smiles} - {str(e)}\n"
922
  output_text += "-" * 50 + "\n"
923
 
924
- return output_text, None, None, []
925
 
926
  except Exception as e:
927
- return f"Error processing file: {str(e)}", None, None, []
 
928
 
929
  return (
930
  output_text or "No analysis done.",
931
- img_cyclic if 'img_cyclic' in locals() else None,
932
  #img_linear if 'img_linear' in locals() else None,
933
- structure_files if structure_files else []
934
  )
935
 
936
  iface = gr.Interface(
@@ -961,7 +1060,10 @@ iface = gr.Interface(
961
  #gr.File(
962
  #label="3D Structure Files",
963
  #file_count="multiple"
964
- #)
 
 
 
965
  ],
966
  title="Peptide Structure Analyzer and Visualizer",
967
  description='''
@@ -970,8 +1072,6 @@ iface = gr.Interface(
970
  2. Determines if the peptide is cyclic
971
  3. Parses the amino acid sequence
972
  4. Creates 2D structure visualization with residue annotations
973
- 5. Optional linear representation
974
- 6. Optional 3D structure generation (ETKDG and UFF methods)
975
 
976
  Input: Either enter a SMILES string directly or upload a text file containing SMILES strings
977
 
@@ -993,6 +1093,8 @@ if __name__ == "__main__":
993
  iface.launch(share=True)
994
 
995
  """
 
 
996
  gr.Checkbox(
997
  label="Generate 3D structure (sdf file format)",
998
  value=False
 
23
  from io import BytesIO
24
  import tempfile
25
  from rdkit import Chem
26
+ from swisssidechain import all_aminos
27
 
28
  class PeptideAnalyzer:
29
  def __init__(self):
 
67
  'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
68
  'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
69
  }
70
+
71
+ self._build_swisssidechain_lookups()
72
+
73
+ def _build_swisssidechain_lookups(self):
74
+ """Side chain lookups for SwissSidechain UAAs"""
75
+ # Exact SMILES match
76
+ self.exact_smiles_lookup = {}
77
+
78
+ # Clean SMILES lookup (without stereochemistry)
79
+ self.clean_smiles_lookup = {}
80
+
81
+ for uaa_name, uaa_data in all_aminos.items():
82
+ code = uaa_data["Code"]
83
+ letter = uaa_data["Letter"]
84
+ smiles = uaa_data["SMILES"]
85
+
86
+ self.three_to_one[code] = letter
87
+
88
+ self.exact_smiles_lookup[smiles] = code
89
+
90
+ # Clean SMILES (no stereochemistry)
91
+ clean_smiles = self._remove_stereochemistry(smiles)
92
+ if clean_smiles not in self.clean_smiles_lookup:
93
+ self.clean_smiles_lookup[clean_smiles] = []
94
+ self.clean_smiles_lookup[clean_smiles].append(code)
95
+
96
+ def _remove_stereochemistry(self, smiles):
97
+ """Remove stereochemistry from SMILES"""
98
+ cleaned = smiles
99
+ stereochemistry_patterns = [
100
+ '[C@@H]', '[C@H]', '[C@@]', '[C@]',
101
+ '[S@@]', '[S@]', '[N@@]', '[N@]',
102
+ '@@', '@'
103
+ ]
104
+ for pattern in stereochemistry_patterns:
105
+ cleaned = cleaned.replace(pattern, pattern.replace('@@', '').replace('@', '').replace('[', '').replace(']', ''))
106
+ return cleaned
107
+
108
  def preprocess_complex_residues(self, smiles):
109
  """Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
110
  complex_positions = []
 
272
  return False
273
 
274
  def is_cyclic(self, smiles):
 
275
  # Check for C-terminal carboxyl
276
  if smiles.endswith('C(=O)O'):
277
  return False, [], []
 
308
  return content
309
 
310
  def identify_residue(self, segment):
 
 
311
  if 'complex_type' in segment:
312
  return segment['complex_type'], []
313
+
314
  content = self.clean_terminal_carboxyl(segment)
315
  mods = self.get_modifications(segment)
316
 
 
322
  print("DIRECT MATCH: Found Pro at end")
323
  return 'Pro', mods
324
 
325
+ # Eal - Glu(OAll)
326
  if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
327
  return 'Eal', mods
328
+
329
  # Proline (P)
330
  if any([
331
  (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
 
353
  if ('N1[C@H](CCC1)' in content):
354
  return 'pro', mods
355
 
356
+ # Tryptophan (W)
357
  if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
358
  'c[nH]c' in content.replace(' ', ''):
359
  if '[C@H](CC' in content: # D-form
360
  return 'trp', mods
361
  return 'Trp', mods
362
 
363
+ # Lysine (K)
364
  if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
365
  if '[C@H](CCCCN)' in content: # D-form
366
  return 'lys', mods
367
  return 'Lys', mods
368
 
369
+ # Arginine (R)
370
  if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
371
  if '[C@H](CCCNC(=N)N)' in content: # D-form
372
  return 'arg', mods
 
408
  if ('CC(C)[C@@H]' in content or 'CC(C)[C@H]' in content or
409
  '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content or
410
  'C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content):
 
411
  if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
412
  if '[C@H]' in content and not '[C@@H]' in content: # D-form
413
  return 'val', mods
 
422
  'CC[C@H](C)[C@@H]' in content, 'CC[C@@H](C)[C@H]' in content,
423
  'CC[C@H](C)[C@H]' in content, 'CC[C@@H](C)[C@@H]' in content])
424
  and 'CC(C)C' not in content): # Exclude valine pattern
 
425
  if any(['[C@H]([C@@H](CC)C)' in content, '[C@H](CC)C' in content,
426
  '[C@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
427
  'C[C@@H](CC)[C@H]' in content, 'C[C@H](CC)[C@H]' in content,
 
429
  # D-form
430
  return 'ile', mods
431
  return 'Ile', mods
432
+
433
  # Tpb - Thr(PO(OBzl)OH)
434
  if re.search(r'\(C\)OP\(=O\)\(O\)OCc[0-9]ccccc[0-9]', content) or 'OP(=O)(O)OCC' in content:
435
  return 'Tpb', mods
 
540
  if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
541
  return 'Kpg', mods
542
 
543
+ #======================Other UAAs from the SwissSidechain==========================================
544
+ # ADD SWISSSIDECHAIN MATCHING AT THE VERY END - only if nothing else matched
545
+ if content in self.exact_smiles_lookup:
546
+ return self.exact_smiles_lookup[content], mods
547
+
548
+ # Look up without stereochemistry differences)
549
+ content_clean = self._remove_stereochemistry(content)
550
+ if content_clean in self.clean_smiles_lookup:
551
+ matches = self.clean_smiles_lookup[content_clean]
552
+ if len(matches) == 1:
553
+ return matches[0], mods
554
+ else:
555
+ # Prefer L-forms (non-D prefixed codes) over D-forms
556
+ l_forms = [m for m in matches if not m.startswith('D')]
557
+ if l_forms:
558
+ return l_forms[0], mods
559
+ return matches[0], mods
560
+
561
  return None, mods
562
 
563
  def get_modifications(self, segment):
 
618
  'residues': sequence,
619
  'details': "\n".join(logs)
620
  }
621
+
622
+ def get_uaa_information(self):
623
+ uaa_info = """
624
+ ## Supported Non-Standard Amino Acids (UAAs) (Common)
625
+
626
+ - **Kpg** - Lys(palmitoyl-Glu-OtBu)
627
+ - **Cmt** - Cys(Mmt)
628
+ - **Eal** - Glu(OAll)
629
+ - **Tpb** - Thr(PO(OBzl)OH)
630
+ - **Dtg** - Asp(OtBu)-(Dmb)Gly
631
+ - **Aib** - α-Aminoisobutyric acid
632
+ - **Nle** - Norleucine
633
+ - **Hph** - Homophenylalanine
634
+ - **Cyl** - Cycloleucine
635
+ - **Nml** - N-methylleucine
636
+ - **Nma** - N-methylalanine
637
+
638
+ ### Special Cases:
639
+ - **Cys-Cys** - Disulfide-bridged cysteine dimer
640
+ ---
641
+
642
+ ## Three-to-One Letter Code Mapping
643
+
644
+ ### Standard Amino Acids:
645
+ **L-amino acids:** A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y
646
+ **D-amino acids:** a, c, d, e, f, g, h, i, k, l, m, n, p, q, r, s, t, v, w, y
647
+
648
+ ### UAA Single Letter Codes:
649
+ | UAA | Code | UAA | Code | UAA | Code |
650
+ |-----|------|-----|------|-----|------|
651
+ | Aib | Ŷ | Dtg | Ĝ | Cmt | Ĉ |
652
+ | Eal | Ė | Nml | Ŀ | Nma | Ṃ |
653
+ | Kpg | Ƙ | Tpb | Ṯ | Cyl | Ċ |
654
+ | Nle | Ł | Hph | Ĥ | | |
655
+
656
+ ### Special Cases:
657
+ - **Cys-Cys:** CC (L-form) or cc (D-form)
658
+
659
+ ## For other mappings, please refer to the (SwissSideChain webside)[https://www.swisssidechain.ch/browse/family/table.php?family=all]
660
+ """
661
+ return uaa_info
662
+
663
  def annotate_cyclic_structure(mol, sequence):
664
  """Create structure visualization"""
665
  AllChem.Compute2DCoords(mol)
 
909
  generate_3d=False,
910
  use_uff=False
911
  ):
912
+ """Actual Execution Command."""
913
  analyzer = PeptideAnalyzer()
914
  temp_dir = tempfile.mkdtemp() if generate_3d else None
915
  structure_files = []
916
 
917
+ # Retrieve UAA information
918
+ uaa_info = analyzer.get_uaa_information()
919
+
920
  # Handle direct SMILES input
921
  if smiles_input:
922
  smiles = smiles_input.strip()
 
983
  summary += f"- {os.path.basename(filepath)}\n"
984
 
985
  #return summary, img_cyclic, img_linear, structure_files if structure_files else None
986
+ return summary, img_cyclic,
987
 
988
  except Exception as e:
989
  #return f"Error processing SMILES: {str(e)}", None, None, []
990
+ return f"Error processing SMILES: {str(e)}", None, uaa_info
991
  # Handle file input
992
  if file_obj is not None:
993
  try:
 
1019
  output_text += f"Error processing SMILES: {smiles} - {str(e)}\n"
1020
  output_text += "-" * 50 + "\n"
1021
 
1022
+ return output_text, None, uaa_info
1023
 
1024
  except Exception as e:
1025
+ #return f"Error processing file: {str(e)}", None, None, []
1026
+ return f"Error processing file: {str(e)}", None, uaa_info
1027
 
1028
  return (
1029
  output_text or "No analysis done.",
1030
+ img_cyclic if 'img_cyclic' in locals() else None, uaa_info
1031
  #img_linear if 'img_linear' in locals() else None,
1032
+ #structure_files if structure_files else []
1033
  )
1034
 
1035
  iface = gr.Interface(
 
1060
  #gr.File(
1061
  #label="3D Structure Files",
1062
  #file_count="multiple"
1063
+ #),
1064
+ gr.Markdown(
1065
+ label="Side Notes for Non-Standard Amino Acids",
1066
+ )
1067
  ],
1068
  title="Peptide Structure Analyzer and Visualizer",
1069
  description='''
 
1072
  2. Determines if the peptide is cyclic
1073
  3. Parses the amino acid sequence
1074
  4. Creates 2D structure visualization with residue annotations
 
 
1075
 
1076
  Input: Either enter a SMILES string directly or upload a text file containing SMILES strings
1077
 
 
1093
  iface.launch(share=True)
1094
 
1095
  """
1096
+ 5. Optional linear representation
1097
+ 6. Optional 3D structure generation (ETKDG and UFF methods)
1098
  gr.Checkbox(
1099
  label="Generate 3D structure (sdf file format)",
1100
  value=False
swisssidechain.py ADDED
The diff for this file is too large to render. See raw diff