Spaces:
Running
Running
Commit
·
0f1a97c
1
Parent(s):
4ae3df6
add more UAA and description
Browse files- __init__.py +0 -0
- app.py +123 -21
- swisssidechain.py +0 -0
__init__.py
ADDED
File without changes
|
app.py
CHANGED
@@ -23,6 +23,7 @@ import matplotlib.patches as patches
|
|
23 |
from io import BytesIO
|
24 |
import tempfile
|
25 |
from rdkit import Chem
|
|
|
26 |
|
27 |
class PeptideAnalyzer:
|
28 |
def __init__(self):
|
@@ -66,6 +67,44 @@ class PeptideAnalyzer:
|
|
66 |
'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
|
67 |
'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
|
68 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def preprocess_complex_residues(self, smiles):
|
70 |
"""Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
|
71 |
complex_positions = []
|
@@ -233,7 +272,6 @@ class PeptideAnalyzer:
|
|
233 |
return False
|
234 |
|
235 |
def is_cyclic(self, smiles):
|
236 |
-
"""Improved cyclic peptide detection"""
|
237 |
# Check for C-terminal carboxyl
|
238 |
if smiles.endswith('C(=O)O'):
|
239 |
return False, [], []
|
@@ -270,10 +308,9 @@ class PeptideAnalyzer:
|
|
270 |
return content
|
271 |
|
272 |
def identify_residue(self, segment):
|
273 |
-
"""Identify residue with Pro reconstruction"""
|
274 |
-
# Only clean terminal carboxyl if this is the last segment
|
275 |
if 'complex_type' in segment:
|
276 |
return segment['complex_type'], []
|
|
|
277 |
content = self.clean_terminal_carboxyl(segment)
|
278 |
mods = self.get_modifications(segment)
|
279 |
|
@@ -285,9 +322,10 @@ class PeptideAnalyzer:
|
|
285 |
print("DIRECT MATCH: Found Pro at end")
|
286 |
return 'Pro', mods
|
287 |
|
288 |
-
# Eal - Glu(OAll)
|
289 |
if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
|
290 |
return 'Eal', mods
|
|
|
291 |
# Proline (P)
|
292 |
if any([
|
293 |
(segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
|
@@ -315,20 +353,20 @@ class PeptideAnalyzer:
|
|
315 |
if ('N1[C@H](CCC1)' in content):
|
316 |
return 'pro', mods
|
317 |
|
318 |
-
# Tryptophan (W)
|
319 |
if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
|
320 |
'c[nH]c' in content.replace(' ', ''):
|
321 |
if '[C@H](CC' in content: # D-form
|
322 |
return 'trp', mods
|
323 |
return 'Trp', mods
|
324 |
|
325 |
-
# Lysine (K)
|
326 |
if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
|
327 |
if '[C@H](CCCCN)' in content: # D-form
|
328 |
return 'lys', mods
|
329 |
return 'Lys', mods
|
330 |
|
331 |
-
# Arginine (R)
|
332 |
if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
|
333 |
if '[C@H](CCCNC(=N)N)' in content: # D-form
|
334 |
return 'arg', mods
|
@@ -370,7 +408,6 @@ class PeptideAnalyzer:
|
|
370 |
if ('CC(C)[C@@H]' in content or 'CC(C)[C@H]' in content or
|
371 |
'[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content or
|
372 |
'C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content):
|
373 |
-
|
374 |
if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
|
375 |
if '[C@H]' in content and not '[C@@H]' in content: # D-form
|
376 |
return 'val', mods
|
@@ -385,7 +422,6 @@ class PeptideAnalyzer:
|
|
385 |
'CC[C@H](C)[C@@H]' in content, 'CC[C@@H](C)[C@H]' in content,
|
386 |
'CC[C@H](C)[C@H]' in content, 'CC[C@@H](C)[C@@H]' in content])
|
387 |
and 'CC(C)C' not in content): # Exclude valine pattern
|
388 |
-
|
389 |
if any(['[C@H]([C@@H](CC)C)' in content, '[C@H](CC)C' in content,
|
390 |
'[C@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
|
391 |
'C[C@@H](CC)[C@H]' in content, 'C[C@H](CC)[C@H]' in content,
|
@@ -393,6 +429,7 @@ class PeptideAnalyzer:
|
|
393 |
# D-form
|
394 |
return 'ile', mods
|
395 |
return 'Ile', mods
|
|
|
396 |
# Tpb - Thr(PO(OBzl)OH)
|
397 |
if re.search(r'\(C\)OP\(=O\)\(O\)OCc[0-9]ccccc[0-9]', content) or 'OP(=O)(O)OCC' in content:
|
398 |
return 'Tpb', mods
|
@@ -503,7 +540,24 @@ class PeptideAnalyzer:
|
|
503 |
if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
|
504 |
return 'Kpg', mods
|
505 |
|
506 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
return None, mods
|
508 |
|
509 |
def get_modifications(self, segment):
|
@@ -564,7 +618,48 @@ class PeptideAnalyzer:
|
|
564 |
'residues': sequence,
|
565 |
'details': "\n".join(logs)
|
566 |
}
|
567 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
568 |
def annotate_cyclic_structure(mol, sequence):
|
569 |
"""Create structure visualization"""
|
570 |
AllChem.Compute2DCoords(mol)
|
@@ -814,11 +909,14 @@ def process_input(
|
|
814 |
generate_3d=False,
|
815 |
use_uff=False
|
816 |
):
|
817 |
-
"""
|
818 |
analyzer = PeptideAnalyzer()
|
819 |
temp_dir = tempfile.mkdtemp() if generate_3d else None
|
820 |
structure_files = []
|
821 |
|
|
|
|
|
|
|
822 |
# Handle direct SMILES input
|
823 |
if smiles_input:
|
824 |
smiles = smiles_input.strip()
|
@@ -885,11 +983,11 @@ def process_input(
|
|
885 |
summary += f"- {os.path.basename(filepath)}\n"
|
886 |
|
887 |
#return summary, img_cyclic, img_linear, structure_files if structure_files else None
|
888 |
-
return summary, img_cyclic,
|
889 |
|
890 |
except Exception as e:
|
891 |
#return f"Error processing SMILES: {str(e)}", None, None, []
|
892 |
-
return f"Error processing SMILES: {str(e)}", None,
|
893 |
# Handle file input
|
894 |
if file_obj is not None:
|
895 |
try:
|
@@ -921,16 +1019,17 @@ def process_input(
|
|
921 |
output_text += f"Error processing SMILES: {smiles} - {str(e)}\n"
|
922 |
output_text += "-" * 50 + "\n"
|
923 |
|
924 |
-
return output_text, None,
|
925 |
|
926 |
except Exception as e:
|
927 |
-
return f"Error processing file: {str(e)}", None, None, []
|
|
|
928 |
|
929 |
return (
|
930 |
output_text or "No analysis done.",
|
931 |
-
img_cyclic if 'img_cyclic' in locals() else None,
|
932 |
#img_linear if 'img_linear' in locals() else None,
|
933 |
-
structure_files if structure_files else []
|
934 |
)
|
935 |
|
936 |
iface = gr.Interface(
|
@@ -961,7 +1060,10 @@ iface = gr.Interface(
|
|
961 |
#gr.File(
|
962 |
#label="3D Structure Files",
|
963 |
#file_count="multiple"
|
964 |
-
#)
|
|
|
|
|
|
|
965 |
],
|
966 |
title="Peptide Structure Analyzer and Visualizer",
|
967 |
description='''
|
@@ -970,8 +1072,6 @@ iface = gr.Interface(
|
|
970 |
2. Determines if the peptide is cyclic
|
971 |
3. Parses the amino acid sequence
|
972 |
4. Creates 2D structure visualization with residue annotations
|
973 |
-
5. Optional linear representation
|
974 |
-
6. Optional 3D structure generation (ETKDG and UFF methods)
|
975 |
|
976 |
Input: Either enter a SMILES string directly or upload a text file containing SMILES strings
|
977 |
|
@@ -993,6 +1093,8 @@ if __name__ == "__main__":
|
|
993 |
iface.launch(share=True)
|
994 |
|
995 |
"""
|
|
|
|
|
996 |
gr.Checkbox(
|
997 |
label="Generate 3D structure (sdf file format)",
|
998 |
value=False
|
|
|
23 |
from io import BytesIO
|
24 |
import tempfile
|
25 |
from rdkit import Chem
|
26 |
+
from swisssidechain import all_aminos
|
27 |
|
28 |
class PeptideAnalyzer:
|
29 |
def __init__(self):
|
|
|
67 |
'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
|
68 |
'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
|
69 |
}
|
70 |
+
|
71 |
+
self._build_swisssidechain_lookups()
|
72 |
+
|
73 |
+
def _build_swisssidechain_lookups(self):
|
74 |
+
"""Side chain lookups for SwissSidechain UAAs"""
|
75 |
+
# Exact SMILES match
|
76 |
+
self.exact_smiles_lookup = {}
|
77 |
+
|
78 |
+
# Clean SMILES lookup (without stereochemistry)
|
79 |
+
self.clean_smiles_lookup = {}
|
80 |
+
|
81 |
+
for uaa_name, uaa_data in all_aminos.items():
|
82 |
+
code = uaa_data["Code"]
|
83 |
+
letter = uaa_data["Letter"]
|
84 |
+
smiles = uaa_data["SMILES"]
|
85 |
+
|
86 |
+
self.three_to_one[code] = letter
|
87 |
+
|
88 |
+
self.exact_smiles_lookup[smiles] = code
|
89 |
+
|
90 |
+
# Clean SMILES (no stereochemistry)
|
91 |
+
clean_smiles = self._remove_stereochemistry(smiles)
|
92 |
+
if clean_smiles not in self.clean_smiles_lookup:
|
93 |
+
self.clean_smiles_lookup[clean_smiles] = []
|
94 |
+
self.clean_smiles_lookup[clean_smiles].append(code)
|
95 |
+
|
96 |
+
def _remove_stereochemistry(self, smiles):
|
97 |
+
"""Remove stereochemistry from SMILES"""
|
98 |
+
cleaned = smiles
|
99 |
+
stereochemistry_patterns = [
|
100 |
+
'[C@@H]', '[C@H]', '[C@@]', '[C@]',
|
101 |
+
'[S@@]', '[S@]', '[N@@]', '[N@]',
|
102 |
+
'@@', '@'
|
103 |
+
]
|
104 |
+
for pattern in stereochemistry_patterns:
|
105 |
+
cleaned = cleaned.replace(pattern, pattern.replace('@@', '').replace('@', '').replace('[', '').replace(']', ''))
|
106 |
+
return cleaned
|
107 |
+
|
108 |
def preprocess_complex_residues(self, smiles):
|
109 |
"""Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
|
110 |
complex_positions = []
|
|
|
272 |
return False
|
273 |
|
274 |
def is_cyclic(self, smiles):
|
|
|
275 |
# Check for C-terminal carboxyl
|
276 |
if smiles.endswith('C(=O)O'):
|
277 |
return False, [], []
|
|
|
308 |
return content
|
309 |
|
310 |
def identify_residue(self, segment):
|
|
|
|
|
311 |
if 'complex_type' in segment:
|
312 |
return segment['complex_type'], []
|
313 |
+
|
314 |
content = self.clean_terminal_carboxyl(segment)
|
315 |
mods = self.get_modifications(segment)
|
316 |
|
|
|
322 |
print("DIRECT MATCH: Found Pro at end")
|
323 |
return 'Pro', mods
|
324 |
|
325 |
+
# Eal - Glu(OAll)
|
326 |
if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
|
327 |
return 'Eal', mods
|
328 |
+
|
329 |
# Proline (P)
|
330 |
if any([
|
331 |
(segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
|
|
|
353 |
if ('N1[C@H](CCC1)' in content):
|
354 |
return 'pro', mods
|
355 |
|
356 |
+
# Tryptophan (W)
|
357 |
if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
|
358 |
'c[nH]c' in content.replace(' ', ''):
|
359 |
if '[C@H](CC' in content: # D-form
|
360 |
return 'trp', mods
|
361 |
return 'Trp', mods
|
362 |
|
363 |
+
# Lysine (K)
|
364 |
if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
|
365 |
if '[C@H](CCCCN)' in content: # D-form
|
366 |
return 'lys', mods
|
367 |
return 'Lys', mods
|
368 |
|
369 |
+
# Arginine (R)
|
370 |
if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
|
371 |
if '[C@H](CCCNC(=N)N)' in content: # D-form
|
372 |
return 'arg', mods
|
|
|
408 |
if ('CC(C)[C@@H]' in content or 'CC(C)[C@H]' in content or
|
409 |
'[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content or
|
410 |
'C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content):
|
|
|
411 |
if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
|
412 |
if '[C@H]' in content and not '[C@@H]' in content: # D-form
|
413 |
return 'val', mods
|
|
|
422 |
'CC[C@H](C)[C@@H]' in content, 'CC[C@@H](C)[C@H]' in content,
|
423 |
'CC[C@H](C)[C@H]' in content, 'CC[C@@H](C)[C@@H]' in content])
|
424 |
and 'CC(C)C' not in content): # Exclude valine pattern
|
|
|
425 |
if any(['[C@H]([C@@H](CC)C)' in content, '[C@H](CC)C' in content,
|
426 |
'[C@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
|
427 |
'C[C@@H](CC)[C@H]' in content, 'C[C@H](CC)[C@H]' in content,
|
|
|
429 |
# D-form
|
430 |
return 'ile', mods
|
431 |
return 'Ile', mods
|
432 |
+
|
433 |
# Tpb - Thr(PO(OBzl)OH)
|
434 |
if re.search(r'\(C\)OP\(=O\)\(O\)OCc[0-9]ccccc[0-9]', content) or 'OP(=O)(O)OCC' in content:
|
435 |
return 'Tpb', mods
|
|
|
540 |
if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
|
541 |
return 'Kpg', mods
|
542 |
|
543 |
+
#======================Other UAAs from the SwissSidechain==========================================
|
544 |
+
# ADD SWISSSIDECHAIN MATCHING AT THE VERY END - only if nothing else matched
|
545 |
+
if content in self.exact_smiles_lookup:
|
546 |
+
return self.exact_smiles_lookup[content], mods
|
547 |
+
|
548 |
+
# Look up without stereochemistry differences)
|
549 |
+
content_clean = self._remove_stereochemistry(content)
|
550 |
+
if content_clean in self.clean_smiles_lookup:
|
551 |
+
matches = self.clean_smiles_lookup[content_clean]
|
552 |
+
if len(matches) == 1:
|
553 |
+
return matches[0], mods
|
554 |
+
else:
|
555 |
+
# Prefer L-forms (non-D prefixed codes) over D-forms
|
556 |
+
l_forms = [m for m in matches if not m.startswith('D')]
|
557 |
+
if l_forms:
|
558 |
+
return l_forms[0], mods
|
559 |
+
return matches[0], mods
|
560 |
+
|
561 |
return None, mods
|
562 |
|
563 |
def get_modifications(self, segment):
|
|
|
618 |
'residues': sequence,
|
619 |
'details': "\n".join(logs)
|
620 |
}
|
621 |
+
|
622 |
+
def get_uaa_information(self):
|
623 |
+
uaa_info = """
|
624 |
+
## Supported Non-Standard Amino Acids (UAAs) (Common)
|
625 |
+
|
626 |
+
- **Kpg** - Lys(palmitoyl-Glu-OtBu)
|
627 |
+
- **Cmt** - Cys(Mmt)
|
628 |
+
- **Eal** - Glu(OAll)
|
629 |
+
- **Tpb** - Thr(PO(OBzl)OH)
|
630 |
+
- **Dtg** - Asp(OtBu)-(Dmb)Gly
|
631 |
+
- **Aib** - α-Aminoisobutyric acid
|
632 |
+
- **Nle** - Norleucine
|
633 |
+
- **Hph** - Homophenylalanine
|
634 |
+
- **Cyl** - Cycloleucine
|
635 |
+
- **Nml** - N-methylleucine
|
636 |
+
- **Nma** - N-methylalanine
|
637 |
+
|
638 |
+
### Special Cases:
|
639 |
+
- **Cys-Cys** - Disulfide-bridged cysteine dimer
|
640 |
+
---
|
641 |
+
|
642 |
+
## Three-to-One Letter Code Mapping
|
643 |
+
|
644 |
+
### Standard Amino Acids:
|
645 |
+
**L-amino acids:** A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y
|
646 |
+
**D-amino acids:** a, c, d, e, f, g, h, i, k, l, m, n, p, q, r, s, t, v, w, y
|
647 |
+
|
648 |
+
### UAA Single Letter Codes:
|
649 |
+
| UAA | Code | UAA | Code | UAA | Code |
|
650 |
+
|-----|------|-----|------|-----|------|
|
651 |
+
| Aib | Ŷ | Dtg | Ĝ | Cmt | Ĉ |
|
652 |
+
| Eal | Ė | Nml | Ŀ | Nma | Ṃ |
|
653 |
+
| Kpg | Ƙ | Tpb | Ṯ | Cyl | Ċ |
|
654 |
+
| Nle | Ł | Hph | Ĥ | | |
|
655 |
+
|
656 |
+
### Special Cases:
|
657 |
+
- **Cys-Cys:** CC (L-form) or cc (D-form)
|
658 |
+
|
659 |
+
## For other mappings, please refer to the (SwissSideChain webside)[https://www.swisssidechain.ch/browse/family/table.php?family=all]
|
660 |
+
"""
|
661 |
+
return uaa_info
|
662 |
+
|
663 |
def annotate_cyclic_structure(mol, sequence):
|
664 |
"""Create structure visualization"""
|
665 |
AllChem.Compute2DCoords(mol)
|
|
|
909 |
generate_3d=False,
|
910 |
use_uff=False
|
911 |
):
|
912 |
+
"""Actual Execution Command."""
|
913 |
analyzer = PeptideAnalyzer()
|
914 |
temp_dir = tempfile.mkdtemp() if generate_3d else None
|
915 |
structure_files = []
|
916 |
|
917 |
+
# Retrieve UAA information
|
918 |
+
uaa_info = analyzer.get_uaa_information()
|
919 |
+
|
920 |
# Handle direct SMILES input
|
921 |
if smiles_input:
|
922 |
smiles = smiles_input.strip()
|
|
|
983 |
summary += f"- {os.path.basename(filepath)}\n"
|
984 |
|
985 |
#return summary, img_cyclic, img_linear, structure_files if structure_files else None
|
986 |
+
return summary, img_cyclic,
|
987 |
|
988 |
except Exception as e:
|
989 |
#return f"Error processing SMILES: {str(e)}", None, None, []
|
990 |
+
return f"Error processing SMILES: {str(e)}", None, uaa_info
|
991 |
# Handle file input
|
992 |
if file_obj is not None:
|
993 |
try:
|
|
|
1019 |
output_text += f"Error processing SMILES: {smiles} - {str(e)}\n"
|
1020 |
output_text += "-" * 50 + "\n"
|
1021 |
|
1022 |
+
return output_text, None, uaa_info
|
1023 |
|
1024 |
except Exception as e:
|
1025 |
+
#return f"Error processing file: {str(e)}", None, None, []
|
1026 |
+
return f"Error processing file: {str(e)}", None, uaa_info
|
1027 |
|
1028 |
return (
|
1029 |
output_text or "No analysis done.",
|
1030 |
+
img_cyclic if 'img_cyclic' in locals() else None, uaa_info
|
1031 |
#img_linear if 'img_linear' in locals() else None,
|
1032 |
+
#structure_files if structure_files else []
|
1033 |
)
|
1034 |
|
1035 |
iface = gr.Interface(
|
|
|
1060 |
#gr.File(
|
1061 |
#label="3D Structure Files",
|
1062 |
#file_count="multiple"
|
1063 |
+
#),
|
1064 |
+
gr.Markdown(
|
1065 |
+
label="Side Notes for Non-Standard Amino Acids",
|
1066 |
+
)
|
1067 |
],
|
1068 |
title="Peptide Structure Analyzer and Visualizer",
|
1069 |
description='''
|
|
|
1072 |
2. Determines if the peptide is cyclic
|
1073 |
3. Parses the amino acid sequence
|
1074 |
4. Creates 2D structure visualization with residue annotations
|
|
|
|
|
1075 |
|
1076 |
Input: Either enter a SMILES string directly or upload a text file containing SMILES strings
|
1077 |
|
|
|
1093 |
iface.launch(share=True)
|
1094 |
|
1095 |
"""
|
1096 |
+
5. Optional linear representation
|
1097 |
+
6. Optional 3D structure generation (ETKDG and UFF methods)
|
1098 |
gr.Checkbox(
|
1099 |
label="Generate 3D structure (sdf file format)",
|
1100 |
value=False
|
swisssidechain.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|