Spaces:

ChatterjeeLab
/

SMILES2PEPTIDE

Running

App Files Files Community

yinuozhang commited on May 30

Commit

0f1a97c

1 Parent(s): 4ae3df6

add more UAA and description

Browse files

Files changed (3) hide show

__init__.py +0 -0
app.py +123 -21
swisssidechain.py +0 -0

__init__.py ADDED Viewed

File without changes

app.py CHANGED Viewed

@@ -23,6 +23,7 @@ import matplotlib.patches as patches
 from io import BytesIO
 import tempfile
 from rdkit import Chem
 class PeptideAnalyzer:
     def __init__(self):
@@ -66,6 +67,44 @@ class PeptideAnalyzer:
             'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
             'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
         }
     def preprocess_complex_residues(self, smiles):
         """Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
         complex_positions = []
@@ -233,7 +272,6 @@ class PeptideAnalyzer:
         return False
     def is_cyclic(self, smiles):
-        """Improved cyclic peptide detection"""
         # Check for C-terminal carboxyl
         if smiles.endswith('C(=O)O'):
             return False, [], []
@@ -270,10 +308,9 @@ class PeptideAnalyzer:
         return content
     def identify_residue(self, segment):
-        """Identify residue with Pro reconstruction"""
-        # Only clean terminal carboxyl if this is the last segment
         if 'complex_type' in segment:
             return segment['complex_type'], []
         content = self.clean_terminal_carboxyl(segment)
         mods = self.get_modifications(segment)
@@ -285,9 +322,10 @@ class PeptideAnalyzer:
             print("DIRECT MATCH: Found Pro at end")
             return 'Pro', mods
-        # Eal - Glu(OAll) - Multiple patterns
         if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
             return 'Eal', mods
         # Proline (P)
         if any([
             (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
@@ -315,20 +353,20 @@ class PeptideAnalyzer:
         if ('N1[C@H](CCC1)' in content):
             return 'pro', mods
-        # Tryptophan (W) - more specific indole pattern
         if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
         'c[nH]c' in content.replace(' ', ''):
             if '[C@H](CC' in content:  # D-form
                 return 'trp', mods
             return 'Trp', mods
-        # Lysine (K) - both patterns
         if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
             if '[C@H](CCCCN)' in content:  # D-form
                 return 'lys', mods
             return 'Lys', mods
-        # Arginine (R) - both patterns
         if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
             if '[C@H](CCCNC(=N)N)' in content:  # D-form
                 return 'arg', mods
@@ -370,7 +408,6 @@ class PeptideAnalyzer:
         if ('CC(C)[C@@H]' in content or 'CC(C)[C@H]' in content or
             '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content or
             'C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content):
             if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
                 if '[C@H]' in content and not '[C@@H]' in content:  # D-form
                     return 'val', mods
@@ -385,7 +422,6 @@ class PeptideAnalyzer:
                 'CC[C@H](C)[C@@H]' in content, 'CC[C@@H](C)[C@H]' in content,
                 'CC[C@H](C)[C@H]' in content, 'CC[C@@H](C)[C@@H]' in content])
             and 'CC(C)C' not in content):  # Exclude valine pattern
             if any(['[C@H]([C@@H](CC)C)' in content, '[C@H](CC)C' in content,
                     '[C@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
                     'C[C@@H](CC)[C@H]' in content, 'C[C@H](CC)[C@H]' in content,
@@ -393,6 +429,7 @@ class PeptideAnalyzer:
                 # D-form
                 return 'ile', mods
             return 'Ile', mods
         # Tpb - Thr(PO(OBzl)OH)
         if re.search(r'\(C\)OP\(=O\)\(O\)OCc[0-9]ccccc[0-9]', content) or 'OP(=O)(O)OCC' in content:
             return 'Tpb', mods
@@ -503,7 +540,24 @@ class PeptideAnalyzer:
         if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
             return 'Kpg', mods
         return None, mods
     def get_modifications(self, segment):
@@ -564,7 +618,48 @@ class PeptideAnalyzer:
             'residues': sequence,
             'details': "\n".join(logs)
         }
 def annotate_cyclic_structure(mol, sequence):
     """Create structure visualization"""
     AllChem.Compute2DCoords(mol)
@@ -814,11 +909,14 @@ def process_input(
     generate_3d=False,
     use_uff=False
 ):
-    """Process input and create visualizations using PeptideAnalyzer"""
     analyzer = PeptideAnalyzer()
     temp_dir = tempfile.mkdtemp() if generate_3d else None
     structure_files = []
     # Handle direct SMILES input
     if smiles_input:
         smiles = smiles_input.strip()
@@ -885,11 +983,11 @@ def process_input(
                     summary += f"- {os.path.basename(filepath)}\n"
             #return summary, img_cyclic, img_linear, structure_files if structure_files else None
-            return summary, img_cyclic, structure_files or None
         except Exception as e:
             #return f"Error processing SMILES: {str(e)}", None, None, []
-            return f"Error processing SMILES: {str(e)}", None, []
     # Handle file input
     if file_obj is not None:
         try:
@@ -921,16 +1019,17 @@ def process_input(
                     output_text += f"Error processing SMILES: {smiles} - {str(e)}\n"
                     output_text += "-" * 50 + "\n"
-            return output_text, None, None, []
         except Exception as e:
-            return f"Error processing file: {str(e)}", None, None, []
     return (
             output_text or "No analysis done.",
-            img_cyclic if 'img_cyclic' in locals() else None,
             #img_linear if 'img_linear' in locals() else None,
-            structure_files if structure_files else []
         )
 iface = gr.Interface(
@@ -961,7 +1060,10 @@ iface = gr.Interface(
         #gr.File(
             #label="3D Structure Files",
             #file_count="multiple"
-        #)
     ],
     title="Peptide Structure Analyzer and Visualizer",
     description='''
@@ -970,8 +1072,6 @@ iface = gr.Interface(
     2. Determines if the peptide is cyclic
     3. Parses the amino acid sequence
     4. Creates 2D structure visualization with residue annotations
-    5. Optional linear representation
-    6. Optional 3D structure generation (ETKDG and UFF methods)
     Input: Either enter a SMILES string directly or upload a text file containing SMILES strings
@@ -993,6 +1093,8 @@ if __name__ == "__main__":
     iface.launch(share=True)
 """
 gr.Checkbox(
     label="Generate 3D structure (sdf file format)",
     value=False

 from io import BytesIO
 import tempfile
 from rdkit import Chem
+from swisssidechain import all_aminos
 class PeptideAnalyzer:
     def __init__(self):
             'Aib': 'Ŷ', 'Dtg': 'Ĝ', 'Cmt': 'Ĉ', 'Eal': 'Ė', 'Nml': "Ŀ", 'Nma': 'Ṃ',
             'Kpg': 'Ƙ', 'Tpb': 'Ṯ', 'Cyl': 'Ċ', 'Nle': 'Ł', 'Hph': 'Ĥ', 'Cys-Cys': 'CC', 'cys-cys': 'cc',
         }
+        self._build_swisssidechain_lookups()
+    def _build_swisssidechain_lookups(self):
+        """Side chain lookups for SwissSidechain UAAs"""
+        # Exact SMILES match
+        self.exact_smiles_lookup = {}
+        # Clean SMILES lookup (without stereochemistry)
+        self.clean_smiles_lookup = {}
+        for uaa_name, uaa_data in all_aminos.items():
+            code = uaa_data["Code"]
+            letter = uaa_data["Letter"]
+            smiles = uaa_data["SMILES"]
+            self.three_to_one[code] = letter
+            self.exact_smiles_lookup[smiles] = code
+            # Clean SMILES (no stereochemistry)
+            clean_smiles = self._remove_stereochemistry(smiles)
+            if clean_smiles not in self.clean_smiles_lookup:
+                self.clean_smiles_lookup[clean_smiles] = []
+            self.clean_smiles_lookup[clean_smiles].append(code)
+    def _remove_stereochemistry(self, smiles):
+        """Remove stereochemistry from SMILES"""
+        cleaned = smiles
+        stereochemistry_patterns = [
+            '[C@@H]', '[C@H]', '[C@@]', '[C@]',
+            '[S@@]', '[S@]', '[N@@]', '[N@]',
+            '@@', '@'
+        ]
+        for pattern in stereochemistry_patterns:
+            cleaned = cleaned.replace(pattern, pattern.replace('@@', '').replace('@', '').replace('[', '').replace(']', ''))
+        return cleaned
     def preprocess_complex_residues(self, smiles):
         """Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
         complex_positions = []
         return False
     def is_cyclic(self, smiles):
         # Check for C-terminal carboxyl
         if smiles.endswith('C(=O)O'):
             return False, [], []
         return content
     def identify_residue(self, segment):
         if 'complex_type' in segment:
             return segment['complex_type'], []
         content = self.clean_terminal_carboxyl(segment)
         mods = self.get_modifications(segment)
             print("DIRECT MATCH: Found Pro at end")
             return 'Pro', mods
+        # Eal - Glu(OAll)
         if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
             return 'Eal', mods
         # Proline (P)
         if any([
             (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
         if ('N1[C@H](CCC1)' in content):
             return 'pro', mods
+        # Tryptophan (W)
         if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
         'c[nH]c' in content.replace(' ', ''):
             if '[C@H](CC' in content:  # D-form
                 return 'trp', mods
             return 'Trp', mods
+        # Lysine (K)
         if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
             if '[C@H](CCCCN)' in content:  # D-form
                 return 'lys', mods
             return 'Lys', mods
+        # Arginine (R)
         if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
             if '[C@H](CCCNC(=N)N)' in content:  # D-form
                 return 'arg', mods
         if ('CC(C)[C@@H]' in content or 'CC(C)[C@H]' in content or
             '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content or
             'C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content):
             if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
                 if '[C@H]' in content and not '[C@@H]' in content:  # D-form
                     return 'val', mods
                 'CC[C@H](C)[C@@H]' in content, 'CC[C@@H](C)[C@H]' in content,
                 'CC[C@H](C)[C@H]' in content, 'CC[C@@H](C)[C@@H]' in content])
             and 'CC(C)C' not in content):  # Exclude valine pattern
             if any(['[C@H]([C@@H](CC)C)' in content, '[C@H](CC)C' in content,
                     '[C@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
                     'C[C@@H](CC)[C@H]' in content, 'C[C@H](CC)[C@H]' in content,
                 # D-form
                 return 'ile', mods
             return 'Ile', mods
         # Tpb - Thr(PO(OBzl)OH)
         if re.search(r'\(C\)OP\(=O\)\(O\)OCc[0-9]ccccc[0-9]', content) or 'OP(=O)(O)OCC' in content:
             return 'Tpb', mods
         if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
             return 'Kpg', mods
+        #======================Other UAAs from the SwissSidechain==========================================
+        # ADD SWISSSIDECHAIN MATCHING AT THE VERY END - only if nothing else matched
+        if content in self.exact_smiles_lookup:
+            return self.exact_smiles_lookup[content], mods
+        # Look up without stereochemistry differences)
+        content_clean = self._remove_stereochemistry(content)
+        if content_clean in self.clean_smiles_lookup:
+            matches = self.clean_smiles_lookup[content_clean]
+            if len(matches) == 1:
+                return matches[0], mods
+            else:
+                # Prefer L-forms (non-D prefixed codes) over D-forms
+                l_forms = [m for m in matches if not m.startswith('D')]
+                if l_forms:
+                    return l_forms[0], mods
+                return matches[0], mods
         return None, mods
     def get_modifications(self, segment):
             'residues': sequence,
             'details': "\n".join(logs)
         }
+    def get_uaa_information(self):
+        uaa_info = """
+                ## Supported Non-Standard Amino Acids (UAAs) (Common)
+                - **Kpg** - Lys(palmitoyl-Glu-OtBu)
+                - **Cmt** - Cys(Mmt)
+                - **Eal** - Glu(OAll)
+                - **Tpb** - Thr(PO(OBzl)OH)
+                - **Dtg** - Asp(OtBu)-(Dmb)Gly
+                - **Aib** - α-Aminoisobutyric acid
+                - **Nle** - Norleucine
+                - **Hph** - Homophenylalanine
+                - **Cyl** - Cycloleucine
+                - **Nml** - N-methylleucine
+                - **Nma** - N-methylalanine
+                ### Special Cases:
+                - **Cys-Cys** - Disulfide-bridged cysteine dimer
+                ---
+                ## Three-to-One Letter Code Mapping
+                ### Standard Amino Acids:
+                **L-amino acids:** A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y
+                **D-amino acids:** a, c, d, e, f, g, h, i, k, l, m, n, p, q, r, s, t, v, w, y
+                ### UAA Single Letter Codes:
+                | UAA | Code | UAA | Code | UAA | Code |
+                |-----|------|-----|------|-----|------|
+                | Aib | Ŷ | Dtg | Ĝ | Cmt | Ĉ |
+                | Eal | Ė | Nml | Ŀ | Nma | Ṃ |
+                | Kpg | Ƙ | Tpb | Ṯ | Cyl | Ċ |
+                | Nle | Ł | Hph | Ĥ | | |
+                ### Special Cases:
+                - **Cys-Cys:** CC (L-form) or cc (D-form)
+                ## For other mappings, please refer to the (SwissSideChain webside)[https://www.swisssidechain.ch/browse/family/table.php?family=all]
+                """
+        return uaa_info
 def annotate_cyclic_structure(mol, sequence):
     """Create structure visualization"""
     AllChem.Compute2DCoords(mol)
     generate_3d=False,
     use_uff=False
 ):
+    """Actual Execution Command."""
     analyzer = PeptideAnalyzer()
     temp_dir = tempfile.mkdtemp() if generate_3d else None
     structure_files = []
+    # Retrieve UAA information
+    uaa_info = analyzer.get_uaa_information()
     # Handle direct SMILES input
     if smiles_input:
         smiles = smiles_input.strip()
                     summary += f"- {os.path.basename(filepath)}\n"
             #return summary, img_cyclic, img_linear, structure_files if structure_files else None
+            return summary, img_cyclic,
         except Exception as e:
             #return f"Error processing SMILES: {str(e)}", None, None, []
+            return f"Error processing SMILES: {str(e)}", None, uaa_info
     # Handle file input
     if file_obj is not None:
         try:
                     output_text += f"Error processing SMILES: {smiles} - {str(e)}\n"
                     output_text += "-" * 50 + "\n"
+            return output_text, None, uaa_info
         except Exception as e:
+            #return f"Error processing file: {str(e)}", None, None, []
+            return f"Error processing file: {str(e)}", None, uaa_info
     return (
             output_text or "No analysis done.",
+            img_cyclic if 'img_cyclic' in locals() else None, uaa_info
             #img_linear if 'img_linear' in locals() else None,
+            #structure_files if structure_files else []
         )
 iface = gr.Interface(
         #gr.File(
             #label="3D Structure Files",
             #file_count="multiple"
+        #),
+        gr.Markdown(
+            label="Side Notes for Non-Standard Amino Acids",
+        )
     ],
     title="Peptide Structure Analyzer and Visualizer",
     description='''
     2. Determines if the peptide is cyclic
     3. Parses the amino acid sequence
     4. Creates 2D structure visualization with residue annotations
     Input: Either enter a SMILES string directly or upload a text file containing SMILES strings
     iface.launch(share=True)
 """
+5. Optional linear representation
+6. Optional 3D structure generation (ETKDG and UFF methods)
 gr.Checkbox(
     label="Generate 3D structure (sdf file format)",
     value=False

swisssidechain.py ADDED Viewed

The diff for this file is too large to render. See raw diff