Spaces:

ChatterjeeLab
/

SMILES2PEPTIDE

Running

App Files Files Community

yinuozhang commited on Dec 9, 2024

Commit

d7b6536

1 Parent(s): 84bbd6a

add uaa and sdf

Browse files

Files changed (1) hide show

app.py +546 -71

app.py CHANGED Viewed

@@ -19,11 +19,19 @@ class PeptideAnalyzer:
         self.bond_patterns = [
             (r'OC\(=O\)', 'ester'),  # Ester bond
             (r'N\(C\)C\(=O\)', 'n_methyl'),  # N-methylated peptide bond
-            (r'N[12]C\(=O\)', 'proline'),  # Proline peptide bond
             (r'NC\(=O\)', 'peptide'),  # Standard peptide bond
             (r'C\(=O\)N\(C\)', 'n_methyl_reverse'),  # Reverse N-methylated
             (r'C\(=O\)N[12]?', 'peptide_reverse')  # Reverse peptide bond
         ]
     def is_peptide(self, smiles):
         """Check if the SMILES represents a peptide structure"""
@@ -44,47 +52,25 @@ class PeptideAnalyzer:
         return False
     def is_cyclic(self, smiles):
-        """
-        Determine if SMILES represents a cyclic peptide by checking head-tail connection.
-        Returns: (is_cyclic, peptide_cycles, aromatic_cycles)
-        """
-        # First find aromatic rings
-        aromatic_cycles = []
-        for match in re.finditer(r'c[12]ccccc[12]', smiles):
-            number = match.group(0)[1]
-            if number not in aromatic_cycles:
-                aromatic_cycles.append(str(number))
-        # Find potential cycle numbers and their contexts
-        cycle_closures = []
-        # Look for cycle starts and corresponding ends
-        cycle_patterns = [
-            # Pattern pairs (start, end)
-            (r'[^\d](\d)[A-Z@]', r'C\1=O$'),  # Classic C=O ending
-            (r'[^\d](\d)[A-Z@]', r'N\1C\(=O\)'),  # N1C(=O) pattern
-            (r'[^\d](\d)[A-Z@]', r'N\1C$'),  # Simple N1C ending
-            (r'[^\d](\d)C\(=O\)', r'N\1[A-Z]'),  # Reverse connection
-            (r'H(\d)', r'N\1C'),  # H1...N1C pattern
-            (r'[^\d](\d)(?:C|N|O)', r'(?:C|N)\1(?:\(|$)'),  # Generic cycle closure
-        ]
-        for start_pat, end_pat in cycle_patterns:
-            start_matches = re.finditer(start_pat, smiles)
-            for start_match in start_matches:
-                number = start_match.group(1)
-                if number not in aromatic_cycles:  # Skip aromatic ring numbers
-                    # Look for corresponding end pattern
-                    end_match = re.search(end_pat.replace('\\1', number), smiles)
-                    if end_match and end_match.start() > start_match.start():
-                        cycle_closures.append(number)
-                        break
-        # Remove duplicates and aromatic numbers
-        peptide_cycles = list(set(cycle_closures) - set(aromatic_cycles))
-        is_cyclic = len(peptide_cycles) > 0
         return is_cyclic, peptide_cycles, aromatic_cycles
     def split_on_bonds(self, smiles):
@@ -158,17 +144,353 @@ class PeptideAnalyzer:
         return segments
     def identify_residue(self, segment):
         """Identify residue with Pro reconstruction"""
-        content = segment['content']
         mods = self.get_modifications(segment)
-        # Special handling for Pro: reconstruct the complete pattern
-        if (segment.get('bond_after') == 'N2C(=O)' and 'CCC' in content) or \
-            ('CCCN2' in content and content.endswith('=O')):  # End case
-        # Reconstruct the complete Pro pattern
-            if '[C@@H]2' in content or '[C@H]2' in content:
-                return 'Pro', mods
         if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
             return 'Nle', mods
@@ -215,34 +537,39 @@ class PeptideAnalyzer:
             return 'Leu', mods
         if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content:
             return 'Leu', mods
-        if ('C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content) and 'CC(C)C' not in content:
-            return 'Ile', mods
         if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
             return 'Thr', mods
         if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content:
             return 'Phe', mods
-        if '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content:
-            if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']):
                 return 'Val', mods
         if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
             return 'O-tBu', mods
         if ('[C@H](C)' in content or '[C@@H](C)' in content):
-            if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O']):
                 return 'Ala', mods
         # Tyrosine (Tyr) - 4-hydroxybenzyl side chain
-        if ('Cc2ccc(O)cc2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
             return 'Tyr', mods
-        # Tryptophan (Trp) - Indole side chain
-        if ('Cc2c[nH]c3ccccc23' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
-            return 'Trp', mods
         # Serine (Ser) - Hydroxymethyl side chain
         if '[C@H](CO)' in content or '[C@@H](CO)' in content:
@@ -250,7 +577,7 @@ class PeptideAnalyzer:
                 return 'Ser', mods
         # Threonine (Thr) - 1-hydroxyethyl side chain
-        if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
             return 'Thr', mods
         # Cysteine (Cys) - Thiol side chain
@@ -277,10 +604,6 @@ class PeptideAnalyzer:
         if ('CCC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
             return 'Glu', mods
-        # Lysine (Lys) - 4-aminobutyl side chain
-        if ('C[C@H](CCCCN)' in content or 'C[C@@H](CCCCN)' in content):
-            return 'Lys', mods
         # Arginine (Arg) - 3-guanidinopropyl side chain
         if ('CCCNC(=N)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
             return 'Arg', mods
@@ -302,7 +625,7 @@ class PeptideAnalyzer:
         return mods
     def analyze_structure(self, smiles):
-        """Main analysis function"""
         print("\nAnalyzing structure:", smiles)
         # Split into segments
@@ -328,11 +651,25 @@ class PeptideAnalyzer:
                 print(f"Warning: Could not identify residue in segment: {segment['content']}")
         # Check if cyclic
-        is_cyclic = 'N1' in smiles or 'N2' in smiles
-        final_sequence = f"cyclo({'-'.join(sequence)})" if is_cyclic else '-'.join(sequence)
-        print(f"\nFinal sequence: {final_sequence}")
-        return final_sequence
 """
 def annotate_cyclic_structure(mol, sequence):
@@ -586,9 +923,119 @@ def create_enhanced_linear_viz(sequence, smiles):
     plt.tight_layout()
     return fig
-def process_input(smiles_input=None, file_obj=None, show_linear=False, show_segment_details=False):
     """Process input and create visualizations using PeptideAnalyzer"""
     analyzer = PeptideAnalyzer()
     # Handle direct SMILES input
     if smiles_input:
@@ -597,13 +1044,32 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False, show_segm
         # First check if it's a peptide using analyzer's method
         if not analyzer.is_peptide(smiles):
             return "Error: Input SMILES does not appear to be a peptide structure.", None, None
         try:
             # Create molecule
             mol = Chem.MolFromSmiles(smiles)
             if mol is None:
                 return "Error: Invalid SMILES notation.", None, None
             # Use analyzer to get sequence
             segments = analyzer.split_on_bonds(smiles)
@@ -666,7 +1132,7 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False, show_segm
                 summary += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
                 #summary += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
-            return summary + output_text, img_cyclic, img_linear
         except Exception as e:
             return f"Error processing SMILES: {str(e)}", None, None
@@ -757,6 +1223,14 @@ iface = gr.Interface(
         gr.Checkbox(
             label="Show segment details",
             value=False
         )
     ],
     outputs=[
@@ -781,6 +1255,7 @@ iface = gr.Interface(
     3. Parses the amino acid sequence
     4. Creates 2D structure visualization with residue annotations
     5. Optional linear representation
     Input: Either enter a SMILES string directly or upload a text file containing SMILES strings

         self.bond_patterns = [
             (r'OC\(=O\)', 'ester'),  # Ester bond
             (r'N\(C\)C\(=O\)', 'n_methyl'),  # N-methylated peptide bond
+            (r'N[0-9]C\(=O\)', 'proline'),  # Proline peptide bond
             (r'NC\(=O\)', 'peptide'),  # Standard peptide bond
             (r'C\(=O\)N\(C\)', 'n_methyl_reverse'),  # Reverse N-methylated
             (r'C\(=O\)N[12]?', 'peptide_reverse')  # Reverse peptide bond
         ]
+        # Three to one letter code mapping
+        self.three_to_one = {
+            'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E',
+            'Phe': 'F', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
+            'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N',
+            'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S',
+            'Thr': 'T', 'Val': 'V', 'Trp': 'W', 'Tyr': 'Y'
+        }
     def is_peptide(self, smiles):
         """Check if the SMILES represents a peptide structure"""
         return False
     def is_cyclic(self, smiles):
+        """Improved cyclic peptide detection"""
+        # Check for C-terminal carboxyl
+        if smiles.endswith('C(=O)O'):
+            return False, [], []
+        # Find all numbers used in ring closures
+        ring_numbers = re.findall(r'(?:^|[^c])[0-9](?=[A-Z@\(\)])', smiles)
+        # Find aromatic ring numbers
+        aromatic_matches = re.findall(r'c[0-9](?:ccccc|c\[nH\]c)[0-9]', smiles)
+        aromatic_cycles = []
+        for match in aromatic_matches:
+            numbers = re.findall(r'[0-9]', match)
+            aromatic_cycles.extend(numbers)
+        # Numbers that aren't part of aromatic rings are peptide cycles
+        peptide_cycles = [n for n in ring_numbers if n not in aromatic_cycles]
+        is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
         return is_cyclic, peptide_cycles, aromatic_cycles
     def split_on_bonds(self, smiles):
         return segments
+    def clean_terminal_carboxyl(self, segment):
+        """Remove C-terminal carboxyl only if it's the true terminus"""
+        content = segment['content']
+        # Only clean if:
+        # 1. Contains C(=O)O
+        # 2. No bond_after exists (meaning it's the last segment)
+        # 3. C(=O)O is at the end of the content
+        if 'C(=O)O' in content and not segment.get('bond_after'):
+            print('recognized?')
+            # Remove C(=O)O pattern regardless of position
+            cleaned = re.sub(r'\(C\(=O\)O\)', '', content)
+            # Remove any leftover empty parentheses
+            cleaned = re.sub(r'\(\)', '', cleaned)
+            print(cleaned)
+            return cleaned
+        return content
     def identify_residue(self, segment):
         """Identify residue with Pro reconstruction"""
+        # Only clean terminal carboxyl if this is the last segment
+        content = self.clean_terminal_carboxyl(segment)
         mods = self.get_modifications(segment)
+        # UAA pattern matching section - before regular residues
+        # Phenylglycine and derivatives
+        if 'c1ccccc1' in content:
+            if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content:
+                return '4', mods  # Base phenylglycine
+        # 4-substituted phenylalanines
+        if 'Cc1ccc' in content:
+            if 'OMe' in content or 'OCc1ccc' in content:
+                return '0A1', mods  # 4-methoxy-Phenylalanine
+            elif 'Clc1ccc' in content:
+                return '200', mods  # 4-chloro-Phenylalanine
+            elif 'Brc1ccc' in content:
+                return '4BF', mods  # 4-Bromo-phenylalanine
+            elif 'C#Nc1ccc' in content:
+                return '4CF', mods  # 4-cyano-phenylalanine
+            elif 'Ic1ccc' in content:
+                return 'PHI', mods  # 4-Iodo-phenylalanine
+            elif 'Fc1ccc' in content:
+                return 'PFF', mods  # 4-Fluoro-phenylalanine
+        # Modified tryptophans
+        if 'c[nH]c2' in content:
+            if 'Oc2cccc2' in content:
+                return '0AF', mods  # 7-hydroxy-tryptophan
+            elif 'Fc2cccc2' in content:
+                return '4FW', mods  # 4-fluoro-tryptophan
+            elif 'Clc2cccc2' in content:
+                return '6CW', mods  # 6-chloro-tryptophan
+            elif 'Brc2cccc2' in content:
+                return 'BTR', mods  # 6-bromo-tryptophan
+            elif 'COc2cccc2' in content:
+                return 'MOT5', mods  # 5-Methoxy-tryptophan
+            elif 'Cc2cccc2' in content:
+                return 'MTR5', mods  # 5-Methyl-tryptophan
+        # Special amino acids
+        if 'CC(C)(C)[C@@H]' in content or 'CC(C)(C)[C@H]' in content:
+            return 'BUG', mods  # Tertleucine
+        if 'CCCNC(=N)N' in content:
+            return 'CIR', mods  # Citrulline
+        if '[SeH]' in content:
+            return 'CSE', mods  # Selenocysteine
+        if '[NH3]CC[C@@H]' in content or '[NH3]CC[C@H]' in content:
+            return 'DAB', mods  # Diaminobutyric acid
+        if 'C1CCCCC1' in content:
+            if 'C1CCCCC1[C@@H]' in content or 'C1CCCCC1[C@H]' in content:
+                return 'CHG', mods  # Cyclohexylglycine
+            elif 'C1CCCCC1C[C@@H]' in content or 'C1CCCCC1C[C@H]' in content:
+                return 'ALC', mods  # 3-cyclohexyl-alanine
+        # Naphthalene derivatives
+        if 'c1cccc2c1cccc2' in content:
+            if 'c1cccc2c1cccc2[C@@H]' in content or 'c1cccc2c1cccc2[C@H]' in content:
+                return 'NAL', mods  # 2-Naphthyl-alanine
+        # Heteroaromatic derivatives
+        if 'c1cncc' in content:
+            return 'PYR4', mods  # 3-(4-Pyridyl)-alanine
+        if 'c1cscc' in content:
+            return 'THA3', mods  # 3-(3-thienyl)-alanine
+        if 'c1nnc' in content:
+            return 'TRZ4', mods  # 3-(1,2,4-Triazol-1-yl)-alanine
+        # Modified serines and threonines
+        if 'OP(O)(O)O' in content:
+            if '[C@@H](COP' in content or '[C@H](COP' in content:
+                return 'SEP', mods  # phosphoserine
+            elif '[C@@H](OP' in content or '[C@H](OP' in content:
+                return 'TPO', mods  # phosphothreonine
+        # Specialized ring systems
+        if 'c1c2ccccc2cc2c1cccc2' in content:
+            return 'ANTH', mods  # 3-(9-anthryl)-alanine
+        if 'c1csc2c1cccc2' in content:
+            return 'BTH3', mods  # 3-(3-benzothienyl)-alanine
+        if '[C@]12C[C@H]3C[C@@H](C2)C[C@@H](C1)C3' in content:
+            return 'ADAM', mods  # Adamanthane
+        # Fluorinated derivatives
+        if 'FC(F)(F)' in content:
+            if 'CC(F)(F)F' in content:
+                return 'FLA', mods  # Trifluoro-alanine
+            if 'C(F)(F)F)c1' in content:
+                if 'c1ccccc1C(F)(F)F' in content:
+                    return 'TFG2', mods  # 2-(Trifluoromethyl)-phenylglycine
+                if 'c1cccc(c1)C(F)(F)F' in content:
+                    return 'TFG3', mods  # 3-(Trifluoromethyl)-phenylglycine
+                if 'c1ccc(cc1)C(F)(F)F' in content:
+                    return 'TFG4', mods  # 4-(Trifluoromethyl)-phenylglycine
+        # Multiple halogen patterns
+        if 'F' in content and 'c1' in content:
+            if 'c1ccc(c(c1)F)F' in content:
+                return 'F2F', mods  # 3,4-Difluoro-phenylalanine
+            if 'cc(F)cc(c1)F' in content:
+                return 'WFP', mods  # 3,5-Difluoro-phenylalanine
+        if 'Cl' in content and 'c1' in content:
+            if 'c1ccc(cc1Cl)Cl' in content:
+                return 'CP24', mods  # 2,4-dichloro-phenylalanine
+            if 'c1ccc(c(c1)Cl)Cl' in content:
+                return 'CP34', mods  # 3,4-dichloro-phenylalanine
+        # Hydroxy and amino derivatives
+        if 'O' in content and 'c1' in content:
+            if 'c1cc(O)cc(c1)O' in content:
+                return '3FG', mods  # (2s)-amino(3,5-dihydroxyphenyl)-ethanoic acid
+            if 'c1ccc(c(c1)O)O' in content:
+                return 'DAH', mods  # 3,4-Dihydroxy-phenylalanine
+        # Cyclic amino acids
+        if 'C1CCCC1' in content:
+            return 'CPA3', mods  # 3-Cyclopentyl-alanine
+        if 'C1CCCCC1' in content:
+            if 'CC1CCCCC1' in content:
+                return 'ALC', mods  # 3-cyclohexyl-alanine
+            else:
+                return 'CHG', mods  # Cyclohexylglycine
+        # Chain-length variants
+        if 'CCC[C@@H]' in content or 'CCC[C@H]' in content:
+            return 'NLE', mods  # Norleucine
+        if 'CC[C@@H]' in content or 'CC[C@H]' in content:
+            if not any(x in content for x in ['CC(C)', 'COC', 'CN(']):
+                return 'ABA', mods  # 2-Aminobutyric acid
+        # Modified histidines
+        if 'c1cnc' in content:
+            if '[C@@H]1CN[C@@H](N1)F' in content:
+                return '2HF', mods  # 2-fluoro-l-histidine
+            if 'c1cnc([nH]1)F' in content:
+                return '2HF1', mods  # 2-fluoro-l-histidine variant
+            if 'c1c[nH]c(n1)F' in content:
+                return '2HF2', mods  # 2-fluoro-l-histidine variant
+        # Sulfur and selenium containing
+        if '[SeH]' in content:
+            return 'CSE', mods  # Selenocysteine
+        if 'S' in content:
+            if 'CSCc1ccccc1' in content:
+                return 'BCS', mods  # benzylcysteine
+            if 'CCSC' in content:
+                return 'ESC', mods  # Ethionine
+            if 'CCS' in content:
+                return 'HCS', mods  # homocysteine
+        # Additional modifications
+        if 'CN=[N]=N' in content:
+            return 'AZDA', mods  # azido-alanine
+        if '[NH]=[C](=[NH2])=[NH2]' in content:
+            if 'CCC[NH]=' in content:
+                return 'AGM', mods  # 5-methyl-arginine
+            if 'CC[NH]=' in content:
+                return 'GDPR', mods  # 2-Amino-3-guanidinopropionic acid
+        if 'CCON' in content:
+            return 'CAN', mods  # canaline
+        if '[C@@H]1C=C[C@@H](C=C1)' in content:
+            return 'ACZ', mods  # cis-amiclenomycin
+        if 'CCC(=O)[NH3]' in content:
+            return 'ONL', mods  # 5-oxo-l-norleucine
+        if 'c1ccncc1' in content:
+            return 'PYR4', mods  # 3-(4-Pyridyl)-alanine
+        if 'c1ccco1' in content:
+            return 'FUA2', mods  # (2-furyl)-alanine
+        if 'c1ccc' in content:
+            if 'c1ccc(cc1)c1ccccc1' in content:
+                return 'BIF', mods  # 4,4-biphenylalanine
+            if 'c1ccc(cc1)C(=O)c1ccccc1' in content:
+                return 'PBF', mods  # 4-benzoyl-phenylalanine
+            if 'c1ccc(cc1)C(C)(C)C' in content:
+                return 'TBP4', mods  # 4-tert-butyl-phenylalanine
+            if 'c1ccc(cc1)[C](=[NH2])=[NH2]' in content:
+                return '0BN', mods  # 4-carbamimidoyl-l-phenylalanine
+            if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content:
+                return 'APM', mods  # m-amidinophenyl-3-alanine
+        # Multiple hydroxy patterns
+        if 'O' in content:
+            if '[C@H]([C@H](C)O)O' in content:
+                return 'ILX', mods  # 4,5-dihydroxy-isoleucine
+            if '[C@H]([C@@H](C)O)O' in content:
+                return 'ALO', mods  # Allo-threonine
+            if '[C@H](COP(O)(O)O)' in content:
+                return 'SEP', mods  # phosphoserine
+            if '[C@H]([C@@H](C)OP(O)(O)O)' in content:
+                return 'TPO', mods  # phosphothreonine
+            if '[C@H](c1ccc(O)cc1)O' in content:
+                return 'OMX', mods  # (betar)-beta-hydroxy-l-tyrosine
+            if '[C@H](c1ccc(c(Cl)c1)O)O' in content:
+                return 'OMY', mods  # (betar)-3-chloro-beta-hydroxy-l-tyrosine
+        # Heterocyclic patterns
+        if 'n1' in content:
+            if 'n1cccn1' in content:
+                return 'PYZ1', mods  # 3-(1-Pyrazolyl)-alanine
+            if 'n1nncn1' in content:
+                return 'TEZA', mods  # 3-(2-Tetrazolyl)-alanine
+            if 'c2c(n1)cccc2' in content:
+                return 'QU32', mods  # 3-(2-Quinolyl)-alanine
+            if 'c1cnc2c(c1)cccc2' in content:
+                return 'QU33', mods  # 3-(3-quinolyl)-alanine
+            if 'c1ccnc2c1cccc2' in content:
+                return 'QU34', mods  # 3-(4-quinolyl)-alanine
+            if 'c1ccc2c(c1)nccc2' in content:
+                return 'QU35', mods  # 3-(5-Quinolyl)-alanine
+            if 'c1ccc2c(c1)cncc2' in content:
+                return 'QU36', mods  # 3-(6-Quinolyl)-alanine
+            if 'c1cnc2c(n1)cccc2' in content:
+                return 'QX32', mods  # 3-(2-quinoxalyl)-alanine
+        # Multiple nitrogen patterns
+        if 'N' in content:
+            if '[NH3]CC[C@@H]' in content:
+                return 'DAB', mods  # Diaminobutyric acid
+            if '[NH3]C[C@@H]' in content:
+                return 'DPP', mods  # 2,3-Diaminopropanoic acid
+            if '[NH3]CCCCCC[C@@H]' in content:
+                return 'HHK', mods  # (2s)-2,8-diaminooctanoic acid
+            if 'CCC[NH]=[C](=[NH2])=[NH2]' in content:
+                return 'GBUT', mods  # 2-Amino-4-guanidinobutryric acid
+            if '[NH]=[C](=S)=[NH2]' in content:
+                return 'THIC', mods  # Thio-citrulline
+        # Chain modified amino acids
+        if 'CC' in content:
+            if 'CCCC[C@@H]' in content:
+                return 'AHP', mods  # 2-Aminoheptanoic acid
+            if 'CCC([C@@H])(C)C' in content:
+                return 'I2M', mods  # 3-methyl-l-alloisoleucine
+            if 'CC[C@H]([C@@H])C' in content:
+                return 'IIL', mods  # Allo-Isoleucine
+            if '[C@H](CCC(C)C)' in content:
+                return 'HLEU', mods  # Homoleucine
+            if '[C@@H]([C@@H](C)O)C' in content:
+                return 'HLU', mods  # beta-hydroxyleucine
+        # Modified glutamate/aspartate patterns
+        if '[C@@H]' in content:
+            if '[C@@H](C[C@@H](F))' in content:
+                return 'FGA4', mods  # 4-Fluoro-glutamic acid
+            if '[C@@H](C[C@@H](O))' in content:
+                return '3GL', mods  # 4-hydroxy-glutamic-acid
+            if '[C@@H](C[C@H](C))' in content:
+                return 'LME', mods  # (3r)-3-methyl-l-glutamic acid
+            if '[C@@H](CC[C@H](C))' in content:
+                return 'MEG', mods  # (3s)-3-methyl-l-glutamic acid
+        # Sulfur and selenium modifications
+        if 'S' in content:
+            if 'SCC[C@@H]' in content:
+                return 'HSER', mods  # homoserine
+            if 'SCCN' in content:
+                return 'SLZ', mods  # thialysine
+            if 'SC(=O)' in content:
+                return 'CSA', mods  # s-acetonylcysteine
+            if '[S@@](=O)' in content:
+                return 'SME', mods  # Methionine sulfoxide
+            if 'S(=O)(=O)' in content:
+                return 'OMT', mods  # Methionine sulfone
+        # Double bond containing
+        if 'C=' in content:
+            if 'C=C[C@@H]' in content:
+                return '2AG', mods  # 2-Allyl-glycine
+            if 'C=C[C@@H]' in content:
+                return 'LVG', mods  # vinylglycine
+            if 'C=Cc1ccccc1' in content:
+                return 'STYA', mods  # Styrylalanine
+        # Special cases
+        if '[C@@H]1Cc2c(C1)cccc2' in content:
+            return 'IGL', mods  # alpha-amino-2-indanacetic acid
+        if '[C](=[C](=O)=O)=O' in content:
+            return '26P', mods  # 2-amino-6-oxopimelic acid
+        if '[C](=[C](=O)=O)=C' in content:
+            return '2NP', mods  # l-2-amino-6-methylene-pimelic acid
+        if 'c2cnc[nH]2' in content:
+            return 'HIS', mods  # histidine core
+        if 'c1cccc2c1cc(O)cc2' in content:
+            return 'NAO1', mods  # 5-hydroxy-1-naphthalene
+        if 'c1ccc2c(c1)cc(O)cc2' in content:
+            return 'NAO2', mods  # 6-hydroxy-2-naphthalene
+        # Proline (P) - flexible ring numbers
+        if any([
+            # Check for any ring number in bond patterns
+            (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
+            any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
+            for n in '123456789'
+        ]) or any([
+            # Check ending patterns with any ring number
+            (f'CCCN{n}' in content and content.endswith('=O') and
+            any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
+            for n in '123456789'
+        ]) or any([
+            # Handle CCC[C@H]n patterns
+            (content == f'CCC[C@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
+            (content == f'CCC[C@@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
+            # N-terminal Pro with any ring number
+            (f'N{n}CCC[C@H]{n}' in content) or
+            (f'N{n}CCC[C@@H]{n}' in content)
+            for n in '123456789'
+        ]):
+            return 'Pro', mods
+        # Tryptophan (W) - more specific indole pattern
+        if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
+        'c[nH]c' in content.replace(' ', ''):
+            return 'Trp', mods
+        # Lysine (K) - both patterns
+        if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
+            return 'Lys', mods
+        # Arginine (R) - both patterns
+        if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
+            return 'Arg', mods
         if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
             return 'Nle', mods
             return 'Leu', mods
         if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content:
             return 'Leu', mods
         if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
             return 'Thr', mods
         if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content:
             return 'Phe', mods
+        if ('[C@H](C(C)C)' in content or       # With outer parentheses
+            '[C@@H](C(C)C)' in content or      # With outer parentheses
+            '[C@H]C(C)C' in content or         # Without outer parentheses
+            '[C@@H]C(C)C' in content):         # Without outer parentheses
+            if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']):  # Still check not Leu
                 return 'Val', mods
         if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
             return 'O-tBu', mods
+        if any([
+            'CC[C@H](C)' in content,
+            'CC[C@@H](C)' in content,
+            'C(C)C[C@H]' in content and 'CC(C)C' not in content,
+            'C(C)C[C@@H]' in content and 'CC(C)C' not in content
+        ]):
+            return 'Ile', mods
         if ('[C@H](C)' in content or '[C@@H](C)' in content):
+            if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
                 return 'Ala', mods
         # Tyrosine (Tyr) - 4-hydroxybenzyl side chain
+        if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
             return 'Tyr', mods
         # Serine (Ser) - Hydroxymethyl side chain
         if '[C@H](CO)' in content or '[C@@H](CO)' in content:
                 return 'Ser', mods
         # Threonine (Thr) - 1-hydroxyethyl side chain
+        if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H](C)O' in content or '[C@H](C)O' in content:
             return 'Thr', mods
         # Cysteine (Cys) - Thiol side chain
         if ('CCC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
             return 'Glu', mods
         # Arginine (Arg) - 3-guanidinopropyl side chain
         if ('CCCNC(=N)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
             return 'Arg', mods
         return mods
     def analyze_structure(self, smiles):
+        """Main analysis function with debug output"""
         print("\nAnalyzing structure:", smiles)
         # Split into segments
                 print(f"Warning: Could not identify residue in segment: {segment['content']}")
         # Check if cyclic
+        is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
+        three_letter = '-'.join(sequence)
+        one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
+        if is_cyclic:
+            three_letter = f"cyclo({three_letter})"
+            one_letter = f"cyclo({one_letter})"
+        print(f"\nFinal sequence: {three_letter}")
+        print(f"One-letter code: {one_letter}")
+        print(f"Is cyclic: {is_cyclic}")
+        #print(f"Peptide cycles: {peptide_cycles}")
+        #print(f"Aromatic cycles: {aromatic_cycles}")
+        return {
+            'three_letter': three_letter,
+            'one_letter': one_letter,
+            'is_cyclic': is_cyclic
+        }
 """
 def annotate_cyclic_structure(mol, sequence):
     plt.tight_layout()
     return fig
+class PeptideStructureGenerator:
+    """A class to generate 3D structures of peptides using different embedding methods"""
+    @staticmethod
+    def prepare_molecule(smiles):
+        """Prepare molecule with proper hydrogen handling"""
+        mol = Chem.MolFromSmiles(smiles, sanitize=False)
+        if mol is None:
+            raise ValueError("Failed to create molecule from SMILES")
+        # Calculate valence for each atom
+        for atom in mol.GetAtoms():
+            atom.UpdatePropertyCache(strict=False)
+        # Sanitize with reduced requirements
+        Chem.SanitizeMol(mol,
+                        sanitizeOps=Chem.SANITIZE_FINDRADICALS|
+                                  Chem.SANITIZE_KEKULIZE|
+                                  Chem.SANITIZE_SETAROMATICITY|
+                                  Chem.SANITIZE_SETCONJUGATION|
+                                  Chem.SANITIZE_SETHYBRIDIZATION|
+                                  Chem.SANITIZE_CLEANUPCHIRALITY)
+        mol = Chem.AddHs(mol)
+        return mol
+    @staticmethod
+    def get_etkdg_params(attempt=0):
+        """Get ETKDG parameters with optional modifications based on attempt number"""
+        params = AllChem.ETKDGv3()
+        params.randomSeed = -1
+        params.maxIterations = 200
+        params.numThreads = 4  # Reduced for web interface
+        params.useBasicKnowledge = True
+        params.enforceChirality = True
+        params.useExpTorsionAnglePrefs = True
+        params.useSmallRingTorsions = True
+        params.useMacrocycleTorsions = True
+        params.ETversion = 2
+        params.pruneRmsThresh = -1
+        params.embedRmsThresh = 0.5
+        if attempt > 10:
+            params.bondLength = 1.5 + (attempt - 10) * 0.02
+            params.useExpTorsionAnglePrefs = False
+        return params
+    def generate_structure_etkdg(self, smiles, max_attempts=20):
+        """Generate 3D structure using ETKDG without UFF optimization"""
+        success = False
+        mol = None
+        for attempt in range(max_attempts):
+            try:
+                mol = self.prepare_molecule(smiles)
+                params = self.get_etkdg_params(attempt)
+                if AllChem.EmbedMolecule(mol, params) == 0:
+                    success = True
+                    break
+            except Exception as e:
+                continue
+        if not success:
+            raise ValueError("Failed to generate structure with ETKDG")
+        return mol
+    def generate_structure_uff(self, smiles, max_attempts=20):
+        """Generate 3D structure using ETKDG followed by UFF optimization"""
+        best_mol = None
+        lowest_energy = float('inf')
+        for attempt in range(max_attempts):
+            try:
+                test_mol = self.prepare_molecule(smiles)
+                params = self.get_etkdg_params(attempt)
+                if AllChem.EmbedMolecule(test_mol, params) == 0:
+                    res = AllChem.UFFOptimizeMolecule(test_mol, maxIters=2000,
+                                                     vdwThresh=10.0, confId=0,
+                                                     ignoreInterfragInteractions=True)
+                    if res == 0:
+                        ff = AllChem.UFFGetMoleculeForceField(test_mol)
+                        if ff:
+                            current_energy = ff.CalcEnergy()
+                            if current_energy < lowest_energy:
+                                lowest_energy = current_energy
+                                best_mol = Chem.Mol(test_mol)
+            except Exception:
+                continue
+        if best_mol is None:
+            raise ValueError("Failed to generate optimized structure")
+        return best_mol
+    @staticmethod
+    def mol_to_sdf_bytes(mol):
+        """Convert RDKit molecule to SDF file bytes"""
+        sio = BytesIO()
+        writer = Chem.SDWriter(sio)
+        writer.write(mol)
+        writer.close()
+        return sio.getvalue()
+def process_input(smiles_input=None, file_obj=None, show_linear=False,
+                 show_segment_details=False, generate_3d=False, use_uff=False):
     """Process input and create visualizations using PeptideAnalyzer"""
     analyzer = PeptideAnalyzer()
+    structure_files = []
     # Handle direct SMILES input
     if smiles_input:
         # First check if it's a peptide using analyzer's method
         if not analyzer.is_peptide(smiles):
             return "Error: Input SMILES does not appear to be a peptide structure.", None, None
         try:
             # Create molecule
             mol = Chem.MolFromSmiles(smiles)
             if mol is None:
                 return "Error: Invalid SMILES notation.", None, None
+            # Generate 3D structures if requested
+            if generate_3d:
+                generator = PeptideStructureGenerator()
+                try:
+                    # Generate ETKDG structure
+                    mol_etkdg = generator.generate_structure_etkdg(smiles)
+                    etkdg_bytes = generator.mol_to_sdf_bytes(mol_etkdg)
+                    structure_files.append(("structure_etkdg.sdf", etkdg_bytes))
+                    # Generate UFF structure if requested
+                    if use_uff:
+                        mol_uff = generator.generate_structure_uff(smiles)
+                        uff_bytes = generator.mol_to_sdf_bytes(mol_uff)
+                        structure_files.append(("structure_uff.sdf", uff_bytes))
+                except Exception as e:
+                    return f"Error generating 3D structures: {str(e)}", None, None, []
             # Use analyzer to get sequence
             segments = analyzer.split_on_bonds(smiles)
                 summary += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
                 #summary += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
+            return summary + output_text, img_cyclic, img_linear, structure_files
         except Exception as e:
             return f"Error processing SMILES: {str(e)}", None, None
         gr.Checkbox(
             label="Show segment details",
             value=False
+        ),
+        gr.Checkbox(
+            label="Generate 3D structure (sdf file format)",
+            value=False
+        ),
+        gr.Checkbox(
+            label="Use UFF optimization",
+            value=False
         )
     ],
     outputs=[
     3. Parses the amino acid sequence
     4. Creates 2D structure visualization with residue annotations
     5. Optional linear representation
+    6. Optional 3D structure generation (ETKDG and UFF methods)
     Input: Either enter a SMILES string directly or upload a text file containing SMILES strings