yinuozhang commited on
Commit
d7b6536
·
1 Parent(s): 84bbd6a

add uaa and sdf

Browse files
Files changed (1) hide show
  1. app.py +546 -71
app.py CHANGED
@@ -19,11 +19,19 @@ class PeptideAnalyzer:
19
  self.bond_patterns = [
20
  (r'OC\(=O\)', 'ester'), # Ester bond
21
  (r'N\(C\)C\(=O\)', 'n_methyl'), # N-methylated peptide bond
22
- (r'N[12]C\(=O\)', 'proline'), # Proline peptide bond
23
  (r'NC\(=O\)', 'peptide'), # Standard peptide bond
24
  (r'C\(=O\)N\(C\)', 'n_methyl_reverse'), # Reverse N-methylated
25
  (r'C\(=O\)N[12]?', 'peptide_reverse') # Reverse peptide bond
26
  ]
 
 
 
 
 
 
 
 
27
 
28
  def is_peptide(self, smiles):
29
  """Check if the SMILES represents a peptide structure"""
@@ -44,47 +52,25 @@ class PeptideAnalyzer:
44
  return False
45
 
46
  def is_cyclic(self, smiles):
47
- """
48
- Determine if SMILES represents a cyclic peptide by checking head-tail connection.
49
- Returns: (is_cyclic, peptide_cycles, aromatic_cycles)
50
- """
51
- # First find aromatic rings
52
- aromatic_cycles = []
53
- for match in re.finditer(r'c[12]ccccc[12]', smiles):
54
- number = match.group(0)[1]
55
- if number not in aromatic_cycles:
56
- aromatic_cycles.append(str(number))
57
-
58
- # Find potential cycle numbers and their contexts
59
- cycle_closures = []
60
-
61
- # Look for cycle starts and corresponding ends
62
- cycle_patterns = [
63
- # Pattern pairs (start, end)
64
- (r'[^\d](\d)[A-Z@]', r'C\1=O$'), # Classic C=O ending
65
- (r'[^\d](\d)[A-Z@]', r'N\1C\(=O\)'), # N1C(=O) pattern
66
- (r'[^\d](\d)[A-Z@]', r'N\1C$'), # Simple N1C ending
67
- (r'[^\d](\d)C\(=O\)', r'N\1[A-Z]'), # Reverse connection
68
- (r'H(\d)', r'N\1C'), # H1...N1C pattern
69
- (r'[^\d](\d)(?:C|N|O)', r'(?:C|N)\1(?:\(|$)'), # Generic cycle closure
70
- ]
71
-
72
- for start_pat, end_pat in cycle_patterns:
73
- start_matches = re.finditer(start_pat, smiles)
74
- for start_match in start_matches:
75
- number = start_match.group(1)
76
- if number not in aromatic_cycles: # Skip aromatic ring numbers
77
- # Look for corresponding end pattern
78
- end_match = re.search(end_pat.replace('\\1', number), smiles)
79
- if end_match and end_match.start() > start_match.start():
80
- cycle_closures.append(number)
81
- break
82
 
83
- # Remove duplicates and aromatic numbers
84
- peptide_cycles = list(set(cycle_closures) - set(aromatic_cycles))
 
 
 
 
85
 
86
- is_cyclic = len(peptide_cycles) > 0
 
87
 
 
88
  return is_cyclic, peptide_cycles, aromatic_cycles
89
 
90
  def split_on_bonds(self, smiles):
@@ -158,17 +144,353 @@ class PeptideAnalyzer:
158
 
159
  return segments
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  def identify_residue(self, segment):
162
  """Identify residue with Pro reconstruction"""
163
- content = segment['content']
 
164
  mods = self.get_modifications(segment)
165
 
166
- # Special handling for Pro: reconstruct the complete pattern
167
- if (segment.get('bond_after') == 'N2C(=O)' and 'CCC' in content) or \
168
- ('CCCN2' in content and content.endswith('=O')): # End case
169
- # Reconstruct the complete Pro pattern
170
- if '[C@@H]2' in content or '[C@H]2' in content:
171
- return 'Pro', mods
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
174
  return 'Nle', mods
@@ -215,34 +537,39 @@ class PeptideAnalyzer:
215
  return 'Leu', mods
216
  if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content:
217
  return 'Leu', mods
218
-
219
- if ('C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content) and 'CC(C)C' not in content:
220
- return 'Ile', mods
221
-
222
  if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
223
  return 'Thr', mods
224
 
225
  if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content:
226
  return 'Phe', mods
227
 
228
- if '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content:
229
- if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']):
 
 
 
230
  return 'Val', mods
231
 
232
  if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
233
  return 'O-tBu', mods
234
 
 
 
 
 
 
 
 
 
235
  if ('[C@H](C)' in content or '[C@@H](C)' in content):
236
- if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O']):
237
  return 'Ala', mods
238
 
239
  # Tyrosine (Tyr) - 4-hydroxybenzyl side chain
240
- if ('Cc2ccc(O)cc2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
241
  return 'Tyr', mods
242
-
243
- # Tryptophan (Trp) - Indole side chain
244
- if ('Cc2c[nH]c3ccccc23' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
245
- return 'Trp', mods
246
 
247
  # Serine (Ser) - Hydroxymethyl side chain
248
  if '[C@H](CO)' in content or '[C@@H](CO)' in content:
@@ -250,7 +577,7 @@ class PeptideAnalyzer:
250
  return 'Ser', mods
251
 
252
  # Threonine (Thr) - 1-hydroxyethyl side chain
253
- if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
254
  return 'Thr', mods
255
 
256
  # Cysteine (Cys) - Thiol side chain
@@ -277,10 +604,6 @@ class PeptideAnalyzer:
277
  if ('CCC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
278
  return 'Glu', mods
279
 
280
- # Lysine (Lys) - 4-aminobutyl side chain
281
- if ('C[C@H](CCCCN)' in content or 'C[C@@H](CCCCN)' in content):
282
- return 'Lys', mods
283
-
284
  # Arginine (Arg) - 3-guanidinopropyl side chain
285
  if ('CCCNC(=N)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
286
  return 'Arg', mods
@@ -302,7 +625,7 @@ class PeptideAnalyzer:
302
  return mods
303
 
304
  def analyze_structure(self, smiles):
305
- """Main analysis function"""
306
  print("\nAnalyzing structure:", smiles)
307
 
308
  # Split into segments
@@ -328,11 +651,25 @@ class PeptideAnalyzer:
328
  print(f"Warning: Could not identify residue in segment: {segment['content']}")
329
 
330
  # Check if cyclic
331
- is_cyclic = 'N1' in smiles or 'N2' in smiles
332
- final_sequence = f"cyclo({'-'.join(sequence)})" if is_cyclic else '-'.join(sequence)
 
 
 
 
 
333
 
334
- print(f"\nFinal sequence: {final_sequence}")
335
- return final_sequence
 
 
 
 
 
 
 
 
 
336
 
337
  """
338
  def annotate_cyclic_structure(mol, sequence):
@@ -586,9 +923,119 @@ def create_enhanced_linear_viz(sequence, smiles):
586
  plt.tight_layout()
587
  return fig
588
 
589
- def process_input(smiles_input=None, file_obj=None, show_linear=False, show_segment_details=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  """Process input and create visualizations using PeptideAnalyzer"""
591
  analyzer = PeptideAnalyzer()
 
592
 
593
  # Handle direct SMILES input
594
  if smiles_input:
@@ -597,13 +1044,32 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False, show_segm
597
  # First check if it's a peptide using analyzer's method
598
  if not analyzer.is_peptide(smiles):
599
  return "Error: Input SMILES does not appear to be a peptide structure.", None, None
600
-
601
  try:
602
  # Create molecule
603
  mol = Chem.MolFromSmiles(smiles)
604
  if mol is None:
605
  return "Error: Invalid SMILES notation.", None, None
606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
  # Use analyzer to get sequence
608
  segments = analyzer.split_on_bonds(smiles)
609
 
@@ -666,7 +1132,7 @@ def process_input(smiles_input=None, file_obj=None, show_linear=False, show_segm
666
  summary += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
667
  #summary += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
668
 
669
- return summary + output_text, img_cyclic, img_linear
670
 
671
  except Exception as e:
672
  return f"Error processing SMILES: {str(e)}", None, None
@@ -757,6 +1223,14 @@ iface = gr.Interface(
757
  gr.Checkbox(
758
  label="Show segment details",
759
  value=False
 
 
 
 
 
 
 
 
760
  )
761
  ],
762
  outputs=[
@@ -781,6 +1255,7 @@ iface = gr.Interface(
781
  3. Parses the amino acid sequence
782
  4. Creates 2D structure visualization with residue annotations
783
  5. Optional linear representation
 
784
 
785
  Input: Either enter a SMILES string directly or upload a text file containing SMILES strings
786
 
 
19
  self.bond_patterns = [
20
  (r'OC\(=O\)', 'ester'), # Ester bond
21
  (r'N\(C\)C\(=O\)', 'n_methyl'), # N-methylated peptide bond
22
+ (r'N[0-9]C\(=O\)', 'proline'), # Proline peptide bond
23
  (r'NC\(=O\)', 'peptide'), # Standard peptide bond
24
  (r'C\(=O\)N\(C\)', 'n_methyl_reverse'), # Reverse N-methylated
25
  (r'C\(=O\)N[12]?', 'peptide_reverse') # Reverse peptide bond
26
  ]
27
+ # Three to one letter code mapping
28
+ self.three_to_one = {
29
+ 'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E',
30
+ 'Phe': 'F', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
31
+ 'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N',
32
+ 'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S',
33
+ 'Thr': 'T', 'Val': 'V', 'Trp': 'W', 'Tyr': 'Y'
34
+ }
35
 
36
  def is_peptide(self, smiles):
37
  """Check if the SMILES represents a peptide structure"""
 
52
  return False
53
 
54
  def is_cyclic(self, smiles):
55
+ """Improved cyclic peptide detection"""
56
+ # Check for C-terminal carboxyl
57
+ if smiles.endswith('C(=O)O'):
58
+ return False, [], []
59
+
60
+ # Find all numbers used in ring closures
61
+ ring_numbers = re.findall(r'(?:^|[^c])[0-9](?=[A-Z@\(\)])', smiles)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ # Find aromatic ring numbers
64
+ aromatic_matches = re.findall(r'c[0-9](?:ccccc|c\[nH\]c)[0-9]', smiles)
65
+ aromatic_cycles = []
66
+ for match in aromatic_matches:
67
+ numbers = re.findall(r'[0-9]', match)
68
+ aromatic_cycles.extend(numbers)
69
 
70
+ # Numbers that aren't part of aromatic rings are peptide cycles
71
+ peptide_cycles = [n for n in ring_numbers if n not in aromatic_cycles]
72
 
73
+ is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
74
  return is_cyclic, peptide_cycles, aromatic_cycles
75
 
76
  def split_on_bonds(self, smiles):
 
144
 
145
  return segments
146
 
147
+ def clean_terminal_carboxyl(self, segment):
148
+ """Remove C-terminal carboxyl only if it's the true terminus"""
149
+ content = segment['content']
150
+
151
+ # Only clean if:
152
+ # 1. Contains C(=O)O
153
+ # 2. No bond_after exists (meaning it's the last segment)
154
+ # 3. C(=O)O is at the end of the content
155
+ if 'C(=O)O' in content and not segment.get('bond_after'):
156
+ print('recognized?')
157
+ # Remove C(=O)O pattern regardless of position
158
+ cleaned = re.sub(r'\(C\(=O\)O\)', '', content)
159
+ # Remove any leftover empty parentheses
160
+ cleaned = re.sub(r'\(\)', '', cleaned)
161
+ print(cleaned)
162
+ return cleaned
163
+ return content
164
+
165
  def identify_residue(self, segment):
166
  """Identify residue with Pro reconstruction"""
167
+ # Only clean terminal carboxyl if this is the last segment
168
+ content = self.clean_terminal_carboxyl(segment)
169
  mods = self.get_modifications(segment)
170
 
171
+ # UAA pattern matching section - before regular residues
172
+ # Phenylglycine and derivatives
173
+ if 'c1ccccc1' in content:
174
+ if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content:
175
+ return '4', mods # Base phenylglycine
176
+
177
+ # 4-substituted phenylalanines
178
+ if 'Cc1ccc' in content:
179
+ if 'OMe' in content or 'OCc1ccc' in content:
180
+ return '0A1', mods # 4-methoxy-Phenylalanine
181
+ elif 'Clc1ccc' in content:
182
+ return '200', mods # 4-chloro-Phenylalanine
183
+ elif 'Brc1ccc' in content:
184
+ return '4BF', mods # 4-Bromo-phenylalanine
185
+ elif 'C#Nc1ccc' in content:
186
+ return '4CF', mods # 4-cyano-phenylalanine
187
+ elif 'Ic1ccc' in content:
188
+ return 'PHI', mods # 4-Iodo-phenylalanine
189
+ elif 'Fc1ccc' in content:
190
+ return 'PFF', mods # 4-Fluoro-phenylalanine
191
+
192
+ # Modified tryptophans
193
+ if 'c[nH]c2' in content:
194
+ if 'Oc2cccc2' in content:
195
+ return '0AF', mods # 7-hydroxy-tryptophan
196
+ elif 'Fc2cccc2' in content:
197
+ return '4FW', mods # 4-fluoro-tryptophan
198
+ elif 'Clc2cccc2' in content:
199
+ return '6CW', mods # 6-chloro-tryptophan
200
+ elif 'Brc2cccc2' in content:
201
+ return 'BTR', mods # 6-bromo-tryptophan
202
+ elif 'COc2cccc2' in content:
203
+ return 'MOT5', mods # 5-Methoxy-tryptophan
204
+ elif 'Cc2cccc2' in content:
205
+ return 'MTR5', mods # 5-Methyl-tryptophan
206
+
207
+ # Special amino acids
208
+ if 'CC(C)(C)[C@@H]' in content or 'CC(C)(C)[C@H]' in content:
209
+ return 'BUG', mods # Tertleucine
210
+
211
+ if 'CCCNC(=N)N' in content:
212
+ return 'CIR', mods # Citrulline
213
+
214
+ if '[SeH]' in content:
215
+ return 'CSE', mods # Selenocysteine
216
+
217
+ if '[NH3]CC[C@@H]' in content or '[NH3]CC[C@H]' in content:
218
+ return 'DAB', mods # Diaminobutyric acid
219
+
220
+ if 'C1CCCCC1' in content:
221
+ if 'C1CCCCC1[C@@H]' in content or 'C1CCCCC1[C@H]' in content:
222
+ return 'CHG', mods # Cyclohexylglycine
223
+ elif 'C1CCCCC1C[C@@H]' in content or 'C1CCCCC1C[C@H]' in content:
224
+ return 'ALC', mods # 3-cyclohexyl-alanine
225
+
226
+ # Naphthalene derivatives
227
+ if 'c1cccc2c1cccc2' in content:
228
+ if 'c1cccc2c1cccc2[C@@H]' in content or 'c1cccc2c1cccc2[C@H]' in content:
229
+ return 'NAL', mods # 2-Naphthyl-alanine
230
+
231
+ # Heteroaromatic derivatives
232
+ if 'c1cncc' in content:
233
+ return 'PYR4', mods # 3-(4-Pyridyl)-alanine
234
+ if 'c1cscc' in content:
235
+ return 'THA3', mods # 3-(3-thienyl)-alanine
236
+ if 'c1nnc' in content:
237
+ return 'TRZ4', mods # 3-(1,2,4-Triazol-1-yl)-alanine
238
+
239
+ # Modified serines and threonines
240
+ if 'OP(O)(O)O' in content:
241
+ if '[C@@H](COP' in content or '[C@H](COP' in content:
242
+ return 'SEP', mods # phosphoserine
243
+ elif '[C@@H](OP' in content or '[C@H](OP' in content:
244
+ return 'TPO', mods # phosphothreonine
245
+
246
+ # Specialized ring systems
247
+ if 'c1c2ccccc2cc2c1cccc2' in content:
248
+ return 'ANTH', mods # 3-(9-anthryl)-alanine
249
+ if 'c1csc2c1cccc2' in content:
250
+ return 'BTH3', mods # 3-(3-benzothienyl)-alanine
251
+ if '[C@]12C[C@H]3C[C@@H](C2)C[C@@H](C1)C3' in content:
252
+ return 'ADAM', mods # Adamanthane
253
+
254
+ # Fluorinated derivatives
255
+ if 'FC(F)(F)' in content:
256
+ if 'CC(F)(F)F' in content:
257
+ return 'FLA', mods # Trifluoro-alanine
258
+ if 'C(F)(F)F)c1' in content:
259
+ if 'c1ccccc1C(F)(F)F' in content:
260
+ return 'TFG2', mods # 2-(Trifluoromethyl)-phenylglycine
261
+ if 'c1cccc(c1)C(F)(F)F' in content:
262
+ return 'TFG3', mods # 3-(Trifluoromethyl)-phenylglycine
263
+ if 'c1ccc(cc1)C(F)(F)F' in content:
264
+ return 'TFG4', mods # 4-(Trifluoromethyl)-phenylglycine
265
+
266
+ # Multiple halogen patterns
267
+ if 'F' in content and 'c1' in content:
268
+ if 'c1ccc(c(c1)F)F' in content:
269
+ return 'F2F', mods # 3,4-Difluoro-phenylalanine
270
+ if 'cc(F)cc(c1)F' in content:
271
+ return 'WFP', mods # 3,5-Difluoro-phenylalanine
272
+ if 'Cl' in content and 'c1' in content:
273
+ if 'c1ccc(cc1Cl)Cl' in content:
274
+ return 'CP24', mods # 2,4-dichloro-phenylalanine
275
+ if 'c1ccc(c(c1)Cl)Cl' in content:
276
+ return 'CP34', mods # 3,4-dichloro-phenylalanine
277
+
278
+ # Hydroxy and amino derivatives
279
+ if 'O' in content and 'c1' in content:
280
+ if 'c1cc(O)cc(c1)O' in content:
281
+ return '3FG', mods # (2s)-amino(3,5-dihydroxyphenyl)-ethanoic acid
282
+ if 'c1ccc(c(c1)O)O' in content:
283
+ return 'DAH', mods # 3,4-Dihydroxy-phenylalanine
284
+
285
+ # Cyclic amino acids
286
+ if 'C1CCCC1' in content:
287
+ return 'CPA3', mods # 3-Cyclopentyl-alanine
288
+ if 'C1CCCCC1' in content:
289
+ if 'CC1CCCCC1' in content:
290
+ return 'ALC', mods # 3-cyclohexyl-alanine
291
+ else:
292
+ return 'CHG', mods # Cyclohexylglycine
293
+
294
+ # Chain-length variants
295
+ if 'CCC[C@@H]' in content or 'CCC[C@H]' in content:
296
+ return 'NLE', mods # Norleucine
297
+ if 'CC[C@@H]' in content or 'CC[C@H]' in content:
298
+ if not any(x in content for x in ['CC(C)', 'COC', 'CN(']):
299
+ return 'ABA', mods # 2-Aminobutyric acid
300
+
301
+ # Modified histidines
302
+ if 'c1cnc' in content:
303
+ if '[C@@H]1CN[C@@H](N1)F' in content:
304
+ return '2HF', mods # 2-fluoro-l-histidine
305
+ if 'c1cnc([nH]1)F' in content:
306
+ return '2HF1', mods # 2-fluoro-l-histidine variant
307
+ if 'c1c[nH]c(n1)F' in content:
308
+ return '2HF2', mods # 2-fluoro-l-histidine variant
309
+
310
+ # Sulfur and selenium containing
311
+ if '[SeH]' in content:
312
+ return 'CSE', mods # Selenocysteine
313
+ if 'S' in content:
314
+ if 'CSCc1ccccc1' in content:
315
+ return 'BCS', mods # benzylcysteine
316
+ if 'CCSC' in content:
317
+ return 'ESC', mods # Ethionine
318
+ if 'CCS' in content:
319
+ return 'HCS', mods # homocysteine
320
+
321
+ # Additional modifications
322
+ if 'CN=[N]=N' in content:
323
+ return 'AZDA', mods # azido-alanine
324
+ if '[NH]=[C](=[NH2])=[NH2]' in content:
325
+ if 'CCC[NH]=' in content:
326
+ return 'AGM', mods # 5-methyl-arginine
327
+ if 'CC[NH]=' in content:
328
+ return 'GDPR', mods # 2-Amino-3-guanidinopropionic acid
329
+
330
+ if 'CCON' in content:
331
+ return 'CAN', mods # canaline
332
+ if '[C@@H]1C=C[C@@H](C=C1)' in content:
333
+ return 'ACZ', mods # cis-amiclenomycin
334
+ if 'CCC(=O)[NH3]' in content:
335
+ return 'ONL', mods # 5-oxo-l-norleucine
336
+ if 'c1ccncc1' in content:
337
+ return 'PYR4', mods # 3-(4-Pyridyl)-alanine
338
+ if 'c1ccco1' in content:
339
+ return 'FUA2', mods # (2-furyl)-alanine
340
+
341
+ if 'c1ccc' in content:
342
+ if 'c1ccc(cc1)c1ccccc1' in content:
343
+ return 'BIF', mods # 4,4-biphenylalanine
344
+ if 'c1ccc(cc1)C(=O)c1ccccc1' in content:
345
+ return 'PBF', mods # 4-benzoyl-phenylalanine
346
+ if 'c1ccc(cc1)C(C)(C)C' in content:
347
+ return 'TBP4', mods # 4-tert-butyl-phenylalanine
348
+ if 'c1ccc(cc1)[C](=[NH2])=[NH2]' in content:
349
+ return '0BN', mods # 4-carbamimidoyl-l-phenylalanine
350
+ if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content:
351
+ return 'APM', mods # m-amidinophenyl-3-alanine
352
+
353
+ # Multiple hydroxy patterns
354
+ if 'O' in content:
355
+ if '[C@H]([C@H](C)O)O' in content:
356
+ return 'ILX', mods # 4,5-dihydroxy-isoleucine
357
+ if '[C@H]([C@@H](C)O)O' in content:
358
+ return 'ALO', mods # Allo-threonine
359
+ if '[C@H](COP(O)(O)O)' in content:
360
+ return 'SEP', mods # phosphoserine
361
+ if '[C@H]([C@@H](C)OP(O)(O)O)' in content:
362
+ return 'TPO', mods # phosphothreonine
363
+ if '[C@H](c1ccc(O)cc1)O' in content:
364
+ return 'OMX', mods # (betar)-beta-hydroxy-l-tyrosine
365
+ if '[C@H](c1ccc(c(Cl)c1)O)O' in content:
366
+ return 'OMY', mods # (betar)-3-chloro-beta-hydroxy-l-tyrosine
367
+
368
+ # Heterocyclic patterns
369
+ if 'n1' in content:
370
+ if 'n1cccn1' in content:
371
+ return 'PYZ1', mods # 3-(1-Pyrazolyl)-alanine
372
+ if 'n1nncn1' in content:
373
+ return 'TEZA', mods # 3-(2-Tetrazolyl)-alanine
374
+ if 'c2c(n1)cccc2' in content:
375
+ return 'QU32', mods # 3-(2-Quinolyl)-alanine
376
+ if 'c1cnc2c(c1)cccc2' in content:
377
+ return 'QU33', mods # 3-(3-quinolyl)-alanine
378
+ if 'c1ccnc2c1cccc2' in content:
379
+ return 'QU34', mods # 3-(4-quinolyl)-alanine
380
+ if 'c1ccc2c(c1)nccc2' in content:
381
+ return 'QU35', mods # 3-(5-Quinolyl)-alanine
382
+ if 'c1ccc2c(c1)cncc2' in content:
383
+ return 'QU36', mods # 3-(6-Quinolyl)-alanine
384
+ if 'c1cnc2c(n1)cccc2' in content:
385
+ return 'QX32', mods # 3-(2-quinoxalyl)-alanine
386
+
387
+ # Multiple nitrogen patterns
388
+ if 'N' in content:
389
+ if '[NH3]CC[C@@H]' in content:
390
+ return 'DAB', mods # Diaminobutyric acid
391
+ if '[NH3]C[C@@H]' in content:
392
+ return 'DPP', mods # 2,3-Diaminopropanoic acid
393
+ if '[NH3]CCCCCC[C@@H]' in content:
394
+ return 'HHK', mods # (2s)-2,8-diaminooctanoic acid
395
+ if 'CCC[NH]=[C](=[NH2])=[NH2]' in content:
396
+ return 'GBUT', mods # 2-Amino-4-guanidinobutryric acid
397
+ if '[NH]=[C](=S)=[NH2]' in content:
398
+ return 'THIC', mods # Thio-citrulline
399
+
400
+ # Chain modified amino acids
401
+ if 'CC' in content:
402
+ if 'CCCC[C@@H]' in content:
403
+ return 'AHP', mods # 2-Aminoheptanoic acid
404
+ if 'CCC([C@@H])(C)C' in content:
405
+ return 'I2M', mods # 3-methyl-l-alloisoleucine
406
+ if 'CC[C@H]([C@@H])C' in content:
407
+ return 'IIL', mods # Allo-Isoleucine
408
+ if '[C@H](CCC(C)C)' in content:
409
+ return 'HLEU', mods # Homoleucine
410
+ if '[C@@H]([C@@H](C)O)C' in content:
411
+ return 'HLU', mods # beta-hydroxyleucine
412
+
413
+ # Modified glutamate/aspartate patterns
414
+ if '[C@@H]' in content:
415
+ if '[C@@H](C[C@@H](F))' in content:
416
+ return 'FGA4', mods # 4-Fluoro-glutamic acid
417
+ if '[C@@H](C[C@@H](O))' in content:
418
+ return '3GL', mods # 4-hydroxy-glutamic-acid
419
+ if '[C@@H](C[C@H](C))' in content:
420
+ return 'LME', mods # (3r)-3-methyl-l-glutamic acid
421
+ if '[C@@H](CC[C@H](C))' in content:
422
+ return 'MEG', mods # (3s)-3-methyl-l-glutamic acid
423
+
424
+ # Sulfur and selenium modifications
425
+ if 'S' in content:
426
+ if 'SCC[C@@H]' in content:
427
+ return 'HSER', mods # homoserine
428
+ if 'SCCN' in content:
429
+ return 'SLZ', mods # thialysine
430
+ if 'SC(=O)' in content:
431
+ return 'CSA', mods # s-acetonylcysteine
432
+ if '[S@@](=O)' in content:
433
+ return 'SME', mods # Methionine sulfoxide
434
+ if 'S(=O)(=O)' in content:
435
+ return 'OMT', mods # Methionine sulfone
436
+
437
+ # Double bond containing
438
+ if 'C=' in content:
439
+ if 'C=C[C@@H]' in content:
440
+ return '2AG', mods # 2-Allyl-glycine
441
+ if 'C=C[C@@H]' in content:
442
+ return 'LVG', mods # vinylglycine
443
+ if 'C=Cc1ccccc1' in content:
444
+ return 'STYA', mods # Styrylalanine
445
+
446
+ # Special cases
447
+ if '[C@@H]1Cc2c(C1)cccc2' in content:
448
+ return 'IGL', mods # alpha-amino-2-indanacetic acid
449
+ if '[C](=[C](=O)=O)=O' in content:
450
+ return '26P', mods # 2-amino-6-oxopimelic acid
451
+ if '[C](=[C](=O)=O)=C' in content:
452
+ return '2NP', mods # l-2-amino-6-methylene-pimelic acid
453
+ if 'c2cnc[nH]2' in content:
454
+ return 'HIS', mods # histidine core
455
+ if 'c1cccc2c1cc(O)cc2' in content:
456
+ return 'NAO1', mods # 5-hydroxy-1-naphthalene
457
+ if 'c1ccc2c(c1)cc(O)cc2' in content:
458
+ return 'NAO2', mods # 6-hydroxy-2-naphthalene
459
+
460
+ # Proline (P) - flexible ring numbers
461
+ if any([
462
+ # Check for any ring number in bond patterns
463
+ (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
464
+ any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
465
+ for n in '123456789'
466
+ ]) or any([
467
+ # Check ending patterns with any ring number
468
+ (f'CCCN{n}' in content and content.endswith('=O') and
469
+ any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
470
+ for n in '123456789'
471
+ ]) or any([
472
+ # Handle CCC[C@H]n patterns
473
+ (content == f'CCC[C@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
474
+ (content == f'CCC[C@@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
475
+ # N-terminal Pro with any ring number
476
+ (f'N{n}CCC[C@H]{n}' in content) or
477
+ (f'N{n}CCC[C@@H]{n}' in content)
478
+ for n in '123456789'
479
+ ]):
480
+ return 'Pro', mods
481
+
482
+ # Tryptophan (W) - more specific indole pattern
483
+ if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
484
+ 'c[nH]c' in content.replace(' ', ''):
485
+ return 'Trp', mods
486
+
487
+ # Lysine (K) - both patterns
488
+ if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
489
+ return 'Lys', mods
490
+
491
+ # Arginine (R) - both patterns
492
+ if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
493
+ return 'Arg', mods
494
 
495
  if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
496
  return 'Nle', mods
 
537
  return 'Leu', mods
538
  if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content:
539
  return 'Leu', mods
540
+
 
 
 
541
  if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
542
  return 'Thr', mods
543
 
544
  if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content:
545
  return 'Phe', mods
546
 
547
+ if ('[C@H](C(C)C)' in content or # With outer parentheses
548
+ '[C@@H](C(C)C)' in content or # With outer parentheses
549
+ '[C@H]C(C)C' in content or # Without outer parentheses
550
+ '[C@@H]C(C)C' in content): # Without outer parentheses
551
+ if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']): # Still check not Leu
552
  return 'Val', mods
553
 
554
  if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
555
  return 'O-tBu', mods
556
 
557
+ if any([
558
+ 'CC[C@H](C)' in content,
559
+ 'CC[C@@H](C)' in content,
560
+ 'C(C)C[C@H]' in content and 'CC(C)C' not in content,
561
+ 'C(C)C[C@@H]' in content and 'CC(C)C' not in content
562
+ ]):
563
+ return 'Ile', mods
564
+
565
  if ('[C@H](C)' in content or '[C@@H](C)' in content):
566
+ if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
567
  return 'Ala', mods
568
 
569
  # Tyrosine (Tyr) - 4-hydroxybenzyl side chain
570
+ if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
571
  return 'Tyr', mods
572
+
 
 
 
573
 
574
  # Serine (Ser) - Hydroxymethyl side chain
575
  if '[C@H](CO)' in content or '[C@@H](CO)' in content:
 
577
  return 'Ser', mods
578
 
579
  # Threonine (Thr) - 1-hydroxyethyl side chain
580
+ if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H](C)O' in content or '[C@H](C)O' in content:
581
  return 'Thr', mods
582
 
583
  # Cysteine (Cys) - Thiol side chain
 
604
  if ('CCC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
605
  return 'Glu', mods
606
 
 
 
 
 
607
  # Arginine (Arg) - 3-guanidinopropyl side chain
608
  if ('CCCNC(=N)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
609
  return 'Arg', mods
 
625
  return mods
626
 
627
  def analyze_structure(self, smiles):
628
+ """Main analysis function with debug output"""
629
  print("\nAnalyzing structure:", smiles)
630
 
631
  # Split into segments
 
651
  print(f"Warning: Could not identify residue in segment: {segment['content']}")
652
 
653
  # Check if cyclic
654
+ is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
655
+ three_letter = '-'.join(sequence)
656
+ one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
657
+
658
+ if is_cyclic:
659
+ three_letter = f"cyclo({three_letter})"
660
+ one_letter = f"cyclo({one_letter})"
661
 
662
+ print(f"\nFinal sequence: {three_letter}")
663
+ print(f"One-letter code: {one_letter}")
664
+ print(f"Is cyclic: {is_cyclic}")
665
+ #print(f"Peptide cycles: {peptide_cycles}")
666
+ #print(f"Aromatic cycles: {aromatic_cycles}")
667
+
668
+ return {
669
+ 'three_letter': three_letter,
670
+ 'one_letter': one_letter,
671
+ 'is_cyclic': is_cyclic
672
+ }
673
 
674
  """
675
  def annotate_cyclic_structure(mol, sequence):
 
923
  plt.tight_layout()
924
  return fig
925
 
926
+ class PeptideStructureGenerator:
927
+ """A class to generate 3D structures of peptides using different embedding methods"""
928
+
929
+ @staticmethod
930
+ def prepare_molecule(smiles):
931
+ """Prepare molecule with proper hydrogen handling"""
932
+ mol = Chem.MolFromSmiles(smiles, sanitize=False)
933
+ if mol is None:
934
+ raise ValueError("Failed to create molecule from SMILES")
935
+
936
+ # Calculate valence for each atom
937
+ for atom in mol.GetAtoms():
938
+ atom.UpdatePropertyCache(strict=False)
939
+
940
+ # Sanitize with reduced requirements
941
+ Chem.SanitizeMol(mol,
942
+ sanitizeOps=Chem.SANITIZE_FINDRADICALS|
943
+ Chem.SANITIZE_KEKULIZE|
944
+ Chem.SANITIZE_SETAROMATICITY|
945
+ Chem.SANITIZE_SETCONJUGATION|
946
+ Chem.SANITIZE_SETHYBRIDIZATION|
947
+ Chem.SANITIZE_CLEANUPCHIRALITY)
948
+
949
+ mol = Chem.AddHs(mol)
950
+ return mol
951
+
952
+ @staticmethod
953
+ def get_etkdg_params(attempt=0):
954
+ """Get ETKDG parameters with optional modifications based on attempt number"""
955
+ params = AllChem.ETKDGv3()
956
+ params.randomSeed = -1
957
+ params.maxIterations = 200
958
+ params.numThreads = 4 # Reduced for web interface
959
+ params.useBasicKnowledge = True
960
+ params.enforceChirality = True
961
+ params.useExpTorsionAnglePrefs = True
962
+ params.useSmallRingTorsions = True
963
+ params.useMacrocycleTorsions = True
964
+ params.ETversion = 2
965
+ params.pruneRmsThresh = -1
966
+ params.embedRmsThresh = 0.5
967
+
968
+ if attempt > 10:
969
+ params.bondLength = 1.5 + (attempt - 10) * 0.02
970
+ params.useExpTorsionAnglePrefs = False
971
+
972
+ return params
973
+
974
+ def generate_structure_etkdg(self, smiles, max_attempts=20):
975
+ """Generate 3D structure using ETKDG without UFF optimization"""
976
+ success = False
977
+ mol = None
978
+
979
+ for attempt in range(max_attempts):
980
+ try:
981
+ mol = self.prepare_molecule(smiles)
982
+ params = self.get_etkdg_params(attempt)
983
+
984
+ if AllChem.EmbedMolecule(mol, params) == 0:
985
+ success = True
986
+ break
987
+ except Exception as e:
988
+ continue
989
+
990
+ if not success:
991
+ raise ValueError("Failed to generate structure with ETKDG")
992
+
993
+ return mol
994
+
995
+ def generate_structure_uff(self, smiles, max_attempts=20):
996
+ """Generate 3D structure using ETKDG followed by UFF optimization"""
997
+ best_mol = None
998
+ lowest_energy = float('inf')
999
+
1000
+ for attempt in range(max_attempts):
1001
+ try:
1002
+ test_mol = self.prepare_molecule(smiles)
1003
+ params = self.get_etkdg_params(attempt)
1004
+
1005
+ if AllChem.EmbedMolecule(test_mol, params) == 0:
1006
+ res = AllChem.UFFOptimizeMolecule(test_mol, maxIters=2000,
1007
+ vdwThresh=10.0, confId=0,
1008
+ ignoreInterfragInteractions=True)
1009
+
1010
+ if res == 0:
1011
+ ff = AllChem.UFFGetMoleculeForceField(test_mol)
1012
+ if ff:
1013
+ current_energy = ff.CalcEnergy()
1014
+ if current_energy < lowest_energy:
1015
+ lowest_energy = current_energy
1016
+ best_mol = Chem.Mol(test_mol)
1017
+ except Exception:
1018
+ continue
1019
+
1020
+ if best_mol is None:
1021
+ raise ValueError("Failed to generate optimized structure")
1022
+
1023
+ return best_mol
1024
+
1025
+ @staticmethod
1026
+ def mol_to_sdf_bytes(mol):
1027
+ """Convert RDKit molecule to SDF file bytes"""
1028
+ sio = BytesIO()
1029
+ writer = Chem.SDWriter(sio)
1030
+ writer.write(mol)
1031
+ writer.close()
1032
+ return sio.getvalue()
1033
+
1034
+ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1035
+ show_segment_details=False, generate_3d=False, use_uff=False):
1036
  """Process input and create visualizations using PeptideAnalyzer"""
1037
  analyzer = PeptideAnalyzer()
1038
+ structure_files = []
1039
 
1040
  # Handle direct SMILES input
1041
  if smiles_input:
 
1044
  # First check if it's a peptide using analyzer's method
1045
  if not analyzer.is_peptide(smiles):
1046
  return "Error: Input SMILES does not appear to be a peptide structure.", None, None
1047
+
1048
  try:
1049
  # Create molecule
1050
  mol = Chem.MolFromSmiles(smiles)
1051
  if mol is None:
1052
  return "Error: Invalid SMILES notation.", None, None
1053
 
1054
+ # Generate 3D structures if requested
1055
+ if generate_3d:
1056
+ generator = PeptideStructureGenerator()
1057
+
1058
+ try:
1059
+ # Generate ETKDG structure
1060
+ mol_etkdg = generator.generate_structure_etkdg(smiles)
1061
+ etkdg_bytes = generator.mol_to_sdf_bytes(mol_etkdg)
1062
+ structure_files.append(("structure_etkdg.sdf", etkdg_bytes))
1063
+
1064
+ # Generate UFF structure if requested
1065
+ if use_uff:
1066
+ mol_uff = generator.generate_structure_uff(smiles)
1067
+ uff_bytes = generator.mol_to_sdf_bytes(mol_uff)
1068
+ structure_files.append(("structure_uff.sdf", uff_bytes))
1069
+
1070
+ except Exception as e:
1071
+ return f"Error generating 3D structures: {str(e)}", None, None, []
1072
+
1073
  # Use analyzer to get sequence
1074
  segments = analyzer.split_on_bonds(smiles)
1075
 
 
1132
  summary += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
1133
  #summary += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
1134
 
1135
+ return summary + output_text, img_cyclic, img_linear, structure_files
1136
 
1137
  except Exception as e:
1138
  return f"Error processing SMILES: {str(e)}", None, None
 
1223
  gr.Checkbox(
1224
  label="Show segment details",
1225
  value=False
1226
+ ),
1227
+ gr.Checkbox(
1228
+ label="Generate 3D structure (sdf file format)",
1229
+ value=False
1230
+ ),
1231
+ gr.Checkbox(
1232
+ label="Use UFF optimization",
1233
+ value=False
1234
  )
1235
  ],
1236
  outputs=[
 
1255
  3. Parses the amino acid sequence
1256
  4. Creates 2D structure visualization with residue annotations
1257
  5. Optional linear representation
1258
+ 6. Optional 3D structure generation (ETKDG and UFF methods)
1259
 
1260
  Input: Either enter a SMILES string directly or upload a text file containing SMILES strings
1261