lkjjj26 commited on
Commit
e38319c
·
1 Parent(s): 1742853

update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -90
app.py CHANGED
@@ -1,9 +1,10 @@
1
  from transformers import pipeline
2
  from rcsbsearchapi import AttributeQuery
3
- from rcsbsearchapi.search import SequenceQuery
4
  import os
5
  from dotenv import load_dotenv
6
  from shiny import App, render, ui, reactive
 
7
  import pandas as pd
8
  import warnings
9
  import re
@@ -22,7 +23,7 @@ warnings.filterwarnings('ignore')
22
  load_dotenv()
23
 
24
  class PDBSearchAssistant:
25
- def __init__(self, model_name="google/flan-t5-large"):
26
  # Set up HuggingFace pipeline with better model
27
  self.pipe = pipeline(
28
  "text2text-generation",
@@ -30,7 +31,7 @@ class PDBSearchAssistant:
30
  max_new_tokens=1024,
31
  temperature=0.1,
32
  torch_dtype="auto",
33
- device="cuda"
34
  )
35
 
36
  self.prompt_template = """
@@ -159,18 +160,26 @@ class PDBSearchAssistant:
159
  # Clean and normalize remaining response
160
  # Remove all resolution entries to avoid confusion
161
  cleaned_response = re.sub(r'[Rr]esolution:\s*\d+(?:\.\d+)?(?:\s*Å?)?\s*', '', response)
162
-
 
163
  # Split remaining response into clean key-value pairs
164
  response_pairs = {}
165
  for pair in re.finditer(r'(\w+):\s*([^:]+?)(?=\s+\w+:|$)', cleaned_response):
166
  key, value = pair.groups()
 
167
  key = key.lower()
168
  value = value.strip()
169
  if value.lower() not in ['none', 'n/a']:
170
  response_pairs[key] = value
171
 
172
  print("Parsed response pairs:", response_pairs) # Debug print
173
-
 
 
 
 
 
 
174
  # Extract sequence and similarity from cleaned pairs
175
  if 'sequence' in response_pairs:
176
  sequence = response_pairs['sequence']
@@ -241,7 +250,7 @@ class PDBSearchAssistant:
241
  protein_entity_query = AttributeQuery(
242
  attribute="rcsb_entity_container_identifiers.entity_names.value",
243
  operator="contains_words",
244
- value=protein_name
245
  )
246
  queries.append(protein_entity_query)
247
 
@@ -342,18 +351,20 @@ class PDBSearchAssistant:
342
  print("Final query:", final_query)
343
 
344
  # Execute search
345
- session = final_query.exec()
346
  results = []
347
 
348
  # Process results with additional information
349
  # search_engine = ProteinSearchEngine()
350
 
 
351
  try:
352
  for entry in session:
353
  try:
354
  # PDB ID 추출 방식 개선
355
  if isinstance(entry, dict):
356
- pdb_id = entry.get('identifier')
 
357
  elif hasattr(entry, 'identifier'):
358
  pdb_id = entry.identifier
359
  else:
@@ -363,7 +374,11 @@ class PDBSearchAssistant:
363
 
364
  if not pdb_id or len(pdb_id) != 4: # PDB ID는 항상 4자리
365
  continue
366
-
 
 
 
 
367
  # RCSB PDB REST API를 직접 사용하여 구조 정보 가져오기
368
  structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
369
  response = requests.get(structure_url)
@@ -381,20 +396,18 @@ class PDBSearchAssistant:
381
  'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
382
  'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
383
  'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
384
-
385
-
386
  }
387
 
388
  results.append(result)
389
-
390
- # Limit to top 10 results
391
- if len(results) >= 10:
392
  break
393
 
394
  except Exception as e:
395
  print(f"Error processing entry: {str(e)}")
396
  continue
397
-
398
  except Exception as e:
399
  print(f"Error processing results: {str(e)}")
400
  print(f"Error type: {type(e)}")
@@ -421,46 +434,45 @@ class PDBSearchAssistant:
421
  pdb_path = self.pdbl.retrieve_pdb_file(
422
  pdb_id,
423
  pdir=self.pdb_dir,
424
- file_format="pdb"
425
  )
426
 
 
 
 
 
 
427
  if not pdb_path or not os.path.exists(pdb_path):
428
  print(f"Failed to download PDB file for {pdb_id}")
429
 
430
- structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
431
- response = requests.get(structure_url)
432
- structure_data = response.json() if response.status_code == 200 else {}
433
-
434
-
435
- sequence_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1"
436
- seq_response = requests.get(sequence_url)
437
- seq_data = seq_response.json() if response.status_code == 200 else {}
438
- sequence = seq_data.get('entity_poly', {}).get('pdbx_seq_one_letter_code', 'N/A')
439
-
440
  sequences = []
441
 
442
- chain_info = {
443
- 'chain_id': "A", # chain.id, 임의 설정 api 3개써서 가져오기는 가능
444
- 'entity_id': '1', # Default entity ID
445
- 'description': structure_data.get('struct', {}).get('title', 'N/A'),
446
- 'sequence': sequence,
447
- 'length': len(sequence),
448
- 'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0],
449
- 'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
450
- 'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
451
- }
452
- sequences.append(chain_info)
 
 
 
 
 
 
 
 
 
453
  return sequences
454
 
455
  # Parse structure
456
  parser = PDB.PDBParser(QUIET=True)
457
  structure = parser.get_structure(pdb_id, pdb_path)
458
 
459
- # Get structure info from RCSB API for additional details
460
- structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
461
- response = requests.get(structure_url)
462
- structure_data = response.json() if response.status_code == 200 else {}
463
-
464
  sequences = []
465
  # Extract sequences from each chain
466
  for model in structure:
@@ -543,9 +555,29 @@ class PDBSearchAssistant:
543
  print(f"Error processing query: {str(e)}")
544
  return {"type": "structure", "results": []}
545
 
546
- def render_html(pdb_id):
547
- if pdb_id is None:
548
  return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
  html_content = f"""
550
  <!DOCTYPE html>
551
  <html>
@@ -561,12 +593,16 @@ def render_html(pdb_id):
561
  </style>
562
  </head>
563
  <body>
 
 
 
564
  <div class="viewer_3Dmoljs"
565
  data-pdb="{pdb_id}"
566
  data-backgroundcolor="0xffffff"
567
  data-style="cartoon:color=spectrum"
568
  data-spin="axis:y;speed:0.2">
569
  </div>
 
570
  </body>
571
  </html>
572
  """
@@ -582,49 +618,14 @@ def render_html(pdb_id):
582
  return f'<iframe style="width: 100%; height: 480px; border: none;" srcdoc=\'{escaped_content}\'></iframe>'
583
 
584
  def create_interactive_table(df):
585
- if df.empty:
586
- return go.Figure()
587
-
588
  # Reorder columns - Add '# of atoms of protein' to the column order
589
  column_order = ['PDB ID', 'Resolution', 'Title','# of total residues', '# of atoms of protein', 'Method','Release Date']
590
  df = df[column_order]
591
 
592
  # Release Date 형식 변경 (YYYY-MM-DD)
593
  df['Release Date'] = pd.to_datetime(df['Release Date']).dt.strftime('%Y-%m-%d')
 
594
 
595
- # Create interactive table
596
- table = go.Figure(data=[go.Table(
597
- header=dict(
598
- values=list(df.columns),
599
- fill_color='paleturquoise',
600
- align='center',
601
- font=dict(size=16),
602
- ),
603
- cells=dict(
604
- values=[
605
- [f'<a href="https://www.rcsb.org/structure/{cell}">{cell}</a>'
606
- if i == 0 else cell
607
- for cell in df[col]]
608
- for i, col in enumerate(df.columns)
609
- ],
610
- align='center',
611
- font=dict(size=15),
612
- height=35
613
- ),
614
- columnwidth=[80, 80, 400, 100, 100, 100, 100], # Updated columnwidth to include new column
615
- customdata=[['html'] * len(df) if i == 0 else [''] * len(df)
616
- for i in range(len(df.columns))],
617
- hoverlabel=dict(bgcolor='white')
618
- )])
619
-
620
- # Update table layout
621
- table.update_layout(
622
- margin=dict(l=20, r=20, t=20, b=20),
623
- height=450,
624
- autosize=True
625
- )
626
-
627
- return table
628
 
629
  # Simplified Shiny app UI definition
630
  app_ui = ui.page_fluid(
@@ -643,6 +644,15 @@ app_ui = ui.page_fluid(
643
  color: #0a58ca;
644
  text-decoration: underline;
645
  }
 
 
 
 
 
 
 
 
 
646
  .shiny-input-container {
647
  max-width: 100%;
648
  margin: 0 auto;
@@ -970,10 +980,13 @@ app_ui = ui.page_fluid(
970
  align-items: ;
971
  justify-content: flex-start;
972
  gap: 5px;
 
973
  margin-bottom: 20px;
974
  margin-left: 20px;
975
  }
976
-
 
 
977
  .pdb-select-label {
978
  font-weight: bold;
979
  margin: 0;
@@ -1046,6 +1059,7 @@ app_ui = ui.page_fluid(
1046
  ui.tags.ul(
1047
  ui.tags.li("Sequence of PDB ID 8ET6"),
1048
  ui.tags.li("Spike protein"),
 
1049
  ui.tags.li("Human insulin"),
1050
  ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
1051
  ui.tags.li("Find structures containing sequence with similarity 90% FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
@@ -1060,8 +1074,11 @@ app_ui = ui.page_fluid(
1060
  ui.column(12,
1061
  ui.div(
1062
  {"class": "results-section"},
1063
- ui.h4("Top 10 PDBs Results"),
1064
- output_widget("results_table"),
 
 
 
1065
  ui.download_button("download", "Download Results",
1066
  class_="btn btn-info")
1067
  )
@@ -1075,7 +1092,7 @@ app_ui = ui.page_fluid(
1075
  "Select PDB ID",
1076
  {"class": "pdb-select-label"}
1077
  ),
1078
- ui.input_select(
1079
  "selected_pdb",
1080
  "", # Label is empty as we're using a separate label
1081
  choices=[],
@@ -1144,21 +1161,21 @@ def server(input, output, session):
1144
  status_store.set("Ready")
1145
  pdb_ids = df['PDB ID'].tolist()
1146
  @output
1147
- @render_widget
1148
  def results_table():
1149
- return create_interactive_table(df)
1150
 
1151
  if pdb_ids:
1152
  pdb_ids_store.set(pdb_ids)
1153
  # Update only one dropdown
1154
- ui.update_select(
1155
  "selected_pdb",
1156
  choices=pdb_ids,
1157
- selected=pdb_ids[0]
1158
  )
1159
  else:
1160
  pdb_ids_store.set([])
1161
- ui.update_select(
1162
  "selected_pdb",
1163
  choices=[],
1164
  selected=None
@@ -1202,8 +1219,11 @@ def server(input, output, session):
1202
  @render.ui
1203
  def output_iframe():
1204
  selected_pdb = input.selected_pdb()
 
 
 
1205
  if selected_pdb:
1206
- return ui.HTML(render_html(selected_pdb))
1207
  return ui.HTML("")
1208
 
1209
  @output
 
1
  from transformers import pipeline
2
  from rcsbsearchapi import AttributeQuery
3
+ from rcsbsearchapi.search import SequenceQuery, SeqMotifQuery
4
  import os
5
  from dotenv import load_dotenv
6
  from shiny import App, render, ui, reactive
7
+ from itables.shiny import DT
8
  import pandas as pd
9
  import warnings
10
  import re
 
23
  load_dotenv()
24
 
25
  class PDBSearchAssistant:
26
+ def __init__(self, model_name="google/flan-t5-large"): # google/flan-t5-large or Rostlab/prot_t5_xl_uniref50 11GB
27
  # Set up HuggingFace pipeline with better model
28
  self.pipe = pipeline(
29
  "text2text-generation",
 
31
  max_new_tokens=1024,
32
  temperature=0.1,
33
  torch_dtype="auto",
34
+ device="cpu" # cuda or cpu
35
  )
36
 
37
  self.prompt_template = """
 
160
  # Clean and normalize remaining response
161
  # Remove all resolution entries to avoid confusion
162
  cleaned_response = re.sub(r'[Rr]esolution:\s*\d+(?:\.\d+)?(?:\s*Å?)?\s*', '', response)
163
+ print("cleaned_responese :", cleaned_response)
164
+
165
  # Split remaining response into clean key-value pairs
166
  response_pairs = {}
167
  for pair in re.finditer(r'(\w+):\s*([^:]+?)(?=\s+\w+:|$)', cleaned_response):
168
  key, value = pair.groups()
169
+ print(key, value)
170
  key = key.lower()
171
  value = value.strip()
172
  if value.lower() not in ['none', 'n/a']:
173
  response_pairs[key] = value
174
 
175
  print("Parsed response pairs:", response_pairs) # Debug print
176
+
177
+ # case LLM remove all input, if input has any param word -> replace input to value
178
+ if not response_pairs:
179
+ if 'protein' in response:
180
+ response_pairs['protein'] = response
181
+ print("Replaced response pairs:", response_pairs) # Debug print
182
+
183
  # Extract sequence and similarity from cleaned pairs
184
  if 'sequence' in response_pairs:
185
  sequence = response_pairs['sequence']
 
250
  protein_entity_query = AttributeQuery(
251
  attribute="rcsb_entity_container_identifiers.entity_names.value",
252
  operator="contains_words",
253
+ value=protein_name
254
  )
255
  queries.append(protein_entity_query)
256
 
 
351
  print("Final query:", final_query)
352
 
353
  # Execute search
354
+ session = final_query.exec(results_verbosity="minimal") # query return identifier, score
355
  results = []
356
 
357
  # Process results with additional information
358
  # search_engine = ProteinSearchEngine()
359
 
360
+
361
  try:
362
  for entry in session:
363
  try:
364
  # PDB ID 추출 방식 개선
365
  if isinstance(entry, dict):
366
+ if entry.get('score') > 0.75:
367
+ pdb_id = entry.get('identifier')
368
  elif hasattr(entry, 'identifier'):
369
  pdb_id = entry.identifier
370
  else:
 
374
 
375
  if not pdb_id or len(pdb_id) != 4: # PDB ID는 항상 4자리
376
  continue
377
+
378
+ # thresh hold
379
+ if len(results) > 1 and results[-1]["PDB ID"] == pdb_id:
380
+ break
381
+
382
  # RCSB PDB REST API를 직접 사용하여 구조 정보 가져오기
383
  structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
384
  response = requests.get(structure_url)
 
396
  'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
397
  'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
398
  'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
 
 
399
  }
400
 
401
  results.append(result)
402
+
403
+ # Limit to max 500
404
+ if len(results) >= 500:
405
  break
406
 
407
  except Exception as e:
408
  print(f"Error processing entry: {str(e)}")
409
  continue
410
+
411
  except Exception as e:
412
  print(f"Error processing results: {str(e)}")
413
  print(f"Error type: {type(e)}")
 
434
  pdb_path = self.pdbl.retrieve_pdb_file(
435
  pdb_id,
436
  pdir=self.pdb_dir,
437
+ file_format="pdb"
438
  )
439
 
440
+ # Get structure info from RCSB API for additional details
441
+ structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
442
+ response = requests.get(structure_url)
443
+ structure_data = response.json() if response.status_code == 200 else {}
444
+
445
  if not pdb_path or not os.path.exists(pdb_path):
446
  print(f"Failed to download PDB file for {pdb_id}")
447
 
 
 
 
 
 
 
 
 
 
 
448
  sequences = []
449
 
450
+ entity_ids = structure_data.get('rcsb_entry_container_identifiers', {}).get('polymer_entity_ids', {})
451
+ for i in entity_ids:
452
+ sequence_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{i}"
453
+ seq_response = requests.get(sequence_url)
454
+ seq_data = seq_response.json() if response.status_code == 200 else {}
455
+ sequence = seq_data.get('entity_poly', {}).get('pdbx_seq_one_letter_code_can', 'N/A') # pdbx_seq_one_letter_code
456
+
457
+ chain_info = {
458
+ 'chain_id': seq_data.get('entity_poly', {}).get('pdbx_strand_id', 'N/A'), # chain.id
459
+ 'entity_id': i, # Default entity ID
460
+ 'description': structure_data.get('struct', {}).get('title', 'N/A'),
461
+ 'sequence': sequence,
462
+ 'length': len(sequence),
463
+ 'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0],
464
+ 'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
465
+ 'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
466
+ }
467
+ sequences.append(chain_info)
468
+ print("not Bio pdb list")
469
+
470
  return sequences
471
 
472
  # Parse structure
473
  parser = PDB.PDBParser(QUIET=True)
474
  structure = parser.get_structure(pdb_id, pdb_path)
475
 
 
 
 
 
 
476
  sequences = []
477
  # Extract sequences from each chain
478
  for model in structure:
 
555
  print(f"Error processing query: {str(e)}")
556
  return {"type": "structure", "results": []}
557
 
558
+ def render_html(pdb_id, chain_count):
559
+ if pdb_id is None or chain_count <= 0:
560
  return ""
561
+
562
+ chains = [chr(65 + i) for i in range(chain_count)]
563
+
564
+ # chain block
565
+ chain_html_blocks = "".join([
566
+ f"""
567
+ <div>
568
+ {pdb_id} {chain}
569
+ </div>
570
+ <div class="viewer_3Dmoljs"
571
+ data-pdb="{pdb_id}"
572
+ data-select="chain:{chain}"
573
+ data-backgroundcolor="0xffffff"
574
+ data-style="cartoon:color=spectrum"
575
+ data-spin="axis:y;speed:0.2">
576
+ </div>
577
+ """
578
+ for chain in chains
579
+ ])
580
+
581
  html_content = f"""
582
  <!DOCTYPE html>
583
  <html>
 
593
  </style>
594
  </head>
595
  <body>
596
+ <div>
597
+ {pdb_id}
598
+ </div>
599
  <div class="viewer_3Dmoljs"
600
  data-pdb="{pdb_id}"
601
  data-backgroundcolor="0xffffff"
602
  data-style="cartoon:color=spectrum"
603
  data-spin="axis:y;speed:0.2">
604
  </div>
605
+ {chain_html_blocks}
606
  </body>
607
  </html>
608
  """
 
618
  return f'<iframe style="width: 100%; height: 480px; border: none;" srcdoc=\'{escaped_content}\'></iframe>'
619
 
620
  def create_interactive_table(df):
 
 
 
621
  # Reorder columns - Add '# of atoms of protein' to the column order
622
  column_order = ['PDB ID', 'Resolution', 'Title','# of total residues', '# of atoms of protein', 'Method','Release Date']
623
  df = df[column_order]
624
 
625
  # Release Date 형식 변경 (YYYY-MM-DD)
626
  df['Release Date'] = pd.to_datetime(df['Release Date']).dt.strftime('%Y-%m-%d')
627
+ return df
628
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
 
630
  # Simplified Shiny app UI definition
631
  app_ui = ui.page_fluid(
 
644
  color: #0a58ca;
645
  text-decoration: underline;
646
  }
647
+ .dt-layout-cell {
648
+ overflow-x: auto;
649
+ max-width :100%;
650
+ max-height: 600px;
651
+ }
652
+ table colgroup col[data-dt-column="2"] {
653
+ width: 450px !important;
654
+ min-width: 450px !important;
655
+ }
656
  .shiny-input-container {
657
  max-width: 100%;
658
  margin: 0 auto;
 
980
  align-items: ;
981
  justify-content: flex-start;
982
  gap: 5px;
983
+ margin-top: 20px;
984
  margin-bottom: 20px;
985
  margin-left: 20px;
986
  }
987
+ .pdb-selector .form-group.shiny-input-container{
988
+ margin-left: 250px;
989
+ }
990
  .pdb-select-label {
991
  font-weight: bold;
992
  margin: 0;
 
1059
  ui.tags.ul(
1060
  ui.tags.li("Sequence of PDB ID 8ET6"),
1061
  ui.tags.li("Spike protein"),
1062
+ ui.tags.li("Membrane protein"),
1063
  ui.tags.li("Human insulin"),
1064
  ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
1065
  ui.tags.li("Find structures containing sequence with similarity 90% FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
 
1074
  ui.column(12,
1075
  ui.div(
1076
  {"class": "results-section"},
1077
+ ui.h4("PDB Search Results"),
1078
+ ui.output_ui(
1079
+ "results_table",
1080
+ # {"class": "resres"}
1081
+ ), #output_widget("results_table"),
1082
  ui.download_button("download", "Download Results",
1083
  class_="btn btn-info")
1084
  )
 
1092
  "Select PDB ID",
1093
  {"class": "pdb-select-label"}
1094
  ),
1095
+ ui.input_selectize(
1096
  "selected_pdb",
1097
  "", # Label is empty as we're using a separate label
1098
  choices=[],
 
1161
  status_store.set("Ready")
1162
  pdb_ids = df['PDB ID'].tolist()
1163
  @output
1164
+ @render.ui #render_widget
1165
  def results_table():
1166
+ return ui.HTML(DT(create_interactive_table(df))) #create_interactive_table(df)
1167
 
1168
  if pdb_ids:
1169
  pdb_ids_store.set(pdb_ids)
1170
  # Update only one dropdown
1171
+ ui.update_selectize(
1172
  "selected_pdb",
1173
  choices=pdb_ids,
1174
+ selected=pdb_ids[0] # matching entity 1
1175
  )
1176
  else:
1177
  pdb_ids_store.set([])
1178
+ ui.update_selectize(
1179
  "selected_pdb",
1180
  choices=[],
1181
  selected=None
 
1219
  @render.ui
1220
  def output_iframe():
1221
  selected_pdb = input.selected_pdb()
1222
+ sequences = assistant.get_sequences_by_pdb_id(selected_pdb)
1223
+ chain_cnt = len(sequences)
1224
+
1225
  if selected_pdb:
1226
+ return ui.HTML(render_html(selected_pdb, chain_cnt))
1227
  return ui.HTML("")
1228
 
1229
  @output