lkjjj26 commited on
Commit
6365aaf
·
verified ·
1 Parent(s): e0471a2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -44
app.py CHANGED
@@ -36,7 +36,7 @@ class PDBSearchAssistant:
36
  Extract specific search parameters from the protein-related query:
37
  1. Protein name or type
38
  2. Resolution cutoff (in Å)
39
- 3. Sequence information
40
  4. Specific PDB ID
41
  5. Experimental method (X-RAY, EM, NMR)
42
  6. Organism/Species information
@@ -58,6 +58,46 @@ class PDBSearchAssistant:
58
  PDB_ID: none
59
  Method: X-RAY
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  Now analyze:
62
  Query: {query}
63
  """
@@ -380,52 +420,65 @@ class PDBSearchAssistant:
380
  print(f"Error in direct API call for PDB ID {pdb_id}: {str(e)}")
381
  return []
382
 
383
- def analyze_query_type(self, query):
384
- """Analyze query type and extract relevant information"""
385
- print(f"\nAnalyzing query: '{query}'") # 입력된 쿼리 출력
386
- query = query.lower().strip()
387
- print(f"Lowercase query: '{query}'") # 소문자로 변환된 쿼리 출력
388
-
389
- # Check for sequence query pattern
390
- sequence_patterns = [
391
- r"sequence\s+of\s+pdb\s+id\s+([a-zA-Z0-9]{4})",
392
- r"sequence\s+for\s+pdb\s+id\s+([a-zA-Z0-9]{4})",
393
- r"get\s+sequence\s+([a-zA-Z0-9]{4})",
394
- r"([a-zA-Z0-9]{4})\s+sequence"
395
- ]
396
-
397
- for i, pattern in enumerate(sequence_patterns):
398
- print(f"Trying pattern {i+1}: {pattern}") # 각 패턴 시도 출력
399
- match = re.search(pattern, query)
400
- if match:
401
- pdb_id = match.group(1).upper()
402
- print(f"Match found! PDB ID: {pdb_id}") # 매치된 PDB ID 출력
 
 
 
 
403
  return {
404
  "type": "sequence",
405
- "pdb_id": pdb_id
406
  }
407
-
408
- print("No sequence pattern matched, treating as structure search") # 구조 검색으로 처리
409
- return {
410
- "type": "structure",
411
- "query": query
412
- }
413
-
414
- def process_query(self, query):
415
- """Process query and return appropriate results"""
416
- query_info = self.analyze_query_type(query)
417
-
418
- if query_info["type"] == "sequence":
419
- return {
420
- "type": "sequence",
421
- "results": self.get_sequences_by_pdb_id(query_info["pdb_id"])
422
- }
423
- else:
424
  return {
425
  "type": "structure",
426
- "results": self.search_pdb(query_info["query"])
427
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
 
429
 
430
  def create_interactive_table(df):
431
  if df.empty:
@@ -610,8 +663,7 @@ app_ui = ui.page_fluid(
610
  ui.tags.ul(
611
  ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
612
  ui.tags.li("Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"),
613
- ui.tags.li("Sequence of PDB ID 8ET6"),
614
- ui.tags.li("Get sequence 7BZ5")
615
  )
616
  )
617
  )
@@ -707,8 +759,6 @@ def server(input, output, session):
707
  @render.text
708
  def sequence_output():
709
  current_results = results_store.get()
710
- print(current_results["type"])
711
- print(current_results["results"])
712
  if current_results["type"] == "sequence":
713
  sequences = current_results["results"]
714
  if not sequences:
 
36
  Extract specific search parameters from the protein-related query:
37
  1. Protein name or type
38
  2. Resolution cutoff (in Å)
39
+ 3. Protein sequence information
40
  4. Specific PDB ID
41
  5. Experimental method (X-RAY, EM, NMR)
42
  6. Organism/Species information
 
58
  PDB_ID: none
59
  Method: X-RAY
60
 
61
+ Query: "Get sequence of PDB ID 8ET6"
62
+ Protein: none
63
+ Organism: none
64
+ Resolution: none
65
+ Sequence: none
66
+ PDB_ID: 8ET6
67
+ Method: none
68
+
69
+ Query: "Sequence of 7BZ5"
70
+ Protein: none
71
+ Organism: none
72
+ Resolution: none
73
+ Sequence: none
74
+ PDB_ID: 7BZ5
75
+ Method: none
76
+
77
+ Query: "7BZ5"
78
+ Protein: none
79
+ Organism: none
80
+ Resolution: none
81
+ Sequence: none
82
+ PDB_ID: 7BZ5
83
+ Method: none
84
+
85
+ Query: "6KAO"
86
+ Protein: none
87
+ Organism: none
88
+ Resolution: none
89
+ Sequence: none
90
+ PDB_ID: 6KAO
91
+ Method: none
92
+
93
+ Query: "Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
94
+ Protein: none
95
+ Organism: none
96
+ Resolution: none
97
+ Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
98
+ PDB_ID: none
99
+ Method: none
100
+
101
  Now analyze:
102
  Query: {query}
103
  """
 
420
  print(f"Error in direct API call for PDB ID {pdb_id}: {str(e)}")
421
  return []
422
 
423
+ def process_query(self, query):
424
+ """Process query and return results"""
425
+ try:
426
+ # Get search parameters from LLM
427
+ formatted_prompt = self.prompt_template.format(query=query)
428
+ response = self.pipe(formatted_prompt)[0]['generated_text']
429
+ print("Generated parameters:", response)
430
+
431
+ # Parse LLM response for PDB ID
432
+ pdb_id = None
433
+ for line in response.split('\n'):
434
+ if 'PDB_ID:' in line:
435
+ value = line.split('PDB_ID:')[1].strip()
436
+ if value.lower() not in ['none', 'n/a']:
437
+ pdb_id = value.upper()
438
+ break
439
+
440
+ # Check if query is asking for sequence
441
+ sequence_keywords = ['sequence', 'seq']
442
+ is_sequence_query = any(keyword in query.lower() for keyword in sequence_keywords)
443
+
444
+ if is_sequence_query and pdb_id:
445
+ # Get sequences for the PDB ID
446
+ sequences = self.get_sequences_by_pdb_id(pdb_id)
447
  return {
448
  "type": "sequence",
449
+ "results": sequences
450
  }
451
+
452
+ # If not a sequence query or no PDB ID found, proceed with normal structure search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  return {
454
  "type": "structure",
455
+ "results": self.search_pdb(query)
456
  }
457
+
458
+ except Exception as e:
459
+ print(f"Error processing query: {str(e)}")
460
+ return {"type": "structure", "results": []}
461
+
462
+ def pdbsummary(name):
463
+
464
+ search_engine = ProteinSearchEngine()
465
+
466
+ query = ProteinQuery(
467
+ name,
468
+ max_resolution= 5.0
469
+ )
470
+
471
+ results = search_engine.search(query)
472
+
473
+ answer = ""
474
+ for i, structure in enumerate(results, 1):
475
+ answer += f"\n{i}. PDB ID : {structure.pdb_id}\n"
476
+ answer += f"\nResolution : {structure.resolution:.2f} A \n"
477
+ answer += f"Method : {structure.method}\n Title : {structure.title}\n"
478
+ answer += f"Release Date : {structure.release_date}\n Sequence length: {len(structure.sequence)} aa\n"
479
+ answer += f" Sequence:\n {structure.sequence}\n"
480
 
481
+ return answer
482
 
483
  def create_interactive_table(df):
484
  if df.empty:
 
663
  ui.tags.ul(
664
  ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
665
  ui.tags.li("Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"),
666
+ ui.tags.li("Sequence of PDB ID 8ET6")
 
667
  )
668
  )
669
  )
 
759
  @render.text
760
  def sequence_output():
761
  current_results = results_store.get()
 
 
762
  if current_results["type"] == "sequence":
763
  sequences = current_results["results"]
764
  if not sequences: