Spaces:

lkjjj26
/

query

Sleeping

App Files Files Community

lkjjj26 commited on Jan 22

Commit

e38319c

1 Parent(s): 1742853

update app.py

Browse files

Files changed (1) hide show

app.py +110 -90

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from transformers import pipeline
 from rcsbsearchapi import AttributeQuery
-from rcsbsearchapi.search import SequenceQuery
 import os
 from dotenv import load_dotenv
 from shiny import App, render, ui, reactive
 import pandas as pd
 import warnings
 import re
@@ -22,7 +23,7 @@ warnings.filterwarnings('ignore')
 load_dotenv()
 class PDBSearchAssistant:
-    def __init__(self, model_name="google/flan-t5-large"):
         # Set up HuggingFace pipeline with better model
         self.pipe = pipeline(
             "text2text-generation",
@@ -30,7 +31,7 @@ class PDBSearchAssistant:
             max_new_tokens=1024,
             temperature=0.1,
             torch_dtype="auto",
-            device="cuda"
         )
         self.prompt_template = """
@@ -159,18 +160,26 @@ class PDBSearchAssistant:
             # Clean and normalize remaining response
             # Remove all resolution entries to avoid confusion
             cleaned_response = re.sub(r'[Rr]esolution:\s*\d+(?:\.\d+)?(?:\s*Å?)?\s*', '', response)
             # Split remaining response into clean key-value pairs
             response_pairs = {}
             for pair in re.finditer(r'(\w+):\s*([^:]+?)(?=\s+\w+:|$)', cleaned_response):
                 key, value = pair.groups()
                 key = key.lower()
                 value = value.strip()
                 if value.lower() not in ['none', 'n/a']:
                     response_pairs[key] = value
             print("Parsed response pairs:", response_pairs)  # Debug print
             # Extract sequence and similarity from cleaned pairs
             if 'sequence' in response_pairs:
                 sequence = response_pairs['sequence']
@@ -241,7 +250,7 @@ class PDBSearchAssistant:
                     protein_entity_query = AttributeQuery(
                         attribute="rcsb_entity_container_identifiers.entity_names.value",
                         operator="contains_words",
-                        value=protein_name
                     )
                     queries.append(protein_entity_query)
@@ -342,18 +351,20 @@ class PDBSearchAssistant:
                     print("Final query:", final_query)
                     # Execute search
-                    session = final_query.exec()
                     results = []
                     # Process results with additional information
                     # search_engine = ProteinSearchEngine()
                     try:
                         for entry in session:
                             try:
                                 # PDB ID 추출 방식 개선
                                 if isinstance(entry, dict):
-                                    pdb_id = entry.get('identifier')
                                 elif hasattr(entry, 'identifier'):
                                     pdb_id = entry.identifier
                                 else:
@@ -363,7 +374,11 @@ class PDBSearchAssistant:
                                 if not pdb_id or len(pdb_id) != 4:  # PDB ID는 항상 4자리
                                     continue
                                 # RCSB PDB REST API를 직접 사용하여 구조 정보 가져오기
                                 structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
                                 response = requests.get(structure_url)
@@ -381,20 +396,18 @@ class PDBSearchAssistant:
                                     'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
                                     'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
                                     'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
                                 }
                                 results.append(result)
-                                # Limit to top 10 results
-                                if len(results) >= 10:
                                     break
                             except Exception as e:
                                 print(f"Error processing entry: {str(e)}")
                                 continue
                     except Exception as e:
                         print(f"Error processing results: {str(e)}")
                         print(f"Error type: {type(e)}")
@@ -421,46 +434,45 @@ class PDBSearchAssistant:
             pdb_path = self.pdbl.retrieve_pdb_file(
                 pdb_id,
                 pdir=self.pdb_dir,
-                file_format="pdb"
             )
             if not pdb_path or not os.path.exists(pdb_path):
                 print(f"Failed to download PDB file for {pdb_id}")
-                structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
-                response = requests.get(structure_url)
-                structure_data = response.json() if response.status_code == 200 else {}
-                sequence_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1"
-                seq_response = requests.get(sequence_url)
-                seq_data = seq_response.json() if response.status_code == 200 else {}
-                sequence = seq_data.get('entity_poly', {}).get('pdbx_seq_one_letter_code', 'N/A')
                 sequences = []
-                chain_info = {
-                            'chain_id': "A", # chain.id, 임의 설정 api 3개써서 가져오기는 가능
-                            'entity_id': '1',  # Default entity ID
-                            'description': structure_data.get('struct', {}).get('title', 'N/A'),
-                            'sequence': sequence,
-                            'length': len(sequence),
-                            'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0],
-                            'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
-                            'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
-                        }
-                sequences.append(chain_info)
                 return sequences
             # Parse structure
             parser = PDB.PDBParser(QUIET=True)
             structure = parser.get_structure(pdb_id, pdb_path)
-            # Get structure info from RCSB API for additional details
-            structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
-            response = requests.get(structure_url)
-            structure_data = response.json() if response.status_code == 200 else {}
             sequences = []
             # Extract sequences from each chain
             for model in structure:
@@ -543,9 +555,29 @@ class PDBSearchAssistant:
             print(f"Error processing query: {str(e)}")
             return {"type": "structure", "results": []}
-def render_html(pdb_id):
-    if pdb_id is None:
         return ""
     html_content = f"""
     <!DOCTYPE html>
     <html>
@@ -561,12 +593,16 @@ def render_html(pdb_id):
         </style>
     </head>
     <body>
         <div class="viewer_3Dmoljs"
              data-pdb="{pdb_id}"
              data-backgroundcolor="0xffffff"
              data-style="cartoon:color=spectrum"
              data-spin="axis:y;speed:0.2">
         </div>
     </body>
     </html>
     """
@@ -582,49 +618,14 @@ def render_html(pdb_id):
     return f'<iframe style="width: 100%; height: 480px; border: none;" srcdoc=\'{escaped_content}\'></iframe>'
 def create_interactive_table(df):
-    if df.empty:
-        return go.Figure()
     # Reorder columns - Add '# of atoms of protein' to the column order
     column_order = ['PDB ID', 'Resolution', 'Title','# of total residues', '# of atoms of protein', 'Method','Release Date']
     df = df[column_order]
     # Release Date 형식 변경 (YYYY-MM-DD)
     df['Release Date'] = pd.to_datetime(df['Release Date']).dt.strftime('%Y-%m-%d')
-    # Create interactive table
-    table = go.Figure(data=[go.Table(
-        header=dict(
-            values=list(df.columns),
-            fill_color='paleturquoise',
-            align='center',
-            font=dict(size=16),
-        ),
-        cells=dict(
-            values=[
-                [f'<a href="https://www.rcsb.org/structure/{cell}">{cell}</a>'
-                 if i == 0 else cell
-                 for cell in df[col]]
-                for i, col in enumerate(df.columns)
-            ],
-            align='center',
-            font=dict(size=15),
-            height=35
-        ),
-        columnwidth=[80, 80, 400, 100, 100, 100, 100],  # Updated columnwidth to include new column
-        customdata=[['html'] * len(df) if i == 0 else [''] * len(df)
-                   for i in range(len(df.columns))],
-        hoverlabel=dict(bgcolor='white')
-    )])
-    # Update table layout
-    table.update_layout(
-        margin=dict(l=20, r=20, t=20, b=20),
-        height=450,
-        autosize=True
-    )
-    return table
 # Simplified Shiny app UI definition
 app_ui = ui.page_fluid(
@@ -643,6 +644,15 @@ app_ui = ui.page_fluid(
                 color: #0a58ca;
                 text-decoration: underline;
             }
             .shiny-input-container {
                 max-width: 100%;
                 margin: 0 auto;
@@ -970,10 +980,13 @@ app_ui = ui.page_fluid(
                 align-items: ;
                 justify-content: flex-start;
                 gap: 5px;
                 margin-bottom: 20px;
                 margin-left: 20px;
             }
             .pdb-select-label {
                 font-weight: bold;
                 margin: 0;
@@ -1046,6 +1059,7 @@ app_ui = ui.page_fluid(
                             ui.tags.ul(
                                 ui.tags.li("Sequence of PDB ID 8ET6"),
                                 ui.tags.li("Spike protein"),
                                 ui.tags.li("Human insulin"),
                                 ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
                                 ui.tags.li("Find structures containing sequence with similarity 90% FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
@@ -1060,8 +1074,11 @@ app_ui = ui.page_fluid(
                 ui.column(12,
                     ui.div(
                         {"class": "results-section"},
-                        ui.h4("Top 10 PDBs Results"),
-                        output_widget("results_table"),
                         ui.download_button("download", "Download Results",
                                          class_="btn btn-info")
                     )
@@ -1075,7 +1092,7 @@ app_ui = ui.page_fluid(
                         "Select PDB ID",
                         {"class": "pdb-select-label"}
                     ),
-                    ui.input_select(
                         "selected_pdb",
                         "",  # Label is empty as we're using a separate label
                         choices=[],
@@ -1144,21 +1161,21 @@ def server(input, output, session):
                 status_store.set("Ready")
                 pdb_ids = df['PDB ID'].tolist()
                 @output
-                @render_widget
                 def results_table():
-                    return create_interactive_table(df)
         if pdb_ids:
             pdb_ids_store.set(pdb_ids)
             # Update only one dropdown
-            ui.update_select(
                 "selected_pdb",
                 choices=pdb_ids,
-                selected=pdb_ids[0]
             )
         else:
             pdb_ids_store.set([])
-            ui.update_select(
                 "selected_pdb",
                 choices=[],
                 selected=None
@@ -1202,8 +1219,11 @@ def server(input, output, session):
     @render.ui
     def output_iframe():
         selected_pdb = input.selected_pdb()
         if selected_pdb:
-            return ui.HTML(render_html(selected_pdb))
         return ui.HTML("")
     @output

 from transformers import pipeline
 from rcsbsearchapi import AttributeQuery
+from rcsbsearchapi.search import SequenceQuery, SeqMotifQuery
 import os
 from dotenv import load_dotenv
 from shiny import App, render, ui, reactive
+from itables.shiny import DT
 import pandas as pd
 import warnings
 import re
 load_dotenv()
 class PDBSearchAssistant:
+    def __init__(self, model_name="google/flan-t5-large"): # google/flan-t5-large or Rostlab/prot_t5_xl_uniref50 11GB
         # Set up HuggingFace pipeline with better model
         self.pipe = pipeline(
             "text2text-generation",
             max_new_tokens=1024,
             temperature=0.1,
             torch_dtype="auto",
+            device="cpu" # cuda or cpu
         )
         self.prompt_template = """
             # Clean and normalize remaining response
             # Remove all resolution entries to avoid confusion
             cleaned_response = re.sub(r'[Rr]esolution:\s*\d+(?:\.\d+)?(?:\s*Å?)?\s*', '', response)
+            print("cleaned_responese :",  cleaned_response)
             # Split remaining response into clean key-value pairs
             response_pairs = {}
             for pair in re.finditer(r'(\w+):\s*([^:]+?)(?=\s+\w+:|$)', cleaned_response):
                 key, value = pair.groups()
+                print(key, value)
                 key = key.lower()
                 value = value.strip()
                 if value.lower() not in ['none', 'n/a']:
                     response_pairs[key] = value
             print("Parsed response pairs:", response_pairs)  # Debug print
+            # case LLM remove all input, if input has any param word -> replace input to value
+            if not response_pairs:
+                if 'protein' in response:
+                    response_pairs['protein'] = response
+                    print("Replaced response pairs:", response_pairs)  # Debug print
             # Extract sequence and similarity from cleaned pairs
             if 'sequence' in response_pairs:
                 sequence = response_pairs['sequence']
                     protein_entity_query = AttributeQuery(
                         attribute="rcsb_entity_container_identifiers.entity_names.value",
                         operator="contains_words",
+                        value=protein_name
                     )
                     queries.append(protein_entity_query)
                     print("Final query:", final_query)
                     # Execute search
+                    session = final_query.exec(results_verbosity="minimal") # query return identifier, score
                     results = []
                     # Process results with additional information
                     # search_engine = ProteinSearchEngine()
                     try:
                         for entry in session:
                             try:
                                 # PDB ID 추출 방식 개선
                                 if isinstance(entry, dict):
+                                    if entry.get('score') > 0.75:
+                                        pdb_id = entry.get('identifier')
                                 elif hasattr(entry, 'identifier'):
                                     pdb_id = entry.identifier
                                 else:
                                 if not pdb_id or len(pdb_id) != 4:  # PDB ID는 항상 4자리
                                     continue
+                                # thresh hold
+                                if len(results) > 1 and results[-1]["PDB ID"] == pdb_id:
+                                    break
                                 # RCSB PDB REST API를 직접 사용하여 구조 정보 가져오기
                                 structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
                                 response = requests.get(structure_url)
                                     'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
                                     'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
                                     'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
                                 }
                                 results.append(result)
+                                # Limit to max 500
+                                if len(results) >= 500:
                                     break
                             except Exception as e:
                                 print(f"Error processing entry: {str(e)}")
                                 continue
                     except Exception as e:
                         print(f"Error processing results: {str(e)}")
                         print(f"Error type: {type(e)}")
             pdb_path = self.pdbl.retrieve_pdb_file(
                 pdb_id,
                 pdir=self.pdb_dir,
+                file_format="pdb"
             )
+            # Get structure info from RCSB API for additional details
+            structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
+            response = requests.get(structure_url)
+            structure_data = response.json() if response.status_code == 200 else {}
             if not pdb_path or not os.path.exists(pdb_path):
                 print(f"Failed to download PDB file for {pdb_id}")
                 sequences = []
+                entity_ids = structure_data.get('rcsb_entry_container_identifiers', {}).get('polymer_entity_ids', {})
+                for i in entity_ids:
+                    sequence_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{i}"
+                    seq_response = requests.get(sequence_url)
+                    seq_data = seq_response.json() if response.status_code == 200 else {}
+                    sequence = seq_data.get('entity_poly', {}).get('pdbx_seq_one_letter_code_can', 'N/A') # pdbx_seq_one_letter_code
+                    chain_info = {
+                                'chain_id': seq_data.get('entity_poly', {}).get('pdbx_strand_id', 'N/A'), # chain.id
+                                'entity_id': i,  # Default entity ID
+                                'description': structure_data.get('struct', {}).get('title', 'N/A'),
+                                'sequence': sequence,
+                                'length': len(sequence),
+                                'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0],
+                                'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
+                                'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
+                            }
+                    sequences.append(chain_info)
+                print("not Bio pdb list")
                 return sequences
             # Parse structure
             parser = PDB.PDBParser(QUIET=True)
             structure = parser.get_structure(pdb_id, pdb_path)
             sequences = []
             # Extract sequences from each chain
             for model in structure:
             print(f"Error processing query: {str(e)}")
             return {"type": "structure", "results": []}
+def render_html(pdb_id, chain_count):
+    if pdb_id is None or chain_count <= 0:
         return ""
+    chains = [chr(65 + i) for i in range(chain_count)]
+    # chain block
+    chain_html_blocks = "".join([
+        f"""
+        <div>
+        {pdb_id} {chain}
+        </div>
+        <div class="viewer_3Dmoljs"
+             data-pdb="{pdb_id}"
+             data-select="chain:{chain}"
+             data-backgroundcolor="0xffffff"
+             data-style="cartoon:color=spectrum"
+             data-spin="axis:y;speed:0.2">
+        </div>
+        """
+        for chain in chains
+    ])
     html_content = f"""
     <!DOCTYPE html>
     <html>
         </style>
     </head>
     <body>
+        <div>
+            {pdb_id}
+        </div>
         <div class="viewer_3Dmoljs"
              data-pdb="{pdb_id}"
              data-backgroundcolor="0xffffff"
              data-style="cartoon:color=spectrum"
              data-spin="axis:y;speed:0.2">
         </div>
+        {chain_html_blocks}
     </body>
     </html>
     """
     return f'<iframe style="width: 100%; height: 480px; border: none;" srcdoc=\'{escaped_content}\'></iframe>'
 def create_interactive_table(df):
     # Reorder columns - Add '# of atoms of protein' to the column order
     column_order = ['PDB ID', 'Resolution', 'Title','# of total residues', '# of atoms of protein', 'Method','Release Date']
     df = df[column_order]
     # Release Date 형식 변경 (YYYY-MM-DD)
     df['Release Date'] = pd.to_datetime(df['Release Date']).dt.strftime('%Y-%m-%d')
+    return df
 # Simplified Shiny app UI definition
 app_ui = ui.page_fluid(
                 color: #0a58ca;
                 text-decoration: underline;
             }
+            .dt-layout-cell {
+                overflow-x: auto;
+                max-width :100%;
+                max-height: 600px;
+            }
+            table colgroup col[data-dt-column="2"] {
+                width: 450px !important;
+                min-width: 450px !important;
+            }
             .shiny-input-container {
                 max-width: 100%;
                 margin: 0 auto;
                 align-items: ;
                 justify-content: flex-start;
                 gap: 5px;
+                margin-top: 20px;
                 margin-bottom: 20px;
                 margin-left: 20px;
             }
+            .pdb-selector .form-group.shiny-input-container{
+                margin-left: 250px;
+            }
             .pdb-select-label {
                 font-weight: bold;
                 margin: 0;
                             ui.tags.ul(
                                 ui.tags.li("Sequence of PDB ID 8ET6"),
                                 ui.tags.li("Spike protein"),
+                                ui.tags.li("Membrane protein"),
                                 ui.tags.li("Human insulin"),
                                 ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
                                 ui.tags.li("Find structures containing sequence with similarity 90% FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
                 ui.column(12,
                     ui.div(
                         {"class": "results-section"},
+                        ui.h4("PDB Search Results"),
+                        ui.output_ui(
+                            "results_table",
+                            # {"class": "resres"}
+                        ), #output_widget("results_table"),
                         ui.download_button("download", "Download Results",
                                          class_="btn btn-info")
                     )
                         "Select PDB ID",
                         {"class": "pdb-select-label"}
                     ),
+                    ui.input_selectize(
                         "selected_pdb",
                         "",  # Label is empty as we're using a separate label
                         choices=[],
                 status_store.set("Ready")
                 pdb_ids = df['PDB ID'].tolist()
                 @output
+                @render.ui #render_widget
                 def results_table():
+                    return ui.HTML(DT(create_interactive_table(df))) #create_interactive_table(df)
         if pdb_ids:
             pdb_ids_store.set(pdb_ids)
             # Update only one dropdown
+            ui.update_selectize(
                 "selected_pdb",
                 choices=pdb_ids,
+                selected=pdb_ids[0] # matching entity 1
             )
         else:
             pdb_ids_store.set([])
+            ui.update_selectize(
                 "selected_pdb",
                 choices=[],
                 selected=None
     @render.ui
     def output_iframe():
         selected_pdb = input.selected_pdb()
+        sequences = assistant.get_sequences_by_pdb_id(selected_pdb)
+        chain_cnt = len(sequences)
         if selected_pdb:
+            return ui.HTML(render_html(selected_pdb, chain_cnt))
         return ui.HTML("")
     @output