Spaces:

lkjjj26
/

query

Sleeping

File size: 14,689 Bytes

from transformers import pipeline
from rcsbsearchapi import TextQuery, AttributeQuery, Query
from rcsbsearchapi.search import Sort, SequenceQuery
import os
from dotenv import load_dotenv
from shiny import App, render, ui, reactive
import pandas as pd
import warnings
import re
from  UniprotKB_P_Sequence_RCSB_API_test import ProteinQuery, ProteinSearchEngine
import plotly.graph_objects as go
from shinywidgets import output_widget, render_widget
warnings.filterwarnings('ignore')

# Load environment variables from .env file
load_dotenv()

# os.environ["TRANSFORMERS_CACHE"] = "./transformers_cache"
# os.makedirs("./transformers_cache", exist_ok=True)

class PDBSearchAssistant:
    def __init__(self, model_name="google/flan-t5-large"):
        # Set up HuggingFace pipeline with better model
        self.pipe = pipeline(
            "text2text-generation",
            model=model_name,
            max_new_tokens=512,
            temperature=0.3,
            torch_dtype="auto",
            device="cpu"
        )
        
        self.prompt_template = """
            Extract specific search parameters from the query, if present:
            1. Resolution cutoff (in Å)
            2. Sequence information
            3. Specific PDB ID
            4. Experimental method (X-RAY, EM, NMR)

            Format:
            Resolution: [maximum resolution in Å, if mentioned]
            Sequence: [any sequence mentioned]
            PDB_ID: [specific PDB ID if mentioned]
            Method: [experimental method if mentioned]

            Examples:
            Query: "Find X-ray structures better than 2.5Å resolution"
            Resolution: 2.5
            Sequence: none
            PDB_ID: none
            Method: X-RAY

            Query: "Show me NMR structures of kinases"
            Resolution: none
            Sequence: none
            PDB_ID: none
            Method: NMR

            Now analyze:
            Query: {query}
            """

    def search_pdb(self, query):
        try:
            # Get search parameters from LLM
            formatted_prompt = self.prompt_template.format(query=query)
            response = self.pipe(formatted_prompt)[0]['generated_text']
            print("Generated parameters:", response)
            
            # Parse LLM response
            resolution_limit = None
            pdb_id = None
            sequence = None
            method = None
            has_resolution_query = False
            resolution_direction = "less"
            
            # Check if query contains resolution-related terms
            resolution_terms = {
                'better': 'less',
                'best': 'less',
                'highest': 'less',
                'good': 'less',
                'fine': 'less',
                'worse': 'greater',
                'worst': 'greater',
                'lowest': 'greater',
                'poor': 'greater',
                'resolution': None,
                'å': None,
                'angstrom': None,
                'than': None,
                'under': 'less',
                'below': 'less',
                'above': 'greater',
                'over': 'greater'
            }
            
            # Check if the original query mentions resolution
            query_lower = query.lower()
            
            # Determine resolution direction from query
            for term, direction in resolution_terms.items():
                if term in query_lower:
                    has_resolution_query = True
                    if direction:  # if not None
                        resolution_direction = direction
            
            # Also check for numerical values with Å
            if re.search(r'\d+\.?\d*\s*å?', query_lower):
                has_resolution_query = True
            
            # Clean and parse LLM response
            for line in response.split('\n'):
                if 'Resolution:' in line:
                    value = line.split('Resolution:')[1].strip()
                    if value.lower() not in ['none', 'n/a'] and has_resolution_query:
                        try:
                            # Extract just the number
                            res_value = ''.join(c for c in value if c.isdigit() or c == '.')
                            resolution_limit = float(res_value)
                        except ValueError:
                            pass
                elif 'Method:' in line:
                    value = line.split('Method:')[1].strip()
                    if value.lower() not in ['none', 'n/a']:
                        method = value.upper()
                elif 'Sequence:' in line:
                    value = line.split('Sequence:')[1].strip()
                    if value.lower() not in ['none', 'n/a']:
                        sequence = value
                elif 'PDB_ID:' in line:
                    value = line.split('PDB_ID:')[1].strip()
                    if value.lower() not in ['none', 'n/a']:
                        pdb_id = value
            
            # Build search query
            queries = []
            
            # Check if the query contains a protein sequence pattern
            # Check for amino acid sequence (minimum 25 residues)
            query_words = query.split()
            for word in query_words:
                # Check if the word consists of valid amino acid letters
                if (len(word) >= 25 and  # minimum 25 residues requirement
                    all(c in 'ACDEFGHIKLMNPQRSTVWY' for c in word.upper()) and
                    sum(c.isupper() for c in word) / len(word) > 0.8):
                    sequence = word
                    break
            
            # If sequence is found, use SequenceQuery
            if sequence:
                if len(sequence) < 25:
                    print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
                    sequence = None
                else:
                    print(f"Adding sequence search with identity 100% for sequence: {sequence}")
                    sequence_query = SequenceQuery(
                        sequence,
                        identity_cutoff=1.0,  # 100% identity
                        evalue_cutoff=1,
                        sequence_type="protein"
                    )
                    queries.append(sequence_query)
            # If no sequence, proceed with text search
            else:
                # Clean the original query and add text search
                clean_query = query.lower()
                
                # Remove resolution numbers and terms if they exist
                if has_resolution_query:
                    clean_query = re.sub(r'\d+\.?\d*\s*å?', '', clean_query)
                    for term in resolution_terms:
                        clean_query = clean_query.replace(term, '')
                
                # Clean up extra spaces and trim
                clean_query = ' '.join(clean_query.split())
                
                print("Cleaned query:", clean_query)
                
                # Add text search if query is not empty
                if clean_query.strip():
                    text_query = AttributeQuery(
                        attribute="struct.title",
                        operator="contains_phrase",
                        value=clean_query
                    )
                    queries.append(text_query)
            
            # Add resolution filter if specified
            if resolution_limit and has_resolution_query:
                operator = "less_or_equal" if resolution_direction == "less" else "greater_or_equal"
                print(f"Adding resolution filter: {operator} {resolution_limit}Å")
                resolution_query = AttributeQuery(
                    attribute="rcsb_entry_info.resolution_combined",
                    operator=operator,
                    value=resolution_limit
                )
                queries.append(resolution_query)
            
            # Add PDB ID search if specified
            if pdb_id:
                print(f"Searching for specific PDB ID: {pdb_id}")
                id_query = AttributeQuery(
                    attribute="rcsb_id",
                    operator="exact_match",
                    value=pdb_id.upper()
                )
                queries = [id_query]  # Override other queries for direct PDB ID search
            
            # Add experimental method filter if specified
            if method:
                print(f"Adding experimental method filter: {method}")
                method_query = AttributeQuery(
                    attribute="exptl.method",
                    operator="exact_match",
                    value=method
                )
                queries.append(method_query)
            
            # Combine queries with AND operator
            if queries:
                final_query = queries[0]
                for q in queries[1:]:
                    final_query = final_query & q
                
                print("Final query:", final_query)
                
                # Execute search
                session = final_query.exec()
                results = []
                
                # Process results safely with additional information
                try:
                    for entry in session:
                        # Handle both string and object types
                        if isinstance(entry, str):
                            result = {
                                'PDB ID': entry
                            }
                        else:
                            # Handle object type
                            result = {
                                'PDB ID': entry.identifier
                            }
                        
                        results.append(result)
                except Exception as e:
                    print(f"Error processing results: {str(e)}")
                    # If error occurs during processing, at least return PDB IDs
                    if isinstance(entry, str):
                        results.append({'PDB ID': entry})
                
                print(f"Found {len(results)} structures")
                return results
            
            return []
            
        except Exception as e:
            print(f"Error during search: {str(e)}")
            print(f"Error type: {type(e)}")
            return []

def pdbsummary(name):

    search_engine = ProteinSearchEngine()

    query = ProteinQuery(
        name,
        max_resolution= 5.0
    )

    results = search_engine.search(query)

    answer = ""
    for i, structure in enumerate(results, 1):
        answer += f"\n{i}. PDB ID : {structure.pdb_id}\n"
        answer += f"\nResolution : {structure.resolution:.2f} A \n"
        answer += f"Method : {structure.method}\n Title : {structure.title}\n"
        answer += f"Release Date : {structure.release_date}\n Sequence length: {len(structure.sequence)} aa\n"
        answer += f"    Sequence:\n {structure.sequence}\n"

    return answer

def create_interactive_table(df):
    if df.empty:
        return go.Figure()
    
    # Create interactive table
    table = go.Figure(data=[go.Table(
        header=dict(
            values=list(df.columns),
            fill_color='paleturquoise',
            align='left',
            font=dict(size=14),
        ),
        cells=dict(
            values=[df[col] for col in df.columns],
            align='left',
            font=dict(size=13),
            height=30
        ),
        columnwidth=[len(str(max(df[col], key=len))) for col in df.columns]
    )])
    
    # Update table layout
    table.update_layout(
        margin=dict(l=0, r=0, t=0, b=0),
        height=400,
        autosize=True
    )
    
    return table

# Simplified Shiny app UI definition
app_ui = ui.page_fluid(
    ui.tags.head(
        ui.tags.style("""
            .table a {
                color: #0d6efd;
                text-decoration: none;
            }
            .table a:hover {
                color: #0a58ca;
                text-decoration: underline;
            }
        """)
    ),
    ui.h2("Advanced PDB Structure Search Tool"),
    ui.row(
        ui.column(12,
            ui.input_text("query", "Search Query", 
                         value="Human insulin"),
        )
    ),
    ui.row(
        ui.column(12,
            ui.p("Example queries:"),
            ui.tags.ul(
                ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
                ui.tags.li("Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"),

            ),
        )
    ),
    ui.row(
        ui.column(12,
            ui.input_action_button("search", "Search", class_="btn-primary"),
        )
    ),
    ui.row(
        ui.column(12,
            ui.h4("Search Parameters:"),
            ui.output_text("search_conditions"),
        )
    ),
    ui.row(
        ui.column(12,
            ui.h4("Top 10 Results:"),
            output_widget("results_table"),
            ui.download_button("download", "Download Results")
        )
    )
)

def server(input, output, session):
    assistant = PDBSearchAssistant()
    results_store = reactive.Value([])
    
    @reactive.Effect
    @reactive.event(input.search)
    def _():
        results = assistant.search_pdb(query=input.query())
        results_store.set(results)
        
        # Convert results to DataFrame and add hyperlinks
        df = pd.DataFrame(results)
        if not df.empty:
            df['PDB ID'] = df['PDB ID'].apply(
                lambda x: f'<a href="https://www.rcsb.org/3d-view/{x}" target="_blank">{pdbsummary(x)}</a>'
            )
        
        @output
        @render_widget
        def results_table():
            return create_interactive_table(df) # id 순으로 정렬되는거인듯 Top rank 순은 아님
    
    @output
    @render.text
    def search_conditions():
        results = results_store.get()
        return f"""
        Applied Search Conditions:
        - Query: {input.query()}
        - Total structures found: {len(results)}
        """
    
    @output
    @render.download(filename="pdb_search_results.csv")
    def download():
        df = pd.DataFrame(results_store.get())
        return df.to_csv(index=False)

app = App(app_ui, server)

if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()
    app.run(host="0.0.0.0", port=7860)