import requests from typing import List, Dict, Optional from dataclasses import dataclass @dataclass class ProteinQuery: name: str organism: Optional[str] = None mutations: Optional[List[str]] = None min_resolution: Optional[float] = None max_resolution: Optional[float] = None @dataclass class ProteinStructure: pdb_id: str resolution: float sequence: str title: str method: str release_date: str class ProteinSearchEngine: def __init__(self, debug=False): self.uniprot_api = "https://rest.uniprot.org/uniprotkb" self.pdb_api = "https://data.rcsb.org/graphql" def _get_uniprot_data(self, query: ProteinQuery) -> Dict: """UniProt API를 통해 기본 단백질 정보 검색""" search_query = f'"{query.name}"' if query.organism: search_query += f' AND organism:"{query.organism}"' params = { "query": search_query, "format": "json" } # self._debug_print(f"UniProt search query: {search_query}") response = requests.get(f"{self.uniprot_api}/search", params=params) data = response.json() # self._debug_print(f"UniProt results count: {len(data.get('results', []))}") return data def _get_pdb_structures(self, uniprot_id: str, uniprot_sequence: str = None) -> List[ProteinStructure]: """REST API를 사용하여 PDB에서 구조 정보 검색""" url = "https://search.rcsb.org/rcsbsearch/v2/query" query = { "query": { "type": "group", "logical_operator": "and", "nodes": [ { "type": "terminal", "service": "text", "parameters": { "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession", "operator": "exact_match", "value": uniprot_id } }, { "type": "terminal", "service": "text", "parameters": { "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name", "operator": "exact_match", "value": "UniProt" } } ] }, "return_type": "entry" } response = requests.post(url, json=query) if response.status_code != 200: # self._debug_print(f"Error querying PDB: {response.text}") return [] data = response.json() structures = [] for hit in data.get("result_set", []): pdb_id = hit["identifier"] # PDB API를 통해 구조 세부 정보 가져오기 structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}" structure_response = requests.get(structure_url) if structure_response.status_code == 200: structure_data = structure_response.json() # 시퀀스 정보 가져오기 entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1" # 첫 번째 엔티티 가져오기 entity_response = requests.get(entity_url) sequence = "" if entity_response.status_code == 200: entity_data = entity_response.json() sequence = entity_data.get("entity_poly", {}).get("pdbx_seq_one_letter_code", "") structure = ProteinStructure( pdb_id=pdb_id, resolution=float(structure_data.get("rcsb_entry_info", {}).get("resolution_combined", [0.0])[0]), sequence=sequence, method=structure_data.get("exptl", [{}])[0].get("method", ""), title=structure_data.get("struct", {}).get("title", ""), release_date=structure_data.get("rcsb_accession_info", {}).get("initial_release_date", "") ) structures.append(structure) return structures def search(self, query: ProteinQuery) -> List[ProteinStructure]: """주어진 쿼리로 단백질 구조 검색""" # 1. UniProt에서 기본 정보 검색 uniprot_data = self._get_uniprot_data(query) if not uniprot_data.get('results'): # self._debug_print("No UniProt results found") return [] all_structures = [] # 여러 UniProt 엔트리 검색 for entry in uniprot_data['results'][:5]: # 상위 5개만 검색 uniprot_id = entry['primaryAccession'] sequence = entry.get('sequence', {}).get('value', '') # self._debug_print(f"Processing UniProt ID: {uniprot_id}") # self._debug_print(f"UniProt Sequence ({len(sequence)} aa):\n{sequence}") structures = self._get_pdb_structures(uniprot_id, sequence) all_structures.extend(structures) # self._debug_print(f"Total structures found: {len(all_structures)}") # 3. Resolution 기준으로 필터링 filtered_structures = [] for structure in all_structures: # Resolution 체크 if query.min_resolution and structure.resolution < query.min_resolution: continue if query.max_resolution and structure.resolution > query.max_resolution: continue filtered_structures.append(structure) # self._debug_print(f"Structures after resolution filter: {len(filtered_structures)}") # 4. Resolution 기준으로 정렬 filtered_structures.sort(key=lambda x: x.resolution) return filtered_structures def main(): # 검색 엔진 초기화 search_engine = ProteinSearchEngine(debug=True) # 전체 검색 (resolution 5 이하) query = ProteinQuery( name="human hemoglobin A", max_resolution=5.0 # resolution 제한 완화 ) # 검색 실행 results = search_engine.search(query) # 결과를 파일로 출력 with open('protein_search_results.txt', 'w') as f: f.write(f"Search Query: {query.name}\n") if query.organism: f.write(f"Organism: {query.organism}\n") f.write(f"Resolution Filter: <= {query.max_resolution} Å\n\n") f.write(f"Found {len(results)} structures matching the criteria:\n") for i, structure in enumerate(results, 1): f.write(f"\n{i}. PDB ID: {structure.pdb_id}\n") f.write(f" Resolution: {structure.resolution:.2f} Å\n") f.write(f" Method: {structure.method}\n") f.write(f" Title: {structure.title}\n") f.write(f" Release Date: {structure.release_date}\n") f.write(f" Sequence Length: {len(structure.sequence)} aa\n") f.write(f" Sequence:\n{structure.sequence}\n") f.write("-" * 80 + "\n") print(f"Results have been saved to 'protein_search_results.txt'") if __name__ == "__main__": main()