|
|
|
import subprocess |
|
import os |
|
import pandas as pd |
|
import pickle |
|
|
|
def get_sequences_from_blastdb(database_path, entries): |
|
""" |
|
Retrieves sequences for a list of entries from a BLAST database. |
|
|
|
Parameters: |
|
- database_path (str): Path to the BLAST database (without file extension). |
|
- entries (list): List of entry IDs to query. |
|
|
|
Returns: |
|
- dict: A dictionary with entry IDs as keys and sequences as values. |
|
""" |
|
sequences = {} |
|
os.chdir("ncbi-blast-2.16.0+/swissprot") |
|
for entry in entries: |
|
try: |
|
|
|
result = subprocess.run( |
|
["blastdbcmd", "-db", database_path, "-entry", entry], |
|
capture_output=True, text=True, check=True |
|
) |
|
|
|
|
|
|
|
result = result.stdout.strip() |
|
id = result.split(' ',1)[0].split('>')[1] |
|
assert id==entry |
|
seq = result.split('\n',1)[1] |
|
seq = seq.replace('\n','').strip('').strip('\n') |
|
sequences[entry] = seq |
|
|
|
except subprocess.CalledProcessError as e: |
|
print(f"Error retrieving entry {entry}: {e}") |
|
sequences[entry] = None |
|
|
|
return sequences |
|
|
|
|
|
def main(): |
|
|
|
|
|
htgs = pd.read_csv("blast_outputs/ht_uniprot_query.txt",header=None) |
|
htgs = list(htgs[0]) |
|
|
|
database_path = "swissprot" |
|
entries = htgs |
|
|
|
sequences_dict = get_sequences_from_blastdb(database_path, entries) |
|
with open("blast_outputs/best_htg_alignments_swissprot_seqs.pkl", "wb") as f: |
|
pickle.dump(sequences_dict, f) |
|
|
|
|
|
with open("blast_outputs/best_htg_alignments_swissprot_seqs.pkl", "rb") as f: |
|
d = pickle.load(f) |
|
|
|
if __name__ == '__main__': |
|
main() |