Fill-Mask
Transformers
Safetensors
esm
FusOn-pLM / fuson_plm /data /blast /extract_blast_seqs.py
root
uploading data folder
1e6a1f0
# Quick script to just get sequences out of
import subprocess
import os
import pandas as pd
import pickle
def get_sequences_from_blastdb(database_path, entries):
"""
Retrieves sequences for a list of entries from a BLAST database.
Parameters:
- database_path (str): Path to the BLAST database (without file extension).
- entries (list): List of entry IDs to query.
Returns:
- dict: A dictionary with entry IDs as keys and sequences as values.
"""
sequences = {}
os.chdir("ncbi-blast-2.16.0+/swissprot")
for entry in entries:
try:
# Run blastdbcmd command to retrieve the sequence for each entry
result = subprocess.run(
["blastdbcmd", "-db", database_path, "-entry", entry],
capture_output=True, text=True, check=True
)
# Store the output in the dictionary (entry ID as key, sequence as value)
# make sure the ID is what we think
result = result.stdout.strip()
id = result.split(' ',1)[0].split('>')[1]
assert id==entry
seq = result.split('\n',1)[1]
seq = seq.replace('\n','').strip('').strip('\n')
sequences[entry] = seq
except subprocess.CalledProcessError as e:
print(f"Error retrieving entry {entry}: {e}")
sequences[entry] = None # Store None if there's an error for this entry
return sequences
def main():
# Query SwissProt database for the sequences of all the head and tail genes that produced the top alignments
htgs = pd.read_csv("blast_outputs/ht_uniprot_query.txt",header=None)
htgs = list(htgs[0])
database_path = "swissprot" # Path to the BLAST database without extension
entries = htgs
sequences_dict = get_sequences_from_blastdb(database_path, entries)
with open("blast_outputs/best_htg_alignments_swissprot_seqs.pkl", "wb") as f:
pickle.dump(sequences_dict, f)
# Now look at the file you just wrote
with open("blast_outputs/best_htg_alignments_swissprot_seqs.pkl", "rb") as f:
d = pickle.load(f)
if __name__ == '__main__':
main()