Fill-Mask
Transformers
Safetensors
esm
File size: 2,191 Bytes
1e6a1f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Quick script to just get sequences out of 
import subprocess
import os
import pandas as pd
import pickle

def get_sequences_from_blastdb(database_path, entries):
    """
    Retrieves sequences for a list of entries from a BLAST database.

    Parameters:
    - database_path (str): Path to the BLAST database (without file extension).
    - entries (list): List of entry IDs to query.

    Returns:
    - dict: A dictionary with entry IDs as keys and sequences as values.
    """
    sequences = {}
    os.chdir("ncbi-blast-2.16.0+/swissprot")
    for entry in entries:
        try:
            # Run blastdbcmd command to retrieve the sequence for each entry
            result = subprocess.run(
                ["blastdbcmd", "-db", database_path, "-entry", entry],
                capture_output=True, text=True, check=True
            )

            # Store the output in the dictionary (entry ID as key, sequence as value)
            # make sure the ID is what we think
            result = result.stdout.strip()
            id = result.split(' ',1)[0].split('>')[1]
            assert id==entry
            seq = result.split('\n',1)[1]
            seq = seq.replace('\n','').strip('').strip('\n')
            sequences[entry] = seq

        except subprocess.CalledProcessError as e:
            print(f"Error retrieving entry {entry}: {e}")
            sequences[entry] = None  # Store None if there's an error for this entry

    return sequences


def main():
    # Query SwissProt database for the sequences of all the head and tail genes that produced the top alignments
    
    htgs = pd.read_csv("blast_outputs/ht_uniprot_query.txt",header=None)
    htgs = list(htgs[0])
                
    database_path = "swissprot"  # Path to the BLAST database without extension
    entries = htgs

    sequences_dict = get_sequences_from_blastdb(database_path, entries)
    with open("blast_outputs/best_htg_alignments_swissprot_seqs.pkl", "wb") as f:
        pickle.dump(sequences_dict, f)

    # Now look at the file you just wrote 
    with open("blast_outputs/best_htg_alignments_swissprot_seqs.pkl", "rb") as f:
        d = pickle.load(f)

if __name__ == '__main__':
    main()