gzhong's picture
Upload folder using huggingface_hub
7718235 verified
#!/bin/bash
#
# Runs jackhmmer search with bitscore thresholds
#
#SBATCH --cluster=<clustername>
#SBATCH --partition=<partitionname>
#SBATCH --account=<accountname>
#SBATCH --job-name=jackhmmer
#SBATCH --output=jackhmmer.out
#SBATCH --gres=gpu:0 # Number of GPU(s) per node.
#SBATCH --cpus-per-task=4 # CPU cores/threads
#SBATCH --mem=48000M # memory per node
#SBATCH --time=0-24:00 # Max time (DD-HH:MM)
#SBATCH --ntasks=1 # Only set to >1 if you want to use multi-threading
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
## USAGE
## Create a directory, and put the WT sequence in wt.fasta in the directory
## sbatch jackhmmer.sh <dir> <bitscore_threshold> <niter>
dir=$1
bitscore=$2 # e.g. 0.5
niter=$3
seqdb=$4 # location for e.g. uniref100 or uniref90 fasta files
query="$dir/wt.fasta"
tblout="$dir/targets.tblout"
alignmentfile="$dir/alignment.sto"
hmmprefix="$dir/iter"
aliprefix="$dir/iter"
wtseq=$(sed 1d $query)
seqlen=${#wtseq}
bitscore=$(echo "$seqlen*$bitscore" | bc) # scale bitscore by seqlen
echo "$bitscore"
#EVcouplings defaults
jackhmmer -N $niter \
--incT $bitscore --incdomT $bitscore -T $bitscore --domT $bitscore \
--popen 0.02 --pextend 0.4 --mx BLOSUM62 \
--tblout $tblout -A $alignmentfile --noali --notextw\
--chkhmm $hmmprefix --chkali $aliprefix \
--cpu $SLURM_CPUS_PER_TASK \
$query $seqdb
# convert tblout to target id list
targetidfile="$dir/target_ids.txt"
python scripts/tblout2ids.py $tblout $targetidfile
# fetch sequences
fastafile="$dir/target_seqs.fasta"
txtfile="$dir/target_seqs.txt"
esl-sfetch -o $fastafile -f $seqdb $targetidfile
python scripts/fasta2txt.py $fastafile $txtfile
# split into train and validation
python scripts/randsplit.py $txtfile 0.2
python src/sto2a2m.py $query $alignmentfile ${dir}/alignment
for (( i=1; i<=$niter; i++ ))
do
python src/sto2a2m.py $query $aliprefix-$i.sto $aliprefix-$i
done