gzhong's picture
Upload folder using huggingface_hub
7718235 verified
'''
Script for converting .a2m alignment file format to .ali.
To be used with the integrative Potts model coDCA package.
'''
import argparse
from Bio import SeqIO
alphabet = "ACDEFGHIKLMNPQRSTVWY"
aa_to_int = {c: i for i, c in enumerate(alphabet)}
def seq2ints(s, focus_cols):
ints = [1+aa_to_int.get(s[i], 20) for i in focus_cols]
return ' '.join([str(a) for a in ints])
def seqdistance(s1, s2):
assert len(s1) == len(s2)
diff = 0
for i in range(len(s1)):
if s1[i] != s2[i]:
diff += 1
return float(diff) / float(len(s1))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('input_file', type=str)
parser.add_argument('output_file', type=str)
parser.add_argument('--theta', default=0.2, type=float)
args = parser.parse_args()
records = SeqIO.parse(args.input_file, "fasta")
count = 0
wt = str(list(records)[0].seq)
focus_cols = [i for i in range(len(wt)) if wt[i].isupper()]
records = SeqIO.parse(args.input_file, "fasta")
with open(args.output_file, 'w+') as seq_list:
for record in records:
seq_list.write(seq2ints(str(record.seq), focus_cols))
seq_list.write('\n')
count += 1
print("Converted %i records" % count)