|
''' |
|
Script for converting .a2m alignment file format to .ali. |
|
To be used with the integrative Potts model coDCA package. |
|
''' |
|
|
|
import argparse |
|
from Bio import SeqIO |
|
|
|
alphabet = "ACDEFGHIKLMNPQRSTVWY" |
|
aa_to_int = {c: i for i, c in enumerate(alphabet)} |
|
|
|
|
|
def seq2ints(s, focus_cols): |
|
ints = [1+aa_to_int.get(s[i], 20) for i in focus_cols] |
|
return ' '.join([str(a) for a in ints]) |
|
|
|
def seqdistance(s1, s2): |
|
assert len(s1) == len(s2) |
|
diff = 0 |
|
for i in range(len(s1)): |
|
if s1[i] != s2[i]: |
|
diff += 1 |
|
return float(diff) / float(len(s1)) |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('input_file', type=str) |
|
parser.add_argument('output_file', type=str) |
|
parser.add_argument('--theta', default=0.2, type=float) |
|
args = parser.parse_args() |
|
records = SeqIO.parse(args.input_file, "fasta") |
|
count = 0 |
|
wt = str(list(records)[0].seq) |
|
focus_cols = [i for i in range(len(wt)) if wt[i].isupper()] |
|
records = SeqIO.parse(args.input_file, "fasta") |
|
with open(args.output_file, 'w+') as seq_list: |
|
for record in records: |
|
seq_list.write(seq2ints(str(record.seq), focus_cols)) |
|
seq_list.write('\n') |
|
count += 1 |
|
print("Converted %i records" % count) |
|
|