|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use strict; |
|
use warnings; |
|
my $numres=100; |
|
my $desclen=1000; |
|
my $ARGC=scalar(@ARGV); |
|
if ($ARGC<2) { |
|
die(" |
|
reformat.pl from HHsuite3 |
|
Read a multiple alignment in one format and write it in another format |
|
Usage: reformat.pl [informat] [outformat] infile outfile [options] |
|
or reformat.pl [informat] [outformat] 'fileglob' .ext [options] |
|
|
|
Available input formats: |
|
fas: aligned fasta; lower and upper case equivalent, '.' and '-' equivalent |
|
a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', |
|
gaps aligned to inserts: '.' |
|
a3m: like a2m, but gaps aligned to inserts MAY be omitted |
|
sto: Stockholm format; sequences in several blocks with sequence name at |
|
beginning of line (hmmer output) |
|
psi: format as read by PSI-BLAST using the -B option (like sto with -M first -r) |
|
clu: Clustal format; sequences in several blocks with sequence name at beginning |
|
of line |
|
Available output formats: |
|
fas: aligned fasta; all gaps '-' |
|
a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', gaps |
|
aligned to inserts: '.' |
|
a3m: like a2m, but gaps aligned to inserts are omitted |
|
sto: Stockholm format; sequences in just one block, one line per sequence |
|
psi: format as read by PSI-BLAST using the -B option |
|
clu: Clustal format |
|
If no input or output format is given the file extension is interpreted as format |
|
specification ('aln' as 'clu') |
|
|
|
Options: |
|
-v int verbose mode (0:off, 1:on) |
|
-num add number prefix to sequence names: 'name', '1:name' '2:name' etc |
|
-noss remove secondary structure sequences (beginning with >ss_) |
|
-sa do not remove solvent accessibility sequences (beginning with >sa_) |
|
-M first make all columns with residue in first sequence match columns |
|
(default for output format a2m or a3m) |
|
-M int make all columns with less than X% gaps match columns |
|
(for output format a2m or a3m) |
|
-r remove all lower case residues (insert states) |
|
(AFTER -M option has been processed) |
|
-r int remove all lower case columns with more than X% gaps |
|
-g '' suppress all gaps |
|
-g '-' write all gaps as '-' |
|
-uc write all residues in upper case (AFTER all other options have been processed) |
|
-lc write all residues in lower case (AFTER all other options have been processed) |
|
-l number of residues per line (for Clustal, FASTA, A2M, A3M formats) |
|
(default=$numres) |
|
-d maximum number of characers in nameline (default=$desclen) |
|
|
|
Examples: reformat.pl 1hjra.a3m 1hjra.a2m |
|
(same as reformat.pl a3m a2m 1hjra.a3m 1hjra.a2m) |
|
reformat.pl test.a3m test.fas -num -r 90 |
|
reformat.pl fas sto '*.fasta' .stockholm |
|
\n"); |
|
|
|
|
|
|
|
} |
|
|
|
my $informat=""; |
|
my $outformat=""; |
|
my $infile=""; |
|
my $outfile=""; |
|
my $num=0; |
|
my $noss=0; |
|
my $nosa=1; |
|
my $line; |
|
my $options=""; |
|
my @names; |
|
my @seqs; |
|
my $n; |
|
my $k; |
|
my $remove_inserts=0; |
|
my $remove_gapped=0; |
|
my $matchmode=""; |
|
my $match_gaprule=0; |
|
my $v=2; |
|
my $update=0; |
|
my $nss=-1; |
|
my $lname; |
|
my $titleline; |
|
|
|
my @informats= ("fas","a2m","a3m","sto","psi","clu"); |
|
my @outformats= ("fas","a2m","a3m","sto","psi","clu","ufas"); |
|
my $found; |
|
my $element; |
|
my $gap="default"; |
|
my $case="default"; |
|
|
|
|
|
for (my $i=0; $i<$ARGC; $i++) {$options.=" $ARGV[$i] ";} |
|
if ($options=~s/ -i\s+(\S+) / /) {$infile=$1;} |
|
if ($options=~s/ -o\s+(\S+) / /) {$outfile=$1;} |
|
if ($options=~s/ -num / /) {$num=1; $desclen=505;} |
|
if ($options=~s/ -noss / /) {$noss=1;} |
|
if ($options=~s/ -sa / /) {$nosa=0;} |
|
if ($options=~s/ -g\s+\'?(\S*)\'? / /) {$gap=$1;} |
|
if ($options=~s/ -r\s+(\d+) / /) {$remove_gapped=$1;} |
|
if ($options=~s/ -r / /) {$remove_inserts=1;} |
|
if ($options=~s/ -lc / /) {$case="lc";} |
|
if ($options=~s/ -uc / /) {$case="uc";} |
|
if ($options=~s/ -v\s*(\d+) / /) {$v=$1;} |
|
if ($options=~s/ -v / /) {$v=2;} |
|
if ($options=~s/ -M\s+(\d+) / /) {$matchmode="gaprule"; $match_gaprule=$1;} |
|
if ($options=~s/ -M\s+first / /) {$matchmode="first"; $match_gaprule=$1;} |
|
if ($options=~s/ -u / /) {$update=1;} |
|
if ($options=~s/ -l\s+(\S+) / /) {$numres=$1;} |
|
if ($options=~s/ -lname\s+(\S+) / /) {$lname=$1;} |
|
if ($options=~s/ -d\s+(\S+) / /) {$desclen=$1;} |
|
|
|
|
|
if ($outfile eq "") { |
|
if ($options=~s/(\S+)\s*$//) { |
|
$outfile=$1; |
|
} else { |
|
die("Error: no output file given: '$options'\n"); |
|
} |
|
} |
|
if ($infile eq "") { |
|
if ($options=~s/(\S+)\s*$//) { |
|
$infile=$1; |
|
} else { |
|
die("Error: no input file given: '$options'\n"); |
|
} |
|
} |
|
if ($options=~s/(\S+)\s*$//) { |
|
$outformat=$1; |
|
} else { |
|
if ($outfile=~/\S*\.(\S+?)$/) { |
|
$outformat=lc($1); |
|
if ($outformat eq "aln") {$outformat="clu";} |
|
elsif ($outformat eq "fa") {$outformat="fas";} |
|
elsif ($outformat eq "fasta") {$outformat="fas";} |
|
elsif ($outformat eq "afa") {$outformat="fas";} |
|
elsif ($outformat eq "afas") {$outformat="fas";} |
|
elsif ($outformat eq "afasta") {$outformat="fas";} |
|
} else { |
|
print ("Using FASTA output format: '$options'\n"); $outformat="fas"; |
|
} |
|
} |
|
if ($options=~s/(\S+)\s*$//) { |
|
$informat=$1; |
|
} else { |
|
if ($infile=~/\S*\.(\S+?)$/) { |
|
$informat=lc($1); |
|
if ($informat eq "aln") {$informat="clu";} |
|
elsif ($informat eq "fa") {$informat="fas";} |
|
elsif ($informat eq "fasta") {$informat="fas";} |
|
} else { |
|
print ("Using FASTA input format: '$options'\n"); $informat="fas"; |
|
} |
|
} |
|
|
|
|
|
|
|
if ($options!~/^\s*$/) { |
|
$options=~s/^\s*(.*?)\s*$/$1/g; |
|
print("\nWARNING: unknown options '$options'\n"); |
|
} |
|
|
|
|
|
$found=0; |
|
foreach $element (@informats) {if ($informat eq $element) {$found=1; last;}} |
|
if(!$found) {die("\nError: $informat is not a valid input format option\n");} |
|
$found=0; |
|
foreach $element (@outformats) {if ($outformat eq $element) {$found=1; last;}} |
|
if(!$found) {die("\nError: $outformat is not a valid output format option\n");} |
|
|
|
|
|
|
|
|
|
if($outformat eq "ufas") {$gap="";} |
|
|
|
|
|
if ($infile=~/\*/ || $outfile=~/^\./) |
|
{ |
|
$outfile=~/.*\.(\S*)$/; |
|
my $outext=$1; |
|
my @infiles=glob($infile); |
|
printf("%i files to reformat\n",scalar(@infiles)); |
|
foreach $infile (@infiles) |
|
{ |
|
if ($infile!~/(\S+)\.\S+/) {$infile=~/(\S+)/} |
|
$outfile="$1.$outext"; |
|
if ($update && -e $outfile) {next;} |
|
if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");} |
|
&reformat($infile,$outfile); |
|
} |
|
exit 0; |
|
} |
|
else |
|
{ |
|
if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");} |
|
&reformat($infile,$outfile); |
|
exit 0; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
sub reformat() |
|
{ |
|
$infile=$_[0]; |
|
$nss=-1; |
|
$titleline=""; |
|
|
|
|
|
|
|
|
|
|
|
my $skip=0; |
|
open (INFILE,"<$infile") or die ("ERROR: cannot open $infile: $!\n"); |
|
|
|
|
|
if ($informat eq "fas" || $informat eq "a2m" || $informat eq "a3m") |
|
{ |
|
$/=">"; |
|
$n=0; |
|
my $seq=<INFILE>; |
|
if ($seq=~s/^(\#.*)//) {$titleline=$1;} |
|
$seq=~s/(\n\#.*)*\n//; |
|
|
|
|
|
if ($seq ne ">") { |
|
$infile="/$infile."; |
|
$infile=~/^.*\/(.*?)\..*/; |
|
$names[$n]=$1; |
|
$seq=~tr/\n //d; |
|
$seqs[$n]=$seq; |
|
$n++; |
|
} |
|
|
|
while ($seq=<INFILE>) { |
|
$seq=~s/\n\#.*//g; |
|
while ($seq=~s/(.)>/$1/) {$seq.=<INFILE>;} |
|
if ($seq=~/^aa_/) {next;} |
|
if ($seq=~/^sa_/ && $nosa) {next;} |
|
if ($seq=~/^ss_/) { |
|
if ($noss) {next;} |
|
|
|
|
|
$nss=$n; |
|
} |
|
$seq=~s/^\s*(.*)//; |
|
if (defined $1 && $1 ne "") { |
|
$names[$n]=$1; |
|
} else { |
|
$names[$n]=$n; |
|
} |
|
$seqs[$n]=$seq; |
|
$n++; |
|
} |
|
|
|
$/="\n"; |
|
} |
|
|
|
|
|
elsif ($informat eq "sto") |
|
{ |
|
my %seqhash; |
|
my $name; |
|
my $first_block=1; |
|
|
|
$n=0; |
|
while ($line = <INFILE>) |
|
{ |
|
$line=~tr/\r//d; |
|
$line=~s/\s+/ /g; |
|
if ($line=~s/^\#=GC SS_cons/ss_dssp/) {} |
|
if ($line=~/^\#/) {next;} |
|
if ($line=~/^\/\//) {last;} |
|
if ($line=~/^\s*$/){$first_block=0; next;} |
|
if ($line!~/^\s*(\S+)\s+(\S+)/) { |
|
die ("\nERROR found in stockholm format: $!"); |
|
} |
|
if (!(exists $seqhash{$1})) |
|
{ |
|
if ($line=~/^aa_/) {next;} |
|
if ($line=~/^sa_/ && $nosa) {next;} |
|
if ($line=~/^ss_/) { |
|
if ($noss) {next;} |
|
|
|
|
|
$nss=$n; |
|
} |
|
$line=~/^\s*(\S+)\s+(\S+)/; |
|
$names[$n]=$1; |
|
$seqs[$n]=$2; |
|
$seqhash{$1}=$n++; |
|
$first_block=1; |
|
} |
|
else |
|
{ |
|
if ($first_block) {die ("\nERROR: sequence $1 appears more than once per block\n");} |
|
$seqs[$seqhash{$1}].=$2; |
|
} |
|
|
|
} |
|
} |
|
|
|
elsif ($informat eq "clu") |
|
{ |
|
my $residues_per_line=50; |
|
|
|
my $block=1; |
|
my $name; |
|
my $residues; |
|
$n=0; |
|
$k=0; |
|
|
|
while ($line = <INFILE>) |
|
{ |
|
|
|
$line=~tr/\r//d; |
|
if ($line=~/CLUSTAL/i) {next;} |
|
if ($line=~/^\#/) {next;} |
|
if ($line=~/^\/\//) {last;} |
|
if ($line=~/^\s*$/){ |
|
if ($k) { |
|
if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} |
|
$block++; |
|
$n=$k; |
|
$k=0; |
|
} |
|
next; |
|
} |
|
if ($line!~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/) { |
|
if ($line=~/^[*.: ]*$/) {next;} |
|
if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} |
|
chomp($line); |
|
if ($line!~/^(\S{1,20})([a-zA-Z0-9.-]{$residues_per_line})(\s+\d+)?$/) { |
|
die ("\nError found in Clustal format in $infile, line $.: '$line'\n"); |
|
} |
|
$name=$1; |
|
$residues=$2; |
|
print("WARNING: Found no space between name and residues in $infile, line $.: '$line'\n"); |
|
} else { |
|
if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} |
|
if ($line=~/^aa_/ || $line=~/^sa_/) {next;} |
|
if ($line=~/^ss_/) { |
|
|
|
|
|
$nss=$n; |
|
} |
|
$line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/; |
|
$name=$1; |
|
$residues=$2; |
|
$residues=~tr/ //d; |
|
$residues_per_line=length($residues); |
|
} |
|
if ($block==1) { |
|
$names[$k]=$name; |
|
$seqs[$k]=$residues; |
|
} else { |
|
$seqs[$k].=$residues; |
|
if ($names[$k] ne $name) { |
|
print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n"); |
|
} |
|
} |
|
|
|
$k++; |
|
} |
|
if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} |
|
if (!$n) {$n=$k;} |
|
} |
|
|
|
|
|
elsif ($informat eq "psi") |
|
{ |
|
my $block=1; |
|
my $name; |
|
my $residues; |
|
$n=0; |
|
$k=0; |
|
|
|
while ($line = <INFILE>) |
|
{ |
|
|
|
$line=~tr/\r//d; |
|
if ($line=~/^\s*$/){ |
|
if ($k) { |
|
if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} |
|
$block++; |
|
$n=$k; |
|
$k=0; |
|
} |
|
next; |
|
} |
|
|
|
if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} |
|
if ($line=~/^aa_/ || $line=~/^sa_/) {next;} |
|
if ($line=~/^ss_/) { |
|
|
|
|
|
$nss=$n; |
|
} |
|
$line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/; |
|
$name=$1; |
|
$residues=$2; |
|
$residues=~tr/ //d; |
|
|
|
if ($block==1) { |
|
$names[$k]=$name; |
|
$seqs[$k]=$residues; |
|
} else { |
|
$seqs[$k].=$residues; |
|
if ($names[$k] ne $name) { |
|
print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n"); |
|
} |
|
} |
|
|
|
$k++; |
|
} |
|
if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} |
|
if (!$n) {$n=$k;} |
|
} |
|
|
|
close INFILE; |
|
|
|
|
|
|
|
if ($n==0) {die("\nERROR: input file $infile contains no sequences\n");} |
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($informat ne "a3m" && $informat ne "a2m") { |
|
for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;} |
|
} |
|
|
|
|
|
|
|
|
|
for ($k=0; $k<$n; $k++) { |
|
$seqs[$k]=~tr/A-Za-z0-9.~-//cd; |
|
$seqs[$k]=~tr/~/-/; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($informat eq "a3m" && (!$remove_inserts || $matchmode)) |
|
{ |
|
print("inserting gaps...\n"); |
|
my @len_ins; |
|
my $j; |
|
my @inserts; |
|
my $insert; |
|
|
|
|
|
for ($k=0; $k<$n; $k++) |
|
{ |
|
|
|
|
|
|
|
@inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#"); |
|
$j=0; |
|
|
|
|
|
|
|
|
|
foreach $insert (@inserts) |
|
{ |
|
if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {$len_ins[$j]=length($insert);} |
|
$j++; |
|
|
|
} |
|
|
|
} |
|
my $ngap; |
|
|
|
|
|
for ($k=0; $k<$n; $k++) |
|
{ |
|
|
|
@inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#"); |
|
$j=0; |
|
|
|
|
|
foreach $insert (@inserts) |
|
{ |
|
for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";} |
|
$j++; |
|
} |
|
$seqs[$k] = join("",@inserts); |
|
$seqs[$k] =~ tr/\#//d; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($matchmode eq "" && ($outformat eq "a3m" || $outformat eq "a2m")) {$matchmode="first";} |
|
|
|
|
|
if ($matchmode eq "gaprule") { |
|
|
|
my @gaps=(); |
|
my $residues; |
|
my @residues; |
|
|
|
|
|
for ($k=0; $k<$n; $k++) { |
|
@residues=unpack("C*",$seqs[$k]); |
|
for (my $l=0; $l<@residues; $l++) { |
|
if ($residues[$l]==46 || $residues[$l]==45) { |
|
if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;} |
|
} |
|
} |
|
} |
|
|
|
|
|
for ($k=0; $k<$n; $k++) { |
|
@residues=unpack("C*",$seqs[$k]); |
|
$residues=""; |
|
for (my $l=0; $l<@residues; $l++) { |
|
if (!defined $gaps[$l] || $gaps[$l]<0.01*$match_gaprule*$n) { |
|
if ($residues[$l]==46) { |
|
$residues .= "-"; |
|
} else { |
|
$residues .= uc(chr($residues[$l])); |
|
} |
|
} else { |
|
if ($residues[$l]==45) { |
|
$residues .= "."; |
|
} else { |
|
$residues .= lc(chr($residues[$l])); |
|
} |
|
} |
|
$seqs[$k]=$residues; |
|
} |
|
} |
|
} |
|
|
|
|
|
if ($matchmode eq "first") { |
|
|
|
my @match=(); |
|
my $residues; |
|
my @residues; |
|
|
|
|
|
|
|
for ($k=0; $k<scalar(@names); $k++) { |
|
if ($names[$k]!~/^(ss_|aa_|sa_)/) {last;} |
|
} |
|
@residues=unpack("C*",$seqs[$k]); |
|
for (my $l=0; $l<@residues; $l++) { |
|
if ($residues[$l]==46 || $residues[$l]==45) {$match[$l]=0;} else {$match[$l]=1;} |
|
} |
|
|
|
|
|
for ($k=0; $k<$n; $k++) { |
|
@residues=unpack("C*",$seqs[$k]); |
|
$residues=""; |
|
for (my $l=0; $l<@residues; $l++) { |
|
if ($match[$l]) { |
|
if ($residues[$l]==46) { |
|
$residues .= "-"; |
|
} else { |
|
$residues .= uc(chr($residues[$l])); |
|
} |
|
} else { |
|
if ($residues[$l]==45) { |
|
$residues .= "."; |
|
} else { |
|
$residues .= lc(chr($residues[$l])); |
|
} |
|
} |
|
$seqs[$k]=$residues; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($remove_gapped) { |
|
|
|
my @gaps=(); |
|
my $residues; |
|
my @residues; |
|
|
|
|
|
for ($k=0; $k<$n; $k++) { |
|
@residues=unpack("C*",$seqs[$k]); |
|
for (my $l=0; $l<@residues; $l++) { |
|
if ($residues[$l]==45 || $residues[$l]==46) { |
|
if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;} |
|
} |
|
} |
|
} |
|
|
|
|
|
for ($k=0; $k<$n; $k++) { |
|
@residues=unpack("C*",$seqs[$k]); |
|
$residues=""; |
|
for (my $l=0; $l<@residues; $l++) { |
|
if (!defined $gaps[$l] || $gaps[$l]<0.01*$remove_gapped*$n) { |
|
$residues .= chr($residues[$l]) |
|
} |
|
$seqs[$k]=$residues; |
|
} |
|
} |
|
} |
|
|
|
|
|
for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/ //d;} |
|
if ($remove_inserts) { |
|
for ($k=0; $k<$n; $k++) { |
|
$seqs[$k]=~tr/a-z.//d; |
|
|
|
} |
|
} |
|
|
|
|
|
|
|
my $nin=$n; |
|
for ($k=0; $k<$n; $k++) { |
|
if (($seqs[$k]=~tr/a-zA-Z0-9/a-zA-Z0-9/==0)) { |
|
if ($v>=2) {print("Sequence contains only gaps and is removed: $names[$k]\n");} |
|
splice(@seqs,$k,1); |
|
splice(@names,$k,1); |
|
$k--; $n--; |
|
} |
|
} |
|
|
|
|
|
|
|
for ($k=0; $k<$n; $k++) {$names[$k]=substr($names[$k],0,$desclen);} |
|
|
|
if ($outformat eq "a3m") { |
|
|
|
for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/.//d;} |
|
} elsif ($outformat eq "fas" || $outformat eq "clu" || $outformat eq "sto" || $outformat eq "psi" ) { |
|
|
|
for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/./-/;} |
|
} |
|
if ($gap ne "default") { |
|
for ($k=0; $k<$n; $k++) {$seqs[$k]=~s/\./$gap/g; $seqs[$k]=~s/-/$gap/g;} |
|
} |
|
if ($case eq "uc") { |
|
for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;} |
|
} elsif ($case eq "lc") { |
|
for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/A-Z/a-z/;} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($outformat ne "a3m" && $outformat ne "ufas") { |
|
my $len=length($seqs[0]); |
|
for($k=1; $k<$n; $k++) { |
|
if (length($seqs[$k])!=$len) { |
|
printf("\nError: Sequences in $infile do not all have same length, e.g. >%-.20s (len=%i) and >%-.20s (len=%i)\n", |
|
$names[0],$len,$names[$k],length($seqs[$k])); |
|
if ($v>=3) { |
|
printf("%.20s %s\n%.20s %s\n\n",$names[0],$seqs[0],$names[$k],$seqs[$k]); |
|
} |
|
exit 1; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
for($k=0; $k<$n; $k++) { |
|
$names[$k]=~s/<[A-Za-z\/].*?>//g; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
my $ndssp=-1; |
|
my $nsa=-1; |
|
my $npred=-1; |
|
my $nconf=-1; |
|
my $nquery=-1; |
|
for ($k=0; $k<$n; $k++) { |
|
if ($names[$k]=~/^ss_dssp/){$ndssp=$k; } |
|
elsif ($names[$k]=~/^sa_dssp/){$nsa=$k; } |
|
elsif ($names[$k]=~/^ss_pred/){$npred=$k; } |
|
elsif ($names[$k]=~/^ss_conf/){$nconf=$k; } |
|
elsif ($nquery==-1 && $names[$k]!~/^aa_/) {$nquery=$k;} |
|
} |
|
|
|
|
|
open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n"); |
|
if ($outformat eq "sto" || $outformat eq "psi") { |
|
my $refline; |
|
if ($outformat eq "sto") { |
|
print(OUTFILE "# STOCKHOLM 1.0\n\n"); |
|
|
|
|
|
if (!$lname) {$lname=32;} |
|
if ($names[$nquery] =~ /^\S+\s+(.*)/) { |
|
printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GF DE", $1); |
|
} |
|
$refline=$seqs[$nquery]; |
|
$refline=~s/[a-z]/-/g; |
|
printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GC RF",$refline); |
|
if ($ndssp>=0) { |
|
printf(OUTFILE "%-32.32s %s\n","#=GC SS_cons",$seqs[$ndssp]); |
|
} |
|
} |
|
if ($num) { |
|
my $num=2; |
|
for ($k=0; $k<$n; $k++) { |
|
if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;} |
|
$names[$k]=~s/^(\S+)\#\d+/$1/; |
|
$names[$k]=~s/^(\S{1,25})\S+/$1\#$num/; |
|
$num++; |
|
} |
|
} |
|
for ($k=0; $k<$n; $k++) { |
|
if ($k==$ndssp || $k==$npred || $k==$nconf) {next;} |
|
$names[$k] =~ /\s*(\S+)/; |
|
if (!$lname) {$lname=32;} |
|
printf(OUTFILE "%-$lname.$lname"."s %s\n",$1,$seqs[$k]); |
|
} |
|
if ($outformat eq "sto") {print(OUTFILE "//\n");} |
|
} elsif ($outformat eq "clu") { |
|
printf(OUTFILE "CLUSTAL\n\n\n"); |
|
if ($num) { |
|
my $num=2; |
|
for ($k=0; $k<$n; $k++) { |
|
if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;} |
|
$names[$k]=~s/^(\S+)\#\d+/$1/; |
|
$names[$k]=~s/^(\S{1,10})\S*/$1\#$num/; |
|
$num++; |
|
} |
|
} |
|
while($seqs[0] ne "") { |
|
for ($k=0; $k<scalar(@names); $k++) { |
|
$names[$k] =~ s/\s*(\S+).*/$1/; |
|
$seqs[$k]=~s/(\S{1,$numres})//; |
|
if (!$lname) {$lname=18;} |
|
printf(OUTFILE "%-$lname.$lname"."s %s\n",$names[$k],$1); |
|
} |
|
print(OUTFILE "\n"); |
|
} |
|
} else { |
|
if ($num) { |
|
my $num=2; |
|
for ($k=0; $k<$n; $k++) { |
|
if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;} |
|
$names[$k]=~s/^(\S+)\#\d+/$1/; |
|
$names[$k]=~s/^(\S{1,25})\S+/$1\#$num/; |
|
|
|
$num++; |
|
} |
|
} |
|
if ($titleline ne "" && $outformat eq "a3m") { |
|
printf(OUTFILE "%s\n",$titleline); |
|
} |
|
for ($k=0; $k<$n; $k++) { |
|
$seqs[$k]=~s/(\S{$numres})/$1\n/g; |
|
printf(OUTFILE ">%s\n%s\n",$names[$k],$seqs[$k]); |
|
} |
|
} |
|
|
|
close OUTFILE; |
|
if ($v>=2) { |
|
if ($nin==1) {print("Reformatted $infile with 1 sequence from $informat to $outformat and written to file $outfile\n");} |
|
else { |
|
if (!$nin==$n) {printf("Removed %i sequences which contained no residues\n",$nin-$n); } |
|
print("Reformatted $infile with $n sequences from $informat to $outformat and written to file $outfile\n"); |
|
} |
|
} |
|
|
|
return; |
|
} |
|
|