aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2008-2016 by Peter Cock. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.Align support for "emboss" alignment output from EMBOSS tools.
This module contains a parser for the EMBOSS srspair/pair/simple file format,
for example from the needle, water, and stretcher tools.
"""
from Bio.Align import Alignment
from Bio.Align import interfaces
from Bio.Seq import Seq, reverse_complement
from Bio.SeqRecord import SeqRecord
class AlignmentIterator(interfaces.AlignmentIterator):
"""Emboss alignment iterator.
For reading the (pairwise) alignments from EMBOSS tools in what they
call the "pairs" and "simple" formats.
"""
fmt = "EMBOSS"
def _read_header(self, stream):
try:
line = next(stream)
except StopIteration:
raise ValueError("Empty file.") from None
if line.rstrip() != "########################################":
raise ValueError("Unexpected line: %s") % line
# assume srspair format (default) if not specified explicitly in
# the output file
self.metadata = {}
self.metadata["Align_format"] = "srspair"
commandline = None
for line in stream:
if line.rstrip() == "########################################":
break
if not line.startswith("# "):
raise ValueError("Unexpected line: %s") % line
if commandline is not None:
if line.startswith("# "):
commandline += " " + line[1:].strip()
continue
self.metadata["Command line"] = commandline
commandline = None
key, value = line[2:].split(":", 1)
if key == "Program":
self.metadata["Program"] = value.strip()
elif key == "Rundate":
self.metadata["Rundate"] = value.strip()
elif key == "Report_file":
self.metadata["Report_file"] = value.strip()
elif key == "Align_format":
self.metadata["Align_format"] = value.strip()
elif key == "Commandline":
commandline = value.strip()
def _read_next_alignment(self, stream):
number_of_sequences = None
annotations = {}
for line in stream:
line = line.rstrip("\r\n")
if not line:
continue
elif line.startswith("#---------------------------------------"):
# may appear between alignments
continue
elif line.startswith("#======================================="):
# found the alignment metadata start
identifiers = []
ncols = None
sequences = None
break
else:
raise ValueError("Unexpected line: %s" % line)
for line in stream:
line = line.rstrip("\r\n")
if line == "#=======================================":
# reached the end of alignment metadata
break
elif line.strip() == "#":
continue
elif not line.startswith("# "):
raise ValueError("Unexpected line: %s") % line
try:
key, value = line[2:].split(":", 1)
except ValueError:
# An equal sign is used for Longest_Identity,
# Longest_Similarity, Shortest_Identity, and
# Shortest_Similarity, which are included if command line
# argument -nobrief was used.
key, value = line[2:].split(" = ", 1)
if key == "Aligned_sequences":
number_of_sequences = int(value.strip())
assert len(identifiers) == 0
# Should now expect the record identifiers...
for i, line in enumerate(stream):
if not line.startswith("# "):
raise ValueError("Unexpected line: %s") % line
number, identifier = line[2:].split(":")
assert i + 1 == int(number)
identifiers.append(identifier.strip())
if len(identifiers) == number_of_sequences:
break
elif key == "Matrix":
annotations[key] = value.strip()
elif key == "Gap_penalty":
annotations[key] = float(value.strip())
elif key == "Extend_penalty":
annotations[key] = float(value.strip())
elif key == "Length":
ncols = int(value.strip())
elif key == "Identity":
annotations[key] = int(value.strip().split("/")[0])
elif key == "Similarity":
annotations[key] = int(value.strip().split("/")[0])
elif key == "Gaps":
annotations[key] = int(value.strip().split("/")[0])
elif key == "Score":
annotations[key] = float(value.strip())
# TODO:
# The following are generated if the -nobrief command line
# argument used. We could simply calculate them from the
# alignment, but then we have to define what we mean by
# "similar". For now, simply store them as an annotation.
elif key == "Longest_Identity":
annotations[key] = value.strip()
elif key == "Longest_Similarity":
annotations[key] = value.strip()
elif key == "Shortest_Identity":
annotations[key] = value.strip()
elif key == "Shortest_Similarity":
annotations[key] = value.strip()
else:
raise ValueError("Failed to parse line '%s'" % line)
else:
return
if len(identifiers) == 0:
raise ValueError("Number of sequences missing!")
if ncols is None:
raise ValueError("Length of alignment missing!")
sequences = [""] * number_of_sequences
aligned_sequences = [""] * number_of_sequences
consensus = ""
starts = [0] * number_of_sequences
ends = [0] * number_of_sequences
column = 0
index = 0
for line in stream:
line = line.rstrip("\r\n")
# parse the sequences
if not line:
# empty line
if index == number_of_sequences:
# reached the end of an alignment block
if column == ncols:
# reached the end of the sequences
break
index = 0
continue
prefix = line[:21].strip()
if prefix == "":
# match line
consensus += line[21:71]
else:
identifier, start = prefix.split(None, 1)
assert identifiers[index].startswith(identifier)
aligned_sequence, end = line[21:].split(None, 1)
start = int(start)
end = int(end)
length = len(sequences[index])
sequence = aligned_sequence.replace("-", "")
if length == 0 and len(sequence) > 0:
if start < end:
start -= 1 # Python counting
assert end == start + len(sequence)
else:
end -= 1 # Python counting
assert end == start - len(sequence)
# Record the start
starts[index] = start
else:
if starts[index] <= ends[index]:
# forward strand
if (
self.metadata["Align_format"] == "srspair"
and len(sequence) == 0
):
assert start == ends[index]
assert end == start
else:
start -= 1
assert end == start + len(sequence)
else:
if (
self.metadata["Align_format"] == "srspair"
and len(sequence) == 0
):
assert start - 1 == ends[index]
assert end == start
else:
end -= 1
assert end == start - len(sequence)
# Record the end
ends[index] = end
sequences[index] += sequence
aligned_sequences[index] += aligned_sequence
if index == 0:
column += len(aligned_sequence)
else:
assert column == len(aligned_sequences[index])
index += 1
coordinates = Alignment.infer_coordinates(aligned_sequences)
records = []
n = len(sequences)
for i in range(n):
start = starts[i]
end = ends[i]
if start < end:
coordinates[i, :] += start
data = sequences[i]
else:
start, end = end, start
coordinates[i, :] = end - coordinates[i, :]
data = reverse_complement(sequences[i])
if start == 0:
sequence = Seq(data)
else:
# create a partially defined sequence
sequence = Seq({start: data}, length=end)
record = SeqRecord(sequence, identifiers[i])
records.append(record)
alignment = Alignment(records, coordinates)
if annotations:
alignment.annotations = annotations
if consensus:
alignment.column_annotations = {"emboss_consensus": consensus}
return alignment