aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2021 by Michiel de Hoon. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.Align support for tabular output from BLAST or FASTA.
This module contains a parser for tabular output from BLAST run with the
'-outfmt 7' argument, as well as tabular output from William Pearson's
FASTA alignment tools using the '-m 8CB' or '-m 8CC' arguments.
"""
import re
import enum
import numpy
from Bio.Align import Alignment
from Bio.Align import interfaces
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
class State(enum.Enum):
"""Enumerate alignment states needed when parsing a BTOP string."""
MATCH = enum.auto()
QUERY_GAP = enum.auto()
TARGET_GAP = enum.auto()
NONE = enum.auto()
class AlignmentIterator(interfaces.AlignmentIterator):
"""Alignment iterator for tabular output from BLAST or FASTA.
For reading (pairwise) alignments from tabular output generated by BLAST
run with the '-outfmt 7' argument, as well as tabular output generated by
William Pearson's FASTA alignment programs with the '-m 8CB' or '-m 8CC'
output formats.
"""
fmt = "Tabular"
def _read_header(self, stream):
try:
line = next(stream)
except StopIteration:
raise ValueError("Empty file.") from None
if not line.startswith("# "):
raise ValueError("Missing header.")
line = line.rstrip()
self._parse_header(stream, line)
def _parse_header(self, stream, line):
metadata = {}
blast_programs = (
"BLASTN",
"BLASTP",
"BLASTX",
"TBLASTN",
"TBLASTX",
"DELTABLAST",
"PSIBLAST",
"RPSBLAST",
"RPSTBLASTN",
)
try:
program, version = line[2:].split(None, 1)
if program not in blast_programs:
raise ValueError("Not a BLAST program")
except ValueError:
# FASTA
metadata["Command line"] = line[2:]
line = next(stream)
assert line.startswith("# ")
metadata["Program"], metadata["Version"] = line[2:].rstrip().split(None, 1)
self._final_prefix = "# FASTA processed "
else:
# BLAST
metadata["Program"], metadata["Version"] = program, version
self._final_prefix = "# BLAST processed "
for line in stream:
line = line.strip()
assert line.startswith("# ")
try:
prefix, value = line[2:].split(": ")
except ValueError:
suffix = " hits found"
assert line.endswith(suffix)
hits = int(line[2 : -len(suffix)])
break
if prefix == "Query":
if metadata["Program"] == "FASTA":
query_line, query_size = value.rsplit(" - ", 1)
query_size, unit = query_size.split()
self._query_size = int(query_size)
assert unit in ("nt", "aa")
else:
query_line = value
self._query_size = None
try:
self._query_id, self._query_description = query_line.split(None, 1)
except ValueError:
self._query_id = query_line.strip()
self._query_description = None
elif prefix == "Database":
metadata["Database"] = value
elif prefix == "Fields":
self._fields = value.split(", ")
elif prefix == "RID":
metadata["RID"] = value
self.metadata = metadata
def _read_next_alignment(self, stream):
for line in stream:
line = line.rstrip()
if line.startswith("# "):
if line.startswith(self._final_prefix) and line.endswith(" queries"):
del self._fields
del self._query_id
del self._query_description
del self._query_size
del self._final_prefix
return
self._parse_header(stream, line)
else:
break
alignment_length = None
identical = None
btop = None
cigar = None
score = None
query_id = None
target_id = None
query_start = None
query_end = None
target_start = None
target_end = None
query_sequence = None
target_sequence = None
target_length = None
coordinates = None
query_size = self._query_size
columns = line.split("\t")
assert len(columns) == len(self._fields)
annotations = {}
query_annotations = {}
target_annotations = {}
for column, field in zip(columns, self._fields):
if field == "query id":
query_id = column
if self._query_id is not None:
assert query_id == self._query_id
elif field == "subject id":
target_id = column
elif field == "% identity":
annotations[field] = float(column)
elif field == "alignment length":
alignment_length = int(column)
elif field == "mismatches":
annotations[field] = int(column)
elif field == "gap opens":
annotations[field] = int(column)
elif field == "q. start":
query_start = int(column)
elif field == "q. end":
query_end = int(column)
elif field == "s. start":
target_start = int(column)
elif field == "s. end":
target_end = int(column)
elif field == "evalue":
annotations["evalue"] = float(column)
elif field == "bit score":
annotations["bit score"] = float(column)
elif field == "BTOP":
coordinates = self.parse_btop(column)
elif field == "aln_code":
coordinates = self.parse_cigar(column)
elif field == "query gi":
query_annotations["gi"] = column
elif field == "query acc.":
query_annotations["acc."] = column
elif field == "query acc.ver":
query_annotations["acc.ver"] = column
if query_id is None:
query_id = column
elif field == "query length":
if query_size is None:
query_size = int(column)
else:
assert query_size == int(column)
elif field == "subject ids":
target_annotations["ids"] = column
elif field == "subject gi":
target_annotations["gi"] = column
elif field == "subject gis":
target_annotations["gis"] = column
elif field == "subject acc.":
target_annotations["acc."] = column
elif field == "subject accs.":
target_annotations["accs."] = column
elif field == "subject tax ids":
target_annotations["tax ids"] = column
elif field == "subject sci names":
target_annotations["sci names"] = column
elif field == "subject com names":
target_annotations["com names"] = column
elif field == "subject blast names":
target_annotations["blast names"] = column
elif field == "subject super kingdoms":
target_annotations["super kingdoms"] = column
elif field == "subject title":
target_annotations["title"] = column
elif field == "subject titles":
target_annotations["titles"] = column
elif field == "subject strand":
target_annotations["strand"] = column
elif field == "% subject coverage":
target_annotations["% coverage"] = float(column)
elif field == "subject acc.ver":
target_annotations["acc.ver"] = column
if target_id is None:
target_id = column
elif field == "subject length":
target_length = int(column)
elif field == "query seq":
query_sequence = column
elif field == "subject seq":
target_sequence = column
elif field == "score":
score = int(column)
elif field == "identical":
identical = int(column)
annotations[field] = identical
elif field == "positives":
annotations[field] = int(column)
elif field == "gaps":
annotations[field] = int(column)
elif field == "% positives":
annotations[field] = float(column)
elif field == "% hsp coverage":
annotations[field] = float(column)
elif field == "query/sbjct frames":
annotations[field] = column
elif field == "query frame":
query_annotations["frame"] = column
elif field == "sbjct frame":
target_annotations["frame"] = column
else:
raise ValueError("Unexpected field '%s'" % field)
program = self.metadata["Program"]
if coordinates is None:
if alignment_length is not None:
annotations["alignment length"] = alignment_length
# otherwise, get it from alignment.shape
if query_start is not None and query_end is not None:
if query_start < query_end:
query_start -= 1
else:
query_end -= 1
if target_start is not None and target_end is not None:
if target_start < target_end:
target_start -= 1
else:
target_end -= 1
if coordinates is None or program in ("BLASTX", "TBLASTX"):
if query_start is not None:
query_annotations["start"] = query_start
if query_end is not None:
query_annotations["end"] = query_end
elif coordinates is not None:
if query_start < query_end:
coordinates[1, :] += query_start
else:
# mapped to reverse strand
coordinates[1, :] = query_start - coordinates[1, :]
if coordinates is None or program in ("TBLASTN", "TBLASTX"):
if target_start is not None:
target_annotations["start"] = target_start
if target_end is not None:
target_annotations["end"] = target_end
elif coordinates is not None:
coordinates[0, :] += target_start
if query_sequence is None:
if query_size is None:
query_seq = None
else:
query_seq = Seq(None, length=query_size)
else:
query_sequence = query_sequence.replace("-", "")
if program == "TBLASTN":
assert len(query_sequence) == query_end - query_start
query_seq = Seq({query_start: query_sequence}, length=query_size)
elif program == "TBLASTX":
query_annotations["start"] = query_start
query_annotations["end"] = query_end
query_seq = Seq(query_sequence)
else:
raise Exception("Unknown program %s" % program)
query = SeqRecord(query_seq, id=query_id)
if self._query_description is not None:
query.description = self._query_description
if query_annotations:
query.annotations = query_annotations
if self.metadata["Program"] in ("TBLASTN", "TBLASTX"):
target_annotations["length"] = target_length
if target_sequence is None:
target_seq = None
else:
target_sequence = target_sequence.replace("-", "")
target_seq = Seq(target_sequence)
else:
if target_sequence is None:
if target_end is None:
target_seq = None
else:
target_seq = Seq(None, length=target_end)
else:
target_sequence = target_sequence.replace("-", "")
if target_start is not None and target_end is not None:
assert len(target_sequence) == target_end - target_start
target_seq = Seq({target_start: target_sequence}, length=target_end)
target = SeqRecord(target_seq, id=target_id)
if target_annotations:
target.annotations = target_annotations
records = [target, query]
alignment = Alignment(records, coordinates)
alignment.annotations = annotations
if score is not None:
alignment.score = score
return alignment
def parse_btop(self, btop):
"""Parse a BTOP string and return alignment coordinates.
A BTOP (Blast trace-back operations) string is used by BLAST to
describe a sequence alignment.
"""
target_coordinates = []
query_coordinates = []
target_coordinates.append(0)
query_coordinates.append(0)
state = State.NONE
tokens = re.findall("([A-Z-*]{2}|\\d+)", btop)
# each token is now
# - an integer
# - a pair of characters, which may include dashes
for token in tokens:
if token.startswith("-"):
if state != State.QUERY_GAP:
target_coordinates.append(target_coordinates[-1])
query_coordinates.append(query_coordinates[-1])
state = State.QUERY_GAP
target_coordinates[-1] += 1
elif token.endswith("-"):
if state != State.TARGET_GAP:
target_coordinates.append(target_coordinates[-1])
query_coordinates.append(query_coordinates[-1])
state = State.TARGET_GAP
query_coordinates[-1] += 1
else:
try:
length = int(token)
except ValueError:
# pair of mismatched letters
length = 1
if state == State.MATCH:
target_coordinates[-1] += length
query_coordinates[-1] += length
else:
target_coordinates.append(target_coordinates[-1] + length)
query_coordinates.append(query_coordinates[-1] + length)
state = State.MATCH
coordinates = numpy.array([target_coordinates, query_coordinates])
return coordinates
def parse_cigar(self, cigar):
"""Parse a CIGAR string and return alignment coordinates.
A CIGAR string, as defined by the SAM Sequence Alignment/Map format,
describes a sequence alignment as a series of lengths and operation
(alignment/insertion/deletion) codes.
"""
target_coordinates = []
query_coordinates = []
target_coordinate = 0
query_coordinate = 0
target_coordinates.append(target_coordinate)
query_coordinates.append(query_coordinate)
state = State.NONE
tokens = re.findall("(M|D|I|\\d+)", cigar)
# each token is now
# - the length of the operation
# - the operation
for length, operation in zip(tokens[::2], tokens[1::2]):
length = int(length)
if operation == "M":
target_coordinate += length
query_coordinate += length
elif operation == "I":
target_coordinate += length
elif operation == "D":
query_coordinate += length
target_coordinates.append(target_coordinate)
query_coordinates.append(query_coordinate)
coordinates = numpy.array([target_coordinates, query_coordinates])
return coordinates