aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2012 by Wibowo Arindrarto. All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SearchIO parser for Exonerate plain text output format."""
import re
from itertools import chain
from ._base import (
_BaseExonerateParser,
_BaseExonerateIndexer,
_STRAND_MAP,
_parse_hit_or_query_line,
)
from .exonerate_vulgar import _RE_VULGAR
__all__ = ("ExonerateTextParser", "ExonerateTextIndexer")
# for capturing sequences in alignment blocks
# e.g. ' 529 : ATCCCTTATCTCTTTATCTTGTA : 472'
_RE_ALN_ROW = re.compile(r"\s*\d+\s+: (.*) :\s+\d+")
# for splitting the line based on intron annotations
# e.g. ' >>>> Target Intron 1 >>>> ' or 'gt.........................ag'
_RE_EXON = re.compile(
r"[atgc ]{2}?(?:(?:[<>]+ \w+ Intron \d+ [<>]+)|(?:\.+))[atgc ]{2}?"
)
# captures the intron length
# from e.g. '61 bp // 154295 bp' (joint intron lengths) or '177446 bp'
_RE_EXON_LEN = re.compile(r"(?:(\d+) bp // (\d+) bp)|(?:(\d+) bp)")
# for splitting lines in the NER model
_RE_NER = re.compile(r"--<\s+\d+\s+>--")
# for capturing NER gap lengths
_RE_NER_LEN = re.compile(r"--<\s+(\d+)\s+>--")
# regexes for capturing the letters inside curly braces
# no. of letters is either 1 or 2, since they are split codons
_RE_SCODON_START = re.compile(r"\{(\w{1,2})\}$")
_RE_SCODON_END = re.compile(r"^\{(\w{1,2})\}")
def _flip_codons(codon_seq, target_seq):
"""Flips the codon characters from one seq to another (PRIVATE)."""
a, b = "", ""
for char1, char2 in zip(codon_seq, target_seq):
# no need to do anything if the codon seq line has nothing
if char1 == " ":
a += char1
b += char2
else:
a += char2
b += char1
return a, b
def _get_block_coords(parsed_seq, row_dict, has_ner=False):
"""Return a list of start, end coordinates for each given block in the sequence (PRIVATE)."""
start = 0
coords = []
if not has_ner:
splitter = _RE_EXON
else:
splitter = _RE_NER
# use the query line for reference
seq = parsed_seq[row_dict["query"]]
for block in re.split(splitter, seq):
start += seq[start:].find(block)
end = start + len(block)
coords.append((start, end))
return coords
def _get_inter_coords(coords, strand=1):
"""Return list of pairs covering intervening ranges (PRIVATE).
From the given pairs of coordinates, returns a list of pairs
covering the intervening ranges.
"""
# adapted from Python's itertools guide
# if strand is -1, adjust coords to the ends and starts are chained
if strand == -1:
sorted_coords = [(max(a, b), min(a, b)) for a, b in coords]
inter_coords = list(chain(*sorted_coords))[1:-1]
return list(zip(inter_coords[1::2], inter_coords[::2]))
else:
inter_coords = list(chain(*coords))[1:-1]
return list(zip(inter_coords[::2], inter_coords[1::2]))
def _stitch_rows(raw_rows):
"""Stitches together the parsed alignment rows and returns them in a list (PRIVATE)."""
# deal with possible codon surprise!
# (i.e. alignments with codons using cdna2genome model)
# by creating additional rows to contain the codons
try:
max_len = max(len(x) for x in raw_rows)
for row in raw_rows:
assert len(row) == max_len
except AssertionError:
for idx, row in enumerate(raw_rows):
if len(row) != max_len:
# codons must be present in the query and hit (so +2)
assert len(row) + 2 == max_len
# add additional empty lines to contain codons
raw_rows[idx] = [" " * len(row[0])] + row + [" " * len(row[0])]
cmbn_rows = []
for idx, row in enumerate(raw_rows[0]):
cmbn_row = "".join(aln_row[idx] for aln_row in raw_rows)
cmbn_rows.append(cmbn_row)
# the real aligned sequence is always the 'outer' one, so we want
# to flip them with their 'inner' pairs
if len(cmbn_rows) == 5:
# flip query sequence
cmbn_rows[0], cmbn_rows[1] = _flip_codons(cmbn_rows[0], cmbn_rows[1])
# flip hit sequence
cmbn_rows[4], cmbn_rows[3] = _flip_codons(cmbn_rows[4], cmbn_rows[3])
return cmbn_rows
def _get_row_dict(row_len, model):
"""Return a dictionary of row indices for parsing alignment blocks (PRIVATE)."""
idx = {}
# 3 lines, usually in dna vs dna models
if row_len == 3:
idx["query"] = 0
idx["midline"] = 1
idx["hit"] = 2
idx["qannot"], idx["hannot"] = None, None
# 4 lines, in protein vs dna models or dna vs protein models
# TODO: currently we check this from the model string; is there
# a better way to do it?
elif row_len == 4:
if "protein2" in model:
idx["query"] = 0
idx["midline"] = 1
idx["hit"] = 2
idx["hannot"] = 3
idx["qannot"] = None
elif "2protein" in model:
idx["query"] = 1
idx["midline"] = 2
idx["hit"] = 3
idx["hannot"] = None
idx["qannot"] = 0
else:
raise ValueError("Unexpected model: " + model)
# 5 lines, translated dna vs translated dna
elif row_len == 5:
# set sequence indexes
idx["qannot"] = 0
idx["query"] = 1
idx["midline"] = 2
idx["hit"] = 3
idx["hannot"] = 4
else:
raise ValueError("Unexpected row count in alignment block: %i" % row_len)
return idx
def _get_blocks(rows, coords, idx):
"""Return a list of dictionaries of sequences split by the coordinates (PRIVATE)."""
for idx_name in ("query", "hit", "midline", "qannot", "hannot"):
assert idx_name in idx
blocks = []
for start, end in coords:
block = {}
# get seqs according to index
block["query"] = rows[idx["query"]][start:end]
block["hit"] = rows[idx["hit"]][start:end]
block["similarity"] = rows[idx["midline"]][start:end]
if idx["qannot"] is not None:
block["query_annotation"] = rows[idx["qannot"]][start:end]
if idx["hannot"] is not None:
block["hit_annotation"] = rows[idx["hannot"]][start:end]
blocks.append(block)
return blocks
def _get_scodon_moves(tmp_seq_blocks):
"""Get a dictionary of split codon locations relative to each fragment end (PRIVATE)."""
scodon_moves = {"query": [], "hit": []}
for seq_type in scodon_moves:
scoords = []
for block in tmp_seq_blocks:
# check both ends of the sequence for residues in curly braces
m_start = re.search(_RE_SCODON_START, block[seq_type])
m_end = re.search(_RE_SCODON_END, block[seq_type])
if m_start:
m_start = len(m_start.group(1))
scoords.append((m_start, 0))
else:
scoords.append((0, 0))
if m_end:
m_end = len(m_end.group(1))
scoords.append((0, m_end))
else:
scoords.append((0, 0))
scodon_moves[seq_type] = scoords
return scodon_moves
def _clean_blocks(tmp_seq_blocks):
"""Remove curly braces (split codon markers) from the given sequences (PRIVATE)."""
seq_blocks = []
for seq_block in tmp_seq_blocks:
for line_name in seq_block:
seq_block[line_name] = (
seq_block[line_name].replace("{", "").replace("}", "")
)
seq_blocks.append(seq_block)
return seq_blocks
def _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens):
"""Return the length of introns between fragments (PRIVATE)."""
# set opposite type, for setting introns
opp_type = "hit" if seq_type == "query" else "query"
# list of flags to denote if an intron follows a block
# it reads e.g. this line:
# "ATGTT{TT} >>>> Target Intron 1 >>>> {G}TGTGTGTACATT"
# and sets the opposing sequence type's intron (since this
# line is present on the opposite sequence type line)
has_intron_after = ["Intron" in x[seq_type] for x in inter_blocks]
assert len(has_intron_after) == len(raw_inter_lens)
# create list containing coord adjustments incorporating
# intron lengths
inter_lens = []
for flag, parsed_len in zip(has_intron_after, raw_inter_lens):
if flag:
# joint introns
if all(parsed_len[:2]):
# intron len is [0] if opp_type is query, otherwise it's [1]
intron_len = (
int(parsed_len[0]) if opp_type == "query" else int(parsed_len[1])
)
# single hit/query introns
elif parsed_len[2]:
intron_len = int(parsed_len[2])
else:
raise ValueError("Unexpected intron parsing result: %r" % parsed_len)
else:
intron_len = 0
inter_lens.append(intron_len)
return inter_lens
def _comp_coords(hsp, seq_type, inter_lens):
"""Fill the block coordinates of the given hsp dictionary (PRIVATE)."""
assert seq_type in ("hit", "query")
# manually fill the first coord
seq_step = 1 if hsp["%s_strand" % seq_type] >= 0 else -1
fstart = hsp["%s_start" % seq_type]
# fend is fstart + number of residues in the sequence, minus gaps
fend = (
fstart
+ len(hsp[seq_type][0].replace("-", "").replace(">", "").replace("<", ""))
* seq_step
)
coords = [(fstart, fend)]
# and start from the second block, after the first inter seq
for idx, block in enumerate(hsp[seq_type][1:]):
bstart = coords[-1][1] + inter_lens[idx] * seq_step
bend = bstart + seq_step * len(block.replace("-", ""))
coords.append((bstart, bend))
# adjust the coords so the smallest is [0], if strand is -1
# couldn't do this in the previous steps since we need the initial
# block ordering
if seq_step != 1:
for idx, coord in enumerate(coords):
coords[idx] = coords[idx][1], coords[idx][0]
return coords
def _comp_split_codons(hsp, seq_type, scodon_moves):
"""Compute positions of split codons, store in given HSP dictionary (PRIVATE)."""
scodons = []
for idx in range(len(scodon_moves[seq_type])):
pair = scodon_moves[seq_type][idx]
if not any(pair):
continue
else:
assert not all(pair)
a, b = pair
anchor_pair = hsp["%s_ranges" % seq_type][idx // 2]
strand = 1 if hsp["%s_strand" % seq_type] >= 0 else -1
if a:
func = max if strand == 1 else min
anchor = func(anchor_pair)
start_c, end_c = anchor + a * strand * -1, anchor
elif b:
func = min if strand == 1 else max
anchor = func(anchor_pair)
start_c, end_c = anchor + b * strand, anchor
scodons.append((min(start_c, end_c), max(start_c, end_c)))
return scodons
class ExonerateTextParser(_BaseExonerateParser):
"""Parser for Exonerate plain text output."""
_ALN_MARK = "C4 Alignment:"
def parse_alignment_block(self, header):
"""Parse alignment block, return query result, hits, hsps."""
qresult = header["qresult"]
hit = header["hit"]
hsp = header["hsp"]
# check for values that must have been set by previous methods
for val_name in (
"query_start",
"query_end",
"hit_start",
"hit_end",
"query_strand",
"hit_strand",
):
assert val_name in hsp, hsp
# get the alignment rows
# and stitch them so we have the full sequences in single strings
raw_aln_blocks, vulgar_comp = self._read_alignment()
# cmbn_rows still has split codon markers (curly braces)
cmbn_rows = _stitch_rows(raw_aln_blocks)
row_dict = _get_row_dict(len(cmbn_rows), qresult["model"])
# get the sequence blocks
has_ner = "NER" in qresult["model"].upper()
seq_coords = _get_block_coords(cmbn_rows, row_dict, has_ner)
tmp_seq_blocks = _get_blocks(cmbn_rows, seq_coords, row_dict)
# get split codon temp coords for later use
# this result in pairs of base movement for both ends of each row
scodon_moves = _get_scodon_moves(tmp_seq_blocks)
# remove the split codon markers
seq_blocks = _clean_blocks(tmp_seq_blocks)
# adjust strands
hsp["query_strand"] = _STRAND_MAP[hsp["query_strand"]]
hsp["hit_strand"] = _STRAND_MAP[hsp["hit_strand"]]
# cast coords into ints
hsp["query_start"] = int(hsp["query_start"])
hsp["query_end"] = int(hsp["query_end"])
hsp["hit_start"] = int(hsp["hit_start"])
hsp["hit_end"] = int(hsp["hit_end"])
# cast score into ints
hsp["score"] = int(hsp["score"])
# set sequences
hsp["query"] = [x["query"] for x in seq_blocks]
hsp["hit"] = [x["hit"] for x in seq_blocks]
hsp["aln_annotation"] = {}
# set the molecule type
# currently only limited to models with protein queries
if (
"protein2" in qresult["model"]
or "coding2" in qresult["model"]
or "2protein" in qresult["model"]
):
hsp["molecule_type"] = "protein"
# get the annotations if they exist
for annot_type in ("similarity", "query_annotation", "hit_annotation"):
try:
hsp["aln_annotation"][annot_type] = [x[annot_type] for x in seq_blocks]
except KeyError:
pass
# use vulgar coordinates if vulgar line is present and return
# if vulgar_comp is not None:
# hsp = parse_vulgar_comp(hsp, vulgar_comp)
# return {'qresult': qresult, 'hit': hit, 'hsp': hsp}
# otherwise we need to get the coordinates from the alignment
# get the intervening blocks first, so we can use them
# to adjust the coordinates
if not has_ner:
# get intervening coordinates and blocks, only if model is not ner
# ner models have a much more simple coordinate calculation
inter_coords = _get_inter_coords(seq_coords)
inter_blocks = _get_blocks(cmbn_rows, inter_coords, row_dict)
# returns a three-component tuple of intron lengths
# first two component filled == intron in hit and query
# last component filled == intron in hit or query
raw_inter_lens = re.findall(_RE_EXON_LEN, cmbn_rows[row_dict["midline"]])
# compute start and end coords for each block
for seq_type in ("query", "hit"):
# ner blocks and intron blocks require different adjustments
if not has_ner:
opp_type = "hit" if seq_type == "query" else "query"
inter_lens = _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens)
else:
# for NER blocks, the length of the inter-fragment gaps is
# written on the same strand, so opp_type is seq_type
opp_type = seq_type
inter_lens = [
int(x)
for x in re.findall(_RE_NER_LEN, cmbn_rows[row_dict[seq_type]])
]
# check that inter_lens's length is len opp_type block - 1
if len(inter_lens) != len(hsp[opp_type]) - 1:
raise ValueError(
"Length mismatch: %r vs %r"
% (len(inter_lens), len(hsp[opp_type]) - 1)
)
# fill the hsp query and hit coordinates
hsp["%s_ranges" % opp_type] = _comp_coords(hsp, opp_type, inter_lens)
# and fill the split codon coordinates, if model != ner
# can't do this in the if-else clause above since we need to
# compute the ranges first
if not has_ner:
hsp["%s_split_codons" % opp_type] = _comp_split_codons(
hsp, opp_type, scodon_moves
)
# now that we've finished parsing coords, we can set the hit and start
# coord according to Biopython's convention (start <= end)
for seq_type in ("query", "hit"):
if hsp["%s_strand" % seq_type] == -1:
n_start = "%s_start" % seq_type
n_end = "%s_end" % seq_type
hsp[n_start], hsp[n_end] = hsp[n_end], hsp[n_start]
return {"qresult": qresult, "hit": hit, "hsp": hsp}
def _read_alignment(self):
"""Read the raw alignment block strings, returns them in a list (PRIVATE)."""
raw_aln_blocks = []
# flag to check whether we're in an alignment row
in_aln_row = False
# flag for vulgar line, if present, we can parse coordinates from it
vulgar_comp = None
while True:
match = re.search(_RE_ALN_ROW, self.line.strip())
# if we have a match, set flags and values
if match and not in_aln_row:
start_idx = self.line.index(match.group(1))
row_len = len(match.group(1))
in_aln_row = True
raw_aln_block = []
# if we're in an alignment row, grab the sequence
if in_aln_row:
raw_aln_block.append(self.line[start_idx : start_idx + row_len])
# reset flags and values if the line matches, we're in an alignment
# row, and there are more than 1 line in rows
if match and in_aln_row and len(raw_aln_block) > 1:
raw_aln_blocks.append(raw_aln_block)
start_idx = None
row_len = None
in_aln_row = False
self.line = self.handle.readline()
# try to parse vulgar line if present
if self.line.startswith("vulgar"):
vulgar = re.search(_RE_VULGAR, self.line)
vulgar_comp = vulgar.group(10)
if not self.line or self.line.startswith(self._ALN_MARK):
# HACK: this is so that the parse_qresult method does not
# yield the objects before appending the last HSP. We are doing
# this to keep the parser compatible with outputs without
# human-readable alignment outputs. This also relies on the
# fact that repeated readline() always returns '' on EOF.
if not self.line:
self.line = "mock"
break
return raw_aln_blocks, vulgar_comp
class ExonerateTextIndexer(_BaseExonerateIndexer):
"""Indexer class for Exonerate plain text."""
_parser = ExonerateTextParser
_query_mark = b"C4 Alignment"
def get_qresult_id(self, pos):
"""Return the query ID from the nearest "Query:" line."""
handle = self._handle
handle.seek(pos)
sentinel = b"Query:"
while True:
line = handle.readline().strip()
if line.startswith(sentinel):
break
if not line:
raise StopIteration
qid, desc = _parse_hit_or_query_line(line.decode())
return qid
def get_raw(self, offset):
"""Return the raw string of a QueryResult object from the given offset."""
handle = self._handle
handle.seek(offset)
qresult_key = None
qresult_raw = b""
while True:
line = handle.readline()
if not line:
break
elif line.startswith(self._query_mark):
cur_pos = handle.tell()
if qresult_key is None:
qresult_key = self.get_qresult_id(cur_pos)
else:
curr_key = self.get_qresult_id(cur_pos)
if curr_key != qresult_key:
break
handle.seek(cur_pos)
qresult_raw += line
return qresult_raw
# if not used as a module, run the doctest
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()