|
"""Helpers for handling fasta files.""" |
|
|
|
from __future__ import annotations |
|
|
|
from pathlib import Path |
|
|
|
from Bio import SeqIO |
|
from Bio.SeqRecord import SeqRecord |
|
from folding_studio_data_models.exceptions import FastaValidationError |
|
|
|
|
|
def validate_fasta( |
|
fasta_input: str | Path, |
|
allow_multimer: bool = True, |
|
max_aa_length: int | None = None, |
|
str_output: bool = False, |
|
) -> SeqRecord | list[SeqRecord] | str: |
|
"""Validate a fasta content. |
|
|
|
Args: |
|
fasta_input (str | Path): Input fasta. |
|
allow_multimer (bool, optional): Allow mutlimer in the fasta representation. Defaults to True. |
|
max_aa_length (int | None, optional): Max AA lenght supported. Defaults to None. |
|
|
|
Raises: |
|
FastaValidationError: If no sequence found in the FASTA content. |
|
FastaValidationError: If one or more sequence has 0 residue. |
|
FastaValidationError: If only monomer are supported but a multimer was given. |
|
FastaValidationError: If unuspported sequence lenght in FASTA content |
|
|
|
Returns: |
|
SeqRecord | list[SeqRecord]: Unique SeqRecord if `allow_multimer` set to False, |
|
otherwise a list of SeqRecord. |
|
""" |
|
|
|
records: list[SeqRecord] = list(SeqIO.parse(fasta_input, "fasta")) |
|
if not records: |
|
raise FastaValidationError("No sequence found in the FASTA content.") |
|
elif any(len(record.seq) == 0 for record in records): |
|
raise FastaValidationError("One or more sequence has 0 residue.") |
|
elif not allow_multimer and len(records) > 1: |
|
raise FastaValidationError( |
|
"Only monomer are supported but a multimer was given." |
|
) |
|
elif max_aa_length is not None and any( |
|
len(record.seq) > max_aa_length for record in records |
|
): |
|
raise FastaValidationError( |
|
"Unuspported sequence lenght in FASTA content. " |
|
f"Max supported sequence lenght is {max_aa_length}AA." |
|
) |
|
if str_output: |
|
return "\n".join( |
|
f">{record.description}\n{str(record.seq)}" for record in records |
|
) |
|
elif allow_multimer: |
|
return records |
|
return records[0] |
|
|