jfaustin's picture
add dockerfile and folding studio cli
44459bb
raw
history blame
2.13 kB
"""Helpers for handling fasta files."""
from __future__ import annotations
from pathlib import Path
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from folding_studio_data_models.exceptions import FastaValidationError
def validate_fasta(
fasta_input: str | Path,
allow_multimer: bool = True,
max_aa_length: int | None = None,
str_output: bool = False,
) -> SeqRecord | list[SeqRecord] | str:
"""Validate a fasta content.
Args:
fasta_input (str | Path): Input fasta.
allow_multimer (bool, optional): Allow mutlimer in the fasta representation. Defaults to True.
max_aa_length (int | None, optional): Max AA lenght supported. Defaults to None.
Raises:
FastaValidationError: If no sequence found in the FASTA content.
FastaValidationError: If one or more sequence has 0 residue.
FastaValidationError: If only monomer are supported but a multimer was given.
FastaValidationError: If unuspported sequence lenght in FASTA content
Returns:
SeqRecord | list[SeqRecord]: Unique SeqRecord if `allow_multimer` set to False,
otherwise a list of SeqRecord.
"""
records: list[SeqRecord] = list(SeqIO.parse(fasta_input, "fasta"))
if not records:
raise FastaValidationError("No sequence found in the FASTA content.")
elif any(len(record.seq) == 0 for record in records):
raise FastaValidationError("One or more sequence has 0 residue.")
elif not allow_multimer and len(records) > 1:
raise FastaValidationError(
"Only monomer are supported but a multimer was given."
)
elif max_aa_length is not None and any(
len(record.seq) > max_aa_length for record in records
):
raise FastaValidationError(
"Unuspported sequence lenght in FASTA content. "
f"Max supported sequence lenght is {max_aa_length}AA."
)
if str_output:
return "\n".join(
f">{record.description}\n{str(record.seq)}" for record in records
)
elif allow_multimer:
return records
return records[0]