"""Helpers for handling fasta files.""" from __future__ import annotations from pathlib import Path from Bio import SeqIO from Bio.SeqRecord import SeqRecord from folding_studio_data_models.exceptions import FastaValidationError def validate_fasta( fasta_input: str | Path, allow_multimer: bool = True, max_aa_length: int | None = None, str_output: bool = False, ) -> SeqRecord | list[SeqRecord] | str: """Validate a fasta content. Args: fasta_input (str | Path): Input fasta. allow_multimer (bool, optional): Allow mutlimer in the fasta representation. Defaults to True. max_aa_length (int | None, optional): Max AA lenght supported. Defaults to None. Raises: FastaValidationError: If no sequence found in the FASTA content. FastaValidationError: If one or more sequence has 0 residue. FastaValidationError: If only monomer are supported but a multimer was given. FastaValidationError: If unuspported sequence lenght in FASTA content Returns: SeqRecord | list[SeqRecord]: Unique SeqRecord if `allow_multimer` set to False, otherwise a list of SeqRecord. """ records: list[SeqRecord] = list(SeqIO.parse(fasta_input, "fasta")) if not records: raise FastaValidationError("No sequence found in the FASTA content.") elif any(len(record.seq) == 0 for record in records): raise FastaValidationError("One or more sequence has 0 residue.") elif not allow_multimer and len(records) > 1: raise FastaValidationError( "Only monomer are supported but a multimer was given." ) elif max_aa_length is not None and any( len(record.seq) > max_aa_length for record in records ): raise FastaValidationError( "Unuspported sequence lenght in FASTA content. " f"Max supported sequence lenght is {max_aa_length}AA." ) if str_output: return "\n".join( f">{record.description}\n{str(record.seq)}" for record in records ) elif allow_multimer: return records return records[0]