File size: 2,134 Bytes
44459bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""Helpers for handling fasta files."""

from __future__ import annotations

from pathlib import Path

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from folding_studio_data_models.exceptions import FastaValidationError


def validate_fasta(
    fasta_input: str | Path,
    allow_multimer: bool = True,
    max_aa_length: int | None = None,
    str_output: bool = False,
) -> SeqRecord | list[SeqRecord] | str:
    """Validate a fasta content.

    Args:
        fasta_input (str | Path): Input fasta.
        allow_multimer (bool, optional): Allow mutlimer in the fasta representation. Defaults to True.
        max_aa_length (int | None, optional): Max AA lenght supported. Defaults to None.

    Raises:
        FastaValidationError: If no sequence found in the FASTA content.
        FastaValidationError: If one or more sequence has 0 residue.
        FastaValidationError: If only monomer are supported but a multimer was given.
        FastaValidationError: If unuspported sequence lenght in FASTA content

    Returns:
        SeqRecord | list[SeqRecord]: Unique SeqRecord if `allow_multimer` set to False,
            otherwise a list of SeqRecord.
    """

    records: list[SeqRecord] = list(SeqIO.parse(fasta_input, "fasta"))
    if not records:
        raise FastaValidationError("No sequence found in the FASTA content.")
    elif any(len(record.seq) == 0 for record in records):
        raise FastaValidationError("One or more sequence has 0 residue.")
    elif not allow_multimer and len(records) > 1:
        raise FastaValidationError(
            "Only monomer are supported but a multimer was given."
        )
    elif max_aa_length is not None and any(
        len(record.seq) > max_aa_length for record in records
    ):
        raise FastaValidationError(
            "Unuspported sequence lenght in FASTA content. "
            f"Max supported sequence lenght is {max_aa_length}AA."
        )
    if str_output:
        return "\n".join(
            f">{record.description}\n{str(record.seq)}" for record in records
        )
    elif allow_multimer:
        return records
    return records[0]