neuralbioinfo
/

prokbert-mini

@@ -1,769 +0,0 @@
-# Config utils
-import yaml
-import pathlib
-from os.path import join
-import os
-import numpy as np
-import torch
-import argparse
-from multiprocessing import cpu_count
-from transformers import TrainingArguments
-from copy import deepcopy
-import re
-import sys
-def add_hf_args_to_parser(parser):
-    # Create a temporary TrainingArguments to access default values and descriptions
-    hf_args = TrainingArguments(output_dir="/tmp")  # Dummy output_dir
-    # Iterate over all public attributes
-    for attr in dir(hf_args):
-        if not attr.startswith("_"):
-            default = getattr(hf_args, attr)
-            # You can add more sophisticated handling based on attribute types here
-            if isinstance(default, (int, float, str, bool)):
-                help_str = f"Auto-generated help for {attr}"
-                parser.add_argument(f"--{attr}", type=type(default), default=default, help=help_str)
-    return parser
-class BaseConfig:
-    """Base class for managing and validating configurations."""
-    numpy_dtype_mapping = {1: np.int8,
-                           2: np.int16,
-                           8: np.int64,
-                           4: np.int32}
-    def __init__(self):
-        super().__init__()
-    def cast_to_expected_type(self, parameter_class: str, parameter_name: str, value: any) -> any:
-        """
-        Cast the given value to the expected type.
-        :param parameter_class: The class/category of the parameter.
-        :type parameter_class: str
-        :param parameter_name: The name of the parameter.
-        :type parameter_name: str
-        :param value: The value to be casted.
-        :type value: any
-        :return: Value casted to the expected type.
-        :rtype: any
-        :raises ValueError: If casting fails.
-        """
-        expected_type = self.parameters[parameter_class][parameter_name]['type']
-        if expected_type in ["integer", "int"]:
-            try:
-                return int(value)
-            except ValueError:
-                raise ValueError(f"Failed to cast value '{value}' to integer for parameter '{parameter_name}' in class '{parameter_class}'.")
-        elif expected_type == "float":
-            try:
-                return float(value)
-            except ValueError:
-                raise ValueError(f"Failed to cast value '{value}' to float for parameter '{parameter_name}' in class '{parameter_class}'.")
-        elif expected_type in ["string", "str"]:
-            return str(value)
-        elif expected_type in ["boolean", "bool"]:
-            if isinstance(value, bool):
-                return value
-            elif str(value).lower() == "true":
-                return True
-            elif str(value).lower() == "false":
-                return False
-            else:
-                raise ValueError(f"Failed to cast value '{value}' to boolean for parameter '{parameter_name}' in class '{parameter_class}'.")
-        elif expected_type == "type":
-            # For this type, we will simply return the value without casting.
-            # It assumes the configuration provides valid Python types.
-            return value
-        elif expected_type == "list":
-            if isinstance(value, list):
-                return value
-            else:
-                raise ValueError(f"Failed to validate value '{value}' as a list for parameter '{parameter_name}' in class '{parameter_class}'.")
-        elif expected_type == "tuple":
-            if isinstance(value, tuple):
-                return value
-            else:
-                raise ValueError(f"Failed to validate value '{value}' as a tuple for parameter '{parameter_name}' in class '{parameter_class}'.")
-        elif expected_type == "set":
-            if isinstance(value, set):
-                return value
-            else:
-                raise ValueError(f"Failed to validate value '{value}' as a set for parameter '{parameter_name}' in class '{parameter_class}'.")
-        elif expected_type == "dict":
-            if isinstance(value, dict):
-                return value
-            else:
-                raise ValueError(f"Failed to validate value '{value}' as a dict for parameter '{parameter_name}' in class '{parameter_class}'.")
-        else:
-            raise ValueError(f"Unknown expected type '{expected_type}' for parameter '{parameter_name}' in class '{parameter_class}'.")
-    def get_parameter(self, parameter_class: str, parameter_name: str) -> any:
-        """
-        Retrieve the default value of a specified parameter.
-        :param parameter_class: The class/category of the parameter (e.g., 'segmentation').
-        :type parameter_class: str
-        :param parameter_name: The name of the parameter.
-        :type parameter_name: str
-        :return: Default value of the parameter, casted to the expected type.
-        :rtype: any
-        """
-        default_value = self.parameters[parameter_class][parameter_name]['default']
-        return self.cast_to_expected_type(parameter_class, parameter_name, default_value)
-    def validate_type(self, parameter_class: str, parameter_name: str, value: any) -> bool:
-        """
-        Validate the type of a given value against the expected type.
-        :param parameter_class: The class/category of the parameter.
-        :type parameter_class: str
-        :param parameter_name: The name of the parameter.
-        :type parameter_name: str
-        :param value: The value to be validated.
-        :type value: any
-        :return: True if the value is of the expected type, otherwise False.
-        :rtype: bool
-        """
-        expected_type = self.parameters[parameter_class][parameter_name]['type']
-        if expected_type == "integer" and not isinstance(value, int):
-            return False
-        elif expected_type == "float" and not isinstance(value, float):
-            return False
-        elif expected_type == "string" and not isinstance(value, str):
-            return False
-        else:
-            return True
-    def validate_value(self, parameter_class: str, parameter_name: str, value: any) -> bool:
-        """
-        Validate the value of a parameter against its constraints.
-        :param parameter_class: The class/category of the parameter.
-        :type parameter_class: str
-        :param parameter_name: The name of the parameter.
-        :type parameter_name: str
-        :param value: The value to be validated.
-        :type value: any
-        :return: True if the value meets the constraints, otherwise False.
-        :rtype: bool
-        """
-        constraints = self.parameters[parameter_class][parameter_name].get('constraints', {})
-        if 'options' in constraints and value not in constraints['options']:
-            return False
-        if 'min' in constraints and value < constraints['min']:
-            return False
-        if 'max' in constraints and value > constraints['max']:
-            return False
-        return True
-    def validate(self, parameter_class: str, parameter_name: str, value: any):
-        """
-        Validate both the type and value of a parameter.
-        :param parameter_class: The class/category of the parameter.
-        :type parameter_class: str
-        :param parameter_name: The name of the parameter.
-        :type parameter_name: str
-        :param value: The value to be validated.
-        :type value: any
-        :raises TypeError: If the value is not of the expected type.
-        :raises ValueError: If the value does not meet the parameter's constraints.
-        """
-        if not self.validate_type(parameter_class, parameter_name, value):
-            raise TypeError(f"Invalid type for {parameter_name} for parameter class '{parameter_class}'. Expected {self.parameters[parameter_class][parameter_name]['type']}.")
-        if not self.validate_value(parameter_class, parameter_name, value):
-            raise ValueError(f"Invalid value for {parameter_name}  for parameter class '{parameter_class}'. Constraints: {self.parameters[parameter_class][parameter_name].get('constraints', {})}.")
-    def describe(self, parameter_class: str, parameter_name: str) -> str:
-        """
-        Retrieve the description of a parameter.
-        :param parameter_class: The class/category of the parameter.
-        :type parameter_class: str
-        :param parameter_name: The name of the parameter.
-        :type parameter_name: str
-        :return: Description of the parameter.
-        :rtype: str
-        """
-        return self.parameters[parameter_class][parameter_name]['description']
-    @staticmethod
-    def rename_non_unique_parameters(config: dict) -> tuple[dict, dict, dict]:
-        """
-        Rename parameters in the configuration to ensure uniqueness across different groups.
-        This method identifies parameters with the same name across different groups and renames them
-        by prefixing the group name. This is to prevent conflicts when parameters are used in a context
-        where the group name is not specified.
-        :param config: A dictionary where each key is a group name and each value is a dict
-                       of parameters for that group.
-        :type config: dict
-        :return: A tuple containing:
-                 - renamed_config: A dictionary with the same structure as the input, but with non-unique parameter
-                   names renamed. The structure is {group_name: {param_name: param_info}}.
-                 - cmd_argument2group_param: A dictionary mapping the new parameter names to their original group
-                   and parameter name. The structure is {new_param_name: [group_name, original_param_name]}.
-                 - group2param2cmdarg: A dictionary mapping each group to a dict that maps the original parameter
-                   names to the new parameter names. The structure is {group_name: {original_param_name: new_param_name}}.
-        :rtype: tuple[dict, dict, dict]
-        """
-        # Identify non-unique parameter names
-        param_counts = {}
-        for group_name, parameters in config.items():
-            for param_name in parameters.keys():
-                param_counts[param_name] = param_counts.get(param_name, 0) + 1
-        non_unique_params = {param for param, count in param_counts.items() if count > 1}
-        cmd_argument2group_param = {}
-        group2param2cmdarg = {}
-        for group_name, parameters in config.items():
-            group2param2cmdarg[group_name]={}
-            for param_name in parameters.keys():
-                group2param2cmdarg[group_name][param_name] = param_name
-        # Rename only the non-unique parameters
-        renamed_config = {}
-        for group_name, parameters in config.items():
-            renamed_group = {}
-            for param_name, param_info in parameters.items():
-                new_param_name = f"{group_name}_{param_name}" if param_name in non_unique_params else param_name
-                cmd_argument2group_param[new_param_name] = [group_name, param_name]
-                group2param2cmdarg[group_name][param_name]=new_param_name
-                renamed_group[new_param_name] = param_info
-            renamed_config[group_name] = renamed_group
-        return renamed_config, cmd_argument2group_param, group2param2cmdarg
-    @staticmethod
-    def create_parser(config: dict) -> argparse.ArgumentParser:
-        """
-        Create and configure an argparse parser based on the given configuration.
-        This method sets up a command-line argument parser with arguments defined in the configuration.
-        Each top-level key in the configuration represents a group of related arguments.
-        :param config: A dictionary where each key is a group name and each value is a dict
-                       of parameters for that group. Each parameter's information should include
-                       its type, default value, and help description.
-        :type config: dict
-        :return: Configured argparse.ArgumentParser instance with arguments added as specified
-                 in the configuration.
-        :rtype: argparse.ArgumentParser
-        :raises ValueError: If an unknown or unsupported type is specified for a parameter.
-        """
-        parser = argparse.ArgumentParser(description="Command-line parser for project settings")
-        # Mapping of type strings to Python types
-        type_mapping = {
-            'integer': int,
-            'int': int,
-            'float': float,
-            'string': str,
-            'str': str,
-            'bool': bool,
-            'boolean': bool,
-            'list': list
-            # Complex types like 'dict' and 'type' are intentionally excluded
-        }
-        # List of types to handle as strings
-        handle_as_string = ['dict', 'type', 'list']
-        excluded_parameters = ['vocabmap', 'np_tokentype', 'pretraining_dataset_data', 'optim']
-        for group_name, parameters in config.items():
-            group = parser.add_argument_group(group_name)
-            for param_name, param_info in parameters.items():
-                param_type_str = param_info['type']
-                description = param_info['description']
-                escaped_description = re.sub(r"([^%])%", r"\1%%", description)
-                if param_name in excluded_parameters:
-                    continue
-                if param_type_str in handle_as_string:
-                    # Handle these types as strings in argparse, conversion will be done later in the program
-                    param_type = str
-                elif param_type_str not in type_mapping:
-                    raise ValueError(f"Unknown or unsupported type '{param_type_str}' for parameter '{param_name}'")
-                else:
-                    param_type = type_mapping[param_type_str]
-                #print(f'The current type is: {param_type}')
-                default_param = param_info['default']
-                description = param_info['description']
-                kwargs = {
-                    'type': param_type,
-                    'default': param_info['default'],
-                    'help': escaped_description
-                }            # Add constraints if they exist
-                """
-                if 'constraints' in param_info:
-                    constraints = param_info['constraints']
-                    if 'min' in constraints:
-                        kwargs['type'] = lambda x: eval(param_type_str)(x) if eval(param_type_str)(x) >= constraints['min'] else sys.exit(f"Value for {param_name} must be at least {constraints['min']}")
-                    if 'max' in constraints:
-                        kwargs['type'] = lambda x: eval(param_type_str)(x) if eval(param_type_str)(x) <= constraints['max'] else sys.exit(f"Value for {param_name} must be at most {constraints['max']}")
-                    if 'options' in constraints:
-                        kwargs['choices'] = constraints['options']
-                """
-                # Add argument to the group
-                group.add_argument(f'--{param_name}', **kwargs)
-        #parser = add_hf_args_to_parser(parser)
-        return parser
-class SeqConfig(BaseConfig):
-    """Class to manage and validate sequence processing configurations."""
-    def __init__(self):
-        super().__init__()
-        self.default_seq_config_file = self._get_default_sequence_processing_config_file()
-        with open(self.default_seq_config_file, 'r') as file:
-            self.parameters = yaml.safe_load(file)
-        # Some postprocessing steps
-        self.parameters['tokenization']['shift']['constraints']['max'] = self.parameters['tokenization']['kmer']['default']-1
-        # Ha valaki update-li a k-mer paramter-t, akkor triggerelni kellene, hogy mi legyen.
-        self.get_and_set_segmentation_parameters()
-        self.get_and_set_tokenization_parameters()
-        self.get_and_set_computational_parameters()
-    def _get_default_sequence_processing_config_file(self) -> str:
-        """
-        Retrieve the default sequence processing configuration file.
-        :return: Path to the configuration file.
-        :rtype: str
-        """
-        current_path = pathlib.Path(__file__).parent
-        prokbert_seq_config_file = join(current_path, 'configs', 'sequence_processing.yaml')
-        self.current_path = current_path
-        try:
-            # Attempt to read the environment variable
-            prokbert_seq_config_file = os.environ['SEQ_CONFIG_FILE']
-        except KeyError:
-            # Handle the case when the environment variable is not found
-            pass
-            # print("SEQ_CONFIG_FILE environment variable has not been set. Using default value: {0}".format(prokbert_seq_config_file))
-        return prokbert_seq_config_file
-    def get_and_set_segmentation_parameters(self, parameters: dict = {}) -> dict:
-        """
-        Retrieve and validate the provided parameters for segmentation.
-        :param parameters: A dictionary of parameters to be validated.
-        :type parameters: dict
-        :return: A dictionary of validated segmentation parameters.
-        :rtype: dict
-        :raises ValueError: If an invalid segmentation parameter is provided.
-        """
-        segmentation_params = {k: self.get_parameter('segmentation', k) for k in self.parameters['segmentation']}
-        for param, param_value in parameters.items():
-            if param not in segmentation_params:
-                raise ValueError(f"The provided {param} is an INVALID segmentation parameter! The valid parameters are: {list(segmentation_params.keys())}")
-            self.validate('segmentation', param, param_value)
-            segmentation_params[param] = param_value
-        self.segmentation_params = segmentation_params
-        return segmentation_params
-    def get_and_set_tokenization_parameters(self, parameters: dict = {}) -> dict:
-        # Updating the other parameters if necesseary, i.e. if k-mer has-been changed, then the shift is updated and we run a parameter check at the end
-        tokenization_params = {k: self.get_parameter('tokenization', k) for k in self.parameters['tokenization']}
-        for param, param_value in parameters.items():
-            if param not in tokenization_params:
-                raise ValueError(f"The provided {param} is an INVALID tokenization parameter! The valid parameters are: {list(tokenization_params.keys())}")
-            self.validate('tokenization', param, param_value)
-            tokenization_params[param] = param_value
-        # Loading and check the vocab file. It is assumed that its ordered dictionary
-        vocabfile=tokenization_params['vocabfile']
-        act_kmer = tokenization_params['kmer']
-        if vocabfile=='auto':
-            vocabfile_path = join(self.current_path, 'data/prokbert_vocabs/', f'prokbert-base-dna{act_kmer}', 'vocab.txt')
-            tokenization_params['vocabfile'] = vocabfile_path
-        else:
-            vocabfile_path = vocabfile
-        with open(vocabfile_path) as vocabfile_in:
-            vocabmap = {line.strip(): i for i, line in enumerate(vocabfile_in)}
-        tokenization_params['vocabmap'] = vocabmap
-        # Loading the vocab
-        self.tokenization_params = tokenization_params
-        return tokenization_params
-    def get_and_set_computational_parameters(self, parameters: dict = {}) -> dict:
-        """ Reading and validating the computational paramters
-        """
-        computational_params = {k: self.get_parameter('computation', k) for k in self.parameters['computation']}
-        core_count = cpu_count()
-        if computational_params['cpu_cores_for_segmentation'] == -1:
-            computational_params['cpu_cores_for_segmentation'] = core_count
-        if computational_params['cpu_cores_for_tokenization'] == -1:
-            computational_params['cpu_cores_for_tokenization'] = core_count
-        for param, param_value in parameters.items():
-            if param not in computational_params:
-                raise ValueError(f"The provided {param} is an INVALID computation parameter! The valid parameters are: {list(computational_params.keys())}")
-            self.validate('computation', param, param_value)
-            computational_params[param] = param_value
-        np_tokentype= SeqConfig.numpy_dtype_mapping[computational_params['numpy_token_integer_prec_byte']]
-        computational_params['np_tokentype'] = np_tokentype
-        self.computational_params = computational_params
-        return computational_params
-    def get_maximum_segment_length_from_token_count_from_params(self):
-        """Calculating the maximum length of the segment from the token count """
-        max_token_counts = self.tokenization_params['token_limit']
-        shift = self.tokenization_params['shift']
-        kmer = self.tokenization_params['kmer']
-        return self.get_maximum_segment_length_from_token_count(max_token_counts, shift, kmer)
-    def get_maximum_token_count_from_max_length_from_params(self):
-        """Calculating the maximum length of the segment from the token count """
-        max_segment_length = self.tokenization_params['max_segment_length']
-        shift = self.tokenization_params['shift']
-        kmer = self.tokenization_params['kmer']
-        max_token_count = self.get_maximum_token_count_from_max_length(max_segment_length, shift, kmer)
-        return max_token_count
-    def get_cmd_arg_parser(self) -> tuple[argparse.ArgumentParser, dict, dict]:
-        """
-        Create and return a command-line argument parser for ProkBERT configurations, along with mappings
-        between command-line arguments and configuration parameters.
-        This method combines sequence configuration parameters with training configuration parameters
-        and sets up a command-line argument parser using these combined settings. It ensures that parameter
-        names are unique across different groups by renaming any non-unique parameters.
-        :return: A tuple containing:
-                 - Configured argparse.ArgumentParser instance for handling ProkBERT configurations.
-                 - A dictionary mapping new command-line arguments to their original group and parameter name.
-                 - A dictionary mapping each group to a dict that maps the original parameter names
-                   to the new command-line argument names.
-        :rtype: tuple[argparse.ArgumentParser, dict, dict]
-        Note: The method assumes that the configuration parameters for training and sequence configuration
-        are available within the class.
-        """
-        combined_params = deepcopy(self.parameters)
-        combined_params['Sequence'] = {}
-        combined_params['Sequence']['fasta_file_dir'] = {'default': 'None',
-                                                         'description' : 'Directory where the input fasta file are located for the pretraining',
-                                                         'type': 'string'}
-        combined_params['Sequence']['out'] = {'default': 'pretrain.h5',
-                                                         'description' : 'Output path',
-                                                         'type': 'string'}
-        combined_params, cmd_argument2group_param, group2param2cmdarg = BaseConfig.rename_non_unique_parameters(combined_params)
-        parser = BaseConfig.create_parser(combined_params)
-        return parser,cmd_argument2group_param, group2param2cmdarg
-    @staticmethod
-    def get_maximum_segment_length_from_token_count(max_token_counts, shift, kmer):
-        """Calcuates how long sequence can be covered
-        """
-        max_segment_length = (max_token_counts-3)*shift + kmer
-        return max_segment_length
-    @staticmethod
-    def get_maximum_token_count_from_max_length(max_segment_length, shift, kmer):
-        """Calcuates how long sequence can be covered
-        """
-        max_token_count = int(np.ceil((max_segment_length - kmer)/shift+3))
-        return max_token_count
-class ProkBERTConfig(BaseConfig):
-    """Class to manage and validate pretraining configurations."""
-    torch_dtype_mapping = {1: torch.uint8,
-                           2: torch.int16,
-                           8: torch.int64,
-                           4: torch.int32}
-    def __init__(self):
-        super().__init__()
-        self.default_pretrain_config_file = self._get_default_pretrain_config_file()
-        with open(self.default_pretrain_config_file, 'r') as file:
-            self.parameters = yaml.safe_load(file)
-        # Load and validate each parameter set
-        self.data_collator_params = self.get_set_parameters('data_collator')
-        self.model_params = self.get_set_parameters('model')
-        self.dataset_params = self.get_set_parameters('dataset')
-        self.pretraining_params = self.get_set_parameters('pretraining')
-        self.finetuning_params = self.get_set_parameters('finetuning')
-        # Getting the sequtils params as well
-        self.def_seq_config = SeqConfig()
-        self.segmentation_params = self.def_seq_config.get_and_set_segmentation_parameters(self.parameters['segmentation'])
-        self.tokenization_params = self.def_seq_config.get_and_set_tokenization_parameters(self.parameters['tokenization'])
-        self.computation_params = self.def_seq_config.get_and_set_computational_parameters(self.parameters['computation'])
-        self.default_torchtype = ProkBERTConfig.torch_dtype_mapping[self.computation_params['numpy_token_integer_prec_byte']]
-        hf_training_args = TrainingArguments("working_dir")
-        self.hf_training_args_dict = hf_training_args.to_dict()
-    def _get_default_pretrain_config_file(self) -> str:
-        """
-        Retrieve the default pretraining configuration file.
-        :return: Path to the configuration file.
-        :rtype: str
-        """
-        current_path = pathlib.Path(__file__).parent
-        pretrain_config_file = join(current_path, 'configs', 'pretraining.yaml')
-        try:
-            # Attempt to read the environment variable
-            pretrain_config_file = os.environ['PRETRAIN_CONFIG_FILE']
-        except KeyError:
-            # Handle the case when the environment variable is not found
-            pass
-            # print(f"PRETRAIN_CONFIG_FILE environment variable has not been set. Using default value: {pretrain_config_file}")
-        return pretrain_config_file
-    def get_set_parameters(self, parameter_class: str, parameters: dict = {}) -> dict:
-        """
-        Retrieve and validate the provided parameters for a given parameter class.
-        :param parameter_class: The class/category of the parameter (e.g., 'data_collator').
-        :type parameter_class: str
-        :param parameters: A dictionary of parameters to be validated.
-        :type parameters: dict
-        :return: A dictionary of validated parameters.
-        :rtype: dict
-        :raises ValueError: If an invalid parameter is provided.
-        """
-        class_params = {k: self.get_parameter(parameter_class, k) for k in self.parameters[parameter_class]}
-        # First validatiading the class parameters as well
-        for param, param_value in class_params.items():
-            self.validate(parameter_class, param, param_value)
-        for param, param_value in parameters.items():
-            if param not in class_params and (parameter_class!='pretraining'):
-                raise ValueError(f"The provided {param} is an INVALID {parameter_class} parameter! The valid parameters are: {list(class_params.keys())}")
-            else:
-                if parameter_class == 'pretraining' or parameter_class == 'finetuning' :
-                    if param in self.hf_training_args_dict or param in class_params:
-                        if param in class_params:
-                            self.validate(parameter_class, param, param_value)
-                        class_params[param] = param_value
-                    else:
-                        raise ValueError(f"The provided {param} is an INVALID {parameter_class} parameter! In addition is not a valid training argument.")
-                else:
-                    self.validate(parameter_class, param, param_value)
-                    class_params[param] = param_value
-        return class_params
-    def get_and_set_model_parameters(self, parameters: dict = {}) -> dict:
-        """ Setting the model parameters """
-        # Here we include the additional training arguments available for the trainer
-        self.model_params = self.get_set_parameters('model', parameters)
-        return self.model_params
-    def get_and_set_dataset_parameters(self, parameters: dict = {}) -> dict:
-        """ Setting the dataset parameters """
-        self.dataset_params = self.get_set_parameters('dataset', parameters)
-        return self.dataset_params
-    def get_and_set_pretraining_parameters(self, parameters: dict = {}) -> dict:
-        """ Setting the model parameters """
-        self.pretraining_params = self.get_set_parameters('pretraining', parameters)
-        return self.pretraining_params
-    def get_and_set_datacollator_parameters(self, parameters: dict = {}) -> dict:
-        """ Setting the model parameters """
-        self.data_collator_params = self.get_set_parameters('data_collator', parameters)
-        return self.data_collator_params
-    def get_and_set_segmentation_parameters(self, parameters: dict = {}) -> dict:
-        self.segmentation_params = self.def_seq_config.get_and_set_segmentation_parameters(parameters)
-        return self.segmentation_params
-    def get_and_set_tokenization_parameters(self, parameters: dict = {}) -> dict:
-        self.tokenization_params = self.def_seq_config.get_and_set_tokenization_parameters(parameters)
-        return self.tokenization_params
-    def get_and_set_computation_params(self, parameters: dict = {}) -> dict:
-        self.computation_params = self.def_seq_config.get_and_set_computational_parameters(parameters)
-        return self.computation_params
-    def get_and_set_finetuning_parameters(self, parameters: dict = {}) -> dict:
-        """ Setting the finetuning parameters """
-        # Here we include the additional training arguments available for the trainer
-        self.finetuning_params = self.get_set_parameters('finetuning', parameters)
-        return self.finetuning_params
-    def get_inference_parameters(self):
-        # Instantiate TrainingArguments to access default values
-        hf_defaults = TrainingArguments(output_dir="/tmp")  # Dummy output_dir for initialization
-        return {
-            'inference': {
-                'fastain': {
-                    'default': None,
-                    'type': 'str',
-                    'description': 'Path to the input data for inference.'
-                },
-                'out': {
-                    'default': None,
-                    'type': 'str',
-                    'description': 'Output path for the inference results.'
-                },
-                'per_device_eval_batch_size': {
-                    'default': hf_defaults.per_device_eval_batch_size,
-                    'type': 'int',
-                    'description': 'Batch size per device during evaluation.'
-                },
-                'ddp_backend': {
-                    'default': hf_defaults.ddp_backend,
-                    'type': 'str',
-                    'description': 'The backend to use for distributed training.'
-                },
-                'dataloader_drop_last': {
-                    'default': hf_defaults.dataloader_drop_last,
-                    'type': 'bool',
-                    'description': 'Drop the last incomplete batch if it is not divisible by the batch size.'
-                },
-                'torch_compile': {
-                    'default': getattr(hf_defaults, 'torch_compile', False),  # Fallback for compatibility
-                    'type': 'bool',
-                    'description': 'Whether to use TorchScript’s JIT compilation to accelerate training.'
-                },
-                'torch_compile_mode': {
-                    'default': getattr(hf_defaults, 'torch_compile_mode', 'eager'),  # Fallback for compatibility
-                    'type': 'str',
-                    'description': 'The JIT mode to use for compiling PyTorch operations.'
-                }
-            }
-        }
-    def get_cmd_arg_parser(self, keyset=[]) -> tuple[argparse.ArgumentParser, dict, dict]:
-        """
-        Create and return a command-line argument parser for ProkBERT configurations, along with mappings
-        between command-line arguments and configuration parameters.
-        This method combines sequence configuration parameters with training configuration parameters
-        and sets up a command-line argument parser using these combined settings. It ensures that parameter
-        names are unique across different groups by renaming any non-unique parameters.
-        :return: A tuple containing:
-                 - Configured argparse.ArgumentParser instance for handling ProkBERT configurations.
-                 - A dictionary mapping new command-line arguments to their original group and parameter name.
-                 - A dictionary mapping each group to a dict that maps the original parameter names
-                   to the new command-line argument names.
-        :rtype: tuple[argparse.ArgumentParser, dict, dict]
-        Note: The method assumes that the configuration parameters for training and sequence configuration
-        are available within the class.
-        """
-        if len(keyset) ==0:
-            trainin_conf_keysets = ['data_collator', 'model', 'dataset', 'pretraining', 'finetuning']
-        else:
-            trainin_conf_keysets = keyset
-        inference_params = self.get_inference_parameters()
-        seq_config = deepcopy(self.def_seq_config.parameters)
-        default_other_config = deepcopy(self.parameters)
-        combined_params = {}
-        for k,v in seq_config.items():
-            combined_params[k] = v
-        for k in trainin_conf_keysets:
-            combined_params[k] = default_other_config[k]
-        combined_params.update(inference_params)
-        combined_params, cmd_argument2group_param, group2param2cmdarg = BaseConfig.rename_non_unique_parameters(combined_params)
-        parser = BaseConfig.create_parser(combined_params)
-        return parser,cmd_argument2group_param, group2param2cmdarg
-def get_user_provided_args(args, parser):
-    """
-    Extract arguments provided by the user from the parsed arguments.
-    Args:
-        args (argparse.Namespace): Parsed command-line arguments.
-        parser (argparse.ArgumentParser): The argument parser instance.
-    Returns:
-        dict: A dictionary of user-provided arguments and their values.
-    """
-    user_provided_args = {}
-    for action in parser._actions:
-        arg_name = action.dest
-        default_value = action.default
-        user_value = getattr(args, arg_name, None)
-        if user_value != default_value:
-            user_provided_args[arg_name] = user_value
-    return user_provided_args

general_utils.py DELETED Viewed

@@ -1,309 +0,0 @@
-# coding=utf-8
-import pandas as pd
-import os
-import numpy as np
-import subprocess
-import shutil
-""" Library for general utils, such as dataframe properties checking,
-creating directories, checking files, etc.
-"""
-def check_expected_columns(df: pd.DataFrame, expected_columns: list) -> bool:
-    """Checks if a DataFrame contains the expected columns.
-    :param df: The input DataFrame to be checked.
-    :type df: pd.DataFrame
-    :param expected_columns: A list of columns that are expected to be present in the DataFrame.
-    :type expected_columns: list
-    :param df: pd.DataFrame:
-    :param expected_columns: list:
-    :returns: True if all expected columns are present in the DataFrame, False otherwise.
-    :rtype: bool
-    :raises ValueError: If any of the expected columns are not present in the DataFrame.
-    Examples
-    --------
-    >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
-    >>> check_expected_columns(df, ['A', 'B'])
-    True
-    >>> check_expected_columns(df, ['A', 'C'])
-    ValueError: The following columns are missing: ['C']
-    """
-    missing_columns = [col for col in expected_columns if col not in df.columns]
-    if missing_columns:
-        raise ValueError(f"The following columns are missing: {missing_columns}")
-    return True
-def is_valid_primary_key(df: pd.DataFrame, column_name: str) -> bool:
-    """Checks if a specified column in a DataFrame can serve as a valid primary key.
-    :param df: The input DataFrame to be checked.
-    :type df: pd.DataFrame
-    :param column_name: The name of the column to check.
-    :type column_name: str
-    :returns: True if the column can serve as a valid primary key, False otherwise.
-    :rtype: bool
-    :raises ValueError: If the specified column does not exist in the DataFrame.
-    Examples
-    --------
-    >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-    >>> is_valid_primary_key(df, 'A')
-    True
-    >>> df = pd.DataFrame({'A': [1, 2, 2], 'B': [4, 5, 6]})
-    >>> is_valid_primary_key(df, 'A')
-    False
-    """
-    if column_name not in df.columns:
-        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
-    # Check for NaN values
-    if df[column_name].isnull().any():
-        return False
-    # Check for unique values
-    if not df[column_name].is_unique:
-        return False
-    return True
-def get_non_empty_files(start_path: str, extensions: tuple = ('.fasta', '.fna')) -> str:
-    """Generator that yields non-empty files from a specified directory and its subdirectories based on the given extensions.
-    :param start_path: The path to the directory from which to start the search.
-    :type start_path: str
-    :param extensions: A tuple of file extensions to look for (default is ('.fasta', '.fna')).
-                       The function also automatically checks for compressed versions with '.gz'.
-    :type extensions: tuple
-    :returns: Yields filenames that match the specified extensions and are non-empty.
-    :rtype: str
-    """
-    for dirpath, _, filenames in os.walk(start_path):
-        for filename in filenames:
-            filepath = os.path.join(dirpath, filename)
-            if any(filename.endswith(ext) or filename.endswith(ext + '.gz') for ext in extensions) and os.path.getsize(filepath) > 0:
-                yield filename
-def truncate_zero_columns(arr: np.ndarray) -> np.ndarray:
-    """Truncate all trailing columns composed entirely of zeros in a given 2D numpy array.
-    :param arr: Input 2D numpy array.
-    :type arr: np.ndarray
-    :returns: A new array with trailing zero columns removed.
-    :rtype: np.ndarray
-    """
-    # Iterate over columns from the end
-    for idx in range(arr.shape[1]-1, -1, -1):
-        if np.any(arr[:, idx]):
-            return arr[:, :(idx+1)]
-    return np.empty((arr.shape[0], 0))
-import os
-def create_directory_for_filepath(filepath: str) -> None:
-    """Given a file path, creates the underlying directory structure if it doesn't already exist.
-    :param filepath: The path to the file for which the directory structure should be created.
-    :type filepath: str
-    :raises ValueError: If the provided path is empty or None.
-    :raises OSError: If there's an error creating the directory structure.
-    """
-    if not filepath:
-        raise ValueError("The provided filepath is empty or None.")
-    directory = os.path.dirname(filepath)
-    if directory and not os.path.exists(directory):
-        try:
-            os.makedirs(directory)
-            print(f"Directory structure {directory} created successfully.")
-        except OSError as e:
-            raise OSError(f"Error creating directory structure {directory}. Error: {e}")
-# Example usage:
-# create_directory_for_filepath("/path/to/directory/that/might/not/exist/filename.txt")
-def check_file_exists(file_path: str) -> bool:
-    """Checks if the provided file path exists.
-    :param file_path: Path to the file.
-    :type file_path: str
-    :returns: True if the file exists, raises ValueError otherwise.
-    :rtype: bool
-    """
-    if os.path.exists(file_path):
-        return True
-    else:
-        raise ValueError(f"The provided file path '{file_path}' does not exist.")
-def count_gpus(method="clinfo"):
-    """
-    Count the number of available GPUs using the specified method.
-    This function counts the number of NVIDIA and AMD GPUs using the chosen method. By default, it uses the 'clinfo'
-    method for AMD GPUs.
-    :param method: The method to use for GPU counting. Choose between 'clinfo' (default) and 'rocm'.
-    :type method: str, optional
-    :return: The total number of GPUs detected.
-    :rtype: int
-    :raises ValueError: If an unknown method is provided.
-    :raises Exception: If an error occurs while querying AMD GPUs using the specified method.
-    .. note::
-        - The 'clinfo' method queries AMD GPUs by running the 'clinfo' command.
-        - The 'rocm' method queries AMD GPUs by running 'rocm-smi --list' command.
-    """
-    import torch
-    import subprocess
-    # Count NVIDIA GPUs
-    nvidia_gpu_count = torch.cuda.device_count()
-    # Count AMD GPUs
-    amd_gpu_count = 0
-    try:
-        if method == "clinfo":
-            clinfo_output = subprocess.check_output('clinfo').decode('utf-8')
-            amd_gpu_count = clinfo_output.lower().count('device type: gpu')
-        elif method == "rocm":
-            rocm_output = subprocess.check_output('rocm-smi --list', shell=True).decode('utf-8')
-            amd_gpu_count = len(rocm_output.strip().split('\n'))
-        else:
-            raise ValueError("Unknown method provided. Choose between 'clinfo' and 'rocm'.")
-    except Exception as e:
-        print(f"Error querying AMD GPUs using method '{method}': {e}")
-    total_gpus = nvidia_gpu_count + amd_gpu_count
-    return total_gpus
-def create_hard_links(source_directory: str, target_directory: str, blacklist: list = []) -> None:
-    """Creates hard links for all files from the source directory to the target directory.
-    :param source_directory: The directory containing the original files.
-    :type source_directory: str
-    :param target_directory: The directory where hard links will be created.
-    :type target_directory: str
-    :param blacklist: List of filenames to exclude from creating hard links.
-    :type blacklist: list
-    :returns: None
-    """
-    # Ensure the provided directories exist
-    if not os.path.exists(source_directory):
-        raise ValueError(f"The source directory '{source_directory}' does not exist.")
-    if not os.path.exists(target_directory):
-        os.makedirs(target_directory)
-    # Iterate through the files in the source directory
-    for filename in os.listdir(source_directory):
-        source_file_path = os.path.join(source_directory, filename)
-        target_file_path = os.path.join(target_directory, filename)
-        # Check for files to skip
-        if (filename.startswith('.') or
-            filename.startswith('_') or
-            os.path.isdir(source_file_path) or
-            filename in blacklist):
-            continue
-        # Create a hard link
-        os.link(source_file_path, target_file_path)
-    return f"Hard links created in {target_directory} from {source_directory}."
-# Example usage
-# create_hard_links("/path/to/source_directory", "/path/to/target_directory", blacklist=["file_to_skip.txt"])
-def create_selected_hard_links(source_directory: str, target_directory: str, filenames: list) -> None:
-    """Creates hard links for the specified files from the source directory to the target directory.
-    :param source_directory: The directory containing the original files.
-    :type source_directory: str
-    :param target_directory: The directory where hard links will be created.
-    :type target_directory: str
-    :param filenames: List of filenames for which hard links should be created.
-    :type filenames: list
-    :returns: None
-    """
-    # Ensure the provided directories exist
-    if not os.path.exists(source_directory):
-        raise ValueError(f"The source directory '{source_directory}' does not exist.")
-    if not os.path.exists(target_directory):
-        os.makedirs(target_directory)
-    # Iterate through the specified filenames
-    for filename in filenames:
-        source_file_path = os.path.join(source_directory, filename)
-        target_file_path = os.path.join(target_directory, filename)
-        # Ensure the file exists in the source directory
-        if not os.path.isfile(source_file_path):
-            print(f"Warning: {filename} does not exist in the source directory. Skipping.")
-            continue
-        # Create a hard link
-        try:
-            os.link(source_file_path, target_file_path)
-        except FileExistsError:
-            print(f'The target hard link {target_file_path} exist. Skipping...')
-    return f"Hard links for specified files created in {target_directory} from {source_directory}."
-def remove_hidden_files(directory: str) -> None:
-    """Removes all files recursively in a folder that start with '.' or '_'.
-    :param directory: The directory from which hidden files should be removed.
-    :type directory: str
-    :returns: None
-    """
-    # Ensure the directory exists
-    if not os.path.exists(directory):
-        raise ValueError(f"The directory '{directory}' does not exist.")
-    # Use os.walk to iterate through all subdirectories and files
-    for dirpath, dirnames, filenames in os.walk(directory, topdown=False):
-        # Filter out directories starting with '.' or '_'
-        dirnames[:] = [d for d in dirnames if not d.startswith('.') and not d.startswith('_')]
-        # Remove files starting with '.' or '_'
-        for filename in filenames:
-            if filename.startswith('.') or filename.startswith('_'):
-                file_path = os.path.join(dirpath, filename)
-                os.remove(file_path)
-                print(f"Removed: {file_path}")
-    print(f"All hidden files removed from {directory}.")

sequtils.py DELETED Viewed

@@ -1,980 +0,0 @@
-import logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# coding=utf-8
-""" Library for sequence processing """
-import os
-import sys
-import pandas as pd
-from multiprocessing import Pool
-import multiprocessing
-from os.path import join, isfile, splitext
-from os import listdir
-import random
-from Bio import SeqIO
-import numpy as np
-import math
-import gzip
-from mimetypes import guess_type
-from functools import partial
-import operator
-import pathlib
-#from typing import Dict, List, Type, Tuple
-from itertools import product
-from typing import List, Union, Dict, Any, Optional, Tuple, Type, Set
-from .general_utils import *
-from Bio.Seq import Seq
-from Bio.SeqRecord import SeqRecord
-from scipy.ndimage import convolve1d
-import h5py
-def load_contigs(
-    fasta_files_list: Union[List[str], str],
-    adding_reverse_complement: bool = True,
-    IsAddHeader: bool = False,
-    AsDataFrame: bool = False,
-    to_uppercase: bool = False,
-    is_add_sequence_id: bool = False
-) -> Union[List[Union[str, List[str]]], pd.DataFrame]:
-    """
-    Loads contigs from a list of FASTA files.
-    :param fasta_files_list: List of paths to FASTA files or a single file path. Compressed (gz) FASTA files are accepted.
-    :type fasta_files_list: Union[List[str], str]
-    :param adding_reverse_complement: If True, adds the reverse complement of each sequence. Defaults to True.
-    :type adding_reverse_complement: bool
-    :param IsAddHeader: If True, includes the FASTA ID and description in the output. Defaults to False.
-    :type IsAddHeader: bool
-    :param AsDataFrame: If True, returns the sequences as a pandas DataFrame. Defaults to False.
-    :type AsDataFrame: bool
-    :param to_uppercase: If True, converts sequences to uppercase. Defaults to False.
-    :type to_uppercase: bool
-    :param is_add_sequence_id: If True, adds a unique integer sequence ID to each sequence. Defaults to False.
-    :type is_add_sequence_id: bool
-    :return: The loaded sequences. Each sequence is represented as a string if IsAddHeader is False, or as a list
-             [sequence_id, fasta_id, description, source_file, sequence, orientation] if IsAddHeader is True and is_add_sequence_id is True.
-             If AsDataFrame is True, the sequences are returned as a DataFrame.
-    :rtype: Union[List[Union[str, List[str]]], pd.DataFrame]
-    Example:
-        >>> fasta_files = ['path/to/file1.fasta', 'path/to/file2.fasta.gz']
-        >>> load_contigs(fasta_files, adding_reverse_complement=False, IsAddHeader=True, AsDataFrame=True, to_uppercase=True, is_add_sequence_id=True)
-        # Returns a DataFrame with the sequences from the specified FASTA files, all in uppercase, with unique sequence IDs.
-    """
-    logging.info('Loading sequence data into memory!')
-    if isinstance(fasta_files_list, str):
-        logging.info('Since the fasta_files_list is a string, not a list, we convert it to a list.')
-        fasta_files_list = [fasta_files_list]
-    sequences = []
-    sequence_id = 0
-    df_cols = ['sequence_id', 'fasta_id', 'description', 'source_file', 'sequence', 'orientation'] if (IsAddHeader and is_add_sequence_id) else ['fasta_id', 'description', 'source_file', 'sequence', 'orientation'] if IsAddHeader else ['sequence']
-    for act_assembly in fasta_files_list:
-        # Determine the file encoding based on the file extension
-        encoding = guess_type(act_assembly)[1]
-        _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
-        with _open(act_assembly) as f_assembly:
-            # Parse the fasta file
-            contigs = list(SeqIO.parse(f_assembly, "fasta"))
-        for contig in contigs:
-            act_seq = str(contig.seq)[:] if not to_uppercase else str(contig.seq).upper()[:]
-            act_header = str(contig.id)
-            act_description = str(contig.description)
-            if adding_reverse_complement:
-                # Compute the reverse complement of the sequence
-                act_reverse_complement = str(contig.seq.reverse_complement()) if not to_uppercase else str(contig.seq.reverse_complement()).upper()
-            if IsAddHeader:
-                # Include sequence ID (if applicable), fasta ID, description, source file, sequence, and orientation in the output
-                entry = [sequence_id] if is_add_sequence_id else []
-                entry.extend([act_header, act_description, act_assembly, act_seq, 'forward'])
-                sequences.append(entry)
-                if adding_reverse_complement:
-                    entry = [sequence_id + 1] if is_add_sequence_id else []
-                    entry.extend([act_header, act_description, act_assembly, act_reverse_complement, 'reverse'])
-                    sequences.append(entry)
-                    if is_add_sequence_id:
-                        sequence_id += 2
-                else:
-                    sequence_id+=1
-            else:
-                # Only include the sequence in the output
-                sequences.append(act_seq)
-                if adding_reverse_complement:
-                    sequences.append(act_reverse_complement)
-    if AsDataFrame:
-        # Convert the sequences to a DataFrame
-        sequences = pd.DataFrame(sequences, columns=df_cols)
-    return sequences
-def segment_sequence_contiguous(
-    sequence: str,
-    params: Dict[str, Any],
-    sequence_id: Optional[Any] = np.nan
-) -> List[Dict[str, Any]]:
-    """
-    Creates end-to-end, disjoint segments of a sequence without overlaps.
-    Segments smaller than the predefined minimum length will be discarded.
-    This function returns a list of segments along with their positions in the original sequence.
-    :param sequence: The input nucleotide sequence to be segmented.
-    :type sequence: str
-    :param params: Dictionary containing the segmentation parameters. Must include 'min_length' and 'max_length' keys
-                   specifying the minimum and maximum lengths of the segments, respectively. Can contain other parameters.
-    :type params: Dict[str, Any]
-    :param sequence_id: An identifier for the sequence, optional. Defaults to NaN.
-    :type sequence_id: Optional[Any]
-    :return: A list of dictionaries, each representing a segment. Each dictionary contains the segment's sequence,
-             start position, end position, and sequence ID.
-    :rtype: List[Dict[str, Any]]
-    Example:
-        >>> params = {'min_length': 0, 'max_length': 100}
-        >>> segment_sequence_contiguous('ATCGATCGA', params)
-        [{'segment': 'ATCGATCGA', 'segment_start': 0, 'segment_end': 9, 'sequence_id': np.nan}]
-    """
-    # Extract segmentation parameters
-    min_segment_len = params['min_length']
-    max_segment_len = params['max_length']
-    # Ensure the sequence is treated as a string
-    if isinstance(sequence, str):
-        act_seq = sequence
-    L = len(sequence)
-    segments = []
-    for i in range(0, L, max_segment_len):
-        act_start_pos = i
-        act_end_pos = min(i + max_segment_len, L)
-        act_segment = sequence[act_start_pos:act_end_pos]
-        # Add segment to the list if it's longer than the minimum length
-        if len(act_segment) >= min_segment_len:
-            new_record = {
-                'segment': act_segment,
-                'segment_start': act_start_pos,
-                'segment_end': act_end_pos,
-                'sequence_id': sequence_id
-            }
-            segments.append(new_record)
-    return segments
-def segment_sequences_random(
-    sequences: Union[pd.DataFrame, List[str]],
-    params: Dict[str, Union[int, float, str, Dict, List, Tuple]]
-) -> List[Dict[str, Union[int, str]]]:
-    """
-    Randomly segments the input sequences.
-    This function accepts either a list of sequences or a DataFrame containing sequences.
-    If a DataFrame is provided, it's assumed to have preprocessed sequences with "sequence" and "sequence_id" columns,
-    where "sequence_id" is a valid primary key. The function returns a list of dictionaries,
-    each containing details of a segment including its sequence, start position, end position,
-    associated sequence ID, and a segment ID (not generated in this function).
-    :param sequences: A DataFrame containing sequences with "sequence" and "sequence_id" columns or a list of sequences.
-    :type sequences: Union[pd.DataFrame, List[str]]
-    :param params: Dictionary containing segmentation parameters such as 'coverage', 'min_length', and 'max_length'.
-    :type params: Dict[str, Union[int, float, str, Dict, List, Tuple]]
-    :return: A list of dictionaries with each containing details of a segment.
-    :rtype: List[Dict[str, Union[int, str]]]
-    Notes:
-        - The actual number of segments may differ from the expected number due to random sampling and sequences
-          being shorter than the specified segment size.
-        - Segment IDs are not generated by this function.
-    """
-    # Calculate sequence lengths and cumulative sum of lengths
-    sequences['seq_lengths'] = sequences.apply(lambda x: len(x['sequence']), axis=1)
-    sequences['lenght_cum_sum'] = sequences['seq_lengths'].cumsum()
-    Lseqs = sum(sequences['seq_lengths'])
-    # Calculate the number of segments to sample based on expected coverage.
-    # Note: The actual number might be biased if many sequences are "short" compared to the segment sizes.
-    N_segments = int(np.ceil(params['coverage'] * Lseqs / params['max_length']))
-    logging.info(f'Sampling {N_segments} segments from {len(sequences)} sequences.')
-    # Generate random starting coordinates for segments
-    start_coords = list(np.sort(np.int64(np.random.uniform(0, sequences['lenght_cum_sum'].max(), N_segments))))
-    segmentdb = []
-    for sid, act_sampling_coord in enumerate(start_coords):
-        diff = act_sampling_coord - sequences['lenght_cum_sum']
-        # Find the sequence in which the current segment starts
-        for i in range(len(sequences['lenght_cum_sum'])):
-            if diff[i] < 0:
-                break
-        act_sequence_id = sequences['sequence_id'].iloc[i]
-        rel_coord = act_sampling_coord - sequences['lenght_cum_sum'].iloc[i] + sequences['seq_lengths'].iloc[i]
-        segment_end = min(rel_coord + params['max_length'], sequences['seq_lengths'].iloc[i])
-        # Skip the segment if it's shorter than the minimum segment length
-        if segment_end - rel_coord < params['min_length']:
-            pred_seqgment = sequences['sequence'].iloc[i][rel_coord:segment_end]
-            minimum_len = params['min_length']
-            logging.info(f'Too short segment, skip! Sampled segment: {pred_seqgment},  Segment end coordinate: {segment_end}, relative coordinate: {rel_coord}, minimum length is: {minimum_len}')
-            continue
-        new_segment = sequences['sequence'].iloc[i][rel_coord:segment_end]
-        new_record = {
-            'sequence_id': act_sequence_id,
-            'segment_start': rel_coord,
-            'segment_end': segment_end,
-            'segment': new_segment,
-            'segment_id': str(sid)
-        }
-        segmentdb.append(new_record)
-    return segmentdb
-def segment_sequences(
-    sequences: Union[List[str], pd.DataFrame],
-    params: Dict[str, Union[int, float, str, ]],
-    AsDataFrame: bool = False
-) -> Union[List[str], pd.DataFrame]:
-    """
-    Segments sequences based on the provided parameters.
-    This function assumes that the sequence is quality controlled and preprocessed, i.e., it is a valid nucleotide sequence.
-    If sequences are provided as a DataFrame, then it is assumed that there is a "sequence_id" and
-    a "sequence" attribute. The "sequence_id" should be a valid primary key.
-    If the output is requested as a DataFrame, then the IDs are added as well.
-    :param sequences: A list of sequences or a DataFrame containing sequences.
-                      If a DataFrame, it must have "sequence_id" and "sequence" attributes.
-    :type sequences: Union[List[str], pd.DataFrame]
-    :param params: Dictionary containing the segmentation parameters.
-        - 'type' (str): The type of segmentation ('contiguous' or 'random').
-        - 'min_length' (int): Minimum length of a segment.
-        - 'max_length' (int): Maximum length of a segment.
-        - 'coverage' (float): Coverage percentage for random segmentation.
-    :type params: Dict[str, Union[int, float, str, Dict[str, int], List[int], Tuple[int, int]]]
-    :param AsDataFrame: If True, the output will be a DataFrame. If False, it will be a list. Defaults to False.
-    :type AsDataFrame: bool
-    :return: List of segmented sequences or a DataFrame with segmented sequences and their corresponding information based on the `AsDataFrame` parameter.
-    :rtype: Union[List[str], pd.DataFrame]
-    :raises ValueError: If the provided sequences DataFrame does not have the required attributes.
-    :raises ValueError: If the "sequence_id" column is not a valid primary key.
-    Examples:
-        >>> segment_sequences(['AATCAATTTTATTT', 'AGCCGATTCAATTGCATTATTT'], {'type': 'contiguous', 'min_length': 1, 'max_length': 1000, 'coverage': 1.0})
-    """
-    segmentation_type = params['type']
-    # Checking for primary key and sequence attribute???
-    expected_attributes = ['sequence_id', 'sequence']
-    return_cols = ['segment_id', 'sequence_id', 'segment_start', 'segment_end', 'segment']
-    if isinstance(sequences, list):
-        logging.info('Sequences is a list, therefore ignoring ids and tracking information. ')
-        IsSequenceId = None
-        IsSeqList = True
-    elif isinstance(sequences, pd.DataFrame):
-        #logging.info('Sequences is a list, therefore adding tracking information.')
-        logging.info('Checking input DataFrame!')
-        check_expected_columns(sequences, expected_attributes)
-        logging.info('Checking input sequence_id is valid primary key in the DataFrame')
-        is_valid_primary_key(sequences, 'sequence_id')
-        IsSequenceId = True
-        IsSeqList=False
-    segments = []
-    if segmentation_type == 'contiguous':
-        if IsSeqList:
-            if IsSequenceId:
-                for act_seq_id, seq in enumerate(sequences):
-                    act_segments = segment_sequence_contiguous(seq, params, act_seq_id)
-                    segments.extend(act_segments)
-            else:
-                for seq in sequences:
-                    act_segments = segment_sequence_contiguous(seq, params)
-                    segments.extend(act_segments)
-        else:
-            for _, rec in sequences.iterrows():
-                act_seq = rec['sequence']
-                act_seq_id = rec['sequence_id']
-                act_segments = segment_sequence_contiguous(act_seq, params, act_seq_id)
-                segments.extend(act_segments)
-    elif segmentation_type == 'random':
-        if IsSeqList:
-            seqeunce_df = pd.DataFrame(sequences,
-                                        columns = ['sequence'])
-            seqeunce_df['sequence_id'] = list(range(len(sequences)))
-            segments = segment_sequences_random(seqeunce_df, params)
-        else:
-            segments = segment_sequences_random(sequences, params)
-    if AsDataFrame:
-        #logging.info('Creating a DataFrame from the segments. ')
-        segment_db = pd.DataFrame(segments)
-        segment_ids = list(range(len(segment_db)))
-        segment_db['segment_id'] = segment_ids
-        segment_db = segment_db[return_cols]
-    else:
-        segment_db = [seg['segment'] for seg in segments]
-    return segment_db
-def lca_kmer_tokenize_segment(segment: str, offset: int, params: Dict[str, Dict[str, int] | int | float]):
-    # calculate the tokenization for one offset value
-    shift = params['shift']
-    max_segment_length = params['max_segment_length']
-    max_unknown_token_proportion = params['max_unknown_token_proportion']
-    kmer = params['kmer']
-    token_limit = params['token_limit']
-    vocabmap = params['vocabmap']
-    add_special_token = params['add_special_token']
-    if len(segment) > max_segment_length:
-        raise(ValueError(f'The segment is longer {len(segment)} then the maximum allowed segment length ({max_segment_length}). '))
-    kmers = [segment[i:i + kmer] for i in range(offset, len(segment) - kmer + 1, shift)]
-    return kmers
-def lca_tokenize_segment(segment: str, params: Dict[str, Dict[str, int] | int | float]) -> Tuple[List[List[int]], List[List[str]]]:
-    """
-    Tokenizes a single segment using Local Context Aware (LCA) tokenization.
-    The segment is first split into k-mers with specified shifts and then tokenized into token vectors.
-    :param segment: The input nucleotide sequence segment to be tokenized.
-    :type segment: str
-    :param params: Dictionary containing the tokenization parameters.
-        - 'shift' (int): The k-mer shift parameter.
-        - 'max_segment_length' (int): Maximum allowable segment length.
-        - 'max_unknown_token_proportion' (float): Maximum allowable proportion of unknown tokens in a segment.
-        - 'kmer' (int): Size of the k-mer.
-        - 'token_limit' (int): Maximum number of tokens allowed in the tokenized output.
-        - 'vocabmap' (dict[str, int]): Dictionary mapping k-mers to their respective token values.
-    :type params: dict
-    :returns: A tuple containing:
-        - list[list[int]]: List of tokenized segments (each segment as a list of integers).
-        - list[list[str]]: List of k-merized segments with different shifts (each segment as a list of strings).
-    :rtype: Tuple[List[List[int]], List[List[str]]]
-    :raises ValueError: If the segment length exceeds the `max_segment_length`.
-    Examples:
-        >>> vocabmap_example = {"[CLS]": 2, "[SEP]": 3, "[UNK]": 0, "TCTTT": 4, "CTTTG": 5, "TTTGC": 6, "TTGCT": 7}
-        >>> segment_example = 'TCTTTGCTAAG'
-        >>> params_example = {'shift': 1, 'max_segment_length': 512, 'max_unknown_token_proportion': 0.2, 'kmer': 5, 'token_limit': 10, 'vocabmap': vocabmap_example}
-        >>> lca_tokenize_segment(segment_example, params_example)
-        ([[2, 4, 5, 6, 7, 3]], [['TCTTT', 'CTTTG', 'TTTGC', 'TTGCT']])
-    """
-    #logging.info('Tokenizing a segment')
-    shift = params['shift']
-    max_segment_length = params['max_segment_length']
-    max_unknown_token_proportion = params['max_unknown_token_proportion']
-    kmer = params['kmer']
-    token_limit = params['token_limit']
-    vocabmap = params['vocabmap']
-    add_special_token = params['add_special_token']
-    if len(segment) > max_segment_length:
-        raise(ValueError(f'The segment is longer {len(segment)} then the maximum allowed segment length ({max_segment_length}). '))
-    kmers_offset = []
-    # For every pssoble offset and window we should get a k-mer vector.
-    # If the segmen is too short or non-existent, then we might have a problem. So, please ensure the segment
-    for offset in range(shift):
-        kmers = [segment[i:i + kmer] for i in range(offset, len(segment) - kmer + 1, shift)]
-        kmers_offset.append(kmers)
-    # Mapping the k-mers into numbers
-    tokenized_segments = tokenize_kmerized_segment_list(kmers_offset, vocabmap, token_limit, max_unknown_token_proportion, add_special_token)
-    return tokenized_segments, kmers_offset
-def tokenize_kmerized_segment_list(kmerized_segments: List[List[str]],
-                                   vocabmap: Dict[str, int],
-                                   token_limit: int,
-                                   max_unknown_token_proportion: float,
-                                   add_special_tokens: bool = True) -> List[List[int]]:
-    """Tokenizes or vectorizes a list of k-merized segments into a list of token vectors. If the expected number of
-    tokens in a segment exceeds the maximum allowed tokens (`token_limit`), the function raises an error. For segments
-    where unknown k-mers exceed the proportion set by `max_unknown_token_proportion`, the output is a special token
-    sequence indicating an empty sentence.
-    :param kmerized_segments: List containing k-merized segments.
-    :type kmerized_segments: List[List[str]]
-    :param vocabmap: Dictionary that maps k-mers to their respective token values.
-    :type vocabmap: Dict[str, int]
-    :param token_limit: Maximum number of tokens allowed in the tokenized output.
-    :type token_limit: int
-    :param max_unknown_token_proportion: Maximum allowable proportion of unknown tokens in a segment.
-    :type max_unknown_token_proportion: float
-    :param add_special_tokens: Whether to add special tokens (`[CLS]` and `[SEP]`) to the tokenized segments.
-    :type add_special_tokens: bool, optional (default=True)
-    :returns: List containing tokenized segments.
-    :rtype: List[List[int]]
-    :raises ValueError: If the expected number of tokens in a segment exceeds `token_limit`.
-    Examples
-    --------
-    >>> vocabmap_example = {"[CLS]": 2, "[SEP]": 3, "[UNK]": 0, "TCTTTG": 4, "CTTTGC": 5, "TTTGCT": 6, "TTGCTA": 7}
-    >>> kmerized_segment_example = [['TCTTTG', 'CTTTGC', 'TTTGCT', 'TTGCTA']]
-    >>> tokenize_kmerized_segment_list(kmerized_segment_example, vocabmap_example, 10, 0.2)
-    [[2, 4, 5, 6, 7, 3]]
-    """
-    tokenized_segments = []
-    if add_special_tokens:
-        empty_sentence = [2, 3]
-    else:
-        empty_sentence = []
-    for act_kmer_list in kmerized_segments:
-        if add_special_tokens:
-            tokenized_kmerized_segment = [vocabmap['[CLS]']]
-        else:
-            tokenized_kmerized_segment = []
-        unkcount=0
-        L_kmerized_segment = len(act_kmer_list)
-        unkw_tsh_count = int(L_kmerized_segment*max_unknown_token_proportion)
-        if len(act_kmer_list)+2 > token_limit:
-            raise(ValueError(f'The expected number of tokens in the segment ({L_kmerized_segment+2}) is larger, then the maximum allowed number of tokens = ({token_limit}). '))
-        if L_kmerized_segment == 0:
-            logging.info('Its and empty sentence')
-            tokenized_kmerized_segment = empty_sentence
-            tokenized_segments.append(empty_sentence)
-            continue
-        for kmer in act_kmer_list:
-            try:
-                tokenized_kmerized_segment.append(vocabmap[kmer.upper()])
-            except KeyError:
-                tokenized_kmerized_segment.append(vocabmap['[UNK]'])
-                unkcount+=1
-        if unkcount > unkw_tsh_count:
-            tokenized_segments.append(empty_sentence)
-            continue
-        if add_special_tokens:
-            tokenized_kmerized_segment.append(vocabmap['[SEP]'])
-        tokenized_segments.append(tokenized_kmerized_segment)
-    return tokenized_segments
-def process_batch_tokenize_segments_with_ids(
-    segments: List[str],
-    segment_ids: List[Any],
-    tokenization_params: Dict[str, Any],
-    np_token_type: type = np.uint16
-) -> Dict[Any, List[np.ndarray]]:
-    """
-    Tokenizes a batch of segments and associates them with their provided IDs.
-    This function generates vector representations for a collection of segments, assuming the segments
-    have undergone quality control. The result is a dictionary where the keys are segment IDs, and the values
-    are lists of potential vector representations for the segment, with each list element corresponding to
-    a specific shift.
-    The vector representations are converted to numpy arrays. The output is not a 2D rectangular array but
-    a dictionary mapping each segment ID to its tokenized representations.
-    :param segments: A list of preprocessed and validated segments.
-    :type segments: List[str]
-    :param segment_ids: A list of segment IDs corresponding to each segment in `segments`.
-    :type segment_ids: List[Any]
-    :param tokenization_params: A dictionary containing tokenization parameters.
-    :type tokenization_params: Dict[str, Any]
-    :param np_token_type: Numpy data type for the tokenized segments. Defaults to np.uint16.
-    :type np_token_type: type, optional
-    :return: A dictionary with segment IDs as keys and lists of numpy arrays representing tokenized segments as values.
-    :rtype: Dict[Any, List[np.ndarray]]
-    Example:
-        >>> segments = ['ACTG', 'TGCA']
-        >>> segment_ids = [1, 2]
-        >>> tokenization_params = {'max_segment_length': 50, ...}
-        >>> tokenized_segments = process_batch_tokenize_segments_with_ids(
-                segments, segment_ids, tokenization_params
-            )
-    """
-    tokenized_segments_with_ids = {}
-    for i, segment in enumerate(segments):
-        act_id = segment_ids[i]
-        tokenized_segments_with_ids[act_id] = []
-        max_segment_length = tokenization_params['max_segment_length']
-        if len(segment) > max_segment_length:
-            raise ValueError(f'The segment is longer ({len(segment)}) than the maximum allowed segment length ({max_segment_length}).')
-        tokenized_segment, _ = lca_tokenize_segment(segment, tokenization_params)
-        tokenized_segment = [np.array(act_segment, dtype=np_token_type) for act_segment in tokenized_segment]
-        tokenized_segments_with_ids[act_id] = tokenized_segment
-    return tokenized_segments_with_ids
-def batch_tokenize_segments_with_ids(
-    segment_data: Union[Tuple[List[str], List[Any]], pd.DataFrame],
-    tokenization_params: Dict[str, Any],
-    num_cores: int = 1,
-    batch_size: int = 10000,
-    np_token_type: type = np.uint16
-) -> Dict[Any, List[np.ndarray]]:
-    """
-    Parallel tokenization of segments with associated IDs.
-    This function splits the input data into batches and uses multiprocessing to tokenize
-    the segments in parallel. It supports both list/tuple inputs and pandas DataFrames.
-    :param segment_data: Either a tuple/list containing two elements (segments, segment_ids),
-                         or a pandas DataFrame with 'segment' and 'segment_id' columns.
-    :type segment_data: Union[Tuple[List[str], List[Any]], pd.DataFrame]
-    :param tokenization_params: Dictionary containing tokenization parameters.
-    :type tokenization_params: Dict[str, Any]
-    :param num_cores: Number of CPU cores to use for parallel processing. Defaults to 1.
-    :type num_cores: int, optional
-    :param batch_size: Number of segments to process in each batch. Defaults to 10,000.
-    :type batch_size: int, optional
-    :param np_token_type: Numpy data type for the tokenized segments. Defaults to np.uint16.
-    :type np_token_type: type, optional
-    :return: A dictionary where keys are segment IDs and values are lists of numpy arrays representing tokenized segments.
-    :rtype: Dict[Any, List[np.ndarray]]
-    :raises ValueError: If the input data is neither a tuple/list nor a pandas DataFrame.
-    Example:
-        >>> segments = ['ACTG', 'TGCA']
-        >>> segment_ids = [1, 2]
-        >>> tokenization_params = {'max_segment_length': 50, ...}
-        >>> tokenized_data = batch_tokenize_segments_with_ids(
-                (segments, segment_ids),
-                tokenization_params,
-                num_cores=4,
-                batch_size=1000
-            )
-    """
-    if isinstance(segment_data, tuple) or isinstance(segment_data, list):
-        segments = segment_data[0]
-        segment_ids = segment_data[1]
-    elif isinstance(segment_data, pd.DataFrame):
-        segments = list(segment_data['segment'])
-        segment_ids = list(segment_data['segment_id'])
-    else:
-        raise ValueError(f'The input should be either pandas DataFrame or a tuple instead of {type(segment_data)}')
-    Ndata = len(segments)
-    batch_intervals = [(i, min(i + batch_size, Ndata)) for i in range(0, Ndata, batch_size)]
-    params = [
-        (segments[interval[0]:interval[1]],
-         segment_ids[interval[0]:interval[1]],
-         tokenization_params,
-         np_token_type)
-        for interval in batch_intervals
-    ]
-    with Pool(processes=num_cores) as pool:
-        result_list = pool.starmap(process_batch_tokenize_segments_with_ids, params)
-    tokenized_sets = {}
-    for d in result_list:
-        tokenized_sets.update(d)
-    return tokenized_sets
-def get_rectangular_array_from_tokenized_dataset(tokenized_segments_data: Dict[int, List[np.ndarray]], shift: int, max_token_count: int, truncate_zeros: bool = True, randomize: bool = True, numpy_dtype: Type = np.uint16) -> Tuple[np.ndarray, pd.DataFrame]:
-    """Create a rectangular numpy array that can be used as input to a Language Model (LM) from tokenized segment data.
-    :param tokenized_segments_data: A dictionary where keys are segment ids and values are lists of possible LCA tokenized vectors.
-    :type tokenized_segments_data: Dict[int, List[np.ndarray]]
-    :param shift: Number of LCA offsets.
-    :type shift: int
-    :param max_token_count: Maximum allowed token count in the output numpy array.
-    :type max_token_count: int
-    :param truncate_zeros: If True, truncate columns from the end of the numpy array that only contain zeros. (default=True)
-    :type truncate_zeros: bool, optional
-    :param randomize: If True, randomize the order of the rows in the output numpy array. (default=True)
-    :type randomize: bool, optional
-    :param numpy_dtype: Data type of the values in the output numpy array. (default=np.uint16)
-    :type numpy_dtype: Type, optional
-    :returns: A rectangular numpy array suitable for input to an LM.
-    :rtype: np.ndarray
-    :returns: A dataframe that describes which row in the numpy array corresponds to which segment and its LCA offset.
-        Columns are: ['torch_id', 'segment_id', 'offset']
-    :rtype: pd.DataFrame
-    """
-    expected_length = len(tokenized_segments_data)*shift
-    X=np.full((expected_length,max_token_count),0, dtype=numpy_dtype)
-    torch_db = []
-    torch_id = 0
-    for segment_id, tokenized_vectors in tokenized_segments_data.items():
-        for offset in range(shift):
-            segment_vector = tokenized_vectors[offset]
-            X[torch_id,0:segment_vector.shape[0]] = segment_vector
-            torch_db.append([torch_id, segment_id, offset])
-            torch_id+=1
-    torch_tokenized_segment_db = pd.DataFrame(torch_db,
-                                            columns = ['torch_id', 'segment_id', 'offset'])
-    if randomize:
-        logging.info('Doing randomization!')
-        perm = np.random.permutation(expected_length)
-        X = X[perm,:]
-        torch_tokenized_segment_db.rename({'torch_id': 'original_torch_id'}, axis=1, inplace=True)
-        torch_tokenized_segment_db = torch_tokenized_segment_db.iloc[perm,:].reset_index().drop('index', axis=1).reset_index().rename({'index' : 'torch_id'}, axis=1)
-    if truncate_zeros:
-        logging.info('Tuncating all zeros column')
-    X = truncate_zero_columns(X)
-    return X, torch_tokenized_segment_db
-def pretty_print_overlapping_sequence(segment, segment_kmers, tokenizer_params):
-    """
-    Format the sequence for pretty printing with overlapping k-mers.
-    :param segment: DNA sequence.
-    :type segment: str
-    :param segment_kmers: List of k-mers in the segment.
-    :type segment_kmers: list
-    :param tokenizer_params: Dictionary containing tokenization parameters.
-    :type tokenizer_params: dict
-    :return: List of formatted strings representing the sequence with overlapping k-mers.
-    :rtype: list
-    """
-    shift = tokenizer_params['shift']
-    k = tokenizer_params['kmer']
-    sep_c = 2
-    lines = []
-    base_offset = len(str( int((k+3)/shift))) + 3
-    first_line = ' '*base_offset + segment
-    lines.append(first_line)
-    nr_lines = int(np.ceil((k+sep_c)/shift))
-    logging.info('Nr. line to cover the seq:  {0}'.format(nr_lines))
-    for line_id in range(nr_lines):
-        line_mers = [k_mer for j, k_mer in enumerate(segment_kmers) if j%nr_lines== line_id]
-        act_line = str(line_id) + '.  ' + ' '*(line_id*shift)  + (' '*(sep_c)).join(line_mers)
-        lines.append(act_line)
-    lines = '\n'.join(lines)
-    return lines
-def generate_kmers(abc: Set[str], k: int) -> List[str]:
-    """
-    Generates all possible k-mers from a given alphabet.
-    :param abc: The alphabet.
-    :type abc: Set[str]
-    :param k: Length of the k-mers.
-    :type k: int
-    :return: List of all possible k-mers.
-    :rtype: List[str]
-    """
-    return [''.join(p) for p in product(abc, repeat=k)]
-def save_to_hdf(X: np.ndarray, hdf_file_path: str, database: pd.DataFrame = None, compression: bool = False, pd_chunksize: int = 10_000_000) -> None:
-    """Save a numpy array and an optional pandas DataFrame to an HDF5 file.
-    :param X: 2D numpy array to be saved.
-    :type X: np.ndarray
-    :param hdf_file_path: Path to the HDF5 file.
-    :type hdf_file_path: str
-    :param database: Pandas DataFrame to be saved. Defaults to None.
-    :type database: pd.DataFrame
-    :param compression: Whether to apply compression. Defaults to False.
-    :type compression: bool
-    :param pd_chunksize: Number of rows per chunk for saving the DataFrame. Defaults to 10,000,000.
-    :type pd_chunksize: int
-    :raises ValueError: If the provided numpy array is not 2D.
-    :raises OSError: If there's an error creating the directory structure or removing an existing HDF5 file.
-    Example:
-    >>> import numpy as np
-        >>> import pandas as pd
-        >>> array = np.random.random((100, 100))
-        >>> df = pd.DataFrame({'A': range(1, 101), 'B': range(101, 201)})
-        >>> save_to_hdf(array, "sample.hdf5", database=df, compression=True)
-    """
-    # Check if X is a 2D numpy array
-    if len(X.shape) != 2:
-        raise ValueError("The provided numpy array is not 2D.")
-    # If HDF5 file exists, attempt to delete it
-    if os.path.exists(hdf_file_path):
-        try:
-            os.remove(hdf_file_path)
-            logging.info(f"Existing HDF5 file {hdf_file_path} removed successfully.")
-        except Exception as e:
-            raise OSError(f"Error removing existing HDF5 file {hdf_file_path}. Error: {e}")
-    # Create directory structure for HDF5 file
-    create_directory_for_filepath(hdf_file_path)
-    # Save the numpy array to HDF5
-    with h5py.File(hdf_file_path, 'w') as hdf:
-        try:
-            grp = hdf.create_group("training_data")
-        except ValueError:
-            del hdf['training_data']
-        if compression:
-            grp.create_dataset("X", data=X, compression="lzf", chunks=True)
-        else:
-            grp.create_dataset("X", data=X, chunks=True)
-    logging.info(f"Numpy array saved to {hdf_file_path} successfully.")
-    # Save the pandas DataFrame to HDF5, if provided
-    if database is not None:
-        logging.info("Adding database into the HDF5 file!")
-        num_chunks = int(np.ceil(len(database) / pd_chunksize))
-        logging.info(f'Number of chunks: {num_chunks}')
-        chunk_grouping = np.arange(len(database)) // pd_chunksize
-        chunkseqs = database.groupby(chunk_grouping)
-        for i, (_, chunk) in enumerate(chunkseqs):
-            logging.info(f'Writing database chunk {i} into {hdf_file_path}')
-            if compression:
-                chunk.to_hdf(hdf_file_path, f'database_{i}', format='table', data_columns=True,  mode='a', complib='lzo')
-            else:
-                chunk.to_hdf(hdf_file_path, f'database_{i}', format='table', data_columns=True,  mode='a')
-        logging.info('Database addition finished!')
-def dataframe_to_seqrecords(
-    df: pd.DataFrame,
-    fastaidcol: str = 'test_fastaid',
-    sequencecol: str = 'sequence'
-) -> List[SeqRecord]:
-    """
-    Convert a DataFrame with sequence information into a list of SeqRecord objects.
-    :param df: DataFrame containing at least two columns: one for sequence IDs and one for sequences.
-    :type df: pd.DataFrame
-    :param fastaidcol: Name of the column in `df` that contains sequence IDs. Defaults to 'test_fastaid'.
-    :type fastaidcol: str, optional
-    :param sequencecol: Name of the column in `df` that contains nucleotide sequences. Defaults to 'sequence'.
-    :type sequencecol: str, optional
-    :return: A list of SeqRecord objects constructed from the DataFrame.
-    :rtype: List[SeqRecord]
-    Example:
-        >>> import pandas as pd
-        >>> data = {'test_fastaid': ['seq1', 'seq2'], 'sequence': ['ATCG', 'GGTA']}
-        >>> df = pd.DataFrame(data)
-        >>> seq_records = dataframe_to_seqrecords(df)
-        >>> seq_records[0].id
-        'seq1'
-    """
-    seq_records = []
-    for _, row in df.iterrows():
-        seq = Seq(row[sequencecol])
-        record = SeqRecord(seq, id=str(row[fastaidcol]), description="")
-        seq_records.append(record)
-    return seq_records
-def write_seqrecords_to_fasta(
-    seq_records: List[SeqRecord],
-    file_name: str
-) -> None:
-    """
-    Write a list of SeqRecord objects to a FASTA file.
-    :param seq_records: List of SeqRecord objects to be written to file.
-    :type seq_records: List[SeqRecord]
-    :param file_name: Name or path of the file to write the FASTA records.
-    :type file_name: str
-    :return: None
-    :rtype: None
-    Example:
-        >>> from Bio.Seq import Seq
-        >>> from Bio.SeqRecord import SeqRecord
-        >>> seq_records = [SeqRecord(Seq('ATCG'), id='seq1'), SeqRecord(Seq('GGTA'), id='seq2')]
-        >>> write_seqrecords_to_fasta(seq_records, 'output.fasta')
-    """
-    SeqIO.write(seq_records, file_name, "fasta")
-def dump_records_to_files(
-    seq_records: List[SeqRecord],
-    folder_path: str
-) -> None:
-    """
-    Write each SeqRecord to a separate FASTA file in the specified folder.
-    :param seq_records: List of SeqRecord objects to be written individually.
-    :type seq_records: List[SeqRecord]
-    :param folder_path: Path to the folder where the files should be saved.
-                        The folder will be created if it does not exist.
-    :type folder_path: str
-    :return: None
-    :rtype: None
-    Example:
-        >>> from Bio.Seq import Seq
-        >>> from Bio.SeqRecord import SeqRecord
-        >>> seq_records = [SeqRecord(Seq('ATCG'), id='seq1'), SeqRecord(Seq('GGTA'), id='seq2')]
-        >>> dump_records_to_files(seq_records, 'sequences_folder')
-    """
-    # Ensure the folder exists
-    os.makedirs(folder_path, exist_ok=True)
-    for record in seq_records:
-        file_path = os.path.join(folder_path, f"{record.id}.fasta")
-        SeqIO.write(record, file_path, "fasta")
-def split_seqrecords_to_fasta_chunks(
-    seq_records: List[SeqRecord],
-    output_folder: str,
-    chunk_size_mb: int = 10
-) -> None:
-    """
-    Splits a list of SeqRecord objects into multiple FASTA files, each less than a specified size in MB.
-    :param seq_records: List of SeqRecord objects to be split into chunks.
-    :type seq_records: List[SeqRecord]
-    :param output_folder: The output folder where the FASTA files will be saved.
-    :type output_folder: str
-    :param chunk_size_mb: Maximum size of each FASTA file in megabytes. Defaults to 10 MB.
-    :type chunk_size_mb: int, optional
-    :return: None
-    :rtype: None
-    Example:
-        >>> seq_records = [...]  # A list of SeqRecord objects
-        >>> split_seqrecords_to_fasta_chunks(seq_records, 'output_chunks', chunk_size_mb=5)
-    Notes:
-        - The last chunk may be smaller than the specified `chunk_size_mb`.
-        - The function approximates the size of each record for chunking.
-    """
-    # Ensure output folder exists
-    os.makedirs(output_folder, exist_ok=True)
-    current_chunk = []
-    current_chunk_size = 0  # in bytes
-    chunk_id = 1  # Identifier for chunks/files
-    for record in seq_records:
-        # Approximate size of the record in bytes
-        record_size = len(str(record.seq)) + len(record.id) + 2  # Adding buffer for '>' and '\n'
-        # Check if adding this record exceeds the chunk size
-        if current_chunk_size + record_size > chunk_size_mb * 1024 * 1024:
-            file_path = os.path.join(output_folder, f"chunk_{chunk_id}.fasta")
-            SeqIO.write(current_chunk, file_path, "fasta")
-            current_chunk = []
-            current_chunk_size = 0
-            chunk_id += 1
-        current_chunk.append(record)
-        current_chunk_size += record_size
-    # Write any remaining records to the last chunk
-    if current_chunk:
-        file_path = os.path.join(output_folder, f"chunk_{chunk_id}.fasta")
-        SeqIO.write(current_chunk, file_path, "fasta")
-def filter_short_sequences(
-    seq_records: List[SeqRecord],
-    length_threshold: int
-) -> List[SeqRecord]:
-    """
-    Filters out SeqRecord objects with sequences shorter than a specified threshold.
-    :param seq_records: List of SeqRecord objects.
-    :type seq_records: List[SeqRecord]
-    :param length_threshold: The minimum length of sequences to be retained.
-    :type length_threshold: int
-    :return: A list of SeqRecord objects that meet or exceed the length threshold.
-    :rtype: List[SeqRecord]
-    Example:
-        >>> from Bio.Seq import Seq
-        >>> from Bio.SeqRecord import SeqRecord
-        >>> records = [
-        ...     SeqRecord(Seq('ATCG'), id='seq1'),
-        ...     SeqRecord(Seq('AT'), id='seq2')
-        ... ]
-        >>> filtered_records = filter_short_sequences(records, 3)
-        >>> len(filtered_records)
-        1
-        >>> filtered_records[0].id
-        'seq1'
-    """
-    filtered_records = [record for record in seq_records if len(record.seq) >= length_threshold]
-    return filtered_records
-def get_token_counts_for_segment(Lseg, kmer, shift, offset):
-    nr_tokens = int((Lseg -kmer)/shift + 1)
-    return nr_tokens
-def get_seq_coordinates(token_pos, kmer, shift, offset):
-    seq_start = int(token_pos*shift + offset)
-    seq_end = int(token_pos*shift+kmer + offset)
-    return seq_start, seq_end
-def get_token_coordinates(seq_pos, kmer, shift, offset, Lseg):
-    nrtokens = get_token_counts_for_segment(Lseg, kmer, shift, offset)
-    token_pos_end = int((seq_pos+offset - kmer) / shift)
-    token_pos_start = int((seq_pos + offset) / shift)
-    if token_pos_end<0:
-        token_pos_end=0
-    if token_pos_start >= nrtokens:
-        token_pos_start = nrtokens-1
-    return token_pos_start, token_pos_end
-def sliding_window_average(arr, window_size=6):
-    # Create a window for averaging
-    window = np.ones(window_size) / window_size
-    # Use 'valid' mode to slide the window over the array without padding
-    result = np.convolve(arr, window, mode='valid')
-    return result
-def convolve_expression_array(expression_array, window_size=6, step=2):
-    # Define the averaging window
-    window = np.ones(window_size) / window_size
-    # Apply convolution along each column (axis=0)
-    convolved_array = convolve1d(expression_array, window, axis=1, mode='reflect')
-    # Downsample by step size
-    return convolved_array[:, ::step]

tokenizer.py DELETED Viewed

@@ -1,363 +0,0 @@
-import collections
-import os
-import json
-from copy import deepcopy
-from typing import List, Optional, Tuple, Dict
-from transformers import PreTrainedTokenizer
-from transformers.utils.hub import cached_file, hf_hub_url
-from .config_utils import SeqConfig
-from .sequtils import generate_kmers, lca_kmer_tokenize_segment
-# Define the names of the vocabulary files
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-# Define the mapping for pretrained vocabulary files
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "lca-mini-k6s1": "lca-base-dna6/vocab.txt",
-        "lca-mini-k6s2": "lca-base-dna6/vocab.txt",
-        "lca-mini-k1s1": "lca-base-dna1/vocab.txt",
-    }
-}
-# Define positional embedding sizes for pretrained models
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "lca-mini-k6s1": 1024,
-    "lca-mini-k1s1": 1024,
-    "lca-mini-k6s2": 2048,
-}
-# Define initial configuration for pretrained models
-PRETRAINED_INIT_CONFIGURATION = {
-    "lca-mini-k6s1": {"do_upper_case": True},
-    "lca-mini-k1s1": {"do_upper_case": True},
-    "lca-mini-k6s2": {"do_upper_case": True},
-}
-# Utility function to load vocabulary from a file
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        vocab[token.rstrip("\n")] = index
-    return vocab
-class LCATokenizer(PreTrainedTokenizer):
-    """
-    Custom tokenizer for LCA (Local Context Aware) tasks.
-    Handles specific tokenization processes, including k-mer tokenization with configurable shifts.
-    Attributes:
-        vocab_files_names (dict): Mapping of vocabulary file names.
-        pretrained_vocab_files_map (dict): Mapping of pretrained vocabulary files.
-        pretrained_init_configuration (dict): Initial configuration for pretrained models.
-        max_model_input_sizes (dict): Maximum input sizes for pretrained models.
-    """
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    nucleotide_abc = {"A", "T", "C", "G"}
-    extended_nucleotide_abc = {"A", "T", "C", "G", "*"}
-    sequence_unk_token = 'N'
-    default_unk_token = "[UNK]"
-    default_sep_token = "[SEP]"
-    default_pad_token = "[PAD]"
-    default_cls_token = "[CLS]"
-    default_mask_token = "[MASK]"
-    def __init__(
-        self,
-        config: Dict = {},
-        operation_space: str = "kmer",
-        **kwargs,
-    ):
-        """
-        Initializes the LCATokenizer with configuration and operation space.
-        Args:
-            config (dict): Tokenization parameters like k-mer size and shift.
-            operation_space (str): Defines operation mode ('kmer' or 'sequence').
-            kwargs: Additional arguments for PreTrainedTokenizer.
-        """
-        self.defconfig = SeqConfig()
-        config = self.defconfig.get_and_set_tokenization_parameters(config)
-        self.config = config
-        self.operation_space = operation_space
-        # Set default tokens
-        kwargs.setdefault("cls_token", self.default_cls_token)
-        kwargs.setdefault("unk_token", self.default_unk_token)
-        kwargs.setdefault("sep_token", self.default_sep_token)
-        kwargs.setdefault("pad_token", self.default_pad_token)
-        kwargs.setdefault("mask_token", self.default_mask_token)
-        # Load vocabulary
-        vocab_file = self.config["vocabfile"]
-        self.vocab = self.config["vocabmap"]
-        self.id2token = {v: k for k, v in self.vocab.items()}
-        self.max_len = self.config["max_segment_length"]
-        super().__init__(**kwargs)
-        # Handle extended vocabulary for sequence mode
-        if self.operation_space == 'sequence':
-            token_extension = sorted(list(set(generate_kmers(LCATokenizer.extended_nucleotide_abc, self.config['kmer'])) - \
-                 set(generate_kmers(LCATokenizer.nucleotide_abc, self.config['kmer'])) ))
-            self.extended_vocab = deepcopy(self.vocab)
-            for token in token_extension:
-                self.extended_vocab[token] = 4
-            self.unk_token = LCATokenizer.sequence_unk_token * self.config['shift']
-            self.mask_token = '*'
-            self.extended_vocab[self.mask_token] = self.vocab['[MASK]']
-            full_unk = 'N' * self.config['kmer']
-            self.vocab[full_unk] = 1
-            self.id2token[1] = full_unk
-            self.full_unk_token = full_unk
-        else:
-            self.extended_vocab = self.vocab
-            self.unk_token = '[UNK]'
-        self.unkown_tokenid = self.vocab['[UNK]']
-        self.sep_token = '[SEP]'
-        self.cls_token = '[CLS]'
-        self.pad_token = '[PAD]'
-        self.mask_token = '[MASK]'
-        self.special_tokens = list(self.special_tokens_map.values())
-    def _tokenize(self, text, **kwargs):
-        """
-        Tokenizes the input text using LCA tokenization with an optional offset.
-        Args:
-            text (str): The input DNA sequence to tokenize.
-            kwargs: Additional arguments, including:
-                - offset (int): The starting position for tokenization. Default is 0.
-        Returns:
-            List[str]: A list of tokens generated from the input text.
-        """
-        offset = kwargs.get("offset", 0)
-        #if offset < 0 or offset >= self.config.get("shift", 1):
-        #    raise ValueError(f"Invalid offset: {offset}. Must be between 0 and {self.config['shift'] - 1}.")
-        return lca_kmer_tokenize_segment(text, offset, self.config)
-    def _convert_token_to_id(self, token: str) -> int:
-        """
-        Converts a token to its corresponding ID using the vocabulary.
-        Args:
-            token (str): The token to convert.
-        Returns:
-            int: Token ID, or the unknown token ID if the token is not in the vocabulary.
-        """
-        return self.extended_vocab.get(token, self.unkown_tokenid)
-    def _convert_id_to_token(self, index: int) -> str:
-        """
-        Converts an ID to its corresponding token using the vocabulary.
-        Args:
-            index (int): The ID to convert.
-        Returns:
-            str: Corresponding token, or the unknown token if the ID is not in the vocabulary.
-        """
-        return self.id2token.get(index, self.unk_token)
-    def __len__(self) -> int:
-        """
-        Returns the length of the tokenizer's vocabulary.
-        The length returned is one less than the actual number of items in the vocabulary
-        to account for a specific offset or adjustment in token indexing.
-        :return: The adjusted length of the vocabulary.
-        :rtype: int
-        """
-        return len(self.vocab)
-    def tokenize(self, text: str, **kwargs) -> List[str]:
-        """
-        Tokenizes the input text using LCA tokenization.
-        Args:
-            text (str): The input DNA sequence to tokenize.
-            kwargs: Additional arguments, including:
-                - offset (int): The starting position for tokenization. Default is 0.
-        Returns:
-            List[str]: A list of tokens generated from the input text.
-        """
-        return self._tokenize(text, **kwargs)
-    def encode(self, text: str,  **kwargs) -> List[int]:
-        """
-        Extends the base `encode` method to support an `offset` parameter for custom tokenization logic.
-        Args:
-            text (str): Input text (DNA sequence).
-            offset (int): Offset parameter for the LCA tokenization. Defaults to 0.
-            kwargs: Additional arguments passed to the base `encode` method.
-        Returns:
-            List[int]: Encoded token IDs.
-        """
-        # Inject the offset into kwargs for the tokenizer
-        offset = kwargs.get("offset", 0)
-        kwargs["offset"] = offset
-        return super().encode(text, **kwargs)
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Builds inputs by adding special tokens to a sequence or pair of sequences.
-        Args:
-            token_ids_0 (List[int]): List of token IDs for the first sequence.
-            token_ids_1 (List[int], optional): List of token IDs for the second sequence.
-        Returns:
-            List[int]: Input IDs with special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        input_ids = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id]
-        #token_type_ids = [0 for i in range(len(input_ids))]
-        return input_ids
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Create the token type IDs corresponding to the sequences passed. [What are token type
-        IDs?](../glossary#token-type-ids)
-        Should be overridden in a subclass if the model has a special way of building those.
-        Args:
-            token_ids_0 (`List[int]`): The first tokenized sequence.
-            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
-        Returns:
-            `List[int]`: The token type ids.
-        """
-        if token_ids_1 is None:
-            return (len(token_ids_0)+2) * [0]
-        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
-    def batch_encode_plus(self, *args, **kwargs):
-        """
-        Extends the base `batch_encode_plus` method to add custom functionality if needed.
-        Args:
-            *args: Positional arguments passed to the base method.
-            **kwargs: Keyword arguments passed to the base method.
-        Returns:
-            dict: A dictionary containing the results of batch encoding.
-        """
-        # Call the parent method to handle the batch encoding
-        #print('Running batch encoding with ids')
-        act_outputs = super().batch_encode_plus(*args, **kwargs)
-        return act_outputs
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        """
-        Saves the tokenizer's vocabulary to a file.
-        Args:
-            save_directory (str): Directory to save the vocabulary file.
-            filename_prefix (str, optional): Prefix for the filename. Default is None.
-        Returns:
-            Tuple[str]: Path to the saved vocabulary file.
-        """
-        if filename_prefix is None:
-            filename_prefix = ""
-        vocab_file_path = os.path.join(save_directory, filename_prefix + "vocab.txt")
-        with open(vocab_file_path, "w") as f:
-            for token in self.vocab:
-                f.write(token + "\n")
-        return (vocab_file_path,)
-    def save_pretrained(self, save_directory: str, **kwargs):
-        """
-        Saves the tokenizer configuration and vocabulary to a directory.
-        Args:
-            save_directory (str): Directory to save the tokenizer files.
-        """
-        if not os.path.exists(save_directory):
-            os.makedirs(save_directory)
-        super().save_pretrained(save_directory, **kwargs)
-        tokenizer_config_path = os.path.join(save_directory, "tokenizer_config.json")
-        if os.path.exists(tokenizer_config_path):
-            with open(tokenizer_config_path, "r") as f:
-                tokenizer_config = json.load(f)
-        else:
-            tokenizer_config = {}
-        tokenizer_config.update({
-            "kmer": self.config.get("kmer", 6),
-            "shift": self.config.get("shift", 1),
-        })
-        with open(tokenizer_config_path, "w") as f:
-            json.dump(tokenizer_config, f, indent=2)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        """
-        Loads a tokenizer from the pretrained model directory or Hugging Face Hub.
-        Args:
-            pretrained_model_name_or_path (str): Path or model name on Hugging Face Hub.
-            kwargs: Additional arguments for initialization.
-        Returns:
-            LCATokenizer: The loaded tokenizer instance.
-        """
-        tokenizer_config_file = hf_hub_url(
-            pretrained_model_name_or_path, filename="tokenizer_config.json"
-        )
-        resolved_tokenizer_config_file = cached_file(
-            pretrained_model_name_or_path, filename="tokenizer_config.json"
-        )
-        with open(resolved_tokenizer_config_file, "r") as f:
-            tokenizer_config = json.load(f)
-        kmer = tokenizer_config.pop("kmer", 6)
-        shift = tokenizer_config.pop("shift", 1)
-        base_tokenization_config = {'kmer': kmer, 'shift': shift}
-        defconfig = SeqConfig()
-        config = defconfig.get_and_set_tokenization_parameters(base_tokenization_config)
-        tokenizer = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
-        tokenizer.config = config
-        return tokenizer

tokenizer_config.json CHANGED Viewed

@@ -1,10 +1,4 @@
 {
-  "auto_map": {
-    "AutoTokenizer": [
-      "tokenizer.LCATokenizer",
-      null
-    ]
-  },
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
@@ -15,4 +9,4 @@
   "unk_token": "[UNK]",
   "kmer": 6,
   "shift": 1
-}

 {
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
   "unk_token": "[UNK]",
   "kmer": 6,
   "shift": 1
+}