Spaces:

ml-jku
/

prot_xlstm_variant_fitness

Running

App Files Files Community

Elias Buerger commited on Mar 18

Commit

28f312f

1 Parent(s): e4995f0

slim down

Browse files

Files changed (19) hide show

Dockerfile +0 -34
app.py +2 -0
environment.yml +0 -39
protxlstm/applications/generation_utils/create_sequence_df.py +0 -85
protxlstm/applications/generation_utils/score_hamming.py +0 -80
protxlstm/applications/generation_utils/score_hmmer.py +0 -102
protxlstm/applications/generation_utils/score_structure.py +0 -55
protxlstm/applications/sample_sequences.py +0 -200
protxlstm/applications/score_sequences.py +0 -58
protxlstm/data.py +0 -60
protxlstm/dataloaders.py +0 -249
protxlstm/fim.py +0 -203
protxlstm/index.html +0 -16
protxlstm/models/llama.py +0 -342
protxlstm/models/mamba.py +0 -833
protxlstm/plot_utils.py +0 -26
protxlstm/train.py +0 -338
protxlstm/trainer.py +0 -123
run.sh +0 -6

Dockerfile DELETED Viewed

@@ -1,34 +0,0 @@
-FROM continuumio/anaconda3:main
-WORKDIR /code
-COPY ./environment.yml /code/environment.yml
-# Create the environment using the environment.yml file
-RUN conda env create -f /code/environment.yml
-# Set up a new user named "user" with user ID 1000
-RUN useradd -m -u 1000 user
-# Switch to the "user" user
-USER user
-# Set home to the user's home directory
-ENV HOME=/home/user \
-    PYTHONPATH=$HOME/app \
-    PYTHONUNBUFFERED=1 \
-    GRADIO_ALLOW_FLAGGING=never \
-    GRADIO_NUM_PORTS=1 \
-    GRADIO_SERVER_NAME=0.0.0.0 \
-    GRADIO_THEME=huggingface \
-    SYSTEM=spaces
-# Set the working directory to the user's home directory
-WORKDIR $HOME/app
-# Copy the current directory contents into the container at $HOME/app setting the owner to the user
-COPY --chown=user . $HOME/app
-# cgjs, u+x
-RUN chmod u+x $HOME/app/run.sh
-RUN chmod -R 777 $HOME/
-CMD ["./run.sh"]

app.py CHANGED Viewed

@@ -111,11 +111,13 @@ if __name__ == "__main__":
                 placeholder=DEFAULT_SEQUENCE,
             )
             st.session_state.context_sequences = context_sequence_str.split(",") + [st.session_state.target_sequence]
         elif context_type == 'Use MSA file':
             msa_file = st.file_uploader("Choose MSA file")
             st.session_state.num_context_sequences = st.number_input("How many of these sequences should be used?", min_value=0, step=1, value=25)
         else:
             st.session_state.context_sequences = [st.session_state.target_sequence]
     if st.session_state.target_sequence != "":
         with st.container():

                 placeholder=DEFAULT_SEQUENCE,
             )
             st.session_state.context_sequences = context_sequence_str.split(",") + [st.session_state.target_sequence]
+            msa_file = None
         elif context_type == 'Use MSA file':
             msa_file = st.file_uploader("Choose MSA file")
             st.session_state.num_context_sequences = st.number_input("How many of these sequences should be used?", min_value=0, step=1, value=25)
         else:
             st.session_state.context_sequences = [st.session_state.target_sequence]
+            msa_file = None
     if st.session_state.target_sequence != "":
         with st.container():

environment.yml DELETED Viewed

@@ -1,39 +0,0 @@
-name: prot_xlstm_app
-channels:
-  - pytorch
-  - nvidia
-  - conda-forge
-  - defaults
-dependencies:
-  - cuda=12.1
-  - cuda-nvcc=12.1
-  - gxx_linux-64=11.2.0
-  - python=3.11
-  - pip
-  - pytorch=2.2.0
-  - pytorch-cuda=12.1
-  - cmake
-  - ninja
-  - pip:
-    - accelerate>=0.26.0
-    - biopython #==1.83
-    - bottleneck #==1.4.2
-    - dacite #==1.8.1
-    - ipykernel #==6.29.3
-    - mamba_ssm==1.2.0
-    - matplotlib #==3.8.4
-    - numpy<2.0 #==1.26.4
-    - omegaconf #==2.3.0
-    - pandas #==2.2.2
-    - pyhmmer #==0.10.15
-    - rich #==13.7.1
-    - scipy #==1.13.0
-    - seaborn #==0.13.2
-    - torchmetrics #==1.2.1
-    - tqdm #==4.66.4
-    - transformers==4.44.2
-    - tueplots #==0.0.17
-    - wandb #==0.17.0
-    - streamlit #==1.43.2

protxlstm/applications/generation_utils/create_sequence_df.py DELETED Viewed

@@ -1,85 +0,0 @@
-import numpy as np
-import pickle
-import pandas as pd
-from protxlstm.dataloaders import ProteinMemmapDataset
-from protxlstm.utils import decode_sequence, reorder_masked_sequence
-def create_sequence_df(model_name, family_idx, parameters_list=None, num_sequences = 100, data_dir="./data/"):
-    #load dataset
-    dataset = ProteinMemmapDataset(
-            msa_memmap_path=f"{data_dir}open_protein_set_memmap.dat",
-            msa_memmap_meta_path=f"{data_dir}open_protein_set_memmap_indices.csv",
-            subset_path=f"{data_dir}/cluster_testing_set.txt",
-            sample=False,
-            max_msa_len=-1,
-            reverse=False,
-            seed=0,
-            troubleshoot=False,
-            fim_strategy="multiple_span",
-            always_mask=False,
-            max_position_embeddings=2048,
-            max_seq_position_embeddings=512,
-            add_position_ids="1d",
-            mask_fraction=0.2,
-            max_patches=5,
-        )
-    family_id = list(dataset.dataset_meta["msa_id"])[family_idx]
-    if model_name == "natural":
-        data = dataset[family_idx]
-        sequence_df = pd.DataFrame(columns=["family", "family_id", "sequence", "sequence_length"])
-        tokens = data["input_ids"][None,:]
-        all_context = decode_sequence(tokens[0].cpu().numpy())
-        list_sequences_msa = [reorder_masked_sequence(elem+"<cls>") for elem in all_context.split("<cls>")[1:-1]]
-        rd_idxs = np.random.choice(len(list_sequences_msa), num_sequences, replace=False)
-        natural_sequences = [seq for i, seq in enumerate(list_sequences_msa) if i in rd_idxs]
-        df_dict = {"family": [family_idx]*len(natural_sequences),
-                    "family_id": [family_id]*len(natural_sequences),
-                    "sequence": natural_sequences,
-                    "sequence_length": [len(seq) for seq in natural_sequences]}
-        sequence_df = pd.concat([sequence_df, pd.DataFrame(df_dict)], ignore_index = True)
-    else:
-        sequence_df = pd.DataFrame(columns=["family", "family_id",  "n_seqs_ctx", "temperature", "top_k", "top_p", "original_sequence", "sequence", "sequence_length", "perplexity"])
-        if parameters_list is None:
-            parameters_list = [(10,1.,10,1.), (10,1.,15,1.), (10,1.,10,0.95), (10,0.9,10,0.95), (10,0.8,10,0.9),
-                    (100,1.,10,1.), (100,1.,15,1.), (100,1.,10,0.95), (100,0.9,10,0.95), (100,0.8,10,0.9),
-                    (500,1.,10,1.), (500,1.,15,1.), (500,1.,10,0.95), (500,0.9,10,0.95), (500,0.8,10,0.9),
-                    (1000,1.,10,1.), (1000,1.,15,1.), (1000,1.,10,0.95), (1000,0.9,10,0.95), (1000,0.8,10,0.9),
-                    (-1,1.,10,1.), (-1,1.,15,1.), (-1,1.,10,0.95), (-1,0.9,10,0.95), (-1,0.8,10,0.9)]
-        for param in parameters_list:
-            n_seqs_ctx, temperature, top_k, top_p = param
-            with open(f"evaluation/generation/generated_sequences/{model_name}/{family_idx}_{param}_{num_sequences}", "rb") as f:
-                gen_seqs = pickle.load(f)
-            original_sequences =  list(gen_seqs[family_idx][param].keys())
-            reordered_sequences = [reorder_masked_sequence(seq) for seq in original_sequences]
-            perplexities = [gen_seqs[family_idx][param][seq]["perplexity"] for seq in original_sequences]
-            df_dict = {"family": [family_idx]*len(original_sequences),
-                        "family_id": [family_id]*len(original_sequences),
-                        "n_seqs_ctx": [n_seqs_ctx]*len(original_sequences),
-                        "temperature": [temperature]*len(original_sequences),
-                        "top_k": [top_k]*len(original_sequences),
-                        "top_p": [top_p]*len(original_sequences),
-                        "original_sequence": original_sequences,
-                        "sequence": reordered_sequences,
-                        "sequence_length": [len(seq) for seq in reordered_sequences],
-                        "perplexity": perplexities
-                        }
-            sequence_df = pd.concat([sequence_df, pd.DataFrame(df_dict)], ignore_index = True)
-    return sequence_df

protxlstm/applications/generation_utils/score_hamming.py DELETED Viewed

@@ -1,80 +0,0 @@
-import numpy as np
-from tqdm import tqdm
-import pandas as pd
-from Bio import Align
-from protxlstm.dataloaders import ProteinMemmapDataset
-from protxlstm.utils import decode_sequence, reorder_masked_sequence
-aligner = Align.PairwiseAligner()
-aligner.mode = 'global'
-aligner.match_score = 1
-aligner.mismatch_score = -1
-aligner.open_gap_score = -1
-aligner.extend_gap_score = -1
-def align_sequences(ref_seq, query_seq, print_alignments=False):
-    def hamming_str(s1,s2):
-        assert len(s1) == len(s2)
-        return sum(np.array(list(s1)) != np.array(list(s2)))/len(s1)
-    alignments = aligner.align(ref_seq, query_seq)
-    if print_alignments:
-        print("Score = %.1f:" % alignments[0].score)
-        print(alignments[0])
-    return hamming_str(alignments[0][0], alignments[0][1]), alignments[0][0], alignments[0][1]
-def score_hamming(sequence_df, family_idx, data_dir = f"./data/"):
-    assert len(set(list(sequence_df["family"]))) == 1 and sequence_df["family"].iloc[0] == family_idx
-    #load dataset
-    dataset = ProteinMemmapDataset(
-            msa_memmap_path=f"{data_dir}open_protein_set_memmap.dat",
-            msa_memmap_meta_path=f"{data_dir}open_protein_set_memmap_indices.csv",
-            subset_path=f"{data_dir}/cluster_testing_set.txt",
-            sample=False,
-            max_msa_len=-1,
-            reverse=False,
-            seed=0,
-            troubleshoot=False,
-            fim_strategy="multiple_span",
-            always_mask=False,
-            max_position_embeddings=2048,
-            max_seq_position_embeddings=512,
-            add_position_ids="1d",
-            mask_fraction=0.2,
-            max_patches=5,
-        )
-    # Select a sample of the dataset to be the input
-    data = dataset[family_idx]
-    tokens = data["input_ids"][None,:]
-    all_context = decode_sequence(tokens[0].cpu().numpy())
-    list_sequences_msa = [reorder_masked_sequence(elem+"<cls>") for elem in all_context.split("<cls>")[1:-1]]
-    # sequence_df["hamming"] = pd.Series(dtype=object)
-    sequence_df["min_hamming"] = pd.Series()
-    sequence_df["median_hamming"] = pd.Series()
-    sequence_df["mean_hamming"] = pd.Series()
-    sequence_df["std_hamming"] = pd.Series()
-    for seq in tqdm(list(sequence_df["sequence"])):
-        all_hamming = []
-        for ctx_seq in list_sequences_msa:
-            if ctx_seq == seq:
-                continue
-            else:
-                hamming, _, _ = align_sequences(ctx_seq, seq , print_alignments=False)
-                all_hamming.append(hamming)
-        # sequence_df.loc[sequence_df["sequence"] == seq, "hamming"] = [all_hamming]
-        sequence_df.loc[sequence_df["sequence"] == seq, "min_hamming"] = np.min(all_hamming)
-        sequence_df.loc[sequence_df["sequence"] == seq, "median_hamming"] = np.median(all_hamming)
-        sequence_df.loc[sequence_df["sequence"] == seq, "mean_hamming"] = np.mean(all_hamming)
-        sequence_df.loc[sequence_df["sequence"] == seq, "std_hamming"] = np.std(all_hamming)
-    return sequence_df

protxlstm/applications/generation_utils/score_hmmer.py DELETED Viewed

@@ -1,102 +0,0 @@
-import string
-from Bio import SeqIO
-import pyhmmer
-from tqdm import tqdm
-alphabet = pyhmmer.easel.Alphabet.amino()
-# This is an efficient way to delete lowercase characters and insertion characters from a string
-deletekeys = dict.fromkeys(string.ascii_lowercase)
-deletekeys["."] = None
-deletekeys["*"] = None
-translation = str.maketrans(deletekeys)
-def remove_insertions(sequence: str) -> str:
-    """ Removes any insertions into the sequence. Needed to load aligned sequences in an MSA. """
-    return sequence.translate(translation)
-def read_msa(filename: str):
-    """ Reads the sequences from an MSA file, automatically removes insertions."""
-    return [(record.description, remove_insertions(str(record.seq))) for record in SeqIO.parse(filename, "fasta")]
-def read_msa_unaligned(filename: str):
-    """ Reads the sequences from an MSA file, removes only . - and * characters."""
-    return [(record.description, str(record.seq).replace(".","").replace("-","").replace("*","").upper()) for record in SeqIO.parse(filename, "fasta")]
-def check_msa(msa):
-    """ Checks if there are any repeated sequences in the MSA"""
-    seqs = set()
-    for el in msa:
-        seqs.add(el[1])
-    assert len(seqs) == len(msa), "There are repeated sequences in the MSA"
-def make_hmm_from_a3m_msa(msa_filepath, hmm_filename=None):
-    # Load MSA from a3m
-    msa_tup = read_msa(msa_filepath)
-    # check_msa(msa_tup)
-    # Create digitized MSA block
-    all_seqs = [pyhmmer.easel.TextSequence(name=str(i).encode("utf-8"), sequence=seq) for i, (idz, seq) in enumerate(msa_tup)]
-    msa  = pyhmmer.easel.TextMSA(name=b"msa", sequences=all_seqs)
-    msa = msa.digitize(alphabet)
-    # Fit HMM
-    builder = pyhmmer.plan7.Builder(alphabet)
-    background = pyhmmer.plan7.Background(alphabet)
-    hmm, _, _ = builder.build_msa(msa, background)
-    if hmm_filename is not None:
-        with open(f"{hmm_filename}.hmm", "wb") as output_file:
-            hmm.write(output_file)
-    return hmm
-def align_and_score_sequences_in_a3m_with_hmm(hmm, sequences_path=None, sequences_list=None):
-    if sequences_list is not None:
-        msa = sequences_list
-        all_seqs = [pyhmmer.easel.TextSequence(name=str(i).encode("utf-8"), sequence=seq) for i, seq in enumerate(sequences_list)]
-    elif sequences_path is not None:
-        # Load sequences from a3m
-        msa = read_msa_unaligned(sequences_path)
-        all_seqs = [pyhmmer.easel.TextSequence(name=str(i).encode("utf-8"), sequence=seq) for i, (idz, seq) in enumerate(msa)]
-    else:
-        raise NotImplementedError("Missing sequences to align/score")
-    # Create digitized Sequence block
-    seq_block = pyhmmer.easel.TextSequenceBlock(all_seqs)
-    seq_block = seq_block.digitize(alphabet)
-    # Get all hits from the hmm
-    background = pyhmmer.plan7.Background(alphabet)
-    pipeline = pyhmmer.plan7.Pipeline(alphabet, background=background, bias_filter=False, F1=1.0, F2=1.0, F3=1.0)
-    hits = pipeline.search_hmm(hmm, seq_block)
-    if len(hits) != len(msa):
-        print(f"Number of hits: {len(hits)} is different from the number of sequences in the MSA: {len(msa)}")
-    # Extract hits
-    all_hits = {}
-    for hit in hits:
-        idz, score, evalue = hit.name, hit.score, hit.evalue
-        i = int(idz.decode("utf-8"))
-        seq = msa[i][1] if sequences_path is not None else sequences_list[i]
-        all_hits[seq] = {"score": score, "evalue": evalue}
-    return all_hits
-def score_hmmer(sequence_df, family_idx, data_dir = f"./data/"):
-    assert len(set(list(sequence_df["family"]))) == 1 and sequence_df["family"].iloc[0] == family_idx
-    family_id = sequence_df["family_id"].iloc[0]
-    msa_filepath = f"{data_dir}/a3m_files/{family_id}/a3m/uniclust30.a3m"
-    try:
-        hmm = make_hmm_from_a3m_msa(msa_filepath)
-    except:
-        raise Exception(f"Missing MSA of family {family_id}")
-    # align sequences
-    sequences = list(sequence_df["sequence"])
-    scores = align_and_score_sequences_in_a3m_with_hmm(hmm, sequences_list=sequences)
-    # save the scores associated to each sequence in the main df in the columns "score" and "evalue"
-    for seq in tqdm(sequences):
-        sequence_df.loc[sequence_df["sequence"] == seq, "score_gen"] = scores[seq]["score"] if seq in scores.keys() else 0
-        sequence_df.loc[sequence_df["sequence"] == seq, "evalue_gen"] = scores[seq]["evalue"] if seq in scores.keys() else 1
-    return sequence_df

protxlstm/applications/generation_utils/score_structure.py DELETED Viewed

@@ -1,55 +0,0 @@
-from Bio.PDB import PDBParser
-import torch
-from tqdm import tqdm
-from transformers import EsmForProteinFolding
-from protxlstm.utils import MASK_TO_ID
-pdb_parser = PDBParser()
-def compute_structure(seq, model):
-    def keep_sequence(seq, l):
-        if len(seq) > l:
-            return False
-        for mm in list(MASK_TO_ID.keys())+["<eos>", "<pad>", "<unk>", "<mask>", "<cls>", "<null_1>", "." , "-"]:
-            if mm in seq:
-                return False
-        return True
-    keep = keep_sequence(seq, l=750)
-    if keep:
-        with torch.no_grad():
-            output = model.infer([seq])
-        # pdb = model.output_to_pdb(output)
-        ptm = output["ptm"].item()
-        pae = output["predicted_aligned_error"].cpu().numpy()
-        mean_plddt = ((output["plddt"] * output["atom37_atom_exists"]).sum(dim=(1, 2)) / output["atom37_atom_exists"].sum(dim=(1, 2))).item()
-        pos_plddt = ((output["plddt"] * output["atom37_atom_exists"]).sum(dim=(2,)) / output["atom37_atom_exists"].sum(dim=(2,))).cpu().numpy()
-    else:
-        print(f"Sequence is invalid.")
-        ptm, pae, mean_plddt, pos_plddt = 0, 0 ,0 , 0
-    return ptm, pae, mean_plddt, pos_plddt
-def score_structure(sequence_df, family_idx):
-    assert len(set(list(sequence_df["family"]))) == 1 and sequence_df["family"].iloc[0] == family_idx
-    device="cuda:0"
-    # Import the folding model
-    model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)
-    model = model.cuda(device)
-    model.esm = model.esm.half()
-    torch.backends.cuda.matmul.allow_tf32 = True
-    sequences = list(sequence_df["sequence"])
-    for seq in tqdm(sequences):
-        ptm, pae, mean_plddt, pos_plddt = compute_structure(seq, model)
-        sequence_df.loc[sequence_df["sequence"] == seq, "ptm"] = ptm
-        sequence_df.loc[sequence_df["sequence"] == seq, "mean_plddt"] = mean_plddt
-    return sequence_df

protxlstm/applications/sample_sequences.py DELETED Viewed

@@ -1,200 +0,0 @@
-import torch
-from tqdm import tqdm
-import pickle
-import os
-import argparse
-import json
-from protxlstm.dataloaders import ProteinMemmapDataset
-from protxlstm.generation import generate_sequence
-from protxlstm.utils import (
-    AA_TO_ID,
-    load_model,
-)
-from protxlstm.models.xlstm import xLSTMLMHeadModel
-from protxlstm.models.mamba import MambaLMHeadModelwithPosids
-def sample_sequences(dataset,
-                     model,
-                     family_idx,
-                     params,
-                     n_samples_per_family,
-                     max_length=1000,
-                     chunk_chunk_size=2**15,
-                     save_path=None,
-                     device="cuda:0"):
-    """
-    Function to sample sequences from the model. Given a dataset, a list of families (their indexes in the dataset)
-    and a set of generating parameters, it generates `n_samples_per_family` sequences for each family and each parameter set.
-    The function returns a dictionary with the following structure:
-    gen_seqs = {family_idx: {parameters: {sequence: perplexity}}}
-    The parameters are in a list of tuples with the following structure:
-    parameters_list = [(nr_seqs_ctx, temperature, top_k, top_p)]
-    """
-    gen_seqs = {}
-    gen_seqs[family_idx] = {}
-    gen_seqs[family_idx][params] = {}
-    print(f"Sampling sequences for family {family_idx} and parameters {params}.")
-    n_seqs_ctx , temperature, top_k, top_p = params
-    for _ in tqdm(range(n_samples_per_family)):
-        # Sample the dataset to get the input
-        data = dataset[family_idx]
-        tokens = data["input_ids"][None,:].to(device)
-        pos_ids = data["position_ids"][None,:].to(device)
-        start_seqs = torch.argwhere(tokens[0]==0)[:,0].cpu().numpy()
-        n_seqs_ctx = len(start_seqs) if len(start_seqs) < n_seqs_ctx else n_seqs_ctx
-        L = start_seqs[n_seqs_ctx]+1
-        context_tokens = tokens[:,:L]
-        context_pos_ids = pos_ids[:,:L]
-        is_fim={}
-        # Generate the new sequence
-        output = generate_sequence(model,
-                                context_tokens,
-                                position_ids=context_pos_ids,
-                                is_fim=is_fim,
-                                max_length=(L+max_length),
-                                temperature=temperature,
-                                top_k=top_k,
-                                top_p=top_p,
-                                return_dict_in_generate=True,
-                                output_scores=True,
-                                eos_token_id=torch.tensor([AA_TO_ID["<cls>"]]).to(device),
-                                chunk_chunk_size=chunk_chunk_size,
-                                device=device)
-        # Get the perplexity of the generated sequence
-        output_seq = output["generated"]
-        loss = torch.nn.functional.cross_entropy(torch.from_numpy(output["scores"]).permute(0, 2, 1),
-                                                torch.from_numpy(output["generated_tokens"][0][None,:]))
-        # save only sequences with length < max_length
-        if len(output_seq[0]) < max_length:
-            gen_seqs[family_idx][params][output_seq[0]] = {"perplexity": torch.exp(loss).item()}
-    if save_path is not None:
-        if not os.path.exists("evaluation/generation/generated_sequences"):
-            os.mkdir("evaluation/generation/generated_sequences")
-        if not os.path.exists(save_path):
-            os.mkdir(save_path)
-        with open(f'{save_path}/{family_idx}_{params}_{n_samples_per_family}', "wb") as f:
-            pickle.dump(gen_seqs, f)
-        print(f"Sequences saved for family {family_idx} and parameters {params}")
-    return gen_seqs
-def generate_sequences(model_name,
-                    checkpoint,
-                    family_idxs=[],
-                    parameters_list=[],
-                    n_samples_per_family = 100,
-                    chunk_size=1024,
-                    chunk_chunk_size=2**15,
-                    data_dir="data/",
-                    device="cuda:0"
-                    ):
-    # Load the test dataset
-    fim_strategy = "multiple_span"
-    mask_fraction = 0.2
-    dataset = ProteinMemmapDataset(
-            msa_memmap_path=f"{data_dir}open_protein_set_memmap.dat",
-            msa_memmap_meta_path=f"{data_dir}open_protein_set_memmap_indices.csv",
-            subset_path=f"{data_dir}cluster_testing_set.txt",
-            sample=False,
-            max_msa_len=-1,
-            reverse=False,
-            seed=0,
-            troubleshoot=False,
-            fim_strategy=fim_strategy,
-            always_mask=False,
-            max_position_embeddings=2048,
-            max_seq_position_embeddings=512,
-            add_position_ids="1d",
-            mask_fraction=mask_fraction
-        )
-    if model_name == "xlstm":
-        model_class = xLSTMLMHeadModel
-    elif model_name == "mamba":
-        model_class = MambaLMHeadModelwithPosids
-    save_path = f"evaluation/generation/generated_sequences/{checkpoint.split('/')[-1]}"
-    if model_name == "xlstm":
-        config_update_kwargs = {
-                "mlstm_backend": "chunkwise_variable",
-                "mlstm_chunksize": chunk_size,
-                "mlstm_return_last_state": True
-            }
-    else:
-        config_update_kwargs = {}
-    #load the model
-    model = load_model(checkpoint,
-                    model_class=model_class,
-                    device=device,
-                    dtype=torch.bfloat16,
-                    **config_update_kwargs,
-                    )
-    model = model.eval()
-    print("Model loaded.")
-    for family_idx in family_idxs:
-        for params in parameters_list:
-            params = tuple(params)
-            if not os.path.exists(f'{save_path}/{family_idx}_{params}_{n_samples_per_family}'):
-                gen_seqs = sample_sequences(
-                        dataset=dataset,
-                        model=model,
-                        family_idx=family_idx,
-                        params=params,
-                        n_samples_per_family=n_samples_per_family,
-                        chunk_chunk_size=chunk_chunk_size,
-                        save_path=save_path,
-                        device=device)
-                print(f"Sampled {len(gen_seqs[family_idx][params])} valid sequences.")
-            else:
-                print(f"Sequences for family {family_idx} and parameters {params} already exist.")
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Generate sequences."
-    )
-    parser.add_argument("--model_name", type=str, help="Either 'xlstm' or 'mamba'.")
-    parser.add_argument("--checkpoint", type=str, help="Path to model checkpoint.")
-    parser.add_argument("--family_idxs", type=str, help="List of family indices.")
-    parser.add_argument("--parameters_list", type=str, help="List of sampling parameters.")
-    parser.add_argument("--n_samples_per_family", type=int, default=100, help="Number of sequences to sample per family and parameter set.")
-    parser.add_argument("--chunk_size", type=int, default=1024, help="Chunk size for xLSTM context encoding.")
-    parser.add_argument("--chunk_chunk_size", type=int, default=2*15, help="Length of context sequence part processed at once.")
-    parser.add_argument("--data_dir", type=str, default="data/", help="Path to dataset.")
-    parser.add_argument("--device", type=str, default="cuda:0", help="Device.")
-    args = parser.parse_args()
-    family_idxs = json.loads(args.family_idxs)
-    parameters_list = json.loads(args.parameters_list)
-    # Run sequence generation
-    generate_sequences(
-        model_name=args.model_name,
-        checkpoint=args.checkpoint,
-        family_idxs=family_idxs,
-        parameters_list=parameters_list,
-        n_samples_per_family=args.n_samples_per_family,
-        chunk_size=args.chunk_size,
-        chunk_chunk_size=args.chunk_chunk_size,
-        data_dir=args.data_dir,
-        device=args.device,
-        )

protxlstm/applications/score_sequences.py DELETED Viewed

@@ -1,58 +0,0 @@
-import argparse
-import os
-import pickle
-from generation_utils.create_sequence_df import create_sequence_df
-from generation_utils.score_hamming import score_hamming
-from generation_utils.score_hmmer import score_hmmer
-from generation_utils.score_structure import score_structure
-def score_sequences(model_name,
-                    family_idx,
-                    num_sequences = 100,
-                    data_dir = "data/"):
-    if os.path.isfile(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}"):
-        with open(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}", "rb") as f:
-            sequence_df = pickle.load(f)
-    else:
-        sequence_df = create_sequence_df(model_name, family_idx, data_dir = data_dir, num_sequences = num_sequences)
-        if not os.path.exists("evaluation/generation/evaluations/"):
-            os.mkdir("evaluation/generation/evaluations/")
-        if not os.path.exists(f"evaluation/generation/evaluations/{model_name}/"):
-            os.mkdir(f"evaluation/generation/evaluations/{model_name}/")
-        with open(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}", "wb") as f:
-            pickle.dump(sequence_df, f)
-    if not "min_hamming" in sequence_df.columns:
-        sequence_df = score_hamming(sequence_df, family_idx, data_dir)
-        with open(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}", "wb") as f:
-            pickle.dump(sequence_df, f)
-    if not "score_gen" in sequence_df.columns:
-        sequence_df = score_hmmer(sequence_df, family_idx, data_dir)
-        with open(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}", "wb") as f:
-            pickle.dump(sequence_df, f)
-    if not "ptm" in sequence_df.columns:
-        sequence_df = score_structure(sequence_df, family_idx)
-        with open(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}", "wb") as f:
-            pickle.dump(sequence_df, f)
-    return sequence_df
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Generate sequences."
-    )
-    parser.add_argument("--model_name", type=str, help="Either 'xlstm' or 'mamba'.")
-    parser.add_argument("--family_idx", type=int, help="Family index.")
-    parser.add_argument("--num_sequences", type=int, default=100, help="Number of sequences.")
-    parser.add_argument("--data_dir", type=str, default="./data/", help="Path to dataset.")
-    args = parser.parse_args()
-    sequence_df = score_sequences(args.model_name, args.family_idx, args.num_sequences, args.data_dir)

protxlstm/data.py DELETED Viewed

@@ -1,60 +0,0 @@
-import csv
-import os
-import numpy as np
-from tqdm import tqdm
-from protxlstm.utils import load_sequences_from_msa_file, tokenizer
-def process_msa(msa_item):
-    msa_name, msa_path = msa_item
-    # Load an a3m file with all the context sequences
-    msa = load_sequences_from_msa_file(msa_path)
-    # Tokenize the sequences and concatenate them into a single array
-    tokens = tokenizer(msa, concatenate=True)
-    tokens = tokens.numpy()[0]
-    return msa_name, tokens
-def main(data_dir, output_dir):
-    msa_paths = {k: os.path.join(data_dir, k, 'a3m/uniclust30.a3m') for k in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, k))}
-    msa_items = list(msa_paths.items())
-    dataset_dictionary = {}
-    total_length = 0
-    # First pass: calculate total length of all concatenated arrays
-    for item in tqdm(msa_items):
-        try:
-            k, v = process_msa(item)
-            dataset_dictionary[k] = v
-            total_length += len(v)
-        except:
-            print(f"Error processing {item}")
-    # Initialize the memmap array with the calculated total length
-    memmap_path = os.path.join(output_dir, 'open_protein_set_memmap.dat')
-    concatenated_array = np.memmap(memmap_path, dtype='int8', mode='w+', shape=(total_length,))
-    with open(f'{output_dir}/open_protein_set_memmap_indices.csv', 'w', newline='') as csvfile:
-        csvwriter = csv.writer(csvfile)
-        csvwriter.writerow(['msa_id', 'Start', 'End'])
-        start_index = 0
-        for key, array in dataset_dictionary.items():
-            end_index = start_index + len(array) - 1
-            concatenated_array[start_index:end_index + 1] = array  # Write to memmap
-            csvwriter.writerow([key, start_index, end_index])
-            start_index = end_index + 1
-    # Ensure the data is written to disk
-    concatenated_array.flush()
-if __name__ == "__main__":
-    data_dir = 'data/a3m_files'
-    output_dir = 'data/'
-    main(data_dir, output_dir)

protxlstm/dataloaders.py DELETED Viewed

@@ -1,249 +0,0 @@
-# Original code from ProtMamba under Apache License 2.0.
-#
-# Modifications made by Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen
-#   - Uniclust30_Dataset renamed to ProteinMemmapDataset
-#       - Dataset input file format changed for more efficient dataloading
-#       - Option to use only a subset
-#   - DataCollatorForUniclust30Dataset renamed to ProteinDataCollator
-#       - Add sequence padding
-import numpy as np
-import pandas as pd
-import torch
-from torch.utils.data import DataLoader, Dataset
-from typing import Dict, Optional, Sequence
-from protxlstm.fim import MultipleSpanFIM, NoFIM, SingleSpanFIM
-from protxlstm.utils import AA_TO_ID
-# Make dataset
-class ProteinMemmapDataset(Dataset):
-    """
-    ProteinMemmapDataset is a PyTorch Dataset class for handling memory-mapped datasets of protein multiple sequence alignments (MSAs).
-    This class imports MSA data stored in memmap format and associated metadata CSVs. It supports flexible
-    data sampling strategies and inpainting methods for sequence manipulation and training purposes.
-    Args:
-        msa_memmap_path (str): Path to the memory-mapped file containing the MSA clusters.
-        msa_memmap_meta_path (str): Path to the CSV file with metadata linking MSA Cluster IDs and indices in the memmap array.
-        subset_path (str, optional): Path to a CSV file specifying a subset of cluster IDs to use.
-        sample (bool, optional): If True, randomly samples sequences from each cluster; otherwise, loads all sequences and shuffles them.
-        max_msa_len (int, optional): Maximum length of the MSA sequences to include. Defaults to -1 (no limit).
-        reverse (bool, optional): If True, reverses sequences with a probability of 0.5 and moves the last token to the front.
-        seed (int, optional): Random seed for reproducibility. Defaults to 42.
-        troubleshoot (bool, optional): If True, prints debugging information. Defaults to False.
-        fim_strategy (str, optional): Strategy for inpainting ("no-scramble", "one_span", or "multiple_span").
-        max_patches (int, optional): Number of patches for inpainting. Used when fim_strategy is "multiple_span".
-        mask_fraction (float, optional): Fraction of the patches to mask. Used when fim_strategy is "multiple_span".
-        always_mask (bool, optional): If True, ensures masking is applied in the inpainting process.
-        max_position_embeddings (int, optional): Maximum position embeddings. Defaults to 2048.
-        max_seq_position_embeddings (int, optional): Maximum sequence position embeddings for 2D positional IDs. Defaults to 512.
-        add_position_ids (str, optional): Type of position IDs to add ("none", "1d", or "2d"). Defaults to "1d".
-    """
-    _FIM = {"no-scramble": NoFIM, "one_span": SingleSpanFIM, "multiple_span": MultipleSpanFIM}
-    _POSIDS = {"none", "1d", "2d"}
-    def __init__(self,
-                 msa_memmap_path=None,
-                 msa_memmap_meta_path=None,
-                 subset_path=None,
-                 sample=False,
-                 max_msa_len=-1,
-                 reverse=False,
-                 seed=42,
-                 troubleshoot=False,
-                 fim_strategy="no-scramble",
-                 max_patches=5,
-                 mask_fraction=0.2,
-                 always_mask=False,
-                 max_position_embeddings=2048,
-                 max_seq_position_embeddings=512,
-                 add_position_ids="1d", ):
-        np.random.seed(seed)
-        if msa_memmap_path:
-            self.dataset = np.memmap(msa_memmap_path, dtype=np.int8, mode='r')
-            self.dataset_meta = pd.read_csv(msa_memmap_meta_path)
-            if subset_path:
-                subset_ids = pd.read_csv(subset_path, header=None, names=['ID'])['ID'].tolist()
-                self.dataset_meta = self.dataset_meta[self.dataset_meta['msa_id'].isin(subset_ids)]
-        else:
-            self.dataset = None
-        self.sample = sample
-        self.max_msa_len = max_msa_len
-        self.reverse = reverse
-        self.fim_strategy = fim_strategy
-        if fim_strategy in ProteinMemmapDataset._FIM:
-            self.fim = ProteinMemmapDataset._FIM[fim_strategy](max_patches=max_patches,
-                                                             mask_fraction=mask_fraction,
-                                                             always_mask=always_mask,
-                                                             add_position_ids=add_position_ids != "none",
-                                                             troubleshoot=troubleshoot)
-        else:
-            raise ValueError(f'Fill in the middle stragy "{fim_strategy}" not recognized.')
-        self.max_position_embeddings = max_position_embeddings
-        self.max_seq_position_embeddings = max_seq_position_embeddings
-        self.add_position_ids = add_position_ids
-        self.troubleshoot = troubleshoot
-    def __len__(self):
-        # meta dataframe has one row for each MSA cluster
-        return len(self.dataset_meta)
-    def __getitem__(self, idx):
-        # get all the sequences in the cluster
-        sequences = self.get_sequences(idx)
-        # get total number of sequences in the cluster and choose how many to sample
-        orig_num_sequences = len(self.get_index_start_of_sequences(sequences))
-        num_sequences = np.random.randint(1, orig_num_sequences + 1) if self.sample else orig_num_sequences
-        # sample the sequences
-        sequences, position_ids = self.sample_sequences(sequences, num_sequences)
-        # with probability 0.5, reverse the sequences and move the last token to the front
-        sequences, position_ids = self.reverse_sequences(sequences, position_ids) if (
-                self.reverse and np.random.rand() > 0.5) else sequences, position_ids
-        # limit the length of the MSA
-        sequences = sequences[:self.max_msa_len] if self.max_msa_len > 0 else sequences
-        if self.add_position_ids != "none":
-            position_ids = position_ids[:self.max_msa_len] if self.max_msa_len > 0 else position_ids
-        # convert to tensor
-        sequences = torch.asarray(sequences, dtype=torch.int64)
-        position_ids = torch.asarray(position_ids, dtype=torch.int64).clamp(0,
-                                                                            self.max_position_embeddings - 1) if self.add_position_ids!="none" else None
-        if self.troubleshoot:
-            print(
-                f"Cluster {idx} has {orig_num_sequences} sequences, of which {num_sequences} sampled now. Total MSA length: {len(sequences)}")
-        if self.add_position_ids == "1d":
-            return dict(input_ids=sequences, position_ids=position_ids, labels=sequences)
-        if self.add_position_ids == "2d":
-            seq_position_ids = (sequences == AA_TO_ID["<cls>"]).int().cumsum(-1).clamp(0,
-                                                                                       self.max_seq_position_embeddings - 1).contiguous()
-            return dict(input_ids=sequences, position_ids=position_ids, seq_position_ids=seq_position_ids,
-                        labels=sequences)
-        return dict(input_ids=sequences, labels=sequences)
-    def get_msa_id(self, idx):
-        """Get the MSA ID in the cluster with index `idx`."""
-        cluster_meta = self.dataset_meta.iloc[idx]
-        return cluster_meta.msa_id
-    def get_idx_from_msa_id(self, msa_id):
-        """Get `idx` with the MSA ID"""
-        return self.dataset_meta[self.dataset_meta.msa_id == msa_id].index[0]
-    def get_sequences(self, idx):
-        """Get the sequences in the cluster with index `idx`."""
-        cluster_meta = self.dataset_meta.iloc[idx]
-        sequences = self.dataset[cluster_meta.Start : cluster_meta.End]
-        return sequences
-    def get_index_start_of_sequences(self, sequences):
-        """Get the positions of the start of each sequence in the cluster."""
-        return np.where(sequences == 0)[0]
-    def reverse_sequences(self, sequence, position_ids=None):
-        """Reverse the sequences and move the last token to the front."""
-        sequence = sequence[::-1]
-        if position_ids is not None:
-            position_ids = position_ids[::-1]
-        return np.concatenate([sequence[-1:], sequence[:-1]]), np.concatenate(
-            [position_ids[-1:], position_ids[:-1]]) if position_ids is not None else None
-    def sample_sequences(self, sequences, num_sequences, shuffle=True):
-        """Sample `num_sequences` from the sequences in the cluster."""
-        L = len(sequences)
-        # get the indexes of the start of each sequence
-        inds = self.get_index_start_of_sequences(sequences)
-        # check that there are sequences in the cluster and that there are enough of them
-        assert len(inds) > 0, "No sequences found in cluster."
-        assert len(inds) >= num_sequences, "Not enough sequences in cluster."
-        # sample n_sequences randomly from the sequences
-        if shuffle:
-            which_seqs = np.random.choice(np.arange(len(inds)), num_sequences, replace=False)
-        else:
-            which_seqs = np.arange(len(inds))[-num_sequences:]
-        # get the tuples of start and end indexes of the sequences
-        tuples = [(inds[i], inds[i + 1]) if i < len(inds) - 1 else (inds[i], L) for i in which_seqs]
-        if self.troubleshoot:
-            print(f"Sampled sequences: {tuples}")
-        # concatenate the sequences
-        sequences, position_ids = self.fim.apply(sequences, tuples)
-        return sequences, position_ids
-def make_dataloader(dataset):
-    """Basic function to make a dataloader.
-    """
-    dataloader = DataLoader(dataset)
-    return dataloader
-class ProteinDataCollator(object):
-    """
-    Collate examples into a batch, and pad batch to a specified maximum sequence length,
-    or to the longest sequence in the batch if max_sequence_length is None.
-    """
-    def __init__(self, max_sequence_length: Optional[int] = None):
-        """
-        Initialize the collator with an optional max_sequence_length.
-        Args:
-            max_sequence_length (Optional[int]): The maximum sequence length to pad/truncate to.
-                                                 If None, pad to the longest sequence in the batch.
-        """
-        self.max_sequence_length = max_sequence_length
-    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
-        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "input_ids"))
-        longest_seq = max(len(seq) for seq in input_ids)
-        if self.max_sequence_length is None:
-            max_len = longest_seq
-        else:
-            max_len = self.max_sequence_length
-        input_ids = self.pad_sequences(input_ids, max_len, padding_value=AA_TO_ID["<pad>"])
-        labels = self.pad_sequences(labels, longest_seq, padding_value=AA_TO_ID["<pad>"])
-        labels = self.pad_sequences(labels, max_len, padding_value=-100)
-        return_dict = dict(
-            input_ids=input_ids,
-            labels=labels,
-            attention_mask=input_ids.ne(AA_TO_ID["<pad>"])
-        )
-        if "position_ids" in instances[0]:
-            position_ids = [instance["position_ids"] for instance in instances]
-            position_ids = self.pad_sequences(position_ids, max_len, padding_value=0)
-            return_dict["position_ids"] = position_ids
-            if "seq_position_ids" in instances[0]:
-                seq_position_ids = [instance["seq_position_ids"] for instance in instances]
-                seq_position_ids = self.pad_sequences(seq_position_ids, max_len, padding_value=0)
-                return_dict["seq_position_ids"] = seq_position_ids
-        return return_dict
-    def pad_sequences(self, seqs, max_length, padding_value):
-        # truncate long sequences (redundant, already done in __getitem__, maybe safe to remove)
-        seqs = [seq[:max_length] for seq in seqs]
-        # pad to same length
-        seqs = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=padding_value)
-        # pad to max length
-        padding = max_length - seqs.size(1)
-        seqs = torch.nn.functional.pad(seqs, (0, padding), value=padding_value)
-        return seqs

protxlstm/fim.py DELETED Viewed

@@ -1,203 +0,0 @@
-# Original code from ProtMamba under Apache License 2.0.
-from protxlstm.utils import MASK_TO_ID, AA_TO_ID
-import numpy as np
-class AbstractFIM(object):
-    def __init__(self,
-                 max_patches=5,
-                 mask_fraction=0.2,
-                 always_mask=False,
-                 mask_tokens=MASK_TO_ID,
-                 eos_token=AA_TO_ID["<eos>"],
-                 add_position_ids=False,
-                 troubleshoot=False):
-        """
-        This class is designed to concatenate sequences based on different scrambling strategies.
-        It takes a list of sequences, tuples indicating the start and end indices of each sequence,
-        an optional number of patches to sample, and a scrambling strategy as inputs.
-        """
-        self.troubleshoot = troubleshoot
-        self.max_patches = max_patches
-        self.mask_fraction = mask_fraction
-        self.mask_tokens = mask_tokens
-        assert len(
-            self.mask_tokens) >= self.max_patches, "Number of mask tokens must be bigger than max number of patches."
-        self.eos_token = eos_token
-        self.add_position_ids = add_position_ids
-        self.always_mask = always_mask
-    def apply(self, sequences, tuples):
-        """
-        This function concatenates the sequences scrambling each one according to the scrambling strategy.
-        """
-        input_ids, position_ids = [], []
-        for t in tuples:
-            seq, pos = self.fim(sequences, t)
-            input_ids.extend(seq)
-            if self.add_position_ids:
-                position_ids.extend(pos)
-        if self.add_position_ids:
-            return input_ids, position_ids
-        return input_ids, None
-    def fim(self, sequences, t):
-        """
-        This function concatenates the sequence's parts based on the scrambling strategy.
-        """
-        raise NotImplementedError
-class NoFIM(AbstractFIM):
-    def __init__(self,
-                 max_patches=5,
-                 mask_fraction=0.2,
-                 always_mask=False,
-                 mask_tokens=MASK_TO_ID,
-                 eos_token=AA_TO_ID["<eos>"],
-                 add_position_ids=False,
-                 troubleshoot=False):
-        super().__init__(max_patches, mask_fraction, always_mask, mask_tokens, eos_token, add_position_ids, troubleshoot)
-    def fim(self, sequences, t):
-        """
-        This function keeps the sequence identical without any scrambling.
-        """
-        if self.add_position_ids:
-            position_ids = np.arange(t[0], t[1]) - t[0]
-            return sequences[t[0]:t[1]], position_ids
-        return sequences[t[0]:t[1]], None
-class SingleSpanFIM(AbstractFIM):
-    def __init__(self,
-                 max_patches=5,
-                 mask_fraction=0.2,
-                 always_mask=False,
-                 mask_tokens=MASK_TO_ID,
-                 eos_token=AA_TO_ID["<eos>"],
-                 add_position_ids=False,
-                 troubleshoot=False):
-        super().__init__(max_patches, mask_fraction, always_mask, mask_tokens, eos_token, add_position_ids, troubleshoot)
-    def fim(self, sequences, t):
-        """
-        This function creates and concatenates parts of the sequences based on the OpenAI scrambling strategy.
-        It randomly selects two indices within the range of the given tuple,
-        splits the sequence into three parts based on these indices, and then concatenates them with the
-        masked patch at the end
-        """
-        new_tuple = tuple(np.sort(np.random.choice(np.arange(t[0] + 1, t[1]), 2, replace=False)))
-        part1 = sequences[t[0]:new_tuple[0]]
-        part2 = sequences[new_tuple[0]:new_tuple[1]]
-        part3 = sequences[new_tuple[1]:t[1]]
-        sequence = np.concatenate([part1, [self.mask_tokens["<mask-1>"]], part3, [self.mask_tokens["<mask-1>"]], part2])
-        position_ids_sequence = None
-        if self.add_position_ids:
-            position_ids = np.arange(t[0], t[1]) - t[0]
-            position_ids_part1 = position_ids[t[0]:new_tuple[0]]
-            position_ids_part2 = position_ids[new_tuple[0]:new_tuple[1]]
-            position_ids_part3 = position_ids[new_tuple[1]:t[1]]
-            position_ids_sequence = np.concatenate(
-                [position_ids_part1, [position_ids_part2[0]], position_ids_part3, [position_ids_part2[0]],
-                 position_ids_part2])
-        return sequence, position_ids_sequence
-class MultipleSpanFIM(AbstractFIM):
-    def __init__(self,
-                 max_patches=5,
-                 mask_fraction=0.2,
-                 always_mask=False,
-                 mask_tokens=MASK_TO_ID,
-                 eos_token=AA_TO_ID["<eos>"],
-                 add_position_ids=False,
-                 troubleshoot=False):
-        super().__init__(max_patches, mask_fraction, always_mask, mask_tokens, eos_token, add_position_ids, troubleshoot)
-    def fim(self, sequences, t):
-        """
-        This function creates and concatenates parts of the sequences based on the inpaint scrambling strategy.
-        It randomly selects `2*num_patches` indices within the range of the given tuple,
-        splits the sequence into unmasked and masked parts based on these indices, and then concatenates them.
-        The number of patches is sampled from a poisson distribution with upper limit `self.max_patches` and average 1.
-        The concatenation is done by joining all unmaksed parts (interleaved with mask tokens) and afterwards
-        all masked parts (interleaved with mask tokens). At the end of the unmasked parts, a special token is added
-        to indicate the end of the unmasked parts, and at the end of the masked parts, a special token is added
-        to indicate the end of the masked parts.
-        """
-        # sample num_patches from a discrete poisson distribution with upper limit L
-        def sample_lengths(start, end):
-            """
-            Sample a length uniformly from 1 to max_L*self.mask_fraction (must be bigger than 1).
-            If the length is larger than max_L, return max_L.
-            """
-            max_L = end - start
-            length = np.random.randint(1, max(int(max_L * self.mask_fraction), 2))
-            return min(length, max_L)
-        # sample num_patches from a discrete poisson distribution with upper limit max_patches
-        num_patches = 1000
-        while num_patches > self.max_patches:
-            num_patches = np.random.poisson(1)
-        if self.always_mask:
-            num_patches = max(num_patches, 1)
-        # sample num_patches starting points for the masked positions (+ final position)
-        start_patches = list(np.sort(np.random.choice(np.arange(t[0] + 1, t[1]),
-                                                      num_patches,
-                                                      replace=False))) + [t[1]]
-        # sample num_patches lengths of the patches
-        len_patches = [sample_lengths(start_patches[i], start_patches[i + 1])
-                       for i in range(len(start_patches) - 1)]
-        # create masked tuples with start and end indices of the patches
-        masked_tuples = [(start_patches[i], start_patches[i] + len_patches[i]) for i in range(len(start_patches) - 1)]
-        # split the sequences into unmasked and masked parts
-        unmasked_sequence, masked_sequence, unmasked_position_ids, masked_position_ids = self.split_sequences(sequences,
-                                                                                                              t,
-                                                                                                              masked_tuples)
-        if self.troubleshoot:
-            print(f"For sequence in {t}: sampled {num_patches=}, {start_patches=}, {len_patches=}, {masked_tuples=}")
-        # concatenate the unmasked and masked parts
-        return unmasked_sequence + masked_sequence, unmasked_position_ids + masked_position_ids if self.add_position_ids else None
-    def split_sequences(self, sequences, t, masked_tuples):
-        """
-        This function splits the sequences into unmasked and masked parts based on the given tuples.
-        Args:
-            t (tuple): The start and end index of each sequence.
-            masked_tuples (list): A list of tuples specifying the indices for masked regions.
-        Returns:
-            unmasked_parts (list): The unmasked parts of the sequences interleaved with mask_tokens.
-            masked_parts (list): The masked parts of the sequences interleaved with mask_tokens.
-        """
-        unmasked_parts, masked_parts = [], []
-        unmasked_positions, masked_positions = [], []
-        position_ids = None
-        start, end = t
-        if self.add_position_ids:
-            position_ids = np.arange(start, end) - start
-        for i, region in enumerate(masked_tuples):
-            mask_token = self.mask_tokens[f"<mask-{i + 1}>"]
-            unmasked_parts.extend(sequences[start:region[0]])
-            unmasked_parts.append(mask_token)
-            masked_parts.append(mask_token)
-            masked_parts.extend(sequences[region[0]:region[1]])
-            if self.add_position_ids:
-                unmasked_positions.extend(position_ids[start-t[0]:region[0]-t[0]])
-                unmasked_positions.append(position_ids[region[0]-t[0]])
-                masked_positions.append(position_ids[region[0]-t[0]])
-                masked_positions.extend(position_ids[region[0]-t[0]:region[1]-t[0]])
-            start = region[1]
-        unmasked_parts.extend(sequences[start:end])
-        if self.add_position_ids:
-            unmasked_positions.extend(position_ids[start-t[0]:end-t[0]])
-        if len(masked_tuples) > 0:
-            unmasked_parts.append(self.eos_token)
-            if self.add_position_ids:
-                unmasked_positions.append(0)
-        return unmasked_parts, masked_parts, unmasked_positions, masked_positions

protxlstm/index.html DELETED Viewed

@@ -1,16 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
-<html>
- <head>
-  <title>Index of /research/Bio-xLSTM/downloads/Prot-xLSTM/checkpoints/protxlstm_26M_30B</title>
- </head>
- <body>
-<h1>Index of /research/Bio-xLSTM/downloads/Prot-xLSTM/checkpoints/protxlstm_26M_30B</h1>
-<pre><img src="/icons/blank.gif" alt="Icon "> <a href="?C=N;O=D">Name</a>                                                  <a href="?C=M;O=A">Last modified</a>      <a href="?C=S;O=A">Size</a>  <a href="?C=D;O=A">Description</a><hr><img src="/icons/back.gif" alt="[PARENTDIR]"> <a href="/research/Bio-xLSTM/downloads/Prot-xLSTM/checkpoints/">Parent Directory</a>                                                           -
-<img src="/icons/unknown.gif" alt="[   ]"> <a href="config.json">config.json</a>                                           2024-11-04 14:36  1.8K
-<img src="/icons/unknown.gif" alt="[   ]"> <a href="optimizer.pt">optimizer.pt</a>                                          2024-11-04 14:36  198M
-<img src="/icons/binary.gif" alt="[   ]"> <a href="pytorch_model.bin">pytorch_model.bin</a>                                     2024-11-04 14:36   99M
-<img src="/icons/unknown.gif" alt="[   ]"> <a href="rng_state.pth">rng_state.pth</a>                                         2024-11-04 14:36   14K
-<img src="/icons/unknown.gif" alt="[   ]"> <a href="scheduler.pt">scheduler.pt</a>                                          2024-11-04 14:36  1.0K
-<img src="/icons/unknown.gif" alt="[   ]"> <a href="trainer_state.json">trainer_state.json</a>                                    2024-11-04 14:36  2.4M
-<hr></pre>
-</body></html>

protxlstm/models/llama.py DELETED Viewed

@@ -1,342 +0,0 @@
-import json
-import math
-import os
-from collections import namedtuple
-from typing import Optional, Tuple
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import PretrainedConfig
-from protxlstm.xlstm.components.rotary_position import compute_freqs_cis
-# Note: generation capabilities are not implemented for the transformer
-class TransformerConfig(PretrainedConfig):
-    model_type = "llama"
-    def __init__(
-        self,
-        d_model,
-        n_layer,
-        n_heads,
-        n_kv_heads,
-        bidirectional,
-        vocab_size,
-        hidden_dim,
-        multiple_of,  # MLP hidden layer size will be multiple of
-        norm_eps,
-        max_length,
-        dropout,
-        max_position_embeddings,
-        rope_base_frequency,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        # default hyperparameters for the Llama 7B model
-        self.dim = d_model
-        self.n_layers = n_layer
-        self.n_heads = n_heads
-        self.n_kv_heads = n_kv_heads
-        self.causal_attention = not bidirectional
-        self.vocab_size = vocab_size
-        self.hidden_dim = hidden_dim
-        self.multiple_of = multiple_of
-        self.norm_eps = norm_eps
-        self.max_seq_len = max_length
-        self.dropout = dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.rope_base_frequency = rope_base_frequency
-class RMSNorm_transformer(torch.nn.Module):
-    def __init__(self, dim: int, eps: float):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-    def forward(self, x):
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-    t = torch.arange(end, device=freqs.device)  # type: ignore
-    freqs = torch.outer(t, freqs).float()  # type: ignore
-    freqs_cos = torch.cos(freqs)  # real part
-    freqs_sin = torch.sin(freqs)  # imaginary part
-    return freqs_cos, freqs_sin
-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
-    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-    return freqs_cis.view(shape)
-def apply_rotary_emb(
-    xq: torch.Tensor,
-    xk: torch.Tensor,
-    freqs_cos: torch.Tensor,
-    freqs_sin: torch.Tensor
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    # reshape xq and xk to match the complex representation
-    xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1)
-    xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
-    # reshape freqs_cos and freqs_sin for broadcasting
-    freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)
-    freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)
-    # apply rotation using real numbers
-    xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
-    xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
-    xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
-    xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos
-    # flatten last two dimensions
-    xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)
-    xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
-    return xq_out.type_as(xq), xk_out.type_as(xk)
-def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
-    bs, slen, n_kv_heads, head_dim = x.shape
-    if n_rep == 1:
-        return x
-    return (
-        x[:, :, :, None, :]
-        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
-        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
-    )
-class Attention(nn.Module):
-    def __init__(self, args: TransformerConfig):
-        super().__init__()
-        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
-        assert args.n_heads % self.n_kv_heads == 0
-        model_parallel_size = 1
-        self.n_local_heads = args.n_heads // model_parallel_size
-        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
-        self.n_rep = self.n_local_heads // self.n_local_kv_heads
-        self.head_dim = args.dim // args.n_heads
-        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
-        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
-        self.attn_dropout = nn.Dropout(args.dropout)
-        self.resid_dropout = nn.Dropout(args.dropout)
-        self.dropout = args.dropout
-        self.causal_attention = args.causal_attention
-        # use flash attention or a manual implementation?
-        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
-        if not self.flash and self.causal_attention:
-            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
-            mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
-            mask = torch.triu(mask, diagonal=1)
-            self.register_buffer("mask", mask)
-    def forward(
-        self,
-        x: torch.Tensor,
-        freqs_cos: torch.Tensor,
-        freqs_sin: torch.Tensor,
-    ):
-        bsz, seqlen, _ = x.shape
-        # QKV
-        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
-        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        # RoPE relative positional embeddings
-        xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
-        # grouped multiquery attention: expand out keys and values
-        xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
-        xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
-        # make heads into a batch dimension
-        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
-        xk = xk.transpose(1, 2)
-        xv = xv.transpose(1, 2)
-        # flash implementation
-        if self.flash:
-            output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=self.causal_attention)
-        else:
-            # manual implementation
-            scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
-            if self.causal_attention:
-                scores = scores + self.mask[:, :, :seqlen, :seqlen]   # (bs, n_local_heads, seqlen, cache_len + seqlen)
-            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
-            scores = self.attn_dropout(scores)
-            output = torch.matmul(scores, xv)  # (bs, n_local_heads, seqlen, head_dim)
-        # restore time as batch dimension and concat heads
-        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
-        # final projection into the residual stream
-        output = self.wo(output)
-        output = self.resid_dropout(output)
-        return output
-class FeedForward(nn.Module):
-    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
-        super().__init__()
-        if hidden_dim is None:
-            hidden_dim = 4 * dim
-            hidden_dim = int(2 * hidden_dim / 3)
-            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
-        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
-        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
-class TransformerBlock(nn.Module):
-    def __init__(self, layer_id: int, args: TransformerConfig):
-        super().__init__()
-        self.n_heads = args.n_heads
-        self.dim = args.dim
-        self.head_dim = args.dim // args.n_heads
-        self.attention = Attention(args)
-        self.feed_forward = FeedForward(
-            dim=args.dim,
-            hidden_dim=args.hidden_dim,
-            multiple_of=args.multiple_of,
-            dropout=args.dropout,
-        )
-        self.layer_id = layer_id
-        self.attention_norm = RMSNorm_transformer(args.dim, eps=args.norm_eps)
-        self.ffn_norm = RMSNorm_transformer(args.dim, eps=args.norm_eps)
-    def forward(self, x, freqs_cos, freqs_sin):
-        h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin)
-        out = h + self.feed_forward.forward(self.ffn_norm(h))
-        return out
-class Transformer(nn.Module):
-    last_loss: Optional[torch.Tensor]
-    def __init__(self, params: TransformerConfig):
-        super().__init__()
-        self.params = params
-        self.vocab_size = params.vocab_size
-        self.n_layers = params.n_layers
-        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
-        self.dropout = nn.Dropout(params.dropout)
-        self.layers = torch.nn.ModuleList()
-        for layer_id in range(params.n_layers):
-            self.layers.append(TransformerBlock(layer_id, params))
-        self.layer_head_dim = self.layers[0].head_dim
-        self.norm = RMSNorm_transformer(params.dim, eps=params.norm_eps)
-        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
-        # share the unembedding parameters with the embedding parameters
-        self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying
-        # some useful precompute for the RoPE relative positional embeddings
-        # freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
-        # self.register_buffer("freqs_cos", freqs_cos, persistent=False)
-        # self.register_buffer("freqs_sin", freqs_sin, persistent=False)
-        # init all weights
-        self.apply(self._init_weights)
-        # apply special scaled init to the residual projections, per GPT-2 paper
-        for pn, p in self.named_parameters():
-            if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
-                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers))
-        # Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets tensor.
-        self.last_loss = None
-    def _init_weights(self, module):
-        if isinstance(module, nn.Linear):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-            if module.bias is not None:
-                torch.nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.Embedding):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-    def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None, **kwargs) -> torch.Tensor:
-        _bsz, seqlen = tokens.shape
-        h = self.tok_embeddings(tokens)
-        h = self.dropout(h)
-        # freqs_cos = self.freqs_cos[:seqlen]
-        # freqs_sin = self.freqs_sin[:seqlen]
-        if 'position_ids' in kwargs:
-            freqs_cos, freqs_sin = compute_freqs_cis(kwargs.pop("position_ids"), self.layer_head_dim, theta=self.params.rope_base_frequency)
-        else:
-            raise ValueError('Llama model only implemented with RoPEs')
-        freqs_cos = freqs_cos.squeeze()
-        freqs_sin = freqs_sin.squeeze()
-        for layer in self.layers:
-            h = layer(h, freqs_cos, freqs_sin)
-        h = self.norm(h)
-        if targets is not None:
-            logits = self.output(h)
-            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
-        else:
-            logits = self.output(h)
-            self.last_loss = None
-        return logits
-class TransformerLMHeadModel(nn.Module):
-    def __init__(
-        self,
-        config: TransformerConfig,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.backbone = Transformer(config)
-    def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0):
-        """
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        lm_logits = self.backbone(input_ids, position_ids=position_ids)
-        CausalLMOutput = namedtuple("CausalLMOutput", ["loss", "logits"])
-        return CausalLMOutput(loss=None, logits=lm_logits)
-    def save_pretrained(self, save_directory):
-        """
-        Save the model and its configuration file to a directory.
-        """
-        # Ensure save_directory exists
-        os.makedirs(save_directory, exist_ok=True)
-        # Save the model's state_dict
-        model_path = os.path.join(save_directory, "pytorch_model.bin")
-        torch.save(self.state_dict(), model_path)
-        # Save the configuration of the model
-        config_path = os.path.join(save_directory, "config.json")
-        with open(config_path, "w") as f:
-            json.dump(self.config.to_dict(), f)

protxlstm/models/mamba.py DELETED Viewed

@@ -1,833 +0,0 @@
-# Original code from ProtMamba under Apache License 2.0.
-import json
-import os
-from collections import namedtuple
-from dataclasses import dataclass, field
-from functools import partial
-from mamba_ssm.models.config_mamba import MambaConfig
-from mamba_ssm.modules.mamba_simple import Block, Mamba
-from mamba_ssm.models.mixer_seq_simple import MixerModel, _init_weights
-from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
-from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf
-import torch
-import torch.nn as nn
-from torch.utils.checkpoint import checkpoint
-from transformers import PretrainedConfig
-from protxlstm.generation import GenerationMixinSafe
-@dataclass
-class MambaConfig(PretrainedConfig):
-    d_model: int = 2560
-    n_layer: int = 64
-    vocab_size: int = 50277
-    ssm_cfg: dict = field(default_factory=dict)
-    rms_norm: bool = True
-    residual_in_fp32: bool = True
-    fused_add_norm: bool = True
-    pad_vocab_size_multiple: int = 8
-    max_position_embeddings: int = 2048
-def create_block(
-    d_model,
-    ssm_cfg=None,
-    norm_epsilon=1e-5,
-    rms_norm=False,
-    residual_in_fp32=False,
-    fused_add_norm=False,
-    layer_idx=None,
-    device=None,
-    dtype=None,
-    checkpoint_mixer=False,
-):
-    if ssm_cfg is None:
-        ssm_cfg = {}
-    factory_kwargs = {"device": device, "dtype": dtype}
-    mixer_cls = partial(Mamba, layer_idx=layer_idx, **ssm_cfg, **factory_kwargs)
-    norm_cls = partial(
-        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
-    )
-    block = Block(
-        d_model,
-        mixer_cls,
-        norm_cls=norm_cls,
-        fused_add_norm=fused_add_norm,
-        residual_in_fp32=residual_in_fp32,
-    )
-    block.layer_idx = layer_idx
-    if checkpoint_mixer:
-        block.mixer = CheckpointedModule(block.mixer)
-    return block
-class CheckpointedModule(torch.nn.Module):
-    def __init__(self, layer):
-        super().__init__()
-        self.ckpt_layer = layer
-    def forward(self, x, *args, **kwargs):
-        return checkpoint(self.ckpt_layer, x, use_reentrant=False)
-    # def state_dict(self, **kwargs):
-    #     # Get the state dict of the underlying layer
-    #     layer_state_dict = self.ckpt_layer.state_dict(**kwargs)
-    #     # Create a new state dict with the original keys
-    #     state_dict = {k.replace('ckpt_layer.', ''): v for k, v in layer_state_dict.items()}
-    #     return state_dict
-class MixerModelSafe(MixerModel):
-    """
-    Overwrite the forward method to allow saving intermediate layers.
-    """
-    def forward(self, input_ids, inference_params=None, save_layer=[]):
-        hidden_states = self.embedding(input_ids)
-        residual = None
-        if len(save_layer) > 0:
-            hidden_states_dict = {}
-        for i, layer in enumerate(self.layers):
-            hidden_states, residual = layer(
-                hidden_states, residual, inference_params=inference_params
-            )
-            if i + 1 in save_layer:
-                hidden_states_dict[i + 1] = (
-                    hidden_states.detach().cpu().to(torch.float).numpy()
-                )
-        if len(save_layer) > 0:
-            return hidden_states_dict
-        if not self.fused_add_norm:
-            residual = (
-                (hidden_states + residual) if residual is not None else hidden_states
-            )
-            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
-        else:
-            # Set prenorm=False here since we don't need the residual
-            fused_add_norm_fn = (
-                rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
-            )
-            hidden_states = fused_add_norm_fn(
-                hidden_states,
-                self.norm_f.weight,
-                self.norm_f.bias,
-                eps=self.norm_f.eps,
-                residual=residual,
-                prenorm=False,
-                residual_in_fp32=self.residual_in_fp32,
-            )
-        return hidden_states
-class MixerModelWithPosids(nn.Module):
-    r"""Mixer model for Mamba but we add positional encodings to the input embeddings."""
-    def __init__(
-        self,
-        d_model: int,
-        n_layer: int,
-        vocab_size: int,
-        max_position_embeddings: int,
-        ssm_cfg=None,
-        norm_epsilon: float = 1e-5,
-        rms_norm: bool = False,
-        initializer_cfg=None,
-        fused_add_norm=False,
-        residual_in_fp32=False,
-        device=None,
-        dtype=None,
-        checkpoint_mixer=False,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.residual_in_fp32 = residual_in_fp32
-        self.embedding = nn.Embedding(vocab_size, d_model // 2, **factory_kwargs)
-        self.position_embedding = nn.Embedding(
-            max_position_embeddings, d_model - d_model // 2, **factory_kwargs
-        )
-        # We change the order of residual and layer norm:
-        # Instead of LN -> Attn / MLP -> Add, we do:
-        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
-        # the main branch (output of MLP / Mixer). The model definition is unchanged.
-        # This is for performance reason: we can fuse add + layer_norm.
-        self.fused_add_norm = fused_add_norm
-        if self.fused_add_norm:
-            if layer_norm_fn is None or rms_norm_fn is None:
-                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
-        self.layers = nn.ModuleList(
-            [
-                create_block(
-                    d_model,
-                    ssm_cfg=ssm_cfg,
-                    norm_epsilon=norm_epsilon,
-                    rms_norm=rms_norm,
-                    residual_in_fp32=residual_in_fp32,
-                    fused_add_norm=fused_add_norm,
-                    layer_idx=i,
-                    checkpoint_mixer=checkpoint_mixer,
-                    **factory_kwargs,
-                )
-                for i in range(n_layer)
-            ]
-        )
-        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
-            d_model, eps=norm_epsilon, **factory_kwargs
-        )
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return {
-            i: layer.allocate_inference_cache(
-                batch_size, max_seqlen, dtype=dtype, **kwargs
-            )
-            for i, layer in enumerate(self.layers)
-        }
-    def forward(self, input_ids, position_ids, inference_params=None, save_layer=[]):
-        hidden_states = torch.cat(
-            [
-                self.embedding(input_ids),
-                self.position_embedding(position_ids),
-            ],
-            -1,
-        )
-        residual = None
-        if len(save_layer) > 0:
-            hidden_states_dict = {}
-        for i, layer in enumerate(self.layers):
-            hidden_states, residual = layer(
-                hidden_states, residual, inference_params=inference_params
-            )
-            if i + 1 in save_layer:
-                hidden_states_dict[i + 1] = (
-                    hidden_states.detach().cpu().to(torch.float).numpy()
-                )
-        if len(save_layer) > 0:
-            return hidden_states_dict
-        if not self.fused_add_norm:
-            residual = (
-                (hidden_states + residual) if residual is not None else hidden_states
-            )
-            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
-        else:
-            fused_add_norm_fn = (
-                rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
-            )
-            hidden_states = fused_add_norm_fn(
-                hidden_states,
-                self.norm_f.weight,
-                self.norm_f.bias,
-                eps=self.norm_f.eps,
-                residual=residual,
-                prenorm=False,
-                residual_in_fp32=self.residual_in_fp32,
-            )
-        return hidden_states
-class MixerModelWith2DPosids(nn.Module):
-    r"""Mixer model for Mamba but we add positional encodings to the input embeddings."""
-    def __init__(
-        self,
-        d_model: int,
-        n_layer: int,
-        vocab_size: int,
-        max_position_embeddings: int,
-        max_sequence_position_embeddings: int = 512,
-        ssm_cfg=None,
-        norm_epsilon: float = 1e-5,
-        rms_norm: bool = False,
-        initializer_cfg=None,
-        fused_add_norm=False,
-        residual_in_fp32=False,
-        device=None,
-        dtype=None,
-        checkpoint_mixer=False,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.residual_in_fp32 = residual_in_fp32
-        self.embedding = nn.Embedding(
-            vocab_size, d_model - 2 * d_model // 4, **factory_kwargs
-        )
-        self.position_embedding = nn.Embedding(
-            max_position_embeddings, d_model // 4, **factory_kwargs
-        )
-        self.seq_position_embedding = nn.Embedding(
-            max_sequence_position_embeddings, d_model // 4, **factory_kwargs
-        )
-        self.d_embeddings = d_model - 2 * d_model // 4
-        # We change the order of residual and layer norm:
-        # Instead of LN -> Attn / MLP -> Add, we do:
-        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
-        # the main branch (output of MLP / Mixer). The model definition is unchanged.
-        # This is for performance reason: we can fuse add + layer_norm.
-        self.fused_add_norm = fused_add_norm
-        if self.fused_add_norm:
-            if layer_norm_fn is None or rms_norm_fn is None:
-                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
-        self.layers = nn.ModuleList(
-            [
-                create_block(
-                    d_model,
-                    ssm_cfg=ssm_cfg,
-                    norm_epsilon=norm_epsilon,
-                    rms_norm=rms_norm,
-                    residual_in_fp32=residual_in_fp32,
-                    fused_add_norm=fused_add_norm,
-                    layer_idx=i,
-                    checkpoint_mixer=checkpoint_mixer,
-                    **factory_kwargs,
-                )
-                for i in range(n_layer)
-            ]
-        )
-        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
-            d_model, eps=norm_epsilon, **factory_kwargs
-        )
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return {
-            i: layer.allocate_inference_cache(
-                batch_size, max_seqlen, dtype=dtype, **kwargs
-            )
-            for i, layer in enumerate(self.layers)
-        }
-    def forward(
-        self,
-        input_ids,
-        position_ids,
-        seq_position_ids,
-        inference_params=None,
-        save_layer=[],
-    ):
-        hidden_states = torch.cat(
-            [
-                self.embedding(input_ids),
-                self.position_embedding(position_ids),
-                self.seq_position_embedding(seq_position_ids),
-            ],
-            -1,
-        )
-        residual = None
-        if len(save_layer) > 0:
-            hidden_states_dict = {}
-        for i, layer in enumerate(self.layers):
-            hidden_states, residual = layer(
-                hidden_states, residual, inference_params=inference_params
-            )
-            if i + 1 in save_layer:
-                hidden_states_dict[i + 1] = (
-                    hidden_states.detach().cpu().to(torch.float).numpy()
-                )
-        if len(save_layer) > 0:
-            return hidden_states_dict
-        if not self.fused_add_norm:
-            residual = (
-                (hidden_states + residual) if residual is not None else hidden_states
-            )
-            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
-        else:
-            fused_add_norm_fn = (
-                rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
-            )
-            hidden_states = fused_add_norm_fn(
-                hidden_states,
-                self.norm_f.weight,
-                self.norm_f.bias,
-                eps=self.norm_f.eps,
-                residual=residual,
-                prenorm=False,
-                residual_in_fp32=self.residual_in_fp32,
-            )
-        return hidden_states
-class MambaLMHeadModelSafe(nn.Module, GenerationMixinSafe):
-    def __init__(
-        self,
-        config: MambaConfig,
-        initializer_cfg=None,
-        device=None,
-        dtype=None,
-        checkpoint_mixer=False,
-    ) -> None:
-        self.config = config
-        d_model = config.d_model
-        n_layer = config.n_layer
-        vocab_size = config.vocab_size
-        ssm_cfg = config.ssm_cfg
-        rms_norm = config.rms_norm
-        residual_in_fp32 = config.residual_in_fp32
-        fused_add_norm = config.fused_add_norm
-        pad_vocab_size_multiple = config.pad_vocab_size_multiple
-        factory_kwargs = {"device": device, "dtype": dtype}
-        if checkpoint_mixer:
-            raise NotImplementedError(
-                "Checkpointing is not yet supported for MambaLMHeadModelSafe"
-            )
-        super().__init__()
-        if vocab_size % pad_vocab_size_multiple != 0:
-            vocab_size += pad_vocab_size_multiple - (
-                vocab_size % pad_vocab_size_multiple
-            )
-        self.backbone = MixerModelSafe(
-            d_model=d_model,
-            n_layer=n_layer,
-            vocab_size=vocab_size,
-            ssm_cfg=ssm_cfg,
-            rms_norm=rms_norm,
-            initializer_cfg=initializer_cfg,
-            fused_add_norm=fused_add_norm,
-            residual_in_fp32=residual_in_fp32,
-            **factory_kwargs,
-        )
-        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
-        # Initialize weights and apply final processing
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-        self.tie_weights()
-    def tie_weights(self):
-        self.lm_head.weight = self.backbone.embedding.weight
-    def clip_grad_norm_(self, max_norm, norm_type=2.0):
-        r"""Clip the norm of the gradients for the model.
-        Args:
-            max_norm (float or int): The maximum norm of the gradients.
-                The gradients are modified in-place.
-            norm_type (float or int): The type of the used p-norm. Can be 'inf' for infinity norm.
-        Returns:
-            Total norm of the parameters (viewed as a single vector).
-        """
-        return torch.nn.utils.clip_grad_value_(self.parameters(), max_norm)
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.backbone.allocate_inference_cache(
-            batch_size, max_seqlen, dtype=dtype, **kwargs
-        )
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        inference_params=None,
-        num_last_tokens=0,
-        save_layer=[],
-        *args,
-        **kwargs,
-    ):
-        """
-        "position_ids" is just to be compatible with Transformer generation. We don't use it.
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        return self.protected_forward(
-            input_ids, position_ids, inference_params, num_last_tokens, save_layer
-        )
-    def protected_forward(
-        self,
-        input_ids,
-        position_ids=None,
-        inference_params=None,
-        num_last_tokens=0,
-        save_layer=[],
-    ):
-        hidden_states = self.backbone(
-            input_ids, inference_params=inference_params, save_layer=save_layer
-        )
-        if len(save_layer) > 0:
-            return hidden_states
-        if num_last_tokens > 0:
-            hidden_states = hidden_states[:, -num_last_tokens:]
-        lm_logits = self.lm_head(hidden_states)
-        CausalLMOutput = namedtuple("CausalLMOutput", ["loss", "logits"])
-        return CausalLMOutput(loss=None, logits=lm_logits)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
-        config_data = load_config_hf(pretrained_model_name)
-        config = MambaConfig(**config_data)
-        model = cls(config, device=device, dtype=dtype, **kwargs)
-        model.load_state_dict(
-            load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype),
-            strict=False,
-        )
-        return model
-    def save_pretrained(self, save_directory):
-        """
-        Minimal implementation of save_pretrained for MambaLMHeadModel.
-        Save the model and its configuration file to a directory.
-        """
-        # Ensure save_directory exists
-        os.makedirs(save_directory, exist_ok=True)
-        # Save the model's state_dict
-        model_path = os.path.join(save_directory, "pytorch_model.bin")
-        torch.save(self.state_dict(), model_path)
-        # Save the configuration of the model
-        config_path = os.path.join(save_directory, "config.json")
-        with open(config_path, "w") as f:
-            json.dump(self.config.__dict__, f)
-class MambaLMHeadModelwithPosids(nn.Module, GenerationMixinSafe):
-    def __init__(
-        self,
-        config: MambaConfig,
-        initializer_cfg=None,
-        device=None,
-        dtype=None,
-        checkpoint_mixer=False,
-    ) -> None:
-        self.config = config
-        d_model = config.d_model
-        n_layer = config.n_layer
-        vocab_size = config.vocab_size
-        max_position_embeddings = config.max_position_embeddings
-        ssm_cfg = config.ssm_cfg
-        rms_norm = config.rms_norm
-        residual_in_fp32 = config.residual_in_fp32
-        fused_add_norm = config.fused_add_norm
-        pad_vocab_size_multiple = config.pad_vocab_size_multiple
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        if vocab_size % pad_vocab_size_multiple != 0:
-            vocab_size += pad_vocab_size_multiple - (
-                vocab_size % pad_vocab_size_multiple
-            )
-        self.backbone = MixerModelWithPosids(
-            d_model=d_model,
-            n_layer=n_layer,
-            vocab_size=vocab_size,
-            max_position_embeddings=max_position_embeddings,
-            ssm_cfg=ssm_cfg,
-            rms_norm=rms_norm,
-            initializer_cfg=initializer_cfg,
-            fused_add_norm=fused_add_norm,
-            residual_in_fp32=residual_in_fp32,
-            checkpoint_mixer=checkpoint_mixer,
-            **factory_kwargs,
-        )
-        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
-        # Initialize weights and apply final processing
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-        self.tie_weights()
-    def tie_weights(self):
-        self.lm_head.weight = self.backbone.embedding.weight
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.backbone.allocate_inference_cache(
-            batch_size, max_seqlen, dtype=dtype, **kwargs
-        )
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        inference_params=None,
-        num_last_tokens=0,
-        save_layer=[],
-        *args,
-        **kwargs,
-    ):
-        """
-        "position_ids" is just to be compatible with Transformer generation. We don't use it.
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        return self.protected_forward(
-            input_ids, position_ids, inference_params, num_last_tokens, save_layer
-        )
-    def protected_forward(
-        self,
-        input_ids,
-        position_ids=None,
-        inference_params=None,
-        num_last_tokens=0,
-        save_layer=[],
-    ):
-        hidden_states = self.backbone(
-            input_ids,
-            position_ids=position_ids,
-            inference_params=inference_params,
-            save_layer=save_layer,
-        )
-        if len(save_layer) > 0:
-            return hidden_states
-        hidden_states = hidden_states[:, :, : self.config.d_model // 2]
-        if num_last_tokens > 0:
-            hidden_states = hidden_states[:, -num_last_tokens:]
-        lm_logits = self.lm_head(hidden_states)
-        CausalLMOutput = namedtuple("CausalLMOutput", ["loss", "logits"])
-        return CausalLMOutput(loss=None, logits=lm_logits)
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name,
-        device=None,
-        dtype=None,
-        checkpoint_mixer=False,
-        **kwargs,
-    ):
-        config_data = load_config_hf(pretrained_model_name)
-        config = MambaConfig(**config_data)
-        model = cls(
-            config,
-            device=device,
-            dtype=dtype,
-            checkpoint_mixer=checkpoint_mixer,
-            **kwargs,
-        )
-        state_dict = load_state_dict_hf(
-            pretrained_model_name, device=device, dtype=dtype
-        )
-        if state_dict.keys() != model.state_dict().keys():
-            if checkpoint_mixer:
-                for key in model.state_dict().keys():
-                    if "ckpt_layer" in key:
-                        state_dict[key] = state_dict.pop(key.replace("ckpt_layer.", ""))
-                print(
-                    "Using a model that was pretrained without gradient checkpointing and now want to use it. Changed the keys of the state_dict to match the model's keys."
-                )
-            else:
-                for key in list(state_dict.keys()):
-                    if "ckpt_layer" in key:
-                        state_dict[key.replace("ckpt_layer.", "")] = state_dict.pop(key)
-                print(
-                    "Using a model that was pretrained with gradient checkpointing but now do not want to use it. Changed the keys of the state_dict to match the model's keys."
-                )
-            assert (
-                state_dict.keys() == model.state_dict().keys()
-            ), "The keys of the state_dict do not match the model's keys."
-        model.load_state_dict(state_dict)
-        return model
-    def save_pretrained(self, save_directory):
-        """
-        Minimal implementation of save_pretrained for MambaLMHeadModel.
-        Save the model and its configuration file to a directory.
-        """
-        # Ensure save_directory exists
-        os.makedirs(save_directory, exist_ok=True)
-        # Save the model's state_dict
-        model_path = os.path.join(save_directory, "pytorch_model.bin")
-        torch.save(self.state_dict(), model_path)
-        # Save the configuration of the model
-        config_path = os.path.join(save_directory, "config.json")
-        with open(config_path, "w") as f:
-            json.dump(self.config.__dict__, f)
-class MambaLMHeadModelwith2DPosids(nn.Module, GenerationMixinSafe):
-    def __init__(
-        self,
-        config: MambaConfig,
-        initializer_cfg=None,
-        device=None,
-        dtype=None,
-        checkpoint_mixer=False,
-    ) -> None:
-        self.config = config
-        d_model = config.d_model
-        n_layer = config.n_layer
-        vocab_size = config.vocab_size
-        max_position_embeddings = config.max_position_embeddings
-        ssm_cfg = config.ssm_cfg
-        rms_norm = config.rms_norm
-        residual_in_fp32 = config.residual_in_fp32
-        fused_add_norm = config.fused_add_norm
-        pad_vocab_size_multiple = config.pad_vocab_size_multiple
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        if vocab_size % pad_vocab_size_multiple != 0:
-            vocab_size += pad_vocab_size_multiple - (
-                vocab_size % pad_vocab_size_multiple
-            )
-        self.backbone = MixerModelWith2DPosids(
-            d_model=d_model,
-            n_layer=n_layer,
-            vocab_size=vocab_size,
-            max_position_embeddings=max_position_embeddings,
-            ssm_cfg=ssm_cfg,
-            rms_norm=rms_norm,
-            initializer_cfg=initializer_cfg,
-            fused_add_norm=fused_add_norm,
-            residual_in_fp32=residual_in_fp32,
-            checkpoint_mixer=checkpoint_mixer,
-            **factory_kwargs,
-        )
-        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
-        # Initialize weights and apply final processing
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-        self.tie_weights()
-    def tie_weights(self):
-        self.lm_head.weight = self.backbone.embedding.weight
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.backbone.allocate_inference_cache(
-            batch_size, max_seqlen, dtype=dtype, **kwargs
-        )
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        seq_position_ids=None,
-        inference_params=None,
-        num_last_tokens=0,
-        save_layer=[],
-        *args,
-        **kwargs,
-    ):
-        """
-        "position_ids" is just to be compatible with Transformer generation. We don't use it.
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        return self.protected_forward(
-            input_ids,
-            position_ids,
-            seq_position_ids,
-            inference_params,
-            num_last_tokens,
-            save_layer,
-        )
-    def protected_forward(
-        self,
-        input_ids,
-        position_ids=None,
-        seq_position_ids=None,
-        inference_params=None,
-        num_last_tokens=0,
-        save_layer=[],
-    ):
-        hidden_states = self.backbone(
-            input_ids,
-            position_ids=position_ids,
-            seq_position_ids=seq_position_ids,
-            inference_params=inference_params,
-            save_layer=save_layer,
-        )
-        if len(save_layer) > 0:
-            return hidden_states
-        hidden_states = hidden_states[:, :, : self.backbone.d_embeddings]
-        if num_last_tokens > 0:
-            hidden_states = hidden_states[:, -num_last_tokens:]
-        lm_logits = self.lm_head(hidden_states)
-        CausalLMOutput = namedtuple("CausalLMOutput", ["loss", "logits"])
-        return CausalLMOutput(loss=None, logits=lm_logits)
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name,
-        device=None,
-        dtype=None,
-        checkpoint_mixer=False,
-        **kwargs,
-    ):
-        config_data = load_config_hf(pretrained_model_name)
-        config = MambaConfig(**config_data)
-        model = cls(
-            config,
-            device=device,
-            dtype=dtype,
-            checkpoint_mixer=checkpoint_mixer,
-            **kwargs,
-        )
-        state_dict = load_state_dict_hf(
-            pretrained_model_name, device=device, dtype=dtype
-        )
-        if state_dict.keys() != model.state_dict().keys():
-            if checkpoint_mixer:
-                for key in model.state_dict().keys():
-                    if "ckpt_layer" in key:
-                        state_dict[key] = state_dict.pop(key.replace("ckpt_layer.", ""))
-                print(
-                    "Using a model that was pretrained without gradient checkpointing and now want to use it. Changed the keys of the state_dict to match the model's keys."
-                )
-            else:
-                for key in list(state_dict.keys()):
-                    if "ckpt_layer" in key:
-                        state_dict[key.replace("ckpt_layer.", "")] = state_dict.pop(key)
-                print(
-                    "Using a model that was pretrained with gradient checkpointing but now do not want to use it. Changed the keys of the state_dict to match the model's keys."
-                )
-            assert (
-                state_dict.keys() == model.state_dict().keys()
-            ), "The keys of the state_dict do not match the model's keys."
-        model.load_state_dict(state_dict)
-        return model
-    def save_pretrained(self, save_directory):
-        """
-        Minimal implementation of save_pretrained for MambaLMHeadModel.
-        Save the model and its configuration file to a directory.
-        """
-        # Ensure save_directory exists
-        os.makedirs(save_directory, exist_ok=True)
-        # Save the model's state_dict
-        model_path = os.path.join(save_directory, "pytorch_model.bin")
-        torch.save(self.state_dict(), model_path)
-        # Save the configuration of the model
-        config_path = os.path.join(save_directory, "config.json")
-        with open(config_path, "w") as f:
-            json.dump(self.config.__dict__, f)

protxlstm/plot_utils.py DELETED Viewed

@@ -1,26 +0,0 @@
-cd = { # use dependent on model-type!!
-    "xLSTM": "#3073AD",
-    "Transformers": "#4B9D7A",
-    "Mamba": "#DF8953",
-    "S4": "#D275AB",
-    "Hyena": "#E86A61",
-}
-def setup_matplotlib():
-  import matplotlib.pyplot as plt
-  from tueplots import bundles, axes
-  bundles.icml2022()
-  plt.rcParams.update(bundles.icml2022())
-  plt.rcParams.update(axes.lines(base_width=0.5))
-  plt.rcParams["text.usetex"] = False
-  plt.rcParams['font.family'] = "sans-serif"
-  plt.rcParams['font.serif'] = 'Arial'
-  plt.rcParams['legend.edgecolor'] = 'grey'
-  plt.rcParams['legend.framealpha'] = 0.7
-  plt.rcParams['lines.linewidth'] = 1.2
-  plt.rcParams['axes.grid'] = True
-  plt.rcParams['axes.grid.axis'] = 'both'
-  plt.rcParams['grid.alpha'] = 0.2
-  plt.rcParams['axes.grid'] = True
-  plt.rcParams['axes.prop_cycle'] = plt.cycler(color=cd.values())

protxlstm/train.py DELETED Viewed

@@ -1,338 +0,0 @@
-# Original code from ProtMamba under Apache License 2.0.
-#
-# Modifications made by Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen
-#   - Extended to training of xlstm and transformer-based models
-#   - Predefined splits instead of on-the-fly creation
-#   - Option to overwrite config parameters from the command line
-#   - wandb logging
-import argparse
-import os
-import torch
-from omegaconf import OmegaConf
-from transformers import TrainingArguments
-from protxlstm.dataloaders import ProteinMemmapDataset, ProteinDataCollator
-from protxlstm.models.xlstm import xLSTMConfig, xLSTMLMHeadModel
-from protxlstm.models.llama import TransformerConfig, TransformerLMHeadModel
-from protxlstm.trainer import ProtTrainer, EarlyStoppingCallback, get_last_checkpoint
-from protxlstm.utils import (
-    AA_TO_ID,
-    compute_metrics,
-    is_zero_rank,
-    parse_override_args,
-    print_number_of_parameters,
-    print_zero_rank,
-    set_optimizer_and_scheduler,
-    setup_wandb,
-    load_model,
-)
-def run(config):
-    """
-    Run training loop.
-    Args:
-       config (dict): dictionary with the configuration parameters.
-    """
-    if config.model_type == 'llama':
-        pe_kwargs = {
-            'max_position_embeddings' : config["model"]["max_position_embeddings"],
-            'add_position_ids' : '1d',
-        }
-    elif config.model_type == 'mamba':
-        from protxlstm.models.mamba import MambaConfig, MambaLMHeadModelSafe, MambaLMHeadModelwithPosids, MambaLMHeadModelwith2DPosids
-        pe_kwargs = {
-            'max_position_embeddings' : config["model"]["max_position_embeddings"],
-            'max_seq_position_embeddings' : config["model"]["max_seq_position_embeddings"],
-            'add_position_ids' : config["model"]["add_position_ids"]
-        }
-    else:
-        position_embeddings = config["model"]["position_embeddings"]
-        assert position_embeddings in ["none", "abs_1d", "abs_2d", "rot_1d", "rot_2d"]
-        if position_embeddings != "none":
-            position_embeddings = position_embeddings.split("_")[-1]
-        pe_kwargs = {
-            'max_position_embeddings' : config["model"]["max_position_embeddings"],
-            'max_seq_position_embeddings' : config["model"]["max_seq_position_embeddings"],
-            'add_position_ids' : position_embeddings
-        }
-    # Setup WandB
-    wandb_run_name = setup_wandb(config)
-    # Load datasets
-    dataset_params = {
-        "msa_memmap_path": config["msa_memmap_path"],
-        "msa_memmap_meta_path": config["msa_memmap_meta_path"],
-        "sample": config["sample_sequences"],
-        "max_msa_len": config["max_msa_len"],
-        "reverse": False,
-        "seed": config["seed_sequence_sampling"],
-        "troubleshoot": False,
-        "fim_strategy": config["fim_strategy"],
-        "always_mask": config["always_mask"],
-        **pe_kwargs,
-    }
-    train_dataset = ProteinMemmapDataset(subset_path=config["train_set"], **dataset_params)
-    valid_dataset = ProteinMemmapDataset(subset_path=config["valid_set"], **dataset_params)
-    train_eval_dataset = ProteinMemmapDataset(subset_path=config["train_eval_set"], **dataset_params)
-    print(f'Train set size: {len(train_dataset)} Train eval set size: {len(train_eval_dataset)} Valid set size: {len(valid_dataset)}')
-    assert (
-        len(AA_TO_ID) == config["model"]["vocab_size"]
-    ), f"Vocab size in the config file does not match the one in the code. I should be {len(AA_TO_ID)}"
-    # Create data collator for batched training
-    data_collator = ProteinDataCollator(max_sequence_length=config["max_msa_len"])
-    # Check datatypes
-    if config["dtype"] == "float32":
-        dtype = torch.float32
-    elif config["dtype"] == "bfloat16":
-        dtype = torch.bfloat16
-    else:
-        raise ValueError("dtype must be either float32 or bfloat16")
-    # Initialize model
-    if config.model_type == 'xlstm':
-        # Load model for finetuning
-        if config.finetune_model_path:
-            # These fields are updated in the config loaded from the checkpoint
-            config_update_kwargs = {
-                "mlstm_backend": config["model"]["mlstm_block"]["mlstm"]["backend"],
-                "mlstm_chunksize": config["model"]["mlstm_block"]["mlstm"]["chunk_size"],
-                "checkpoint_blocks": config["model"]["checkpoint_blocks"],
-                "rope_base_frequency": config["model"]["rope_base_frequency"]
-            }
-            model = load_model(
-                config.finetune_model_path,
-                model_class=xLSTMLMHeadModel,
-                device="cuda",
-                dtype=dtype,
-                **config_update_kwargs
-            )
-        else:
-            # Create new mode
-            xlstm_config = xLSTMConfig().init_from_dict(config["model"])
-            model = xLSTMLMHeadModel(xlstm_config)
-    elif config.model_type == 'mamba':
-        _mamba_model = {
-            "none": MambaLMHeadModelSafe,
-            "1d": MambaLMHeadModelwithPosids,
-            "2d": MambaLMHeadModelwith2DPosids,
-            }
-        Mamba = _mamba_model[config['model']["add_position_ids"]]
-        # Load model for finetuning
-        if config.finetune_model_path:
-            model = load_model(
-                config.finetune_model_path,
-                model_class=Mamba,
-                device="cuda",
-                dtype=dtype,
-                checkpoint_mixer=config["checkpoint_mixer"],
-            )
-        else:
-            # Create new mode
-            mamba_config = MambaConfig(d_model=config['model']["d_model"],
-                                    n_layer=config['model']["n_layer"],
-                                    vocab_size=config['model']["vocab_size"],
-                                    residual_in_fp32=config['model']["residual_in_fp32"])
-            model = Mamba(mamba_config, dtype=dtype, checkpoint_mixer=config['model']["checkpoint_mixer"])
-    elif config.model_type == 'llama':
-        llama_config = TransformerConfig(
-            d_model=config["model"]["d_model"],
-            n_layer=config["model"]["n_layer"],
-            n_heads=config["model"]["n_heads"],
-            n_kv_heads=config["model"]["n_kv_heads"],
-            bidirectional=config["model"]["bidirectional"],
-            hidden_dim=config["model"]["hidden_dim"],
-            multiple_of=config["model"]["multiple_of"],
-            norm_eps=config["model"]["norm_eps"],
-            max_length=config["model"]["max_length"],
-            vocab_size=config["model"]["vocab_size"],
-            dropout=config["model"]["dropout"],
-            max_position_embeddings=config["model"]["max_position_embeddings"],
-            rope_base_frequency=config["model"]["rope_base_frequency"],
-        )
-        model = TransformerLMHeadModel(llama_config)
-    else:
-        raise ValueError(f"Unsupported model_type: {config.model_type}. Expected 'xlstm', 'mamba', or 'llama'.")
-    # TODO: Improve what we want print
-    if is_zero_rank():
-        print_number_of_parameters(model)
-    print_zero_rank(f"dtype: {config['dtype']}")
-    print_zero_rank(f"Epochs: {config['num_epochs']}")
-    print_zero_rank(f"Batch size per GPU: {config['batch_size']}")
-    print_zero_rank(f"Gradient accumulation steps: {config['gradient_accumulation_steps']}")
-    eff_batch_size = config["batch_size"] * config["gradient_accumulation_steps"]
-    nr_gpus = torch.cuda.device_count()
-    print_zero_rank(f"GPUS: {nr_gpus}")
-    eff_batch_size *= nr_gpus
-    print_zero_rank(f"Effective batch size: {eff_batch_size}")
-    print_zero_rank(
-        f"Steps per training epoch: {len(train_dataset) // config['batch_size']}, eff. steps: {len(train_dataset) // eff_batch_size}"
-    )
-    print_zero_rank(f"Steps per evaluation epoch: {len(valid_dataset) // config['batch_size']}")
-    print_zero_rank(f"Max MSA length: {config['max_msa_len']}")
-    ev_epochs = round(
-        config["eval_steps"] * config["batch_size"] / len(train_dataset), 3
-    )
-    print_zero_rank(
-        f"Evaluation every {config['eval_steps']} steps, i.e. {ev_epochs} epochs. Effectively every {config['eval_steps']*config['gradient_accumulation_steps']} steps, i.e. {ev_epochs*config['gradient_accumulation_steps']} epochs."
-    )
-    if config.model_type == 'xlstm' and config["model"]["checkpoint_blocks"]:
-        print_zero_rank("Using gradient checkpointing")
-    if config["compute_only_fim_loss"]:
-        print_zero_rank("Computing only FIM loss for training")
-    # Training callbacks
-    es_callback = EarlyStoppingCallback(
-        train_path=config["output_dir"] + '/' + wandb_run_name, config=config
-    )
-    callbacks = [es_callback]
-    # Optimizer and Schedulers
-    optimizer, scheduler = set_optimizer_and_scheduler(
-        config,
-        len(train_dataset),
-        model.parameters()
-    )
-    # Find checkpoint if available
-    last_checkpoint = None
-    if config.finetune_model_path is None:
-        path = os.path.join(config["output_dir"], wandb_run_name)
-        if os.path.exists(path):
-            last_checkpoint = get_last_checkpoint(path)
-            if last_checkpoint is None:
-                print_zero_rank("No checkpoint found, starting training from scratch.")
-            else:
-                print_zero_rank(f"Resuming training from the last checkpoint: {last_checkpoint}")
-    # Create trainer
-    trainer = ProtTrainer(
-        model=model,
-        train_dataset=train_dataset,
-        eval_dataset={"valid": valid_dataset, "train": train_eval_dataset},
-        optimizers=(optimizer, scheduler),
-        args=TrainingArguments(
-            run_name=wandb_run_name,
-            local_rank=int(os.getenv('LOCAL_RANK', '0')),
-            learning_rate=config["learning_rate"],
-            num_train_epochs=config["num_epochs"],
-            per_device_train_batch_size=config["batch_size"],
-            per_device_eval_batch_size=config["batch_size"],
-            gradient_accumulation_steps=config["gradient_accumulation_steps"],
-            eval_accumulation_steps=config["eval_accumulation_steps"],
-            eval_strategy="steps",
-            max_grad_norm=config["max_grad_norm"],
-            bf16=config["dtype"] == "bfloat16",
-            dataloader_num_workers=32,
-            logging_steps=config["logging_steps"],
-            eval_steps=config["eval_steps"],
-            save_steps=config["save_steps"],
-            output_dir=config["output_dir"] + '/' + wandb_run_name,
-            logging_dir=config["output_dir"] + '/' + wandb_run_name,
-            report_to="wandb" if is_zero_rank() else None,
-            log_on_each_node=False,
-            overwrite_output_dir=False,
-            push_to_hub=False,
-            label_names=["labels"],
-        ),
-        compute_only_fim_loss=config["compute_only_fim_loss"],
-        data_collator=data_collator,
-        compute_metrics=compute_metrics,
-        callbacks=callbacks,
-    )
-    # Train model
-    while True:
-        if last_checkpoint is None and trainer.state.global_step == 0:
-            eval_results = trainer.evaluate()
-            print_zero_rank(
-                f">>> Initial validation perplexity: {eval_results['eval_valid_perplexity/batch']:.2f}"
-            )
-        else:
-            print_zero_rank(f"Resuming training from the last checkpoint: {last_checkpoint}")
-        # Train
-        trainer.train(resume_from_checkpoint=last_checkpoint)
-        # Break training when the number of epochs is reached
-        if (
-            not es_callback.should_restart
-            or trainer.state.epoch >= config["num_epochs"]
-        ):
-            eval_results = trainer.evaluate()
-            print_zero_rank(
-                f">>> Final Perplexity: {eval_results['eval_valid_perplexity/batch']:.2f}"
-            )
-            break
-        # If the training was interrupted because of a loss spike, restart from the last checkpoint
-        last_checkpoint = es_callback.checkpoint_path
-    return trainer
-if __name__ == "__main__":
-    # Default configuration file paths
-    default_model_config = "configs/xlstm_default_config.yaml"
-    default_train_config = "configs/train_default_config.yaml"
-    parser = argparse.ArgumentParser(
-        description="Train or finetune a model with the provided configuration."
-    )
-    parser.add_argument(
-        "--model_config_path",
-        type=str,
-        default=default_model_config,
-        help=f"Path to the model configuration file (default: {default_model_config})"
-    )
-    parser.add_argument(
-        "--train_config_path",
-        type=str,
-        default=default_train_config,
-        help=f"Path to the training and dataset configuration file (default: {default_train_config})"
-    )
-    parser.add_argument(
-        "overrides",
-        nargs=argparse.REMAINDER,
-        help="Override configuration values using key=value format.",
-    )
-    args = parser.parse_args()
-    # Check if the default config files exist, or raise an error
-    if not os.path.exists(args.model_config_path):
-        raise FileNotFoundError(f"Model config file not found: {args.model_config_path}")
-    if not os.path.exists(args.train_config_path):
-        raise FileNotFoundError(f"Train config file not found: {args.train_config_path}")
-    # Load the model and training configurations
-    model_config = OmegaConf.load(args.model_config_path)
-    train_config = OmegaConf.load(args.train_config_path)
-    # Merge the model and training configurations
-    config = OmegaConf.merge(model_config, train_config)
-    # Parse overrides
-    if args.overrides:
-        overrides = parse_override_args(args.overrides)
-        config.merge_with(OmegaConf.create(overrides))
-    # Run the training/finetuning process
-    run(config)

protxlstm/trainer.py DELETED Viewed

@@ -1,123 +0,0 @@
-# Original code from ProtMamba under Apache License 2.0.
-#
-# Modifications made by Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen
-#   - MambaTrainer renamed to ProtTrainer
-import os
-import re
-import torch
-from transformers import Trainer, TrainerCallback
-from protxlstm.utils import AA_TO_ID, find_fim_indices
-class ProtTrainer(Trainer):
-    """
-    Base HuggingFace Trainer used for training.
-    from https://github.com/havenhq/mamba-chat/blob/main/trainer/mamba_trainer.py"""
-    def __init__(self, compute_only_fim_loss, **kwargs,):
-        super().__init__(**kwargs)
-        self.compute_only_fim_loss = compute_only_fim_loss
-    def compute_loss(self, model, inputs, return_outputs=False):
-        input_ids = inputs.pop("input_ids")
-        labels = inputs.pop("labels")
-        if "seq_position_ids" in inputs and "position_ids" in inputs:
-            position_ids = inputs.pop("position_ids")
-            seq_position_ids = inputs.pop("seq_position_ids")
-            output = model(input_ids, position_ids=position_ids, seq_position_ids=seq_position_ids)
-        elif "position_ids" in inputs:
-            position_ids = inputs.pop("position_ids")
-            output = model(input_ids, position_ids=position_ids)
-        else:
-            output = model(input_ids)
-        lm_logits = output.logits
-        labels = labels.to(lm_logits.device)
-        shift_logits = lm_logits[:, :-1, :].contiguous()
-        labels = labels[:, 1:].contiguous()
-        loss_fct = torch.nn.CrossEntropyLoss()
-        if self.compute_only_fim_loss:
-            # start and end tokens
-            is_cls_tokens = (labels == AA_TO_ID["<cls>"])
-            is_eos_tokens = (labels == AA_TO_ID["<eos>"])
-            bool_fim = find_fim_indices(is_cls_tokens, is_eos_tokens)
-            # include also the cls token
-            bool_fim = bool_fim | is_cls_tokens
-            inds = torch.where(bool_fim)
-            lm_loss = loss_fct(shift_logits[inds[0], inds[1], :], labels[bool_fim])
-        else:
-            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))
-        return (lm_loss, output) if return_outputs else lm_loss
-    def save_model(self, output_dir, _internal_call):
-        if int(os.getenv('LOCAL_RANK', '0')) == 0:
-            self.model.save_pretrained(output_dir)
-PREFIX_CHECKPOINT_DIR = "checkpoint"
-_re_checkpoint = re.compile(r"^" + PREFIX_CHECKPOINT_DIR + r"\-(\d+)$")
-def get_last_checkpoint(folder, max_steps=None):
-    content = os.listdir(folder)
-    checkpoints = [
-        path
-        for path in content
-        if _re_checkpoint.search(path) is not None and os.path.isdir(os.path.join(folder, path))
-    ]
-    if len(checkpoints) == 0:
-        return
-    max_steps = max_steps if max_steps is not None else float("inf")
-    # func = lambda x: int(_re_checkpoint.search(x).groups()[0])
-    def func(x):
-        num = int(_re_checkpoint.search(x).groups()[0])
-        return num if num < max_steps else -1
-    return os.path.join(folder, max(checkpoints, key=func))
-class EarlyStoppingCallback(TrainerCallback):
-    def __init__(self, train_path, config=None):
-        self.step_counter_reset = 0
-        self.step_counter_stop = 0
-        self.best_loss = None
-        self.train_path = train_path
-        self.patience = config["patience"]
-        self.metric_name = config["early_stopping_metric"]
-        self.checkpoint_path = None
-        self.should_restart = False
-        self.eval_steps = config["eval_steps"]
-        self.loss_increase_factor = config["loss_increase_factor"]
-    def get_checkpoint_path(self, max_steps):
-        last_checkpoint = None
-        if os.path.exists(self.train_path):
-            last_checkpoint = get_last_checkpoint(self.train_path, max_steps)
-            if last_checkpoint is None:
-                print("No checkpoint found, starting training from scratch.")
-            else:
-                print(f"Max checkpoint allowed: {max_steps}, restarting from {last_checkpoint}.")
-        return last_checkpoint
-    def on_evaluate(self, args, state, control, model, metrics, **kwargs):
-        if self.metric_name in metrics:
-            if self.best_loss is None:
-                self.best_loss = metrics[self.metric_name]
-            elif self.best_loss*self.loss_increase_factor < metrics[self.metric_name]:
-                self.step_counter += 1
-                if self.step_counter >= self.patience:
-                    checkpoint_path = self.get_checkpoint_path(max_steps=(state.global_step-self.patience*self.eval_steps))
-                    control.should_training_stop = True
-                    self.checkpoint_path = checkpoint_path
-                    self.should_restart = True
-            else:
-                self.step_counter = 0
-                self.best_loss = min(self.best_loss, metrics[self.metric_name])
-                self.should_restart = False
-    def on_train_begin(self, args, state, control, **kwargs):
-        self.step_counter = 0
-        self.best_loss = None
-        self.should_restart = False

run.sh DELETED Viewed

@@ -1,6 +0,0 @@
-#!/bin/bash
-CONDA_ENV=$(head -1 /code/environment.yml | cut -d" " -f2)
-eval "$(conda shell.bash hook)"
-conda activate $CONDA_ENV
-streamlit run app.py --server.port 7860 --server.address 0.0.0.0