inwaves
/

os-solu

Model card Files Files and versions Community

inwaves commited on Sep 7, 2022

Commit

c13ef0b

•

1 Parent(s): 0cab14c

WIP getting the Pile dataset up and running

Browse files

Files changed (4) hide show

main.py +127 -20
model.py +16 -8
requirements.txt +6 -5
utils.py +27 -19

main.py CHANGED Viewed

@@ -1,44 +1,151 @@
 import torch as t
 import torch.nn as nn
 import torch.functional as F
 import torch.optim as optim
-import argparse
 from utils import OsSoluConfig
 from model import OsSoluModel
-from typing import Tuple
-def parse_arguments() -> argparse.Namespace:
-    # TODO: command-line args for hparams
     parser = argparse.ArgumentParser(description="Parse command-line arguments for this model.")
     parser.add_argument("--d_model", type=int, default=512, help="Hidden size of the model.")
-    parser.add_argument("--vocab_size", type=int, default=65536, help="Vocabulary size of the input sequence.")
-    parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate for the optimiser.")
-    parser.add_argument("--num_embeddings", type=int, default=1024, help="Number of embeddings.")
-    parser.add_argument("--num_blocks", type=int, default=1, help="Number of transformer blocks.")
     parser.add_argument("--dropout", type=float, default=0.1, help="Probability of dropout.")
     parser.add_argument("--ln_eps", type=float, default=1e-3, help="Layer norm epsilon.")
-    parser.add_argument("--num_heads", type=int, default=4, help="Number of attention heads in each attention layer.")
-    parser.add_argument("--self_attention_type", type=str, default="unidirectional", help="What type of attention to use: rotary or unidirectional. ")
     parser.add_argument("--max_positional_embeddings", type=int, default=1024, help="Maximum number of positional embeddings.")
-    args = parser.parse_args()
     return args
-def train(config: OsSoluConfig, model: OsSoluModel) -> OsSoluModel:
     # TODO: training loop
     return model
-def eval():
-    pass
 def setup() -> Tuple[OsSoluConfig, OsSoluModel]:
-    # TODO: wandb logging
     args = parse_arguments()
     config = OsSoluConfig(args)
     model = OsSoluModel(config)
-    return config, model
 if __name__=="__main__":
-    config, model = setup()
-    trained_model = train(config, model)
-    eval()

+import argparse
 import torch as t
 import torch.nn as nn
 import torch.functional as F
 import torch.optim as optim
+from tqdm import tqdm
+import wandb
+from typing import Tuple
+from torch.utils.data.dataloader import DataLoader
+from datasets import load_dataset
 from utils import OsSoluConfig
 from model import OsSoluModel
+WANDB_PROJECT_NAME = "os_solu"
+DEVICE = "cuda" if t.cuda.is_available() else "cpu"
+def parse_arguments() -> dict:
+    """Parses command-line arguments for this model run. Arguments of type string have allowed values,
+       which are enforced. Default parameter values are provided such that fields in the config are never None.
+    Raises:
+        ValueError: optimiser type must be adam or sgd.
+        ValueError: attention type must be rotary or unidirectional.
+    Returns:
+        dict: a dictionary containing the command-line arguments parsed by this function.
+    """
     parser = argparse.ArgumentParser(description="Parse command-line arguments for this model.")
+    parser.add_argument("--batch_size", type=int, default=256, help="Batch size used in training.")
     parser.add_argument("--d_model", type=int, default=512, help="Hidden size of the model.")
     parser.add_argument("--dropout", type=float, default=0.1, help="Probability of dropout.")
+    parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate for the optimiser.")
     parser.add_argument("--ln_eps", type=float, default=1e-3, help="Layer norm epsilon.")
     parser.add_argument("--max_positional_embeddings", type=int, default=1024, help="Maximum number of positional embeddings.")
+    parser.add_argument("--nonlinearity", type=str, default="solu", help=" Nonlinearity to use inside MLP block: must be relu or solu.")
+    parser.add_argument("--num_blocks", type=int, default=1, help="Number of transformer blocks.")
+    parser.add_argument("--num_embeddings", type=int, default=1024, help="Number of embeddings.")
+    parser.add_argument("--num_epochs", type=int, default=5, help="Number of epochs to run for.")
+    parser.add_argument("--num_heads", type=int, default=4, help="Number of attention heads in each attention layer.")
+    parser.add_argument("--optimiser_type", type=str, default="adam", help="Optimiser type.")
+    parser.add_argument("--self_attention_type", type=str, default="unidirectional", help="What type of attention to use: rotary or unidirectional.")
+    parser.add_argument("--vocab_size", type=int, default=65536, help="Vocabulary size of the input sequence.")
+    args = vars(parser.parse_args())
+    # Parse string arguments.
+    allowed_values = {
+        "optimiser_type": ["adam", "sgd"],
+        "self_attention_type": ["unidirectional", "rotary"],
+        "nonlinearity": ["relu", "solu"],
+    }
+    for key, values in allowed_values.items():
+        if args[key] not in values:
+            raise ValueError(f"{key} should be one of {values}.")
     return args
+def train(config: OsSoluConfig, model: OsSoluModel, train_dataloader: DataLoader) -> OsSoluModel:
+    """Trains a model using the config and training dataset provided.
+    Args:
+        config (OsSoluConfig): The config object.
+        model (OsSoluModel): The model to train.
+        train_dataloader (t.utils.data.DataLoader): The training dataset provided as a torch DataLoader object.
+    Returns:
+        OsSoluModel: The trained model.
+    """
     # TODO: training loop
+    train_loss_fn = t.nn.CrossEntropyLoss()
+    wandb.watch(model, criterion=train_loss_fn, log="all", log_freq=10, log_graph=True)
+    # Initialise optimiser.
+    opt = optim.Adam if config.optimiser_type.lower() == "adam" else optim.SGD
+    optimiser = opt(model.parameters(), lr=config.learning_rate)
+    # Train loop.
+    examples_seen = 0
+    for epoch in range(config.num_epochs):
+        for i, (data, target) in enumerate(tqdm(train_dataloader)):
+            print(data, target)
+            data = data.to(DEVICE)
+            target = target.to(DEVICE)
+            predictions = model(data)
+            accuracy = (predictions.argmax(dim=-1) == target).sum() / len(data)
+            optimiser.zero_grad()
+            loss = train_loss_fn(target, predictions)
+            loss.backward()
+            optimiser.step()
+            wandb.log(dict(train_loss=loss, train_accuracy=accuracy, elapsed=time.time() - start_time), step=examples_seen)
+            examples_seen += len(data)
     return model
+def eval(model: OsSoluModel, test_dataloader: DataLoader) -> None:
+    """Evaluates a trained model on the test dataset provided.
+    Args:
+        model (OsSoluModel): The trained model.
+        test_dataset (t.utils.data.Dataset): The dataset on which to evaluate the model.
+    """
+    test_loss_fn = t.nn.CrossEntropyLoss()
+    # Eval loop.
+    examples_seen = 0
+    total_loss, num_correct = 0, 0
+    model.eval()
+    with t.inference_mode():
+        for i, (data, target) in enumerate(tqdm(test_dataloader)):
+            data = data.to(DEVICE)
+            target = target.to(DEVICE)
+            predictions = model(data)
+            num_correct += (predictions.argmax(dim=-1) == target).sum().item()
+            total_loss += test_loss_fn(target, predictions).item()
+            examples_seen += len(data)
+        wandb.log(dict(test_loss=total_loss, test_accuracy=num_correct / examples_seen, elapsed=time.time() - start_time), step=examples_seen)
+    # Save the model's state on disk, then upload to wandb.
+    filename = f"{wandb.run.dir}/model_state_dict.pt"
+    t.save(model.state_dict(), filename)
+    wandb.save(filename)
 def setup() -> Tuple[OsSoluConfig, OsSoluModel]:
+    """This function delegates the setup to various helper functions.
+    Returns:
+        Tuple[OsSoluConfig, OsSoluModel, datasets.iterable_dataset.IterableDataset, datasets.iterable_dataset.IterableDataset]: A tuple containing a config, a model, a training dataset and a test dataset.
+    """
     args = parse_arguments()
+    wandb.init(project=WANDB_PROJECT_NAME, config=args)
     config = OsSoluConfig(args)
     model = OsSoluModel(config)
+    # Load and prep data.
+    ds = load_dataset("the_pile", streaming=True)
+    train_dataset = ds["train"].with_format("torch")
+    train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size)
+    test_dataset = ds["test"].with_format("torch")
+    test_dataloader = DataLoader(test_dataset, batch_size=config.batch_size)
+    return config, model, (train_dataloader, test_dataloader)
 if __name__=="__main__":
+    config, model, (train_dataloader, test_dataloader) = setup()
+    trained_model = train(config, model, train_dataloader)
+    eval(trained_model, test_dataloader)

model.py CHANGED Viewed

@@ -8,28 +8,35 @@ from einops import rearrange, repeat, reduce
 from utils import OsSoluConfig
 class OsSoluModel(nn.Module):
     def __init__(self, config: OsSoluConfig) -> None:
         super().__init__()
-        normalised_shape = None             # TODO: normalised_shape should be defined properly
         self.config = config
         self.embed_positions = nn.Embedding(config.max_positional_embeddings, config.d_model)
         self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
         self.dropout = nn.Dropout(config.dropout)
         self.transformer_blocks = nn.ModuleList([GPT2Block(config) for _ in range(config.num_blocks)])
-        self.final_ln = nn.LayerNorm(normalized_shape, config.ln_eps)
-        self.unembed = nn
     def forward(self, x: t.Tensor) -> t.Tensor:
         positional_embeddings = self.embed_positions(t.arange(x.size(1)))
         token_embeddings = self.embed_tokens(x)
         embeddings = positional_embeddings + token_embeddings
         out = self.dropout(embeddings)
-        out = self.transformer_blocks(out)
 class SoLU(nn.Module):
     def __init__(self):
-        pass
     def forward(self, x: t.Tensor) -> t.Tensor:
         return x * x.softmax(dim=-1)
@@ -39,12 +46,13 @@ class GPT2Block(nn.Module):
         super().__init__()
         self.config = config
-        self.layer_norm1 = nn.LayerNorm(normalized_shape, config.ln_eps)
         self.attention = UnidirectionalAttention(config) if config.self_attention_type == "unidirectional" else RotaryAttention(config)
         self.MLP = nn.Sequential(
-            nn.LayerNorm(normalized_shape, config.ln_eps),
             nn.Linear(config.d_model, 4*config.d_model),
-            SoLU(),
             nn.Linear(4*config.d_model, config.d_model),
             nn.Dropout(config.dropout)
         )

 from utils import OsSoluConfig
 class OsSoluModel(nn.Module):
+    """An open-source implementation of a SoLU-based transformer. This is a GPT-style architecture model
+    where the nonlinearity in the MLP block is replaced with SoLU(x) = x * softmax(x)."""
     def __init__(self, config: OsSoluConfig) -> None:
         super().__init__()
         self.config = config
         self.embed_positions = nn.Embedding(config.max_positional_embeddings, config.d_model)
         self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
         self.dropout = nn.Dropout(config.dropout)
         self.transformer_blocks = nn.ModuleList([GPT2Block(config) for _ in range(config.num_blocks)])
+        self.final_ln = nn.LayerNorm(config.d_model, config.ln_eps)
     def forward(self, x: t.Tensor) -> t.Tensor:
         positional_embeddings = self.embed_positions(t.arange(x.size(1)))
         token_embeddings = self.embed_tokens(x)
         embeddings = positional_embeddings + token_embeddings
         out = self.dropout(embeddings)
+        for block in self.transformer_blocks:
+            out = block(out)
+        # Unembedding is not separate, so we just einsum with token embedding weights.
+        out = einsum("vocab hidden, batch seq hidden -> batch seq vocab", self.embed_tokens.weight, out)
+        return out
 class SoLU(nn.Module):
+    """A simple wrapper around the SoLU function such that it can be used as a layer in a model."""
     def __init__(self):
+        super().__init__()
     def forward(self, x: t.Tensor) -> t.Tensor:
         return x * x.softmax(dim=-1)
         super().__init__()
         self.config = config
+        self.layer_norm1 = nn.LayerNorm(config.d_model, config.ln_eps)
         self.attention = UnidirectionalAttention(config) if config.self_attention_type == "unidirectional" else RotaryAttention(config)
+        nonlinearity = SoLU() if config.nonlinearity == "solu" else nn.ReLU()
         self.MLP = nn.Sequential(
+            nn.LayerNorm(config.d_model, config.ln_eps),
             nn.Linear(config.d_model, 4*config.d_model),
+            nonlinearity,
             nn.Linear(4*config.d_model, config.d_model),
             nn.Dropout(config.dropout)
         )

requirements.txt CHANGED Viewed

@@ -1,13 +1,14 @@
-torch
-wandb
 einops
 fancy_einsum
-tqdm
 ipykernel
-notebook
 ipywidgets
 jupyter
 matplotlib
 numpy-stl
 wandb
-plotly

+datasets
 einops
 fancy_einsum
 ipykernel
 ipywidgets
 jupyter
 matplotlib
+notebook
 numpy-stl
+plotly
+torch
+tqdm
 wandb
+zstandard

utils.py CHANGED Viewed

@@ -1,27 +1,35 @@
-import argparse
 class OsSoluConfig:
     d_model: int                                # Hidden size of the model.
-    vocab_size: int                             # Vocabulary size of the input sequence. Unsure about this.
-    learning_rate: float                        # Learning rate for the optimiser.
-    num_embeddings: int                         # Number of embeddings. Unsure about this.
-    num_blocks: int                             # Number of transformer blocks.
     dropout: float                              # Probability of dropout.
     ln_eps: float                               # Layer norm epsilon.
     num_heads: int                              # Number of attention heads in each attention layer.
     self_attention_type: str                    # What type of attention to use: rotary or unidirectional.
-    max_positional_embeddings: int              # Maximum number of positional embeddings.
-    def __init__(self, args: argparse.Namespace) -> None:
         """Initialise this config class with values provided by a command-line argument parser.
            Values are never None here, as we provide suitable defaults in the parser call."""
-        self.d_model = args.d_model
-        self.vocab_size = args.vocab_size
-        self.learning_rate = args.learning_rate
-        self.num_embeddings = args.num_embeddings
-        self.num_blocks = args.num_blocks
-        self.dropout = args.dropout
-        self.ln_eps = args.ln_eps
-        self.num_heads = args.num_heads
-        self.self_attention_type = args.self_attention_type
-        self.max_positional_embeddings = args.max_positional_embeddings

 class OsSoluConfig:
+    """A class to hold hyperparameters for the model itself and for the training process."""
+    batch_size: int                             # Training data batch size.
     d_model: int                                # Hidden size of the model.
     dropout: float                              # Probability of dropout.
+    learning_rate: float                        # Learning rate for the optimiser.
     ln_eps: float                               # Layer norm epsilon.
+    max_positional_embeddings: int              # Maximum number of positional embeddings.
+    nonlinearity: str                           # Nonlinearity to use inside MLP block: must be ReLU or SoLU.
+    num_blocks: int                             # Number of transformer blocks.
+    num_embeddings: int                         # Number of embeddings. Unsure about this.
+    num_epochs: int                             # Number of epochs for this run.
     num_heads: int                              # Number of attention heads in each attention layer.
     self_attention_type: str                    # What type of attention to use: rotary or unidirectional.
+    optimiser_type: str                         # Optimiser type: SGD, Adam.
+    vocab_size: int                             # Vocabulary size of the input sequence. Unsure about this.
+    def __init__(self, args: dict) -> None:
         """Initialise this config class with values provided by a command-line argument parser.
            Values are never None here, as we provide suitable defaults in the parser call."""
+        self.batch_size = args["batch_size"]
+        self.d_model = args["d_model"]
+        self.dropout = args["dropout"]
+        self.learning_rate = args["learning_rate"]
+        self.ln_eps = args["ln_eps"]
+        self.max_positional_embeddings = args["max_positional_embeddings"]
+        self.nonlinearity = args["nonlinearity"]
+        self.num_blocks = args["num_blocks"]
+        self.num_embeddings = args["num_embeddings"]
+        self.num_epochs = args["num_epochs"]
+        self.num_heads = args["num_heads"]
+        self.optimiser_type = args["optimiser_type"]
+        self.self_attention_type = args["self_attention_type"]
+        self.vocab_size = args["vocab_size"]