Spaces:

dsmueller
/

fine-tuning-playground

Runtime error

App Files Files Community

dsmueller commited on Dec 22, 2023

Commit

9e70bac

1 Parent(s): be97146

Add new files and update dependencies

Browse files

Files changed (6) hide show

.gitignore +4 -0
Dockerfile +44 -0
app.py +124 -0
playground.ipynb +64 -0
poetry.lock +0 -0
pyproject.toml +20 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.venv/
+__pycache__/
+.env
+cache

Dockerfile ADDED Viewed

	@@ -0,0 +1,44 @@

+# Use an official Python runtime as a parent image
+FROM python:3.11.1
+# Set up user
+RUN useradd -m -u 1000 user
+USER user
+# Install Cargo (Rust's package manager), needed for hf_transfer
+# RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
+# Set up environment variables
+# ENV HOME=/home/user \
+#     PATH=/home/user/.local/bin:/home/user/.cargo/bin:$PATH
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:/home/user/.cargo/bin:$PATH
+# Set the working directory in the container
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at /home/user/app
+COPY --chown=user . $HOME/app
+# Set user to root
+USER root
+# Install any needed packages specified in requirements.txt
+RUN python -m venv /venv && \
+    /venv/bin/pip install --no-cache-dir -r requirements.txt
+# Fix broken OpenCV installation for docker container
+# RUN apt-get update && apt-get install -y libgl1 && apt-get install -y python3-opencv
+# RUN /venv/bin/pip install opencv-python
+# Set user back to non-root
+USER user
+# Make port 80 available to the world outside this container
+EXPOSE 80
+# Generate requirements.txt using pip freeze
+RUN /venv/bin/pip freeze > requirements.txt
+# Run train_llm.py when the container launches
+CMD ["/bin/bash", "-c", "source /venv/bin/activate && python train_llm.py"]

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from datasets import load_dataset
+from trl import SFTTrainer
+from peft import LoraConfig
+import os
+from uuid import uuid4
+import pandas as pd
+import subprocess
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def max_token_len(dataset):
+    max_seq_length = 0
+    for row in dataset:
+        tokens = len(tokenizer(row['text'])['input_ids'])
+        if tokens > max_seq_length:
+            max_seq_length = tokens
+    return max_seq_length
+# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
+model_name = 'mistralai/Mistral-7B-v0.1'
+# model_name = 'distilbert-base-uncased'
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model_max_length = tokenizer.model_max_length
+print("Model Max Length:", model_max_length)
+# dataset = load_dataset("imdb", split="train")
+dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
+dataset = load_dataset(dataset_name, split="train")
+# Write dataset files into data directory
+data_directory = './fine_tune_data/'
+# Create the data directory if it doesn't exist
+os.makedirs(data_directory, exist_ok=True)
+# Write the train data to a CSV file
+train_data='train_data'
+train_filename = os.path.join(data_directory, train_data)
+dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)
+max_token_length_train=max_token_len(dataset['train'])
+print('Max token length train: '+str(max_token_length_train))
+# Write the validation data to a CSV file
+validation_data='validation_data'
+validation_filename = os.path.join(data_directory, validation_data)
+dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)
+max_token_length_validation=max_token_len(dataset['validation'])
+print('Max token length validation: '+str(max_token_length_validation))
+max_token_length=max(max_token_length_train,max_token_length_validation)
+if max_token_length > model_max_length:
+    raise ValueError("Maximum token length exceeds model limits.")
+block_size=2*max_token_length
+print('Block size: '+str(block_size))
+# Define project parameters
+username='ai-aerospace'
+project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
+repo_name='ams-data-train-100-'+str(uuid4())
+model_params={
+  "project_name": project_name,
+  "model_name": model_name,
+  "repo_id": username+'/'+repo_name,
+  "train_data": train_data,
+  "validation_data": validation_data,
+  "data_directory": data_directory,
+  "block_size": block_size,
+  "model_max_length": max_token_length,
+  "logging_steps": -1,
+  "evaluation_strategy": "epoch",
+  "save_total_limit": 1,
+  "save_strategy": "epoch",
+  "mixed_precision": "fp16",
+  "lr": 0.00003,
+  "epochs": 3,
+  "batch_size": 2,
+  "warmup_ratio": 0.1,
+  "gradient_accumulation": 1,
+  "optimizer": "adamw_torch",
+  "scheduler": "linear",
+  "weight_decay": 0,
+  "max_grad_norm": 1,
+  "seed": 42,
+  "quantization": "int4",
+  "lora_r": 16,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05
+}
+for key, value in model_params.items():
+  os.environ[key] = str(value)
+print(model_params)
+### Load model
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    load_in_4bit=True
+)
+### Start trainer
+# trainer = SFTTrainer(
+#     model_name,
+#     train_dataset=dataset,
+#     dataset_text_field="text",
+#     max_seq_length=512,
+# )
+peft_config = LoraConfig(
+    r=model_params['lora_r'],
+    lora_alpha=model_params['lora_alpha'],
+    lora_dropout=model_params['lora_dropout']
+)
+trainer = SFTTrainer(
+    model,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    peft_config=peft_config,
+    max_seq_length=model_params['model_max_length']
+)
+trainer.train()

playground.ipynb ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModel\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_memory_required(model_name):\n",
+    "    model = AutoModel.from_pretrained(model_name)\n",
+    "\n",
+    "    # Calculate total parameters (assuming model parameters and gradients are in FP32)\n",
+    "    total_params = sum(p.numel() for p in model.parameters())\n",
+    "    total_memory_params = total_params * 4  # 4 bytes for FP32\n",
+    "\n",
+    "    # Optimizer states (e.g., for Adam, it's roughly the same as the model parameters)\n",
+    "    optimizer_memory = total_memory_params * 2  # Adam stores two values per parameter\n",
+    "\n",
+    "    # Batch size and sequence length\n",
+    "    batch_size = 32\n",
+    "    sequence_length = 512\n",
+    "    # Estimate activation memory (very rough estimate)\n",
+    "    activation_memory_per_example = sequence_length * model.config.hidden_size * 4  # 4 bytes for FP32\n",
+    "    total_activation_memory = batch_size * activation_memory_per_example\n",
+    "\n",
+    "    # Total estimated memory\n",
+    "    total_estimated_memory = total_memory_params + optimizer_memory + total_activation_memory\n",
+    "\n",
+    "    print(f\"Estimated memory for model and gradients: {total_memory_params / (1024 ** 3):.2f} GB\")\n",
+    "    print(f\"Estimated memory for optimizer states: {optimizer_memory / (1024 ** 3):.2f} GB\")\n",
+    "    print(f\"Estimated memory for activations: {total_activation_memory / (1024 ** 3):.2f} GB\")\n",
+    "    print(f\"Total estimated memory: {total_estimated_memory / (1024 ** 3):.2f} GB\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load model\n",
+    "model_name = 'mistralai/Mistral-7B-v0.1'\n",
+    "calculate_memory_required(model_name)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,20 @@

+[tool.poetry]
+name = "fine-tuning-playground"
+version = "0.1.0"
+description = ""
+authors = ["dsmueller <[email protected]>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.11"
+trl = "^0.7.6"
+datasets = "^2.16.0"
+transformers = "^4.36.2"
+torch = "^2.1.2"
+ipykernel = "^6.27.1"
+peft = "^0.7.1"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"