dsmueller commited on
Commit
9e70bac
·
1 Parent(s): be97146

Add new files and update dependencies

Browse files
Files changed (6) hide show
  1. .gitignore +4 -0
  2. Dockerfile +44 -0
  3. app.py +124 -0
  4. playground.ipynb +64 -0
  5. poetry.lock +0 -0
  6. pyproject.toml +20 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ .env
4
+ cache
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.11.1
3
+
4
+ # Set up user
5
+ RUN useradd -m -u 1000 user
6
+ USER user
7
+
8
+ # Install Cargo (Rust's package manager), needed for hf_transfer
9
+ # RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
10
+
11
+ # Set up environment variables
12
+ # ENV HOME=/home/user \
13
+ # PATH=/home/user/.local/bin:/home/user/.cargo/bin:$PATH
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:/home/user/.cargo/bin:$PATH
16
+
17
+ # Set the working directory in the container
18
+ WORKDIR $HOME/app
19
+
20
+ # Copy the current directory contents into the container at /home/user/app
21
+ COPY --chown=user . $HOME/app
22
+
23
+ # Set user to root
24
+ USER root
25
+
26
+ # Install any needed packages specified in requirements.txt
27
+ RUN python -m venv /venv && \
28
+ /venv/bin/pip install --no-cache-dir -r requirements.txt
29
+
30
+ # Fix broken OpenCV installation for docker container
31
+ # RUN apt-get update && apt-get install -y libgl1 && apt-get install -y python3-opencv
32
+ # RUN /venv/bin/pip install opencv-python
33
+
34
+ # Set user back to non-root
35
+ USER user
36
+
37
+ # Make port 80 available to the world outside this container
38
+ EXPOSE 80
39
+
40
+ # Generate requirements.txt using pip freeze
41
+ RUN /venv/bin/pip freeze > requirements.txt
42
+
43
+ # Run train_llm.py when the container launches
44
+ CMD ["/bin/bash", "-c", "source /venv/bin/activate && python train_llm.py"]
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from trl import SFTTrainer
3
+ from peft import LoraConfig
4
+
5
+ import os
6
+ from uuid import uuid4
7
+ import pandas as pd
8
+
9
+ import subprocess
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer
11
+
12
+ def max_token_len(dataset):
13
+ max_seq_length = 0
14
+ for row in dataset:
15
+ tokens = len(tokenizer(row['text'])['input_ids'])
16
+ if tokens > max_seq_length:
17
+ max_seq_length = tokens
18
+ return max_seq_length
19
+
20
+ # model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
21
+ model_name = 'mistralai/Mistral-7B-v0.1'
22
+ # model_name = 'distilbert-base-uncased'
23
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
24
+ model_max_length = tokenizer.model_max_length
25
+ print("Model Max Length:", model_max_length)
26
+
27
+ # dataset = load_dataset("imdb", split="train")
28
+ dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
29
+ dataset = load_dataset(dataset_name, split="train")
30
+
31
+ # Write dataset files into data directory
32
+ data_directory = './fine_tune_data/'
33
+
34
+ # Create the data directory if it doesn't exist
35
+ os.makedirs(data_directory, exist_ok=True)
36
+
37
+ # Write the train data to a CSV file
38
+ train_data='train_data'
39
+ train_filename = os.path.join(data_directory, train_data)
40
+ dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)
41
+ max_token_length_train=max_token_len(dataset['train'])
42
+ print('Max token length train: '+str(max_token_length_train))
43
+
44
+ # Write the validation data to a CSV file
45
+ validation_data='validation_data'
46
+ validation_filename = os.path.join(data_directory, validation_data)
47
+ dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)
48
+ max_token_length_validation=max_token_len(dataset['validation'])
49
+ print('Max token length validation: '+str(max_token_length_validation))
50
+
51
+ max_token_length=max(max_token_length_train,max_token_length_validation)
52
+ if max_token_length > model_max_length:
53
+ raise ValueError("Maximum token length exceeds model limits.")
54
+ block_size=2*max_token_length
55
+ print('Block size: '+str(block_size))
56
+
57
+ # Define project parameters
58
+ username='ai-aerospace'
59
+ project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
60
+ repo_name='ams-data-train-100-'+str(uuid4())
61
+
62
+ model_params={
63
+ "project_name": project_name,
64
+ "model_name": model_name,
65
+ "repo_id": username+'/'+repo_name,
66
+ "train_data": train_data,
67
+ "validation_data": validation_data,
68
+ "data_directory": data_directory,
69
+ "block_size": block_size,
70
+ "model_max_length": max_token_length,
71
+ "logging_steps": -1,
72
+ "evaluation_strategy": "epoch",
73
+ "save_total_limit": 1,
74
+ "save_strategy": "epoch",
75
+ "mixed_precision": "fp16",
76
+ "lr": 0.00003,
77
+ "epochs": 3,
78
+ "batch_size": 2,
79
+ "warmup_ratio": 0.1,
80
+ "gradient_accumulation": 1,
81
+ "optimizer": "adamw_torch",
82
+ "scheduler": "linear",
83
+ "weight_decay": 0,
84
+ "max_grad_norm": 1,
85
+ "seed": 42,
86
+ "quantization": "int4",
87
+ "lora_r": 16,
88
+ "lora_alpha": 32,
89
+ "lora_dropout": 0.05
90
+ }
91
+ for key, value in model_params.items():
92
+ os.environ[key] = str(value)
93
+
94
+ print(model_params)
95
+
96
+ ### Load model
97
+ model = AutoModelForCausalLM.from_pretrained(
98
+ model_name,
99
+ load_in_4bit=True
100
+ )
101
+
102
+ ### Start trainer
103
+ # trainer = SFTTrainer(
104
+ # model_name,
105
+ # train_dataset=dataset,
106
+ # dataset_text_field="text",
107
+ # max_seq_length=512,
108
+ # )
109
+
110
+ peft_config = LoraConfig(
111
+ r=model_params['lora_r'],
112
+ lora_alpha=model_params['lora_alpha'],
113
+ lora_dropout=model_params['lora_dropout']
114
+ )
115
+
116
+ trainer = SFTTrainer(
117
+ model,
118
+ train_dataset=dataset,
119
+ dataset_text_field="text",
120
+ peft_config=peft_config,
121
+ max_seq_length=model_params['model_max_length']
122
+ )
123
+
124
+ trainer.train()
playground.ipynb ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from transformers import AutoModel\n",
10
+ "import torch"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "def calculate_memory_required(model_name):\n",
20
+ " model = AutoModel.from_pretrained(model_name)\n",
21
+ "\n",
22
+ " # Calculate total parameters (assuming model parameters and gradients are in FP32)\n",
23
+ " total_params = sum(p.numel() for p in model.parameters())\n",
24
+ " total_memory_params = total_params * 4 # 4 bytes for FP32\n",
25
+ "\n",
26
+ " # Optimizer states (e.g., for Adam, it's roughly the same as the model parameters)\n",
27
+ " optimizer_memory = total_memory_params * 2 # Adam stores two values per parameter\n",
28
+ "\n",
29
+ " # Batch size and sequence length\n",
30
+ " batch_size = 32\n",
31
+ " sequence_length = 512\n",
32
+ " # Estimate activation memory (very rough estimate)\n",
33
+ " activation_memory_per_example = sequence_length * model.config.hidden_size * 4 # 4 bytes for FP32\n",
34
+ " total_activation_memory = batch_size * activation_memory_per_example\n",
35
+ "\n",
36
+ " # Total estimated memory\n",
37
+ " total_estimated_memory = total_memory_params + optimizer_memory + total_activation_memory\n",
38
+ "\n",
39
+ " print(f\"Estimated memory for model and gradients: {total_memory_params / (1024 ** 3):.2f} GB\")\n",
40
+ " print(f\"Estimated memory for optimizer states: {optimizer_memory / (1024 ** 3):.2f} GB\")\n",
41
+ " print(f\"Estimated memory for activations: {total_activation_memory / (1024 ** 3):.2f} GB\")\n",
42
+ " print(f\"Total estimated memory: {total_estimated_memory / (1024 ** 3):.2f} GB\")\n"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "# Load model\n",
52
+ "model_name = 'mistralai/Mistral-7B-v0.1'\n",
53
+ "calculate_memory_required(model_name)\n"
54
+ ]
55
+ }
56
+ ],
57
+ "metadata": {
58
+ "language_info": {
59
+ "name": "python"
60
+ }
61
+ },
62
+ "nbformat": 4,
63
+ "nbformat_minor": 2
64
+ }
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "fine-tuning-playground"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["dsmueller <[email protected]>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.11"
10
+ trl = "^0.7.6"
11
+ datasets = "^2.16.0"
12
+ transformers = "^4.36.2"
13
+ torch = "^2.1.2"
14
+ ipykernel = "^6.27.1"
15
+ peft = "^0.7.1"
16
+
17
+
18
+ [build-system]
19
+ requires = ["poetry-core"]
20
+ build-backend = "poetry.core.masonry.api"