Spaces:
Runtime error
Runtime error
Add new files and update dependencies
Browse files- .gitignore +4 -0
- Dockerfile +44 -0
- app.py +124 -0
- playground.ipynb +64 -0
- poetry.lock +0 -0
- pyproject.toml +20 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.venv/
|
2 |
+
__pycache__/
|
3 |
+
.env
|
4 |
+
cache
|
Dockerfile
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as a parent image
|
2 |
+
FROM python:3.11.1
|
3 |
+
|
4 |
+
# Set up user
|
5 |
+
RUN useradd -m -u 1000 user
|
6 |
+
USER user
|
7 |
+
|
8 |
+
# Install Cargo (Rust's package manager), needed for hf_transfer
|
9 |
+
# RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
|
10 |
+
|
11 |
+
# Set up environment variables
|
12 |
+
# ENV HOME=/home/user \
|
13 |
+
# PATH=/home/user/.local/bin:/home/user/.cargo/bin:$PATH
|
14 |
+
ENV HOME=/home/user \
|
15 |
+
PATH=/home/user/.local/bin:/home/user/.cargo/bin:$PATH
|
16 |
+
|
17 |
+
# Set the working directory in the container
|
18 |
+
WORKDIR $HOME/app
|
19 |
+
|
20 |
+
# Copy the current directory contents into the container at /home/user/app
|
21 |
+
COPY --chown=user . $HOME/app
|
22 |
+
|
23 |
+
# Set user to root
|
24 |
+
USER root
|
25 |
+
|
26 |
+
# Install any needed packages specified in requirements.txt
|
27 |
+
RUN python -m venv /venv && \
|
28 |
+
/venv/bin/pip install --no-cache-dir -r requirements.txt
|
29 |
+
|
30 |
+
# Fix broken OpenCV installation for docker container
|
31 |
+
# RUN apt-get update && apt-get install -y libgl1 && apt-get install -y python3-opencv
|
32 |
+
# RUN /venv/bin/pip install opencv-python
|
33 |
+
|
34 |
+
# Set user back to non-root
|
35 |
+
USER user
|
36 |
+
|
37 |
+
# Make port 80 available to the world outside this container
|
38 |
+
EXPOSE 80
|
39 |
+
|
40 |
+
# Generate requirements.txt using pip freeze
|
41 |
+
RUN /venv/bin/pip freeze > requirements.txt
|
42 |
+
|
43 |
+
# Run train_llm.py when the container launches
|
44 |
+
CMD ["/bin/bash", "-c", "source /venv/bin/activate && python train_llm.py"]
|
app.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from trl import SFTTrainer
|
3 |
+
from peft import LoraConfig
|
4 |
+
|
5 |
+
import os
|
6 |
+
from uuid import uuid4
|
7 |
+
import pandas as pd
|
8 |
+
|
9 |
+
import subprocess
|
10 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
11 |
+
|
12 |
+
def max_token_len(dataset):
|
13 |
+
max_seq_length = 0
|
14 |
+
for row in dataset:
|
15 |
+
tokens = len(tokenizer(row['text'])['input_ids'])
|
16 |
+
if tokens > max_seq_length:
|
17 |
+
max_seq_length = tokens
|
18 |
+
return max_seq_length
|
19 |
+
|
20 |
+
# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
|
21 |
+
model_name = 'mistralai/Mistral-7B-v0.1'
|
22 |
+
# model_name = 'distilbert-base-uncased'
|
23 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
24 |
+
model_max_length = tokenizer.model_max_length
|
25 |
+
print("Model Max Length:", model_max_length)
|
26 |
+
|
27 |
+
# dataset = load_dataset("imdb", split="train")
|
28 |
+
dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
|
29 |
+
dataset = load_dataset(dataset_name, split="train")
|
30 |
+
|
31 |
+
# Write dataset files into data directory
|
32 |
+
data_directory = './fine_tune_data/'
|
33 |
+
|
34 |
+
# Create the data directory if it doesn't exist
|
35 |
+
os.makedirs(data_directory, exist_ok=True)
|
36 |
+
|
37 |
+
# Write the train data to a CSV file
|
38 |
+
train_data='train_data'
|
39 |
+
train_filename = os.path.join(data_directory, train_data)
|
40 |
+
dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)
|
41 |
+
max_token_length_train=max_token_len(dataset['train'])
|
42 |
+
print('Max token length train: '+str(max_token_length_train))
|
43 |
+
|
44 |
+
# Write the validation data to a CSV file
|
45 |
+
validation_data='validation_data'
|
46 |
+
validation_filename = os.path.join(data_directory, validation_data)
|
47 |
+
dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)
|
48 |
+
max_token_length_validation=max_token_len(dataset['validation'])
|
49 |
+
print('Max token length validation: '+str(max_token_length_validation))
|
50 |
+
|
51 |
+
max_token_length=max(max_token_length_train,max_token_length_validation)
|
52 |
+
if max_token_length > model_max_length:
|
53 |
+
raise ValueError("Maximum token length exceeds model limits.")
|
54 |
+
block_size=2*max_token_length
|
55 |
+
print('Block size: '+str(block_size))
|
56 |
+
|
57 |
+
# Define project parameters
|
58 |
+
username='ai-aerospace'
|
59 |
+
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
|
60 |
+
repo_name='ams-data-train-100-'+str(uuid4())
|
61 |
+
|
62 |
+
model_params={
|
63 |
+
"project_name": project_name,
|
64 |
+
"model_name": model_name,
|
65 |
+
"repo_id": username+'/'+repo_name,
|
66 |
+
"train_data": train_data,
|
67 |
+
"validation_data": validation_data,
|
68 |
+
"data_directory": data_directory,
|
69 |
+
"block_size": block_size,
|
70 |
+
"model_max_length": max_token_length,
|
71 |
+
"logging_steps": -1,
|
72 |
+
"evaluation_strategy": "epoch",
|
73 |
+
"save_total_limit": 1,
|
74 |
+
"save_strategy": "epoch",
|
75 |
+
"mixed_precision": "fp16",
|
76 |
+
"lr": 0.00003,
|
77 |
+
"epochs": 3,
|
78 |
+
"batch_size": 2,
|
79 |
+
"warmup_ratio": 0.1,
|
80 |
+
"gradient_accumulation": 1,
|
81 |
+
"optimizer": "adamw_torch",
|
82 |
+
"scheduler": "linear",
|
83 |
+
"weight_decay": 0,
|
84 |
+
"max_grad_norm": 1,
|
85 |
+
"seed": 42,
|
86 |
+
"quantization": "int4",
|
87 |
+
"lora_r": 16,
|
88 |
+
"lora_alpha": 32,
|
89 |
+
"lora_dropout": 0.05
|
90 |
+
}
|
91 |
+
for key, value in model_params.items():
|
92 |
+
os.environ[key] = str(value)
|
93 |
+
|
94 |
+
print(model_params)
|
95 |
+
|
96 |
+
### Load model
|
97 |
+
model = AutoModelForCausalLM.from_pretrained(
|
98 |
+
model_name,
|
99 |
+
load_in_4bit=True
|
100 |
+
)
|
101 |
+
|
102 |
+
### Start trainer
|
103 |
+
# trainer = SFTTrainer(
|
104 |
+
# model_name,
|
105 |
+
# train_dataset=dataset,
|
106 |
+
# dataset_text_field="text",
|
107 |
+
# max_seq_length=512,
|
108 |
+
# )
|
109 |
+
|
110 |
+
peft_config = LoraConfig(
|
111 |
+
r=model_params['lora_r'],
|
112 |
+
lora_alpha=model_params['lora_alpha'],
|
113 |
+
lora_dropout=model_params['lora_dropout']
|
114 |
+
)
|
115 |
+
|
116 |
+
trainer = SFTTrainer(
|
117 |
+
model,
|
118 |
+
train_dataset=dataset,
|
119 |
+
dataset_text_field="text",
|
120 |
+
peft_config=peft_config,
|
121 |
+
max_seq_length=model_params['model_max_length']
|
122 |
+
)
|
123 |
+
|
124 |
+
trainer.train()
|
playground.ipynb
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"from transformers import AutoModel\n",
|
10 |
+
"import torch"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": null,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [],
|
18 |
+
"source": [
|
19 |
+
"def calculate_memory_required(model_name):\n",
|
20 |
+
" model = AutoModel.from_pretrained(model_name)\n",
|
21 |
+
"\n",
|
22 |
+
" # Calculate total parameters (assuming model parameters and gradients are in FP32)\n",
|
23 |
+
" total_params = sum(p.numel() for p in model.parameters())\n",
|
24 |
+
" total_memory_params = total_params * 4 # 4 bytes for FP32\n",
|
25 |
+
"\n",
|
26 |
+
" # Optimizer states (e.g., for Adam, it's roughly the same as the model parameters)\n",
|
27 |
+
" optimizer_memory = total_memory_params * 2 # Adam stores two values per parameter\n",
|
28 |
+
"\n",
|
29 |
+
" # Batch size and sequence length\n",
|
30 |
+
" batch_size = 32\n",
|
31 |
+
" sequence_length = 512\n",
|
32 |
+
" # Estimate activation memory (very rough estimate)\n",
|
33 |
+
" activation_memory_per_example = sequence_length * model.config.hidden_size * 4 # 4 bytes for FP32\n",
|
34 |
+
" total_activation_memory = batch_size * activation_memory_per_example\n",
|
35 |
+
"\n",
|
36 |
+
" # Total estimated memory\n",
|
37 |
+
" total_estimated_memory = total_memory_params + optimizer_memory + total_activation_memory\n",
|
38 |
+
"\n",
|
39 |
+
" print(f\"Estimated memory for model and gradients: {total_memory_params / (1024 ** 3):.2f} GB\")\n",
|
40 |
+
" print(f\"Estimated memory for optimizer states: {optimizer_memory / (1024 ** 3):.2f} GB\")\n",
|
41 |
+
" print(f\"Estimated memory for activations: {total_activation_memory / (1024 ** 3):.2f} GB\")\n",
|
42 |
+
" print(f\"Total estimated memory: {total_estimated_memory / (1024 ** 3):.2f} GB\")\n"
|
43 |
+
]
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"cell_type": "code",
|
47 |
+
"execution_count": null,
|
48 |
+
"metadata": {},
|
49 |
+
"outputs": [],
|
50 |
+
"source": [
|
51 |
+
"# Load model\n",
|
52 |
+
"model_name = 'mistralai/Mistral-7B-v0.1'\n",
|
53 |
+
"calculate_memory_required(model_name)\n"
|
54 |
+
]
|
55 |
+
}
|
56 |
+
],
|
57 |
+
"metadata": {
|
58 |
+
"language_info": {
|
59 |
+
"name": "python"
|
60 |
+
}
|
61 |
+
},
|
62 |
+
"nbformat": 4,
|
63 |
+
"nbformat_minor": 2
|
64 |
+
}
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "fine-tuning-playground"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = ["dsmueller <[email protected]>"]
|
6 |
+
readme = "README.md"
|
7 |
+
|
8 |
+
[tool.poetry.dependencies]
|
9 |
+
python = "^3.11"
|
10 |
+
trl = "^0.7.6"
|
11 |
+
datasets = "^2.16.0"
|
12 |
+
transformers = "^4.36.2"
|
13 |
+
torch = "^2.1.2"
|
14 |
+
ipykernel = "^6.27.1"
|
15 |
+
peft = "^0.7.1"
|
16 |
+
|
17 |
+
|
18 |
+
[build-system]
|
19 |
+
requires = ["poetry-core"]
|
20 |
+
build-backend = "poetry.core.masonry.api"
|