Spaces:
Runtime error
Runtime error
Initial commit
Browse files- .gitignore +3 -0
- Dockerfile +34 -0
- README.md +5 -5
- train_llm.py +69 -0
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
.env
|
Dockerfile
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime as a parent image
|
| 2 |
+
FROM python:3.11.1
|
| 3 |
+
|
| 4 |
+
# Set the working directory in the container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install poetry
|
| 8 |
+
# RUN pip3 install poetry==1.7.1
|
| 9 |
+
|
| 10 |
+
# Copy the current directory contents into the container at /usr/src/app
|
| 11 |
+
COPY . .
|
| 12 |
+
|
| 13 |
+
# Install dependencies
|
| 14 |
+
# RUN poetry config virtualenvs.create false \
|
| 15 |
+
# && poetry install --no-interaction --no-ansi
|
| 16 |
+
# Streamlit must be installed separately. Potentially this will cause an issue with dependencies in the future, but it's the only way it works.
|
| 17 |
+
# RUN pip3 install streamlit
|
| 18 |
+
|
| 19 |
+
# Install dependencies
|
| 20 |
+
RUN pip3 install -r requirements.txt
|
| 21 |
+
|
| 22 |
+
# Make a port available to the world outside this container
|
| 23 |
+
# The EXPOSE instruction informs Docker that the container listens on the specified network ports at runtime. Your container needs to listen to Streamlit’s (default) port 8501.
|
| 24 |
+
EXPOSE 8501
|
| 25 |
+
|
| 26 |
+
# The HEALTHCHECK instruction tells Docker how to test a container to check that it is still working. Your container needs to listen to Streamlit’s (default) port 8501:
|
| 27 |
+
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 28 |
+
|
| 29 |
+
# Run the command inside your image filesystem.
|
| 30 |
+
CMD ["python", "train_llm.py"]
|
| 31 |
+
|
| 32 |
+
# Execute with:
|
| 33 |
+
# docker build -t <image_name> .
|
| 34 |
+
# docker run -p 8501:8501 <image_name>
|
README.md
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
---
|
| 2 |
title: Autotrain Playground
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
-
|
| 9 |
---
|
| 10 |
|
| 11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
title: Autotrain Playground
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 8501
|
| 9 |
---
|
| 10 |
|
| 11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
train_llm.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import jsonlines
|
| 3 |
+
from uuid import uuid4
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
from datasets import load_dataset
|
| 7 |
+
import subprocess
|
| 8 |
+
from tqdm.notebook import tqdm
|
| 9 |
+
|
| 10 |
+
# from dotenv import load_dotenv,find_dotenv
|
| 11 |
+
# load_dotenv(find_dotenv(),override=True)
|
| 12 |
+
|
| 13 |
+
# Load dataset
|
| 14 |
+
dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
|
| 15 |
+
dataset=load_dataset(dataset_name)
|
| 16 |
+
|
| 17 |
+
# Write dataset files into data directory
|
| 18 |
+
data_directory = '../fine_tune_data/'
|
| 19 |
+
|
| 20 |
+
# Create the data directory if it doesn't exist
|
| 21 |
+
os.makedirs(data_directory, exist_ok=True)
|
| 22 |
+
|
| 23 |
+
# Write the train data to a CSV file
|
| 24 |
+
train_data='train_data.csv'
|
| 25 |
+
train_filename = os.path.join(data_directory, train_data)
|
| 26 |
+
dataset['train'].to_pandas().to_csv(train_filename, columns=['text'], index=False)
|
| 27 |
+
|
| 28 |
+
# Write the validation data to a CSV file
|
| 29 |
+
validation_data='validation_data.csv'
|
| 30 |
+
validation_filename = os.path.join(data_directory, validation_data)
|
| 31 |
+
dataset['validation'].to_pandas().to_csv(validation_filename, columns=['text'], index=False)
|
| 32 |
+
|
| 33 |
+
# Define project parameters
|
| 34 |
+
username='ai-aerospace'
|
| 35 |
+
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
|
| 36 |
+
repo_name='ams_data_train-100_'+str(uuid4())
|
| 37 |
+
|
| 38 |
+
model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
|
| 39 |
+
# model_name='mistralai/Mistral-7B-v0.1'
|
| 40 |
+
|
| 41 |
+
# Save parameters to environment variables
|
| 42 |
+
os.environ["project_name"] = project_name
|
| 43 |
+
os.environ["model_name"] = model_name
|
| 44 |
+
os.environ["repo_id"] = username+'/'+repo_name
|
| 45 |
+
os.environ["train_data"] = train_data
|
| 46 |
+
os.environ["validation_data"] = validation_data
|
| 47 |
+
|
| 48 |
+
# Set .venv and execute the autotrain script
|
| 49 |
+
# !autotrain llm --train --project_name my-llm --model TinyLlama/TinyLlama-1.1B-Chat-v0.1 --data_path . --use-peft --use_int4 --learning_rate 2e-4 --train_batch_size 6 --num_train_epochs 3 --trainer sft
|
| 50 |
+
# The training dataset to be used must be called training.csv and be located in the data_path folder.
|
| 51 |
+
command="""
|
| 52 |
+
source ../.venv/bin/activate && autotrain llm --train \
|
| 53 |
+
--project_name ${project_name} \
|
| 54 |
+
--model ${model_name} \
|
| 55 |
+
--data_path ../fine_tune_data \
|
| 56 |
+
--train_split ${train_data} \
|
| 57 |
+
--valid_split ${validation_data} \
|
| 58 |
+
--use-peft \
|
| 59 |
+
--learning_rate 2e-4 \
|
| 60 |
+
--train_batch_size 6 \
|
| 61 |
+
--num_train_epochs 3 \
|
| 62 |
+
--trainer sft \
|
| 63 |
+
--push_to_hub \
|
| 64 |
+
--repo_id ${repo_id} \
|
| 65 |
+
--token $HUGGINGFACE_TOKEN
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
# Use subprocess.run() to execute the command
|
| 69 |
+
subprocess.run(command, shell=True, check=True, env=os.environ)
|