Safetensors
qwenva / deepspeed_pretrain.py
jiangchengchengNLP's picture
Upload 5 files
4ab846c verified
raw
history blame
6.11 kB
import os
from tqdm import tqdm
import shutil
os.environ['HF_ENDPOINT']="https://hf-mirror.com"
from qwenva import tokenizer
from qwenva import processor
from qwenva import qwenva
images_file_path="/root/autodl-tmp/images"
import torch
from torch.utils.data import Dataset, DataLoader
import os
import json
from PIL import Image
import json
with open('/root/autodl-tmp/chat.json', 'r', encoding='utf-8') as f:
chat_data = json.load(f)
image_token=tokenizer.encode('<image>')[0]
pad_token=tokenizer.pad_token_id
image_token=tokenizer.encode('<image>')[0]
pad_token=tokenizer.pad_token_id
def process_data(sample,max_len=8012):
conversations=sample['conversations']
labels=[]
input_ids=[]
flag=0
messages=[]
input_ids=[]
for index,item in enumerate(conversations):
if item['from']=='human':
old_input_ids=input_ids
messages.append({'role':'user','content':item['value']})
input_ids=tokenizer.apply_chat_template(
messages,
add_generation_prompt=True
)
#input_ids+=input_token[]
labels+=[-100]*(len(input_ids)-len(old_input_ids))
if index==flag:
try:
image_index=input_ids.index(image_token)
labels[image_index]=image_token
except ValueError:
print("image token not found")
flag=index+1
continue
elif item['from']=='gpt':
old_input_ids=input_ids
messages.append({'role':'assistant','content':item['value']})
input_ids=tokenizer.apply_chat_template(
messages
)
flag=index+1
labels+=input_ids[len(old_input_ids):]
#填充或者截断,使得长度相同
if len(input_ids)>max_len:
input_ids=input_ids[:max_len]
labels=labels[:max_len]
attention_mask=[1]*len(input_ids)
else:
attention_mask=[1]*len(input_ids)+[0]*(max_len-len(input_ids))
input_ids+=[pad_token]*(max_len-len(input_ids))
labels+=[-100]*(max_len-len(labels))
#转化为张量
input_ids=torch.tensor(input_ids)
attention_mask=torch.tensor(attention_mask)
labels=torch.tensor(labels)
image_index=torch.tensor(image_index)
return {
'input_ids':input_ids,
'attention_mask':attention_mask,
'labels':labels,
'image_idx':image_index
}
import os
import torch
from torch.utils.data import Dataset
from PIL import Image
class MyDataset(Dataset):
def __init__(self, images_file_path,data,max_len=1024):
self.max_len=max_len
self.images_file_path = images_file_path
self.data = data
self.max_len=max_len
def __len__(self):
return len(self.data)
def __getitem__(self, index):
output_=process_data(self.data[index],max_len=self.max_len)
img_path=os.path.join(self.images_file_path,self.data[index]['image'])
img=Image.open(img_path)
input_pixel= processor(images=img, return_tensors="pt")
output_['input_pixel']=input_pixel['pixel_values'].squeeze()
return output_
dataset=MyDataset(images_file_path,chat_data,max_len=360)
train_loader=DataLoader(dataset,batch_size=8,shuffle=True)
import deepspeed
import argparse
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
qwenva=qwenva.to(device)
model_engine,optimizer,_,_=deepspeed.initialize(
model=qwenva,
args=argparse.Namespace(),
model_parameters=qwenva.parameters(),
config_params="./deepspeed_config.json"
)
#checkpoint_path = "/root/autodl-tmp/best_model_2"
#model_engine.load_checkpoint(checkpoint_path)
import torch.optim as optim
import torch.nn as nn
from torch.amp import autocast, GradScaler
#optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()
#eps = 1e-8
accumulation_steps = 2
# 训练函数
def train(model_engine, train_dataloader, optimizer, loss_fn, device, epochs):
model_engine.train()
#model_engine.to(device)
for epoch in range(epochs):
# 使用 tqdm 显示进度条
with tqdm(total=len(train_dataloader), desc=f'Epoch {epoch + 1}/{epochs}', unit='batch') as pbar:
optimizer.zero_grad()
for batch_idx, batch in enumerate(train_dataloader):
# 将数据拷贝到 GPU 上
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
input_pixel = batch['input_pixel'].to(device)
labels = batch['labels'].to(device)
image_idx=batch['image_idx'].to(device)
logits = model_engine(input_ids, attention_mask, input_pixel,image_idx)
# 计算损失
#max_logits= logits.max(dim=-1, keepdim=True)[0] # 计算最大值
#stable_logits= logits - max_logits # 减去最大值得到数值稳定的值
loss= loss_fn(logits[:, :-1, :].reshape(-1, logits.shape[-1]), labels[:, 1:].reshape(-1).clone())
# 反向传播
model_engine.backward(loss)
if (batch_idx+1)%accumulation_steps==0:
model_engine.step()
pbar.update(1)
pbar.set_postfix(loss=loss.item()) # 显示当前损失
if (batch_idx+1)%24807==0:
# 如果文件夹存在,则删除并重新创建
if os.path.exists("/root/autodl-tmp/best_model_instruct"):
shutil.rmtree("/root/autodl-tmp/best_model_instruct") # 删除文件夹及其内容
os.makedirs("/root/autodl-tmp/best_model_instruct") # 重新创建文件夹
model_engine.save_checkpoint("/root/autodl-tmp/best_model_instruct")
print(f" model saved at batch {batch_idx+1}")
train(model_engine, train_loader, optimizer, loss_fn, device, epochs=1)