import os from tqdm import tqdm import shutil os.environ['HF_ENDPOINT']="https://hf-mirror.com" from qwenva import tokenizer from qwenva import processor from qwenva import qwenva images_file_path="/root/autodl-tmp/images" import torch from torch.utils.data import Dataset, DataLoader import os import json from PIL import Image import json with open('/root/autodl-tmp/chat.json', 'r', encoding='utf-8') as f: chat_data = json.load(f) image_token=tokenizer.encode('')[0] pad_token=tokenizer.pad_token_id image_token=tokenizer.encode('')[0] pad_token=tokenizer.pad_token_id def process_data(sample,max_len=8012): conversations=sample['conversations'] labels=[] input_ids=[] flag=0 messages=[] input_ids=[] for index,item in enumerate(conversations): if item['from']=='human': old_input_ids=input_ids messages.append({'role':'user','content':item['value']}) input_ids=tokenizer.apply_chat_template( messages, add_generation_prompt=True ) #input_ids+=input_token[] labels+=[-100]*(len(input_ids)-len(old_input_ids)) if index==flag: try: image_index=input_ids.index(image_token) labels[image_index]=image_token except ValueError: print("image token not found") flag=index+1 continue elif item['from']=='gpt': old_input_ids=input_ids messages.append({'role':'assistant','content':item['value']}) input_ids=tokenizer.apply_chat_template( messages ) flag=index+1 labels+=input_ids[len(old_input_ids):] #填充或者截断,使得长度相同 if len(input_ids)>max_len: input_ids=input_ids[:max_len] labels=labels[:max_len] attention_mask=[1]*len(input_ids) else: attention_mask=[1]*len(input_ids)+[0]*(max_len-len(input_ids)) input_ids+=[pad_token]*(max_len-len(input_ids)) labels+=[-100]*(max_len-len(labels)) #转化为张量 input_ids=torch.tensor(input_ids) attention_mask=torch.tensor(attention_mask) labels=torch.tensor(labels) image_index=torch.tensor(image_index) return { 'input_ids':input_ids, 'attention_mask':attention_mask, 'labels':labels, 'image_idx':image_index } import os import torch from torch.utils.data import Dataset from PIL import Image class MyDataset(Dataset): def __init__(self, images_file_path,data,max_len=1024): self.max_len=max_len self.images_file_path = images_file_path self.data = data self.max_len=max_len def __len__(self): return len(self.data) def __getitem__(self, index): output_=process_data(self.data[index],max_len=self.max_len) img_path=os.path.join(self.images_file_path,self.data[index]['image']) img=Image.open(img_path) input_pixel= processor(images=img, return_tensors="pt") output_['input_pixel']=input_pixel['pixel_values'].squeeze() return output_ dataset=MyDataset(images_file_path,chat_data,max_len=360) train_loader=DataLoader(dataset,batch_size=8,shuffle=True) import deepspeed import argparse # 设置设备 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') qwenva=qwenva.to(device) model_engine,optimizer,_,_=deepspeed.initialize( model=qwenva, args=argparse.Namespace(), model_parameters=qwenva.parameters(), config_params="./deepspeed_config.json" ) #checkpoint_path = "/root/autodl-tmp/best_model_2" #model_engine.load_checkpoint(checkpoint_path) import torch.optim as optim import torch.nn as nn from torch.amp import autocast, GradScaler #optimizer = optim.Adam(model.parameters(), lr=0.001) loss_fn = nn.CrossEntropyLoss() #eps = 1e-8 accumulation_steps = 2 # 训练函数 def train(model_engine, train_dataloader, optimizer, loss_fn, device, epochs): model_engine.train() #model_engine.to(device) for epoch in range(epochs): # 使用 tqdm 显示进度条 with tqdm(total=len(train_dataloader), desc=f'Epoch {epoch + 1}/{epochs}', unit='batch') as pbar: optimizer.zero_grad() for batch_idx, batch in enumerate(train_dataloader): # 将数据拷贝到 GPU 上 input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) input_pixel = batch['input_pixel'].to(device) labels = batch['labels'].to(device) image_idx=batch['image_idx'].to(device) logits = model_engine(input_ids, attention_mask, input_pixel,image_idx) # 计算损失 #max_logits= logits.max(dim=-1, keepdim=True)[0] # 计算最大值 #stable_logits= logits - max_logits # 减去最大值得到数值稳定的值 loss= loss_fn(logits[:, :-1, :].reshape(-1, logits.shape[-1]), labels[:, 1:].reshape(-1).clone()) # 反向传播 model_engine.backward(loss) if (batch_idx+1)%accumulation_steps==0: model_engine.step() pbar.update(1) pbar.set_postfix(loss=loss.item()) # 显示当前损失 if (batch_idx+1)%24807==0: # 如果文件夹存在,则删除并重新创建 if os.path.exists("/root/autodl-tmp/best_model_instruct"): shutil.rmtree("/root/autodl-tmp/best_model_instruct") # 删除文件夹及其内容 os.makedirs("/root/autodl-tmp/best_model_instruct") # 重新创建文件夹 model_engine.save_checkpoint("/root/autodl-tmp/best_model_instruct") print(f" model saved at batch {batch_idx+1}") train(model_engine, train_loader, optimizer, loss_fn, device, epochs=1)