|
--- |
|
base_model: HuggingFaceM4/Idefics3-8B-Llama3 |
|
library_name: peft |
|
license: apache-2.0 |
|
tags: |
|
- generated_from_trainer |
|
model-index: |
|
- name: idefics3-llama-gui-dense-descriptions |
|
results: [] |
|
datasets: |
|
- Agent-Eval-Refine/GUI-Dense-Descriptions |
|
language: |
|
- en |
|
--- |
|
|
|
<!-- This model card has been generated automatically according to the information the Trainer had access to. You |
|
should probably proofread and complete it, then remove this comment. --> |
|
|
|
# idefics3-llama-gui-dense-descriptions |
|
|
|
This model is a fine-tuned version of [HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) on https://huggingface.co/datasets/Agent-Eval-Refine/GUI-Dense-Descriptions dataset |
|
|
|
## Finetuning script |
|
|
|
```python |
|
# !pip install git+https://github.com/andimarafioti/transformers.git@e1b7c0a05ab65e4ddb62a407fe12f8ec13a916f0" |
|
# !pip install accelerate datasets peft bitsandbytes |
|
# !pip install flash-attn --no-build-isolation |
|
|
|
import pandas as pd |
|
import torch |
|
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model |
|
from transformers import ( |
|
AutoProcessor, |
|
BitsAndBytesConfig, |
|
Idefics3ForConditionalGeneration, |
|
) |
|
import os |
|
from PIL import Image |
|
from datasets import load_dataset |
|
from transformers import TrainingArguments, Trainer |
|
from huggingface_hub import notebook_login |
|
|
|
notebook_login() |
|
|
|
gui_dense_desc_dataset = load_dataset("Agent-Eval-Refine/GUI-Dense-Descriptions") |
|
train_ds = gui_dense_desc_dataset["train"] |
|
|
|
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" |
|
# os.environ["CUDA_VISIBLE_DEVICES"] = "2" |
|
|
|
USE_LORA = False |
|
USE_QLORA = True |
|
model_id = "HuggingFaceM4/Idefics3-8B-Llama3" |
|
|
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
if USE_QLORA or USE_LORA: |
|
lora_config = LoraConfig( |
|
r=8, |
|
lora_alpha=8, |
|
lora_dropout=0.1, |
|
target_modules=[ |
|
"down_proj", |
|
"o_proj", |
|
"k_proj", |
|
"q_proj", |
|
"gate_proj", |
|
"up_proj", |
|
"v_proj", |
|
], |
|
use_dora=False if USE_QLORA else True, |
|
init_lora_weights="gaussian", |
|
) |
|
lora_config.inference_mode = False |
|
if USE_QLORA: |
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_use_double_quant=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
) |
|
|
|
model = Idefics3ForConditionalGeneration.from_pretrained( |
|
model_id, |
|
quantization_config=bnb_config if USE_QLORA else None, |
|
_attn_implementation="flash_attention_2", |
|
device_map="auto", |
|
torch_dtype=torch.bfloat16, |
|
) |
|
model.add_adapter(lora_config) |
|
model.enable_adapters() |
|
model = prepare_model_for_kbit_training(model) |
|
model = get_peft_model(model, lora_config) |
|
print(model.get_nb_trainable_parameters()) |
|
else: |
|
model = Idefics3ForConditionalGeneration.from_pretrained( |
|
model_id, |
|
torch_dtype=torch.bfloat16, |
|
_attn_implementation="flash_attention_2", |
|
device_map="auto", |
|
) |
|
|
|
# if you'd like to only fine-tune LLM |
|
for param in model.model.vision_model.parameters(): |
|
param.requires_grad = False |
|
|
|
image_token_id = processor.tokenizer.additional_special_tokens_ids[ |
|
processor.tokenizer.additional_special_tokens.index("<image>") |
|
] |
|
|
|
|
|
def collate_fn(examples): |
|
texts = [] |
|
images = [] |
|
for example in examples: |
|
image = example["image"] |
|
image_description = example["text"] |
|
messages = [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "image"}, |
|
{ |
|
"type": "text", |
|
"text": "Provide a detailed description of the image.", |
|
}, |
|
], |
|
}, |
|
{ |
|
"role": "assistant", |
|
"content": [{"type": "text", "text": image_description}], |
|
}, |
|
] |
|
text = processor.apply_chat_template(messages, add_generation_prompt=False) |
|
texts.append(text.strip()) |
|
images.append([image]) |
|
|
|
batch = processor(text=texts, images=images, return_tensors="pt", padding=True) |
|
labels = batch["input_ids"].clone() |
|
labels[labels == processor.tokenizer.pad_token_id] = -100 |
|
labels[labels == image_token_id] = -100 |
|
batch["labels"] = labels |
|
|
|
return batch |
|
|
|
training_args = TrainingArguments( |
|
num_train_epochs=1, |
|
per_device_train_batch_size=2, |
|
gradient_accumulation_steps=8, |
|
warmup_steps=50, |
|
learning_rate=1e-4, |
|
weight_decay=0.01, |
|
logging_steps=5, |
|
save_strategy="steps", |
|
save_steps=250, |
|
save_total_limit=1, |
|
optim="adamw_torch", |
|
bf16=True, |
|
output_dir="./idefics3-llama-gui-dense-descriptions", |
|
hub_model_id="idefics3-llama-gui-dense-descriptions", |
|
remove_unused_columns=False, |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
data_collator=collate_fn, |
|
train_dataset=train_ds, |
|
) |
|
|
|
trainer.train() |
|
|
|
trainer.push_to_hub() |
|
|
|
``` |
|
|
|
Training took approx. 40 min. on 2xH100 (80 Gb each) devices. |
|
|
|
## Intended usage |
|
|
|
```python |
|
from peft import PeftModel |
|
from transformers import AutoProcessor, Idefics3ForConditionalGeneration |
|
from transformers.image_utils import load_image |
|
import torch |
|
|
|
adapter_path = "Maverick17/idefics3-llama-gui-dense-descriptions" |
|
base_model_id = "HuggingFaceM4/Idefics3-8B-Llama3" |
|
|
|
# Load Model base model |
|
model = Idefics3ForConditionalGeneration.from_pretrained( |
|
base_model_id, |
|
_attn_implementation="flash_attention_2", |
|
device_map="auto", |
|
torch_dtype=torch.bfloat16, |
|
) |
|
|
|
# Merge LoRA and base model |
|
peft_model = PeftModel.from_pretrained(model, adapter_path) |
|
merged_model = peft_model.merge_and_unload() |
|
|
|
processor = AutoProcessor.from_pretrained(base_model_id) |
|
|
|
image = load_image("path/to/ui/image.png") |
|
|
|
# Create inputs |
|
messages = [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "image"}, |
|
{ |
|
"type": "text", |
|
"text": "Provide a detailed description of the image.", |
|
}, |
|
], |
|
}, |
|
] |
|
|
|
prompt = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
inputs = processor(text=prompt, images=[image], return_tensors="pt") |
|
inputs = {k: v.to("cuda") for k, v in inputs.items()} |
|
|
|
generation_args = { |
|
"max_new_tokens": 1024, |
|
"repetition_penalty": 1, |
|
} |
|
|
|
generation_args["do_sample"] = False |
|
generation_args.update(inputs) |
|
|
|
# Generate |
|
generated_ids = model.generate(**generation_args) |
|
|
|
generated_texts = processor.batch_decode( |
|
generated_ids[:, generation_args["input_ids"].size(1) :], skip_special_tokens=True |
|
) |
|
|
|
print(generated_texts[0].strip()) |
|
``` |
|
|
|
## Training procedure |
|
|
|
### Training hyperparameters |
|
|
|
The following hyperparameters were used during training: |
|
- learning_rate: 0.0001 |
|
- train_batch_size: 2 |
|
- eval_batch_size: 8 |
|
- seed: 42 |
|
- gradient_accumulation_steps: 8 |
|
- total_train_batch_size: 16 |
|
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 |
|
- lr_scheduler_type: linear |
|
- lr_scheduler_warmup_steps: 50 |
|
- num_epochs: 1 |
|
|
|
### Framework versions |
|
|
|
- PEFT 0.13.0 |
|
- Transformers 4.44.0.dev0 |
|
- Pytorch 2.4.1+cu121 |
|
- Datasets 3.0.1 |
|
- Tokenizers 0.19.1 |