mylesgoose commited on
Commit
72940e6
·
verified ·
1 Parent(s): 5fd87da

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +116 -5
README.md CHANGED
@@ -1,5 +1,116 @@
1
- ---
2
- license: other
3
- license_name: meta
4
- license_link: https://ai.meta.com/llama/licence
5
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: meta
4
+ license_link: https://ai.meta.com/llama/licence
5
+ datasets:
6
+ - toshi456/llava_pretrain_blip_laion_cc_sbu_558k_ja
7
+ base_model: mylesgoose/Meta-Llama-3.1-8B-Instruct-goose-abliterated
8
+ ---
9
+ Install https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main prior to running below. Thanks to that team for their fantastic work.
10
+ you can test with something like this.
11
+ ![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/65069ffda7ba30bf62aea321/XJRK1McipixmNVUyiL5v1.jpeg)
12
+
13
+ """"
14
+ from llava.model.builder import load_pretrained_model
15
+ from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
16
+ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
17
+ from llava.conversation import conv_templates, SeparatorStyle
18
+
19
+ from PIL import Image
20
+ import requests
21
+ import copy
22
+ import torch
23
+
24
+ pretrained = "mylesgoose/Meta-Llama-3.1-8B-Instruct-goose-abliterated-pre-llava"
25
+ model_name = "llava_llama3"
26
+ device = "cuda"
27
+ device_map = "auto"
28
+ tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args
29
+
30
+ model.eval()
31
+ model.tie_weights()
32
+
33
+ image = Image.open("https://cdn-uploads.huggingface.co/production/uploads/65069ffda7ba30bf62aea321/XJRK1McipixmNVUyiL5v1.jpeg")
34
+ image_tensor = process_images([image], image_processor, model.config)
35
+ image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
36
+ conv_template = "llava_llama_3"
37
+ question = DEFAULT_IMAGE_TOKEN + "\nWhat is shown in this image? Is there anything strange about this image? Is this normal behaviour"
38
+ conv = copy.deepcopy(conv_templates[conv_template])
39
+ conv.append_message(conv.roles[0], question)
40
+ conv.append_message(conv.roles[1], None)
41
+ prompt_question = conv.get_prompt()
42
+
43
+ # Check if tokenizer_image_token returns the attention mask
44
+ input_ids, attention_mask = tokenizer_image_token(
45
+ prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
46
+ )
47
+ input_ids = input_ids.unsqueeze(0).to(device)
48
+ image_sizes = [image.size]
49
+
50
+ # If attention_mask is not returned, create it manually (adjust as needed)
51
+ if attention_mask is None:
52
+ attention_mask = torch.ones_like(input_ids)
53
+ attention_mask[:, :IMAGE_TOKEN_INDEX] = 1
54
+ attention_mask[:, IMAGE_TOKEN_INDEX+1:] = 1
55
+
56
+ cont = model.generate(
57
+ input_ids,
58
+ images=image_tensor,
59
+ image_sizes=image_sizes,
60
+ attention_mask=attention_mask,
61
+ do_sample=True,
62
+ temperature=0.9,
63
+ max_new_tokens=256,
64
+ )
65
+ text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
66
+ print(text_outputs)
67
+
68
+ """
69
+
70
+ I Trained the llama 3.1 model integratign the google vison encoder. This is a base model . It has only the encoder integrated into it. It has not been trained on any closed datasets. Other than what is listed.
71
+ LLM_VERSION="mylesgoose/Meta-Llama-3.1-8B-Instruct-goose-abliterated"
72
+ LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
73
+ VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
74
+ VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
75
+
76
+ ############### Pretrain ################
77
+
78
+ PROMPT_VERSION=plain
79
+
80
+ BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
81
+ echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
82
+
83
+ deepspeed llava/train/train_mem.py \
84
+ --deepspeed scripts/zero3.json \
85
+ --model_name_or_path ${LLM_VERSION} \
86
+ --version ${PROMPT_VERSION} \
87
+ --data_path ./data/llava_data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \
88
+ --image_folder ./data/llava_data/LLaVA-Pretrain/images \
89
+ --vision_tower ${VISION_MODEL_VERSION} \
90
+ --mm_tunable_parts="mm_mlp_adapter" \
91
+ --mm_vision_select_layer -2 \
92
+ --mm_projector_type mlp2x_gelu \
93
+ --mm_use_im_start_end False \
94
+ --mm_use_im_patch_token False \
95
+ --bf16 True \
96
+ --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \
97
+ --num_train_epochs 1 \
98
+ --per_device_train_batch_size 6 \
99
+ --per_device_eval_batch_size 6 \
100
+ --gradient_accumulation_steps 6 \
101
+ --evaluation_strategy "no" \
102
+ --save_strategy "steps" \
103
+ --save_steps 500 \
104
+ --learning_rate 1e-3 \
105
+ --weight_decay 0. \
106
+ --warmup_ratio 0.03 \
107
+ --lr_scheduler_type "cosine" \
108
+ --logging_steps 1 \
109
+ --tf32 True \
110
+ --model_max_length 131072 \
111
+ --gradient_checkpointing True \
112
+ --dataloader_num_workers 6 \
113
+ --lazy_preprocess True \
114
+ --report_to wandb \
115
+ --run_name $BASE_RUN_NAME \
116
+ --attn_implementation flash_attention_2