mylesgoose
/

Meta-Llama-3.1-8B-Instruct-goose-abliterated-pre-llava

Safetensors

llama

Model card Files Files and versions Community

mylesgoose commited on Sep 10, 2024

Commit

72940e6

verified ·

1 Parent(s): 5fd87da

Update README.md

Browse files

Files changed (1) hide show

README.md +116 -5

README.md CHANGED Viewed

@@ -1,5 +1,116 @@
----
-license: other
-license_name: meta
-license_link: https://ai.meta.com/llama/licence
----

+---
+license: other
+license_name: meta
+license_link: https://ai.meta.com/llama/licence
+datasets:
+- toshi456/llava_pretrain_blip_laion_cc_sbu_558k_ja
+base_model: mylesgoose/Meta-Llama-3.1-8B-Instruct-goose-abliterated
+---
+Install https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main prior to running below. Thanks to that team for their fantastic work.
+you can test with something like this.
+![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/65069ffda7ba30bf62aea321/XJRK1McipixmNVUyiL5v1.jpeg)
+""""
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
+from llava.conversation import conv_templates, SeparatorStyle
+from PIL import Image
+import requests
+import copy
+import torch
+pretrained = "mylesgoose/Meta-Llama-3.1-8B-Instruct-goose-abliterated-pre-llava"
+model_name = "llava_llama3"
+device = "cuda"
+device_map = "auto"
+tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args
+model.eval()
+model.tie_weights()
+image = Image.open("https://cdn-uploads.huggingface.co/production/uploads/65069ffda7ba30bf62aea321/XJRK1McipixmNVUyiL5v1.jpeg")
+image_tensor = process_images([image], image_processor, model.config)
+image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
+conv_template = "llava_llama_3"
+question = DEFAULT_IMAGE_TOKEN + "\nWhat is shown in this image? Is there anything strange about this image? Is this normal behaviour"
+conv = copy.deepcopy(conv_templates[conv_template])
+conv.append_message(conv.roles[0], question)
+conv.append_message(conv.roles[1], None)
+prompt_question = conv.get_prompt()
+# Check if tokenizer_image_token returns the attention mask
+input_ids, attention_mask = tokenizer_image_token(
+    prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+)
+input_ids = input_ids.unsqueeze(0).to(device)
+image_sizes = [image.size]
+# If attention_mask is not returned, create it manually (adjust as needed)
+if attention_mask is None:
+    attention_mask = torch.ones_like(input_ids)
+    attention_mask[:, :IMAGE_TOKEN_INDEX] = 1
+    attention_mask[:, IMAGE_TOKEN_INDEX+1:] = 1
+cont = model.generate(
+    input_ids,
+    images=image_tensor,
+    image_sizes=image_sizes,
+    attention_mask=attention_mask,
+    do_sample=True,
+    temperature=0.9,
+    max_new_tokens=256,
+)
+text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
+print(text_outputs)
+"""
+I Trained the llama 3.1 model integratign the google vison encoder. This is a base model . It has only the encoder integrated into it. It has not been trained on any closed datasets. Other than what is listed.
+LLM_VERSION="mylesgoose/Meta-Llama-3.1-8B-Instruct-goose-abliterated"
+LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
+VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
+VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
+############### Pretrain ################
+PROMPT_VERSION=plain
+BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
+echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
+deepspeed llava/train/train_mem.py \
+    --deepspeed scripts/zero3.json \
+    --model_name_or_path ${LLM_VERSION} \
+    --version ${PROMPT_VERSION} \
+    --data_path ./data/llava_data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \
+    --image_folder ./data/llava_data/LLaVA-Pretrain/images \
+    --vision_tower ${VISION_MODEL_VERSION} \
+    --mm_tunable_parts="mm_mlp_adapter" \
+    --mm_vision_select_layer -2 \
+    --mm_projector_type mlp2x_gelu \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 6 \
+    --per_device_eval_batch_size 6 \
+    --gradient_accumulation_steps 6 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 131072 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 6 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name $BASE_RUN_NAME \
+    --attn_implementation flash_attention_2