--- library_name: transformers pipeline_tag: text-generation inference: true widget: - text: Hello! example_title: Hello world group: Python base_model: - Qwen/Qwen2-VL-7B-Instruct --- This model is for debugging. It is randomly initialized using the config from [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) but with smaller size. Usage: ```python from PIL import Image import requests import torch from torchvision import io from typing import Dict from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor model_id = "yujiepan/qwen2-vl-tiny-random" # Load the model in half-precision on the available device(s) model = Qwen2VLForConditionalGeneration.from_pretrained( model_id, torch_dtype="auto", device_map="auto" ) processor = AutoProcessor.from_pretrained(model_id) # Image url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" image = Image.open(requests.get(url, stream=True).raw) conversation = [ { "role": "user", "content": [ { "type": "image", }, {"type": "text", "text": "Describe this image."}, ], } ] text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n' inputs = processor( text=[text_prompt], images=[image], padding=True, return_tensors="pt" ) inputs = inputs.to("cuda") output_ids = model.generate(**inputs, max_new_tokens=128) generated_ids = [ output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids) ] output_text = processor.batch_decode( generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True ) print(output_text) ``` Codes: ```python import os from typing import Dict import requests import torch import transformers from PIL import Image from torchvision import io from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer, GenerationConfig, pipeline, set_seed) from transformers.models.qwen2_vl import Qwen2VLForConditionalGeneration model_id = "Qwen/Qwen2-VL-7B-Instruct" repo_id = "yujiepan/qwen2-vl-tiny-random" save_path = f"/tmp/{repo_id}" config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) config.hidden_size = 16 config.intermediate_size = 32 config.num_attention_heads = 2 config.num_hidden_layers = 2 config.num_key_value_heads = 1 config.vision_config.embed_dim = 16 config.vision_config.num_heads = 2 config.vision_config.hidden_size = 16 config.vision_config.depth = 2 config.rope_scaling['mrope_section'] = [1, 1, 2] # sum needs to be 4 here model = Qwen2VLForConditionalGeneration(config=config) model = model.to(torch.bfloat16).cuda().eval() model.generation_config = GenerationConfig.from_pretrained( model_id, trust_remote_code=True, ) set_seed(42) with torch.no_grad(): for _, p in sorted(model.named_parameters()): torch.nn.init.uniform_(p, -0.3, 0.3) processor = AutoProcessor.from_pretrained(model_id) model.save_pretrained(save_path) processor.save_pretrained(save_path) os.system(f"ls -alh {save_path}") def try_inference(): url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" image = Image.open(requests.get(url, stream=True).raw) conversation = [ { "role": "user", "content": [ { "type": "image", }, {"type": "text", "text": "Describe this image."}, ], } ] processor = AutoProcessor.from_pretrained(save_path) model = Qwen2VLForConditionalGeneration.from_pretrained( save_path, torch_dtype=torch.bfloat16, device_map='cuda') text_prompt = processor.apply_chat_template( conversation, add_generation_prompt=True) inputs = processor( text=[text_prompt], images=[image], padding=True, return_tensors="pt" ) inputs = inputs.to("cuda") output_ids = model.generate(**inputs, max_new_tokens=16) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids) ] output_text = processor.batch_decode( generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True ) print(output_text) try_inference() ```