File size: 2,850 Bytes
0191eb4 94a8e2d 0191eb4 94a8e2d 0191eb4 94a8e2d 0191eb4 94a8e2d 0191eb4 94a8e2d 0191eb4 94a8e2d 0191eb4 94a8e2d a71d550 0191eb4 88729ab c634e6c 88729ab 94a8e2d 0191eb4 94a8e2d d1ce99b 94a8e2d dbfa9db 0191eb4 94a8e2d 0191eb4 94a8e2d a17bd2e 0191eb4 94a8e2d 0191eb4 94a8e2d 0191eb4 94a8e2d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
---
library_name: transformers
tags:
- llama-factory
- yi-vl
- llava
license: other
language:
- zh
- en
pipeline_tag: visual-question-answering
---
This is the Huggingface version of [Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B) model.
You may use this model for fine-tuning in downstream tasks, we recommend using our efficient fine-tuning toolkit. https://github.com/hiyouga/LLaMA-Factory
- **Developed by:** [01-AI](https://www.01.ai/).
- **Language(s) (NLP):** Chinese/English
- **License:** [Yi Series Model License](https://huggingface.co/01-ai/Yi-VL-34B/blob/main/LICENSE)
Usage:
```python
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq, LlavaConfig
import transformers
from torch import nn
class LlavaMultiModalProjectorYiVL(nn.Module):
def __init__(self, config: "LlavaConfig"):
super().__init__()
self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_2 = nn.LayerNorm(config.text_config.hidden_size, bias=True)
self.linear_3 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_4 = nn.LayerNorm(config.text_config.hidden_size, bias=True)
self.act = nn.GELU()
def forward(self, image_features):
hidden_states = self.linear_1(image_features)
hidden_states = self.linear_2(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.linear_3(hidden_states)
hidden_states = self.linear_4(hidden_states)
return hidden_states
# Monkey patch of LlavaMultiModalProjector is mandatory
transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorYiVL
model_id = "BUAADreamer/Yi-VL-34B-hf"
messages = [
{ "role": "user", "content": "<image>What's in the picture?" }
]
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
model = AutoModelForVision2Seq.from_pretrained(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(0)
processor = AutoProcessor.from_pretrained(model_id)
text = [processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)]
images = [Image.open(requests.get(image_file, stream=True).raw)]
inputs = processor(text=text, images=images, return_tensors='pt').to(0, torch.float16)
output = model.generate(**inputs, max_new_tokens=200)
output = processor.batch_decode(output, skip_special_tokens=True)
print(output.split("Assistant:")[-1].strip())
```
You could also alternatively launch a Web demo by using the CLI command in [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)
```bash
llamafactory-cli webchat \
--model_name_or_path BUAADreamer/Yi-VL-34B-hf \
--template yi_vl \
--visual_inputs
``` |