|
--- |
|
library_name: transformers |
|
tags: |
|
- llama-factory |
|
- yi-vl |
|
- llava |
|
license: other |
|
language: |
|
- zh |
|
- en |
|
pipeline_tag: visual-question-answering |
|
--- |
|
|
|
This is the Huggingface version of [Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B) model. |
|
|
|
You may use this model for fine-tuning in downstream tasks, we recommend using our efficient fine-tuning toolkit. https://github.com/hiyouga/LLaMA-Factory |
|
|
|
- **Developed by:** [01-AI](https://www.01.ai/). |
|
- **Language(s) (NLP):** Chinese/English |
|
- **License:** [Yi Series Model License](https://huggingface.co/01-ai/Yi-VL-34B/blob/main/LICENSE) |
|
|
|
Usage: |
|
|
|
```python |
|
import requests |
|
from PIL import Image |
|
|
|
import torch |
|
from transformers import AutoProcessor, AutoModelForVision2Seq, LlavaConfig |
|
import transformers |
|
from torch import nn |
|
|
|
|
|
class LlavaMultiModalProjectorYiVL(nn.Module): |
|
def __init__(self, config: "LlavaConfig"): |
|
super().__init__() |
|
self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True) |
|
self.linear_2 = nn.LayerNorm(config.text_config.hidden_size, bias=True) |
|
self.linear_3 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True) |
|
self.linear_4 = nn.LayerNorm(config.text_config.hidden_size, bias=True) |
|
self.act = nn.GELU() |
|
|
|
def forward(self, image_features): |
|
hidden_states = self.linear_1(image_features) |
|
hidden_states = self.linear_2(hidden_states) |
|
hidden_states = self.act(hidden_states) |
|
hidden_states = self.linear_3(hidden_states) |
|
hidden_states = self.linear_4(hidden_states) |
|
return hidden_states |
|
# Monkey patch of LlavaMultiModalProjector is mandatory |
|
transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorYiVL |
|
|
|
model_id = "BUAADreamer/Yi-VL-34B-hf" |
|
|
|
messages = [ |
|
{ "role": "user", "content": "What's in the picture?" } |
|
] |
|
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" |
|
|
|
model = AutoModelForVision2Seq.from_pretrained( |
|
model_id, |
|
torch_dtype=torch.float16, |
|
low_cpu_mem_usage=True, |
|
).to(0) |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
text = [processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)] |
|
images = [Image.open(requests.get(image_file, stream=True).raw)] |
|
inputs = processor(text=prompt, images=images, return_tensors='pt').to(0, torch.float16) |
|
|
|
output = model.generate(**inputs, max_new_tokens=200) |
|
output = processor.batch_decode(output, skip_special_tokens=True) |
|
print(output.split("Assistant:")[-1].strip()) |
|
``` |
|
|
|
You could also alternatively launch a Web demo by using the CLI command in [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) |
|
|
|
```bash |
|
llamafactory-cli webchat \ |
|
--model_name_or_path BUAADreamer/Yi-VL-34B-hf \ |
|
--template yi_vl \ |
|
--visual_inputs |
|
``` |