aarbelle commited on
Commit
ad5cce3
·
verified ·
1 Parent(s): 377b16e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -5
README.md CHANGED
@@ -54,6 +54,7 @@ Granite Vision model is supported natively `transformers` from the `main` branch
54
 
55
  ```python
56
  from transformers import AutoProcessor, AutoModelForVision2Seq
 
57
  import torch
58
 
59
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -63,14 +64,19 @@ processor = AutoProcessor.from_pretrained(model_path)
63
  model = AutoModelForVision2Seq.from_pretrained(model_path).to(device)
64
 
65
  # prepare image and text prompt, using the appropriate prompt template
66
- url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
 
67
 
68
  conversation = [
 
 
 
 
69
  {
70
  "role": "user",
71
  "content": [
72
- {"type": "image", "url": url},
73
- {"type": "text", "text": "What is shown in this image?"},
74
  ],
75
  },
76
  ]
@@ -101,6 +107,8 @@ Then, copy the snippet from the section that is relevant for your use case.
101
  ```python
102
  from vllm import LLM, SamplingParams
103
  from vllm.assets.image import ImageAsset
 
 
104
 
105
  model_path = "ibm-granite/granite-vision-3.1-2b-preview"
106
 
@@ -118,9 +126,10 @@ sampling_params = SamplingParams(
118
  image_token = "<image>"
119
  system_prompt = "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
120
 
121
- question = "What type of flower is this?"
122
  prompt = f"{system_prompt}<|user|>\n{image_token}\n{question}\n<|assistant|>\n"
123
- image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 
124
  print(image)
125
 
126
  # Build the inputs to vLLM; the image is passed as `multi_modal_data`.
 
54
 
55
  ```python
56
  from transformers import AutoProcessor, AutoModelForVision2Seq
57
+ from huggingface_hub import hf_hub_download
58
  import torch
59
 
60
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
64
  model = AutoModelForVision2Seq.from_pretrained(model_path).to(device)
65
 
66
  # prepare image and text prompt, using the appropriate prompt template
67
+
68
+ img_path = hf_hub_download(repo_id=model_path, filename='example.png')
69
 
70
  conversation = [
71
+ {
72
+ "role": "system",
73
+ "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
74
+ },
75
  {
76
  "role": "user",
77
  "content": [
78
+ {"type": "image", "url": img_path},
79
+ {"type": "text", "text": "What is the highest scoring model on ChartQA and what is its score?"},
80
  ],
81
  },
82
  ]
 
107
  ```python
108
  from vllm import LLM, SamplingParams
109
  from vllm.assets.image import ImageAsset
110
+ from huggingface_hub import hf_hub_download
111
+ from PIL import Image
112
 
113
  model_path = "ibm-granite/granite-vision-3.1-2b-preview"
114
 
 
126
  image_token = "<image>"
127
  system_prompt = "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
128
 
129
+ question = "What is the highest scoring model on ChartQA and what is its score?"
130
  prompt = f"{system_prompt}<|user|>\n{image_token}\n{question}\n<|assistant|>\n"
131
+ img_path = hf_hub_download(repo_id=model_path, filename='example.png')
132
+ image = Image.open(img_path).convert("RGB")
133
  print(image)
134
 
135
  # Build the inputs to vLLM; the image is passed as `multi_modal_data`.