vllm
patrickvonplaten commited on
Commit
2cb116a
1 Parent(s): afe31b3

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +22 -10
README.md CHANGED
@@ -80,26 +80,38 @@ You can also make use of a ready-to-go [docker image](https://hub.docker.com/lay
80
  ```py
81
  from vllm import LLM
82
  from vllm.sampling_params import SamplingParams
 
 
 
 
 
83
 
84
- model_name = "mistralai/Pixtral-12B-Base-2409"
85
 
 
86
  sampling_params = SamplingParams(max_tokens=8192)
87
 
88
  llm = LLM(model=model_name, tokenizer_mode="mistral")
89
 
90
- prompt = "Describe this image in one sentence."
91
- image_url = "https://picsum.photos/id/237/200/300"
 
 
 
 
 
92
 
93
- messages = [
94
- {
95
- "role": "user",
96
- "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": image_url}}]
97
- },
98
- ]
99
 
100
- outputs = llm.generate(messages, sampling_params=sampling_params)
 
 
 
101
 
102
  print(outputs[0].outputs[0].text)
 
 
103
  ```
104
 
105
  ### Mistral-inference
 
80
  ```py
81
  from vllm import LLM
82
  from vllm.sampling_params import SamplingParams
83
+ from vllm.inputs.data import TokensPrompt
84
+ import requests
85
+ from PIL import Image
86
+ from io import BytesIO
87
+ from vllm.multimodal import MultiModalDataBuiltins
88
 
89
+ from mistral_common.protocol.instruct.messages import TextChunk, ImageURLChunk
90
 
91
+ model_name = "mistralai/Pixtral-12B-Base-2409"
92
  sampling_params = SamplingParams(max_tokens=8192)
93
 
94
  llm = LLM(model=model_name, tokenizer_mode="mistral")
95
 
96
+ url = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
97
+ response = requests.get(url)
98
+ image = Image.open(BytesIO(response.content))
99
+
100
+ prompt = "The image shows a"
101
+
102
+ user_content = [ImageURLChunk(image_url=url), TextChunk(text=prompt)]
103
 
104
+ tokenizer = llm.llm_engine.tokenizer.tokenizer.mistral.instruct_tokenizer
105
+ tokens, _ = tokenizer.encode_user_content(user_content, False)
 
 
 
 
106
 
107
+ prompt = TokensPrompt(
108
+ prompt_token_ids=tokens, multi_modal_data=MultiModalDataBuiltins(image=[image])
109
+ )
110
+ outputs = llm.generate(prompt, sampling_params=sampling_params)
111
 
112
  print(outputs[0].outputs[0].text)
113
+ # ' view of a river flowing through the landscape, with prominent rock formations visible on either side of the river. The scene is captured using the UWA 14-24mm zoom lens, which provides a wide-angle perspective,
114
+ # allowing for a comprehensive view of the surroundings. The photo is credited to Greg Dowdy.
115
  ```
116
 
117
  ### Mistral-inference