RaushanTurganbay HF staff commited on
Commit
f00efbd
·
verified ·
1 Parent(s): 5d2b324

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +65 -7
README.md CHANGED
@@ -5,6 +5,9 @@ license: other
5
  license_name: tongyi-qianwen-research
6
  license_link: LICENSE
7
  pipeline_tag: image-text-to-text
 
 
 
8
  ---
9
 
10
  # LLaVA Interleave Model Card
@@ -42,10 +45,23 @@ import requests
42
 
43
  model_id = "llava-hf/llava-interleave-qwen-7b-dpo-hf"
44
  pipe = pipeline("image-to-text", model=model_id)
45
- url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
46
 
 
47
  image = Image.open(requests.get(url, stream=True).raw)
48
- prompt = "<|im_start|>user <image>\nWhat does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud<|im_end|><|im_start|>assistant"
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
51
  print(outputs)
@@ -63,10 +79,6 @@ import torch
63
  from transformers import AutoProcessor, LlavaForConditionalGeneration
64
 
65
  model_id = "llava-hf/llava-interleave-qwen-7b-dpo-hf"
66
-
67
- prompt = "<|im_start|>user <image>\nWhat are these?|im_end|><|im_start|>assistant"
68
- image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
69
-
70
  model = LlavaForConditionalGeneration.from_pretrained(
71
  model_id,
72
  torch_dtype=torch.float16,
@@ -75,13 +87,29 @@ model = LlavaForConditionalGeneration.from_pretrained(
75
 
76
  processor = AutoProcessor.from_pretrained(model_id)
77
 
 
 
 
 
78
 
 
 
 
 
 
 
 
 
 
 
79
  raw_image = Image.open(requests.get(image_file, stream=True).raw)
80
  inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)
81
 
82
  output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
83
  print(processor.decode(output[0][2:], skip_special_tokens=True))
84
  ```
 
 
85
  When prompting with videos/3D/multi-view input, prompt like following:
86
 
87
  ```python
@@ -89,17 +117,47 @@ When prompting with videos/3D/multi-view input, prompt like following:
89
 
90
  image_tokens = "<image>" * n
91
  prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  ```
93
 
94
  When prompting with interleaved images and videos, prompt like following:
95
 
96
  ```python
97
  # two interleaved images
98
- prompt = "<|im_start|>user <image><image>\nWhat are these?|im_end|><|im_start|>assistant"
99
 
100
  # two interleaved videos, if you downsampled n frames in total from both videos
101
  image_tokens = "<image>" * n
102
  prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  ```
104
 
105
 
 
5
  license_name: tongyi-qianwen-research
6
  license_link: LICENSE
7
  pipeline_tag: image-text-to-text
8
+ tags:
9
+ - vision
10
+ - image-text-to-text
11
  ---
12
 
13
  # LLaVA Interleave Model Card
 
45
 
46
  model_id = "llava-hf/llava-interleave-qwen-7b-dpo-hf"
47
  pipe = pipeline("image-to-text", model=model_id)
 
48
 
49
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
50
  image = Image.open(requests.get(url, stream=True).raw)
51
+
52
+ # Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
53
+ # Each value in "content" has to be a list of dicts with types ("text", "image")
54
+ conversation = [
55
+ {
56
+
57
+ "role": "user",
58
+ "content": [
59
+ {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
60
+ {"type": "image"},
61
+ ],
62
+ },
63
+ ]
64
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
65
 
66
  outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
67
  print(outputs)
 
79
  from transformers import AutoProcessor, LlavaForConditionalGeneration
80
 
81
  model_id = "llava-hf/llava-interleave-qwen-7b-dpo-hf"
 
 
 
 
82
  model = LlavaForConditionalGeneration.from_pretrained(
83
  model_id,
84
  torch_dtype=torch.float16,
 
87
 
88
  processor = AutoProcessor.from_pretrained(model_id)
89
 
90
+ # Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
91
+ # Each value in "content" has to be a list of dicts with types ("text", "image")
92
+ conversation = [
93
+ {
94
 
95
+ "role": "user",
96
+ "content": [
97
+ {"type": "text", "text": "What are these?"},
98
+ {"type": "image"},
99
+ ],
100
+ },
101
+ ]
102
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
103
+
104
+ image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
105
  raw_image = Image.open(requests.get(image_file, stream=True).raw)
106
  inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)
107
 
108
  output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
109
  print(processor.decode(output[0][2:], skip_special_tokens=True))
110
  ```
111
+
112
+
113
  When prompting with videos/3D/multi-view input, prompt like following:
114
 
115
  ```python
 
117
 
118
  image_tokens = "<image>" * n
119
  prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
120
+
121
+ # With chat template if you sampled 6 frames you have to have 8 images in one conversation turn
122
+ conversation = [
123
+ {
124
+
125
+ "role": "user",
126
+ "content": [
127
+ {"type": "text", "text": "What are these?"},
128
+ {"type": "image"},
129
+ {"type": "image"},
130
+ {"type": "image"},
131
+ {"type": "image"},
132
+ {"type": "image"},
133
+ ],
134
+ },
135
+ ]
136
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
137
  ```
138
 
139
  When prompting with interleaved images and videos, prompt like following:
140
 
141
  ```python
142
  # two interleaved images
143
+ prompt = "<|im_start|>user <image><image>\nWhat is the difference between these two images?|im_end|><|im_start|>assistant"
144
 
145
  # two interleaved videos, if you downsampled n frames in total from both videos
146
  image_tokens = "<image>" * n
147
  prompt = f"<|im_start|>user {image_tokens}\nWhat are these?|im_end|><|im_start|>assistant"
148
+
149
+ # chat template in interleaved format work same as in sampling videos. Just pass in as many images you want for a prompt
150
+ conversation = [
151
+ {
152
+
153
+ "role": "user",
154
+ "content": [
155
+ {"type": "text", "text": "What is the difference between these two images?"},
156
+ {"type": "image"},
157
+ {"type": "image"},
158
+ ],
159
+ },
160
+ ]
161
  ```
162
 
163