BleachNick
commited on
Commit
•
ab9a3c5
1
Parent(s):
1a96366
Update README.md
Browse files
README.md
CHANGED
@@ -7,8 +7,15 @@ library_name: transformers
|
|
7 |
|
8 |
# Model Card for MMICL
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
## Temporal Demo for MMICL
|
11 |
-
[Playground for MMICL-FLANT5XXL](
|
12 |
support multi-image input as well as video input.
|
13 |
<!-- Provide a quick summary of what the model is/does. -->
|
14 |
|
@@ -54,29 +61,33 @@ import transformers
|
|
54 |
from PIL import Image
|
55 |
import torch
|
56 |
model_type="instructblip"
|
57 |
-
model_ckpt="
|
58 |
-
|
59 |
-
config = InstructBlipConfig.from_pretrained(
|
60 |
|
61 |
if 'instructblip' in model_type:
|
62 |
model = InstructBlipForConditionalGeneration.from_pretrained(
|
63 |
model_ckpt,
|
64 |
config=config).to('cuda:0',dtype=torch.bfloat16)
|
65 |
|
66 |
-
|
67 |
-
sp = [
|
68 |
-
|
69 |
processor = InstructBlipProcessor.from_pretrained(
|
70 |
-
|
71 |
)
|
72 |
-
|
73 |
-
|
74 |
sp = sp+processor.tokenizer.additional_special_tokens[len(sp):]
|
75 |
processor.tokenizer.add_special_tokens({'additional_special_tokens':sp})
|
|
|
|
|
|
|
|
|
76 |
|
|
|
|
|
|
|
|
|
77 |
|
78 |
-
prompt = ['Use the image 0: <image0
|
79 |
-
# images try to load the images to be a list of PIL.Image object.
|
80 |
prompt = " ".join(prompt)
|
81 |
|
82 |
inputs = processor(images=images, text=prompt, return_tensors="pt")
|
@@ -90,10 +101,16 @@ outputs = model.generate(
|
|
90 |
pixel_values = inputs['pixel_values'],
|
91 |
input_ids = inputs['input_ids'],
|
92 |
attention_mask = inputs['attention_mask'],
|
93 |
-
img_mask = inputs['img_mask']
|
|
|
|
|
|
|
|
|
94 |
)
|
95 |
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
|
96 |
print(generated_text)
|
|
|
|
|
97 |
|
98 |
```
|
99 |
|
|
|
7 |
|
8 |
# Model Card for MMICL
|
9 |
|
10 |
+
# News 🚀
|
11 |
+
1. [09-19] We have converted the MMICL demo to a permanent link: [Demo for MMICL](http://www.testmmicl.work). The Vicuna version of MMICL and Chat Mode are presently under development, so they may require careful adjustment of generation parameters and may not work correctly.
|
12 |
+
2. [09-15] Our [paper](https://arxiv.org/abs/2309.07915) has been uploaded to arXiv.
|
13 |
+
3. [09-01] The [MIC](https://huggingface.co/datasets/BleachNick/MIC_full) data has released on the huggingface hub.
|
14 |
+
4. [08-23] Reach the 1st on [MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation), 1st on [MMBench](https://opencompass.org.cn/leaderboard-multimodal)
|
15 |
+
5. [08-21] The [MMICL-FLANT5XXL](https://huggingface.co/BleachNick/MMICL-Instructblip-T5-xxl) and [MMICL-Tiny](https://huggingface.co/BleachNick/MMICL-Instructblip-T5-xl) model has released on the huggingface hub.
|
16 |
+
|
17 |
## Temporal Demo for MMICL
|
18 |
+
[Playground for MMICL-FLANT5XXL](http://www.testmmicl.work/)
|
19 |
support multi-image input as well as video input.
|
20 |
<!-- Provide a quick summary of what the model is/does. -->
|
21 |
|
|
|
61 |
from PIL import Image
|
62 |
import torch
|
63 |
model_type="instructblip"
|
64 |
+
model_ckpt="/home/haozhezhao/MMICL-Instructblip-T5-xxl"
|
65 |
+
processor_ckpt = "Salesforce/instructblip-flan-t5-xxl"
|
66 |
+
config = InstructBlipConfig.from_pretrained(model_ckpt )
|
67 |
|
68 |
if 'instructblip' in model_type:
|
69 |
model = InstructBlipForConditionalGeneration.from_pretrained(
|
70 |
model_ckpt,
|
71 |
config=config).to('cuda:0',dtype=torch.bfloat16)
|
72 |
|
73 |
+
image_palceholder="图"
|
74 |
+
sp = [image_palceholder]+[f"<image{i}>" for i in range(20)]
|
|
|
75 |
processor = InstructBlipProcessor.from_pretrained(
|
76 |
+
processor_ckpt
|
77 |
)
|
|
|
|
|
78 |
sp = sp+processor.tokenizer.additional_special_tokens[len(sp):]
|
79 |
processor.tokenizer.add_special_tokens({'additional_special_tokens':sp})
|
80 |
+
if model.qformer.embeddings.word_embeddings.weight.shape[0] != len(processor.qformer_tokenizer):
|
81 |
+
model.qformer.resize_token_embeddings(len(processor.qformer_tokenizer))
|
82 |
+
replace_token="".join(32*[image_palceholder])
|
83 |
+
|
84 |
|
85 |
+
image = Image.open ("images/cal_num1.png")
|
86 |
+
image1 = Image.open ("images/cal_num2.png")
|
87 |
+
image2 = Image.open ("images/cal_num3.png")
|
88 |
+
images = [image,image1,image2]
|
89 |
|
90 |
+
prompt = [f'Use the image 0: <image0>{replace_token},image 1: <image1>{replace_token} and image 2: <image2>{replace_token} as a visual aid to help you calculate the equation accurately. image 0 is 2+1=3.\nimage 1 is 5+6=11.\nimage 2 is"']
|
|
|
91 |
prompt = " ".join(prompt)
|
92 |
|
93 |
inputs = processor(images=images, text=prompt, return_tensors="pt")
|
|
|
101 |
pixel_values = inputs['pixel_values'],
|
102 |
input_ids = inputs['input_ids'],
|
103 |
attention_mask = inputs['attention_mask'],
|
104 |
+
img_mask = inputs['img_mask'],
|
105 |
+
do_sample=False,
|
106 |
+
max_length=50,
|
107 |
+
min_length=1,
|
108 |
+
set_min_padding_size =False,
|
109 |
)
|
110 |
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
|
111 |
print(generated_text)
|
112 |
+
# output: 3x6=18"
|
113 |
+
|
114 |
|
115 |
```
|
116 |
|