anas-awadalla commited on
Commit
e548f8b
·
1 Parent(s): f569b1b

added stuff

Browse files
app.py CHANGED
@@ -7,20 +7,22 @@ import os
7
  login(token=os.environ["HUGGINGFACE_TOKEN"])
8
 
9
  demo_imgs = [
10
- ["images/chinchilla_web-1024x683.jpg", "images/shiba-inu-dog-in-the-snow.jpg"],
11
- ["images/900.jpeg", "images/hummus.jpg"],
12
- ["images/COCO_train2014_000000572279.jpg", "images/COCO_train2014_000000194806.jpg"],
13
  [
14
  "images/bcee7a-20190225-a-london-underground-sign.jpg",
15
  "images/istockphoto-622434332-1024x1024.jpg",
16
  ],
17
- ["images/dogs.jpeg", "images/pandas.jpg"],
18
  ["images/11887_pesto-pasta_Rita-1x1-1-501c953b29074ab193e2b5ad36e64648.jpg", "images/hummus.jpg"],
19
  ]
20
  demo_texts = [
21
  [
22
  "Output: This is a chinchilla. They are mainly found in Chile.",
23
  "Output: This is a shiba. They are very popular in Japan.",
 
 
24
  ],
25
  [
26
  "Output: a pink flamingo standing in a body of water.",
@@ -31,9 +33,11 @@ demo_texts = [
31
  [
32
  "Question: Describe the scene. Answer: A white airplane being repaired on the runway. 'Cargo' is written on it in red.",
33
  "Question: What is the man trying to catch? Answer: The man is catching a white kite that his friend is flying. The two men are on a beach.",
 
 
34
  ],
35
  ['Output: "Underground"', 'Output: "Congress Ave"'],
36
- ["Output: 2 dogs", "Output: 3 pandas"],
37
  ]
38
 
39
  # cd to open_flamingo dir and pip install .
@@ -50,12 +54,12 @@ with open("bad_words.txt", "r") as f:
50
  model, image_processor, tokenizer = create_model_and_transforms(
51
  clip_vision_encoder_pretrained="openai",
52
  clip_vision_encoder_path="ViT-L-14",
53
- lang_encoder_path="togethercomputer/RedPajama-INCITE-Base-3B-v1",
54
- tokenizer_path="togethercomputer/RedPajama-INCITE-Base-3B-v1",
55
  cross_attn_every_n_layers=2,
56
  )
57
 
58
- checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-4B-vitl-rpj3b", "checkpoint.pt")
59
  model.load_state_dict(torch.load(checkpoint_path), strict=False)
60
 
61
  model.eval()
@@ -97,6 +101,28 @@ def generate(
97
  if example_two_text is None
98
  else f"Output: {example_two_text}"
99
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  if (
102
  example_one_image is None
@@ -107,6 +133,10 @@ def generate(
107
  raise gr.Error("Please fill in all the fields (image and text).")
108
 
109
  demo_plus_text = f"<image>{example_one_text}<|endofchunk|><image>{example_two_text}<|endofchunk|>"
 
 
 
 
110
  demo_plus_text += (
111
  "<image>Output:" if idx != 2 else f"<image>Question: {text.strip()} Answer:"
112
  )
@@ -117,7 +147,14 @@ def generate(
117
  input_ids = lang_x["input_ids"]
118
  attention_mask = lang_x["attention_mask"]
119
 
120
- vision_x = [image_processor(example_one_image).unsqueeze(0), image_processor(example_two_image).unsqueeze(0), image_processor(image).unsqueeze(0)]
 
 
 
 
 
 
 
121
  vision_x = torch.cat(vision_x, dim=0)
122
  vision_x = vision_x.unsqueeze(1).unsqueeze(0)
123
  print(vision_x.shape)
@@ -165,12 +202,11 @@ def generate(
165
 
166
 
167
  with gr.Blocks() as demo:
168
- # As a consequence, you should treat this model as a research prototype and not as a production-ready model. Before using this demo please familiarize yourself with our [model card](https://github.com/mlfoundations/open_flamingo/blob/main/MODEL_CARD.md) and [terms and conditions](https://github.com/mlfoundations/open_flamingo/blob/main/TERMS_AND_CONDITIONS.md)
169
  gr.Markdown(
170
  """
171
  # 🦩 OpenFlamingo Demo
172
 
173
- Blog posts: #1 [An open-source framework for training vision-language models with in-context learning](https://laion.ai/blog/open-flamingo/) // #2 [OpenFlamingo v2: New Models and Enhanced Training Setup]()\n
174
  GitHub: [open_flamingo](https://github.com/mlfoundations/open_flamingo)
175
 
176
  In this demo we implement an interactive interface that showcases the in-context learning capabilities of the OpenFlamingo-4B model, a large multimodal model trained on top of
 
7
  login(token=os.environ["HUGGINGFACE_TOKEN"])
8
 
9
  demo_imgs = [
10
+ ["images/chinchilla_web-1024x683.jpg", "images/shiba-inu-dog-in-the-snow.jpg", "images/900.jpeg", "images/dogs.jpeg"],
11
+ ["images/900.jpeg", "images/hummus.jpg", "images/london-underground-sign.jpg", "images/COCO_train2014_000000194806.jpg"],
12
+ ["images/COCO_train2014_000000572279.jpg", "images/COCO_train2014_000000194806.jpg", "images/istockphoto-622434332-1024x1024.jpg", "images/11887_pesto-pasta_Rita-1x1-1-501c953b29074ab193e2b5ad36e64648.jpg"],
13
  [
14
  "images/bcee7a-20190225-a-london-underground-sign.jpg",
15
  "images/istockphoto-622434332-1024x1024.jpg",
16
  ],
17
+ ["images/dogs.jpeg", "images/pandas.jpg", "images/900.jpeg", "images/mhJ2yWNwMtNcmijZqVEDDW-320-80.jpg"],
18
  ["images/11887_pesto-pasta_Rita-1x1-1-501c953b29074ab193e2b5ad36e64648.jpg", "images/hummus.jpg"],
19
  ]
20
  demo_texts = [
21
  [
22
  "Output: This is a chinchilla. They are mainly found in Chile.",
23
  "Output: This is a shiba. They are very popular in Japan.",
24
+ "Output: This is a flamingo. They are found in South America.",
25
+ "Output: These are labrador retrievers. They are found in the UK.",
26
  ],
27
  [
28
  "Output: a pink flamingo standing in a body of water.",
 
33
  [
34
  "Question: Describe the scene. Answer: A white airplane being repaired on the runway. 'Cargo' is written on it in red.",
35
  "Question: What is the man trying to catch? Answer: The man is catching a white kite that his friend is flying. The two men are on a beach.",
36
+ "Question: What does the sign say? Answer: Congress Ave",
37
+ "Question: What is this dish? Answer: This is pesto pasta topped with cheese and basil.",
38
  ],
39
  ['Output: "Underground"', 'Output: "Congress Ave"'],
40
+ ["Output: 2 dogs", "Output: 3 pandas", "Output: 1 flamingo", "Output: 5 fingers"],
41
  ]
42
 
43
  # cd to open_flamingo dir and pip install .
 
54
  model, image_processor, tokenizer = create_model_and_transforms(
55
  clip_vision_encoder_pretrained="openai",
56
  clip_vision_encoder_path="ViT-L-14",
57
+ lang_encoder_path="togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
58
+ tokenizer_path="togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
59
  cross_attn_every_n_layers=2,
60
  )
61
 
62
+ checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-4B-vitl-rpj3b-langinstruct", "checkpoint.pt")
63
  model.load_state_dict(torch.load(checkpoint_path), strict=False)
64
 
65
  model.eval()
 
101
  if example_two_text is None
102
  else f"Output: {example_two_text}"
103
  )
104
+
105
+ if idx != -1:
106
+ example_three_image = (
107
+ Image.open(demo_imgs[idx][2])
108
+ if example_three_image is None
109
+ else example_three_image
110
+ )
111
+ example_three_text = (
112
+ demo_texts[idx][2]
113
+ if example_three_text is None
114
+ else f"Output: {example_three_text}"
115
+ )
116
+ example_four_image = (
117
+ Image.open(demo_imgs[idx][3])
118
+ if example_four_image is None
119
+ else example_four_image
120
+ )
121
+ example_four_text = (
122
+ demo_texts[idx][3]
123
+ if example_four_text is None
124
+ else f"Output: {example_four_text}"
125
+ )
126
 
127
  if (
128
  example_one_image is None
 
133
  raise gr.Error("Please fill in all the fields (image and text).")
134
 
135
  demo_plus_text = f"<image>{example_one_text}<|endofchunk|><image>{example_two_text}<|endofchunk|>"
136
+
137
+ if idx != -1:
138
+ demo_plus_text += f"<image>{example_three_text}<|endofchunk|><image>{example_four_text}<|endofchunk|>"
139
+
140
  demo_plus_text += (
141
  "<image>Output:" if idx != 2 else f"<image>Question: {text.strip()} Answer:"
142
  )
 
147
  input_ids = lang_x["input_ids"]
148
  attention_mask = lang_x["attention_mask"]
149
 
150
+ vision_x = [image_processor(example_one_image).unsqueeze(0), image_processor(example_two_image).unsqueeze(0)]
151
+
152
+ if idx != -1:
153
+ vision_x.append(image_processor(example_three_image).unsqueeze(0))
154
+ vision_x.append(image_processor(example_four_image).unsqueeze(0))
155
+
156
+ vision_x.append(image_processor(image).unsqueeze(0))
157
+
158
  vision_x = torch.cat(vision_x, dim=0)
159
  vision_x = vision_x.unsqueeze(1).unsqueeze(0)
160
  print(vision_x.shape)
 
202
 
203
 
204
  with gr.Blocks() as demo:
 
205
  gr.Markdown(
206
  """
207
  # 🦩 OpenFlamingo Demo
208
 
209
+ Blog posts: #1 [An open-source framework for training vision-language models with in-context learning](https://laion.ai/blog/open-flamingo/) // #2 [OpenFlamingo v2: New Models and Enhanced Training Setup]()
210
  GitHub: [open_flamingo](https://github.com/mlfoundations/open_flamingo)
211
 
212
  In this demo we implement an interactive interface that showcases the in-context learning capabilities of the OpenFlamingo-4B model, a large multimodal model trained on top of
images/4645808729_2dfc59b6a5_z.jpg ADDED
images/5944609705_4664531909_z.jpg ADDED
images/mhJ2yWNwMtNcmijZqVEDDW-320-80.jpg ADDED