cooelf commited on
Commit
830cde5
·
1 Parent(s): 5cbd183

update answer

Browse files
Files changed (1) hide show
  1. app.py +42 -20
app.py CHANGED
@@ -10,12 +10,15 @@ from timm.data import resolve_data_config
10
  from timm.data.transforms_factory import create_transform
11
 
12
  rationale_model_dir = "cooelf/MM-CoT-UnifiedQA-Base-Rationale-Joint"
 
 
13
  vit_model = timm.create_model("vit_base_patch16_384", pretrained=True, num_classes=0)
14
  vit_model.eval()
15
  config = resolve_data_config({}, model=vit_model)
16
  transform = create_transform(**config)
17
  tokenizer = T5Tokenizer.from_pretrained(rationale_model_dir)
18
  r_model = T5ForMultimodalGeneration.from_pretrained(rationale_model_dir, patch_size=(577, 768))
 
19
 
20
  def inference_chat(input_image,input_text):
21
  with torch.no_grad():
@@ -25,7 +28,6 @@ def inference_chat(input_image,input_text):
25
  out = vit_model.forward_features(input)
26
  image_features = out.detach()
27
 
28
- input_ids = tokenizer(input_text, return_tensors='pt', padding=True).input_ids
29
  source = tokenizer.batch_encode_plus(
30
  [input_text],
31
  max_length=512,
@@ -44,17 +46,38 @@ def inference_chat(input_image,input_text):
44
  num_beams=1,
45
  do_sample=False
46
  )
47
- gpt3_out = tokenizer.batch_decode(rationale, skip_special_tokens=True)[0]
48
- gpt3_out1 = gpt3_out
49
- return gpt3_out, gpt3_out,gpt3_out1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
- title = """# VQA with VLE and LLM"""
53
- description = """**VLE** (Visual-Language Encoder) is an image-text multimodal understanding model built on the pre-trained text and image encoders. See https://github.com/iflytek/VLE for more details.
54
- We demonstrate visual question answering systems built with VLE and LLM."""
55
- description1 = """**VQA**: The image and the question are fed to a VQA model (VLEForVQA) and the model predicts the answer.
56
 
57
- **VQA+LLM**: We feed the caption, question, and answers predicted by the VQA model to the LLM and ask the LLM to generate the final answer. The outptus from VQA+LLM may vary due to the decoding strategy of the LLM."""
58
 
59
  with gr.Blocks(
60
  css="""
@@ -70,10 +93,10 @@ with gr.Blocks(
70
 
71
  with gr.Row():
72
  with gr.Column(scale=1):
73
- image_input = gr.Image(type="pil",label="VQA Image Input")
74
  with gr.Row():
75
  with gr.Column(scale=1):
76
- chat_input = gr.Textbox(lines=1, label="VQA Question Input")
77
  with gr.Row():
78
  clear_button = gr.Button(value="Clear", interactive=True,width=30)
79
  submit_button = gr.Button(
@@ -89,9 +112,8 @@ with gr.Blocks(
89
  '''
90
  with gr.Column():
91
  gr.Markdown(description1)
92
- caption_output = gr.Textbox(lines=0, label="VQA")
93
- caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)")
94
- gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
95
 
96
  chat_input.submit(
97
  inference_chat,
@@ -99,12 +121,12 @@ with gr.Blocks(
99
  image_input,
100
  chat_input,
101
  ],
102
- [ caption_output,gpt3_output_v1,caption_output_v1],
103
  )
104
  clear_button.click(
105
- lambda: ("", [],"","",""),
106
  [],
107
- [chat_input, state,caption_output,gpt3_output_v1,caption_output_v1],
108
  queue=False,
109
  )
110
  submit_button.click(
@@ -113,12 +135,12 @@ with gr.Blocks(
113
  image_input,
114
  chat_input,
115
  ],
116
- [caption_output,gpt3_output_v1,caption_output_v1],
117
  )
118
- examples=[['api/61.png',"Question: Think about the magnetic force between the magnets in each pair. Which of the following statements is true?\nContext: The images below show two pairs of magnets. The magnets in different pairs do not affect each other. All the magnets shown are made of the same material, but some of them are different sizes and shapes.\nOptions: (A) The magnitude of the magnetic force is the same in both pairs. (B) The magnitude of the magnetic force is smaller in Pair 1. (C) The magnitude of the magnetic force is smaller in Pair 2.\nSolution:","2","2","2"],
119
  ]
120
  examples = gr.Examples(
121
- examples=examples,inputs=[image_input, chat_input,caption_output,caption_output_v1,gpt3_output_v1],
122
  )
123
 
124
  iface.queue(concurrency_count=1, api_open=False, max_size=10)
 
10
  from timm.data.transforms_factory import create_transform
11
 
12
  rationale_model_dir = "cooelf/MM-CoT-UnifiedQA-Base-Rationale-Joint"
13
+ answer_model_dir = "cooelf/MM-CoT-UnifiedQA-Base-Answer-Joint"
14
+
15
  vit_model = timm.create_model("vit_base_patch16_384", pretrained=True, num_classes=0)
16
  vit_model.eval()
17
  config = resolve_data_config({}, model=vit_model)
18
  transform = create_transform(**config)
19
  tokenizer = T5Tokenizer.from_pretrained(rationale_model_dir)
20
  r_model = T5ForMultimodalGeneration.from_pretrained(rationale_model_dir, patch_size=(577, 768))
21
+ a_model = T5ForMultimodalGeneration.from_pretrained(answer_model_dir, patch_size=(577, 768))
22
 
23
  def inference_chat(input_image,input_text):
24
  with torch.no_grad():
 
28
  out = vit_model.forward_features(input)
29
  image_features = out.detach()
30
 
 
31
  source = tokenizer.batch_encode_plus(
32
  [input_text],
33
  max_length=512,
 
46
  num_beams=1,
47
  do_sample=False
48
  )
49
+ rationale = tokenizer.batch_decode(rationale, skip_special_tokens=True)[0]
50
+
51
+ input_text = input_text + "\n" + rationale +"\nAnswer:"
52
+ source = tokenizer.batch_encode_plus(
53
+ [input_text],
54
+ max_length=512,
55
+ pad_to_max_length=True,
56
+ truncation=True,
57
+ padding="max_length",
58
+ return_tensors="pt",
59
+ )
60
+ source_ids = source["input_ids"]
61
+ source_mask = source["attention_mask"]
62
+ answer = a_model.generate(
63
+ input_ids=source_ids,
64
+ attention_mask=source_mask,
65
+ image_ids=image_features,
66
+ max_length=64,
67
+ num_beams=1,
68
+ do_sample=False
69
+ )
70
+
71
+ answer = tokenizer.batch_decode(answer, skip_special_tokens=True)[0]
72
+ return rationale, answer
73
 
74
 
75
+ title = """# Multimodal-CoT"""
76
+ # description = """**VLE** (Visual-Language Encoder) is an image-text multimodal understanding model built on the pre-trained text and image encoders. See https://github.com/iflytek/VLE for more details.
77
+ # We demonstrate visual question answering systems built with VLE and LLM."""
78
+ # description1 = """**VQA**: The image and the question are fed to a VQA model (VLEForVQA) and the model predicts the answer.
79
 
80
+ # **VQA+LLM**: We feed the caption, question, and answers predicted by the VQA model to the LLM and ask the LLM to generate the final answer. The outptus from VQA+LLM may vary due to the decoding strategy of the LLM."""
81
 
82
  with gr.Blocks(
83
  css="""
 
93
 
94
  with gr.Row():
95
  with gr.Column(scale=1):
96
+ image_input = gr.Image(type="pil",label="Image")
97
  with gr.Row():
98
  with gr.Column(scale=1):
99
+ chat_input = gr.Textbox(lines=1, label="Question")
100
  with gr.Row():
101
  clear_button = gr.Button(value="Clear", interactive=True,width=30)
102
  submit_button = gr.Button(
 
112
  '''
113
  with gr.Column():
114
  gr.Markdown(description1)
115
+ rationale = gr.Textbox(lines=0, label="Rationale")
116
+ answer = gr.Textbox(lines=0, label="Answer")
 
117
 
118
  chat_input.submit(
119
  inference_chat,
 
121
  image_input,
122
  chat_input,
123
  ],
124
+ [rationale, answer],
125
  )
126
  clear_button.click(
127
+ lambda: ("", [],"",""),
128
  [],
129
+ [chat_input, state, rationale, answer],
130
  queue=False,
131
  )
132
  submit_button.click(
 
135
  image_input,
136
  chat_input,
137
  ],
138
+ [rationale, answer],
139
  )
140
+ examples=[['api/61.png',"Question: Think about the magnetic force between the magnets in each pair. Which of the following statements is true?\nContext: The images below show two pairs of magnets. The magnets in different pairs do not affect each other. All the magnets shown are made of the same material, but some of them are different sizes and shapes.\nOptions: (A) The magnitude of the magnetic force is the same in both pairs. (B) The magnitude of the magnetic force is smaller in Pair 1. (C) The magnitude of the magnetic force is smaller in Pair 2.\nSolution:","Magnet sizes affect the magnitude of the magnetic force. Imagine magnets that are the same shape and made of the same material. The smaller the magnets, the smaller the magnitude of the magnetic force between them.nMagnet A is the same size in both pairs. But Magnet B is smaller in Pair 2 than in Pair 1. So, the magnitude of the magnetic force is smaller in Pair 2 than in Pair 1.","The answer is (C)."],
141
  ]
142
  examples = gr.Examples(
143
+ examples=examples,inputs=[image_input, chat_input, rationale, answer],
144
  )
145
 
146
  iface.queue(concurrency_count=1, api_open=False, max_size=10)