Niki Zhang commited on
Commit
631bbe0
·
verified ·
1 Parent(s): b0229d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -46
app.py CHANGED
@@ -395,15 +395,15 @@ def init_openai_api_key(api_key=""):
395
 
396
  global gpt_state
397
  gpt_state=1
398
- return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
399
  else:
400
  gpt_state=0
401
- return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
402
 
403
  def init_wo_openai_api_key():
404
  global gpt_state
405
  gpt_state=0
406
- return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*2
407
 
408
  def get_click_prompt(chat_input, click_state, click_mode):
409
  inputs = json.loads(chat_input)
@@ -441,14 +441,19 @@ def update_click_state(click_state, caption, click_mode):
441
  else:
442
  raise NotImplementedError
443
 
444
- def chat_input_callback(*args):
445
- visual_chatgpt, chat_input, click_state, state, aux_state = args
446
  if visual_chatgpt is not None:
447
- return visual_chatgpt.run_text(chat_input, state, aux_state)
 
 
 
 
448
  else:
449
  response = "Text refiner is not initilzed, please input openai api key."
450
  state = state + [(chat_input, response)]
451
- return state, state
 
452
 
453
 
454
 
@@ -777,6 +782,7 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
777
 
778
  prompt = get_sketch_prompt(mask)
779
  boxes = prompt['input_boxes']
 
780
 
781
  controls = {'length': length,
782
  'sentiment': sentiment,
@@ -797,18 +803,23 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
797
 
798
  enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
799
  out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)[0]
 
 
 
 
800
 
801
  # Update components and states
802
  state.append((f'Box: {boxes}', None))
803
- state.append((None, f'raw_caption: {out["generated_captions"]["raw_caption"]}'))
804
  text = out['generated_captions']['raw_caption']
805
  input_mask = np.array(out['mask'].convert('P'))
806
- image_input = mask_painter(np.array(image_input), input_mask)
807
-
808
- origin_image_input = image_input
 
809
 
810
- fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
811
- image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
812
 
813
  yield state, state, image_input
814
 
@@ -819,10 +830,10 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
819
  new_cap = refined_caption['caption']
820
  if refined_caption['wiki']:
821
  state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
822
- state = state + [(None, f"caption: {new_cap}")]
823
- refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
824
 
825
- yield state, state, refined_image_input
826
 
827
  def clear_chat_memory(visual_chatgpt, keep_global=False):
828
  if visual_chatgpt is not None:
@@ -852,26 +863,16 @@ def export_chat_log(chat_state):
852
  return None
853
 
854
 
855
- def cap_everything(image_input, visual_chatgpt, text_refiner,input_language, input_audio, input_mic, use_mic, agree):
856
-
857
- model = build_caption_anything_with_models(
858
- args,
859
- api_key="",
860
- captioner=shared_captioner,
861
- sam_model=shared_sam_model,
862
- ocr_reader=shared_ocr_reader,
863
- text_refiner=text_refiner,
864
- session_id=iface.app_id
865
- )
866
- paragraph = model.inference_cap_everything(image_input, verbose=True)
867
  # state = state + [(None, f"Caption Everything: {paragraph}")]
868
  Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
869
  AI_prompt = "Received."
870
  visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
871
  visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
872
  # waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
873
- waveform_visual, audio_output=None,None
874
- return paragraph,waveform_visual, audio_output
875
 
876
  def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
877
 
@@ -1038,10 +1039,23 @@ def create_ui():
1038
  clear_button_image = gr.Button(value="Clear Image", interactive=True)
1039
 
1040
  with gr.Tab("Trajectory (beta)") as traj_tab:
1041
- sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=20,
1042
  elem_id="image_sketcher")
1043
  with gr.Row():
 
1044
  submit_button_sketcher = gr.Button(value="Submit", interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
1045
 
1046
  with gr.Column(visible=False) as modules_need_gpt1:
1047
  with gr.Row(scale=1.0):
@@ -1070,16 +1084,16 @@ def create_ui():
1070
  enable_wiki = gr.Radio(
1071
  choices=["Yes", "No"],
1072
  value="No",
1073
- label="Enable Wiki",
1074
  interactive=True)
1075
 
1076
- with gr.Row(scale=1.0):
1077
- gr.Examples(
1078
  examples=examples,
1079
  inputs=[example_image],
1080
  )
1081
 
1082
- # with gr.Column(visible=True) as modules_not_need_gpt3:
1083
 
1084
 
1085
  with gr.Column(scale=0.5):
@@ -1108,7 +1122,7 @@ def create_ui():
1108
 
1109
  with gr.Column() as modules_need_gpt0:
1110
  with gr.Column(visible=False,scale=1.0) as modules_need_gpt2:
1111
- paragraph_output = gr.Textbox(lines=10, label="Describe Everything", max_lines=10)
1112
  cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
1113
 
1114
  with gr.Column(visible=False) as modules_not_need_gpt2:
@@ -1146,7 +1160,7 @@ def create_ui():
1146
  # this part is for 3d generate.
1147
  ###############################################################################
1148
 
1149
- with gr.Row(variant="panel"):
1150
  with gr.Column():
1151
  with gr.Row():
1152
  input_image = gr.Image(
@@ -1268,19 +1282,27 @@ def create_ui():
1268
  outputs=[input_text, input_language, input_audio, input_mic, use_mic, agree, output_waveform, output_audio],
1269
  queue=False
1270
  )
 
 
 
 
 
 
 
 
1271
 
1272
 
1273
  openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
1274
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
1275
- modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
1276
  enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
1277
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1278
  modules_not_need_gpt,
1279
- modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
1280
  disable_chatGPT_button.click(init_wo_openai_api_key,
1281
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1282
  modules_not_need_gpt,
1283
- modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
1284
 
1285
  enable_chatGPT_button.click(
1286
  lambda: (None, [], [], [[], [], []], "", "", ""),
@@ -1297,8 +1319,8 @@ def create_ui():
1297
  show_progress=False
1298
  )
1299
 
1300
- cap_everything_button.click(cap_everything, [origin_image, visual_chatgpt, text_refiner,input_language, input_audio, input_mic, use_mic, agree],
1301
- [paragraph_output,output_waveform, output_audio])
1302
 
1303
  clear_button_click.click(
1304
  lambda x: ([[], [], []], x),
@@ -1348,11 +1370,11 @@ def create_ui():
1348
  sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
1349
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
1350
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph])
1351
- chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
1352
- [chatbot, state, aux_state])
1353
  chat_input.submit(lambda: "", None, chat_input)
1354
- submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
1355
- [chatbot, state, aux_state])
1356
  submit_button_text.click(lambda: "", None, chat_input)
1357
  example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
1358
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
 
395
 
396
  global gpt_state
397
  gpt_state=1
398
+ return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
399
  else:
400
  gpt_state=0
401
+ return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
402
 
403
  def init_wo_openai_api_key():
404
  global gpt_state
405
  gpt_state=0
406
+ return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
407
 
408
  def get_click_prompt(chat_input, click_state, click_mode):
409
  inputs = json.loads(chat_input)
 
441
  else:
442
  raise NotImplementedError
443
 
444
+ async def chat_input_callback(*args):
445
+ visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay = args
446
  if visual_chatgpt is not None:
447
+ state, _, aux_state, _ = visual_chatgpt.run_text(chat_input, state, aux_state)
448
+ last_text, last_response = state[-1]
449
+ print("last response",last_response)
450
+ audio = await texttospeech(last_response,language,autoplay)
451
+ return state, state, aux_state,audio
452
  else:
453
  response = "Text refiner is not initilzed, please input openai api key."
454
  state = state + [(chat_input, response)]
455
+ audio = await texttospeech(response,language,autoplay)
456
+ return state, state, None,audio
457
 
458
 
459
 
 
782
 
783
  prompt = get_sketch_prompt(mask)
784
  boxes = prompt['input_boxes']
785
+ boxes = boxes[0]
786
 
787
  controls = {'length': length,
788
  'sentiment': sentiment,
 
803
 
804
  enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
805
  out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)[0]
806
+
807
+ # if visual_chatgpt is not None:
808
+ # print('inference_click: add caption to chatGPT memory')
809
+ # new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
810
 
811
  # Update components and states
812
  state.append((f'Box: {boxes}', None))
813
+ state.append((None, f'{out["generated_captions"]["raw_caption"]}'))
814
  text = out['generated_captions']['raw_caption']
815
  input_mask = np.array(out['mask'].convert('P'))
816
+ # image_input = mask_painter(np.array(image_input), input_mask, background_alpha=0 )
817
+ image_input = Image.fromarray(np.array(image_input))
818
+ draw = ImageDraw.Draw(image_input)
819
+ draw.rectangle(boxes, outline='red', width=2)
820
 
821
+ # fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
822
+ # image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
823
 
824
  yield state, state, image_input
825
 
 
830
  new_cap = refined_caption['caption']
831
  if refined_caption['wiki']:
832
  state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
833
+ state = state + [(None, f"{new_cap}")]
834
+ # refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
835
 
836
+ yield state, state, image_input
837
 
838
  def clear_chat_memory(visual_chatgpt, keep_global=False):
839
  if visual_chatgpt is not None:
 
863
  return None
864
 
865
 
866
+ async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
867
+
 
 
 
 
 
 
 
 
 
 
868
  # state = state + [(None, f"Caption Everything: {paragraph}")]
869
  Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
870
  AI_prompt = "Received."
871
  visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
872
  visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
873
  # waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
874
+ audio_output=await texttospeech(paragraph,language,autoplay)
875
+ return paragraph,audio_output
876
 
877
  def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
878
 
 
1039
  clear_button_image = gr.Button(value="Clear Image", interactive=True)
1040
 
1041
  with gr.Tab("Trajectory (beta)") as traj_tab:
1042
+ sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10,
1043
  elem_id="image_sketcher")
1044
  with gr.Row():
1045
+ clear_button_sketcher = gr.Button(value="Clear Sketch", interactive=True)
1046
  submit_button_sketcher = gr.Button(value="Submit", interactive=True)
1047
+ with gr.Row(scale=1.0):
1048
+ with gr.Row(scale=0.8):
1049
+ focus_type_sketch = gr.Radio(
1050
+ choices=["CFV-D", "CFV-DA", "CFV-DAI","PFV-DDA"],
1051
+ value="CFV-D",
1052
+ label="Information Type",
1053
+ interactive=True)
1054
+ Input_sketch = gr.Radio(
1055
+ choices=["Trace+Seg", "Trace"],
1056
+ value="Trace+Seg",
1057
+ label="Caption Type",
1058
+ interactive=True)
1059
 
1060
  with gr.Column(visible=False) as modules_need_gpt1:
1061
  with gr.Row(scale=1.0):
 
1084
  enable_wiki = gr.Radio(
1085
  choices=["Yes", "No"],
1086
  value="No",
1087
+ label="Expert",
1088
  interactive=True)
1089
 
1090
+ with gr.Column(visible=True) as modules_not_need_gpt3:
1091
+ gr.Examples(
1092
  examples=examples,
1093
  inputs=[example_image],
1094
  )
1095
 
1096
+
1097
 
1098
 
1099
  with gr.Column(scale=0.5):
 
1122
 
1123
  with gr.Column() as modules_need_gpt0:
1124
  with gr.Column(visible=False,scale=1.0) as modules_need_gpt2:
1125
+ paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
1126
  cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
1127
 
1128
  with gr.Column(visible=False) as modules_not_need_gpt2:
 
1160
  # this part is for 3d generate.
1161
  ###############################################################################
1162
 
1163
+ with gr.Row(variant="panel") as d3_model:
1164
  with gr.Column():
1165
  with gr.Row():
1166
  input_image = gr.Image(
 
1282
  outputs=[input_text, input_language, input_audio, input_mic, use_mic, agree, output_waveform, output_audio],
1283
  queue=False
1284
  )
1285
+
1286
+ clear_button_sketcher.click(
1287
+ lambda x: (x),
1288
+ [origin_image],
1289
+ [sketcher_input],
1290
+ queue=False,
1291
+ show_progress=False
1292
+ )
1293
 
1294
 
1295
  openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
1296
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
1297
+ modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
1298
  enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
1299
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1300
  modules_not_need_gpt,
1301
+ modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
1302
  disable_chatGPT_button.click(init_wo_openai_api_key,
1303
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1304
  modules_not_need_gpt,
1305
+ modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
1306
 
1307
  enable_chatGPT_button.click(
1308
  lambda: (None, [], [], [[], [], []], "", "", ""),
 
1319
  show_progress=False
1320
  )
1321
 
1322
+ cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
1323
+ [paragraph_output,output_audio])
1324
 
1325
  clear_button_click.click(
1326
  lambda x: ([[], [], []], x),
 
1370
  sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
1371
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
1372
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph])
1373
+ chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
1374
+ [chatbot, state, aux_state,output_audio])
1375
  chat_input.submit(lambda: "", None, chat_input)
1376
+ submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
1377
+ [chatbot, state, aux_state,output_audio])
1378
  submit_button_text.click(lambda: "", None, chat_input)
1379
  example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
1380
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,