Niki Zhang commited on
Commit
5abd550
·
verified ·
1 Parent(s): ab4f7f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +259 -190
app.py CHANGED
@@ -25,6 +25,7 @@ import easyocr
25
  import re
26
  import edge_tts
27
  import asyncio
 
28
  # import tts
29
 
30
  ###############################################################################
@@ -32,27 +33,14 @@ import asyncio
32
  ###############################################################################
33
 
34
 
 
35
 
36
-
37
- # import uuid
38
- # from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
39
- # from diffusers.utils import export_to_video
40
- # from safetensors.torch import load_file
41
- #from diffusers.models.modeling_outputs import Transformer2DModelOutput
42
-
43
-
44
- import random
45
- import uuid
46
- import json
47
- from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
48
-
49
-
50
-
51
-
52
  import imageio
53
  import numpy as np
54
  import torch
55
  import rembg
 
56
  from torchvision.transforms import v2
57
  from pytorch_lightning import seed_everything
58
  from omegaconf import OmegaConf
@@ -297,6 +285,7 @@ def make3d(images):
297
  ############# above part is for 3D generate #############
298
  ###############################################################################
299
 
 
300
  ###############################################################################
301
  ############# this part is for text to image #############
302
  ###############################################################################
@@ -418,6 +407,36 @@ filtered_language_dict = {
418
  'Cantonese': 'zh-HK-HiuGaaiNeural'
419
  }
420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  gpt_state = 0
422
  VOICE = "en-GB-SoniaNeural"
423
  article = """
@@ -463,6 +482,7 @@ class ImageSketcher(gr.Image):
463
  mask[..., -1] = 255
464
  mask = self.postprocess(mask)
465
  x['mask'] = mask
 
466
  return super().preprocess(x)
467
 
468
 
@@ -512,15 +532,18 @@ def init_openai_api_key(api_key=""):
512
 
513
  global gpt_state
514
  gpt_state=1
515
- return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
 
516
  else:
517
  gpt_state=0
518
- return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
 
519
 
520
  def init_wo_openai_api_key():
521
  global gpt_state
522
  gpt_state=0
523
- return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
 
524
 
525
  def get_click_prompt(chat_input, click_state, click_mode):
526
  inputs = json.loads(chat_input)
@@ -677,17 +700,16 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
677
  point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
678
  visual_chatgpt.point_prompt = point_prompt
679
 
680
- generated_caption = text
681
- print(generated_caption)
682
  print("new crop save",new_crop_save_path)
683
 
684
- yield state, state, click_state, image_input_nobackground, image_input_withbackground, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
685
 
686
 
687
 
688
 
689
 
690
- async def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
691
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
692
  autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
693
  print("state",state)
@@ -702,23 +724,57 @@ async def submit_caption(image_input, state, generated_caption, text_refiner, vi
702
  print("click_index",click_index)
703
  print("input_points_state",input_points_state)
704
  print("input_labels_state",input_labels_state)
705
-
 
706
 
707
- input_mask = input_mask_state
708
- input_points = input_points_state
709
- input_labels = input_labels_state
710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
 
712
-
713
- focus_map = {
714
- "CFV-D":0,
715
- "CFV-DA":1,
716
- "CFV-DAI":2,
717
- "PFV-DDA":3
718
- }
719
-
 
 
 
 
 
 
720
  mapped_value = focus_map.get(focus_type, -1)
721
- print("mapped value",mapped_value)
722
 
723
  controls = {
724
  'length': length,
@@ -726,95 +782,21 @@ async def submit_caption(image_input, state, generated_caption, text_refiner, vi
726
  'factuality': factuality,
727
  'language': language
728
  }
729
- '''
730
- prompt_list = [
731
- 'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
732
- 'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
733
- 'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
734
- 'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
735
- ]
736
-
737
- prompt_list = [
738
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
739
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
740
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
741
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
742
- ]
743
- '''
744
- prompt_list = [
745
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
746
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
747
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
748
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
749
- ]
750
  if mapped_value != -1:
751
- prompt= prompt_list[mapped_value].format(
752
- raw_caption=generated_caption,
753
  Wiki_caption=paragraph,
754
  length=controls['length'],
755
  sentiment=controls['sentiment'],
756
  language=controls['language']
757
  )
758
-
759
  else:
760
- print("error prompting")
761
  prompt = "Invalid focus type."
762
 
763
  if controls['factuality'] == "Imagination":
764
- prompt += "Assuming that I am someone who has viewed a lot of art and has a lot of experience viewing art. Explain artistic features (composition, color, style, or use of light) and discuss the symbolism of the content and its influence on later artistic movements"
765
-
766
- print("Prompt:", prompt)
767
- print("click",click_index)
768
-
769
- origin_image_input = image_input
770
-
771
-
772
 
773
- image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
774
- input_points=input_points, input_labels=input_labels)
775
-
776
- if generated_caption:
777
- # state = state + [(None, f"RAW_Caption: {generated_caption}")]
778
-
779
-
780
- if not args.disable_gpt and text_refiner:
781
- print("new crop save",new_crop_save_path)
782
- focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
783
- if focus_info.startswith('"') and focus_info.endswith('"'):
784
- focus_info=focus_info[1:-1]
785
- focus_info=focus_info.replace('#', '')
786
- # state = state + [(None, f"Wiki: {paragraph}")]
787
- state = state + [(None, f"{focus_info}")]
788
- print("new_cap",focus_info)
789
- read_info = re.sub(r'[#[\]!*]','',focus_info)
790
- read_info = emoji.replace_emoji(read_info,replace="")
791
- print("read info",read_info)
792
-
793
- # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
794
- # input_points=input_points, input_labels=input_labels)
795
- try:
796
- audio_output = await texttospeech(read_info, language,autoplay)
797
- # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
798
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
799
-
800
- except Exception as e:
801
- state = state + [(None, f"Error during TTS prediction: {str(e)}")]
802
- print(f"Error during TTS prediction: {str(e)}")
803
- # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
804
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
805
-
806
- else:
807
- try:
808
- audio_output = await texttospeech(focus_info, language, autoplay)
809
- # waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
810
- waveform_visual, audio_output=None,None
811
- # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
812
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
813
-
814
- except Exception as e:
815
- state = state + [(None, f"Error during TTS prediction: {str(e)}")]
816
- print(f"Error during TTS prediction: {str(e)}")
817
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
818
 
819
 
820
  def encode_image(image_path):
@@ -892,14 +874,19 @@ def get_sketch_prompt(mask: Image.Image):
892
 
893
  return prompt
894
 
 
895
 
896
- def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
897
- original_size, input_size, text_refiner):
898
  image_input, mask = sketcher_image['image'], sketcher_image['mask']
899
-
 
 
900
  prompt = get_sketch_prompt(mask)
901
  boxes = prompt['input_boxes']
902
  boxes = boxes[0]
 
 
903
 
904
  controls = {'length': length,
905
  'sentiment': sentiment,
@@ -919,38 +906,77 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
919
  model.setup(image_embedding, original_size, input_size, is_image_set=True)
920
 
921
  enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
922
- out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)[0]
923
 
924
- # if visual_chatgpt is not None:
925
- # print('inference_click: add caption to chatGPT memory')
926
- # new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
927
-
 
 
 
 
 
 
 
 
 
 
 
 
 
928
  # Update components and states
929
  state.append((f'Box: {boxes}', None))
930
- state.append((None, f'{out["generated_captions"]["raw_caption"]}'))
931
- text = out['generated_captions']['raw_caption']
932
- input_mask = np.array(out['mask'].convert('P'))
933
- # image_input = mask_painter(np.array(image_input), input_mask, background_alpha=0 )
934
- image_input = Image.fromarray(np.array(image_input))
935
- draw = ImageDraw.Draw(image_input)
936
- draw.rectangle(boxes, outline='red', width=2)
937
-
938
  # fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
939
  # image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
940
 
941
- yield state, state, image_input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942
 
943
- if not args.disable_gpt and model.text_refiner:
944
- refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
945
- enable_wiki=enable_wiki)
946
 
947
- new_cap = refined_caption['caption']
948
- if refined_caption['wiki']:
949
- state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
950
- state = state + [(None, f"{new_cap}")]
951
- # refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
952
 
953
- yield state, state, image_input
954
 
955
  def clear_chat_memory(visual_chatgpt, keep_global=False):
956
  if visual_chatgpt is not None:
@@ -1020,32 +1046,55 @@ def get_style():
1020
  #image_sketcher [data-testid="image"], #image_sketcher [data-testid="image"] > div{min-height: 500px}
1021
  #image_upload{min-height:500px}
1022
  #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 500px}
 
 
 
 
 
 
 
 
 
 
 
 
1023
  '''
1024
  elif current_version <= version.parse('3.27'):
1025
  style = '''
1026
  #image_sketcher{min-height:500px}
1027
  #image_upload{min-height:500px}
 
 
 
 
 
 
 
 
 
 
 
1028
  '''
1029
  else:
1030
  style = None
1031
 
1032
  return style
1033
 
1034
- def handle_like_dislike(like_data, like_state, dislike_state):
1035
- if like_data.liked:
1036
- if like_data.index not in like_state:
1037
- like_state.append(like_data.index)
1038
- message = f"Liked: {like_data.value} at index {like_data.index}"
1039
- else:
1040
- message = "You already liked this item"
1041
- else:
1042
- if like_data.index not in dislike_state:
1043
- dislike_state.append(like_data.index)
1044
- message = f"Disliked: {like_data.value} at index {like_data.index}"
1045
- else:
1046
- message = "You already disliked this item"
1047
 
1048
- return like_state, dislike_state
1049
 
1050
  async def texttospeech(text,language,autoplay):
1051
  voice=filtered_language_dict[language]
@@ -1060,9 +1109,11 @@ async def texttospeech(text,language,autoplay):
1060
  if autoplay:
1061
  audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
1062
  else:
1063
- audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls {audio_style}></audio>'
 
1064
  return audio_player
1065
 
 
1066
  def create_ui():
1067
  title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
1068
  """
@@ -1093,7 +1144,6 @@ def create_ui():
1093
  visual_chatgpt = gr.State(None)
1094
  original_size = gr.State(None)
1095
  input_size = gr.State(None)
1096
- generated_caption = gr.State("")
1097
  paragraph = gr.State("")
1098
  aux_state = gr.State([])
1099
  click_index_state = gr.State((0, 0))
@@ -1102,15 +1152,33 @@ def create_ui():
1102
  input_labels_state = gr.State([])
1103
  new_crop_save_path = gr.State(None)
1104
  image_input_nobackground = gr.State(None)
1105
- like_state=gr.State([])
1106
- dislike_state=gr.State([])
1107
-
1108
-
1109
 
1110
  gr.Markdown(title)
1111
  gr.Markdown(description)
 
 
 
 
 
 
 
 
 
 
 
 
1112
 
1113
- with gr.Row():
 
 
 
 
 
 
 
 
 
 
1114
  with gr.Column(scale=1.0):
1115
  with gr.Column(visible=False) as modules_not_need_gpt:
1116
  with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
@@ -1156,11 +1224,12 @@ def create_ui():
1156
  clear_button_image = gr.Button(value="Clear Image", interactive=True)
1157
 
1158
  with gr.Tab("Trajectory (beta)") as traj_tab:
1159
- sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10,
1160
  elem_id="image_sketcher")
1161
- with gr.Row():
1162
- clear_button_sketcher = gr.Button(value="Clear Sketch", interactive=True)
1163
  submit_button_sketcher = gr.Button(value="Submit", interactive=True)
 
1164
  with gr.Row(scale=1.0):
1165
  with gr.Row(scale=0.8):
1166
  focus_type_sketch = gr.Radio(
@@ -1171,7 +1240,7 @@ def create_ui():
1171
  Input_sketch = gr.Radio(
1172
  choices=["Trace+Seg", "Trace"],
1173
  value="Trace+Seg",
1174
- label="Caption Type",
1175
  interactive=True)
1176
 
1177
  with gr.Column(visible=False) as modules_need_gpt1:
@@ -1203,26 +1272,17 @@ def create_ui():
1203
  value="No",
1204
  label="Expert",
1205
  interactive=True)
1206
-
1207
  with gr.Column(visible=True) as modules_not_need_gpt3:
1208
  gr.Examples(
1209
- examples=examples,
1210
- inputs=[example_image],
1211
- )
 
1212
 
1213
 
1214
 
1215
 
1216
- with gr.Column(scale=0.5):
1217
- with gr.Row(align="right",visible=False) as language_select:
1218
- language = gr.Dropdown(
1219
- ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
1220
- value="English", label="Language", interactive=True)
1221
-
1222
- with gr.Row(align="right",visible=False) as autoplay:
1223
- auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
1224
- output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
1225
-
1226
  with gr.Column(visible=True) as module_key_input:
1227
  openai_api_key = gr.Textbox(
1228
  placeholder="Input openAI API key",
@@ -1243,7 +1303,7 @@ def create_ui():
1243
  cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
1244
 
1245
  with gr.Column(visible=False) as modules_not_need_gpt2:
1246
- with gr.Blocks(css=css):
1247
  chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True).style(height=600, scale=0.5)
1248
  with gr.Column(visible=False) as modules_need_gpt3:
1249
  chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
@@ -1251,6 +1311,9 @@ def create_ui():
1251
  with gr.Row():
1252
  clear_button_text = gr.Button(value="Clear Text", interactive=True)
1253
  submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
 
 
 
1254
  with gr.Row():
1255
  export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
1256
  with gr.Row():
@@ -1421,7 +1484,7 @@ def create_ui():
1421
  # this part is for 3d generate.
1422
  ###############################################################################
1423
 
1424
- with gr.Row(variant="panel") as d3_model:
1425
  with gr.Column():
1426
  with gr.Row():
1427
  input_image = gr.Image(
@@ -1529,7 +1592,7 @@ def create_ui():
1529
 
1530
  def clear_tts_fields():
1531
  return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
1532
-
1533
  # submit_tts.click(
1534
  # tts.predict,
1535
  # inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
@@ -1544,6 +1607,9 @@ def create_ui():
1544
  queue=False
1545
  )
1546
 
 
 
 
1547
  clear_button_sketcher.click(
1548
  lambda x: (x),
1549
  [origin_image],
@@ -1552,18 +1618,21 @@ def create_ui():
1552
  show_progress=False
1553
  )
1554
 
 
 
 
1555
 
1556
  openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
1557
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
1558
- modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
1559
  enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
1560
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1561
  modules_not_need_gpt,
1562
- modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
1563
  disable_chatGPT_button.click(init_wo_openai_api_key,
1564
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1565
  modules_not_need_gpt,
1566
- modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
1567
 
1568
  enable_chatGPT_button.click(
1569
  lambda: (None, [], [], [[], [], []], "", "", ""),
@@ -1677,7 +1746,7 @@ def create_ui():
1677
  image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
1678
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
1679
  ],
1680
- outputs=[chatbot, state, click_state, image_input, input_image, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
1681
  show_progress=False, queue=True
1682
  )
1683
 
@@ -1685,7 +1754,7 @@ def create_ui():
1685
  submit_button_click.click(
1686
  submit_caption,
1687
  inputs=[
1688
- image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
1689
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
1690
  auto_play,paragraph,focus_type,openai_api_key,new_crop_save_path
1691
  ],
@@ -1701,10 +1770,10 @@ def create_ui():
1701
  submit_button_sketcher.click(
1702
  inference_traject,
1703
  inputs=[
1704
- sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
1705
- original_size, input_size, text_refiner
1706
  ],
1707
- outputs=[chatbot, state, sketcher_input],
1708
  show_progress=False, queue=True
1709
  )
1710
 
 
25
  import re
26
  import edge_tts
27
  import asyncio
28
+ import cv2
29
  # import tts
30
 
31
  ###############################################################################
 
33
  ###############################################################################
34
 
35
 
36
+ # import spaces #
37
 
38
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  import imageio
40
  import numpy as np
41
  import torch
42
  import rembg
43
+ from PIL import Image
44
  from torchvision.transforms import v2
45
  from pytorch_lightning import seed_everything
46
  from omegaconf import OmegaConf
 
285
  ############# above part is for 3D generate #############
286
  ###############################################################################
287
 
288
+
289
  ###############################################################################
290
  ############# this part is for text to image #############
291
  ###############################################################################
 
407
  'Cantonese': 'zh-HK-HiuGaaiNeural'
408
  }
409
 
410
+ focus_map = {
411
+ "CFV-D":0,
412
+ "CFV-DA":1,
413
+ "CFV-DAI":2,
414
+ "PFV-DDA":3
415
+ }
416
+
417
+ '''
418
+ prompt_list = [
419
+ 'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
420
+ 'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
421
+ 'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
422
+ 'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
423
+ ]
424
+
425
+ prompt_list = [
426
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
427
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
428
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
429
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
430
+ ]
431
+ '''
432
+ prompt_list = [
433
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
434
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
435
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
436
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
437
+ ]
438
+
439
+
440
  gpt_state = 0
441
  VOICE = "en-GB-SoniaNeural"
442
  article = """
 
482
  mask[..., -1] = 255
483
  mask = self.postprocess(mask)
484
  x['mask'] = mask
485
+
486
  return super().preprocess(x)
487
 
488
 
 
532
 
533
  global gpt_state
534
  gpt_state=1
535
+ # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
536
+ return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
537
  else:
538
  gpt_state=0
539
+ # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
540
+ return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
541
 
542
  def init_wo_openai_api_key():
543
  global gpt_state
544
  gpt_state=0
545
+ # return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
546
+ return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*2
547
 
548
  def get_click_prompt(chat_input, click_state, click_mode):
549
  inputs = json.loads(chat_input)
 
700
  point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
701
  visual_chatgpt.point_prompt = point_prompt
702
 
703
+
 
704
  print("new crop save",new_crop_save_path)
705
 
706
+ yield state, state, click_state, image_input_nobackground, image_input_withbackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
707
 
708
 
709
 
710
 
711
 
712
+ async def submit_caption(state, text_refiner, length, sentiment, factuality, language,
713
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
714
  autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
715
  print("state",state)
 
724
  print("click_index",click_index)
725
  print("input_points_state",input_points_state)
726
  print("input_labels_state",input_labels_state)
727
+
728
+ prompt=generate_prompt(paragraph,focus_type,length,sentiment,factuality,language)
729
 
730
+ print("Prompt:", prompt)
731
+ print("click",click_index)
 
732
 
733
+ # image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
734
+ # input_points=input_points, input_labels=input_labels)
735
+
736
+
737
+ if not args.disable_gpt and text_refiner:
738
+ print("new crop save",new_crop_save_path)
739
+ focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
740
+ if focus_info.startswith('"') and focus_info.endswith('"'):
741
+ focus_info=focus_info[1:-1]
742
+ focus_info=focus_info.replace('#', '')
743
+ # state = state + [(None, f"Wiki: {paragraph}")]
744
+ state = state + [(None, f"{focus_info}")]
745
+ print("new_cap",focus_info)
746
+ read_info = re.sub(r'[#[\]!*]','',focus_info)
747
+ read_info = emoji.replace_emoji(read_info,replace="")
748
+ print("read info",read_info)
749
+
750
+ # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
751
+ # input_points=input_points, input_labels=input_labels)
752
+ try:
753
+ audio_output = await texttospeech(read_info, language,autoplay)
754
+ # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
755
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
756
+
757
+ except Exception as e:
758
+ state = state + [(None, f"Error during TTS prediction: {str(e)}")]
759
+ print(f"Error during TTS prediction: {str(e)}")
760
+ # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
761
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
762
 
763
+ else:
764
+ try:
765
+ audio_output = await texttospeech(focus_info, language, autoplay)
766
+ # waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
767
+ # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
768
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
769
+
770
+ except Exception as e:
771
+ state = state + [(None, f"Error during TTS prediction: {str(e)}")]
772
+ print(f"Error during TTS prediction: {str(e)}")
773
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
774
+
775
+ def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language):
776
+
777
  mapped_value = focus_map.get(focus_type, -1)
 
778
 
779
  controls = {
780
  'length': length,
 
782
  'factuality': factuality,
783
  'language': language
784
  }
785
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
  if mapped_value != -1:
787
+ prompt = prompt_list[mapped_value].format(
 
788
  Wiki_caption=paragraph,
789
  length=controls['length'],
790
  sentiment=controls['sentiment'],
791
  language=controls['language']
792
  )
 
793
  else:
 
794
  prompt = "Invalid focus type."
795
 
796
  if controls['factuality'] == "Imagination":
797
+ prompt += " Assuming that I am someone who has viewed a lot of art and has a lot of experience viewing art. Explain artistic features (composition, color, style, or use of light) and discuss the symbolism of the content and its influence on later artistic movements."
 
 
 
 
 
 
 
798
 
799
+ return prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
800
 
801
 
802
  def encode_image(image_path):
 
874
 
875
  return prompt
876
 
877
+ submit_traj=0
878
 
879
+ async def inference_traject(origin_image,sketcher_image, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
880
+ original_size, input_size, text_refiner,focus_type,paragraph,openai_api_key,autoplay,trace_type):
881
  image_input, mask = sketcher_image['image'], sketcher_image['mask']
882
+
883
+ crop_save_path=""
884
+
885
  prompt = get_sketch_prompt(mask)
886
  boxes = prompt['input_boxes']
887
  boxes = boxes[0]
888
+ global submit_traj
889
+ submit_traj=1
890
 
891
  controls = {'length': length,
892
  'sentiment': sentiment,
 
906
  model.setup(image_embedding, original_size, input_size, is_image_set=True)
907
 
908
  enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
909
+ out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki,verbose=True)[0]
910
 
911
+ print(trace_type)
912
+
913
+ if trace_type=="Trace+Seg":
914
+ input_mask = np.array(out['mask'].convert('P'))
915
+ image_input = mask_painter(np.array(image_input), input_mask, background_alpha=0 )
916
+ crop_save_path=out['crop_save_path']
917
+
918
+ else:
919
+ image_input = Image.fromarray(np.array(origin_image))
920
+ draw = ImageDraw.Draw(image_input)
921
+ draw.rectangle(boxes, outline='red', width=2)
922
+ cropped_image = origin_image.crop(boxes)
923
+ cropped_image.save('temp.png')
924
+ crop_save_path='temp.png'
925
+
926
+ print("crop_svae_path",out['crop_save_path'])
927
+
928
  # Update components and states
929
  state.append((f'Box: {boxes}', None))
930
+
 
 
 
 
 
 
 
931
  # fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
932
  # image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
933
 
934
+ prompt=generate_prompt(focus_type, paragraph, length, sentiment, factuality, language)
935
+ width, height = sketcher_image['image'].size
936
+ sketcher_image['mask'] = np.zeros((height, width, 4), dtype=np.uint8)
937
+ sketcher_image['mask'][..., -1] = 255
938
+ sketcher_image['image']=image_input
939
+
940
+
941
+ if not args.disable_gpt and text_refiner:
942
+ focus_info=get_image_gpt(openai_api_key,crop_save_path,prompt)
943
+ if focus_info.startswith('"') and focus_info.endswith('"'):
944
+ focus_info=focus_info[1:-1]
945
+ focus_info=focus_info.replace('#', '')
946
+ state = state + [(None, f"{focus_info}")]
947
+ print("new_cap",focus_info)
948
+ read_info = re.sub(r'[#[\]!*]','',focus_info)
949
+ read_info = emoji.replace_emoji(read_info,replace="")
950
+ print("read info",read_info)
951
+
952
+ # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
953
+ # input_points=input_points, input_labels=input_labels)
954
+ try:
955
+ audio_output = await texttospeech(read_info, language,autoplay)
956
+ # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
957
+ return state, state,image_input,audio_output
958
+
959
+
960
+ except Exception as e:
961
+ state = state + [(None, f"Error during TTS prediction: {str(e)}")]
962
+ print(f"Error during TTS prediction: {str(e)}")
963
+ # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
964
+ return state, state, image_input,audio_output
965
+
966
+
967
+ else:
968
+ try:
969
+ audio_output = await texttospeech(focus_info, language, autoplay)
970
+ # waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
971
+ # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
972
+ return state, state, image_input,audio_output
973
 
 
 
 
974
 
975
+ except Exception as e:
976
+ state = state + [(None, f"Error during TTS prediction: {str(e)}")]
977
+ print(f"Error during TTS prediction: {str(e)}")
978
+ return state, state, image_input,audio_output
 
979
 
 
980
 
981
  def clear_chat_memory(visual_chatgpt, keep_global=False):
982
  if visual_chatgpt is not None:
 
1046
  #image_sketcher [data-testid="image"], #image_sketcher [data-testid="image"] > div{min-height: 500px}
1047
  #image_upload{min-height:500px}
1048
  #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 500px}
1049
+ .custom-language {
1050
+ width: 20%;
1051
+ }
1052
+
1053
+ .custom-autoplay {
1054
+ width: 40%;
1055
+ }
1056
+
1057
+ .custom-output {
1058
+ width: 30%;
1059
+ }
1060
+
1061
  '''
1062
  elif current_version <= version.parse('3.27'):
1063
  style = '''
1064
  #image_sketcher{min-height:500px}
1065
  #image_upload{min-height:500px}
1066
+ .custom-language {
1067
+ width: 20%;
1068
+ }
1069
+
1070
+ .custom-autoplay {
1071
+ width: 40%;
1072
+ }
1073
+
1074
+ .custom-output {
1075
+ width: 30%;
1076
+ }
1077
  '''
1078
  else:
1079
  style = None
1080
 
1081
  return style
1082
 
1083
+ # def handle_like_dislike(like_data, like_state, dislike_state):
1084
+ # if like_data.liked:
1085
+ # if like_data.index not in like_state:
1086
+ # like_state.append(like_data.index)
1087
+ # message = f"Liked: {like_data.value} at index {like_data.index}"
1088
+ # else:
1089
+ # message = "You already liked this item"
1090
+ # else:
1091
+ # if like_data.index not in dislike_state:
1092
+ # dislike_state.append(like_data.index)
1093
+ # message = f"Disliked: {like_data.value} at index {like_data.index}"
1094
+ # else:
1095
+ # message = "You already disliked this item"
1096
 
1097
+ # return like_state, dislike_state
1098
 
1099
  async def texttospeech(text,language,autoplay):
1100
  voice=filtered_language_dict[language]
 
1109
  if autoplay:
1110
  audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
1111
  else:
1112
+ audio_player=None
1113
+ # audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls {audio_style}></audio>'
1114
  return audio_player
1115
 
1116
+
1117
  def create_ui():
1118
  title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
1119
  """
 
1144
  visual_chatgpt = gr.State(None)
1145
  original_size = gr.State(None)
1146
  input_size = gr.State(None)
 
1147
  paragraph = gr.State("")
1148
  aux_state = gr.State([])
1149
  click_index_state = gr.State((0, 0))
 
1152
  input_labels_state = gr.State([])
1153
  new_crop_save_path = gr.State(None)
1154
  image_input_nobackground = gr.State(None)
 
 
 
 
1155
 
1156
  gr.Markdown(title)
1157
  gr.Markdown(description)
1158
+ with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
1159
+ language = gr.Dropdown(
1160
+ ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
1161
+ value="English", label="Language", interactive=True, scale=0.2, elem_classes="custom-language"
1162
+ )
1163
+ auto_play = gr.Checkbox(
1164
+ label="Check to autoplay audio", value=False, scale=0.4, elem_classes="custom-autoplay"
1165
+ )
1166
+ output_audio = gr.HTML(
1167
+ label="Synthesised Audio", scale=0.3, elem_classes="custom-output"
1168
+ )
1169
+
1170
 
1171
+ # with gr.Row(align="right",visible=False) as language_select:
1172
+ # language = gr.Dropdown(
1173
+ # ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
1174
+ # value="English", label="Language", interactive=True)
1175
+
1176
+ # with gr.Row(align="right",visible=False) as autoplay:
1177
+ # auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
1178
+ # output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
1179
+
1180
+ with gr.Row():
1181
+
1182
  with gr.Column(scale=1.0):
1183
  with gr.Column(visible=False) as modules_not_need_gpt:
1184
  with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
 
1224
  clear_button_image = gr.Button(value="Clear Image", interactive=True)
1225
 
1226
  with gr.Tab("Trajectory (beta)") as traj_tab:
1227
+ sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10,
1228
  elem_id="image_sketcher")
1229
+ example_image = gr.Image(type="pil", interactive=False, visible=False)
1230
+ with gr.Row():
1231
  submit_button_sketcher = gr.Button(value="Submit", interactive=True)
1232
+ clear_button_sketcher = gr.Button(value="Clear Sketch", interactive=True)
1233
  with gr.Row(scale=1.0):
1234
  with gr.Row(scale=0.8):
1235
  focus_type_sketch = gr.Radio(
 
1240
  Input_sketch = gr.Radio(
1241
  choices=["Trace+Seg", "Trace"],
1242
  value="Trace+Seg",
1243
+ label="Trace Type",
1244
  interactive=True)
1245
 
1246
  with gr.Column(visible=False) as modules_need_gpt1:
 
1272
  value="No",
1273
  label="Expert",
1274
  interactive=True)
 
1275
  with gr.Column(visible=True) as modules_not_need_gpt3:
1276
  gr.Examples(
1277
+ examples=examples,
1278
+ inputs=[example_image],
1279
+ )
1280
+
1281
 
1282
 
1283
 
1284
 
1285
+ with gr.Column(scale=0.5):
 
 
 
 
 
 
 
 
 
1286
  with gr.Column(visible=True) as module_key_input:
1287
  openai_api_key = gr.Textbox(
1288
  placeholder="Input openAI API key",
 
1303
  cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
1304
 
1305
  with gr.Column(visible=False) as modules_not_need_gpt2:
1306
+ with gr.Blocks():
1307
  chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True).style(height=600, scale=0.5)
1308
  with gr.Column(visible=False) as modules_need_gpt3:
1309
  chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
 
1311
  with gr.Row():
1312
  clear_button_text = gr.Button(value="Clear Text", interactive=True)
1313
  submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
1314
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
1315
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
1316
+
1317
  with gr.Row():
1318
  export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
1319
  with gr.Row():
 
1484
  # this part is for 3d generate.
1485
  ###############################################################################
1486
 
1487
+ with gr.Row(variant="panel",visible=False) as d3_model:
1488
  with gr.Column():
1489
  with gr.Row():
1490
  input_image = gr.Image(
 
1592
 
1593
  def clear_tts_fields():
1594
  return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
1595
+
1596
  # submit_tts.click(
1597
  # tts.predict,
1598
  # inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
 
1607
  queue=False
1608
  )
1609
 
1610
+
1611
+
1612
+
1613
  clear_button_sketcher.click(
1614
  lambda x: (x),
1615
  [origin_image],
 
1618
  show_progress=False
1619
  )
1620
 
1621
+
1622
+
1623
+
1624
 
1625
  openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
1626
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
1627
+ modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
1628
  enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
1629
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1630
  modules_not_need_gpt,
1631
+ modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
1632
  disable_chatGPT_button.click(init_wo_openai_api_key,
1633
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1634
  modules_not_need_gpt,
1635
+ modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
1636
 
1637
  enable_chatGPT_button.click(
1638
  lambda: (None, [], [], [[], [], []], "", "", ""),
 
1746
  image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
1747
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
1748
  ],
1749
+ outputs=[chatbot, state, click_state, image_input, input_image, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
1750
  show_progress=False, queue=True
1751
  )
1752
 
 
1754
  submit_button_click.click(
1755
  submit_caption,
1756
  inputs=[
1757
+ state, text_refiner,length, sentiment, factuality, language,
1758
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
1759
  auto_play,paragraph,focus_type,openai_api_key,new_crop_save_path
1760
  ],
 
1770
  submit_button_sketcher.click(
1771
  inference_traject,
1772
  inputs=[
1773
+ origin_image,sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
1774
+ original_size, input_size, text_refiner,focus_type_sketch,paragraph,openai_api_key,auto_play,Input_sketch
1775
  ],
1776
+ outputs=[chatbot, state, sketcher_input,output_audio],
1777
  show_progress=False, queue=True
1778
  )
1779