Niki Zhang commited on
Commit
d1292a4
·
verified ·
1 Parent(s): e73c6e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +187 -120
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from math import inf
2
  import os
3
  import base64
@@ -9,6 +10,7 @@ import requests
9
  from packaging import version
10
  from PIL import Image, ImageDraw
11
  import functools
 
12
  from langchain.llms.openai import OpenAI
13
  from caption_anything.model import CaptionAnything
14
  from caption_anything.utils.image_editing_utils import create_bubble_frame
@@ -20,7 +22,10 @@ from caption_anything.segmenter import build_segmenter
20
  from caption_anything.utils.chatbot import ConversationBot, build_chatbot_tools, get_new_image_name
21
  from segment_anything import sam_model_registry
22
  import easyocr
23
- import tts
 
 
 
24
 
25
  ###############################################################################
26
  ############# this part is for 3D generate #############
@@ -279,9 +284,25 @@ def make3d(images):
279
  ############# above part is for 3D generate #############
280
  ###############################################################################
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
  gpt_state = 0
284
- pre_click_index=(inf, inf)
285
  article = """
286
  <div style='margin:20px auto;'>
287
  <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
@@ -374,15 +395,15 @@ def init_openai_api_key(api_key=""):
374
 
375
  global gpt_state
376
  gpt_state=1
377
- return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=True)]+ [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
378
  else:
379
  gpt_state=0
380
- return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
381
 
382
  def init_wo_openai_api_key():
383
  global gpt_state
384
  gpt_state=0
385
- return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]
386
 
387
  def get_click_prompt(chat_input, click_state, click_mode):
388
  inputs = json.loads(chat_input)
@@ -467,7 +488,12 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
467
  # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
468
  paragraph = get_image_gpt(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
469
 
470
- state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
 
 
 
 
 
471
 
472
  return state, state, image_input, click_state, image_input, image_input, image_input, image_embedding, \
473
  original_size, input_size, f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",paragraph
@@ -539,12 +565,11 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
539
 
540
 
541
 
542
- def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
543
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
544
- input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key,new_crop_save_path):
545
  print("state",state)
546
-
547
- global pre_click_index
548
  click_index = click_index_state
549
 
550
  # if pre_click_index==click_index:
@@ -553,7 +578,6 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
553
  # else:
554
  # pre_click_index = click_index
555
  print("click_index",click_index)
556
- print("pre_click_index",pre_click_index)
557
  print("input_points_state",input_points_state)
558
  print("input_labels_state",input_labels_state)
559
 
@@ -630,29 +654,34 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
630
  focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
631
  if focus_info.startswith('"') and focus_info.endswith('"'):
632
  focus_info=focus_info[1:-1]
633
-
634
  # state = state + [(None, f"Wiki: {paragraph}")]
635
  state = state + [(None, f"{focus_info}")]
636
  print("new_cap",focus_info)
 
 
 
637
 
638
  # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
639
  # input_points=input_points, input_labels=input_labels)
640
  try:
641
- waveform_visual, audio_output = tts.predict(focus_info, input_language, input_audio, input_mic, use_mic, agree)
642
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
643
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
644
 
645
  except Exception as e:
646
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
647
  print(f"Error during TTS prediction: {str(e)}")
648
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
649
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
650
 
651
  else:
652
  try:
653
- waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
 
 
654
  # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
655
- return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
656
 
657
  except Exception as e:
658
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
@@ -834,7 +863,8 @@ def cap_everything(image_input, visual_chatgpt, text_refiner,input_language, inp
834
  AI_prompt = "Received."
835
  visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
836
  visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
837
- waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
 
838
  return paragraph,waveform_visual, audio_output
839
 
840
  def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
@@ -877,6 +907,37 @@ def get_style():
877
 
878
  return style
879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
880
 
881
  def create_ui():
882
  title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
@@ -884,17 +945,20 @@ def create_ui():
884
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
885
 
886
  examples = [
887
- ["test_images/img36.webp"],
888
- ["test_images/MUS.png"],
889
- ["test_images/图片2.png"],
890
- ["test_images/img5.jpg"],
891
- ["test_images/img14.jpg"],
892
- ["test_images/qingming3.jpeg"],
 
 
893
 
894
  ]
895
 
896
  with gr.Blocks(
897
- css=get_style()
 
898
  ) as iface:
899
  state = gr.State([])
900
  out_state = gr.State(None)
@@ -914,6 +978,8 @@ def create_ui():
914
  input_labels_state = gr.State([])
915
  new_crop_save_path = gr.State(None)
916
  image_input_nobackground = gr.State(None)
 
 
917
 
918
 
919
 
@@ -924,19 +990,15 @@ def create_ui():
924
  with gr.Column(scale=1.0):
925
  with gr.Column(visible=False) as modules_not_need_gpt:
926
  with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
927
- image_intro=gr.HTML()
928
  image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
929
  example_image = gr.Image(type="pil", interactive=False, visible=False)
930
  with gr.Row():
931
  name_label_base = gr.Button(value="Name: ")
932
  artist_label_base = gr.Button(value="Artist: ")
933
  year_label_base = gr.Button(value="Year: ")
934
- material_label_base = gr.Button(value="Material: ")
935
 
936
-
937
-
938
  with gr.Tab("Click") as click_tab:
939
- image_intro_click=gr.HTML()
940
  image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
941
  example_image = gr.Image(type="pil", interactive=False, visible=False)
942
  with gr.Row():
@@ -945,11 +1007,14 @@ def create_ui():
945
  year_label = gr.Button(value="Year: ")
946
  material_label = gr.Button(value="Material: ")
947
  with gr.Row(scale=1.0):
948
- focus_type = gr.Radio(
949
- choices=["CFV-D", "CFV-DA", "CFV-DAI","PFV-DDA"],
950
- value="CFV-D",
951
- label="Focus Type",
952
- interactive=True)
 
 
 
953
  with gr.Row(scale=1.0):
954
  with gr.Row(scale=0.4):
955
  point_prompt = gr.Radio(
@@ -965,53 +1030,62 @@ def create_ui():
965
  with gr.Row(scale=0.4):
966
  clear_button_click = gr.Button(value="Clear Clicks", interactive=True)
967
  clear_button_image = gr.Button(value="Clear Image", interactive=True)
968
- submit_button_click=gr.Button(value="Submit", interactive=True)
969
- with gr.Tab("Trajectory (beta)"):
970
  sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=20,
971
  elem_id="image_sketcher")
972
  with gr.Row():
973
  submit_button_sketcher = gr.Button(value="Submit", interactive=True)
974
 
975
- with gr.Column(visible=False) as modules_need_gpt1:
976
- with gr.Row(scale=1.0):
977
- language = gr.Dropdown(
978
- ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
979
- value="English", label="Language", interactive=True)
980
- sentiment = gr.Radio(
981
- choices=["Positive", "Natural", "Negative"],
982
- value="Natural",
983
- label="Sentiment",
984
- interactive=True,
985
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
986
  with gr.Row(scale=1.0):
987
- factuality = gr.Radio(
988
- choices=["Factual", "Imagination"],
989
- value="Factual",
990
- label="Factuality",
991
- interactive=True,
992
- )
993
- length = gr.Slider(
994
- minimum=10,
995
- maximum=80,
996
- value=10,
997
- step=1,
998
- interactive=True,
999
- label="Generated Caption Length",
1000
- )
1001
- # 是否启用wiki内容整合到caption中
1002
- enable_wiki = gr.Radio(
1003
- choices=["Yes", "No"],
1004
- value="No",
1005
- label="Enable Wiki",
1006
- interactive=True)
1007
-
1008
- # with gr.Column(visible=True) as modules_not_need_gpt3:
1009
- gr.Examples(
1010
  examples=examples,
1011
  inputs=[example_image],
1012
  )
1013
 
 
 
 
1014
  with gr.Column(scale=0.5):
 
 
 
 
 
 
 
 
 
1015
  with gr.Column(visible=True) as module_key_input:
1016
  openai_api_key = gr.Textbox(
1017
  placeholder="Input openAI API key",
@@ -1027,39 +1101,39 @@ def create_ui():
1027
  notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
1028
 
1029
  with gr.Column():
1030
- with gr.Column(visible=False) as modules_need_gpt2:
1031
  paragraph_output = gr.Textbox(lines=7, label="Describe Everything", max_lines=7)
1032
- with gr.Column(visible=False) as modules_need_gpt0:
1033
  cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
1034
 
1035
  with gr.Column(visible=False) as modules_not_need_gpt2:
1036
- chatbot = gr.Chatbot(label="Chatbox", ).style(height=550, scale=0.5)
1037
- with gr.Column(visible=False) as modules_need_gpt3:
1038
- chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
1039
- container=False)
1040
- with gr.Row():
1041
- clear_button_text = gr.Button(value="Clear Text", interactive=True)
1042
- submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
1043
- with gr.Row():
1044
- export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
1045
- with gr.Row():
1046
- chat_log_file = gr.File(label="Download Chat Log")
 
1047
 
1048
- with gr.Column(scale=0.5):
1049
  # TTS interface hidden initially
1050
- with gr.Column(visible=False) as tts_interface:
1051
- input_text = gr.Textbox(label="Text Prompt", value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality")
1052
- input_language = gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en")
1053
- input_audio = gr.Audio(label="Reference Audio", type="filepath", value="examples/female.wav")
1054
- input_mic = gr.Audio(source="microphone", type="filepath", label="Use Microphone for Reference")
1055
- use_mic = gr.Checkbox(label="Check to use Microphone as Reference", value=False)
1056
- agree = gr.Checkbox(label="Agree", value=True)
1057
- output_waveform = gr.Video(label="Waveform Visual")
1058
- output_audio = gr.HTML(label="Synthesised Audio")
1059
 
1060
- with gr.Row():
1061
- submit_tts = gr.Button(value="Submit", interactive=True)
1062
- clear_tts = gr.Button(value="Clear", interactive=True)
1063
 
1064
 
1065
 
@@ -1154,6 +1228,8 @@ def create_ui():
1154
 
1155
 
1156
  mv_images = gr.State()
 
 
1157
 
1158
  submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
1159
  fn=generate_mvs,
@@ -1174,12 +1250,12 @@ def create_ui():
1174
  def clear_tts_fields():
1175
  return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
1176
 
1177
- submit_tts.click(
1178
- tts.predict,
1179
- inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
1180
- outputs=[output_waveform, output_audio],
1181
- queue=True
1182
- )
1183
 
1184
  clear_tts.click(
1185
  clear_tts_fields,
@@ -1191,15 +1267,15 @@ def create_ui():
1191
 
1192
  openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
1193
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
1194
- modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box])
1195
  enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
1196
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1197
  modules_not_need_gpt,
1198
- modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box])
1199
  disable_chatGPT_button.click(init_wo_openai_api_key,
1200
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1201
  modules_not_need_gpt,
1202
- modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box])
1203
 
1204
  enable_chatGPT_button.click(
1205
  lambda: (None, [], [], [[], [], []], "", "", ""),
@@ -1299,6 +1375,7 @@ def create_ui():
1299
  return [gr.update(visible=False)]*4
1300
 
1301
 
 
1302
  click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
1303
  base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
1304
 
@@ -1322,26 +1399,16 @@ def create_ui():
1322
  inputs=[
1323
  image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
1324
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
1325
- input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key,new_crop_save_path
1326
  ],
1327
  outputs=[
1328
  chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
1329
- output_waveform, output_audio
1330
  ],
1331
  show_progress=True,
1332
  queue=True
1333
  )
1334
-
1335
-
1336
- focus_type.change(
1337
- lambda x: ([[], [], []], x),
1338
- [image_input_nobackground],
1339
- [click_state, image_input],
1340
- queue=False,
1341
- show_progress=False
1342
- )
1343
-
1344
-
1345
 
1346
  submit_button_sketcher.click(
1347
  inference_traject,
@@ -1370,4 +1437,4 @@ def create_ui():
1370
  if __name__ == '__main__':
1371
  iface = create_ui()
1372
  iface.queue(concurrency_count=5, api_open=False, max_size=10)
1373
- iface.launch(server_name="0.0.0.0", enable_queue=True)
 
1
+ from io import BytesIO
2
  from math import inf
3
  import os
4
  import base64
 
10
  from packaging import version
11
  from PIL import Image, ImageDraw
12
  import functools
13
+ import emoji
14
  from langchain.llms.openai import OpenAI
15
  from caption_anything.model import CaptionAnything
16
  from caption_anything.utils.image_editing_utils import create_bubble_frame
 
22
  from caption_anything.utils.chatbot import ConversationBot, build_chatbot_tools, get_new_image_name
23
  from segment_anything import sam_model_registry
24
  import easyocr
25
+ import re
26
+ import edge_tts
27
+ import asyncio
28
+ # import tts
29
 
30
  ###############################################################################
31
  ############# this part is for 3D generate #############
 
284
  ############# above part is for 3D generate #############
285
  ###############################################################################
286
 
287
+ css = """
288
+ #warning {background-color: #FFCCCB}
289
+ .chatbot {
290
+ padding: 0 !important;
291
+ margin: 0 !important;
292
+ }
293
+ """
294
+ filtered_language_dict = {
295
+ 'English': 'en-US-JennyNeural',
296
+ 'Chinese': 'zh-CN-XiaoxiaoNeural',
297
+ 'French': 'fr-FR-DeniseNeural',
298
+ 'Spanish': 'es-MX-DaliaNeural',
299
+ 'Arabic': 'ar-SA-ZariyahNeural',
300
+ 'Portuguese': 'pt-BR-FranciscaNeural',
301
+ 'Cantonese': 'zh-HK-HiuGaaiNeural'
302
+ }
303
 
304
  gpt_state = 0
305
+ VOICE = "en-GB-SoniaNeural"
306
  article = """
307
  <div style='margin:20px auto;'>
308
  <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
 
395
 
396
  global gpt_state
397
  gpt_state=1
398
+ return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
399
  else:
400
  gpt_state=0
401
+ return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
402
 
403
  def init_wo_openai_api_key():
404
  global gpt_state
405
  gpt_state=0
406
+ return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*2
407
 
408
  def get_click_prompt(chat_input, click_state, click_mode):
409
  inputs = json.loads(chat_input)
 
488
  # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
489
  paragraph = get_image_gpt(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
490
 
491
+ state = [
492
+ (
493
+ None,
494
+ f"🤖 Hi, I am EyeSee. Let's explore this painting {name} together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
495
+ )
496
+ ]
497
 
498
  return state, state, image_input, click_state, image_input, image_input, image_input, image_embedding, \
499
  original_size, input_size, f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",paragraph
 
565
 
566
 
567
 
568
+ async def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
569
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
570
+ autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
571
  print("state",state)
572
+
 
573
  click_index = click_index_state
574
 
575
  # if pre_click_index==click_index:
 
578
  # else:
579
  # pre_click_index = click_index
580
  print("click_index",click_index)
 
581
  print("input_points_state",input_points_state)
582
  print("input_labels_state",input_labels_state)
583
 
 
654
  focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
655
  if focus_info.startswith('"') and focus_info.endswith('"'):
656
  focus_info=focus_info[1:-1]
657
+ focus_info=focus_info.replace('#', '')
658
  # state = state + [(None, f"Wiki: {paragraph}")]
659
  state = state + [(None, f"{focus_info}")]
660
  print("new_cap",focus_info)
661
+ read_info = re.sub(r'[#[\]!*]','',focus_info)
662
+ read_info = emoji.replace_emoji(read_info,replace="")
663
+ print("read info",read_info)
664
 
665
  # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
666
  # input_points=input_points, input_labels=input_labels)
667
  try:
668
+ audio_output = await texttospeech(read_info, language,autoplay)
669
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
670
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
671
 
672
  except Exception as e:
673
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
674
  print(f"Error during TTS prediction: {str(e)}")
675
  # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
676
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
677
 
678
  else:
679
  try:
680
+ audio_output = await texttospeech(focus_info, language, autoplay)
681
+ # waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
682
+ waveform_visual, audio_output=None,None
683
  # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
684
+ return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
685
 
686
  except Exception as e:
687
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
 
863
  AI_prompt = "Received."
864
  visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
865
  visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
866
+ # waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
867
+ waveform_visual, audio_output=None,None
868
  return paragraph,waveform_visual, audio_output
869
 
870
  def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
 
907
 
908
  return style
909
 
910
+ def handle_like_dislike(like_data, like_state, dislike_state):
911
+ if like_data.liked:
912
+ if like_data.index not in like_state:
913
+ like_state.append(like_data.index)
914
+ message = f"Liked: {like_data.value} at index {like_data.index}"
915
+ else:
916
+ message = "You already liked this item"
917
+ else:
918
+ if like_data.index not in dislike_state:
919
+ dislike_state.append(like_data.index)
920
+ message = f"Disliked: {like_data.value} at index {like_data.index}"
921
+ else:
922
+ message = "You already disliked this item"
923
+
924
+ return like_state, dislike_state
925
+
926
+ async def texttospeech(text,language,autoplay):
927
+ voice=filtered_language_dict[language]
928
+ communicate = edge_tts.Communicate(text, voice)
929
+ file_path="output.wav"
930
+ await communicate.save(file_path)
931
+ with open(file_path, "rb") as audio_file:
932
+ audio_bytes = BytesIO(audio_file.read())
933
+ audio = base64.b64encode(audio_bytes.read()).decode("utf-8")
934
+ print("tts....")
935
+ audio_style = 'style="width:250px;"'
936
+ if autoplay:
937
+ audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
938
+ else:
939
+ audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls {audio_style}></audio>'
940
+ return audio_player
941
 
942
  def create_ui():
943
  title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
 
945
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
946
 
947
  examples = [
948
+ ["test_images/pearl.jpg"],
949
+ ["test_images/ambass.jpg"],
950
+ ["test_images/Picture0.png"],
951
+ ["test_images/Picture1.png"],
952
+ ["test_images/Picture2.png"],
953
+ ["test_images/Picture3.png"],
954
+ ["test_images/Picture4.png"],
955
+ ["test_images/Picture5.png"],
956
 
957
  ]
958
 
959
  with gr.Blocks(
960
+ css=get_style(),
961
+ theme=gr.themes.Base()
962
  ) as iface:
963
  state = gr.State([])
964
  out_state = gr.State(None)
 
978
  input_labels_state = gr.State([])
979
  new_crop_save_path = gr.State(None)
980
  image_input_nobackground = gr.State(None)
981
+ like_state=gr.State([])
982
+ dislike_state=gr.State([])
983
 
984
 
985
 
 
990
  with gr.Column(scale=1.0):
991
  with gr.Column(visible=False) as modules_not_need_gpt:
992
  with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
 
993
  image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
994
  example_image = gr.Image(type="pil", interactive=False, visible=False)
995
  with gr.Row():
996
  name_label_base = gr.Button(value="Name: ")
997
  artist_label_base = gr.Button(value="Artist: ")
998
  year_label_base = gr.Button(value="Year: ")
999
+ material_label_base = gr.Button(value="Material: ")
1000
 
 
 
1001
  with gr.Tab("Click") as click_tab:
 
1002
  image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
1003
  example_image = gr.Image(type="pil", interactive=False, visible=False)
1004
  with gr.Row():
 
1007
  year_label = gr.Button(value="Year: ")
1008
  material_label = gr.Button(value="Material: ")
1009
  with gr.Row(scale=1.0):
1010
+ with gr.Row(scale=0.8):
1011
+ focus_type = gr.Radio(
1012
+ choices=["CFV-D", "CFV-DA", "CFV-DAI","PFV-DDA"],
1013
+ value="CFV-D",
1014
+ label="Information Type",
1015
+ interactive=True)
1016
+ with gr.Row(scale=0.2):
1017
+ submit_button_click=gr.Button(value="Submit", interactive=True,variant='primary',size="sm")
1018
  with gr.Row(scale=1.0):
1019
  with gr.Row(scale=0.4):
1020
  point_prompt = gr.Radio(
 
1030
  with gr.Row(scale=0.4):
1031
  clear_button_click = gr.Button(value="Clear Clicks", interactive=True)
1032
  clear_button_image = gr.Button(value="Clear Image", interactive=True)
1033
+
1034
+ with gr.Tab("Trajectory (beta)") as traj_tab:
1035
  sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=20,
1036
  elem_id="image_sketcher")
1037
  with gr.Row():
1038
  submit_button_sketcher = gr.Button(value="Submit", interactive=True)
1039
 
1040
+ with gr.Column(visible=False) as modules_need_gpt1:
1041
+ with gr.Row(scale=1.0):
1042
+ sentiment = gr.Radio(
1043
+ choices=["Positive", "Natural", "Negative"],
1044
+ value="Natural",
1045
+ label="Sentiment",
1046
+ interactive=True,
1047
+ )
1048
+ with gr.Row(scale=1.0):
1049
+ factuality = gr.Radio(
1050
+ choices=["Factual", "Imagination"],
1051
+ value="Factual",
1052
+ label="Factuality",
1053
+ interactive=True,
1054
+ )
1055
+ length = gr.Slider(
1056
+ minimum=10,
1057
+ maximum=80,
1058
+ value=10,
1059
+ step=1,
1060
+ interactive=True,
1061
+ label="Generated Caption Length",
1062
+ )
1063
+ # 是否启用wiki内容整合到caption中
1064
+ enable_wiki = gr.Radio(
1065
+ choices=["Yes", "No"],
1066
+ value="No",
1067
+ label="Enable Wiki",
1068
+ interactive=True)
1069
+
1070
  with gr.Row(scale=1.0):
1071
+ gr.Examples(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1072
  examples=examples,
1073
  inputs=[example_image],
1074
  )
1075
 
1076
+ # with gr.Column(visible=True) as modules_not_need_gpt3:
1077
+
1078
+
1079
  with gr.Column(scale=0.5):
1080
+ with gr.Row(align="right",visible=False) as language_select:
1081
+ language = gr.Dropdown(
1082
+ ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
1083
+ value="English", label="Language", interactive=True)
1084
+
1085
+ with gr.Row(align="right",visible=False) as autoplay:
1086
+ auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
1087
+ output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
1088
+
1089
  with gr.Column(visible=True) as module_key_input:
1090
  openai_api_key = gr.Textbox(
1091
  placeholder="Input openAI API key",
 
1101
  notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
1102
 
1103
  with gr.Column():
1104
+ with gr.Column(visible=False,scale=1.0) as modules_need_gpt2:
1105
  paragraph_output = gr.Textbox(lines=7, label="Describe Everything", max_lines=7)
1106
+ with gr.Column(visible=False,scale=0.2) as modules_need_gpt0:
1107
  cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
1108
 
1109
  with gr.Column(visible=False) as modules_not_need_gpt2:
1110
+ with gr.Blocks(css=css):
1111
+ chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True).style(height=600, scale=0.5)
1112
+ with gr.Column(visible=False) as modules_need_gpt3:
1113
+ chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
1114
+ container=False)
1115
+ with gr.Row():
1116
+ clear_button_text = gr.Button(value="Clear Text", interactive=True)
1117
+ submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
1118
+ with gr.Row():
1119
+ export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
1120
+ with gr.Row():
1121
+ chat_log_file = gr.File(label="Download Chat Log")
1122
 
 
1123
  # TTS interface hidden initially
1124
+ with gr.Column(visible=False) as tts_interface:
1125
+ input_text = gr.Textbox(label="Text Prompt", value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality")
1126
+ input_language = gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en")
1127
+ input_audio = gr.Audio(label="Reference Audio", type="filepath", value="examples/female.wav")
1128
+ input_mic = gr.Audio(source="microphone", type="filepath", label="Use Microphone for Reference")
1129
+ use_mic = gr.Checkbox(label="Check to use Microphone as Reference", value=False)
1130
+ agree = gr.Checkbox(label="Agree", value=True)
1131
+ output_waveform = gr.Video(label="Waveform Visual")
1132
+ # output_audio = gr.HTML(label="Synthesised Audio")
1133
 
1134
+ with gr.Row():
1135
+ submit_tts = gr.Button(value="Submit", interactive=True)
1136
+ clear_tts = gr.Button(value="Clear", interactive=True)
1137
 
1138
 
1139
 
 
1228
 
1229
 
1230
  mv_images = gr.State()
1231
+
1232
+ chatbot.like(handle_like_dislike, inputs=[like_state, dislike_state], outputs=[like_state, dislike_state])
1233
 
1234
  submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
1235
  fn=generate_mvs,
 
1250
  def clear_tts_fields():
1251
  return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
1252
 
1253
+ # submit_tts.click(
1254
+ # tts.predict,
1255
+ # inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
1256
+ # outputs=[output_waveform, output_audio],
1257
+ # queue=True
1258
+ # )
1259
 
1260
  clear_tts.click(
1261
  clear_tts_fields,
 
1267
 
1268
  openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
1269
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
1270
+ modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
1271
  enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
1272
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1273
  modules_not_need_gpt,
1274
+ modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
1275
  disable_chatGPT_button.click(init_wo_openai_api_key,
1276
  outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
1277
  modules_not_need_gpt,
1278
+ modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
1279
 
1280
  enable_chatGPT_button.click(
1281
  lambda: (None, [], [], [[], [], []], "", "", ""),
 
1375
  return [gr.update(visible=False)]*4
1376
 
1377
 
1378
+ traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
1379
  click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
1380
  base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
1381
 
 
1399
  inputs=[
1400
  image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
1401
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
1402
+ auto_play,paragraph,focus_type,openai_api_key,new_crop_save_path
1403
  ],
1404
  outputs=[
1405
  chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
1406
+ output_audio
1407
  ],
1408
  show_progress=True,
1409
  queue=True
1410
  )
1411
+
 
 
 
 
 
 
 
 
 
 
1412
 
1413
  submit_button_sketcher.click(
1414
  inference_traject,
 
1437
  if __name__ == '__main__':
1438
  iface = create_ui()
1439
  iface.queue(concurrency_count=5, api_open=False, max_size=10)
1440
+ iface.launch(server_name="0.0.0.0", enable_queue=True)