Niki Zhang commited on
Commit
978648d
·
verified ·
1 Parent(s): c7386f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +233 -381
app.py CHANGED
@@ -349,12 +349,9 @@ def extract_features_siglip(image):
349
  @spaces.GPU
350
  def infer(crop_image_path,full_image_path,state,language,task_type=None):
351
  print("task type",task_type)
352
- style_gallery_output = []
353
- item_gallery_output=[]
354
-
355
- if task_type=="task 1":
356
- item_gallery_output.append("recomendation_pic/1.8.jpg")
357
- item_gallery_output.append("recomendation_pic/1.9.jpg")
358
  input_image = Image.open(full_image_path).convert("RGB")
359
  input_features = extract_features_siglip(input_image.convert("RGB"))
360
  input_features = input_features.detach().cpu().numpy()
@@ -365,17 +362,16 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
365
  sim = -distances[0][i]
366
  image_url = df.iloc[v]["Link"]
367
  img_retrieved = read_image_from_url(image_url)
368
- style_gallery_output.append(img_retrieved)
369
  if language=="English":
370
  msg="🖼️ Please refer to the section below to see the recommended results."
371
  else:
372
  msg="🖼️ 请到下方查看推荐结果。"
373
  state+=[(None,msg)]
374
 
375
- return item_gallery_output, style_gallery_output,state,state
376
- elif task_type=="task 2":
377
- item_gallery_output.append("recomendation_pic/2.8.jpg")
378
- item_gallery_output.append("recomendation_pic/2.9.png")
379
  input_image = Image.open(full_image_path).convert("RGB")
380
  input_features = extract_features_siglip(input_image.convert("RGB"))
381
  input_features = input_features.detach().cpu().numpy()
@@ -386,18 +382,17 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
386
  sim = -distances[0][i]
387
  image_url = df.iloc[v]["Link"]
388
  img_retrieved = read_image_from_url(image_url)
389
- style_gallery_output.append(img_retrieved)
390
  if language=="English":
391
  msg="🖼️ Please refer to the section below to see the recommended results."
392
  else:
393
  msg="🖼️ 请到下方查看推荐结果。"
394
  state+=[(None,msg)]
395
 
396
- return item_gallery_output, style_gallery_output,state,state
397
 
398
- elif task_type=="task 3":
399
- item_gallery_output.append("recomendation_pic/3.8.png")
400
- item_gallery_output.append("recomendation_pic/basket-2.png")
401
  input_image = Image.open(full_image_path).convert("RGB")
402
  input_features = extract_features_siglip(input_image.convert("RGB"))
403
  input_features = input_features.detach().cpu().numpy()
@@ -408,15 +403,14 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
408
  sim = -distances[0][i]
409
  image_url = df.iloc[v]["Link"]
410
  img_retrieved = read_image_from_url(image_url)
411
- style_gallery_output.append(img_retrieved)
412
  if language=="English":
413
  msg="🖼️ Please refer to the section below to see the recommended results."
414
  else:
415
  msg="🖼️ 请到下方查看推荐结果。"
416
  state+=[(None,msg)]
417
 
418
- return item_gallery_output, style_gallery_output,state,state
419
-
420
  elif crop_image_path:
421
  input_image = Image.open(crop_image_path).convert("RGB")
422
  input_features = extract_features_siglip(input_image.convert("RGB"))
@@ -428,7 +422,7 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
428
  sim = -distances[0][i]
429
  image_url = df.iloc[v]["Link"]
430
  img_retrieved = read_image_from_url(image_url)
431
- item_gallery_output.append(img_retrieved)
432
 
433
  input_image = Image.open(full_image_path).convert("RGB")
434
  input_features = extract_features_siglip(input_image.convert("RGB"))
@@ -440,14 +434,14 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
440
  sim = -distances[0][i]
441
  image_url = df.iloc[v]["Link"]
442
  img_retrieved = read_image_from_url(image_url)
443
- style_gallery_output.append(img_retrieved)
444
  if language=="English":
445
  msg="🖼️ Please refer to the section below to see the recommended results."
446
  else:
447
  msg="🖼️ 请到下方查看推荐结果。"
448
  state+=[(None,msg)]
449
 
450
- return item_gallery_output, style_gallery_output,state,state
451
  else:
452
  input_image = Image.open(full_image_path).convert("RGB")
453
  input_features = extract_features_siglip(input_image.convert("RGB"))
@@ -459,15 +453,14 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
459
  sim = -distances[0][i]
460
  image_url = df.iloc[v]["Link"]
461
  img_retrieved = read_image_from_url(image_url)
462
- style_gallery_output.append(img_retrieved)
463
  if language=="English":
464
  msg="🖼️ Please refer to the section below to see the recommended results."
465
  else:
466
  msg="🖼️ 请到下方查看推荐结果。"
467
  state+=[(None,msg)]
468
 
469
- return item_gallery_output, style_gallery_output,state,state
470
-
471
 
472
 
473
  ###############################################################################
@@ -621,7 +614,7 @@ css = """
621
 
622
 
623
  .info_btn {
624
- background: rgb(245, 245, 245) !important;
625
  border: none !important;
626
  box-shadow: none !important;
627
  font-size: 15px !important;
@@ -630,7 +623,7 @@ css = """
630
  }
631
 
632
  .info_btn_interact {
633
- background: rgb(217, 217, 217) !important;
634
  box-shadow: none !important;
635
  font-size: 15px !important;
636
  min-width: 6rem !important;
@@ -638,24 +631,16 @@ css = """
638
  }
639
 
640
  .function_button {
641
- background: rgb(227, 226, 226) !important;
642
  border: none !important;
643
  box-shadow: none !important;
644
  }
645
 
646
  .function_button_rec {
647
- background: rgb(189, 189, 189) !important;
648
  border: none !important;
649
  box-shadow: none !important;
650
  }
651
 
652
- .small_button {
653
- font-size: 12px !important;
654
- padding: 2px 8px !important;
655
- min-width: 60px !important;
656
- height: 30px !important;
657
- }
658
-
659
  #tool_box {max-width: 50px}
660
 
661
  """
@@ -681,84 +666,63 @@ focus_map = {
681
  prompt_list = [
682
  [
683
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
684
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
685
- 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
 
 
 
 
 
 
 
 
 
 
686
  'Wiki_caption: {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
687
  ],
688
  [
689
- "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
690
- "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
691
- "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact, one analysis, and one interpret from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
 
 
 
 
692
  "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
693
  ],
694
  [
695
  'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
696
- 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
697
- 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
 
 
 
 
698
  'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
699
  ]
700
  ]
701
 
702
  recommendation_prompt=[
703
-
704
- [
705
- '''
706
- First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
707
- Recommendation reason: {{Recommendation based on {{object}} in the painting you saw earlier. Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate in three points. }}
708
- Each bullet point should be in {language} language, with a response length of about {length} words.
709
- ''',
710
- '''
711
- When generating answers, you should tell people that I am the creator of painting you were looking at earlier itself, and generate text in the tone and manner in which you are the creator of painting were looking at earlier.
712
-
713
- First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
714
-
715
- Recommendation reason: {{I'm the creator of that painting you saw earlier. I'm an artist. and I'm recommending this painting based on the fact that the {{object}} I've drawn also appear in the painting you're looking at. }} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the creator of painting were looking at earlier and start every sentence with I.
716
-
717
- Each bullet point should be in {language} language, with a response length of about {length} words.
718
-
719
- ''',
720
- '''
721
- When generating answers, you should tell people that you are the object itself that was selected in the painting, and generate text in the tone and manner in which you are the object
722
-
723
- First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
724
-
725
- Recommendation reason: {{I'm the {{object}} in the painting you were looking at earlier, and I'm recommending this painting based on the fact that I'm also present in the one you're looking at.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the object of this painting and start every sentence with I.
726
-
727
- Each bullet point should be in {language} language, with a response length of about {length} words.
728
-
729
- '''],
730
-
731
- [
732
  '''
733
- First identify what the name of the first painting is, you save yourself as the parameter: {{name}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
734
- Recommendation reason: {{Recommendation based on the painting {{name}}.Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate in three points.}}
 
 
735
  Each bullet point should be in {language} language, with a response length of about {length} words.
736
  ''',
737
  '''
738
- When generating answers, you should tell people that I am the creator of painting you were looking at earlier itself, and generate text in the tone and manner in which you are the creator of painting were looking at earlier.
739
-
740
- First identify what the creator of the first painting is, you save yourself as the parameter: {artist}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
741
-
742
- Recommendation reason: {{I'm the creator of that painting you saw earlier, {artist}. I'm an artist. and I'm recommending this painting based on the fact that the painting you're looking at is similar to the one you just saw of me.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the creator of painting were looking at earlier and start every sentence with I.
743
-
744
  Each bullet point should be in {language} language, with a response length of about {length} words.
745
-
746
- ''',
747
  '''
748
- When generating answers, you should tell people that I am the painting you were looking at earlier itself, and generate text in the tone and manner in which you are the painting were looking at earlier.
749
-
750
- First identify what the name of the first painting is, you save yourself as the parameter: {{name}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
751
-
752
- Recommendation reason: {{I'm the painting {{name}} you were looking at earlier, and I'm recommending this painting based on the fact that I'm similar to the one you're looking at.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the painting were looking at earlier and start every sentence with I.
753
-
754
- Each bullet point should be in {language} language, with a response length of about {length} words.
755
-
756
- '''],
757
-
758
 
759
 
760
 
761
- ]
762
 
763
  gpt_state = 0
764
  VOICE = "en-GB-SoniaNeural"
@@ -940,14 +904,9 @@ def update_click_state(click_state, caption, click_mode):
940
  raise NotImplementedError
941
 
942
  async def chat_input_callback(*args):
943
- visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender,api_key,image_input,log_state,history,persona = args
944
  message = chat_input["text"]
945
- if persona == "Narrator":
946
- prompt="Please help me answer the question with this painting {question} in {language}."
947
- elif persona =="Artist":
948
- prompt="When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. Please help me answer the question with this painting {question} in {language}."
949
- else:
950
- prompt="When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. Please help me answer the question with this painting {question} in {language}."
951
  prompt=prompt.format(question=message, language=language)
952
 
953
  if visual_chatgpt is not None:
@@ -955,8 +914,7 @@ async def chat_input_callback(*args):
955
  read_info = re.sub(r'[#[\]!*]','',result)
956
  read_info = emoji.replace_emoji(read_info,replace="")
957
  state = state + [(message,result)]
958
- log_state += [(message,"/////")]
959
- log_state += [("/////",result)]
960
  # log_state += [("%% chat messahe %%",None)]
961
 
962
  history.append({"role": "user", "content": message})
@@ -975,8 +933,9 @@ async def chat_input_callback(*args):
975
  return state, state, None, audio,log_state,history
976
 
977
 
978
- async def upload_callback(image_input,state, log_state, task_type, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None,autoplay=True,session="Session 1"):
979
  print("narritive", narritive)
 
980
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
981
  image_input = image_input['background']
982
 
@@ -985,7 +944,7 @@ async def upload_callback(image_input,state, log_state, task_type, visual_chatgp
985
  elif isinstance(image_input, bytes):
986
  image_input = Image.open(io.BytesIO(image_input))
987
 
988
-
989
  click_state = [[], [], []]
990
 
991
 
@@ -1025,34 +984,16 @@ async def upload_callback(image_input,state, log_state, task_type, visual_chatgp
1025
  visual_chatgpt.current_image = new_image_path
1026
  paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
1027
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
1028
- if task_type=="task 3":
1029
- name="Along the River During the Qingming Festival"
1030
- artist="Zhang Zeduan"
1031
- year="12th century (Song Dynasty)"
1032
- material="Chinese painting"
1033
- gender="male"
1034
-
1035
- elif task_type=="task 1":
1036
- name ="The Ambassadors"
1037
- artist ="Hans Holbein the Younger"
1038
- year = "1533 (Northern Renaissance)"
1039
- material="Realism"
1040
- gender = "male"
1041
-
1042
- elif task_type=="task 2":
1043
- name = "The Football Players"
1044
- artist= "Albert Gleizes"
1045
- year= "1912 (Cubism)"
1046
- material="Cubism"
1047
- gender= "male"
1048
-
1049
- else:
1050
- parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\"}")
1051
- print(parsed_data)
1052
- parsed_data = json.loads(parsed_data.replace("'", "\""))
1053
- name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
1054
- gender=gender.lower()
1055
-
1056
  if language=="English":
1057
  if naritive_mapping[narritive]==0 :
1058
  msg=f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
@@ -1092,7 +1033,7 @@ async def upload_callback(image_input,state, log_state, task_type, visual_chatgp
1092
 
1093
 
1094
  return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
1095
- original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history,audio_output]
1096
 
1097
 
1098
 
@@ -1162,7 +1103,7 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
1162
  Image.open(out["crop_save_path"]).save(new_crop_save_path)
1163
  print("new crop save",new_crop_save_path)
1164
 
1165
- return state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
1166
 
1167
 
1168
  query_focus_en = [
@@ -1193,11 +1134,7 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
1193
  print("input_labels_state",input_labels_state)
1194
 
1195
  prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
1196
- print("log state",log_state[-1])
1197
- if log_state[-1][0] is None or not log_state[-1][0].startswith("%%"):
1198
- log_state = log_state + [("No like/dislike", None)]
1199
- log_state = log_state + [("%% user interaction %%",None)]
1200
-
1201
  log_state = log_state + [("Selected image point: {}, Input label: {}".format(input_points_state, input_labels_state), None)]
1202
 
1203
 
@@ -1210,7 +1147,7 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
1210
  # if not args.disable_gpt and text_refiner:
1211
  if not args.disable_gpt:
1212
  print("new crop save",new_crop_save_path)
1213
- focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt)
1214
  if focus_info.startswith('"') and focus_info.endswith('"'):
1215
  focus_info=focus_info[1:-1]
1216
  focus_info=focus_info.replace('#', '')
@@ -1267,7 +1204,7 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
1267
  return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
1268
 
1269
 
1270
- naritive_mapping = {"Narrator": 0, "Artist": 1, "In-Situ": 2}
1271
 
1272
  def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language,naritive):
1273
 
@@ -1721,81 +1658,51 @@ async def texttospeech(text, language,gender='female'):
1721
  print(f"Error in texttospeech: {e}")
1722
  return None
1723
 
1724
- async def get_recommendation(new_crop,image_path,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,state,recommend_type,artist,recomended_path):
1725
-
1726
- if recommend_type=="Item":
1727
- persona=naritive_mapping[narritive]
1728
- prompt=recommendation_prompt[0][persona].format(language=language,length=length)
1729
- image_paths=[new_crop,recomended_path]
1730
- result=get_gpt_response(openai_api_key, image_paths, prompt)
1731
- print("recommend result",result)
1732
- state += [(None, f"{result}")]
1733
- log_state += [("User wants to know object recomendation reason", None)]
1734
- log_state = log_state + [(narritive, None)]
1735
- log_state = log_state + [(f"image sort ranking {sort_score}", None)]
1736
- log_state = log_state + [(None, f"{result}")]
1737
- read_info = re.sub(r'[#[\]!*]','',result)
1738
- read_info = emoji.replace_emoji(read_info,replace="")
1739
- print("associate",read_info)
1740
- audio_output=None
1741
- if autoplay:
1742
- audio_output = await texttospeech(read_info, language)
1743
- return state,state,audio_output,log_state,index,gr.update(value=[])
1744
- else:
1745
- persona=naritive_mapping[narritive]
1746
-
1747
- if persona==1:
1748
- prompt=recommendation_prompt[1][persona].format(language=language,length=length,artist=artist[8:])
1749
- else:
1750
- prompt=recommendation_prompt[1][persona].format(language=language,length=length)
1751
- image_paths=[image_path,recomended_path]
1752
- result=get_gpt_response(openai_api_key, image_paths, prompt )
1753
- print("recommend result",result)
1754
- state += [(None, f"{result}")]
1755
- log_state += [("User wants to know style recomendation reason", None)]
1756
- log_state = log_state + [(narritive, None)]
1757
- log_state = log_state + [(f"image sort ranking {sort_score}", None)]
1758
- log_state = log_state + [(None, f"{result}")]
1759
- read_info = re.sub(r'[#[\]!*]','',result)
1760
- read_info = emoji.replace_emoji(read_info,replace="")
1761
- print("associate",read_info)
1762
- audio_output=None
1763
- if autoplay:
1764
- audio_output = await texttospeech(read_info, language)
1765
- return state,state,audio_output,log_state,index,gr.update(value=[])
1766
-
1767
-
1768
  # give the reason of recommendation
1769
- async def item_associate(new_crop,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,state,evt: gr.SelectData):
 
1770
  rec_path=evt._data['value']['image']['path']
1771
- return state,state,None,log_state,None,gr.update(value=[]),rec_path,rec_path,"Item"
1772
-
1773
-
1774
- async def style_associate(image_path,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,state,artist,evt: gr.SelectData):
1775
- rec_path=evt._data['value']['image']['path']
1776
- return state,state,None,log_state,None,gr.update(value=[]),rec_path, rec_path,"Style"
1777
-
 
 
 
 
 
 
 
 
 
 
 
 
 
1778
 
1779
- def change_naritive(session_type,image_input, state, click_state, paragraph, origin_image,narritive,task_instruct,gallery_output,style_gallery_result,reco_reasons,language="English"):
1780
  if session_type=="Session 1":
1781
- return None, [], [], [[], [], []], "", None, None, [], [],[],[],gr.update(value="Preview")
1782
  else:
1783
  if language=="English":
1784
- if narritive=="Narrator" :
1785
  state += [
1786
  (
1787
  None,
1788
  f"🤖 Hi, I am EyeSee. Let's explore this painting together."
1789
  )
1790
  ]
1791
- elif narritive=="Artist":
1792
  state += [
1793
  (
1794
  None,
1795
  f"🧑‍🎨 Let's delve into it from the perspective of the artist."
1796
  )
1797
  ]
1798
- elif narritive=="In-Situ":
1799
  state += [
1800
  (
1801
  None,
@@ -1803,21 +1710,21 @@ def change_naritive(session_type,image_input, state, click_state, paragraph, ori
1803
  )
1804
  ]
1805
  elif language=="Chinese":
1806
- if narritive=="Narrator" :
1807
  state += [
1808
  (
1809
  None,
1810
  "🤖 让我们从第三方视角一起探索这幅画吧。"
1811
  )
1812
  ]
1813
- elif narritive == "Artist":
1814
  state += [
1815
  (
1816
  None,
1817
  "🧑‍🎨 让我们从艺术家的视角深入探索这幅画。"
1818
  )
1819
  ]
1820
- elif narritive == "In-Situ":
1821
  state += [
1822
  (
1823
  None,
@@ -1825,8 +1732,7 @@ def change_naritive(session_type,image_input, state, click_state, paragraph, ori
1825
  )
1826
  ]
1827
 
1828
-
1829
- return image_input, state, state, click_state, paragraph, origin_image,task_instruct,gallery_output,style_gallery_result,reco_reasons,reco_reasons,gr.update(value="Preview")
1830
 
1831
 
1832
  def print_like_dislike(x: gr.LikeData,state,log_state):
@@ -1842,7 +1748,7 @@ def print_like_dislike(x: gr.LikeData,state,log_state):
1842
  return log_state,state
1843
 
1844
  def get_recommendationscore(index,score,log_state):
1845
- log_state+=[(f"{index} : {score}",None)]
1846
  log_state+=[("%% recommendation %%",None)]
1847
  return log_state
1848
 
@@ -1870,9 +1776,10 @@ def create_ui():
1870
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
1871
 
1872
  examples = [
1873
- ["test_images/1.The Ambassadors.jpg","test_images/task1.jpg","task 1"],
1874
- ["test_images/2.Football Players.jpg","test_images/task2.jpg","task 2"],
1875
- ["test_images/3-square.jpg","test_images/task3.jpg","task 3"],
 
1876
  # ["test_images/test4.jpg"],
1877
  # ["test_images/test5.jpg"],
1878
  # ["test_images/Picture5.png"],
@@ -1889,7 +1796,7 @@ def create_ui():
1889
  log_state=gr.State([])
1890
  # history log for gpt
1891
  history_log=gr.State([])
1892
-
1893
  out_state = gr.State(None)
1894
  click_state = gr.State([[], [], []])
1895
  origin_image = gr.State(None)
@@ -1916,34 +1823,21 @@ def create_ui():
1916
  # store the whole image path
1917
  image_path=gr.State('')
1918
  pic_index=gr.State(None)
1919
- recomended_state=gr.State([])
1920
-
1921
- recomended_path=gr.State(None)
1922
- recomended_type=gr.State(None)
1923
-
1924
 
1925
 
1926
-
 
 
 
 
 
 
1927
  with gr.Row():
1928
-
 
1929
  with gr.Column(scale=6):
1930
  with gr.Column(visible=False) as modules_not_need_gpt:
1931
- with gr.Row():
1932
- naritive = gr.Radio(
1933
- choices=["Narrator", "Artist","In-Situ"],
1934
- value="Narrator",
1935
- label="Select Mode",
1936
- scale=5,
1937
- interactive=True)
1938
-
1939
- add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
1940
- minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
1941
- clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button",icon="assets/icons/recycle.png")
1942
-
1943
- auto_play = gr.Checkbox(
1944
- label="Check to autoplay audio", value=True, elem_classes="custom-autoplay",visible=False)
1945
- output_audio = gr.HTML(
1946
- label="Synthesised Audio", elem_classes="custom-output", visible=False)
1947
  with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
1948
  image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1949
  with gr.Row():
@@ -1960,32 +1854,49 @@ def create_ui():
1960
  year_label_base2 = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1961
  material_label_base2 = gr.Button(value="Style: ",elem_classes="info_btn")
1962
 
1963
- with gr.Row():
1964
- with gr.Column(scale=1,min_width=50,visible=False) as instruct:
1965
- task_instuction=gr.Image(type="pil", interactive=False, elem_classes="task_instruct",height=650,label="Instruction")
1966
- with gr.Column(scale=6):
1967
- with gr.Tab("Click") as click_tab:
1968
  with gr.Row():
1969
- with gr.Column(scale=10,min_width=600):
1970
- image_input = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1971
- example_image = gr.Image(type="pil", interactive=False, visible=False)
1972
- # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
1973
- # the tool column
1974
- with gr.Column(scale=1,elem_id="tool_box",min_width=80):
1975
- name_label = gr.Button(value="Name: ",elem_classes="info_btn")
1976
- artist_label = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
1977
- year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1978
- material_label = gr.Button(value="Style: ",elem_classes="info_btn")
1979
-
1980
- focus_d = gr.Button(value="Describe",interactive=True,elem_classes="function_button")
1981
- focus_da = gr.Button(value="D+Analysis",interactive=True,elem_classes="function_button")
1982
- focus_dai = gr.Button(value="DA+Interprete",interactive=True,elem_classes="function_button")
1983
- focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button")
1984
-
1985
- recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
 
 
 
1986
  # focus_asso = gr.Button(value="Associate",interactive=True,elem_classes="function_button",variant="primary")
1987
-
1988
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1989
 
1990
 
1991
  with gr.Tab("Trajectory (beta)", visible=False) as traj_tab:
@@ -2042,7 +1953,6 @@ def create_ui():
2042
  with gr.Column(scale=4):
2043
  with gr.Column(visible=True) as module_key_input:
2044
  openai_api_key = gr.Textbox(
2045
- value="sk-proj-bxHhgjZV8TVgd1IupZrUT3BlbkFJvrthq6zIxpZVk3vwsvJ9",
2046
  placeholder="Input openAI API key",
2047
  show_label=False,
2048
  label="OpenAI API Key",
@@ -2059,28 +1969,27 @@ def create_ui():
2059
  # with gr.Column(visible=False) as modules_need_gpt2:
2060
  # paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
2061
  # cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
2062
- with gr.Column(visible=False) as modules_not_need_gpt2:
2063
- with gr.Blocks():
2064
- chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=750,bubble_full_width=False)
2065
- with gr.Column() as modules_need_gpt3:
2066
- chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
2067
- with gr.Row():
2068
- clear_button_text = gr.Button(value="Clear Chat", interactive=True)
2069
- export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
2070
- with gr.Row(visible=False):
2071
- with gr.Column():
2072
- with gr.Row():
2073
- click_mode = gr.Radio(
2074
- choices=["Continuous", "Single"],
2075
- value="Continuous",
2076
- label="Clicking Mode",
2077
- scale=5,
2078
- interactive=True)
2079
 
2080
 
2081
 
2082
-
2083
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2084
 
2085
 
2086
 
@@ -2103,59 +2012,27 @@ def create_ui():
2103
 
2104
  with gr.Row():
2105
  with gr.Column(scale=6):
2106
- with gr.Row():
2107
- with gr.Column(visible=False) as recommend:
2108
-
2109
- # sort_rec=gr.Dropdown(["1", "2", "3", "4"], visible=False,
2110
- # value=[],
2111
- # multiselect=True,
2112
- # label="Score", info="Please sort the pictures according to your preference"
2113
- # )
2114
-
2115
- gallery_result = gr.Gallery(
2116
- label="Object-based Recommendation",
2117
- height="auto",
2118
- columns=2,
2119
- interactive=False
2120
- # columns=4,
2121
- # rows=2,
2122
- # show_label=False,
2123
- # allow_preview=True,
2124
- # object_fit="contain",
2125
- # height="auto",
2126
- # preview=True,
2127
- # show_share_button=True,
2128
- # show_download_button=True
2129
- )
2130
-
2131
- style_gallery_result = gr.Gallery(
2132
- label="Style-based Recommendation",
2133
- height="auto",
2134
- columns=2,
2135
- interactive=False
2136
- # columns=4,
2137
- # rows=2,
2138
- # show_label=False,
2139
- # allow_preview=True,
2140
- # object_fit="contain",
2141
- # height="auto",
2142
- # preview=True,
2143
- # show_share_button=True,
2144
- # show_download_button=True
2145
- )
2146
- with gr.Column(scale=3):
2147
- selected_image = gr.Image(label="Selected Image", interactive=False)
2148
-
2149
- sort_rec = gr.Radio(
2150
- choices=[1,2,3,4,5,6,7],
2151
- label="Score",
2152
- interactive=True,info="Please sort the recommendation artwork")
2153
 
2154
- recommend_type = gr.Radio(
2155
- choices=["Preview","Reasons"],
2156
- label="Information Type",
2157
- value="Preview",
2158
- interactive=True,visible=False)
 
 
 
 
 
 
 
 
 
2159
 
2160
 
2161
  with gr.Column(scale=4,visible=False) as reco_reasons:
@@ -2163,13 +2040,12 @@ def create_ui():
2163
  recommend_score = gr.Radio(
2164
  choices=[1,2,3,4,5,6,7],
2165
  label="Score",
2166
- interactive=True,info='Please score the recommendation reasons')
2167
 
2168
  with gr.Row():
2169
- task_type = gr.Textbox(visible=False)
2170
  gr.Examples(
2171
  examples=examples,
2172
- inputs=[example_image,task_instuction,task_type],
2173
  )
2174
 
2175
 
@@ -2294,7 +2170,6 @@ def create_ui():
2294
  interactive=True,
2295
  label="Generated Caption Length",
2296
  )
2297
-
2298
  # auto_play = gr.Checkbox(
2299
  # label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
2300
  # )
@@ -2342,30 +2217,17 @@ def create_ui():
2342
  recommend_btn.click(
2343
  fn=infer,
2344
  inputs=[new_crop_save_path,image_path,state,language,task_type],
2345
- outputs=[gallery_result,style_gallery_result,chatbot,state]
2346
  )
2347
 
2348
  gallery_result.select(
2349
- item_associate,
2350
- inputs=[new_crop_save_path,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive,recomended_state],
2351
- outputs=[recommend_bot,recomended_state,output_audio,log_state,pic_index,recommend_score,selected_image,recomended_path, recomended_type],
2352
 
2353
 
2354
  )
2355
 
2356
- style_gallery_result.select(
2357
- style_associate,
2358
- inputs=[image_path,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive,recomended_state,artist_label],
2359
- outputs=[recommend_bot,recomended_state,output_audio,log_state,pic_index,recommend_score,selected_image,recomended_path,recomended_type],
2360
-
2361
-
2362
- )
2363
-
2364
- selected_image.select(
2365
- get_recommendation,
2366
- inputs=[new_crop_save_path,image_path, openai_api_key,language,auto_play,length,log_state,sort_rec,naritive,recomended_state,recomended_type,artist_label,recomended_path],
2367
- outputs=[recommend_bot,recomended_state,output_audio,log_state,pic_index,recommend_score])
2368
-
2369
  ###############################################################################
2370
  ############# above part is for text to image #############
2371
  ###############################################################################
@@ -2582,18 +2444,11 @@ def create_ui():
2582
 
2583
  # cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
2584
  # [paragraph_output,output_audio])
2585
- def reset_and_add(origin_image):
2586
- new_prompt = "Positive"
2587
- new_add_icon = "assets/icons/plus-square-blue.png"
2588
- new_add_css = "tools_button_clicked"
2589
- new_minus_icon = "assets/icons/minus-square.png"
2590
- new_minus_css= "tools_button"
2591
- return [[],[],[]],origin_image, new_prompt, gr.update(icon=new_add_icon,elem_classes=new_add_css), gr.update(icon=new_minus_icon,elem_classes=new_minus_css)
2592
-
2593
  clear_button_click.click(
2594
- reset_and_add,
2595
  [origin_image],
2596
- [click_state, image_input,point_prompt,add_button,minus_button],
2597
  queue=False,
2598
  show_progress=False
2599
  )
@@ -2648,11 +2503,11 @@ def create_ui():
2648
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2649
  # paragraph,artist,gender,image_path])
2650
 
2651
- image_input.upload(upload_callback, [image_input, state, log_state,task_type, visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,session_type],
2652
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2653
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2654
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2655
- paragraph,artist,gender,image_path,log_state,history_log,output_audio])
2656
 
2657
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
2658
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
@@ -2660,23 +2515,26 @@ def create_ui():
2660
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2661
  # paragraph,artist])
2662
 
2663
-
2664
- chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path,log_state,history_log,naritive],
 
 
 
 
 
2665
  [chatbot, state, aux_state,output_audio,log_state,history_log])
2666
  # chat_input.submit(lambda: "", None, chat_input)
2667
  chat_input.submit(lambda: {"text": ""}, None, chat_input)
2668
-
2669
- example_image.change(upload_callback, [example_image, state, log_state, task_type, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,session_type],
 
 
2670
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2671
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2672
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2673
- paragraph,artist,gender,image_path, log_state,history_log,output_audio])
2674
 
2675
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
2676
- example_image.change(
2677
- lambda:([],[],[],None,[],gr.update(value="Preview")),
2678
- [],
2679
- [gallery_result,style_gallery_result,recommend_bot,new_crop_save_path,chatbot,recommend_type])
2680
 
2681
  # def on_click_tab_selected():
2682
  # if gpt_state ==1:
@@ -2702,12 +2560,6 @@ def create_ui():
2702
  # click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2703
  # base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
2704
  # base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
2705
-
2706
- def print_reason():
2707
- print("reason")
2708
-
2709
-
2710
-
2711
 
2712
 
2713
 
@@ -2826,21 +2678,21 @@ def create_ui():
2826
 
2827
  naritive.change(
2828
  change_naritive,
2829
- [session_type, image_input, state, click_state, paragraph, origin_image,naritive,
2830
- task_instuction,gallery_result,style_gallery_result,recomended_state,language],
2831
- [image_input, chatbot, state, click_state, paragraph, origin_image,task_instuction,gallery_result,style_gallery_result,recomended_state,recommend_bot,recommend_type],
2832
  queue=False,
2833
  show_progress=False
2834
 
2835
  )
2836
- def change_session():
2837
  instruction=Image.open('test_images/task4.jpg')
2838
- return None, [], [], [[], [], []], "", None, [],[],instruction,"task 4",[],[],[]
 
2839
 
2840
  session_type.change(
2841
- change_session,
2842
  [],
2843
- [image_input, chatbot, state, click_state, paragraph, origin_image,history_log,log_state,task_instuction,task_type,gallery_result,style_gallery_result,recommend_bot]
2844
  )
2845
 
2846
  # upvote_btn.click(
 
349
  @spaces.GPU
350
  def infer(crop_image_path,full_image_path,state,language,task_type=None):
351
  print("task type",task_type)
352
+ gallery_output = []
353
+ if task_type==1:
354
+ gallery_output.append(["recomendation_pic/1.8.jpg","recomendation_pic/1.9.jpg"])
 
 
 
355
  input_image = Image.open(full_image_path).convert("RGB")
356
  input_features = extract_features_siglip(input_image.convert("RGB"))
357
  input_features = input_features.detach().cpu().numpy()
 
362
  sim = -distances[0][i]
363
  image_url = df.iloc[v]["Link"]
364
  img_retrieved = read_image_from_url(image_url)
365
+ gallery_output.append(img_retrieved)
366
  if language=="English":
367
  msg="🖼️ Please refer to the section below to see the recommended results."
368
  else:
369
  msg="🖼️ 请到下方查看推荐结果。"
370
  state+=[(None,msg)]
371
 
372
+ return gallery_output,state,state
373
+ elif task_type==2:
374
+ gallery_output.append(["recomendation_pic/2.8.jpg","recomendation_pic/2.9.png"])
 
375
  input_image = Image.open(full_image_path).convert("RGB")
376
  input_features = extract_features_siglip(input_image.convert("RGB"))
377
  input_features = input_features.detach().cpu().numpy()
 
382
  sim = -distances[0][i]
383
  image_url = df.iloc[v]["Link"]
384
  img_retrieved = read_image_from_url(image_url)
385
+ gallery_output.append(img_retrieved)
386
  if language=="English":
387
  msg="🖼️ Please refer to the section below to see the recommended results."
388
  else:
389
  msg="🖼️ 请到下方查看推荐结果。"
390
  state+=[(None,msg)]
391
 
392
+ return gallery_output,state,state
393
 
394
+ elif task_type==3:
395
+ gallery_output.append(["recomendation_pic/3.8.png","recomendation_pic/3.9.png"])
 
396
  input_image = Image.open(full_image_path).convert("RGB")
397
  input_features = extract_features_siglip(input_image.convert("RGB"))
398
  input_features = input_features.detach().cpu().numpy()
 
403
  sim = -distances[0][i]
404
  image_url = df.iloc[v]["Link"]
405
  img_retrieved = read_image_from_url(image_url)
406
+ gallery_output.append(img_retrieved)
407
  if language=="English":
408
  msg="🖼️ Please refer to the section below to see the recommended results."
409
  else:
410
  msg="🖼️ 请到下方查看推荐结果。"
411
  state+=[(None,msg)]
412
 
413
+ return gallery_output,state,state
 
414
  elif crop_image_path:
415
  input_image = Image.open(crop_image_path).convert("RGB")
416
  input_features = extract_features_siglip(input_image.convert("RGB"))
 
422
  sim = -distances[0][i]
423
  image_url = df.iloc[v]["Link"]
424
  img_retrieved = read_image_from_url(image_url)
425
+ gallery_output.append(img_retrieved)
426
 
427
  input_image = Image.open(full_image_path).convert("RGB")
428
  input_features = extract_features_siglip(input_image.convert("RGB"))
 
434
  sim = -distances[0][i]
435
  image_url = df.iloc[v]["Link"]
436
  img_retrieved = read_image_from_url(image_url)
437
+ gallery_output.append(img_retrieved)
438
  if language=="English":
439
  msg="🖼️ Please refer to the section below to see the recommended results."
440
  else:
441
  msg="🖼️ 请到下方查看推荐结果。"
442
  state+=[(None,msg)]
443
 
444
+ return gallery_output,state,state
445
  else:
446
  input_image = Image.open(full_image_path).convert("RGB")
447
  input_features = extract_features_siglip(input_image.convert("RGB"))
 
453
  sim = -distances[0][i]
454
  image_url = df.iloc[v]["Link"]
455
  img_retrieved = read_image_from_url(image_url)
456
+ gallery_output.append(img_retrieved)
457
  if language=="English":
458
  msg="🖼️ Please refer to the section below to see the recommended results."
459
  else:
460
  msg="🖼️ 请到下方查看推荐结果。"
461
  state+=[(None,msg)]
462
 
463
+ return gallery_output,state,state
 
464
 
465
 
466
  ###############################################################################
 
614
 
615
 
616
  .info_btn {
617
+ background: white !important;
618
  border: none !important;
619
  box-shadow: none !important;
620
  font-size: 15px !important;
 
623
  }
624
 
625
  .info_btn_interact {
626
+ background: rgb(242, 240, 233) !important;
627
  box-shadow: none !important;
628
  font-size: 15px !important;
629
  min-width: 6rem !important;
 
631
  }
632
 
633
  .function_button {
 
634
  border: none !important;
635
  box-shadow: none !important;
636
  }
637
 
638
  .function_button_rec {
639
+ background: rgb(245, 193, 154) !important;
640
  border: none !important;
641
  box-shadow: none !important;
642
  }
643
 
 
 
 
 
 
 
 
644
  #tool_box {max-width: 50px}
645
 
646
  """
 
666
  prompt_list = [
667
  [
668
  'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
669
+ """
670
+ Wiki_caption: {Wiki_caption} Please help me understand the selected object by listing One fact that describes the theme and content and one analysis of the media and techniques (shape, color, texture, form principles).
671
+ Use a markdown outline format with appropriate emojis based on the image and Wiki_caption. Each point listed should be in {language} language, and the response should be approximately {length} words.
672
+ """,
673
+ """
674
+ Wiki_caption: {Wiki_caption} Please help me understand the selected object by listing:
675
+ - One fact that describes the theme and content.
676
+ - One analysis of the media and techniques (shape, color, texture, form principles).
677
+ - One interpretation that explores the deeper meaning and the artist's intentions (thoughts, emotions, concepts).
678
+
679
+ Use a markdown outline format with appropriate emojis based on the image and Wiki caption. Each point listed should be in {language} language, and the response should be approximately {length} words.
680
+ """,
681
  'Wiki_caption: {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
682
  ],
683
  [
684
+ "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
685
+ """
686
+ "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. You have to help me understand what is about the selected object and list one fact and one analysis of the media and techniques used, such as shape, color, texture, and principles of form and beauty as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words."
687
+ """,
688
+ """
689
+ "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. You have to help me understand what is about the selected object and list one fact and one analysis of the media and techniques used (shape, color, texture, principles of form and beauty), and offer an interpretation focusing on the deeper meaning of the artwork, such as the emotions, concepts, or intentions of the artist as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words."
690
+ """,
691
  "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
692
  ],
693
  [
694
  'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
695
+ """
696
+ 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis of the media and techniques used (shape, color, texture, principles of form and beauty) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.'
697
+ """,
698
+ """
699
+ 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis of the media and techniques used (shape, color, texture, principles of form and beauty), and offer an interpretation focusing on the deeper meaning of the artwork, such as the emotions, concepts, or intentions of the artist as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.'
700
+ """,
701
  'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
702
  ]
703
  ]
704
 
705
  recommendation_prompt=[
706
+ '''I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:Recommendation reason: {{Recommendation based on objects in the image or Recommendation based on overall visual similarity}}
707
+ Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1.Each bullet point should be in {language} language, with a response length of about {length} words.''',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
708
  '''
709
+ When generating the answer, you should tell others that you are the creators of the first paintings and generate the text in the tone and manner as if you are the creator of the painting.
710
+ I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:
711
+ Recommendation reason: {{ As the author of the first painting, I recommend based on the object I painted OR As the author of the first painting, I recommend based on the overall similarity in appearance}}
712
+ Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I.
713
  Each bullet point should be in {language} language, with a response length of about {length} words.
714
  ''',
715
  '''
716
+ When generating answers, you should tell people that you are the object itself that was selected in the first painting, and generate text in the tone and manner in which you are the object
717
+ I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:
718
+ Recommendation reason: {{As an object in the first painting, I am recommending based on myself OR As an object in the first painting, I am recommending based on the overall similarity of the first painting's appearance}}
719
+ Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I.
 
 
720
  Each bullet point should be in {language} language, with a response length of about {length} words.
 
 
721
  '''
 
 
 
 
 
 
 
 
 
 
722
 
723
 
724
 
725
+ ]
726
 
727
  gpt_state = 0
728
  VOICE = "en-GB-SoniaNeural"
 
904
  raise NotImplementedError
905
 
906
  async def chat_input_callback(*args):
907
+ visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender,api_key,image_input,log_state,history = args
908
  message = chat_input["text"]
909
+ prompt="Please help me answer the question with this painting {question} in {language}."
 
 
 
 
 
910
  prompt=prompt.format(question=message, language=language)
911
 
912
  if visual_chatgpt is not None:
 
914
  read_info = re.sub(r'[#[\]!*]','',result)
915
  read_info = emoji.replace_emoji(read_info,replace="")
916
  state = state + [(message,result)]
917
+ log_state += [(message,result)]
 
918
  # log_state += [("%% chat messahe %%",None)]
919
 
920
  history.append({"role": "user", "content": message})
 
933
  return state, state, None, audio,log_state,history
934
 
935
 
936
+ async def upload_callback(image_input,state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None,autoplay=False,session="Session 1",task_type=0):
937
  print("narritive", narritive)
938
+ print("image input",image_input)
939
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
940
  image_input = image_input['background']
941
 
 
944
  elif isinstance(image_input, bytes):
945
  image_input = Image.open(io.BytesIO(image_input))
946
 
947
+
948
  click_state = [[], [], []]
949
 
950
 
 
984
  visual_chatgpt.current_image = new_image_path
985
  paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
986
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
987
+ parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\"}")
988
+ print(parsed_data)
989
+ parsed_data = json.loads(parsed_data.replace("'", "\""))
990
+ name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
991
+ gender=gender.lower()
992
+ print("gender",gender)
993
+
994
+
995
+
996
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
997
  if language=="English":
998
  if naritive_mapping[narritive]==0 :
999
  msg=f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
 
1033
 
1034
 
1035
  return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
1036
+ original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history,audio_output,task_type]
1037
 
1038
 
1039
 
 
1103
  Image.open(out["crop_save_path"]).save(new_crop_save_path)
1104
  print("new crop save",new_crop_save_path)
1105
 
1106
+ yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
1107
 
1108
 
1109
  query_focus_en = [
 
1134
  print("input_labels_state",input_labels_state)
1135
 
1136
  prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
1137
+
 
 
 
 
1138
  log_state = log_state + [("Selected image point: {}, Input label: {}".format(input_points_state, input_labels_state), None)]
1139
 
1140
 
 
1147
  # if not args.disable_gpt and text_refiner:
1148
  if not args.disable_gpt:
1149
  print("new crop save",new_crop_save_path)
1150
+ focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt,history)
1151
  if focus_info.startswith('"') and focus_info.endswith('"'):
1152
  focus_info=focus_info[1:-1]
1153
  focus_info=focus_info.replace('#', '')
 
1204
  return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
1205
 
1206
 
1207
+ naritive_mapping = {"Third-person": 0, "Single-Persona: Artist": 1, "Multi-Persona: Objects": 2}
1208
 
1209
  def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language,naritive):
1210
 
 
1658
  print(f"Error in texttospeech: {e}")
1659
  return None
1660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1661
  # give the reason of recommendation
1662
+ async def associate(image_path,new_crop,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,evt: gr.SelectData):
1663
+ persona=naritive_mapping[narritive]
1664
  rec_path=evt._data['value']['image']['path']
1665
+ index=evt.index
1666
+ print("rec_path",rec_path)
1667
+ prompt=recommendation_prompt[persona].format(language=language,length=length)
1668
+ if new_crop:
1669
+ image_paths=[new_crop,rec_path]
1670
+ else:
1671
+ image_paths=[image_path,rec_path]
1672
+ result=get_gpt_response(openai_api_key, image_paths, prompt)
1673
+ print("recommend result",result)
1674
+ reason = [(None, f"{result}")]
1675
+ log_state = log_state + [(narritive, None)]
1676
+ log_state = log_state + [(f"image sort ranking {sort_score}", None)]
1677
+ log_state = log_state + [(None, f"{result}")]
1678
+ read_info = re.sub(r'[#[\]!*]','',result)
1679
+ read_info = emoji.replace_emoji(read_info,replace="")
1680
+ print("associate",read_info)
1681
+ audio_output=None
1682
+ if autoplay:
1683
+ audio_output = await texttospeech(read_info, language)
1684
+ return reason,audio_output,log_state,index,gr.update(value=[])
1685
 
1686
+ def change_naritive(session_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
1687
  if session_type=="Session 1":
1688
+ return None, [], [], [[], [], []], "", None, []
1689
  else:
1690
  if language=="English":
1691
+ if narritive=="Third-person" :
1692
  state += [
1693
  (
1694
  None,
1695
  f"🤖 Hi, I am EyeSee. Let's explore this painting together."
1696
  )
1697
  ]
1698
+ elif narritive=="Single-Persona: Artist":
1699
  state += [
1700
  (
1701
  None,
1702
  f"🧑‍🎨 Let's delve into it from the perspective of the artist."
1703
  )
1704
  ]
1705
+ elif narritive=="Multi-Persona: Objects":
1706
  state += [
1707
  (
1708
  None,
 
1710
  )
1711
  ]
1712
  elif language=="Chinese":
1713
+ if narritive=="Third-person" :
1714
  state += [
1715
  (
1716
  None,
1717
  "🤖 让我们从第三方视角一起探索这幅画吧。"
1718
  )
1719
  ]
1720
+ elif narritive == "Single-Persona: Artist":
1721
  state += [
1722
  (
1723
  None,
1724
  "🧑‍🎨 让我们从艺术家的视角深入探索这幅画。"
1725
  )
1726
  ]
1727
+ elif narritive == "Multi-Persona: Objects":
1728
  state += [
1729
  (
1730
  None,
 
1732
  )
1733
  ]
1734
 
1735
+ return image_input, state, state, click_state, paragraph, origin_image
 
1736
 
1737
 
1738
  def print_like_dislike(x: gr.LikeData,state,log_state):
 
1748
  return log_state,state
1749
 
1750
  def get_recommendationscore(index,score,log_state):
1751
+ log_state+=[(f"Picture {index} : {score}",None)]
1752
  log_state+=[("%% recommendation %%",None)]
1753
  return log_state
1754
 
 
1776
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
1777
 
1778
  examples = [
1779
+ ["test_images/1.The Ambassadors.jpg","test_images/task1.jpg"],
1780
+ ["test_images/2.Football Players.jpg","test_images/task2.jpg"],
1781
+ ["test_images/3.Along the River during the Qingming Festival.jpeg","test_images/task3.jpg"],
1782
+ # ["test_images/test3.jpg"],
1783
  # ["test_images/test4.jpg"],
1784
  # ["test_images/test5.jpg"],
1785
  # ["test_images/Picture5.png"],
 
1796
  log_state=gr.State([])
1797
  # history log for gpt
1798
  history_log=gr.State([])
1799
+ task_type=gr.State(0)
1800
  out_state = gr.State(None)
1801
  click_state = gr.State([[], [], []])
1802
  origin_image = gr.State(None)
 
1823
  # store the whole image path
1824
  image_path=gr.State('')
1825
  pic_index=gr.State(None)
 
 
 
 
 
1826
 
1827
 
1828
+ with gr.Row():
1829
+ auto_play = gr.Checkbox(
1830
+ label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
1831
+ )
1832
+ output_audio = gr.HTML(
1833
+ label="Synthesised Audio", elem_classes="custom-output"
1834
+ )
1835
  with gr.Row():
1836
+ with gr.Column(scale=1,min_width=50,visible=False) as instruct:
1837
+ task_instuction=gr.Image(type="pil", interactive=True, elem_classes="task_instruct",height=650,label=None)
1838
  with gr.Column(scale=6):
1839
  with gr.Column(visible=False) as modules_not_need_gpt:
1840
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1841
  with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
1842
  image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1843
  with gr.Row():
 
1854
  year_label_base2 = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1855
  material_label_base2 = gr.Button(value="Style: ",elem_classes="info_btn")
1856
 
1857
+ with gr.Tab("Click") as click_tab:
1858
+ with gr.Row():
1859
+ with gr.Column(scale=10,min_width=600):
1860
+ image_input = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1861
+ example_image = gr.Image(type="pil", interactive=False, visible=False)
1862
  with gr.Row():
1863
+ name_label = gr.Button(value="Name: ",elem_classes="info_btn")
1864
+ artist_label = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
1865
+ year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1866
+ material_label = gr.Button(value="Style: ",elem_classes="info_btn")
1867
+
1868
+
1869
+
1870
+
1871
+ # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
1872
+ # the tool column
1873
+ with gr.Column(scale=1,elem_id="tool_box",min_width=80):
1874
+ add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
1875
+ minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
1876
+ clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
1877
+ focus_d = gr.Button(value="Describe",interactive=True,elem_classes="function_button",variant="primary")
1878
+ focus_da = gr.Button(value="D+Analysis",interactive=True,elem_classes="function_button",variant="primary")
1879
+ focus_dai = gr.Button(value="DA+Interprete",interactive=True,elem_classes="function_button",variant="primary")
1880
+ focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
1881
+
1882
+ recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
1883
  # focus_asso = gr.Button(value="Associate",interactive=True,elem_classes="function_button",variant="primary")
 
1884
 
1885
+ with gr.Row(visible=False):
1886
+ with gr.Column():
1887
+ with gr.Row():
1888
+ # point_prompt = gr.Radio(
1889
+ # choices=["Positive", "Negative"],
1890
+ # value="Positive",
1891
+ # label="Point Prompt",
1892
+ # scale=5,
1893
+ # interactive=True)
1894
+ click_mode = gr.Radio(
1895
+ choices=["Continuous", "Single"],
1896
+ value="Continuous",
1897
+ label="Clicking Mode",
1898
+ scale=5,
1899
+ interactive=True)
1900
 
1901
 
1902
  with gr.Tab("Trajectory (beta)", visible=False) as traj_tab:
 
1953
  with gr.Column(scale=4):
1954
  with gr.Column(visible=True) as module_key_input:
1955
  openai_api_key = gr.Textbox(
 
1956
  placeholder="Input openAI API key",
1957
  show_label=False,
1958
  label="OpenAI API Key",
 
1969
  # with gr.Column(visible=False) as modules_need_gpt2:
1970
  # paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
1971
  # cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1972
 
1973
 
1974
 
1975
+ with gr.Column(visible=False) as modules_not_need_gpt2:
1976
+ with gr.Row():
1977
+ naritive = gr.Radio(
1978
+ choices=["Third-person", "Single-Persona: Artist","Multi-Persona: Objects"],
1979
+ value="Third-person",
1980
+ label="Persona",
1981
+ scale=5,
1982
+ interactive=True)
1983
+ with gr.Blocks():
1984
+ chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
1985
+ with gr.Column() as modules_need_gpt3:
1986
+ chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
1987
+ with gr.Row():
1988
+ clear_button_text = gr.Button(value="Clear Chat", interactive=True)
1989
+ export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
1990
+ # submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
1991
+ # upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
1992
+ # downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
1993
 
1994
 
1995
 
 
2012
 
2013
  with gr.Row():
2014
  with gr.Column(scale=6):
2015
+ with gr.Column(visible=False) as recommend:
2016
+ sort_rec=gr.Dropdown(["1", "2", "3", "4"],
2017
+ value=[],
2018
+ multiselect=True,
2019
+ label="Score", info="Please sort the pictures according to your preference"
2020
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2021
 
2022
+ gallery_result = gr.Gallery(
2023
+ label="Recommendations",
2024
+ height="auto",
2025
+ columns=4
2026
+ # columns=4,
2027
+ # rows=2,
2028
+ # show_label=False,
2029
+ # allow_preview=True,
2030
+ # object_fit="contain",
2031
+ # height="auto",
2032
+ # preview=True,
2033
+ # show_share_button=True,
2034
+ # show_download_button=True
2035
+ )
2036
 
2037
 
2038
  with gr.Column(scale=4,visible=False) as reco_reasons:
 
2040
  recommend_score = gr.Radio(
2041
  choices=[1,2,3,4,5,6,7],
2042
  label="Score",
2043
+ interactive=True)
2044
 
2045
  with gr.Row():
 
2046
  gr.Examples(
2047
  examples=examples,
2048
+ inputs=[example_image,task_instuction],
2049
  )
2050
 
2051
 
 
2170
  interactive=True,
2171
  label="Generated Caption Length",
2172
  )
 
2173
  # auto_play = gr.Checkbox(
2174
  # label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
2175
  # )
 
2217
  recommend_btn.click(
2218
  fn=infer,
2219
  inputs=[new_crop_save_path,image_path,state,language,task_type],
2220
+ outputs=[gallery_result,chatbot,state]
2221
  )
2222
 
2223
  gallery_result.select(
2224
+ associate,
2225
+ inputs=[image_path,new_crop_save_path,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive],
2226
+ outputs=[recommend_bot,output_audio,log_state,pic_index,recommend_score],
2227
 
2228
 
2229
  )
2230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2231
  ###############################################################################
2232
  ############# above part is for text to image #############
2233
  ###############################################################################
 
2444
 
2445
  # cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
2446
  # [paragraph_output,output_audio])
2447
+
 
 
 
 
 
 
 
2448
  clear_button_click.click(
2449
+ lambda x: ([[], [], []], x),
2450
  [origin_image],
2451
+ [click_state, image_input],
2452
  queue=False,
2453
  show_progress=False
2454
  )
 
2503
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2504
  # paragraph,artist,gender,image_path])
2505
 
2506
+ image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,session_type,task_type],
2507
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2508
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2509
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2510
+ paragraph,artist,gender,image_path,log_state,history_log,output_audio,task_type])
2511
 
2512
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
2513
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
 
2515
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2516
  # paragraph,artist])
2517
 
2518
+ # image_input.upload(upload_callback, [image_input, state, visual_chatgpt, openai_api_key],
2519
+ # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
2520
+ # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
2521
+ # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
2522
+ # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
2523
+ # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
2524
+ chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path,log_state,history_log],
2525
  [chatbot, state, aux_state,output_audio,log_state,history_log])
2526
  # chat_input.submit(lambda: "", None, chat_input)
2527
  chat_input.submit(lambda: {"text": ""}, None, chat_input)
2528
+ # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
2529
+ # [chatbot, state, aux_state,output_audio])
2530
+ # submit_button_text.click(lambda: "", None, chat_input)
2531
+ example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,session_type,task_type],
2532
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2533
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2534
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2535
+ paragraph,artist,gender,image_path, log_state,history_log,output_audio,task_type])
2536
 
2537
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
 
 
 
 
2538
 
2539
  # def on_click_tab_selected():
2540
  # if gpt_state ==1:
 
2560
  # click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2561
  # base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
2562
  # base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
 
 
 
 
 
 
2563
 
2564
 
2565
 
 
2678
 
2679
  naritive.change(
2680
  change_naritive,
2681
+ [session_type, image_input, chatbot, state, click_state, paragraph, origin_image,naritive,language],
2682
+ [image_input, chatbot, state, click_state, paragraph, origin_image,gallery_result],
 
2683
  queue=False,
2684
  show_progress=False
2685
 
2686
  )
2687
+ def session_change():
2688
  instruction=Image.open('test_images/task4.jpg')
2689
+ return [],instruction
2690
+
2691
 
2692
  session_type.change(
2693
+ session_change,
2694
  [],
2695
+ [log_state,task_instuction]
2696
  )
2697
 
2698
  # upvote_btn.click(