Niki Zhang commited on
Commit
235ef66
·
verified ·
1 Parent(s): 978648d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +374 -208
app.py CHANGED
@@ -349,9 +349,12 @@ def extract_features_siglip(image):
349
  @spaces.GPU
350
  def infer(crop_image_path,full_image_path,state,language,task_type=None):
351
  print("task type",task_type)
352
- gallery_output = []
353
- if task_type==1:
354
- gallery_output.append(["recomendation_pic/1.8.jpg","recomendation_pic/1.9.jpg"])
 
 
 
355
  input_image = Image.open(full_image_path).convert("RGB")
356
  input_features = extract_features_siglip(input_image.convert("RGB"))
357
  input_features = input_features.detach().cpu().numpy()
@@ -362,16 +365,17 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
362
  sim = -distances[0][i]
363
  image_url = df.iloc[v]["Link"]
364
  img_retrieved = read_image_from_url(image_url)
365
- gallery_output.append(img_retrieved)
366
  if language=="English":
367
  msg="🖼️ Please refer to the section below to see the recommended results."
368
  else:
369
  msg="🖼️ 请到下方查看推荐结果。"
370
  state+=[(None,msg)]
371
 
372
- return gallery_output,state,state
373
- elif task_type==2:
374
- gallery_output.append(["recomendation_pic/2.8.jpg","recomendation_pic/2.9.png"])
 
375
  input_image = Image.open(full_image_path).convert("RGB")
376
  input_features = extract_features_siglip(input_image.convert("RGB"))
377
  input_features = input_features.detach().cpu().numpy()
@@ -382,17 +386,18 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
382
  sim = -distances[0][i]
383
  image_url = df.iloc[v]["Link"]
384
  img_retrieved = read_image_from_url(image_url)
385
- gallery_output.append(img_retrieved)
386
  if language=="English":
387
  msg="🖼️ Please refer to the section below to see the recommended results."
388
  else:
389
  msg="🖼️ 请到下方查看推荐结果。"
390
  state+=[(None,msg)]
391
 
392
- return gallery_output,state,state
393
 
394
- elif task_type==3:
395
- gallery_output.append(["recomendation_pic/3.8.png","recomendation_pic/3.9.png"])
 
396
  input_image = Image.open(full_image_path).convert("RGB")
397
  input_features = extract_features_siglip(input_image.convert("RGB"))
398
  input_features = input_features.detach().cpu().numpy()
@@ -403,14 +408,15 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
403
  sim = -distances[0][i]
404
  image_url = df.iloc[v]["Link"]
405
  img_retrieved = read_image_from_url(image_url)
406
- gallery_output.append(img_retrieved)
407
  if language=="English":
408
  msg="🖼️ Please refer to the section below to see the recommended results."
409
  else:
410
  msg="🖼️ 请到下方查看推荐结果。"
411
  state+=[(None,msg)]
412
 
413
- return gallery_output,state,state
 
414
  elif crop_image_path:
415
  input_image = Image.open(crop_image_path).convert("RGB")
416
  input_features = extract_features_siglip(input_image.convert("RGB"))
@@ -422,7 +428,7 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
422
  sim = -distances[0][i]
423
  image_url = df.iloc[v]["Link"]
424
  img_retrieved = read_image_from_url(image_url)
425
- gallery_output.append(img_retrieved)
426
 
427
  input_image = Image.open(full_image_path).convert("RGB")
428
  input_features = extract_features_siglip(input_image.convert("RGB"))
@@ -434,14 +440,14 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
434
  sim = -distances[0][i]
435
  image_url = df.iloc[v]["Link"]
436
  img_retrieved = read_image_from_url(image_url)
437
- gallery_output.append(img_retrieved)
438
  if language=="English":
439
  msg="🖼️ Please refer to the section below to see the recommended results."
440
  else:
441
  msg="🖼️ 请到下方查看推荐结果。"
442
  state+=[(None,msg)]
443
 
444
- return gallery_output,state,state
445
  else:
446
  input_image = Image.open(full_image_path).convert("RGB")
447
  input_features = extract_features_siglip(input_image.convert("RGB"))
@@ -453,14 +459,15 @@ def infer(crop_image_path,full_image_path,state,language,task_type=None):
453
  sim = -distances[0][i]
454
  image_url = df.iloc[v]["Link"]
455
  img_retrieved = read_image_from_url(image_url)
456
- gallery_output.append(img_retrieved)
457
  if language=="English":
458
  msg="🖼️ Please refer to the section below to see the recommended results."
459
  else:
460
  msg="🖼️ 请到下方查看推荐结果。"
461
  state+=[(None,msg)]
462
 
463
- return gallery_output,state,state
 
464
 
465
 
466
  ###############################################################################
@@ -614,7 +621,7 @@ css = """
614
 
615
 
616
  .info_btn {
617
- background: white !important;
618
  border: none !important;
619
  box-shadow: none !important;
620
  font-size: 15px !important;
@@ -623,7 +630,7 @@ css = """
623
  }
624
 
625
  .info_btn_interact {
626
- background: rgb(242, 240, 233) !important;
627
  box-shadow: none !important;
628
  font-size: 15px !important;
629
  min-width: 6rem !important;
@@ -631,16 +638,24 @@ css = """
631
  }
632
 
633
  .function_button {
 
634
  border: none !important;
635
  box-shadow: none !important;
636
  }
637
 
638
  .function_button_rec {
639
- background: rgb(245, 193, 154) !important;
640
  border: none !important;
641
  box-shadow: none !important;
642
  }
643
 
 
 
 
 
 
 
 
644
  #tool_box {max-width: 50px}
645
 
646
  """
@@ -703,26 +718,65 @@ Use a markdown outline format with appropriate emojis based on the image and Wik
703
  ]
704
 
705
  recommendation_prompt=[
706
- '''I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:Recommendation reason: {{Recommendation based on objects in the image or Recommendation based on overall visual similarity}}
707
- Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1.Each bullet point should be in {language} language, with a response length of about {length} words.''',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
708
  '''
709
- When generating the answer, you should tell others that you are the creators of the first paintings and generate the text in the tone and manner as if you are the creator of the painting.
710
- I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:
711
- Recommendation reason: {{ As the author of the first painting, I recommend based on the object I painted OR As the author of the first painting, I recommend based on the overall similarity in appearance}}
712
- Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I.
713
  Each bullet point should be in {language} language, with a response length of about {length} words.
714
  ''',
715
  '''
716
- When generating answers, you should tell people that you are the object itself that was selected in the first painting, and generate text in the tone and manner in which you are the object
717
- I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the image:
718
- Recommendation reason: {{As an object in the first painting, I am recommending based on myself OR As an object in the first painting, I am recommending based on the overall similarity of the first painting's appearance}}
719
- Detailed analysis: Based on the recommendation reason, explain why you recommend image 2 after viewing image 1. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I.
 
 
720
  Each bullet point should be in {language} language, with a response length of about {length} words.
 
 
721
  '''
 
 
 
 
 
 
 
 
 
 
722
 
723
 
724
 
725
- ]
726
 
727
  gpt_state = 0
728
  VOICE = "en-GB-SoniaNeural"
@@ -904,9 +958,14 @@ def update_click_state(click_state, caption, click_mode):
904
  raise NotImplementedError
905
 
906
  async def chat_input_callback(*args):
907
- visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender,api_key,image_input,log_state,history = args
908
  message = chat_input["text"]
909
- prompt="Please help me answer the question with this painting {question} in {language}."
 
 
 
 
 
910
  prompt=prompt.format(question=message, language=language)
911
 
912
  if visual_chatgpt is not None:
@@ -914,7 +973,8 @@ async def chat_input_callback(*args):
914
  read_info = re.sub(r'[#[\]!*]','',result)
915
  read_info = emoji.replace_emoji(read_info,replace="")
916
  state = state + [(message,result)]
917
- log_state += [(message,result)]
 
918
  # log_state += [("%% chat messahe %%",None)]
919
 
920
  history.append({"role": "user", "content": message})
@@ -933,9 +993,8 @@ async def chat_input_callback(*args):
933
  return state, state, None, audio,log_state,history
934
 
935
 
936
- async def upload_callback(image_input,state, log_state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None,autoplay=False,session="Session 1",task_type=0):
937
  print("narritive", narritive)
938
- print("image input",image_input)
939
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
940
  image_input = image_input['background']
941
 
@@ -944,7 +1003,7 @@ async def upload_callback(image_input,state, log_state, visual_chatgpt=None, ope
944
  elif isinstance(image_input, bytes):
945
  image_input = Image.open(io.BytesIO(image_input))
946
 
947
-
948
  click_state = [[], [], []]
949
 
950
 
@@ -984,16 +1043,34 @@ async def upload_callback(image_input,state, log_state, visual_chatgpt=None, ope
984
  visual_chatgpt.current_image = new_image_path
985
  paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
986
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
987
- parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\"}")
988
- print(parsed_data)
989
- parsed_data = json.loads(parsed_data.replace("'", "\""))
990
- name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
991
- gender=gender.lower()
992
- print("gender",gender)
993
-
994
-
995
-
996
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
997
  if language=="English":
998
  if naritive_mapping[narritive]==0 :
999
  msg=f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
@@ -1033,7 +1110,7 @@ async def upload_callback(image_input,state, log_state, visual_chatgpt=None, ope
1033
 
1034
 
1035
  return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
1036
- original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history,audio_output,task_type]
1037
 
1038
 
1039
 
@@ -1103,7 +1180,7 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
1103
  Image.open(out["crop_save_path"]).save(new_crop_save_path)
1104
  print("new crop save",new_crop_save_path)
1105
 
1106
- yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
1107
 
1108
 
1109
  query_focus_en = [
@@ -1134,7 +1211,11 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
1134
  print("input_labels_state",input_labels_state)
1135
 
1136
  prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
1137
-
 
 
 
 
1138
  log_state = log_state + [("Selected image point: {}, Input label: {}".format(input_points_state, input_labels_state), None)]
1139
 
1140
 
@@ -1147,7 +1228,7 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
1147
  # if not args.disable_gpt and text_refiner:
1148
  if not args.disable_gpt:
1149
  print("new crop save",new_crop_save_path)
1150
- focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt,history)
1151
  if focus_info.startswith('"') and focus_info.endswith('"'):
1152
  focus_info=focus_info[1:-1]
1153
  focus_info=focus_info.replace('#', '')
@@ -1204,7 +1285,7 @@ async def submit_caption(naritive, state,length, sentiment, factuality, language
1204
  return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
1205
 
1206
 
1207
- naritive_mapping = {"Third-person": 0, "Single-Persona: Artist": 1, "Multi-Persona: Objects": 2}
1208
 
1209
  def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language,naritive):
1210
 
@@ -1658,51 +1739,81 @@ async def texttospeech(text, language,gender='female'):
1658
  print(f"Error in texttospeech: {e}")
1659
  return None
1660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1661
  # give the reason of recommendation
1662
- async def associate(image_path,new_crop,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,evt: gr.SelectData):
1663
- persona=naritive_mapping[narritive]
1664
  rec_path=evt._data['value']['image']['path']
1665
- index=evt.index
1666
- print("rec_path",rec_path)
1667
- prompt=recommendation_prompt[persona].format(language=language,length=length)
1668
- if new_crop:
1669
- image_paths=[new_crop,rec_path]
1670
- else:
1671
- image_paths=[image_path,rec_path]
1672
- result=get_gpt_response(openai_api_key, image_paths, prompt)
1673
- print("recommend result",result)
1674
- reason = [(None, f"{result}")]
1675
- log_state = log_state + [(narritive, None)]
1676
- log_state = log_state + [(f"image sort ranking {sort_score}", None)]
1677
- log_state = log_state + [(None, f"{result}")]
1678
- read_info = re.sub(r'[#[\]!*]','',result)
1679
- read_info = emoji.replace_emoji(read_info,replace="")
1680
- print("associate",read_info)
1681
- audio_output=None
1682
- if autoplay:
1683
- audio_output = await texttospeech(read_info, language)
1684
- return reason,audio_output,log_state,index,gr.update(value=[])
1685
 
1686
- def change_naritive(session_type,image_input, chatbot, state, click_state, paragraph, origin_image,narritive,language="English"):
1687
  if session_type=="Session 1":
1688
- return None, [], [], [[], [], []], "", None, []
1689
  else:
1690
  if language=="English":
1691
- if narritive=="Third-person" :
1692
  state += [
1693
  (
1694
  None,
1695
  f"🤖 Hi, I am EyeSee. Let's explore this painting together."
1696
  )
1697
  ]
1698
- elif narritive=="Single-Persona: Artist":
1699
  state += [
1700
  (
1701
  None,
1702
  f"🧑‍🎨 Let's delve into it from the perspective of the artist."
1703
  )
1704
  ]
1705
- elif narritive=="Multi-Persona: Objects":
1706
  state += [
1707
  (
1708
  None,
@@ -1710,21 +1821,21 @@ def change_naritive(session_type,image_input, chatbot, state, click_state, parag
1710
  )
1711
  ]
1712
  elif language=="Chinese":
1713
- if narritive=="Third-person" :
1714
  state += [
1715
  (
1716
  None,
1717
  "🤖 让我们从第三方视角一起探索这幅画吧。"
1718
  )
1719
  ]
1720
- elif narritive == "Single-Persona: Artist":
1721
  state += [
1722
  (
1723
  None,
1724
  "🧑‍🎨 让我们从艺术家的视角深入探索这幅画。"
1725
  )
1726
  ]
1727
- elif narritive == "Multi-Persona: Objects":
1728
  state += [
1729
  (
1730
  None,
@@ -1732,7 +1843,8 @@ def change_naritive(session_type,image_input, chatbot, state, click_state, parag
1732
  )
1733
  ]
1734
 
1735
- return image_input, state, state, click_state, paragraph, origin_image
 
1736
 
1737
 
1738
  def print_like_dislike(x: gr.LikeData,state,log_state):
@@ -1748,7 +1860,7 @@ def print_like_dislike(x: gr.LikeData,state,log_state):
1748
  return log_state,state
1749
 
1750
  def get_recommendationscore(index,score,log_state):
1751
- log_state+=[(f"Picture {index} : {score}",None)]
1752
  log_state+=[("%% recommendation %%",None)]
1753
  return log_state
1754
 
@@ -1776,10 +1888,9 @@ def create_ui():
1776
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
1777
 
1778
  examples = [
1779
- ["test_images/1.The Ambassadors.jpg","test_images/task1.jpg"],
1780
- ["test_images/2.Football Players.jpg","test_images/task2.jpg"],
1781
- ["test_images/3.Along the River during the Qingming Festival.jpeg","test_images/task3.jpg"],
1782
- # ["test_images/test3.jpg"],
1783
  # ["test_images/test4.jpg"],
1784
  # ["test_images/test5.jpg"],
1785
  # ["test_images/Picture5.png"],
@@ -1796,7 +1907,7 @@ def create_ui():
1796
  log_state=gr.State([])
1797
  # history log for gpt
1798
  history_log=gr.State([])
1799
- task_type=gr.State(0)
1800
  out_state = gr.State(None)
1801
  click_state = gr.State([[], [], []])
1802
  origin_image = gr.State(None)
@@ -1823,21 +1934,34 @@ def create_ui():
1823
  # store the whole image path
1824
  image_path=gr.State('')
1825
  pic_index=gr.State(None)
 
 
 
 
 
1826
 
1827
 
1828
- with gr.Row():
1829
- auto_play = gr.Checkbox(
1830
- label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
1831
- )
1832
- output_audio = gr.HTML(
1833
- label="Synthesised Audio", elem_classes="custom-output"
1834
- )
1835
  with gr.Row():
1836
- with gr.Column(scale=1,min_width=50,visible=False) as instruct:
1837
- task_instuction=gr.Image(type="pil", interactive=True, elem_classes="task_instruct",height=650,label=None)
1838
  with gr.Column(scale=6):
1839
  with gr.Column(visible=False) as modules_not_need_gpt:
1840
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1841
  with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
1842
  image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1843
  with gr.Row():
@@ -1854,49 +1978,32 @@ def create_ui():
1854
  year_label_base2 = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1855
  material_label_base2 = gr.Button(value="Style: ",elem_classes="info_btn")
1856
 
1857
- with gr.Tab("Click") as click_tab:
1858
- with gr.Row():
1859
- with gr.Column(scale=10,min_width=600):
1860
- image_input = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1861
- example_image = gr.Image(type="pil", interactive=False, visible=False)
1862
  with gr.Row():
1863
- name_label = gr.Button(value="Name: ",elem_classes="info_btn")
1864
- artist_label = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
1865
- year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1866
- material_label = gr.Button(value="Style: ",elem_classes="info_btn")
1867
-
1868
-
1869
-
1870
-
1871
- # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
1872
- # the tool column
1873
- with gr.Column(scale=1,elem_id="tool_box",min_width=80):
1874
- add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
1875
- minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
1876
- clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
1877
- focus_d = gr.Button(value="Describe",interactive=True,elem_classes="function_button",variant="primary")
1878
- focus_da = gr.Button(value="D+Analysis",interactive=True,elem_classes="function_button",variant="primary")
1879
- focus_dai = gr.Button(value="DA+Interprete",interactive=True,elem_classes="function_button",variant="primary")
1880
- focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
1881
-
1882
- recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
1883
  # focus_asso = gr.Button(value="Associate",interactive=True,elem_classes="function_button",variant="primary")
 
1884
 
1885
- with gr.Row(visible=False):
1886
- with gr.Column():
1887
- with gr.Row():
1888
- # point_prompt = gr.Radio(
1889
- # choices=["Positive", "Negative"],
1890
- # value="Positive",
1891
- # label="Point Prompt",
1892
- # scale=5,
1893
- # interactive=True)
1894
- click_mode = gr.Radio(
1895
- choices=["Continuous", "Single"],
1896
- value="Continuous",
1897
- label="Clicking Mode",
1898
- scale=5,
1899
- interactive=True)
1900
 
1901
 
1902
  with gr.Tab("Trajectory (beta)", visible=False) as traj_tab:
@@ -1953,6 +2060,7 @@ def create_ui():
1953
  with gr.Column(scale=4):
1954
  with gr.Column(visible=True) as module_key_input:
1955
  openai_api_key = gr.Textbox(
 
1956
  placeholder="Input openAI API key",
1957
  show_label=False,
1958
  label="OpenAI API Key",
@@ -1969,27 +2077,28 @@ def create_ui():
1969
  # with gr.Column(visible=False) as modules_need_gpt2:
1970
  # paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
1971
  # cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1972
 
1973
 
1974
 
1975
- with gr.Column(visible=False) as modules_not_need_gpt2:
1976
- with gr.Row():
1977
- naritive = gr.Radio(
1978
- choices=["Third-person", "Single-Persona: Artist","Multi-Persona: Objects"],
1979
- value="Third-person",
1980
- label="Persona",
1981
- scale=5,
1982
- interactive=True)
1983
- with gr.Blocks():
1984
- chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
1985
- with gr.Column() as modules_need_gpt3:
1986
- chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
1987
- with gr.Row():
1988
- clear_button_text = gr.Button(value="Clear Chat", interactive=True)
1989
- export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
1990
- # submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
1991
- # upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
1992
- # downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
1993
 
1994
 
1995
 
@@ -2012,27 +2121,59 @@ def create_ui():
2012
 
2013
  with gr.Row():
2014
  with gr.Column(scale=6):
2015
- with gr.Column(visible=False) as recommend:
2016
- sort_rec=gr.Dropdown(["1", "2", "3", "4"],
2017
- value=[],
2018
- multiselect=True,
2019
- label="Score", info="Please sort the pictures according to your preference"
2020
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2021
 
2022
- gallery_result = gr.Gallery(
2023
- label="Recommendations",
2024
- height="auto",
2025
- columns=4
2026
- # columns=4,
2027
- # rows=2,
2028
- # show_label=False,
2029
- # allow_preview=True,
2030
- # object_fit="contain",
2031
- # height="auto",
2032
- # preview=True,
2033
- # show_share_button=True,
2034
- # show_download_button=True
2035
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2036
 
2037
 
2038
  with gr.Column(scale=4,visible=False) as reco_reasons:
@@ -2040,12 +2181,13 @@ def create_ui():
2040
  recommend_score = gr.Radio(
2041
  choices=[1,2,3,4,5,6,7],
2042
  label="Score",
2043
- interactive=True)
2044
 
2045
  with gr.Row():
 
2046
  gr.Examples(
2047
  examples=examples,
2048
- inputs=[example_image,task_instuction],
2049
  )
2050
 
2051
 
@@ -2170,6 +2312,7 @@ def create_ui():
2170
  interactive=True,
2171
  label="Generated Caption Length",
2172
  )
 
2173
  # auto_play = gr.Checkbox(
2174
  # label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
2175
  # )
@@ -2217,17 +2360,30 @@ def create_ui():
2217
  recommend_btn.click(
2218
  fn=infer,
2219
  inputs=[new_crop_save_path,image_path,state,language,task_type],
2220
- outputs=[gallery_result,chatbot,state]
2221
  )
2222
 
2223
  gallery_result.select(
2224
- associate,
2225
- inputs=[image_path,new_crop_save_path,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive],
2226
- outputs=[recommend_bot,output_audio,log_state,pic_index,recommend_score],
2227
 
2228
 
2229
  )
2230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2231
  ###############################################################################
2232
  ############# above part is for text to image #############
2233
  ###############################################################################
@@ -2444,11 +2600,18 @@ def create_ui():
2444
 
2445
  # cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
2446
  # [paragraph_output,output_audio])
2447
-
 
 
 
 
 
 
 
2448
  clear_button_click.click(
2449
- lambda x: ([[], [], []], x),
2450
  [origin_image],
2451
- [click_state, image_input],
2452
  queue=False,
2453
  show_progress=False
2454
  )
@@ -2503,11 +2666,11 @@ def create_ui():
2503
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2504
  # paragraph,artist,gender,image_path])
2505
 
2506
- image_input.upload(upload_callback, [image_input, state, log_state,visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,session_type,task_type],
2507
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2508
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2509
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2510
- paragraph,artist,gender,image_path,log_state,history_log,output_audio,task_type])
2511
 
2512
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
2513
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
@@ -2515,26 +2678,23 @@ def create_ui():
2515
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2516
  # paragraph,artist])
2517
 
2518
- # image_input.upload(upload_callback, [image_input, state, visual_chatgpt, openai_api_key],
2519
- # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
2520
- # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
2521
- # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
2522
- # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
2523
- # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
2524
- chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path,log_state,history_log],
2525
  [chatbot, state, aux_state,output_audio,log_state,history_log])
2526
  # chat_input.submit(lambda: "", None, chat_input)
2527
  chat_input.submit(lambda: {"text": ""}, None, chat_input)
2528
- # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
2529
- # [chatbot, state, aux_state,output_audio])
2530
- # submit_button_text.click(lambda: "", None, chat_input)
2531
- example_image.change(upload_callback, [example_image, state, log_state, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,session_type,task_type],
2532
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2533
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2534
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2535
- paragraph,artist,gender,image_path, log_state,history_log,output_audio,task_type])
2536
 
2537
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
 
 
 
 
2538
 
2539
  # def on_click_tab_selected():
2540
  # if gpt_state ==1:
@@ -2560,6 +2720,12 @@ def create_ui():
2560
  # click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2561
  # base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
2562
  # base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
 
 
 
 
 
 
2563
 
2564
 
2565
 
@@ -2678,21 +2844,21 @@ def create_ui():
2678
 
2679
  naritive.change(
2680
  change_naritive,
2681
- [session_type, image_input, chatbot, state, click_state, paragraph, origin_image,naritive,language],
2682
- [image_input, chatbot, state, click_state, paragraph, origin_image,gallery_result],
 
2683
  queue=False,
2684
  show_progress=False
2685
 
2686
  )
2687
- def session_change():
2688
  instruction=Image.open('test_images/task4.jpg')
2689
- return [],instruction
2690
-
2691
 
2692
  session_type.change(
2693
- session_change,
2694
  [],
2695
- [log_state,task_instuction]
2696
  )
2697
 
2698
  # upvote_btn.click(
 
349
  @spaces.GPU
350
  def infer(crop_image_path,full_image_path,state,language,task_type=None):
351
  print("task type",task_type)
352
+ style_gallery_output = []
353
+ item_gallery_output=[]
354
+
355
+ if task_type=="task 1":
356
+ item_gallery_output.append("recomendation_pic/1.8.jpg")
357
+ item_gallery_output.append("recomendation_pic/1.9.jpg")
358
  input_image = Image.open(full_image_path).convert("RGB")
359
  input_features = extract_features_siglip(input_image.convert("RGB"))
360
  input_features = input_features.detach().cpu().numpy()
 
365
  sim = -distances[0][i]
366
  image_url = df.iloc[v]["Link"]
367
  img_retrieved = read_image_from_url(image_url)
368
+ style_gallery_output.append(img_retrieved)
369
  if language=="English":
370
  msg="🖼️ Please refer to the section below to see the recommended results."
371
  else:
372
  msg="🖼️ 请到下方查看推荐结果。"
373
  state+=[(None,msg)]
374
 
375
+ return item_gallery_output, style_gallery_output,state,state
376
+ elif task_type=="task 2":
377
+ item_gallery_output.append("recomendation_pic/2.8.jpg")
378
+ item_gallery_output.append("recomendation_pic/2.9.png")
379
  input_image = Image.open(full_image_path).convert("RGB")
380
  input_features = extract_features_siglip(input_image.convert("RGB"))
381
  input_features = input_features.detach().cpu().numpy()
 
386
  sim = -distances[0][i]
387
  image_url = df.iloc[v]["Link"]
388
  img_retrieved = read_image_from_url(image_url)
389
+ style_gallery_output.append(img_retrieved)
390
  if language=="English":
391
  msg="🖼️ Please refer to the section below to see the recommended results."
392
  else:
393
  msg="🖼️ 请到下方查看推荐结果。"
394
  state+=[(None,msg)]
395
 
396
+ return item_gallery_output, style_gallery_output,state,state
397
 
398
+ elif task_type=="task 3":
399
+ item_gallery_output.append("recomendation_pic/3.8.png")
400
+ item_gallery_output.append("recomendation_pic/basket-2.png")
401
  input_image = Image.open(full_image_path).convert("RGB")
402
  input_features = extract_features_siglip(input_image.convert("RGB"))
403
  input_features = input_features.detach().cpu().numpy()
 
408
  sim = -distances[0][i]
409
  image_url = df.iloc[v]["Link"]
410
  img_retrieved = read_image_from_url(image_url)
411
+ style_gallery_output.append(img_retrieved)
412
  if language=="English":
413
  msg="🖼️ Please refer to the section below to see the recommended results."
414
  else:
415
  msg="🖼️ 请到下方查看推荐结果。"
416
  state+=[(None,msg)]
417
 
418
+ return item_gallery_output, style_gallery_output,state,state
419
+
420
  elif crop_image_path:
421
  input_image = Image.open(crop_image_path).convert("RGB")
422
  input_features = extract_features_siglip(input_image.convert("RGB"))
 
428
  sim = -distances[0][i]
429
  image_url = df.iloc[v]["Link"]
430
  img_retrieved = read_image_from_url(image_url)
431
+ item_gallery_output.append(img_retrieved)
432
 
433
  input_image = Image.open(full_image_path).convert("RGB")
434
  input_features = extract_features_siglip(input_image.convert("RGB"))
 
440
  sim = -distances[0][i]
441
  image_url = df.iloc[v]["Link"]
442
  img_retrieved = read_image_from_url(image_url)
443
+ style_gallery_output.append(img_retrieved)
444
  if language=="English":
445
  msg="🖼️ Please refer to the section below to see the recommended results."
446
  else:
447
  msg="🖼️ 请到下方查看推荐结果。"
448
  state+=[(None,msg)]
449
 
450
+ return item_gallery_output, style_gallery_output,state,state
451
  else:
452
  input_image = Image.open(full_image_path).convert("RGB")
453
  input_features = extract_features_siglip(input_image.convert("RGB"))
 
459
  sim = -distances[0][i]
460
  image_url = df.iloc[v]["Link"]
461
  img_retrieved = read_image_from_url(image_url)
462
+ style_gallery_output.append(img_retrieved)
463
  if language=="English":
464
  msg="🖼️ Please refer to the section below to see the recommended results."
465
  else:
466
  msg="🖼️ 请到下方查看推荐结果。"
467
  state+=[(None,msg)]
468
 
469
+ return item_gallery_output, style_gallery_output,state,state
470
+
471
 
472
 
473
  ###############################################################################
 
621
 
622
 
623
  .info_btn {
624
+ background: rgb(245, 245, 245) !important;
625
  border: none !important;
626
  box-shadow: none !important;
627
  font-size: 15px !important;
 
630
  }
631
 
632
  .info_btn_interact {
633
+ background: rgb(217, 217, 217) !important;
634
  box-shadow: none !important;
635
  font-size: 15px !important;
636
  min-width: 6rem !important;
 
638
  }
639
 
640
  .function_button {
641
+ background: rgb(227, 226, 226) !important;
642
  border: none !important;
643
  box-shadow: none !important;
644
  }
645
 
646
  .function_button_rec {
647
+ background: rgb(189, 189, 189) !important;
648
  border: none !important;
649
  box-shadow: none !important;
650
  }
651
 
652
+ .small_button {
653
+ font-size: 12px !important;
654
+ padding: 2px 8px !important;
655
+ min-width: 60px !important;
656
+ height: 30px !important;
657
+ }
658
+
659
  #tool_box {max-width: 50px}
660
 
661
  """
 
718
  ]
719
 
720
  recommendation_prompt=[
721
+
722
+ [
723
+ '''
724
+ First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
725
+ Recommendation reason: {{Recommendation based on {{object}} in the painting you saw earlier. Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate in three points. }}
726
+ Each bullet point should be in {language} language, with a response length of about {length} words.
727
+ ''',
728
+ '''
729
+ When generating answers, you should tell people that I am the creator of painting you were looking at earlier itself, and generate text in the tone and manner in which you are the creator of painting were looking at earlier.
730
+
731
+ First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
732
+
733
+ Recommendation reason: {{I'm the creator of that painting you saw earlier. I'm an artist. and I'm recommending this painting based on the fact that the {{object}} I've drawn also appear in the painting you're looking at. }} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the creator of painting were looking at earlier and start every sentence with I.
734
+
735
+ Each bullet point should be in {language} language, with a response length of about {length} words.
736
+
737
+ ''',
738
+ '''
739
+ When generating answers, you should tell people that you are the object itself that was selected in the painting, and generate text in the tone and manner in which you are the object
740
+
741
+ First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
742
+
743
+ Recommendation reason: {{I'm the {{object}} in the painting you were looking at earlier, and I'm recommending this painting based on the fact that I'm also present in the one you're looking at.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the object of this painting and start every sentence with I.
744
+
745
+ Each bullet point should be in {language} language, with a response length of about {length} words.
746
+
747
+ '''],
748
+
749
+ [
750
  '''
751
+ First identify what the name of the first painting is, you save yourself as the parameter: {{name}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
752
+ Recommendation reason: {{Recommendation based on the painting {{name}}.Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate in three points.}}
 
 
753
  Each bullet point should be in {language} language, with a response length of about {length} words.
754
  ''',
755
  '''
756
+ When generating answers, you should tell people that I am the creator of painting you were looking at earlier itself, and generate text in the tone and manner in which you are the creator of painting were looking at earlier.
757
+
758
+ First identify what the creator of the first painting is, you save yourself as the parameter: {artist}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
759
+
760
+ Recommendation reason: {{I'm the creator of that painting you saw earlier, {artist}. I'm an artist. and I'm recommending this painting based on the fact that the painting you're looking at is similar to the one you just saw of me.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the creator of painting were looking at earlier and start every sentence with I.
761
+
762
  Each bullet point should be in {language} language, with a response length of about {length} words.
763
+
764
+ ''',
765
  '''
766
+ When generating answers, you should tell people that I am the painting you were looking at earlier itself, and generate text in the tone and manner in which you are the painting were looking at earlier.
767
+
768
+ First identify what the name of the first painting is, you save yourself as the parameter: {{name}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
769
+
770
+ Recommendation reason: {{I'm the painting {{name}} you were looking at earlier, and I'm recommending this painting based on the fact that I'm similar to the one you're looking at.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the painting were looking at earlier and start every sentence with I.
771
+
772
+ Each bullet point should be in {language} language, with a response length of about {length} words.
773
+
774
+ '''],
775
+
776
 
777
 
778
 
779
+ ]
780
 
781
  gpt_state = 0
782
  VOICE = "en-GB-SoniaNeural"
 
958
  raise NotImplementedError
959
 
960
  async def chat_input_callback(*args):
961
+ visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender,api_key,image_input,log_state,history,persona = args
962
  message = chat_input["text"]
963
+ if persona == "Narrator":
964
+ prompt="Please help me answer the question with this painting {question} in {language}."
965
+ elif persona =="Artist":
966
+ prompt="When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. Please help me answer the question with this painting {question} in {language}."
967
+ else:
968
+ prompt="When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. Please help me answer the question with this painting {question} in {language}."
969
  prompt=prompt.format(question=message, language=language)
970
 
971
  if visual_chatgpt is not None:
 
973
  read_info = re.sub(r'[#[\]!*]','',result)
974
  read_info = emoji.replace_emoji(read_info,replace="")
975
  state = state + [(message,result)]
976
+ log_state += [(message,"/////")]
977
+ log_state += [("/////",result)]
978
  # log_state += [("%% chat messahe %%",None)]
979
 
980
  history.append({"role": "user", "content": message})
 
993
  return state, state, None, audio,log_state,history
994
 
995
 
996
+ async def upload_callback(image_input,state, log_state, task_type, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None,history=None,autoplay=True,session="Session 1"):
997
  print("narritive", narritive)
 
998
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
999
  image_input = image_input['background']
1000
 
 
1003
  elif isinstance(image_input, bytes):
1004
  image_input = Image.open(io.BytesIO(image_input))
1005
 
1006
+
1007
  click_state = [[], [], []]
1008
 
1009
 
 
1043
  visual_chatgpt.current_image = new_image_path
1044
  paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
1045
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
1046
+ if task_type=="task 3":
1047
+ name="Along the River During the Qingming Festival"
1048
+ artist="Zhang Zeduan"
1049
+ year="12th century (Song Dynasty)"
1050
+ material="Chinese painting"
1051
+ gender="male"
1052
+
1053
+ elif task_type=="task 1":
1054
+ name ="The Ambassadors"
1055
+ artist ="Hans Holbein the Younger"
1056
+ year = "1533 (Northern Renaissance)"
1057
+ material="Realism"
1058
+ gender = "male"
1059
+
1060
+ elif task_type=="task 2":
1061
+ name = "The Football Players"
1062
+ artist= "Albert Gleizes"
1063
+ year= "1912 (Cubism)"
1064
+ material="Cubism"
1065
+ gender= "male"
1066
+
1067
+ else:
1068
+ parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\"}")
1069
+ print(parsed_data)
1070
+ parsed_data = json.loads(parsed_data.replace("'", "\""))
1071
+ name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
1072
+ gender=gender.lower()
1073
+
1074
  if language=="English":
1075
  if naritive_mapping[narritive]==0 :
1076
  msg=f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
 
1110
 
1111
 
1112
  return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
1113
+ original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender,new_image_path,log_state,history,audio_output]
1114
 
1115
 
1116
 
 
1180
  Image.open(out["crop_save_path"]).save(new_crop_save_path)
1181
  print("new crop save",new_crop_save_path)
1182
 
1183
+ return state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
1184
 
1185
 
1186
  query_focus_en = [
 
1211
  print("input_labels_state",input_labels_state)
1212
 
1213
  prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
1214
+ print("log state",log_state[-1])
1215
+ if log_state[-1][0] is None or not log_state[-1][0].startswith("%%"):
1216
+ log_state = log_state + [("No like/dislike", None)]
1217
+ log_state = log_state + [("%% user interaction %%",None)]
1218
+
1219
  log_state = log_state + [("Selected image point: {}, Input label: {}".format(input_points_state, input_labels_state), None)]
1220
 
1221
 
 
1228
  # if not args.disable_gpt and text_refiner:
1229
  if not args.disable_gpt:
1230
  print("new crop save",new_crop_save_path)
1231
+ focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt)
1232
  if focus_info.startswith('"') and focus_info.endswith('"'):
1233
  focus_info=focus_info[1:-1]
1234
  focus_info=focus_info.replace('#', '')
 
1285
  return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,log_state,history
1286
 
1287
 
1288
+ naritive_mapping = {"Narrator": 0, "Artist": 1, "In-Situ": 2}
1289
 
1290
  def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language,naritive):
1291
 
 
1739
  print(f"Error in texttospeech: {e}")
1740
  return None
1741
 
1742
+ async def get_recommendation(new_crop,image_path,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,state,recommend_type,artist,recomended_path):
1743
+
1744
+ if recommend_type=="Item":
1745
+ persona=naritive_mapping[narritive]
1746
+ prompt=recommendation_prompt[0][persona].format(language=language,length=length)
1747
+ image_paths=[new_crop,recomended_path]
1748
+ result=get_gpt_response(openai_api_key, image_paths, prompt)
1749
+ print("recommend result",result)
1750
+ state += [(None, f"{result}")]
1751
+ log_state += [("User wants to know object recomendation reason", None)]
1752
+ log_state = log_state + [(narritive, None)]
1753
+ log_state = log_state + [(f"image sort ranking {sort_score}", None)]
1754
+ log_state = log_state + [(None, f"{result}")]
1755
+ read_info = re.sub(r'[#[\]!*]','',result)
1756
+ read_info = emoji.replace_emoji(read_info,replace="")
1757
+ print("associate",read_info)
1758
+ audio_output=None
1759
+ if autoplay:
1760
+ audio_output = await texttospeech(read_info, language)
1761
+ return state,state,audio_output,log_state,index,gr.update(value=[])
1762
+ else:
1763
+ persona=naritive_mapping[narritive]
1764
+
1765
+ if persona==1:
1766
+ prompt=recommendation_prompt[1][persona].format(language=language,length=length,artist=artist[8:])
1767
+ else:
1768
+ prompt=recommendation_prompt[1][persona].format(language=language,length=length)
1769
+ image_paths=[image_path,recomended_path]
1770
+ result=get_gpt_response(openai_api_key, image_paths, prompt )
1771
+ print("recommend result",result)
1772
+ state += [(None, f"{result}")]
1773
+ log_state += [("User wants to know style recomendation reason", None)]
1774
+ log_state = log_state + [(narritive, None)]
1775
+ log_state = log_state + [(f"image sort ranking {sort_score}", None)]
1776
+ log_state = log_state + [(None, f"{result}")]
1777
+ read_info = re.sub(r'[#[\]!*]','',result)
1778
+ read_info = emoji.replace_emoji(read_info,replace="")
1779
+ print("associate",read_info)
1780
+ audio_output=None
1781
+ if autoplay:
1782
+ audio_output = await texttospeech(read_info, language)
1783
+ return state,state,audio_output,log_state,index,gr.update(value=[])
1784
+
1785
+
1786
  # give the reason of recommendation
1787
+ async def item_associate(new_crop,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,state,evt: gr.SelectData):
 
1788
  rec_path=evt._data['value']['image']['path']
1789
+ return state,state,None,log_state,None,gr.update(value=[]),rec_path,rec_path,"Item"
1790
+
1791
+
1792
+ async def style_associate(image_path,openai_api_key,language,autoplay,length,log_state,sort_score,narritive,state,artist,evt: gr.SelectData):
1793
+ rec_path=evt._data['value']['image']['path']
1794
+ return state,state,None,log_state,None,gr.update(value=[]),rec_path, rec_path,"Style"
1795
+
 
 
 
 
 
 
 
 
 
 
 
 
 
1796
 
1797
+ def change_naritive(session_type,image_input, state, click_state, paragraph, origin_image,narritive,task_instruct,gallery_output,style_gallery_result,reco_reasons,language="English"):
1798
  if session_type=="Session 1":
1799
+ return None, [], [], [[], [], []], "", None, None, [], [],[],[],gr.update(value="Preview")
1800
  else:
1801
  if language=="English":
1802
+ if narritive=="Narrator" :
1803
  state += [
1804
  (
1805
  None,
1806
  f"🤖 Hi, I am EyeSee. Let's explore this painting together."
1807
  )
1808
  ]
1809
+ elif narritive=="Artist":
1810
  state += [
1811
  (
1812
  None,
1813
  f"🧑‍🎨 Let's delve into it from the perspective of the artist."
1814
  )
1815
  ]
1816
+ elif narritive=="In-Situ":
1817
  state += [
1818
  (
1819
  None,
 
1821
  )
1822
  ]
1823
  elif language=="Chinese":
1824
+ if narritive=="Narrator" :
1825
  state += [
1826
  (
1827
  None,
1828
  "🤖 让我们从第三方视角一起探索这幅画吧。"
1829
  )
1830
  ]
1831
+ elif narritive == "Artist":
1832
  state += [
1833
  (
1834
  None,
1835
  "🧑‍🎨 让我们从艺术家的视角深入探索这幅画。"
1836
  )
1837
  ]
1838
+ elif narritive == "In-Situ":
1839
  state += [
1840
  (
1841
  None,
 
1843
  )
1844
  ]
1845
 
1846
+
1847
+ return image_input, state, state, click_state, paragraph, origin_image,task_instruct,gallery_output,style_gallery_result,reco_reasons,reco_reasons,gr.update(value="Preview")
1848
 
1849
 
1850
  def print_like_dislike(x: gr.LikeData,state,log_state):
 
1860
  return log_state,state
1861
 
1862
  def get_recommendationscore(index,score,log_state):
1863
+ log_state+=[(f"{index} : {score}",None)]
1864
  log_state+=[("%% recommendation %%",None)]
1865
  return log_state
1866
 
 
1888
  description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
1889
 
1890
  examples = [
1891
+ ["test_images/1.The Ambassadors.jpg","test_images/task1.jpg","task 1"],
1892
+ ["test_images/2.Football Players.jpg","test_images/task2.jpg","task 2"],
1893
+ ["test_images/3-square.jpg","test_images/task3.jpg","task 3"],
 
1894
  # ["test_images/test4.jpg"],
1895
  # ["test_images/test5.jpg"],
1896
  # ["test_images/Picture5.png"],
 
1907
  log_state=gr.State([])
1908
  # history log for gpt
1909
  history_log=gr.State([])
1910
+
1911
  out_state = gr.State(None)
1912
  click_state = gr.State([[], [], []])
1913
  origin_image = gr.State(None)
 
1934
  # store the whole image path
1935
  image_path=gr.State('')
1936
  pic_index=gr.State(None)
1937
+ recomended_state=gr.State([])
1938
+
1939
+ recomended_path=gr.State(None)
1940
+ recomended_type=gr.State(None)
1941
+
1942
 
1943
 
1944
+
 
 
 
 
 
 
1945
  with gr.Row():
1946
+
 
1947
  with gr.Column(scale=6):
1948
  with gr.Column(visible=False) as modules_not_need_gpt:
1949
+ with gr.Row():
1950
+ naritive = gr.Radio(
1951
+ choices=["Narrator", "Artist","In-Situ"],
1952
+ value="Narrator",
1953
+ label="Select Mode",
1954
+ scale=5,
1955
+ interactive=True)
1956
+
1957
+ add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
1958
+ minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
1959
+ clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button",icon="assets/icons/recycle.png")
1960
+
1961
+ auto_play = gr.Checkbox(
1962
+ label="Check to autoplay audio", value=True, elem_classes="custom-autoplay",visible=False)
1963
+ output_audio = gr.HTML(
1964
+ label="Synthesised Audio", elem_classes="custom-output", visible=False)
1965
  with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
1966
  image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1967
  with gr.Row():
 
1978
  year_label_base2 = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1979
  material_label_base2 = gr.Button(value="Style: ",elem_classes="info_btn")
1980
 
1981
+ with gr.Row():
1982
+ with gr.Column(scale=1,min_width=50,visible=False) as instruct:
1983
+ task_instuction=gr.Image(type="pil", interactive=False, elem_classes="task_instruct",height=650,label="Instruction")
1984
+ with gr.Column(scale=6):
1985
+ with gr.Tab("Click") as click_tab:
1986
  with gr.Row():
1987
+ with gr.Column(scale=10,min_width=600):
1988
+ image_input = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650)
1989
+ example_image = gr.Image(type="pil", interactive=False, visible=False)
1990
+ # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
1991
+ # the tool column
1992
+ with gr.Column(scale=1,elem_id="tool_box",min_width=80):
1993
+ name_label = gr.Button(value="Name: ",elem_classes="info_btn")
1994
+ artist_label = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
1995
+ year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
1996
+ material_label = gr.Button(value="Style: ",elem_classes="info_btn")
1997
+
1998
+ focus_d = gr.Button(value="Describe",interactive=True,elem_classes="function_button")
1999
+ focus_da = gr.Button(value="D+Analysis",interactive=True,elem_classes="function_button")
2000
+ focus_dai = gr.Button(value="DA+Interprete",interactive=True,elem_classes="function_button")
2001
+ focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button")
2002
+
2003
+ recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec")
 
 
 
2004
  # focus_asso = gr.Button(value="Associate",interactive=True,elem_classes="function_button",variant="primary")
2005
+
2006
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2007
 
2008
 
2009
  with gr.Tab("Trajectory (beta)", visible=False) as traj_tab:
 
2060
  with gr.Column(scale=4):
2061
  with gr.Column(visible=True) as module_key_input:
2062
  openai_api_key = gr.Textbox(
2063
+ value="sk-proj-bxHhgjZV8TVgd1IupZrUT3BlbkFJvrthq6zIxpZVk3vwsvJ9",
2064
  placeholder="Input openAI API key",
2065
  show_label=False,
2066
  label="OpenAI API Key",
 
2077
  # with gr.Column(visible=False) as modules_need_gpt2:
2078
  # paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
2079
  # cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
2080
+ with gr.Column(visible=False) as modules_not_need_gpt2:
2081
+ with gr.Blocks():
2082
+ chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=750,bubble_full_width=False)
2083
+ with gr.Column() as modules_need_gpt3:
2084
+ chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
2085
+ with gr.Row():
2086
+ clear_button_text = gr.Button(value="Clear Chat", interactive=True)
2087
+ export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
2088
+ with gr.Row(visible=False):
2089
+ with gr.Column():
2090
+ with gr.Row():
2091
+ click_mode = gr.Radio(
2092
+ choices=["Continuous", "Single"],
2093
+ value="Continuous",
2094
+ label="Clicking Mode",
2095
+ scale=5,
2096
+ interactive=True)
2097
 
2098
 
2099
 
2100
+
2101
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2102
 
2103
 
2104
 
 
2121
 
2122
  with gr.Row():
2123
  with gr.Column(scale=6):
2124
+ with gr.Row():
2125
+ with gr.Column(visible=False) as recommend:
2126
+
2127
+ # sort_rec=gr.Dropdown(["1", "2", "3", "4"], visible=False,
2128
+ # value=[],
2129
+ # multiselect=True,
2130
+ # label="Score", info="Please sort the pictures according to your preference"
2131
+ # )
2132
+
2133
+ gallery_result = gr.Gallery(
2134
+ label="Object-based Recommendation",
2135
+ height="auto",
2136
+ columns=2,
2137
+ interactive=False
2138
+ # columns=4,
2139
+ # rows=2,
2140
+ # show_label=False,
2141
+ # allow_preview=True,
2142
+ # object_fit="contain",
2143
+ # height="auto",
2144
+ # preview=True,
2145
+ # show_share_button=True,
2146
+ # show_download_button=True
2147
+ )
2148
 
2149
+ style_gallery_result = gr.Gallery(
2150
+ label="Style-based Recommendation",
2151
+ height="auto",
2152
+ columns=2,
2153
+ interactive=False
2154
+ # columns=4,
2155
+ # rows=2,
2156
+ # show_label=False,
2157
+ # allow_preview=True,
2158
+ # object_fit="contain",
2159
+ # height="auto",
2160
+ # preview=True,
2161
+ # show_share_button=True,
2162
+ # show_download_button=True
2163
+ )
2164
+ with gr.Column(scale=3):
2165
+ selected_image = gr.Image(label="Selected Image", interactive=False)
2166
+
2167
+ sort_rec = gr.Radio(
2168
+ choices=[1,2,3,4,5,6,7],
2169
+ label="Score",
2170
+ interactive=True,info="Please sort the recommendation artwork")
2171
+
2172
+ recommend_type = gr.Radio(
2173
+ choices=["Preview","Reasons"],
2174
+ label="Information Type",
2175
+ value="Preview",
2176
+ interactive=True,visible=False)
2177
 
2178
 
2179
  with gr.Column(scale=4,visible=False) as reco_reasons:
 
2181
  recommend_score = gr.Radio(
2182
  choices=[1,2,3,4,5,6,7],
2183
  label="Score",
2184
+ interactive=True,info='Please score the recommendation reasons')
2185
 
2186
  with gr.Row():
2187
+ task_type = gr.Textbox(visible=False)
2188
  gr.Examples(
2189
  examples=examples,
2190
+ inputs=[example_image,task_instuction,task_type],
2191
  )
2192
 
2193
 
 
2312
  interactive=True,
2313
  label="Generated Caption Length",
2314
  )
2315
+
2316
  # auto_play = gr.Checkbox(
2317
  # label="Check to autoplay audio", value=False, elem_classes="custom-autoplay"
2318
  # )
 
2360
  recommend_btn.click(
2361
  fn=infer,
2362
  inputs=[new_crop_save_path,image_path,state,language,task_type],
2363
+ outputs=[gallery_result,style_gallery_result,chatbot,state]
2364
  )
2365
 
2366
  gallery_result.select(
2367
+ item_associate,
2368
+ inputs=[new_crop_save_path,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive,recomended_state],
2369
+ outputs=[recommend_bot,recomended_state,output_audio,log_state,pic_index,recommend_score,selected_image,recomended_path, recomended_type],
2370
 
2371
 
2372
  )
2373
 
2374
+ style_gallery_result.select(
2375
+ style_associate,
2376
+ inputs=[image_path,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive,recomended_state,artist_label],
2377
+ outputs=[recommend_bot,recomended_state,output_audio,log_state,pic_index,recommend_score,selected_image,recomended_path,recomended_type],
2378
+
2379
+
2380
+ )
2381
+
2382
+ selected_image.select(
2383
+ get_recommendation,
2384
+ inputs=[new_crop_save_path,image_path, openai_api_key,language,auto_play,length,log_state,sort_rec,naritive,recomended_state,recomended_type,artist_label,recomended_path],
2385
+ outputs=[recommend_bot,recomended_state,output_audio,log_state,pic_index,recommend_score])
2386
+
2387
  ###############################################################################
2388
  ############# above part is for text to image #############
2389
  ###############################################################################
 
2600
 
2601
  # cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
2602
  # [paragraph_output,output_audio])
2603
+ def reset_and_add(origin_image):
2604
+ new_prompt = "Positive"
2605
+ new_add_icon = "assets/icons/plus-square-blue.png"
2606
+ new_add_css = "tools_button_clicked"
2607
+ new_minus_icon = "assets/icons/minus-square.png"
2608
+ new_minus_css= "tools_button"
2609
+ return [[],[],[]],origin_image, new_prompt, gr.update(icon=new_add_icon,elem_classes=new_add_css), gr.update(icon=new_minus_icon,elem_classes=new_minus_css)
2610
+
2611
  clear_button_click.click(
2612
+ reset_and_add,
2613
  [origin_image],
2614
+ [click_state, image_input,point_prompt,add_button,minus_button],
2615
  queue=False,
2616
  show_progress=False
2617
  )
 
2666
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2667
  # paragraph,artist,gender,image_path])
2668
 
2669
+ image_input.upload(upload_callback, [image_input, state, log_state,task_type, visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,session_type],
2670
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2671
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2672
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2673
+ paragraph,artist,gender,image_path,log_state,history_log,output_audio])
2674
 
2675
  # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
2676
  # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
 
2678
  # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2679
  # paragraph,artist])
2680
 
2681
+
2682
+ chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path,log_state,history_log,naritive],
 
 
 
 
 
2683
  [chatbot, state, aux_state,output_audio,log_state,history_log])
2684
  # chat_input.submit(lambda: "", None, chat_input)
2685
  chat_input.submit(lambda: {"text": ""}, None, chat_input)
2686
+
2687
+ example_image.change(upload_callback, [example_image, state, log_state, task_type, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,session_type],
 
 
2688
  [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
2689
  image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
2690
  name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
2691
+ paragraph,artist,gender,image_path, log_state,history_log,output_audio])
2692
 
2693
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
2694
+ example_image.change(
2695
+ lambda:([],[],[],None,[],gr.update(value="Preview")),
2696
+ [],
2697
+ [gallery_result,style_gallery_result,recommend_bot,new_crop_save_path,chatbot,recommend_type])
2698
 
2699
  # def on_click_tab_selected():
2700
  # if gpt_state ==1:
 
2720
  # click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
2721
  # base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
2722
  # base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1])
2723
+
2724
+ def print_reason():
2725
+ print("reason")
2726
+
2727
+
2728
+
2729
 
2730
 
2731
 
 
2844
 
2845
  naritive.change(
2846
  change_naritive,
2847
+ [session_type, image_input, state, click_state, paragraph, origin_image,naritive,
2848
+ task_instuction,gallery_result,style_gallery_result,recomended_state,language],
2849
+ [image_input, chatbot, state, click_state, paragraph, origin_image,task_instuction,gallery_result,style_gallery_result,recomended_state,recommend_bot,recommend_type],
2850
  queue=False,
2851
  show_progress=False
2852
 
2853
  )
2854
+ def change_session():
2855
  instruction=Image.open('test_images/task4.jpg')
2856
+ return None, [], [], [[], [], []], "", None, [],[],instruction,"task 4",[],[],[]
 
2857
 
2858
  session_type.change(
2859
+ change_session,
2860
  [],
2861
+ [image_input, chatbot, state, click_state, paragraph, origin_image,history_log,log_state,task_instuction,task_type,gallery_result,style_gallery_result,recommend_bot]
2862
  )
2863
 
2864
  # upvote_btn.click(