EyeSee_chi

Running

App Files Files Community

Niki Zhang commited on Jun 25, 2024

Commit

06cd0b8

verified ·

1 Parent(s): 36f6cb5

Update app.py

Browse files

Files changed (1) hide show

app.py +390 -215

app.py CHANGED Viewed

@@ -476,15 +476,55 @@ print("4")
 css = """
 #warning {background-color: #FFCCCB}
 .tools_button {
     background: white;
     border: none !important;
     box-shadow: none !important;
 }
-.info_btn {
     background: white;
     border: none !important;
     box-shadow: none !important;
 }
 .function_button {
@@ -496,20 +536,20 @@ css = """
 """
 filtered_language_dict = {
-    'English': 'en-US-JennyNeural',
-    'Chinese': 'zh-CN-XiaoxiaoNeural',
-    'French': 'fr-FR-DeniseNeural',
-    'Spanish': 'es-MX-DaliaNeural',
-    'Arabic': 'ar-SA-ZariyahNeural',
-    'Portuguese': 'pt-BR-FranciscaNeural',
-    'Cantonese': 'zh-HK-HiuGaaiNeural'
 }
 focus_map = {
 "D":0,
 "DA":1,
 "DAI":2,
-"DDA":3
 }
 '''
@@ -528,11 +568,27 @@ prompt_list = [
 ]
 '''
 prompt_list = [
-'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
-'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
-'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
-'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
 ]
 gpt_state = 0
@@ -665,11 +721,11 @@ def init_openai_api_key(api_key=""):
         global gpt_state
         gpt_state=1
         # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
-        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]
     else:
         gpt_state=0
         # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
-        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]
 def init_wo_openai_api_key():
         global gpt_state
@@ -714,7 +770,7 @@ def update_click_state(click_state, caption, click_mode):
         raise NotImplementedError
 async def chat_input_callback(*args):
-    visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay = args
     message = chat_input["text"]
     if visual_chatgpt is not None:
         state, _, aux_state, _ = visual_chatgpt.run_text(message, state, aux_state)
@@ -724,12 +780,12 @@ async def chat_input_callback(*args):
             return state, state, aux_state, None
         else:
-            audio = await texttospeech(last_response,language,autoplay)
             return state, state, aux_state, audio
     else:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
-        audio = await texttospeech(response,language,autoplay)
         return state, state, None, audio
@@ -774,37 +830,63 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
         visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
         print("memory",visual_chatgpt.agent.memory)
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
-        parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\" }")
         parsed_data = json.loads(parsed_data.replace("'", "\""))
-        name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"]
-        # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
-    if narritive==None or narritive=="Third":
-        state = [
-        (
-            None,
-            f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
-        )
-        ]
-    elif narritive=="Artist":
-        state = [
-        (
-            None,
-            f"🧑‍🎨 Hello, I am the {artist}. Welcome to explore my painting, '{name}'. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant insights and thoughts behind my creation."
-        )
-        ]
-    elif narritive=="Item":
-        state = [
-        (
-            None,
-            f"🎨 Hello, I am the Item. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant insights and thoughts behind my creation."
-        )
-        ]
     return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
-        original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist]
@@ -842,14 +924,23 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
-    state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
-    # state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
-    update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
     text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
     image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
-    image_input_withbackground=mask_painter(np.array(image_input), input_mask)
     click_index_state = click_index
     input_mask_state = input_mask
@@ -878,9 +969,9 @@ query_focus = {
 }
-async def submit_caption(state,length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
-                   autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
     state = state + [(query_focus[focus_type], None)]
@@ -896,7 +987,7 @@ async def submit_caption(state,length, sentiment, factuality, language,
     print("input_points_state",input_points_state)
     print("input_labels_state",input_labels_state)
-    prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language)
     print("Prompt:", prompt)
     print("click",click_index)
@@ -918,6 +1009,16 @@ async def submit_caption(state,length, sentiment, factuality, language,
         read_info = re.sub(r'[#[\]!*]','',focus_info)
         read_info = emoji.replace_emoji(read_info,replace="")
         print("read info",read_info)
         # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
         #                                           input_points=input_points, input_labels=input_labels)
@@ -925,25 +1026,26 @@ async def submit_caption(state,length, sentiment, factuality, language,
             if autoplay==False:
                 return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
-            audio_output = await texttospeech(read_info, language, autoplay)
             print("done")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
         except Exception as e:
             state = state + [(None, f"Error during TTS prediction: {str(e)}")]
             print(f"Error during TTS prediction: {str(e)}")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
-            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
     else:
         state = state + [(None, f"Error during TTS prediction: {str(e)}")]
         print(f"Error during TTS prediction: {str(e)}")
-        return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
-def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language):
     mapped_value = focus_map.get(focus_type, -1)
@@ -953,9 +1055,13 @@ def generate_prompt(focus_type, paragraph,length, sentiment, factuality, languag
         'factuality': factuality,
         'language': language
     }
     if mapped_value != -1:
-        prompt = prompt_list[mapped_value].format(
             Wiki_caption=paragraph,
             length=controls['length'],
             sentiment=controls['sentiment'],
@@ -964,8 +1070,8 @@ def generate_prompt(focus_type, paragraph,length, sentiment, factuality, languag
     else:
         prompt = "Invalid focus type."
-    if controls['factuality'] == "Imagination":
-        prompt += " Assuming that I am someone who has viewed a lot of art and has a lot of experience viewing art. Explain artistic features (composition, color, style, or use of light) and discuss the symbolism of the content and its influence on later artistic movements."
     return prompt
@@ -1061,103 +1167,103 @@ def get_sketch_prompt(mask: Image.Image):
 submit_traj=0
-async def inference_traject(origin_image,sketcher_image, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
-                      original_size, input_size, text_refiner,focus_type,paragraph,openai_api_key,autoplay,trace_type):
-    image_input, mask = sketcher_image['background'], sketcher_image['layers'][0]
-    crop_save_path=""
-    prompt = get_sketch_prompt(mask)
-    boxes = prompt['input_boxes']
-    boxes = boxes[0]
-    controls = {'length': length,
-                'sentiment': sentiment,
-                'factuality': factuality,
-                'language': language}
-    model = build_caption_anything_with_models(
-        args,
-        api_key="",
-        captioner=shared_captioner,
-        sam_model=shared_sam_model,
-        ocr_reader=shared_ocr_reader,
-        text_refiner=text_refiner,
-        session_id=iface.app_id
-    )
-    model.setup(image_embedding, original_size, input_size, is_image_set=True)
-    enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
-    out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki,verbose=True)[0]
-    print(trace_type)
-    if trace_type=="Trace+Seg":
-        input_mask = np.array(out['mask'].convert('P'))
-        image_input = mask_painter(np.array(image_input), input_mask, background_alpha=0)
-        d3_input=mask_painter(np.array(image_input), input_mask)
-        crop_save_path=out['crop_save_path']
-    else:
-        image_input = Image.fromarray(np.array(origin_image))
-        draw = ImageDraw.Draw(image_input)
-        draw.rectangle(boxes, outline='red', width=2)
-        d3_input=image_input
-        cropped_image = origin_image.crop(boxes)
-        cropped_image.save('temp.png')
-        crop_save_path='temp.png'
-    print("crop_svae_path",out['crop_save_path'])
-    # Update components and states
-    state.append((f'Box: {boxes}', None))
-    # fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
-    # image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
-    prompt=generate_prompt(focus_type, paragraph, length, sentiment, factuality, language)
-    # if not args.disable_gpt and text_refiner:
-    if not args.disable_gpt:
-        focus_info=get_gpt_response(openai_api_key,crop_save_path,prompt)
-        if focus_info.startswith('"') and focus_info.endswith('"'):
-            focus_info=focus_info[1:-1]
-        focus_info=focus_info.replace('#', '')
-        state = state + [(None, f"{focus_info}")]
-        print("new_cap",focus_info)
-        read_info = re.sub(r'[#[\]!*]','',focus_info)
-        read_info = emoji.replace_emoji(read_info,replace="")
-        print("read info",read_info)
-        # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
-        #                                           input_points=input_points, input_labels=input_labels)
-        try:
-            audio_output = await texttospeech(read_info, language,autoplay)
-            # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-            return state, state,image_input,audio_output,crop_save_path,d3_input
-        except Exception as e:
-            state = state + [(None, f"Error during TTS prediction: {str(e)}")]
-            print(f"Error during TTS prediction: {str(e)}")
-            # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
-            return state, state, image_input,audio_output,crop_save_path
-    else:
-        try:
-            audio_output = await texttospeech(focus_info, language, autoplay)
-            # waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
-            # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-            return state, state, image_input,audio_output
-        except Exception as e:
-            state = state + [(None, f"Error during TTS prediction: {str(e)}")]
-            print(f"Error during TTS prediction: {str(e)}")
-            return state, state, image_input,audio_output
 def clear_chat_memory(visual_chatgpt, keep_global=False):
@@ -1172,7 +1278,7 @@ def clear_chat_memory(visual_chatgpt, keep_global=False):
             visual_chatgpt.global_prompt = ""
-def export_chat_log(chat_state, paragraph, liked, disliked):
     try:
         if not chat_state:
             return None
@@ -1201,41 +1307,44 @@ def export_chat_log(chat_state, paragraph, liked, disliked):
             temp_file.write(chat_log.encode('utf-8'))
             temp_file_path = temp_file.name
             print(temp_file_path)
-        return temp_file_path
     except Exception as e:
         print(f"An error occurred while exporting the chat log: {e}")
         return None
 async def get_artistinfo(artist_name,api_key,state,language,autoplay,length):
-    prompt=f"Provide a concise summary of about {length} words in {language} on the painter {artist_name}, covering his biography, major works, artistic style, significant contributions to the art world, and any major awards or recognitions he has received."
     res=get_gpt_response(api_key,None,prompt)
-    state = state + [(None, f"Artist Background:{res}")]
     read_info = re.sub(r'[#[\]!*]','',res)
     read_info = emoji.replace_emoji(read_info,replace="")
     # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
     #                                           input_points=input_points, input_labels=input_labels)
-    audio_output = await texttospeech(read_info, language,autoplay)
     # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-    return state, state,audio_output
 async def get_yearinfo(year,api_key,state,language,autoplay,length):
-    prompt = f"Provide a concise summary of about {length} words in {language} on the art historical period associated with the year {year}, covering its major characteristics, influential artists, notable works, and its significance in the broader context of art history."
     res=get_gpt_response(api_key,None,prompt)
-    state = state + [(None, f"History Background: {res}")]
     read_info = re.sub(r'[#[\]!*]','',res)
     read_info = emoji.replace_emoji(read_info,replace="")
     # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
     #                                           input_points=input_points, input_labels=input_labels)
-    audio_output = await texttospeech(read_info, language,autoplay)
     # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-    return state, state,audio_output
@@ -1364,10 +1473,10 @@ def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragr
 #     return like_state, dislike_state
-async def texttospeech(text, language, autoplay):
     try:
         if autoplay:
-            voice = filtered_language_dict[language]
             communicate = edge_tts.Communicate(text, voice)
             file_path = "output.wav"
             await communicate.save(file_path)
@@ -1385,6 +1494,35 @@ async def texttospeech(text, language, autoplay):
         print(f"Error in texttospeech: {e}")
         return None
 def print_like_dislike(x: gr.LikeData,like_res,dislike_res,state):
     print(x.index, x.value, x.liked)
     if x.liked == True:
@@ -1398,14 +1536,15 @@ def print_like_dislike(x: gr.LikeData,like_res,dislike_res,state):
     return like_res,dislike_res,state
 def toggle_icons_and_update_prompt(point_prompt):
     new_prompt = "Negative" if point_prompt == "Positive" else "Positive"
     new_add_icon = "assets/icons/plus-square-blue.png" if new_prompt == "Positive" else "assets/icons/plus-square.png"
     new_minus_icon = "assets/icons/minus-square.png" if new_prompt == "Positive" else "assets/icons/minus-square-blue.png"
-    print(point_prompt,flush=True)
-    print(new_prompt,flush=True)
-    return new_prompt, gr.update(icon=new_add_icon), gr.update(icon=new_minus_icon)
 add_icon_path="assets/icons/plus-square-blue.png"
 minus_icon_path="assets/icons/minus-square.png"
@@ -1420,12 +1559,11 @@ def create_ui():
     examples = [
         ["test_images/ambass.jpg"],
-        ["test_images/pearl.jpg"],
-        ["test_images/Picture0.png"],
-        ["test_images/Picture1.png"],
-        ["test_images/Picture2.png"],
-        ["test_images/Picture3.png"],
-        ["test_images/Picture4.png"],
         ["test_images/Picture5.png"],
     ]
@@ -1457,6 +1595,9 @@ def create_ui():
         gr.Markdown(title)
         gr.Markdown(description)
         point_prompt = gr.State("Positive")
         # with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
         #     with gr.Column(scale=0.5):
         #         # gr.Markdown("Left side content")
@@ -1479,8 +1620,8 @@ def create_ui():
             value="English", label="Language", interactive=True, elem_classes="custom-language"
         )
             length = gr.Slider(
-                                minimum=40,
-                                maximum=200,
                                 value=80,
                                 step=1,
                                 interactive=True,
@@ -1507,45 +1648,47 @@ def create_ui():
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as modules_not_need_gpt:
                     with gr.Tab("Base(GPT Power)") as base_tab:
-                        image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         with gr.Row():
                             name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
-                            artist_label_base = gr.Button(value="Artist: ",elem_classes="info_btn")
-                            year_label_base = gr.Button(value="Year: ",elem_classes="info_btn")
                             material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
                     with gr.Tab("Base2") as base_tab2:
-                        image_input_base_2 = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         with gr.Row():
                             name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
-                            artist_label_base2 = gr.Button(value="Artist: ",elem_classes="info_btn")
-                            year_label_base2 = gr.Button(value="Year: ",elem_classes="info_btn")
                             material_label_base2 = gr.Button(value="Style: ",elem_classes="info_btn")
                     with gr.Tab("Click") as click_tab:
                         with gr.Row():
                             with gr.Column(scale=10,min_width=600):
-                                image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                                 example_image = gr.Image(type="pil", interactive=False, visible=False)
                                 with gr.Row():
                                     name_label = gr.Button(value="Name: ",elem_classes="info_btn")
-                                    artist_label = gr.Button(value="Artist: ",elem_classes="info_btn")
-                                    year_label = gr.Button(value="Year: ",elem_classes="info_btn")
                                     material_label = gr.Button(value="Style: ",elem_classes="info_btn")
                             # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
                             # the tool column
                             with gr.Column(scale=1,elem_id="tool_box",min_width=80):
-                                add_button = gr.Button(value="", interactive=True,elem_classes="tools_button",icon=add_icon_path)
-                                minus_button = gr.Button(value="", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
                                 clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
                                 clear_button_image = gr.Button(value="Change", interactive=True,elem_classes="tools_button")
-                                focus_d = gr.Button(value="D",interactive=True,elem_classes="function_button")
-                                focus_da = gr.Button(value="DA",interactive=True,elem_classes="function_button")
-                                focus_dai = gr.Button(value="DAI",interactive=True,elem_classes="function_button")
-                                focus_dda = gr.Button(value="DDA",interactive=True,elem_classes="function_button")
-                                recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button")
                         with gr.Row(visible=False):
                             with gr.Column():
@@ -1608,7 +1751,22 @@ def create_ui():
                                 value="No",
                                 label="Expert",
                                 interactive=True)
                 with gr.Column(visible=True) as modules_not_need_gpt3:
                     gr.Examples(
@@ -1671,7 +1829,7 @@ def create_ui():
         ############# this part is for text to image #############
         ###############################################################################
-        with gr.Row(variant="panel") as text2image_model:
             with gr.Column():
                 with gr.Column():
@@ -1719,7 +1877,7 @@ def create_ui():
                         value=0,
                     )
                     randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-                    with gr.Row(visible=True):
                         width = gr.Slider(
                             label="Width",
                             minimum=100,
@@ -1749,21 +1907,21 @@ def create_ui():
                             step=1,
                             value=8,
                         )
-            with gr.Column():
-                result = gr.Gallery(
-                    label="Result",
-                    height="auto",
-                    columns=4
-                    # columns=4,
-                    # rows=2,
-                    # show_label=False,
-                    # allow_preview=True,
-                    # object_fit="contain",
-                    # height="auto",
-                    # preview=True,
-                    # show_share_button=True,
-                    # show_download_button=True
-                )
         with gr.Row():
             naritive = gr.Radio(
@@ -1814,8 +1972,16 @@ def create_ui():
         recommend_btn.click(
             fn=infer,
             inputs=[new_crop_save_path],
-            outputs=[result]
             )
         ###############################################################################
         ############# above part is for text to image #############
@@ -1966,11 +2132,11 @@ def create_ui():
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
-                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
-                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row])
         # openai_api_key.submit(init_openai_api_key,
         #                       outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
         #                                modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
@@ -2077,7 +2243,7 @@ def create_ui():
                            [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                             image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-                                paragraph,artist])
         # image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key],
         #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
@@ -2103,7 +2269,7 @@ def create_ui():
         # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
         #                       [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
         #                        image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
-        chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
                           [chatbot, state, aux_state,output_audio])
         # chat_input.submit(lambda: "", None, chat_input)
         chat_input.submit(lambda: {"text": ""}, None, chat_input)
@@ -2114,7 +2280,7 @@ def create_ui():
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                               image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-                            paragraph,artist])
         example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
@@ -2161,11 +2327,11 @@ def create_ui():
         focus_d.click(
             submit_caption,
             inputs=[
-        state,length, sentiment, factuality, language,
-        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, auto_play, paragraph,focus_d,openai_api_key,new_crop_save_path
     ],
             outputs=[
-                chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
             ],
             show_progress=True,
             queue=True
@@ -2178,7 +2344,7 @@ def create_ui():
         focus_da.click(
         submit_caption,
         inputs=[
-        state,length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,auto_play, paragraph,focus_da,openai_api_key,new_crop_save_path
         ],
         outputs=[
@@ -2192,7 +2358,7 @@ def create_ui():
         focus_dai.click(
         submit_caption,
         inputs=[
-        state,length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
         auto_play, paragraph,focus_dai,openai_api_key,new_crop_save_path
         ],
@@ -2207,7 +2373,7 @@ def create_ui():
         focus_dda.click(
         submit_caption,
         inputs=[
-        state,length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
         auto_play, paragraph,focus_dda,openai_api_key,new_crop_save_path
         ],
@@ -2252,11 +2418,20 @@ def create_ui():
         export_button.click(
             export_chat_log,
-            inputs=[state,paragraph,like_res,dislike_res],
-            outputs=[chat_log_file],
             queue=True
         )
         # upvote_btn.click(
         #     handle_liked,
         #     inputs=[state,like_res],
@@ -2281,4 +2456,4 @@ if __name__ == '__main__':
     iface = create_ui()
     iface.queue(api_open=False, max_size=10)
     # iface.queue(concurrency_count=5, api_open=False, max_size=10)
-    iface.launch(server_name="0.0.0.0",show_error=True)

 css = """
 #warning {background-color: #FFCCCB}
 .tools_button {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
     background: white;
     border: none !important;
     box-shadow: none !important;
+    text-align: center;
+    color: black;
 }
+.tools_button_clicked {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    background: white;
+    border: none !important;
+    box-shadow: none !important;
+    text-align: center;
+    color: rgb(18,150,219);
+}
+.tools_button_add {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
     background: white;
     border: none !important;
     box-shadow: none !important;
+    text-align: center;
+    color: rgb(18,150,219);
+}
+.image_upload {
+    height: 650px;
+}
+.info_btn {
+    background: white !important;
+    border: none !important;
+    box-shadow: none !important;
+}
+info_btn_interact {
+    background: white !important;
+    box-shadow: none !important;
 }
 .function_button {
 """
 filtered_language_dict = {
+    'English': {'female': 'en-US-JennyNeural', 'male': 'en-US-GuyNeural'},
+    'Chinese': {'female': 'zh-CN-XiaoxiaoNeural', 'male': 'zh-CN-YunxiNeural'},
+    'French': {'female': 'fr-FR-DeniseNeural', 'male': 'fr-FR-HenriNeural'},
+    'Spanish': {'female': 'es-MX-DaliaNeural', 'male': 'es-MX-JorgeNeural'},
+    'Arabic': {'female': 'ar-SA-ZariyahNeural', 'male': 'ar-SA-HamedNeural'},
+    'Portuguese': {'female': 'pt-BR-FranciscaNeural', 'male': 'pt-BR-AntonioNeural'},
+    'Cantonese': {'female': 'zh-HK-HiuGaaiNeural', 'male': 'zh-HK-WanLungNeural'}
 }
 focus_map = {
 "D":0,
 "DA":1,
 "DAI":2,
+"Judge":3
 }
 '''
 ]
 '''
 prompt_list = [
+    [
+        'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
+        'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
+        'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
+        'You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
+    ],
+    [
+        'When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
+        'When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
+        'When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
+        'You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
+    ],
+    [
+        'When generating answers, you should tell people that you are the object or the person itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object or the person and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
+        'When generating answers, you should tell people that you are the object or the person itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object or the person and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
+        'When generating answers, you should tell people that you are the object or the person itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object or the person and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
+        'You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
+    ]
 ]
 gpt_state = 0
         global gpt_state
         gpt_state=1
         # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
+        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
     else:
         gpt_state=0
         # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
+        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
 def init_wo_openai_api_key():
         global gpt_state
         raise NotImplementedError
 async def chat_input_callback(*args):
+    visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay,gender = args
     message = chat_input["text"]
     if visual_chatgpt is not None:
         state, _, aux_state, _ = visual_chatgpt.run_text(message, state, aux_state)
             return state, state, aux_state, None
         else:
+            audio = await texttospeech(last_response,language,autoplay,gender)
             return state, state, aux_state, audio
     else:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
+        audio = await texttospeech(response,language,autoplay,gender)
         return state, state, None, audio
         visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
         print("memory",visual_chatgpt.agent.memory)
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
+        parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\",\"gender\": \"The gender of the author\" }")
         parsed_data = json.loads(parsed_data.replace("'", "\""))
+        name, artist, year, material,gender= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"],parsed_data['gender']
+        gender=gender.lower()
+        print("gender",gender)
+    if language=="English":
+        if narritive=="Third" :
+            state = [
+            (
+                None,
+                f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
+            )
+            ]
+        elif narritive=="Artist":
+            state = [
+            (
+                None,
+                f"🧑‍🎨 Hello, I am the {artist}. Welcome to explore my painting, '{name}'. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant insights and thoughts behind my creation."
+            )
+            ]
+        elif narritive=="Item":
+            state = [
+            (
+                None,
+                f"🎨 Hello, Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with relevant insights and thoughts from the perspective of the objects within the painting"
+            )
+            ]
+    elif language=="Chinese":
+        if narritive == "Third":
+            state = [
+            (
+                None,
+                f"🤖 你好，我是 EyeSee。让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域，并选择四种信息类型之一：描述、分析、解读和评判。根据你的选择，我会为你提供相关的信息。"
+            )
+            ]
+        elif narritive == "Artist":
+            state = [
+            (
+                None,
+                f"🧑‍🎨 你好，我是{artist}。欢迎探索我的画作《{name}》。你可以点击你感兴趣的区域，并选择四种信息类型之一：描述、分析、解读和评判。根据你的选择，我会为你提供我的创作背后的相关见解和想法。"
+            )
+            ]
+        elif narritive == "Item":
+            state = [
+            (
+                None,
+                f"🎨 你好，让我们一起探索这幅画《{name}》。你可以点击你感兴趣的区域，并选择四种信息类型之一：描述、分析、解读和评判。根据你的选择，我会从画面上事物的视角为你提供相关的见解和想法。"
+            )
+            ]
     return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
+        original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist, gender]
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
+    # state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
+    # state = state + [("Selected image point: {}, Input label: {}".format(
+    #     prompt["input_point"],
+    #     '+' if prompt["input_label"] == "1" else '-'
+    # ), None)]
+    output_label = ['+' if label == 1 else '-' for label in prompt["input_label"]]
+    state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], output_label), None)]
+    # update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
     text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
     image_input_nobackground = mask_painter(np.array(image_input), input_mask,background_alpha=0)
     click_index_state = click_index
     input_mask_state = input_mask
 }
+async def submit_caption(naritive, state,length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+                   autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path, gender):
     state = state + [(query_focus[focus_type], None)]
     print("input_points_state",input_points_state)
     print("input_labels_state",input_labels_state)
+    prompt=generate_prompt(focus_type,paragraph,length,sentiment,factuality,language, naritive)
     print("Prompt:", prompt)
     print("click",click_index)
         read_info = re.sub(r'[#[\]!*]','',focus_info)
         read_info = emoji.replace_emoji(read_info,replace="")
         print("read info",read_info)
+        if naritive=="Item":
+            parsed_data = get_gpt_response(openai_api_key, new_crop_save_path,prompt = f"Based on the information {focus_info}, return the gender of this item, returns its most likely gender, do not return unknown, in the format {{\"gender\": \"<gender>\"}}")
+            parsed_data = json.loads(parsed_data)
+            try:
+                gender=parsed_data['gender']
+                gender=gender.lower()
+            except:
+                print("error gpt responese")
+            print("item gender",gender)
         # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
         #                                           input_points=input_points, input_labels=input_labels)
             if autoplay==False:
                 return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
+            audio_output = await texttospeech(read_info, language, autoplay,gender)
             print("done")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output,gender,focus_info
         except Exception as e:
             state = state + [(None, f"Error during TTS prediction: {str(e)}")]
             print(f"Error during TTS prediction: {str(e)}")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
+            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output,gender,focus_info
     else:
         state = state + [(None, f"Error during TTS prediction: {str(e)}")]
         print(f"Error during TTS prediction: {str(e)}")
+        return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None,None,focus_info
+def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language,naritive):
     mapped_value = focus_map.get(focus_type, -1)
         'factuality': factuality,
         'language': language
     }
+    naritive_mapping = {"Third": 0, "Artist": 1, "Item": 2}
+    naritive_value=naritive_mapping[naritive]
     if mapped_value != -1:
+        prompt = prompt_list[naritive_value][mapped_value].format(
             Wiki_caption=paragraph,
             length=controls['length'],
             sentiment=controls['sentiment'],
     else:
         prompt = "Invalid focus type."
+    # if controls['factuality'] == "Imagination":
+    #     prompt += " Assuming that I am someone who has viewed a lot of art and has a lot of experience viewing art. Explain artistic features (composition, color, style, or use of light) and discuss the symbolism of the content and its influence on later artistic movements."
     return prompt
 submit_traj=0
+# async def inference_traject(naritive, origin_image,sketcher_image, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
+#                       original_size, input_size, text_refiner,focus_type,paragraph,openai_api_key,autoplay,trace_type):
+#     image_input, mask = sketcher_image['background'], sketcher_image['layers'][0]
+#     crop_save_path=""
+#     prompt = get_sketch_prompt(mask)
+#     boxes = prompt['input_boxes']
+#     boxes = boxes[0]
+#     controls = {'length': length,
+#                 'sentiment': sentiment,
+#                 'factuality': factuality,
+#                 'language': language}
+#     model = build_caption_anything_with_models(
+#         args,
+#         api_key="",
+#         captioner=shared_captioner,
+#         sam_model=shared_sam_model,
+#         ocr_reader=shared_ocr_reader,
+#         text_refiner=text_refiner,
+#         session_id=iface.app_id
+#     )
+#     model.setup(image_embedding, original_size, input_size, is_image_set=True)
+#     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
+#     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki,verbose=True)[0]
+#     print(trace_type)
+#     if trace_type=="Trace+Seg":
+#         input_mask = np.array(out['mask'].convert('P'))
+#         image_input = mask_painter(np.array(image_input), input_mask, background_alpha=0)
+#         d3_input=mask_painter(np.array(image_input), input_mask)
+#         crop_save_path=out['crop_save_path']
+#     else:
+#         image_input = Image.fromarray(np.array(origin_image))
+#         draw = ImageDraw.Draw(image_input)
+#         draw.rectangle(boxes, outline='red', width=2)
+#         d3_input=image_input
+#         cropped_image = origin_image.crop(boxes)
+#         cropped_image.save('temp.png')
+#         crop_save_path='temp.png'
+#     print("crop_svae_path",out['crop_save_path'])
+#     # Update components and states
+#     state.append((f'Box: {boxes}', None))
+#     # fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
+#     # image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
+#     prompt=generate_prompt(focus_type, paragraph, length, sentiment, factuality, language,naritive)
+#     # if not args.disable_gpt and text_refiner:
+#     if not args.disable_gpt:
+#         focus_info=get_gpt_response(openai_api_key,crop_save_path,prompt)
+#         if focus_info.startswith('"') and focus_info.endswith('"'):
+#             focus_info=focus_info[1:-1]
+#         focus_info=focus_info.replace('#', '')
+#         state = state + [(None, f"{focus_info}")]
+#         print("new_cap",focus_info)
+#         read_info = re.sub(r'[#[\]!*]','',focus_info)
+#         read_info = emoji.replace_emoji(read_info,replace="")
+#         print("read info",read_info)
+#         # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
+#         #                                           input_points=input_points, input_labels=input_labels)
+#         try:
+#             audio_output = await texttospeech(read_info, language,autoplay,gender)
+#             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+#             return state, state,image_input,audio_output,crop_save_path,d3_input
+#         except Exception as e:
+#             state = state + [(None, f"Error during TTS prediction: {str(e)}")]
+#             print(f"Error during TTS prediction: {str(e)}")
+#             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
+#             return state, state, image_input,audio_output,crop_save_path
+#     else:
+#         try:
+#             audio_output = await texttospeech(focus_info, language, autoplay)
+#             # waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
+#             # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+#             return state, state, image_input,audio_output
+#         except Exception as e:
+#             state = state + [(None, f"Error during TTS prediction: {str(e)}")]
+#             print(f"Error during TTS prediction: {str(e)}")
+#             return state, state, image_input,audio_output
 def clear_chat_memory(visual_chatgpt, keep_global=False):
             visual_chatgpt.global_prompt = ""
+def export_chat_log(chat_state, paragraph, liked, disliked,log_list):
     try:
         if not chat_state:
             return None
             temp_file.write(chat_log.encode('utf-8'))
             temp_file_path = temp_file.name
             print(temp_file_path)
+        log_list.append(temp_file_path)
+        return log_list,log_list
     except Exception as e:
         print(f"An error occurred while exporting the chat log: {e}")
         return None
 async def get_artistinfo(artist_name,api_key,state,language,autoplay,length):
+    prompt = f"Provide a concise summary of about {length} words in {language} on the painter {artist_name}, covering his biography, major works, artistic style, significant contributions to the art world, and any major awards or recognitions he has received. Start your response with 'Artist Background: '."
     res=get_gpt_response(api_key,None,prompt)
+    state = state + [(None, res)]
     read_info = re.sub(r'[#[\]!*]','',res)
     read_info = emoji.replace_emoji(read_info,replace="")
     # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
     #                                           input_points=input_points, input_labels=input_labels)
+    if autoplay:
+        audio_output = await texttospeech(read_info, language,autoplay)
     # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+        return state, state,audio_output
+    return state, state,None
 async def get_yearinfo(year,api_key,state,language,autoplay,length):
+    prompt = f"Provide a concise summary of about {length} words in {language} on the art historical period associated with the year {year}, covering its major characteristics, influential artists, notable works, and its significance in the broader context of art history with 'History Background: '."
     res=get_gpt_response(api_key,None,prompt)
+    state = state + [(None, res)]
     read_info = re.sub(r'[#[\]!*]','',res)
     read_info = emoji.replace_emoji(read_info,replace="")
     # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
     #                                           input_points=input_points, input_labels=input_labels)
+    if autoplay:
+        audio_output = await texttospeech(read_info, language,autoplay)
     # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+        return state, state,audio_output
+    return state, state,None
 #     return like_state, dislike_state
+async def texttospeech(text, language, autoplay,gender='female'):
     try:
         if autoplay:
+            voice = filtered_language_dict[language][gender]
             communicate = edge_tts.Communicate(text, voice)
             file_path = "output.wav"
             await communicate.save(file_path)
         print(f"Error in texttospeech: {e}")
         return None
+async def associate(focus_info,openai_api_key,language,state,autoplay,evt: gr.SelectData):
+    rec_path=evt._data['value']['image']['path']
+    print("rec_path",rec_path)
+    prompt="""
+    The information and image I gave you are 2 different paintings. Please analyze the relationship between the image and the information {focus_info}. Discuss their similarities and differences in terms of style, themes, colors, and any other relevant aspects. Provide a detailed analysis that highlights how the information fits into or contrasts with the recommended painting. Consider the following points in your analysis:
+    - Artistic style and techniques
+    - Themes and subjects
+    - Color palettes and compositions
+    - Historical and cultural contexts
+    - Symbolism and meanings
+    Based on your analysis, provide insights into how the information enhances or contrasts with the recommended painting, and suggest any interesting interpretations or observations. Return your response in {language}
+    """
+    prompt=prompt.format(focus_info=focus_info,language=language)
+    result=get_gpt_response(openai_api_key, rec_path, prompt)
+    state = state + [(None, f"{result}")]
+    read_info = re.sub(r'[#[\]!*]','',result)
+    read_info = emoji.replace_emoji(result,replace="")
+    if autoplay:
+        audio_output = await texttospeech(read_info, language, autoplay)
+        return state,state,audio_output
+    return state,state,None
 def print_like_dislike(x: gr.LikeData,like_res,dislike_res,state):
     print(x.index, x.value, x.liked)
     if x.liked == True:
     return like_res,dislike_res,state
 def toggle_icons_and_update_prompt(point_prompt):
     new_prompt = "Negative" if point_prompt == "Positive" else "Positive"
     new_add_icon = "assets/icons/plus-square-blue.png" if new_prompt == "Positive" else "assets/icons/plus-square.png"
+    new_add_css = "tools_button_clicked" if new_prompt == "Positive" else "tools_button"
     new_minus_icon = "assets/icons/minus-square.png" if new_prompt == "Positive" else "assets/icons/minus-square-blue.png"
+    new_minus_css= "tools_button" if new_prompt == "Positive" else "tools_button_clicked"
+    return new_prompt, gr.update(icon=new_add_icon,elem_classes=new_add_css), gr.update(icon=new_minus_icon,elem_classes=new_minus_css)
 add_icon_path="assets/icons/plus-square-blue.png"
 minus_icon_path="assets/icons/minus-square.png"
     examples = [
         ["test_images/ambass.jpg"],
+        ["test_images/test1.png"],
+        ["test_images/test2.png"],
+        ["test_images/test3.png"],
+        ["test_images/test4.png"],
+        ["test_images/test5.png"],
         ["test_images/Picture5.png"],
     ]
         gr.Markdown(title)
         gr.Markdown(description)
         point_prompt = gr.State("Positive")
+        log_list=gr.State([])
+        gender=gr.State('female')
+        focus_info=gr.State('')
         # with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
         #     with gr.Column(scale=0.5):
         #         # gr.Markdown("Left side content")
             value="English", label="Language", interactive=True, elem_classes="custom-language"
         )
             length = gr.Slider(
+                                minimum=60,
+                                maximum=120,
                                 value=80,
                                 step=1,
                                 interactive=True,
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as modules_not_need_gpt:
                     with gr.Tab("Base(GPT Power)") as base_tab:
+                        image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload")
                         with gr.Row():
                             name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
+                            artist_label_base = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
+                            year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact")
                             material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
                     with gr.Tab("Base2") as base_tab2:
+                        image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload")
                         with gr.Row():
                             name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
+                            artist_label_base2 = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
+                            year_label_base2 = gr.Button(value="Year: ",elem_classes="info_btn_interact")
                             material_label_base2 = gr.Button(value="Style: ",elem_classes="info_btn")
                     with gr.Tab("Click") as click_tab:
                         with gr.Row():
                             with gr.Column(scale=10,min_width=600):
+                                image_input = gr.Image(type="pil", interactive=True, elem_classes="image_upload")
                                 example_image = gr.Image(type="pil", interactive=False, visible=False)
                                 with gr.Row():
                                     name_label = gr.Button(value="Name: ",elem_classes="info_btn")
+                                    artist_label = gr.Button(value="Artist: ",elem_classes="info_btn_interact")
+                                    year_label = gr.Button(value="Year: ",elem_classes="info_btn_interact")
                                     material_label = gr.Button(value="Style: ",elem_classes="info_btn")
                             # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
                             # the tool column
                             with gr.Column(scale=1,elem_id="tool_box",min_width=80):
+                                add_button = gr.Button(value="Extend Area", interactive=True,elem_classes="tools_button_add",icon=add_icon_path)
+                                minus_button = gr.Button(value="Remove Area", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
                                 clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
                                 clear_button_image = gr.Button(value="Change", interactive=True,elem_classes="tools_button")
+                                focus_d = gr.Button(value="D",interactive=True,elem_classes="function_button",variant="primary")
+                                focus_da = gr.Button(value="DA",interactive=True,elem_classes="function_button",variant="primary")
+                                focus_dai = gr.Button(value="DAI",interactive=True,elem_classes="function_button",variant="primary")
+                                focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button",variant="primary")
+                                recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button",variant="primary")
+                                # focus_asso = gr.Button(value="Associate",interactive=True,elem_classes="function_button",variant="primary")
                         with gr.Row(visible=False):
                             with gr.Column():
                                 value="No",
                                 label="Expert",
                                 interactive=True)
+                with gr.Column(visible=False) as recommend:
+                    gallery_result = gr.Gallery(
+                    label="Result",
+                    height="auto",
+                    columns=4
+                    # columns=4,
+                    # rows=2,
+                    # show_label=False,
+                    # allow_preview=True,
+                    # object_fit="contain",
+                    # height="auto",
+                    # preview=True,
+                    # show_share_button=True,
+                    # show_download_button=True
+                )
                 with gr.Column(visible=True) as modules_not_need_gpt3:
                     gr.Examples(
         ############# this part is for text to image #############
         ###############################################################################
+        with gr.Row(variant="panel",visible=False) as text2image_model:
             with gr.Column():
                 with gr.Column():
                         value=0,
                     )
                     randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+                    with gr.Row():
                         width = gr.Slider(
                             label="Width",
                             minimum=100,
                             step=1,
                             value=8,
                         )
+            # with gr.Column():
+            #     result = gr.Gallery(
+            #         label="Result",
+            #         height="auto",
+            #         columns=4
+            #         # columns=4,
+            #         # rows=2,
+            #         # show_label=False,
+            #         # allow_preview=True,
+            #         # object_fit="contain",
+            #         # height="auto",
+            #         # preview=True,
+            #         # show_share_button=True,
+            #         # show_download_button=True
+            #     )
         with gr.Row():
             naritive = gr.Radio(
         recommend_btn.click(
             fn=infer,
             inputs=[new_crop_save_path],
+            outputs=[gallery_result]
             )
+        gallery_result.select(
+            associate,
+            inputs=[focus_info,openai_api_key,language,state,auto_play],
+            outputs=[chatbot,state,output_audio],
+        )
         ###############################################################################
         ############# above part is for text to image #############
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
+                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
+                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend])
         # openai_api_key.submit(init_openai_api_key,
         #                       outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
         #                                modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
                            [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                             image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+                                paragraph,artist,gender])
         # image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key],
         #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
         # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
         #                       [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
         #                        image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
+        chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender],
                           [chatbot, state, aux_state,output_audio])
         # chat_input.submit(lambda: "", None, chat_input)
         chat_input.submit(lambda: {"text": ""}, None, chat_input)
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                               image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+                            paragraph,artist,gender])
         example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
         focus_d.click(
             submit_caption,
             inputs=[
+        naritive, state,length, sentiment, factuality, language,
+        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, auto_play, paragraph,focus_d,openai_api_key,new_crop_save_path,gender
     ],
             outputs=[
+                chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio,focus_info
             ],
             show_progress=True,
             queue=True
         focus_da.click(
         submit_caption,
         inputs=[
+        naritive,state,length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,auto_play, paragraph,focus_da,openai_api_key,new_crop_save_path
         ],
         outputs=[
         focus_dai.click(
         submit_caption,
         inputs=[
+        naritive,state,length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
         auto_play, paragraph,focus_dai,openai_api_key,new_crop_save_path
         ],
         focus_dda.click(
         submit_caption,
         inputs=[
+        naritive,state,length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
         auto_play, paragraph,focus_dda,openai_api_key,new_crop_save_path
         ],
         export_button.click(
             export_chat_log,
+            inputs=[state,paragraph,like_res,dislike_res,log_list],
+            outputs=[chat_log_file,log_list],
             queue=True
         )
+        naritive.change(
+            lambda: (None, [], [], [[], [], []], "", "", ""),
+            [],
+            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
+            queue=False,
+            show_progress=False
+        )
         # upvote_btn.click(
         #     handle_liked,
         #     inputs=[state,like_res],
     iface = create_ui()
     iface.queue(api_open=False, max_size=10)
     # iface.queue(concurrency_count=5, api_open=False, max_size=10)
+    iface.launch(server_name="0.0.0.0")