EyeSee_chi

Sleeping

App Files Files Community

Niki Zhang commited on Jun 6, 2024

Commit

5abd550

verified ·

1 Parent(s): ab4f7f8

Update app.py

Browse files

Files changed (1) hide show

app.py +259 -190

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ import easyocr
 import re
 import edge_tts
 import asyncio
 # import tts
 ###############################################################################
@@ -32,27 +33,14 @@ import asyncio
 ###############################################################################
-# import uuid
-# from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
-# from diffusers.utils import export_to_video
-# from safetensors.torch import load_file
-#from diffusers.models.modeling_outputs import Transformer2DModelOutput
-import random
-import uuid
-import json
-from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 import imageio
 import numpy as np
 import torch
 import rembg
 from torchvision.transforms import v2
 from pytorch_lightning import seed_everything
 from omegaconf import OmegaConf
@@ -297,6 +285,7 @@ def make3d(images):
 ############# above part is for 3D generate #############
 ###############################################################################
 ###############################################################################
 ############# this part is for text to image #############
 ###############################################################################
@@ -418,6 +407,36 @@ filtered_language_dict = {
     'Cantonese': 'zh-HK-HiuGaaiNeural'
 }
 gpt_state = 0
 VOICE = "en-GB-SoniaNeural"
 article = """
@@ -463,6 +482,7 @@ class ImageSketcher(gr.Image):
                 mask[..., -1] = 255
                 mask = self.postprocess(mask)
                 x['mask'] = mask
         return super().preprocess(x)
@@ -512,15 +532,18 @@ def init_openai_api_key(api_key=""):
         global gpt_state
         gpt_state=1
-        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
     else:
         gpt_state=0
-        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
 def init_wo_openai_api_key():
         global gpt_state
         gpt_state=0
-        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
@@ -677,17 +700,16 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
         point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
         visual_chatgpt.point_prompt = point_prompt
-    generated_caption = text
-    print(generated_caption)
     print("new crop save",new_crop_save_path)
-    yield state, state, click_state, image_input_nobackground, image_input_withbackground, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
-async def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
                    autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
     print("state",state)
@@ -702,23 +724,57 @@ async def submit_caption(image_input, state, generated_caption, text_refiner, vi
     print("click_index",click_index)
     print("input_points_state",input_points_state)
     print("input_labels_state",input_labels_state)
-    input_mask = input_mask_state
-    input_points = input_points_state
-    input_labels = input_labels_state
-    focus_map = {
-    "CFV-D":0,
-    "CFV-DA":1,
-    "CFV-DAI":2,
-    "PFV-DDA":3
-}
     mapped_value = focus_map.get(focus_type, -1)
-    print("mapped value",mapped_value)
     controls = {
         'length': length,
@@ -726,95 +782,21 @@ async def submit_caption(image_input, state, generated_caption, text_refiner, vi
         'factuality': factuality,
         'language': language
     }
-    '''
-    prompt_list = [
-    'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
-    'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
-    'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
-    'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
-    ]
-    prompt_list = [
-    'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
-    'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
-    'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
-    'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
-    ]
-    '''
-    prompt_list = [
-    'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
-    'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
-    'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
-    'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
-    ]
     if mapped_value != -1:
-        prompt= prompt_list[mapped_value].format(
-            raw_caption=generated_caption,
             Wiki_caption=paragraph,
             length=controls['length'],
             sentiment=controls['sentiment'],
             language=controls['language']
         )
     else:
-        print("error prompting")
         prompt = "Invalid focus type."
     if controls['factuality'] == "Imagination":
-        prompt += "Assuming that I am someone who has viewed a lot of art and has a lot of experience viewing art.  Explain artistic features (composition, color, style, or use of light) and discuss the symbolism of the content and its influence on later artistic movements"
-    print("Prompt:", prompt)
-    print("click",click_index)
-    origin_image_input = image_input
-    image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
-                                      input_points=input_points, input_labels=input_labels)
-    if generated_caption:
-        # state = state + [(None, f"RAW_Caption: {generated_caption}")]
-        if not args.disable_gpt and text_refiner:
-            print("new crop save",new_crop_save_path)
-            focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
-            if focus_info.startswith('"') and focus_info.endswith('"'):
-                focus_info=focus_info[1:-1]
-            focus_info=focus_info.replace('#', '')
-            # state = state + [(None, f"Wiki: {paragraph}")]
-            state = state + [(None, f"{focus_info}")]
-            print("new_cap",focus_info)
-            read_info = re.sub(r'[#[\]!*]','',focus_info)
-            read_info = emoji.replace_emoji(read_info,replace="")
-            print("read info",read_info)
-            # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
-            #                                           input_points=input_points, input_labels=input_labels)
-            try:
-                audio_output = await texttospeech(read_info, language,autoplay)
-                # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
-            except Exception as e:
-                state = state + [(None, f"Error during TTS prediction: {str(e)}")]
-                print(f"Error during TTS prediction: {str(e)}")
-                # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
-                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
-        else:
-            try:
-                audio_output = await texttospeech(focus_info, language, autoplay)
-                # waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
-                waveform_visual, audio_output=None,None
-                # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
-            except Exception as e:
-                state = state + [(None, f"Error during TTS prediction: {str(e)}")]
-                print(f"Error during TTS prediction: {str(e)}")
-                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
 def encode_image(image_path):
@@ -892,14 +874,19 @@ def get_sketch_prompt(mask: Image.Image):
     return prompt
-def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
-                      original_size, input_size, text_refiner):
     image_input, mask = sketcher_image['image'], sketcher_image['mask']
     prompt = get_sketch_prompt(mask)
     boxes = prompt['input_boxes']
     boxes = boxes[0]
     controls = {'length': length,
                 'sentiment': sentiment,
@@ -919,38 +906,77 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
     model.setup(image_embedding, original_size, input_size, is_image_set=True)
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
-    out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)[0]
-    # if visual_chatgpt is not None:
-    #     print('inference_click: add caption to chatGPT memory')
-    #     new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
     # Update components and states
     state.append((f'Box: {boxes}', None))
-    state.append((None, f'{out["generated_captions"]["raw_caption"]}'))
-    text = out['generated_captions']['raw_caption']
-    input_mask = np.array(out['mask'].convert('P'))
-    # image_input = mask_painter(np.array(image_input), input_mask, background_alpha=0 )
-    image_input = Image.fromarray(np.array(image_input))
-    draw = ImageDraw.Draw(image_input)
-    draw.rectangle(boxes, outline='red', width=2)
     # fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
     # image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
-    yield state, state, image_input
-    if not args.disable_gpt and model.text_refiner:
-        refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
-                                                       enable_wiki=enable_wiki)
-        new_cap = refined_caption['caption']
-        if refined_caption['wiki']:
-            state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
-        state = state + [(None, f"{new_cap}")]
-        # refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
-        yield state, state, image_input
 def clear_chat_memory(visual_chatgpt, keep_global=False):
     if visual_chatgpt is not None:
@@ -1020,32 +1046,55 @@ def get_style():
         #image_sketcher [data-testid="image"], #image_sketcher [data-testid="image"] > div{min-height: 500px}
         #image_upload{min-height:500px}
         #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 500px}
         '''
     elif current_version <= version.parse('3.27'):
         style = '''
         #image_sketcher{min-height:500px}
         #image_upload{min-height:500px}
         '''
     else:
         style = None
     return style
-def handle_like_dislike(like_data, like_state, dislike_state):
-    if like_data.liked:
-        if like_data.index not in like_state:
-            like_state.append(like_data.index)
-            message = f"Liked: {like_data.value} at index {like_data.index}"
-        else:
-            message = "You already liked this item"
-    else:
-        if like_data.index not in dislike_state:
-            dislike_state.append(like_data.index)
-            message = f"Disliked: {like_data.value} at index {like_data.index}"
-        else:
-            message = "You already disliked this item"
-    return like_state, dislike_state
 async def texttospeech(text,language,autoplay):
     voice=filtered_language_dict[language]
@@ -1060,9 +1109,11 @@ async def texttospeech(text,language,autoplay):
     if autoplay:
         audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
     else:
-        audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls {audio_style}></audio>'
     return audio_player
 def create_ui():
     title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
     """
@@ -1093,7 +1144,6 @@ def create_ui():
         visual_chatgpt = gr.State(None)
         original_size = gr.State(None)
         input_size = gr.State(None)
-        generated_caption = gr.State("")
         paragraph = gr.State("")
         aux_state = gr.State([])
         click_index_state = gr.State((0, 0))
@@ -1102,15 +1152,33 @@ def create_ui():
         input_labels_state = gr.State([])
         new_crop_save_path = gr.State(None)
         image_input_nobackground = gr.State(None)
-        like_state=gr.State([])
-        dislike_state=gr.State([])
         gr.Markdown(title)
         gr.Markdown(description)
-        with gr.Row():
             with gr.Column(scale=1.0):
                 with gr.Column(visible=False) as modules_not_need_gpt:
                     with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
@@ -1156,11 +1224,12 @@ def create_ui():
                                 clear_button_image = gr.Button(value="Clear Image", interactive=True)
                     with gr.Tab("Trajectory (beta)") as traj_tab:
-                        sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10,
                                                        elem_id="image_sketcher")
-                        with gr.Row():
-                            clear_button_sketcher = gr.Button(value="Clear Sketch", interactive=True)
                             submit_button_sketcher = gr.Button(value="Submit", interactive=True)
                         with gr.Row(scale=1.0):
                             with gr.Row(scale=0.8):
                                 focus_type_sketch = gr.Radio(
@@ -1171,7 +1240,7 @@ def create_ui():
                                 Input_sketch = gr.Radio(
                                         choices=["Trace+Seg", "Trace"],
                                         value="Trace+Seg",
-                                        label="Caption Type",
                                         interactive=True)
                     with gr.Column(visible=False) as modules_need_gpt1:
@@ -1203,26 +1272,17 @@ def create_ui():
                                 value="No",
                                 label="Expert",
                                 interactive=True)
                 with gr.Column(visible=True) as modules_not_need_gpt3:
                     gr.Examples(
-                    examples=examples,
-                    inputs=[example_image],
-                )
-            with gr.Column(scale=0.5):
-                with gr.Row(align="right",visible=False) as language_select:
-                    language = gr.Dropdown(
-                                ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
-                                value="English", label="Language", interactive=True)
-                with gr.Row(align="right",visible=False) as autoplay:
-                    auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
-                    output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
                 with gr.Column(visible=True) as module_key_input:
                     openai_api_key = gr.Textbox(
                         placeholder="Input openAI API key",
@@ -1243,7 +1303,7 @@ def create_ui():
                         cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
                 with gr.Column(visible=False) as modules_not_need_gpt2:
-                    with gr.Blocks(css=css):
                         chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True).style(height=600, scale=0.5)
                         with gr.Column(visible=False) as modules_need_gpt3:
                             chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
@@ -1251,6 +1311,9 @@ def create_ui():
                             with gr.Row():
                                 clear_button_text = gr.Button(value="Clear Text", interactive=True)
                                 submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
                             with gr.Row():
                                 export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
                             with gr.Row():
@@ -1421,7 +1484,7 @@ def create_ui():
         # this part is for 3d generate.
         ###############################################################################
-        with gr.Row(variant="panel") as d3_model:
             with gr.Column():
                 with gr.Row():
                     input_image = gr.Image(
@@ -1529,7 +1592,7 @@ def create_ui():
         def clear_tts_fields():
             return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
         # submit_tts.click(
         #     tts.predict,
         #     inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
@@ -1544,6 +1607,9 @@ def create_ui():
             queue=False
         )
         clear_button_sketcher.click(
             lambda x: (x),
             [origin_image],
@@ -1552,18 +1618,21 @@ def create_ui():
             show_progress=False
         )
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
-                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
-                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
-                                              modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
@@ -1677,7 +1746,7 @@ def create_ui():
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
                 out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
             ],
-            outputs=[chatbot, state, click_state, image_input, input_image, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
             show_progress=False, queue=True
         )
@@ -1685,7 +1754,7 @@ def create_ui():
         submit_button_click.click(
             submit_caption,
             inputs=[
-        image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
         auto_play,paragraph,focus_type,openai_api_key,new_crop_save_path
     ],
@@ -1701,10 +1770,10 @@ def create_ui():
         submit_button_sketcher.click(
             inference_traject,
             inputs=[
-                sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
-                original_size, input_size, text_refiner
             ],
-            outputs=[chatbot, state, sketcher_input],
             show_progress=False, queue=True
         )

 import re
 import edge_tts
 import asyncio
+import cv2
 # import tts
 ###############################################################################
 ###############################################################################
+# import spaces  #
+import os
 import imageio
 import numpy as np
 import torch
 import rembg
+from PIL import Image
 from torchvision.transforms import v2
 from pytorch_lightning import seed_everything
 from omegaconf import OmegaConf
 ############# above part is for 3D generate #############
 ###############################################################################
 ###############################################################################
 ############# this part is for text to image #############
 ###############################################################################
     'Cantonese': 'zh-HK-HiuGaaiNeural'
 }
+focus_map = {
+"CFV-D":0,
+"CFV-DA":1,
+"CFV-DAI":2,
+"PFV-DDA":3
+}
+'''
+prompt_list = [
+'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
+]
+prompt_list = [
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
+]
+'''
+prompt_list = [
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
+'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
+]
 gpt_state = 0
 VOICE = "en-GB-SoniaNeural"
 article = """
                 mask[..., -1] = 255
                 mask = self.postprocess(mask)
                 x['mask'] = mask
         return super().preprocess(x)
         global gpt_state
         gpt_state=1
+        # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
+        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
     else:
         gpt_state=0
+        # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
+        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
 def init_wo_openai_api_key():
         global gpt_state
         gpt_state=0
+        # return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
+        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*2
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
         point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
         visual_chatgpt.point_prompt = point_prompt
     print("new crop save",new_crop_save_path)
+    yield state, state, click_state, image_input_nobackground, image_input_withbackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
+async def submit_caption(state, text_refiner, length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
                    autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
     print("state",state)
     print("click_index",click_index)
     print("input_points_state",input_points_state)
     print("input_labels_state",input_labels_state)
+    prompt=generate_prompt(paragraph,focus_type,length,sentiment,factuality,language)
+    print("Prompt:", prompt)
+    print("click",click_index)
+    # image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
+    #                                   input_points=input_points, input_labels=input_labels)
+    if not args.disable_gpt and text_refiner:
+        print("new crop save",new_crop_save_path)
+        focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
+        if focus_info.startswith('"') and focus_info.endswith('"'):
+            focus_info=focus_info[1:-1]
+        focus_info=focus_info.replace('#', '')
+        # state = state + [(None, f"Wiki: {paragraph}")]
+        state = state + [(None, f"{focus_info}")]
+        print("new_cap",focus_info)
+        read_info = re.sub(r'[#[\]!*]','',focus_info)
+        read_info = emoji.replace_emoji(read_info,replace="")
+        print("read info",read_info)
+        # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
+        #                                           input_points=input_points, input_labels=input_labels)
+        try:
+            audio_output = await texttospeech(read_info, language,autoplay)
+            # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
+        except Exception as e:
+            state = state + [(None, f"Error during TTS prediction: {str(e)}")]
+            print(f"Error during TTS prediction: {str(e)}")
+            # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
+            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
+    else:
+        try:
+            audio_output = await texttospeech(focus_info, language, autoplay)
+            # waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
+            # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
+        except Exception as e:
+            state = state + [(None, f"Error during TTS prediction: {str(e)}")]
+            print(f"Error during TTS prediction: {str(e)}")
+            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
+def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language):
     mapped_value = focus_map.get(focus_type, -1)
     controls = {
         'length': length,
         'factuality': factuality,
         'language': language
     }
     if mapped_value != -1:
+        prompt = prompt_list[mapped_value].format(
             Wiki_caption=paragraph,
             length=controls['length'],
             sentiment=controls['sentiment'],
             language=controls['language']
         )
     else:
         prompt = "Invalid focus type."
     if controls['factuality'] == "Imagination":
+        prompt += " Assuming that I am someone who has viewed a lot of art and has a lot of experience viewing art. Explain artistic features (composition, color, style, or use of light) and discuss the symbolism of the content and its influence on later artistic movements."
+    return prompt
 def encode_image(image_path):
     return prompt
+submit_traj=0
+async def inference_traject(origin_image,sketcher_image, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
+                      original_size, input_size, text_refiner,focus_type,paragraph,openai_api_key,autoplay,trace_type):
     image_input, mask = sketcher_image['image'], sketcher_image['mask']
+    crop_save_path=""
     prompt = get_sketch_prompt(mask)
     boxes = prompt['input_boxes']
     boxes = boxes[0]
+    global submit_traj
+    submit_traj=1
     controls = {'length': length,
                 'sentiment': sentiment,
     model.setup(image_embedding, original_size, input_size, is_image_set=True)
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
+    out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki,verbose=True)[0]
+    print(trace_type)
+    if trace_type=="Trace+Seg":
+        input_mask = np.array(out['mask'].convert('P'))
+        image_input = mask_painter(np.array(image_input), input_mask, background_alpha=0 )
+        crop_save_path=out['crop_save_path']
+    else:
+        image_input = Image.fromarray(np.array(origin_image))
+        draw = ImageDraw.Draw(image_input)
+        draw.rectangle(boxes, outline='red', width=2)
+        cropped_image = origin_image.crop(boxes)
+        cropped_image.save('temp.png')
+        crop_save_path='temp.png'
+    print("crop_svae_path",out['crop_save_path'])
     # Update components and states
     state.append((f'Box: {boxes}', None))
     # fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
     # image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
+    prompt=generate_prompt(focus_type, paragraph, length, sentiment, factuality, language)
+    width, height = sketcher_image['image'].size
+    sketcher_image['mask'] = np.zeros((height, width, 4), dtype=np.uint8)
+    sketcher_image['mask'][..., -1] = 255
+    sketcher_image['image']=image_input
+    if not args.disable_gpt and text_refiner:
+        focus_info=get_image_gpt(openai_api_key,crop_save_path,prompt)
+        if focus_info.startswith('"') and focus_info.endswith('"'):
+            focus_info=focus_info[1:-1]
+        focus_info=focus_info.replace('#', '')
+        state = state + [(None, f"{focus_info}")]
+        print("new_cap",focus_info)
+        read_info = re.sub(r'[#[\]!*]','',focus_info)
+        read_info = emoji.replace_emoji(read_info,replace="")
+        print("read info",read_info)
+        # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
+        #                                           input_points=input_points, input_labels=input_labels)
+        try:
+            audio_output = await texttospeech(read_info, language,autoplay)
+            # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+            return state, state,image_input,audio_output
+        except Exception as e:
+            state = state + [(None, f"Error during TTS prediction: {str(e)}")]
+            print(f"Error during TTS prediction: {str(e)}")
+            # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
+            return state, state, image_input,audio_output
+    else:
+        try:
+            audio_output = await texttospeech(focus_info, language, autoplay)
+            # waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
+            # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+            return state, state, image_input,audio_output
+        except Exception as e:
+            state = state + [(None, f"Error during TTS prediction: {str(e)}")]
+            print(f"Error during TTS prediction: {str(e)}")
+            return state, state, image_input,audio_output
 def clear_chat_memory(visual_chatgpt, keep_global=False):
     if visual_chatgpt is not None:
         #image_sketcher [data-testid="image"], #image_sketcher [data-testid="image"] > div{min-height: 500px}
         #image_upload{min-height:500px}
         #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 500px}
+        .custom-language {
+            width: 20%;
+        }
+        .custom-autoplay {
+            width: 40%;
+        }
+        .custom-output {
+            width: 30%;
+        }
         '''
     elif current_version <= version.parse('3.27'):
         style = '''
         #image_sketcher{min-height:500px}
         #image_upload{min-height:500px}
+        .custom-language {
+            width: 20%;
+        }
+        .custom-autoplay {
+            width: 40%;
+        }
+        .custom-output {
+            width: 30%;
+        }
         '''
     else:
         style = None
     return style
+# def handle_like_dislike(like_data, like_state, dislike_state):
+#     if like_data.liked:
+#         if like_data.index not in like_state:
+#             like_state.append(like_data.index)
+#             message = f"Liked: {like_data.value} at index {like_data.index}"
+#         else:
+#             message = "You already liked this item"
+#     else:
+#         if like_data.index not in dislike_state:
+#             dislike_state.append(like_data.index)
+#             message = f"Disliked: {like_data.value} at index {like_data.index}"
+#         else:
+#             message = "You already disliked this item"
+#     return like_state, dislike_state
 async def texttospeech(text,language,autoplay):
     voice=filtered_language_dict[language]
     if autoplay:
         audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
     else:
+        audio_player=None
+        # audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls {audio_style}></audio>'
     return audio_player
 def create_ui():
     title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
     """
         visual_chatgpt = gr.State(None)
         original_size = gr.State(None)
         input_size = gr.State(None)
         paragraph = gr.State("")
         aux_state = gr.State([])
         click_index_state = gr.State((0, 0))
         input_labels_state = gr.State([])
         new_crop_save_path = gr.State(None)
         image_input_nobackground = gr.State(None)
         gr.Markdown(title)
         gr.Markdown(description)
+        with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
+            language = gr.Dropdown(
+            ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
+            value="English", label="Language", interactive=True, scale=0.2, elem_classes="custom-language"
+        )
+            auto_play = gr.Checkbox(
+            label="Check to autoplay audio", value=False, scale=0.4, elem_classes="custom-autoplay"
+        )
+            output_audio = gr.HTML(
+                label="Synthesised Audio", scale=0.3, elem_classes="custom-output"
+            )
+        # with gr.Row(align="right",visible=False) as language_select:
+        #     language = gr.Dropdown(
+        #         ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
+        #         value="English", label="Language", interactive=True)
+        # with gr.Row(align="right",visible=False) as autoplay:
+        #     auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
+        #     output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
+        with gr.Row():
             with gr.Column(scale=1.0):
                 with gr.Column(visible=False) as modules_not_need_gpt:
                     with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
                                 clear_button_image = gr.Button(value="Clear Image", interactive=True)
                     with gr.Tab("Trajectory (beta)") as traj_tab:
+                        sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10,
                                                        elem_id="image_sketcher")
+                        example_image = gr.Image(type="pil", interactive=False, visible=False)
+                        with gr.Row():
                             submit_button_sketcher = gr.Button(value="Submit", interactive=True)
+                            clear_button_sketcher = gr.Button(value="Clear Sketch", interactive=True)
                         with gr.Row(scale=1.0):
                             with gr.Row(scale=0.8):
                                 focus_type_sketch = gr.Radio(
                                 Input_sketch = gr.Radio(
                                         choices=["Trace+Seg", "Trace"],
                                         value="Trace+Seg",
+                                        label="Trace Type",
                                         interactive=True)
                     with gr.Column(visible=False) as modules_need_gpt1:
                                 value="No",
                                 label="Expert",
                                 interactive=True)
                 with gr.Column(visible=True) as modules_not_need_gpt3:
                     gr.Examples(
+                examples=examples,
+                inputs=[example_image],
+            )
+            with gr.Column(scale=0.5):
                 with gr.Column(visible=True) as module_key_input:
                     openai_api_key = gr.Textbox(
                         placeholder="Input openAI API key",
                         cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
                 with gr.Column(visible=False) as modules_not_need_gpt2:
+                    with gr.Blocks():
                         chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True).style(height=600, scale=0.5)
                         with gr.Column(visible=False) as modules_need_gpt3:
                             chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
                             with gr.Row():
                                 clear_button_text = gr.Button(value="Clear Text", interactive=True)
                                 submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
+                                upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
+                                downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
                             with gr.Row():
                                 export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
                             with gr.Row():
         # this part is for 3d generate.
         ###############################################################################
+        with gr.Row(variant="panel",visible=False) as d3_model:
             with gr.Column():
                 with gr.Row():
                     input_image = gr.Image(
         def clear_tts_fields():
             return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
         # submit_tts.click(
         #     tts.predict,
         #     inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
             queue=False
         )
         clear_button_sketcher.click(
             lambda x: (x),
             [origin_image],
             show_progress=False
         )
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
+                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
+                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
+                                              modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
                 out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
             ],
+            outputs=[chatbot, state, click_state, image_input, input_image, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
             show_progress=False, queue=True
         )
         submit_button_click.click(
             submit_caption,
             inputs=[
+        state, text_refiner,length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
         auto_play,paragraph,focus_type,openai_api_key,new_crop_save_path
     ],
         submit_button_sketcher.click(
             inference_traject,
             inputs=[
+                origin_image,sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
+                original_size, input_size, text_refiner,focus_type_sketch,paragraph,openai_api_key,auto_play,Input_sketch
             ],
+            outputs=[chatbot, state, sketcher_input,output_audio],
             show_progress=False, queue=True
         )