EyeSee_chi

Running

App Files Files Community

Niki Zhang commited on Jun 3, 2024

Commit

d1292a4

verified ·

1 Parent(s): e73c6e2

Update app.py

Browse files

Files changed (1) hide show

app.py +187 -120

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from math import inf
 import os
 import base64
@@ -9,6 +10,7 @@ import requests
 from packaging import version
 from PIL import Image, ImageDraw
 import functools
 from langchain.llms.openai import OpenAI
 from caption_anything.model import CaptionAnything
 from caption_anything.utils.image_editing_utils import create_bubble_frame
@@ -20,7 +22,10 @@ from caption_anything.segmenter import build_segmenter
 from caption_anything.utils.chatbot import ConversationBot, build_chatbot_tools, get_new_image_name
 from segment_anything import sam_model_registry
 import easyocr
-import tts
 ###############################################################################
 ############# this part is for 3D generate #############
@@ -279,9 +284,25 @@ def make3d(images):
 ############# above part is for 3D generate #############
 ###############################################################################
 gpt_state = 0
-pre_click_index=(inf, inf)
 article = """
 <div style='margin:20px auto;'>
 <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
@@ -374,15 +395,15 @@ def init_openai_api_key(api_key=""):
         global gpt_state
         gpt_state=1
-        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=True)]+ [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
     else:
         gpt_state=0
-        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
 def init_wo_openai_api_key():
         global gpt_state
         gpt_state=0
-        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
@@ -467,7 +488,12 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
         # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
         paragraph = get_image_gpt(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
-    state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
     return state, state, image_input, click_state, image_input, image_input, image_input, image_embedding, \
         original_size, input_size, f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",paragraph
@@ -539,12 +565,11 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
-def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
-                   input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key,new_crop_save_path):
     print("state",state)
-    global pre_click_index
     click_index = click_index_state
     # if pre_click_index==click_index:
@@ -553,7 +578,6 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
     # else:
     #     pre_click_index = click_index
     print("click_index",click_index)
-    print("pre_click_index",pre_click_index)
     print("input_points_state",input_points_state)
     print("input_labels_state",input_labels_state)
@@ -630,29 +654,34 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
             focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
             if focus_info.startswith('"') and focus_info.endswith('"'):
                 focus_info=focus_info[1:-1]
             # state = state + [(None, f"Wiki: {paragraph}")]
             state = state + [(None, f"{focus_info}")]
             print("new_cap",focus_info)
             # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
             #                                           input_points=input_points, input_labels=input_labels)
             try:
-                waveform_visual, audio_output = tts.predict(focus_info, input_language, input_audio, input_mic, use_mic, agree)
                 # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
             except Exception as e:
                 state = state + [(None, f"Error during TTS prediction: {str(e)}")]
                 print(f"Error during TTS prediction: {str(e)}")
                 # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
-                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
         else:
             try:
-                waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
                 # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
             except Exception as e:
                 state = state + [(None, f"Error during TTS prediction: {str(e)}")]
@@ -834,7 +863,8 @@ def cap_everything(image_input, visual_chatgpt, text_refiner,input_language, inp
     AI_prompt = "Received."
     visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
     visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
-    waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
     return paragraph,waveform_visual, audio_output
 def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
@@ -877,6 +907,37 @@ def get_style():
     return style
 def create_ui():
     title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
@@ -884,17 +945,20 @@ def create_ui():
     description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
     examples = [
-        ["test_images/img36.webp"],
-        ["test_images/MUS.png"],
-        ["test_images/图片2.png"],
-        ["test_images/img5.jpg"],
-        ["test_images/img14.jpg"],
-        ["test_images/qingming3.jpeg"],
     ]
     with gr.Blocks(
-            css=get_style()
     ) as iface:
         state = gr.State([])
         out_state = gr.State(None)
@@ -914,6 +978,8 @@ def create_ui():
         input_labels_state = gr.State([])
         new_crop_save_path = gr.State(None)
         image_input_nobackground = gr.State(None)
@@ -924,19 +990,15 @@ def create_ui():
             with gr.Column(scale=1.0):
                 with gr.Column(visible=False) as modules_not_need_gpt:
                     with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
-                        image_intro=gr.HTML()
                         image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         example_image = gr.Image(type="pil", interactive=False, visible=False)
                         with gr.Row():
                             name_label_base = gr.Button(value="Name: ")
                             artist_label_base = gr.Button(value="Artist: ")
                             year_label_base = gr.Button(value="Year: ")
-                            material_label_base = gr.Button(value="Material: ")
                     with gr.Tab("Click") as click_tab:
-                        image_intro_click=gr.HTML()
                         image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         example_image = gr.Image(type="pil", interactive=False, visible=False)
                         with gr.Row():
@@ -945,11 +1007,14 @@ def create_ui():
                             year_label = gr.Button(value="Year: ")
                             material_label = gr.Button(value="Material: ")
                         with gr.Row(scale=1.0):
-                             focus_type = gr.Radio(
-                                    choices=["CFV-D", "CFV-DA", "CFV-DAI","PFV-DDA"],
-                                    value="CFV-D",
-                                    label="Focus Type",
-                                    interactive=True)
                         with gr.Row(scale=1.0):
                             with gr.Row(scale=0.4):
                                 point_prompt = gr.Radio(
@@ -965,53 +1030,62 @@ def create_ui():
                             with gr.Row(scale=0.4):
                                 clear_button_click = gr.Button(value="Clear Clicks", interactive=True)
                                 clear_button_image = gr.Button(value="Clear Image", interactive=True)
-                                submit_button_click=gr.Button(value="Submit", interactive=True)
-                    with gr.Tab("Trajectory (beta)"):
                         sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=20,
                                                        elem_id="image_sketcher")
                         with gr.Row():
                             submit_button_sketcher = gr.Button(value="Submit", interactive=True)
-                with gr.Column(visible=False) as modules_need_gpt1:
-                    with gr.Row(scale=1.0):
-                        language = gr.Dropdown(
-                            ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
-                            value="English", label="Language", interactive=True)
-                        sentiment = gr.Radio(
-                            choices=["Positive", "Natural", "Negative"],
-                            value="Natural",
-                            label="Sentiment",
-                            interactive=True,
-                        )
                     with gr.Row(scale=1.0):
-                        factuality = gr.Radio(
-                            choices=["Factual", "Imagination"],
-                            value="Factual",
-                            label="Factuality",
-                            interactive=True,
-                        )
-                        length = gr.Slider(
-                            minimum=10,
-                            maximum=80,
-                            value=10,
-                            step=1,
-                            interactive=True,
-                            label="Generated Caption Length",
-                        )
-                        # 是否启用wiki内容整合到caption中
-                        enable_wiki = gr.Radio(
-                            choices=["Yes", "No"],
-                            value="No",
-                            label="Enable Wiki",
-                            interactive=True)
-                # with gr.Column(visible=True) as modules_not_need_gpt3:
-                gr.Examples(
                     examples=examples,
                     inputs=[example_image],
                 )
             with gr.Column(scale=0.5):
                 with gr.Column(visible=True) as module_key_input:
                     openai_api_key = gr.Textbox(
                         placeholder="Input openAI API key",
@@ -1027,39 +1101,39 @@ def create_ui():
                     notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
                 with gr.Column():
-                    with gr.Column(visible=False) as modules_need_gpt2:
                         paragraph_output = gr.Textbox(lines=7, label="Describe Everything", max_lines=7)
-                    with gr.Column(visible=False) as modules_need_gpt0:
                         cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
                 with gr.Column(visible=False) as modules_not_need_gpt2:
-                    chatbot = gr.Chatbot(label="Chatbox", ).style(height=550, scale=0.5)
-                    with gr.Column(visible=False) as modules_need_gpt3:
-                        chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
-                            container=False)
-                        with gr.Row():
-                            clear_button_text = gr.Button(value="Clear Text", interactive=True)
-                            submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
-                        with gr.Row():
-                            export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
-                        with gr.Row():
-                            chat_log_file = gr.File(label="Download Chat Log")
-            with gr.Column(scale=0.5):
                 # TTS interface hidden initially
-                with gr.Column(visible=False) as tts_interface:
-                    input_text = gr.Textbox(label="Text Prompt", value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality")
-                    input_language = gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en")
-                    input_audio = gr.Audio(label="Reference Audio", type="filepath", value="examples/female.wav")
-                    input_mic = gr.Audio(source="microphone", type="filepath", label="Use Microphone for Reference")
-                    use_mic = gr.Checkbox(label="Check to use Microphone as Reference", value=False)
-                    agree = gr.Checkbox(label="Agree", value=True)
-                    output_waveform = gr.Video(label="Waveform Visual")
-                    output_audio = gr.HTML(label="Synthesised Audio")
-                    with gr.Row():
-                        submit_tts = gr.Button(value="Submit", interactive=True)
-                        clear_tts = gr.Button(value="Clear", interactive=True)
@@ -1154,6 +1228,8 @@ def create_ui():
         mv_images = gr.State()
         submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
             fn=generate_mvs,
@@ -1174,12 +1250,12 @@ def create_ui():
         def clear_tts_fields():
             return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
-        submit_tts.click(
-            tts.predict,
-            inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
-            outputs=[output_waveform, output_audio],
-            queue=True
-        )
         clear_tts.click(
             clear_tts_fields,
@@ -1191,15 +1267,15 @@ def create_ui():
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
-                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
-                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box])
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
-                                              modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box])
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
@@ -1299,6 +1375,7 @@ def create_ui():
                 return [gr.update(visible=False)]*4
         click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
         base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
@@ -1322,26 +1399,16 @@ def create_ui():
             inputs=[
         image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
-        input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key,new_crop_save_path
     ],
             outputs=[
                 chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
-                output_waveform, output_audio
             ],
             show_progress=True,
             queue=True
         )
-        focus_type.change(
-           lambda x: ([[], [], []], x),
-            [image_input_nobackground],
-            [click_state, image_input],
-            queue=False,
-            show_progress=False
-        )
         submit_button_sketcher.click(
             inference_traject,
@@ -1370,4 +1437,4 @@ def create_ui():
 if __name__ == '__main__':
     iface = create_ui()
     iface.queue(concurrency_count=5, api_open=False, max_size=10)
-    iface.launch(server_name="0.0.0.0", enable_queue=True)

+from io import BytesIO
 from math import inf
 import os
 import base64
 from packaging import version
 from PIL import Image, ImageDraw
 import functools
+import emoji
 from langchain.llms.openai import OpenAI
 from caption_anything.model import CaptionAnything
 from caption_anything.utils.image_editing_utils import create_bubble_frame
 from caption_anything.utils.chatbot import ConversationBot, build_chatbot_tools, get_new_image_name
 from segment_anything import sam_model_registry
 import easyocr
+import re
+import edge_tts
+import asyncio
+# import tts
 ###############################################################################
 ############# this part is for 3D generate #############
 ############# above part is for 3D generate #############
 ###############################################################################
+css = """
+#warning {background-color: #FFCCCB}
+.chatbot {
+        padding: 0 !important;
+        margin: 0 !important;
+    }
+"""
+filtered_language_dict = {
+    'English': 'en-US-JennyNeural',
+    'Chinese': 'zh-CN-XiaoxiaoNeural',
+    'French': 'fr-FR-DeniseNeural',
+    'Spanish': 'es-MX-DaliaNeural',
+    'Arabic': 'ar-SA-ZariyahNeural',
+    'Portuguese': 'pt-BR-FranciscaNeural',
+    'Cantonese': 'zh-HK-HiuGaaiNeural'
+}
 gpt_state = 0
+VOICE = "en-GB-SoniaNeural"
 article = """
 <div style='margin:20px auto;'>
 <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
         global gpt_state
         gpt_state=1
+        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
     else:
         gpt_state=0
+        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
 def init_wo_openai_api_key():
         global gpt_state
         gpt_state=0
+        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*2
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
         # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
         paragraph = get_image_gpt(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
+    state = [
+    (
+        None,
+        f"🤖 Hi, I am EyeSee. Let's explore this painting {name} together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
+    )
+]
     return state, state, image_input, click_state, image_input, image_input, image_input, image_embedding, \
         original_size, input_size, f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",paragraph
+async def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+                   autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
     print("state",state)
     click_index = click_index_state
     # if pre_click_index==click_index:
     # else:
     #     pre_click_index = click_index
     print("click_index",click_index)
     print("input_points_state",input_points_state)
     print("input_labels_state",input_labels_state)
             focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
             if focus_info.startswith('"') and focus_info.endswith('"'):
                 focus_info=focus_info[1:-1]
+            focus_info=focus_info.replace('#', '')
             # state = state + [(None, f"Wiki: {paragraph}")]
             state = state + [(None, f"{focus_info}")]
             print("new_cap",focus_info)
+            read_info = re.sub(r'[#[\]!*]','',focus_info)
+            read_info = emoji.replace_emoji(read_info,replace="")
+            print("read info",read_info)
             # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
             #                                           input_points=input_points, input_labels=input_labels)
             try:
+                audio_output = await texttospeech(read_info, language,autoplay)
                 # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
             except Exception as e:
                 state = state + [(None, f"Error during TTS prediction: {str(e)}")]
                 print(f"Error during TTS prediction: {str(e)}")
                 # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
+                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
         else:
             try:
+                audio_output = await texttospeech(focus_info, language, autoplay)
+                # waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
+                waveform_visual, audio_output=None,None
                 # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
             except Exception as e:
                 state = state + [(None, f"Error during TTS prediction: {str(e)}")]
     AI_prompt = "Received."
     visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
     visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
+    # waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
+    waveform_visual, audio_output=None,None
     return paragraph,waveform_visual, audio_output
 def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
     return style
+def handle_like_dislike(like_data, like_state, dislike_state):
+    if like_data.liked:
+        if like_data.index not in like_state:
+            like_state.append(like_data.index)
+            message = f"Liked: {like_data.value} at index {like_data.index}"
+        else:
+            message = "You already liked this item"
+    else:
+        if like_data.index not in dislike_state:
+            dislike_state.append(like_data.index)
+            message = f"Disliked: {like_data.value} at index {like_data.index}"
+        else:
+            message = "You already disliked this item"
+    return like_state, dislike_state
+async def texttospeech(text,language,autoplay):
+    voice=filtered_language_dict[language]
+    communicate = edge_tts.Communicate(text, voice)
+    file_path="output.wav"
+    await communicate.save(file_path)
+    with open(file_path, "rb") as audio_file:
+            audio_bytes = BytesIO(audio_file.read())
+    audio = base64.b64encode(audio_bytes.read()).decode("utf-8")
+    print("tts....")
+    audio_style = 'style="width:250px;"'
+    if autoplay:
+        audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
+    else:
+        audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls {audio_style}></audio>'
+    return audio_player
 def create_ui():
     title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
     description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
     examples = [
+        ["test_images/pearl.jpg"],
+        ["test_images/ambass.jpg"],
+        ["test_images/Picture0.png"],
+        ["test_images/Picture1.png"],
+        ["test_images/Picture2.png"],
+        ["test_images/Picture3.png"],
+        ["test_images/Picture4.png"],
+        ["test_images/Picture5.png"],
     ]
     with gr.Blocks(
+            css=get_style(),
+            theme=gr.themes.Base()
     ) as iface:
         state = gr.State([])
         out_state = gr.State(None)
         input_labels_state = gr.State([])
         new_crop_save_path = gr.State(None)
         image_input_nobackground = gr.State(None)
+        like_state=gr.State([])
+        dislike_state=gr.State([])
             with gr.Column(scale=1.0):
                 with gr.Column(visible=False) as modules_not_need_gpt:
                     with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
                         image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         example_image = gr.Image(type="pil", interactive=False, visible=False)
                         with gr.Row():
                             name_label_base = gr.Button(value="Name: ")
                             artist_label_base = gr.Button(value="Artist: ")
                             year_label_base = gr.Button(value="Year: ")
+                            material_label_base = gr.Button(value="Material: ")
                     with gr.Tab("Click") as click_tab:
                         image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         example_image = gr.Image(type="pil", interactive=False, visible=False)
                         with gr.Row():
                             year_label = gr.Button(value="Year: ")
                             material_label = gr.Button(value="Material: ")
                         with gr.Row(scale=1.0):
+                            with gr.Row(scale=0.8):
+                                focus_type = gr.Radio(
+                                        choices=["CFV-D", "CFV-DA", "CFV-DAI","PFV-DDA"],
+                                        value="CFV-D",
+                                        label="Information Type",
+                                        interactive=True)
+                            with gr.Row(scale=0.2):
+                                submit_button_click=gr.Button(value="Submit", interactive=True,variant='primary',size="sm")
                         with gr.Row(scale=1.0):
                             with gr.Row(scale=0.4):
                                 point_prompt = gr.Radio(
                             with gr.Row(scale=0.4):
                                 clear_button_click = gr.Button(value="Clear Clicks", interactive=True)
                                 clear_button_image = gr.Button(value="Clear Image", interactive=True)
+                    with gr.Tab("Trajectory (beta)") as traj_tab:
                         sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=20,
                                                        elem_id="image_sketcher")
                         with gr.Row():
                             submit_button_sketcher = gr.Button(value="Submit", interactive=True)
+                    with gr.Column(visible=False) as modules_need_gpt1:
+                        with gr.Row(scale=1.0):
+                            sentiment = gr.Radio(
+                                choices=["Positive", "Natural", "Negative"],
+                                value="Natural",
+                                label="Sentiment",
+                                interactive=True,
+                            )
+                        with gr.Row(scale=1.0):
+                            factuality = gr.Radio(
+                                choices=["Factual", "Imagination"],
+                                value="Factual",
+                                label="Factuality",
+                                interactive=True,
+                            )
+                            length = gr.Slider(
+                                minimum=10,
+                                maximum=80,
+                                value=10,
+                                step=1,
+                                interactive=True,
+                                label="Generated Caption Length",
+                            )
+                            # 是否启用wiki内容整合到caption中
+                            enable_wiki = gr.Radio(
+                                choices=["Yes", "No"],
+                                value="No",
+                                label="Enable Wiki",
+                                interactive=True)
                     with gr.Row(scale=1.0):
+                        gr.Examples(
                     examples=examples,
                     inputs=[example_image],
                 )
+                # with gr.Column(visible=True) as modules_not_need_gpt3:
             with gr.Column(scale=0.5):
+                with gr.Row(align="right",visible=False) as language_select:
+                    language = gr.Dropdown(
+                                ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
+                                value="English", label="Language", interactive=True)
+                with gr.Row(align="right",visible=False) as autoplay:
+                    auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
+                    output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
                 with gr.Column(visible=True) as module_key_input:
                     openai_api_key = gr.Textbox(
                         placeholder="Input openAI API key",
                     notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
                 with gr.Column():
+                    with gr.Column(visible=False,scale=1.0) as modules_need_gpt2:
                         paragraph_output = gr.Textbox(lines=7, label="Describe Everything", max_lines=7)
+                    with gr.Column(visible=False,scale=0.2) as modules_need_gpt0:
                         cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
                 with gr.Column(visible=False) as modules_not_need_gpt2:
+                    with gr.Blocks(css=css):
+                        chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True).style(height=600, scale=0.5)
+                        with gr.Column(visible=False) as modules_need_gpt3:
+                            chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
+                                container=False)
+                            with gr.Row():
+                                clear_button_text = gr.Button(value="Clear Text", interactive=True)
+                                submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
+                            with gr.Row():
+                                export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
+                            with gr.Row():
+                                chat_log_file = gr.File(label="Download Chat Log")
                 # TTS interface hidden initially
+            with gr.Column(visible=False) as tts_interface:
+                input_text = gr.Textbox(label="Text Prompt", value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality")
+                input_language = gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en")
+                input_audio = gr.Audio(label="Reference Audio", type="filepath", value="examples/female.wav")
+                input_mic = gr.Audio(source="microphone", type="filepath", label="Use Microphone for Reference")
+                use_mic = gr.Checkbox(label="Check to use Microphone as Reference", value=False)
+                agree = gr.Checkbox(label="Agree", value=True)
+                output_waveform = gr.Video(label="Waveform Visual")
+                # output_audio = gr.HTML(label="Synthesised Audio")
+                with gr.Row():
+                    submit_tts = gr.Button(value="Submit", interactive=True)
+                    clear_tts = gr.Button(value="Clear", interactive=True)
         mv_images = gr.State()
+        chatbot.like(handle_like_dislike, inputs=[like_state, dislike_state], outputs=[like_state, dislike_state])
         submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
             fn=generate_mvs,
         def clear_tts_fields():
             return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
+        # submit_tts.click(
+        #     tts.predict,
+        #     inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
+        #     outputs=[output_waveform, output_audio],
+        #     queue=True
+        # )
         clear_tts.click(
             clear_tts_fields,
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
+                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
+                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
+                                              modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
                 return [gr.update(visible=False)]*4
+        traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
         click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
         base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
             inputs=[
         image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+        auto_play,paragraph,focus_type,openai_api_key,new_crop_save_path
     ],
             outputs=[
                 chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
+                output_audio
             ],
             show_progress=True,
             queue=True
         )
         submit_button_sketcher.click(
             inference_traject,
 if __name__ == '__main__':
     iface = create_ui()
     iface.queue(concurrency_count=5, api_open=False, max_size=10)
+    iface.launch(server_name="0.0.0.0", enable_queue=True)