EyeSee_chi

Running

App Files Files Community

Niki Zhang commited on Jun 24, 2024

Commit

b6190c4

verified ·

1 Parent(s): 9029eb6

Update app.py

Browse files

Files changed (1) hide show

app.py +557 -478

app.py CHANGED Viewed

@@ -28,7 +28,6 @@ import re
 import edge_tts
 from langchain import __version__
 import torch
-import gradio as gr
 from transformers import AutoProcessor, SiglipModel
 import faiss
 from huggingface_hub import hf_hub_download
@@ -38,6 +37,8 @@ import requests
 import spaces
 # Print the current version of LangChain
 print(f"Current LangChain version: {__version__}")
 # import tts
 ###############################################################################
@@ -46,9 +47,9 @@ print(f"Current LangChain version: {__version__}")
 # import spaces  #
-import threading
-lock = threading.Lock()
 import os
 # import uuid
 # from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
@@ -94,220 +95,220 @@ from huggingface_hub import hf_hub_download
-def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False):
-    """
-    Get the rendering camera parameters.
-    """
-    c2ws = get_circular_camera_poses(M=M, radius=radius, elevation=elevation)
-    if is_flexicubes:
-        cameras = torch.linalg.inv(c2ws)
-        cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1, 1)
-    else:
-        extrinsics = c2ws.flatten(-2)
-        intrinsics = FOV_to_intrinsics(50.0).unsqueeze(0).repeat(M, 1, 1).float().flatten(-2)
-        cameras = torch.cat([extrinsics, intrinsics], dim=-1)
-        cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1)
-    return cameras
-def images_to_video(images, output_path, fps=30):
-    # images: (N, C, H, W)
-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    frames = []
-    for i in range(images.shape[0]):
-        frame = (images[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8).clip(0, 255)
-        assert frame.shape[0] == images.shape[2] and frame.shape[1] == images.shape[3], \
-            f"Frame shape mismatch: {frame.shape} vs {images.shape}"
-        assert frame.min() >= 0 and frame.max() <= 255, \
-            f"Frame value out of range: {frame.min()} ~ {frame.max()}"
-        frames.append(frame)
-    imageio.mimwrite(output_path, np.stack(frames), fps=fps, codec='h264')
-###############################################################################
-# Configuration.
-###############################################################################
-import shutil
-def find_cuda():
-    # Check if CUDA_HOME or CUDA_PATH environment variables are set
-    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
-    if cuda_home and os.path.exists(cuda_home):
-        return cuda_home
-    # Search for the nvcc executable in the system's PATH
-    nvcc_path = shutil.which('nvcc')
-    if nvcc_path:
-        # Remove the 'bin/nvcc' part to get the CUDA installation path
-        cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
-        return cuda_path
-    return None
-cuda_path = find_cuda()
-if cuda_path:
-    print(f"CUDA installation found at: {cuda_path}")
-else:
-    print("CUDA installation not found")
-config_path = 'configs/instant-nerf-base.yaml'
-config = OmegaConf.load(config_path)
-config_name = os.path.basename(config_path).replace('.yaml', '')
-model_config = config.model_config
-infer_config = config.infer_config
-IS_FLEXICUBES = True if config_name.startswith('instant-mesh') else False
-device = torch.device('cuda')
-# load diffusion model
-print('Loading diffusion model ...')
-pipeline = DiffusionPipeline.from_pretrained(
-    "sudo-ai/zero123plus-v1.2",
-    custom_pipeline="zero123plus",
-    torch_dtype=torch.float16,
-)
-pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
-    pipeline.scheduler.config, timestep_spacing='trailing'
-)
-# load custom white-background UNet
-unet_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model")
-state_dict = torch.load(unet_ckpt_path, map_location='cpu')
-pipeline.unet.load_state_dict(state_dict, strict=True)
-pipeline = pipeline.to(device)
-# load reconstruction model
-print('Loading reconstruction model ...')
-model_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="instant_nerf_base.ckpt", repo_type="model")
-model0 = instantiate_from_config(model_config)
-state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
-state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.') and 'source_camera' not in k}
-model0.load_state_dict(state_dict, strict=True)
-model0 = model0.to(device)
-print('Loading Finished!')
-def check_input_image(input_image):
-    if input_image is None:
-        raise gr.Error("No image uploaded!")
-        image = None
-    else:
-        image = Image.open(input_image)
-    return image
-def preprocess(input_image, do_remove_background):
-    rembg_session = rembg.new_session() if do_remove_background else None
-    if do_remove_background:
-        input_image = remove_background(input_image, rembg_session)
-        input_image = resize_foreground(input_image, 0.85)
-    return input_image
-# @spaces.GPU
-def generate_mvs(input_image, sample_steps, sample_seed):
-    seed_everything(sample_seed)
-    # sampling
-    z123_image = pipeline(
-        input_image,
-        num_inference_steps=sample_steps
-    ).images[0]
-    show_image = np.asarray(z123_image, dtype=np.uint8)
-    show_image = torch.from_numpy(show_image)     # (960, 640, 3)
-    show_image = rearrange(show_image, '(n h) (m w) c -> (n m) h w c', n=3, m=2)
-    show_image = rearrange(show_image, '(n m) h w c -> (n h) (m w) c', n=2, m=3)
-    show_image = Image.fromarray(show_image.numpy())
-    return z123_image, show_image
-# @spaces.GPU
-def make3d(images):
-    global model0
-    if IS_FLEXICUBES:
-        model0.init_flexicubes_geometry(device)
-    model0 = model0.eval()
-    images = np.asarray(images, dtype=np.float32) / 255.0
-    images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float()     # (3, 960, 640)
-    images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=3, m=2)        # (6, 3, 320, 320)
-    input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(device)
-    render_cameras = get_render_cameras(batch_size=1, radius=2.5, is_flexicubes=IS_FLEXICUBES).to(device)
-    images = images.unsqueeze(0).to(device)
-    images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1)
-    mesh_fpath = tempfile.NamedTemporaryFile(suffix=f".obj", delete=False).name
-    print(mesh_fpath)
-    mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
-    mesh_dirname = os.path.dirname(mesh_fpath)
-    video_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.mp4")
-    mesh_glb_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.glb")
-    with torch.no_grad():
-        # get triplane
-        planes = model0.forward_planes(images, input_cameras)
-        # # get video
-        # chunk_size = 20 if IS_FLEXICUBES else 1
-        # render_size = 384
-        # frames = []
-        # for i in tqdm(range(0, render_cameras.shape[1], chunk_size)):
-        #     if IS_FLEXICUBES:
-        #         frame = model.forward_geometry(
-        #             planes,
-        #             render_cameras[:, i:i+chunk_size],
-        #             render_size=render_size,
-        #         )['img']
-        #     else:
-        #         frame = model.synthesizer(
-        #             planes,
-        #             cameras=render_cameras[:, i:i+chunk_size],
-        #             render_size=render_size,
-        #         )['images_rgb']
-        #     frames.append(frame)
-        # frames = torch.cat(frames, dim=1)
-        # images_to_video(
-        #     frames[0],
-        #     video_fpath,
-        #     fps=30,
-        # )
-        # print(f"Video saved to {video_fpath}")
-        # get mesh
-        mesh_out = model0.extract_mesh(
-            planes,
-            use_texture_map=False,
-            **infer_config,
-        )
-        vertices, faces, vertex_colors = mesh_out
-        vertices = vertices[:, [1, 2, 0]]
-        save_glb(vertices, faces, vertex_colors, mesh_glb_fpath)
-        save_obj(vertices, faces, vertex_colors, mesh_fpath)
-        print(f"Mesh saved to {mesh_fpath}")
-    return mesh_fpath, mesh_glb_fpath
 ###############################################################################
@@ -471,10 +472,13 @@ examples = [
 css = """
 #warning {background-color: #FFCCCB}
-.chatbot {
-        padding: 0 !important;
-        margin: 0 !important;
-    }
 """
 filtered_language_dict = {
     'English': 'en-US-JennyNeural',
@@ -487,10 +491,10 @@ filtered_language_dict = {
 }
 focus_map = {
-"CFV-D":0,
-"CFV-DA":1,
-"CFV-DAI":2,
-"PFV-DDA":3
 }
 '''
@@ -616,17 +620,17 @@ def init_openai_api_key(api_key=""):
         global gpt_state
         gpt_state=1
         # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
-        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
     else:
         gpt_state=0
         # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
-        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
 def init_wo_openai_api_key():
         global gpt_state
         gpt_state=0
         # return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
-        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*2
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
@@ -666,15 +670,17 @@ def update_click_state(click_state, caption, click_mode):
 async def chat_input_callback(*args):
     visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay = args
     if visual_chatgpt is not None:
-        state, _, aux_state, _ = visual_chatgpt.run_text(chat_input, state, aux_state)
         last_text, last_response = state[-1]
         print("last response",last_response)
-        if autoplay:
-            audio = await texttospeech(last_response,language,autoplay)
         else:
-            audio=None
-        return state, state, aux_state, audio
     else:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
@@ -722,9 +728,9 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
         visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
         print("memory",visual_chatgpt.agent.memory)
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
-        parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and material used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"material\": \"Material used in the painting\" }")
         parsed_data = json.loads(parsed_data.replace("'", "\""))
-        name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
         # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
@@ -736,7 +742,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
 ]
     return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
-        original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}"]*4 + [paragraph,artist]
@@ -774,7 +780,8 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
     state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
     update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
     text = out['generated_captions']['raw_caption']
@@ -798,13 +805,11 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     print("new crop save",new_crop_save_path)
-    yield state, state, click_state, image_input_nobackground, image_input_withbackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
-async def submit_caption(state, text_refiner, length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
                    autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
     print("state",state)
@@ -846,6 +851,9 @@ async def submit_caption(state, text_refiner, length, sentiment, factuality, lan
         # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
         #                                           input_points=input_points, input_labels=input_labels)
         try:
             audio_output = await texttospeech(read_info, language, autoplay)
             print("done")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
@@ -858,16 +866,11 @@ async def submit_caption(state, text_refiner, length, sentiment, factuality, lan
             return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
     else:
-        try:
-            audio_output = await texttospeech(focus_info, language, autoplay)
-            # waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
-            # return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
-            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
-        except Exception as e:
-            state = state + [(None, f"Error during TTS prediction: {str(e)}")]
-            print(f"Error during TTS prediction: {str(e)}")
-            return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
 def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language):
@@ -1069,7 +1072,7 @@ async def inference_traject(origin_image,sketcher_image, enable_wiki, language,
             state = state + [(None, f"Error during TTS prediction: {str(e)}")]
             print(f"Error during TTS prediction: {str(e)}")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
-            return state, state, image_input,audio_output,crop_save_path,d3_input
     else:
@@ -1222,58 +1225,58 @@ def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragr
 #     return state,dislike_res
-def get_style():
-    current_version = version.parse(gr.__version__)
-    print(current_version)
-    if current_version <= version.parse('3.24.1'):
-        style = '''
-        #image_sketcher{min-height:500px}
-        #image_sketcher [data-testid="image"], #image_sketcher [data-testid="image"] > div{min-height: 500px}
-        #image_upload{min-height:500px}
-        #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 500px}
-        .custom-language {
-            width: 20%;
-        }
-        .custom-autoplay {
-            width: 40%;
-        }
-        .custom-output {
-            width: 30%;
-        }
-        '''
-    elif current_version <= version.parse('3.27'):
-        style = '''
-        #image_sketcher{min-height:500px}
-        #image_upload{min-height:500px}
-        .custom-language {
-            width: 20%;
-        }
-        .custom-autoplay {
-            width: 40%;
-        }
-        .custom-output {
-            width: 30%;
-        }
-        .custom-gallery {
-            display: flex;
-            flex-wrap: wrap;
-            justify-content: space-between;
-        }
-        .custom-gallery img {
-            width: 48%;
-            margin-bottom: 10px;
-        }
-        '''
-    else:
-        style = None
-    return style
 # def handle_like_dislike(like_data, like_state, dislike_state):
 #     if like_data.liked:
@@ -1323,9 +1326,21 @@ def print_like_dislike(x: gr.LikeData,like_res,dislike_res,state):
         dislike_res.append(x.value)
         state = state + [(None, f"Disliked Received 👎")]
     return like_res,dislike_res,state
 def create_ui():
     title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
@@ -1345,7 +1360,7 @@ def create_ui():
     ]
     with gr.Blocks(
-            css=get_style(),
             theme=gr.themes.Base()
     ) as iface:
         state = gr.State([])
@@ -1370,6 +1385,7 @@ def create_ui():
         dislike_res=gr.State([])
         gr.Markdown(title)
         gr.Markdown(description)
         # with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
         #     with gr.Column(scale=0.5):
         #         # gr.Markdown("Left side content")
@@ -1392,9 +1408,9 @@ def create_ui():
             value="English", label="Language", interactive=True, elem_classes="custom-language"
         )
             length = gr.Slider(
-                                minimum=20,
-                                maximum=100,
-                                value=40,
                                 step=1,
                                 interactive=True,
                                 label="Generated Caption Length",
@@ -1416,8 +1432,7 @@ def create_ui():
         #     auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
         #     output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
-        with gr.Row():
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as modules_not_need_gpt:
                     with gr.Tab("Base(GPT Power)") as base_tab:
@@ -1426,7 +1441,7 @@ def create_ui():
                             name_label_base = gr.Button(value="Name: ")
                             artist_label_base = gr.Button(value="Artist: ")
                             year_label_base = gr.Button(value="Year: ")
-                            material_label_base = gr.Button(value="Material: ")
                     with gr.Tab("Base2") as base_tab2:
                         image_input_base_2 = gr.Image(type="pil", interactive=True, elem_id="image_upload")
@@ -1434,52 +1449,55 @@ def create_ui():
                             name_label_base2 = gr.Button(value="Name: ")
                             artist_label_base2 = gr.Button(value="Artist: ")
                             year_label_base2 = gr.Button(value="Year: ")
-                            material_label_base2 = gr.Button(value="Material: ")
                     with gr.Tab("Click") as click_tab:
-                        image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
-                        example_image = gr.Image(type="pil", interactive=False, visible=False)
-                        # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
-                        with gr.Row():
-                            name_label = gr.Button(value="Name: ")
-                            artist_label = gr.Button(value="Artist: ")
-                            year_label = gr.Button(value="Year: ")
-                            material_label = gr.Button(value="Material: ")
                         with gr.Row():
-                            with gr.Column():
                                 with gr.Row():
-                                    focus_type = gr.Radio(
-                                            choices=["CFV-D", "CFV-DA", "CFV-DAI","PFV-DDA"],
-                                            value="CFV-D",
-                                            label="Information Type",
-                                            interactive=True,
-                                            scale=4)
                                 with gr.Row():
-                                    point_prompt = gr.Radio(
-                                        choices=["Positive", "Negative"],
-                                        value="Positive",
-                                        label="Point Prompt",
-                                        scale=5,
-                                        interactive=True)
                                     click_mode = gr.Radio(
                                         choices=["Continuous", "Single"],
                                         value="Continuous",
                                         label="Clicking Mode",
                                         scale=5,
                                         interactive=True)
-                            with gr.Column():
-                                with gr.Row():
-                                    submit_button_click=gr.Button(value="Submit", interactive=True,variant='primary',scale=2)
-                                with gr.Row():
-                                    clear_button_click = gr.Button(value="Clear Clicks", interactive=True,scale=2)
-                                    clear_button_image = gr.Button(value="Clear Image", interactive=True,scale=2)
-                    with gr.Tab("Trajectory (beta)") as traj_tab:
                         # sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10,
                         #                                elem_id="image_sketcher")
-                        sketcher_input = gr.ImageEditor(type="pil", interactive=True,
-                                                       elem_id="image_sketcher")
                         with gr.Row():
                             name_label_traj = gr.Button(value="Name: ")
                             artist_label_traj = gr.Button(value="Artist: ")
@@ -1489,28 +1507,16 @@ def create_ui():
                         with gr.Row():
                             clear_button_sketcher = gr.Button(value="Clear Sketch", interactive=True)
                             submit_button_sketcher = gr.Button(value="Submit", interactive=True)
-                        with gr.Row():
-                            with gr.Row():
-                                focus_type_sketch = gr.Radio(
-                                        choices=["CFV-D", "CFV-DA", "CFV-DAI","PFV-DDA"],
-                                        value="CFV-D",
-                                        label="Information Type",
-                                        interactive=True)
-                                Input_sketch = gr.Radio(
-                                        choices=["Trace+Seg", "Trace"],
-                                        value="Trace",
-                                        label="Trace Type",
-                                        interactive=True)
                     with gr.Column(visible=False,scale=4) as modules_need_gpt1:
-                        with gr.Row():
                             sentiment = gr.Radio(
                                 choices=["Positive", "Natural", "Negative"],
                                 value="Natural",
                                 label="Sentiment",
                                 interactive=True,
                             )
-                        with gr.Row():
                             factuality = gr.Radio(
                                 choices=["Factual", "Imagination"],
                                 value="Factual",
@@ -1531,6 +1537,8 @@ def create_ui():
                                 value="No",
                                 label="Expert",
                                 interactive=True)
                 with gr.Column(visible=True) as modules_not_need_gpt3:
                     gr.Examples(
                 examples=examples,
@@ -1541,7 +1549,7 @@ def create_ui():
-            with gr.Column(scale=5):
                 with gr.Column(visible=True) as module_key_input:
                     openai_api_key = gr.Textbox(
                         placeholder="Input openAI API key",
@@ -1563,20 +1571,16 @@ def create_ui():
                 with gr.Column(visible=False) as modules_not_need_gpt2:
                     with gr.Blocks():
-                        chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600)
                         with gr.Column(visible=False) as modules_need_gpt3:
-                            chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter",container=False)
                             with gr.Row():
-                                clear_button_text = gr.Button(value="Clear Text", interactive=True)
-                                submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
                                 # upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
                                 # downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
-                            with gr.Row():
-                                export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
-                            with gr.Row():
-                                chat_log_file = gr.File(label="Download Chat Log")
                 # TTS interface hidden initially
             with gr.Column(visible=False) as tts_interface:
@@ -1689,6 +1693,15 @@ def create_ui():
                     # show_share_button=True,
                     # show_download_button=True
                 )
@@ -1727,7 +1740,7 @@ def create_ui():
         #     outputs=[result, seed],
         #     api_name="run",
         # )
-        run_button.click(
             fn=infer,
             inputs=[new_crop_save_path],
             outputs=[result]
@@ -1742,106 +1755,106 @@ def create_ui():
         # this part is for 3d generate.
         ###############################################################################
-        with gr.Row(variant="panel",visible=False) as d3_model:
-            with gr.Column():
-                with gr.Row():
-                    input_image = gr.Image(
-                        label="Input Image",
-                        image_mode="RGBA",
-                        sources="upload",
-                        #width=256,
-                        #height=256,
-                        type="pil",
-                        elem_id="content_image",
-                    )
-                    processed_image = gr.Image(
-                        label="Processed Image",
-                        image_mode="RGBA",
-                        #width=256,
-                        #height=256,
-                        type="pil",
-                        interactive=False
-                    )
-                with gr.Row():
-                    with gr.Group():
-                        do_remove_background = gr.Checkbox(
-                            label="Remove Background", value=True
-                        )
-                        sample_seed = gr.Number(value=42, label="Seed Value", precision=0)
-                        sample_steps = gr.Slider(
-                            label="Sample Steps",
-                            minimum=30,
-                            maximum=75,
-                            value=75,
-                            step=5
-                        )
-                with gr.Row():
-                    submit = gr.Button("Generate", elem_id="generate", variant="primary")
-                with gr.Row(variant="panel"):
-                    gr.Examples(
-                        examples=[
-                            os.path.join("examples", img_name) for img_name in sorted(os.listdir("examples"))
-                        ],
-                        inputs=[input_image],
-                        label="Examples",
-                        cache_examples=False,
-                        examples_per_page=16
-                    )
-            with gr.Column():
-                with gr.Row():
-                    with gr.Column():
-                        mv_show_images = gr.Image(
-                            label="Generated Multi-views",
-                            type="pil",
-                            width=379,
-                            interactive=False
-                        )
-                    # with gr.Column():
-                    #     output_video = gr.Video(
-                    #         label="video", format="mp4",
-                    #         width=379,
-                    #         autoplay=True,
-                    #         interactive=False
-                    #     )
-                with gr.Row():
-                    with gr.Tab("OBJ"):
-                        output_model_obj = gr.Model3D(
-                            label="Output Model (OBJ Format)",
-                            interactive=False,
-                        )
-                        gr.Markdown("Note: Downloaded .obj model will be flipped. Export .glb instead or manually flip it before usage.")
-                    with gr.Tab("GLB"):
-                        output_model_glb = gr.Model3D(
-                            label="Output Model (GLB Format)",
-                            interactive=False,
-                        )
-                        gr.Markdown("Note: The model shown here has a darker appearance. Download to get correct results.")
-        mv_images = gr.State()
-        chatbot.like(print_like_dislike, inputs=[like_res,dislike_res,state], outputs=[like_res,dislike_res,chatbot])
-        submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
-            fn=generate_mvs,
-            inputs=[processed_image, sample_steps, sample_seed],
-            outputs=[mv_images, mv_show_images]
-        ).success(
-            fn=make3d,
-            inputs=[mv_images],
-            outputs=[output_model_obj, output_model_glb]
-        )
         ###############################################################################
         # above part is for 3d generate.
@@ -1868,13 +1881,13 @@ def create_ui():
-        clear_button_sketcher.click(
-            lambda x: (x),
-            [origin_image],
-            [sketcher_input],
-            queue=False,
-            show_progress=False
-        )
@@ -1882,11 +1895,11 @@ def create_ui():
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
-                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
-                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
         # openai_api_key.submit(init_openai_api_key,
         #                       outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
         #                                modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
@@ -1898,7 +1911,7 @@ def create_ui():
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
-                                              modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
         artist_label_base2.click(
             get_artistinfo,
@@ -1995,23 +2008,23 @@ def create_ui():
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
                                 paragraph,artist])
-        image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key],
-                           [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
-                            image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
-                            name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-                                paragraph,artist])
-        image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key],
-                           [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
-                            image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
-                            name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-                                paragraph,artist])
-        sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
-                           [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
-                            image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
-                            name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
-                                paragraph,artist])
         # image_input.upload(upload_callback, [image_input, state, visual_chatgpt, openai_api_key],
         #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
@@ -2022,9 +2035,9 @@ def create_ui():
         chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
                           [chatbot, state, aux_state,output_audio])
         chat_input.submit(lambda: "", None, chat_input)
-        submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
-                          [chatbot, state, aux_state,output_audio])
-        submit_button_text.click(lambda: "", None, chat_input)
         example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                               image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
@@ -2068,37 +2081,103 @@ def create_ui():
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
                 out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
             ],
-            outputs=[chatbot, state, click_state, image_input, input_image, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
             show_progress=False, queue=True
         )
-        submit_button_click.click(
             submit_caption,
             inputs=[
-        state, text_refiner,length, sentiment, factuality, language,
-        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
-        auto_play,paragraph,focus_type,openai_api_key,new_crop_save_path
     ],
             outputs=[
-                chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
-                output_audio
             ],
             show_progress=True,
             queue=True
         )
-        submit_button_sketcher.click(
-            inference_traject,
-            inputs=[
-                origin_image,sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
-                original_size, input_size, text_refiner,focus_type_sketch,paragraph,openai_api_key,auto_play,Input_sketch
-            ],
-            outputs=[chatbot, state, sketcher_input,output_audio,new_crop_save_path,input_image],
-            show_progress=False, queue=True
         )
         export_button.click(
             export_chat_log,
             inputs=[state,paragraph,like_res,dislike_res],
@@ -2129,4 +2208,4 @@ if __name__ == '__main__':
     iface = create_ui()
     iface.queue(api_open=False, max_size=10)
     # iface.queue(concurrency_count=5, api_open=False, max_size=10)
-    iface.launch(server_name="0.0.0.0")

 import edge_tts
 from langchain import __version__
 import torch
 from transformers import AutoProcessor, SiglipModel
 import faiss
 from huggingface_hub import hf_hub_download
 import spaces
 # Print the current version of LangChain
 print(f"Current LangChain version: {__version__}")
+print("testing testing")
 # import tts
 ###############################################################################
 # import spaces  #
+# import threading
+# lock = threading.Lock()
 import os
 # import uuid
 # from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
+# def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False):
+#     """
+#     Get the rendering camera parameters.
+#     """
+#     c2ws = get_circular_camera_poses(M=M, radius=radius, elevation=elevation)
+#     if is_flexicubes:
+#         cameras = torch.linalg.inv(c2ws)
+#         cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1, 1)
+#     else:
+#         extrinsics = c2ws.flatten(-2)
+#         intrinsics = FOV_to_intrinsics(50.0).unsqueeze(0).repeat(M, 1, 1).float().flatten(-2)
+#         cameras = torch.cat([extrinsics, intrinsics], dim=-1)
+#         cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1)
+#     return cameras
+# def images_to_video(images, output_path, fps=30):
+#     # images: (N, C, H, W)
+#     os.makedirs(os.path.dirname(output_path), exist_ok=True)
+#     frames = []
+#     for i in range(images.shape[0]):
+#         frame = (images[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8).clip(0, 255)
+#         assert frame.shape[0] == images.shape[2] and frame.shape[1] == images.shape[3], \
+#             f"Frame shape mismatch: {frame.shape} vs {images.shape}"
+#         assert frame.min() >= 0 and frame.max() <= 255, \
+#             f"Frame value out of range: {frame.min()} ~ {frame.max()}"
+#         frames.append(frame)
+#     imageio.mimwrite(output_path, np.stack(frames), fps=fps, codec='h264')
+# ###############################################################################
+# # Configuration.
+# ###############################################################################
+# import shutil
+# def find_cuda():
+#     # Check if CUDA_HOME or CUDA_PATH environment variables are set
+#     cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+#     if cuda_home and os.path.exists(cuda_home):
+#         return cuda_home
+#     # Search for the nvcc executable in the system's PATH
+#     nvcc_path = shutil.which('nvcc')
+#     if nvcc_path:
+#         # Remove the 'bin/nvcc' part to get the CUDA installation path
+#         cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
+#         return cuda_path
+#     return None
+# cuda_path = find_cuda()
+# if cuda_path:
+#     print(f"CUDA installation found at: {cuda_path}")
+# else:
+#     print("CUDA installation not found")
+# config_path = 'configs/instant-nerf-base.yaml'
+# config = OmegaConf.load(config_path)
+# config_name = os.path.basename(config_path).replace('.yaml', '')
+# model_config = config.model_config
+# infer_config = config.infer_config
+# IS_FLEXICUBES = True if config_name.startswith('instant-mesh') else False
+# device = torch.device('cuda')
+# # load diffusion model
+# print('Loading diffusion model ...')
+# pipeline = DiffusionPipeline.from_pretrained(
+#     "sudo-ai/zero123plus-v1.2",
+#     custom_pipeline="zero123plus",
+#     torch_dtype=torch.float16,
+# )
+# pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
+#     pipeline.scheduler.config, timestep_spacing='trailing'
+# )
+# # load custom white-background UNet
+# unet_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model")
+# state_dict = torch.load(unet_ckpt_path, map_location='cpu')
+# pipeline.unet.load_state_dict(state_dict, strict=True)
+# pipeline = pipeline.to(device)
+# # load reconstruction model
+# print('Loading reconstruction model ...')
+# model_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="instant_nerf_base.ckpt", repo_type="model")
+# model0 = instantiate_from_config(model_config)
+# state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
+# state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.') and 'source_camera' not in k}
+# model0.load_state_dict(state_dict, strict=True)
+# model0 = model0.to(device)
+# print('Loading Finished!')
+# def check_input_image(input_image):
+#     if input_image is None:
+#         raise gr.Error("No image uploaded!")
+#         image = None
+#     else:
+#         image = Image.open(input_image)
+#     return image
+# def preprocess(input_image, do_remove_background):
+#     rembg_session = rembg.new_session() if do_remove_background else None
+#     if do_remove_background:
+#         input_image = remove_background(input_image, rembg_session)
+#         input_image = resize_foreground(input_image, 0.85)
+#     return input_image
+# # @spaces.GPU
+# def generate_mvs(input_image, sample_steps, sample_seed):
+#     seed_everything(sample_seed)
+#     # sampling
+#     z123_image = pipeline(
+#         input_image,
+#         num_inference_steps=sample_steps
+#     ).images[0]
+#     show_image = np.asarray(z123_image, dtype=np.uint8)
+#     show_image = torch.from_numpy(show_image)     # (960, 640, 3)
+#     show_image = rearrange(show_image, '(n h) (m w) c -> (n m) h w c', n=3, m=2)
+#     show_image = rearrange(show_image, '(n m) h w c -> (n h) (m w) c', n=2, m=3)
+#     show_image = Image.fromarray(show_image.numpy())
+#     return z123_image, show_image
+# # @spaces.GPU
+# def make3d(images):
+#     global model0
+#     if IS_FLEXICUBES:
+#         model0.init_flexicubes_geometry(device)
+#     model0 = model0.eval()
+#     images = np.asarray(images, dtype=np.float32) / 255.0
+#     images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float()     # (3, 960, 640)
+#     images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=3, m=2)        # (6, 3, 320, 320)
+#     input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(device)
+#     render_cameras = get_render_cameras(batch_size=1, radius=2.5, is_flexicubes=IS_FLEXICUBES).to(device)
+#     images = images.unsqueeze(0).to(device)
+#     images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1)
+#     mesh_fpath = tempfile.NamedTemporaryFile(suffix=f".obj", delete=False).name
+#     print(mesh_fpath)
+#     mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
+#     mesh_dirname = os.path.dirname(mesh_fpath)
+#     video_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.mp4")
+#     mesh_glb_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.glb")
+#     with torch.no_grad():
+#         # get triplane
+#         planes = model0.forward_planes(images, input_cameras)
+#         # # get video
+#         # chunk_size = 20 if IS_FLEXICUBES else 1
+#         # render_size = 384
+#         # frames = []
+#         # for i in tqdm(range(0, render_cameras.shape[1], chunk_size)):
+#         #     if IS_FLEXICUBES:
+#         #         frame = model.forward_geometry(
+#         #             planes,
+#         #             render_cameras[:, i:i+chunk_size],
+#         #             render_size=render_size,
+#         #         )['img']
+#         #     else:
+#         #         frame = model.synthesizer(
+#         #             planes,
+#         #             cameras=render_cameras[:, i:i+chunk_size],
+#         #             render_size=render_size,
+#         #         )['images_rgb']
+#         #     frames.append(frame)
+#         # frames = torch.cat(frames, dim=1)
+#         # images_to_video(
+#         #     frames[0],
+#         #     video_fpath,
+#         #     fps=30,
+#         # )
+#         # print(f"Video saved to {video_fpath}")
+#         # get mesh
+#         mesh_out = model0.extract_mesh(
+#             planes,
+#             use_texture_map=False,
+#             **infer_config,
+#         )
+#         vertices, faces, vertex_colors = mesh_out
+#         vertices = vertices[:, [1, 2, 0]]
+#         save_glb(vertices, faces, vertex_colors, mesh_glb_fpath)
+#         save_obj(vertices, faces, vertex_colors, mesh_fpath)
+#         print(f"Mesh saved to {mesh_fpath}")
+#     return mesh_fpath, mesh_glb_fpath
 ###############################################################################
 css = """
 #warning {background-color: #FFCCCB}
+.tools_button {
+    background: white;
+    border: none !important;
+    box-shadow: none !important;
+}
+#tool_box {max-width: 50px}
 """
 filtered_language_dict = {
     'English': 'en-US-JennyNeural',
 }
 focus_map = {
+"D":0,
+"DA":1,
+"DAI":2,
+"DDA":3
 }
 '''
         global gpt_state
         gpt_state=1
         # return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
+        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]
     else:
         gpt_state=0
         # return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
+        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]
 def init_wo_openai_api_key():
         global gpt_state
         gpt_state=0
         # return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
+        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
 async def chat_input_callback(*args):
     visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay = args
+    message = chat_input["text"]
     if visual_chatgpt is not None:
+        state, _, aux_state, _ = visual_chatgpt.run_text(message, state, aux_state)
         last_text, last_response = state[-1]
         print("last response",last_response)
+        if autoplay==False:
+            return state, state, aux_state, None
         else:
+            audio = await texttospeech(last_response,language,autoplay)
+            return state, state, aux_state, audio
     else:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
         visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
         print("memory",visual_chatgpt.agent.memory)
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
+        parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and painting style used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"style\": \"Painting style used in the painting\" }")
         parsed_data = json.loads(parsed_data.replace("'", "\""))
+        name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"]
         # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
 ]
     return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
+        original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist]
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
+    # state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
     state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
     update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
     text = out['generated_captions']['raw_caption']
     print("new crop save",new_crop_save_path)
+    yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
+async def submit_caption(state,length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
                    autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
     print("state",state)
         # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
         #                                           input_points=input_points, input_labels=input_labels)
         try:
+            if autoplay==False:
+                return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
             audio_output = await texttospeech(read_info, language, autoplay)
             print("done")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
             return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
     else:
+        state = state + [(None, f"Error during TTS prediction: {str(e)}")]
+        print(f"Error during TTS prediction: {str(e)}")
+        return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None
 def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language):
             state = state + [(None, f"Error during TTS prediction: {str(e)}")]
             print(f"Error during TTS prediction: {str(e)}")
             # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
+            return state, state, image_input,audio_output,crop_save_path
     else:
 #     return state,dislike_res
+# def get_style():
+#     current_version = version.parse(gr.__version__)
+#     print(current_version)
+#     if current_version <= version.parse('3.24.1'):
+#         style = '''
+#         #image_sketcher{min-height:500px}
+#         #image_sketcher [data-testid="image"], #image_sketcher [data-testid="image"] > div{min-height: 500px}
+#         #image_upload{min-height:500px}
+#         #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 500px}
+#         .custom-language {
+#             width: 20%;
+#         }
+#         .custom-autoplay {
+#             width: 40%;
+#         }
+#         .custom-output {
+#             width: 30%;
+#         }
+#         '''
+#     elif current_version <= version.parse('3.27'):
+#         style = '''
+#         #image_sketcher{min-height:500px}
+#         #image_upload{min-height:500px}
+#         .custom-language {
+#             width: 20%;
+#         }
+#         .custom-autoplay {
+#             width: 40%;
+#         }
+#         .custom-output {
+#             width: 30%;
+#         }
+#         .custom-gallery {
+#             display: flex;
+#             flex-wrap: wrap;
+#             justify-content: space-between;
+#         }
+#         .custom-gallery img {
+#             width: 48%;
+#             margin-bottom: 10px;
+#         }
+#         '''
+#     else:
+#         style = None
+#     return style
 # def handle_like_dislike(like_data, like_state, dislike_state):
 #     if like_data.liked:
         dislike_res.append(x.value)
         state = state + [(None, f"Disliked Received 👎")]
     return like_res,dislike_res,state
+def toggle_icons_and_update_prompt(point_prompt):
+    new_prompt = "Negative" if point_prompt == "Positive" else "Positive"
+    new_add_icon = "assets/icons/plus-square-blue.png" if point_prompt == "Positive" else "assets/icons/plus-square.png"
+    new_minus_icon = "assets/icons/minus-square.png" if point_prompt == "Positive" else "assets/icons/minus-square-blue.png"
+    print(point_prompt)
+    print(new_prompt)
+    return new_prompt, gr.update(icon=new_add_icon), gr.update(icon=new_minus_icon)
+add_icon_path="assets/icons/plus-square-blue.png"
+minus_icon_path="assets/icons/minus-square.png"
+print("this is a print test")
 def create_ui():
     title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
     ]
     with gr.Blocks(
+            css=css,
             theme=gr.themes.Base()
     ) as iface:
         state = gr.State([])
         dislike_res=gr.State([])
         gr.Markdown(title)
         gr.Markdown(description)
+        point_prompt = gr.State("Positive")
         # with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
         #     with gr.Column(scale=0.5):
         #         # gr.Markdown("Left side content")
             value="English", label="Language", interactive=True, elem_classes="custom-language"
         )
             length = gr.Slider(
+                                minimum=40,
+                                maximum=200,
+                                value=80,
                                 step=1,
                                 interactive=True,
                                 label="Generated Caption Length",
         #     auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
         #     output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
+        with gr.Row():
             with gr.Column(scale=6):
                 with gr.Column(visible=False) as modules_not_need_gpt:
                     with gr.Tab("Base(GPT Power)") as base_tab:
                             name_label_base = gr.Button(value="Name: ")
                             artist_label_base = gr.Button(value="Artist: ")
                             year_label_base = gr.Button(value="Year: ")
+                            material_label_base = gr.Button(value="Style: ")
                     with gr.Tab("Base2") as base_tab2:
                         image_input_base_2 = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                             name_label_base2 = gr.Button(value="Name: ")
                             artist_label_base2 = gr.Button(value="Artist: ")
                             year_label_base2 = gr.Button(value="Year: ")
+                            material_label_base2 = gr.Button(value="Style: ")
                     with gr.Tab("Click") as click_tab:
                         with gr.Row():
+                            with gr.Column(scale=10,min_width=450):
+                                image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
+                                example_image = gr.Image(type="pil", interactive=False, visible=False)
                                 with gr.Row():
+                                    name_label = gr.Button(value="Name: ")
+                                    artist_label = gr.Button(value="Artist: ")
+                                    year_label = gr.Button(value="Year: ")
+                                    material_label = gr.Button(value="Style: ")
+                            # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
+                            # the tool column
+                            with gr.Column(scale=1,elem_id="tool_box",min_width=100):
+                                add_button = gr.Button(value="", interactive=True,elem_classes="tools_button",icon=add_icon_path)
+                                minus_button = gr.Button(value="", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
+                                clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
+                                clear_button_image = gr.Button(value="Change Image", interactive=True,elem_classes="tools_button")
+                                focus_d = gr.Button(value="D",interactive=True,elem_classes="function_button")
+                                focus_da = gr.Button(value="DA",interactive=True,elem_classes="function_button")
+                                focus_dai = gr.Button(value="DAI",interactive=True,elem_classes="function_button")
+                                focus_dda = gr.Button(value="DDA",interactive=True,elem_classes="function_button")
+                                recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button")
+                        with gr.Row(visible=False):
+                            with gr.Column():
                                 with gr.Row():
+                                    # point_prompt = gr.Radio(
+                                    #     choices=["Positive", "Negative"],
+                                    #     value="Positive",
+                                    #     label="Point Prompt",
+                                    #     scale=5,
+                                    #     interactive=True)
                                     click_mode = gr.Radio(
                                         choices=["Continuous", "Single"],
                                         value="Continuous",
                                         label="Clicking Mode",
                                         scale=5,
                                         interactive=True)
+                    with gr.Tab("Trajectory (beta)", visible=False) as traj_tab:
                         # sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10,
                         #                                elem_id="image_sketcher")
+                        sketcher_input = gr.ImageEditor(type="pil", interactive=True
+                                                    )
                         with gr.Row():
                             name_label_traj = gr.Button(value="Name: ")
                             artist_label_traj = gr.Button(value="Artist: ")
                         with gr.Row():
                             clear_button_sketcher = gr.Button(value="Clear Sketch", interactive=True)
                             submit_button_sketcher = gr.Button(value="Submit", interactive=True)
                     with gr.Column(visible=False,scale=4) as modules_need_gpt1:
+                        with gr.Row(visible=False):
                             sentiment = gr.Radio(
                                 choices=["Positive", "Natural", "Negative"],
                                 value="Natural",
                                 label="Sentiment",
                                 interactive=True,
                             )
                             factuality = gr.Radio(
                                 choices=["Factual", "Imagination"],
                                 value="Factual",
                                 value="No",
                                 label="Expert",
                                 interactive=True)
                 with gr.Column(visible=True) as modules_not_need_gpt3:
                     gr.Examples(
                 examples=examples,
+            with gr.Column(scale=4):
                 with gr.Column(visible=True) as module_key_input:
                     openai_api_key = gr.Textbox(
                         placeholder="Input openAI API key",
                 with gr.Column(visible=False) as modules_not_need_gpt2:
                     with gr.Blocks():
+                        chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=600,bubble_full_width=False)
                         with gr.Column(visible=False) as modules_need_gpt3:
+                            chat_input = gr.MultimodalTextbox(interactive=True, file_types=[".txt"], placeholder="Message EyeSee...", show_label=False)
                             with gr.Row():
+                                clear_button_text = gr.Button(value="Clear Chat", interactive=True)
+                                export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
+                                # submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
                                 # upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
                                 # downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
                 # TTS interface hidden initially
             with gr.Column(visible=False) as tts_interface:
                     # show_share_button=True,
                     # show_download_button=True
                 )
+        with gr.Row():
+            naritive = gr.Radio(
+                                choices=["Third", "Artist","Item"],
+                                value="Third",
+                                label="narritive",
+                                scale=5,
+                                interactive=True)
+            chat_log_file = gr.File(label="Download Chat Log",scale=5)
         #     outputs=[result, seed],
         #     api_name="run",
         # )
+        recommend_btn.click(
             fn=infer,
             inputs=[new_crop_save_path],
             outputs=[result]
         # this part is for 3d generate.
         ###############################################################################
+        # with gr.Row(variant="panel",visible=False) as d3_model:
+        #     with gr.Column():
+        #         with gr.Row():
+        #             input_image = gr.Image(
+        #                 label="Input Image",
+        #                 image_mode="RGBA",
+        #                 sources="upload",
+        #                 #width=256,
+        #                 #height=256,
+        #                 type="pil",
+        #                 elem_id="content_image",
+        #             )
+        #             processed_image = gr.Image(
+        #                 label="Processed Image",
+        #                 image_mode="RGBA",
+        #                 #width=256,
+        #                 #height=256,
+        #                 type="pil",
+        #                 interactive=False
+        #             )
+        #         with gr.Row():
+        #             with gr.Group():
+        #                 do_remove_background = gr.Checkbox(
+        #                     label="Remove Background", value=True
+        #                 )
+        #                 sample_seed = gr.Number(value=42, label="Seed Value", precision=0)
+        #                 sample_steps = gr.Slider(
+        #                     label="Sample Steps",
+        #                     minimum=30,
+        #                     maximum=75,
+        #                     value=75,
+        #                     step=5
+        #                 )
+        #         with gr.Row():
+        #             submit = gr.Button("Generate", elem_id="generate", variant="primary")
+        #         with gr.Row(variant="panel"):
+        #             gr.Examples(
+        #                 examples=[
+        #                     os.path.join("examples", img_name) for img_name in sorted(os.listdir("examples"))
+        #                 ],
+        #                 inputs=[input_image],
+        #                 label="Examples",
+        #                 cache_examples=False,
+        #                 examples_per_page=16
+        #             )
+        #     with gr.Column():
+        #         with gr.Row():
+        #             with gr.Column():
+        #                 mv_show_images = gr.Image(
+        #                     label="Generated Multi-views",
+        #                     type="pil",
+        #                     width=379,
+        #                     interactive=False
+        #                 )
+        #             # with gr.Column():
+        #             #     output_video = gr.Video(
+        #             #         label="video", format="mp4",
+        #             #         width=379,
+        #             #         autoplay=True,
+        #             #         interactive=False
+        #             #     )
+        #         with gr.Row():
+        #             with gr.Tab("OBJ"):
+        #                 output_model_obj = gr.Model3D(
+        #                     label="Output Model (OBJ Format)",
+        #                     interactive=False,
+        #                 )
+        #                 gr.Markdown("Note: Downloaded .obj model will be flipped. Export .glb instead or manually flip it before usage.")
+        #             with gr.Tab("GLB"):
+        #                 output_model_glb = gr.Model3D(
+        #                     label="Output Model (GLB Format)",
+        #                     interactive=False,
+        #                 )
+        #                 gr.Markdown("Note: The model shown here has a darker appearance. Download to get correct results.")
+        # mv_images = gr.State()
+        # chatbot.like(print_like_dislike, inputs=[like_res,dislike_res,state], outputs=[like_res,dislike_res,chatbot])
+        # submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
+        #     fn=generate_mvs,
+        #     inputs=[processed_image, sample_steps, sample_seed],
+        #     outputs=[mv_images, mv_show_images]
+        # ).success(
+        #     fn=make3d,
+        #     inputs=[mv_images],
+        #     outputs=[output_model_obj, output_model_glb]
+        # )
         ###############################################################################
         # above part is for 3d generate.
+        # clear_button_sketcher.click(
+        #     lambda x: (x),
+        #     [origin_image],
+        #     [sketcher_input],
+        #     queue=False,
+        #     show_progress=False
+        # )
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
+                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
+                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row])
         # openai_api_key.submit(init_openai_api_key,
         #                       outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
         #                                modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
+                                              modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row])
         artist_label_base2.click(
             get_artistinfo,
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
                                 paragraph,artist])
+        # image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key],
+        #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
+        #                     image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
+        #                     name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+        #                         paragraph,artist])
+        # image_input.upload(upload_callback, [image_input, state, visual_chatgpt,openai_api_key],
+        #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
+        #                     image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
+        #                     name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+        #                         paragraph,artist])
+        # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key],
+        #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
+        #                     image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
+        #                     name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
+        #                         paragraph,artist])
         # image_input.upload(upload_callback, [image_input, state, visual_chatgpt, openai_api_key],
         #                    [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
         chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
                           [chatbot, state, aux_state,output_audio])
         chat_input.submit(lambda: "", None, chat_input)
+        # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
+        #                   [chatbot, state, aux_state,output_audio])
+        # submit_button_text.click(lambda: "", None, chat_input)
         example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                               image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
                 out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
             ],
+            outputs=[chatbot, state, click_state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
             show_progress=False, queue=True
         )
+        focus_d.click(
             submit_caption,
             inputs=[
+        state,length, sentiment, factuality, language,
+        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, auto_play, paragraph,focus_d,openai_api_key,new_crop_save_path
     ],
             outputs=[
+                chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
             ],
             show_progress=True,
             queue=True
         )
+        focus_da.click(
+        submit_caption,
+        inputs=[
+        state,length, sentiment, factuality, language,
+        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,auto_play, paragraph,focus_da,openai_api_key,new_crop_save_path
+        ],
+        outputs=[
+            chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
+        ],
+        show_progress=True,
+        queue=True
+        )
+        focus_dai.click(
+        submit_caption,
+        inputs=[
+        state,length, sentiment, factuality, language,
+        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+        auto_play, paragraph,focus_dai,openai_api_key,new_crop_save_path
+        ],
+        outputs=[
+            chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
+        ],
+        show_progress=True,
+        queue=True
         )
+        focus_dda.click(
+        submit_caption,
+        inputs=[
+        state,length, sentiment, factuality, language,
+        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+        auto_play, paragraph,focus_dda,openai_api_key,new_crop_save_path
+        ],
+        outputs=[
+            chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,output_audio
+        ],
+        show_progress=True,
+        queue=True
+        )
+        add_button.click(
+            toggle_icons_and_update_prompt,
+            inputs=[point_prompt],
+            outputs=[point_prompt,add_button,minus_button],
+            show_progress=True,
+            queue=True
+        )
+        minus_button.click(
+            toggle_icons_and_update_prompt,
+            inputs=[point_prompt],
+            outputs=[point_prompt,add_button,minus_button],
+            show_progress=True,
+            queue=True
+        )
+        # submit_button_sketcher.click(
+        #     inference_traject,
+        #     inputs=[
+        #         origin_image,sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
+        #         original_size, input_size, text_refiner,focus_type_sketch,paragraph,openai_api_key,auto_play,Input_sketch
+        #     ],
+        #     outputs=[chatbot, state, sketcher_input,output_audio,new_crop_save_path],
+        #     show_progress=False, queue=True
+        # )
         export_button.click(
             export_chat_log,
             inputs=[state,paragraph,like_res,dislike_res],
     iface = create_ui()
     iface.queue(api_open=False, max_size=10)
     # iface.queue(concurrency_count=5, api_open=False, max_size=10)
+    iface.launch(server_name="0.0.0.0",show_error=True)