EyeSee_chi

Running

File size: 42,265 Bytes

ee87a3a
d29ec53
ee87a3a
 
 
 
f2e1e32
ee87a3a
 
 
f950d25
ee87a3a
 
 
 
 
 
 
 
 
 
cf1091a
 
 
5ac1132
cf1091a
 
 
 
 
 
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f950d25
 
 
 
 
 
 
 
 
 
 
 
 
05b6ef0
ee87a3a
 
 
f950d25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee87a3a
 
5ac1132
 
 
 
ee87a3a
d29ec53
cf1091a
ee87a3a
 
d29ec53
 
5ac1132
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9eedbd
ee87a3a
 
 
 
 
d29ec53
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
d29ec53
ee87a3a
 
 
 
 
 
 
 
 
 
d29ec53
 
 
 
b9eedbd
d29ec53
ee87a3a
 
d29ec53
 
 
ee87a3a
c5a524a
 
ee87a3a
 
f950d25
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d29ec53
ee87a3a
 
 
c5a524a
 
ee87a3a
 
 
f950d25
 
 
 
 
 
 
ee87a3a
c5a524a
 
 
 
 
ee87a3a
f2e1e32
 
b9eedbd
f2e1e32
b9eedbd
f2e1e32
 
 
 
f950d25
cf1091a
b9eedbd
f950d25
 
 
 
 
 
 
d29ec53
 
 
 
 
f950d25
d29ec53
 
f950d25
 
 
 
 
 
 
 
d29ec53
b9eedbd
 
 
 
d29ec53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9eedbd
d29ec53
 
 
 
 
 
 
 
f950d25
f2e1e32
 
 
 
f950d25
 
 
b9eedbd
 
 
d29ec53
 
 
 
f950d25
cf1091a
d29ec53
abf795d
cf1091a
 
 
abf795d
f950d25
cf1091a
 
 
abf795d
cf1091a
 
 
abf795d
f950d25
f2e1e32
d29ec53
 
 
 
 
 
 
 
 
f2e1e32
d29ec53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2e1e32
 
ee87a3a
2a1fba2
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2e1e32
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abf795d
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abf795d
 
ee87a3a
d29ec53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e57ef3
ee87a3a
7e57ef3
ee87a3a
 
5cd1774
 
 
ee87a3a
 
 
5cd1774
ee87a3a
 
 
 
 
 
f950d25
ee87a3a
 
 
 
 
 
 
f2e1e32
d29ec53
ee87a3a
f950d25
 
 
 
b9eedbd
d29ec53
5ac1132
ee87a3a
 
 
 
 
 
 
abf795d
d29ec53
abf795d
 
d29ec53
abf795d
d29ec53
ee87a3a
 
d29ec53
 
 
 
 
 
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2e1e32
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf1091a
ee87a3a
 
 
 
 
d29ec53
ee87a3a
 
 
 
 
cf1091a
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
abf795d
 
 
 
 
 
 
ee87a3a
 
 
 
 
 
 
 
abf795d
 
cf1091a
 
 
 
 
 
 
 
 
5ac1132
cf1091a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee87a3a
cf1091a
05b6ef0
ee87a3a
5ac1132
05b6ef0
ee87a3a
 
5ac1132
ee87a3a
 
 
cf1091a
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abf795d
 
ee87a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abf795d
d29ec53
ee87a3a
 
d29ec53
 
 
 
 
 
 
 
 
 
ee87a3a
 
 
 
 
 
d29ec53
 
 
 
ee87a3a
f2e1e32
abf795d
 
 
 
 
 
 
5ac1132
abf795d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2e1e32
 
 
 
f950d25
 
f2e1e32
b9eedbd
f2e1e32
 
 
f950d25
f2e1e32
cf1091a
 
 
 
b9eedbd
cf1091a
 
 
 
 
 
 
 
f2e1e32
 
ee87a3a
 
 
 
 
 
 
 
 
 
 
abf795d
5ac1132
cf1091a
 
ee87a3a
 
 
 
 
 
5ac1132

import os
import base64
import json
import gradio as gr
import numpy as np
from gradio import processing_utils
import requests
from packaging import version
from PIL import Image, ImageDraw
import functools
from langchain.llms.openai import OpenAI
from caption_anything.model import CaptionAnything
from caption_anything.utils.image_editing_utils import create_bubble_frame
from caption_anything.utils.utils import mask_painter, seg_model_map, prepare_segmenter, image_resize
from caption_anything.utils.parser import parse_augment
from caption_anything.captioner import build_captioner
from caption_anything.text_refiner import build_text_refiner
from caption_anything.segmenter import build_segmenter
from caption_anything.utils.chatbot import ConversationBot, build_chatbot_tools, get_new_image_name
from segment_anything import sam_model_registry
import easyocr
import tts 


gpt_state = 0

article = """
<div style='margin:20px auto;'>
<p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
</div>
"""

args = parse_augment()
args.segmenter = "huge"
args.segmenter_checkpoint = "sam_vit_h_4b8939.pth"
args.clip_filter = True
if args.segmenter_checkpoint is None:
    _, segmenter_checkpoint = prepare_segmenter(args.segmenter)
else:
    segmenter_checkpoint = args.segmenter_checkpoint
    
shared_captioner = build_captioner(args.captioner, args.device, args)
shared_sam_model = sam_model_registry[seg_model_map[args.segmenter]](checkpoint=segmenter_checkpoint).to(args.device)
ocr_lang = ["ch_tra", "en"]
shared_ocr_reader = easyocr.Reader(ocr_lang)
tools_dict = {e.split('_')[0].strip(): e.split('_')[1].strip() for e in args.chat_tools_dict.split(',')}
shared_chatbot_tools = build_chatbot_tools(tools_dict)


class ImageSketcher(gr.Image):
    """
    Fix the bug of gradio.Image that cannot upload with tool == 'sketch'.
    """

    is_template = True  # Magic to make this work with gradio.Block, don't remove unless you know what you're doing.

    def __init__(self, **kwargs):
        super().__init__(tool="sketch", **kwargs)

    def preprocess(self, x):
        if self.tool == 'sketch' and self.source in ["upload", "webcam"]:
            assert isinstance(x, dict)
            if x['mask'] is None:
                decode_image = processing_utils.decode_base64_to_image(x['image'])
                width, height = decode_image.size
                mask = np.zeros((height, width, 4), dtype=np.uint8)
                mask[..., -1] = 255
                mask = self.postprocess(mask)
                x['mask'] = mask
        return super().preprocess(x)


def build_caption_anything_with_models(args, api_key="", captioner=None, sam_model=None, ocr_reader=None, text_refiner=None,
                                       session_id=None):
    segmenter = build_segmenter(args.segmenter, args.device, args, model=sam_model)
    captioner = captioner
    if session_id is not None:
        print('Init caption anything for session {}'.format(session_id))
    return CaptionAnything(args, api_key, captioner=captioner, segmenter=segmenter, ocr_reader=ocr_reader, text_refiner=text_refiner)


def validate_api_key(api_key):
    api_key = str(api_key).strip()
    print(api_key)
    try:
        test_llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=api_key)
        response = test_llm("Test API call")
        print(response)
        return True
    except Exception as e:
        print(f"API key validation failed: {e}")
        return False


def init_openai_api_key(api_key=""):
    text_refiner = None
    visual_chatgpt = None
    if api_key and len(api_key) > 30:
        print(api_key)
        if validate_api_key(api_key):
            try:
                text_refiner = build_text_refiner(args.text_refiner, args.device, args, api_key)
                assert len(text_refiner.llm('hi')) > 0 # test
                visual_chatgpt = ConversationBot(shared_chatbot_tools, api_key)
            except Exception as e:
                print(f"Error initializing TextRefiner or ConversationBot: {e}")
                text_refiner = None
                visual_chatgpt = None
        else:
            print("Invalid API key.")
    else:
        print("API key is too short.")
    print(text_refiner)
    openai_available = text_refiner is not None
    if openai_available:

        global gpt_state
        gpt_state=1
        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=True)]+ [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
    else:
        gpt_state=0
        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
        
def init_wo_openai_api_key():
        global gpt_state
        gpt_state=0
        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]

def get_click_prompt(chat_input, click_state, click_mode):
    inputs = json.loads(chat_input)
    if click_mode == 'Continuous':
        points = click_state[0]
        labels = click_state[1]
        for input in inputs:
            points.append(input[:2])
            labels.append(input[2])
    elif click_mode == 'Single':
        points = []
        labels = []
        for input in inputs:
            points.append(input[:2])
            labels.append(input[2])
        click_state[0] = points
        click_state[1] = labels
    else:
        raise NotImplementedError

    prompt = {
        "prompt_type": ["click"],
        "input_point": click_state[0],
        "input_label": click_state[1],
        "multimask_output": "True",
    }
    return prompt


def update_click_state(click_state, caption, click_mode):
    if click_mode == 'Continuous':
        click_state[2].append(caption)
    elif click_mode == 'Single':
        click_state[2] = [caption]
    else:
        raise NotImplementedError

def chat_input_callback(*args):
    visual_chatgpt, chat_input, click_state, state, aux_state = args
    if visual_chatgpt is not None:
        return visual_chatgpt.run_text(chat_input, state, aux_state)
    else:
        response = "Text refiner is not initilzed, please input openai api key."
        state = state + [(chat_input, response)]
        return state, state


        
def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English"):
    if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
        image_input, mask = image_input['image'], image_input['mask']

    click_state = [[], [], []]
    image_input = image_resize(image_input, res=1024)

    model = build_caption_anything_with_models(
        args,
        api_key="",
        captioner=shared_captioner,
        sam_model=shared_sam_model,
        ocr_reader=shared_ocr_reader,
        session_id=iface.app_id
    )
    model.segmenter.set_image(image_input)
    image_embedding = model.image_embedding
    original_size = model.original_size
    input_size = model.input_size

    if visual_chatgpt is not None:
        print('upload_callback: add caption to chatGPT memory')
        new_image_path = get_new_image_name('chat_image', func_name='upload')
        image_input.save(new_image_path)
        visual_chatgpt.current_image = new_image_path
        img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
        Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {img_caption}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
        AI_prompt = "Received."
        visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
        visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt 
        parsed_data = get_image_gpt(openai_api_key, new_image_path,"Please provide the name, artist, year of creation, and material used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\",\"artist\": \"Name of the artist\", \"year\": \"Year of creation\", \"material\": \"Material used in the painting\" }.")
        parsed_data = json.loads(parsed_data.replace("'", "\""))
        name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
        artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
        paragraph = get_image_gpt(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
    
    state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]

    return state, state, image_input, click_state, image_input, image_input, image_input, image_embedding, \
        original_size, input_size, artwork_info,artwork_info,paragraph




def inference_click(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
                    length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, evt: gr.SelectData):
    click_index = evt.index

    if point_prompt == 'Positive':
        coordinate = "[[{}, {}, 1]]".format(str(click_index[0]), str(click_index[1]))
    else:
        coordinate = "[[{}, {}, 0]]".format(str(click_index[0]), str(click_index[1]))

    prompt = get_click_prompt(coordinate, click_state, click_mode)
    input_points = prompt['input_point']
    input_labels = prompt['input_label']

    controls = {'length': length,
                'sentiment': sentiment,
                'factuality': factuality,
                'language': language}

    model = build_caption_anything_with_models(
        args,
        api_key="",
        captioner=shared_captioner,
        sam_model=shared_sam_model,
        ocr_reader=shared_ocr_reader,
        text_refiner=text_refiner,
        session_id=iface.app_id
    )

    model.setup(image_embedding, original_size, input_size, is_image_set=True)
    
    enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
    out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]

    state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
    update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
    text = out['generated_captions']['raw_caption']
    input_mask = np.array(out['mask'].convert('P'))
    image_input = mask_painter(np.array(image_input), input_mask)

    click_index_state = click_index
    input_mask_state = input_mask
    input_points_state = input_points
    input_labels_state = input_labels
    out_state = out  

    if visual_chatgpt is not None:
        print('inference_click: add caption to chatGPT memory')
        new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
        Image.open(out["crop_save_path"]).save(new_crop_save_path)
        point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
        visual_chatgpt.point_prompt = point_prompt

    generated_caption = text
    print(generated_caption)
    print("new crop save",new_crop_save_path)

    yield state, state, click_state, image_input, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path




def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language, 
                   out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
                   input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key,new_crop_save_path):
    print("state",state)

    click_index = click_index_state
    input_mask = input_mask_state
    input_points = input_points_state
    input_labels = input_labels_state
    out = out_state
    focus_map = {
    "Inside the Mark": 0,
    "Around the Mark": 1,
    "Outside the Mark": 2
}

    mapped_value = focus_map.get(focus_type, -1)
    print("mapped value",mapped_value)

    controls = {
        'length': length,
        'sentiment': sentiment,
        'factuality': factuality,
        'language': language
    }

    prompt_list = [
    'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
    'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
    'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
]
    

    if mapped_value != -1:
        prompt= prompt_list[mapped_value].format(
            raw_caption=generated_caption,
            Wiki_caption=paragraph,
            length=controls['length'],
            sentiment=controls['sentiment'],
            language=controls['language']
        )
        
    else:
        print("error prompting")
        prompt = "Invalid focus type."

    if controls['factuality'] == "Imagination":
        prompt += "Assuming that I am someone who has viewed a lot of art and has a lot of experience viewing art.  Explain artistic features (composition, color, style, or use of light) and discuss the symbolism of the content and its influence on later artistic movements"
    
    print("Prompt:", prompt)
    print("click",click_index)

    origin_image_input = image_input



    image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
                                      input_points=input_points, input_labels=input_labels)

    if generated_caption:
        state = state + [(None, f"RAW_Caption: {generated_caption}")]
        

        if not args.disable_gpt and text_refiner:
            print("new crop save",new_crop_save_path)
            focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
            
            state = state + [(None, f"Wiki: {paragraph}")]
            state = state + [(None, f"Focus_Caption: {focus_info}")]
            print("new_cap",focus_info)
            refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
                                                      input_points=input_points, input_labels=input_labels)
            try:
                waveform_visual, audio_output = tts.predict(focus_info, input_language, input_audio, input_mic, use_mic, agree)
                return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
            except Exception as e:
                state = state + [(None, f"Error during TTS prediction: {str(e)}")]
                print(f"Error during TTS prediction: {str(e)}")
                return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None

        else:
            try:
                waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
                return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
            except Exception as e:
                state = state + [(None, f"Error during TTS prediction: {str(e)}")]
                print(f"Error during TTS prediction: {str(e)}")
                return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
def get_image_gpt(api_key, image_path,prompt,enable_wiki=None):
    # Getting the base64 string
    base64_image = encode_image(image_path)
    
    

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    prompt_text = prompt

    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt_text
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 300
    }

    # Sending the request to the OpenAI API
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    result = response.json()
    print(result)
    content = result['choices'][0]['message']['content']
    # Assume the model returns a valid JSON string in 'content'
    try:
        return content
    except json.JSONDecodeError:
        return {"error": "Failed to parse model output"}




def get_sketch_prompt(mask: Image.Image):
    """
    Get the prompt for the sketcher.
    TODO: This is a temporary solution. We should cluster the sketch and get the bounding box of each cluster.
    """

    mask = np.asarray(mask)[..., 0]

    # Get the bounding box of the sketch
    y, x = np.where(mask != 0)
    x1, y1 = np.min(x), np.min(y)
    x2, y2 = np.max(x), np.max(y)

    prompt = {
        'prompt_type': ['box'],
        'input_boxes': [
            [x1, y1, x2, y2]
        ]
    }

    return prompt


def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
                      original_size, input_size, text_refiner):
    image_input, mask = sketcher_image['image'], sketcher_image['mask']

    prompt = get_sketch_prompt(mask)
    boxes = prompt['input_boxes']

    controls = {'length': length,
                'sentiment': sentiment,
                'factuality': factuality,
                'language': language}

    model = build_caption_anything_with_models(
        args,
        api_key="",
        captioner=shared_captioner,
        sam_model=shared_sam_model,
        ocr_reader=shared_ocr_reader,
        text_refiner=text_refiner,
        session_id=iface.app_id
    )

    model.setup(image_embedding, original_size, input_size, is_image_set=True)

    enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
    out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)[0]

    # Update components and states
    state.append((f'Box: {boxes}', None))
    state.append((None, f'raw_caption: {out["generated_captions"]["raw_caption"]}'))
    text = out['generated_captions']['raw_caption']
    input_mask = np.array(out['mask'].convert('P'))
    image_input = mask_painter(np.array(image_input), input_mask)

    origin_image_input = image_input

    fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
    image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)

    yield state, state, image_input

    if not args.disable_gpt and model.text_refiner:
        refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
                                                       enable_wiki=enable_wiki)

        new_cap = refined_caption['caption']
        if refined_caption['wiki']:
            state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
        state = state + [(None, f"caption: {new_cap}")]
        refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)

        yield state, state, refined_image_input

def clear_chat_memory(visual_chatgpt, keep_global=False):
    if visual_chatgpt is not None:
        visual_chatgpt.memory.clear()
        visual_chatgpt.point_prompt = ""
        if keep_global:
            visual_chatgpt.agent.memory.buffer = visual_chatgpt.global_prompt
        else:
            visual_chatgpt.current_image = None
            visual_chatgpt.global_prompt = ""

def cap_everything(image_input, visual_chatgpt, text_refiner,input_language, input_audio, input_mic, use_mic, agree):
    
    model = build_caption_anything_with_models(
        args,
        api_key="",
        captioner=shared_captioner,
        sam_model=shared_sam_model,
        ocr_reader=shared_ocr_reader,
        text_refiner=text_refiner,
        session_id=iface.app_id
    )
    paragraph = model.inference_cap_everything(image_input, verbose=True)
    # state = state + [(None, f"Caption Everything: {paragraph}")]  
    Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
    AI_prompt = "Received."
    visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
    visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt 
    waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
    return paragraph,waveform_visual, audio_output

def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
    
    model = build_caption_anything_with_models(
        args,
        api_key="",
        captioner=shared_captioner,
        sam_model=shared_sam_model,
        ocr_reader=shared_ocr_reader,
        text_refiner=text_refiner,
        session_id=iface.app_id
    )
    paragraph = model.inference_cap_everything(image_input, verbose=True)
    # state = state + [(None, f"Caption Everything: {paragraph}")]  
    Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
    AI_prompt = "Received."
    visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
    visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt 
    return paragraph


    
def get_style():
    current_version = version.parse(gr.__version__)
    if current_version <= version.parse('3.24.1'):
        style = '''
        #image_sketcher{min-height:500px}
        #image_sketcher [data-testid="image"], #image_sketcher [data-testid="image"] > div{min-height: 500px}
        #image_upload{min-height:500px}
        #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 500px}
        '''
    elif current_version <= version.parse('3.27'):
        style = '''
        #image_sketcher{min-height:500px}
        #image_upload{min-height:500px}
        '''
    else:
        style = None

    return style


def create_ui():
    title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
    """
    description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """

    examples = [
        ["test_images/img36.webp"],
        ["test_images/MUS.png"],
        ["test_images/图片2.png"],
        ["test_images/img5.jpg"],
        ["test_images/img14.jpg"],
        ["test_images/qingming3.jpeg"],
        
    ]

    with gr.Blocks(
            css=get_style()
    ) as iface:
        state = gr.State([])
        out_state = gr.State(None)
        click_state = gr.State([[], [], []])
        origin_image = gr.State(None)
        image_embedding = gr.State(None)
        text_refiner = gr.State(None)
        visual_chatgpt = gr.State(None)
        original_size = gr.State(None)
        input_size = gr.State(None)
        generated_caption = gr.State("")
        paragraph = gr.State("")
        aux_state = gr.State([])
        click_index_state = gr.State((0, 0))
        input_mask_state = gr.State(np.zeros((1, 1)))
        input_points_state = gr.State([])
        input_labels_state = gr.State([])
        new_crop_save_path = gr.State(None)
    


        gr.Markdown(title)
        gr.Markdown(description)

        with gr.Row():
            with gr.Column(scale=1.0):
                with gr.Column(visible=False) as modules_not_need_gpt:
                    with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
                        image_intro=gr.HTML()
                        image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                        example_image = gr.Image(type="pil", interactive=False, visible=False)
                       
                    with gr.Tab("Click") as click_tab:
                        image_intro_click=gr.HTML()
                        image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                        example_image = gr.Image(type="pil", interactive=False, visible=False)
                        with gr.Row(scale=1.0):
                             focus_type = gr.Radio(
                                    choices=["Inside the Mark", "Around the Mark", "Outside the Mark"],
                                    value="Inside the Mark",
                                    label="Focus Type",
                                    interactive=True)
                        with gr.Row(scale=1.0):
                            with gr.Row(scale=0.4):
                                point_prompt = gr.Radio(
                                    choices=["Positive", "Negative"],
                                    value="Positive",
                                    label="Point Prompt",
                                    interactive=True)
                                click_mode = gr.Radio(
                                    choices=["Continuous", "Single"],
                                    value="Continuous",
                                    label="Clicking Mode",
                                    interactive=True)
                            with gr.Row(scale=0.4):
                                clear_button_click = gr.Button(value="Clear Clicks", interactive=True)
                                clear_button_image = gr.Button(value="Clear Image", interactive=True)
                                submit_button_click=gr.Button(value="Submit", interactive=True)
                    with gr.Tab("Trajectory (beta)"):
                        sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=20,
                                                       elem_id="image_sketcher")
                        with gr.Row():
                            submit_button_sketcher = gr.Button(value="Submit", interactive=True)

                with gr.Column(visible=False) as modules_need_gpt1:
                    with gr.Row(scale=1.0):
                        language = gr.Dropdown(
                            ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
                            value="English", label="Language", interactive=True)
                        sentiment = gr.Radio(
                            choices=["Positive", "Natural", "Negative"],
                            value="Natural",
                            label="Sentiment",
                            interactive=True,
                        )
                    with gr.Row(scale=1.0):
                        factuality = gr.Radio(
                            choices=["Factual", "Imagination"],
                            value="Factual",
                            label="Factuality",
                            interactive=True,
                        )
                        length = gr.Slider(
                            minimum=10,
                            maximum=80,
                            value=10,
                            step=1,
                            interactive=True,
                            label="Generated Caption Length",
                        )
                        # 是否启用wiki内容整合到caption中
                        enable_wiki = gr.Radio(
                            choices=["Yes", "No"],
                            value="No",
                            label="Enable Wiki",
                            interactive=True)

                # with gr.Column(visible=True) as modules_not_need_gpt3:
                gr.Examples(
                    examples=examples,
                    inputs=[example_image],
                )

            with gr.Column(scale=0.5):
                with gr.Column(visible=True) as module_key_input:
                    openai_api_key = gr.Textbox(
                        placeholder="Input openAI API key",
                        show_label=False,
                        label="OpenAI API Key",
                        lines=1,
                        type="password")
                    with gr.Row(scale=0.5):
                        enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary')
                        disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True,
                                                        variant='primary')
                with gr.Column(visible=False) as module_notification_box:
                    notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
                
                with gr.Column():
                    with gr.Column(visible=False) as modules_need_gpt2: 
                        paragraph_output = gr.Textbox(lines=7, label="Describe Everything", max_lines=7)
                    with gr.Column(visible=False) as modules_need_gpt0:
                        cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
                
                with gr.Column(visible=False) as modules_not_need_gpt2: 
                    chatbot = gr.Chatbot(label="Chatbox", ).style(height=550, scale=0.5)
                    with gr.Column(visible=False) as modules_need_gpt3:
                        chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
                            container=False)
                        with gr.Row():
                            clear_button_text = gr.Button(value="Clear Text", interactive=True)
                            submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
            
            with gr.Column(scale=0.5):
                # TTS interface hidden initially
                with gr.Column(visible=False) as tts_interface:
                    input_text = gr.Textbox(label="Text Prompt", value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality")
                    input_language = gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en")
                    input_audio = gr.Audio(label="Reference Audio", type="filepath", value="examples/female.wav")
                    input_mic = gr.Audio(source="microphone", type="filepath", label="Use Microphone for Reference")
                    use_mic = gr.Checkbox(label="Check to use Microphone as Reference", value=False)
                    agree = gr.Checkbox(label="Agree", value=True)
                    output_waveform = gr.Video(label="Waveform Visual")
                    output_audio = gr.HTML(label="Synthesised Audio")

                    with gr.Row():
                        submit_tts = gr.Button(value="Submit", interactive=True)
                        clear_tts = gr.Button(value="Clear", interactive=True)


        def clear_tts_fields():
            return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
        
        submit_tts.click(
            tts.predict,
            inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
            outputs=[output_waveform, output_audio],
            queue=True
        )

        clear_tts.click(
            clear_tts_fields,
            inputs=None,
            outputs=[input_text, input_language, input_audio, input_mic, use_mic, agree, output_waveform, output_audio],
            queue=False
        )

        
        openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                              outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box])
        enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                    outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                             modules_not_need_gpt,
                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box])
        disable_chatGPT_button.click(init_wo_openai_api_key,
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
                                              modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box])
        
        enable_chatGPT_button.click(
            lambda: (None, [], [], [[], [], []], "", "", ""),
            [],
            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
            queue=False,
            show_progress=False
        )
        openai_api_key.submit(
            lambda: (None, [], [], [[], [], []], "", "", ""),
            [],
            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
            queue=False,
            show_progress=False
        )

        cap_everything_button.click(cap_everything, [origin_image, visual_chatgpt, text_refiner,input_language, input_audio, input_mic, use_mic, agree], 
                                    [paragraph_output,output_waveform, output_audio])
        
        clear_button_click.click(
            lambda x: ([[], [], []], x),
            [origin_image],
            [click_state, image_input],
            queue=False,
            show_progress=False
        )
        clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
        clear_button_image.click(
            lambda: (None, [], [], [[], [], []], "", "", ""),
            [],
            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
            queue=False,
            show_progress=False
        )
        clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt])
        clear_button_text.click(
            lambda: ([], [], [[], [], [], []]),
            [],
            [chatbot, state, click_state],
            queue=False,
            show_progress=False
        )
        clear_button_text.click(clear_chat_memory, inputs=[visual_chatgpt])
        
        image_input.clear(
            lambda: (None, [], [], [[], [], []], "", "", ""),
            [],
            [image_input, chatbot, state, click_state, paragraph_output, origin_image],
            queue=False,
            show_progress=False
        )

        image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])


        

        image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key],
                           [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
                            image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
        
        image_input.upload(upload_callback, [image_input, state, visual_chatgpt, openai_api_key],
                           [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
                            image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
        sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
                               image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
        chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
                          [chatbot, state, aux_state])
        chat_input.submit(lambda: "", None, chat_input)
        submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
                          [chatbot, state, aux_state])
        submit_button_text.click(lambda: "", None, chat_input)
        example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
                             [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
                              image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])

        example_image.change(clear_chat_memory, inputs=[visual_chatgpt])

        def on_click_tab_selected():
            if gpt_state ==1:
                print(gpt_state)
                print("using gpt")
                return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
            else: 
                print("no gpt")
                print("gpt_state",gpt_state)
                return [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2
        
        def on_base_selected():
            if gpt_state ==1:
                print(gpt_state)
                print("using gpt")
                return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2
            else: 
                print("no gpt")
                return [gr.update(visible=False)]*4
        

        click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
        base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])

  
  

        image_input.select(
            inference_click,
            inputs=[
                origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
                image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
                out_state, click_index_state, input_mask_state, input_points_state, input_labels_state
            ],
            outputs=[chatbot, state, click_state, image_input, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path],
            show_progress=False, queue=True
        )


        submit_button_click.click(
            submit_caption,
            inputs=[
        image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language, 
        out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
        input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key,new_crop_save_path
    ],
            outputs=[
                chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
                output_waveform, output_audio 
            ],
            show_progress=True,
            queue=True
        )



        submit_button_sketcher.click(
            inference_traject,
            inputs=[
                sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
                original_size, input_size, text_refiner
            ],
            outputs=[chatbot, state, sketcher_input],
            show_progress=False, queue=True
        )





        return iface


if __name__ == '__main__':
    iface = create_ui()
    iface.queue(concurrency_count=5, api_open=False, max_size=10)
    iface.launch(server_name="0.0.0.0", enable_queue=True)