Spaces:
Running
Running
Niki Zhang
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -25,6 +25,7 @@ import easyocr
|
|
25 |
import re
|
26 |
import edge_tts
|
27 |
import asyncio
|
|
|
28 |
# import tts
|
29 |
|
30 |
###############################################################################
|
@@ -32,27 +33,14 @@ import asyncio
|
|
32 |
###############################################################################
|
33 |
|
34 |
|
|
|
35 |
|
36 |
-
|
37 |
-
# import uuid
|
38 |
-
# from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
|
39 |
-
# from diffusers.utils import export_to_video
|
40 |
-
# from safetensors.torch import load_file
|
41 |
-
#from diffusers.models.modeling_outputs import Transformer2DModelOutput
|
42 |
-
|
43 |
-
|
44 |
-
import random
|
45 |
-
import uuid
|
46 |
-
import json
|
47 |
-
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
import imageio
|
53 |
import numpy as np
|
54 |
import torch
|
55 |
import rembg
|
|
|
56 |
from torchvision.transforms import v2
|
57 |
from pytorch_lightning import seed_everything
|
58 |
from omegaconf import OmegaConf
|
@@ -297,6 +285,7 @@ def make3d(images):
|
|
297 |
############# above part is for 3D generate #############
|
298 |
###############################################################################
|
299 |
|
|
|
300 |
###############################################################################
|
301 |
############# this part is for text to image #############
|
302 |
###############################################################################
|
@@ -418,6 +407,36 @@ filtered_language_dict = {
|
|
418 |
'Cantonese': 'zh-HK-HiuGaaiNeural'
|
419 |
}
|
420 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
gpt_state = 0
|
422 |
VOICE = "en-GB-SoniaNeural"
|
423 |
article = """
|
@@ -463,6 +482,7 @@ class ImageSketcher(gr.Image):
|
|
463 |
mask[..., -1] = 255
|
464 |
mask = self.postprocess(mask)
|
465 |
x['mask'] = mask
|
|
|
466 |
return super().preprocess(x)
|
467 |
|
468 |
|
@@ -512,15 +532,18 @@ def init_openai_api_key(api_key=""):
|
|
512 |
|
513 |
global gpt_state
|
514 |
gpt_state=1
|
515 |
-
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
|
|
|
516 |
else:
|
517 |
gpt_state=0
|
518 |
-
return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
|
|
|
519 |
|
520 |
def init_wo_openai_api_key():
|
521 |
global gpt_state
|
522 |
gpt_state=0
|
523 |
-
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
|
|
|
524 |
|
525 |
def get_click_prompt(chat_input, click_state, click_mode):
|
526 |
inputs = json.loads(chat_input)
|
@@ -677,17 +700,16 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
|
|
677 |
point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
|
678 |
visual_chatgpt.point_prompt = point_prompt
|
679 |
|
680 |
-
|
681 |
-
print(generated_caption)
|
682 |
print("new crop save",new_crop_save_path)
|
683 |
|
684 |
-
yield state, state, click_state, image_input_nobackground, image_input_withbackground,
|
685 |
|
686 |
|
687 |
|
688 |
|
689 |
|
690 |
-
async def submit_caption(
|
691 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
692 |
autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
|
693 |
print("state",state)
|
@@ -702,23 +724,57 @@ async def submit_caption(image_input, state, generated_caption, text_refiner, vi
|
|
702 |
print("click_index",click_index)
|
703 |
print("input_points_state",input_points_state)
|
704 |
print("input_labels_state",input_labels_state)
|
705 |
-
|
|
|
706 |
|
707 |
-
|
708 |
-
|
709 |
-
input_labels = input_labels_state
|
710 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
711 |
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
720 |
mapped_value = focus_map.get(focus_type, -1)
|
721 |
-
print("mapped value",mapped_value)
|
722 |
|
723 |
controls = {
|
724 |
'length': length,
|
@@ -726,95 +782,21 @@ async def submit_caption(image_input, state, generated_caption, text_refiner, vi
|
|
726 |
'factuality': factuality,
|
727 |
'language': language
|
728 |
}
|
729 |
-
|
730 |
-
prompt_list = [
|
731 |
-
'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
732 |
-
'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
|
733 |
-
'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
|
734 |
-
'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
|
735 |
-
]
|
736 |
-
|
737 |
-
prompt_list = [
|
738 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
739 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
740 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
741 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
|
742 |
-
]
|
743 |
-
'''
|
744 |
-
prompt_list = [
|
745 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
746 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
747 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
748 |
-
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
|
749 |
-
]
|
750 |
if mapped_value != -1:
|
751 |
-
prompt= prompt_list[mapped_value].format(
|
752 |
-
raw_caption=generated_caption,
|
753 |
Wiki_caption=paragraph,
|
754 |
length=controls['length'],
|
755 |
sentiment=controls['sentiment'],
|
756 |
language=controls['language']
|
757 |
)
|
758 |
-
|
759 |
else:
|
760 |
-
print("error prompting")
|
761 |
prompt = "Invalid focus type."
|
762 |
|
763 |
if controls['factuality'] == "Imagination":
|
764 |
-
prompt += "Assuming that I am someone who has viewed a lot of art and has a lot of experience viewing art.
|
765 |
-
|
766 |
-
print("Prompt:", prompt)
|
767 |
-
print("click",click_index)
|
768 |
-
|
769 |
-
origin_image_input = image_input
|
770 |
-
|
771 |
-
|
772 |
|
773 |
-
|
774 |
-
input_points=input_points, input_labels=input_labels)
|
775 |
-
|
776 |
-
if generated_caption:
|
777 |
-
# state = state + [(None, f"RAW_Caption: {generated_caption}")]
|
778 |
-
|
779 |
-
|
780 |
-
if not args.disable_gpt and text_refiner:
|
781 |
-
print("new crop save",new_crop_save_path)
|
782 |
-
focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
|
783 |
-
if focus_info.startswith('"') and focus_info.endswith('"'):
|
784 |
-
focus_info=focus_info[1:-1]
|
785 |
-
focus_info=focus_info.replace('#', '')
|
786 |
-
# state = state + [(None, f"Wiki: {paragraph}")]
|
787 |
-
state = state + [(None, f"{focus_info}")]
|
788 |
-
print("new_cap",focus_info)
|
789 |
-
read_info = re.sub(r'[#[\]!*]','',focus_info)
|
790 |
-
read_info = emoji.replace_emoji(read_info,replace="")
|
791 |
-
print("read info",read_info)
|
792 |
-
|
793 |
-
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
794 |
-
# input_points=input_points, input_labels=input_labels)
|
795 |
-
try:
|
796 |
-
audio_output = await texttospeech(read_info, language,autoplay)
|
797 |
-
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
798 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
|
799 |
-
|
800 |
-
except Exception as e:
|
801 |
-
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
802 |
-
print(f"Error during TTS prediction: {str(e)}")
|
803 |
-
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
804 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
|
805 |
-
|
806 |
-
else:
|
807 |
-
try:
|
808 |
-
audio_output = await texttospeech(focus_info, language, autoplay)
|
809 |
-
# waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
|
810 |
-
waveform_visual, audio_output=None,None
|
811 |
-
# return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
812 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
|
813 |
-
|
814 |
-
except Exception as e:
|
815 |
-
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
816 |
-
print(f"Error during TTS prediction: {str(e)}")
|
817 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
818 |
|
819 |
|
820 |
def encode_image(image_path):
|
@@ -892,14 +874,19 @@ def get_sketch_prompt(mask: Image.Image):
|
|
892 |
|
893 |
return prompt
|
894 |
|
|
|
895 |
|
896 |
-
def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
|
897 |
-
original_size, input_size, text_refiner):
|
898 |
image_input, mask = sketcher_image['image'], sketcher_image['mask']
|
899 |
-
|
|
|
|
|
900 |
prompt = get_sketch_prompt(mask)
|
901 |
boxes = prompt['input_boxes']
|
902 |
boxes = boxes[0]
|
|
|
|
|
903 |
|
904 |
controls = {'length': length,
|
905 |
'sentiment': sentiment,
|
@@ -919,38 +906,77 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
|
|
919 |
model.setup(image_embedding, original_size, input_size, is_image_set=True)
|
920 |
|
921 |
enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
|
922 |
-
out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)[0]
|
923 |
|
924 |
-
|
925 |
-
|
926 |
-
|
927 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
928 |
# Update components and states
|
929 |
state.append((f'Box: {boxes}', None))
|
930 |
-
|
931 |
-
text = out['generated_captions']['raw_caption']
|
932 |
-
input_mask = np.array(out['mask'].convert('P'))
|
933 |
-
# image_input = mask_painter(np.array(image_input), input_mask, background_alpha=0 )
|
934 |
-
image_input = Image.fromarray(np.array(image_input))
|
935 |
-
draw = ImageDraw.Draw(image_input)
|
936 |
-
draw.rectangle(boxes, outline='red', width=2)
|
937 |
-
|
938 |
# fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
|
939 |
# image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
|
940 |
|
941 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
942 |
|
943 |
-
if not args.disable_gpt and model.text_refiner:
|
944 |
-
refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
|
945 |
-
enable_wiki=enable_wiki)
|
946 |
|
947 |
-
|
948 |
-
|
949 |
-
|
950 |
-
|
951 |
-
# refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
|
952 |
|
953 |
-
yield state, state, image_input
|
954 |
|
955 |
def clear_chat_memory(visual_chatgpt, keep_global=False):
|
956 |
if visual_chatgpt is not None:
|
@@ -1020,32 +1046,55 @@ def get_style():
|
|
1020 |
#image_sketcher [data-testid="image"], #image_sketcher [data-testid="image"] > div{min-height: 500px}
|
1021 |
#image_upload{min-height:500px}
|
1022 |
#image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 500px}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1023 |
'''
|
1024 |
elif current_version <= version.parse('3.27'):
|
1025 |
style = '''
|
1026 |
#image_sketcher{min-height:500px}
|
1027 |
#image_upload{min-height:500px}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1028 |
'''
|
1029 |
else:
|
1030 |
style = None
|
1031 |
|
1032 |
return style
|
1033 |
|
1034 |
-
def handle_like_dislike(like_data, like_state, dislike_state):
|
1035 |
-
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
-
|
1041 |
-
|
1042 |
-
|
1043 |
-
|
1044 |
-
|
1045 |
-
|
1046 |
-
|
1047 |
|
1048 |
-
|
1049 |
|
1050 |
async def texttospeech(text,language,autoplay):
|
1051 |
voice=filtered_language_dict[language]
|
@@ -1060,9 +1109,11 @@ async def texttospeech(text,language,autoplay):
|
|
1060 |
if autoplay:
|
1061 |
audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
|
1062 |
else:
|
1063 |
-
audio_player
|
|
|
1064 |
return audio_player
|
1065 |
|
|
|
1066 |
def create_ui():
|
1067 |
title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
|
1068 |
"""
|
@@ -1093,7 +1144,6 @@ def create_ui():
|
|
1093 |
visual_chatgpt = gr.State(None)
|
1094 |
original_size = gr.State(None)
|
1095 |
input_size = gr.State(None)
|
1096 |
-
generated_caption = gr.State("")
|
1097 |
paragraph = gr.State("")
|
1098 |
aux_state = gr.State([])
|
1099 |
click_index_state = gr.State((0, 0))
|
@@ -1102,15 +1152,33 @@ def create_ui():
|
|
1102 |
input_labels_state = gr.State([])
|
1103 |
new_crop_save_path = gr.State(None)
|
1104 |
image_input_nobackground = gr.State(None)
|
1105 |
-
like_state=gr.State([])
|
1106 |
-
dislike_state=gr.State([])
|
1107 |
-
|
1108 |
-
|
1109 |
|
1110 |
gr.Markdown(title)
|
1111 |
gr.Markdown(description)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1112 |
|
1113 |
-
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1114 |
with gr.Column(scale=1.0):
|
1115 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
1116 |
with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
|
@@ -1156,11 +1224,12 @@ def create_ui():
|
|
1156 |
clear_button_image = gr.Button(value="Clear Image", interactive=True)
|
1157 |
|
1158 |
with gr.Tab("Trajectory (beta)") as traj_tab:
|
1159 |
-
sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10,
|
1160 |
elem_id="image_sketcher")
|
1161 |
-
|
1162 |
-
|
1163 |
submit_button_sketcher = gr.Button(value="Submit", interactive=True)
|
|
|
1164 |
with gr.Row(scale=1.0):
|
1165 |
with gr.Row(scale=0.8):
|
1166 |
focus_type_sketch = gr.Radio(
|
@@ -1171,7 +1240,7 @@ def create_ui():
|
|
1171 |
Input_sketch = gr.Radio(
|
1172 |
choices=["Trace+Seg", "Trace"],
|
1173 |
value="Trace+Seg",
|
1174 |
-
label="
|
1175 |
interactive=True)
|
1176 |
|
1177 |
with gr.Column(visible=False) as modules_need_gpt1:
|
@@ -1203,26 +1272,17 @@ def create_ui():
|
|
1203 |
value="No",
|
1204 |
label="Expert",
|
1205 |
interactive=True)
|
1206 |
-
|
1207 |
with gr.Column(visible=True) as modules_not_need_gpt3:
|
1208 |
gr.Examples(
|
1209 |
-
|
1210 |
-
|
1211 |
-
|
|
|
1212 |
|
1213 |
|
1214 |
|
1215 |
|
1216 |
-
with gr.Column(scale=0.5):
|
1217 |
-
with gr.Row(align="right",visible=False) as language_select:
|
1218 |
-
language = gr.Dropdown(
|
1219 |
-
['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
|
1220 |
-
value="English", label="Language", interactive=True)
|
1221 |
-
|
1222 |
-
with gr.Row(align="right",visible=False) as autoplay:
|
1223 |
-
auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
|
1224 |
-
output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
|
1225 |
-
|
1226 |
with gr.Column(visible=True) as module_key_input:
|
1227 |
openai_api_key = gr.Textbox(
|
1228 |
placeholder="Input openAI API key",
|
@@ -1243,7 +1303,7 @@ def create_ui():
|
|
1243 |
cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
|
1244 |
|
1245 |
with gr.Column(visible=False) as modules_not_need_gpt2:
|
1246 |
-
with gr.Blocks(
|
1247 |
chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True).style(height=600, scale=0.5)
|
1248 |
with gr.Column(visible=False) as modules_need_gpt3:
|
1249 |
chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
|
@@ -1251,6 +1311,9 @@ def create_ui():
|
|
1251 |
with gr.Row():
|
1252 |
clear_button_text = gr.Button(value="Clear Text", interactive=True)
|
1253 |
submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
|
|
|
|
|
|
|
1254 |
with gr.Row():
|
1255 |
export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
|
1256 |
with gr.Row():
|
@@ -1421,7 +1484,7 @@ def create_ui():
|
|
1421 |
# this part is for 3d generate.
|
1422 |
###############################################################################
|
1423 |
|
1424 |
-
with gr.Row(variant="panel") as d3_model:
|
1425 |
with gr.Column():
|
1426 |
with gr.Row():
|
1427 |
input_image = gr.Image(
|
@@ -1529,7 +1592,7 @@ def create_ui():
|
|
1529 |
|
1530 |
def clear_tts_fields():
|
1531 |
return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
|
1532 |
-
|
1533 |
# submit_tts.click(
|
1534 |
# tts.predict,
|
1535 |
# inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
|
@@ -1544,6 +1607,9 @@ def create_ui():
|
|
1544 |
queue=False
|
1545 |
)
|
1546 |
|
|
|
|
|
|
|
1547 |
clear_button_sketcher.click(
|
1548 |
lambda x: (x),
|
1549 |
[origin_image],
|
@@ -1552,18 +1618,21 @@ def create_ui():
|
|
1552 |
show_progress=False
|
1553 |
)
|
1554 |
|
|
|
|
|
|
|
1555 |
|
1556 |
openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
|
1557 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
|
1558 |
-
modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,
|
1559 |
enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
|
1560 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1561 |
modules_not_need_gpt,
|
1562 |
-
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,
|
1563 |
disable_chatGPT_button.click(init_wo_openai_api_key,
|
1564 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1565 |
modules_not_need_gpt,
|
1566 |
-
modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,
|
1567 |
|
1568 |
enable_chatGPT_button.click(
|
1569 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
@@ -1677,7 +1746,7 @@ def create_ui():
|
|
1677 |
image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
|
1678 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
1679 |
],
|
1680 |
-
outputs=[chatbot, state, click_state, image_input, input_image,
|
1681 |
show_progress=False, queue=True
|
1682 |
)
|
1683 |
|
@@ -1685,7 +1754,7 @@ def create_ui():
|
|
1685 |
submit_button_click.click(
|
1686 |
submit_caption,
|
1687 |
inputs=[
|
1688 |
-
|
1689 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
1690 |
auto_play,paragraph,focus_type,openai_api_key,new_crop_save_path
|
1691 |
],
|
@@ -1701,10 +1770,10 @@ def create_ui():
|
|
1701 |
submit_button_sketcher.click(
|
1702 |
inference_traject,
|
1703 |
inputs=[
|
1704 |
-
sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
|
1705 |
-
original_size, input_size, text_refiner
|
1706 |
],
|
1707 |
-
outputs=[chatbot, state, sketcher_input],
|
1708 |
show_progress=False, queue=True
|
1709 |
)
|
1710 |
|
|
|
25 |
import re
|
26 |
import edge_tts
|
27 |
import asyncio
|
28 |
+
import cv2
|
29 |
# import tts
|
30 |
|
31 |
###############################################################################
|
|
|
33 |
###############################################################################
|
34 |
|
35 |
|
36 |
+
# import spaces #
|
37 |
|
38 |
+
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
import imageio
|
40 |
import numpy as np
|
41 |
import torch
|
42 |
import rembg
|
43 |
+
from PIL import Image
|
44 |
from torchvision.transforms import v2
|
45 |
from pytorch_lightning import seed_everything
|
46 |
from omegaconf import OmegaConf
|
|
|
285 |
############# above part is for 3D generate #############
|
286 |
###############################################################################
|
287 |
|
288 |
+
|
289 |
###############################################################################
|
290 |
############# this part is for text to image #############
|
291 |
###############################################################################
|
|
|
407 |
'Cantonese': 'zh-HK-HiuGaaiNeural'
|
408 |
}
|
409 |
|
410 |
+
focus_map = {
|
411 |
+
"CFV-D":0,
|
412 |
+
"CFV-DA":1,
|
413 |
+
"CFV-DAI":2,
|
414 |
+
"PFV-DDA":3
|
415 |
+
}
|
416 |
+
|
417 |
+
'''
|
418 |
+
prompt_list = [
|
419 |
+
'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
420 |
+
'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
|
421 |
+
'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.',
|
422 |
+
'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
|
423 |
+
]
|
424 |
+
|
425 |
+
prompt_list = [
|
426 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
427 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
428 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
|
429 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.'
|
430 |
+
]
|
431 |
+
'''
|
432 |
+
prompt_list = [
|
433 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis)as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
434 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
435 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
|
436 |
+
'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and the objects that may be related to the selected object and list one fact of selected object, one fact of related object and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
|
437 |
+
]
|
438 |
+
|
439 |
+
|
440 |
gpt_state = 0
|
441 |
VOICE = "en-GB-SoniaNeural"
|
442 |
article = """
|
|
|
482 |
mask[..., -1] = 255
|
483 |
mask = self.postprocess(mask)
|
484 |
x['mask'] = mask
|
485 |
+
|
486 |
return super().preprocess(x)
|
487 |
|
488 |
|
|
|
532 |
|
533 |
global gpt_state
|
534 |
gpt_state=1
|
535 |
+
# return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
|
536 |
+
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
|
537 |
else:
|
538 |
gpt_state=0
|
539 |
+
# return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
|
540 |
+
return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
|
541 |
|
542 |
def init_wo_openai_api_key():
|
543 |
global gpt_state
|
544 |
gpt_state=0
|
545 |
+
# return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
|
546 |
+
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*2
|
547 |
|
548 |
def get_click_prompt(chat_input, click_state, click_mode):
|
549 |
inputs = json.loads(chat_input)
|
|
|
700 |
point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
|
701 |
visual_chatgpt.point_prompt = point_prompt
|
702 |
|
703 |
+
|
|
|
704 |
print("new crop save",new_crop_save_path)
|
705 |
|
706 |
+
yield state, state, click_state, image_input_nobackground, image_input_withbackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
|
707 |
|
708 |
|
709 |
|
710 |
|
711 |
|
712 |
+
async def submit_caption(state, text_refiner, length, sentiment, factuality, language,
|
713 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
714 |
autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
|
715 |
print("state",state)
|
|
|
724 |
print("click_index",click_index)
|
725 |
print("input_points_state",input_points_state)
|
726 |
print("input_labels_state",input_labels_state)
|
727 |
+
|
728 |
+
prompt=generate_prompt(paragraph,focus_type,length,sentiment,factuality,language)
|
729 |
|
730 |
+
print("Prompt:", prompt)
|
731 |
+
print("click",click_index)
|
|
|
732 |
|
733 |
+
# image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
|
734 |
+
# input_points=input_points, input_labels=input_labels)
|
735 |
+
|
736 |
+
|
737 |
+
if not args.disable_gpt and text_refiner:
|
738 |
+
print("new crop save",new_crop_save_path)
|
739 |
+
focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
|
740 |
+
if focus_info.startswith('"') and focus_info.endswith('"'):
|
741 |
+
focus_info=focus_info[1:-1]
|
742 |
+
focus_info=focus_info.replace('#', '')
|
743 |
+
# state = state + [(None, f"Wiki: {paragraph}")]
|
744 |
+
state = state + [(None, f"{focus_info}")]
|
745 |
+
print("new_cap",focus_info)
|
746 |
+
read_info = re.sub(r'[#[\]!*]','',focus_info)
|
747 |
+
read_info = emoji.replace_emoji(read_info,replace="")
|
748 |
+
print("read info",read_info)
|
749 |
+
|
750 |
+
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
751 |
+
# input_points=input_points, input_labels=input_labels)
|
752 |
+
try:
|
753 |
+
audio_output = await texttospeech(read_info, language,autoplay)
|
754 |
+
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
755 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
|
756 |
+
|
757 |
+
except Exception as e:
|
758 |
+
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
759 |
+
print(f"Error during TTS prediction: {str(e)}")
|
760 |
+
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
761 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
|
762 |
|
763 |
+
else:
|
764 |
+
try:
|
765 |
+
audio_output = await texttospeech(focus_info, language, autoplay)
|
766 |
+
# waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
|
767 |
+
# return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
768 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
|
769 |
+
|
770 |
+
except Exception as e:
|
771 |
+
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
772 |
+
print(f"Error during TTS prediction: {str(e)}")
|
773 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
774 |
+
|
775 |
+
def generate_prompt(focus_type, paragraph,length, sentiment, factuality, language):
|
776 |
+
|
777 |
mapped_value = focus_map.get(focus_type, -1)
|
|
|
778 |
|
779 |
controls = {
|
780 |
'length': length,
|
|
|
782 |
'factuality': factuality,
|
783 |
'language': language
|
784 |
}
|
785 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
786 |
if mapped_value != -1:
|
787 |
+
prompt = prompt_list[mapped_value].format(
|
|
|
788 |
Wiki_caption=paragraph,
|
789 |
length=controls['length'],
|
790 |
sentiment=controls['sentiment'],
|
791 |
language=controls['language']
|
792 |
)
|
|
|
793 |
else:
|
|
|
794 |
prompt = "Invalid focus type."
|
795 |
|
796 |
if controls['factuality'] == "Imagination":
|
797 |
+
prompt += " Assuming that I am someone who has viewed a lot of art and has a lot of experience viewing art. Explain artistic features (composition, color, style, or use of light) and discuss the symbolism of the content and its influence on later artistic movements."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
798 |
|
799 |
+
return prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
800 |
|
801 |
|
802 |
def encode_image(image_path):
|
|
|
874 |
|
875 |
return prompt
|
876 |
|
877 |
+
submit_traj=0
|
878 |
|
879 |
+
async def inference_traject(origin_image,sketcher_image, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
|
880 |
+
original_size, input_size, text_refiner,focus_type,paragraph,openai_api_key,autoplay,trace_type):
|
881 |
image_input, mask = sketcher_image['image'], sketcher_image['mask']
|
882 |
+
|
883 |
+
crop_save_path=""
|
884 |
+
|
885 |
prompt = get_sketch_prompt(mask)
|
886 |
boxes = prompt['input_boxes']
|
887 |
boxes = boxes[0]
|
888 |
+
global submit_traj
|
889 |
+
submit_traj=1
|
890 |
|
891 |
controls = {'length': length,
|
892 |
'sentiment': sentiment,
|
|
|
906 |
model.setup(image_embedding, original_size, input_size, is_image_set=True)
|
907 |
|
908 |
enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
|
909 |
+
out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki,verbose=True)[0]
|
910 |
|
911 |
+
print(trace_type)
|
912 |
+
|
913 |
+
if trace_type=="Trace+Seg":
|
914 |
+
input_mask = np.array(out['mask'].convert('P'))
|
915 |
+
image_input = mask_painter(np.array(image_input), input_mask, background_alpha=0 )
|
916 |
+
crop_save_path=out['crop_save_path']
|
917 |
+
|
918 |
+
else:
|
919 |
+
image_input = Image.fromarray(np.array(origin_image))
|
920 |
+
draw = ImageDraw.Draw(image_input)
|
921 |
+
draw.rectangle(boxes, outline='red', width=2)
|
922 |
+
cropped_image = origin_image.crop(boxes)
|
923 |
+
cropped_image.save('temp.png')
|
924 |
+
crop_save_path='temp.png'
|
925 |
+
|
926 |
+
print("crop_svae_path",out['crop_save_path'])
|
927 |
+
|
928 |
# Update components and states
|
929 |
state.append((f'Box: {boxes}', None))
|
930 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
931 |
# fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
|
932 |
# image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
|
933 |
|
934 |
+
prompt=generate_prompt(focus_type, paragraph, length, sentiment, factuality, language)
|
935 |
+
width, height = sketcher_image['image'].size
|
936 |
+
sketcher_image['mask'] = np.zeros((height, width, 4), dtype=np.uint8)
|
937 |
+
sketcher_image['mask'][..., -1] = 255
|
938 |
+
sketcher_image['image']=image_input
|
939 |
+
|
940 |
+
|
941 |
+
if not args.disable_gpt and text_refiner:
|
942 |
+
focus_info=get_image_gpt(openai_api_key,crop_save_path,prompt)
|
943 |
+
if focus_info.startswith('"') and focus_info.endswith('"'):
|
944 |
+
focus_info=focus_info[1:-1]
|
945 |
+
focus_info=focus_info.replace('#', '')
|
946 |
+
state = state + [(None, f"{focus_info}")]
|
947 |
+
print("new_cap",focus_info)
|
948 |
+
read_info = re.sub(r'[#[\]!*]','',focus_info)
|
949 |
+
read_info = emoji.replace_emoji(read_info,replace="")
|
950 |
+
print("read info",read_info)
|
951 |
+
|
952 |
+
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
953 |
+
# input_points=input_points, input_labels=input_labels)
|
954 |
+
try:
|
955 |
+
audio_output = await texttospeech(read_info, language,autoplay)
|
956 |
+
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
957 |
+
return state, state,image_input,audio_output
|
958 |
+
|
959 |
+
|
960 |
+
except Exception as e:
|
961 |
+
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
962 |
+
print(f"Error during TTS prediction: {str(e)}")
|
963 |
+
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
964 |
+
return state, state, image_input,audio_output
|
965 |
+
|
966 |
+
|
967 |
+
else:
|
968 |
+
try:
|
969 |
+
audio_output = await texttospeech(focus_info, language, autoplay)
|
970 |
+
# waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
|
971 |
+
# return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
972 |
+
return state, state, image_input,audio_output
|
973 |
|
|
|
|
|
|
|
974 |
|
975 |
+
except Exception as e:
|
976 |
+
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
977 |
+
print(f"Error during TTS prediction: {str(e)}")
|
978 |
+
return state, state, image_input,audio_output
|
|
|
979 |
|
|
|
980 |
|
981 |
def clear_chat_memory(visual_chatgpt, keep_global=False):
|
982 |
if visual_chatgpt is not None:
|
|
|
1046 |
#image_sketcher [data-testid="image"], #image_sketcher [data-testid="image"] > div{min-height: 500px}
|
1047 |
#image_upload{min-height:500px}
|
1048 |
#image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 500px}
|
1049 |
+
.custom-language {
|
1050 |
+
width: 20%;
|
1051 |
+
}
|
1052 |
+
|
1053 |
+
.custom-autoplay {
|
1054 |
+
width: 40%;
|
1055 |
+
}
|
1056 |
+
|
1057 |
+
.custom-output {
|
1058 |
+
width: 30%;
|
1059 |
+
}
|
1060 |
+
|
1061 |
'''
|
1062 |
elif current_version <= version.parse('3.27'):
|
1063 |
style = '''
|
1064 |
#image_sketcher{min-height:500px}
|
1065 |
#image_upload{min-height:500px}
|
1066 |
+
.custom-language {
|
1067 |
+
width: 20%;
|
1068 |
+
}
|
1069 |
+
|
1070 |
+
.custom-autoplay {
|
1071 |
+
width: 40%;
|
1072 |
+
}
|
1073 |
+
|
1074 |
+
.custom-output {
|
1075 |
+
width: 30%;
|
1076 |
+
}
|
1077 |
'''
|
1078 |
else:
|
1079 |
style = None
|
1080 |
|
1081 |
return style
|
1082 |
|
1083 |
+
# def handle_like_dislike(like_data, like_state, dislike_state):
|
1084 |
+
# if like_data.liked:
|
1085 |
+
# if like_data.index not in like_state:
|
1086 |
+
# like_state.append(like_data.index)
|
1087 |
+
# message = f"Liked: {like_data.value} at index {like_data.index}"
|
1088 |
+
# else:
|
1089 |
+
# message = "You already liked this item"
|
1090 |
+
# else:
|
1091 |
+
# if like_data.index not in dislike_state:
|
1092 |
+
# dislike_state.append(like_data.index)
|
1093 |
+
# message = f"Disliked: {like_data.value} at index {like_data.index}"
|
1094 |
+
# else:
|
1095 |
+
# message = "You already disliked this item"
|
1096 |
|
1097 |
+
# return like_state, dislike_state
|
1098 |
|
1099 |
async def texttospeech(text,language,autoplay):
|
1100 |
voice=filtered_language_dict[language]
|
|
|
1109 |
if autoplay:
|
1110 |
audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
|
1111 |
else:
|
1112 |
+
audio_player=None
|
1113 |
+
# audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls {audio_style}></audio>'
|
1114 |
return audio_player
|
1115 |
|
1116 |
+
|
1117 |
def create_ui():
|
1118 |
title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
|
1119 |
"""
|
|
|
1144 |
visual_chatgpt = gr.State(None)
|
1145 |
original_size = gr.State(None)
|
1146 |
input_size = gr.State(None)
|
|
|
1147 |
paragraph = gr.State("")
|
1148 |
aux_state = gr.State([])
|
1149 |
click_index_state = gr.State((0, 0))
|
|
|
1152 |
input_labels_state = gr.State([])
|
1153 |
new_crop_save_path = gr.State(None)
|
1154 |
image_input_nobackground = gr.State(None)
|
|
|
|
|
|
|
|
|
1155 |
|
1156 |
gr.Markdown(title)
|
1157 |
gr.Markdown(description)
|
1158 |
+
with gr.Row(align="right", visible=False, elem_id="top_row") as top_row:
|
1159 |
+
language = gr.Dropdown(
|
1160 |
+
['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
|
1161 |
+
value="English", label="Language", interactive=True, scale=0.2, elem_classes="custom-language"
|
1162 |
+
)
|
1163 |
+
auto_play = gr.Checkbox(
|
1164 |
+
label="Check to autoplay audio", value=False, scale=0.4, elem_classes="custom-autoplay"
|
1165 |
+
)
|
1166 |
+
output_audio = gr.HTML(
|
1167 |
+
label="Synthesised Audio", scale=0.3, elem_classes="custom-output"
|
1168 |
+
)
|
1169 |
+
|
1170 |
|
1171 |
+
# with gr.Row(align="right",visible=False) as language_select:
|
1172 |
+
# language = gr.Dropdown(
|
1173 |
+
# ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
|
1174 |
+
# value="English", label="Language", interactive=True)
|
1175 |
+
|
1176 |
+
# with gr.Row(align="right",visible=False) as autoplay:
|
1177 |
+
# auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
|
1178 |
+
# output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
|
1179 |
+
|
1180 |
+
with gr.Row():
|
1181 |
+
|
1182 |
with gr.Column(scale=1.0):
|
1183 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
1184 |
with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
|
|
|
1224 |
clear_button_image = gr.Button(value="Clear Image", interactive=True)
|
1225 |
|
1226 |
with gr.Tab("Trajectory (beta)") as traj_tab:
|
1227 |
+
sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10,
|
1228 |
elem_id="image_sketcher")
|
1229 |
+
example_image = gr.Image(type="pil", interactive=False, visible=False)
|
1230 |
+
with gr.Row():
|
1231 |
submit_button_sketcher = gr.Button(value="Submit", interactive=True)
|
1232 |
+
clear_button_sketcher = gr.Button(value="Clear Sketch", interactive=True)
|
1233 |
with gr.Row(scale=1.0):
|
1234 |
with gr.Row(scale=0.8):
|
1235 |
focus_type_sketch = gr.Radio(
|
|
|
1240 |
Input_sketch = gr.Radio(
|
1241 |
choices=["Trace+Seg", "Trace"],
|
1242 |
value="Trace+Seg",
|
1243 |
+
label="Trace Type",
|
1244 |
interactive=True)
|
1245 |
|
1246 |
with gr.Column(visible=False) as modules_need_gpt1:
|
|
|
1272 |
value="No",
|
1273 |
label="Expert",
|
1274 |
interactive=True)
|
|
|
1275 |
with gr.Column(visible=True) as modules_not_need_gpt3:
|
1276 |
gr.Examples(
|
1277 |
+
examples=examples,
|
1278 |
+
inputs=[example_image],
|
1279 |
+
)
|
1280 |
+
|
1281 |
|
1282 |
|
1283 |
|
1284 |
|
1285 |
+
with gr.Column(scale=0.5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1286 |
with gr.Column(visible=True) as module_key_input:
|
1287 |
openai_api_key = gr.Textbox(
|
1288 |
placeholder="Input openAI API key",
|
|
|
1303 |
cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
|
1304 |
|
1305 |
with gr.Column(visible=False) as modules_not_need_gpt2:
|
1306 |
+
with gr.Blocks():
|
1307 |
chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True).style(height=600, scale=0.5)
|
1308 |
with gr.Column(visible=False) as modules_need_gpt3:
|
1309 |
chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
|
|
|
1311 |
with gr.Row():
|
1312 |
clear_button_text = gr.Button(value="Clear Text", interactive=True)
|
1313 |
submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
|
1314 |
+
upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
|
1315 |
+
downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
|
1316 |
+
|
1317 |
with gr.Row():
|
1318 |
export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
|
1319 |
with gr.Row():
|
|
|
1484 |
# this part is for 3d generate.
|
1485 |
###############################################################################
|
1486 |
|
1487 |
+
with gr.Row(variant="panel",visible=False) as d3_model:
|
1488 |
with gr.Column():
|
1489 |
with gr.Row():
|
1490 |
input_image = gr.Image(
|
|
|
1592 |
|
1593 |
def clear_tts_fields():
|
1594 |
return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
|
1595 |
+
|
1596 |
# submit_tts.click(
|
1597 |
# tts.predict,
|
1598 |
# inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
|
|
|
1607 |
queue=False
|
1608 |
)
|
1609 |
|
1610 |
+
|
1611 |
+
|
1612 |
+
|
1613 |
clear_button_sketcher.click(
|
1614 |
lambda x: (x),
|
1615 |
[origin_image],
|
|
|
1618 |
show_progress=False
|
1619 |
)
|
1620 |
|
1621 |
+
|
1622 |
+
|
1623 |
+
|
1624 |
|
1625 |
openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
|
1626 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
|
1627 |
+
modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
|
1628 |
enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
|
1629 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1630 |
modules_not_need_gpt,
|
1631 |
+
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
|
1632 |
disable_chatGPT_button.click(init_wo_openai_api_key,
|
1633 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1634 |
modules_not_need_gpt,
|
1635 |
+
modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
|
1636 |
|
1637 |
enable_chatGPT_button.click(
|
1638 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
|
|
1746 |
image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
|
1747 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
1748 |
],
|
1749 |
+
outputs=[chatbot, state, click_state, image_input, input_image, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground],
|
1750 |
show_progress=False, queue=True
|
1751 |
)
|
1752 |
|
|
|
1754 |
submit_button_click.click(
|
1755 |
submit_caption,
|
1756 |
inputs=[
|
1757 |
+
state, text_refiner,length, sentiment, factuality, language,
|
1758 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
1759 |
auto_play,paragraph,focus_type,openai_api_key,new_crop_save_path
|
1760 |
],
|
|
|
1770 |
submit_button_sketcher.click(
|
1771 |
inference_traject,
|
1772 |
inputs=[
|
1773 |
+
origin_image,sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
|
1774 |
+
original_size, input_size, text_refiner,focus_type_sketch,paragraph,openai_api_key,auto_play,Input_sketch
|
1775 |
],
|
1776 |
+
outputs=[chatbot, state, sketcher_input,output_audio],
|
1777 |
show_progress=False, queue=True
|
1778 |
)
|
1779 |
|