Spaces:
Running
Running
Niki Zhang
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -395,15 +395,15 @@ def init_openai_api_key(api_key=""):
|
|
395 |
|
396 |
global gpt_state
|
397 |
gpt_state=1
|
398 |
-
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*
|
399 |
else:
|
400 |
gpt_state=0
|
401 |
-
return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*
|
402 |
|
403 |
def init_wo_openai_api_key():
|
404 |
global gpt_state
|
405 |
gpt_state=0
|
406 |
-
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*
|
407 |
|
408 |
def get_click_prompt(chat_input, click_state, click_mode):
|
409 |
inputs = json.loads(chat_input)
|
@@ -441,14 +441,19 @@ def update_click_state(click_state, caption, click_mode):
|
|
441 |
else:
|
442 |
raise NotImplementedError
|
443 |
|
444 |
-
def chat_input_callback(*args):
|
445 |
-
visual_chatgpt, chat_input, click_state, state, aux_state = args
|
446 |
if visual_chatgpt is not None:
|
447 |
-
|
|
|
|
|
|
|
|
|
448 |
else:
|
449 |
response = "Text refiner is not initilzed, please input openai api key."
|
450 |
state = state + [(chat_input, response)]
|
451 |
-
|
|
|
452 |
|
453 |
|
454 |
|
@@ -777,6 +782,7 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
|
|
777 |
|
778 |
prompt = get_sketch_prompt(mask)
|
779 |
boxes = prompt['input_boxes']
|
|
|
780 |
|
781 |
controls = {'length': length,
|
782 |
'sentiment': sentiment,
|
@@ -797,18 +803,23 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
|
|
797 |
|
798 |
enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
|
799 |
out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)[0]
|
|
|
|
|
|
|
|
|
800 |
|
801 |
# Update components and states
|
802 |
state.append((f'Box: {boxes}', None))
|
803 |
-
state.append((None, f'
|
804 |
text = out['generated_captions']['raw_caption']
|
805 |
input_mask = np.array(out['mask'].convert('P'))
|
806 |
-
image_input = mask_painter(np.array(image_input), input_mask)
|
807 |
-
|
808 |
-
|
|
|
809 |
|
810 |
-
fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
|
811 |
-
image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
|
812 |
|
813 |
yield state, state, image_input
|
814 |
|
@@ -819,10 +830,10 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
|
|
819 |
new_cap = refined_caption['caption']
|
820 |
if refined_caption['wiki']:
|
821 |
state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
|
822 |
-
state = state + [(None, f"
|
823 |
-
refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
|
824 |
|
825 |
-
yield state, state,
|
826 |
|
827 |
def clear_chat_memory(visual_chatgpt, keep_global=False):
|
828 |
if visual_chatgpt is not None:
|
@@ -852,26 +863,16 @@ def export_chat_log(chat_state):
|
|
852 |
return None
|
853 |
|
854 |
|
855 |
-
def cap_everything(
|
856 |
-
|
857 |
-
model = build_caption_anything_with_models(
|
858 |
-
args,
|
859 |
-
api_key="",
|
860 |
-
captioner=shared_captioner,
|
861 |
-
sam_model=shared_sam_model,
|
862 |
-
ocr_reader=shared_ocr_reader,
|
863 |
-
text_refiner=text_refiner,
|
864 |
-
session_id=iface.app_id
|
865 |
-
)
|
866 |
-
paragraph = model.inference_cap_everything(image_input, verbose=True)
|
867 |
# state = state + [(None, f"Caption Everything: {paragraph}")]
|
868 |
Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
|
869 |
AI_prompt = "Received."
|
870 |
visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
|
871 |
visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
872 |
# waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
|
873 |
-
|
874 |
-
return paragraph,
|
875 |
|
876 |
def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
|
877 |
|
@@ -1038,10 +1039,23 @@ def create_ui():
|
|
1038 |
clear_button_image = gr.Button(value="Clear Image", interactive=True)
|
1039 |
|
1040 |
with gr.Tab("Trajectory (beta)") as traj_tab:
|
1041 |
-
sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=
|
1042 |
elem_id="image_sketcher")
|
1043 |
with gr.Row():
|
|
|
1044 |
submit_button_sketcher = gr.Button(value="Submit", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1045 |
|
1046 |
with gr.Column(visible=False) as modules_need_gpt1:
|
1047 |
with gr.Row(scale=1.0):
|
@@ -1070,16 +1084,16 @@ def create_ui():
|
|
1070 |
enable_wiki = gr.Radio(
|
1071 |
choices=["Yes", "No"],
|
1072 |
value="No",
|
1073 |
-
label="
|
1074 |
interactive=True)
|
1075 |
|
1076 |
-
|
1077 |
-
|
1078 |
examples=examples,
|
1079 |
inputs=[example_image],
|
1080 |
)
|
1081 |
|
1082 |
-
|
1083 |
|
1084 |
|
1085 |
with gr.Column(scale=0.5):
|
@@ -1108,7 +1122,7 @@ def create_ui():
|
|
1108 |
|
1109 |
with gr.Column() as modules_need_gpt0:
|
1110 |
with gr.Column(visible=False,scale=1.0) as modules_need_gpt2:
|
1111 |
-
paragraph_output = gr.Textbox(lines=
|
1112 |
cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
|
1113 |
|
1114 |
with gr.Column(visible=False) as modules_not_need_gpt2:
|
@@ -1146,7 +1160,7 @@ def create_ui():
|
|
1146 |
# this part is for 3d generate.
|
1147 |
###############################################################################
|
1148 |
|
1149 |
-
with gr.Row(variant="panel"):
|
1150 |
with gr.Column():
|
1151 |
with gr.Row():
|
1152 |
input_image = gr.Image(
|
@@ -1268,19 +1282,27 @@ def create_ui():
|
|
1268 |
outputs=[input_text, input_language, input_audio, input_mic, use_mic, agree, output_waveform, output_audio],
|
1269 |
queue=False
|
1270 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1271 |
|
1272 |
|
1273 |
openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
|
1274 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
|
1275 |
-
modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
|
1276 |
enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
|
1277 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1278 |
modules_not_need_gpt,
|
1279 |
-
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
|
1280 |
disable_chatGPT_button.click(init_wo_openai_api_key,
|
1281 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1282 |
modules_not_need_gpt,
|
1283 |
-
modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
|
1284 |
|
1285 |
enable_chatGPT_button.click(
|
1286 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
@@ -1297,8 +1319,8 @@ def create_ui():
|
|
1297 |
show_progress=False
|
1298 |
)
|
1299 |
|
1300 |
-
cap_everything_button.click(cap_everything, [
|
1301 |
-
[paragraph_output,
|
1302 |
|
1303 |
clear_button_click.click(
|
1304 |
lambda x: ([[], [], []], x),
|
@@ -1348,11 +1370,11 @@ def create_ui():
|
|
1348 |
sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
|
1349 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
|
1350 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph])
|
1351 |
-
chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
|
1352 |
-
[chatbot, state, aux_state])
|
1353 |
chat_input.submit(lambda: "", None, chat_input)
|
1354 |
-
submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
|
1355 |
-
[chatbot, state, aux_state])
|
1356 |
submit_button_text.click(lambda: "", None, chat_input)
|
1357 |
example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
|
1358 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
|
|
|
395 |
|
396 |
global gpt_state
|
397 |
gpt_state=1
|
398 |
+
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
|
399 |
else:
|
400 |
gpt_state=0
|
401 |
+
return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
|
402 |
|
403 |
def init_wo_openai_api_key():
|
404 |
global gpt_state
|
405 |
gpt_state=0
|
406 |
+
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
|
407 |
|
408 |
def get_click_prompt(chat_input, click_state, click_mode):
|
409 |
inputs = json.loads(chat_input)
|
|
|
441 |
else:
|
442 |
raise NotImplementedError
|
443 |
|
444 |
+
async def chat_input_callback(*args):
|
445 |
+
visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay = args
|
446 |
if visual_chatgpt is not None:
|
447 |
+
state, _, aux_state, _ = visual_chatgpt.run_text(chat_input, state, aux_state)
|
448 |
+
last_text, last_response = state[-1]
|
449 |
+
print("last response",last_response)
|
450 |
+
audio = await texttospeech(last_response,language,autoplay)
|
451 |
+
return state, state, aux_state,audio
|
452 |
else:
|
453 |
response = "Text refiner is not initilzed, please input openai api key."
|
454 |
state = state + [(chat_input, response)]
|
455 |
+
audio = await texttospeech(response,language,autoplay)
|
456 |
+
return state, state, None,audio
|
457 |
|
458 |
|
459 |
|
|
|
782 |
|
783 |
prompt = get_sketch_prompt(mask)
|
784 |
boxes = prompt['input_boxes']
|
785 |
+
boxes = boxes[0]
|
786 |
|
787 |
controls = {'length': length,
|
788 |
'sentiment': sentiment,
|
|
|
803 |
|
804 |
enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
|
805 |
out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)[0]
|
806 |
+
|
807 |
+
# if visual_chatgpt is not None:
|
808 |
+
# print('inference_click: add caption to chatGPT memory')
|
809 |
+
# new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
|
810 |
|
811 |
# Update components and states
|
812 |
state.append((f'Box: {boxes}', None))
|
813 |
+
state.append((None, f'{out["generated_captions"]["raw_caption"]}'))
|
814 |
text = out['generated_captions']['raw_caption']
|
815 |
input_mask = np.array(out['mask'].convert('P'))
|
816 |
+
# image_input = mask_painter(np.array(image_input), input_mask, background_alpha=0 )
|
817 |
+
image_input = Image.fromarray(np.array(image_input))
|
818 |
+
draw = ImageDraw.Draw(image_input)
|
819 |
+
draw.rectangle(boxes, outline='red', width=2)
|
820 |
|
821 |
+
# fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
|
822 |
+
# image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
|
823 |
|
824 |
yield state, state, image_input
|
825 |
|
|
|
830 |
new_cap = refined_caption['caption']
|
831 |
if refined_caption['wiki']:
|
832 |
state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
|
833 |
+
state = state + [(None, f"{new_cap}")]
|
834 |
+
# refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
|
835 |
|
836 |
+
yield state, state, image_input
|
837 |
|
838 |
def clear_chat_memory(visual_chatgpt, keep_global=False):
|
839 |
if visual_chatgpt is not None:
|
|
|
863 |
return None
|
864 |
|
865 |
|
866 |
+
async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
|
867 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
868 |
# state = state + [(None, f"Caption Everything: {paragraph}")]
|
869 |
Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
|
870 |
AI_prompt = "Received."
|
871 |
visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
|
872 |
visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
873 |
# waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
|
874 |
+
audio_output=await texttospeech(paragraph,language,autoplay)
|
875 |
+
return paragraph,audio_output
|
876 |
|
877 |
def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
|
878 |
|
|
|
1039 |
clear_button_image = gr.Button(value="Clear Image", interactive=True)
|
1040 |
|
1041 |
with gr.Tab("Trajectory (beta)") as traj_tab:
|
1042 |
+
sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10,
|
1043 |
elem_id="image_sketcher")
|
1044 |
with gr.Row():
|
1045 |
+
clear_button_sketcher = gr.Button(value="Clear Sketch", interactive=True)
|
1046 |
submit_button_sketcher = gr.Button(value="Submit", interactive=True)
|
1047 |
+
with gr.Row(scale=1.0):
|
1048 |
+
with gr.Row(scale=0.8):
|
1049 |
+
focus_type_sketch = gr.Radio(
|
1050 |
+
choices=["CFV-D", "CFV-DA", "CFV-DAI","PFV-DDA"],
|
1051 |
+
value="CFV-D",
|
1052 |
+
label="Information Type",
|
1053 |
+
interactive=True)
|
1054 |
+
Input_sketch = gr.Radio(
|
1055 |
+
choices=["Trace+Seg", "Trace"],
|
1056 |
+
value="Trace+Seg",
|
1057 |
+
label="Caption Type",
|
1058 |
+
interactive=True)
|
1059 |
|
1060 |
with gr.Column(visible=False) as modules_need_gpt1:
|
1061 |
with gr.Row(scale=1.0):
|
|
|
1084 |
enable_wiki = gr.Radio(
|
1085 |
choices=["Yes", "No"],
|
1086 |
value="No",
|
1087 |
+
label="Expert",
|
1088 |
interactive=True)
|
1089 |
|
1090 |
+
with gr.Column(visible=True) as modules_not_need_gpt3:
|
1091 |
+
gr.Examples(
|
1092 |
examples=examples,
|
1093 |
inputs=[example_image],
|
1094 |
)
|
1095 |
|
1096 |
+
|
1097 |
|
1098 |
|
1099 |
with gr.Column(scale=0.5):
|
|
|
1122 |
|
1123 |
with gr.Column() as modules_need_gpt0:
|
1124 |
with gr.Column(visible=False,scale=1.0) as modules_need_gpt2:
|
1125 |
+
paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
|
1126 |
cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
|
1127 |
|
1128 |
with gr.Column(visible=False) as modules_not_need_gpt2:
|
|
|
1160 |
# this part is for 3d generate.
|
1161 |
###############################################################################
|
1162 |
|
1163 |
+
with gr.Row(variant="panel") as d3_model:
|
1164 |
with gr.Column():
|
1165 |
with gr.Row():
|
1166 |
input_image = gr.Image(
|
|
|
1282 |
outputs=[input_text, input_language, input_audio, input_mic, use_mic, agree, output_waveform, output_audio],
|
1283 |
queue=False
|
1284 |
)
|
1285 |
+
|
1286 |
+
clear_button_sketcher.click(
|
1287 |
+
lambda x: (x),
|
1288 |
+
[origin_image],
|
1289 |
+
[sketcher_input],
|
1290 |
+
queue=False,
|
1291 |
+
show_progress=False
|
1292 |
+
)
|
1293 |
|
1294 |
|
1295 |
openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
|
1296 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
|
1297 |
+
modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
|
1298 |
enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
|
1299 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1300 |
modules_not_need_gpt,
|
1301 |
+
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
|
1302 |
disable_chatGPT_button.click(init_wo_openai_api_key,
|
1303 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1304 |
modules_not_need_gpt,
|
1305 |
+
modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
|
1306 |
|
1307 |
enable_chatGPT_button.click(
|
1308 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
|
|
1319 |
show_progress=False
|
1320 |
)
|
1321 |
|
1322 |
+
cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
|
1323 |
+
[paragraph_output,output_audio])
|
1324 |
|
1325 |
clear_button_click.click(
|
1326 |
lambda x: ([[], [], []], x),
|
|
|
1370 |
sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
|
1371 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
|
1372 |
image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph])
|
1373 |
+
chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
|
1374 |
+
[chatbot, state, aux_state,output_audio])
|
1375 |
chat_input.submit(lambda: "", None, chat_input)
|
1376 |
+
submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
|
1377 |
+
[chatbot, state, aux_state,output_audio])
|
1378 |
submit_button_text.click(lambda: "", None, chat_input)
|
1379 |
example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
|
1380 |
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
|