Spaces:
Running
Running
Niki Zhang
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from math import inf
|
2 |
import os
|
3 |
import base64
|
@@ -9,6 +10,7 @@ import requests
|
|
9 |
from packaging import version
|
10 |
from PIL import Image, ImageDraw
|
11 |
import functools
|
|
|
12 |
from langchain.llms.openai import OpenAI
|
13 |
from caption_anything.model import CaptionAnything
|
14 |
from caption_anything.utils.image_editing_utils import create_bubble_frame
|
@@ -20,7 +22,10 @@ from caption_anything.segmenter import build_segmenter
|
|
20 |
from caption_anything.utils.chatbot import ConversationBot, build_chatbot_tools, get_new_image_name
|
21 |
from segment_anything import sam_model_registry
|
22 |
import easyocr
|
23 |
-
import
|
|
|
|
|
|
|
24 |
|
25 |
###############################################################################
|
26 |
############# this part is for 3D generate #############
|
@@ -279,9 +284,25 @@ def make3d(images):
|
|
279 |
############# above part is for 3D generate #############
|
280 |
###############################################################################
|
281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
gpt_state = 0
|
284 |
-
|
285 |
article = """
|
286 |
<div style='margin:20px auto;'>
|
287 |
<p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
|
@@ -374,15 +395,15 @@ def init_openai_api_key(api_key=""):
|
|
374 |
|
375 |
global gpt_state
|
376 |
gpt_state=1
|
377 |
-
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=
|
378 |
else:
|
379 |
gpt_state=0
|
380 |
-
return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
|
381 |
|
382 |
def init_wo_openai_api_key():
|
383 |
global gpt_state
|
384 |
gpt_state=0
|
385 |
-
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]
|
386 |
|
387 |
def get_click_prompt(chat_input, click_state, click_mode):
|
388 |
inputs = json.loads(chat_input)
|
@@ -467,7 +488,12 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
|
|
467 |
# artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
|
468 |
paragraph = get_image_gpt(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
|
469 |
|
470 |
-
state = [
|
|
|
|
|
|
|
|
|
|
|
471 |
|
472 |
return state, state, image_input, click_state, image_input, image_input, image_input, image_embedding, \
|
473 |
original_size, input_size, f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",paragraph
|
@@ -539,12 +565,11 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
|
|
539 |
|
540 |
|
541 |
|
542 |
-
def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
|
543 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
544 |
-
|
545 |
print("state",state)
|
546 |
-
|
547 |
-
global pre_click_index
|
548 |
click_index = click_index_state
|
549 |
|
550 |
# if pre_click_index==click_index:
|
@@ -553,7 +578,6 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
|
|
553 |
# else:
|
554 |
# pre_click_index = click_index
|
555 |
print("click_index",click_index)
|
556 |
-
print("pre_click_index",pre_click_index)
|
557 |
print("input_points_state",input_points_state)
|
558 |
print("input_labels_state",input_labels_state)
|
559 |
|
@@ -630,29 +654,34 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
|
|
630 |
focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
|
631 |
if focus_info.startswith('"') and focus_info.endswith('"'):
|
632 |
focus_info=focus_info[1:-1]
|
633 |
-
|
634 |
# state = state + [(None, f"Wiki: {paragraph}")]
|
635 |
state = state + [(None, f"{focus_info}")]
|
636 |
print("new_cap",focus_info)
|
|
|
|
|
|
|
637 |
|
638 |
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
639 |
# input_points=input_points, input_labels=input_labels)
|
640 |
try:
|
641 |
-
|
642 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
643 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
|
644 |
|
645 |
except Exception as e:
|
646 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
647 |
print(f"Error during TTS prediction: {str(e)}")
|
648 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
649 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
|
650 |
|
651 |
else:
|
652 |
try:
|
653 |
-
|
|
|
|
|
654 |
# return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
655 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
|
656 |
|
657 |
except Exception as e:
|
658 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
@@ -834,7 +863,8 @@ def cap_everything(image_input, visual_chatgpt, text_refiner,input_language, inp
|
|
834 |
AI_prompt = "Received."
|
835 |
visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
|
836 |
visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
837 |
-
waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
|
|
|
838 |
return paragraph,waveform_visual, audio_output
|
839 |
|
840 |
def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
|
@@ -877,6 +907,37 @@ def get_style():
|
|
877 |
|
878 |
return style
|
879 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
880 |
|
881 |
def create_ui():
|
882 |
title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
|
@@ -884,17 +945,20 @@ def create_ui():
|
|
884 |
description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
|
885 |
|
886 |
examples = [
|
887 |
-
["test_images/
|
888 |
-
["test_images/
|
889 |
-
["test_images
|
890 |
-
["test_images/
|
891 |
-
["test_images/
|
892 |
-
["test_images/
|
|
|
|
|
893 |
|
894 |
]
|
895 |
|
896 |
with gr.Blocks(
|
897 |
-
css=get_style()
|
|
|
898 |
) as iface:
|
899 |
state = gr.State([])
|
900 |
out_state = gr.State(None)
|
@@ -914,6 +978,8 @@ def create_ui():
|
|
914 |
input_labels_state = gr.State([])
|
915 |
new_crop_save_path = gr.State(None)
|
916 |
image_input_nobackground = gr.State(None)
|
|
|
|
|
917 |
|
918 |
|
919 |
|
@@ -924,19 +990,15 @@ def create_ui():
|
|
924 |
with gr.Column(scale=1.0):
|
925 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
926 |
with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
|
927 |
-
image_intro=gr.HTML()
|
928 |
image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
|
929 |
example_image = gr.Image(type="pil", interactive=False, visible=False)
|
930 |
with gr.Row():
|
931 |
name_label_base = gr.Button(value="Name: ")
|
932 |
artist_label_base = gr.Button(value="Artist: ")
|
933 |
year_label_base = gr.Button(value="Year: ")
|
934 |
-
material_label_base = gr.Button(value="Material: ")
|
935 |
|
936 |
-
|
937 |
-
|
938 |
with gr.Tab("Click") as click_tab:
|
939 |
-
image_intro_click=gr.HTML()
|
940 |
image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
|
941 |
example_image = gr.Image(type="pil", interactive=False, visible=False)
|
942 |
with gr.Row():
|
@@ -945,11 +1007,14 @@ def create_ui():
|
|
945 |
year_label = gr.Button(value="Year: ")
|
946 |
material_label = gr.Button(value="Material: ")
|
947 |
with gr.Row(scale=1.0):
|
948 |
-
|
949 |
-
|
950 |
-
|
951 |
-
|
952 |
-
|
|
|
|
|
|
|
953 |
with gr.Row(scale=1.0):
|
954 |
with gr.Row(scale=0.4):
|
955 |
point_prompt = gr.Radio(
|
@@ -965,53 +1030,62 @@ def create_ui():
|
|
965 |
with gr.Row(scale=0.4):
|
966 |
clear_button_click = gr.Button(value="Clear Clicks", interactive=True)
|
967 |
clear_button_image = gr.Button(value="Clear Image", interactive=True)
|
968 |
-
|
969 |
-
with gr.Tab("Trajectory (beta)"):
|
970 |
sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=20,
|
971 |
elem_id="image_sketcher")
|
972 |
with gr.Row():
|
973 |
submit_button_sketcher = gr.Button(value="Submit", interactive=True)
|
974 |
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
-
|
979 |
-
|
980 |
-
|
981 |
-
|
982 |
-
|
983 |
-
|
984 |
-
|
985 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
986 |
with gr.Row(scale=1.0):
|
987 |
-
|
988 |
-
choices=["Factual", "Imagination"],
|
989 |
-
value="Factual",
|
990 |
-
label="Factuality",
|
991 |
-
interactive=True,
|
992 |
-
)
|
993 |
-
length = gr.Slider(
|
994 |
-
minimum=10,
|
995 |
-
maximum=80,
|
996 |
-
value=10,
|
997 |
-
step=1,
|
998 |
-
interactive=True,
|
999 |
-
label="Generated Caption Length",
|
1000 |
-
)
|
1001 |
-
# 是否启用wiki内容整合到caption中
|
1002 |
-
enable_wiki = gr.Radio(
|
1003 |
-
choices=["Yes", "No"],
|
1004 |
-
value="No",
|
1005 |
-
label="Enable Wiki",
|
1006 |
-
interactive=True)
|
1007 |
-
|
1008 |
-
# with gr.Column(visible=True) as modules_not_need_gpt3:
|
1009 |
-
gr.Examples(
|
1010 |
examples=examples,
|
1011 |
inputs=[example_image],
|
1012 |
)
|
1013 |
|
|
|
|
|
|
|
1014 |
with gr.Column(scale=0.5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1015 |
with gr.Column(visible=True) as module_key_input:
|
1016 |
openai_api_key = gr.Textbox(
|
1017 |
placeholder="Input openAI API key",
|
@@ -1027,39 +1101,39 @@ def create_ui():
|
|
1027 |
notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
|
1028 |
|
1029 |
with gr.Column():
|
1030 |
-
with gr.Column(visible=False) as modules_need_gpt2:
|
1031 |
paragraph_output = gr.Textbox(lines=7, label="Describe Everything", max_lines=7)
|
1032 |
-
with gr.Column(visible=False) as modules_need_gpt0:
|
1033 |
cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
|
1034 |
|
1035 |
with gr.Column(visible=False) as modules_not_need_gpt2:
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
-
|
1041 |
-
|
1042 |
-
|
1043 |
-
|
1044 |
-
|
1045 |
-
|
1046 |
-
|
|
|
1047 |
|
1048 |
-
with gr.Column(scale=0.5):
|
1049 |
# TTS interface hidden initially
|
1050 |
-
|
1051 |
-
|
1052 |
-
|
1053 |
-
|
1054 |
-
|
1055 |
-
|
1056 |
-
|
1057 |
-
|
1058 |
-
|
1059 |
|
1060 |
-
|
1061 |
-
|
1062 |
-
|
1063 |
|
1064 |
|
1065 |
|
@@ -1154,6 +1228,8 @@ def create_ui():
|
|
1154 |
|
1155 |
|
1156 |
mv_images = gr.State()
|
|
|
|
|
1157 |
|
1158 |
submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
|
1159 |
fn=generate_mvs,
|
@@ -1174,12 +1250,12 @@ def create_ui():
|
|
1174 |
def clear_tts_fields():
|
1175 |
return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
|
1176 |
|
1177 |
-
submit_tts.click(
|
1178 |
-
|
1179 |
-
|
1180 |
-
|
1181 |
-
|
1182 |
-
)
|
1183 |
|
1184 |
clear_tts.click(
|
1185 |
clear_tts_fields,
|
@@ -1191,15 +1267,15 @@ def create_ui():
|
|
1191 |
|
1192 |
openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
|
1193 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
|
1194 |
-
modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box])
|
1195 |
enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
|
1196 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1197 |
modules_not_need_gpt,
|
1198 |
-
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box])
|
1199 |
disable_chatGPT_button.click(init_wo_openai_api_key,
|
1200 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1201 |
modules_not_need_gpt,
|
1202 |
-
modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box])
|
1203 |
|
1204 |
enable_chatGPT_button.click(
|
1205 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
@@ -1299,6 +1375,7 @@ def create_ui():
|
|
1299 |
return [gr.update(visible=False)]*4
|
1300 |
|
1301 |
|
|
|
1302 |
click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
|
1303 |
base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
|
1304 |
|
@@ -1322,26 +1399,16 @@ def create_ui():
|
|
1322 |
inputs=[
|
1323 |
image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
|
1324 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
1325 |
-
|
1326 |
],
|
1327 |
outputs=[
|
1328 |
chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
|
1329 |
-
|
1330 |
],
|
1331 |
show_progress=True,
|
1332 |
queue=True
|
1333 |
)
|
1334 |
-
|
1335 |
-
|
1336 |
-
focus_type.change(
|
1337 |
-
lambda x: ([[], [], []], x),
|
1338 |
-
[image_input_nobackground],
|
1339 |
-
[click_state, image_input],
|
1340 |
-
queue=False,
|
1341 |
-
show_progress=False
|
1342 |
-
)
|
1343 |
-
|
1344 |
-
|
1345 |
|
1346 |
submit_button_sketcher.click(
|
1347 |
inference_traject,
|
@@ -1370,4 +1437,4 @@ def create_ui():
|
|
1370 |
if __name__ == '__main__':
|
1371 |
iface = create_ui()
|
1372 |
iface.queue(concurrency_count=5, api_open=False, max_size=10)
|
1373 |
-
iface.launch(server_name="0.0.0.0", enable_queue=True)
|
|
|
1 |
+
from io import BytesIO
|
2 |
from math import inf
|
3 |
import os
|
4 |
import base64
|
|
|
10 |
from packaging import version
|
11 |
from PIL import Image, ImageDraw
|
12 |
import functools
|
13 |
+
import emoji
|
14 |
from langchain.llms.openai import OpenAI
|
15 |
from caption_anything.model import CaptionAnything
|
16 |
from caption_anything.utils.image_editing_utils import create_bubble_frame
|
|
|
22 |
from caption_anything.utils.chatbot import ConversationBot, build_chatbot_tools, get_new_image_name
|
23 |
from segment_anything import sam_model_registry
|
24 |
import easyocr
|
25 |
+
import re
|
26 |
+
import edge_tts
|
27 |
+
import asyncio
|
28 |
+
# import tts
|
29 |
|
30 |
###############################################################################
|
31 |
############# this part is for 3D generate #############
|
|
|
284 |
############# above part is for 3D generate #############
|
285 |
###############################################################################
|
286 |
|
287 |
+
css = """
|
288 |
+
#warning {background-color: #FFCCCB}
|
289 |
+
.chatbot {
|
290 |
+
padding: 0 !important;
|
291 |
+
margin: 0 !important;
|
292 |
+
}
|
293 |
+
"""
|
294 |
+
filtered_language_dict = {
|
295 |
+
'English': 'en-US-JennyNeural',
|
296 |
+
'Chinese': 'zh-CN-XiaoxiaoNeural',
|
297 |
+
'French': 'fr-FR-DeniseNeural',
|
298 |
+
'Spanish': 'es-MX-DaliaNeural',
|
299 |
+
'Arabic': 'ar-SA-ZariyahNeural',
|
300 |
+
'Portuguese': 'pt-BR-FranciscaNeural',
|
301 |
+
'Cantonese': 'zh-HK-HiuGaaiNeural'
|
302 |
+
}
|
303 |
|
304 |
gpt_state = 0
|
305 |
+
VOICE = "en-GB-SoniaNeural"
|
306 |
article = """
|
307 |
<div style='margin:20px auto;'>
|
308 |
<p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
|
|
|
395 |
|
396 |
global gpt_state
|
397 |
gpt_state=1
|
398 |
+
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
|
399 |
else:
|
400 |
gpt_state=0
|
401 |
+
return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
|
402 |
|
403 |
def init_wo_openai_api_key():
|
404 |
global gpt_state
|
405 |
gpt_state=0
|
406 |
+
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*2
|
407 |
|
408 |
def get_click_prompt(chat_input, click_state, click_mode):
|
409 |
inputs = json.loads(chat_input)
|
|
|
488 |
# artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
|
489 |
paragraph = get_image_gpt(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
|
490 |
|
491 |
+
state = [
|
492 |
+
(
|
493 |
+
None,
|
494 |
+
f"🤖 Hi, I am EyeSee. Let's explore this painting {name} together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
|
495 |
+
)
|
496 |
+
]
|
497 |
|
498 |
return state, state, image_input, click_state, image_input, image_input, image_input, image_embedding, \
|
499 |
original_size, input_size, f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Material: {material}",paragraph
|
|
|
565 |
|
566 |
|
567 |
|
568 |
+
async def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
|
569 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
570 |
+
autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
|
571 |
print("state",state)
|
572 |
+
|
|
|
573 |
click_index = click_index_state
|
574 |
|
575 |
# if pre_click_index==click_index:
|
|
|
578 |
# else:
|
579 |
# pre_click_index = click_index
|
580 |
print("click_index",click_index)
|
|
|
581 |
print("input_points_state",input_points_state)
|
582 |
print("input_labels_state",input_labels_state)
|
583 |
|
|
|
654 |
focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
|
655 |
if focus_info.startswith('"') and focus_info.endswith('"'):
|
656 |
focus_info=focus_info[1:-1]
|
657 |
+
focus_info=focus_info.replace('#', '')
|
658 |
# state = state + [(None, f"Wiki: {paragraph}")]
|
659 |
state = state + [(None, f"{focus_info}")]
|
660 |
print("new_cap",focus_info)
|
661 |
+
read_info = re.sub(r'[#[\]!*]','',focus_info)
|
662 |
+
read_info = emoji.replace_emoji(read_info,replace="")
|
663 |
+
print("read info",read_info)
|
664 |
|
665 |
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
666 |
# input_points=input_points, input_labels=input_labels)
|
667 |
try:
|
668 |
+
audio_output = await texttospeech(read_info, language,autoplay)
|
669 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
670 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
|
671 |
|
672 |
except Exception as e:
|
673 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
674 |
print(f"Error during TTS prediction: {str(e)}")
|
675 |
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
676 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
|
677 |
|
678 |
else:
|
679 |
try:
|
680 |
+
audio_output = await texttospeech(focus_info, language, autoplay)
|
681 |
+
# waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
|
682 |
+
waveform_visual, audio_output=None,None
|
683 |
# return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
684 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, audio_output
|
685 |
|
686 |
except Exception as e:
|
687 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
|
|
863 |
AI_prompt = "Received."
|
864 |
visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
|
865 |
visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
866 |
+
# waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
|
867 |
+
waveform_visual, audio_output=None,None
|
868 |
return paragraph,waveform_visual, audio_output
|
869 |
|
870 |
def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
|
|
|
907 |
|
908 |
return style
|
909 |
|
910 |
+
def handle_like_dislike(like_data, like_state, dislike_state):
|
911 |
+
if like_data.liked:
|
912 |
+
if like_data.index not in like_state:
|
913 |
+
like_state.append(like_data.index)
|
914 |
+
message = f"Liked: {like_data.value} at index {like_data.index}"
|
915 |
+
else:
|
916 |
+
message = "You already liked this item"
|
917 |
+
else:
|
918 |
+
if like_data.index not in dislike_state:
|
919 |
+
dislike_state.append(like_data.index)
|
920 |
+
message = f"Disliked: {like_data.value} at index {like_data.index}"
|
921 |
+
else:
|
922 |
+
message = "You already disliked this item"
|
923 |
+
|
924 |
+
return like_state, dislike_state
|
925 |
+
|
926 |
+
async def texttospeech(text,language,autoplay):
|
927 |
+
voice=filtered_language_dict[language]
|
928 |
+
communicate = edge_tts.Communicate(text, voice)
|
929 |
+
file_path="output.wav"
|
930 |
+
await communicate.save(file_path)
|
931 |
+
with open(file_path, "rb") as audio_file:
|
932 |
+
audio_bytes = BytesIO(audio_file.read())
|
933 |
+
audio = base64.b64encode(audio_bytes.read()).decode("utf-8")
|
934 |
+
print("tts....")
|
935 |
+
audio_style = 'style="width:250px;"'
|
936 |
+
if autoplay:
|
937 |
+
audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
|
938 |
+
else:
|
939 |
+
audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls {audio_style}></audio>'
|
940 |
+
return audio_player
|
941 |
|
942 |
def create_ui():
|
943 |
title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
|
|
|
945 |
description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
|
946 |
|
947 |
examples = [
|
948 |
+
["test_images/pearl.jpg"],
|
949 |
+
["test_images/ambass.jpg"],
|
950 |
+
["test_images/Picture0.png"],
|
951 |
+
["test_images/Picture1.png"],
|
952 |
+
["test_images/Picture2.png"],
|
953 |
+
["test_images/Picture3.png"],
|
954 |
+
["test_images/Picture4.png"],
|
955 |
+
["test_images/Picture5.png"],
|
956 |
|
957 |
]
|
958 |
|
959 |
with gr.Blocks(
|
960 |
+
css=get_style(),
|
961 |
+
theme=gr.themes.Base()
|
962 |
) as iface:
|
963 |
state = gr.State([])
|
964 |
out_state = gr.State(None)
|
|
|
978 |
input_labels_state = gr.State([])
|
979 |
new_crop_save_path = gr.State(None)
|
980 |
image_input_nobackground = gr.State(None)
|
981 |
+
like_state=gr.State([])
|
982 |
+
dislike_state=gr.State([])
|
983 |
|
984 |
|
985 |
|
|
|
990 |
with gr.Column(scale=1.0):
|
991 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
992 |
with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
|
|
|
993 |
image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
|
994 |
example_image = gr.Image(type="pil", interactive=False, visible=False)
|
995 |
with gr.Row():
|
996 |
name_label_base = gr.Button(value="Name: ")
|
997 |
artist_label_base = gr.Button(value="Artist: ")
|
998 |
year_label_base = gr.Button(value="Year: ")
|
999 |
+
material_label_base = gr.Button(value="Material: ")
|
1000 |
|
|
|
|
|
1001 |
with gr.Tab("Click") as click_tab:
|
|
|
1002 |
image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
|
1003 |
example_image = gr.Image(type="pil", interactive=False, visible=False)
|
1004 |
with gr.Row():
|
|
|
1007 |
year_label = gr.Button(value="Year: ")
|
1008 |
material_label = gr.Button(value="Material: ")
|
1009 |
with gr.Row(scale=1.0):
|
1010 |
+
with gr.Row(scale=0.8):
|
1011 |
+
focus_type = gr.Radio(
|
1012 |
+
choices=["CFV-D", "CFV-DA", "CFV-DAI","PFV-DDA"],
|
1013 |
+
value="CFV-D",
|
1014 |
+
label="Information Type",
|
1015 |
+
interactive=True)
|
1016 |
+
with gr.Row(scale=0.2):
|
1017 |
+
submit_button_click=gr.Button(value="Submit", interactive=True,variant='primary',size="sm")
|
1018 |
with gr.Row(scale=1.0):
|
1019 |
with gr.Row(scale=0.4):
|
1020 |
point_prompt = gr.Radio(
|
|
|
1030 |
with gr.Row(scale=0.4):
|
1031 |
clear_button_click = gr.Button(value="Clear Clicks", interactive=True)
|
1032 |
clear_button_image = gr.Button(value="Clear Image", interactive=True)
|
1033 |
+
|
1034 |
+
with gr.Tab("Trajectory (beta)") as traj_tab:
|
1035 |
sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=20,
|
1036 |
elem_id="image_sketcher")
|
1037 |
with gr.Row():
|
1038 |
submit_button_sketcher = gr.Button(value="Submit", interactive=True)
|
1039 |
|
1040 |
+
with gr.Column(visible=False) as modules_need_gpt1:
|
1041 |
+
with gr.Row(scale=1.0):
|
1042 |
+
sentiment = gr.Radio(
|
1043 |
+
choices=["Positive", "Natural", "Negative"],
|
1044 |
+
value="Natural",
|
1045 |
+
label="Sentiment",
|
1046 |
+
interactive=True,
|
1047 |
+
)
|
1048 |
+
with gr.Row(scale=1.0):
|
1049 |
+
factuality = gr.Radio(
|
1050 |
+
choices=["Factual", "Imagination"],
|
1051 |
+
value="Factual",
|
1052 |
+
label="Factuality",
|
1053 |
+
interactive=True,
|
1054 |
+
)
|
1055 |
+
length = gr.Slider(
|
1056 |
+
minimum=10,
|
1057 |
+
maximum=80,
|
1058 |
+
value=10,
|
1059 |
+
step=1,
|
1060 |
+
interactive=True,
|
1061 |
+
label="Generated Caption Length",
|
1062 |
+
)
|
1063 |
+
# 是否启用wiki内容整合到caption中
|
1064 |
+
enable_wiki = gr.Radio(
|
1065 |
+
choices=["Yes", "No"],
|
1066 |
+
value="No",
|
1067 |
+
label="Enable Wiki",
|
1068 |
+
interactive=True)
|
1069 |
+
|
1070 |
with gr.Row(scale=1.0):
|
1071 |
+
gr.Examples(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1072 |
examples=examples,
|
1073 |
inputs=[example_image],
|
1074 |
)
|
1075 |
|
1076 |
+
# with gr.Column(visible=True) as modules_not_need_gpt3:
|
1077 |
+
|
1078 |
+
|
1079 |
with gr.Column(scale=0.5):
|
1080 |
+
with gr.Row(align="right",visible=False) as language_select:
|
1081 |
+
language = gr.Dropdown(
|
1082 |
+
['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
|
1083 |
+
value="English", label="Language", interactive=True)
|
1084 |
+
|
1085 |
+
with gr.Row(align="right",visible=False) as autoplay:
|
1086 |
+
auto_play = gr.Checkbox(label="Check to autoplay audio", value=False,scale=0.4)
|
1087 |
+
output_audio = gr.HTML(label="Synthesised Audio",scale=0.6)
|
1088 |
+
|
1089 |
with gr.Column(visible=True) as module_key_input:
|
1090 |
openai_api_key = gr.Textbox(
|
1091 |
placeholder="Input openAI API key",
|
|
|
1101 |
notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False)
|
1102 |
|
1103 |
with gr.Column():
|
1104 |
+
with gr.Column(visible=False,scale=1.0) as modules_need_gpt2:
|
1105 |
paragraph_output = gr.Textbox(lines=7, label="Describe Everything", max_lines=7)
|
1106 |
+
with gr.Column(visible=False,scale=0.2) as modules_need_gpt0:
|
1107 |
cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
|
1108 |
|
1109 |
with gr.Column(visible=False) as modules_not_need_gpt2:
|
1110 |
+
with gr.Blocks(css=css):
|
1111 |
+
chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True).style(height=600, scale=0.5)
|
1112 |
+
with gr.Column(visible=False) as modules_need_gpt3:
|
1113 |
+
chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
|
1114 |
+
container=False)
|
1115 |
+
with gr.Row():
|
1116 |
+
clear_button_text = gr.Button(value="Clear Text", interactive=True)
|
1117 |
+
submit_button_text = gr.Button(value="Send", interactive=True, variant="primary")
|
1118 |
+
with gr.Row():
|
1119 |
+
export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary")
|
1120 |
+
with gr.Row():
|
1121 |
+
chat_log_file = gr.File(label="Download Chat Log")
|
1122 |
|
|
|
1123 |
# TTS interface hidden initially
|
1124 |
+
with gr.Column(visible=False) as tts_interface:
|
1125 |
+
input_text = gr.Textbox(label="Text Prompt", value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality")
|
1126 |
+
input_language = gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en")
|
1127 |
+
input_audio = gr.Audio(label="Reference Audio", type="filepath", value="examples/female.wav")
|
1128 |
+
input_mic = gr.Audio(source="microphone", type="filepath", label="Use Microphone for Reference")
|
1129 |
+
use_mic = gr.Checkbox(label="Check to use Microphone as Reference", value=False)
|
1130 |
+
agree = gr.Checkbox(label="Agree", value=True)
|
1131 |
+
output_waveform = gr.Video(label="Waveform Visual")
|
1132 |
+
# output_audio = gr.HTML(label="Synthesised Audio")
|
1133 |
|
1134 |
+
with gr.Row():
|
1135 |
+
submit_tts = gr.Button(value="Submit", interactive=True)
|
1136 |
+
clear_tts = gr.Button(value="Clear", interactive=True)
|
1137 |
|
1138 |
|
1139 |
|
|
|
1228 |
|
1229 |
|
1230 |
mv_images = gr.State()
|
1231 |
+
|
1232 |
+
chatbot.like(handle_like_dislike, inputs=[like_state, dislike_state], outputs=[like_state, dislike_state])
|
1233 |
|
1234 |
submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success(
|
1235 |
fn=generate_mvs,
|
|
|
1250 |
def clear_tts_fields():
|
1251 |
return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None]
|
1252 |
|
1253 |
+
# submit_tts.click(
|
1254 |
+
# tts.predict,
|
1255 |
+
# inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree],
|
1256 |
+
# outputs=[output_waveform, output_audio],
|
1257 |
+
# queue=True
|
1258 |
+
# )
|
1259 |
|
1260 |
clear_tts.click(
|
1261 |
clear_tts_fields,
|
|
|
1267 |
|
1268 |
openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
|
1269 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
|
1270 |
+
modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
|
1271 |
enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
|
1272 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1273 |
modules_not_need_gpt,
|
1274 |
+
modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
|
1275 |
disable_chatGPT_button.click(init_wo_openai_api_key,
|
1276 |
outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
|
1277 |
modules_not_need_gpt,
|
1278 |
+
modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
|
1279 |
|
1280 |
enable_chatGPT_button.click(
|
1281 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
|
|
1375 |
return [gr.update(visible=False)]*4
|
1376 |
|
1377 |
|
1378 |
+
traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
|
1379 |
click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2])
|
1380 |
base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1])
|
1381 |
|
|
|
1399 |
inputs=[
|
1400 |
image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
|
1401 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
1402 |
+
auto_play,paragraph,focus_type,openai_api_key,new_crop_save_path
|
1403 |
],
|
1404 |
outputs=[
|
1405 |
chatbot, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
|
1406 |
+
output_audio
|
1407 |
],
|
1408 |
show_progress=True,
|
1409 |
queue=True
|
1410 |
)
|
1411 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1412 |
|
1413 |
submit_button_sketcher.click(
|
1414 |
inference_traject,
|
|
|
1437 |
if __name__ == '__main__':
|
1438 |
iface = create_ui()
|
1439 |
iface.queue(concurrency_count=5, api_open=False, max_size=10)
|
1440 |
+
iface.launch(server_name="0.0.0.0", enable_queue=True)
|