EyeSee_chi

Running

App Files Files Community

Niki Zhang commited on Jun 4, 2024

Commit

631bbe0

verified ·

1 Parent(s): b0229d6

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -46

app.py CHANGED Viewed

@@ -395,15 +395,15 @@ def init_openai_api_key(api_key=""):
         global gpt_state
         gpt_state=1
-        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*2
     else:
         gpt_state=0
-        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*2
 def init_wo_openai_api_key():
         global gpt_state
         gpt_state=0
-        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*2
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
@@ -441,14 +441,19 @@ def update_click_state(click_state, caption, click_mode):
     else:
         raise NotImplementedError
-def chat_input_callback(*args):
-    visual_chatgpt, chat_input, click_state, state, aux_state = args
     if visual_chatgpt is not None:
-        return visual_chatgpt.run_text(chat_input, state, aux_state)
     else:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
-        return state, state
@@ -777,6 +782,7 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
     prompt = get_sketch_prompt(mask)
     boxes = prompt['input_boxes']
     controls = {'length': length,
                 'sentiment': sentiment,
@@ -797,18 +803,23 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)[0]
     # Update components and states
     state.append((f'Box: {boxes}', None))
-    state.append((None, f'raw_caption: {out["generated_captions"]["raw_caption"]}'))
     text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
-    image_input = mask_painter(np.array(image_input), input_mask)
-    origin_image_input = image_input
-    fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
-    image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
     yield state, state, image_input
@@ -819,10 +830,10 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
         new_cap = refined_caption['caption']
         if refined_caption['wiki']:
             state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
-        state = state + [(None, f"caption: {new_cap}")]
-        refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
-        yield state, state, refined_image_input
 def clear_chat_memory(visual_chatgpt, keep_global=False):
     if visual_chatgpt is not None:
@@ -852,26 +863,16 @@ def export_chat_log(chat_state):
         return None
-def cap_everything(image_input, visual_chatgpt, text_refiner,input_language, input_audio, input_mic, use_mic, agree):
-    model = build_caption_anything_with_models(
-        args,
-        api_key="",
-        captioner=shared_captioner,
-        sam_model=shared_sam_model,
-        ocr_reader=shared_ocr_reader,
-        text_refiner=text_refiner,
-        session_id=iface.app_id
-    )
-    paragraph = model.inference_cap_everything(image_input, verbose=True)
     # state = state + [(None, f"Caption Everything: {paragraph}")]
     Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
     AI_prompt = "Received."
     visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
     visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
     # waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
-    waveform_visual, audio_output=None,None
-    return paragraph,waveform_visual, audio_output
 def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
@@ -1038,10 +1039,23 @@ def create_ui():
                                 clear_button_image = gr.Button(value="Clear Image", interactive=True)
                     with gr.Tab("Trajectory (beta)") as traj_tab:
-                        sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=20,
                                                        elem_id="image_sketcher")
                         with gr.Row():
                             submit_button_sketcher = gr.Button(value="Submit", interactive=True)
                     with gr.Column(visible=False) as modules_need_gpt1:
                         with gr.Row(scale=1.0):
@@ -1070,16 +1084,16 @@ def create_ui():
                             enable_wiki = gr.Radio(
                                 choices=["Yes", "No"],
                                 value="No",
-                                label="Enable Wiki",
                                 interactive=True)
-                    with gr.Row(scale=1.0):
-                        gr.Examples(
                     examples=examples,
                     inputs=[example_image],
                 )
-                # with gr.Column(visible=True) as modules_not_need_gpt3:
             with gr.Column(scale=0.5):
@@ -1108,7 +1122,7 @@ def create_ui():
                 with gr.Column() as modules_need_gpt0:
                     with gr.Column(visible=False,scale=1.0) as modules_need_gpt2:
-                        paragraph_output = gr.Textbox(lines=10, label="Describe Everything", max_lines=10)
                         cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
                 with gr.Column(visible=False) as modules_not_need_gpt2:
@@ -1146,7 +1160,7 @@ def create_ui():
         # this part is for 3d generate.
         ###############################################################################
-        with gr.Row(variant="panel"):
             with gr.Column():
                 with gr.Row():
                     input_image = gr.Image(
@@ -1268,19 +1282,27 @@ def create_ui():
             outputs=[input_text, input_language, input_audio, input_mic, use_mic, agree, output_waveform, output_audio],
             queue=False
         )
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
-                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
-                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
-                                              modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay])
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
@@ -1297,8 +1319,8 @@ def create_ui():
             show_progress=False
         )
-        cap_everything_button.click(cap_everything, [origin_image, visual_chatgpt, text_refiner,input_language, input_audio, input_mic, use_mic, agree],
-                                    [paragraph_output,output_waveform, output_audio])
         clear_button_click.click(
             lambda x: ([[], [], []], x),
@@ -1348,11 +1370,11 @@ def create_ui():
         sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
                               [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
                                image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph])
-        chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
-                          [chatbot, state, aux_state])
         chat_input.submit(lambda: "", None, chat_input)
-        submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
-                          [chatbot, state, aux_state])
         submit_button_text.click(lambda: "", None, chat_input)
         example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,

         global gpt_state
         gpt_state=1
+        return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=False)]*3 + [text_refiner, visual_chatgpt, None]+[gr.update(visible=True)]*3
     else:
         gpt_state=0
+        return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']+[gr.update(visible=False)]*3
 def init_wo_openai_api_key():
         global gpt_state
         gpt_state=0
+        return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]+[gr.update(visible=False)]*3
 def get_click_prompt(chat_input, click_state, click_mode):
     inputs = json.loads(chat_input)
     else:
         raise NotImplementedError
+async def chat_input_callback(*args):
+    visual_chatgpt, chat_input, click_state, state, aux_state ,language , autoplay = args
     if visual_chatgpt is not None:
+        state, _, aux_state, _ = visual_chatgpt.run_text(chat_input, state, aux_state)
+        last_text, last_response = state[-1]
+        print("last response",last_response)
+        audio = await texttospeech(last_response,language,autoplay)
+        return state, state, aux_state,audio
     else:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
+        audio = await texttospeech(response,language,autoplay)
+        return state, state, None,audio
     prompt = get_sketch_prompt(mask)
     boxes = prompt['input_boxes']
+    boxes = boxes[0]
     controls = {'length': length,
                 'sentiment': sentiment,
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)[0]
+    # if visual_chatgpt is not None:
+    #     print('inference_click: add caption to chatGPT memory')
+    #     new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
     # Update components and states
     state.append((f'Box: {boxes}', None))
+    state.append((None, f'{out["generated_captions"]["raw_caption"]}'))
     text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
+    # image_input = mask_painter(np.array(image_input), input_mask, background_alpha=0 )
+    image_input = Image.fromarray(np.array(image_input))
+    draw = ImageDraw.Draw(image_input)
+    draw.rectangle(boxes, outline='red', width=2)
+    # fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
+    # image_input = create_bubble_frame(image_input, "", fake_click_index, input_mask)
     yield state, state, image_input
         new_cap = refined_caption['caption']
         if refined_caption['wiki']:
             state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
+        state = state + [(None, f"{new_cap}")]
+        # refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
+        yield state, state, image_input
 def clear_chat_memory(visual_chatgpt, keep_global=False):
     if visual_chatgpt is not None:
         return None
+async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
     # state = state + [(None, f"Caption Everything: {paragraph}")]
     Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
     AI_prompt = "Received."
     visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
     visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
     # waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
+    audio_output=await texttospeech(paragraph,language,autoplay)
+    return paragraph,audio_output
 def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
                                 clear_button_image = gr.Button(value="Clear Image", interactive=True)
                     with gr.Tab("Trajectory (beta)") as traj_tab:
+                        sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10,
                                                        elem_id="image_sketcher")
                         with gr.Row():
+                            clear_button_sketcher = gr.Button(value="Clear Sketch", interactive=True)
                             submit_button_sketcher = gr.Button(value="Submit", interactive=True)
+                        with gr.Row(scale=1.0):
+                            with gr.Row(scale=0.8):
+                                focus_type_sketch = gr.Radio(
+                                        choices=["CFV-D", "CFV-DA", "CFV-DAI","PFV-DDA"],
+                                        value="CFV-D",
+                                        label="Information Type",
+                                        interactive=True)
+                                Input_sketch = gr.Radio(
+                                        choices=["Trace+Seg", "Trace"],
+                                        value="Trace+Seg",
+                                        label="Caption Type",
+                                        interactive=True)
                     with gr.Column(visible=False) as modules_need_gpt1:
                         with gr.Row(scale=1.0):
                             enable_wiki = gr.Radio(
                                 choices=["Yes", "No"],
                                 value="No",
+                                label="Expert",
                                 interactive=True)
+                with gr.Column(visible=True) as modules_not_need_gpt3:
+                    gr.Examples(
                     examples=examples,
                     inputs=[example_image],
                 )
             with gr.Column(scale=0.5):
                 with gr.Column() as modules_need_gpt0:
                     with gr.Column(visible=False,scale=1.0) as modules_need_gpt2:
+                        paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16)
                         cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True)
                 with gr.Column(visible=False) as modules_not_need_gpt2:
         # this part is for 3d generate.
         ###############################################################################
+        with gr.Row(variant="panel") as d3_model:
             with gr.Column():
                 with gr.Row():
                     input_image = gr.Image(
             outputs=[input_text, input_language, input_audio, input_mic, use_mic, agree, output_waveform, output_audio],
             queue=False
         )
+        clear_button_sketcher.click(
+            lambda x: (x),
+            [origin_image],
+            [sketcher_input],
+            queue=False,
+            show_progress=False
+        )
         openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
                               outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
+                                       modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
         enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
                                     outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                              modules_not_need_gpt,
+                                             modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
         disable_chatGPT_button.click(init_wo_openai_api_key,
                                      outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3,
                                               modules_not_need_gpt,
+                                              modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,language_select,autoplay,d3_model])
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             show_progress=False
         )
+        cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play],
+                                    [paragraph_output,output_audio])
         clear_button_click.click(
             lambda x: ([[], [], []], x),
         sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
                               [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
                                image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph])
+        chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
+                          [chatbot, state, aux_state,output_audio])
         chat_input.submit(lambda: "", None, chat_input)
+        submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
+                          [chatbot, state, aux_state,output_audio])
         submit_button_text.click(lambda: "", None, chat_input)
         example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,