Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -635,20 +635,25 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
|
|
635 |
state = state + [(None, f"{focus_info}")]
|
636 |
print("new_cap",focus_info)
|
637 |
|
638 |
-
refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
639 |
-
|
640 |
try:
|
641 |
-
waveform_visual, audio_output = tts.predict(focus_info
|
642 |
-
return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
|
|
|
|
643 |
except Exception as e:
|
644 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
645 |
print(f"Error during TTS prediction: {str(e)}")
|
646 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
|
|
647 |
|
648 |
else:
|
649 |
try:
|
650 |
waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
|
651 |
-
return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
|
|
|
|
652 |
except Exception as e:
|
653 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
654 |
print(f"Error during TTS prediction: {str(e)}")
|
|
|
635 |
state = state + [(None, f"{focus_info}")]
|
636 |
print("new_cap",focus_info)
|
637 |
|
638 |
+
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
639 |
+
# input_points=input_points, input_labels=input_labels)
|
640 |
try:
|
641 |
+
waveform_visual, audio_output = tts.predict(focus_info, input_language, input_audio, input_mic, use_mic, agree)
|
642 |
+
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
643 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
644 |
+
|
645 |
except Exception as e:
|
646 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
647 |
print(f"Error during TTS prediction: {str(e)}")
|
648 |
+
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
649 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
650 |
|
651 |
else:
|
652 |
try:
|
653 |
waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
|
654 |
+
# return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
655 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
656 |
+
|
657 |
except Exception as e:
|
658 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
659 |
print(f"Error during TTS prediction: {str(e)}")
|