Spaces:
Running
Running
Niki Zhang
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -714,7 +714,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
|
|
714 |
new_image_path = get_new_image_name('chat_image', func_name='upload')
|
715 |
image_input.save(new_image_path)
|
716 |
visual_chatgpt.current_image = new_image_path
|
717 |
-
paragraph =
|
718 |
# img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
|
719 |
Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
|
720 |
AI_prompt = "Received."
|
@@ -722,7 +722,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
|
|
722 |
visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
|
723 |
print("memory",visual_chatgpt.agent.memory)
|
724 |
# visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
725 |
-
parsed_data =
|
726 |
parsed_data = json.loads(parsed_data.replace("'", "\""))
|
727 |
name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
|
728 |
# artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
|
@@ -832,7 +832,7 @@ async def submit_caption(state, text_refiner, length, sentiment, factuality, lan
|
|
832 |
# if not args.disable_gpt and text_refiner:
|
833 |
if not args.disable_gpt:
|
834 |
print("new crop save",new_crop_save_path)
|
835 |
-
focus_info=
|
836 |
if focus_info.startswith('"') and focus_info.endswith('"'):
|
837 |
focus_info=focus_info[1:-1]
|
838 |
focus_info=focus_info.replace('#', '')
|
@@ -900,52 +900,66 @@ def encode_image(image_path):
|
|
900 |
with open(image_path, "rb") as image_file:
|
901 |
return base64.b64encode(image_file.read()).decode('utf-8')
|
902 |
|
903 |
-
def
|
904 |
-
|
905 |
-
|
906 |
-
|
907 |
-
|
908 |
|
909 |
headers = {
|
910 |
"Content-Type": "application/json",
|
911 |
"Authorization": f"Bearer {api_key}"
|
912 |
}
|
913 |
|
914 |
-
|
915 |
-
|
916 |
-
|
917 |
-
|
918 |
-
|
919 |
-
|
920 |
-
|
921 |
-
|
922 |
-
|
923 |
-
|
924 |
-
|
925 |
-
|
926 |
-
|
927 |
-
|
928 |
-
|
929 |
-
|
|
|
930 |
}
|
931 |
-
|
932 |
-
|
933 |
-
|
934 |
-
|
935 |
-
|
936 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
937 |
|
938 |
# Sending the request to the OpenAI API
|
939 |
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
940 |
result = response.json()
|
941 |
-
|
942 |
-
content = result['choices'][0]['message']['content']
|
943 |
-
# Assume the model returns a valid JSON string in 'content'
|
944 |
try:
|
|
|
945 |
return content
|
946 |
-
except json.JSONDecodeError:
|
947 |
-
return {"error": "Failed to parse model output"}
|
948 |
-
|
949 |
|
950 |
|
951 |
|
@@ -1033,7 +1047,7 @@ async def inference_traject(origin_image,sketcher_image, enable_wiki, language,
|
|
1033 |
|
1034 |
# if not args.disable_gpt and text_refiner:
|
1035 |
if not args.disable_gpt:
|
1036 |
-
focus_info=
|
1037 |
if focus_info.startswith('"') and focus_info.endswith('"'):
|
1038 |
focus_info=focus_info[1:-1]
|
1039 |
focus_info=focus_info.replace('#', '')
|
@@ -1119,8 +1133,41 @@ def export_chat_log(chat_state, paragraph, liked, disliked):
|
|
1119 |
print(f"An error occurred while exporting the chat log: {e}")
|
1120 |
return None
|
1121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1122 |
|
1123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1124 |
async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
|
1125 |
|
1126 |
# state = state + [(None, f"Caption Everything: {paragraph}")]
|
@@ -1853,6 +1900,39 @@ def create_ui():
|
|
1853 |
modules_not_need_gpt,
|
1854 |
modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
|
1855 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1856 |
enable_chatGPT_button.click(
|
1857 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
1858 |
[],
|
|
|
714 |
new_image_path = get_new_image_name('chat_image', func_name='upload')
|
715 |
image_input.save(new_image_path)
|
716 |
visual_chatgpt.current_image = new_image_path
|
717 |
+
paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
|
718 |
# img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
|
719 |
Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
|
720 |
AI_prompt = "Received."
|
|
|
722 |
visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
|
723 |
print("memory",visual_chatgpt.agent.memory)
|
724 |
# visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
725 |
+
parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and material used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"material\": \"Material used in the painting\" }")
|
726 |
parsed_data = json.loads(parsed_data.replace("'", "\""))
|
727 |
name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
|
728 |
# artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
|
|
|
832 |
# if not args.disable_gpt and text_refiner:
|
833 |
if not args.disable_gpt:
|
834 |
print("new crop save",new_crop_save_path)
|
835 |
+
focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt)
|
836 |
if focus_info.startswith('"') and focus_info.endswith('"'):
|
837 |
focus_info=focus_info[1:-1]
|
838 |
focus_info=focus_info.replace('#', '')
|
|
|
900 |
with open(image_path, "rb") as image_file:
|
901 |
return base64.b64encode(image_file.read()).decode('utf-8')
|
902 |
|
903 |
+
def get_gpt_response(api_key, image_path, prompt, enable_wiki=None):
|
904 |
+
headers = {
|
905 |
+
"Content-Type": "application/json",
|
906 |
+
"Authorization": f"Bearer {api_key}"
|
907 |
+
}
|
908 |
|
909 |
headers = {
|
910 |
"Content-Type": "application/json",
|
911 |
"Authorization": f"Bearer {api_key}"
|
912 |
}
|
913 |
|
914 |
+
if image_path:
|
915 |
+
base64_image = encode_image(image_path)
|
916 |
+
payload = {
|
917 |
+
"model": "gpt-4o",
|
918 |
+
"messages": [
|
919 |
+
{
|
920 |
+
"role": "user",
|
921 |
+
"content": [
|
922 |
+
{
|
923 |
+
"type": "text",
|
924 |
+
"text": prompt
|
925 |
+
},
|
926 |
+
{
|
927 |
+
"type": "image_url",
|
928 |
+
"image_url": {
|
929 |
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
930 |
+
}
|
931 |
}
|
932 |
+
]
|
933 |
+
}
|
934 |
+
],
|
935 |
+
"max_tokens": 300
|
936 |
+
}
|
937 |
+
else:
|
938 |
+
payload = {
|
939 |
+
"model": "gpt-4o",
|
940 |
+
"messages": [
|
941 |
+
{
|
942 |
+
"role": "user",
|
943 |
+
"content": [
|
944 |
+
{
|
945 |
+
"type": "text",
|
946 |
+
"text": prompt
|
947 |
+
}
|
948 |
+
]
|
949 |
+
}
|
950 |
+
],
|
951 |
+
"max_tokens": 300
|
952 |
+
}
|
953 |
|
954 |
# Sending the request to the OpenAI API
|
955 |
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
956 |
result = response.json()
|
957 |
+
|
|
|
|
|
958 |
try:
|
959 |
+
content = result['choices'][0]['message']['content']
|
960 |
return content
|
961 |
+
except (KeyError, IndexError, json.JSONDecodeError) as e:
|
962 |
+
return json.dumps({"error": "Failed to parse model output", "details": str(e)})
|
|
|
963 |
|
964 |
|
965 |
|
|
|
1047 |
|
1048 |
# if not args.disable_gpt and text_refiner:
|
1049 |
if not args.disable_gpt:
|
1050 |
+
focus_info=get_gpt_response(openai_api_key,crop_save_path,prompt)
|
1051 |
if focus_info.startswith('"') and focus_info.endswith('"'):
|
1052 |
focus_info=focus_info[1:-1]
|
1053 |
focus_info=focus_info.replace('#', '')
|
|
|
1133 |
print(f"An error occurred while exporting the chat log: {e}")
|
1134 |
return None
|
1135 |
|
1136 |
+
async def get_artistinfo(artist_name,api_key,state,language,autoplay,length):
|
1137 |
+
prompt=f"Provide a concise summary of about {length} words on the painter {artist_name}, covering his biography, major works, artistic style, significant contributions to the art world, and any major awards or recognitions he has received."
|
1138 |
+
res=get_gpt_response(api_key,None,prompt)
|
1139 |
+
state = state + [(None, f"Artist Info:{res}")]
|
1140 |
+
read_info = re.sub(r'[#[\]!*]','',res)
|
1141 |
+
read_info = emoji.replace_emoji(read_info,replace="")
|
1142 |
+
|
1143 |
+
|
1144 |
+
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
1145 |
+
# input_points=input_points, input_labels=input_labels)
|
1146 |
+
|
1147 |
+
audio_output = await texttospeech(read_info, language,autoplay)
|
1148 |
+
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
1149 |
+
return state, state,audio_output
|
1150 |
|
1151 |
|
1152 |
+
async def get_yearinfo(year,api_key,state,language,autoplay,length):
|
1153 |
+
prompt = f"Provide a concise summary of about {length} words on the art historical period associated with the year {year}, covering its major characteristics, influential artists, notable works, and its significance in the broader context of art history."
|
1154 |
+
res=get_gpt_response(api_key,None,prompt)
|
1155 |
+
state = state + [(None, f"Artist Info:{res}")]
|
1156 |
+
read_info = re.sub(r'[#[\]!*]','',res)
|
1157 |
+
read_info = emoji.replace_emoji(read_info,replace="")
|
1158 |
+
|
1159 |
+
|
1160 |
+
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
1161 |
+
# input_points=input_points, input_labels=input_labels)
|
1162 |
+
|
1163 |
+
audio_output = await texttospeech(read_info, language,autoplay)
|
1164 |
+
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
1165 |
+
return state, state,audio_output
|
1166 |
+
|
1167 |
+
|
1168 |
+
|
1169 |
+
|
1170 |
+
|
1171 |
async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
|
1172 |
|
1173 |
# state = state + [(None, f"Caption Everything: {paragraph}")]
|
|
|
1900 |
modules_not_need_gpt,
|
1901 |
modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
|
1902 |
|
1903 |
+
artist_label_base2.click(
|
1904 |
+
get_artistinfo,
|
1905 |
+
inputs=[artist_label_base2,openai_api_key,state,language,auto_play,length],
|
1906 |
+
outputs=[chatbot,state,output_audio]
|
1907 |
+
)
|
1908 |
+
artist_label.click(
|
1909 |
+
get_artistinfo,
|
1910 |
+
inputs=[artist_label,openai_api_key,state,language,auto_play,length],
|
1911 |
+
outputs=[chatbot,state,output_audio]
|
1912 |
+
)
|
1913 |
+
artist_label_traj.click(
|
1914 |
+
get_artistinfo,
|
1915 |
+
inputs=[artist_label_traj,openai_api_key,state,language,auto_play,length],
|
1916 |
+
outputs=[chatbot,state,output_audio]
|
1917 |
+
)
|
1918 |
+
|
1919 |
+
year_label_base2.click(
|
1920 |
+
get_yearinfo,
|
1921 |
+
inputs=[year_label_base2,openai_api_key,state,language,auto_play,length],
|
1922 |
+
outputs=[chatbot,state,output_audio]
|
1923 |
+
)
|
1924 |
+
year_label.click(
|
1925 |
+
get_yearinfo,
|
1926 |
+
inputs=[year_label,openai_api_key,state,language,auto_play,length],
|
1927 |
+
outputs=[chatbot,state,output_audio]
|
1928 |
+
)
|
1929 |
+
year_label_traj.click(
|
1930 |
+
get_yearinfo,
|
1931 |
+
inputs=[year_label_traj,openai_api_key,state,language,auto_play,length],
|
1932 |
+
outputs=[chatbot,state,output_audio]
|
1933 |
+
)
|
1934 |
+
|
1935 |
+
|
1936 |
enable_chatGPT_button.click(
|
1937 |
lambda: (None, [], [], [[], [], []], "", "", ""),
|
1938 |
[],
|