Niki Zhang commited on
Commit
b96896b
·
verified ·
1 Parent(s): 29ba522

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -37
app.py CHANGED
@@ -714,7 +714,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
714
  new_image_path = get_new_image_name('chat_image', func_name='upload')
715
  image_input.save(new_image_path)
716
  visual_chatgpt.current_image = new_image_path
717
- paragraph = get_image_gpt(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
718
  # img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
719
  Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
720
  AI_prompt = "Received."
@@ -722,7 +722,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
722
  visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
723
  print("memory",visual_chatgpt.agent.memory)
724
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
725
- parsed_data = get_image_gpt(openai_api_key, new_image_path,"Please provide the name, artist, year of creation, and material used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\",\"artist\": \"Name of the artist\", \"year\": \"Year of creation\", \"material\": \"Material used in the painting\" }.")
726
  parsed_data = json.loads(parsed_data.replace("'", "\""))
727
  name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
728
  # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
@@ -832,7 +832,7 @@ async def submit_caption(state, text_refiner, length, sentiment, factuality, lan
832
  # if not args.disable_gpt and text_refiner:
833
  if not args.disable_gpt:
834
  print("new crop save",new_crop_save_path)
835
- focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
836
  if focus_info.startswith('"') and focus_info.endswith('"'):
837
  focus_info=focus_info[1:-1]
838
  focus_info=focus_info.replace('#', '')
@@ -900,52 +900,66 @@ def encode_image(image_path):
900
  with open(image_path, "rb") as image_file:
901
  return base64.b64encode(image_file.read()).decode('utf-8')
902
 
903
- def get_image_gpt(api_key, image_path,prompt,enable_wiki=None):
904
- # Getting the base64 string
905
- base64_image = encode_image(image_path)
906
-
907
-
908
 
909
  headers = {
910
  "Content-Type": "application/json",
911
  "Authorization": f"Bearer {api_key}"
912
  }
913
 
914
- prompt_text = prompt
915
-
916
- payload = {
917
- "model": "gpt-4o",
918
- "messages": [
919
- {
920
- "role": "user",
921
- "content": [
922
- {
923
- "type": "text",
924
- "text": prompt_text
925
- },
926
- {
927
- "type": "image_url",
928
- "image_url": {
929
- "url": f"data:image/jpeg;base64,{base64_image}"
 
930
  }
931
- }
932
- ]
933
- }
934
- ],
935
- "max_tokens": 300
936
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
937
 
938
  # Sending the request to the OpenAI API
939
  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
940
  result = response.json()
941
- print(result)
942
- content = result['choices'][0]['message']['content']
943
- # Assume the model returns a valid JSON string in 'content'
944
  try:
 
945
  return content
946
- except json.JSONDecodeError:
947
- return {"error": "Failed to parse model output"}
948
-
949
 
950
 
951
 
@@ -1033,7 +1047,7 @@ async def inference_traject(origin_image,sketcher_image, enable_wiki, language,
1033
 
1034
  # if not args.disable_gpt and text_refiner:
1035
  if not args.disable_gpt:
1036
- focus_info=get_image_gpt(openai_api_key,crop_save_path,prompt)
1037
  if focus_info.startswith('"') and focus_info.endswith('"'):
1038
  focus_info=focus_info[1:-1]
1039
  focus_info=focus_info.replace('#', '')
@@ -1119,8 +1133,41 @@ def export_chat_log(chat_state, paragraph, liked, disliked):
1119
  print(f"An error occurred while exporting the chat log: {e}")
1120
  return None
1121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1122
 
1123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1124
  async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
1125
 
1126
  # state = state + [(None, f"Caption Everything: {paragraph}")]
@@ -1853,6 +1900,39 @@ def create_ui():
1853
  modules_not_need_gpt,
1854
  modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
1855
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1856
  enable_chatGPT_button.click(
1857
  lambda: (None, [], [], [[], [], []], "", "", ""),
1858
  [],
 
714
  new_image_path = get_new_image_name('chat_image', func_name='upload')
715
  image_input.save(new_image_path)
716
  visual_chatgpt.current_image = new_image_path
717
+ paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
718
  # img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
719
  Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
720
  AI_prompt = "Received."
 
722
  visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
723
  print("memory",visual_chatgpt.agent.memory)
724
  # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
725
+ parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and material used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"material\": \"Material used in the painting\" }")
726
  parsed_data = json.loads(parsed_data.replace("'", "\""))
727
  name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
728
  # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
 
832
  # if not args.disable_gpt and text_refiner:
833
  if not args.disable_gpt:
834
  print("new crop save",new_crop_save_path)
835
+ focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt)
836
  if focus_info.startswith('"') and focus_info.endswith('"'):
837
  focus_info=focus_info[1:-1]
838
  focus_info=focus_info.replace('#', '')
 
900
  with open(image_path, "rb") as image_file:
901
  return base64.b64encode(image_file.read()).decode('utf-8')
902
 
903
+ def get_gpt_response(api_key, image_path, prompt, enable_wiki=None):
904
+ headers = {
905
+ "Content-Type": "application/json",
906
+ "Authorization": f"Bearer {api_key}"
907
+ }
908
 
909
  headers = {
910
  "Content-Type": "application/json",
911
  "Authorization": f"Bearer {api_key}"
912
  }
913
 
914
+ if image_path:
915
+ base64_image = encode_image(image_path)
916
+ payload = {
917
+ "model": "gpt-4o",
918
+ "messages": [
919
+ {
920
+ "role": "user",
921
+ "content": [
922
+ {
923
+ "type": "text",
924
+ "text": prompt
925
+ },
926
+ {
927
+ "type": "image_url",
928
+ "image_url": {
929
+ "url": f"data:image/jpeg;base64,{base64_image}"
930
+ }
931
  }
932
+ ]
933
+ }
934
+ ],
935
+ "max_tokens": 300
936
+ }
937
+ else:
938
+ payload = {
939
+ "model": "gpt-4o",
940
+ "messages": [
941
+ {
942
+ "role": "user",
943
+ "content": [
944
+ {
945
+ "type": "text",
946
+ "text": prompt
947
+ }
948
+ ]
949
+ }
950
+ ],
951
+ "max_tokens": 300
952
+ }
953
 
954
  # Sending the request to the OpenAI API
955
  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
956
  result = response.json()
957
+
 
 
958
  try:
959
+ content = result['choices'][0]['message']['content']
960
  return content
961
+ except (KeyError, IndexError, json.JSONDecodeError) as e:
962
+ return json.dumps({"error": "Failed to parse model output", "details": str(e)})
 
963
 
964
 
965
 
 
1047
 
1048
  # if not args.disable_gpt and text_refiner:
1049
  if not args.disable_gpt:
1050
+ focus_info=get_gpt_response(openai_api_key,crop_save_path,prompt)
1051
  if focus_info.startswith('"') and focus_info.endswith('"'):
1052
  focus_info=focus_info[1:-1]
1053
  focus_info=focus_info.replace('#', '')
 
1133
  print(f"An error occurred while exporting the chat log: {e}")
1134
  return None
1135
 
1136
+ async def get_artistinfo(artist_name,api_key,state,language,autoplay,length):
1137
+ prompt=f"Provide a concise summary of about {length} words on the painter {artist_name}, covering his biography, major works, artistic style, significant contributions to the art world, and any major awards or recognitions he has received."
1138
+ res=get_gpt_response(api_key,None,prompt)
1139
+ state = state + [(None, f"Artist Info:{res}")]
1140
+ read_info = re.sub(r'[#[\]!*]','',res)
1141
+ read_info = emoji.replace_emoji(read_info,replace="")
1142
+
1143
+
1144
+ # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
1145
+ # input_points=input_points, input_labels=input_labels)
1146
+
1147
+ audio_output = await texttospeech(read_info, language,autoplay)
1148
+ # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
1149
+ return state, state,audio_output
1150
 
1151
 
1152
+ async def get_yearinfo(year,api_key,state,language,autoplay,length):
1153
+ prompt = f"Provide a concise summary of about {length} words on the art historical period associated with the year {year}, covering its major characteristics, influential artists, notable works, and its significance in the broader context of art history."
1154
+ res=get_gpt_response(api_key,None,prompt)
1155
+ state = state + [(None, f"Artist Info:{res}")]
1156
+ read_info = re.sub(r'[#[\]!*]','',res)
1157
+ read_info = emoji.replace_emoji(read_info,replace="")
1158
+
1159
+
1160
+ # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
1161
+ # input_points=input_points, input_labels=input_labels)
1162
+
1163
+ audio_output = await texttospeech(read_info, language,autoplay)
1164
+ # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
1165
+ return state, state,audio_output
1166
+
1167
+
1168
+
1169
+
1170
+
1171
  async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
1172
 
1173
  # state = state + [(None, f"Caption Everything: {paragraph}")]
 
1900
  modules_not_need_gpt,
1901
  modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
1902
 
1903
+ artist_label_base2.click(
1904
+ get_artistinfo,
1905
+ inputs=[artist_label_base2,openai_api_key,state,language,auto_play,length],
1906
+ outputs=[chatbot,state,output_audio]
1907
+ )
1908
+ artist_label.click(
1909
+ get_artistinfo,
1910
+ inputs=[artist_label,openai_api_key,state,language,auto_play,length],
1911
+ outputs=[chatbot,state,output_audio]
1912
+ )
1913
+ artist_label_traj.click(
1914
+ get_artistinfo,
1915
+ inputs=[artist_label_traj,openai_api_key,state,language,auto_play,length],
1916
+ outputs=[chatbot,state,output_audio]
1917
+ )
1918
+
1919
+ year_label_base2.click(
1920
+ get_yearinfo,
1921
+ inputs=[year_label_base2,openai_api_key,state,language,auto_play,length],
1922
+ outputs=[chatbot,state,output_audio]
1923
+ )
1924
+ year_label.click(
1925
+ get_yearinfo,
1926
+ inputs=[year_label,openai_api_key,state,language,auto_play,length],
1927
+ outputs=[chatbot,state,output_audio]
1928
+ )
1929
+ year_label_traj.click(
1930
+ get_yearinfo,
1931
+ inputs=[year_label_traj,openai_api_key,state,language,auto_play,length],
1932
+ outputs=[chatbot,state,output_audio]
1933
+ )
1934
+
1935
+
1936
  enable_chatGPT_button.click(
1937
  lambda: (None, [], [], [[], [], []], "", "", ""),
1938
  [],