EyeSee_chi

Running

App Files Files Community

Niki Zhang commited on Jun 14, 2024

Commit

b96896b

verified ·

1 Parent(s): 29ba522

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -37

app.py CHANGED Viewed

@@ -714,7 +714,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
         new_image_path = get_new_image_name('chat_image', func_name='upload')
         image_input.save(new_image_path)
         visual_chatgpt.current_image = new_image_path
-        paragraph = get_image_gpt(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
         # img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
         Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
         AI_prompt = "Received."
@@ -722,7 +722,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
         visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
         print("memory",visual_chatgpt.agent.memory)
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
-        parsed_data = get_image_gpt(openai_api_key, new_image_path,"Please provide the name, artist, year of creation, and material used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\",\"artist\": \"Name of the artist\", \"year\": \"Year of creation\", \"material\": \"Material used in the painting\" }.")
         parsed_data = json.loads(parsed_data.replace("'", "\""))
         name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
         # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
@@ -832,7 +832,7 @@ async def submit_caption(state, text_refiner, length, sentiment, factuality, lan
     # if not args.disable_gpt and text_refiner:
     if not args.disable_gpt:
         print("new crop save",new_crop_save_path)
-        focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
         if focus_info.startswith('"') and focus_info.endswith('"'):
             focus_info=focus_info[1:-1]
         focus_info=focus_info.replace('#', '')
@@ -900,52 +900,66 @@ def encode_image(image_path):
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
-def get_image_gpt(api_key, image_path,prompt,enable_wiki=None):
-    # Getting the base64 string
-    base64_image = encode_image(image_path)
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {api_key}"
     }
-    prompt_text = prompt
-    payload = {
-        "model": "gpt-4o",
-        "messages": [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": prompt_text
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/jpeg;base64,{base64_image}"
                         }
-                    }
-                ]
-            }
-        ],
-        "max_tokens": 300
-    }
     # Sending the request to the OpenAI API
     response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
     result = response.json()
-    print(result)
-    content = result['choices'][0]['message']['content']
-    # Assume the model returns a valid JSON string in 'content'
     try:
         return content
-    except json.JSONDecodeError:
-        return {"error": "Failed to parse model output"}
@@ -1033,7 +1047,7 @@ async def inference_traject(origin_image,sketcher_image, enable_wiki, language,
     # if not args.disable_gpt and text_refiner:
     if not args.disable_gpt:
-        focus_info=get_image_gpt(openai_api_key,crop_save_path,prompt)
         if focus_info.startswith('"') and focus_info.endswith('"'):
             focus_info=focus_info[1:-1]
         focus_info=focus_info.replace('#', '')
@@ -1119,8 +1133,41 @@ def export_chat_log(chat_state, paragraph, liked, disliked):
         print(f"An error occurred while exporting the chat log: {e}")
         return None
 async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
     # state = state + [(None, f"Caption Everything: {paragraph}")]
@@ -1853,6 +1900,39 @@ def create_ui():
                                               modules_not_need_gpt,
                                               modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],

         new_image_path = get_new_image_name('chat_image', func_name='upload')
         image_input.save(new_image_path)
         visual_chatgpt.current_image = new_image_path
+        paragraph = get_gpt_response(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
         # img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
         Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {paragraph}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
         AI_prompt = "Received."
         visual_chatgpt.agent.memory.save_context({"input": Human_prompt}, {"output": AI_prompt})
         print("memory",visual_chatgpt.agent.memory)
         # visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
+        parsed_data = get_gpt_response(openai_api_key, new_image_path,"Please provide the name, artist, year of creation (including the art historical period), and material used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\", \"artist\": \"Name of the artist\", \"year\": \"Year of creation (Art historical period)\", \"material\": \"Material used in the painting\" }")
         parsed_data = json.loads(parsed_data.replace("'", "\""))
         name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
         # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
     # if not args.disable_gpt and text_refiner:
     if not args.disable_gpt:
         print("new crop save",new_crop_save_path)
+        focus_info=get_gpt_response(openai_api_key,new_crop_save_path,prompt)
         if focus_info.startswith('"') and focus_info.endswith('"'):
             focus_info=focus_info[1:-1]
         focus_info=focus_info.replace('#', '')
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
+def get_gpt_response(api_key, image_path, prompt, enable_wiki=None):
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {api_key}"
+    }
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {api_key}"
     }
+    if image_path:
+        base64_image = encode_image(image_path)
+        payload = {
+            "model": "gpt-4o",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}"
+                            }
                         }
+                    ]
+                }
+            ],
+            "max_tokens": 300
+        }
+    else:
+        payload = {
+            "model": "gpt-4o",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": 300
+        }
     # Sending the request to the OpenAI API
     response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
     result = response.json()
     try:
+        content = result['choices'][0]['message']['content']
         return content
+    except (KeyError, IndexError, json.JSONDecodeError) as e:
+        return json.dumps({"error": "Failed to parse model output", "details": str(e)})
     # if not args.disable_gpt and text_refiner:
     if not args.disable_gpt:
+        focus_info=get_gpt_response(openai_api_key,crop_save_path,prompt)
         if focus_info.startswith('"') and focus_info.endswith('"'):
             focus_info=focus_info[1:-1]
         focus_info=focus_info.replace('#', '')
         print(f"An error occurred while exporting the chat log: {e}")
         return None
+async def get_artistinfo(artist_name,api_key,state,language,autoplay,length):
+    prompt=f"Provide a concise summary of about {length} words on the painter {artist_name}, covering his biography, major works, artistic style, significant contributions to the art world, and any major awards or recognitions he has received."
+    res=get_gpt_response(api_key,None,prompt)
+    state = state + [(None, f"Artist Info:{res}")]
+    read_info = re.sub(r'[#[\]!*]','',res)
+    read_info = emoji.replace_emoji(read_info,replace="")
+    # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
+    #                                           input_points=input_points, input_labels=input_labels)
+    audio_output = await texttospeech(read_info, language,autoplay)
+    # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+    return state, state,audio_output
+async def get_yearinfo(year,api_key,state,language,autoplay,length):
+    prompt = f"Provide a concise summary of about {length} words on the art historical period associated with the year {year}, covering its major characteristics, influential artists, notable works, and its significance in the broader context of art history."
+    res=get_gpt_response(api_key,None,prompt)
+    state = state + [(None, f"Artist Info:{res}")]
+    read_info = re.sub(r'[#[\]!*]','',res)
+    read_info = emoji.replace_emoji(read_info,replace="")
+    # refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
+    #                                           input_points=input_points, input_labels=input_labels)
+    audio_output = await texttospeech(read_info, language,autoplay)
+    # return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
+    return state, state,audio_output
 async def cap_everything(paragraph, visual_chatgpt,language,autoplay):
     # state = state + [(None, f"Caption Everything: {paragraph}")]
                                               modules_not_need_gpt,
                                               modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,d3_model,top_row])
+        artist_label_base2.click(
+            get_artistinfo,
+            inputs=[artist_label_base2,openai_api_key,state,language,auto_play,length],
+            outputs=[chatbot,state,output_audio]
+        )
+        artist_label.click(
+            get_artistinfo,
+            inputs=[artist_label,openai_api_key,state,language,auto_play,length],
+            outputs=[chatbot,state,output_audio]
+        )
+        artist_label_traj.click(
+            get_artistinfo,
+            inputs=[artist_label_traj,openai_api_key,state,language,auto_play,length],
+            outputs=[chatbot,state,output_audio]
+        )
+        year_label_base2.click(
+            get_yearinfo,
+            inputs=[year_label_base2,openai_api_key,state,language,auto_play,length],
+            outputs=[chatbot,state,output_audio]
+        )
+        year_label.click(
+            get_yearinfo,
+            inputs=[year_label,openai_api_key,state,language,auto_play,length],
+            outputs=[chatbot,state,output_audio]
+        )
+        year_label_traj.click(
+            get_yearinfo,
+            inputs=[year_label_traj,openai_api_key,state,language,auto_play,length],
+            outputs=[chatbot,state,output_audio]
+        )
         enable_chatGPT_button.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],