Spaces:

simonlee-cb
/

chat-image-edit

Running

App Files Files Community

simonlee-cb commited on Mar 10

Commit

583b7ad

1 Parent(s): c16bc85

refactor: clean up

Browse files

Files changed (7) hide show

image_edit_chat.py +4 -11
image_edit_demo.py +2 -3
src/agents/image-edit-agent.py +0 -105
src/agents/image_edit_agent.py +140 -0
src/agents/mask_generation_agent.py +31 -108
src/services/generate_mask.py +0 -2
src/services/google_cloud_image_upload.py +3 -1

image_edit_chat.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from src.agents.mask_generation_agent import mask_generation_agent, ImageEditDeps
 import os
 from src.hopter.client import Hopter, Environment
 from src.services.generate_mask import GenerateMaskService
@@ -9,10 +9,9 @@ from pydantic_ai.messages import (
     ToolCallPart,
     ToolReturnPart
 )
-from src.agents.mask_generation_agent import EditImageResult
-from pydantic_ai.agent import Agent
 from pydantic_ai.models.openai import OpenAIModel
-model = OpenAIModel(
     "gpt-4o",
     api_key=os.environ.get("OPENAI_API_KEY"),
 )
@@ -56,12 +55,6 @@ EXAMPLES = [
     }
 ]
-simple_agent = Agent(
-    model,
-    system_prompt="You are a helpful assistant that can answer questions and help with tasks.",
-    deps_type=ImageEditDeps
-)
 load_dotenv()
 def build_user_message(chat_input):
@@ -142,7 +135,7 @@ async def stream_from_agent(chat_input, chatbot, past_messages, current_image):
     )
     # Run the agent
-    async with mask_generation_agent.run_stream(
         messages,
         deps=deps,
         message_history=past_messages

 import gradio as gr
+from src.agents.image_edit_agent import image_edit_agent, ImageEditDeps, EditImageResult
 import os
 from src.hopter.client import Hopter, Environment
 from src.services.generate_mask import GenerateMaskService
     ToolCallPart,
     ToolReturnPart
 )
 from pydantic_ai.models.openai import OpenAIModel
+model = OpenAIModel(
     "gpt-4o",
     api_key=os.environ.get("OPENAI_API_KEY"),
 )
     }
 ]
 load_dotenv()
 def build_user_message(chat_input):
     )
     # Run the agent
+    async with image_edit_agent.run_stream(
         messages,
         deps=deps,
         message_history=past_messages

image_edit_demo.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from src.agents.mask_generation_agent import mask_generation_agent, ImageEditDeps
 import os
 from src.hopter.client import Hopter, Environment
 from src.services.generate_mask import GenerateMaskService
@@ -7,7 +7,6 @@ from dotenv import load_dotenv
 from pydantic_ai.messages import (
     ToolReturnPart
 )
-from src.agents.mask_generation_agent import EditImageResult
 from src.utils import upload_image
 load_dotenv()
@@ -31,7 +30,7 @@ async def process_edit(image, instruction):
         hopter_client=hopter,
         mask_service=mask_service
     )
-    result = await mask_generation_agent.run(
         messages,
         deps=deps
     )

 import gradio as gr
+from src.agents.image_edit_agent import image_edit_agent, ImageEditDeps, EditImageResult
 import os
 from src.hopter.client import Hopter, Environment
 from src.services.generate_mask import GenerateMaskService
 from pydantic_ai.messages import (
     ToolReturnPart
 )
 from src.utils import upload_image
 load_dotenv()
         hopter_client=hopter,
         mask_service=mask_service
     )
+    result = await image_edit_agent.run(
         messages,
         deps=deps
     )

src/agents/image-edit-agent.py DELETED Viewed

@@ -1,105 +0,0 @@
-from pydantic_ai import Agent, RunContext
-from pydantic_ai.settings import ModelSettings
-from pydantic_ai.models.openai import OpenAIModel
-from dotenv import load_dotenv
-import os
-import asyncio
-from src.utils import image_path_to_base64
-from dataclasses import dataclass
-load_dotenv()
-@dataclass
-class ImageEditDeps:
-    edit_instruction: str
-    image_url: str
-model = OpenAIModel(
-    "gpt-4o",
-    api_key=os.environ.get("OPENAI_API_KEY"),
-)
-image_edit_agent = Agent(
-    model,
-    system_prompt=[
-        'Be concise, reply with one sentence.',
-        "You are an image editing agent. You will be given an image and an editing instruction. Use the tools available to you and come up with a plan to edit the image according to the instruction."
-    ],
-    deps_type=ImageEditDeps
-)
-@image_edit_agent.tool
-async def identify_editing_subject(ctx: RunContext[ImageEditDeps]) -> str:
-    """
-    Identify the subject of the image editing instruction.
-    Args:
-        instruction: The image editing instruction.
-        image_url: The URL of the image.
-    Returns:
-        The subject of the image editing instruction.
-    """
-    messages = [
-        {
-            "type": "text",
-            "text": ctx.deps.edit_instruction
-        },
-        {
-            "type": "image_url",
-            "image_url": {
-                "url": ctx.deps.image_url
-            }
-        }
-    ]
-    r = await mask_generation_agent.run(messages, usage=ctx.usage, deps=ctx.deps)
-    return r.data
-mask_generation_agent = Agent(
-    model,
-    system_prompt=[
-        "I will give you an editing instruction of the image. Please output the object needed to be edited.",
-        "You only need to output the basic description of the object in no more than 5 words.",
-        "The output should only contain one noun.",
-        "For example, the editing instruction is 'Change the white cat to a black dog'. Then you need to output: 'white cat'. Only output the new content. Do not output anything else."
-    ],
-    deps_type=ImageEditDeps
-)
-@mask_generation_agent.tool
-async def generate_mask(ctx: RunContext[ImageEditDeps], mask_subject: str) -> str:
-    """
-    Generate a mask for the image editing instruction.
-    """
-    pass
-async def main():
-    image_file_path = "./assets/lakeview.jpg"
-    image_base64 = image_path_to_base64(image_file_path)
-    image_url = f"data:image/jpeg;base64,{image_base64}"
-    prompt = "remove the light post"
-    messages = [
-        {
-            "type": "text",
-            "text": prompt
-        },
-        {
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }
-    ]
-    deps = ImageEditDeps(
-        edit_instruction=prompt,
-        image_url=image_url
-    )
-    r = await mask_generation_agent.run(messages, deps=deps)
-    print(r.data)
-if __name__ == "__main__":
-    asyncio.run(main())

src/agents/image_edit_agent.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from pydantic_ai import Agent, RunContext
+from pydantic_ai.models.openai import OpenAIModel
+from dotenv import load_dotenv
+import os
+import asyncio
+from dataclasses import dataclass
+from typing import Optional
+import logfire
+from src.services.generate_mask import GenerateMaskService
+from src.hopter.client import Hopter, Environment, MagicReplaceInput, SuperResolutionInput
+from src.utils import image_path_to_uri, download_image_to_data_uri, upload_image
+import base64
+import tempfile
+load_dotenv()
+logfire.configure(token=os.environ.get("LOGFIRE_TOKEN"))
+logfire.instrument_openai()
+system_prompt = """
+I will give you an editing instruction of the image.
+if the edit instruction involved modifying parts of the image, please generate a mask for it.
+if images are not provided, ask the user to provide an image.
+"""
+@dataclass
+class ImageEditDeps:
+    edit_instruction: str
+    hopter_client: Hopter
+    mask_service: GenerateMaskService
+    image_url: Optional[str] = None
+model = OpenAIModel(
+    "gpt-4o",
+    api_key=os.environ.get("OPENAI_API_KEY"),
+)
+@dataclass
+class EditImageResult:
+    edited_image_url: str
+image_edit_agent = Agent(
+    model,
+    system_prompt=system_prompt,
+    deps_type=ImageEditDeps
+)
+def upload_image_from_base64(base64_image: str) -> str:
+    image_format = base64_image.split(",")[0]
+    image_data = base64.b64decode(base64_image.split(",")[1])
+    suffix = ".jpg" if image_format == "image/jpeg" else ".png"
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
+        temp_filename = temp_file.name
+        with open(temp_filename, "wb") as f:
+            f.write(image_data)
+    return upload_image(temp_filename)
+@image_edit_agent.tool
+async def edit_object(ctx: RunContext[ImageEditDeps]) -> EditImageResult:
+    """
+    Use this tool to edit an object in the image. for example:
+    - remove the pole
+    - replace the dog with a cat
+    - change the background to a beach
+    - remove the person in the image
+    - change the hair color to red
+    - change the hat to a cap
+    """
+    edit_instruction = ctx.deps.edit_instruction
+    image_url = ctx.deps.image_url
+    mask_service = ctx.deps.mask_service
+    hopter_client = ctx.deps.hopter_client
+    image_uri = download_image_to_data_uri(image_url)
+    # Generate mask
+    mask_instruction = mask_service.get_mask_generation_instruction(edit_instruction, image_url)
+    mask = mask_service.generate_mask(mask_instruction, image_uri)
+    # Magic replace
+    input = MagicReplaceInput(image=image_uri, mask=mask, prompt=mask_instruction.target_caption)
+    result = hopter_client.magic_replace(input)
+    uploaded_image = upload_image_from_base64(result.base64_image)
+    return EditImageResult(edited_image_url=uploaded_image)
+@image_edit_agent.tool
+async def super_resolution(ctx: RunContext[ImageEditDeps]) -> EditImageResult:
+    """
+    run super resolution, upscale, or enhance the image
+    """
+    image_url = ctx.deps.image_url
+    hopter_client = ctx.deps.hopter_client
+    image_uri = download_image_to_data_uri(image_url)
+    input = SuperResolutionInput(image_b64=image_uri, scale=4, use_face_enhancement=False)
+    result = hopter_client.super_resolution(input)
+    uploaded_image = upload_image_from_base64(result.scaled_image)
+    return EditImageResult(edited_image_url=uploaded_image)
+async def main():
+    image_file_path = "./assets/lakeview.jpg"
+    image_url = image_path_to_uri(image_file_path)
+    prompt = "remove the light post"
+    messages = [
+        {
+            "type": "text",
+            "text": prompt
+        },
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }
+    ]
+    # Initialize services
+    hopter = Hopter(api_key=os.environ.get("HOPTER_API_KEY"), environment=Environment.STAGING)
+    mask_service = GenerateMaskService(hopter=hopter)
+    # Initialize dependencies
+    deps = ImageEditDeps(
+        edit_instruction=prompt,
+        image_url=image_url,
+        hopter_client=hopter,
+        mask_service=mask_service
+    )
+    async with image_edit_agent.run_stream(
+        messages,
+        deps=deps
+    ) as result:
+        async for message in result.stream():
+            print(message)
+if __name__ == "__main__":
+    asyncio.run(main())

src/agents/mask_generation_agent.py CHANGED Viewed

@@ -8,11 +8,9 @@ from typing import Optional
 import logfire
 from src.services.generate_mask import GenerateMaskService
 from src.hopter.client import Hopter, Environment, MagicReplaceInput, SuperResolutionInput
-from src.services.image_uploader import ImageUploader
 from src.utils import image_path_to_uri, download_image_to_data_uri, upload_image
 import base64
 import tempfile
-from PIL import Image
 load_dotenv()
@@ -20,127 +18,52 @@ logfire.configure(token=os.environ.get("LOGFIRE_TOKEN"))
 logfire.instrument_openai()
 system_prompt = """
-I will give you an editing instruction of the image.
-if the edit instruction involved modifying parts of the image, please generate a mask for it.
-if images are not provided, ask the user to provide an image.
 """
-@dataclass
-class ImageEditDeps:
-    edit_instruction: str
-    hopter_client: Hopter
-    mask_service: GenerateMaskService
-    image_url: Optional[str] = None
 model = OpenAIModel(
     "gpt-4o",
     api_key=os.environ.get("OPENAI_API_KEY"),
 )
 @dataclass
 class MaskGenerationResult:
     mask_image_base64: str
-@dataclass
-class EditImageResult:
-    edited_image_url: str
 mask_generation_agent = Agent(
     model,
-    system_prompt=system_prompt,
-    deps_type=ImageEditDeps
 )
-def upload_image_from_base64(base64_image: str) -> str:
-    image_format = base64_image.split(",")[0]
-    image_data = base64.b64decode(base64_image.split(",")[1])
-    suffix = ".jpg" if image_format == "image/jpeg" else ".png"
-    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
-        temp_filename = temp_file.name
-        with open(temp_filename, "wb") as f:
-            f.write(image_data)
-    return upload_image(temp_filename)
-@mask_generation_agent.tool
-async def edit_object(ctx: RunContext[ImageEditDeps]) -> EditImageResult:
-    """
-    Use this tool to edit an object in the image. for example:
-    - remove the pole
-    - replace the dog with a cat
-    - change the background to a beach
-    - remove the person in the image
-    - change the hair color to red
-    - change the hat to a cap
-    """
-    edit_instruction = ctx.deps.edit_instruction
-    image_url = ctx.deps.image_url
-    mask_service = ctx.deps.mask_service
-    hopter_client = ctx.deps.hopter_client
-    image_uri = download_image_to_data_uri(image_url)
-    # Generate mask
-    mask_instruction = mask_service.get_mask_generation_instruction(edit_instruction, image_url)
-    mask = mask_service.generate_mask(mask_instruction, image_uri)
-    # Magic replace
-    input = MagicReplaceInput(image=image_uri, mask=mask, prompt=mask_instruction.target_caption)
-    result = hopter_client.magic_replace(input)
-    uploaded_image = upload_image_from_base64(result.base64_image)
-    return EditImageResult(edited_image_url=uploaded_image)
 @mask_generation_agent.tool
-async def super_resolution(ctx: RunContext[ImageEditDeps]) -> EditImageResult:
     """
-    run super resolution, upscale, or enhance the image
     """
-    image_url = ctx.deps.image_url
-    hopter_client = ctx.deps.hopter_client
-    image_uri = download_image_to_data_uri(image_url)
-    input = SuperResolutionInput(image_b64=image_uri, scale=4, use_face_enhancement=False)
-    result = hopter_client.super_resolution(input)
-    uploaded_image = upload_image_from_base64(result.scaled_image)
-    return EditImageResult(edited_image_url=uploaded_image)
-async def main():
-    image_file_path = "./assets/lakeview.jpg"
-    image_url = image_path_to_uri(image_file_path)
-    prompt = "remove the light post"
-    messages = [
-        {
-            "type": "text",
-            "text": prompt
-        },
-        {
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }
-    ]
-    # Initialize services
-    hopter = Hopter(api_key=os.environ.get("HOPTER_API_KEY"), environment=Environment.STAGING)
-    mask_service = GenerateMaskService(hopter=hopter)
-    # Initialize dependencies
-    deps = ImageEditDeps(
-        edit_instruction=prompt,
-        image_url=image_url,
-        hopter_client=hopter,
-        mask_service=mask_service
-    )
-    async with mask_generation_agent.run_stream(
-        messages,
-        deps=deps
-    ) as result:
-        async for message in result.stream():
-            print(message)
-if __name__ == "__main__":
-    asyncio.run(main())

 import logfire
 from src.services.generate_mask import GenerateMaskService
 from src.hopter.client import Hopter, Environment, MagicReplaceInput, SuperResolutionInput
 from src.utils import image_path_to_uri, download_image_to_data_uri, upload_image
 import base64
 import tempfile
 load_dotenv()
 logfire.instrument_openai()
 system_prompt = """
+I will give you an editing instruction of the image. Perform the following tasks:
+<task_1>
+Please output which type of editing category it is in.
+You can choose from the following categories:
+1. Addition: Adding new objects within the images, e.g., add a bird
+2. Remove: Removing objects, e.g., remove the mask
+3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog)
+4. Global: Edit the entire image, e.g., let's see it in winter
+5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc.
+Only output a single word, e.g., 'Addition'.
+</task_1>
+<task_2>
+Please output the subject needed to be edited. You only need to output the basic description of the object in no more than 5 words.  The output should only contain one noun.
+For example, the editing instruction is 'Change the white cat to a black dog'. Then you need to output: 'white cat'. Only output the new content. Do not output anything else.
+</task_2>
+<task_3>
+Please describe the new content that should be present in the image after applying the instruction.
+For example, if the original image content shows a grandmother wearing a mask and the instruction is 'remove the mask', your output should be: 'a grandmother'.
+The output should only include elements that remain in the image after the edit and should not mention elements that have been changed or removed, such as 'mask' in this example.
+Do not output 'sorry, xxx', even if it's a guess, directly output the answer you think is correct.
+</task_3>
 """
 model = OpenAIModel(
     "gpt-4o",
     api_key=os.environ.get("OPENAI_API_KEY"),
 )
 @dataclass
 class MaskGenerationResult:
     mask_image_base64: str
 mask_generation_agent = Agent(
     model,
+    system_prompt=system_prompt
 )
 @mask_generation_agent.tool
+async def generate_mask(edit_instruction: str, image_url: str) -> MaskGenerationResult:
     """
+    Use this tool to generate a mask for the image.
     """
+    pass

src/services/generate_mask.py CHANGED Viewed

@@ -1,8 +1,6 @@
-from pydantic import BaseModel, Field
 from openai import OpenAI
 import os
 from dotenv import load_dotenv
-import base64
 import asyncio
 from src.hopter.client import Hopter, RamGroundedSamInput, Environment
 from src.models.generate_mask_instruction import GenerateMaskInstruction

 from openai import OpenAI
 import os
 from dotenv import load_dotenv
 import asyncio
 from src.hopter.client import Hopter, RamGroundedSamInput, Environment
 from src.models.generate_mask_instruction import GenerateMaskInstruction

src/services/google_cloud_image_upload.py CHANGED Viewed

@@ -3,10 +3,12 @@ from PIL import Image
 import os
 import uuid
 import tempfile
 def get_credentials():
     credentials_json_string = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
     # create a temp file with the credentials
     with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as temp_file:
         temp_file.write(credentials_json_string)

 import os
 import uuid
 import tempfile
+from dotenv import load_dotenv
+load_dotenv()
 def get_credentials():
     credentials_json_string = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
     # create a temp file with the credentials
     with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as temp_file:
         temp_file.write(credentials_json_string)