from openai import OpenAI
import os
from dotenv import load_dotenv
import asyncio
from src.hopter.client import Hopter, RamGroundedSamInput, Environment
from src.models.generate_mask_instruction import GenerateMaskInstruction
from src.services.openai_file_upload import OpenAIFileUpload
from src.utils import download_image_to_data_uri
load_dotenv()
system_prompt = """
I will give you an editing instruction of the image. Perform the following tasks:
Please output which type of editing category it is in.
You can choose from the following categories:
1. Addition: Adding new objects within the images, e.g., add a bird
2. Remove: Removing objects, e.g., remove the mask
3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog)
4. Global: Edit the entire image, e.g., let's see it in winter
5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc.
Only output a single word, e.g., 'Addition'.
Please output the subject needed to be edited. You only need to output the basic description of the object in no more than 5 words. The output should only contain one noun.
For example, the editing instruction is 'Change the white cat to a black dog'. Then you need to output: 'white cat'. Only output the new content. Do not output anything else.
Please describe the new content that should be present in the image after applying the instruction.
For example, if the original image content shows a grandmother wearing a mask and the instruction is 'remove the mask', your output should be: 'a grandmother'.
The output should only include elements that remain in the image after the edit and should not mention elements that have been changed or removed, such as 'mask' in this example.
Do not output 'sorry, xxx', even if it's a guess, directly output the answer you think is correct.
"""
class GenerateMaskService:
def __init__(self, hopter: Hopter):
self.llm = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
self.model = "gpt-4o"
self.openai_file_upload = OpenAIFileUpload()
self.hopter = hopter
def get_mask_generation_instruction(
self, edit_instruction: str, image_url: str
) -> GenerateMaskInstruction:
messages = [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": [
{"type": "text", "text": edit_instruction},
{"type": "image_url", "image_url": {"url": image_url}},
],
},
]
response = self.llm.beta.chat.completions.parse(
model=self.model, messages=messages, response_format=GenerateMaskInstruction
)
instruction = response.choices[0].message.parsed
return instruction
def generate_mask(
self, mask_instruction: GenerateMaskInstruction, image_url: str
) -> str:
"""
Generate a mask for the image editing instruction.
Args:
mask_instruction (GenerateMaskInstruction): The mask generation instruction.
Returns:
str: The mask image in base64 format.
"""
image_uri = download_image_to_data_uri(image_url)
input = RamGroundedSamInput(
text_prompt=mask_instruction.subject, image_b64=image_uri
)
generate_mask_result = self.hopter.generate_mask(input)
return generate_mask_result.mask_b64
async def main():
service = GenerateMaskService(
Hopter(
api_key=os.environ.get("HOPTER_API_KEY"), environment=Environment.STAGING
)
)
edit_instruction = "remove the light post"
image_file_path = "./assets/lakeview.jpg"
with open(image_file_path, "rb") as image_file:
image_url = service.openai_file_upload.upload_image(image_file.read(), "vision")
instruction = service.get_mask_generation_instruction(edit_instruction, image_url)
print(instruction)
mask = service.generate_mask(instruction, image_url)
print(mask)
if __name__ == "__main__":
asyncio.run(main())