from openai import OpenAI import os from dotenv import load_dotenv import asyncio from src.hopter.client import Hopter, RamGroundedSamInput, Environment from src.models.generate_mask_instruction import GenerateMaskInstruction from src.services.openai_file_upload import OpenAIFileUpload from src.utils import download_image_to_data_uri load_dotenv() system_prompt = """ I will give you an editing instruction of the image. Perform the following tasks: Please output which type of editing category it is in. You can choose from the following categories: 1. Addition: Adding new objects within the images, e.g., add a bird 2. Remove: Removing objects, e.g., remove the mask 3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog) 4. Global: Edit the entire image, e.g., let's see it in winter 5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc. Only output a single word, e.g., 'Addition'. Please output the subject needed to be edited. You only need to output the basic description of the object in no more than 5 words. The output should only contain one noun. For example, the editing instruction is 'Change the white cat to a black dog'. Then you need to output: 'white cat'. Only output the new content. Do not output anything else. Please describe the new content that should be present in the image after applying the instruction. For example, if the original image content shows a grandmother wearing a mask and the instruction is 'remove the mask', your output should be: 'a grandmother'. The output should only include elements that remain in the image after the edit and should not mention elements that have been changed or removed, such as 'mask' in this example. Do not output 'sorry, xxx', even if it's a guess, directly output the answer you think is correct. """ class GenerateMaskService: def __init__(self, hopter: Hopter): self.llm = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) self.model = "gpt-4o" self.openai_file_upload = OpenAIFileUpload() self.hopter = hopter def get_mask_generation_instruction( self, edit_instruction: str, image_url: str ) -> GenerateMaskInstruction: messages = [ {"role": "system", "content": system_prompt}, { "role": "user", "content": [ {"type": "text", "text": edit_instruction}, {"type": "image_url", "image_url": {"url": image_url}}, ], }, ] response = self.llm.beta.chat.completions.parse( model=self.model, messages=messages, response_format=GenerateMaskInstruction ) instruction = response.choices[0].message.parsed return instruction def generate_mask( self, mask_instruction: GenerateMaskInstruction, image_url: str ) -> str: """ Generate a mask for the image editing instruction. Args: mask_instruction (GenerateMaskInstruction): The mask generation instruction. Returns: str: The mask image in base64 format. """ image_uri = download_image_to_data_uri(image_url) input = RamGroundedSamInput( text_prompt=mask_instruction.subject, image_b64=image_uri ) generate_mask_result = self.hopter.generate_mask(input) return generate_mask_result.mask_b64 async def main(): service = GenerateMaskService( Hopter( api_key=os.environ.get("HOPTER_API_KEY"), environment=Environment.STAGING ) ) edit_instruction = "remove the light post" image_file_path = "./assets/lakeview.jpg" with open(image_file_path, "rb") as image_file: image_url = service.openai_file_upload.upload_image(image_file.read(), "vision") instruction = service.get_mask_generation_instruction(edit_instruction, image_url) print(instruction) mask = service.generate_mask(instruction, image_url) print(mask) if __name__ == "__main__": asyncio.run(main())