Spaces:

slabstech
/

ghost-vision

Paused

App Files Files Community

sachin commited on 26 days ago

Commit

e9cc529

1 Parent(s): c66a631

improve memory management

Browse files

Files changed (1) hide show

intruct.py +157 -529

intruct.py CHANGED Viewed

@@ -1,231 +1,155 @@
-from fastapi import FastAPI, File, UploadFile, Form
-from fastapi.responses import StreamingResponse
 import io
 import math
-from PIL import Image, ImageOps, ImageDraw
 import torch
-from diffusers import StableDiffusionInstructPix2PixPipeline, StableDiffusionInpaintPipeline
-from fastapi import FastAPI, Response
-from fastapi.responses import FileResponse
-import torch
-from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
 from huggingface_hub import hf_hub_download, login
 from safetensors.torch import load_file
-from io import BytesIO
-import os
-import base64
-from typing import List
-from fastapi import FastAPI, File, UploadFile, HTTPException
-from fastapi.responses import StreamingResponse
-from PIL import Image, ImageDraw, ImageFilter
-import io
-import torch
-import numpy as np
-from diffusers import StableDiffusionInpaintPipeline
-import cv2
-from fastapi import FastAPI, File, UploadFile, HTTPException
-from fastapi.responses import StreamingResponse, JSONResponse
-import torch
-from PIL import Image
-import io
-import numpy as np
-import cv2
 from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 from sam2.sam2_image_predictor import SAM2ImagePredictor
 # Initialize FastAPI app
 app = FastAPI()
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load Grounding DINO model and processor at startup
-dino_model_id = "IDEA-Research/grounding-dino-base"
-dino_processor = AutoProcessor.from_pretrained(dino_model_id)
-dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(dino_model_id).to(device)
-# Load SAM 2 model at startup
-#sam_checkpoint = "sam2.1_hiera_tiny.pt"  # Replace with your checkpoint path
-sam_predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-tiny")
-sam_predictor.model.to(device)
-# Default text query
 DEFAULT_TEXT_QUERY = "a tank."
 def process_image_with_dino(image: Image.Image, text_query: str = DEFAULT_TEXT_QUERY):
-    """Detect objects using Grounding DINO."""
-    inputs = dino_processor(images=image, text=text_query, return_tensors="pt").to(device)
     with torch.no_grad():
-        outputs = dino_model(**inputs)
-    # Post-process results
-    results = dino_processor.post_process_grounded_object_detection(
-        outputs,
-        inputs.input_ids,
-        threshold=0.4,
-        text_threshold=0.3,
-        target_sizes=[image.size[::-1]]  # [width, height]
     )
-    return results[0]  # Single image result
 def segment_with_sam(image: Image.Image, boxes: list):
-    """Segment detected objects using SAM 2 and return a mask."""
     image_np = np.array(image)
-    sam_predictor.set_image(image_np)
     if not boxes:
-        return np.zeros(image_np.shape[:2], dtype=bool)  # Empty mask if no boxes
-    # Convert boxes to [x_min, y_min, x_max, y_max] tensor and move to device
     boxes_tensor = torch.tensor(
         [[box["x_min"], box["y_min"], box["x_max"], box["y_max"]] for box in boxes],
         dtype=torch.float32
     ).to(device)
-    # Predict with SAM 2 using boxes directly
-    masks, _, _ = sam_predictor.predict(
-        point_coords=None,
-        point_labels=None,
-        box=boxes_tensor,  # Use 'box' argument instead of 'boxes'
-        multimask_output=False
-    )
-    return masks[0]  # Return the first mask directly (already a NumPy array)
 def create_background_mask(image_np: np.ndarray, mask: np.ndarray) -> np.ndarray:
-    """Create an RGB mask for background removal (object preserved)."""
-    mask_inv = np.logical_not(mask).astype(np.uint8) * 255  # Invert mask (background is white)
-    mask_rgb = cv2.cvtColor(mask_inv, cv2.COLOR_GRAY2RGB)  # Convert to RGB
     return mask_rgb
 def create_object_mask(image_np: np.ndarray, mask: np.ndarray) -> np.ndarray:
-    """Create an RGB mask for object removal (background preserved)."""
-    mask_rgb = cv2.cvtColor(mask.astype(np.uint8) * 255, cv2.COLOR_GRAY2RGB)  # Object is white, background black
     return mask_rgb
-model_id_runway = "runwayml/stable-diffusion-inpainting"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-try:
-    pipe_runway = StableDiffusionInpaintPipeline.from_pretrained(model_id_runway)
-    pipe_runway.to(device)
-except Exception as e:
-    raise RuntimeError(f"Failed to load model: {e}")
-# Load the pre-trained InstructPix2Pix model for editing
-model_id = "timbrooks/instruct-pix2pix"
-pipe_edit = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-    model_id, torch_dtype=torch.float16, safety_checker=None
-).to("cuda")
-# Load the pre-trained Inpainting model
-inpaint_model_id = "stabilityai/stable-diffusion-2-inpainting"
-pipe_inpaint = StableDiffusionInpaintPipeline.from_pretrained(
-    inpaint_model_id, torch_dtype=torch.float16, safety_checker=None
-).to("cuda")
-# Default configuration values
-DEFAULT_STEPS = 50
-DEFAULT_TEXT_CFG = 7.5
-DEFAULT_IMAGE_CFG = 1.5
-DEFAULT_SEED = 1371
-HF_TOKEN = os.getenv("HF_TOKEN")
-def load_model():
-    try:
-        # Login to Hugging Face if token is provided
-        if HF_TOKEN:
-            login(token=HF_TOKEN)
-        base = "stabilityai/stable-diffusion-xl-base-1.0"
-        repo = "ByteDance/SDXL-Lightning"
-        ckpt = "sdxl_lightning_4step_unet.safetensors"
-        # Load model with explicit error handling
-        unet = UNet2DConditionModel.from_config(
-            base,
-            subfolder="unet"
-        ).to("cuda", torch.float16)
-        unet.load_state_dict(load_file(hf_hub_download(repo, ckpt), device="cuda"))
-        pipe = StableDiffusionXLPipeline.from_pretrained(
-            base,
-            unet=unet,
-            torch_dtype=torch.float16,
-            variant="fp16"
-        ).to("cuda")
-        # Configure scheduler
-        pipe.scheduler = EulerDiscreteScheduler.from_config(
-            pipe.scheduler.config,
-            timestep_spacing="trailing"
-        )
-        return pipe
-    except Exception as e:
-        raise Exception(f"Failed to load model: {str(e)}")
-# Load model at startup with error handling
-try:
-    pipe_generate = load_model()
-except Exception as e:
-    print(f"Model initialization failed: {str(e)}")
-    raise
-@app.get("/generate")
-async def generate_image(prompt: str):
-    try:
-        # Generate image
-        image = pipe_generate(
-            prompt,
-            num_inference_steps=4,
-            guidance_scale=0
-        ).images[0]
-        # Save image to buffer
-        buffer = BytesIO()
-        image.save(buffer, format="PNG")
-        buffer.seek(0)
-        return Response(content=buffer.getvalue(), media_type="image/png")
-    except Exception as e:
-        return {"error": str(e)}
-@app.get("/health")
-async def health_check():
-    return {"status": "healthy"}
 def process_image(input_image: Image.Image, instruction: str, steps: int, text_cfg_scale: float, image_cfg_scale: float, seed: int):
-    """
-    Process the input image with the given instruction using InstructPix2Pix.
-    """
-    # Resize image to fit model requirements
     width, height = input_image.size
     factor = 512 / max(width, height)
     factor = math.ceil(min(width, height) * factor / 64) * 64 / min(width, height)
     width = int((width * factor) // 64) * 64
     height = int((height * factor) // 64) * 64
     input_image = ImageOps.fit(input_image, (width, height), method=Image.Resampling.LANCZOS)
     if not instruction:
         return input_image
-    # Set the random seed for reproducibility
     generator = torch.manual_seed(seed)
-    # Generate the edited image
-    edited_image = pipe_edit(
         instruction,
         image=input_image,
         guidance_scale=text_cfg_scale,
@@ -233,9 +157,25 @@ def process_image(input_image: Image.Image, instruction: str, steps: int, text_c
         num_inference_steps=steps,
         generator=generator,
     ).images[0]
     return edited_image
 @app.post("/edit-image/")
 async def edit_image(
     file: UploadFile = File(...),
@@ -245,79 +185,43 @@ async def edit_image(
     image_cfg_scale: float = Form(default=DEFAULT_IMAGE_CFG),
     seed: int = Form(default=DEFAULT_SEED)
 ):
-    """
-    Endpoint to edit an image based on a text instruction.
-    """
-    # Read and convert the uploaded image
-    image_data = await file.read()
-    input_image = Image.open(io.BytesIO(image_data)).convert("RGB")
-    # Process the image
-    edited_image = process_image(input_image, instruction, steps, text_cfg_scale, image_cfg_scale, seed)
-    # Convert the edited image to bytes
-    img_byte_arr = io.BytesIO()
-    edited_image.save(img_byte_arr, format="PNG")
-    img_byte_arr.seek(0)
-    # Return the image as a streaming response
-    return StreamingResponse(img_byte_arr, media_type="image/png")
-# New endpoint for inpainting
 @app.post("/inpaint/")
 async def inpaint_image(
     file: UploadFile = File(...),
     prompt: str = Form(...),
-    mask_coordinates: str = Form(...),  # Format: "x1,y1,x2,y2" (top-left and bottom-right of the rectangle to inpaint)
     steps: int = Form(default=DEFAULT_STEPS),
     guidance_scale: float = Form(default=7.5),
     seed: int = Form(default=DEFAULT_SEED)
 ):
-    """
-    Endpoint to perform inpainting on an image.
-    - file: The input image to inpaint.
-    - prompt: The text prompt describing what to generate in the inpainted area.
-    - mask_coordinates: Coordinates of the rectangular area to inpaint (format: "x1,y1,x2,y2").
-    - steps: Number of inference steps.
-    - guidance_scale: Guidance scale for the inpainting process.
-    - seed: Random seed for reproducibility.
-    """
     try:
-        # Read and convert the uploaded image
         image_data = await file.read()
         input_image = Image.open(io.BytesIO(image_data)).convert("RGB")
-        # Resize image to fit model requirements (must be divisible by 8 for inpainting)
         width, height = input_image.size
         factor = 512 / max(width, height)
         factor = math.ceil(min(width, height) * factor / 8) * 8 / min(width, height)
         width = int((width * factor) // 8) * 8
         height = int((height * factor) // 8) * 8
         input_image = ImageOps.fit(input_image, (width, height), method=Image.Resampling.LANCZOS)
-        # Create a mask for inpainting
-        mask = Image.new("L", (width, height), 0)  # Black image (0 = no inpainting)
         draw = ImageDraw.Draw(mask)
-        # Parse the mask coordinates
-        try:
-            x1, y1, x2, y2 = map(int, mask_coordinates.split(","))
-            # Adjust coordinates based on resized image
-            x1 = int(x1 * factor)
-            y1 = int(y1 * factor)
-            x2 = int(x2 * factor)
-            y2 = int(y2 * factor)
-        except ValueError:
-            return {"error": "Invalid mask coordinates format. Use 'x1,y1,x2,y2'."}
-        # Draw a white rectangle on the mask (255 = area to inpaint)
         draw.rectangle([x1, y1, x2, y2], fill=255)
-        # Set the random seed for reproducibility
         generator = torch.manual_seed(seed)
-        # Perform inpainting
-        inpainted_image = pipe_inpaint(
             prompt=prompt,
             image=input_image,
             mask_image=mask,
@@ -325,332 +229,56 @@ async def inpaint_image(
             guidance_scale=guidance_scale,
             generator=generator,
         ).images[0]
-        # Convert the inpainted image to bytes
         img_byte_arr = io.BytesIO()
         inpainted_image.save(img_byte_arr, format="PNG")
         img_byte_arr.seek(0)
-        # Return the image as a streaming response
         return StreamingResponse(img_byte_arr, media_type="image/png")
-    except Exception as e:
-        return {"error": str(e)}
-@app.get("/")
-async def root():
-    """
-    Root endpoint for basic health check.
-    """
-    return {"message": "InstructPix2Pix API is running. Use POST /edit-image/ or /inpaint/ to edit images."}
-# Helper functions
-def prepare_guided_image(original_image: Image, reference_image: Image, mask_image: Image) -> Image:
-    original_array = np.array(original_image)
-    reference_array = np.array(reference_image)
-    mask_array = np.array(mask_image) / 255.0
-    mask_array = mask_array[:, :, np.newaxis]
-    blended_array = original_array * (1 - mask_array) + reference_array * mask_array
-    return Image.fromarray(blended_array.astype(np.uint8))
-def soften_mask(mask_image: Image, softness: int = 5) -> Image:
-    from PIL import ImageFilter
-    return mask_image.filter(ImageFilter.GaussianBlur(radius=softness))
-def generate_rectangular_mask(image_size: tuple, x1: int = 100, y1: int = 100, x2: int = 200, y2: int = 200) -> Image:
-    mask = Image.new("L", image_size, 0)
-    draw = ImageDraw.Draw(mask)
-    draw.rectangle([x1, y1, x2, y2], fill=255)
-    return mask
-def segment_tank(tank_image: Image) -> tuple[Image, Image]:
-    tank_array = np.array(tank_image.convert("RGB"))
-    tank_array = cv2.cvtColor(tank_array, cv2.COLOR_RGB2BGR)
-    hsv = cv2.cvtColor(tank_array, cv2.COLOR_BGR2HSV)
-    lower_snow = np.array([0, 0, 180])
-    upper_snow = np.array([180, 50, 255])
-    snow_mask = cv2.inRange(hsv, lower_snow, upper_snow)
-    tank_mask = cv2.bitwise_not(snow_mask)
-    kernel = np.ones((5, 5), np.uint8)
-    tank_mask = cv2.erode(tank_mask, kernel, iterations=1)
-    tank_mask = cv2.dilate(tank_mask, kernel, iterations=1)
-    tank_mask_image = Image.fromarray(tank_mask, mode="L")
-    tank_array_rgb = np.array(tank_image.convert("RGB"))
-    mask_array = tank_mask / 255.0
-    mask_array = mask_array[:, :, np.newaxis]
-    segmented_tank = (tank_array_rgb * mask_array).astype(np.uint8)
-    alpha = tank_mask
-    segmented_tank_rgba = np.zeros((tank_image.height, tank_image.width, 4), dtype=np.uint8)
-    segmented_tank_rgba[:, :, :3] = segmented_tank
-    segmented_tank_rgba[:, :, 3] = alpha
-    segmented_tank_image = Image.fromarray(segmented_tank_rgba, mode="RGBA")
-    return segmented_tank_image, tank_mask_image
-async def apply_camouflage_to_tank(tank_image: Image) -> Image:
-    segmented_tank, tank_mask = segment_tank(tank_image)
-    segmented_tank.save("segmented_tank.png")
-    tank_mask.save("tank_mask.png")
-    camouflaged_tank = pipe_runway(
-        prompt="Apply a grassy camouflage pattern with shades of green and brown to the tank, preserving its structure.",
-        image=segmented_tank.convert("RGB"),
-        mask_image=tank_mask,
-        strength=0.5,
-        guidance_scale=8.0,
-        num_inference_steps=50,
-        negative_prompt="snow, ice, rock, stone, boat, unrelated objects"
-    ).images[0]
-    camouflaged_tank_rgba = np.zeros((camouflaged_tank.height, camouflaged_tank.width, 4), dtype=np.uint8)
-    camouflaged_tank_rgba[:, :, :3] = np.array(camouflaged_tank)
-    camouflaged_tank_rgba[:, :, 3] = np.array(tank_mask)
-    camouflaged_tank_image = Image.fromarray(camouflaged_tank_rgba, mode="RGBA")
-    camouflaged_tank_image.save("camouflaged_tank.png")
-    return camouflaged_tank_image
-def fit_image_to_mask(original_image: Image, reference_image: Image, mask_x1: int, mask_y1: int, mask_x2: int, mask_y2: int) -> tuple:
-    mask_width = mask_x2 - mask_x1
-    mask_height = mask_y2 - mask_y1
-    if mask_width <= 0 or mask_height <= 0:
-        raise ValueError("Mask dimensions must be positive")
-    ref_width, ref_height = reference_image.size
-    aspect_ratio = ref_width / ref_height
-    if mask_width / mask_height > aspect_ratio:
-        new_height = mask_height
-        new_width = int(new_height * aspect_ratio)
-    else:
-        new_width = mask_width
-        new_height = int(new_width / aspect_ratio)
-    reference_image_resized = reference_image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-    guided_image = original_image.copy().convert("RGB")
-    paste_x = mask_x1 + (mask_width - new_width) // 2
-    paste_y = mask_y1 + (mask_height - new_height) // 2
-    guided_image.paste(reference_image_resized, (paste_x, paste_y), reference_image_resized)
-    mask_image = generate_rectangular_mask(original_image.size, mask_x1, mask_y1, mask_x2, mask_y2)
-    return guided_image, mask_image
-# Endpoints
-@app.post("/inpaint/")
-async def inpaint_image(
-    image: UploadFile = File(...),
-    mask: UploadFile = File(...),
-    prompt: str = "Fill the masked area with appropriate content."
-):
-    try:
-        image_bytes = await image.read()
-        mask_bytes = await mask.read()
-        original_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        mask_image = Image.open(io.BytesIO(mask_bytes)).convert("L")
-        if original_image.size != mask_image.size:
-            raise HTTPException(status_code=400, detail="Image and mask dimensions must match.")
-        result = pipe_runway(prompt=prompt, image=original_image, mask_image=mask_image).images[0]
-        result_bytes = io.BytesIO()
-        result.save(result_bytes, format="PNG")
-        result_bytes.seek(0)
-        return StreamingResponse(
-            result_bytes,
-            media_type="image/png",
-            headers={"Content-Disposition": "attachment; filename=inpainted_image.png"}
-        )
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error during inpainting: {e}")
-@app.post("/inpaint-with-reference/")
-async def inpaint_with_reference(
-    image: UploadFile = File(...),
-    reference_image: UploadFile = File(...),
-    prompt: str = "Integrate the reference content naturally into the masked area, matching style and lighting.",
-    mask_x1: int = 100,
-    mask_y1: int = 100,
-    mask_x2: int = 200,
-    mask_y2: int = 200
-):
-    try:
-        image_bytes = await image.read()
-        reference_bytes = await reference_image.read()
-        original_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        reference_image = Image.open(io.BytesIO(reference_bytes)).convert("RGB")
-        if original_image.size != reference_image.size:
-            reference_image = reference_image.resize(original_image.size, Image.Resampling.LANCZOS)
-        mask_image = generate_rectangular_mask(original_image.size, mask_x1, mask_y1, mask_x2, mask_y2)
-        softened_mask = soften_mask(mask_image, softness=5)
-        guided_image = prepare_guided_image(original_image, reference_image, softened_mask)
-        result = pipe_runway(
-            prompt=prompt,
-            image=guided_image,
-            mask_image=softened_mask,
-            strength=0.75,
-            guidance_scale=7.5
-        ).images[0]
-        result_bytes = io.BytesIO()
-        result.save(result_bytes, format="PNG")
-        result_bytes.seek(0)
-        return StreamingResponse(
-            result_bytes,
-            media_type="image/png",
-            headers={"Content-Disposition": "attachment; filename=natural_inpaint_image.png"}
-        )
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error during natural inpainting: {e}")
-@app.post("/fit-image-to-mask/")
-async def fit_image_to_mask_endpoint(
-    image: UploadFile = File(...),
-    reference_image: UploadFile = File(...),
-    mask_x1: int = 200,
-    mask_y1: int = 200,
-    mask_x2: int = 500,
-    mask_y2: int = 500
-):
-    try:
-        image_bytes = await image.read()
-        reference_bytes = await reference_image.read()
-        original_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        reference_image = Image.open(io.BytesIO(reference_bytes)).convert("RGB")
-        camouflaged_tank = await apply_camouflage_to_tank(reference_image)
-        guided_image, mask_image = fit_image_to_mask(original_image, camouflaged_tank, mask_x1, mask_y1, mask_x2, mask_y2)
-        guided_image.save("guided_image_before_blending.png")
-        softened_mask = soften_mask(mask_image, softness=2)
-        result = pipe_runway(
-            prompt="Blend the camouflaged tank into the grassy field with trees, ensuring a non-snowy environment, matching the style, lighting, and surroundings.",
-            image=guided_image,
-            mask_image=softened_mask,
-            strength=0.2,
-            guidance_scale=7.5,
-            num_inference_steps=50,
-            negative_prompt="snow, ice, rock, stone, boat, unrelated objects"
-        ).images[0]
-        result_bytes = io.BytesIO()
-        result.save(result_bytes, format="PNG")
-        result_bytes.seek(0)
-        return StreamingResponse(
-            result_bytes,
-            media_type="image/png",
-            headers={"Content-Disposition": "attachment; filename=fitted_image.png"}
-        )
-    except ValueError as ve:
-        raise HTTPException(status_code=400, detail=f"ValueError in processing: {str(ve)}")
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error during fitting and inpainting: {str(e)}")
 @app.post("/detect-json/")
-async def detect_json(
-    file: UploadFile = File(..., description="Image file to process"),
-    text_query: str = DEFAULT_TEXT_QUERY
-):
-    """Endpoint to detect objects and return bounding box information as JSON."""
     try:
-        # Read and convert the uploaded image
         image_data = await file.read()
         image = Image.open(io.BytesIO(image_data)).convert("RGB")
-        # Process with Grounding DINO
         results = process_image_with_dino(image, text_query)
-        # Format results as JSON-compatible data
-        detections = []
-        for box, label, score in zip(results["boxes"], results["labels"], results["scores"]):
-            x_min, y_min, x_max, y_max = box.tolist()
-            detections.append({
                 "label": label,
-                "score": float(score),  # Convert tensor to float
-                "box": {
-                    "x_min": x_min,
-                    "y_min": y_min,
-                    "x_max": x_max,
-                    "y_max": y_max
-                }
-            })
         return JSONResponse(content={"detections": detections})
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}")
 @app.post("/segment-image/")
-async def segment_image(
-    file: UploadFile = File(..., description="Image file to process"),
-    text_query: str = DEFAULT_TEXT_QUERY
-):
-    """Endpoint to segment objects and return the image with background removed."""
     try:
-        # Read and convert the uploaded image
         image_data = await file.read()
         image = Image.open(io.BytesIO(image_data)).convert("RGB")
-        # Detect objects with Grounding DINO
         results = process_image_with_dino(image, text_query)
-        # Extract boxes for segmentation, move to CPU
         boxes = [
             {"x_min": box[0].item(), "y_min": box[1].item(), "x_max": box[2].item(), "y_max": box[3].item()}
-            for box in results["boxes"].cpu()  # Move tensor to CPU here
         ]
-        # Segment with SAM 2
         mask = segment_with_sam(image, boxes)
-        # Create background mask and apply it
         image_np = np.array(image)
         background_mask = create_background_mask(image_np, mask)
         segmented_image = cv2.bitwise_and(image_np, background_mask)
-        # Convert to PIL Image and save to bytes
         output_image = Image.fromarray(segmented_image)
         img_byte_arr = io.BytesIO()
         output_image.save(img_byte_arr, format="PNG")
         img_byte_arr.seek(0)
-        return StreamingResponse(
-            img_byte_arr,
-            media_type="image/png",
-            headers={"Content-Disposition": "attachment; filename=segmented_image.png"}
-        )
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}")
-@app.post("/mask-object/")
-async def mask_object(
-    file: UploadFile = File(..., description="Image file to process"),
-    text_query: str = DEFAULT_TEXT_QUERY
-):
-    """Endpoint to mask the detected object and return the image with the object removed."""
-    try:
-        # Read and convert the uploaded image
-        image_data = await file.read()
-        image = Image.open(io.BytesIO(image_data)).convert("RGB")
-        # Detect objects with Grounding DINO
-        results = process_image_with_dino(image, text_query)
-        # Extract boxes for segmentation, move to CPU
-        boxes = [
-            {"x_min": box[0].item(), "y_min": box[1].item(), "x_max": box[2].item(), "y_max": box[3].item()}
-            for box in results["boxes"].cpu()  # Move tensor to CPU here
-        ]
-        # Segment with SAM 2
-        mask = segment_with_sam(image, boxes)
-        # Create object mask and apply it
-        image_np = np.array(image)
-        object_mask = create_object_mask(image_np, mask)
-        masked_image = cv2.bitwise_and(image_np, object_mask)
-        # Convert to PIL Image and save to bytes
-        output_image = Image.fromarray(masked_image)
-        img_byte_arr = io.BytesIO()
-        output_image.save(img_byte_arr, format="PNG")
-        img_byte_arr.seek(0)
-        return StreamingResponse(
-            img_byte_arr,
-            media_type="image/png",
-            headers={"Content-Disposition": "attachment; filename=masked_object_image.png"}
-        )
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}")
 if __name__ == "__main__":
     import uvicorn

+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse, Response
 import io
 import math
+from PIL import Image, ImageOps, ImageDraw, ImageFilter
 import torch
+import numpy as np
+from diffusers import (
+    StableDiffusionInstructPix2PixPipeline,
+    StableDiffusionInpaintPipeline,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+    EulerDiscreteScheduler,
+)
 from huggingface_hub import hf_hub_download, login
 from safetensors.torch import load_file
 from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 from sam2.sam2_image_predictor import SAM2ImagePredictor
+import cv2
+import os
+from typing import Optional
 # Initialize FastAPI app
 app = FastAPI()
+# Device configuration
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Model variables (initially None, loaded lazily)
+pipe_edit = None  # InstructPix2Pix
+pipe_inpaint = None  # Stable Diffusion Inpainting
+pipe_generate = None  # Stable Diffusion XL
+pipe_runway = None  # Runway Inpainting
+dino_processor = None  # Grounding DINO processor
+dino_model = None  # Grounding DINO model
+sam_predictor = None  # SAM 2 predictor
+# Default configuration values
+DEFAULT_STEPS = 50
+DEFAULT_TEXT_CFG = 7.5
+DEFAULT_IMAGE_CFG = 1.5
+DEFAULT_SEED = 1371
 DEFAULT_TEXT_QUERY = "a tank."
+HF_TOKEN = os.getenv("HF_TOKEN")
+# Helper functions for lazy loading
+def load_instruct_pix2pix() -> StableDiffusionInstructPix2PixPipeline:
+    global pipe_edit
+    if pipe_edit is None:
+        model_id = "timbrooks/instruct-pix2pix"
+        pipe_edit = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            model_id, torch_dtype=torch.float16, safety_checker=None
+        ).to(device)
+    return pipe_edit
+def load_inpaint_pipeline() -> StableDiffusionInpaintPipeline:
+    global pipe_inpaint
+    if pipe_inpaint is None:
+        inpaint_model_id = "stabilityai/stable-diffusion-2-inpainting"
+        pipe_inpaint = StableDiffusionInpaintPipeline.from_pretrained(
+            inpaint_model_id, torch_dtype=torch.float16, safety_checker=None
+        ).to(device)
+    return pipe_inpaint
+def load_generate_pipeline() -> StableDiffusionXLPipeline:
+    global pipe_generate
+    if pipe_generate is None:
+        try:
+            if HF_TOKEN:
+                login(token=HF_TOKEN)
+            base = "stabilityai/stable-diffusion-xl-base-1.0"
+            repo = "ByteDance/SDXL-Lightning"
+            ckpt = "sdxl_lightning_4step_unet.safetensors"
+            unet = UNet2DConditionModel.from_config(base, subfolder="unet").to(device, torch.float16)
+            unet.load_state_dict(load_file(hf_hub_download(repo, ckpt), device=device))
+            pipe_generate = StableDiffusionXLPipeline.from_pretrained(
+                base, unet=unet, torch_dtype=torch.float16, variant="fp16"
+            ).to(device)
+            pipe_generate.scheduler = EulerDiscreteScheduler.from_config(
+                pipe_generate.scheduler.config, timestep_spacing="trailing"
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to load generate pipeline: {str(e)}")
+    return pipe_generate
+def load_runway_inpaint() -> StableDiffusionInpaintPipeline:
+    global pipe_runway
+    if pipe_runway is None:
+        model_id_runway = "runwayml/stable-diffusion-inpainting"
+        pipe_runway = StableDiffusionInpaintPipeline.from_pretrained(model_id_runway).to(device)
+    return pipe_runway
+def load_dino() -> tuple[AutoProcessor, AutoModelForZeroShotObjectDetection]:
+    global dino_processor, dino_model
+    if dino_processor is None or dino_model is None:
+        dino_model_id = "IDEA-Research/grounding-dino-base"
+        dino_processor = AutoProcessor.from_pretrained(dino_model_id)
+        dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(dino_model_id).to(device)
+    return dino_processor, dino_model
+def load_sam() -> SAM2ImagePredictor:
+    global sam_predictor
+    if sam_predictor is None:
+        sam_predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-tiny")
+        sam_predictor.model.to(device)
+    return sam_predictor
+# Image processing helper functions (unchanged, included for completeness)
 def process_image_with_dino(image: Image.Image, text_query: str = DEFAULT_TEXT_QUERY):
+    processor, model = load_dino()
+    inputs = processor(images=image, text=text_query, return_tensors="pt").to(device)
     with torch.no_grad():
+        outputs = model(**inputs)
+    results = processor.post_process_grounded_object_detection(
+        outputs, inputs.input_ids, threshold=0.4, text_threshold=0.3, target_sizes=[image.size[::-1]]
     )
+    return results[0]
 def segment_with_sam(image: Image.Image, boxes: list):
+    predictor = load_sam()
     image_np = np.array(image)
+    predictor.set_image(image_np)
     if not boxes:
+        return np.zeros(image_np.shape[:2], dtype=bool)
     boxes_tensor = torch.tensor(
         [[box["x_min"], box["y_min"], box["x_max"], box["y_max"]] for box in boxes],
         dtype=torch.float32
     ).to(device)
+    masks, _, _ = predictor.predict(point_coords=None, point_labels=None, box=boxes_tensor, multimask_output=False)
+    return masks[0]
 def create_background_mask(image_np: np.ndarray, mask: np.ndarray) -> np.ndarray:
+    mask_inv = np.logical_not(mask).astype(np.uint8) * 255
+    mask_rgb = cv2.cvtColor(mask_inv, cv2.COLOR_GRAY2RGB)
     return mask_rgb
 def create_object_mask(image_np: np.ndarray, mask: np.ndarray) -> np.ndarray:
+    mask_rgb = cv2.cvtColor(mask.astype(np.uint8) * 255, cv2.COLOR_GRAY2RGB)
     return mask_rgb
 def process_image(input_image: Image.Image, instruction: str, steps: int, text_cfg_scale: float, image_cfg_scale: float, seed: int):
     width, height = input_image.size
     factor = 512 / max(width, height)
     factor = math.ceil(min(width, height) * factor / 64) * 64 / min(width, height)
     width = int((width * factor) // 64) * 64
     height = int((height * factor) // 64) * 64
     input_image = ImageOps.fit(input_image, (width, height), method=Image.Resampling.LANCZOS)
     if not instruction:
         return input_image
     generator = torch.manual_seed(seed)
+    pipe = load_instruct_pix2pix()
+    edited_image = pipe(
         instruction,
         image=input_image,
         guidance_scale=text_cfg_scale,
         num_inference_steps=steps,
         generator=generator,
     ).images[0]
     return edited_image
+# Endpoints
+@app.get("/generate")
+async def generate_image(prompt: str):
+    try:
+        pipe = load_generate_pipeline()
+        image = pipe(prompt, num_inference_steps=4, guidance_scale=0).images[0]
+        buffer = io.BytesIO()
+        image.save(buffer, format="PNG")
+        buffer.seek(0)
+        return Response(content=buffer.getvalue(), media_type="image/png")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating image: {str(e)}")
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
 @app.post("/edit-image/")
 async def edit_image(
     file: UploadFile = File(...),
     image_cfg_scale: float = Form(default=DEFAULT_IMAGE_CFG),
     seed: int = Form(default=DEFAULT_SEED)
 ):
+    try:
+        image_data = await file.read()
+        input_image = Image.open(io.BytesIO(image_data)).convert("RGB")
+        edited_image = process_image(input_image, instruction, steps, text_cfg_scale, image_cfg_scale, seed)
+        img_byte_arr = io.BytesIO()
+        edited_image.save(img_byte_arr, format="PNG")
+        img_byte_arr.seek(0)
+        return StreamingResponse(img_byte_arr, media_type="image/png")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error editing image: {str(e)}")
 @app.post("/inpaint/")
 async def inpaint_image(
     file: UploadFile = File(...),
     prompt: str = Form(...),
+    mask_coordinates: str = Form(...),
     steps: int = Form(default=DEFAULT_STEPS),
     guidance_scale: float = Form(default=7.5),
     seed: int = Form(default=DEFAULT_SEED)
 ):
     try:
         image_data = await file.read()
         input_image = Image.open(io.BytesIO(image_data)).convert("RGB")
         width, height = input_image.size
         factor = 512 / max(width, height)
         factor = math.ceil(min(width, height) * factor / 8) * 8 / min(width, height)
         width = int((width * factor) // 8) * 8
         height = int((height * factor) // 8) * 8
         input_image = ImageOps.fit(input_image, (width, height), method=Image.Resampling.LANCZOS)
+        mask = Image.new("L", (width, height), 0)
         draw = ImageDraw.Draw(mask)
+        x1, y1, x2, y2 = map(int, mask_coordinates.split(","))
+        x1, y1, x2, y2 = int(x1 * factor), int(y1 * factor), int(x2 * factor), int(y2 * factor)
         draw.rectangle([x1, y1, x2, y2], fill=255)
         generator = torch.manual_seed(seed)
+        pipe = load_inpaint_pipeline()
+        inpainted_image = pipe(
             prompt=prompt,
             image=input_image,
             mask_image=mask,
             guidance_scale=guidance_scale,
             generator=generator,
         ).images[0]
         img_byte_arr = io.BytesIO()
         inpainted_image.save(img_byte_arr, format="PNG")
         img_byte_arr.seek(0)
         return StreamingResponse(img_byte_arr, media_type="image/png")
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid mask coordinates format. Use 'x1,y1,x2,y2'.")
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error inpainting image: {str(e)}")
 @app.post("/detect-json/")
+async def detect_json(file: UploadFile = File(...), text_query: str = DEFAULT_TEXT_QUERY):
     try:
         image_data = await file.read()
         image = Image.open(io.BytesIO(image_data)).convert("RGB")
         results = process_image_with_dino(image, text_query)
+        detections = [
+            {
                 "label": label,
+                "score": float(score),
+                "box": {"x_min": box[0].item(), "y_min": box[1].item(), "x_max": box[2].item(), "y_max": box[3].item()}
+            }
+            for box, label, score in zip(results["boxes"].cpu(), results["labels"], results["scores"])
+        ]
         return JSONResponse(content={"detections": detections})
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}")
 @app.post("/segment-image/")
+async def segment_image(file: UploadFile = File(...), text_query: str = DEFAULT_TEXT_QUERY):
     try:
         image_data = await file.read()
         image = Image.open(io.BytesIO(image_data)).convert("RGB")
         results = process_image_with_dino(image, text_query)
         boxes = [
             {"x_min": box[0].item(), "y_min": box[1].item(), "x_max": box[2].item(), "y_max": box[3].item()}
+            for box in results["boxes"].cpu()
         ]
         mask = segment_with_sam(image, boxes)
         image_np = np.array(image)
         background_mask = create_background_mask(image_np, mask)
         segmented_image = cv2.bitwise_and(image_np, background_mask)
         output_image = Image.fromarray(segmented_image)
         img_byte_arr = io.BytesIO()
         output_image.save(img_byte_arr, format="PNG")
         img_byte_arr.seek(0)
+        return StreamingResponse(img_byte_arr, media_type="image/png")
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error segmenting image: {str(e)}")
+# Add other endpoints (e.g., /mask-object/, /fit-image-to-mask/) with similar lazy loading patterns as needed
 if __name__ == "__main__":
     import uvicorn