Spaces:
Paused
Paused
from fastapi import FastAPI, File, UploadFile, Form | |
from fastapi.responses import StreamingResponse | |
import io | |
import math | |
from PIL import Image, ImageOps, ImageDraw | |
import torch | |
from diffusers import StableDiffusionInstructPix2PixPipeline, StableDiffusionInpaintPipeline | |
from fastapi import FastAPI, Response | |
from fastapi.responses import FileResponse | |
import torch | |
from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler | |
from huggingface_hub import hf_hub_download, login | |
from safetensors.torch import load_file | |
from io import BytesIO | |
import os | |
import base64 | |
from typing import List | |
from fastapi import FastAPI, File, UploadFile, HTTPException | |
from fastapi.responses import StreamingResponse | |
from PIL import Image, ImageDraw, ImageFilter | |
import io | |
import torch | |
import numpy as np | |
from diffusers import StableDiffusionInpaintPipeline | |
import cv2 | |
# Initialize FastAPI app | |
app = FastAPI() | |
model_id_runway = "runwayml/stable-diffusion-inpainting" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
try: | |
pipe_runway = StableDiffusionInpaintPipeline.from_pretrained(model_id_runway) | |
pipe_runway.to(device) | |
except Exception as e: | |
raise RuntimeError(f"Failed to load model: {e}") | |
# Load the pre-trained InstructPix2Pix model for editing | |
model_id = "timbrooks/instruct-pix2pix" | |
pipe_edit = StableDiffusionInstructPix2PixPipeline.from_pretrained( | |
model_id, torch_dtype=torch.float16, safety_checker=None | |
).to("cuda") | |
# Load the pre-trained Inpainting model | |
inpaint_model_id = "stabilityai/stable-diffusion-2-inpainting" | |
pipe_inpaint = StableDiffusionInpaintPipeline.from_pretrained( | |
inpaint_model_id, torch_dtype=torch.float16, safety_checker=None | |
).to("cuda") | |
# Default configuration values | |
DEFAULT_STEPS = 50 | |
DEFAULT_TEXT_CFG = 7.5 | |
DEFAULT_IMAGE_CFG = 1.5 | |
DEFAULT_SEED = 1371 | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
def load_model(): | |
try: | |
# Login to Hugging Face if token is provided | |
if HF_TOKEN: | |
login(token=HF_TOKEN) | |
base = "stabilityai/stable-diffusion-xl-base-1.0" | |
repo = "ByteDance/SDXL-Lightning" | |
ckpt = "sdxl_lightning_4step_unet.safetensors" | |
# Load model with explicit error handling | |
unet = UNet2DConditionModel.from_config( | |
base, | |
subfolder="unet" | |
).to("cuda", torch.float16) | |
unet.load_state_dict(load_file(hf_hub_download(repo, ckpt), device="cuda")) | |
pipe = StableDiffusionXLPipeline.from_pretrained( | |
base, | |
unet=unet, | |
torch_dtype=torch.float16, | |
variant="fp16" | |
).to("cuda") | |
# Configure scheduler | |
pipe.scheduler = EulerDiscreteScheduler.from_config( | |
pipe.scheduler.config, | |
timestep_spacing="trailing" | |
) | |
return pipe | |
except Exception as e: | |
raise Exception(f"Failed to load model: {str(e)}") | |
# Load model at startup with error handling | |
try: | |
pipe_generate = load_model() | |
except Exception as e: | |
print(f"Model initialization failed: {str(e)}") | |
raise | |
async def generate_image(prompt: str): | |
try: | |
# Generate image | |
image = pipe_generate( | |
prompt, | |
num_inference_steps=4, | |
guidance_scale=0 | |
).images[0] | |
# Save image to buffer | |
buffer = BytesIO() | |
image.save(buffer, format="PNG") | |
buffer.seek(0) | |
return Response(content=buffer.getvalue(), media_type="image/png") | |
except Exception as e: | |
return {"error": str(e)} | |
async def health_check(): | |
return {"status": "healthy"} | |
def process_image(input_image: Image.Image, instruction: str, steps: int, text_cfg_scale: float, image_cfg_scale: float, seed: int): | |
""" | |
Process the input image with the given instruction using InstructPix2Pix. | |
""" | |
# Resize image to fit model requirements | |
width, height = input_image.size | |
factor = 512 / max(width, height) | |
factor = math.ceil(min(width, height) * factor / 64) * 64 / min(width, height) | |
width = int((width * factor) // 64) * 64 | |
height = int((height * factor) // 64) * 64 | |
input_image = ImageOps.fit(input_image, (width, height), method=Image.Resampling.LANCZOS) | |
if not instruction: | |
return input_image | |
# Set the random seed for reproducibility | |
generator = torch.manual_seed(seed) | |
# Generate the edited image | |
edited_image = pipe_edit( | |
instruction, | |
image=input_image, | |
guidance_scale=text_cfg_scale, | |
image_guidance_scale=image_cfg_scale, | |
num_inference_steps=steps, | |
generator=generator, | |
).images[0] | |
return edited_image | |
async def edit_image( | |
file: UploadFile = File(...), | |
instruction: str = Form(...), | |
steps: int = Form(default=DEFAULT_STEPS), | |
text_cfg_scale: float = Form(default=DEFAULT_TEXT_CFG), | |
image_cfg_scale: float = Form(default=DEFAULT_IMAGE_CFG), | |
seed: int = Form(default=DEFAULT_SEED) | |
): | |
""" | |
Endpoint to edit an image based on a text instruction. | |
""" | |
# Read and convert the uploaded image | |
image_data = await file.read() | |
input_image = Image.open(io.BytesIO(image_data)).convert("RGB") | |
# Process the image | |
edited_image = process_image(input_image, instruction, steps, text_cfg_scale, image_cfg_scale, seed) | |
# Convert the edited image to bytes | |
img_byte_arr = io.BytesIO() | |
edited_image.save(img_byte_arr, format="PNG") | |
img_byte_arr.seek(0) | |
# Return the image as a streaming response | |
return StreamingResponse(img_byte_arr, media_type="image/png") | |
# New endpoint for inpainting | |
async def inpaint_image( | |
file: UploadFile = File(...), | |
prompt: str = Form(...), | |
mask_coordinates: str = Form(...), # Format: "x1,y1,x2,y2" (top-left and bottom-right of the rectangle to inpaint) | |
steps: int = Form(default=DEFAULT_STEPS), | |
guidance_scale: float = Form(default=7.5), | |
seed: int = Form(default=DEFAULT_SEED) | |
): | |
""" | |
Endpoint to perform inpainting on an image. | |
- file: The input image to inpaint. | |
- prompt: The text prompt describing what to generate in the inpainted area. | |
- mask_coordinates: Coordinates of the rectangular area to inpaint (format: "x1,y1,x2,y2"). | |
- steps: Number of inference steps. | |
- guidance_scale: Guidance scale for the inpainting process. | |
- seed: Random seed for reproducibility. | |
""" | |
try: | |
# Read and convert the uploaded image | |
image_data = await file.read() | |
input_image = Image.open(io.BytesIO(image_data)).convert("RGB") | |
# Resize image to fit model requirements (must be divisible by 8 for inpainting) | |
width, height = input_image.size | |
factor = 512 / max(width, height) | |
factor = math.ceil(min(width, height) * factor / 8) * 8 / min(width, height) | |
width = int((width * factor) // 8) * 8 | |
height = int((height * factor) // 8) * 8 | |
input_image = ImageOps.fit(input_image, (width, height), method=Image.Resampling.LANCZOS) | |
# Create a mask for inpainting | |
mask = Image.new("L", (width, height), 0) # Black image (0 = no inpainting) | |
draw = ImageDraw.Draw(mask) | |
# Parse the mask coordinates | |
try: | |
x1, y1, x2, y2 = map(int, mask_coordinates.split(",")) | |
# Adjust coordinates based on resized image | |
x1 = int(x1 * factor) | |
y1 = int(y1 * factor) | |
x2 = int(x2 * factor) | |
y2 = int(y2 * factor) | |
except ValueError: | |
return {"error": "Invalid mask coordinates format. Use 'x1,y1,x2,y2'."} | |
# Draw a white rectangle on the mask (255 = area to inpaint) | |
draw.rectangle([x1, y1, x2, y2], fill=255) | |
# Set the random seed for reproducibility | |
generator = torch.manual_seed(seed) | |
# Perform inpainting | |
inpainted_image = pipe_inpaint( | |
prompt=prompt, | |
image=input_image, | |
mask_image=mask, | |
num_inference_steps=steps, | |
guidance_scale=guidance_scale, | |
generator=generator, | |
).images[0] | |
# Convert the inpainted image to bytes | |
img_byte_arr = io.BytesIO() | |
inpainted_image.save(img_byte_arr, format="PNG") | |
img_byte_arr.seek(0) | |
# Return the image as a streaming response | |
return StreamingResponse(img_byte_arr, media_type="image/png") | |
except Exception as e: | |
return {"error": str(e)} | |
async def root(): | |
""" | |
Root endpoint for basic health check. | |
""" | |
return {"message": "InstructPix2Pix API is running. Use POST /edit-image/ or /inpaint/ to edit images."} | |
# Helper functions | |
def prepare_guided_image(original_image: Image, reference_image: Image, mask_image: Image) -> Image: | |
original_array = np.array(original_image) | |
reference_array = np.array(reference_image) | |
mask_array = np.array(mask_image) / 255.0 | |
mask_array = mask_array[:, :, np.newaxis] | |
blended_array = original_array * (1 - mask_array) + reference_array * mask_array | |
return Image.fromarray(blended_array.astype(np.uint8)) | |
def soften_mask(mask_image: Image, softness: int = 5) -> Image: | |
from PIL import ImageFilter | |
return mask_image.filter(ImageFilter.GaussianBlur(radius=softness)) | |
def generate_rectangular_mask(image_size: tuple, x1: int = 100, y1: int = 100, x2: int = 200, y2: int = 200) -> Image: | |
mask = Image.new("L", image_size, 0) | |
draw = ImageDraw.Draw(mask) | |
draw.rectangle([x1, y1, x2, y2], fill=255) | |
return mask | |
def segment_tank(tank_image: Image) -> tuple[Image, Image]: | |
tank_array = np.array(tank_image.convert("RGB")) | |
tank_array = cv2.cvtColor(tank_array, cv2.COLOR_RGB2BGR) | |
hsv = cv2.cvtColor(tank_array, cv2.COLOR_BGR2HSV) | |
lower_snow = np.array([0, 0, 180]) | |
upper_snow = np.array([180, 50, 255]) | |
snow_mask = cv2.inRange(hsv, lower_snow, upper_snow) | |
tank_mask = cv2.bitwise_not(snow_mask) | |
kernel = np.ones((5, 5), np.uint8) | |
tank_mask = cv2.erode(tank_mask, kernel, iterations=1) | |
tank_mask = cv2.dilate(tank_mask, kernel, iterations=1) | |
tank_mask_image = Image.fromarray(tank_mask, mode="L") | |
tank_array_rgb = np.array(tank_image.convert("RGB")) | |
mask_array = tank_mask / 255.0 | |
mask_array = mask_array[:, :, np.newaxis] | |
segmented_tank = (tank_array_rgb * mask_array).astype(np.uint8) | |
alpha = tank_mask | |
segmented_tank_rgba = np.zeros((tank_image.height, tank_image.width, 4), dtype=np.uint8) | |
segmented_tank_rgba[:, :, :3] = segmented_tank | |
segmented_tank_rgba[:, :, 3] = alpha | |
segmented_tank_image = Image.fromarray(segmented_tank_rgba, mode="RGBA") | |
return segmented_tank_image, tank_mask_image | |
async def apply_camouflage_to_tank(tank_image: Image) -> Image: | |
segmented_tank, tank_mask = segment_tank(tank_image) | |
segmented_tank.save("segmented_tank.png") | |
tank_mask.save("tank_mask.png") | |
camouflaged_tank = pipe_runway( | |
prompt="Apply a grassy camouflage pattern with shades of green and brown to the tank, preserving its structure.", | |
image=segmented_tank.convert("RGB"), | |
mask_image=tank_mask, | |
strength=0.5, | |
guidance_scale=8.0, | |
num_inference_steps=50, | |
negative_prompt="snow, ice, rock, stone, boat, unrelated objects" | |
).images[0] | |
camouflaged_tank_rgba = np.zeros((camouflaged_tank.height, camouflaged_tank.width, 4), dtype=np.uint8) | |
camouflaged_tank_rgba[:, :, :3] = np.array(camouflaged_tank) | |
camouflaged_tank_rgba[:, :, 3] = np.array(tank_mask) | |
camouflaged_tank_image = Image.fromarray(camouflaged_tank_rgba, mode="RGBA") | |
camouflaged_tank_image.save("camouflaged_tank.png") | |
return camouflaged_tank_image | |
def fit_image_to_mask(original_image: Image, reference_image: Image, mask_x1: int, mask_y1: int, mask_x2: int, mask_y2: int) -> tuple: | |
mask_width = mask_x2 - mask_x1 | |
mask_height = mask_y2 - mask_y1 | |
if mask_width <= 0 or mask_height <= 0: | |
raise ValueError("Mask dimensions must be positive") | |
ref_width, ref_height = reference_image.size | |
aspect_ratio = ref_width / ref_height | |
if mask_width / mask_height > aspect_ratio: | |
new_height = mask_height | |
new_width = int(new_height * aspect_ratio) | |
else: | |
new_width = mask_width | |
new_height = int(new_width / aspect_ratio) | |
reference_image_resized = reference_image.resize((new_width, new_height), Image.Resampling.LANCZOS) | |
guided_image = original_image.copy().convert("RGB") | |
paste_x = mask_x1 + (mask_width - new_width) // 2 | |
paste_y = mask_y1 + (mask_height - new_height) // 2 | |
guided_image.paste(reference_image_resized, (paste_x, paste_y), reference_image_resized) | |
mask_image = generate_rectangular_mask(original_image.size, mask_x1, mask_y1, mask_x2, mask_y2) | |
return guided_image, mask_image | |
# Endpoints | |
async def inpaint_image( | |
image: UploadFile = File(...), | |
mask: UploadFile = File(...), | |
prompt: str = "Fill the masked area with appropriate content." | |
): | |
try: | |
image_bytes = await image.read() | |
mask_bytes = await mask.read() | |
original_image = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
mask_image = Image.open(io.BytesIO(mask_bytes)).convert("L") | |
if original_image.size != mask_image.size: | |
raise HTTPException(status_code=400, detail="Image and mask dimensions must match.") | |
result = pipe_runway(prompt=prompt, image=original_image, mask_image=mask_image).images[0] | |
result_bytes = io.BytesIO() | |
result.save(result_bytes, format="PNG") | |
result_bytes.seek(0) | |
return StreamingResponse( | |
result_bytes, | |
media_type="image/png", | |
headers={"Content-Disposition": "attachment; filename=inpainted_image.png"} | |
) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error during inpainting: {e}") | |
async def inpaint_with_reference( | |
image: UploadFile = File(...), | |
reference_image: UploadFile = File(...), | |
prompt: str = "Integrate the reference content naturally into the masked area, matching style and lighting.", | |
mask_x1: int = 100, | |
mask_y1: int = 100, | |
mask_x2: int = 200, | |
mask_y2: int = 200 | |
): | |
try: | |
image_bytes = await image.read() | |
reference_bytes = await reference_image.read() | |
original_image = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
reference_image = Image.open(io.BytesIO(reference_bytes)).convert("RGB") | |
if original_image.size != reference_image.size: | |
reference_image = reference_image.resize(original_image.size, Image.Resampling.LANCZOS) | |
mask_image = generate_rectangular_mask(original_image.size, mask_x1, mask_y1, mask_x2, mask_y2) | |
softened_mask = soften_mask(mask_image, softness=5) | |
guided_image = prepare_guided_image(original_image, reference_image, softened_mask) | |
result = pipe_runway( | |
prompt=prompt, | |
image=guided_image, | |
mask_image=softened_mask, | |
strength=0.75, | |
guidance_scale=7.5 | |
).images[0] | |
result_bytes = io.BytesIO() | |
result.save(result_bytes, format="PNG") | |
result_bytes.seek(0) | |
return StreamingResponse( | |
result_bytes, | |
media_type="image/png", | |
headers={"Content-Disposition": "attachment; filename=natural_inpaint_image.png"} | |
) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error during natural inpainting: {e}") | |
async def fit_image_to_mask_endpoint( | |
image: UploadFile = File(...), | |
reference_image: UploadFile = File(...), | |
mask_x1: int = 200, | |
mask_y1: int = 200, | |
mask_x2: int = 500, | |
mask_y2: int = 500 | |
): | |
try: | |
image_bytes = await image.read() | |
reference_bytes = await reference_image.read() | |
original_image = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
reference_image = Image.open(io.BytesIO(reference_bytes)).convert("RGB") | |
camouflaged_tank = await apply_camouflage_to_tank(reference_image) | |
guided_image, mask_image = fit_image_to_mask(original_image, camouflaged_tank, mask_x1, mask_y1, mask_x2, mask_y2) | |
guided_image.save("guided_image_before_blending.png") | |
softened_mask = soften_mask(mask_image, softness=2) | |
result = pipe_runway( | |
prompt="Blend the camouflaged tank into the grassy field with trees, ensuring a non-snowy environment, matching the style, lighting, and surroundings.", | |
image=guided_image, | |
mask_image=softened_mask, | |
strength=0.2, | |
guidance_scale=7.5, | |
num_inference_steps=50, | |
negative_prompt="snow, ice, rock, stone, boat, unrelated objects" | |
).images[0] | |
result_bytes = io.BytesIO() | |
result.save(result_bytes, format="PNG") | |
result_bytes.seek(0) | |
return StreamingResponse( | |
result_bytes, | |
media_type="image/png", | |
headers={"Content-Disposition": "attachment; filename=fitted_image.png"} | |
) | |
except ValueError as ve: | |
raise HTTPException(status_code=400, detail=f"ValueError in processing: {str(ve)}") | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error during fitting and inpainting: {str(e)}") | |
from fastapi import FastAPI, File, UploadFile, HTTPException | |
from fastapi.responses import StreamingResponse, JSONResponse | |
import torch | |
from PIL import Image, ImageDraw, ImageFont | |
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection | |
import io | |
# Set up model and device | |
model_id_segment = "IDEA-Research/grounding-dino-base" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Load processor and model at startup | |
processor_segment = AutoProcessor.from_pretrained(model_id_segment) | |
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id_segment).to(device) | |
# Default text query (can be overridden via endpoint parameters) | |
DEFAULT_TEXT_QUERY = "a tank." # Adjust based on your use case | |
def process_image(image: Image.Image, text_query: str = DEFAULT_TEXT_QUERY): | |
"""Process the image with Grounding DINO and return detection results.""" | |
# Prepare inputs for the model | |
inputs = processor_segment(images=image, text=text_query, return_tensors="pt").to(device) | |
# Perform inference | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Post-process results | |
results = processor_segment.post_process_grounded_object_detection( | |
outputs, | |
inputs.input_ids, | |
threshold=0.4, | |
text_threshold=0.3, | |
target_sizes=[image.size[::-1]] # [width, height] | |
) | |
return results | |
def draw_detections(image: Image.Image, results: list) -> Image.Image: | |
"""Draw bounding boxes and labels on the image.""" | |
output_image = image.copy() | |
draw = ImageDraw.Draw(output_image) | |
# Try to load a font, fall back to default | |
try: | |
font = ImageFont.truetype("arial.ttf", 20) | |
except: | |
font = ImageFont.load_default() | |
# Colors for different objects | |
colors = {"a tank": "red"} # Add more as needed, e.g., {"a cat": "red", "a remote control": "blue"} | |
# Draw bounding boxes and labels | |
for detection in results: | |
boxes = detection["boxes"] | |
labels = detection["labels"] | |
scores = detection["scores"] | |
for box, label, score in zip(boxes, labels, scores): | |
x_min, y_min, x_max, y_max = box.tolist() | |
# Draw rectangle | |
draw.rectangle( | |
[(x_min, y_min), (x_max, y_max)], | |
outline=colors.get(label, "green"), | |
width=2 | |
) | |
# Draw label with score | |
label_text = f"{label} {score:.2f}" | |
bbox = draw.textbbox((x_min, y_min - 20), label_text, font=font) | |
text_width = bbox[2] - bbox[0] | |
text_height = bbox[3] - bbox[1] | |
# Draw background rectangle for text | |
draw.rectangle( | |
[(x_min, y_min - text_height - 5), (x_min + text_width, y_min)], | |
fill=colors.get(label, "green") | |
) | |
# Draw text | |
draw.text( | |
(x_min, y_min - text_height - 5), | |
label_text, | |
fill="white", | |
font=font | |
) | |
return output_image | |
async def detect_image( | |
file: UploadFile = File(..., description="Image file to process"), | |
text_query: str = DEFAULT_TEXT_QUERY | |
): | |
""" | |
Endpoint to detect objects in an image and return the annotated image. | |
Args: | |
file: Uploaded image file. | |
text_query: Text query for objects to detect (e.g., "a tank."). | |
Returns: | |
StreamingResponse with the annotated image. | |
""" | |
try: | |
# Read and convert the uploaded image | |
image_data = await file.read() | |
image = Image.open(io.BytesIO(image_data)).convert("RGB") | |
# Process the image | |
results = process_image(image, text_query) | |
# Draw detections on the image | |
output_image = draw_detections(image, results) | |
# Convert to bytes for response | |
img_byte_arr = io.BytesIO() | |
output_image.save(img_byte_arr, format="PNG") | |
img_byte_arr.seek(0) | |
return StreamingResponse( | |
img_byte_arr, | |
media_type="image/png", | |
headers={"Content-Disposition": "attachment; filename=detected_objects.png"} | |
) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}") | |
async def detect_json( | |
file: UploadFile = File(..., description="Image file to process"), | |
text_query: str = DEFAULT_TEXT_QUERY | |
): | |
""" | |
Endpoint to detect objects in an image and return bounding box information as JSON. | |
Args: | |
file: Uploaded image file. | |
text_query: Text query for objects to detect (e.g., "a tank."). | |
Returns: | |
JSONResponse with bounding box coordinates, labels, and scores. | |
""" | |
try: | |
# Read and convert the uploaded image | |
image_data = await file.read() | |
image = Image.open(io.BytesIO(image_data)).convert("RGB") | |
# Process the image | |
results = process_image(image, text_query) | |
# Format results as JSON-compatible data | |
detections = [] | |
for detection in results: | |
boxes = detection["boxes"] | |
labels = detection["labels"] | |
scores = detection["scores"] | |
for box, label, score in zip(boxes, labels, scores): | |
x_min, y_min, x_max, y_max = box.tolist() | |
detections.append({ | |
"label": label, | |
"score": float(score), # Convert tensor to float | |
"box": { | |
"x_min": x_min, | |
"y_min": y_min, | |
"x_max": x_max, | |
"y_max": y_max | |
} | |
}) | |
return JSONResponse(content={"detections": detections}) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}") | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=7860) |