Spaces:

petergpt
/

bgremoval

Runtime error

File size: 5,177 Bytes

c9473c9
b7a75e4
de63122
 
623e1bf
 
 
 
de63122
 
 
 
4d7e87d
b7a75e4
 
 
de63122
 
 
 
 
 
 
 
 
b7a75e4
 
 
 
 
 
 
4d7e87d
 
 
 
 
de63122
4d7e87d
623e1bf
de63122
623e1bf
 
 
 
de63122
 
623e1bf
 
4d7e87d
c9473c9
 
f397a20
c9473c9
 
b7a75e4
de63122
4d7e87d
 
de63122
 
 
 
 
 
 
4d7e87d
 
 
 
b7a75e4
f397a20
c9473c9
f397a20
c9473c9
 
 
 
 
 
b7a75e4
4d7e87d
 
 
 
de63122
c333b0b
 
4d7e87d
 
de63122
 
4d7e87d
 
 
 
de63122
4d7e87d
de63122
4d7e87d
de63122
4d7e87d
 
 
de63122
4d7e87d
 
c333b0b
f397a20
36a76ae
c333b0b
de63122
36a76ae
4d7e87d
36a76ae
 
 
 
de63122
36a76ae
de63122
4d7e87d
 
36a76ae
4d7e87d
36a76ae
 
1018e38
36a76ae
de63122
36a76ae
 
 
4d7e87d
36a76ae
 
 
 
4d7e87d
 
 
36a76ae
 
 
623e1bf
 
c9473c9
e041428
c9473c9
de63122
 
623e1bf
 
b7a75e4

import time
import gc
import torch

from PIL import Image
from torchvision import transforms
import gradio as gr

from transformers import AutoConfig, AutoModelForImageSegmentation

# 1) Wrap config loading in a helper that monkey-patches a dummy get_text_config().

def load_model():
    config = AutoConfig.from_pretrained("zhengpeng7/BiRefNet_lite", trust_remote_code=True)
    config.is_encoder_decoder = False

    # We define a dummy function that returns a minimal object
    # with a tie_word_embeddings attribute, so tie_weights() won't fail.
    def dummy_text_config(decoder=True):
        class DummyTextConfig:
            tie_word_embeddings = False
        return DummyTextConfig()

    # Patch the config so huggingface code won't blow up
    setattr(config, "get_text_config", dummy_text_config)

    model = AutoModelForImageSegmentation.from_pretrained(
        "zhengpeng7/BiRefNet_lite",
        config=config,
        trust_remote_code=True
    )

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()
    return model, device

# 2) Initialize global model & device
birefnet, device = load_model()

# 3) Preprocessing transform
image_size = (1024, 1024)
transform_image = transforms.Compose([
    transforms.Resize(image_size),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

def run_inference(images, model, device):
    inputs = []
    original_sizes = []
    for img in images:
        original_sizes.append(img.size)
        inputs.append(transform_image(img))

    input_tensor = torch.stack(inputs).to(device)
    try:
        with torch.no_grad():
            # If the model returns multiple outputs, adapt as needed
            output = model(input_tensor)
            # The last element might be your segmentation mask. Adjust if needed:
            # e.g. preds = output[-1] if it returns a list/tuple
            # or preds = output.logits if it returns a named field
            # The original example used `output[-1].sigmoid()`, so:
            preds = output[-1].sigmoid().cpu()
    except torch.OutOfMemoryError:
        del input_tensor
        torch.cuda.empty_cache()
        raise

    # Post-process
    results = []
    for i, img in enumerate(images):
        pred = preds[i].squeeze()
        pred_pil = transforms.ToPILImage()(pred)
        mask = pred_pil.resize(original_sizes[i])
        result = Image.new("RGBA", original_sizes[i], (0, 0, 0, 0))
        result.paste(img, mask=mask)
        results.append(result)

    # Cleanup
    del input_tensor, preds
    gc.collect()
    torch.cuda.empty_cache()

    return results

def binary_search_max(images):
    low, high = 1, len(images)
    best, best_count = None, 0

    while low <= high:
        mid = (low + high) // 2
        batch = images[:mid]
        try:
            # Re-load the model to avoid leftover memory fragmentation
            global birefnet, device
            birefnet, device = load_model()
            res = run_inference(batch, birefnet, device)
            best, best_count = res, mid
            low = mid + 1
        except torch.OutOfMemoryError:
            high = mid - 1

    return best, best_count

def extract_objects(filepaths):
    images = [Image.open(p).convert("RGB") for p in filepaths]
    start_time = time.time()

    # First attempt: all images at once
    try:
        results = run_inference(images, birefnet, device)
        end_time = time.time()
        total_time = end_time - start_time
        summary = f"Total request time: {total_time:.2f}s\nProcessed {len(images)} images successfully."
        return results, summary

    except torch.OutOfMemoryError:
        # If it fails with OOM, do a fallback
        oom_time = time.time()
        initial_attempt_time = oom_time - start_time
        
        best, best_count = binary_search_max(images)
        end_time = time.time()
        total_time = end_time - start_time

        if best is None:
            # Not even 1 image can be processed
            summary = (
                f"Initial attempt OOM after {initial_attempt_time:.2f}s.\n"
                f"Could not process even a single image.\n"
                f"Total time including fallback attempts: {total_time:.2f}s."
            )
            return [], summary
        else:
            summary = (
                f"Initial attempt OOM after {initial_attempt_time:.2f}s.\n"
                f"Found that {best_count} images can be processed without OOM.\n"
                f"Total time including fallback attempts: {total_time:.2f}s.\n"
                f"Next time, try using up to {best_count} images."
            )
            return best, summary

iface = gr.Interface(
    fn=extract_objects,
    inputs=gr.Files(label="Upload Multiple Images", type="filepath", file_count="multiple"),
    outputs=[gr.Gallery(label="Processed Images"), gr.Textbox(label="Timing Info")],
    title="BiRefNet Bulk Background Removal (with fallback)",
    description="Upload multiple images. If OOM occurs, we fallback to smaller batches."
)

if __name__ == "__main__":
    iface.launch()