Captain

Sleeping

File size: 4,357 Bytes

from typing import Any
import gradio as gr
import spaces
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, LlamaTokenizer

# Constants
DEFAULT_PARAMS = {
    "do_sample": False,
    "max_new_tokens": 256,
}
DEFAULT_QUERY = (
    "Provide a factual description of this image in up to two paragraphs. "
    "Include details on objects, background, scenery, interactions, gestures, poses, and any visible text content. "
    "Specify the number of repeated objects. "
    "Describe the dominant colors, color contrasts, textures, and materials. "
    "Mention the composition, including the arrangement of elements and focus points. "
    "Note the camera angle or perspective, and provide any identifiable contextual information. "
    "Include details on the style, lighting, and shadows. "
    "Avoid subjective interpretations or speculation."
)

DTYPE = torch.bfloat16
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
tokenizer = LlamaTokenizer.from_pretrained(
    pretrained_model_name_or_path="lmsys/vicuna-7b-v1.5",
)
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path="THUDM/cogvlm-chat-hf",
    torch_dtype=DTYPE,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)

model = model.to(device=DEVICE)

@spaces.GPU
@torch.no_grad()
def generate_caption(
    image: Image.Image,
    params: dict[str, Any] = DEFAULT_PARAMS,
) -> str:
    # Debugging: Check image size and format
    print(f"Uploaded image format: {image.format}, size: {image.size}")
    
    # Convert image to the expected format (if needed)
    if image.mode != "RGB":
        image = image.convert("RGB")
        print(f"Image converted to RGB mode: {image.mode}")

    inputs = model.build_conversation_input_ids(
        tokenizer=tokenizer,
        query=DEFAULT_QUERY,
        history=[],
        images=[image],
    )
    
    # Debugging: Check tensor shapes
    print(f"Input IDs shape: {inputs['input_ids'].shape}")
    print(f"Images tensor shape: {inputs['images'][0].shape}")

    inputs = {
        "input_ids": inputs["input_ids"].unsqueeze(0).to(device=DEVICE),
        "token_type_ids": inputs["token_type_ids"].unsqueeze(0).to(device=DEVICE),
        "attention_mask": inputs["attention_mask"].unsqueeze(0).to(device=DEVICE),
        "images": [[inputs["images"][0].to(device=DEVICE, dtype=DTYPE)]],
    }

    outputs = model.generate(**inputs, **params)
    outputs = outputs[:, inputs["input_ids"].shape[1] :]
    result = tokenizer.decode(outputs[0])

    result = result.replace("This image showcases", "").strip().removesuffix("</s>").strip().capitalize()
    return result

# CSS for design enhancements with a fixed image input bar and simplified query
css = """
  #container {
    background-color: #f9f9f9;
    padding: 20px;
    border-radius: 15px;
    border: 2px solid #333; /* Darker outline */
    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2); /* Enhanced shadow */
    max-width: 450px;
    margin: auto;
  }
  #input_image {
    margin-top: 15px;
    border: 2px solid #333; /* Darker outline */
    border-radius: 8px;
    height: 180px; /* Fixed height */
    object-fit: contain; /* Ensure image fits within the fixed height */
  }
  #output_caption {
    margin-top: 15px;
    border: 2px solid #333; /* Darker outline */
    border-radius: 8px;
    height: 180px; /* Fixed height */
    overflow-y: auto; /* Scrollable if content exceeds height */
  }
  #run_button {
    background-color: #fff; /* Dark button color */
    color: black; /* White text */
    border-radius: 10px;
    padding: 10px;
    cursor: pointer;
    transition: background-color 0.3s ease;
    margin-top: 15px;
  }
  #run_button:hover {
    background-color: #333; /* Slightly lighter on hover */
  }
"""

# Gradio interface with vertical alignment and fixed image input height
with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="container"):
        input_image = gr.Image(type="pil", elem_id="input_image")
        run_button = gr.Button(value="Generate Prompt", elem_id="run_button")
        output_caption = gr.Textbox(label="Womener AI", show_copy_button=True, elem_id="output_caption", lines=6)

    run_button.click(
        fn=generate_caption,
        inputs=[input_image],
        outputs=output_caption,
    )

demo.launch(share=False)