Spaces:
Runtime error
Runtime error
File size: 5,683 Bytes
9845f41 fdf0785 9845f41 fdf0785 307952a fdf0785 307952a 9845f41 826388b 9845f41 fdf0785 9845f41 fdf0785 9845f41 fdf0785 9845f41 fdf0785 9845f41 fdf0785 9845f41 fdf0785 9845f41 fdf0785 9845f41 fdf0785 9845f41 fdf0785 9845f41 fdf0785 9845f41 fdf0785 9845f41 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import gradio as gr
from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, Blip2ForConditionalGeneration, VisionEncoderDecoderModel, BitsAndBytesConfig
import torch
import open_clip
from huggingface_hub import hf_hub_download
quantizer_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
# Use when running on a CPU
device_map = {
"transformer.word_embeddings": 0,
"transformer.word_embeddings_layernorm": 0,
"lm_head": "cpu",
"transformer.h": 0,
"transformer.ln_f": 0,
}
# Load the Blip2 model
preprocessor_blip2_8_bit = AutoProcessor.from_pretrained("Salesforce/blip2-opt-6.7b")
model_blip2_8_bit = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-6.7b", device_map=device_map)
# Load the Blip base model
# preprocessor_blip_base = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
# model_blip_base = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# # Load the Blip large model
# preprocessor_blip_large = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
# model_blip_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
# # Load the GIT coco model
# preprocessor_git_large_coco = AutoProcessor.from_pretrained("microsoft/git-large-coco")
# model_git_large_coco = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")
# # Load the CLIP model
# model_oc_coca, _, transform_oc_coca = open_clip.create_model_and_transforms(
# model_name="coca_ViT-L-14",
# pretrained="mscoco_finetuned_laion2B-s13B-b90k"
# )
device = "cuda" if torch.cuda.is_available() else "cpu"
# Transfer the models to the device
model_blip2_8_bit.to(device)
# model_blip_base.to(device)
# model_blip_large.to(device)
# model_git_large_coco.to(device)
# model_oc_coca.to(device)
def generate_caption(
preprocessor,
model,
image,
tokenizer=None,
use_float_16=False,
):
"""
Generate captions for the given image.
-----
Parameters
preprocessor: AutoProcessor
The preprocessor for the model.
model: BlipForConditionalGeneration
The model to use.
image: PIL.Image
The image to generate captions for.
tokenizer: AutoTokenizer
The tokenizer to use. If None, the default tokenizer for the model will be used.
use_float_16: bool
Whether to use float16 precision. This can speed up inference, but may lead to worse results.
-----
Returns
str
The generated caption.
"""
inputs = preprocessor(image, return_tensors="pt").to(device)
if use_float_16:
inputs = inputs.to(torch.float16)
generated_ids = model.generate(
pixel_values=inputs.pixel_values,
# attention_mask=inputs.attention_mask,
max_length=32,
use_cache=True,
)
if tokenizer is None:
generated_caption = preprocessor.batch_decode(generated_ids, skip_special_tokens=True)[0]
else:
generated_caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return generated_caption
def generate_captions_clip(
model,
transform,
image
):
"""
Generate captions for the given image using CLIP.
-----
Parameters
model: VisionEncoderDecoderModel
The CLIP model to use.
transform: Callable
The transform to apply to the image before passing it to the model.
image: PIL.Image
The image to generate captions for.
-----
Returns
str
The generated caption.
"""
img = transform(image).unsqueeze(0).to(device)
with torch.no_grad(), torch.cuda.amp.autocast():
generated = model.generate(img, seq_len=32, do_sample=True, temperature=0.9)
generated_caption = model.decode(generated[0].detach()).split("<end_of_text>")[0].replace("<start_of_text>", "")
return generated_caption
def generate_captions(
image
):
"""
Generate captions for the given image.
-----
Parameters
image: PIL.Image
The image to generate captions for.
-----
Returns
str
The generated caption.
"""
# Generate captions for the image using the Blip2 model
caption_blip2_8_bit = generate_caption(preprocessor_blip2_8_bit, model_blip2_8_bit, image, use_float_16=True).strip()
# Generate captions for the image using the Blip base model
# caption_blip_base = generate_caption(preprocessor_blip_base, model_blip_base, image).strip()
# # Generate captions for the image using the Blip large model
# caption_blip_large = generate_caption(preprocessor_blip_large, model_blip_large, image).strip()
# # Generate captions for the image using the GIT coco model
# caption_git_large_coco = generate_caption(preprocessor_git_large_coco, model_git_large_coco, image).strip()
# # Generate captions for the image using the CLIP model
# caption_oc_coca = generate_captions_clip(model_oc_coca, transform_oc_coca, image).strip()
return caption_blip2_8_bit
# Create the interface
iface = gr.Interface(
fn=generate_captions,
# Define the inputs: Image, Slider for Max Length, Slider for Temperature
inputs=[
gr.inputs.Image(label="Image"),
gr.inputs.Slider(minimum=16, maximum=64, step=2, default=32, label="Max Length"),
gr.inputs.Slider(minimum=0.5, maximum=1.5, step=0.1, default=1.0, label="Temperature"),
],
# Define the outputs
outputs=[
gr.outputs.Textbox(label="Blip2 8-bit"),
# gr.outputs.Textbox(label="Blip base"),
# gr.outputs.Textbox(label="Blip large"),
# gr.outputs.Textbox(label="GIT large coco"),
# gr.outputs.Textbox(label="CLIP"),
],
title="Image Captioning",
description="Generate captions for images using the Blip2 model, the Blip base model, the Blip large model, the GIT large coco model, and the CLIP model.",
enable_queue=True,
)
# Launch the interface
iface.launch() |