Spaces:

iamrobotbear
/

caption-match

Runtime error

File size: 3,039 Bytes

c617ba2
 
 
 
 
6c00d80
c617ba2
 
 
 
 
6c00d80
afa4c81
587c009
c617ba2
 
 
d72dfa9
 
 
 
 
 
 
c617ba2
 
 
587c009
6c00d80
c617ba2
17fbc96
587c009
c617ba2
 
 
587c009
 
 
c617ba2
587c009
c617ba2
587c009
 
c617ba2
587c009
 
 
 
 
 
 
 
 
 
 
 
 
 
c617ba2
 
afa4c81
c617ba2
ca6ee07
 
 
6c00d80

import gradio as gr
import torch
from PIL import Image
from lavis.models import load_model_and_preprocess
from lavis.processors import load_processor
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and preprocessors for Image-Text Matching (LAVIS)
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
model_itm, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain", device=device, is_eval=True)

# Load tokenizer and model for Image Captioning (TextCaps)
tokenizer_caption = AutoTokenizer.from_pretrained("microsoft/git-large-r-textcaps")
model_caption = AutoModelForCausalLM.from_pretrained("microsoft/git-large-r-textcaps").to(device)

# List of statements for Image-Text Matching
statements = [
    "cartoon, figurine, or toy",
    "appears to be for children",
    "includes children",
    "is sexual",
    "depicts a child or portrays objects, images, or cartoon figures that primarily appeal to persons below the legal purchase age",
    "uses the name of or depicts Santa Claus",
    'promotes alcohol use as a "rite of passage" to adulthood',
]

# Function to generate image captions using TextCaps
def generate_image_captions(image):
    inputs = tokenizer_caption(image, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model_caption.generate(**inputs)
    caption = tokenizer_caption.decode(outputs[0], skip_special_tokens=True)
    return caption

# Main function to perform image captioning and image-text matching
def process_images_and_statements(image):
    pil_image = Image.fromarray(image.astype('uint8'), 'RGB')
    img = vis_processors["eval"](pil_image.convert("RGB")).unsqueeze(0).to(device)
    
    # Generate image captions using TextCaps
    captions = generate_image_captions(pil_image)
    
    # Convert caption to the format expected by the ITM model
    txt = text_processors["eval"](captions)
    
    # Compute ITM scores for predefined statements
    itm_output = model_itm({"image": img, "text_input": txt}, match_head="itm")
    itm_scores = torch.nn.functional.softmax(itm_output, dim=1)
    score = itm_scores[:, 1].item()
    
    results = [f'Image Caption: "{captions}" with a matching probability of {score:.3%}']
    for statement in statements:
        txt = text_processors["eval"](statement)
        itm_output = model_itm({"image": img, "text_input": txt}, match_head="itm")
        itm_scores = torch.nn.functional.softmax(itm_output, dim=1)
        score = itm_scores[:, 1].item()
        result_text = f'The combination of image, caption ("{captions}"), and statement ("{statement}") is matched with a probability of {score:.3%}'
        results.append(result_text)
    output = "\n".join(results)
    return output

# Gradio interface
image_input = gr.inputs.Image()
output = gr.outputs.Textbox(label="Results")

iface = gr.Interface(fn=process_images_and_statements, inputs=image_input, outputs=output, title="Image Captioning and Image-Text Matching")
iface.launch()