from imagebind import data import torch from imagebind.models import imagebind_model from imagebind.models.imagebind_model import ModalityType import gradio as gr # command = "pip install git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d timm==0.6.7 ftfy regex einops fvcore decord==0.6.0" # process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) # process.wait() # print(process.returncode) # should print 0 if installation was successful device = "cuda:0" if torch.cuda.is_available() else "cpu" # Instantiate model model = imagebind_model.imagebind_huge(pretrained=True) model.eval() model.to(device) text_list = ["An Advertisement(branding, text, promotions, lifestyle depiction, contextual cues, and visual composition)","Not an Advertisement"] image_paths = [] with gr.Blocks() as demo: image = gr.File() image_paths.append(image) gr.Markdown( """ Zocket ImageBind made AdBind """) inputs = { ModalityType.TEXT: data.load_and_transform_text(text_list, device), ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device), } with torch.no_grad(): embeddings = model(inputs) print( "Vision x Text: ", torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1), ) out = f"""Output = {torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1)}""" gr.Markdown(out) demo.launch() # Load data