rscir / app.py
Bill Psomas
minor fix
2019c6b
import os
import numpy as np
import torch
from PIL import Image
import open_clip
import gradio as gr
import pickle
# Load pre-trained model
model, _, tokenizer = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai')
# Load features
def load_features(pickle_file):
with open(pickle_file, 'rb') as f:
data = pickle.load(f)
return data
# Calculate similarity
def calculate_similarity(image_features, text_feature, lambda_val=0.5):
image_similarities = image_features @ text_feature.T
text_similarities = text_feature @ text_feature.T
combined_similarities = (1 - lambda_val) * image_similarities + lambda_val * text_similarities
return combined_similarities
# Load precomputed features
features = load_features('features/patternnet_clip.pkl')
image_features = torch.tensor(features['feats'])#.cuda()
image_paths = features['paths']
def image_text_retrieval(image, text, lambda_val):
# Preprocess image
preprocess = open_clip.get_preprocess('ViT-L-14')
image = preprocess(image).unsqueeze(0)#.cuda()
# Encode image and text
image_feature = model.encode_image(image).cpu()
#text_feature = model.encode_text(tokenizer(text).unsqueeze(0).cuda()).cpu()
text_feature = model.encode_text(tokenizer(text).unsqueeze(0)).cpu()
# Calculate combined similarities
similarities = calculate_similarity(image_features, text_feature, lambda_val)
top_indices = similarities.topk(5).indices.squeeze().tolist()
# Retrieve top images
top_images = [Image.open(image_paths[i]) for i in top_indices]
return top_images
# Create Gradio interface
def demo(image, text, lambda_val):
return image_text_retrieval(image, text, lambda_val)
iface = gr.Interface(
fn=demo,
inputs=[
gr.Image(type="pil", label="Query Image"),
gr.Textbox(lines=2, placeholder="Enter text query...", label="Text Query"),
gr.Slider(minimum=0, maximum=1, value=0.5, label="Lambda Value (Image-Text Weight)")
],
outputs=gr.Gallery(label="Retrieved Images"),
title="Composed Image Retrieval for Remote Sensing",
description="Upload a query image, enter a text query, and adjust the lambda value to retrieve images based on both image and text inputs."
)
iface.launch()