Spaces:
Running
Running
import cv2 as cv | |
import numpy as np | |
import torch | |
from gensim import models | |
import xgboost as xgb | |
import XGBoost_utils | |
import sys | |
import joblib | |
from DL_models import CustomResNet | |
#Ad/Brand Gaze Prediction | |
#Now the model is only able to process magazine images or images with full-page counterpages | |
#Please indicate where is the ad by ad_location parameter: left <- ad_location=0, right <- ad_location=1; otherwise, set it as None | |
def Ad_Gaze_Prediction(input_ad_path, input_ctpg_path, ad_location, | |
text_detection_model_path, LDA_model_pth, training_ad_text_dictionary_path, training_lang_preposition_path, | |
training_language, ad_embeddings, ctpg_embeddings, | |
surface_sizes=None, Product_Group=None, TextBoxes=None, Obj_and_Topics=None, | |
obj_detection_model_pth=None, num_topic=20, Gaze_Time_Type='Brand', Info_printing=True): | |
##Image Loading | |
if Info_printing: print('Loading Image ......') | |
flag_full_page_ad = False | |
has_ctpg = True | |
if type(input_ad_path) == str: | |
ad_img = cv.imread(input_ad_path) | |
ad_img = cv.cvtColor(ad_img, cv.COLOR_BGR2RGB) | |
ad_img_dim1, ad_img_dim2 = ad_img.shape[:2] | |
dim1_scale = int(np.ceil(ad_img_dim1/32)) | |
dim2_scale = int(np.ceil(ad_img_dim2/32)) | |
ad_img = cv.resize(ad_img, (32*dim2_scale,32*dim1_scale)) | |
else: | |
ad_img = input_ad_path | |
if input_ctpg_path is None: | |
ctpg_img = None #Initialization | |
flag_full_page_ad = True | |
has_ctpg = False | |
else: | |
if type(input_ctpg_path) == str: | |
ctpg_img = cv.imread(input_ctpg_path) | |
ctpg_img = cv.cvtColor(ctpg_img, cv.COLOR_BGR2RGB) | |
ctpg_img_dim1, ctpg_img_dim2 = ctpg_img.shape[:2] | |
dim1_scale = int(np.ceil(ctpg_img_dim1/32)) | |
dim2_scale = int(np.ceil(ctpg_img_dim2/32)) | |
ctpg_img = cv.resize(ctpg_img, (32*dim2_scale,32*dim1_scale)) | |
else: | |
ctpg_img = input_ctpg_path | |
#ctpg_img_dim1, ctpg_img_dim2 = [None,None] | |
# ctpg_img = None #Initialization | |
# flag_full_page_ad = False | |
# if has_ctpg: | |
# img = cv.resize(img, (1280,1024)) | |
# h, w, _ = img.shape | |
# page_width = w // 2 | |
# ctpg_location = 1-ad_location | |
# ad_img = img[:, (ad_location*page_width):((ad_location+1)*page_width)] | |
# ctpg_img = img[:, (ctpg_location*page_width):((ctpg_location+1)*page_width)] | |
# else: | |
# #if image's width is larger its height, then treat it as a double-page ad | |
# h, w, _ = img.shape | |
# if w > h: | |
# ad_img = cv.resize(img, (1280,1024)) | |
# flag_full_page_ad = True | |
# else: | |
# ad_img = cv.resize(img, (640,1024)) | |
if Info_printing: print() | |
##File Size | |
if Info_printing: print('Calculating complexity (filsize) ......') | |
filesize_ad = XGBoost_utils.filesize_individual(input_ad_path) | |
if has_ctpg: | |
filesize_ctpg = XGBoost_utils.filesize_individual(input_ctpg_path) | |
else: | |
filesize_ctpg = 0 | |
if Info_printing: print() | |
##Salience | |
if Info_printing: print('Processing Salience Information ......') | |
#Salience Map | |
S_map_ad = XGBoost_utils.Itti_Saliency(ad_img, scale_final=3) | |
if has_ctpg: | |
S_map_ctpg = XGBoost_utils.Itti_Saliency(ctpg_img, scale_final=3) | |
#K-Mean | |
threshold = 0.001 | |
enhance_rate = 1 | |
num_clusters = 3 | |
if flag_full_page_ad: | |
width = S_map_ad.shape[1] | |
left = S_map_ad[:, :width//2] | |
vecs_left, km_left = XGBoost_utils.salience_matrix_conv(left,threshold,num_clusters,enhance_rate=enhance_rate) | |
_,scores_left,widths_left,D_left = XGBoost_utils.img_clusters(num_clusters, left, km_left.labels_, km_left.cluster_centers_, vecs_left) | |
right = S_map_ad[:, width//2:] | |
vecs_right, km_right = XGBoost_utils.salience_matrix_conv(right,threshold,num_clusters,enhance_rate=enhance_rate) | |
_,scores_right,widths_right,D_right = XGBoost_utils.img_clusters(num_clusters, right, km_right.labels_, km_right.cluster_centers_, vecs_right) | |
ad_sal = np.array(scores_left) + np.array(scores_right) | |
ad_width = np.array(widths_left) + np.array(widths_right); ad_width = np.log(ad_width+1) | |
ad_sig_obj = D_left + D_right | |
ctpg_sal = np.zeros_like(ad_sal) | |
ctpg_width = np.zeros_like(ad_width) | |
ctpg_sig_obj = 0 | |
else: | |
vecs, km = XGBoost_utils.salience_matrix_conv(S_map_ad,threshold,num_clusters,enhance_rate=enhance_rate) | |
_,scores,widths,D = XGBoost_utils.img_clusters(num_clusters, S_map_ad, km.labels_, km.cluster_centers_, vecs) | |
ad_sal = np.array(scores) | |
ad_width = np.log(np.array(widths)+1) | |
ad_sig_obj = D | |
if has_ctpg: | |
vecs, km = XGBoost_utils.salience_matrix_conv(S_map_ctpg,threshold,num_clusters,enhance_rate=enhance_rate) | |
_,scores,widths,D = XGBoost_utils.img_clusters(num_clusters, S_map_ctpg, km.labels_, km.cluster_centers_, vecs) | |
ctpg_sal = np.array(scores) | |
ctpg_width = np.log(np.array(widths)+1) | |
ctpg_sig_obj = D | |
else: | |
ctpg_sal = np.zeros_like(ad_sal) | |
ctpg_width = np.zeros_like(ad_width) | |
ctpg_sig_obj = 0 | |
if Info_printing: print() | |
##Number of Textboxes | |
if Info_printing: print('Processing Textboxes ......') | |
if TextBoxes is None: | |
#Need multiples of 32 in both dimensions | |
ad_num_textboxes = XGBoost_utils.text_detection_east(ad_img, text_detection_model_path) | |
if has_ctpg: | |
ctpg_num_textboxes = XGBoost_utils.text_detection_east(ctpg_img, text_detection_model_path) | |
else: | |
ctpg_num_textboxes = 0 | |
else: | |
ad_num_textboxes, ctpg_num_textboxes = TextBoxes | |
if Info_printing: print() | |
##Objects and Topic Difference | |
if Info_printing: print('Processing Object and Topic Information ......') | |
if Info_printing: print('Loading Object Detection Model') | |
if Obj_and_Topics is None: | |
if obj_detection_model_pth is None: | |
model_obj = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, trust_repo=True) | |
else: | |
model_obj = torch.load(obj_detection_model_pth) | |
model_lda = models.LdaModel.load(LDA_model_pth) | |
dictionary = torch.load(training_ad_text_dictionary_path) | |
dutch_preposition = torch.load(training_lang_preposition_path) | |
ad_num_objs, ctpg_num_objs, ad_topic_weights, topic_Diff = XGBoost_utils.object_and_topic_variables(ad_img, ctpg_img, has_ctpg, dictionary, | |
dutch_preposition, training_language, model_obj, | |
model_lda, num_topic) | |
else: | |
ad_num_objs, ctpg_num_objs, ad_topic_soft_weights, ctpg_topic_soft_weights = Obj_and_Topics | |
indx = np.argmax(ad_topic_soft_weights) | |
ad_topic_weights = np.zeros(num_topic) | |
ad_topic_weights[indx] = 1 | |
topic_Diff = XGBoost_utils.KL_dist(ad_topic_soft_weights, ctpg_topic_soft_weights) | |
if Info_printing: print() | |
##Left and Right Indicator | |
if Info_printing: print('Getting Left/Right Indicator ......') | |
if flag_full_page_ad: | |
Left_right_indicator = [1,1] | |
else: | |
if has_ctpg: | |
if ad_location == 0: | |
Left_right_indicator = [1,0] | |
elif ad_location == 1: | |
Left_right_indicator = [0,1] | |
else: | |
Left_right_indicator = [1,1] | |
else: | |
Left_right_indicator = [1,0] | |
if Info_printing: print() | |
##Product Category | |
if Info_printing: print('Getting Product Category Indicator ......') | |
if Product_Group is None: | |
group_ind = XGBoost_utils.product_category() | |
else: | |
group_ind = Product_Group | |
if Info_printing: print() | |
##Surface Sizes | |
if Info_printing: print('Getting Surface Sizes ......') | |
if surface_sizes is None: | |
ad_img = cv.cvtColor(ad_img, cv.COLOR_RGB2BGR) | |
print('Please select the bounding box for your ad (from top left to bottom right)') | |
A = XGBoost_utils.Region_Selection(ad_img) | |
print() | |
print('Please select the bounding box for brands (from top left to bottom right)') | |
B = XGBoost_utils.Region_Selection(ad_img) | |
print() | |
print('Please select the bounding box for texts (from top left to bottom right)') | |
T = XGBoost_utils.Region_Selection(ad_img) | |
surface_sizes = [B/A*100,(1-B/A-T/A)*100,T/A*100,sum(Left_right_indicator)*5] | |
##Typicality Measure | |
# if Info_printing: print('Calculating Typicality Measure ......') | |
# if Info_printing: print() | |
##Get All things together | |
if Info_printing: print('Predicting ......') | |
gaze = 0 | |
for i in range(10): | |
#Var construction | |
pca_topic_transform = joblib.load('Topic_Embedding_PCAs/pca_model_'+str(i)+'.pkl') | |
ad_topics_curr = pca_topic_transform.transform(ad_embeddings)[:,:4][0] | |
ctpg_topics_curr = pca_topic_transform.transform(ctpg_embeddings)[:,:4][0] | |
ad_topic_weights = ad_topics_curr | |
topic_Diff = np.linalg.norm(ad_topics_curr-ctpg_topics_curr) | |
X = surface_sizes+[filesize_ad,filesize_ctpg]+list(ad_sal)+list(ctpg_sal)+list(ad_width)+list(ctpg_width)+[ad_sig_obj,ctpg_sig_obj]+[ad_num_textboxes,ctpg_num_textboxes,ad_num_objs,ctpg_num_objs]+list(group_ind)+list(ad_topic_weights) | |
X = np.array(X).reshape(1,len(X)) | |
X_for_typ = list(X[0,[0,1,2,3,4,6,7,8,12,13,14,18,20,22]])+list(group_ind)+list(ad_topic_weights) | |
X_for_typ = np.array(X_for_typ).reshape(1,len(X_for_typ)) | |
if Gaze_Time_Type == 'Brand': | |
med = torch.load('Brand_Gaze_Model/typicality_train_medoid') | |
elif Gaze_Time_Type == 'Ad': | |
med = torch.load('Ad_Gaze_Model/typicality_train_medoid') | |
typ = XGBoost_utils.typ_cat(med, X_for_typ, group_ind, np.abs) | |
Var = surface_sizes+[filesize_ad,filesize_ctpg]+list(ad_sal)+list(ctpg_sal)+list(ad_width)+list(ctpg_width)+[ad_sig_obj,ctpg_sig_obj]+[ad_num_textboxes,ctpg_num_textboxes,ad_num_objs,ctpg_num_objs]+Left_right_indicator+list(ad_topic_weights)+list(group_ind)+[topic_Diff.item(),typ.item()] | |
Var = np.array(Var).reshape(1,len(Var)) | |
xgb_model = xgb.XGBRegressor() | |
if Gaze_Time_Type == 'Brand': | |
xgb_model.load_model('Brand_Gaze_Model/10_models/Model_'+str(i+1)+'.json') | |
elif Gaze_Time_Type == 'Ad': | |
xgb_model.load_model('Ad_Gaze_Model/10_models/Model_'+str(i+1)+'.json') | |
gaze += xgb_model.predict(Var) | |
gaze = gaze/10 | |
if Info_printing: print('The predicted '+Gaze_Time_Type+' gaze time is: ', (np.exp(gaze)-1).item()) | |
return (np.exp(gaze)-1).item() | |
def CNN_Prediction(adv_imgs, ctpg_imgs, ad_locations, Gaze_Type='AG'): #Gaze_Type='AG' or 'BG' | |
gaze = 0 | |
if torch.cuda.is_available(): | |
device = 'cuda' | |
elif torch.backends.mps.is_available(): | |
device = 'mps' | |
else: | |
device = 'cpu' | |
if Gaze_Type == 'AG': | |
a_temp = 0.2590; b_temp = 1.1781 #AG | |
elif Gaze_Type == 'BG': | |
a_temp = 0.2100; b_temp = 0.3541 #BG | |
for i in range(10): | |
net = CustomResNet() | |
net.load_state_dict(torch.load('CNN_Gaze_Model/Fine-tune_'+Gaze_Type+'/Model_'+str(i)+'.pth',map_location=torch.device('cpu'))) | |
net = net.to(device) | |
with torch.no_grad(): | |
pred = net.forward(adv_imgs, ctpg_imgs, ad_locations) | |
pred = torch.exp(pred*a_temp+b_temp) - 1 | |
gaze += pred/10 | |
return gaze | |
def HeatMap_CNN(adv_imgs, ctpg_imgs, ad_locations, Gaze_Type='AG'): | |
if torch.cuda.is_available(): | |
device = 'cuda' | |
elif torch.backends.mps.is_available(): | |
device = 'mps' | |
else: | |
device = 'cpu' | |
net = CustomResNet() | |
net.load_state_dict(torch.load('CNN_Gaze_Model/Fine-tune_'+Gaze_Type+'/Model_'+str(0)+'.pth',map_location=torch.device('cpu'))) | |
net = net.to(device) | |
pred = net(adv_imgs/255.0,ctpg_imgs/255.0,ad_locations) | |
pred.backward() | |
# pull the gradients out of the model | |
gradients = net.get_activations_gradient() | |
# pool the gradients across the channels | |
pooled_gradients = torch.mean(gradients, dim=[0, 2, 3]) | |
# get the activations of the last convolutional layer | |
activations = net.get_activations(adv_imgs).detach() | |
# weight the channels by corresponding gradients | |
for i in range(512): | |
activations[:, i, :, :] *= pooled_gradients[i] | |
# average the channels of the activations | |
heatmap = torch.mean(activations, dim=1).squeeze().to('cpu') | |
# relu on top of the heatmap | |
# expression (2) in https://arxiv.org/pdf/1610.02391.pdf | |
heatmap = np.maximum(heatmap, 0) | |
# normalize the heatmap | |
heatmap /= torch.max(heatmap) | |
img = torch.permute(adv_imgs[0],(1,2,0)).to(torch.uint8).numpy() | |
img = cv.cvtColor(img, cv.COLOR_BGR2RGB) | |
heatmap = cv.resize(heatmap.numpy(), (img.shape[1], img.shape[0])) | |
heatmap = np.uint8(255 * heatmap) | |
heatmap = cv.applyColorMap(heatmap, cv.COLORMAP_TURBO) | |
superimposed_img = heatmap * 0.8 + img * 0.5 | |
superimposed_img /= np.max(superimposed_img) | |
superimposed_img = np.uint8(255 * superimposed_img) | |
return superimposed_img |