File size: 4,387 Bytes
26fd00c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import torch 
import numpy as np
import os 
import re

def average(checkpoints, lambdas=[0.5, 0.5], num_models=6, output_dir=None, filename=None, skip_keys=None, ema=False):

	ckpt = torch.load(checkpoints[0], map_location='cpu')

	if ema:
		key = 'extra_state'
		state = ckpt['extra_state']['ema']
	else:
		key = 'model'
		state = ckpt['model']

	print(lambdas)


	
	if num_models == 1:
		average_state = {k : v.clone() * lambdas[0] for k, v in state.items()}
		for i in range(1, len(checkpoints)):
			skip_keys_list = set()
			print(checkpoints[i], lambdas[i])
			if ema:
				statei = torch.load(checkpoints[i], map_location='cpu')['extra_state']['ema']
			else:
				statei = torch.load(checkpoints[i], map_location='cpu')['model']
			for k, v in average_state.items():
				if k in statei and (skip_keys is None or ((not any([re.match(sk, k) for sk in skip_keys])) and (not any([sk in k for sk in skip_keys])))):
					try:
						average_state[k] += (lambdas[i])*statei[k].clone()
					except:
						print(k, average_state[k].shape, statei[k].shape)
						average_state[k] += (lambdas[i])*average_state[k].clone()
				else:
					average_state[k] += (lambdas[i])*average_state[k].clone()
					skip_keys_list.add(k)
					
					
			state_dict = average_state
			print(skip_keys_list)
		if ema:
			save_obj = {key:{'ema': state_dict, 'epoch': 0}} 
			for k, v in ckpt['extra_state'].items():
				if k != 'ema':
					save_obj['extra_state']=v
					print(k)
			for k, v in ckpt.items():
				if k != key:
					save_obj[k]=v
					print(k)
		else:
			save_obj = {key: state_dict,}
			for k, v in ckpt.items():
				if k != key:
					save_obj[k]=v
				print(k)
		output_path = os.path.join(output_dir, '{}.pt'.format(filename))
		print('saving', output_path)
		torch.save(save_obj, output_path)  

	else:
		if ema:
			state_dict1 = ckpt['extra_state']['ema']
			state_dict2 = torch.load(checkpoints[1], map_location='cpu')['extra_state']['ema']
		else:
			state_dict1 = ckpt['model']
			state_dict2 = torch.load(checkpoints[1], map_location='cpu')['model']
		for l in lambdas:
			average_state = {k : v * l for k, v in state_dict1.items()} #{k : v * (1./NUM_MODELS) for k, v in state_dict1.items()}
			for k, v in average_state.items():
				if k in state_dict2:
					average_state[k] += (1-l)*state_dict2[k]
				else:
					average_state[k] += (1-l)*state_dict1[k]

			state_dict = average_state

			if ema:
				save_obj = {key:{'ema': state_dict,}} 
				for k, v in ckpt['extra_state'].items():
					if k != 'ema':
						save_obj['extra_state'][k]=v
						print(k)
				for k, v in ckpt.items():
					if k != key:
						save_obj[k]=v
						print(k)
			else:
				save_obj = {key: state_dict,}
				for k, v in ckpt.items():
					if k != key:
						save_obj[k]=v
						print(k)
			output_path = os.path.join(output_dir, '{}_l{:.2f}.pt'.format(filename, l))
			print('saving', output_path)
			torch.save(save_obj, output_path)  






# average of several models 

# lambdas = [1/4, 1/4, 1/4, 1/4]

# num_models=1
# output_dir='/lus/scratch/NAT/gda2204/SHARED/logs/ofa/pretrained_models/average_models/'
# filename='avg_caprefsnlivqa'

# checkpoints = [
# 			'/lus/scratch/NAT/gda2204/SHARED/logs/ofa/checkpoints/caption/caption_stage_1_ofaplus_base_pretrain_s2_hsep1_bs16_shuf/10_0.06_6000/checkpoint_best.pt',
# 			'/lus/scratch/NAT/gda2204/SHARED/logs/ofa/checkpoints/refcocoplus/refcocoplus_ofaplus_base_pretrain_s2_hsep1_fix_lr5e5_bs8_4_shuf/10_5e-5_512/checkpoint_best.pt',
# 			'/lus/scratch/NAT/gda2204/SHARED/logs/ofa/checkpoints/snli_ve/snli_ve_ofaplus_base_pretrain_s2_hsep1/10_5e-5/checkpoint_best.pt',
# 			'/lus/scratch/NAT/gda2204/SHARED/logs/ofa/checkpoints/vqa/vqa_ofaplus_base_pretrain_s2_bs16_lr1e4_shuf_hsep1/20_0.04_1e-4_480/checkpoint_best.pt',
# 			]

# for weight interpolation
num_models=6
output_dir='/lus/scratch/NAT/gda2204/SHARED/logs/ofa/pretrained_models/average_models/'
filename='avg_capvqa'
lambdas = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]

checkpoints = ['/lus/scratch/NAT/gda2204/SHARED/logs/ofa/checkpoints/vqa/vqa_ofaplus_base_pretrain_s2_bs16_lr1e4_shuf_hsep1/20_0.04_1e-4_480/checkpoint_best.pt',
	       '/lus/scratch/NAT/gda2204/SHARED/logs/ofa/checkpoints/caption/caption_stage_1_ofaplus_base_pretrain_s2_hsep1_bs16_shuf/10_0.06_6000/checkpoint_best.pt',
    ]



average(checkpoints, lambdas=lambdas, num_models=num_models, output_dir=output_dir, filename=filename)