Spaces:

AItool
/

RIFEimgInterpolation

Sleeping

App Files Files Community

AItool commited on 8 days ago

Commit

ffad2e7

verified ·

1 Parent(s): 801e00f

Upload 4 files

Browse files

Files changed (4) hide show

dataset.py +109 -0
inference_img.py +111 -0
inference_video.py +297 -0
train.py +155 -0

dataset.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import cv2
+import ast
+import torch
+import numpy as np
+import random
+from torch.utils.data import DataLoader, Dataset
+cv2.setNumThreads(1)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class VimeoDataset(Dataset):
+    def __init__(self, dataset_name, batch_size=32):
+        self.batch_size = batch_size
+        self.dataset_name = dataset_name
+        self.h = 256
+        self.w = 448
+        self.data_root = 'vimeo_triplet'
+        self.image_root = os.path.join(self.data_root, 'sequences')
+        train_fn = os.path.join(self.data_root, 'tri_trainlist.txt')
+        test_fn = os.path.join(self.data_root, 'tri_testlist.txt')
+        with open(train_fn, 'r') as f:
+            self.trainlist = f.read().splitlines()
+        with open(test_fn, 'r') as f:
+            self.testlist = f.read().splitlines()
+        self.load_data()
+    def __len__(self):
+        return len(self.meta_data)
+    def load_data(self):
+        cnt = int(len(self.trainlist) * 0.95)
+        if self.dataset_name == 'train':
+            self.meta_data = self.trainlist[:cnt]
+        elif self.dataset_name == 'test':
+            self.meta_data = self.testlist
+        else:
+            self.meta_data = self.trainlist[cnt:]
+    def crop(self, img0, gt, img1, h, w):
+        ih, iw, _ = img0.shape
+        x = np.random.randint(0, ih - h + 1)
+        y = np.random.randint(0, iw - w + 1)
+        img0 = img0[x:x+h, y:y+w, :]
+        img1 = img1[x:x+h, y:y+w, :]
+        gt = gt[x:x+h, y:y+w, :]
+        return img0, gt, img1
+    def getimg(self, index):
+        imgpath = os.path.join(self.image_root, self.meta_data[index])
+        imgpaths = [imgpath + '/im1.png', imgpath + '/im2.png', imgpath + '/im3.png']
+        # Load images
+        img0 = cv2.imread(imgpaths[0])
+        gt = cv2.imread(imgpaths[1])
+        img1 = cv2.imread(imgpaths[2])
+        timestep = 0.5
+        return img0, gt, img1, timestep
+        # RIFEm with Vimeo-Septuplet
+        # imgpaths = [imgpath + '/im1.png', imgpath + '/im2.png', imgpath + '/im3.png', imgpath + '/im4.png', imgpath + '/im5.png', imgpath + '/im6.png', imgpath + '/im7.png']
+        # ind = [0, 1, 2, 3, 4, 5, 6]
+        # random.shuffle(ind)
+        # ind = ind[:3]
+        # ind.sort()
+        # img0 = cv2.imread(imgpaths[ind[0]])
+        # gt = cv2.imread(imgpaths[ind[1]])
+        # img1 = cv2.imread(imgpaths[ind[2]])
+        # timestep = (ind[1] - ind[0]) * 1.0 / (ind[2] - ind[0] + 1e-6)
+    def __getitem__(self, index):
+        img0, gt, img1, timestep = self.getimg(index)
+        if self.dataset_name == 'train':
+            img0, gt, img1 = self.crop(img0, gt, img1, 224, 224)
+            if random.uniform(0, 1) < 0.5:
+                img0 = img0[:, :, ::-1]
+                img1 = img1[:, :, ::-1]
+                gt = gt[:, :, ::-1]
+            if random.uniform(0, 1) < 0.5:
+                img0 = img0[::-1]
+                img1 = img1[::-1]
+                gt = gt[::-1]
+            if random.uniform(0, 1) < 0.5:
+                img0 = img0[:, ::-1]
+                img1 = img1[:, ::-1]
+                gt = gt[:, ::-1]
+            if random.uniform(0, 1) < 0.5:
+                tmp = img1
+                img1 = img0
+                img0 = tmp
+                timestep = 1 - timestep
+            # random rotation
+            p = random.uniform(0, 1)
+            if p < 0.25:
+                img0 = cv2.rotate(img0, cv2.ROTATE_90_CLOCKWISE)
+                gt = cv2.rotate(gt, cv2.ROTATE_90_CLOCKWISE)
+                img1 = cv2.rotate(img1, cv2.ROTATE_90_CLOCKWISE)
+            elif p < 0.5:
+                img0 = cv2.rotate(img0, cv2.ROTATE_180)
+                gt = cv2.rotate(gt, cv2.ROTATE_180)
+                img1 = cv2.rotate(img1, cv2.ROTATE_180)
+            elif p < 0.75:
+                img0 = cv2.rotate(img0, cv2.ROTATE_90_COUNTERCLOCKWISE)
+                gt = cv2.rotate(gt, cv2.ROTATE_90_COUNTERCLOCKWISE)
+                img1 = cv2.rotate(img1, cv2.ROTATE_90_COUNTERCLOCKWISE)
+        img0 = torch.from_numpy(img0.copy()).permute(2, 0, 1)
+        img1 = torch.from_numpy(img1.copy()).permute(2, 0, 1)
+        gt = torch.from_numpy(gt.copy()).permute(2, 0, 1)
+        timestep = torch.tensor(timestep).reshape(1, 1, 1)
+        return torch.cat((img0, img1, gt), 0), timestep

inference_img.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+import cv2
+import torch
+import argparse
+from torch.nn import functional as F
+import warnings
+warnings.filterwarnings("ignore")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.set_grad_enabled(False)
+if torch.cuda.is_available():
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+parser = argparse.ArgumentParser(description='Interpolation for a pair of images')
+parser.add_argument('--img', dest='img', nargs=2, required=True)
+parser.add_argument('--exp', default=4, type=int)
+parser.add_argument('--ratio', default=0, type=float, help='inference ratio between two images with 0 - 1 range')
+parser.add_argument('--rthreshold', default=0.02, type=float, help='returns image when actual ratio falls in given range threshold')
+parser.add_argument('--rmaxcycles', default=8, type=int, help='limit max number of bisectional cycles')
+parser.add_argument('--model', dest='modelDir', type=str, default='train_log', help='directory with trained model files')
+args = parser.parse_args()
+try:
+    try:
+        try:
+            from model.RIFE_HDv2 import Model
+            model = Model()
+            model.load_model(args.modelDir, -1)
+            print("Loaded v2.x HD model.")
+        except:
+            from train_log.RIFE_HDv3 import Model
+            model = Model()
+            model.load_model(args.modelDir, -1)
+            print("Loaded v3.x HD model.")
+    except:
+        from model.RIFE_HD import Model
+        model = Model()
+        model.load_model(args.modelDir, -1)
+        print("Loaded v1.x HD model")
+except:
+    from model.RIFE import Model
+    model = Model()
+    model.load_model(args.modelDir, -1)
+    print("Loaded ArXiv-RIFE model")
+model.eval()
+model.device()
+if args.img[0].endswith('.exr') and args.img[1].endswith('.exr'):
+    img0 = cv2.imread(args.img[0], cv2.IMREAD_COLOR | cv2.IMREAD_ANYDEPTH)
+    img1 = cv2.imread(args.img[1], cv2.IMREAD_COLOR | cv2.IMREAD_ANYDEPTH)
+    img0 = (torch.tensor(img0.transpose(2, 0, 1)).to(device)).unsqueeze(0)
+    img1 = (torch.tensor(img1.transpose(2, 0, 1)).to(device)).unsqueeze(0)
+else:
+    img0 = cv2.imread(args.img[0], cv2.IMREAD_UNCHANGED)
+    img1 = cv2.imread(args.img[1], cv2.IMREAD_UNCHANGED)
+    img0 = (torch.tensor(img0.transpose(2, 0, 1)).to(device) / 255.).unsqueeze(0)
+    img1 = (torch.tensor(img1.transpose(2, 0, 1)).to(device) / 255.).unsqueeze(0)
+n, c, h, w = img0.shape
+ph = ((h - 1) // 32 + 1) * 32
+pw = ((w - 1) // 32 + 1) * 32
+padding = (0, pw - w, 0, ph - h)
+img0 = F.pad(img0, padding)
+img1 = F.pad(img1, padding)
+if args.ratio:
+    img_list = [img0]
+    img0_ratio = 0.0
+    img1_ratio = 1.0
+    if args.ratio <= img0_ratio + args.rthreshold / 2:
+        middle = img0
+    elif args.ratio >= img1_ratio - args.rthreshold / 2:
+        middle = img1
+    else:
+        tmp_img0 = img0
+        tmp_img1 = img1
+        for inference_cycle in range(args.rmaxcycles):
+            middle = model.inference(tmp_img0, tmp_img1)
+            middle_ratio = ( img0_ratio + img1_ratio ) / 2
+            if args.ratio - (args.rthreshold / 2) <= middle_ratio <= args.ratio + (args.rthreshold / 2):
+                break
+            if args.ratio > middle_ratio:
+                tmp_img0 = middle
+                img0_ratio = middle_ratio
+            else:
+                tmp_img1 = middle
+                img1_ratio = middle_ratio
+    img_list.append(middle)
+    img_list.append(img1)
+else:
+    img_list = [img0, img1]
+    for i in range(args.exp):
+        tmp = []
+        for j in range(len(img_list) - 1):
+            mid = model.inference(img_list[j], img_list[j + 1])
+            tmp.append(img_list[j])
+            tmp.append(mid)
+        tmp.append(img1)
+        img_list = tmp
+if not os.path.exists('output'):
+    os.mkdir('output')
+for i in range(len(img_list)):
+    if args.img[0].endswith('.exr') and args.img[1].endswith('.exr'):
+        cv2.imwrite('output/img{}.exr'.format(i), (img_list[i][0]).cpu().numpy().transpose(1, 2, 0)[:h, :w], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+    else:
+        cv2.imwrite('output/img{}.png'.format(i), (img_list[i][0] * 255).byte().cpu().numpy().transpose(1, 2, 0)[:h, :w])

inference_video.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import os
+import cv2
+import torch
+import argparse
+import numpy as np
+from tqdm import tqdm
+from torch.nn import functional as F
+import warnings
+import _thread
+import skvideo.io
+from queue import Queue, Empty
+from model.pytorch_msssim import ssim_matlab
+warnings.filterwarnings("ignore")
+def transferAudio(sourceVideo, targetVideo):
+    import shutil
+    import moviepy.editor
+    tempAudioFileName = "./temp/audio.mkv"
+    # split audio from original video file and store in "temp" directory
+    if True:
+        # clear old "temp" directory if it exits
+        if os.path.isdir("temp"):
+            # remove temp directory
+            shutil.rmtree("temp")
+        # create new "temp" directory
+        os.makedirs("temp")
+        # extract audio from video
+        os.system('ffmpeg -y -i "{}" -c:a copy -vn {}'.format(sourceVideo, tempAudioFileName))
+    targetNoAudio = os.path.splitext(targetVideo)[0] + "_noaudio" + os.path.splitext(targetVideo)[1]
+    os.rename(targetVideo, targetNoAudio)
+    # combine audio file and new video file
+    os.system('ffmpeg -y -i "{}" -i {} -c copy "{}"'.format(targetNoAudio, tempAudioFileName, targetVideo))
+    if os.path.getsize(targetVideo) == 0: # if ffmpeg failed to merge the video and audio together try converting the audio to aac
+        tempAudioFileName = "./temp/audio.m4a"
+        os.system('ffmpeg -y -i "{}" -c:a aac -b:a 160k -vn {}'.format(sourceVideo, tempAudioFileName))
+        os.system('ffmpeg -y -i "{}" -i {} -c copy "{}"'.format(targetNoAudio, tempAudioFileName, targetVideo))
+        if (os.path.getsize(targetVideo) == 0): # if aac is not supported by selected format
+            os.rename(targetNoAudio, targetVideo)
+            print("Audio transfer failed. Interpolated video will have no audio")
+        else:
+            print("Lossless audio transfer failed. Audio was transcoded to AAC (M4A) instead.")
+            # remove audio-less video
+            os.remove(targetNoAudio)
+    else:
+        os.remove(targetNoAudio)
+    # remove temp directory
+    shutil.rmtree("temp")
+parser = argparse.ArgumentParser(description='Interpolation for a pair of images')
+parser.add_argument('--video', dest='video', type=str, default=None)
+parser.add_argument('--output', dest='output', type=str, default=None)
+parser.add_argument('--img', dest='img', type=str, default=None)
+parser.add_argument('--montage', dest='montage', action='store_true', help='montage origin video')
+parser.add_argument('--model', dest='modelDir', type=str, default='train_log', help='directory with trained model files')
+parser.add_argument('--fp16', dest='fp16', action='store_true', help='fp16 mode for faster and more lightweight inference on cards with Tensor Cores')
+parser.add_argument('--UHD', dest='UHD', action='store_true', help='support 4k video')
+parser.add_argument('--scale', dest='scale', type=float, default=1.0, help='Try scale=0.5 for 4k video')
+parser.add_argument('--skip', dest='skip', action='store_true', help='whether to remove static frames before processing')
+parser.add_argument('--fps', dest='fps', type=int, default=None)
+parser.add_argument('--png', dest='png', action='store_true', help='whether to vid_out png format vid_outs')
+parser.add_argument('--ext', dest='ext', type=str, default='mp4', help='vid_out video extension')
+parser.add_argument('--exp', dest='exp', type=int, default=1)
+args = parser.parse_args()
+assert (not args.video is None or not args.img is None)
+if args.skip:
+    print("skip flag is abandoned, please refer to issue #207.")
+if args.UHD and args.scale==1.0:
+    args.scale = 0.5
+assert args.scale in [0.25, 0.5, 1.0, 2.0, 4.0]
+if not args.img is None:
+    args.png = True
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.set_grad_enabled(False)
+if torch.cuda.is_available():
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    if(args.fp16):
+        torch.set_default_tensor_type(torch.cuda.HalfTensor)
+try:
+    try:
+        try:
+            from model.RIFE_HDv2 import Model
+            model = Model()
+            model.load_model(args.modelDir, -1)
+            print("Loaded v2.x HD model.")
+        except:
+            from train_log.RIFE_HDv3 import Model
+            model = Model()
+            model.load_model(args.modelDir, -1)
+            print("Loaded v3.x HD model.")
+    except:
+        from model.RIFE_HD import Model
+        model = Model()
+        model.load_model(args.modelDir, -1)
+        print("Loaded v1.x HD model")
+except:
+    from model.RIFE import Model
+    model = Model()
+    model.load_model(args.modelDir, -1)
+    print("Loaded ArXiv-RIFE model")
+model.eval()
+model.device()
+if not args.video is None:
+    videoCapture = cv2.VideoCapture(args.video)
+    fps = videoCapture.get(cv2.CAP_PROP_FPS)
+    tot_frame = videoCapture.get(cv2.CAP_PROP_FRAME_COUNT)
+    videoCapture.release()
+    if args.fps is None:
+        fpsNotAssigned = True
+        args.fps = fps * (2 ** args.exp)
+    else:
+        fpsNotAssigned = False
+    videogen = skvideo.io.vreader(args.video)
+    lastframe = next(videogen)
+    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
+    video_path_wo_ext, ext = os.path.splitext(args.video)
+    print('{}.{}, {} frames in total, {}FPS to {}FPS'.format(video_path_wo_ext, args.ext, tot_frame, fps, args.fps))
+    if args.png == False and fpsNotAssigned == True:
+        print("The audio will be merged after interpolation process")
+    else:
+        print("Will not merge audio because using png or fps flag!")
+else:
+    videogen = []
+    for f in os.listdir(args.img):
+        if 'png' in f:
+            videogen.append(f)
+    tot_frame = len(videogen)
+    videogen.sort(key= lambda x:int(x[:-4]))
+    lastframe = cv2.imread(os.path.join(args.img, videogen[0]), cv2.IMREAD_UNCHANGED)[:, :, ::-1].copy()
+    videogen = videogen[1:]
+h, w, _ = lastframe.shape
+vid_out_name = None
+vid_out = None
+if args.png:
+    if not os.path.exists('vid_out'):
+        os.mkdir('vid_out')
+else:
+    if args.output is not None:
+        vid_out_name = args.output
+    else:
+        vid_out_name = '{}_{}X_{}fps.{}'.format(video_path_wo_ext, (2 ** args.exp), int(np.round(args.fps)), args.ext)
+    vid_out = cv2.VideoWriter(vid_out_name, fourcc, args.fps, (w, h))
+def clear_write_buffer(user_args, write_buffer):
+    cnt = 0
+    while True:
+        item = write_buffer.get()
+        if item is None:
+            break
+        if user_args.png:
+            cv2.imwrite('vid_out/{:0>7d}.png'.format(cnt), item[:, :, ::-1])
+            cnt += 1
+        else:
+            vid_out.write(item[:, :, ::-1])
+def build_read_buffer(user_args, read_buffer, videogen):
+    try:
+        for frame in videogen:
+             if not user_args.img is None:
+                  frame = cv2.imread(os.path.join(user_args.img, frame), cv2.IMREAD_UNCHANGED)[:, :, ::-1].copy()
+             if user_args.montage:
+                  frame = frame[:, left: left + w]
+             read_buffer.put(frame)
+    except:
+        pass
+    read_buffer.put(None)
+def make_inference(I0, I1, n):
+    global model
+    middle = model.inference(I0, I1, args.scale)
+    if n == 1:
+        return [middle]
+    first_half = make_inference(I0, middle, n=n//2)
+    second_half = make_inference(middle, I1, n=n//2)
+    if n%2:
+        return [*first_half, middle, *second_half]
+    else:
+        return [*first_half, *second_half]
+def pad_image(img):
+    if(args.fp16):
+        return F.pad(img, padding).half()
+    else:
+        return F.pad(img, padding)
+if args.montage:
+    left = w // 4
+    w = w // 2
+tmp = max(32, int(32 / args.scale))
+ph = ((h - 1) // tmp + 1) * tmp
+pw = ((w - 1) // tmp + 1) * tmp
+padding = (0, pw - w, 0, ph - h)
+pbar = tqdm(total=tot_frame)
+if args.montage:
+    lastframe = lastframe[:, left: left + w]
+write_buffer = Queue(maxsize=500)
+read_buffer = Queue(maxsize=500)
+_thread.start_new_thread(build_read_buffer, (args, read_buffer, videogen))
+_thread.start_new_thread(clear_write_buffer, (args, write_buffer))
+I1 = torch.from_numpy(np.transpose(lastframe, (2,0,1))).to(device, non_blocking=True).unsqueeze(0).float() / 255.
+I1 = pad_image(I1)
+temp = None # save lastframe when processing static frame
+while True:
+    if temp is not None:
+        frame = temp
+        temp = None
+    else:
+        frame = read_buffer.get()
+    if frame is None:
+        break
+    I0 = I1
+    I1 = torch.from_numpy(np.transpose(frame, (2,0,1))).to(device, non_blocking=True).unsqueeze(0).float() / 255.
+    I1 = pad_image(I1)
+    I0_small = F.interpolate(I0, (32, 32), mode='bilinear', align_corners=False)
+    I1_small = F.interpolate(I1, (32, 32), mode='bilinear', align_corners=False)
+    ssim = ssim_matlab(I0_small[:, :3], I1_small[:, :3])
+    break_flag = False
+    if ssim > 0.996:
+        frame = read_buffer.get() # read a new frame
+        if frame is None:
+            break_flag = True
+            frame = lastframe
+        else:
+            temp = frame
+        I1 = torch.from_numpy(np.transpose(frame, (2,0,1))).to(device, non_blocking=True).unsqueeze(0).float() / 255.
+        I1 = pad_image(I1)
+        I1 = model.inference(I0, I1, args.scale)
+        I1_small = F.interpolate(I1, (32, 32), mode='bilinear', align_corners=False)
+        ssim = ssim_matlab(I0_small[:, :3], I1_small[:, :3])
+        frame = (I1[0] * 255).byte().cpu().numpy().transpose(1, 2, 0)[:h, :w]
+    if ssim < 0.2:
+        output = []
+        for i in range((2 ** args.exp) - 1):
+            output.append(I0)
+        '''
+        output = []
+        step = 1 / (2 ** args.exp)
+        alpha = 0
+        for i in range((2 ** args.exp) - 1):
+            alpha += step
+            beta = 1-alpha
+            output.append(torch.from_numpy(np.transpose((cv2.addWeighted(frame[:, :, ::-1], alpha, lastframe[:, :, ::-1], beta, 0)[:, :, ::-1].copy()), (2,0,1))).to(device, non_blocking=True).unsqueeze(0).float() / 255.)
+        '''
+    else:
+        output = make_inference(I0, I1, 2**args.exp-1) if args.exp else []
+    if args.montage:
+        write_buffer.put(np.concatenate((lastframe, lastframe), 1))
+        for mid in output:
+            mid = (((mid[0] * 255.).byte().cpu().numpy().transpose(1, 2, 0)))
+            write_buffer.put(np.concatenate((lastframe, mid[:h, :w]), 1))
+    else:
+        write_buffer.put(lastframe)
+        for mid in output:
+            mid = (((mid[0] * 255.).byte().cpu().numpy().transpose(1, 2, 0)))
+            write_buffer.put(mid[:h, :w])
+    pbar.update(1)
+    lastframe = frame
+    if break_flag:
+        break
+if args.montage:
+    write_buffer.put(np.concatenate((lastframe, lastframe), 1))
+else:
+    write_buffer.put(lastframe)
+write_buffer.put(None)
+import time
+while(not write_buffer.empty()):
+    time.sleep(0.1)
+pbar.close()
+if not vid_out is None:
+    vid_out.release()
+# move audio to new video file if appropriate
+if args.png == False and fpsNotAssigned == True and not args.video is None:
+    try:
+        transferAudio(args.video, vid_out_name)
+    except:
+        print("Audio transfer failed. Interpolated video will have no audio")
+        targetNoAudio = os.path.splitext(vid_out_name)[0] + "_noaudio" + os.path.splitext(vid_out_name)[1]
+        os.rename(targetNoAudio, vid_out_name)

train.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import os
+import cv2
+import math
+import time
+import torch
+import torch.distributed as dist
+import numpy as np
+import random
+import argparse
+from model.RIFE import Model
+from dataset import *
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.tensorboard import SummaryWriter
+from torch.utils.data.distributed import DistributedSampler
+device = torch.device("cuda")
+log_path = 'train_log'
+def get_learning_rate(step):
+    if step < 2000:
+        mul = step / 2000.
+        return 3e-4 * mul
+    else:
+        mul = np.cos((step - 2000) / (args.epoch * args.step_per_epoch - 2000.) * math.pi) * 0.5 + 0.5
+        return (3e-4 - 3e-6) * mul + 3e-6
+def flow2rgb(flow_map_np):
+    h, w, _ = flow_map_np.shape
+    rgb_map = np.ones((h, w, 3)).astype(np.float32)
+    normalized_flow_map = flow_map_np / (np.abs(flow_map_np).max())
+    rgb_map[:, :, 0] += normalized_flow_map[:, :, 0]
+    rgb_map[:, :, 1] -= 0.5 * (normalized_flow_map[:, :, 0] + normalized_flow_map[:, :, 1])
+    rgb_map[:, :, 2] += normalized_flow_map[:, :, 1]
+    return rgb_map.clip(0, 1)
+def train(model, local_rank):
+    if local_rank == 0:
+        writer = SummaryWriter('train')
+        writer_val = SummaryWriter('validate')
+    else:
+        writer = None
+        writer_val = None
+    step = 0
+    nr_eval = 0
+    dataset = VimeoDataset('train')
+    sampler = DistributedSampler(dataset)
+    train_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=8, pin_memory=True, drop_last=True, sampler=sampler)
+    args.step_per_epoch = train_data.__len__()
+    dataset_val = VimeoDataset('validation')
+    val_data = DataLoader(dataset_val, batch_size=16, pin_memory=True, num_workers=8)
+    print('training...')
+    time_stamp = time.time()
+    for epoch in range(args.epoch):
+        sampler.set_epoch(epoch)
+        for i, data in enumerate(train_data):
+            data_time_interval = time.time() - time_stamp
+            time_stamp = time.time()
+            data_gpu, timestep = data
+            data_gpu = data_gpu.to(device, non_blocking=True) / 255.
+            timestep = timestep.to(device, non_blocking=True)
+            imgs = data_gpu[:, :6]
+            gt = data_gpu[:, 6:9]
+            learning_rate = get_learning_rate(step) * args.world_size / 4
+            pred, info = model.update(imgs, gt, learning_rate, training=True) # pass timestep if you are training RIFEm
+            train_time_interval = time.time() - time_stamp
+            time_stamp = time.time()
+            if step % 200 == 1 and local_rank == 0:
+                writer.add_scalar('learning_rate', learning_rate, step)
+                writer.add_scalar('loss/l1', info['loss_l1'], step)
+                writer.add_scalar('loss/tea', info['loss_tea'], step)
+                writer.add_scalar('loss/distill', info['loss_distill'], step)
+            if step % 1000 == 1 and local_rank == 0:
+                gt = (gt.permute(0, 2, 3, 1).detach().cpu().numpy() * 255).astype('uint8')
+                mask = (torch.cat((info['mask'], info['mask_tea']), 3).permute(0, 2, 3, 1).detach().cpu().numpy() * 255).astype('uint8')
+                pred = (pred.permute(0, 2, 3, 1).detach().cpu().numpy() * 255).astype('uint8')
+                merged_img = (info['merged_tea'].permute(0, 2, 3, 1).detach().cpu().numpy() * 255).astype('uint8')
+                flow0 = info['flow'].permute(0, 2, 3, 1).detach().cpu().numpy()
+                flow1 = info['flow_tea'].permute(0, 2, 3, 1).detach().cpu().numpy()
+                for i in range(5):
+                    imgs = np.concatenate((merged_img[i], pred[i], gt[i]), 1)[:, :, ::-1]
+                    writer.add_image(str(i) + '/img', imgs, step, dataformats='HWC')
+                    writer.add_image(str(i) + '/flow', np.concatenate((flow2rgb(flow0[i]), flow2rgb(flow1[i])), 1), step, dataformats='HWC')
+                    writer.add_image(str(i) + '/mask', mask[i], step, dataformats='HWC')
+                writer.flush()
+            if local_rank == 0:
+                print('epoch:{} {}/{} time:{:.2f}+{:.2f} loss_l1:{:.4e}'.format(epoch, i, args.step_per_epoch, data_time_interval, train_time_interval, info['loss_l1']))
+            step += 1
+        nr_eval += 1
+        if nr_eval % 5 == 0:
+            evaluate(model, val_data, step, local_rank, writer_val)
+        model.save_model(log_path, local_rank)
+        dist.barrier()
+def evaluate(model, val_data, nr_eval, local_rank, writer_val):
+    loss_l1_list = []
+    loss_distill_list = []
+    loss_tea_list = []
+    psnr_list = []
+    psnr_list_teacher = []
+    time_stamp = time.time()
+    for i, data in enumerate(val_data):
+        data_gpu, timestep = data
+        data_gpu = data_gpu.to(device, non_blocking=True) / 255.
+        imgs = data_gpu[:, :6]
+        gt = data_gpu[:, 6:9]
+        with torch.no_grad():
+            pred, info = model.update(imgs, gt, training=False)
+            merged_img = info['merged_tea']
+        loss_l1_list.append(info['loss_l1'].cpu().numpy())
+        loss_tea_list.append(info['loss_tea'].cpu().numpy())
+        loss_distill_list.append(info['loss_distill'].cpu().numpy())
+        for j in range(gt.shape[0]):
+            psnr = -10 * math.log10(torch.mean((gt[j] - pred[j]) * (gt[j] - pred[j])).cpu().data)
+            psnr_list.append(psnr)
+            psnr = -10 * math.log10(torch.mean((merged_img[j] - gt[j]) * (merged_img[j] - gt[j])).cpu().data)
+            psnr_list_teacher.append(psnr)
+        gt = (gt.permute(0, 2, 3, 1).cpu().numpy() * 255).astype('uint8')
+        pred = (pred.permute(0, 2, 3, 1).cpu().numpy() * 255).astype('uint8')
+        merged_img = (merged_img.permute(0, 2, 3, 1).cpu().numpy() * 255).astype('uint8')
+        flow0 = info['flow'].permute(0, 2, 3, 1).cpu().numpy()
+        flow1 = info['flow_tea'].permute(0, 2, 3, 1).cpu().numpy()
+        if i == 0 and local_rank == 0:
+            for j in range(10):
+                imgs = np.concatenate((merged_img[j], pred[j], gt[j]), 1)[:, :, ::-1]
+                writer_val.add_image(str(j) + '/img', imgs.copy(), nr_eval, dataformats='HWC')
+                writer_val.add_image(str(j) + '/flow', flow2rgb(flow0[j][:, :, ::-1]), nr_eval, dataformats='HWC')
+    eval_time_interval = time.time() - time_stamp
+    if local_rank != 0:
+        return
+    writer_val.add_scalar('psnr', np.array(psnr_list).mean(), nr_eval)
+    writer_val.add_scalar('psnr_teacher', np.array(psnr_list_teacher).mean(), nr_eval)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--epoch', default=300, type=int)
+    parser.add_argument('--batch_size', default=16, type=int, help='minibatch size')
+    parser.add_argument('--local_rank', default=0, type=int, help='local rank')
+    parser.add_argument('--world_size', default=4, type=int, help='world size')
+    args = parser.parse_args()
+    torch.distributed.init_process_group(backend="nccl", world_size=args.world_size)
+    torch.cuda.set_device(args.local_rank)
+    seed = 1234
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.benchmark = True
+    model = Model(args.local_rank)
+    train(model, args.local_rank)