ryefoxlime commited on Mar 14, 2024

Commit

18a9dce

verified ·

1 Parent(s): de8e1d2

Upload 26 files

Browse files

TAD Bot Face Detection algorithm

Files changed (26) hide show

detectfaces.py +102 -0
face_detection.py +23 -0
main.py +602 -0
models/.DS_Store +0 -0
models/PosterV2_7cls.py +441 -0
models/PosterV2_8cls.py +317 -0
models/__pycache__/PosterV2_7cls.cpython-310.pyc +0 -0
models/__pycache__/PosterV2_7cls.cpython-311.pyc +0 -0
models/__pycache__/ir50.cpython-310.pyc +0 -0
models/__pycache__/ir50.cpython-311.pyc +0 -0
models/__pycache__/mobilefacenet.cpython-310.pyc +0 -0
models/__pycache__/mobilefacenet.cpython-311.pyc +0 -0
models/__pycache__/vit_model.cpython-310.pyc +0 -0
models/__pycache__/vit_model.cpython-311.pyc +0 -0
models/ir50.py +272 -0
models/matrix.py +62 -0
models/mobilefacenet.py +193 -0
models/pretrain/.DS_Store +0 -0
models/pretrain/.gitignore +2 -0
models/pretrain/ir50.pth +3 -0
models/pretrain/mobilefacenet_model_best.pth.tar +3 -0
models/vit_model.py +828 -0
models/vit_model_8.py +828 -0
prediction.py +103 -0
raf-db-model_best.pth +3 -0
requirements.txt +131 -0

detectfaces.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from main import *
+import cv2
+import time
+model_path = "raf-db-model_best.pth"
+if torch.backends.mps.is_available():
+    device = "mps"
+elif torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
+model = pyramid_trans_expr2(img_size=224, num_classes=7)
+model = torch.nn.DataParallel(model)
+model = model.to(device)
+currtime = time.strftime("%H:%M:%S")
+print(currtime)
+def main():
+    if model_path is not None:
+        if os.path.isfile(model_path):
+            print("=> loading checkpoint '{}'".format(model_path))
+            checkpoint = torch.load(model_path, map_location=device)
+            best_acc = checkpoint["best_acc"]
+            best_acc = best_acc.to()
+            print(f"best_acc:{best_acc}")
+            model.load_state_dict(checkpoint["state_dict"])
+            print(
+                "=> loaded checkpoint '{}' (epoch {})".format(
+                    model_path, checkpoint["epoch"]
+                )
+            )
+        else:
+            print("=> no checkpoint found at '{}'".format(model_path))
+        imagecapture(model)
+        return
+def imagecapture(model):
+    currtimeimg = time.strftime("%H:%M:%S")
+    cap = cv2.VideoCapture(0)
+    if not cap.isOpened():
+        print("Error: Could not open webcam.")
+        exit()
+    face_cascade = cv2.CascadeClassifier(
+        cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
+    )
+    start_time = None
+    capturing = False
+    while True:
+        from prediction import predict
+        ret, frame = cap.read()
+        if not ret:
+            print("Error: Could not read frame.")
+            break
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        faces = face_cascade.detectMultiScale(
+            gray, scaleFactor=1.3, minNeighbors=5, minSize=(30, 30)
+        )
+        # Display the frame
+        cv2.imshow("Webcam", frame)
+        # If faces are detected, start the timer
+        if len(faces) > 0:
+            print(f"[!]Face detected at {currtimeimg}")
+            face_region = frame[
+                faces[0][1] : faces[0][1] + faces[0][3],
+                faces[0][0] : faces[0][0] + faces[0][2],
+            ]  # Crop the face region
+            face_pil_image = Image.fromarray(
+                cv2.cvtColor(face_region, cv2.COLOR_BGR2RGB)
+            )  # Convert to PIL image
+            print("[!]Start Expressions")
+            print(f"-->Prediction starting at {currtimeimg}")
+            predictions = predict(model, image_path=face_pil_image)
+            print(f"-->Done prediction at {currtimeimg}")
+            # Reset capturing
+            capturing = False
+        # Break the loop if the 'q' key is pressed
+        if cv2.waitKey(1) & 0xFF == ord("q"):
+            break
+    # Release the webcam and close the OpenCV window
+    cap.release()
+    cv2.destroyAllWindows()
+if __name__ == "__main__":
+    main()

face_detection.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from deepface import DeepFace
+import matplotlib.pyplot as plt
+from PIL import Image
+import numpy as np
+import time
+def face_detection(img_path):
+    currtime = time.strftime("%H:%M:%S")
+    face_objs = DeepFace.extract_faces(np.array(img_path), detector_backend="mtcnn", enforce_detection=False)
+    coordinates = face_objs[0]["facial_area"]
+    image = img_path
+    cropped_image = image.crop(
+        (
+            coordinates["x"],
+            coordinates["y"],
+            coordinates["x"] + coordinates["w"],
+            coordinates["y"] + coordinates["h"],
+        )
+    )
+    cropped_image.save(f"Images/test_{currtime}.jpg")
+    return cropped_image

main.py ADDED Viewed

	@@ -0,0 +1,602 @@

+import shutil
+import warnings
+from sklearn import metrics
+from sklearn.metrics import confusion_matrix
+from PIL import Image
+warnings.filterwarnings("ignore")
+import torch.utils.data as data
+import os
+import argparse
+from sklearn.metrics import f1_score, confusion_matrix
+from data_preprocessing.sam import SAM
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+import matplotlib.pyplot as plt
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+import numpy as np
+import datetime
+from torchsampler import ImbalancedDatasetSampler
+from models.PosterV2_7cls import *
+warnings.filterwarnings("ignore", category=UserWarning)
+now = datetime.datetime.now()
+time_str = now.strftime("[%m-%d]-[%H-%M]-")
+if torch.backends.mps.is_available():
+    device = "mps"
+elif torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
+print(f"Using device: {device}")
+parser = argparse.ArgumentParser()
+parser.add_argument("--data", type=str, default=r"raf-db/DATASET")
+parser.add_argument(
+    "--data_type",
+    default="RAF-DB",
+    choices=["RAF-DB", "AffectNet-7", "CAER-S"],
+    type=str,
+    help="dataset option",
+)
+parser.add_argument(
+    "--checkpoint_path", type=str, default="./checkpoint/" + time_str + "model.pth"
+)
+parser.add_argument(
+    "--best_checkpoint_path",
+    type=str,
+    default="./checkpoint/" + time_str + "model_best.pth",
+)
+parser.add_argument(
+    "-j",
+    "--workers",
+    default=4,
+    type=int,
+    metavar="N",
+    help="number of data loading workers",
+)
+parser.add_argument(
+    "--epochs", default=200, type=int, metavar="N", help="number of total epochs to run"
+)
+parser.add_argument(
+    "--start-epoch",
+    default=0,
+    type=int,
+    metavar="N",
+    help="manual epoch number (useful on restarts)",
+)
+parser.add_argument("-b", "--batch-size", default=2, type=int, metavar="N")
+parser.add_argument(
+    "--optimizer", type=str, default="adam", help="Optimizer, adam or sgd."
+)
+parser.add_argument(
+    "--lr", "--learning-rate", default=0.000035, type=float, metavar="LR", dest="lr"
+)
+parser.add_argument("--momentum", default=0.9, type=float, metavar="M")
+parser.add_argument(
+    "--wd", "--weight-decay", default=1e-4, type=float, metavar="W", dest="weight_decay"
+)
+parser.add_argument(
+    "-p", "--print-freq", default=30, type=int, metavar="N", help="print frequency"
+)
+parser.add_argument(
+    "--resume", default=None, type=str, metavar="PATH", help="path to checkpoint"
+)
+parser.add_argument(
+    "-e", "--evaluate", default=None, type=str, help="evaluate model on test set"
+)
+parser.add_argument("--beta", type=float, default=0.6)
+parser.add_argument("--gpu", type=str, default="0")
+parser.add_argument(
+    "-i", "--image", type=str, help="upload a single image to test the prediction"
+)
+parser.add_argument("-t", "--test", type=str, help="test model on single image")
+args = parser.parse_args()
+def main():
+    # os.environ["CUDA_VISIBLE_DEVICES"] = device
+    best_acc = 0
+    # print("Training time: " + now.strftime("%m-%d %H:%M"))
+    # create model
+    model = pyramid_trans_expr2(img_size=224, num_classes=7)
+    model = torch.nn.DataParallel(model)
+    model = model.to(device)
+    criterion = torch.nn.CrossEntropyLoss()
+    if args.optimizer == "adamw":
+        base_optimizer = torch.optim.AdamW
+    elif args.optimizer == "adam":
+        base_optimizer = torch.optim.Adam
+    elif args.optimizer == "sgd":
+        base_optimizer = torch.optim.SGD
+    else:
+        raise ValueError("Optimizer not supported.")
+    optimizer = SAM(
+        model.parameters(),
+        base_optimizer,
+        lr=args.lr,
+        rho=0.05,
+        adaptive=False,
+    )
+    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.98)
+    recorder = RecorderMeter(args.epochs)
+    recorder1 = RecorderMeter1(args.epochs)
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume)
+            args.start_epoch = checkpoint["epoch"]
+            best_acc = checkpoint["best_acc"]
+            recorder = checkpoint["recorder"]
+            recorder1 = checkpoint["recorder1"]
+            best_acc = best_acc.to()
+            model.load_state_dict(checkpoint["state_dict"])
+            optimizer.load_state_dict(checkpoint["optimizer"])
+            print(
+                "=> loaded checkpoint '{}' (epoch {})".format(
+                    args.resume, checkpoint["epoch"]
+                )
+            )
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+    cudnn.benchmark = True
+    # Data loading code
+    traindir = os.path.join(args.data, "train")
+    valdir = os.path.join(args.data, "test")
+    if args.evaluate is None:
+        if args.data_type == "RAF-DB":
+            train_dataset = datasets.ImageFolder(
+                traindir,
+                transforms.Compose(
+                    [
+                        transforms.Resize((224, 224)),
+                        transforms.RandomHorizontalFlip(),
+                        transforms.ToTensor(),
+                        transforms.Normalize(
+                            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                        ),
+                        transforms.RandomErasing(scale=(0.02, 0.1)),
+                    ]
+                ),
+            )
+        else:
+            train_dataset = datasets.ImageFolder(
+                traindir,
+                transforms.Compose(
+                    [
+                        transforms.Resize((224, 224)),
+                        transforms.RandomHorizontalFlip(),
+                        transforms.ToTensor(),
+                        transforms.Normalize(
+                            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                        ),
+                        transforms.RandomErasing(p=1, scale=(0.05, 0.05)),
+                    ]
+                ),
+            )
+        if args.data_type == "AffectNet-7":
+            train_loader = torch.utils.data.DataLoader(
+                train_dataset,
+                sampler=ImbalancedDatasetSampler(train_dataset),
+                batch_size=args.batch_size,
+                shuffle=False,
+                num_workers=args.workers,
+                pin_memory=True,
+            )
+        else:
+            train_loader = torch.utils.data.DataLoader(
+                train_dataset,
+                batch_size=args.batch_size,
+                shuffle=True,
+                num_workers=args.workers,
+                pin_memory=True,
+            )
+    test_dataset = datasets.ImageFolder(
+        valdir,
+        transforms.Compose(
+            [
+                transforms.Resize((224, 224)),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        ),
+    )
+    val_loader = torch.utils.data.DataLoader(
+        test_dataset,
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=args.workers,
+        pin_memory=True,
+    )
+    if args.evaluate is not None:
+        from validation import validate
+        if os.path.isfile(args.evaluate):
+            print("=> loading checkpoint '{}'".format(args.evaluate))
+            checkpoint = torch.load(args.evaluate, map_location=device)
+            best_acc = checkpoint["best_acc"]
+            best_acc = best_acc.to()
+            print(f"best_acc:{best_acc}")
+            model.load_state_dict(checkpoint["state_dict"])
+            print(
+                "=> loaded checkpoint '{}' (epoch {})".format(
+                    args.evaluate, checkpoint["epoch"]
+                )
+            )
+        else:
+            print("=> no checkpoint found at '{}'".format(args.evaluate))
+        validate(val_loader, model, criterion, args)
+        return
+    if args.test is not None:
+        from prediction import predict
+        if os.path.isfile(args.test):
+            print("=> loading checkpoint '{}'".format(args.test))
+            checkpoint = torch.load(args.test, map_location=device)
+            best_acc = checkpoint["best_acc"]
+            best_acc = best_acc.to()
+            print(f"best_acc:{best_acc}")
+            model.load_state_dict(checkpoint["state_dict"])
+            print(
+                "=> loaded checkpoint '{}' (epoch {})".format(
+                    args.test, checkpoint["epoch"]
+                )
+            )
+        else:
+            print("=> no checkpoint found at '{}'".format(args.test))
+        predict(model, image_path=args.image)
+        return
+    matrix = None
+    for epoch in range(args.start_epoch, args.epochs):
+        current_learning_rate = optimizer.state_dict()["param_groups"][0]["lr"]
+        print("Current learning rate: ", current_learning_rate)
+        txt_name = "./log/" + time_str + "log.txt"
+        with open(txt_name, "a") as f:
+            f.write("Current learning rate: " + str(current_learning_rate) + "\n")
+        # train for one epoch
+        train_acc, train_los = train(
+            train_loader, model, criterion, optimizer, epoch, args
+        )
+        # evaluate on validation set
+        val_acc, val_los, output, target, D = validate(
+            val_loader, model, criterion, args
+        )
+        scheduler.step()
+        recorder.update(epoch, train_los, train_acc, val_los, val_acc)
+        recorder1.update(output, target)
+        curve_name = time_str + "cnn.png"
+        recorder.plot_curve(os.path.join("./log/", curve_name))
+        # remember best acc and save checkpoint
+        is_best = val_acc > best_acc
+        best_acc = max(val_acc, best_acc)
+        print("Current best accuracy: ", best_acc.item())
+        if is_best:
+            matrix = D
+        print("Current best matrix: ", matrix)
+        txt_name = "./log/" + time_str + "log.txt"
+        with open(txt_name, "a") as f:
+            f.write("Current best accuracy: " + str(best_acc.item()) + "\n")
+        save_checkpoint(
+            {
+                "epoch": epoch + 1,
+                "state_dict": model.state_dict(),
+                "best_acc": best_acc,
+                "optimizer": optimizer.state_dict(),
+                "recorder1": recorder1,
+                "recorder": recorder,
+            },
+            is_best,
+            args,
+        )
+def train(train_loader, model, criterion, optimizer, epoch, args):
+    losses = AverageMeter("Loss", ":.4f")
+    top1 = AverageMeter("Accuracy", ":6.3f")
+    progress = ProgressMeter(
+        len(train_loader), [losses, top1], prefix="Epoch: [{}]".format(epoch)
+    )
+    # switch to train mode
+    model.train()
+    for i, (images, target) in enumerate(train_loader):
+        images = images.to(device)
+        target = target.to(device)
+        # compute output
+        output = model(images)
+        loss = criterion(output, target)
+        # measure accuracy and record loss
+        acc1, _ = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        # optimizer.step()
+        optimizer.first_step(zero_grad=True)
+        images = images.to(device)
+        target = target.to(device)
+        # compute output
+        output = model(images)
+        loss = criterion(output, target)
+        # measure accuracy and record loss
+        acc1, _ = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.second_step(zero_grad=True)
+        # print loss and accuracy
+        if i % args.print_freq == 0:
+            progress.display(i)
+    return top1.avg, losses.avg
+def save_checkpoint(state, is_best, args):
+    torch.save(state, args.checkpoint_path)
+    if is_best:
+        best_state = state.pop("optimizer")
+        torch.save(best_state, args.best_checkpoint_path)
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=":f"):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+    def __str__(self):
+        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print_txt = "\t".join(entries)
+        print(print_txt)
+        txt_name = "./log/" + time_str + "log.txt"
+        with open(txt_name, "a") as f:
+            f.write(print_txt + "\n")
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = "{:" + str(num_digits) + "d}"
+        return "[" + fmt + "/" + fmt.format(num_batches) + "]"
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+        res = []
+        for k in topk:
+            correct_k = correct[:k].contiguous().view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+labels = ["A", "B", "C", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O"]
+class RecorderMeter1(object):
+    """Computes and stores the minimum loss value and its epoch index"""
+    def __init__(self, total_epoch):
+        self.reset(total_epoch)
+    def reset(self, total_epoch):
+        self.total_epoch = total_epoch
+        self.current_epoch = 0
+        self.epoch_losses = np.zeros(
+            (self.total_epoch, 2), dtype=np.float32
+        )  # [epoch, train/val]
+        self.epoch_accuracy = np.zeros(
+            (self.total_epoch, 2), dtype=np.float32
+        )  # [epoch, train/val]
+    def update(self, output, target):
+        self.y_pred = output
+        self.y_true = target
+    def plot_confusion_matrix(self, cm, title="Confusion Matrix", cmap=plt.cm.binary):
+        plt.imshow(cm, interpolation="nearest", cmap=cmap)
+        y_true = self.y_true
+        y_pred = self.y_pred
+        plt.title(title)
+        plt.colorbar()
+        xlocations = np.array(range(len(labels)))
+        plt.xticks(xlocations, labels, rotation=90)
+        plt.yticks(xlocations, labels)
+        plt.ylabel("True label")
+        plt.xlabel("Predicted label")
+        cm = confusion_matrix(y_true, y_pred)
+        np.set_printoptions(precision=2)
+        cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
+        plt.figure(figsize=(12, 8), dpi=120)
+        ind_array = np.arange(len(labels))
+        x, y = np.meshgrid(ind_array, ind_array)
+        for x_val, y_val in zip(x.flatten(), y.flatten()):
+            c = cm_normalized[y_val][x_val]
+            if c > 0.01:
+                plt.text(
+                    x_val,
+                    y_val,
+                    "%0.2f" % (c,),
+                    color="red",
+                    fontsize=7,
+                    va="center",
+                    ha="center",
+                )
+        # offset the tick
+        tick_marks = np.arange(len(7))
+        plt.gca().set_xticks(tick_marks, minor=True)
+        plt.gca().set_yticks(tick_marks, minor=True)
+        plt.gca().xaxis.set_ticks_position("none")
+        plt.gca().yaxis.set_ticks_position("none")
+        plt.grid(True, which="minor", linestyle="-")
+        plt.gcf().subplots_adjust(bottom=0.15)
+        plot_confusion_matrix(cm_normalized, title="Normalized confusion matrix")
+        # show confusion matrix
+        plt.savefig("./log/confusion_matrix.png", format="png")
+        # fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
+        print("Saved figure")
+        plt.show()
+    def matrix(self):
+        target = self.y_true
+        output = self.y_pred
+        im_re_label = np.array(target)
+        im_pre_label = np.array(output)
+        y_ture = im_re_label.flatten()
+        # im_re_label.transpose()
+        y_pred = im_pre_label.flatten()
+        im_pre_label.transpose()
+class RecorderMeter(object):
+    """Computes and stores the minimum loss value and its epoch index"""
+    def __init__(self, total_epoch):
+        self.reset(total_epoch)
+    def reset(self, total_epoch):
+        self.total_epoch = total_epoch
+        self.current_epoch = 0
+        self.epoch_losses = np.zeros(
+            (self.total_epoch, 2), dtype=np.float32
+        )  # [epoch, train/val]
+        self.epoch_accuracy = np.zeros(
+            (self.total_epoch, 2), dtype=np.float32
+        )  # [epoch, train/val]
+    def update(self, idx, train_loss, train_acc, val_loss, val_acc):
+        self.epoch_losses[idx, 0] = train_loss * 30
+        self.epoch_losses[idx, 1] = val_loss * 30
+        self.epoch_accuracy[idx, 0] = train_acc
+        self.epoch_accuracy[idx, 1] = val_acc
+        self.current_epoch = idx + 1
+    def plot_curve(self, save_path):
+        title = "the accuracy/loss curve of train/val"
+        dpi = 80
+        width, height = 1800, 800
+        legend_fontsize = 10
+        figsize = width / float(dpi), height / float(dpi)
+        fig = plt.figure(figsize=figsize)
+        x_axis = np.array([i for i in range(self.total_epoch)])  # epochs
+        y_axis = np.zeros(self.total_epoch)
+        plt.xlim(0, self.total_epoch)
+        plt.ylim(0, 100)
+        interval_y = 5
+        interval_x = 5
+        plt.xticks(np.arange(0, self.total_epoch + interval_x, interval_x))
+        plt.yticks(np.arange(0, 100 + interval_y, interval_y))
+        plt.grid()
+        plt.title(title, fontsize=20)
+        plt.xlabel("the training epoch", fontsize=16)
+        plt.ylabel("accuracy", fontsize=16)
+        y_axis[:] = self.epoch_accuracy[:, 0]
+        plt.plot(x_axis, y_axis, color="g", linestyle="-", label="train-accuracy", lw=2)
+        plt.legend(loc=4, fontsize=legend_fontsize)
+        y_axis[:] = self.epoch_accuracy[:, 1]
+        plt.plot(x_axis, y_axis, color="y", linestyle="-", label="valid-accuracy", lw=2)
+        plt.legend(loc=4, fontsize=legend_fontsize)
+        y_axis[:] = self.epoch_losses[:, 0]
+        plt.plot(x_axis, y_axis, color="g", linestyle=":", label="train-loss-x30", lw=2)
+        plt.legend(loc=4, fontsize=legend_fontsize)
+        y_axis[:] = self.epoch_losses[:, 1]
+        plt.plot(x_axis, y_axis, color="y", linestyle=":", label="valid-loss-x30", lw=2)
+        plt.legend(loc=4, fontsize=legend_fontsize)
+        if save_path is not None:
+            fig.savefig(save_path, dpi=dpi, bbox_inches="tight")
+            print("Saved figure")
+        plt.close(fig)
+if __name__ == "__main__":
+    main()

models/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

models/PosterV2_7cls.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import os
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from .mobilefacenet import MobileFaceNet
+from .ir50 import Backbone
+from .vit_model import VisionTransformer, PatchEmbed
+from timm.models.layers import trunc_normal_, DropPath
+from thop import profile
+def load_pretrained_weights(model, checkpoint):
+    import collections
+    if "state_dict" in checkpoint:
+        state_dict = checkpoint["state_dict"]
+    else:
+        state_dict = checkpoint
+    model_dict = model.state_dict()
+    new_state_dict = collections.OrderedDict()
+    matched_layers, discarded_layers = [], []
+    for k, v in state_dict.items():
+        # If the pretrained state_dict was saved as nn.DataParallel,
+        # keys would contain "module.", which should be ignored.
+        if k.startswith("module."):
+            k = k[7:]
+        if k in model_dict and model_dict[k].size() == v.size():
+            new_state_dict[k] = v
+            matched_layers.append(k)
+        else:
+            discarded_layers.append(k)
+    # new_state_dict.requires_grad = False
+    model_dict.update(new_state_dict)
+    model.load_state_dict(model_dict)
+    print("load_weight", len(matched_layers))
+    return model
+def window_partition(x, window_size, h_w, w_w):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size: window size
+    Returns:
+        local window features (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, h_w, window_size, w_w, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows
+class window(nn.Module):
+    def __init__(self, window_size, dim):
+        super(window, self).__init__()
+        self.window_size = window_size
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x):
+        x = x.permute(0, 2, 3, 1)
+        B, H, W, C = x.shape
+        x = self.norm(x)
+        shortcut = x
+        h_w = int(torch.div(H, self.window_size).item())
+        w_w = int(torch.div(W, self.window_size).item())
+        x_windows = window_partition(x, self.window_size, h_w, w_w)
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)
+        return x_windows, shortcut
+class WindowAttentionGlobal(nn.Module):
+    """
+    Global window attention based on: "Hatamizadeh et al.,
+    Global Context Vision Transformers <https://arxiv.org/abs/2206.09959>"
+    """
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        """
+        Args:
+            dim: feature size dimension.
+            num_heads: number of attention head.
+            window_size: window size.
+            qkv_bias: bool argument for query, key, value learnable bias.
+            qk_scale: bool argument to scaling query, key.
+            attn_drop: attention dropout rate.
+            proj_drop: output dropout rate.
+        """
+        super().__init__()
+        window_size = (window_size, window_size)
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = torch.div(dim, num_heads)
+        self.scale = qk_scale or head_dim**-0.5
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, q_global):
+        # print(f'q_global.shape:{q_global.shape}')
+        # print(f'x.shape:{x.shape}')
+        B_, N, C = x.shape
+        B = q_global.shape[0]
+        head_dim = int(torch.div(C, self.num_heads).item())
+        B_dim = int(torch.div(B_, B).item())
+        kv = (
+            self.qkv(x)
+            .reshape(B_, N, 2, self.num_heads, head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
+        k, v = kv[0], kv[1]
+        q_global = q_global.repeat(1, B_dim, 1, 1, 1)
+        q = q_global.reshape(B_, self.num_heads, N, head_dim)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1],
+            -1,
+        )
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attn = attn + relative_position_bias.unsqueeze(0)
+        attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+def _to_channel_last(x):
+    """
+    Args:
+        x: (B, C, H, W)
+    Returns:
+        x: (B, H, W, C)
+    """
+    return x.permute(0, 2, 3, 1)
+def _to_channel_first(x):
+    return x.permute(0, 3, 1, 2)
+def _to_query(x, N, num_heads, dim_head):
+    B = x.shape[0]
+    x = x.reshape(B, 1, N, num_heads, dim_head).permute(0, 1, 3, 2, 4)
+    return x
+class Mlp(nn.Module):
+    """
+    Multi-Layer Perceptron (MLP) block
+    """
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        """
+        Args:
+            in_features: input features dimension.
+            hidden_features: hidden features dimension.
+            out_features: output features dimension.
+            act_layer: activation function.
+            drop: dropout rate.
+        """
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_reverse(windows, window_size, H, W, h_w, w_w):
+    """
+    Args:
+        windows: local window features (num_windows*B, window_size, window_size, C)
+        window_size: Window size
+        H: Height of image
+        W: Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, h_w, w_w, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class feedforward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        window_size,
+        mlp_ratio=4.0,
+        act_layer=nn.GELU,
+        drop=0.0,
+        drop_path=0.0,
+        layer_scale=None,
+    ):
+        super(feedforward, self).__init__()
+        if layer_scale is not None and type(layer_scale) in [int, float]:
+            self.layer_scale = True
+            self.gamma1 = nn.Parameter(
+                layer_scale * torch.ones(dim), requires_grad=True
+            )
+            self.gamma2 = nn.Parameter(
+                layer_scale * torch.ones(dim), requires_grad=True
+            )
+        else:
+            self.gamma1 = 1.0
+            self.gamma2 = 1.0
+        self.window_size = window_size
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=drop,
+        )
+        self.norm = nn.LayerNorm(dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, attn_windows, shortcut):
+        B, H, W, C = shortcut.shape
+        h_w = int(torch.div(H, self.window_size).item())
+        w_w = int(torch.div(W, self.window_size).item())
+        x = window_reverse(attn_windows, self.window_size, H, W, h_w, w_w)
+        x = shortcut + self.drop_path(self.gamma1 * x)
+        x = x + self.drop_path(self.gamma2 * self.mlp(self.norm(x)))
+        return x
+class pyramid_trans_expr2(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        num_classes=7,
+        window_size=[28, 14, 7],
+        num_heads=[2, 4, 8],
+        dims=[64, 128, 256],
+        embed_dim=768,
+    ):
+        super().__init__()
+        self.img_size = img_size
+        self.num_heads = num_heads
+        self.dim_head = []
+        for num_head, dim in zip(num_heads, dims):
+            self.dim_head.append(int(torch.div(dim, num_head).item()))
+        self.num_classes = num_classes
+        self.window_size = window_size
+        self.N = [win * win for win in window_size]
+        self.face_landback = MobileFaceNet([112, 112], 136)
+        mobilefacenet_path = os.path.join(
+            os.getcwd(), "models/pretrain/mobilefacenet_model_best.pth.tar"
+        )
+        ir50_path = os.path.join(os.getcwd(), "models/pretrain/ir50.pth")
+        print(mobilefacenet_path)
+        face_landback_checkpoint = torch.load(
+            mobilefacenet_path,
+            map_location=lambda storage, loc: storage,
+        )
+        self.face_landback.load_state_dict(face_landback_checkpoint["state_dict"])
+        for param in self.face_landback.parameters():
+            param.requires_grad = False
+        self.VIT = VisionTransformer(depth=2, embed_dim=embed_dim)
+        self.ir_back = Backbone(50, 0.0, "ir")
+        ir_checkpoint = torch.load(
+            ir50_path,
+            map_location=lambda storage, loc: storage,
+        )
+        self.ir_back = load_pretrained_weights(self.ir_back, ir_checkpoint)
+        self.attn1 = WindowAttentionGlobal(
+            dim=dims[0], num_heads=num_heads[0], window_size=window_size[0]
+        )
+        self.attn2 = WindowAttentionGlobal(
+            dim=dims[1], num_heads=num_heads[1], window_size=window_size[1]
+        )
+        self.attn3 = WindowAttentionGlobal(
+            dim=dims[2], num_heads=num_heads[2], window_size=window_size[2]
+        )
+        self.window1 = window(window_size=window_size[0], dim=dims[0])
+        self.window2 = window(window_size=window_size[1], dim=dims[1])
+        self.window3 = window(window_size=window_size[2], dim=dims[2])
+        self.conv1 = nn.Conv2d(
+            in_channels=dims[0],
+            out_channels=dims[0],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=dims[1],
+            out_channels=dims[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        self.conv3 = nn.Conv2d(
+            in_channels=dims[2],
+            out_channels=dims[2],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        dpr = [x.item() for x in torch.linspace(0, 0.5, 5)]
+        self.ffn1 = feedforward(
+            dim=dims[0], window_size=window_size[0], layer_scale=1e-5, drop_path=dpr[0]
+        )
+        self.ffn2 = feedforward(
+            dim=dims[1], window_size=window_size[1], layer_scale=1e-5, drop_path=dpr[1]
+        )
+        self.ffn3 = feedforward(
+            dim=dims[2], window_size=window_size[2], layer_scale=1e-5, drop_path=dpr[2]
+        )
+        self.last_face_conv = nn.Conv2d(
+            in_channels=512, out_channels=256, kernel_size=3, padding=1
+        )
+        self.embed_q = nn.Sequential(
+            nn.Conv2d(dims[0], 768, kernel_size=3, stride=2, padding=1),
+            nn.Conv2d(768, 768, kernel_size=3, stride=2, padding=1),
+        )
+        self.embed_k = nn.Sequential(
+            nn.Conv2d(dims[1], 768, kernel_size=3, stride=2, padding=1)
+        )
+        self.embed_v = PatchEmbed(img_size=14, patch_size=14, in_c=256, embed_dim=768)
+    def forward(self, x):
+        x_face = F.interpolate(x, size=112)
+        x_face1, x_face2, x_face3 = self.face_landback(x_face)
+        x_face3 = self.last_face_conv(x_face3)
+        x_face1, x_face2, x_face3 = (
+            _to_channel_last(x_face1),
+            _to_channel_last(x_face2),
+            _to_channel_last(x_face3),
+        )
+        q1, q2, q3 = (
+            _to_query(x_face1, self.N[0], self.num_heads[0], self.dim_head[0]),
+            _to_query(x_face2, self.N[1], self.num_heads[1], self.dim_head[1]),
+            _to_query(x_face3, self.N[2], self.num_heads[2], self.dim_head[2]),
+        )
+        x_ir1, x_ir2, x_ir3 = self.ir_back(x)
+        x_ir1, x_ir2, x_ir3 = self.conv1(x_ir1), self.conv2(x_ir2), self.conv3(x_ir3)
+        x_window1, shortcut1 = self.window1(x_ir1)
+        x_window2, shortcut2 = self.window2(x_ir2)
+        x_window3, shortcut3 = self.window3(x_ir3)
+        o1, o2, o3 = (
+            self.attn1(x_window1, q1),
+            self.attn2(x_window2, q2),
+            self.attn3(x_window3, q3),
+        )
+        o1, o2, o3 = (
+            self.ffn1(o1, shortcut1),
+            self.ffn2(o2, shortcut2),
+            self.ffn3(o3, shortcut3),
+        )
+        o1, o2, o3 = _to_channel_first(o1), _to_channel_first(o2), _to_channel_first(o3)
+        o1, o2, o3 = (
+            self.embed_q(o1).flatten(2).transpose(1, 2),
+            self.embed_k(o2).flatten(2).transpose(1, 2),
+            self.embed_v(o3),
+        )
+        o = torch.cat([o1, o2, o3], dim=1)
+        out = self.VIT(o)
+        return out
+def compute_param_flop():
+    model = pyramid_trans_expr2()
+    img = torch.rand(size=(1, 3, 224, 224))
+    flops, params = profile(model, inputs=(img,))
+    print(f"flops:{flops/1000**3}G,params:{params/1000**2}M")

models/PosterV2_8cls.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from .mobilefacenet import MobileFaceNet
+from .ir50 import Backbone
+from .vit_model_8 import VisionTransformer, PatchEmbed
+from timm.models.layers import trunc_normal_, DropPath
+from thop import profile
+def load_pretrained_weights(model, checkpoint):
+    import collections
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    model_dict = model.state_dict()
+    new_state_dict = collections.OrderedDict()
+    matched_layers, discarded_layers = [], []
+    for k, v in state_dict.items():
+        # If the pretrained state_dict was saved as nn.DataParallel,
+        # keys would contain "module.", which should be ignored.
+        if k.startswith('module.'):
+            k = k[7:]
+        if k in model_dict and model_dict[k].size() == v.size():
+            new_state_dict[k] = v
+            matched_layers.append(k)
+        else:
+            discarded_layers.append(k)
+    # new_state_dict.requires_grad = False
+    model_dict.update(new_state_dict)
+    model.load_state_dict(model_dict)
+    print('load_weight', len(matched_layers))
+    return model
+def window_partition(x, window_size, h_w, w_w):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size: window size
+    Returns:
+        local window features (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, h_w, window_size, w_w, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+class window(nn.Module):
+    def __init__(self, window_size, dim):
+        super(window, self).__init__()
+        self.window_size = window_size
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x):
+        x = x.permute(0, 2, 3, 1)
+        B, H, W, C = x.shape
+        x = self.norm(x)
+        shortcut = x
+        h_w = int(torch.div(H, self.window_size).item())
+        w_w = int(torch.div(W, self.window_size).item())
+        x_windows = window_partition(x, self.window_size, h_w, w_w)
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)
+        return x_windows, shortcut
+class WindowAttentionGlobal(nn.Module):
+    """
+    Global window attention based on: "Hatamizadeh et al.,
+    Global Context Vision Transformers <https://arxiv.org/abs/2206.09959>"
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 ):
+        """
+        Args:
+            dim: feature size dimension.
+            num_heads: number of attention head.
+            window_size: window size.
+            qkv_bias: bool argument for query, key, value learnable bias.
+            qk_scale: bool argument to scaling query, key.
+            attn_drop: attention dropout rate.
+            proj_drop: output dropout rate.
+        """
+        super().__init__()
+        window_size = (window_size, window_size)
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = torch.div(dim, num_heads)
+        self.scale = qk_scale or head_dim ** -0.5
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, q_global):
+        # print(f'q_global.shape:{q_global.shape}')
+        # print(f'x.shape:{x.shape}')
+        B_, N, C = x.shape
+        B = q_global.shape[0]
+        head_dim = int(torch.div(C, self.num_heads).item())
+        B_dim = int(torch.div(B_, B).item())
+        kv = self.qkv(x).reshape(B_, N, 2, self.num_heads, head_dim).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        q_global = q_global.repeat(1, B_dim, 1, 1, 1)
+        q = q_global.reshape(B_, self.num_heads, N, head_dim)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attn = attn + relative_position_bias.unsqueeze(0)
+        attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+def _to_channel_last(x):
+    """
+    Args:
+        x: (B, C, H, W)
+    Returns:
+        x: (B, H, W, C)
+    """
+    return x.permute(0, 2, 3, 1)
+def _to_channel_first(x):
+    return x.permute(0, 3, 1, 2)
+def _to_query(x, N, num_heads, dim_head):
+    B = x.shape[0]
+    x = x.reshape(B, 1, N, num_heads, dim_head).permute(0, 1, 3, 2, 4)
+    return x
+class Mlp(nn.Module):
+    """
+    Multi-Layer Perceptron (MLP) block
+    """
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        """
+        Args:
+            in_features: input features dimension.
+            hidden_features: hidden features dimension.
+            out_features: output features dimension.
+            act_layer: activation function.
+            drop: dropout rate.
+        """
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_reverse(windows, window_size, H, W, h_w, w_w):
+    """
+    Args:
+        windows: local window features (num_windows*B, window_size, window_size, C)
+        window_size: Window size
+        H: Height of image
+        W: Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, h_w, w_w, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class feedforward(nn.Module):
+    def __init__(self, dim, window_size, mlp_ratio=4., act_layer=nn.GELU, drop=0., drop_path=0., layer_scale=None):
+        super(feedforward, self).__init__()
+        if layer_scale is not None and type(layer_scale) in [int, float]:
+            self.layer_scale = True
+            self.gamma1 = nn.Parameter(layer_scale * torch.ones(dim), requires_grad=True)
+            self.gamma2 = nn.Parameter(layer_scale * torch.ones(dim), requires_grad=True)
+        else:
+            self.gamma1 = 1.0
+            self.gamma2 = 1.0
+        self.window_size = window_size
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+        self.norm = nn.LayerNorm(dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, attn_windows, shortcut):
+        B, H, W, C = shortcut.shape
+        h_w = int(torch.div(H, self.window_size).item())
+        w_w = int(torch.div(W, self.window_size).item())
+        x = window_reverse(attn_windows, self.window_size, H, W, h_w, w_w)
+        x = shortcut + self.drop_path(self.gamma1 * x)
+        x = x + self.drop_path(self.gamma2 * self.mlp(self.norm(x)))
+        return x
+class pyramid_trans_expr2(nn.Module):
+    def __init__(self, img_size=224, num_classes=8, window_size=[28,14,7], num_heads=[2, 4, 8], dims=[64, 128, 256], embed_dim=768):
+        super().__init__()
+        self.img_size = img_size
+        self.num_heads = num_heads
+        self.dim_head = []
+        for num_head, dim in zip(num_heads, dims):
+            self.dim_head.append(int(torch.div(dim, num_head).item()))
+        self.num_classes = num_classes
+        self.window_size = window_size
+        self.N = [win * win for win in window_size]
+        self.face_landback = MobileFaceNet([112, 112], 136)
+        face_landback_checkpoint = torch.load(r'./pretrain/mobilefacenet_model_best.pth.tar',
+                                              map_location=lambda storage, loc: storage)
+        self.face_landback.load_state_dict(face_landback_checkpoint['state_dict'])
+        for param in self.face_landback.parameters():
+            param.requires_grad = False
+        self.VIT = VisionTransformer(depth=2, embed_dim=embed_dim, num_classes=num_classes)
+        self.ir_back = Backbone(50, 0.0, 'ir')
+        ir_checkpoint = torch.load(r'./pretrain/ir50.pth', map_location=lambda storage, loc: storage)
+        self.ir_back = load_pretrained_weights(self.ir_back, ir_checkpoint)
+        self.attn1 = WindowAttentionGlobal(dim=dims[0], num_heads=num_heads[0], window_size=window_size[0])
+        self.attn2 = WindowAttentionGlobal(dim=dims[1], num_heads=num_heads[1], window_size=window_size[1])
+        self.attn3 = WindowAttentionGlobal(dim=dims[2], num_heads=num_heads[2], window_size=window_size[2])
+        self.window1 = window(window_size=window_size[0], dim=dims[0])
+        self.window2 = window(window_size=window_size[1], dim=dims[1])
+        self.window3 = window(window_size=window_size[2], dim=dims[2])
+        self.conv1 = nn.Conv2d(in_channels=dims[0], out_channels=dims[0], kernel_size=3, stride=2, padding=1)
+        self.conv2 = nn.Conv2d(in_channels=dims[1], out_channels=dims[1], kernel_size=3, stride=2, padding=1)
+        self.conv3 = nn.Conv2d(in_channels=dims[2], out_channels=dims[2], kernel_size=3, stride=2, padding=1)
+        dpr = [x.item() for x in torch.linspace(0, 0.5, 5)]
+        self.ffn1 = feedforward(dim=dims[0], window_size=window_size[0], layer_scale=1e-5, drop_path=dpr[0])
+        self.ffn2 = feedforward(dim=dims[1], window_size=window_size[1], layer_scale=1e-5, drop_path=dpr[1])
+        self.ffn3 = feedforward(dim=dims[2], window_size=window_size[2], layer_scale=1e-5, drop_path=dpr[2])
+        self.last_face_conv = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, padding=1)
+        self.embed_q = nn.Sequential(nn.Conv2d(dims[0], 768, kernel_size=3, stride=2, padding=1),
+                                     nn.Conv2d(768, 768, kernel_size=3, stride=2, padding=1))
+        self.embed_k = nn.Sequential(nn.Conv2d(dims[1], 768, kernel_size=3, stride=2, padding=1))
+        self.embed_v = PatchEmbed(img_size=14, patch_size=14, in_c=256, embed_dim=768)
+    def forward(self, x):
+        x_face = F.interpolate(x, size=112)
+        x_face1 , x_face2, x_face3 = self.face_landback(x_face)
+        x_face3 = self.last_face_conv(x_face3)
+        x_face1, x_face2, x_face3 = _to_channel_last(x_face1), _to_channel_last(x_face2), _to_channel_last(x_face3)
+        q1, q2, q3 = _to_query(x_face1, self.N[0], self.num_heads[0], self.dim_head[0]), \
+                     _to_query(x_face2, self.N[1], self.num_heads[1], self.dim_head[1]), \
+                     _to_query(x_face3, self.N[2], self.num_heads[2], self.dim_head[2])
+        x_ir1, x_ir2, x_ir3 = self.ir_back(x)
+        x_ir1, x_ir2, x_ir3 = self.conv1(x_ir1), self.conv2(x_ir2), self.conv3(x_ir3)
+        x_window1, shortcut1 = self.window1(x_ir1)
+        x_window2, shortcut2 = self.window2(x_ir2)
+        x_window3, shortcut3 = self.window3(x_ir3)
+        o1, o2, o3 = self.attn1(x_window1, q1), self.attn2(x_window2, q2), self.attn3(x_window3, q3)
+        o1, o2, o3 = self.ffn1(o1, shortcut1), self.ffn2(o2, shortcut2), self.ffn3(o3, shortcut3)
+        o1, o2, o3 = _to_channel_first(o1), _to_channel_first(o2), _to_channel_first(o3)
+        o1, o2, o3 = self.embed_q(o1).flatten(2).transpose(1, 2), self.embed_k(o2).flatten(2).transpose(1, 2), self.embed_v(o3)
+        o = torch.cat([o1, o2, o3], dim=1)
+        out = self.VIT(o)
+        return out
+def compute_param_flop():
+    model = pyramid_trans_expr2()
+    img = torch.rand(size=(1,3,224,224))
+    flops, params = profile(model, inputs=(img,))
+    print(f'flops:{flops/1000**3}G,params:{params/1000**2}M')

models/__pycache__/PosterV2_7cls.cpython-310.pyc ADDED Viewed

Binary file (12.1 kB). View file

models/__pycache__/PosterV2_7cls.cpython-311.pyc ADDED Viewed

Binary file (24.9 kB). View file

models/__pycache__/ir50.cpython-310.pyc ADDED Viewed

Binary file (6.01 kB). View file

models/__pycache__/ir50.cpython-311.pyc ADDED Viewed

Binary file (12 kB). View file

models/__pycache__/mobilefacenet.cpython-310.pyc ADDED Viewed

Binary file (6.5 kB). View file

models/__pycache__/mobilefacenet.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

models/__pycache__/vit_model.cpython-310.pyc ADDED Viewed

Binary file (19.6 kB). View file

models/__pycache__/vit_model.cpython-311.pyc ADDED Viewed

Binary file (34.9 kB). View file

models/ir50.py ADDED Viewed

	@@ -0,0 +1,272 @@

+from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, ReLU, Sigmoid, Dropout2d, Dropout, AvgPool2d, \
+    MaxPool2d, AdaptiveAvgPool2d, Sequential, Module, Parameter
+import torch.nn.functional as F
+import torch
+from collections import namedtuple
+import math
+import pdb
+##################################  Original Arcface Model #############################################################
+class Flatten(Module):
+    def forward(self, input):
+        return input.view(input.size(0), -1)
+def l2_norm(input, axis=1):
+    norm = torch.norm(input, 2, axis, True)
+    output = torch.div(input, norm)
+    return output
+class SEModule(Module):
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.avg_pool = AdaptiveAvgPool2d(1)
+        self.fc1 = Conv2d(
+            channels, channels // reduction, kernel_size=1, padding=0, bias=False)
+        self.relu = ReLU(inplace=True)
+        self.fc2 = Conv2d(
+            channels // reduction, channels, kernel_size=1, padding=0, bias=False)
+        self.sigmoid = Sigmoid()
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+# i = 0
+class bottleneck_IR(Module):
+    def __init__(self, in_channel, depth, stride):
+        super(bottleneck_IR, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False), BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), PReLU(depth),
+            Conv2d(depth, depth, (3, 3), stride, 1, bias=False), BatchNorm2d(depth))
+        i = 0
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        # print(shortcut.shape)
+        # print('---s---')
+        res = self.res_layer(x)
+        # print(res.shape)
+        # print('---r---')
+        # i = i + 50
+        # print(i)
+        # print('50')
+        return res + shortcut
+class bottleneck_IR_SE(Module):
+    def __init__(self, in_channel, depth, stride):
+        super(bottleneck_IR_SE, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth),
+            Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth),
+            SEModule(depth, 16)
+        )
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+        return res + shortcut
+class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
+    '''A named tuple describing a ResNet block.'''
+    # print('50')
+def get_block(in_channel, depth, num_units, stride=2):
+    return [Bottleneck(in_channel, depth, stride)] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+def get_blocks(num_layers):
+    if num_layers == 50:
+        blocks1 = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            # get_block(in_channel=64, depth=128, num_units=4),
+            # get_block(in_channel=128, depth=256, num_units=14),
+            # get_block(in_channel=256, depth=512, num_units=3)
+        ]
+        blocks2 = [
+            # get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=4),
+            # get_block(in_channel=128, depth=256, num_units=14),
+            # get_block(in_channel=256, depth=512, num_units=3)
+        ]
+        blocks3 = [
+            # get_block(in_channel=64, depth=64, num_units=3),
+            # get_block(in_channel=64, depth=128, num_units=4),
+            get_block(in_channel=128, depth=256, num_units=14),
+            # get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 100:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=13),
+            get_block(in_channel=128, depth=256, num_units=30),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 152:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=8),
+            get_block(in_channel=128, depth=256, num_units=36),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    return blocks1, blocks2, blocks3
+class Backbone(Module):
+    def __init__(self, num_layers, drop_ratio, mode='ir'):
+        super(Backbone, self).__init__()
+        # assert num_layers in [50, 100, 152], 'num_layers should be 50,100, or 152'
+        assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
+        blocks1, blocks2, blocks3 = get_blocks(num_layers)
+        # blocks2 = get_blocks(num_layers)
+        if mode == 'ir':
+            unit_module = bottleneck_IR
+        elif mode == 'ir_se':
+            unit_module = bottleneck_IR_SE
+        self.input_layer = Sequential(Conv2d(3, 64, (3, 3), 1, 1, bias=False),
+                                      BatchNorm2d(64),
+                                      PReLU(64))
+        self.output_layer = Sequential(BatchNorm2d(512),
+                                       Dropout(drop_ratio),
+                                       Flatten(),
+                                       Linear(512 * 7 * 7, 512),
+                                       BatchNorm1d(512))
+        modules1 = []
+        for block in blocks1:
+            for bottleneck in block:
+                modules1.append(
+                    unit_module(bottleneck.in_channel,
+                                bottleneck.depth,
+                                bottleneck.stride))
+        modules2 = []
+        for block in blocks2:
+            for bottleneck in block:
+                modules2.append(
+                    unit_module(bottleneck.in_channel,
+                                bottleneck.depth,
+                                bottleneck.stride))
+        modules3 = []
+        for block in blocks3:
+            for bottleneck in block:
+                modules3.append(
+                    unit_module(bottleneck.in_channel,
+                                bottleneck.depth,
+                                bottleneck.stride))
+        # modules4 = []
+        # for block in blocks4:
+        #     for bottleneck in block:
+        #         modules4.append(
+        #             unit_module(bottleneck.in_channel,
+        #                         bottleneck.depth,
+        #                         bottleneck.stride))
+        self.body1 = Sequential(*modules1)
+        self.body2 = Sequential(*modules2)
+        self.body3 = Sequential(*modules3)
+        # self.body4 = Sequential(*modules4)
+    def forward(self, x):
+        x = F.interpolate(x, size=112)
+        x = self.input_layer(x)
+        x1 = self.body1(x)
+        x2 = self.body2(x1)
+        x3 = self.body3(x2)
+        # x = self.output_layer(x)
+        # return l2_norm(x)
+        return x1, x2, x3
+def load_pretrained_weights(model, checkpoint):
+    import collections
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    model_dict = model.state_dict()
+    new_state_dict = collections.OrderedDict()
+    matched_layers, discarded_layers = [], []
+    for i, (k, v) in enumerate(state_dict.items()):
+        # print(i)
+        # If the pretrained state_dict was saved as nn.DataParallel,
+        # keys would contain "module.", which should be ignored.
+        if k.startswith('module.'):
+            k = k[7:]
+        if k in model_dict and model_dict[k].size() == v.size():
+            new_state_dict[k] = v
+            matched_layers.append(k)
+        else:
+            # print(k)
+            discarded_layers.append(k)
+    # new_state_dict.requires_grad = False
+    model_dict.update(new_state_dict)
+    model.load_state_dict(model_dict)
+    print('load_weight', len(matched_layers))
+    return model
+# model = Backbone(50, 0.0, 'ir')
+# ir_checkpoint = torch.load(r'C:\Users\86187\Desktop\project\mixfacial\models\pretrain\new_ir50.pth')
+# print('hello')
+# i1, i2, i3 = 0, 0, 0
+# ir_checkpoint = torch.load(r'C:\Users\86187\Desktop\project\mixfacial\models\pretrain\ir50.pth', map_location=lambda storage, loc: storage)
+# for (k1, v1), (k2, v2) in zip(model.state_dict().items(), ir_checkpoint.items()):
+#     print(f'k1:{k1}, k2:{k2}')
+#     model.state_dict()[k1] = v2
+# torch.save(model.state_dict(), r'C:\Users\86187\Desktop\project\mixfacial\models\pretrain\new_ir50.pth')
+#     print(k)
+#     if k.startswith('body1'):
+#         i1+=1
+#     if k.startswith('body2'):
+#         i2+=1
+#     if k.startswith('body3'):
+#         i3+=1
+# print(f'i1:{i1}, i2:{i2}, i3:{i3}')
+# print('-'*100)
+# ir_checkpoint = torch.load(r'C:\Users\86187\Desktop\project\mixfacial\models\pretrain\ir50.pth', map_location=lambda storage, loc: storage)
+# le = 0
+# for k, v in ir_checkpoint.items():
+#     # print(k)
+#     if k.startswith('body'):
+#         if le < i1:
+#             le += 1
+#             key = k.split('.')[0] + str(1) + k.split('.')[1:]
+#             print(key)
+# # ir_checkpoint = ir_checkpoint["model"]
+# model = load_pretrained_weights(model, ir_checkpoint)
+# img = torch.rand(size=(2,3,224,224))
+# out1, out2, out3 = model(img)
+# print(out1.shape, out2.shape, out3.shape)

models/matrix.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import itertools
+import matplotlib.pyplot as plt
+import numpy as np
+import matplotlib.pyplot as plt
+plt.rcParams['font.sans-serif'] = ['SimHei']
+plt.rcParams['axes.unicode_minus'] = False
+# -*- coding:utf-8 -*-
+def plot_confusion_matrix(cm, classes,
+                          normalize=False,
+                          title='Confusion matrix',
+                          cmap=plt.cm.Blues):
+    """
+    This function prints and plots the confusion matrix.
+    Normalization can be applied by setting `normalize=True`.
+    """
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        print("Normalized confusion matrix")
+    else:
+        print('Confusion matrix, without normalization')
+    print(cm)
+    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
+    plt.title(title)
+    plt.colorbar()
+    tick_marks = np.arange(len(classes))
+    plt.xticks(tick_marks, classes, fontsize=16)
+    plt.yticks(tick_marks, classes, fontsize=16)
+    fmt = '.2f' if normalize else 'd'
+    thresh = cm.max() / 2.
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        plt.text(j, i, format(cm[i, j], fmt),
+                 horizontalalignment="center",
+                 color="white" if cm[i, j] > thresh else "black")
+    plt.tight_layout()
+    plt.ylabel('True Label',fontsize=12)
+    plt.xlabel('Predicted Label',fontsize=12)
+    plt.show()
+cnf_matrix = np.array([[ 299 ,   6 ,   5 ,   3 ,   1 ,   4,   11],
+ [   9,   51   , 0,    2   , 8,    2   , 2],
+ [   2 ,   1  ,120 ,   6   ,13 ,   9  ,  9],
+ [   5  ,  1   , 7 ,1148   , 2  ,  4 ,  18],
+ [   0   , 0  ,  9  ,  4  ,442   , 1  , 22],
+ [   2    ,0 ,   7   , 3 ,   0  ,145 ,   5],
+ [  10    ,0,    6   ,11,   29   , 0,  624]])
+class_names = ["SU", 'FE', 'AN', 'HA', 'SA', 'DI', 'NE']
+plt.figure(dpi=200)
+plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
+                      title=None)

models/mobilefacenet.py ADDED Viewed

	@@ -0,0 +1,193 @@

+from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, ReLU, Sigmoid, Dropout2d, Dropout, AvgPool2d, \
+    MaxPool2d, AdaptiveAvgPool2d, Sequential, Module, Parameter
+import torch.nn.functional as F
+import torch
+import torch.nn as nn
+from collections import namedtuple
+import math
+import pdb
+##################################  Original Arcface Model #############################################################
+######## ccc#######################
+class Flatten(Module):
+    def forward(self, input):
+        return input.view(input.size(0), -1)
+##################################  MobileFaceNet #############################################################
+class Conv_block(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(Conv_block, self).__init__()
+        self.conv = Conv2d(in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding,
+                           bias=False)
+        self.bn = BatchNorm2d(out_c)
+        self.prelu = PReLU(out_c)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.prelu(x)
+        return x
+class Linear_block(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(Linear_block, self).__init__()
+        self.conv = Conv2d(in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding,
+                           bias=False)
+        self.bn = BatchNorm2d(out_c)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+class Depth_Wise(Module):
+    def __init__(self, in_c, out_c, residual=False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1):
+        super(Depth_Wise, self).__init__()
+        self.conv = Conv_block(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.conv_dw = Conv_block(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride)
+        self.project = Linear_block(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.residual = residual
+    def forward(self, x):
+        if self.residual:
+            short_cut = x
+        x = self.conv(x)
+        x = self.conv_dw(x)
+        x = self.project(x)
+        if self.residual:
+            output = short_cut + x
+        else:
+            output = x
+        return output
+class Residual(Module):
+    def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)):
+        super(Residual, self).__init__()
+        modules = []
+        for _ in range(num_block):
+            modules.append(
+                Depth_Wise(c, c, residual=True, kernel=kernel, padding=padding, stride=stride, groups=groups))
+        self.model = Sequential(*modules)
+    def forward(self, x):
+        return self.model(x)
+class GNAP(Module):
+    def __init__(self, embedding_size):
+        super(GNAP, self).__init__()
+        assert embedding_size == 512
+        self.bn1 = BatchNorm2d(512, affine=False)
+        self.pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.bn2 = BatchNorm1d(512, affine=False)
+    def forward(self, x):
+        x = self.bn1(x)
+        x_norm = torch.norm(x, 2, 1, True)
+        x_norm_mean = torch.mean(x_norm)
+        weight = x_norm_mean / x_norm
+        x = x * weight
+        x = self.pool(x)
+        x = x.view(x.shape[0], -1)
+        feature = self.bn2(x)
+        return feature
+class GDC(Module):
+    def __init__(self, embedding_size):
+        super(GDC, self).__init__()
+        self.conv_6_dw = Linear_block(512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0))
+        self.conv_6_flatten = Flatten()
+        self.linear = Linear(512, embedding_size, bias=False)
+        # self.bn = BatchNorm1d(embedding_size, affine=False)
+        self.bn = BatchNorm1d(embedding_size)
+    def forward(self, x):
+        x = self.conv_6_dw(x)    #### [B, 512, 1, 1]
+        x = self.conv_6_flatten(x)   #### [B, 512]
+        x = self.linear(x)      #### [B, 136]
+        x = self.bn(x)
+        return x
+class MobileFaceNet(Module):
+    def __init__(self, input_size, embedding_size=512, output_name="GDC"):
+        super(MobileFaceNet, self).__init__()
+        assert output_name in ["GNAP", 'GDC']
+        assert input_size[0] in [112]
+        self.conv1 = Conv_block(3, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1))
+        self.conv2_dw = Conv_block(64, 64, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64)
+        self.conv_23 = Depth_Wise(64, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128)
+        self.conv_3 = Residual(64, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.conv_34 = Depth_Wise(64, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
+        self.conv_4 = Residual(128, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.conv_45 = Depth_Wise(128, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512)
+        self.conv_5 = Residual(128, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.conv_6_sep = Conv_block(128, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
+        if output_name == "GNAP":
+            self.output_layer = GNAP(512)
+        else:
+            self.output_layer = GDC(embedding_size)
+        self._initialize_weights()
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+    def forward(self, x):
+        out = self.conv1(x)
+        # print(out.shape)
+        out = self.conv2_dw(out)
+        # print(out.shape)
+        out = self.conv_23(out)
+        # print(out.shape)
+        out3 = self.conv_3(out)
+        # print(out.shape)
+        out = self.conv_34(out3)
+        # print(out.shape)
+        out4 = self.conv_4(out)  # [128, 14, 14]
+        # print(out.shape)
+        out = self.conv_45(out4)  # [128, 7, 7]
+        # print(out.shape)
+        out = self.conv_5(out)  # [128, 7, 7]
+        # print(out.shape)
+        conv_features = self.conv_6_sep(out)    ##### [B, 512, 7, 7]
+        out = self.output_layer(conv_features)  ##### [B, 136]
+        return out3, out4, conv_features
+# model = MobileFaceNet([112, 112],136)
+# input = torch.ones(8,3,112,112).cuda()
+# model = model.cuda()
+# x = model(input)
+# import numpy as np
+# parameters = model.parameters()
+# parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
+# print('Total Parameters: %.3fM' % parameters)
+#
+#
+# from ptflops import get_model_complexity_info
+# macs, params = get_model_complexity_info(model, (3, 112, 112), as_strings=True,
+#                                        print_per_layer_stat=True, verbose=True)
+# print('{:<30}  {:<8}'.format('Computational complexity: ', macs))
+# print('{:<30}  {:<8}'.format('Number of parameters: ', params))
+#
+# print(x.shape)

models/pretrain/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

models/pretrain/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *
2	+ !.gitignore

models/pretrain/ir50.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62fcfa833776648f818b15fac4f5b760d76847316097e8e046f77ac445defb75
+size 122022895

models/pretrain/mobilefacenet_model_best.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b994af026bfddbafc507a6f1c8737a9896bab20ed2b0cfb6ae90b81736970313
+size 12281146

models/vit_model.py ADDED Viewed

	@@ -0,0 +1,828 @@

+"""
+original code from rwightman:
+https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+"""
+from functools import partial
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.hub
+from functools import partial
+# import mat
+# from vision_transformer.ir50 import Backbone
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.hub
+from functools import partial
+import math
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from timm.models.vision_transformer import _cfg, Mlp, Block
+# from .ir50 import Backbone
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class BasicBlock(nn.Module):
+    __constants__ = ['downsample']
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        norm_layer = nn.BatchNorm2d
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class DropPath(nn.Module):
+    """
+    Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class PatchEmbed(nn.Module):
+    """
+    2D Image to Patch Embedding
+    """
+    def __init__(self, img_size=14, patch_size=16, in_c=256, embed_dim=768, norm_layer=None):
+        super().__init__()
+        img_size = (img_size, img_size)
+        patch_size = (patch_size, patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.proj = nn.Conv2d(256, 768, kernel_size=1)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        # print(x.shape)
+        # flatten: [B, C, H, W] -> [B, C, HW]
+        # transpose: [B, C, HW] -> [B, HW, C]
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self,
+                 dim, in_chans,  # 输入token的dim
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop_ratio=0.,
+                 proj_drop_ratio=0.):
+        super(Attention, self).__init__()
+        self.num_heads = 8
+        self.img_chanel = in_chans + 1
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_ratio)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop_ratio)
+    def forward(self, x):
+        x_img = x[:, :self.img_chanel, :]
+        # [batch_size, num_patches + 1, total_embed_dim]
+        B, N, C = x_img.shape
+        # print(C)
+        qkv = self.qkv(x_img).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # k, v = kv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        # q = x_img.reshape(B, -1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x_img = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x_img = self.proj(x_img)
+        x_img = self.proj_drop(x_img)
+        #
+        #
+        # # qkv(): -> [batch_size, num_patches + 1, 3 * total_embed_dim]
+        # # reshape: -> [batch_size, num_patches + 1, 3, num_heads, embed_dim_per_head]
+        # # permute: -> [3, batch_size, num_heads, num_patches + 1, embed_dim_per_head]
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        # # [batch_size, num_heads, num_patches + 1, embed_dim_per_head]
+        # q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        #
+        # # transpose: -> [batch_size, num_heads, embed_dim_per_head, num_patches + 1]
+        # # @: multiply -> [batch_size, num_heads, num_patches + 1, num_patches + 1]
+        # attn = (q @ k.transpose(-2, -1)) * self.scale
+        # attn = attn.softmax(dim=-1)
+        # attn = self.attn_drop(attn)
+        #
+        # # @: multiply -> [batch_size, num_heads, num_patches + 1, embed_dim_per_head]
+        # # transpose: -> [batch_size, num_patches + 1, num_heads, embed_dim_per_head]
+        # # reshape: -> [batch_size, num_patches + 1, total_embed_dim]
+        # x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        # x = self.proj(x)
+        # x = self.proj_drop(x)
+        return x_img
+class AttentionBlock(nn.Module):
+    __constants__ = ['downsample']
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(AttentionBlock, self).__init__()
+        norm_layer = nn.BatchNorm2d
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+        # self.cbam = CBAM(planes, 16)
+        self.inplanes = inplanes
+        self.eca_block = eca_block()
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        inplanes = self.inplanes
+        out = self.eca_block(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Mlp(nn.Module):
+    """
+    MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self,
+                 dim, in_chans,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_ratio=0.,
+                 attn_drop_ratio=0.,
+                 drop_path_ratio=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super(Block, self).__init__()
+        self.norm1 = norm_layer(dim)
+        self.img_chanel = in_chans + 1
+        self.conv = nn.Conv1d(self.img_chanel, self.img_chanel, 1)
+        self.attn = Attention(dim, in_chans=in_chans, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                              attn_drop_ratio=attn_drop_ratio, proj_drop_ratio=drop_ratio)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path_ratio) if drop_path_ratio > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop_ratio)
+    def forward(self, x):
+        # x = x + self.drop_path(self.attn(self.norm1(x)))
+        # x = x + self.drop_path(self.mlp(self.norm2(x)))
+        x_img = x
+        # [:, :self.img_chanel, :]
+        # x_lm = x[:, self.img_chanel:, :]
+        x_img = x_img + self.drop_path(self.attn(self.norm1(x)))
+        x = x_img + self.drop_path(self.mlp(self.norm2(x_img)))
+        #
+        # x_lm = x_lm + self.drop_path(self.attn_lm(self.norm3(x)))
+        # x_lm = x_lm + self.drop_path(self.mlp2(self.norm4(x_lm)))
+        # x = torch.cat((x_img, x_lm), dim=1)
+        # x = self.conv(x)
+        return x
+class ClassificationHead(nn.Module):
+    def __init__(self, input_dim: int, target_dim: int):
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, target_dim)
+    def forward(self, x):
+        x = x.view(x.size(0), -1)
+        y_hat = self.linear(x)
+        return y_hat
+def load_pretrained_weights(model, checkpoint):
+    import collections
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    model_dict = model.state_dict()
+    new_state_dict = collections.OrderedDict()
+    matched_layers, discarded_layers = [], []
+    for k, v in state_dict.items():
+        # If the pretrained state_dict was saved as nn.DataParallel,
+        # keys would contain "module.", which should be ignored.
+        if k.startswith('module.'):
+            k = k[7:]
+        if k in model_dict and model_dict[k].size() == v.size():
+            new_state_dict[k] = v
+            matched_layers.append(k)
+        else:
+            discarded_layers.append(k)
+    # new_state_dict.requires_grad = False
+    model_dict.update(new_state_dict)
+    model.load_state_dict(model_dict)
+    print('load_weight', len(matched_layers))
+    return model
+class eca_block(nn.Module):
+    def __init__(self, channel=128, b=1, gamma=2):
+        super(eca_block, self).__init__()
+        kernel_size = int(abs((math.log(channel, 2) + b) / gamma))
+        kernel_size = kernel_size if kernel_size % 2 else kernel_size + 1
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        y = self.avg_pool(x)
+        y = self.conv(y.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1)
+        y = self.sigmoid(y)
+        return x * y.expand_as(x)
+#
+#
+# class IR20(nn.Module):
+#     def __init__(self, img_size_=112, num_classes=7, layers=[2, 2, 2, 2]):
+#         super().__init__()
+#         norm_layer = nn.BatchNorm2d
+#         self.img_size = img_size_
+#         self._norm_layer = norm_layer
+#         self.num_classes = num_classes
+#         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+#         self.bn1 = norm_layer(64)
+#         self.relu = nn.ReLU(inplace=True)
+#         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+#         # self.face_landback = MobileFaceNet([112, 112],136)
+#         # face_landback_checkpoint = torch.load('./models/pretrain/mobilefacenet_model_best.pth.tar', map_location=lambda storage, loc: storage)
+#         # self.face_landback.load_state_dict(face_landback_checkpoint['state_dict'])
+#         self.layer1 = self._make_layer(BasicBlock, 64, 64, layers[0])
+#         self.layer2 = self._make_layer(BasicBlock, 64, 128, layers[1], stride=2)
+#         self.layer3 = self._make_layer(AttentionBlock, 128, 256, layers[2], stride=2)
+#         self.layer4 = self._make_layer(AttentionBlock, 256, 256, layers[3], stride=1)
+#         self.ir_back = Backbone(50, 51, 52, 0.0, 'ir')
+#         self.ir_layer = nn.Linear(1024, 512)
+#         # ir_checkpoint = torch.load(r'F:\0815crossvit\vision_transformer\models\pretrain\Pretrained_on_MSCeleb.pth.tar',
+#         #                          map_location=lambda storage, loc: storage)
+#         # ir_checkpoint = ir_checkpoint['state_dict']
+#         # self.face_landback.load_state_dict(face_landback_checkpoint['state_dict'])
+#         # checkpoint = torch.load('./checkpoint/Pretrained_on_MSCeleb.pth.tar')
+#         # pre_trained_dict = checkpoint['state_dict']
+#         # IR20.load_state_dict(ir_checkpoint, strict=False)
+#         # self.IR = load_pretrained_weights(IR, ir_checkpoint)
+#
+#     def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+#         norm_layer = self._norm_layer
+#         downsample = None
+#         if stride != 1 or inplanes != planes:
+#             downsample = nn.Sequential(conv1x1(inplanes, planes, stride), norm_layer(planes))
+#         layers = []
+#         layers.append(block(inplanes, planes, stride, downsample))
+#         inplanes = planes
+#         for _ in range(1, blocks):
+#             layers.append(block(inplanes, planes))
+#         return nn.Sequential(*layers)
+#
+#     def forward(self, x):
+#         x_ir = self.ir_back(x)
+#         # x_ir = self.ir_layer(x_ir)
+#         # print(x_ir.shape)
+#         # x = F.interpolate(x, size=112)
+#         # x = self.conv1(x)
+#         # x = self.bn1(x)
+#         # x = self.relu(x)
+#         # x = self.maxpool(x)
+#         #
+#         # x = self.layer1(x)
+#         # x = self.layer2(x)
+#         # x = self.layer3(x)
+#         # x = self.layer4(x)
+#         # print(x.shape)
+#         # print(x)
+#         out = x_ir
+#
+#         return out
+#
+#
+# class IR(nn.Module):
+#     def __init__(self, img_size_=112, num_classes=7):
+#         super().__init__()
+#         depth = 8
+#         # if type == "small":
+#         #     depth = 4
+#         # if type == "base":
+#         #     depth = 6
+#         # if type == "large":
+#         #     depth = 8
+#
+#         self.img_size = img_size_
+#         self.num_classes = num_classes
+#         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+#         # self.bn1 = norm_layer(64)
+#         self.relu = nn.ReLU(inplace=True)
+#         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+#         # self.face_landback = MobileFaceNet([112, 112],136)
+#         # face_landback_checkpoint = torch.load('./models/pretrain/mobilefacenet_model_best.pth.tar', map_location=lambda storage, loc: storage)
+#         # self.face_landback.load_state_dict(face_landback_checkpoint['state_dict'])
+#
+#         # for param in self.face_landback.parameters():
+#         #     param.requires_grad = False
+#
+#         ###########################################################################333
+#
+#         self.ir_back = IR20()
+#
+#         # ir_checkpoint = torch.load(r'F:\0815crossvit\vision_transformer\models\pretrain\ir50.pth',
+#         #                            map_location=lambda storage, loc: storage)
+#         # # ir_checkpoint = ir_checkpoint["model"]
+#         # self.ir_back = load_pretrained_weights(self.ir_back, ir_checkpoint)
+#         # load_state_dict(checkpoint_model, strict=False)
+#         # self.ir_layer = nn.Linear(1024,512)
+#
+#         #############################################################3
+#         #
+#         # self.pyramid_fuse = HyVisionTransformer(in_chans=49, q_chanel = 49, embed_dim=512,
+#         #                                      depth=depth, num_heads=8, mlp_ratio=2.,
+#         #                                      drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1)
+#
+#         # self.se_block = SE_block(input_dim=512)
+#         self.head = ClassificationHead(input_dim=768, target_dim=self.num_classes)
+#
+#     def forward(self, x):
+#         B_ = x.shape[0]
+#         # x_face = F.interpolate(x, size=112)
+#         # _, x_face = self.face_landback(x_face)
+#         # x_face = x_face.view(B_, -1, 49).transpose(1,2)
+#         ###############  landmark x_face ([B, 49, 512])
+#         x_ir = self.ir_back(x)
+#         # print(x_ir.shape)
+#         # x_ir = self.ir_layer(x_ir)
+#         # print(x_ir.shape)
+#         ###############  image x_ir ([B, 49, 512])
+#
+#         # y_hat = self.pyramid_fuse(x_ir, x_face)
+#         # y_hat = self.se_block(y_hat)
+#         # y_feat = y_hat
+#
+#         # out = self.head(x_ir)
+#
+#         out = x_ir
+#         return out
+class eca_block(nn.Module):
+    def __init__(self, channel=196, b=1, gamma=2):
+        super(eca_block, self).__init__()
+        kernel_size = int(abs((math.log(channel, 2) + b) / gamma))
+        kernel_size = kernel_size if kernel_size % 2 else kernel_size + 1
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        y = self.avg_pool(x)
+        y = self.conv(y.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1)
+        y = self.sigmoid(y)
+        return x * y.expand_as(x)
+class SE_block(nn.Module):
+    def __init__(self, input_dim: int):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(input_dim, input_dim)
+        self.relu = nn.ReLU()
+        self.linear2 = torch.nn.Linear(input_dim, input_dim)
+        self.sigmod = nn.Sigmoid()
+    def forward(self, x):
+        x1 = self.linear1(x)
+        x1 = self.relu(x1)
+        x1 = self.linear2(x1)
+        x1 = self.sigmod(x1)
+        x = x * x1
+        return x
+class VisionTransformer(nn.Module):
+    def __init__(self, img_size=14, patch_size=14, in_c=147, num_classes=7,
+                 embed_dim=768, depth=6, num_heads=8, mlp_ratio=4.0, qkv_bias=True,
+                 qk_scale=None, representation_size=None, distilled=False, drop_ratio=0.,
+                 attn_drop_ratio=0., drop_path_ratio=0., embed_layer=PatchEmbed, norm_layer=None,
+                 act_layer=None):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_c (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            distilled (bool): model includes a distillation token and head as in DeiT models
+            drop_ratio (float): dropout rate
+            attn_drop_ratio (float): attention dropout rate
+            drop_path_ratio (float): stochastic depth rate
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+        """
+        super(VisionTransformer, self).__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 2 if distilled else 1
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, in_c + 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_ratio)
+        self.se_block = SE_block(input_dim=embed_dim)
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_c=256, embed_dim=768)
+        num_patches = self.patch_embed.num_patches
+        self.head = ClassificationHead(input_dim=embed_dim, target_dim=self.num_classes)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.dist_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
+        # self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_ratio)
+        # self.IR = IR()
+        self.eca_block = eca_block()
+        # self.ir_back = Backbone(50, 0.0, 'ir')
+        # ir_checkpoint = torch.load('./models/pretrain/ir50.pth', map_location=lambda storage, loc: storage)
+        # # ir_checkpoint = ir_checkpoint["model"]
+        # self.ir_back = load_pretrained_weights(self.ir_back, ir_checkpoint)
+        self.CON1 = nn.Conv2d(256, 768, kernel_size=1, stride=1, bias=False)
+        self.IRLinear1 = nn.Linear(1024, 768)
+        self.IRLinear2 = nn.Linear(768, 512)
+        self.eca_block = eca_block()
+        dpr = [x.item() for x in torch.linspace(0, drop_path_ratio, depth)]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(*[
+            Block(dim=embed_dim, in_chans=in_c, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop_ratio=drop_ratio, attn_drop_ratio=attn_drop_ratio, drop_path_ratio=dpr[i],
+                  norm_layer=norm_layer, act_layer=act_layer)
+            for i in range(depth)
+        ])
+        self.norm = norm_layer(embed_dim)
+        # Representation layer
+        if representation_size and not distilled:
+            self.has_logits = True
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(OrderedDict([
+                ("fc", nn.Linear(embed_dim, representation_size)),
+                ("act", nn.Tanh())
+            ]))
+        else:
+            self.has_logits = False
+            self.pre_logits = nn.Identity()
+        # Classifier head(s)
+        # self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.head_dist = None
+        if distilled:
+            self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+        # Weight init
+        nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        if self.dist_token is not None:
+            nn.init.trunc_normal_(self.dist_token, std=0.02)
+        nn.init.trunc_normal_(self.cls_token, std=0.02)
+        self.apply(_init_vit_weights)
+    def forward_features(self, x):
+        # [B, C, H, W] -> [B, num_patches, embed_dim]
+        # x = self.patch_embed(x)  # [B, 196, 768]
+        # [1, 1, 768] -> [B, 1, 768]
+        # print(x.shape)
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)
+        if self.dist_token is None:
+            x = torch.cat((cls_token, x), dim=1)  # [B, 197, 768]
+        else:
+            x = torch.cat((cls_token, self.dist_token.expand(x.shape[0], -1, -1), x), dim=1)
+        # print(x.shape)
+        x = self.pos_drop(x + self.pos_embed)
+        x = self.blocks(x)
+        x = self.norm(x)
+        if self.dist_token is None:
+            return self.pre_logits(x[:, 0])
+        else:
+            return x[:, 0], x[:, 1]
+    def forward(self, x):
+        # B = x.shape[0]
+        # print(x)
+        # x = self.eca_block(x)
+        # x = self.IR(x)
+        # x = eca_block(x)
+        # x = self.ir_back(x)
+        # print(x.shape)
+        # x = self.CON1(x)
+        # x = x.view(-1, 196, 768)
+        #
+        # # print(x.shape)
+        # # x = self.IRLinear1(x)
+        # # print(x)
+        # x_cls = torch.mean(x, 1).view(B, 1, -1)
+        # x = torch.cat((x_cls, x), dim=1)
+        # # print(x.shape)
+        # x = self.pos_drop(x + self.pos_embed)
+        # # print(x.shape)
+        # x = self.blocks(x)
+        # # print(x)
+        # x = self.norm(x)
+        # # print(x)
+        # # x1 = self.IRLinear2(x)
+        # x1 = x[:, 0, :]
+        # print(x1)
+        # print(x1.shape)
+        x = self.forward_features(x)
+        # # print(x.shape)
+        # if self.head_dist is not None:
+        #     x, x_dist = self.head(x[0]), self.head_dist(x[1])
+        #     if self.training and not torch.jit.is_scripting():
+        #         # during inference, return the average of both classifier predictions
+        #         return x, x_dist
+        #     else:
+        #         return (x + x_dist) / 2
+        # else:
+        # print(x.shape)
+        x = self.se_block(x)
+        x1 = self.head(x)
+        return x1
+def _init_vit_weights(m):
+    """
+    ViT weight initialization
+    :param m: module
+    """
+    if isinstance(m, nn.Linear):
+        nn.init.trunc_normal_(m.weight, std=.01)
+        if m.bias is not None:
+            nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.Conv2d):
+        nn.init.kaiming_normal_(m.weight, mode="fan_out")
+        if m.bias is not None:
+            nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.LayerNorm):
+        nn.init.zeros_(m.bias)
+        nn.init.ones_(m.weight)
+def vit_base_patch16_224(num_classes: int = 7):
+    """
+    ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    链接: https://pan.baidu.com/s/1zqb08naP0RPqqfSXfkB2EA  密码: eu9f
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=16,
+                              embed_dim=768,
+                              depth=12,
+                              num_heads=12,
+                              representation_size=None,
+                              num_classes=num_classes)
+    return model
+def vit_base_patch16_224_in21k(num_classes: int = 21843, has_logits: bool = True):
+    """
+    ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch16_224_in21k-e5005f0a.pth
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=16,
+                              embed_dim=768,
+                              depth=12,
+                              num_heads=12,
+                              representation_size=768 if has_logits else None,
+                              num_classes=num_classes)
+    return model
+def vit_base_patch32_224(num_classes: int = 1000):
+    """
+    ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    链接: https://pan.baidu.com/s/1hCv0U8pQomwAtHBYc4hmZg  密码: s5hl
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=32,
+                              embed_dim=768,
+                              depth=12,
+                              num_heads=12,
+                              representation_size=None,
+                              num_classes=num_classes)
+    return model
+def vit_base_patch32_224_in21k(num_classes: int = 21843, has_logits: bool = True):
+    """
+    ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch32_224_in21k-8db57226.pth
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=32,
+                              embed_dim=768,
+                              depth=12,
+                              num_heads=12,
+                              representation_size=768 if has_logits else None,
+                              num_classes=num_classes)
+    return model
+def vit_large_patch16_224(num_classes: int = 1000):
+    """
+    ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    链接: https://pan.baidu.com/s/1cxBgZJJ6qUWPSBNcE4TdRQ  密码: qqt8
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=16,
+                              embed_dim=1024,
+                              depth=24,
+                              num_heads=16,
+                              representation_size=None,
+                              num_classes=num_classes)
+    return model
+def vit_large_patch16_224_in21k(num_classes: int = 21843, has_logits: bool = True):
+    """
+    ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch16_224_in21k-606da67d.pth
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=16,
+                              embed_dim=1024,
+                              depth=24,
+                              num_heads=16,
+                              representation_size=1024 if has_logits else None,
+                              num_classes=num_classes)
+    return model
+def vit_large_patch32_224_in21k(num_classes: int = 21843, has_logits: bool = True):
+    """
+    ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=32,
+                              embed_dim=1024,
+                              depth=24,
+                              num_heads=16,
+                              representation_size=1024 if has_logits else None,
+                              num_classes=num_classes)
+    return model
+def vit_huge_patch14_224_in21k(num_classes: int = 21843, has_logits: bool = True):
+    """
+    ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: converted weights not currently available, too large for github release hosting.
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=14,
+                              embed_dim=1280,
+                              depth=32,
+                              num_heads=16,
+                              representation_size=1280 if has_logits else None,
+                              num_classes=num_classes)
+    return model

models/vit_model_8.py ADDED Viewed

	@@ -0,0 +1,828 @@

+"""
+original code from rwightman:
+https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+"""
+from functools import partial
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.hub
+from functools import partial
+# import mat
+# from vision_transformer.ir50 import Backbone
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.hub
+from functools import partial
+import math
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from timm.models.vision_transformer import _cfg, Mlp, Block
+from .ir50 import Backbone
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class BasicBlock(nn.Module):
+    __constants__ = ['downsample']
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        norm_layer = nn.BatchNorm2d
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class DropPath(nn.Module):
+    """
+    Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class PatchEmbed(nn.Module):
+    """
+    2D Image to Patch Embedding
+    """
+    def __init__(self, img_size=14, patch_size=16, in_c=256, embed_dim=768, norm_layer=None):
+        super().__init__()
+        img_size = (img_size, img_size)
+        patch_size = (patch_size, patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.proj = nn.Conv2d(256, 768, kernel_size=1)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        # print(x.shape)
+        # flatten: [B, C, H, W] -> [B, C, HW]
+        # transpose: [B, C, HW] -> [B, HW, C]
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self,
+                 dim, in_chans,  # 输入token的dim
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop_ratio=0.,
+                 proj_drop_ratio=0.):
+        super(Attention, self).__init__()
+        self.num_heads = 8
+        self.img_chanel = in_chans + 1
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_ratio)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop_ratio)
+    def forward(self, x):
+        x_img = x[:, :self.img_chanel, :]
+        # [batch_size, num_patches + 1, total_embed_dim]
+        B, N, C = x_img.shape
+        # print(C)
+        qkv = self.qkv(x_img).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # k, v = kv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        # q = x_img.reshape(B, -1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x_img = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x_img = self.proj(x_img)
+        x_img = self.proj_drop(x_img)
+        #
+        #
+        # # qkv(): -> [batch_size, num_patches + 1, 3 * total_embed_dim]
+        # # reshape: -> [batch_size, num_patches + 1, 3, num_heads, embed_dim_per_head]
+        # # permute: -> [3, batch_size, num_heads, num_patches + 1, embed_dim_per_head]
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        # # [batch_size, num_heads, num_patches + 1, embed_dim_per_head]
+        # q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        #
+        # # transpose: -> [batch_size, num_heads, embed_dim_per_head, num_patches + 1]
+        # # @: multiply -> [batch_size, num_heads, num_patches + 1, num_patches + 1]
+        # attn = (q @ k.transpose(-2, -1)) * self.scale
+        # attn = attn.softmax(dim=-1)
+        # attn = self.attn_drop(attn)
+        #
+        # # @: multiply -> [batch_size, num_heads, num_patches + 1, embed_dim_per_head]
+        # # transpose: -> [batch_size, num_patches + 1, num_heads, embed_dim_per_head]
+        # # reshape: -> [batch_size, num_patches + 1, total_embed_dim]
+        # x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        # x = self.proj(x)
+        # x = self.proj_drop(x)
+        return x_img
+class AttentionBlock(nn.Module):
+    __constants__ = ['downsample']
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(AttentionBlock, self).__init__()
+        norm_layer = nn.BatchNorm2d
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+        # self.cbam = CBAM(planes, 16)
+        self.inplanes = inplanes
+        self.eca_block = eca_block()
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        inplanes = self.inplanes
+        out = self.eca_block(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Mlp(nn.Module):
+    """
+    MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self,
+                 dim, in_chans,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_ratio=0.,
+                 attn_drop_ratio=0.,
+                 drop_path_ratio=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super(Block, self).__init__()
+        self.norm1 = norm_layer(dim)
+        self.img_chanel = in_chans + 1
+        self.conv = nn.Conv1d(self.img_chanel, self.img_chanel, 1)
+        self.attn = Attention(dim, in_chans=in_chans, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                              attn_drop_ratio=attn_drop_ratio, proj_drop_ratio=drop_ratio)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path_ratio) if drop_path_ratio > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop_ratio)
+    def forward(self, x):
+        # x = x + self.drop_path(self.attn(self.norm1(x)))
+        # x = x + self.drop_path(self.mlp(self.norm2(x)))
+        x_img = x
+        # [:, :self.img_chanel, :]
+        # x_lm = x[:, self.img_chanel:, :]
+        x_img = x_img + self.drop_path(self.attn(self.norm1(x)))
+        x = x_img + self.drop_path(self.mlp(self.norm2(x_img)))
+        #
+        # x_lm = x_lm + self.drop_path(self.attn_lm(self.norm3(x)))
+        # x_lm = x_lm + self.drop_path(self.mlp2(self.norm4(x_lm)))
+        # x = torch.cat((x_img, x_lm), dim=1)
+        # x = self.conv(x)
+        return x
+class ClassificationHead(nn.Module):
+    def __init__(self, input_dim: int, target_dim: int):
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, target_dim)
+    def forward(self, x):
+        x = x.view(x.size(0), -1)
+        y_hat = self.linear(x)
+        return y_hat
+def load_pretrained_weights(model, checkpoint):
+    import collections
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    model_dict = model.state_dict()
+    new_state_dict = collections.OrderedDict()
+    matched_layers, discarded_layers = [], []
+    for k, v in state_dict.items():
+        # If the pretrained state_dict was saved as nn.DataParallel,
+        # keys would contain "module.", which should be ignored.
+        if k.startswith('module.'):
+            k = k[7:]
+        if k in model_dict and model_dict[k].size() == v.size():
+            new_state_dict[k] = v
+            matched_layers.append(k)
+        else:
+            discarded_layers.append(k)
+    # new_state_dict.requires_grad = False
+    model_dict.update(new_state_dict)
+    model.load_state_dict(model_dict)
+    print('load_weight', len(matched_layers))
+    return model
+class eca_block(nn.Module):
+    def __init__(self, channel=128, b=1, gamma=2):
+        super(eca_block, self).__init__()
+        kernel_size = int(abs((math.log(channel, 2) + b) / gamma))
+        kernel_size = kernel_size if kernel_size % 2 else kernel_size + 1
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        y = self.avg_pool(x)
+        y = self.conv(y.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1)
+        y = self.sigmoid(y)
+        return x * y.expand_as(x)
+#
+#
+# class IR20(nn.Module):
+#     def __init__(self, img_size_=112, num_classes=7, layers=[2, 2, 2, 2]):
+#         super().__init__()
+#         norm_layer = nn.BatchNorm2d
+#         self.img_size = img_size_
+#         self._norm_layer = norm_layer
+#         self.num_classes = num_classes
+#         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+#         self.bn1 = norm_layer(64)
+#         self.relu = nn.ReLU(inplace=True)
+#         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+#         # self.face_landback = MobileFaceNet([112, 112],136)
+#         # face_landback_checkpoint = torch.load('./models/pretrain/mobilefacenet_model_best.pth.tar', map_location=lambda storage, loc: storage)
+#         # self.face_landback.load_state_dict(face_landback_checkpoint['state_dict'])
+#         self.layer1 = self._make_layer(BasicBlock, 64, 64, layers[0])
+#         self.layer2 = self._make_layer(BasicBlock, 64, 128, layers[1], stride=2)
+#         self.layer3 = self._make_layer(AttentionBlock, 128, 256, layers[2], stride=2)
+#         self.layer4 = self._make_layer(AttentionBlock, 256, 256, layers[3], stride=1)
+#         self.ir_back = Backbone(50, 51, 52, 0.0, 'ir')
+#         self.ir_layer = nn.Linear(1024, 512)
+#         # ir_checkpoint = torch.load(r'F:\0815crossvit\vision_transformer\models\pretrain\Pretrained_on_MSCeleb.pth.tar',
+#         #                          map_location=lambda storage, loc: storage)
+#         # ir_checkpoint = ir_checkpoint['state_dict']
+#         # self.face_landback.load_state_dict(face_landback_checkpoint['state_dict'])
+#         # checkpoint = torch.load('./checkpoint/Pretrained_on_MSCeleb.pth.tar')
+#         # pre_trained_dict = checkpoint['state_dict']
+#         # IR20.load_state_dict(ir_checkpoint, strict=False)
+#         # self.IR = load_pretrained_weights(IR, ir_checkpoint)
+#
+#     def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+#         norm_layer = self._norm_layer
+#         downsample = None
+#         if stride != 1 or inplanes != planes:
+#             downsample = nn.Sequential(conv1x1(inplanes, planes, stride), norm_layer(planes))
+#         layers = []
+#         layers.append(block(inplanes, planes, stride, downsample))
+#         inplanes = planes
+#         for _ in range(1, blocks):
+#             layers.append(block(inplanes, planes))
+#         return nn.Sequential(*layers)
+#
+#     def forward(self, x):
+#         x_ir = self.ir_back(x)
+#         # x_ir = self.ir_layer(x_ir)
+#         # print(x_ir.shape)
+#         # x = F.interpolate(x, size=112)
+#         # x = self.conv1(x)
+#         # x = self.bn1(x)
+#         # x = self.relu(x)
+#         # x = self.maxpool(x)
+#         #
+#         # x = self.layer1(x)
+#         # x = self.layer2(x)
+#         # x = self.layer3(x)
+#         # x = self.layer4(x)
+#         # print(x.shape)
+#         # print(x)
+#         out = x_ir
+#
+#         return out
+#
+#
+# class IR(nn.Module):
+#     def __init__(self, img_size_=112, num_classes=7):
+#         super().__init__()
+#         depth = 8
+#         # if type == "small":
+#         #     depth = 4
+#         # if type == "base":
+#         #     depth = 6
+#         # if type == "large":
+#         #     depth = 8
+#
+#         self.img_size = img_size_
+#         self.num_classes = num_classes
+#         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+#         # self.bn1 = norm_layer(64)
+#         self.relu = nn.ReLU(inplace=True)
+#         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+#         # self.face_landback = MobileFaceNet([112, 112],136)
+#         # face_landback_checkpoint = torch.load('./models/pretrain/mobilefacenet_model_best.pth.tar', map_location=lambda storage, loc: storage)
+#         # self.face_landback.load_state_dict(face_landback_checkpoint['state_dict'])
+#
+#         # for param in self.face_landback.parameters():
+#         #     param.requires_grad = False
+#
+#         ###########################################################################333
+#
+#         self.ir_back = IR20()
+#
+#         # ir_checkpoint = torch.load(r'F:\0815crossvit\vision_transformer\models\pretrain\ir50.pth',
+#         #                            map_location=lambda storage, loc: storage)
+#         # # ir_checkpoint = ir_checkpoint["model"]
+#         # self.ir_back = load_pretrained_weights(self.ir_back, ir_checkpoint)
+#         # load_state_dict(checkpoint_model, strict=False)
+#         # self.ir_layer = nn.Linear(1024,512)
+#
+#         #############################################################3
+#         #
+#         # self.pyramid_fuse = HyVisionTransformer(in_chans=49, q_chanel = 49, embed_dim=512,
+#         #                                      depth=depth, num_heads=8, mlp_ratio=2.,
+#         #                                      drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1)
+#
+#         # self.se_block = SE_block(input_dim=512)
+#         self.head = ClassificationHead(input_dim=768, target_dim=self.num_classes)
+#
+#     def forward(self, x):
+#         B_ = x.shape[0]
+#         # x_face = F.interpolate(x, size=112)
+#         # _, x_face = self.face_landback(x_face)
+#         # x_face = x_face.view(B_, -1, 49).transpose(1,2)
+#         ###############  landmark x_face ([B, 49, 512])
+#         x_ir = self.ir_back(x)
+#         # print(x_ir.shape)
+#         # x_ir = self.ir_layer(x_ir)
+#         # print(x_ir.shape)
+#         ###############  image x_ir ([B, 49, 512])
+#
+#         # y_hat = self.pyramid_fuse(x_ir, x_face)
+#         # y_hat = self.se_block(y_hat)
+#         # y_feat = y_hat
+#
+#         # out = self.head(x_ir)
+#
+#         out = x_ir
+#         return out
+class eca_block(nn.Module):
+    def __init__(self, channel=196, b=1, gamma=2):
+        super(eca_block, self).__init__()
+        kernel_size = int(abs((math.log(channel, 2) + b) / gamma))
+        kernel_size = kernel_size if kernel_size % 2 else kernel_size + 1
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        y = self.avg_pool(x)
+        y = self.conv(y.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1)
+        y = self.sigmoid(y)
+        return x * y.expand_as(x)
+class SE_block(nn.Module):
+    def __init__(self, input_dim: int):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(input_dim, input_dim)
+        self.relu = nn.ReLU()
+        self.linear2 = torch.nn.Linear(input_dim, input_dim)
+        self.sigmod = nn.Sigmoid()
+    def forward(self, x):
+        x1 = self.linear1(x)
+        x1 = self.relu(x1)
+        x1 = self.linear2(x1)
+        x1 = self.sigmod(x1)
+        x = x * x1
+        return x
+class VisionTransformer(nn.Module):
+    def __init__(self, img_size=14, patch_size=14, in_c=147, num_classes=8,
+                 embed_dim=768, depth=6, num_heads=8, mlp_ratio=4.0, qkv_bias=True,
+                 qk_scale=None, representation_size=None, distilled=False, drop_ratio=0.,
+                 attn_drop_ratio=0., drop_path_ratio=0., embed_layer=PatchEmbed, norm_layer=None,
+                 act_layer=None):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_c (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            distilled (bool): model includes a distillation token and head as in DeiT models
+            drop_ratio (float): dropout rate
+            attn_drop_ratio (float): attention dropout rate
+            drop_path_ratio (float): stochastic depth rate
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+        """
+        super(VisionTransformer, self).__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 2 if distilled else 1
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, in_c + 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_ratio)
+        self.se_block = SE_block(input_dim=embed_dim)
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_c=256, embed_dim=768)
+        num_patches = self.patch_embed.num_patches
+        self.head = ClassificationHead(input_dim=embed_dim, target_dim=self.num_classes)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.dist_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
+        # self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_ratio)
+        # self.IR = IR()
+        self.eca_block = eca_block()
+        # self.ir_back = Backbone(50, 0.0, 'ir')
+        # ir_checkpoint = torch.load('./models/pretrain/ir50.pth', map_location=lambda storage, loc: storage)
+        # # ir_checkpoint = ir_checkpoint["model"]
+        # self.ir_back = load_pretrained_weights(self.ir_back, ir_checkpoint)
+        self.CON1 = nn.Conv2d(256, 768, kernel_size=1, stride=1, bias=False)
+        self.IRLinear1 = nn.Linear(1024, 768)
+        self.IRLinear2 = nn.Linear(768, 512)
+        self.eca_block = eca_block()
+        dpr = [x.item() for x in torch.linspace(0, drop_path_ratio, depth)]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(*[
+            Block(dim=embed_dim, in_chans=in_c, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop_ratio=drop_ratio, attn_drop_ratio=attn_drop_ratio, drop_path_ratio=dpr[i],
+                  norm_layer=norm_layer, act_layer=act_layer)
+            for i in range(depth)
+        ])
+        self.norm = norm_layer(embed_dim)
+        # Representation layer
+        if representation_size and not distilled:
+            self.has_logits = True
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(OrderedDict([
+                ("fc", nn.Linear(embed_dim, representation_size)),
+                ("act", nn.Tanh())
+            ]))
+        else:
+            self.has_logits = False
+            self.pre_logits = nn.Identity()
+        # Classifier head(s)
+        # self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.head_dist = None
+        if distilled:
+            self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+        # Weight init
+        nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        if self.dist_token is not None:
+            nn.init.trunc_normal_(self.dist_token, std=0.02)
+        nn.init.trunc_normal_(self.cls_token, std=0.02)
+        self.apply(_init_vit_weights)
+    def forward_features(self, x):
+        # [B, C, H, W] -> [B, num_patches, embed_dim]
+        # x = self.patch_embed(x)  # [B, 196, 768]
+        # [1, 1, 768] -> [B, 1, 768]
+        # print(x.shape)
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)
+        if self.dist_token is None:
+            x = torch.cat((cls_token, x), dim=1)  # [B, 197, 768]
+        else:
+            x = torch.cat((cls_token, self.dist_token.expand(x.shape[0], -1, -1), x), dim=1)
+        # print(x.shape)
+        x = self.pos_drop(x + self.pos_embed)
+        x = self.blocks(x)
+        x = self.norm(x)
+        if self.dist_token is None:
+            return self.pre_logits(x[:, 0])
+        else:
+            return x[:, 0], x[:, 1]
+    def forward(self, x):
+        # B = x.shape[0]
+        # print(x)
+        # x = self.eca_block(x)
+        # x = self.IR(x)
+        # x = eca_block(x)
+        # x = self.ir_back(x)
+        # print(x.shape)
+        # x = self.CON1(x)
+        # x = x.view(-1, 196, 768)
+        #
+        # # print(x.shape)
+        # # x = self.IRLinear1(x)
+        # # print(x)
+        # x_cls = torch.mean(x, 1).view(B, 1, -1)
+        # x = torch.cat((x_cls, x), dim=1)
+        # # print(x.shape)
+        # x = self.pos_drop(x + self.pos_embed)
+        # # print(x.shape)
+        # x = self.blocks(x)
+        # # print(x)
+        # x = self.norm(x)
+        # # print(x)
+        # # x1 = self.IRLinear2(x)
+        # x1 = x[:, 0, :]
+        # print(x1)
+        # print(x1.shape)
+        x = self.forward_features(x)
+        # # print(x.shape)
+        # if self.head_dist is not None:
+        #     x, x_dist = self.head(x[0]), self.head_dist(x[1])
+        #     if self.training and not torch.jit.is_scripting():
+        #         # during inference, return the average of both classifier predictions
+        #         return x, x_dist
+        #     else:
+        #         return (x + x_dist) / 2
+        # else:
+        # print(x.shape)
+        x = self.se_block(x)
+        x1 = self.head(x)
+        return x1
+def _init_vit_weights(m):
+    """
+    ViT weight initialization
+    :param m: module
+    """
+    if isinstance(m, nn.Linear):
+        nn.init.trunc_normal_(m.weight, std=.01)
+        if m.bias is not None:
+            nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.Conv2d):
+        nn.init.kaiming_normal_(m.weight, mode="fan_out")
+        if m.bias is not None:
+            nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.LayerNorm):
+        nn.init.zeros_(m.bias)
+        nn.init.ones_(m.weight)
+def vit_base_patch16_224(num_classes: int = 7):
+    """
+    ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    链接: https://pan.baidu.com/s/1zqb08naP0RPqqfSXfkB2EA  密码: eu9f
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=16,
+                              embed_dim=768,
+                              depth=12,
+                              num_heads=12,
+                              representation_size=None,
+                              num_classes=num_classes)
+    return model
+def vit_base_patch16_224_in21k(num_classes: int = 21843, has_logits: bool = True):
+    """
+    ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch16_224_in21k-e5005f0a.pth
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=16,
+                              embed_dim=768,
+                              depth=12,
+                              num_heads=12,
+                              representation_size=768 if has_logits else None,
+                              num_classes=num_classes)
+    return model
+def vit_base_patch32_224(num_classes: int = 1000):
+    """
+    ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    链接: https://pan.baidu.com/s/1hCv0U8pQomwAtHBYc4hmZg  密码: s5hl
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=32,
+                              embed_dim=768,
+                              depth=12,
+                              num_heads=12,
+                              representation_size=None,
+                              num_classes=num_classes)
+    return model
+def vit_base_patch32_224_in21k(num_classes: int = 21843, has_logits: bool = True):
+    """
+    ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch32_224_in21k-8db57226.pth
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=32,
+                              embed_dim=768,
+                              depth=12,
+                              num_heads=12,
+                              representation_size=768 if has_logits else None,
+                              num_classes=num_classes)
+    return model
+def vit_large_patch16_224(num_classes: int = 1000):
+    """
+    ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    链接: https://pan.baidu.com/s/1cxBgZJJ6qUWPSBNcE4TdRQ  密码: qqt8
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=16,
+                              embed_dim=1024,
+                              depth=24,
+                              num_heads=16,
+                              representation_size=None,
+                              num_classes=num_classes)
+    return model
+def vit_large_patch16_224_in21k(num_classes: int = 21843, has_logits: bool = True):
+    """
+    ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch16_224_in21k-606da67d.pth
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=16,
+                              embed_dim=1024,
+                              depth=24,
+                              num_heads=16,
+                              representation_size=1024 if has_logits else None,
+                              num_classes=num_classes)
+    return model
+def vit_large_patch32_224_in21k(num_classes: int = 21843, has_logits: bool = True):
+    """
+    ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    weights ported from official Google JAX impl:
+    https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=32,
+                              embed_dim=1024,
+                              depth=24,
+                              num_heads=16,
+                              representation_size=1024 if has_logits else None,
+                              num_classes=num_classes)
+    return model
+def vit_huge_patch14_224_in21k(num_classes: int = 21843, has_logits: bool = True):
+    """
+    ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: converted weights not currently available, too large for github release hosting.
+    """
+    model = VisionTransformer(img_size=224,
+                              patch_size=14,
+                              embed_dim=1280,
+                              depth=32,
+                              num_heads=16,
+                              representation_size=1280 if has_logits else None,
+                              num_classes=num_classes)
+    return model

prediction.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from main import *
+from deepface import DeepFace
+# Checking for all types of devices available
+if torch.backends.mps.is_available():
+    device = "mps"
+elif torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
+print(f"Using device: {device}")
+# Predicting the model
+# def prediction(model, image_path):
+model = pyramid_trans_expr2(img_size=224, num_classes=7)
+model = torch.nn.DataParallel(model)
+model = model.to(device)
+model_path = "raf-db-model_best.pth"
+image_arr = []
+for foldername, subfolders, filenames in os.walk(
+    "/Users/futuregadgetlab/Downloads/Testing/"
+):
+    for filename in filenames:
+        # Construct the full path to the file
+        file_path = os.path.join(foldername, filename)
+        image_arr.append(f"{file_path}")
+def main():
+    if model_path is not None:
+        if os.path.isfile(model_path):
+            print("=> loading checkpoint '{}'".format(model_path))
+            checkpoint = torch.load(model_path, map_location=device)
+            best_acc = checkpoint["best_acc"]
+            best_acc = best_acc.to()
+            print(f"best_acc:{best_acc}")
+            model.load_state_dict(checkpoint["state_dict"])
+            print(
+                "=> loaded checkpoint '{}' (epoch {})".format(
+                    model_path, checkpoint["epoch"]
+                )
+            )
+        else:
+            print("=> no checkpoint found at '{}'".format(model_path))
+        predict(model, image_path=image_arr)
+        return
+def predict(model, image_path):
+    from face_detection import face_detection
+    with torch.no_grad():
+        transform = transforms.Compose(
+            [
+                transforms.Resize((224, 224)),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+                transforms.RandomErasing(p=1, scale=(0.05, 0.05)),
+            ]
+        )
+        face = face_detection(image_path)
+        image_tensor = transform(face).unsqueeze(0)
+        image_tensor = image_tensor.to(device)
+        model.eval()
+        img_pred = model(image_tensor)
+        topk = (3,)
+        with torch.no_grad():
+            maxk = max(topk)
+            # batch_size = target.size(0)
+            _, pred = img_pred.topk(maxk, 1, True, True)
+            pred = pred.t()
+        img_pred = pred
+        img_pred = img_pred.squeeze().cpu().numpy()
+        im_pre_label = np.array(img_pred)
+        y_pred = im_pre_label.flatten()
+        emotions = {
+            0: "Surprise",
+            1: "Fear",
+            2: "Disgust",
+            3: "Happy",
+            4: "Sad",
+            5: "Angry",
+            6: "Neutral",
+        }
+        labels = []
+        for i in y_pred:
+            labels.append(emotions.get(i))
+        print(
+            f"-->Image Path {image_path} [!] The predicted labels are {y_pred} and the label is {labels}"
+            )
+    return
+if __name__ == "__main__":
+    main()

raf-db-model_best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9bf1d0d88238966ce0d1a289a2bb5f927ec2fe635ef1ec4396c323028924701
+size 238971279

requirements.txt ADDED Viewed

	@@ -0,0 +1,131 @@

+appdirs==1.4.4
+asgiref==3.7.2
+attr==0.3.1
+azure-core==1.29.5
+azure-storage-blob==12.18.3
+bleach==5.0.1
+boto==2.49.0
+boto3==1.16.63
+botocore==1.19.63
+boxing==0.1.4
+Brotli @ file:///Users/runner/miniforge3/conda-bld/brotli-split_1695989934239/work
+certifi==2023.7.22
+cffi==1.16.0
+charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1698833585322/work
+click==8.1.7
+colorama==0.4.6
+contourpy @ file:///Users/runner/miniforge3/conda-bld/contourpy_1699041448398/work
+coreapi==2.3.3
+coreschema==0.0.4
+cryptography==41.0.5
+cycler @ file:///home/conda/feedstock_root/build_artifacts/cycler_1696677705766/work
+defusedxml==0.7.1
+Django==3.2.20
+django-annoying==0.10.6
+django-cors-headers==3.6.0
+django-debug-toolbar==3.2.1
+django-environ==0.10.0
+django-extensions==3.1.0
+django-filter==2.4.0
+django-model-utils==4.1.1
+django-ranged-fileresponse==0.1.2
+django-rest-swagger==2.2.0
+django-rq==2.5.1
+django-storages==1.12.3
+django-user-agents==0.4.0
+djangorestframework==3.13.1
+drf-dynamic-fields==0.3.0
+drf-flex-fields==0.9.5
+drf-generators==0.3.0
+drf-yasg==1.20.0
+expiringdict==1.2.2
+filelock @ file:///home/conda/feedstock_root/build_artifacts/filelock_1698714947081/work
+fonttools @ file:///Users/runner/miniforge3/conda-bld/fonttools_1699023568720/work
+fsspec==2023.10.0
+gmpy2 @ file:///Users/runner/miniforge3/conda-bld/gmpy2_1666808749046/work
+google-api-core==2.11.0
+google-cloud-appengine-logging==1.1.0
+google-cloud-audit-log==0.2.0
+google-cloud-core==2.3.2
+google-cloud-logging==2.7.1
+google-cloud-storage==2.5.0
+google-crc32c==1.5.0
+google-resumable-media==2.3.3
+googleapis-common-protos==1.56.4
+grpc-google-iam-v1==0.12.4
+grpcio-status==1.59.2
+htmlmin==0.1.12
+huggingface-hub==0.18.0
+idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1663625384323/work
+ijson==3.2.3
+inflection==0.5.1
+isodate==0.6.1
+itypes==1.2.0
+Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1654302431367/work
+jmespath==0.10.0
+joblib==1.3.2
+jsonschema==3.2.0
+kiwisolver @ file:///Users/runner/miniforge3/conda-bld/kiwisolver_1695380058985/work
+label-studio==1.8.2.post1
+label-studio-converter==0.0.54rc0
+label-studio-tools==0.0.3
+launchdarkly-server-sdk==7.5.0
+lockfile==0.12.2
+lxml==4.9.3
+MarkupSafe @ file:///Users/runner/miniforge3/conda-bld/markupsafe_1695367660391/work
+matplotlib @ file:///Users/runner/miniforge3/conda-bld/matplotlib-suite_1698868590489/work
+mpmath @ file:///home/conda/feedstock_root/build_artifacts/mpmath_1678228039184/work
+munkres==1.1.4
+networkx @ file:///home/conda/feedstock_root/build_artifacts/networkx_1698504735452/work
+nltk==3.6.7
+numpy @ file:///Users/runner/miniforge3/conda-bld/numpy_1694920094885/work/dist/numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl#sha256=6909902123b8421906e90ad77fb0041d9eb2d95bbdc29f3d09c7d244b0e0e5a5
+openapi-codec==1.3.2
+ordered-set==4.0.2
+packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1696202382185/work
+pandas==2.1.2
+Pillow @ file:///Users/runner/miniforge3/conda-bld/pillow_1697423665652/work
+proto-plus==1.22.3
+psycopg2-binary==2.9.6
+pycparser==2.21
+pyparsing @ file:///home/conda/feedstock_root/build_artifacts/pyparsing_1690737849915/work
+pyRFC3339==1.1
+pyrsistent==0.20.0
+PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1661604839144/work
+python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1626286286081/work
+python-json-logger==2.0.4
+pytz==2023.3.post1
+PyYAML @ file:///Users/runner/miniforge3/conda-bld/pyyaml_1695373486380/work
+redis==3.5.3
+requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1684774241324/work
+rq==1.10.1
+ruamel.yaml==0.18.5
+ruamel.yaml.clib==0.2.8
+rules==2.2
+s3transfer==0.3.7
+safetensors==0.4.0
+scikit-learn==1.3.2
+scipy==1.11.3
+semver==2.13.0
+sentry-sdk==1.34.0
+simplejson==3.19.2
+six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
+sqlparse==0.4.4
+sympy @ file:///home/conda/feedstock_root/build_artifacts/sympy_1684180540116/work
+thop==0.1.1.post2209072238
+threadpoolctl==3.2.0
+timm==0.9.10
+torch==2.1.0
+torchaudio==2.1.0
+torchsampler==0.1.2
+torchvision==0.16.0
+tornado @ file:///Users/runner/miniforge3/conda-bld/tornado_1695373481350/work
+tqdm==4.66.1
+typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1695040754690/work
+tzdata @ file:///home/conda/feedstock_root/build_artifacts/python-tzdata_1680081134351/work
+ua-parser==0.18.0
+ujson==5.8.0
+uritemplate==4.1.1
+urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1697720414277/work
+user-agents==2.2.0
+webencodings==0.5.1
+xmljson==0.2.0