Capx
/

WhereAmAt

+from main import *
+def get_satclip(ckpt_path, device, return_all=False):
+    ckpt = torch.load(ckpt_path,map_location=device)
+    ckpt['hyper_parameters'].pop('eval_downstream')
+    ckpt['hyper_parameters'].pop('air_temp_data_path')
+    ckpt['hyper_parameters'].pop('election_data_path')
+    lightning_model = SatCLIPLightningModule(**ckpt['hyper_parameters']).to(device)
+    lightning_model.load_state_dict(ckpt['state_dict'])
+    lightning_model.eval()
+    geo_model = lightning_model.model
+    if return_all:
+        return geo_model
+    else:
+        return geo_model.location

load_lightweight.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+from location_encoder import get_neural_network, get_positional_encoding, LocationEncoder
+def get_satclip_loc_encoder(ckpt_path, device):
+    ckpt = torch.load(ckpt_path,map_location=device)
+    hp = ckpt['hyper_parameters']
+    posenc = get_positional_encoding(
+        hp['le_type'],
+        hp['legendre_polys'],
+        hp['harmonics_calculation'],
+        hp['min_radius'],
+        hp['max_radius'],
+        hp['frequency_num']
+    )
+    nnet = get_neural_network(
+        hp['pe_type'],
+        posenc.embedding_dim,
+        hp['embed_dim'],
+        hp['capacity'],
+        hp['num_hidden_layers']
+    )
+    # only load nnet params from state dict
+    state_dict = ckpt['state_dict']
+    state_dict = {k[k.index('nnet'):]:state_dict[k]
+                  for k in state_dict.keys() if 'nnet' in k}
+    loc_encoder = LocationEncoder(posenc, nnet).double()
+    loc_encoder.load_state_dict(state_dict)
+    loc_encoder.eval()
+    return loc_encoder

location_encoder.py ADDED Viewed

	@@ -0,0 +1,275 @@

+from torch import nn, optim
+import math
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+import numpy as np
+from datetime import datetime
+import positional_encoding as PE
+"""
+FCNet
+"""
+class ResLayer(nn.Module):
+    def __init__(self, linear_size):
+        super(ResLayer, self).__init__()
+        self.l_size = linear_size
+        self.nonlin1 = nn.ReLU(inplace=True)
+        self.nonlin2 = nn.ReLU(inplace=True)
+        self.dropout1 = nn.Dropout()
+        self.w1 = nn.Linear(self.l_size, self.l_size)
+        self.w2 = nn.Linear(self.l_size, self.l_size)
+    def forward(self, x):
+        y = self.w1(x)
+        y = self.nonlin1(y)
+        y = self.dropout1(y)
+        y = self.w2(y)
+        y = self.nonlin2(y)
+        out = x + y
+        return out
+class FCNet(nn.Module):
+    def __init__(self, num_inputs, num_classes, dim_hidden):
+        super(FCNet, self).__init__()
+        self.inc_bias = False
+        self.class_emb = nn.Linear(dim_hidden, num_classes, bias=self.inc_bias)
+        self.feats = nn.Sequential(nn.Linear(num_inputs, dim_hidden),
+                                    nn.ReLU(inplace=True),
+                                    ResLayer(dim_hidden),
+                                    ResLayer(dim_hidden),
+                                    ResLayer(dim_hidden),
+                                    ResLayer(dim_hidden))
+    def forward(self, x):
+        loc_emb = self.feats(x)
+        class_pred = self.class_emb(loc_emb)
+        return class_pred
+"""A simple Multi Layer Perceptron"""
+class MLP(nn.Module):
+    def __init__(self, input_dim, dim_hidden, num_layers, out_dims):
+        super(MLP, self).__init__()
+        layers = []
+        layers += [nn.Linear(input_dim, dim_hidden, bias=True), nn.ReLU()] # input layer
+        layers += [nn.Linear(dim_hidden, dim_hidden, bias=True), nn.ReLU()] * num_layers # hidden layers
+        layers += [nn.Linear(dim_hidden, out_dims, bias=True)] # output layer
+        self.features = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.features(x)
+def exists(val):
+    return val is not None
+def cast_tuple(val, repeat = 1):
+    return val if isinstance(val, tuple) else ((val,) * repeat)
+"""Sinusoidal Representation Network (SIREN)"""
+class SirenNet(nn.Module):
+    def __init__(self, dim_in, dim_hidden, dim_out, num_layers, w0 = 1., w0_initial = 30., use_bias = True, final_activation = None, degreeinput = False, dropout = True):
+        super().__init__()
+        self.num_layers = num_layers
+        self.dim_hidden = dim_hidden
+        self.degreeinput = degreeinput
+        self.layers = nn.ModuleList([])
+        for ind in range(num_layers):
+            is_first = ind == 0
+            layer_w0 = w0_initial if is_first else w0
+            layer_dim_in = dim_in if is_first else dim_hidden
+            self.layers.append(Siren(
+                dim_in = layer_dim_in,
+                dim_out = dim_hidden,
+                w0 = layer_w0,
+                use_bias = use_bias,
+                is_first = is_first,
+                dropout = dropout
+            ))
+        final_activation = nn.Identity() if not exists(final_activation) else final_activation
+        self.last_layer = Siren(dim_in = dim_hidden, dim_out = dim_out, w0 = w0, use_bias = use_bias, activation = final_activation, dropout = False)
+    def forward(self, x, mods = None):
+        # do some normalization to bring degrees in a -pi to pi range
+        if self.degreeinput:
+            x = torch.deg2rad(x) - torch.pi
+        mods = cast_tuple(mods, self.num_layers)
+        for layer, mod in zip(self.layers, mods):
+            x = layer(x)
+            if exists(mod):
+                x *= rearrange(mod, 'd -> () d')
+        return self.last_layer(x)
+class Sine(nn.Module):
+    def __init__(self, w0 = 1.):
+        super().__init__()
+        self.w0 = w0
+    def forward(self, x):
+        return torch.sin(self.w0 * x)
+class Siren(nn.Module):
+    def __init__(self, dim_in, dim_out, w0 = 1., c = 6., is_first = False, use_bias = True, activation = None, dropout = False):
+        super().__init__()
+        self.dim_in = dim_in
+        self.is_first = is_first
+        self.dim_out = dim_out
+        self.dropout = dropout
+        weight = torch.zeros(dim_out, dim_in)
+        bias = torch.zeros(dim_out) if use_bias else None
+        self.init_(weight, bias, c = c, w0 = w0)
+        self.weight = nn.Parameter(weight)
+        self.bias = nn.Parameter(bias) if use_bias else None
+        self.activation = Sine(w0) if activation is None else activation
+    def init_(self, weight, bias, c, w0):
+        dim = self.dim_in
+        w_std = (1 / dim) if self.is_first else (math.sqrt(c / dim) / w0)
+        weight.uniform_(-w_std, w_std)
+        if exists(bias):
+            bias.uniform_(-w_std, w_std)
+    def forward(self, x):
+        out =  F.linear(x, self.weight, self.bias)
+        if self.dropout:
+            out = F.dropout(out, training=self.training)
+        out = self.activation(out)
+        return out
+class Modulator(nn.Module):
+    def __init__(self, dim_in, dim_hidden, num_layers):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for ind in range(num_layers):
+            is_first = ind == 0
+            dim = dim_in if is_first else (dim_hidden + dim_in)
+            self.layers.append(nn.Sequential(
+                nn.Linear(dim, dim_hidden),
+                nn.ReLU()
+            ))
+    def forward(self, z):
+        x = z
+        hiddens = []
+        for layer in self.layers:
+            x = layer(x)
+            hiddens.append(x)
+            x = torch.cat((x, z))
+        return tuple(hiddens)
+class SirenWrapper(nn.Module):
+    def __init__(self, net, image_width, image_height, latent_dim = None):
+        super().__init__()
+        assert isinstance(net, SirenNet), 'SirenWrapper must receive a Siren network'
+        self.net = net
+        self.image_width = image_width
+        self.image_height = image_height
+        self.modulator = None
+        if exists(latent_dim):
+            self.modulator = Modulator(
+                dim_in = latent_dim,
+                dim_hidden = net.dim_hidden,
+                num_layers = net.num_layers
+            )
+        tensors = [torch.linspace(-1, 1, steps = image_height), torch.linspace(-1, 1, steps = image_width)]
+        mgrid = torch.stack(torch.meshgrid(*tensors, indexing = 'ij'), dim=-1)
+        mgrid = rearrange(mgrid, 'h w c -> (h w) c')
+        self.register_buffer('grid', mgrid)
+    def forward(self, img = None, *, latent = None):
+        modulate = exists(self.modulator)
+        assert not (modulate ^ exists(latent)), 'latent vector must be only supplied if `latent_dim` was passed in on instantiation'
+        mods = self.modulator(latent) if modulate else None
+        coords = self.grid.clone().detach().requires_grad_()
+        out = self.net(coords, mods)
+        out = rearrange(out, '(h w) c -> () c h w', h = self.image_height, w = self.image_width)
+        if exists(img):
+            return F.mse_loss(img, out)
+        return out
+def get_positional_encoding(name, legendre_polys=10, harmonics_calculation='analytic', min_radius=1, max_radius=360, frequency_num=10):
+    if name == "direct":
+        return PE.Direct()
+    elif name == "cartesian3d":
+        return PE.Cartesian3D()
+    elif name == "sphericalharmonics":
+        if harmonics_calculation == 'discretized':
+            return PE.DiscretizedSphericalHarmonics(legendre_polys=legendre_polys)
+        else:
+            return PE.SphericalHarmonics(legendre_polys=legendre_polys,
+                                         harmonics_calculation=harmonics_calculation)
+    elif name == "theory":
+        return PE.Theory(min_radius=min_radius,
+                         max_radius=max_radius,
+                         frequency_num=frequency_num)
+    elif name == "wrap":
+        return PE.Wrap()
+    elif name in ["grid", "spherec", "spherecplus", "spherem", "spheremplus"]:
+        return PE.GridAndSphere(min_radius=min_radius,
+                       max_radius=max_radius,
+                       frequency_num=frequency_num,
+                       name=name)
+    else:
+        raise ValueError(f"{name} not a known positional encoding.")
+def get_neural_network(name, input_dim, num_classes=256, dim_hidden=256, num_layers=2):
+    if name == "linear":
+        return nn.Linear(input_dim, num_classes)
+    elif name ==  "mlp":
+        return MLP(
+                input_dim=input_dim,
+                dim_hidden=dim_hidden,
+                num_layers=num_layers,
+                out_dims=num_classes
+        )
+    elif name ==  "siren":
+        return SirenNet(
+                dim_in=input_dim,
+                dim_hidden=dim_hidden,
+                num_layers=num_layers,
+                dim_out=num_classes
+            )
+    elif name ==  "fcnet":
+        return FCNet(
+                num_inputs=input_dim,
+                num_classes=num_classes,
+                dim_hidden=dim_hidden
+            )
+    else:
+        raise ValueError(f"{name} not a known neural networks.")
+class LocationEncoder(nn.Module):
+    def __init__(self, posenc, nnet):
+        super().__init__()
+        self.posenc = posenc
+        self.nnet = nnet
+    def forward(self, x):
+        x = self.posenc(x)
+        return self.nnet(x)

loss.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+class SatCLIPLoss(nn.Module):
+    def __init__(
+            self,
+            local_loss=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+    ):
+        super().__init__()
+        self.local_loss = local_loss
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        # cache state
+        self.prev_num_logits = 0
+        self.labels = {}
+    def get_ground_truth(self, device, num_logits) -> torch.Tensor:
+        # calculated ground-truth and cache if enabled
+        if self.prev_num_logits != num_logits or device not in self.labels:
+            labels = torch.arange(num_logits, device=device, dtype=torch.long)
+            if self.world_size > 1 and self.local_loss:
+                labels = labels + num_logits * self.rank
+            if self.cache_labels:
+                self.labels[device] = labels
+                self.prev_num_logits = num_logits
+        else:
+            labels = self.labels[device]
+        return labels
+    def forward(self, logits_per_image, logits_per_coord, output_dict=False):
+        device = logits_per_image.device
+        labels = self.get_ground_truth(device, logits_per_image.shape[0])
+        total_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_coord, labels)
+        ) / 2
+        return {"contrastive_loss": total_loss} if output_dict else total_loss

main.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import argparse
+import os
+from datetime import datetime
+import lightning.pytorch
+import torch
+from datamodules.s2geo_dataset import S2GeoDataModule
+from lightning.pytorch.callbacks import ModelCheckpoint
+from lightning.pytorch.cli import LightningCLI
+from loss import SatCLIPLoss
+from model import SatCLIP
+torch.set_float32_matmul_precision('high')
+class SatCLIPLightningModule(lightning.pytorch.LightningModule):
+    def __init__(
+        self,
+        embed_dim=512,
+        image_resolution=256,
+        vision_layers=12,
+        vision_width=768,
+        vision_patch_size=32,
+        in_channels=4,
+        le_type="grid",
+        pe_type="siren",
+        frequency_num=16,
+        max_radius=260,
+        min_radius=1,
+        legendre_polys=16,
+        harmonics_calculation="analytic",
+        sh_embedding_dims=32,
+        learning_rate=1e-4,
+        weight_decay=0.01,
+        num_hidden_layers=2,
+        capacity=256,
+    ) -> None:
+        super().__init__()
+        self.model = SatCLIP(
+            embed_dim=embed_dim,
+            image_resolution=image_resolution,
+            vision_layers=vision_layers,
+            vision_width=vision_width,
+            vision_patch_size=vision_patch_size,
+            in_channels=in_channels,
+            le_type=le_type,
+            pe_type=pe_type,
+            frequency_num=frequency_num,
+            max_radius=max_radius,
+            min_radius=min_radius,
+            legendre_polys=legendre_polys,
+            harmonics_calculation=harmonics_calculation,
+            sh_embedding_dims=sh_embedding_dims,
+            num_hidden_layers=num_hidden_layers,
+            capacity=capacity,
+        )
+        self.loss_fun = SatCLIPLoss()
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.save_hyperparameters()
+    def common_step(self, batch, batch_idx):
+        images = batch["image"]
+        t_points = batch["point"].float()
+        logits_per_image, logits_per_coord = self.model(images, t_points)
+        return self.loss_fun(logits_per_image, logits_per_coord)
+    def training_step(self, batch, batch_idx):
+        loss = self.common_step(batch, batch_idx)
+        self.log("train_loss", loss)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss = self.common_step(batch, batch_idx)
+        self.log("val_loss", loss)
+        return loss
+    def configure_optimizers(self):
+        exclude = (
+            lambda n, p: p.ndim < 2
+            or "bn" in n
+            or "ln" in n
+            or "bias" in n
+            or "logit_scale" in n
+        )
+        include = lambda n, p: not exclude(n, p)
+        named_parameters = list(self.model.named_parameters())
+        gain_or_bias_params = [
+            p for n, p in named_parameters if exclude(n, p) and p.requires_grad
+        ]
+        rest_params = [
+            p for n, p in named_parameters if include(n, p) and p.requires_grad
+        ]
+        optimizer = torch.optim.AdamW(
+            [
+                {"params": gain_or_bias_params, "weight_decay": 0.0},
+                {
+                    "params": rest_params,
+                    "weight_decay": self.weight_decay,
+                },  # specify in configs/default.yaml
+            ],
+            lr=self.learning_rate,  # specify in configs/default.yaml
+        )
+        return optimizer
+class MyLightningCLI(LightningCLI):
+    def add_arguments_to_parser(self, parser):
+        parser.add_argument("--watchmodel", action="store_true")
+def cli_main(default_config_filename="/configs/default.yaml"):
+    save_config_fn = default_config_filename.replace(".yaml", "-latest.yaml")
+    # modify configs/default.yaml for learning rate etc.
+    cli = MyLightningCLI(
+        model_class=SatCLIPLightningModule,
+        datamodule_class=S2GeoDataModule,
+        save_config_kwargs=dict(
+            config_filename=save_config_fn,
+            overwrite=True,
+        ),
+        trainer_defaults={
+            "accumulate_grad_batches": 16,
+            "log_every_n_steps": 10,
+        },
+        parser_kwargs={"default_config_files": [default_config_filename]},
+        seed_everything_default=0,
+        run=False,
+    )
+    ts = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
+    run_name = f"SatCLIP_S2_{ts}"
+    if cli.trainer.logger is not None:
+        cli.trainer.logger.experiment.name = run_name
+        # this seems to be necessary to force logging of datamodule hyperparams
+        cli.trainer.logger.log_hyperparams(cli.datamodule.hparams)
+    cli.trainer.fit(
+        model=cli.model,
+        datamodule=cli.datamodule,
+    )
+if __name__ == "__main__":
+    config_fn = "./configs/default.yaml"
+    #A100 go vroom vroom 🚗💨
+    if torch.cuda.get_device_name(device=0)=='NVIDIA A100 80GB PCIe':
+        torch.backends.cuda.matmul.allow_tf32 = True
+        print('Superfastmode! 🚀')
+    else:
+        torch.backends.cuda.matmul.allow_tf32 = False
+    cli_main(config_fn)

model.py ADDED Viewed

	@@ -0,0 +1,400 @@

+from collections import OrderedDict
+from typing import Tuple, Union, Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+import math
+import timm
+import torchgeo.models
+from torchgeo.models import ResNet18_Weights, ResNet50_Weights, ViTSmall16_Weights
+from location_encoder import get_positional_encoding, get_neural_network, LocationEncoder
+from datamodules.s2geo_dataset import S2Geo
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu3(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1], key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x.squeeze(0)
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64, in_channels=3):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(in_channels, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, in_channels: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class SatCLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int, str],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 in_channels: int,
+                 # location
+                 le_type: str,
+                 pe_type: str,
+                 frequency_num: int,
+                 max_radius: int,
+                 min_radius: int,
+                 harmonics_calculation: str,
+                 legendre_polys: int=10,
+                 sh_embedding_dims: int=16,
+                 ffn: bool=True,
+                 num_hidden_layers: int=2,
+                 capacity: int=256,
+                 *args,
+                 **kwargs
+                 ):
+        super().__init__()
+        if isinstance(vision_layers, (tuple, list)):
+            print('using modified resnet')
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width,
+                in_channels=in_channels
+            )
+        elif vision_layers == 'moco_resnet18':
+            print('using pretrained moco resnet18')
+            weights = ResNet18_Weights.SENTINEL2_ALL_MOCO
+            in_chans = weights.meta["in_chans"]
+            self.visual = timm.create_model("resnet18", in_chans=in_chans, num_classes=embed_dim)
+            self.visual.load_state_dict(weights.get_state_dict(progress=True), strict=False)
+            self.visual.requires_grad_(False)
+            self.visual.fc.requires_grad_(True)
+        elif vision_layers == 'moco_resnet50':
+            print('using pretrained moco resnet50')
+            weights = ResNet50_Weights.SENTINEL2_ALL_MOCO
+            in_chans = weights.meta["in_chans"]
+            self.visual = timm.create_model("resnet50", in_chans=in_chans, num_classes=embed_dim)
+            self.visual.load_state_dict(weights.get_state_dict(progress=True), strict=False)
+            self.visual.requires_grad_(False)
+            self.visual.fc.requires_grad_(True)
+        elif vision_layers == 'moco_vit16':
+            print('using pretrained moco vit16')
+            weights = ViTSmall16_Weights.SENTINEL2_ALL_MOCO
+            in_chans = weights.meta["in_chans"]
+            self.visual = timm.create_model("vit_small_patch16_224", in_chans=in_chans, num_classes=embed_dim)
+            self.visual.load_state_dict(weights.get_state_dict(progress=True), strict=False)
+            self.visual.requires_grad_(False)
+            self.visual.head.requires_grad_(True)
+        else:
+            print('using vision transformer')
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim,
+                in_channels=in_channels
+            )
+        self.posenc = get_positional_encoding(name=le_type, harmonics_calculation=harmonics_calculation, legendre_polys=legendre_polys, min_radius=min_radius, max_radius=max_radius, frequency_num=frequency_num).double()
+        self.nnet = get_neural_network(name=pe_type, input_dim=self.posenc.embedding_dim, num_classes=embed_dim, dim_hidden=capacity, num_layers=num_hidden_layers).double()
+        self.location = LocationEncoder(self.posenc,
+                                        self.nnet
+        ).double()
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+    @property
+    def dtype(self):
+        if isinstance(self.visual, timm.models.vision_transformer.VisionTransformer):
+            return self.visual.patch_embed.proj.weight.dtype
+        else:
+            return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_location(self, coords):
+        return self.location(coords.double())
+    def forward(self, image, coords):
+        image_features = self.encode_image(image)
+        location_features = self.encode_location(coords).float()
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        location_features = location_features / location_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ location_features.t()
+        logits_per_location = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_location
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)