diff --git a/app.py b/app.py
index dc042299f4e404ef8a0d146e49de3245072fc22a..e330aebf4955741cf4823806b8e34ccd9184aacf 100644
--- a/app.py
+++ b/app.py
@@ -1,18 +1,18 @@
-import streamlit as st
 import io
-import sys
-import time
-import json
-sys.path.append("./virtex/")
+
+import streamlit as st
 from model import *
 
 # # TODO:
 # - Reformat the model introduction
 # - Make the iterative text generation
 
-def gen_show_caption(sub_prompt=None, cap_prompt = ""):
+
+def gen_show_caption(sub_prompt=None, cap_prompt=""):
     with st.spinner("Generating Caption"):
-        subreddit, caption = virtexModel.predict(image_dict, sub_prompt=sub_prompt, prompt=cap_prompt)
+        subreddit, caption = virtexModel.predict(
+            image_dict, sub_prompt=sub_prompt, prompt=cap_prompt
+        )
         st.markdown(
             f"""
             <style>
@@ -28,10 +28,12 @@ def gen_show_caption(sub_prompt=None, cap_prompt = ""):
             </style>
 
             ### <red> r/{subreddit} </red> <blue> {cap_prompt} </blue> {caption}
-            """, 
-            unsafe_allow_html=True)
-    
-_, center, _ = st.columns([1,8,1])
+            """,
+            unsafe_allow_html=True,
+        )
+
+
+_, center, _ = st.columns([1, 8, 1])
 
 with center:
     st.title("Image Captioning Demo from RedCaps")
@@ -50,7 +52,7 @@ st.sidebar.markdown(
 
 with st.spinner("Loading Model"):
     virtexModel, imageLoader, sample_images, valid_subs = create_objects()
-    
+
 
 select_idx = None
 
@@ -66,9 +68,9 @@ uploaded_image = None
 # with st.sidebar.form("file-uploader-form", clear_on_submit=True):
 uploaded_file = st.sidebar.file_uploader("Choose a file")
 # submitted = st.form_submit_button("Submit")
-if uploaded_file is not None:# and submitted:
+if uploaded_file is not None:  # and submitted:
     uploaded_image = Image.open(io.BytesIO(uploaded_file.getvalue()))
-    select_idx = None # set this to help rewrite the cache
+    select_idx = None  # set this to help rewrite the cache
 
 # class OnChange():
 #     def __init__(self, idx):
@@ -88,21 +90,26 @@ if uploaded_file is not None:# and submitted:
 st.sidebar.title("Select a Subreddit")
 sub = st.sidebar.selectbox(
     "Type below to condition on a subreddit. Select None for a predicted subreddit",
-    valid_subs
+    valid_subs,
 )
 
 st.sidebar.title("Write a Custom Prompt")
-cap_prompt = st.sidebar.text_input(
-    "Write the start of your caption below", 
-    value=""
-)
+cap_prompt = st.sidebar.text_input("Write the start of your caption below", value="")
 
 _ = st.sidebar.button("Regenerate Caption")
 
 
 st.sidebar.write("Advanced Options:")
-num_captions = st.sidebar.select_slider("Number of Captions to Predict", options=[1,2,3,4,5], value=1)
-nuc_size = st.sidebar.slider("Nucelus Size:\nLarger values lead to more diverse captions", min_value=0.0, max_value=1.0, value=0.8, step=0.05)
+num_captions = st.sidebar.select_slider(
+    "Number of Captions to Predict", options=[1, 2, 3, 4, 5], value=1
+)
+nuc_size = st.sidebar.slider(
+    "Nucelus Size:\nLarger values lead to more diverse captions",
+    min_value=0.0,
+    max_value=1.0,
+    value=0.8,
+    step=0.05,
+)
 virtexModel.model.decoder.nucleus_size = nuc_size
 
 image_file = sample_image
@@ -110,14 +117,14 @@ image_file = sample_image
 # LOAD AND CACHE THE IMAGE
 if uploaded_image is not None:
     image = uploaded_image
-elif select_idx is None and 'image' in st.session_state:
-    image = st.session_state['image']
+elif select_idx is None and "image" in st.session_state:
+    image = st.session_state["image"]
 else:
     image = Image.open(image_file)
 
 image = image.convert("RGB")
 
-st.session_state['image'] = image
+st.session_state["image"] = image
 
 
 image_dict = imageLoader.transform(image)
@@ -141,4 +148,4 @@ This demo accompanies our paper RedCaps.
 
 Created by Karan Desai, Gaurav Kaul, Zubin Aysola, Justin Johnson
     """
-)
\ No newline at end of file
+)
diff --git a/model.py b/model.py
index f071d7e96a31ac14fb8d84ad4d28da2eaab9c430..a615e1fb9a09443b2040bc3ab1a9e12c60e08f63 100644
--- a/model.py
+++ b/model.py
@@ -1,18 +1,17 @@
-import streamlit as st
-from huggingface_hub import hf_hub_url, cached_download
-from PIL import Image
 import os
 import json
 import glob
 import random
-from typing import Any, Dict, List
 import torch
 import torchvision
 
+import streamlit as st
 import wordsegment as ws
+from PIL import Image
+from huggingface_hub import hf_hub_url, cached_download
 
 from virtex.config import Config
-from virtex.factories import TokenizerFactory, PretrainingModelFactory, ImageTransformsFactory
+from virtex.factories import TokenizerFactory, PretrainingModelFactory
 from virtex.utils.checkpointing import CheckpointManager
 
 CONFIG_PATH = "config.yaml"
@@ -20,98 +19,108 @@ MODEL_PATH = "checkpoint_last5.pth"
 VALID_SUBREDDITS_PATH = "subreddit_list.json"
 SAMPLES_PATH = "./samples/*.jpg"
 
-class ImageLoader():
+
+class ImageLoader:
     def __init__(self):
-        self.image_transform = torchvision.transforms.Compose([
-                               torchvision.transforms.ToTensor(),
-                               torchvision.transforms.Resize(256),
-                               torchvision.transforms.CenterCrop(224),
-                               torchvision.transforms.Normalize((.485, .456, .406), (.229, .224, .225))])
-        self.show_size=500
-        
+        self.image_transform = torchvision.transforms.Compose(
+            [
+                torchvision.transforms.ToTensor(),
+                torchvision.transforms.Resize(256),
+                torchvision.transforms.CenterCrop(224),
+                torchvision.transforms.Normalize(
+                    (0.485, 0.456, 0.406), (0.229, 0.224, 0.225)
+                ),
+            ]
+        )
+        self.show_size = 500
+
     def load(self, im_path):
         im = torch.FloatTensor(self.image_transform(Image.open(im_path))).unsqueeze(0)
         return {"image": im}
-    
+
     def raw_load(self, im_path):
         im = torch.FloatTensor(Image.open(im_path))
         return {"image": im}
-    
+
     def transform(self, image):
         im = torch.FloatTensor(self.image_transform(image)).unsqueeze(0)
         return {"image": im}
-    
+
     def text_transform(self, text):
         # at present just lowercasing:
         return text.lower()
-    
+
     def show_resize(self, image):
         # ugh we need to do this manually cuz this is pytorch==0.8 not 1.9 lol
         image = torchvision.transforms.functional.to_tensor(image)
-        x,y = image.shape[-2:]
-        ratio = float(self.show_size/max((x,y)))
-        image = torchvision.transforms.functional.resize(image, [int(x * ratio), int(y * ratio)])
+        x, y = image.shape[-2:]
+        ratio = float(self.show_size / max((x, y)))
+        image = torchvision.transforms.functional.resize(
+            image, [int(x * ratio), int(y * ratio)]
+        )
         return torchvision.transforms.functional.to_pil_image(image)
-    
 
-class VirTexModel():
+
+class VirTexModel:
+
     def __init__(self):
         self.config = Config(CONFIG_PATH)
         ws.load()
-        self.device = 'cpu'
+        self.device = "cpu"
         self.tokenizer = TokenizerFactory.from_config(self.config)
         self.model = PretrainingModelFactory.from_config(self.config).to(self.device)
         CheckpointManager(model=self.model).load(MODEL_PATH)
         self.model.eval()
         self.valid_subs = json.load(open(VALID_SUBREDDITS_PATH))
-        
-    def predict(self, image_dict, sub_prompt = None, prompt = ""):
+
+    def predict(self, image_dict, sub_prompt=None, prompt=""):
         if sub_prompt is None:
-            subreddit_tokens = torch.tensor([self.model.sos_index], device=self.device).long()
+            subreddit_tokens = torch.tensor(
+                [self.model.sos_index], device=self.device
+            ).long()
         else:
             subreddit_tokens = " ".join(ws.segment(ws.clean(sub_prompt)))
             subreddit_tokens = (
-                [self.model.sos_index] + 
-                self.tokenizer.encode(subreddit_tokens) +
-                [self.tokenizer.token_to_id("[SEP]")]
-                               )
+                [self.model.sos_index]
+                + self.tokenizer.encode(subreddit_tokens)
+                + [self.tokenizer.token_to_id("[SEP]")]
+            )
             subreddit_tokens = torch.tensor(subreddit_tokens, device=self.device).long()
-            
+
         if prompt is not "":
             # at present prompts without subreddits will break without this change
             # TODO FIX
             cap_tokens = self.tokenizer.encode(prompt)
             cap_tokens = torch.tensor(cap_tokens, device=self.device).long()
-            subreddit_tokens = subreddit_tokens if sub_prompt is not None else torch.tensor(
-                (
-                    [self.model.sos_index] + 
-                    self.tokenizer.encode("pics") + 
-                    [self.tokenizer.token_to_id("[SEP]")]
-                ), device = self.device).long()
-
-            subreddit_tokens = torch.cat(
-                [
-                    subreddit_tokens,
-                    cap_tokens
-                ])
-            
-            
-        predictions: List[Dict[str, Any]] = []
-        
+            subreddit_tokens = (
+                subreddit_tokens
+                if sub_prompt is not None
+                else torch.tensor(
+                    (
+                        [self.model.sos_index]
+                        + self.tokenizer.encode("pics")
+                        + [self.tokenizer.token_to_id("[SEP]")]
+                    ),
+                    device=self.device,
+                ).long()
+            )
+
+            subreddit_tokens = torch.cat([subreddit_tokens, cap_tokens])
+
         is_valid_subreddit = False
         subreddit, rest_of_caption = "", ""
         image_dict["decode_prompt"] = subreddit_tokens
         while not is_valid_subreddit:
-            
+
             with torch.no_grad():
                 caption = self.model(image_dict)["predictions"][0].tolist()
-                
+
             if self.tokenizer.token_to_id("[SEP]") in caption:
                 sep_index = caption.index(self.tokenizer.token_to_id("[SEP]"))
                 caption[sep_index] = self.tokenizer.token_to_id("://")
-            
+
             caption = self.tokenizer.decode(caption)
-            
+
             if "://" in caption:
                 subreddit, rest_of_caption = caption.split("://")
                 subreddit = "".join(subreddit.split())
@@ -122,25 +131,29 @@ class VirTexModel():
             # split prompt for coloring:
             if prompt is not "":
                 _, rest_of_caption = caption.split(prompt.strip())
-            
+
             is_valid_subreddit = subreddit in self.valid_subs
-            
+
         return subreddit, rest_of_caption
 
+
 def download_files():
-    #download model files
+    # download model files
     download_files = [CONFIG_PATH, MODEL_PATH, VALID_SUBREDDITS_PATH]
     for f in download_files:
         fp = cached_download(hf_hub_url("zamborg/redcaps", filename=f))
         os.system(f"cp {fp} ./{f}")
 
+
 def get_samples():
     return glob.glob(SAMPLES_PATH)
 
+
 def get_rand_idx(samples):
-    return random.randint(0,len(samples)-1)
+    return random.randint(0, len(samples) - 1)
+
 
-@st.cache(allow_output_mutation=True) # allow mutation to update nucleus size
+@st.cache(allow_output_mutation=True)  # allow mutation to update nucleus size
 def create_objects():
     sample_images = get_samples()
     virtexModel = VirTexModel()
@@ -149,7 +162,8 @@ def create_objects():
     valid_subs.insert(0, None)
     return virtexModel, imageLoader, sample_images, valid_subs
 
-footer="""<style>
+
+footer = """<style>
 a:link , a:visited{
 color: blue;
 background-color: transparent;
@@ -181,4 +195,4 @@ This demo accompanies our paper RedCaps.
 Created by Karan Desai, Gaurav Kaul, Zubin Aysola, Justin Johnson
 </p>
 </div>
-"""
\ No newline at end of file
+"""
diff --git a/virtex/CHANGELOG.md b/virtex/CHANGELOG.md
deleted file mode 100644
index 9e54814cdf13b6404c9da2c41300455be981b9a1..0000000000000000000000000000000000000000
--- a/virtex/CHANGELOG.md
+++ /dev/null
@@ -1,41 +0,0 @@
-ArXiv v1 -> v2 CHANGELOG
-=========================
-
-[ArXiv v1](https://arxiv.org/abs/2006.06666v1) was our ECCV 2020 submission (reject). [ArXiv v2](https://arxiv.org/abs/2006.06666v2) is out CVPR 2021 submission (accept). The repository snapshots for these two versions are tagged at [`v0.9`](https://github.com/kdexd/virtex/releases/tag/v0.9) and [`v1.0`](https://github.com/kdexd/virtex/releases/tag/v1.0).
-
-While the core motivation and approach is the same, we have made some minor changes in our experiments and evaluation setup. These slightly improve model performances across the board (within decimals). New models are available in [`v1.0` model zoo](http://kdexd.github.io/virtex/virtex/usage/model_zoo.html), however links to old models in `v0.9` will be active till June 30, 2021. We encourage you to use the new models!
-
-We have updated the experiment config files for all changes described below.
-
-Experiment Changes
-------------------
-
-### New Feature:
-
-Add a new pretraining task for BERT-style _Masked Language Modeling_. Pre-trained model released in Model Zoo.
-
-### Pre-training:
-
-- The only change during pre-training is that we do not apply weight decay to LayerNorm and biases in input embedding and transformer layers. We apply weight decay to the biases in output linear layer (before softmax).
-
-- Other factors that could affect results:
-  - Use official [albumentations.ColorJitter transform](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ColorJitter) that mimics torchvision ColorJitter transform. Earlier I implemented [my own ColorJitter](https://github.com/kdexd/virtex/blob/c19e7fc9b98e98af82286ed1537b6f588eaeac44/virtex/data/transforms.py#L156) because albumentations didn't have one.
-  - Use PyTorch Native AMP (Automatic Mixed Precision) instead of NVIDIA Apex.
-
-### Downstream Evaluations:
-
-1. **PASCAL VOC 2007 Linear Classification:** [[diff]](https://github.com/kdexd/virtex/compare/57889ca9829f27b932e92b9e6b51f50f20f2d546..7645cc0d1e3e49f00e347e9873fd020faa2ec62e#diff-b4405dd4879a48ef1e5b1e2801035909584a5f1f32f63d5e793fb50dee077b97)
-   - Instead of training linear SVMs on 8192-dimensional average pooled features from ResNet-50 (7x7x2048 —> 2x2x2048), like [(Misra et al. 2019)](https://arxiv.org/abs/1905.01235), we directly train SVMs on 2048-dimensional global average pooled features, following recent works like [SwAV (Caron et al. 2020)](https://arxiv.org/abs/2006.09882).
-   - We change the pre-processing: resize shortest edge to 256 pixels, and take center crop of 224 pixels.
-   - These improve VOC mAP by 1-2 points everywhere, and makes SVM training faster. Since we select best checkpoint based on this metric, all results on other downstream tasks also change in `ArXiv v2` (But the trends remain same.)
-
-2. **ImageNet Linear Evaluation:** [[diff]](https://github.com/kdexd/virtex/compare/57889ca9829f27b932e92b9e6b51f50f20f2d546..7645cc0d1e3e49f00e347e9873fd020faa2ec62e#diff-d3dea1e7bf97d0cfca4b59a47c0a9bb81e78b8827654fe0258df9ce2c3f5f41c)
-   - Changed random resized crop scale from (20-100%) to (8-100%) for consistency with evaluations in SSL works like MoCo and SwAV.
-   - Use cosine LR decay instead of step decay, following SwAV. Improves accuracy by up to 1%.
-
-3. **iNaturalist Fine-tuning:** [[diff]](https://github.com/kdexd/virtex/compare/57889ca9829f27b932e92b9e6b51f50f20f2d546..7645cc0d1e3e49f00e347e9873fd020faa2ec62e#diff-09096da78cfcde3a604ce22d80313f0800225d928cce5ef7334b89a382adfe4d)
-   - This evaluation is left unchanged across ArXiv versions, but we fixd a typo in image pre-processing step, present in publicly released config.
-
-4. **Detectron2 tasks (COCO and LVIS Instance Segmentation, VOC Detection):**
-   - Heavily simplified the script. Updated Detectron2 uses a more memory-efficient SyncBatchNorm and supports AMP.
-
diff --git a/virtex/LICENSE b/virtex/LICENSE
deleted file mode 100644
index e909de7a0a9528ffc9a95e854842315713a971a3..0000000000000000000000000000000000000000
--- a/virtex/LICENSE
+++ /dev/null
@@ -1,16 +0,0 @@
-Copyright (c) 2020, Karan Desai.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
-associated documentation files (the "Software"), to deal in the Software without restriction,
-including without limitation the rights to use, copy, modify, merge, publish, distribute,
-sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
-NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
-OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/virtex/README.md b/virtex/README.md
deleted file mode 100644
index 720ce5e0559be640430fe8c783b4c7bbf17c1da3..0000000000000000000000000000000000000000
--- a/virtex/README.md
+++ /dev/null
@@ -1,92 +0,0 @@
-VirTex: Learning Visual Representations from Textual Annotations
-================================================================
-
-<h4>
-Karan Desai and Justin Johnson
-</br>
-<span style="font-size: 14pt; color: #555555">
-University of Michigan
-</span>
-</h4>
-<hr>
-
-**CVPR 2021** [arxiv.org/abs/2006.06666][1]
-
-**Model Zoo, Usage Instructions and API docs:** [kdexd.github.io/virtex](https://kdexd.github.io/virtex)
-
-VirTex is a pretraining approach which uses semantically dense captions to
-learn visual representations. We train CNN + Transformers from scratch on
-COCO Captions, and transfer the CNN to downstream vision tasks including
-image classification, object detection, and instance segmentation.
-VirTex matches or outperforms models which use ImageNet for pretraining -- 
-both supervised or unsupervised -- despite using up to 10x fewer images.
-
-![virtex-model](docs/_static/system_figure.jpg)
-
-
-Get the pretrained ResNet-50 visual backbone from our best performing VirTex
-model in one line *without any installation*!
-
-```python
-import torch
-
-# That's it, this one line only requires PyTorch.
-model = torch.hub.load("kdexd/virtex", "resnet50", pretrained=True)
-```
-
-### Note (For returning users before January 2021):
-
-The pretrained models in our model zoo have changed from [`v1.0`](https://github.com/kdexd/virtex/releases/tag/v1.0) onwards.
-They are slightly better tuned than older models, and reproduce the results in our
-CVPR 2021 accepted paper ([arXiv v2](https://arxiv.org/abs/2006.06666v2)). 
-Some training and evaluation hyperparams are changed since [`v0.9`](https://github.com/kdexd/virtex/releases/tag/v0.9).
-Please refer [`CHANGELOG.md`](https://github.com/kdexd/virtex/blob/master/CHANGELOG.md)
-
-
-Usage Instructions
-------------------
-
-1. [How to setup this codebase?][2]  
-2. [VirTex Model Zoo][3]  
-3. [How to train your VirTex model?][4]  
-4. [How to evaluate on downstream tasks?][5]  
-
-Full documentation is available at [kdexd.github.io/virtex](https://kdexd.github.io/virtex).
-
-
-Citation
---------
-
-If you find this code useful, please consider citing:
-
-```text
-@inproceedings{desai2021virtex,
-    title={{VirTex: Learning Visual Representations from Textual Annotations}},
-    author={Karan Desai and Justin Johnson},
-    booktitle={CVPR},
-    year={2021}
-}
-```
-
-Acknowledgments
----------------
-
-We thank Harsh Agrawal, Mohamed El Banani, Richard  Higgins, Nilesh Kulkarni
-and Chris Rockwell for helpful discussions and feedback on the paper. We thank
-Ishan Misra for discussions regarding PIRL evaluation protocol; Saining Xie for
-discussions about replicating iNaturalist evaluation as MoCo; Ross Girshick and
-Yuxin Wu for help with Detectron2 model zoo; Georgia Gkioxari for suggesting
-the Instance Segmentation pretraining task ablation; and Stefan Lee for
-suggestions on figure aesthetics. We thank Jia Deng for access to extra GPUs
-during project development; and UMich ARC-TS team for support with GPU cluster
-management. Finally, we thank all the Starbucks outlets in Ann Arbor for many
-hours of free WiFi. This work was partially supported by the Toyota Research
-Institute (TRI). However, note that this article solely reflects the opinions
-and conclusions of its authors and not TRI or any other Toyota entity.
-
-
-[1]: https://arxiv.org/abs/2006.06666
-[2]: https://kdexd.github.io/virtex/virtex/usage/setup_dependencies.html
-[3]: https://kdexd.github.io/virtex/virtex/usage/model_zoo.html
-[4]: https://kdexd.github.io/virtex/virtex/usage/pretrain.html
-[5]: https://kdexd.github.io/virtex/virtex/usage/downstream.html
diff --git a/virtex/virtex/__init__.py b/virtex/__init__.py
similarity index 100%
rename from virtex/virtex/__init__.py
rename to virtex/__init__.py
diff --git a/virtex/virtex/config.py b/virtex/config.py
similarity index 100%
rename from virtex/virtex/config.py
rename to virtex/config.py
diff --git a/virtex/configs/_base_bicaptioning_R_50_L1_H1024.yaml b/virtex/configs/_base_bicaptioning_R_50_L1_H1024.yaml
deleted file mode 100644
index ab40b92b6560f88547d3e12952c79a4fa71448f8..0000000000000000000000000000000000000000
--- a/virtex/configs/_base_bicaptioning_R_50_L1_H1024.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-# -----------------------------------------------------------------------------
-# Base config: VirTex pretraining for our "base" bicaptioning model:
-# ResNet-50 + (L = 1, H = 1024) transformer trained for 500K iterations.
-# -----------------------------------------------------------------------------
-RANDOM_SEED: 0
-AMP: true
-CUDNN_BENCHMARK: true
-CUDNN_DETERMINISTIC: false
-
-DATA:
-  ROOT: "datasets/coco"
-  TOKENIZER_MODEL: "datasets/vocab/coco_10k.model"
-  VOCAB_SIZE: 10000
-  UNK_INDEX: 0
-  SOS_INDEX: 1
-  EOS_INDEX: 2
-  MASK_INDEX: 3
-
-  IMAGE_CROP_SIZE: 224
-  MAX_CAPTION_LENGTH: 30
-
-  IMAGE_TRANSFORM_TRAIN:
-    - "random_resized_crop"
-    - "horizontal_flip"
-    - "color_jitter"
-    - "normalize"
-
-  IMAGE_TRANSFORM_VAL:
-    - "smallest_resize"
-    - "center_crop"
-    - "normalize"
-
-  USE_PERCENTAGE: 100.0
-  USE_SINGLE_CAPTION: false
-
-MODEL:
-  NAME: "virtex"
-  VISUAL:
-    NAME: "torchvision::resnet50"
-    PRETRAINED: false
-    FROZEN: false
-  TEXTUAL:
-    NAME: "transdec_postnorm::L1_H1024_A16_F4096"
-    DROPOUT: 0.1
-
-OPTIM:
-  OPTIMIZER_NAME: "sgd"
-  SGD_MOMENTUM: 0.9
-  WEIGHT_DECAY: 0.0001
-
-  LOOKAHEAD:
-    USE: true
-    ALPHA: 0.5
-    STEPS: 5
-
-  BATCH_SIZE: 256
-  CNN_LR: 0.2
-  LR: 0.001
-  NUM_ITERATIONS: 500000
-
-  WARMUP_STEPS: 10000
-  LR_DECAY_NAME: "cosine"
-
-  NO_DECAY: ".*textual.(embedding|transformer).*(norm.*|bias)"
-  CLIP_GRAD_NORM: 10.0
-
diff --git a/virtex/configs/backbone_ablations/bicaptioning_R_101_L1_H1024.yaml b/virtex/configs/backbone_ablations/bicaptioning_R_101_L1_H1024.yaml
deleted file mode 100644
index 3db670cd5b9e4bfa0f1da6c668b7cf90cf80d23d..0000000000000000000000000000000000000000
--- a/virtex/configs/backbone_ablations/bicaptioning_R_101_L1_H1024.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  VISUAL:
-    NAME: "torchvision::resnet101"
diff --git a/virtex/configs/backbone_ablations/bicaptioning_R_50W2X_L1_H1024.yaml b/virtex/configs/backbone_ablations/bicaptioning_R_50W2X_L1_H1024.yaml
deleted file mode 100644
index e89bb9e3cdb3ceacbc94ad10829b0b5d4c409d34..0000000000000000000000000000000000000000
--- a/virtex/configs/backbone_ablations/bicaptioning_R_50W2X_L1_H1024.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  VISUAL:
-    NAME: "torchvision::wide_resnet50_2"
diff --git a/virtex/configs/backbone_ablations/bicaptioning_R_50_L1_H1024.yaml b/virtex/configs/backbone_ablations/bicaptioning_R_50_L1_H1024.yaml
deleted file mode 100644
index 3d004bb1a991185d067b68d361e854273cb2738a..0000000000000000000000000000000000000000
--- a/virtex/configs/backbone_ablations/bicaptioning_R_50_L1_H1024.yaml
+++ /dev/null
@@ -1 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
diff --git a/virtex/configs/depth_ablations/bicaptioning_R_50_L1_H1024.yaml b/virtex/configs/depth_ablations/bicaptioning_R_50_L1_H1024.yaml
deleted file mode 100644
index 3d004bb1a991185d067b68d361e854273cb2738a..0000000000000000000000000000000000000000
--- a/virtex/configs/depth_ablations/bicaptioning_R_50_L1_H1024.yaml
+++ /dev/null
@@ -1 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
diff --git a/virtex/configs/depth_ablations/bicaptioning_R_50_L2_H1024.yaml b/virtex/configs/depth_ablations/bicaptioning_R_50_L2_H1024.yaml
deleted file mode 100644
index 7c391a26bbce5217484cd41bbadc37ce9a6b0309..0000000000000000000000000000000000000000
--- a/virtex/configs/depth_ablations/bicaptioning_R_50_L2_H1024.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  TEXTUAL:
-    NAME: "transdec_postnorm::L2_H1024_A16_F4096"
diff --git a/virtex/configs/depth_ablations/bicaptioning_R_50_L3_H1024.yaml b/virtex/configs/depth_ablations/bicaptioning_R_50_L3_H1024.yaml
deleted file mode 100644
index aeb89ca98d97cdff802f1800eb32531357781177..0000000000000000000000000000000000000000
--- a/virtex/configs/depth_ablations/bicaptioning_R_50_L3_H1024.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  TEXTUAL:
-    NAME: "transdec_postnorm::L3_H1024_A16_F4096"
diff --git a/virtex/configs/depth_ablations/bicaptioning_R_50_L4_H1024.yaml b/virtex/configs/depth_ablations/bicaptioning_R_50_L4_H1024.yaml
deleted file mode 100644
index 6bde4aca414e76c89243311916bf00bdacbafac2..0000000000000000000000000000000000000000
--- a/virtex/configs/depth_ablations/bicaptioning_R_50_L4_H1024.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  TEXTUAL:
-    NAME: "transdec_postnorm::L4_H1024_A16_F4096"
diff --git a/virtex/configs/detectron2/_base_faster_rcnn_R_50_C4_BN.yaml b/virtex/configs/detectron2/_base_faster_rcnn_R_50_C4_BN.yaml
deleted file mode 100644
index 639cb01322588d5f7329d792964847413259e60f..0000000000000000000000000000000000000000
--- a/virtex/configs/detectron2/_base_faster_rcnn_R_50_C4_BN.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# ----------------------------------------------------------------------------
-# Train a Faster R-CNN with ResNet-50 and C4 backbone. This config follows
-# Detectron2 format; and is unrelated with our VirTex configs. Params here
-# replicate evaluation protocol as per MoCo (https://arxiv.org/abs/1911.05722).
-# ----------------------------------------------------------------------------
-
-INPUT:
-  # Input format will always be RGB, consistent with torchvision.
-  FORMAT: "RGB"
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MIN_SIZE_TEST: 800
-
-MODEL:
-  META_ARCHITECTURE: "GeneralizedRCNN"
-
-  # Train all layers end-to-end by default.
-  BACKBONE:
-    NAME: build_resnet_backbone
-    FREEZE_AT: 0
-
-  # Fine-tune with SyncBN.
-  # STRIDE_IN_1X1 is False for torchvision-like models.
-  RESNETS:
-    DEPTH: 50
-    NORM: SyncBN
-    STRIDE_IN_1X1: False
-
-  RPN:
-    PRE_NMS_TOPK_TEST: 6000
-    POST_NMS_TOPK_TEST: 1000
-
-  # ROI head with extra BN layer after res5 stage.
-  ROI_HEADS:
-    NAME: "Res5ROIHeadsExtraNorm"
-
-  # ImageNet color mean for torchvision-like models (RGB order).
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-
-SOLVER:
-  # This is for 8 GPUs, apply linear scaling for 4 GPUs.
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.02
-
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-
-VERSION: 2
diff --git a/virtex/configs/detectron2/_base_mask_rcnn_R_50_FPN.yaml b/virtex/configs/detectron2/_base_mask_rcnn_R_50_FPN.yaml
deleted file mode 100644
index efb1f40f6c5c13ea95f4b3cb758bc20ef42983c1..0000000000000000000000000000000000000000
--- a/virtex/configs/detectron2/_base_mask_rcnn_R_50_FPN.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-# ----------------------------------------------------------------------------
-# Train a Mask R-CNN with ResNet-50 and FPN backbone. This config follows
-# Detectron2 format; and is unrelated with our VirTex configs. Params here
-# replicate evaluation protocol as per MoCo (https://arxiv.org/abs/1911.05722).
-# ----------------------------------------------------------------------------
-
-INPUT:
-  # Input format will always be RGB, consistent with torchvision.
-  FORMAT: "RGB"
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MIN_SIZE_TEST: 800
-
-MODEL:
-  META_ARCHITECTURE: "GeneralizedRCNN"
-
-  # Train all layers end-to-end by default.
-  BACKBONE:
-    NAME: "build_resnet_fpn_backbone"
-    FREEZE_AT: 0
-
-  # Fine-tune with SyncBN.
-  # STRIDE_IN_1X1 is False for torchvision-like models.
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
-
-  FPN:
-    IN_FEATURES: ["res2", "res3", "res4", "res5"]
-
-  ANCHOR_GENERATOR:
-    # One size for each in feature map
-    SIZES: [[32], [64], [128], [256], [512]]
-    # Three aspect ratios (same for all in feature maps)
-    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]
-
-  RPN:
-    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
-    PRE_NMS_TOPK_TRAIN: 2000
-    PRE_NMS_TOPK_TEST: 1000
-
-    POST_NMS_TOPK_TRAIN: 1000
-    POST_NMS_TOPK_TEST: 1000
-
-  ROI_HEADS:
-    NAME: "StandardROIHeads"
-    IN_FEATURES: ["p2", "p3", "p4", "p5"]
-
-  ROI_BOX_HEAD:
-    NAME: "FastRCNNConvFCHead"
-    NUM_FC: 2
-    POOLER_RESOLUTION: 7
-
-  ROI_MASK_HEAD:
-    NAME: "MaskRCNNConvUpsampleHead"
-    NUM_CONV: 4
-    POOLER_RESOLUTION: 14
-
-  # ImageNet color mean for torchvision-like models (RGB order).
-  # These are in [0-255] range as expected by Detectron2. Rest of our codebase
-  # uses [0-1] range; but both are equivalent and consistent.
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-
-SOLVER:
-  # This is for 8 GPUs, apply linear scaling for 4 GPUs.
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.02
-
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-
-VERSION: 2
diff --git a/virtex/configs/detectron2/coco_segm_default_init_2x.yaml b/virtex/configs/detectron2/coco_segm_default_init_2x.yaml
deleted file mode 100644
index 4d897fa532405d753ee3e9396616831326c89404..0000000000000000000000000000000000000000
--- a/virtex/configs/detectron2/coco_segm_default_init_2x.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# -----------------------------------------------------------------------------
-# Train a Mask R-CNN R50-FPN backbone on LVIS instance segmentation with any of
-# these weight init: random, imagenet (torchvision), virtex or MoCo.
-# -----------------------------------------------------------------------------
-_BASE_: "_base_mask_rcnn_R_50_FPN.yaml"
-
-DATASETS:
-  TRAIN: ("coco_2017_train",)
-  TEST: ("coco_2017_val",)
-
-MODEL:
-  MASK_ON: True
-  # FPN also has SyncBN, as opposed to no norm (usually).
-  FPN:
-    NORM: "SyncBN"
-  
-  # This will be ignored, weights will be loaded manually in the script.
-  WEIGHTS: ""
-  
-SOLVER:
-  STEPS: (120000, 160000)
-  MAX_ITER: 180000
-  
-VERSION: 2
diff --git a/virtex/configs/detectron2/lvis_segm_default_init_2x.yaml b/virtex/configs/detectron2/lvis_segm_default_init_2x.yaml
deleted file mode 100644
index f1df4dc373e03aff0ae3fbcbd783329ec485d605..0000000000000000000000000000000000000000
--- a/virtex/configs/detectron2/lvis_segm_default_init_2x.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# -----------------------------------------------------------------------------
-# Train a Mask R-CNN R50-FPN backbone on LVIS instance segmentation with any of
-# these weight init: random, virtex or MoCo. (ImageNet init config is separate)
-# -----------------------------------------------------------------------------
-_BASE_: "_base_mask_rcnn_R_50_FPN.yaml"
-
-DATASETS:
-  TRAIN: ("lvis_v1_train",)
-  TEST: ("lvis_v1_val",)
-
-DATALOADER:
-  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
-  REPEAT_THRESHOLD: 0.001
-
-TEST:
-  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300.
-
-MODEL:
-  MASK_ON: True
-  # FPN also has SyncBN, as opposed to no norm (usually).
-  FPN:
-    NORM: "SyncBN"
-
-  ROI_HEADS:
-    NUM_CLASSES: 1203
-    SCORE_THRESH_TEST: 0.0001
-
-  # This will be ignored, weights will be loaded manually in the script.
-  WEIGHTS: ""
-
-SOLVER:
-  STEPS: (120000, 160000)
-  MAX_ITER: 180000
-
-VERSION: 2
-
diff --git a/virtex/configs/detectron2/lvis_segm_imagenet_init_2x.yaml b/virtex/configs/detectron2/lvis_segm_imagenet_init_2x.yaml
deleted file mode 100644
index 5751f83ddf67bb0bb7b9bc1b6e992b72676fceea..0000000000000000000000000000000000000000
--- a/virtex/configs/detectron2/lvis_segm_imagenet_init_2x.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-# -----------------------------------------------------------------------------
-# Train a Mask R-CNN R50-FPN backbone on LVIS instance segmentation
-# with weights initialized from supervised ImageNet pretraining (torchvision).
-# Key difference is that fine-tuning here happens with BN frozen.
-# -----------------------------------------------------------------------------
-_BASE_: "_base_mask_rcnn_R_50_FPN.yaml"
-
-DATASETS:
-  TRAIN: ("lvis_v1_train",)
-  TEST: ("lvis_v1_val",)
-
-DATALOADER:
-  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
-  REPEAT_THRESHOLD: 0.001
-
-TEST:
-  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300.
-
-MODEL:
-  MASK_ON: True
-  RESNETS:
-    NORM: "FrozenBN"
-
-  # Do not tune with SyncBN for ImageNet init from LVIS.
-  ROI_HEADS:
-    NUM_CLASSES: 1203
-    SCORE_THRESH_TEST: 0.0001
-
-  # This will be ignored, weights will be loaded manually in the script.
-  WEIGHTS: ""
-
-SOLVER:
-  STEPS: (120000, 160000)
-  MAX_ITER: 180000
-
-VERSION: 2
-
-
diff --git a/virtex/configs/detectron2/voc_det_default_init_24k.yaml b/virtex/configs/detectron2/voc_det_default_init_24k.yaml
deleted file mode 100644
index 97b9fdad1305dec93504f4b288adab44f2239fb9..0000000000000000000000000000000000000000
--- a/virtex/configs/detectron2/voc_det_default_init_24k.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# -----------------------------------------------------------------------------
-# Train a Faster R-CNN with R50-C4 backbone on VOC07+12 detection with any of
-# these weight init: random, imagenet (torchvision), virtex or MoCo.
-# -----------------------------------------------------------------------------
-_BASE_: "_base_faster_rcnn_R_50_C4_BN.yaml"
-
-DATASETS:
-  TRAIN: ("voc_2007_trainval", "voc_2012_trainval")
-  TEST: ("voc_2007_test",)
-
-INPUT:
-  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
-  MIN_SIZE_TEST: 800
-
-MODEL:
-  MASK_ON: False
-  ROI_HEADS:
-    NUM_CLASSES: 20
-
-  # This will be ignored, weights will be loaded manually in the script.
-  WEIGHTS: ""
-
-SOLVER:
-  STEPS: (18000, 22000)
-  MAX_ITER: 24000
-  WARMUP_ITERS: 100
-
-VERSION: 2
diff --git a/virtex/configs/downstream/imagenet_clf.yaml b/virtex/configs/downstream/imagenet_clf.yaml
deleted file mode 100644
index 895de3f251ea76945fa483f3b75f69a2303b47c2..0000000000000000000000000000000000000000
--- a/virtex/configs/downstream/imagenet_clf.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-RANDOM_SEED: 0
-# Don't need AMP to train a tiny linear layer.
-AMP: false
-CUDNN_BENCHMARK: true
-CUDNN_DETERMINISTIC: false
-
-DATA:
-  ROOT: "datasets/imagenet"
-  IMAGE_TRANSFORM_TRAIN:
-    - "random_resized_crop::{'scale': (0.08, 1.0)}"
-    - "horizontal_flip"
-    - "normalize"
-  IMAGE_TRANSFORM_VAL:
-    - "smallest_resize"
-    - "center_crop"
-    - "normalize"
-
-MODEL:
-  VISUAL:
-    FROZEN: true
-
-OPTIM:
-  BATCH_SIZE: 256
-  SGD_MOMENTUM: 0.9
-  WEIGHT_DECAY: 0.0
-  NO_DECAY: "none"
-  LOOKAHEAD:
-    USE: false
-
-  LR: 0.3
-  WARMUP_STEPS: 0
-  LR_DECAY_NAME: "cosine"
-  NUM_ITERATIONS: 500500  # 100 epochs
diff --git a/virtex/configs/downstream/inaturalist_clf.yaml b/virtex/configs/downstream/inaturalist_clf.yaml
deleted file mode 100644
index eeb5b13ce31e4ba79918a881fcd23528b0f0c905..0000000000000000000000000000000000000000
--- a/virtex/configs/downstream/inaturalist_clf.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-RANDOM_SEED: 0
-AMP: true
-CUDNN_BENCHMARK: true
-CUDNN_DETERMINISTIC: false
-
-DATA:
-  ROOT: "datasets/inaturalist"
-  IMAGE_TRANSFORM_TRAIN:
-    - "random_resized_crop::{'scale': (0.08, 1.0)}"
-    - "horizontal_flip"
-    - "normalize"
-  IMAGE_TRANSFORM_VAL:
-    - "smallest_resize"
-    - "center_crop"
-    - "normalize"
-
-MODEL:
-  VISUAL:
-    FROZEN: false
-    
-OPTIM:
-  BATCH_SIZE: 256
-  SGD_MOMENTUM: 0.9
-  WEIGHT_DECAY: 0.0001
-  NO_DECAY: "none"
-  LOOKAHEAD:
-    USE: false
-
-  LR: 0.025
-  WARMUP_STEPS: 0
-  LR_DECAY_NAME: multistep
-  LR_GAMMA: 0.1
-  LR_STEPS:
-    - 119700  # 70 epochs
-    - 153900  # 90 epochs
-  NUM_ITERATIONS: 171000  # 100 epochs
diff --git a/virtex/configs/downstream/voc07_clf.yaml b/virtex/configs/downstream/voc07_clf.yaml
deleted file mode 100644
index ac3b029e3969662ca0811b87a2224fcfb6bd7ac0..0000000000000000000000000000000000000000
--- a/virtex/configs/downstream/voc07_clf.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-RANDOM_SEED: 0
-DATA:
-  ROOT: datasets/VOC2007
-  IMAGE_TRANSFORM_TRAIN:
-    - smallest_resize
-    - center_crop
-    - normalize
-  IMAGE_TRANSFORM_VAL:
-    - smallest_resize
-    - center_crop
-    - normalize
-
-OPTIM:
-  # Only used for feature extraction, doesn't mean much.
-  BATCH_SIZE: 128
diff --git a/virtex/configs/redcaps/gcc_R_50_L6_H512.yaml b/virtex/configs/redcaps/gcc_R_50_L6_H512.yaml
deleted file mode 100644
index 2b5d9ae7621dad07390d889ef685d102e387ad3f..0000000000000000000000000000000000000000
--- a/virtex/configs/redcaps/gcc_R_50_L6_H512.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-AMP: True
-
-DATA:
-  ROOT: "datasets/gcc/tarfiles/*.tar"
-  TOKENIZER_MODEL: "datasets/vocab/common_30k.model"
-  VOCAB_SIZE: 30000
-  UNK_INDEX: 0
-  SOS_INDEX: 1
-  EOS_INDEX: 2
-  MASK_INDEX: 3
-
-  MAX_CAPTION_LENGTH: 50
-
-MODEL:
-  NAME: "virtex_web"
-  TEXTUAL:
-    NAME: "transdec_prenorm::L6_H512_A8_F2048"
-
-  LABEL_SMOOTHING: 0.1
-
-OPTIM:
-  OPTIMIZER_NAME: "adamw"
-  WEIGHT_DECAY: 0.01
-  LOOKAHEAD:
-    USE: false
-
-  BATCH_SIZE: 256
-  CNN_LR: 0.0005
-  LR: 0.0005
-  NUM_ITERATIONS: 1500000
-
-  WARMUP_STEPS: 10000
-  LR_DECAY_NAME: "cosine"
diff --git a/virtex/configs/redcaps/miniclip_sbu_R_50_L12_H512.yaml b/virtex/configs/redcaps/miniclip_sbu_R_50_L12_H512.yaml
deleted file mode 100644
index 2ebc42885bc1348a9883947076b8db95d7ed4677..0000000000000000000000000000000000000000
--- a/virtex/configs/redcaps/miniclip_sbu_R_50_L12_H512.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-AMP: True
-
-DATA:
-  ROOT: "datasets/sbu/tarfiles/*.tar"
-  TOKENIZER_MODEL: "datasets/vocab/common_30k.model"
-  VOCAB_SIZE: 30000
-  UNK_INDEX: 0
-  SOS_INDEX: 1
-  EOS_INDEX: 2
-  MASK_INDEX: 3
-
-  MAX_CAPTION_LENGTH: 50
-
-MODEL:
-  NAME: "miniclip_web"
-  TEXTUAL:
-    NAME: "transenc_prenorm::L12_H512_A8_F2048"
-  LABEL_SMOOTHING: 0.1
-
-OPTIM:
-  OPTIMIZER_NAME: "adamw"
-  WEIGHT_DECAY: 0.01
-
-  LOOKAHEAD:
-    USE: false
-
-  BATCH_SIZE: 256
-  CNN_LR: 0.0005
-  LR: 0.0005
-  NUM_ITERATIONS: 1500000
-
-  WARMUP_STEPS: 10000
-  LR_DECAY_NAME: "cosine"
diff --git a/virtex/configs/redcaps/redcaps_2020_R_50_L6_H512.yaml b/virtex/configs/redcaps/redcaps_2020_R_50_L6_H512.yaml
deleted file mode 100644
index 2c95e9f507df982f448c4c47898c9d2bb70bfb6f..0000000000000000000000000000000000000000
--- a/virtex/configs/redcaps/redcaps_2020_R_50_L6_H512.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-AMP: True
-
-DATA:
-  ROOT: "datasets/redcaps/tarfiles/*_2020_*.tar"
-  TOKENIZER_MODEL: "datasets/vocab/common_30k.model"
-  VOCAB_SIZE: 30000
-  UNK_INDEX: 0
-  SOS_INDEX: 1
-  EOS_INDEX: 2
-  MASK_INDEX: 3
-
-  MAX_CAPTION_LENGTH: 50
-
-MODEL:
-  NAME: "virtex_web"
-  TEXTUAL:
-    NAME: "transdec_prenorm::L6_H512_A8_F2048"
-  LABEL_SMOOTHING: 0.1
-
-OPTIM:
-  OPTIMIZER_NAME: "adamw"
-  WEIGHT_DECAY: 0.01
-
-  LOOKAHEAD:
-    USE: false
-
-  BATCH_SIZE: 256
-  CNN_LR: 0.0005
-  LR: 0.0005
-  NUM_ITERATIONS: 1500000
-
-  WARMUP_STEPS: 10000
-  LR_DECAY_NAME: "cosine"
diff --git a/virtex/configs/redcaps/redcaps_all_R_50_L6_H512.yaml b/virtex/configs/redcaps/redcaps_all_R_50_L6_H512.yaml
deleted file mode 100644
index e5249782d69c8ce79d68f15e9a7fd06e83b06ae9..0000000000000000000000000000000000000000
--- a/virtex/configs/redcaps/redcaps_all_R_50_L6_H512.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-AMP: True
-
-DATA:
-  ROOT: "datasets/redcaps/tarfiles/*.tar"
-  TOKENIZER_MODEL: "datasets/vocab/common_30k.model"
-  VOCAB_SIZE: 30000
-  UNK_INDEX: 0
-  SOS_INDEX: 1
-  EOS_INDEX: 2
-  MASK_INDEX: 3
-
-  MAX_CAPTION_LENGTH: 50
-
-MODEL:
-  NAME: "virtex_web"
-  TEXTUAL:
-    NAME: "transdec_prenorm::L6_H512_A8_F2048"
-  LABEL_SMOOTHING: 0.1
-
-OPTIM:
-  OPTIMIZER_NAME: "adamw"
-  WEIGHT_DECAY: 0.01
-
-  LOOKAHEAD:
-    USE: false
-
-  BATCH_SIZE: 256
-  CNN_LR: 0.0005
-  LR: 0.0005
-  NUM_ITERATIONS: 1500000
-
-  WARMUP_STEPS: 10000
-  LR_DECAY_NAME: "cosine"
diff --git a/virtex/configs/redcaps/sbu_R_50_L6_H512.yaml b/virtex/configs/redcaps/sbu_R_50_L6_H512.yaml
deleted file mode 100644
index 1834f85ebfcfcf7a11361d95e99efc651577f9dd..0000000000000000000000000000000000000000
--- a/virtex/configs/redcaps/sbu_R_50_L6_H512.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-AMP: True
-
-DATA:
-  ROOT: "datasets/sbu/tarfiles/*.tar"
-  TOKENIZER_MODEL: "datasets/vocab/common_30k.model"
-  VOCAB_SIZE: 30000
-  UNK_INDEX: 0
-  SOS_INDEX: 1
-  EOS_INDEX: 2
-  MASK_INDEX: 3
-
-  MAX_CAPTION_LENGTH: 50
-
-MODEL:
-  NAME: "virtex_web"
-  TEXTUAL:
-    NAME: "transdec_prenorm::L6_H512_A8_F2048"
-  LABEL_SMOOTHING: 0.1
-
-OPTIM:
-  OPTIMIZER_NAME: "adamw"
-  WEIGHT_DECAY: 0.01
-
-  LOOKAHEAD:
-    USE: false
-
-  BATCH_SIZE: 256
-  CNN_LR: 0.0005
-  LR: 0.0005
-  NUM_ITERATIONS: 1500000
-
-  WARMUP_STEPS: 10000
-  LR_DECAY_NAME: "cosine"
diff --git a/virtex/configs/task_ablations/bicaptioning_R_50_L1_H2048.yaml b/virtex/configs/task_ablations/bicaptioning_R_50_L1_H2048.yaml
deleted file mode 100644
index a132a630675820261f09afcf3128b9684034c630..0000000000000000000000000000000000000000
--- a/virtex/configs/task_ablations/bicaptioning_R_50_L1_H2048.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  TEXTUAL:
-    NAME: "transdec_postnorm::L1_H2048_A32_F8192"
diff --git a/virtex/configs/task_ablations/captioning_R_50_L1_H2048.yaml b/virtex/configs/task_ablations/captioning_R_50_L1_H2048.yaml
deleted file mode 100644
index 82f159f9203cbd323c71e183b44630f1b44c558d..0000000000000000000000000000000000000000
--- a/virtex/configs/task_ablations/captioning_R_50_L1_H2048.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  NAME: "captioning"
-  TEXTUAL:
-    NAME: "transdec_postnorm::L1_H2048_A32_F8192"
diff --git a/virtex/configs/task_ablations/masked_lm_R_50_L1_H2048.yaml b/virtex/configs/task_ablations/masked_lm_R_50_L1_H2048.yaml
deleted file mode 100644
index 14a11155ae1d2aadfcc41f56bbf580a890a8e83b..0000000000000000000000000000000000000000
--- a/virtex/configs/task_ablations/masked_lm_R_50_L1_H2048.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  NAME: "masked_lm"
-  TEXTUAL:
-    NAME: "transdec_postnorm::L1_H2048_A32_F8192"
diff --git a/virtex/configs/task_ablations/multilabel_classification_R_50.yaml b/virtex/configs/task_ablations/multilabel_classification_R_50.yaml
deleted file mode 100644
index f8768b9c34bcc6c8078b7324062246f73eb346b0..0000000000000000000000000000000000000000
--- a/virtex/configs/task_ablations/multilabel_classification_R_50.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-DATA:
-  VOCAB_SIZE: 81
-
-MODEL:
-  NAME: "multilabel_classification"
-  TEXTUAL:
-    NAME: "none"
-
-OPTIM:
-  NO_DECAY: "none"
diff --git a/virtex/configs/task_ablations/token_classification_R_50.yaml b/virtex/configs/task_ablations/token_classification_R_50.yaml
deleted file mode 100644
index c31ee4d08300c4033231003c0940bb4940276073..0000000000000000000000000000000000000000
--- a/virtex/configs/task_ablations/token_classification_R_50.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  NAME: "token_classification"
-  TEXTUAL:
-    NAME: "none"
-
-OPTIM:
-  NO_DECAY: "none"
diff --git a/virtex/configs/width_ablations/bicaptioning_R_50_L1_H1024.yaml b/virtex/configs/width_ablations/bicaptioning_R_50_L1_H1024.yaml
deleted file mode 100644
index 3d004bb1a991185d067b68d361e854273cb2738a..0000000000000000000000000000000000000000
--- a/virtex/configs/width_ablations/bicaptioning_R_50_L1_H1024.yaml
+++ /dev/null
@@ -1 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
diff --git a/virtex/configs/width_ablations/bicaptioning_R_50_L1_H2048.yaml b/virtex/configs/width_ablations/bicaptioning_R_50_L1_H2048.yaml
deleted file mode 100644
index a132a630675820261f09afcf3128b9684034c630..0000000000000000000000000000000000000000
--- a/virtex/configs/width_ablations/bicaptioning_R_50_L1_H2048.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  TEXTUAL:
-    NAME: "transdec_postnorm::L1_H2048_A32_F8192"
diff --git a/virtex/configs/width_ablations/bicaptioning_R_50_L1_H512.yaml b/virtex/configs/width_ablations/bicaptioning_R_50_L1_H512.yaml
deleted file mode 100644
index 0b23d0c5ebcc2aae31e599ebd7bd49c923e4fe23..0000000000000000000000000000000000000000
--- a/virtex/configs/width_ablations/bicaptioning_R_50_L1_H512.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  TEXTUAL:
-    NAME: "transdec_postnorm::L1_H512_A8_F2048"
diff --git a/virtex/configs/width_ablations/bicaptioning_R_50_L1_H768.yaml b/virtex/configs/width_ablations/bicaptioning_R_50_L1_H768.yaml
deleted file mode 100644
index 7882e204fba9f05febc204d7b053d8cb4dfe344f..0000000000000000000000000000000000000000
--- a/virtex/configs/width_ablations/bicaptioning_R_50_L1_H768.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-_BASE_: "../_base_bicaptioning_R_50_L1_H1024.yaml"
-
-MODEL:
-  TEXTUAL:
-    NAME: "transdec_postnorm::L1_H768_A12_F3072"
diff --git a/virtex/virtex/data/__init__.py b/virtex/data/__init__.py
similarity index 100%
rename from virtex/virtex/data/__init__.py
rename to virtex/data/__init__.py
diff --git a/virtex/virtex/data/datasets/captioning.py b/virtex/data/datasets/captioning.py
similarity index 100%
rename from virtex/virtex/data/datasets/captioning.py
rename to virtex/data/datasets/captioning.py
diff --git a/virtex/virtex/data/datasets/classification.py b/virtex/data/datasets/classification.py
similarity index 100%
rename from virtex/virtex/data/datasets/classification.py
rename to virtex/data/datasets/classification.py
diff --git a/virtex/virtex/data/datasets/downstream.py b/virtex/data/datasets/downstream.py
similarity index 100%
rename from virtex/virtex/data/datasets/downstream.py
rename to virtex/data/datasets/downstream.py
diff --git a/virtex/virtex/data/datasets/masked_lm.py b/virtex/data/datasets/masked_lm.py
similarity index 100%
rename from virtex/virtex/data/datasets/masked_lm.py
rename to virtex/data/datasets/masked_lm.py
diff --git a/virtex/virtex/data/datasets/redcaps.py b/virtex/data/datasets/redcaps.py
similarity index 100%
rename from virtex/virtex/data/datasets/redcaps.py
rename to virtex/data/datasets/redcaps.py
diff --git a/virtex/virtex/data/datasets/zero_shot.py b/virtex/data/datasets/zero_shot.py
similarity index 100%
rename from virtex/virtex/data/datasets/zero_shot.py
rename to virtex/data/datasets/zero_shot.py
diff --git a/virtex/virtex/data/readers.py b/virtex/data/readers.py
similarity index 100%
rename from virtex/virtex/data/readers.py
rename to virtex/data/readers.py
diff --git a/virtex/virtex/data/tokenizers.py b/virtex/data/tokenizers.py
similarity index 100%
rename from virtex/virtex/data/tokenizers.py
rename to virtex/data/tokenizers.py
diff --git a/virtex/virtex/data/transforms.py b/virtex/data/transforms.py
similarity index 100%
rename from virtex/virtex/data/transforms.py
rename to virtex/data/transforms.py
diff --git a/virtex/docs/Makefile b/virtex/docs/Makefile
deleted file mode 100644
index a33ba2ab28931acc202130e69db1104b883fb578..0000000000000000000000000000000000000000
--- a/virtex/docs/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-SOURCEDIR     = .
-BUILDDIR      = ../../virtex-sphinx
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/virtex/docs/_static/custom.css b/virtex/docs/_static/custom.css
deleted file mode 100644
index 02df8f2a58a2a283273083b58a1cb9d0ac10f7f7..0000000000000000000000000000000000000000
--- a/virtex/docs/_static/custom.css
+++ /dev/null
@@ -1,115 +0,0 @@
-body {
-    padding: 40px 0 0 0;
-    font-size: 12pt;
-    font-family: Inconsolata !important;
-}
-
-/* Monospace everywhere */
-h1, h2, h3, h4, div.sphinxsidebar h1, div.sphinxsidebar h2,
-div.sphinxsidebar h3, div.sphinxsidebar h4, div.body h1,
-div.body h2, div.body h3, div.body h4, .admonition-title {
-    font-family: monospace !important;
-}
-
-/* Make main content wider */
-div.document {
-    margin: auto;
-    width: 65%;
-}
-
-/* Make sidebar slightly wider. */
-div.sphinxsidebar {
-    width: 250px;
-}
-
-div.bodywrapper {
-    margin: 0 0 0 250px;
-}
-
-div.body {
-    color: black;
-    max-width: 100%
-}
-
-/* Darker headings */
-h1, h2, h3, h4, div.sphinxsidebar h1, div.sphinxsidebar h2,
-div.sphinxsidebar h3, div.sphinxsidebar h4, div.body h1,
-div.body h2, div.body h3, div.body h4 {
-    color: black;
-}
-
-@media screen and (max-width: 875px) {
-    div.sphinxsidebar {
-        background-color: white;
-    }
-}
-
-/* Darker bold words */
-strong {
-    color: #252525;
-}
-
-/* TOC tree tag, view source link & permalink anchor styling. */
-div.sphinxsidebar a, .viewcode-link, a.reference {
-    color: darkgreen;
-    text-decoration: none;
-    border-bottom: 1px dashed green;
-    text-underline-position: under;
-}
-a.headerlink {
-    color: black;
-}
-
-/* TOC tree tag, view source link & permalink anchor styling. */
-div.sphinxsidebar a:hover, .viewcode-link:hover, a.reference:hover,
-a.headerlink:hover {
-    font-weight: 700;
-    border-bottom: 1px solid green;
-}
-
-/* Add a light background to class signatures. */
-dl.class > dt:first-of-type, dl.function > dt:first-of-type,
-dl.method > dt:first-of-type, dl.classmethod > dt:first-of-type,
-dl.attribute > dt:first-of-type, dl.data > dt:first-of-type {
-    font-size: 14pt;
-    background-color: #d8f6e9;
-    padding: 10px 20px 10px 10px;
-    border: 1px solid #1b5e20;
-}
-
-/* Add lightgrey background to code snippets. */
-pre {
-    background-color: #eeeeee !important;
-    border: 1pt solid #999999;
-    border-radius: 5px;
-}
-
-/* Dark orange-red comments in code snippets. */
-.highlight .c1 {
-    color: #dd4533;
-}
-
-.admonition, .note {
-    background-color: #fed8b1 !important;
-    border: 1pt solid #ff7700;
-    border-radius: 5px;
-}
-
-/* Make "Parameters" subsection wider - display heading and content vertically. */
-dl.field-list {
-    display: block;
-}
-
-/* Increase font size of subsection headings ("Parameters", "Examples" etc.) */
-.rubric, dl.field-list > dt.field-odd, dl.field-list > dt.field-even {
-    color: black;
-    font-size: 18pt;
-    font-weight: bold;
-    padding: 0px;
-    margin: 20px 0px 20px 0px;
-}
-
-/* Add margins around methods and properties. */
-.py {
-    margin: 20px 0px 20px 0px;
-}
diff --git a/virtex/docs/_static/system_figure.jpg b/virtex/docs/_static/system_figure.jpg
deleted file mode 100644
index 8ac5d31c263df121e1a005d141390dee0a6b344a..0000000000000000000000000000000000000000
Binary files a/virtex/docs/_static/system_figure.jpg and /dev/null differ
diff --git a/virtex/docs/_templates/layout.html b/virtex/docs/_templates/layout.html
deleted file mode 100644
index 66497fed98d5eb668d0781cbcc28e23147bd72bb..0000000000000000000000000000000000000000
--- a/virtex/docs/_templates/layout.html
+++ /dev/null
@@ -1,19 +0,0 @@
-{% extends "!layout.html" %}
-
-{% block htmltitle %}
-
-    <!-- Global site tag (gtag.js) - Google Analytics -->
-    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-120523111-2"></script>
-    <script>
-    window.dataLayer = window.dataLayer || [];
-    function gtag(){dataLayer.push(arguments);}
-    gtag('js', new Date());
-
-    gtag('config', 'UA-120523111-2');
-    </script>
-
-    <link href="https://fonts.googleapis.com/css?family=Inconsolata&display=swap" rel="stylesheet">
-    <link href="https://fonts.googleapis.com/css?family=Ubuntu+Mono&display=swap" rel="stylesheet">
-
-{{ super() }}
-{% endblock %}
diff --git a/virtex/docs/conf.py b/virtex/docs/conf.py
deleted file mode 100644
index fdd9cafe341546765f5aa38074d79832655da75a..0000000000000000000000000000000000000000
--- a/virtex/docs/conf.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# http://www.sphinx-doc.org/en/master/config
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import inspect
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath("../"))
-
-
-# -- Project information -----------------------------------------------------
-
-project = "virtex"
-copyright = "2021, Karan Desai and Justin Johnson"
-author = "Karan Desai"
-
-# The full version, including alpha/beta/rc tags
-release = "1.1"
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "sphinx.ext.autodoc",
-    "sphinx.ext.coverage",
-    "sphinx.ext.doctest",
-    "sphinx.ext.linkcode",
-    "sphinx.ext.autosummary",
-    "sphinx.ext.coverage",
-    "sphinx.ext.intersphinx",
-    "sphinx.ext.mathjax",
-    "sphinx_copybutton",
-    "numpydoc",
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = ".rst"
-
-# The master toctree document.
-master_doc = "index"
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# This version is used underneath the title on the index page.
-version = "1.1"
-# The following is used if you need to also include a more detailed version.
-release = "1.1"
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = "en"
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ["_build"]
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = "sphinx"
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-numpydoc_show_class_members = False
-
-
-# -- Options for HTML output ----------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = "alabaster"
-
-# html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {"collapse_navigation": False, "display_version": True}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-
-
-# -- Autodoc configuration ------------------------------------------------
-
-autodoc_default_options = {
-    "members": True,
-    "member-order": "bysource",
-    "private-members": True,
-    "show-inheritance": True,
-}
-
-
-# -- Intersphinx configuration --------------------------------------------
-
-intersphinx_mapping = {
-    "torch": ("https://pytorch.org/docs/stable/", None),
-    "albumentations": ("https://albumentations.readthedocs.io/en/latest/", None),
-}
-
-# -- Miscellaneous Extra Tweaks -------------------------------------------
-
-# make github links resolve
-def linkcode_resolve(domain, info):
-    """
-    Determine the URL corresponding to Python object
-    This code is from
-    https://github.com/numpy/numpy/blob/master/doc/source/conf.py#L290
-    and https://github.com/Lasagne/Lasagne/pull/262
-    """
-    if domain != "py":
-        return None
-
-    modname = info["module"]
-    fullname = info["fullname"]
-
-    submod = sys.modules.get(modname)
-    if submod is None:
-        return None
-
-    obj = submod
-    for part in fullname.split("."):
-        try:
-            obj = getattr(obj, part)
-        except:  # noqa: E722
-            return None
-
-    try:
-        fn = inspect.getsourcefile(obj)
-    except:  # noqa: E722
-        fn = None
-    if not fn:
-        return None
-
-    try:
-        source, lineno = inspect.getsourcelines(obj)
-    except:  # noqa: E722
-        lineno = None
-
-    if lineno:
-        linespec = "#L%d-L%d" % (lineno, lineno + len(source) - 1)
-    else:
-        linespec = ""
-
-    filename = info["module"].replace(".", "/")
-    return f"https://github.com/kdexd/virtex/blob/master/{filename}.py{linespec}"
diff --git a/virtex/docs/index.rst b/virtex/docs/index.rst
deleted file mode 100644
index f0866e98c0b79ebdbd962fb3f1ac2e595ddc2397..0000000000000000000000000000000000000000
--- a/virtex/docs/index.rst
+++ /dev/null
@@ -1,122 +0,0 @@
-.. raw:: html
-
-    <h1 style="text-align: center">
-    VirTex: Learning Visual Representations from Textual Annotations
-    </h1>
-    <h4 style="text-align: center">
-    Karan Desai and Justin Johnson
-    </br>
-    <span style="font-size: 14pt; color: #555555">
-    University of Michigan
-    </span>
-    </h4>
-    <hr>
-
-    <h4 style="text-align: center">
-    Abstract
-    </h4>
-
-    <p style="text-align: justify">
-    The de-facto approach to many vision tasks is to start from pretrained
-    visual representations, typically learned via supervised training on
-    ImageNet. Recent methods have explored unsupervised pretraining to scale to
-    vast quantities of unlabeled images. In contrast, we aim to learn
-    high-quality visual representations from fewer images. To this end we
-    revisit supervised pretraining, and seek data-efficient alternatives to
-    classification-based pretraining. We propose VirTex -- a pretraining
-    approach using semantically dense captions to learn visual representations.
-    We train convolutional networks from scratch on COCO Captions, and transfer
-    them to downstream recognition tasks including image classification, object
-    detection, and instance segmentation. On all tasks, VirTex yields features
-    that match or exceed those learned on ImageNet -- supervised or unsupervised
-    -- despite using up to ten times fewer images.
-    </p>
-
-**CVPR 2021. Paper available at:** `arxiv.org/abs/2006.06666 <https://arxiv.org/abs/2006.06666>`_.
-
-**Code available at:** `github.com/kdexd/virtex <https://github.com/kdexd/virtex>`_.
-
-.. image:: _static/system_figure.jpg
-
-
-Get the pretrained ResNet-50 visual backbone from our best performing VirTex
-model in one line *without any installation*!
-
-.. code-block:: python
-
-    import torch
-
-    # That's it, this one line only requires PyTorch.
-    model = torch.hub.load("kdexd/virtex", "resnet50", pretrained=True)
-
-
-More details in :doc:`virtex/usage/model_zoo`. Next, dive deeper into our
-code with User Guide and API References!
-
-
-User Guide
-----------
-
-.. toctree::
-    :maxdepth: 2
-
-    virtex/usage/setup_dependencies
-    virtex/usage/model_zoo
-    virtex/usage/pretrain
-    virtex/usage/downstream
-
-
-API Reference
--------------
-
-.. toctree::
-    :maxdepth: 2
-
-    virtex/config
-    virtex/factories
-    virtex/data
-    virtex/models
-    virtex/modules
-    virtex/optim
-    virtex/utils
-    virtex/model_zoo
-
-
-Citation
---------
-
-If you find this code useful, please consider citing:
-
-.. code-block:: text
-
-    @inproceedings{desai2021virtex,
-        title={{VirTex: Learning Visual Representations from Textual Annotations}},
-        author={Karan Desai and Justin Johnson},
-        booktitle={CVPR},
-        year={2021}
-    }
-
-
-Acknowledgments
----------------
-
-We thank Harsh Agrawal, Mohamed El Banani, Richard  Higgins, Nilesh Kulkarni
-and Chris Rockwell for helpful discussions and feedback on the paper. We thank
-Ishan Misra for discussions regarding PIRL evaluation protocol; Saining Xie for
-discussions about replicating iNaturalist evaluation as MoCo; Ross Girshick and
-Yuxin Wu for help with Detectron2 model zoo; Georgia Gkioxari for suggesting
-the Instance Segmentation pretraining task ablation; and Stefan Lee for
-suggestions on figure aesthetics. We thank Jia Deng for access to extra GPUs
-during project development; and UMich ARC-TS team for support with GPU cluster
-management. Finally, we thank all the Starbucks outlets in Ann Arbor for many
-hours of free WiFi. This work was partially supported by the Toyota Research
-Institute (TRI). However, note that this article solely reflects the opinions
-and conclusions of its authors and not TRI or any other Toyota entity.
-
-
-Indices and Tables
-------------------
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/virtex/docs/virtex/config.rst b/virtex/docs/virtex/config.rst
deleted file mode 100644
index 585a5042aa961964722e4184f56718b7682f16fd..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/config.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-virtex.config
-=============
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.config
-
-
-Config References
------------------
-
-.. literalinclude:: ../../virtex/config.py
-  :language: python
-  :linenos:
-  :lines: 46-206
-  :dedent: 8
diff --git a/virtex/docs/virtex/data.datasets.rst b/virtex/docs/virtex/data.datasets.rst
deleted file mode 100644
index 686a974d2d5c4db3a937270719053de6df0ade67..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/data.datasets.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-virtex.data.datasets
-====================
-
-.. raw:: html
-
-    <hr>
-
-Pretraining Datasets
---------------------
-
-.. automodule:: virtex.data.datasets.captioning
-
-.. automodule:: virtex.data.datasets.classification
-
-------------------------------------------------------------------------------
-
-Downstream Datasets
--------------------
-
-.. automodule:: virtex.data.datasets.downstream
diff --git a/virtex/docs/virtex/data.readers.rst b/virtex/docs/virtex/data.readers.rst
deleted file mode 100644
index f65a8327103cc6d0a203838b17960201735f6885..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/data.readers.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.data.readers
-===================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.data.readers
diff --git a/virtex/docs/virtex/data.rst b/virtex/docs/virtex/data.rst
deleted file mode 100644
index 882d69accc4a25e7275b933788a4b07ca0d964fd..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/data.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-virtex.data
-===========
-
-.. raw:: html
-
-    <hr>
-
-
-.. toctree::
-
-    data.readers
-    data.datasets
-    data.tokenizers
-    data.transforms
diff --git a/virtex/docs/virtex/data.tokenizers.rst b/virtex/docs/virtex/data.tokenizers.rst
deleted file mode 100644
index 59594dd805010eff4a8201a1797baf3488c0e33d..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/data.tokenizers.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.data.tokenizers
-======================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.data.tokenizers
diff --git a/virtex/docs/virtex/data.transforms.rst b/virtex/docs/virtex/data.transforms.rst
deleted file mode 100644
index 7d9b0299f2187112e5ea51b54d55eac18f0717c4..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/data.transforms.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.data.transforms
-======================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.data.transforms
diff --git a/virtex/docs/virtex/factories.rst b/virtex/docs/virtex/factories.rst
deleted file mode 100644
index 078afc5bacd486b8b449d76b82624de09679f916..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/factories.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-virtex.factories
-================
-
-.. raw:: html
-
-    <hr>
-
-.. First only include the top-level module, and base class docstrings.
-
-.. automodule:: virtex.factories
-    :no-members:
-
-.. autoclass:: virtex.factories.Factory
-
-
-------------------------------------------------------------------------------
-
-Dataloading-related Factories
------------------------------
-
-.. autoclass:: virtex.factories.TokenizerFactory
-    :members: from_config
-
-.. autoclass:: virtex.factories.ImageTransformsFactory
-    :members: from_config
-
-.. autoclass:: virtex.factories.PretrainingDatasetFactory
-    :members: from_config
-
-.. autoclass:: virtex.factories.DownstreamDatasetFactory
-    :members: from_config
-
-------------------------------------------------------------------------------
-
-Modeling-related Factories
---------------------------
-
-.. autoclass:: virtex.factories.VisualBackboneFactory
-    :members: from_config
-
-.. autoclass:: virtex.factories.TextualHeadFactory
-    :members: from_config
-
-.. autoclass:: virtex.factories.PretrainingModelFactory
-    :members: from_config
-
-------------------------------------------------------------------------------
-
-Optimization-related Factories
-------------------------------
-
-.. autoclass:: virtex.factories.OptimizerFactory
-    :members: from_config
-
-.. autoclass:: virtex.factories.LRSchedulerFactory
-    :members: from_config
diff --git a/virtex/docs/virtex/model_zoo.rst b/virtex/docs/virtex/model_zoo.rst
deleted file mode 100644
index ebdb81863704d6d4c85d5c1b580240ea317d45c7..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/model_zoo.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.model_zoo
-================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.model_zoo.model_zoo
diff --git a/virtex/docs/virtex/models.rst b/virtex/docs/virtex/models.rst
deleted file mode 100644
index 83ab5751e65294b5071a4eebc543b6f70f9566d9..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/models.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-virtex.models
-=============
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.models.classification
-
--------------------------------------------------------------------------------
-
-.. automodule:: virtex.models.captioning
-
--------------------------------------------------------------------------------
-
-.. automodule:: virtex.models.masked_lm
diff --git a/virtex/docs/virtex/modules.embedding.rst b/virtex/docs/virtex/modules.embedding.rst
deleted file mode 100644
index 6125716a3d2c964f1c8e402a475d24750ee42fa3..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/modules.embedding.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.modules.embedding
-========================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.modules.embedding
diff --git a/virtex/docs/virtex/modules.rst b/virtex/docs/virtex/modules.rst
deleted file mode 100644
index f623cfd865184240057f249c4cced5b8d11793c2..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/modules.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-virtex.modules
-==============
-
-.. raw:: html
-
-    <hr>
-
-.. toctree::
-
-    modules.embedding
-    modules.visual_backbones
-    modules.textual_heads
diff --git a/virtex/docs/virtex/modules.textual_heads.rst b/virtex/docs/virtex/modules.textual_heads.rst
deleted file mode 100644
index ddbc68d1c0bd8d1c6b8a997030b48050aec09ea9..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/modules.textual_heads.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.modules.textual_heads
-============================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.modules.textual_heads
diff --git a/virtex/docs/virtex/modules.visual_backbones.rst b/virtex/docs/virtex/modules.visual_backbones.rst
deleted file mode 100644
index 8aff72132cf9ddddc5de04d6d68975cc5086e262..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/modules.visual_backbones.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.modules.visual_backbones
-===============================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.modules.visual_backbones
diff --git a/virtex/docs/virtex/optim.lookahead.rst b/virtex/docs/virtex/optim.lookahead.rst
deleted file mode 100644
index 63030fd060386bec339f6cc88c7edde6b523ffb5..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/optim.lookahead.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.optim.lookahead
-======================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.optim.lookahead
diff --git a/virtex/docs/virtex/optim.lr_scheduler.rst b/virtex/docs/virtex/optim.lr_scheduler.rst
deleted file mode 100644
index 62a0596e86ca1ef624ba0314c4f673f0c7829a66..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/optim.lr_scheduler.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.optim.lr_scheduler
-=========================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.optim.lr_scheduler
diff --git a/virtex/docs/virtex/optim.rst b/virtex/docs/virtex/optim.rst
deleted file mode 100644
index cf31a85cc8d92ef62e8686d2209e6b1e6f18c172..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/optim.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-virtex.optim
-============
-
-.. raw:: html
-
-    <hr>
-
-.. toctree::
-
-    optim.lookahead
-    optim.lr_scheduler
diff --git a/virtex/docs/virtex/usage/downstream.rst b/virtex/docs/virtex/usage/downstream.rst
deleted file mode 100644
index c7278b0015cd46904c4e02d0d7e4e15cb6ef00f7..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/usage/downstream.rst
+++ /dev/null
@@ -1,216 +0,0 @@
-How to evaluate on downstream tasks?
-====================================
-
-In our paper, we evaluate our pretrained VirTex models on seven different
-downstream tasks. Our codebase supports all of these evaluations. Throughout
-this documentation, we consider a specific example of our VirTex pretrained
-model being evaluated for ensuring filepath uniformity in the following example
-command snippets. Paths can be trivially adjusted for any other VirTex model;
-evaluating the baselines (MoCo, ImageNet-supervised, Random Init) require
-additional changes in commands, explained in the last sub-section.
-
-As an example, consider a pretraining job for our best performing VirTex model
-(``width_ablations/bicaptioning_R_50_L1_H2048.yaml``). The serialization
-directory might look something like this:
-
-.. code-block:: text
-
-    /tmp/bicaptioning_R_50_L1_H2048
-        pretrain_config.yaml
-        log-rank0.txt    # stdout/stderr per GPU process
-        log-rank1.txt
-        ...
-        log-rank7.txt
-        checkpoint_2000.pth
-        checkpoint_4000.pth
-        ...
-        checkpoint_498000.pth
-        checkpoint_500000.pth    # serialized checkpoints
-        train_captioning_forward/
-            events.out.* ...    # tensorboard logs
-        ...
-
-We evaluate all checkpoints on **PASCAL VOC 2007 Linear Classification**, and
-then evaluate the best checkpoint (here, it was iteration 500000) on all other
-downstream tasks.
-
-
-PASCAL VOC 2007 Linear Classification
--------------------------------------
-
-Evaluate a single VirTex pretrained checkpoint on VOC 2007 ``trainval`` split:
-
-.. code-block:: shell
-
-    python scripts/clf_voc07.py \
-        --config /tmp/bicaptioning_R_50_L1_H2048/pretrain_config.yaml \
-        --down-config configs/downstream/voc07_clf.yaml \
-        --checkpoint-path /tmp/bicaptioning_R_50_L1_H2048/checkpoint_500000.pth \
-        --weight-init virtex \
-        --num-gpus-per-machine 1 \
-        --cpu-workers 4 \
-        --serialization-dir /tmp/bicaptioning_R_50_L1_H2048
-
-To evaluate recent 100 checkpoints in the sub-directory, this command can be
-looped over as follows:
-
-.. code-block:: shell
-
-    for ((iter = 300000; iter <= 500000; iter+=2000)); do
-        # add command with `checkpoint_$iter.pth`        
-    done
-
-This script write metric to tensorboard logs in the same pretraining directory,
-all VOC07 mAP curves appear together with pretraining loss curves.
-
--------------------------------------------------------------------------------
-
-ImageNet Linear Classification
-------------------------------
-
-We train a linear classifier on 2048-dimensional global average pooled features
-extracted from a frozen visual backbone. Evaluate a checkpoint (for example,
-iteration 500000) on this task as:
-
-.. code-block:: shell
-
-    python scripts/clf_linear.py \
-        --config /tmp/bicaptioning_R_50_L1_H2048/pretrain_config.yaml \
-        --down-config configs/downstream/imagenet_clf.yaml \
-        --checkpoint-path /tmp/bicaptioning_R_50_L1_H2048/checkpoint_500000.pth \
-        --weight-init virtex \
-        --num-gpus-per-machine 8 \
-        --cpu-workers 4 \
-        --serialization-dir /tmp/bicaptioning_R_50_L1_H2048/imagenet_500000 \
-        --checkpoint-every 5005  # 1 epoch of ImageNet
-
--------------------------------------------------------------------------------
-
-Instance Segmentation (and Object Detection) on COCO
-----------------------------------------------------
-
-Train a Mask R-CNN with FPN backbone for COCO Instance Segmentation (and Object
-Detection, because it also has a box head) by initializing the backbone from
-VirTex pretrained weights:
-
-.. code-block:: shell
-
-    python scripts/eval_detectron2.py \
-        --config /tmp/bicaptioning_R_50_L1_H2048/pretrain_config.yaml \
-        --d2-config configs/detectron2/coco_segm_default_init_2x.yaml \
-        --checkpoint-path /tmp/bicaptioning_R_50_L1_H2048/checkpoint_500000.pth \
-        --weight-init virtex \
-        --num-gpus-per-machine 8 \
-        --cpu-workers 2 \
-        --serialization-dir /tmp/bicaptioning_R_50_L1_H2048/coco_segm_500000 \
-        --checkpoint-every 5000
-
-.. note::
-
-    1. This script periodically serializes checkpoints but skips validation
-       step during training for saving time; to evaluate a serialized checkpoint
-       and write results to tensorboard, provide it as ``--checkpoint-path`` and
-       additional flags ``--resume --eval-only``.
-
-    2. Note that ``--d2-config`` here is in Detectron2 format, and not our
-       package :class:`~virtex.config.Config`.
-
-    These points are applicable for all tasks described below.
-
--------------------------------------------------------------------------------
-
-Instance Segmentation on LVIS
------------------------------
-
-Train a Mask R-CNN with FPN backbone for LVIS Instance Segmentation by
-initializing the backbone from VirTex pretrained weights:
-
-.. code-block:: shell
-
-    python scripts/eval_detectron2.py \
-        --config /tmp/bicaptioning_R_50_L1_H2048/pretrain_config.yaml \
-        --d2-config configs/detectron2/lvis_segm_default_init_2x.yaml \
-        --checkpoint-path /tmp/bicaptioning_R_50_L1_H2048/checkpoint_500000.pth \
-        --weight-init virtex \
-        --num-gpus-per-machine 8 \
-        --cpu-workers 2 \
-        --serialization-dir /tmp/bicaptioning_R_50_L1_H2048/lvis_segm_500000 \
-        --checkpoint-every 5000
-
--------------------------------------------------------------------------------
-
-Object Detection on PASCAL VOC 2007+12
---------------------------------------
-
-Train a Faster R-CNN with C4 backbone for PASCAL VOC 2007+12 Object Detection
-by initializing the backbone from VirTex pretrained weights:
-
-.. code-block:: shell
-
-    python scripts/eval_detectron2.py \
-        --config /tmp/bicaptioning_R_50_L1_H2048/pretrain_config.yaml \
-        --d2-config configs/detectron2/voc_det_default_init_24k.yaml \
-        --checkpoint-path /tmp/bicaptioning_R_50_L1_H2048/checkpoint_500000.pth \
-        --weight-init virtex \
-        --num-gpus-per-machine 8 \
-        --cpu-workers 2 \
-        --serialization-dir /tmp/bicaptioning_R_50_L1_H2048/voc_det_500000 \
-        --checkpoint-every 2500
-
--------------------------------------------------------------------------------
-
-iNaturalist 2018 Fine-Grained Classification
---------------------------------------------
-
-Fine-tune the VirTex pretrained visual backbone end-to-end on iNaturalist 2018
-dataset:
-
-.. code-block:: shell
-
-    python scripts/clf_linear.py \
-        --config /tmp/bicaptioning_R_50_L1_H2048/pretrain_config.yaml \
-        --down-config configs/downstream/inaturalist_clf.yaml \
-        --checkpoint-path /tmp/bicaptioning_R_50_L1_H2048/checkpoint_500000.pth \
-        --weight-init virtex \
-        --num-gpus-per-machine 8 \
-        --cpu-workers 4 \
-        --serialization-dir /tmp/bicaptioning_R_50_L1_H2048/inaturalist_500000 \
-        --checkpoint-every 1710  # 1 epoch of iNaturalist
-
--------------------------------------------------------------------------------
-
-Image Captioning on COCO Captions val2017
------------------------------------------
-
-Evaluate a pretrained VirTex model on image captioning for COCO Captions val2017
-split (reporting CIDEr and SPICE metics):
-
-.. code-block:: shell
-
-    python scripts/eval_captioning.py \
-        --config /tmp/bicaptioning_R_50_L1_H2048/pretrain_config.yaml \
-        --checkpoint-path /tmp/bicaptioning_R_50_L1_H2048/checkpoint_500000.pth \
-        --calc-metrics \
-        --num-gpus-per-machine 1 \
-        --cpu-workers 4
-
--------------------------------------------------------------------------------
-
-Running Image Captioning Inference on Arbitrary Images
-------------------------------------------------------
-
-The above script can be used for generating captions for any images in a directory.
-Replace certain commands as follows:
-
-.. code-block:: shell
-
-    python scripts/eval_captioning.py \
-        --config /tmp/bicaptioning_R_50_L1_H2048/pretrain_config.yaml \
-        --checkpoint-path /tmp/bicaptioning_R_50_L1_H2048/checkpoint_500000.pth \
-        --data-root /path/to/images_dir \
-        --output /path/to/save/predictions.json \
-        --num-gpus-per-machine 1 \
-        --cpu-workers 4
-
-This script will save predictions in JSON format. Since our goal is to not
-improve image captioning, these models may not generate the best captions.
diff --git a/virtex/docs/virtex/usage/model_zoo.rst b/virtex/docs/virtex/usage/model_zoo.rst
deleted file mode 100644
index daee9ee44c56ae08f5e2444b552f8233d28b66cb..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/usage/model_zoo.rst
+++ /dev/null
@@ -1,234 +0,0 @@
-VirTex Model Zoo
-================
-
-We provide a collection of pretrained model weights and corresponding config
-names in this model zoo. Tables contain partial paths to config files for each
-model, download link for pretrained weights and for reference -- VOC07 mAP and
-ImageNet top-1 accuracy.
-
-The simplest way to download and use a *full* pretrained model (including both,
-the visual backbone and the textual head) is through :doc:`../model_zoo` API as
-follows. This code snippet works from anywhere, and does not require to be
-executed from project root.
-
-.. code-block:: python
-
-    # Get our full best performing VirTex model:
-    import virtex.model_zoo as mz
-    model = mz.get("width_ablations/bicaptioning_R_50_L1_H2048.yaml", pretrained=True)
-
-    # Optionally extract the torchvision-like visual backbone (with ``avgpool``
-    # and ``fc`` layers replaced with ``nn.Identity`` module).
-    cnn = model.visual.cnn
-
-Alternatively, weights can be manually downloaded from links below, and this
-can be executed from the project root:
-
-.. code-block:: python
-
-    from virtex.config import Config
-    from virtex.factories import PretrainingModelFactory
-    from virtex.utils.checkpointing import CheckpointManager
-
-    # Get the best performing VirTex model:
-    _C = Config("configs/width_ablations/bicaptioning_R_50_L1_H2048.yaml")
-    model = PretrainingModelFactory.from_config(_C)
-
-    CheckpointManager(model=model).load("/path/to/downloaded/weights.pth")
-
-    # Optionally extract the torchvision-like visual backbone (with ``avgpool``
-    # and ``fc`` layers replaced with ``nn.Identity`` module).
-    cnn = model.visual.cnn
-
-
-The pretrained ResNet-50 visual backbone of our best performing model
-(``width_ablations/bicaptioning_R_50_L1_H2048.yaml``) can be loaded in a single
-line, *without following any installation steps* (only requires PyTorch v1.5):
-
-.. code-block:: python
-
-    import torch
-
-    model = torch.hub.load("kdexd/virtex", "resnet50", pretrained=True)
-
-    # This is a torchvision-like resnet50 model, with ``avgpool`` and ``fc``
-    # layers replaced with ``nn.Identity`` module.
-    image_batch = torch.randn(1, 3, 224, 224)  # batch tensor of one image.
-    features_batch = model(image_batch)  # shape: (1, 2048, 7, 7)
-
--------------------------------------------------------------------------------
-
-Pretraining Task Ablations
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. raw:: html
-
-    <style type="text/css">
-    .tg  {border-collapse:collapse;border-spacing:0;}
-    .tg td{border-color:black;border-style:solid;border-width:1px;
-    overflow:hidden;padding:10px 5px;word-break:normal;}
-    .tg th{border-color:black;border-style:solid;border-width:1px;
-    font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
-    .tg .tg-zlqz{background-color:#d5d5d5;border-color:inherit;font-weight:bold;text-align:center;vertical-align:center}
-    .tg .tg-c3ow{border-color:inherit;text-align:center;vertical-align:top}
-    .tg .tg-c3ow a{color: darkgreen; text-decoration: none; border-bottom: 1px dashed green;text-underline-position: under;
-    .tg .tg-c3ow a:hover{font-weight: 700;border-bottom: 1px solid green;}
-    .tg .tg-0pky{border-color:inherit;text-align:left;vertical-align:top}
-    @media screen and (max-width: 767px) {.tg {width: auto !important;}.tg col {width: auto !important;}.tg-wrap {overflow-x: auto;-webkit-overflow-scrolling: touch;}}</style>
-    <div class="tg-wrap"><table class="tg">
-    <tbody>
-    <tr>
-        <td class="tg-zlqz">Model Config Name</td>
-        <td class="tg-zlqz">VOC07<br>mAP</td>
-        <td class="tg-zlqz">ImageNet<br>Top-1 Acc.</td>
-        <td class="tg-zlqz">Model URL</td>
-    </tr>
-    <tr>
-        <td class="tg-0pky">task_ablations/bicaptioning_R_50_L1_H2048.yaml</td>
-        <td class="tg-c3ow">88.7</td>
-        <td class="tg-c3ow">53.8</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/zu8zxtxrron29icd76owgjzojmfcgdk3.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky">task_ablations/captioning_R_50_L1_H2048.yaml</td>
-        <td class="tg-c3ow">88.6</td>
-        <td class="tg-c3ow">50.8</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/1q9qh1cj2u4r5laj7mefd2mlzwthnga7.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky">task_ablations/token_classification_R_50.yaml</td>
-        <td class="tg-c3ow">88.8</td>
-        <td class="tg-c3ow">48.6</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/idvoxjl60pzpcllkbvadqgvwazil2mis.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky">task_ablations/multilabel_classification_R_50.yaml</td>
-        <td class="tg-c3ow">86.2</td>
-        <td class="tg-c3ow">46.2</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/yvlflmo0klqy3m71p6ug06c6aeg282hy.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky">task_ablations/masked_lm_R_50_L1_H2048.yaml</td>
-        <td class="tg-c3ow">86.4</td>
-        <td class="tg-c3ow">46.7</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/x3eij00eslse9j35t9j9ijyj8zkbkizh.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    </tbody>
-    </table></div>
-
-
-Width Ablations
-^^^^^^^^^^^^^^^
-
-.. raw:: html
-
-    <div class="tg-wrap"><table class="tg">
-    <tbody>
-    <tr>
-        <td class="tg-zlqz">Model Config Name</td>
-        <td class="tg-zlqz">VOC07<br>mAP</td>
-        <td class="tg-zlqz">ImageNet<br>Top-1 Acc.</td>
-        <td class="tg-zlqz">Model URL</td>
-    </tr>
-    <tr>
-        <td class="tg-0pky">width_ablations/bicaptioning_R_50_L1_H512.yaml</td>
-        <td class="tg-c3ow">88.4</td>
-        <td class="tg-c3ow">51.8</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/wtk18v0vffws48u5yrj2qjt94wje1pit.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky"><span style="font-weight:400;font-style:normal">width_ablations/bicaptioning_R_50_L1_H768.yaml</span></td>
-        <td class="tg-c3ow">88.3</td>
-        <td class="tg-c3ow">52.3</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/e94n0iexdvksi252bn7sm2vqjnyt9okf.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky"><span style="font-weight:400;font-style:normal">width_ablations/bicaptioning_R_50_L1_H1024.yaml</span></td>
-        <td class="tg-c3ow">88.3</td>
-        <td class="tg-c3ow">53.2</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/1so9cu9y06gy27rqbzwvek4aakfd8opf.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky"><span style="font-weight:400;font-style:normal">width_ablations/bicaptioning_R_50_L1_H2048.yaml</span></td>
-        <td class="tg-c3ow">88.7</td>
-        <td class="tg-c3ow">53.8</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/zu8zxtxrron29icd76owgjzojmfcgdk3.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    </tbody>
-    </table></div>
-
-
-Depth Ablations
-^^^^^^^^^^^^^^^
-
-.. raw:: html
-
-    <div class="tg-wrap"><table class="tg">
-    <tbody>
-    <tr>
-        <td class="tg-zlqz">Model Config Name</td>
-        <td class="tg-zlqz">VOC07<br>mAP</td>
-        <td class="tg-zlqz">ImageNet<br>Top-1 Acc.</td>
-        <td class="tg-zlqz">Model URL</td>
-    </tr>
-    <tr>
-        <td class="tg-0pky">depth_ablations/bicaptioning_R_50_L1_H1024.yaml</td>
-        <td class="tg-c3ow">88.3</td>
-        <td class="tg-c3ow">53.2</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/1so9cu9y06gy27rqbzwvek4aakfd8opf.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky">depth_ablations/bicaptioning_R_50_L2_H1024.yaml</td>
-        <td class="tg-c3ow">88.8</td>
-        <td class="tg-c3ow">53.8</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/9e88f6l13a9r8wq5bbe8qnoh9zenanq3.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky"><span style="font-weight:400;font-style:normal">depth_ablations/bicaptioning_R_50_L3_H1024.yaml</span></td>
-        <td class="tg-c3ow">88.7</td>
-        <td class="tg-c3ow">53.9</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/4cv8052xiq91h7lyx52cp2a6m7m9qkgo.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky"><span style="font-weight:400;font-style:normal">depth_ablations/bicaptioning_R_50_L4_H1024.yaml</span></td>
-        <td class="tg-c3ow">88.7</td>
-        <td class="tg-c3ow">53.9</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/bk5w4471mgvwa5mv6e4c7htgsafzmfm0.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    </tbody>
-    </table></div>
-
-
-Backbone Ablations
-^^^^^^^^^^^^^^^^^^
-
-.. raw:: html
-
-    <div class="tg-wrap"><table class="tg">
-    <tbody>
-    <tr>
-        <td class="tg-zlqz">Model Config Name</td>
-        <td class="tg-zlqz">VOC07<br>mAP</td>
-        <td class="tg-zlqz">ImageNet<br>Top-1 Acc.</td>
-        <td class="tg-zlqz">Model URL</td>
-    </tr>
-    <tr>
-        <td class="tg-0pky">backbone_ablations/bicaptioning_R_50_L1_H1024.yaml</td>
-        <td class="tg-c3ow">88.3</td>
-        <td class="tg-c3ow">53.2</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/1so9cu9y06gy27rqbzwvek4aakfd8opf.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky">backbone_ablations/bicaptioning_R_50W2X_L1_H1024.yaml</td>
-        <td class="tg-c3ow">88.5</td>
-        <td class="tg-c3ow">52.9</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/19vcaf1488945836kir9ebm5itgtugaw.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    <tr>
-        <td class="tg-0pky">backbone_ablations/bicaptioning_R_101_L1_H1024.yaml</td>
-        <td class="tg-c3ow">88.7</td>
-        <td class="tg-c3ow">52.1</td>
-        <td class="tg-c3ow"><a href="https://umich.box.com/shared/static/nptbh4jsj0c0kjsnc2hw754fkikpgx9v.pth" target="_blank" rel="noopener noreferrer">model</a></td>
-    </tr>
-    </tbody>
-    </table></div>
diff --git a/virtex/docs/virtex/usage/pretrain.rst b/virtex/docs/virtex/usage/pretrain.rst
deleted file mode 100644
index 2f14305f2152afdede708d45cbe5b2d165e9246a..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/usage/pretrain.rst
+++ /dev/null
@@ -1,100 +0,0 @@
-How to train your VirTex model?
-===============================
-
-We provide training scripts for all type of VirTex models from the paper;
-including our best-performing model and other ablations.
-Our training jobs are specified by config files (YAML).
-Execute all commands from project root to use the provided config files.
-
-
-Training the base VirTex model
-------------------------------
-
-Train the base VirTex model with ResNet-50 visual backbone; and a textual head
-with ``L = 1, H = 1024`` using all default optimization hyperparameters.
-
-.. code-block::
-
-    python scripts/pretrain_virtex.py \
-        --config configs/_base_bicaptioning_R_50_L1_H1024.yaml \
-        --num-gpus-per-machine 8 \
-        --cpu-workers 4 \
-        --serialization-dir /tmp/VIRTEX_R_50_L1_H1024
-        # Default: --checkpoint-every 2000 --log-every 20
-
-Training job will save checkpoints, tensorboard logs (loss curves and metrics),
-and back up the config in ``--serialization-dir``. Use ``tensorboard --logdir
-<serialization_dir>`` to view training curves, validation metrics etc. directly
-on tensorboard.
-
-We recommend training with 8 GPUs on the same machine, although training with
-multiple GPUs across machines (see: ``--num-machines`` and ``--machine-rank``),
-single GPU (``--num-gpus-per-machine 1``) as well as CPU
-(``--num-gpus-per-machine 0``) is also supported. Using multiple GPUs for
-interactive debugging with PDB is not supported, as PDB and ``multiprocessing``
-module do not play nice.
-
--------------------------------------------------------------------------------
-
-Reproducing all VirTex ablations
---------------------------------
-
-To reproduce all ablations from the `paper <https://arxiv.org/abs/2006.06666>`_,
-replace the ``--config`` argument in above command with the following (all
-assumed to be relative to project root):
-
-Pretraining Task Ablations
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-1. **Bicaptioning:** configs/task_ablations/bicaptioning_R_50_L1_H2048.yaml
-2. **Forward Captioning:** configs/task_ablations/captioning_R_50_L1_H2048.yaml
-3. **Token Classification:** configs/task_ablations/token_classification_R_50.yaml
-4. **Multilabel Classification:** configs/task_ablations/multilabel_classification_R_50.yaml
-5. **Masked Language Modeling:** configs/task_ablations/masked_lm_R_50_L1_H2048.yaml
-
-Transformer Size Ablations
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-1. **Width (H = 512):** configs/width_ablations/bicaptioning_R_50_L1_H512.yaml
-2. **Width (H = 768):** configs/width_ablations/bicaptioning_R_50_L1_H768.yaml
-3. **Width (H = 1024):** configs/width_ablations/bicaptioning_R_50_L1_H1024.yaml
-4. **Width (H = 2048):** configs/width_ablations/bicaptioning_R_50_L1_H2048.yaml
-5. **Depth (L = 1):** configs/depth_ablations/bicaptioning_R_50_L1_H1024.yaml
-6. **Depth (L = 2):** configs/depth_ablations/bicaptioning_R_50_L2_H1024.yaml
-7. **Depth (L = 3):** configs/depth_ablations/bicaptioning_R_50_L3_H1024.yaml
-8. **Depth (L = 4):** configs/depth_ablations/bicaptioning_R_50_L4_H1024.yaml
-
-Backbone Ablations
-^^^^^^^^^^^^^^^^^^
-
-1. **ResNet-50:** configs/backbone_ablations/bicaptioning_R_50_L1_H1024.yaml
-2. **ResNet-50 w2x:** configs/backbone_ablations/bicaptioning_R_50W2X_L1_H1024.yaml
-3. **ResNet-101:** configs/backbone_ablations/bicaptioning_R_101_L1_H1024.yaml
-
-.. note::
-
-    **Pretraining Task Ablations** (1), **Transformer Size Ablations** (3 and 5)
-    and **Backbone Ablations** (1) are all the same exact model.
-
-Data Efficiency Experiments
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-These are VirTex models trained on a subset of COCO Captions dataset. For example,
-train a base VirTex model on randomly selected ``50%`` of COCO Captions:
-
-.. code-block::
-
-    python scripts/pretrain_virtex.py \
-        --config configs/_base_bicaptioning_R_50_L1_H1024.yaml \
-        --config-override DATA.USE_PERCENTAGE 50.0 \
-        --num-gpus-per-machine 8 \
-        --cpu-workers 4 \
-        --serialization-dir /tmp/VIRTEX_R_50_L1_H1024_PERCENT_50
-        # Default: --checkpoint-every 2000 --log-every 20
-
-COCO Captions provides five captions per image. To train with one fixed caption
-per image, add ``DATA.USE_SINGLE_CAPTION True`` in ``--config-override``.
-
-The randomly selected subset is deterministic across runs based on random seed
-(``RANDOM_SEED`` in config). When training on less than ``50%`` dataset size, we
-recommend using multiple random seeds (results will have a variance of ``±1%``).
diff --git a/virtex/docs/virtex/usage/setup_dependencies.rst b/virtex/docs/virtex/usage/setup_dependencies.rst
deleted file mode 100644
index b4ece3964148f977154c367de2dfb84c57a86053..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/usage/setup_dependencies.rst
+++ /dev/null
@@ -1,153 +0,0 @@
-How to setup this codebase?
-===========================
-
-.. raw:: html
-
-    <hr>
-
-This codebase requires Python 3.6+ or higher. We recommend using Anaconda or
-Miniconda. We walk through installation and data preprocessing here.
-
-
-Install Dependencies
---------------------
-
-For these steps to install through Anaconda (or Miniconda).
-
-1. Install Anaconda or Miniconda distribution based on Python 3+ from their
-   `downloads site <https://conda.io/docs/user-guide/install/download.html>`_.
-
-
-2. Clone the repository first.
-
-    .. code-block:: shell
-
-        git clone https://www.github.com/kdexd/virtex
-
-
-3. Create a conda environment and install all the dependencies.
-
-    .. code-block:: shell
-
-        cd virtex
-        conda create -n virtex python=3.6
-        conda activate virtex
-        pip install -r requirements.txt
-
-
-4. Install this codebase as a package in development version.
-
-    .. code-block:: shell
-
-        python setup.py develop
-
-Now you can ``import virtex`` from anywhere as long as you have this conda
-environment activated.
-
--------------------------------------------------------------------------------
-
-
-Setup Datasets
---------------
-
-Datasets are assumed to exist in ``./datasets`` directory (relative to the
-project root) following the structure specified below. COCO is used for
-pretraining, and rest of the datasets (including COCO) are used for downstream
-tasks. This structure is compatible when using
-`Detectron2 <https://github.com/facebookresearch/detectron2>`_ for downstream
-tasks.
-
-COCO
-^^^^
-.. code-block::
-
-    datasets/coco/
-        annotations/
-            captions_{train,val}2017.json
-            instances_{train,val}2017.json
-        train2017/
-            # images in train2017 split
-        val2017/
-            # images in val2017 split
-
-LVIS
-^^^^
-.. code-block::
-
-    datasets/coco/
-        train2017/
-        val2017/
-    datasets/lvis/
-        lvis_v1.0_{train,val}.json
-
-PASCAL VOC
-^^^^^^^^^^
-.. code-block::
-
-    datasets/VOC2007/
-        Annotations/
-        ImageSets/
-            Main/
-                trainval.txt
-                test.txt
-        JPEGImages/
-
-    datasets/VOC2012/
-        # Same as VOC2007 above
-
-ImageNet
-^^^^^^^^
-.. code-block::
-
-    datasets/imagenet/
-        train/
-            # One directory per category with images in it
-        val/
-            # One directory per category with images in it
-        ILSVRC2012_devkit_t12.tar.gz
-
-iNaturalist 2018
-^^^^^^^^^^^^^^^^
-.. code-block::
-
-    datasets/inaturalist/
-        train_val2018/
-        annotations/
-            train2018.json
-            val2018.json
-
--------------------------------------------------------------------------------
-
-
-Preprocess Data
----------------
-
-1. Build a vocabulary out of COCO Captions ``train2017`` split.
-
-    .. code-block:: shell
-
-        python scripts/preprocess/build_vocabulary.py \
-            --captions datasets/coco/annotations/captions_train2017.json \
-            --vocab-size 10000 \
-            --output-prefix datasets/vocab/coco_10k \
-            --do-lower-case
-
-
-2. Serialize COCO Captions (``train2017`` and ``val2017`` splits) into LMDB
-   files. These are faster for data reading during pretraining.
-
-    .. code-block:: shell
-
-        python scripts/preprocess/preprocess_coco.py \
-            --data-root datasets/coco \
-            --split train \
-            --output datasets/coco/serialized_train.lmdb
-
-    .. code-block:: shell
-
-        python scripts/preprocess/preprocess_coco.py \
-            --data-root datasets/coco \
-            --split val \
-            --output datasets/coco/serialized_val.lmdb
-
-That's it! You are all set to use this codebase.
diff --git a/virtex/docs/virtex/utils.beam_search.rst b/virtex/docs/virtex/utils.beam_search.rst
deleted file mode 100644
index a04811e9c89a0c093e1ffb373467eb6ba9b81b87..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/utils.beam_search.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.utils.beam_search
-========================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.utils.beam_search
diff --git a/virtex/docs/virtex/utils.checkpointing.rst b/virtex/docs/virtex/utils.checkpointing.rst
deleted file mode 100644
index 1b3719bf7e330c13835dc57457a3bef238c29b0e..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/utils.checkpointing.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.utils.checkpointing
-==========================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.utils.checkpointing
diff --git a/virtex/docs/virtex/utils.common.rst b/virtex/docs/virtex/utils.common.rst
deleted file mode 100644
index cadd36d26a01f03b4457f1caed1c0c03dc58a9ef..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/utils.common.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.utils.common
-===================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.utils.common
diff --git a/virtex/docs/virtex/utils.distributed.rst b/virtex/docs/virtex/utils.distributed.rst
deleted file mode 100644
index e6a44d674ecb8a96d2568b1cd4072dd1e38f2a9d..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/utils.distributed.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.utils.distributed
-========================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.utils.distributed
diff --git a/virtex/docs/virtex/utils.metrics.rst b/virtex/docs/virtex/utils.metrics.rst
deleted file mode 100644
index 75234d5e4d230adf20192af77849b1a9c3f059d1..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/utils.metrics.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.utils.metrics
-====================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.utils.metrics
diff --git a/virtex/docs/virtex/utils.rst b/virtex/docs/virtex/utils.rst
deleted file mode 100644
index 9d021d9c4e1e255554130264d12abad06cc53911..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/utils.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-virtex.utils
-============
-
-.. raw:: html
-
-    <hr>
-
-.. toctree::
-
-    utils.common
-    utils.distributed
-    utils.timer
-    utils.checkpointing
-    utils.beam_search
-    utils.metrics
diff --git a/virtex/docs/virtex/utils.timer.rst b/virtex/docs/virtex/utils.timer.rst
deleted file mode 100644
index c2ddcdb3459f519d9a98766a6ddbef2adefa072d..0000000000000000000000000000000000000000
--- a/virtex/docs/virtex/utils.timer.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-virtex.utils.timer
-==================
-
-.. raw:: html
-
-    <hr>
-
-.. automodule:: virtex.utils.timer
diff --git a/virtex/virtex/factories.py b/virtex/factories.py
similarity index 100%
rename from virtex/virtex/factories.py
rename to virtex/factories.py
diff --git a/virtex/hubconf.py b/virtex/hubconf.py
deleted file mode 100644
index f85d01d371151f0716680397b1c955d3c4dd42d7..0000000000000000000000000000000000000000
--- a/virtex/hubconf.py
+++ /dev/null
@@ -1,35 +0,0 @@
-dependencies = ["torch"]
-
-import torch
-import torchvision
-
-
-def resnet50(pretrained: bool = False, **kwargs):
-    r"""
-    ResNet-50 visual backbone from the best performing VirTex model: pretrained
-    for bicaptioning on COCO Captions, with textual head ``L = 1, H = 2048``.
-
-    This is a torchvision-like model, with the last ``avgpool`` and `fc``
-    modules replaced with ``nn.Identity()`` modules. Given a batch of image
-    tensors with size ``(B, 3, 224, 224)``, this model computes spatial image
-    features of size ``(B, 7, 7, 2048)``, where B = batch size.
-
-    pretrained (bool): Whether to load model with pretrained weights.
-    """
-
-    # Create a torchvision resnet50 with randomly initialized weights.
-    model = torchvision.models.resnet50(pretrained=False, **kwargs)
-
-    # Replace global average pooling and fully connected layers with identity
-    # modules.
-    model.avgpool = torch.nn.Identity()
-    model.fc = torch.nn.Identity()
-
-    if pretrained:
-        model.load_state_dict(
-            torch.hub.load_state_dict_from_url(
-                "https://umich.box.com/shared/static/gsjqm4i4fm1wpzi947h27wweljd8gcpy.pth",
-                progress=False,
-            )["model"]
-        )
-    return model
diff --git a/virtex/virtex/model_zoo/__init__.py b/virtex/model_zoo/__init__.py
similarity index 100%
rename from virtex/virtex/model_zoo/__init__.py
rename to virtex/model_zoo/__init__.py
diff --git a/virtex/virtex/model_zoo/model_zoo.py b/virtex/model_zoo/model_zoo.py
similarity index 100%
rename from virtex/virtex/model_zoo/model_zoo.py
rename to virtex/model_zoo/model_zoo.py
diff --git a/virtex/virtex/models/__init__.py b/virtex/models/__init__.py
similarity index 100%
rename from virtex/virtex/models/__init__.py
rename to virtex/models/__init__.py
diff --git a/virtex/virtex/models/captioning.py b/virtex/models/captioning.py
similarity index 100%
rename from virtex/virtex/models/captioning.py
rename to virtex/models/captioning.py
diff --git a/virtex/virtex/models/classification.py b/virtex/models/classification.py
similarity index 100%
rename from virtex/virtex/models/classification.py
rename to virtex/models/classification.py
diff --git a/virtex/virtex/models/contrastive.py b/virtex/models/contrastive.py
similarity index 100%
rename from virtex/virtex/models/contrastive.py
rename to virtex/models/contrastive.py
diff --git a/virtex/virtex/models/masked_lm.py b/virtex/models/masked_lm.py
similarity index 100%
rename from virtex/virtex/models/masked_lm.py
rename to virtex/models/masked_lm.py
diff --git a/virtex/virtex/models/zero_shot_classification_eval.py b/virtex/models/zero_shot_classification_eval.py
similarity index 100%
rename from virtex/virtex/models/zero_shot_classification_eval.py
rename to virtex/models/zero_shot_classification_eval.py
diff --git a/virtex/virtex/modules/embedding.py b/virtex/modules/embedding.py
similarity index 100%
rename from virtex/virtex/modules/embedding.py
rename to virtex/modules/embedding.py
diff --git a/virtex/virtex/modules/label_smoothing.py b/virtex/modules/label_smoothing.py
similarity index 100%
rename from virtex/virtex/modules/label_smoothing.py
rename to virtex/modules/label_smoothing.py
diff --git a/virtex/virtex/modules/textual_heads.py b/virtex/modules/textual_heads.py
similarity index 100%
rename from virtex/virtex/modules/textual_heads.py
rename to virtex/modules/textual_heads.py
diff --git a/virtex/virtex/modules/transformer.py b/virtex/modules/transformer.py
similarity index 100%
rename from virtex/virtex/modules/transformer.py
rename to virtex/modules/transformer.py
diff --git a/virtex/virtex/modules/visual_backbones.py b/virtex/modules/visual_backbones.py
similarity index 100%
rename from virtex/virtex/modules/visual_backbones.py
rename to virtex/modules/visual_backbones.py
diff --git a/virtex/virtex/optim/__init__.py b/virtex/optim/__init__.py
similarity index 100%
rename from virtex/virtex/optim/__init__.py
rename to virtex/optim/__init__.py
diff --git a/virtex/virtex/optim/lookahead.py b/virtex/optim/lookahead.py
similarity index 100%
rename from virtex/virtex/optim/lookahead.py
rename to virtex/optim/lookahead.py
diff --git a/virtex/virtex/optim/lr_scheduler.py b/virtex/optim/lr_scheduler.py
similarity index 100%
rename from virtex/virtex/optim/lr_scheduler.py
rename to virtex/optim/lr_scheduler.py
diff --git a/virtex/scripts/clf_linear.py b/virtex/scripts/clf_linear.py
deleted file mode 100644
index 52ab5f22d974cf4e523f174aab09143d7d19b005..0000000000000000000000000000000000000000
--- a/virtex/scripts/clf_linear.py
+++ /dev/null
@@ -1,302 +0,0 @@
-import argparse
-import os
-
-from loguru import logger
-import torch
-from torch import nn
-from torch.cuda import amp
-from torch.utils.data import DataLoader, DistributedSampler
-from torch.utils.tensorboard import SummaryWriter
-
-from virtex.config import Config
-from virtex.factories import (
-    DownstreamDatasetFactory,
-    PretrainingModelFactory,
-    OptimizerFactory,
-    LRSchedulerFactory,
-)
-from virtex.utils.checkpointing import CheckpointManager
-from virtex.utils.common import common_parser, common_setup, cycle
-import virtex.utils.distributed as dist
-from virtex.utils.metrics import TopkAccuracy
-from virtex.utils.timer import Timer
-
-
-# fmt: off
-parser = common_parser(
-    description="""Do image classification with linear models and frozen
-    feature extractor, or fine-tune the feature extractor end-to-end."""
-)
-group = parser.add_argument_group("Downstream config arguments.")
-group.add_argument(
-    "--down-config", metavar="FILE", help="Path to a downstream config file."
-)
-group.add_argument(
-    "--down-config-override", nargs="*", default=[],
-    help="A list of key-value pairs to modify downstream config params.",
-)
-
-parser.add_argument_group("Checkpointing and Logging")
-parser.add_argument(
-    "--weight-init", choices=["random", "imagenet", "torchvision", "virtex"],
-    default="virtex", help="""How to initialize weights:
-        1. 'random' initializes all weights randomly
-        2. 'imagenet' initializes backbone weights from torchvision model zoo
-        3. {'torchvision', 'virtex'} load state dict from --checkpoint-path
-            - with 'torchvision', state dict would be from PyTorch's training
-              script.
-            - with 'virtex' it should be for our full pretrained model."""
-)
-parser.add_argument(
-    "--log-every", type=int, default=50,
-    help="""Log training curves to tensorboard after every these many iterations
-    only master process logs averaged loss values across processes.""",
-)
-parser.add_argument(
-    "--checkpoint-path",
-    help="""Path to load checkpoint and run downstream task evaluation. The
-    name of checkpoint file is required to be `model_*.pth`, where * is
-    iteration number from which the checkpoint was serialized."""
-)
-parser.add_argument(
-    "--checkpoint-every", type=int, default=5000,
-    help="""Serialize model to a checkpoint after every these many iterations.
-    For ImageNet, (5005 iterations = 1 epoch); for iNaturalist (1710 iterations
-    = 1 epoch).""",
-)
-# fmt: on
-
-
-def main(_A: argparse.Namespace):
-
-    if _A.num_gpus_per_machine == 0:
-        # Set device as CPU if num_gpus_per_machine = 0.
-        device = torch.device("cpu")
-    else:
-        # Get the current device as set for current distributed process.
-        # Check `launch` function in `virtex.utils.distributed` module.
-        device = torch.cuda.current_device()
-
-    # Create a downstream config object (this will be immutable) and perform
-    # common setup such as logging and setting up serialization directory.
-    _DOWNC = Config(_A.down_config, _A.down_config_override)
-    common_setup(_DOWNC, _A, job_type="downstream")
-
-    # Create a (pretraining) config object and backup in serializaion directory.
-    _C = Config(_A.config, _A.config_override)
-    _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml"))
-
-    # Get dataset name for tensorboard logging.
-    DATASET = _DOWNC.DATA.ROOT.split("/")[-1]
-
-    # Set number of output classes according to dataset:
-    NUM_CLASSES_MAPPING = {"imagenet": 1000, "inaturalist": 8142}
-    NUM_CLASSES = NUM_CLASSES_MAPPING[DATASET]
-
-    # -------------------------------------------------------------------------
-    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
-    # -------------------------------------------------------------------------
-    train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="train")
-    train_dataloader = DataLoader(
-        train_dataset,
-        batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(),
-        num_workers=_A.cpu_workers,
-        sampler=DistributedSampler(
-            train_dataset,
-            num_replicas=dist.get_world_size(),
-            rank=dist.get_rank(),
-            shuffle=True,
-        ),
-        drop_last=False,
-        pin_memory=True,
-        collate_fn=train_dataset.collate_fn,
-    )
-    val_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="val")
-    val_dataloader = DataLoader(
-        val_dataset,
-        batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(),
-        num_workers=_A.cpu_workers,
-        sampler=DistributedSampler(
-            val_dataset,
-            num_replicas=dist.get_world_size(),
-            rank=dist.get_rank(),
-            shuffle=False,
-        ),
-        pin_memory=True,
-        drop_last=False,
-        collate_fn=val_dataset.collate_fn,
-    )
-    # Initialize model using pretraining config.
-    pretrained_model = PretrainingModelFactory.from_config(_C)
-
-    # Load weights according to the init method, do nothing for `random`, and
-    # `imagenet` is already taken care of.
-    if _A.weight_init == "virtex":
-        CheckpointManager(model=pretrained_model).load(_A.checkpoint_path)
-    elif _A.weight_init == "torchvision":
-        # Keep strict=False because this state dict may have weights for
-        # last fc layer.
-        pretrained_model.visual.cnn.load_state_dict(
-            torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"],
-            strict=False,
-        )
-
-    # Pull out the CNN (torchvision-like) from our pretrained model and add
-    # back the FC layer - this is exists in torchvision models, and is set to
-    # `nn.Identity()` during pretraining.
-    model = pretrained_model.visual.cnn  # type: ignore
-    model.fc = nn.Linear(_DOWNC.MODEL.VISUAL.FEATURE_SIZE, NUM_CLASSES).to(device)
-    model = model.to(device)
-
-    # Re-initialize the FC layer.
-    torch.nn.init.normal_(model.fc.weight.data, mean=0.0, std=0.01)
-    torch.nn.init.constant_(model.fc.bias.data, 0.0)
-
-    # Freeze all layers except FC as per config param.
-    if _DOWNC.MODEL.VISUAL.FROZEN:
-        # Set model to eval mode to prevent BatchNorm from updating running
-        # mean and std. With only a linear layer, being in eval mode when
-        # training will not matter anyway.
-        model.eval()
-
-        for name, param in model.named_parameters():
-            if "fc" not in name:
-                param.requires_grad = False
-
-    # Cross entropy loss and accuracy meter.
-    criterion = nn.CrossEntropyLoss()
-    top1 = TopkAccuracy(top_k=1)
-
-    optimizer = OptimizerFactory.from_config(_DOWNC, model.named_parameters())
-    scheduler = LRSchedulerFactory.from_config(_DOWNC, optimizer)
-    del pretrained_model
-
-    # -------------------------------------------------------------------------
-    #  BEFORE TRAINING STARTS
-    # -------------------------------------------------------------------------
-
-    # Create a gradient scaler for automatic mixed precision.
-    scaler = amp.GradScaler(enabled=_DOWNC.AMP)
-
-    # Create an iterator from dataloader to sample batches perpetually.
-    train_dataloader_iter = cycle(train_dataloader, device)
-
-    if dist.get_world_size() > 1:
-        dist.synchronize()
-        model = nn.parallel.DistributedDataParallel(
-            model, device_ids=[device], find_unused_parameters=True
-        )
-
-    if dist.is_master_process():
-        checkpoint_manager = CheckpointManager(
-            _A.serialization_dir,
-            model=model,
-            optimizer=optimizer,
-            scheduler=scheduler,
-        )
-        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
-
-    # Keep track of time per iteration and ETA.
-    timer = Timer(start_from=1, total_iterations=_DOWNC.OPTIM.NUM_ITERATIONS)
-
-    # -------------------------------------------------------------------------
-    #   TRAINING LOOP
-    # -------------------------------------------------------------------------
-    for iteration in range(1, _DOWNC.OPTIM.NUM_ITERATIONS + 1):
-        timer.tic()
-        optimizer.zero_grad()
-        batch = next(train_dataloader_iter)
-
-        with amp.autocast(enabled=_DOWNC.AMP):
-            logits = model(batch["image"])
-            loss = criterion(logits, batch["label"])
-
-        scaler.scale(loss).backward()
-        scaler.step(optimizer)
-        scaler.update()
-
-        scheduler.step()
-        timer.toc()
-
-        if iteration % _A.log_every == 0 and dist.is_master_process():
-            logger.info(
-                f"{timer.stats} | Loss: {loss:.3f} | GPU: {dist.gpu_mem_usage()} MB"
-            )
-            tensorboard_writer.add_scalar(f"{DATASET}/train_loss", loss, iteration)
-            tensorboard_writer.add_scalar(
-                f"{DATASET}/learning_rate",
-                optimizer.param_groups[0]["lr"],
-                iteration,
-            )
-
-        # ---------------------------------------------------------------------
-        #   VALIDATION
-        # ---------------------------------------------------------------------
-        if iteration % _A.checkpoint_every == 0:
-            torch.set_grad_enabled(False)
-            model.eval()
-
-            total_val_loss = torch.tensor(0.0).to(device)
-
-            for val_iteration, batch in enumerate(val_dataloader, start=1):
-                for key in batch:
-                    batch[key] = batch[key].to(device)
-
-                logits = model(batch["image"])
-                loss = criterion(logits, batch["label"])
-                top1(logits, batch["label"])
-                total_val_loss += loss
-
-            # Divide each loss component by number of val batches per GPU.
-            total_val_loss = total_val_loss / val_iteration
-            dist.average_across_processes(total_val_loss)
-
-            # Get accumulated Top-1 accuracy for logging across GPUs.
-            acc = top1.get_metric(reset=True)
-            dist.average_across_processes(acc)
-
-            torch.set_grad_enabled(True)
-
-            # Set model back to train mode only when fine-tuning end-to-end.
-            if not _DOWNC.MODEL.VISUAL.FROZEN:
-                model.train()
-
-            # Save recent checkpoint and best checkpoint based on accuracy.
-            if dist.is_master_process():
-                checkpoint_manager.step(iteration)
-
-        if iteration % _A.checkpoint_every == 0 and dist.is_master_process():
-            logger.info(f"Iter: {iteration} | Top-1 accuracy: {acc})")
-            tensorboard_writer.add_scalar(
-                f"{DATASET}/val_loss", total_val_loss, iteration
-            )
-            # This name scoping will result in Tensorboard displaying all metrics
-            # (VOC07, caption, etc.) together.
-            tensorboard_writer.add_scalars(
-                f"metrics/{DATASET}", {"top1": acc}, iteration
-            )
-
-        # All processes will wait till master process is done logging.
-        dist.synchronize()
-
-
-if __name__ == "__main__":
-    _A = parser.parse_args()
-
-    # Add an arg in config override if `--weight-init` is imagenet.
-    if _A.weight_init == "imagenet":
-        _A.config_override.extend(["MODEL.VISUAL.PRETRAINED", True])
-
-    if _A.num_gpus_per_machine == 0:
-        main(_A)
-    else:
-        # This will launch `main` and set appropriate CUDA device (GPU ID) as
-        # per process (accessed in the beginning of `main`).
-        dist.launch(
-            main,
-            num_machines=_A.num_machines,
-            num_gpus_per_machine=_A.num_gpus_per_machine,
-            machine_rank=_A.machine_rank,
-            dist_url=_A.dist_url,
-            args=(_A,),
-        )
diff --git a/virtex/scripts/clf_voc07.py b/virtex/scripts/clf_voc07.py
deleted file mode 100644
index 0e382c1ac49a3c9c254ab9c97f14652ed664fbf6..0000000000000000000000000000000000000000
--- a/virtex/scripts/clf_voc07.py
+++ /dev/null
@@ -1,272 +0,0 @@
-import argparse
-import multiprocessing as mp
-import os
-from typing import Any, List
-
-from loguru import logger
-import numpy as np
-from sklearn.svm import LinearSVC
-from sklearn.metrics import average_precision_score
-from sklearn.model_selection import cross_val_score
-import torch
-from torch.utils.data import DataLoader
-from torch.utils.tensorboard import SummaryWriter
-from tqdm import tqdm
-
-from virtex.config import Config
-from virtex.factories import PretrainingModelFactory, DownstreamDatasetFactory
-from virtex.utils.checkpointing import CheckpointManager
-from virtex.utils.common import common_parser, common_setup
-
-
-parser = common_parser(
-    description="Train SVMs for VOC2007 classification on a pretrained model."
-)
-group = parser.add_argument_group("Downstream config arguments.")
-group.add_argument(
-    "--down-config", metavar="FILE", help="Path to a downstream config file."
-)
-group.add_argument(
-    "--down-config-override",
-    nargs="*",
-    default=[],
-    help="A list of key-value pairs to modify downstream config params.",
-)
-
-# fmt: off
-parser.add_argument_group("Checkpointing")
-parser.add_argument(
-    "--weight-init", choices=["random", "imagenet", "torchvision", "virtex"],
-    default="virtex", help="""How to initialize weights:
-        1. 'random' initializes all weights randomly
-        2. 'imagenet' initializes backbone weights from torchvision model zoo
-        3. {'torchvision', 'virtex'} load state dict from --checkpoint-path
-            - with 'torchvision', state dict would be from PyTorch's training
-              script.
-            - with 'virtex' it should be for our full pretrained model."""
-)
-parser.add_argument(
-    "--checkpoint-path",
-    help="Path to load checkpoint and run downstream task evaluation."
-)
-# fmt: on
-
-
-def train_test_single_svm(args):
-
-    feats_train, tgts_train, feats_test, tgts_test, cls_name = args
-    SVM_COSTS = [0.01, 0.1, 1.0, 10.0]
-
-    cls_labels = np.copy(tgts_train)
-    # Meaning of labels in VOC/COCO original loaded target files:
-    # label 0 = not present, set it to -1 as svm train target
-    # label 1 = present. Make the svm train target labels as -1, 1.
-    cls_labels[np.where(cls_labels == 0)] = -1
-
-    # See which cost maximizes the AP for this class.
-    best_crossval_ap: float = 0.0
-    best_crossval_clf = None
-    best_cost: float = 0.0
-
-    # fmt: off
-    for cost in SVM_COSTS:
-        clf = LinearSVC(
-            C=cost, class_weight={1: 2, -1: 1}, penalty="l2",
-            loss="squared_hinge", max_iter=2000,
-        )
-        ap_scores = cross_val_score(
-            clf, feats_train, cls_labels, cv=3, scoring="average_precision",
-        )
-        clf.fit(feats_train, cls_labels)
-
-        # Keep track of best SVM (based on cost) for each class.
-        if ap_scores.mean() > best_crossval_ap:
-            best_crossval_ap = ap_scores.mean()
-            best_crossval_clf = clf
-            best_cost = cost
-
-    logger.info(f"Best SVM {cls_name}: cost {best_cost}, mAP {best_crossval_ap * 100}")
-    # fmt: on
-
-    # -------------------------------------------------------------------------
-    #   TEST THE TRAINED SVM (PER CLASS)
-    # -------------------------------------------------------------------------
-    predictions = best_crossval_clf.decision_function(feats_test)
-    evaluate_data_inds = tgts_test != -1
-    eval_preds = predictions[evaluate_data_inds]
-
-    cls_labels = np.copy(tgts_test)
-    eval_cls_labels = cls_labels[evaluate_data_inds]
-    eval_cls_labels[np.where(eval_cls_labels == 0)] = -1
-
-    # Binarize class labels to make AP targets.
-    targets = eval_cls_labels > 0
-    return average_precision_score(targets, eval_preds)
-
-
-def main(_A: argparse.Namespace):
-
-    if _A.num_gpus_per_machine == 0:
-        # Set device as CPU if num_gpus_per_machine = 0.
-        device = torch.device("cpu")
-    else:
-        # Get the current device (this will be zero here by default).
-        device = torch.cuda.current_device()
-
-    # Create a downstream config object (this will be immutable) and perform
-    # common setup such as logging and setting up serialization directory.
-    _DOWNC = Config(_A.down_config, _A.down_config_override)
-    common_setup(_DOWNC, _A, job_type="downstream")
-
-    # Create a (pretraining) config object and backup in serialization directory.
-    _C = Config(_A.config, _A.config_override)
-    _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml"))
-
-    # -------------------------------------------------------------------------
-    #   INSTANTIATE DATALOADER, MODEL, AND FEATURE EXTRACTOR
-    # -------------------------------------------------------------------------
-
-    train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="trainval")
-    train_dataloader = DataLoader(
-        train_dataset,
-        batch_size=_DOWNC.OPTIM.BATCH_SIZE,
-        num_workers=_A.cpu_workers,
-        pin_memory=True,
-    )
-    test_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="test")
-    test_dataloader = DataLoader(
-        test_dataset,
-        batch_size=_DOWNC.OPTIM.BATCH_SIZE,
-        num_workers=_A.cpu_workers,
-        pin_memory=True,
-    )
-    NUM_CLASSES = len(train_dataset.class_names)
-
-    # Initialize from a checkpoint, but only keep the visual module.
-    model = PretrainingModelFactory.from_config(_C)
-
-    # Load weights according to the init method, do nothing for `random`, and
-    # `imagenet` is already taken care of.
-    if _A.weight_init == "virtex":
-        ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
-    elif _A.weight_init == "torchvision":
-        # Keep strict=False because this state dict may have weights for
-        # last fc layer.
-        model.visual.cnn.load_state_dict(
-            torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"],
-            strict=False,
-        )
-        # Set ``ITERATION`` to a dummy value.
-        ITERATION = 0
-
-    # Transfer model to GPU and set to eval mode. This is a torchvision model
-    # and it returns features as ``(batch_size, 2048, 7, 7)``.
-    model = model.visual.cnn.to(device).eval()
-
-    # -------------------------------------------------------------------------
-    #   EXTRACT FEATURES FOR TRAINING SVMs
-    # -------------------------------------------------------------------------
-
-    features_train: List[torch.Tensor] = []
-    targets_train: List[torch.Tensor] = []
-
-    features_test: List[torch.Tensor] = []
-    targets_test: List[torch.Tensor] = []
-
-    # VOC07 is small, extract all features and keep them in memory.
-    with torch.no_grad():
-        for batch in tqdm(train_dataloader, desc="Extracting train features:"):
-            features = model(batch["image"].to(device))
-
-            # Global average pool features. Assume the tensor is in NCHW format.
-            if len(features.size()) > 2:
-                features = features.view(features.size(0), features.size(1), -1)
-
-                # shape: (batch_size, visual_feature_size)
-                features = features.mean(dim=-1)
-
-            # shape: (batch_size, visual_feature_size)
-            features = features.view(features.size(0), -1)
-
-            # L2-normalize the global average pooled features.
-            features = features / torch.norm(features, dim=-1).unsqueeze(-1)
-
-            features_train.append(features.cpu())
-            targets_train.append(batch["label"])
-
-        # Similarly extract test features.
-        for batch in tqdm(test_dataloader, desc="Extracting test features:"):
-            features = model(batch["image"].to(device))
-
-            if len(features.size()) > 2:
-                features = features.view(features.size(0), features.size(1), -1)
-                features = features.mean(dim=-1)
-
-            features = features.view(features.size(0), -1)
-            features = features / torch.norm(features, dim=-1).unsqueeze(-1)
-
-            features_test.append(features.cpu())
-            targets_test.append(batch["label"])
-
-    # Convert batches of features/targets to one large numpy array
-    features_train = torch.cat(features_train, dim=0).numpy()
-    targets_train = torch.cat(targets_train, dim=0).numpy().astype(np.int32)
-
-    features_test = torch.cat(features_test, dim=0).numpy()
-    targets_test = torch.cat(targets_test, dim=0).numpy().astype(np.int32)
-
-    # -------------------------------------------------------------------------
-    #   TRAIN AND TEST SVMs WITH EXTRACTED FEATURES
-    # -------------------------------------------------------------------------
-
-    input_args: List[Any] = []
-
-    # Iterate over all VOC07 classes and train one-vs-all linear SVMs.
-    for cls_idx in range(NUM_CLASSES):
-        # fmt: off
-        input_args.append((
-            features_train, targets_train[:, cls_idx],
-            features_test, targets_test[:, cls_idx],
-            train_dataset.class_names[cls_idx],
-        ))
-        # fmt: on
-
-    pool = mp.Pool(processes=_A.cpu_workers)
-    pool_output = pool.map(train_test_single_svm, input_args)
-
-    # -------------------------------------------------------------------------
-    #   TENSORBOARD LOGGING (RELEVANT MAINLY FOR weight_init=checkpoint)
-    # -------------------------------------------------------------------------
-
-    # Tensorboard writer for logging mAP scores. This is useful especially
-    # when weight_init=checkpoint (which maybe be coming from a training job).
-    tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
-
-    # Test set mAP for each class, for features from every layer.
-    test_map = torch.tensor(pool_output).mean()
-    logger.info(f"Iteration: {ITERATION}, mAP: {test_map * 100}")
-    tensorboard_writer.add_scalars(
-        "metrics/voc07_clf", {f"voc07_mAP": test_map * 100}, ITERATION
-    )
-
-    # NOTE: for copy-pasting to spreadsheet.
-    logger.info(
-        f"{_C.DATA.ROOT.split('/')[1]},{_C.DATA.TOKENIZER_MODEL.split('/')[-1][:-6]},"
-        f"{_C.DATA.VOCAB_SIZE},{_C.MODEL.NAME},{_C.MODEL.VISUAL.NAME},{_C.MODEL.TEXTUAL.NAME},"
-        f"{_C.MODEL.LABEL_SMOOTHING},{_C.OPTIM.OPTIMIZER_NAME},{_C.OPTIM.BATCH_SIZE},"
-        f"{_C.OPTIM.NUM_ITERATIONS},{_C.OPTIM.LR},{_C.OPTIM.WEIGHT_DECAY},"
-        f"{ITERATION},{test_map * 100:.3f}"
-    )
-
-if __name__ == "__main__":
-    _A = parser.parse_args()
-
-    if _A.num_gpus_per_machine > 1:
-        raise ValueError("Using multiple GPUs is not supported for this script.")
-
-    # Add an arg in config override if `--weight-init` is imagenet.
-    if _A.weight_init == "imagenet":
-        _A.config_override.extend(["MODEL.VISUAL.PRETRAINED", True])
-
-    # No distributed training here, just a single process.
-    main(_A)
diff --git a/virtex/scripts/eval_captioning.py b/virtex/scripts/eval_captioning.py
deleted file mode 100644
index 8da98284f1726027536e38b72b4a82ba04bea396..0000000000000000000000000000000000000000
--- a/virtex/scripts/eval_captioning.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import argparse
-import json
-import os
-from typing import Any, Dict, List
-
-from loguru import logger
-import torch
-from torch.utils.data import DataLoader
-
-from virtex.config import Config
-from virtex.data import ImageDirectoryDataset
-from virtex.factories import TokenizerFactory, PretrainingModelFactory
-from virtex.utils.checkpointing import CheckpointManager
-from virtex.utils.common import common_parser
-from virtex.utils.metrics import CocoCaptionsEvaluator
-
-
-# fmt: off
-parser = common_parser(
-    description="""Run image captioning inference on a pretrained model, and/or
-    evaluate pretrained model on COCO Captions val2017 split."""
-)
-parser.add_argument(
-    "--data-root", default=None,
-    help="""Path to a directory containing image files to generate captions for.
-    Default: COCO val2017 image directory as expected relative to project root."""
-)
-parser.add_argument(
-    "--checkpoint-path", required=True,
-    help="Path to load checkpoint and run captioning evaluation."
-)
-parser.add_argument(
-    "--output", default=None,
-    help="Path to save predictions as a JSON file."
-)
-parser.add_argument(
-    "--calc-metrics", action="store_true",
-    help="""Calculate CIDEr and SPICE metrics using ground truth COCO Captions.
-    This flag should not be set when running inference on arbitrary images."""
-)
-# fmt: on
-
-
-def main(_A: argparse.Namespace):
-
-    if _A.num_gpus_per_machine == 0:
-        # Set device as CPU if num_gpus_per_machine = 0.
-        device = torch.device("cpu")
-    else:
-        # Get the current device (this will be zero here by default).
-        device = torch.cuda.current_device()
-
-    _C = Config(_A.config, _A.config_override)
-
-    tokenizer = TokenizerFactory.from_config(_C)
-
-    if _A.data_root is None:
-        _A.data_root = os.path.join(_C.DATA.ROOT, "val2017")
-
-    val_dataloader = DataLoader(
-        ImageDirectoryDataset(_A.data_root),
-        batch_size=_C.OPTIM.BATCH_SIZE,
-        num_workers=_A.cpu_workers,
-        pin_memory=True,
-    )
-    # Initialize model from a checkpoint.
-    model = PretrainingModelFactory.from_config(_C).to(device)
-    ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
-    model.eval()
-
-    # Make a list of predictions to evaluate.
-    predictions: List[Dict[str, Any]] = []
-
-    for val_iteration, val_batch in enumerate(val_dataloader, start=1):
-
-        val_batch["image"] = val_batch["image"].to(device)
-        with torch.no_grad():
-            output_dict = model(val_batch)
-
-        # Make a dictionary of predictions in COCO format.
-        for image_id, caption in zip(
-            val_batch["image_id"], output_dict["predictions"]
-        ):
-            predictions.append(
-                {
-                    # Convert image id to int if possible (mainly for COCO eval).
-                    "image_id": int(image_id) if image_id.isdigit() else image_id,
-                    "caption": tokenizer.decode(caption.tolist()),
-                }
-            )
-
-    # Save predictions as a JSON file if specified.
-    if _A.output is not None:
-        os.makedirs(os.path.dirname(_A.output), exist_ok=True)
-        json.dump(predictions, open(_A.output, "w"))
-        logger.info(f"Saved predictions to {_A.output}")
-
-    # Calculate CIDEr and SPICE metrics using ground truth COCO Captions. This
-    # should be skipped when running inference on arbitrary images.
-    if _A.calc_metrics:
-        # Assume ground truth (COCO val2017 annotations) exist.
-        gt = os.path.join(_C.DATA.ROOT, "annotations", "captions_val2017.json")
-
-        metrics = CocoCaptionsEvaluator(gt).evaluate(predictions)
-        logger.info(f"Iter: {ITERATION} | Metrics: {metrics}")
-
-
-if __name__ == "__main__":
-    _A = parser.parse_args()
-    if _A.num_gpus_per_machine > 1:
-        raise ValueError("Using multiple GPUs is not supported for this script.")
-
-    # No distributed training here, just a single process.
-    main(_A)
diff --git a/virtex/scripts/eval_detectron2.py b/virtex/scripts/eval_detectron2.py
deleted file mode 100644
index b79147080f8c56313e1a809b9f1a791ecd380e11..0000000000000000000000000000000000000000
--- a/virtex/scripts/eval_detectron2.py
+++ /dev/null
@@ -1,248 +0,0 @@
-"""
-Finetune a pre-trained model on a downstream task, one of those available in
-Detectron2.
-Supported downstream:
-  - LVIS Instance Segmentation
-  - COCO Instance Segmentation
-  - Pascal VOC 2007+12 Object Detection
-
-Reference: https://github.com/facebookresearch/detectron2/blob/master/tools/train_net.py
-Thanks to the developers of Detectron2!
-"""
-import argparse
-import os
-import re
-from typing import Any, Dict, Union
-
-import torch
-from torch.utils.tensorboard import SummaryWriter
-
-import detectron2 as d2
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.engine import DefaultTrainer, default_setup
-from detectron2.evaluation import (
-    LVISEvaluator,
-    PascalVOCDetectionEvaluator,
-    COCOEvaluator,
-)
-from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads
-
-from virtex.config import Config
-from virtex.factories import PretrainingModelFactory
-from virtex.utils.checkpointing import CheckpointManager
-from virtex.utils.common import common_parser
-import virtex.utils.distributed as dist
-
-# fmt: off
-parser = common_parser(
-    description="Train object detectors from pretrained visual backbone."
-)
-parser.add_argument(
-    "--d2-config", required=True,
-    help="Path to a detectron2 config for downstream task finetuning."
-)
-parser.add_argument(
-    "--d2-config-override", nargs="*", default=[],
-    help="""Key-value pairs from Detectron2 config to override from file.
-    Some keys will be ignored because they are set from other args:
-    [DATALOADER.NUM_WORKERS, SOLVER.EVAL_PERIOD, SOLVER.CHECKPOINT_PERIOD,
-    TEST.EVAL_PERIOD, OUTPUT_DIR]""",
-)
-
-parser.add_argument_group("Checkpointing and Logging")
-parser.add_argument(
-    "--weight-init", choices=["random", "imagenet", "torchvision", "virtex"],
-    default="virtex", help="""How to initialize weights:
-        1. 'random' initializes all weights randomly
-        2. 'imagenet' initializes backbone weights from torchvision model zoo
-        3. {'torchvision', 'virtex'} load state dict from --checkpoint-path
-            - with 'torchvision', state dict would be from PyTorch's training
-              script.
-            - with 'virtex' it should be for our full pretrained model."""
-)
-parser.add_argument(
-    "--checkpoint-path",
-    help="Path to load checkpoint and run downstream task evaluation."
-)
-parser.add_argument(
-    "--resume", action="store_true", help="""Specify this flag when resuming
-    training from a checkpoint saved by Detectron2."""
-)
-parser.add_argument(
-    "--eval-only", action="store_true",
-    help="Skip training and evaluate checkpoint provided at --checkpoint-path.",
-)
-parser.add_argument(
-    "--checkpoint-every", type=int, default=5000,
-    help="Serialize model to a checkpoint after every these many iterations.",
-)
-# fmt: on
-
-
-@ROI_HEADS_REGISTRY.register()
-class Res5ROIHeadsExtraNorm(Res5ROIHeads):
-    r"""
-    ROI head with ``res5`` stage followed by a BN layer. Used with Faster R-CNN
-    C4/DC5 backbones for VOC detection.
-    """
-
-    def _build_res5_block(self, cfg):
-        seq, out_channels = super()._build_res5_block(cfg)
-        norm = d2.layers.get_norm(cfg.MODEL.RESNETS.NORM, out_channels)
-        seq.add_module("norm", norm)
-        return seq, out_channels
-
-
-def build_detectron2_config(_C: Config, _A: argparse.Namespace):
-    r"""Build detectron2 config based on our pre-training config and args."""
-    _D2C = d2.config.get_cfg()
-
-    # Override some default values based on our config file.
-    _D2C.merge_from_file(_A.d2_config)
-    _D2C.merge_from_list(_A.d2_config_override)
-
-    # Set some config parameters from args.
-    _D2C.DATALOADER.NUM_WORKERS = _A.cpu_workers
-    _D2C.SOLVER.CHECKPOINT_PERIOD = _A.checkpoint_every
-    _D2C.OUTPUT_DIR = _A.serialization_dir
-
-    # Set ResNet depth to override in Detectron2's config.
-    _D2C.MODEL.RESNETS.DEPTH = int(
-        re.search(r"resnet(\d+)", _C.MODEL.VISUAL.NAME).group(1)
-        if "torchvision" in _C.MODEL.VISUAL.NAME
-        else re.search(r"_R_(\d+)", _C.MODEL.VISUAL.NAME).group(1)
-        if "detectron2" in _C.MODEL.VISUAL.NAME
-        else 0
-    )
-    return _D2C
-
-
-class DownstreamTrainer(DefaultTrainer):
-    r"""
-    Extension of detectron2's ``DefaultTrainer``: custom evaluator and hooks.
-
-    Parameters
-    ----------
-    cfg: detectron2.config.CfgNode
-        Detectron2 config object containing all config params.
-    weights: Union[str, Dict[str, Any]]
-        Weights to load in the initialized model. If ``str``, then we assume path
-        to a checkpoint, or if a ``dict``, we assume a state dict. This will be
-        an ``str`` only if we resume training from a Detectron2 checkpoint.
-    """
-
-    def __init__(self, cfg, weights: Union[str, Dict[str, Any]]):
-
-        super().__init__(cfg)
-
-        # Load pre-trained weights before wrapping to DDP because `ApexDDP` has
-        # some weird issue with `DetectionCheckpointer`.
-        # fmt: off
-        if isinstance(weights, str):
-            # weights are ``str`` means ImageNet init or resume training.
-            self.start_iter = (
-                DetectionCheckpointer(
-                    self._trainer.model,
-                    optimizer=self._trainer.optimizer,
-                    scheduler=self.scheduler
-                ).resume_or_load(weights, resume=True).get("iteration", -1) + 1
-            )
-        elif isinstance(weights, dict):
-            # weights are a state dict means our pretrain init.
-            DetectionCheckpointer(self._trainer.model)._load_model(weights)
-        # fmt: on
-
-    @classmethod
-    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
-        if output_folder is None:
-            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
-        evaluator_list = []
-        evaluator_type = d2.data.MetadataCatalog.get(dataset_name).evaluator_type
-        if evaluator_type == "pascal_voc":
-            return PascalVOCDetectionEvaluator(dataset_name)
-        elif evaluator_type == "coco":
-            return COCOEvaluator(dataset_name, cfg, True, output_folder)
-        elif evaluator_type == "lvis":
-            return LVISEvaluator(dataset_name, cfg, True, output_folder)
-
-    def test(self, cfg=None, model=None, evaluators=None):
-        r"""Evaluate the model and log results to stdout and tensorboard."""
-        cfg = cfg or self.cfg
-        model = model or self.model
-
-        tensorboard_writer = SummaryWriter(log_dir=cfg.OUTPUT_DIR)
-        results = super().test(cfg, model)
-        flat_results = d2.evaluation.testing.flatten_results_dict(results)
-        for k, v in flat_results.items():
-            tensorboard_writer.add_scalar(k, v, self.start_iter)
-
-
-def main(_A: argparse.Namespace):
-
-    # Get the current device as set for current distributed process.
-    # Check `launch` function in `virtex.utils.distributed` module.
-    device = torch.cuda.current_device()
-
-    # Local process group is needed for detectron2.
-    pg = list(range(dist.get_world_size()))
-    d2.utils.comm._LOCAL_PROCESS_GROUP = torch.distributed.new_group(pg)
-
-    # Create a config object (this will be immutable) and perform common setup
-    # such as logging and setting up serialization directory.
-    if _A.weight_init == "imagenet":
-        _A.config_override.extend(["MODEL.VISUAL.PRETRAINED", True])
-    _C = Config(_A.config, _A.config_override)
-
-    # We use `default_setup` from detectron2 to do some common setup, such as
-    # logging, setting up serialization etc. For more info, look into source.
-    _D2C = build_detectron2_config(_C, _A)
-    default_setup(_D2C, _A)
-
-    # Prepare weights to pass in instantiation call of trainer.
-    if _A.weight_init in {"virtex", "torchvision"}:
-        if _A.resume:
-            # If resuming training, let detectron2 load weights by providing path.
-            model = None
-            weights = _A.checkpoint_path
-        else:
-            # Load backbone weights from VirTex pretrained checkpoint.
-            model = PretrainingModelFactory.from_config(_C)
-            if _A.weight_init == "virtex":
-                CheckpointManager(model=model).load(_A.checkpoint_path)
-            else:
-                model.visual.cnn.load_state_dict(
-                    torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"],
-                    strict=False,
-                )
-            weights = model.visual.detectron2_backbone_state_dict()
-    else:
-        # If random or imagenet init, just load weights after initializing model.
-        model = PretrainingModelFactory.from_config(_C)
-        weights = model.visual.detectron2_backbone_state_dict()
-
-    # Back up pretrain config and model checkpoint (if provided).
-    _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml"))
-    if _A.weight_init == "virtex" and not _A.resume:
-        torch.save(
-            model.state_dict(),
-            os.path.join(_A.serialization_dir, "pretrain_model.pth"),
-        )
-
-    del model
-    trainer = DownstreamTrainer(_D2C, weights)
-    trainer.test() if _A.eval_only else trainer.train()
-
-
-if __name__ == "__main__":
-    _A = parser.parse_args()
-
-    # This will launch `main` and set appropriate CUDA device (GPU ID) as
-    # per process (accessed in the beginning of `main`).
-    dist.launch(
-        main,
-        num_machines=_A.num_machines,
-        num_gpus_per_machine=_A.num_gpus_per_machine,
-        machine_rank=_A.machine_rank,
-        dist_url=_A.dist_url,
-        args=(_A, ),
-    )
diff --git a/virtex/scripts/preprocess/build_redcaps_vocab.py b/virtex/scripts/preprocess/build_redcaps_vocab.py
deleted file mode 100644
index fd28f1b8d72fde036f631032a539c4fe16d169f2..0000000000000000000000000000000000000000
--- a/virtex/scripts/preprocess/build_redcaps_vocab.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import argparse
-import glob
-import json
-import os
-import re
-import tempfile
-from functools import lru_cache
-from typing import List
-
-import ftfy
-import sentencepiece as sp
-import wordsegment as ws
-from tqdm import tqdm
-
-
-ws.load()
-
-# fmt: off
-parser = argparse.ArgumentParser(
-    description="""Build a vocabulary out of captions corpus. This vocabulary
-    would be a file which our tokenizer can understand.
-    """
-)
-parser.add_argument(
-    "-f", "--files", nargs="+", default="datasets/redcaps/annotations/*.json",
-    help="Path(s) to SBU, Conceptual, or RedCaps annotation files.",
-)
-parser.add_argument(
-    "-s", "--vocab-size", type=int, default=32000,
-    help="Total desired size of our vocabulary.",
-)
-parser.add_argument(
-    "-o", "--output-prefix", default="datasets/vocab/redcaps_32k",
-    help="Prefix of the files to be saved. Two files will be saved: "
-    "[prefix].model and [prefix].vocab",
-)
-# fmt: on
-
-
-def read_captions_from_file(annotations_path: str) -> List[str]:
-    r"""
-    Given a path to annotation file, read it and return a list of captions.
-
-    Parameters
-    ----------
-    annotations_path: str
-        Path to an annotations file containing captions.
-
-    Returns
-    -------
-    List[str]
-        List of captions from this annotation file.
-    """
-
-    _annotations = json.load(open(annotations_path))
-
-    captions: List[str] = []
-    for ann in tqdm(_annotations["annotations"], desc=annotations_path):
-
-        # This field only exists in RedCaps. Perform word segmentation on the
-        # subreddit name to add appropriae whitespaces.
-        if "subreddit" in ann:
-            subreddit_seg = _segment_subreddit(ann["subreddit"].lower())
-            caption = f"{subreddit_seg} {ann['caption']}"
-        else:
-            caption = ann["caption"]
-
-        captions.append(caption.lower())
-    return captions
-
-
-@lru_cache(maxsize=10)
-def _segment_subreddit(subreddit):
-    return " ".join(ws.segment(ws.clean(subreddit)))
-
-
-if __name__ == "__main__":
-    _A = parser.parse_args()
-
-    all_filepaths: List[str] = []
-    for f in _A.files:
-        all_filepaths.extend(glob.glob(f))
-
-    captions: List[str] = []
-    for path in tqdm(all_filepaths, desc="Reading captions"):
-        captions.extend(read_captions_from_file(path))
-
-    # Create a temporary directory and dump the captions corpus as a text file
-    # with one caption per line. That's how sentencepiece wants its input.
-    tmpdir_path = tempfile.mkdtemp()
-
-    with open(os.path.join(tmpdir_path, "captions.txt"), "w") as captions_file:
-        for caption in captions:
-            captions_file.write(caption + "\n")
-
-    # Padding/out-of-vocab token will be "<unk>" and ID 0 by default.
-    # Add [SOS],[EOS] and [SEP] tokens. [SEP] will not be used during
-    # captioning, but good to have to reuse vocabulary across pretext tasks.
-    sp.SentencePieceTrainer.train(
-        f" --input={os.path.join(tmpdir_path, 'captions.txt')}"
-        f" --vocab_size={_A.vocab_size}"
-        f" --model_prefix={_A.output_prefix}"
-        " --model_type=bpe --character_coverage=1.0"
-        " --bos_id=-1 --eos_id=-1"
-        " --control_symbols=[SOS],[EOS],[SEP]"
-        " --user_defined_symbols=<usr>"
-    )
diff --git a/virtex/scripts/preprocess/build_vocabulary.py b/virtex/scripts/preprocess/build_vocabulary.py
deleted file mode 100644
index bc7a592b40d8044919279dc8116ca03dce20b5d1..0000000000000000000000000000000000000000
--- a/virtex/scripts/preprocess/build_vocabulary.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import argparse
-import json
-import os
-import tempfile
-import unicodedata
-from typing import List
-
-import sentencepiece as sp
-
-
-# fmt: off
-parser = argparse.ArgumentParser(
-    description="""Build a vocabulary out of captions corpus. This vocabulary
-    would be a file which our tokenizer can understand.
-    """
-)
-parser.add_argument(
-    "-c", "--captions", default="datasets/coco/annotations/captions_train2017.json",
-    help="Path to caption annotations file in COCO format.",
-)
-parser.add_argument(
-    "-s", "--vocab-size", type=int, default=10000,
-    help="Total desired size of our vocabulary.",
-)
-parser.add_argument(
-    "-o", "--output-prefix", default="datasets/vocab/coco_10k",
-    help="Prefix of the files to be saved. Two files will be saved: "
-    "[prefix].model and [prefix].vocab",
-)
-parser.add_argument(
-    "-l", "--do-lower-case", action="store_true",
-    help="Whether to lower case the captions before forming vocabulary.",
-)
-parser.add_argument(
-    "-a", "--keep-accents", action="store_true",
-    help="Whether to keep accents before forming vocabulary (dropped by default).",
-)
-# fmt: on
-
-
-def _read_captions(annotations_path: str) -> List[str]:
-    r"""
-    Given a path to annotation file, read it and return a list of captions.
-    These are not processed by any means, returned from the file as-is.
-
-    Parameters
-    ----------
-    annotations_path: str
-        Path to an annotations file containing captions.
-
-    Returns
-    -------
-    List[str]
-        List of captions from this annotation file.
-    """
-
-    _annotations = json.load(open(annotations_path))
-
-    captions: List[str] = []
-    for ann in _annotations["annotations"]:
-        captions.append(ann["caption"])
-
-    return captions
-
-
-if __name__ == "__main__":
-    _A = parser.parse_args()
-    captions: List[str] = _read_captions(_A.captions)
-
-    # Lower case the captions and remove accents according to arguments.
-    for i, caption in enumerate(captions):
-        caption = caption.lower() if _A.do_lower_case else caption
-
-        if not _A.keep_accents:
-            caption = unicodedata.normalize("NFKD", caption)
-            caption = "".join(
-                [chr for chr in caption if not unicodedata.combining(chr)]
-            )
-
-        captions[i] = caption
-
-    # Create a temporary directory and dump the captions corpus as a text file
-    # with one caption per line. That's how sentencepiece wants its input.
-    tmpdir_path = tempfile.mkdtemp()
-
-    with open(os.path.join(tmpdir_path, "captions.txt"), "w") as captions_file:
-        for caption in captions:
-            captions_file.write(caption + "\n")
-
-    # Padding/out-of-vocab token will be "<unk>" and ID 0 by default.
-    # Add [SOS],[EOS] and [MASK] tokens. [MASK] will not be used during
-    # captioning, but good to have to reuse vocabulary across pretext tasks.
-    sp.SentencePieceTrainer.train(
-        f" --input={os.path.join(tmpdir_path, 'captions.txt')}"
-        f" --vocab_size={_A.vocab_size}"
-        f" --model_prefix={_A.output_prefix}"
-        " --model_type=bpe --character_coverage=1.0"
-        " --bos_id=-1 --eos_id=-1"
-        " --control_symbols=[SOS],[EOS],[MASK]"
-    )
diff --git a/virtex/scripts/preprocess/preprocess_coco.py b/virtex/scripts/preprocess/preprocess_coco.py
deleted file mode 100644
index abf768d94d494a2e8397b596ca7993a638d2d840..0000000000000000000000000000000000000000
--- a/virtex/scripts/preprocess/preprocess_coco.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import argparse
-import os
-import pickle
-import platform
-from typing import Any, List
-
-import albumentations as alb
-import lmdb
-from tqdm import tqdm
-from torch.utils.data import DataLoader
-
-from virtex.data.readers import SimpleCocoCaptionsReader
-
-
-# fmt: off
-parser = argparse.ArgumentParser("Serialize a COCO Captions split to LMDB.")
-parser.add_argument(
-    "-d", "--data-root", default="datasets/coco",
-    help="Path to the root directory of COCO dataset.",
-)
-parser.add_argument(
-    "-s", "--split", choices=["train", "val"],
-    help="Which split to process, either `train` or `val`.",
-)
-parser.add_argument(
-    "-b", "--batch-size", type=int, default=16,
-    help="Batch size to process and serialize data. Set as per CPU memory.",
-)
-parser.add_argument(
-    "-j", "--cpu-workers", type=int, default=4,
-    help="Number of CPU workers for data loading.",
-)
-parser.add_argument(
-    "-e", "--short-edge-size", type=int, default=None,
-    help="""Resize shorter edge to this size (keeping aspect ratio constant)
-    before serializing. Useful for saving disk memory, and faster read.
-    If None, no images are resized."""
-)
-parser.add_argument(
-    "-o", "--output", default="datasets/serialized/coco_train2017.lmdb",
-    help="Path to store the file containing serialized dataset.",
-)
-
-
-def collate_fn(instances: List[Any]):
-    r"""Collate function for data loader to return list of instances as-is."""
-    return instances
-
-
-if __name__ == "__main__":
-
-    _A = parser.parse_args()
-    os.makedirs(os.path.dirname(_A.output), exist_ok=True)
-
-    dloader = DataLoader(
-        SimpleCocoCaptionsReader(_A.data_root, _A.split),
-        batch_size=_A.batch_size,
-        num_workers=_A.cpu_workers,
-        shuffle=False,
-        drop_last=False,
-        collate_fn=collate_fn
-    )
-    # Open an LMDB database.
-    # Set a sufficiently large map size for LMDB (based on platform).
-    map_size = 1099511627776 * 2 if platform.system() == "Linux" else 1280000
-    db = lmdb.open(
-        _A.output, map_size=map_size, subdir=False, meminit=False, map_async=True
-    )
-
-    # Transform to resize shortest edge and keep aspect ratio same.
-    if _A.short_edge_size is not None:
-        resize = alb.SmallestMaxSize(max_size=_A.short_edge_size, always_apply=True)
-
-    # Serialize each instance (as a dictionary). Use `pickle.dumps`. Key will
-    # be an integer (cast as string) starting from `0`.
-    INSTANCE_COUNTER: int = 0
-
-    for idx, batch in enumerate(tqdm(dloader)):
-
-        txn = db.begin(write=True)
-
-        for instance in batch:
-            image = instance["image"]
-            width, height, channels = image.shape
-
-            # Resize image from instance and convert instance to tuple.
-            if _A.short_edge_size is not None and min(width, height) > _A.short_edge_size:
-                image = resize(image=image)["image"]
-
-            instance = (instance["image_id"], instance["image"], instance["captions"])
-            txn.put(
-                f"{INSTANCE_COUNTER}".encode("ascii"),
-                pickle.dumps(instance, protocol=-1)
-            )
-            INSTANCE_COUNTER += 1
-
-        txn.commit()
-
-    db.sync()
-    db.close()
diff --git a/virtex/scripts/preprocess/preprocess_redcaps.py b/virtex/scripts/preprocess/preprocess_redcaps.py
deleted file mode 100644
index abb4e9e71e272797e2caf3eed304bc4cdc98f85e..0000000000000000000000000000000000000000
--- a/virtex/scripts/preprocess/preprocess_redcaps.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import argparse
-import json
-import os
-import tarfile
-import tempfile
-from typing import Dict, List
-
-from loguru import logger
-from tqdm import tqdm
-
-
-# fmt: off
-parser = argparse.ArgumentParser(
-    description="""Pre-process RedCaps dataset for training VirTex models - make
-    small shards of TAR files containing images and captions."""
-)
-parser.add_argument(
-    "-a", "--annotations", required=True, help="Path to a RedCaps annotation file."
-)
-parser.add_argument(
-    "-i", "--images", default="datasets/redcaps/images",
-    help="""Path to RedCaps image directory. This directory is expected to have
-    subreddit specific sub-directories containing images.""",
-)
-parser.add_argument(
-    "-z", "--shard-size", type=int, default=1000,
-    help="Maximum number of RedCaps instances in a single TAR file shard.",
-)
-parser.add_argument(
-    "-o", "--output-prefix", required=True,
-    help="Path prefix for saving TAR file shards. For example, `/tmp/tarfiles` "
-    "will save as `/tmp/tarfiles_000000.tar`, `/tmp/tarfiles_000001.tar`, ...",
-)
-# fmt: on
-
-
-def main(_A: argparse.Namespace):
-    r"""
-    Make TAR files containing images and annotations from a single RedCaps
-    annotations file. These TAR files are arranged in a way that
-    `WebDataset <https://github.com/tmbdev/webdataset>`_ can understand.
-    """
-
-    ANNOTATIONS: List[Dict] = json.load(open(_A.annotations))["annotations"]
-
-    # Keep track of the current index of TAR file shard and dataset index.
-    SHARD_INDEX: int = 0
-    DATASET_INDEX: int = 0
-
-    # Create TAR file handle for the initial shard.
-    tar_handle = tarfile.open(f"{_A.output_prefix}_{SHARD_INDEX:0>d}.tar", "w")
-
-    # Keep a count of submissions that were skipped because their image was
-    # not downloaded (not present in image dir).
-    SKIPPED: int = 0
-
-    for ann in tqdm(ANNOTATIONS):
-
-        image_path = os.path.join(
-            _A.images, ann["subreddit"], f"{ann['image_id']}.jpg"
-        )
-        # Add current image in shard if it exists.
-        if os.path.exists(image_path):
-
-            tar_handle.add(image_path, arcname=f"{ann['image_id']}.jpg")
-
-            # Save subreddit name and caption as a JSON file.
-            subreddit_and_caption = {
-                "subreddit": ann["subreddit"], "caption": ann["caption"]
-            }
-            tmpfile = tempfile.NamedTemporaryFile("w+")
-            tmpfile.write(json.dumps(subreddit_and_caption))
-            tmpfile.seek(0)
-            tar_handle.add(tmpfile.name, arcname=f"{ann['image_id']}.json")
-            tmpfile.close()
-
-            DATASET_INDEX += 1
-
-            # Create new shard if current shard is full.
-            if DATASET_INDEX % _A.shard_size == 0 and DATASET_INDEX > 0:
-                tar_handle.close()
-                logger.success(
-                    f"Saved shard: {_A.output_prefix}_{SHARD_INDEX:0>6d}.tar"
-                )
-                SHARD_INDEX += 1
-
-                # Open new TAR file shard.
-                tar_handle = tarfile.open(
-                    f"{_A.output_prefix}_{SHARD_INDEX:0>6d}.tar", "w"
-                )
-        else:
-            SKIPPED += 1
-
-    # Close the file handle to properly save it.
-    tar_handle.close()
-    logger.success(f"Saved shard: {_A.output_prefix}_{SHARD_INDEX:0>6d}.tar\n")
-    logger.info(f"Skipped {SKIPPED} annotations due to missing images.")
-
-
-if __name__ == "__main__":
-    _A = parser.parse_args()
-    main(_A)
diff --git a/virtex/scripts/pretrain_virtex.py b/virtex/scripts/pretrain_virtex.py
deleted file mode 100644
index 73e36ed3428c6899876fca9961dbeb81dcb2bd0c..0000000000000000000000000000000000000000
--- a/virtex/scripts/pretrain_virtex.py
+++ /dev/null
@@ -1,239 +0,0 @@
-import argparse
-from collections import Counter
-from typing import Any
-
-from loguru import logger
-import torch
-from torch import nn
-from torch.cuda import amp
-from torch.utils.data import DataLoader, DistributedSampler
-from torch.utils.tensorboard import SummaryWriter
-
-# fmt: off
-from virtex.config import Config
-from virtex.factories import (
-    PretrainingDatasetFactory, PretrainingModelFactory, OptimizerFactory,
-    LRSchedulerFactory,
-)
-from virtex.utils.checkpointing import CheckpointManager
-from virtex.utils.common import common_parser, common_setup, cycle
-import virtex.utils.distributed as dist
-from virtex.utils.timer import Timer
-
-
-parser = common_parser(
-    description="Train a VirTex model (CNN + Transformer) on COCO Captions."
-)
-group = parser.add_argument_group("Checkpointing and Logging")
-group.add_argument(
-    "--resume-from", default=None,
-    help="Path to a checkpoint to resume training from (if provided)."
-)
-group.add_argument(
-    "--checkpoint-every", type=int, default=2000,
-    help="Serialize model to a checkpoint after every these many iterations.",
-)
-group.add_argument(
-    "--log-every", type=int, default=20,
-    help="""Log training curves to tensorboard after every these many iterations
-    only master process logs averaged loss values across processes.""",
-)
-# fmt: on
-
-
-def main(_A: argparse.Namespace):
-
-    if _A.num_gpus_per_machine == 0:
-        # Set device as CPU if num_gpus_per_machine = 0.
-        device: Any = torch.device("cpu")
-    else:
-        # Get the current device as set for current distributed process.
-        # Check `launch` function in `virtex.utils.distributed` module.
-        device = torch.cuda.current_device()
-
-    # Create a config object (this will be immutable) and perform common setup
-    # such as logging and setting up serialization directory.
-    _C = Config(_A.config, _A.config_override)
-    common_setup(_C, _A)
-
-    # -------------------------------------------------------------------------
-    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
-    # -------------------------------------------------------------------------
-    train_dataset = PretrainingDatasetFactory.from_config(_C, split="train")
-    val_dataset = PretrainingDatasetFactory.from_config(_C, split="val")
-
-    # Make `DistributedSampler`s to shard datasets across GPU processes.
-    # Skip this if training on CPUs.
-    train_sampler = (
-        DistributedSampler(train_dataset, shuffle=True)  # type: ignore
-        if _A.num_gpus_per_machine > 0
-        else None
-    )
-    val_sampler = (
-        DistributedSampler(val_dataset, shuffle=False)  # type: ignore
-        if _A.num_gpus_per_machine > 0
-        else None
-    )
-    train_dataloader = DataLoader(
-        train_dataset,
-        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
-        sampler=train_sampler,
-        shuffle=train_sampler is None,
-        num_workers=_A.cpu_workers,
-        pin_memory=True,
-        drop_last=True,
-        collate_fn=train_dataset.collate_fn,
-    )
-    val_dataloader = DataLoader(
-        val_dataset,
-        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
-        sampler=val_sampler,
-        shuffle=False,
-        num_workers=_A.cpu_workers,
-        pin_memory=True,
-        drop_last=False,
-        collate_fn=val_dataset.collate_fn,
-    )
-
-    model = PretrainingModelFactory.from_config(_C).to(device)
-    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
-    scheduler = LRSchedulerFactory.from_config(_C, optimizer)
-
-    # -------------------------------------------------------------------------
-    #   BEFORE TRAINING STARTS
-    # -------------------------------------------------------------------------
-
-    # Create a gradient scaler for automatic mixed precision.
-    scaler = amp.GradScaler(enabled=_C.AMP)
-
-    # Load checkpoint to resume training if specified.
-    if _A.resume_from is not None:
-        start_iteration = CheckpointManager(
-            model=model, optimizer=optimizer, scheduler=scheduler, scaler=scaler,
-        ).load(_A.resume_from)
-    else:
-        start_iteration = 0
-
-    # Create an iterator from dataloader to sample batches perpetually.
-    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)
-
-    # Wrap model in DDP if using more than one processes.
-    if dist.get_world_size() > 1:
-        dist.synchronize()
-        model = nn.parallel.DistributedDataParallel(
-            model, device_ids=[device], find_unused_parameters=True
-        )
-
-    # Keep track of time per iteration and ETA.
-    timer = Timer(
-        start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS
-    )
-    # Create tensorboard writer and checkpoint manager (only in master process).
-    if dist.is_master_process():
-        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
-        tensorboard_writer.add_text("config", f"```\n{_C}\n```")
-
-        checkpoint_manager = CheckpointManager(
-            _A.serialization_dir,
-            model=model,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            scaler=scaler,
-        )
-
-    # -------------------------------------------------------------------------
-    #   TRAINING LOOP
-    # -------------------------------------------------------------------------
-    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
-        timer.tic()
-        optimizer.zero_grad()
-        batch = next(train_dataloader_iter)
-
-        with amp.autocast(enabled=_C.AMP):
-            output_dict = model(batch)
-            loss = output_dict["loss"]
-
-        scaler.scale(loss).backward()
-
-        # First clip norm of gradients, and then perform optimizer step.
-        scaler.unscale_(optimizer)
-        torch.nn.utils.clip_grad_norm_(model.parameters(), _C.OPTIM.CLIP_GRAD_NORM)
-        scaler.step(optimizer)
-
-        scaler.update()
-        scheduler.step()
-        timer.toc()
-
-        # ---------------------------------------------------------------------
-        #   LOGGING
-        # ---------------------------------------------------------------------
-        if iteration % _A.log_every == 0:
-            logger.info(
-                f"{timer.stats} [Loss {loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]"
-            )
-            if dist.is_master_process():
-                tensorboard_writer.add_scalars(
-                    "learning_rate",
-                    {
-                        "visual": optimizer.param_groups[0]["lr"],
-                        "common": optimizer.param_groups[-1]["lr"],
-                    },
-                    iteration,
-                )
-                tensorboard_writer.add_scalars(
-                    "train", output_dict["loss_components"], iteration
-                )
-
-        # ---------------------------------------------------------------------
-        #   VALIDATION
-        # ---------------------------------------------------------------------
-        if iteration % _A.checkpoint_every == 0:
-            if dist.is_master_process():
-                checkpoint_manager.step(iteration)
-
-            # All processes will wait till master process is done serializing.
-            dist.synchronize()
-
-            torch.set_grad_enabled(False)
-            model.eval()
-
-            # Accumulate different val loss components according to the type of
-            # pretraining model.
-            val_loss_counter: Counter = Counter()
-
-            for val_iteration, val_batch in enumerate(val_dataloader, start=1):
-                for key in val_batch:
-                    val_batch[key] = val_batch[key].to(device)
-                output_dict = model(val_batch)
-
-                val_loss_counter.update(output_dict["loss_components"])
-
-            # Divide each loss component by number of val batches per GPU.
-            val_loss_dict = {
-                k: v / val_iteration for k, v in dict(val_loss_counter).items()
-            }
-            dist.average_across_processes(val_loss_dict)
-            torch.set_grad_enabled(True)
-            model.train()
-
-            logger.info(f"Iteration: {iteration} [Val loss: {val_loss_dict}]")
-            if dist.is_master_process():
-                tensorboard_writer.add_scalars("val", val_loss_dict, iteration)
-
-
-if __name__ == "__main__":
-    _A = parser.parse_args()
-
-    if _A.num_gpus_per_machine == 0:
-        main(_A)
-    else:
-        # This will launch `main` and set appropriate CUDA device (GPU ID) as
-        # per process (accessed in the beginning of `main`).
-        dist.launch(
-            main,
-            num_machines=_A.num_machines,
-            num_gpus_per_machine=_A.num_gpus_per_machine,
-            machine_rank=_A.machine_rank,
-            dist_url=_A.dist_url,
-            args=(_A, ),
-        )
diff --git a/virtex/scripts/redcaps_caption_decode.py b/virtex/scripts/redcaps_caption_decode.py
deleted file mode 100644
index d63b69ac6a13dc235a3ba4980dba582a9cd75be6..0000000000000000000000000000000000000000
--- a/virtex/scripts/redcaps_caption_decode.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import argparse
-import json
-import os
-from typing import Any, Dict, List
-
-from loguru import logger
-import torch
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-
-import wordsegment as ws
-
-from virtex.config import Config
-from virtex.data import ImageDirectoryDataset
-from virtex.factories import TokenizerFactory, PretrainingModelFactory
-from virtex.utils.checkpointing import CheckpointManager
-from virtex.utils.common import common_parser
-
-ws.load()
-
-# fmt: off
-parser = common_parser(
-    description="Decode captions using a RedCaps-pretrained VirTex model."
-)
-parser.add_argument(
-    "--images", required=True,
-    help="Path to a directory containing image files to generate captions for."
-)
-parser.add_argument(
-    "--checkpoint-path", required=True,
-    help="Path to load checkpoint and run captioning evaluation."
-)
-parser.add_argument(
-    "--output", required=True,
-    help="Path to save predictions as a JSON file."
-)
-parser.add_argument(
-    "--subreddit-prompt", default=None,
-    help="Optional subreddit prompt for controllable subreddit-style captioning."
-)
-# fmt: on
-
-
-def main(_A: argparse.Namespace):
-
-    if _A.num_gpus_per_machine == 0:
-        # Set device as CPU if num_gpus_per_machine = 0.
-        device = torch.device("cpu")
-    else:
-        # Get the current device (this will be zero here by default).
-        device = torch.cuda.current_device()
-
-    _C = Config(_A.config, _A.config_override)
-
-    tokenizer = TokenizerFactory.from_config(_C)
-        
-    val_dataloader = DataLoader(
-        ImageDirectoryDataset(_A.images),
-        batch_size=_C.OPTIM.BATCH_SIZE,
-        num_workers=_A.cpu_workers,
-        pin_memory=True,
-    )
-    # Initialize model from a checkpoint.
-    model = PretrainingModelFactory.from_config(_C).to(device)
-    CheckpointManager(model=model).load(_A.checkpoint_path)
-    model.eval()
-
-    # Prepare subreddit prompt for the model if provided.
-    if _A.subreddit_prompt is not None:
-
-        # Remove "r/" if provided.
-        _A.subreddit_prompt = _A.subreddit_prompt.replace("r/", "")
-
-        # Word segmenting (e.g. "itookapicture" -> "i took a picture").
-        _segments = " ".join(ws.segment(ws.clean(_A.subreddit_prompt)))
-        subreddit_tokens = (
-            [model.sos_index]
-            + tokenizer.encode(_segments)
-            + [tokenizer.token_to_id("[SEP]")]
-        )
-    else:
-        # Just seed the model with [SOS]
-        subreddit_tokens = [model.sos_index]
-
-    # Shift the subreddit prompt to appropriate device.
-    subreddit_tokens = torch.tensor(subreddit_tokens, device=device).long()
-
-    # Make a list of predictions to evaluate.
-    predictions: List[Dict[str, Any]] = []
-
-    for val_batch in tqdm(val_dataloader):
-        val_batch["image"] = val_batch["image"].to(device)
-
-        # Add the subreddit tokens as decoding prompt to batch.
-        val_batch["decode_prompt"] = subreddit_tokens
-
-        with torch.no_grad():
-            output_dict = model(val_batch)
-
-        for idx, (image_id, caption) in enumerate(
-            zip(val_batch["image_id"], output_dict["predictions"])
-        ):
-            caption = caption.tolist()
-
-            # Replace [SOS] index with "::" temporarily so it gets decoded.
-            if tokenizer.token_to_id("[SEP]") in caption:
-                sos_index = caption.index(tokenizer.token_to_id("[SEP]"))
-                caption[sos_index] = tokenizer.token_to_id("::")
-
-            caption = tokenizer.decode(caption)
-
-            # Separate out subreddit from the rest of caption.
-            if "::" in caption:
-                subreddit, rest_of_caption = caption.split("::")
-                subreddit = "".join(subreddit.split())
-                rest_of_caption = rest_of_caption.strip()
-            else:
-                subreddit, rest_of_caption = "", caption
-
-            predictions.append(
-                {"image_id": image_id, "subreddit": subreddit, "caption": rest_of_caption}
-            )
-
-    logger.info("Displaying first 25 caption predictions:")
-    for pred in predictions[:25]:
-        logger.info(f"{pred['image_id']} - r/{pred['subreddit']}:: {pred['caption']}")
-
-    # Save predictions as a JSON file.
-    os.makedirs(os.path.dirname(_A.output), exist_ok=True)
-    json.dump(predictions, open(_A.output, "w"))
-    logger.info(f"Saved predictions to {_A.output}")
-
-
-if __name__ == "__main__":
-    _A = parser.parse_args()
-    if _A.num_gpus_per_machine > 1:
-        raise ValueError("Using multiple GPUs is not supported for this script.")
-
-    # No distributed training here, just a single process.
-    main(_A)
diff --git a/virtex/scripts/redcaps_train.py b/virtex/scripts/redcaps_train.py
deleted file mode 100644
index b8c63010361c80ffcdca873a8166448aa2f359ef..0000000000000000000000000000000000000000
--- a/virtex/scripts/redcaps_train.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import argparse
-import os
-import tempfile
-from typing import Any
-
-from loguru import logger
-import torch
-from torch import nn
-from torch.cuda import amp
-from torch.utils.data import DataLoader
-from torch.utils.tensorboard import SummaryWriter
-
-# fmt: off
-from virtex.config import Config
-from virtex.factories import (
-    PretrainingDatasetFactory, PretrainingModelFactory, OptimizerFactory,
-    LRSchedulerFactory,
-)
-from virtex.utils.checkpointing import CheckpointManager
-from virtex.utils.common import common_parser, common_setup, cycle
-import virtex.utils.distributed as dist
-from virtex.utils.timer import Timer
-
-
-parser = common_parser(
-    description="Train a VirTex model (CNN + Transformer) on COCO Captions."
-)
-group = parser.add_argument_group("Checkpointing and Logging")
-group.add_argument(
-    "--resume-from", default=None,
-    help="Path to a checkpoint to resume training from (if provided)."
-)
-group.add_argument(
-    "--checkpoint-every", type=int, default=2000,
-    help="Serialize model to a checkpoint after every these many iterations.",
-)
-group.add_argument(
-    "--log-every", type=int, default=50,
-    help="""Log training curves to tensorboard after every these many iterations
-    only master process logs averaged loss values across processes.""",
-)
-# fmt: on
-
-
-def main(_A: argparse.Namespace):
-
-    if _A.num_gpus_per_machine == 0:
-        # Set device as CPU if num_gpus_per_machine = 0.
-        device: Any = torch.device("cpu")
-    else:
-        # Get the current device as set for current distributed process.
-        # Check `launch` function in `virtex.utils.distributed` module.
-        device = torch.cuda.current_device()
-
-    # Create a config object (this will be immutable) and perform common setup
-    # such as logging and setting up serialization directory.
-    _C = Config(_A.config, _A.config_override)
-    common_setup(_C, _A)
-
-    # -------------------------------------------------------------------------
-    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
-    # -------------------------------------------------------------------------
-
-    # fmt: off
-    train_dataset = PretrainingDatasetFactory.from_config(_C)
-    train_dataloader = DataLoader(
-        train_dataset, batch_size=None, shuffle=False,
-        num_workers=_A.cpu_workers, pin_memory=True,
-    )
-    # fmt: on
-
-    model = PretrainingModelFactory.from_config(_C).to(device)
-    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
-    scheduler = LRSchedulerFactory.from_config(_C, optimizer)
-
-    # -------------------------------------------------------------------------
-    #   BEFORE TRAINING STARTS
-    # -------------------------------------------------------------------------
-
-    # Create a gradient scaler for automatic mixed precision.
-    scaler = amp.GradScaler(enabled=_C.AMP)
-
-    # Load checkpoint to resume training if specified.
-    if _A.resume_from is not None:
-        start_iteration = CheckpointManager(
-            model=model, optimizer=optimizer, scheduler=scheduler,
-        ).load(_A.resume_from)
-    else:
-        start_iteration = 0
-
-    # Create an iterator from dataloader to sample batches perpetually.
-    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)
-
-    # Wrap model in DDP if using more than one processes.
-    if dist.get_world_size() > 1:
-        dist.synchronize()
-        model = nn.parallel.DistributedDataParallel(
-            model, device_ids=[device], find_unused_parameters=True
-        )
-
-    # Keep track of time per iteration and ETA.
-    timer = Timer(
-        start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS
-    )
-    # Create tensorboard writer and checkpoint manager (only in master process).
-    if dist.is_master_process():
-        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
-        tensorboard_writer.add_text("config", f"```\n{_C}\n```")
-
-        checkpoint_manager = CheckpointManager(
-            _A.serialization_dir,
-            model=model,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            scaler=scaler,
-        )
-
-    # -------------------------------------------------------------------------
-    #   TRAINING LOOP
-    # -------------------------------------------------------------------------
-    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
-        timer.tic()
-        optimizer.zero_grad()
-        batch = next(train_dataloader_iter)
-
-        with amp.autocast(enabled=_C.AMP):
-            output_dict = model(batch)
-            loss = output_dict["loss"]
-
-        scaler.scale(loss).backward()
-
-        # First clip norm of gradients, and then perform optimizer step.
-        scaler.unscale_(optimizer)
-        torch.nn.utils.clip_grad_norm_(model.parameters(), _C.OPTIM.CLIP_GRAD_NORM)
-        scaler.step(optimizer)
-
-        scaler.update()
-        scheduler.step()
-        timer.toc()
-
-        # ---------------------------------------------------------------------
-        #   LOGGING
-        # ---------------------------------------------------------------------
-        if iteration % _A.log_every == 0:
-            logger.info(
-                f"{timer.stats} [Loss {loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]"
-            )
-            if dist.is_master_process():
-                tensorboard_writer.add_scalars(
-                    "train", output_dict["loss_components"], iteration
-                )
-
-        if iteration % _A.checkpoint_every == 0 and dist.is_master_process():
-            checkpoint_manager.step(iteration)
-
-
-if __name__ == "__main__":
-    _A = parser.parse_args()
-
-    if _A.num_gpus_per_machine == 0:
-        main(_A)
-    else:
-        # This will launch `main` and set appropriate CUDA device (GPU ID) as
-        # per process (accessed in the beginning of `main`).
-        dist.launch(
-            main,
-            num_machines=_A.num_machines,
-            num_gpus_per_machine=_A.num_gpus_per_machine,
-            machine_rank=_A.machine_rank,
-            dist_url=_A.dist_url,
-            args=(_A, ),
-        )
diff --git a/virtex/scripts/zero_shot_classification.py b/virtex/scripts/zero_shot_classification.py
deleted file mode 100644
index fc8523f6012aab90a86f77c1ba235ad740848fe2..0000000000000000000000000000000000000000
--- a/virtex/scripts/zero_shot_classification.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import argparse
-import json
-import os
-import random
-from typing import Any, Dict, List
-
-from loguru import logger
-import torch
-from torch.utils.data import DataLoader, DistributedSampler
-from torch.nn.utils.rnn import pad_sequence
-from tqdm import tqdm
-
-import wordsegment as ws
-
-from virtex.config import Config
-from virtex.data import ZeroShotDataset
-
-from virtex.data.tokenizers import SentencePieceBPETokenizer
-
-from virtex.factories import TokenizerFactory, VisualBackboneFactory,TextualHeadFactory
-from virtex.utils.checkpointing import CheckpointManager
-from virtex.utils.common import common_parser
-from virtex.utils.metrics import TopkAccuracy
-import virtex.utils.distributed as dist
-
-
-#importing classifier
-from virtex.models.zero_shot_classification_eval import ZeroShotClassifier
-
-ws.load()
-
-# fmt: off
-parser = common_parser(
-    description="""Run image captioning inference on a pretrained model, and/or
-    evaluate pretrained model on COCO Captions val2017 split."""
-)
-parser.add_argument(
-    "--data-root", default=None,
-    help="""Path to a directory containing image files to generate captions for imagenet.
-    Default: COCO val2017 image directory as expected relative to project root."""
-)
-parser.add_argument(
-    "--checkpoint-path", required=False,
-    help="Path to load checkpoint and run captioning evaluation."
-)
-parser.add_argument(
-    "--output", default=None,
-    help="Path to save predictions as a JSON file."
-)
-parser.add_argument(
-    "--calc-metrics", action="store_true",
-    help="""Calculate CIDEr and SPICE metrics using ground truth COCO Captions.
-    This flag should not be set when running inference on arbitrary images."""
-)
-
-parser.add_argument(
-    "--idx_label_dict", default=None, required=False,
-    help="""a dictionary that maps from lable index to label string for classification"""
-)
-parser.add_argument(
-    "--is_redcaps", default=None, required=False,
-    help="""a dictionary that maps from lable index to label string for"""
-)
-parser.add_argument(
-    "--prompt_cls_sos", default=None, required=False,
-    help="""a dictionary that maps from lable index to label string for"""
-)
-parser.add_argument(
-    "--prompt_sos_eos", default=None, required=False,
-    help="""a dictionary that maps from lable index to label string for"""
-)
-# fmt: on
-
-print("###########")
-print(os.getcwd() )
-print("###########")
-
-tokenizer = SentencePieceBPETokenizer("datasets_1/vocab/common_32k.model")
-
-def main(_A: argparse.Namespace):
-    if _A.num_gpus_per_machine == 0:
-        # Set device as CPU if num_gpus_per_machine = 0.
-        device = torch.device("cpu")
-    else:
-        # Get the current device (this will be zero here by default).
-        device = torch.cuda.current_device()
-
-    _C = Config(_A.config, _A.config_override)
-
-    #tokenizer = TokenizerFactory.from_config(_C)
-    
-    if _A.data_root is None:
-        _A.data_root = os.path.join(_C.DATA.ROOT, "val2017")
-    
-    if _A.is_redcaps == 1:
-        model_dataset = 'redcaps'
-    else:
-        model_dataset = 'gcc or sbu'
-        
-    print(_A.idx_label_dict)
-    
-    val_dataset = ZeroShotDataset(data_root=_A.data_root,
-                                  split="test/",
-                                  label_map=_A.idx_label_dict,
-                                  tokenizer=tokenizer,
-                                  prompt_cls_sos=_A.prompt_cls_sos.replace("_", " "),
-                                  prompt_sos_eos=_A.prompt_sos_eos.replace("_", " "))
-    
-    val_dataloader = DataLoader(
-        val_dataset,
-        batch_size= _C.OPTIM.BATCH_SIZE // dist.get_world_size(),
-        num_workers=_A.cpu_workers,
-        sampler=DistributedSampler(
-            val_dataset,
-            num_replicas=dist.get_world_size(),
-            rank=dist.get_rank(),
-        ),
-        pin_memory=True,
-        drop_last=False,
-        collate_fn=val_dataset.collate_fn,
-    )
-    
-    # Initialize model from a checkpoint
-    visual = VisualBackboneFactory.from_config(_C)
-    textual = TextualHeadFactory.from_config(_C)
-    model = ZeroShotClassifier(visual,textual)
-    ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
-    model.to(device).eval()
-    
-    ## setup distributed training 
-    if dist.get_world_size() > 1:
-        dist.synchronize()
-        model = nn.parallel.DistributedDataParallel(
-            model, device_ids=[device], find_unused_parameters=True
-        )
-
-    top_1 = TopkAccuracy(top_k=1)
-    top_5 = TopkAccuracy(top_k=5)
-    batch_num = 0
-  
-    
-    for val_iteration, val_batch in tqdm(enumerate(val_dataloader, start=1)):
-        val_batch["image"] = val_batch["image"].to(device)
-        val_batch["caption_tokens"] = val_batch["caption_tokens"].to(device)
-        val_batch["noitpac_tokens"] = val_batch["noitpac_tokens"] .to(device)
-        val_batch["caption_lengths"] = val_batch["caption_lengths"].to(device)
-        val_batch["label"] = val_batch["label"].to(device)        
-         
-        with torch.no_grad():
-            classification_losses = model(val_batch)
-            
-        batch_num+=1
-        top_1(classification_losses, val_batch["label"]) 
-        top_1_acc = top_1.get_metric(reset=False)
-        dist.average_across_processes(top_1_acc)
-
-        top_5(classification_losses, val_batch["label"])
-        top_5_acc = top_5.get_metric(reset=False)
-        dist.average_across_processes(top_5_acc)
-
-        logger.info(f"Iter: {val_iteration} | Top-1 accuracy: {top_1_acc} | Top-5 accuracy: {top_5_acc}")
-
-   
-
-if __name__ == "__main__":
-    _A = parser.parse_args()
-    #if _A.num_gpus_per_machine > 1:
-    #    raise ValueError("Using multiple GPUs is not supported for this script.")
-
-    # No distributed training here, just a single process.
-    main(_A)
diff --git a/virtex/setup.py b/virtex/setup.py
deleted file mode 100644
index fc715695a0b1e6eb83a52205c9fec3224131bb21..0000000000000000000000000000000000000000
--- a/virtex/setup.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env python
-import glob
-import os
-from setuptools import setup
-import shutil
-from typing import List
-
-
-def get_model_zoo_configs() -> List[str]:
-    """
-    Return a list of configs to include in package for model zoo. Copy over
-    these configs inside virtex/model_zoo.
-    """
-
-    # Use absolute paths while symlinking.
-    source_configs_dir = os.path.join(
-        os.path.dirname(os.path.realpath(__file__)), "configs"
-    )
-    destination = os.path.join(
-        os.path.dirname(os.path.realpath(__file__)), "virtex", "model_zoo", "configs"
-    )
-    # Symlink the config directory inside package to have a cleaner pip install.
-
-    # Remove stale symlink/directory from a previous build.
-    if os.path.exists(source_configs_dir):
-        if os.path.islink(destination):
-            os.unlink(destination)
-        elif os.path.isdir(destination):
-            shutil.rmtree(destination)
-
-    if not os.path.exists(destination):
-        try:
-            os.symlink(source_configs_dir, destination)
-        except OSError:
-            # Fall back to copying if symlink fails: ex. on Windows.
-            shutil.copytree(source_configs_dir, destination)
-
-    config_paths = glob.glob("configs/**/*.yaml", recursive=True)
-    return config_paths
-
-
-setup(
-    name="virtex",
-    version="1.1.0",
-    author="Karan Desai and Justin Johnson",
-    description="VirTex: Learning Visual Representations with Textual Annotations",
-    package_data={"virtex.model_zoo": get_model_zoo_configs()},
-    python_requires=">=3.6",
-    license="MIT",
-    zip_safe=True,
-)
diff --git a/virtex/virtex/utils/assets/download_spice.sh b/virtex/utils/assets/download_spice.sh
similarity index 100%
rename from virtex/virtex/utils/assets/download_spice.sh
rename to virtex/utils/assets/download_spice.sh
diff --git a/virtex/virtex/utils/beam_search.py b/virtex/utils/beam_search.py
similarity index 100%
rename from virtex/virtex/utils/beam_search.py
rename to virtex/utils/beam_search.py
diff --git a/virtex/virtex/utils/checkpointing.py b/virtex/utils/checkpointing.py
similarity index 100%
rename from virtex/virtex/utils/checkpointing.py
rename to virtex/utils/checkpointing.py
diff --git a/virtex/virtex/utils/common.py b/virtex/utils/common.py
similarity index 100%
rename from virtex/virtex/utils/common.py
rename to virtex/utils/common.py
diff --git a/virtex/virtex/utils/distributed.py b/virtex/utils/distributed.py
similarity index 100%
rename from virtex/virtex/utils/distributed.py
rename to virtex/utils/distributed.py
diff --git a/virtex/virtex/utils/metrics.py b/virtex/utils/metrics.py
similarity index 100%
rename from virtex/virtex/utils/metrics.py
rename to virtex/utils/metrics.py
diff --git a/virtex/virtex/utils/nucleus_sampling.py b/virtex/utils/nucleus_sampling.py
similarity index 100%
rename from virtex/virtex/utils/nucleus_sampling.py
rename to virtex/utils/nucleus_sampling.py
diff --git a/virtex/virtex/utils/timer.py b/virtex/utils/timer.py
similarity index 100%
rename from virtex/virtex/utils/timer.py
rename to virtex/utils/timer.py

Model Config Name	VOC07 mAP	ImageNet Top-1 Acc.	Model URL
task_ablations/bicaptioning_R_50_L1_H2048.yaml	88.7	53.8	model
task_ablations/captioning_R_50_L1_H2048.yaml	88.6	50.8	model
task_ablations/token_classification_R_50.yaml	88.8	48.6	model
task_ablations/multilabel_classification_R_50.yaml	86.2	46.2	model
task_ablations/masked_lm_R_50_L1_H2048.yaml	86.4	46.7	model