Update training scripts

Browse files

Files changed (7) hide show

.gitattributes +1 -0
data/train_dataset.json +3 -0
data/valid_dataset.json +3 -0
requirements.txt +8 -0
run_medclip.sh +15 -0
src/hybrid_clip/utils/create_roco_dataset.py +0 -0
src/hybrid_clip/utils/roco_dataset.ipynb +123 -0

.gitattributes CHANGED Viewed

@@ -14,3 +14,4 @@
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text

 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

data/train_dataset.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6f8f9ecea3f4c6f8196f194510159fccde43ee7f2192b259a11d6bc9ad684cb
+size 13426560

data/valid_dataset.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7dbb940f0dee7cb4a85959dc6018aafc824a988b46e3ae8ca2fea6500251ee0a
+size 4132661

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+jax>=0.2.8
+jaxlib>=0.1.59
+flax>=0.3.4
+optax>=0.0.8
+-f https://download.pytorch.org/whl/torch_stable.html
+torch==1.9.0+cpu
+-f https://download.pytorch.org/whl/torch_stable.html
+torchvision==0.10.0+cpu

run_medclip.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+python src/hybrid_clip/run_hybrid_clip.py \
+    --output_dir snapshots \
+    --text_model_name_or_path="roberta-base" \
+    --vision_model_name_or_path="openai/clip-vit-base-patch32" \
+    --tokenizer_name="roberta-base" \
+    --train_file="data/train_dataset.json" \
+    --validation_file="data/valid_dataset.json" \
+    --do_train --do_eval \
+    --num_train_epochs="40" --max_seq_length 96 \
+    --per_device_train_batch_size="64" \
+    --per_device_eval_batch_size="64" \
+    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 32 \
+    # --push_to_hub

src/hybrid_clip/utils/create_roco_dataset.py ADDED Viewed

File without changes

src/hybrid_clip/utils/roco_dataset.ipynb ADDED Viewed

	@@ -0,0 +1,123 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "source": [
+    "import json\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "source": [
+    "train_path = '/home/kaumad/roco-dataset/train'"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "source": [
+    "img_dir = os.path.join(train_path, 'radiology', 'images')"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "source": [
+    "train_csv = pd.read_csv(os.path.join(train_path, 'radiology', 'traindata.csv'))"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "source": [
+    "lines = []\n",
+    "for id, row in train_csv.iterrows():\n",
+    "    img_path = os.path.join(img_dir, 'radiology', row['name'])\n",
+    "    line = json.dumps({\"image_path\": img_path, \"captions\": row['caption']})\n",
+    "    lines.append(line)\n",
+    "    # if id>100:\n",
+    "    #     break\n"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "source": [
+    "len(lines)"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "65450"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 7
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "source": [
+    "train_lines = lines[:45000]\n",
+    "val_lines = lines[-45000:]\n",
+    "\n",
+    "json_dir = '../../../data'\n",
+    "with open(os.path.join(json_dir, \"train_dataset.json\"), \"w\") as f:\n",
+    "    f.write(\"\\n\".join(train_lines))\n",
+    "\n",
+    "with open(os.path.join(json_dir, \"valid_dataset.json\"), \"w\") as f:\n",
+    "    f.write(\"\\n\".join(val_lines))"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "source": [
+    "os.listdir('../../../data')"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "['train_dataset.json', 'valid_dataset.json']"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 14
+    }
+   ],
+   "metadata": {}
+  }
+ ],
+ "metadata": {
+  "orig_nbformat": 4,
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}