Spaces:

Anuj-Panthri
/

Image-Colorization

Runtime error

App Files Files Community

Anuj-Panthri commited on May 5, 2024

Commit

922c280

1 Parent(s): b03b420

added dataset scripts

Browse files

Files changed (9) hide show

Makefile +0 -142
command.py +40 -0
config.yaml +13 -0
requirements.txt +3 -14
src/__init__.py +9 -0
src/data/load_dataset.py +73 -0
src/data/make_dataset.py +120 -22
src/data/visualize_dataset.py +52 -0
src/utils.py +39 -0

Makefile CHANGED Viewed

@@ -1,144 +1,2 @@
 .PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
-#################################################################################
-# GLOBALS                                                                       #
-#################################################################################
-PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
-BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
-PROFILE = default
-PROJECT_NAME = project_name
-PYTHON_INTERPRETER = python
-ifeq (,$(shell which conda))
-HAS_CONDA=False
-else
-HAS_CONDA=True
-endif
-#################################################################################
-# COMMANDS                                                                      #
-#################################################################################
-## Install Python Dependencies
-requirements: test_environment
-	$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
-	$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
-## Make Dataset
-data: requirements
-	$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed
-## Delete all compiled Python files
-clean:
-	find . -type f -name "*.py[co]" -delete
-	find . -type d -name "__pycache__" -delete
-## Lint using flake8
-lint:
-	flake8 src
-## Upload Data to S3
-sync_data_to_s3:
-ifeq (default,$(PROFILE))
-	aws s3 sync data/ s3://$(BUCKET)/data/
-else
-	aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE)
-endif
-## Download Data from S3
-sync_data_from_s3:
-ifeq (default,$(PROFILE))
-	aws s3 sync s3://$(BUCKET)/data/ data/
-else
-	aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE)
-endif
-## Set up python interpreter environment
-create_environment:
-ifeq (True,$(HAS_CONDA))
-		@echo ">>> Detected conda, creating conda environment."
-ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
-	conda create --name $(PROJECT_NAME) python=3
-else
-	conda create --name $(PROJECT_NAME) python=2.7
-endif
-		@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
-else
-	$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
-	@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
-	export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
-	@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
-	@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
-endif
-## Test python environment is setup correctly
-test_environment:
-	$(PYTHON_INTERPRETER) test_environment.py
-#################################################################################
-# PROJECT RULES                                                                 #
-#################################################################################
-#################################################################################
-# Self Documenting Commands                                                     #
-#################################################################################
-.DEFAULT_GOAL := help
-# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
-# sed script explained:
-# /^##/:
-# 	* save line in hold space
-# 	* purge line
-# 	* Loop:
-# 		* append newline + line to hold space
-# 		* go to next line
-# 		* if line starts with doc comment, strip comment character off and loop
-# 	* remove target prerequisites
-# 	* append hold space (+ newline) to line
-# 	* replace newline plus comments by `---`
-# 	* print line
-# Separate expressions are necessary because labels cannot be delimited by
-# semicolon; see <http://stackoverflow.com/a/11799865/1968>
-.PHONY: help
-help:
-	@echo "$$(tput bold)Available rules:$$(tput sgr0)"
-	@echo
-	@sed -n -e "/^## / { \
-		h; \
-		s/.*//; \
-		:doc" \
-		-e "H; \
-		n; \
-		s/^## //; \
-		t doc" \
-		-e "s/:.*//; \
-		G; \
-		s/\\n## /---/; \
-		s/\\n/ /g; \
-		p; \
-	}" ${MAKEFILE_LIST} \
-	| LC_ALL='C' sort --ignore-case \
-	| awk -F '---' \
-		-v ncol=$$(tput cols) \
-		-v indent=19 \
-		-v col_on="$$(tput setaf 6)" \
-		-v col_off="$$(tput sgr0)" \
-	'{ \
-		printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
-		n = split($$2, words, " "); \
-		line_length = ncol - indent; \
-		for (i = 1; i <= n; i++) { \
-			line_length -= length(words[i]) + 1; \
-			if (line_length <= 0) { \
-				line_length = ncol - indent - length(words[i]) - 1; \
-				printf "\n%*s ", -indent, " "; \
-			} \
-			printf "%s ", words[i]; \
-		} \
-		printf "\n"; \
-	}' \
-	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')


1	.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
2

command.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import argparse
+import sys
+import os
+# parser = argparse.ArgumentParser()
+# parser.add_argument("category")
+# parser.add_argument("subcommand-args")
+# args = parser.parse_args()
+args = sys.argv
+# remove "command.py"
+args = args[1:]
+# print(args)
+subcommand = args[0].lower()
+subcommand_args = " ".join(args[1:])
+if subcommand=="data":
+    command = "py src/data/make_dataset.py "+subcommand_args
+    # print(command)
+    os.system(command)
+else:
+    print("subcommand not supported.")
+# os.system("py src/__init__.py")
+"""
+download the dataset:                 data download
+preprocess dataset:                   data prepare
+visualize dataset:                    data show
+delete raw & interim dataset dir:     data delete --cache
+delete all dataset dir:               data delete --all
+train model:                          model train
+evaluate model:                       model evaluate
+inference with model:                 model predict --image test.jpg --folder images/ -d results/
+"""

config.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+raw_dataset_dir: data/raw/
+interim_dataset_dir: data/interim/
+processed_dataset_dir: data/processed/
+# forests or pascal-voc
+dataset: forests
+image_size: 224
+train_size: 0.8
+shuffle: False
+batch_size: 16
+seed: 324

requirements.txt CHANGED Viewed

@@ -1,14 +1,3 @@
-# local package
--e .
-# external requirements
-click
-Sphinx
-coverage
-awscli
-flake8
-python-dotenv>=0.5.1
-# backwards compatibility
-pathlib2

+huggingface_hub
+comet_ml
+scikit-image

src/__init__.py CHANGED Viewed

	@@ -0,0 +1,9 @@

+from src.utils import Config
+from pathlib import Path
+config = Config("config.yaml")
+# config.raw_dataset_dir = Path(config.raw_dataset_dir)
+# config.interim_dataset_dir = Path(config.interim_dataset_dir)
+# config.processed_dataset_dir = Path(config.processed_dataset_dir)
+# print(config)

src/data/load_dataset.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os,sys;sys.path.append(os.getcwd())
+import tensorflow as tf
+from src import config
+from src.utils import *
+from pathlib import Path
+from glob import glob
+import sklearn.model_selection
+from skimage.color import rgb2lab, lab2rgb
+def get_datasets():
+    trainval_dir = Path(config.processed_dataset_dir) / Path("trainval/")
+    test_dir = Path(config.processed_dataset_dir) / Path("test/")
+    trainval_paths = glob(str(trainval_dir/Path("*")))
+    test_paths = glob(str(test_dir/Path("*")))
+    len(trainval_paths),len(test_paths)
+    train_paths,val_paths = sklearn.model_selection.train_test_split(trainval_paths,
+                                                                    train_size=0.8,
+                                                                    random_state=324)
+    print("train|val split:",len(train_paths),"|",len(val_paths))
+    train_ds = get_ds(train_paths,bs=config.batch_size,shuffle=config.shuffle)
+    val_ds = get_ds(val_paths,bs=config.batch_size,shuffle=False,is_val=True)
+    test_ds = get_ds(test_paths,bs=config.batch_size,shuffle=False,is_val=True)
+    return train_ds,val_ds,test_ds
+# def test_dataset():
+#     train_ds = get_ds(train_paths,shuffle=False)
+#     L_batch,AB_batch = next(iter(train_ds))
+#     L_batch = L_batch.numpy()
+#     AB_batch = AB_batch.numpy()
+#     print("L:",L_batch.min(),L_batch.max())
+#     print("A:",AB_batch[:,:,:,0].min(),AB_batch[:,:,:,0].max())
+#     print("B:",AB_batch[:,:,:,1].min(),AB_batch[:,:,:,1].max())
+def tf_RGB_TO_LAB(image):
+    def f(image):
+        image = rgb2lab(image)
+        return image
+    lab = tf.numpy_function(f,[image],tf.float32)
+    lab.set_shape(image.shape)
+    return lab
+# load the image in lab space and split the l and ab channels
+def load_img(img_path):
+    img_bytes = tf.io.read_file(img_path)
+    image = tf.image.decode_image(img_bytes,3,expand_animations=False)
+    image = tf.image.resize(image,[config.image_size,config.image_size])
+    image = image / 255.0
+    image = tf_RGB_TO_LAB(image)
+    L,AB = image[:,:,0:1],image[:,:,1:]
+    L,AB = scale_L(L),scale_AB(AB)
+    return L,AB
+def get_ds(image_paths,bs=8,shuffle=False,is_val=False):
+    ds = tf.data.Dataset.from_tensor_slices(image_paths)
+    if shuffle:   ds = ds.shuffle(len(image_paths))
+    ds = ds.map(load_img,num_parallel_calls=tf.data.AUTOTUNE)
+    ds = ds.batch(bs,num_parallel_calls=tf.data.AUTOTUNE,drop_remainder=not is_val)
+    return ds

src/data/make_dataset.py CHANGED Viewed

@@ -1,30 +1,128 @@
-# -*- coding: utf-8 -*-
-import click
-import logging
 from pathlib import Path
-from dotenv import find_dotenv, load_dotenv
-@click.command()
-@click.argument('input_filepath', type=click.Path(exists=True))
-@click.argument('output_filepath', type=click.Path())
-def main(input_filepath, output_filepath):
-    """ Runs data processing scripts to turn raw data from (../raw) into
-        cleaned data ready to be analyzed (saved in ../processed).
     """
-    logger = logging.getLogger(__name__)
-    logger.info('making final data set from raw data')
-if __name__ == '__main__':
-    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    logging.basicConfig(level=logging.INFO, format=log_fmt)
-    # not used in this stub but often useful for finding various files
-    project_dir = Path(__file__).resolve().parents[2]
-    # find .env automagically by walking up directories until it's found, then
-    # load up the .env entries as environment variables
-    load_dotenv(find_dotenv())
-    main()

+from huggingface_hub import snapshot_download
+import os,sys;sys.path.append(os.getcwd())
+from src import config
+from src.utils import *
+import argparse
 from pathlib import Path
+from zipfile import ZipFile
+from glob import glob
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+import shutil
+from src.data.visualize_dataset import visualize_dataset
+def download_dataset():
+    """Used to download dataset from hugging face
     """
+    print_title(f"Downloading {config.dataset} dataset from hugging face")
+    snapshot_download(repo_id="Anuj-Panthri/Image-Colorization-Datasets",
+                    repo_type="dataset",
+                    local_dir=config.raw_dataset_dir,
+                    allow_patterns=f"{config.dataset}/*")
+def unzip_dataset():
+    print_title(f"Unzipping dataset")
+    print("Extracting to :",Path(config.interim_dataset_dir)/Path("trainval/"))
+    with ZipFile(Path(config.raw_dataset_dir)/Path(f"{config.dataset}/trainval.zip"),"r") as zip:
+        zip.extractall(Path(config.interim_dataset_dir)/Path("trainval/"))
+    print("Extracting to :",Path(config.interim_dataset_dir)/Path("test/"))
+    with ZipFile(Path(config.raw_dataset_dir)/Path(f"{config.dataset}/test.zip"),"r") as zip:
+        zip.extractall(Path(config.interim_dataset_dir)/Path("test/"))
+def clean_dataset():
+    print_title("CLEANING DATASET")
+    trainval_dir = Path(config.interim_dataset_dir) / Path("trainval/")
+    test_dir = Path(config.interim_dataset_dir) / Path("test/")
+    trainval_paths = glob(str(trainval_dir/Path("*")))
+    test_paths = glob(str(test_dir/Path("*")))
+    print("train,test: ",len(trainval_paths),",",len(test_paths),sep="")
+    def clean(image_paths,destination_dir):
+        if os.path.exists(destination_dir): shutil.rmtree(destination_dir)
+        os.makedirs(destination_dir)
+        for i in tqdm(range(len(image_paths))):
+            img = cv2.imread(image_paths[i])
+            img = cv2.resize(img,[128,128])
+            if not is_bw(img):
+                shutil.copy(trainval_paths[i],
+                            destination_dir)
+        print("saved to:",destination_dir)
+    destination_dir = Path(config.processed_dataset_dir)/Path("trainval/")
+    clean(trainval_paths,destination_dir)
+    destination_dir = Path(config.processed_dataset_dir)/Path("test/")
+    clean(test_paths,destination_dir)
+    trainval_dir = Path(config.processed_dataset_dir) / Path("trainval/")
+    test_dir = Path(config.processed_dataset_dir) / Path("test/")
+    trainval_paths = glob(str(trainval_dir/Path("*")))
+    test_paths = glob(str(test_dir/Path("*")))
+    print("after cleaning train,test: ",len(trainval_paths),",",len(test_paths),sep="")
+def prepare_dataset():
+    print_title(f"Preparing dataset")
+    download_dataset()
+    unzip_dataset()
+    clean_dataset()
+def delete_cache():
+    ## clean old interim and raw datasets
+    print_title("deleting unused raw and interim dataset dirs")
+    if os.path.exists(config.raw_dataset_dir):
+        shutil.rmtree(config.raw_dataset_dir)
+    if os.path.exists(config.interim_dataset_dir):
+        shutil.rmtree(config.interim_dataset_dir)
+def delete_all():
+    ## clean all datasets
+    print_title("deleting all dataset dirs")
+    if os.path.exists(config.raw_dataset_dir):
+        shutil.rmtree(config.raw_dataset_dir)
+    if os.path.exists(config.interim_dataset_dir):
+        shutil.rmtree(config.interim_dataset_dir)
+    if os.path.exists(config.processed_dataset_dir):
+        shutil.rmtree(config.processed_dataset_dir)
+if __name__=="__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("command")
+    parser.add_argument("-d","--dataset",default="forests")
+    parser.add_argument("--cache",action="store_true",default=True)
+    parser.add_argument("--all",action="store_true")
+    """
+        prepare dataset:                      data prepare
+        visualize dataset:                    data show
+        delete raw & interim dataset dir:     data delete --cache
+        delete all dataset dir:               data delete --all
+    """
+    args = parser.parse_args()
+    # print(args)
+    if args.command=="prepare":
+        prepare_dataset()
+    elif args.command=="show":
+        visualize_dataset()
+    elif args.command=="delete":
+        if(args.all): delete_all()
+        elif(args.cache): delete_cache()
+    else:
+        print("unsupported")

src/data/visualize_dataset.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os,sys;sys.path.append(os.getcwd())
+from src.data.load_dataset import get_ds,get_datasets
+from src import config
+from src.utils import *
+import matplotlib.pyplot as plt
+import cv2
+import math
+def see_batch(L_batch,AB_batch,show_L=False,cols=4,row_size=5,col_size=5,title=None):
+    n = L_batch.shape[0]
+    rows = math.ceil(n/cols)
+    fig = plt.figure(figsize=(col_size*cols,row_size*rows))
+    if title:
+        plt.title(title)
+    plt.axis("off")
+    for i in range(n):
+        fig.add_subplot(rows,cols,i+1)
+        L,AB = L_batch[i],AB_batch[i]
+        L,AB = rescale_L(L), rescale_AB(AB)
+#         print(L.shape,AB.shape)
+        img = np.concatenate([L,AB],axis=-1)
+        img = cv2.cvtColor(img,cv2.COLOR_LAB2RGB)*255
+#         print(img.min(),img.max())
+        if show_L:
+            L = np.tile(L,(1,1,3))/100*255
+            img = np.concatenate([L,img],axis=1)
+        plt.imshow(img.astype("uint8"))
+    plt.show()
+def visualize_dataset():
+    train_ds,val_ds,test_ds = get_datasets()
+    L_batch,AB_batch = next(iter(train_ds))
+    L_batch,AB_batch = L_batch.numpy(), AB_batch.numpy()
+    see_batch(L_batch,
+              AB_batch,
+              title="training dataset")
+    L_batch,AB_batch = next(iter(val_ds))
+    L_batch,AB_batch = L_batch.numpy(), AB_batch.numpy()
+    see_batch(L_batch,
+              AB_batch,
+              title="validation dataset")
+    L_batch,AB_batch = next(iter(test_ds))
+    L_batch,AB_batch = L_batch.numpy(), AB_batch.numpy()
+    see_batch(L_batch,
+              AB_batch,
+              title="testing dataset")

src/utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import yaml
+import numpy as np
+class Config:
+    def __init__(self,path="config.yaml"):
+        with open(path,'r') as f:
+            self.config = yaml.safe_load(f)
+    def __str__(self):
+        return str(self.config)
+    def __getattr__(self, name: str):
+        return self.config.get(name)
+    # def __setattr__(self, name: str, value: any):
+    #     self.config[name]=value
+def is_bw(img):
+    rg,gb,rb = img[:,:,0]-img[:,:,1] , img[:,:,1]-img[:,:,2] , img[:,:,0]-img[:,:,2]
+    rg,gb,rb = np.abs(rg).sum(),np.abs(gb).sum(),np.abs(rb).sum()
+    avg = np.mean([rg,gb,rb])
+    # print(rg,gb,rb)
+    return avg<10
+def print_title(msg:str,n=30):
+    print("="*n,msg.upper(),"="*n,sep="")
+def scale_L(L):
+    return L/100
+def rescale_L(L):
+    return L*100
+def scale_AB(AB):
+    return AB/128
+def rescale_AB(AB):
+    return AB*128