Spaces:

ThirdEyeData
/

Duplicate_Records_Prediction

Runtime error

App Files Files Community

saritha5 commited on Apr 7, 2023

Commit

fc22863

1 Parent(s): 757e41f

Upload 13 files

Browse files

Files changed (13) hide show

lib/.ipynb_checkpoints/mlutil-checkpoint.ipynb +1297 -0
lib/.ipynb_checkpoints/sampler-checkpoint.ipynb +6 -0
lib/.ipynb_checkpoints/stats-checkpoint.ipynb +510 -0
lib/.ipynb_checkpoints/tnn-checkpoint.ipynb +800 -0
lib/.ipynb_checkpoints/txproc-checkpoint.ipynb +1002 -0
lib/.ipynb_checkpoints/util-checkpoint.ipynb +2141 -0
lib/mlutil.ipynb +1297 -0
lib/sampler.ipynb +1366 -0
lib/stats.ipynb +510 -0
lib/tnn.ipynb +800 -0
lib/txproc.ipynb +1002 -0
lib/util.ipynb +2141 -0
model/tnn/pdamb.mod +0 -0

lib/.ipynb_checkpoints/mlutil-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,1297 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d05ce02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import numpy as np\n",
+    "from sklearn import preprocessing\n",
+    "from sklearn import metrics\n",
+    "from sklearn.datasets import make_blobs\n",
+    "from sklearn.datasets import make_classification\n",
+    "import random\n",
+    "from math import *\n",
+    "from decimal import Decimal\n",
+    "import statistics\n",
+    "import jprops\n",
+    "from Levenshtein import distance as ld\n",
+    "from util import *\n",
+    "from sampler import *\n",
+    "\n",
+    "class Configuration:\n",
+    "    \"\"\"\n",
+    "    Configuration management. Supports default value, mandatory value and typed value.\n",
+    "    \"\"\"\n",
+    "    def __init__(self, configFile, defValues, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            configFile : config file path\n",
+    "            defValues : dictionary of default values\n",
+    "            verbose : verbosity flag\n",
+    "        \"\"\"\n",
+    "        configs = {}\n",
+    "        with open(configFile) as fp:\n",
+    "            for key, value in jprops.iter_properties(fp):\n",
+    "                configs[key] = value\n",
+    "        self.configs = configs\n",
+    "        self.defValues = defValues\n",
+    "        self.verbose = verbose\n",
+    "\n",
+    "    def override(self, configFile):\n",
+    "        \"\"\"\n",
+    "        over ride configuration from file\n",
+    "\n",
+    "        Parameters\n",
+    "            configFile : override config file path\n",
+    "        \"\"\"\n",
+    "        with open(configFile) as fp:\n",
+    "            for key, value in jprops.iter_properties(fp):\n",
+    "                self.configs[key] = value\n",
+    "\n",
+    "\n",
+    "    def setParam(self, name, value):\n",
+    "        \"\"\"\n",
+    "        override individual configuration\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "            value : config param value\n",
+    "        \"\"\"\n",
+    "        self.configs[name] = value\n",
+    "\n",
+    "\n",
+    "    def getStringConfig(self, name):\n",
+    "        \"\"\"\n",
+    "        get string param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            val = (self.configs[name], False)\n",
+    "        if self.verbose:\n",
+    "            print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def getIntConfig(self, name):\n",
+    "        \"\"\"\n",
+    "        get int param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        #print \"%s %s\" %(name,self.configs[name])\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            val = (int(self.configs[name]), False)\n",
+    "        if self.verbose:\n",
+    "            print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def getFloatConfig(self, name):\n",
+    "        \"\"\"\n",
+    "        get float param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        #print \"%s %s\" %(name,self.configs[name])\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            val = (float(self.configs[name]), False)\n",
+    "        if self.verbose:\n",
+    "            print( \"{} {} {:06.3f}\".format(name, self.configs[name], val[0]))\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def getBooleanConfig(self, name):\n",
+    "        \"\"\"\n",
+    "        #get boolean param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            bVal = self.configs[name].lower() == \"true\"\n",
+    "            val = (bVal, False)\n",
+    "        if self.verbose:\n",
+    "            print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def getIntListConfig(self, name, delim=\",\"):\n",
+    "        \"\"\"\n",
+    "        get int list param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "            delim : delemeter\n",
+    "        \"\"\"\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            delSepStr = self.getStringConfig(name)\n",
+    "\n",
+    "            #specified as range\n",
+    "            intList = strListOrRangeToIntArray(delSepStr[0])\n",
+    "            val =(intList, delSepStr[1])\n",
+    "        return val\n",
+    "\n",
+    "    def getFloatListConfig(self, name, delim=\",\"):\n",
+    "        \"\"\"\n",
+    "        get float list param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "            delim : delemeter\n",
+    "        \"\"\"\n",
+    "        delSepStr = self.getStringConfig(name)\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            flList = strToFloatArray(delSepStr[0], delim)\n",
+    "            val =(flList, delSepStr[1])\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def getStringListConfig(self, name, delim=\",\"):\n",
+    "        \"\"\"\n",
+    "        get string list param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "            delim : delemeter\n",
+    "        \"\"\"\n",
+    "        delSepStr = self.getStringConfig(name)\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            strList = delSepStr[0].split(delim)\n",
+    "            val = (strList, delSepStr[1])\n",
+    "        return val\n",
+    "\n",
+    "    def handleDefault(self, name):\n",
+    "        \"\"\"\n",
+    "        handles default\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        dVal = self.defValues[name]\n",
+    "        if (dVal[1] is None):\n",
+    "            val = dVal[0]\n",
+    "        else:\n",
+    "            raise ValueError(dVal[1])\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def isNone(self, name):\n",
+    "        \"\"\"\n",
+    "        true is value is None\t\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        return self.configs[name].lower() == \"none\"\n",
+    "\n",
+    "\n",
+    "    def isDefault(self, name):\n",
+    "        \"\"\"\n",
+    "        true if the value is default\t\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        de = self.configs[name] == \"_\"\n",
+    "        #print de\n",
+    "        return de\n",
+    "\n",
+    "\n",
+    "    def eitherOrStringConfig(self, firstName, secondName):\n",
+    "        \"\"\"\n",
+    "        returns one of two string parameters\t\n",
+    "        Parameters\n",
+    "            firstName : first parameter name\n",
+    "            secondName : second parameter name\t\n",
+    "        \"\"\"\n",
+    "        if not self.isNone(firstName):\n",
+    "            first = self.getStringConfig(firstName)[0]\n",
+    "            second = None\n",
+    "            if not self.isNone(secondName):\n",
+    "                raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \"  \" + secondName)\n",
+    "        else:\n",
+    "            if not self.isNone(secondName):\n",
+    "                second = self.getStringConfig(secondtName)[0]\n",
+    "                first = None\n",
+    "            else:\n",
+    "                raise ValueError(\"at least one of the two parameters should be set \" + firstName + \"  \" + secondName)\n",
+    "        return (first, second)\n",
+    "\n",
+    "\n",
+    "    def eitherOrIntConfig(self, firstName, secondName):\n",
+    "        \"\"\"\n",
+    "        returns one of two int parameters\t\n",
+    "        Parameters\n",
+    "            firstName : first parameter name\n",
+    "            secondName : second parameter name\t\n",
+    "        \"\"\"\n",
+    "        if not self.isNone(firstName):\n",
+    "            first = self.getIntConfig(firstName)[0]\n",
+    "            second = None\n",
+    "            if not self.isNone(secondName):\n",
+    "                raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \"  \" + secondName)\n",
+    "        else:\n",
+    "            if not self.isNone(secondName):\n",
+    "                second = self.getIntConfig(secondsName)[0]\n",
+    "                first = None\n",
+    "            else:\n",
+    "                raise ValueError(\"at least one of the two parameters should be set \" + firstName + \"  \" + secondName)\n",
+    "        return (first, second)\n",
+    "\n",
+    "\n",
+    "class CatLabelGenerator:\n",
+    "    \"\"\"\n",
+    "    label generator for categorical variables\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  catValues, delim):\n",
+    "        \"\"\"\n",
+    "        initilizers\n",
+    "\n",
+    "        Parameters\n",
+    "            catValues : dictionary of categorical values\n",
+    "            delim : delemeter\n",
+    "        \"\"\"\n",
+    "        self.encoders = {}\n",
+    "        self.catValues = catValues\n",
+    "        self.delim = delim\n",
+    "        for k in self.catValues.keys():\t\n",
+    "            le = preprocessing.LabelEncoder()\t\n",
+    "            le.fit(self.catValues[k])\n",
+    "            self.encoders[k] = le\n",
+    "\n",
+    "    def processRow(self, row):\t\n",
+    "        \"\"\"\n",
+    "        encode row categorical values\n",
+    "\n",
+    "        Parameters:\n",
+    "            row : data row\n",
+    "        \"\"\"\n",
+    "        #print row\n",
+    "        rowArr = row.split(self.delim)\n",
+    "        for i in range(len(rowArr)):\n",
+    "            if (i in self.catValues):\n",
+    "                curVal = rowArr[i]\n",
+    "                assert curVal in self.catValues[i], \"categorival value invalid\"\n",
+    "                encVal = self.encoders[i].transform([curVal])\n",
+    "                rowArr[i] = str(encVal[0])\n",
+    "        return self.delim.join(rowArr)\t\t\n",
+    "\n",
+    "    def getOrigLabels(self, indx):\n",
+    "        \"\"\"\n",
+    "        get original labels\n",
+    "\n",
+    "        Parameters:\n",
+    "            indx : column index\n",
+    "        \"\"\"\n",
+    "        return self.encoders[indx].classes_\t\n",
+    "\n",
+    "\n",
+    "class SupvLearningDataGenerator:\n",
+    "    \"\"\"\n",
+    "    data generator for supervised learning\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  configFile):\n",
+    "        \"\"\"\n",
+    "        initilizers\n",
+    "\n",
+    "        Parameters\n",
+    "            configFile : config file path\n",
+    "        \"\"\"\n",
+    "        defValues = dict()\n",
+    "        defValues[\"common.num.samp\"] = (100, None)\n",
+    "        defValues[\"common.num.feat\"] = (5, None)\n",
+    "        defValues[\"common.feat.trans\"] = (None, None)\n",
+    "        defValues[\"common.feat.types\"] = (None, \"missing feature types\")\n",
+    "        defValues[\"common.cat.feat.distr\"] = (None, None)\n",
+    "        defValues[\"common.output.precision\"] = (3, None)\n",
+    "        defValues[\"common.error\"] = (0.01, None)\n",
+    "        defValues[\"class.gen.technique\"] = (\"blob\", None)\n",
+    "        defValues[\"class.num.feat.informative\"] = (2, None)\n",
+    "        defValues[\"class.num.feat.redundant\"] = (2, None)\n",
+    "        defValues[\"class.num.feat.repeated\"] = (0, None)\n",
+    "        defValues[\"class.num.feat.cat\"] = (0, None)\n",
+    "        defValues[\"class.num.class\"] = (2, None)\n",
+    "\n",
+    "        self.config = Configuration(configFile, defValues)\n",
+    "\n",
+    "    def genClassifierData(self):\n",
+    "        \"\"\"\n",
+    "        generates classifier data\n",
+    "        \"\"\"\n",
+    "        nsamp =  self.config.getIntConfig(\"common.num.samp\")[0]\n",
+    "        nfeat =  self.config.getIntConfig(\"common.num.feat\")[0]\n",
+    "        nclass =  self.config.getIntConfig(\"class.num.class\")[0]\n",
+    "        #transform with shift and scale\n",
+    "        ftrans =  self.config.getFloatListConfig(\"common.feat.trans\")[0]\n",
+    "        feTrans = dict()\n",
+    "        for i in range(0, len(ftrans), 2):\n",
+    "            tr = (ftrans[i], ftrans[i+1])\n",
+    "            indx = int(i/2)\n",
+    "            feTrans[indx] = tr\n",
+    "\n",
+    "        ftypes =  self.config.getStringListConfig(\"common.feat.types\")[0]\n",
+    "\n",
+    "        # categorical feature distribution\n",
+    "        feCatDist = dict()\n",
+    "        fcatdl =  self.config.getStringListConfig(\"common.cat.feat.distr\")[0]\n",
+    "        for fcatds in fcatdl:\n",
+    "            fcatd = fcatds.split(\":\")\n",
+    "            feInd =  int(fcatd[0])\n",
+    "            clVal =  int(fcatd[1])\n",
+    "            key = (feInd, clVal)\t\t#feature index and class value\n",
+    "            dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2)))\n",
+    "            feCatDist[key] = CategoricalRejectSampler(*dist)\n",
+    "\n",
+    "        #shift and scale\n",
+    "        genTechnique = self.config.getStringConfig(\"class.gen.technique\")[0]\n",
+    "        error = self.config.getFloatConfig(\"common.error\")[0]\n",
+    "        if genTechnique == \"blob\":\n",
+    "            features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat)\n",
+    "            for i in range(nsamp):\t\t\t#shift and scale\n",
+    "                for j in range(nfeat):\n",
+    "                    tr = feTrans[j]\n",
+    "                    features[i,j] = (features[i,j]  + tr[0]) * tr[1]\n",
+    "            claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz)))\n",
+    "        elif genTechnique == \"classify\":\n",
+    "            nfeatInfo =  self.config.getIntConfig(\"class.num.feat.informative\")[0]\n",
+    "            nfeatRed =  self.config.getIntConfig(\"class.num.feat.redundant\")[0]\n",
+    "            nfeatRep =  self.config.getIntConfig(\"class.num.feat.repeated\")[0]\n",
+    "            shifts = list(map(lambda i : feTrans[i][0], range(nfeat)))\n",
+    "            scales = list(map(lambda i : feTrans[i][1], range(nfeat)))\n",
+    "            features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed, \n",
+    "            n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales)\n",
+    "        else:\n",
+    "            raise \"invalid genaration technique\"\n",
+    "\n",
+    "        # add categorical features and format\n",
+    "        nCatFeat = self.config.getIntConfig(\"class.num.feat.cat\")[0]\n",
+    "        prec =  self.config.getIntConfig(\"common.output.precision\")[0]\n",
+    "        for f , c in zip(features, claz):\n",
+    "            nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat)))\n",
+    "            if nCatFeat > 0:\n",
+    "                cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1)))\n",
+    "                rec = \",\".join(nfs) + \",\" +  \",\".join(cfs)  + \",\" + str(c)\n",
+    "            else:\n",
+    "                rec = \",\".join(nfs)  + \",\" + str(c)\n",
+    "            yield rec\n",
+    "\n",
+    "    def numFeToStr(self, fv, ft, prec):\n",
+    "        \"\"\"\n",
+    "        nummeric feature value to string\n",
+    "\n",
+    "        Parameters\n",
+    "            fv : field value\n",
+    "            ft : field data type\n",
+    "            prec : precision\n",
+    "        \"\"\"\n",
+    "        if ft == \"float\":\n",
+    "            s = formatFloat(prec, fv)\n",
+    "        elif ft ==\"int\":\n",
+    "            s = str(int(fv))\n",
+    "        else:\t\t\n",
+    "            raise \"invalid type expecting float or int\"\n",
+    "        return s\n",
+    "\n",
+    "    def catFe(self, i, cv, ft, feCatDist):\n",
+    "        \"\"\"\n",
+    "        generate categorical feature\n",
+    "\n",
+    "        Parameters\n",
+    "            i : col index\n",
+    "            cv : class value\n",
+    "            ft : field data type\n",
+    "            feCatDist : cat value distribution\n",
+    "        \"\"\"\n",
+    "        if ft == \"cat\":\n",
+    "            key = (i, cv)\n",
+    "            s = feCatDist[key].sample()\n",
+    "        else:\t\t\n",
+    "            raise \"invalid type expecting categorical\"\n",
+    "        return s\n",
+    "\n",
+    "\n",
+    "\n",
+    "def loadDataFile(file, delim, cols, colIndices):\n",
+    "    \"\"\"\n",
+    "    loads delim separated file and extracts columns\n",
+    "    Parameters\n",
+    "        file : file path\n",
+    "        delim : delemeter\n",
+    "        cols : columns to use from file\n",
+    "        colIndices ; columns to extract\n",
+    "    \"\"\"\n",
+    "    data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
+    "    extrData = data[:,colIndices]\n",
+    "    return (data, extrData)\n",
+    "\n",
+    "def loadFeatDataFile(file, delim, cols):\n",
+    "    \"\"\"\n",
+    "    loads delim separated file and extracts columns\n",
+    "\n",
+    "    Parameters\n",
+    "        file : file path\n",
+    "        delim : delemeter\n",
+    "        cols : columns to use from file\n",
+    "    \"\"\"\n",
+    "    data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
+    "    return data\n",
+    "\n",
+    "def extrColumns(arr, columns):\n",
+    "    \"\"\"\n",
+    "    extracts columns\n",
+    "\n",
+    "    Parameters\n",
+    "        arr : 2D array\n",
+    "        columns : columns\n",
+    "    \"\"\"\n",
+    "    return arr[:, columns]\n",
+    "\n",
+    "def subSample(featData, clsData, subSampleRate, withReplacement):\n",
+    "    \"\"\"\n",
+    "    subsample feature and class label data\t\n",
+    "    Parameters\n",
+    "        featData : 2D array of feature data\n",
+    "        clsData : arrray of class labels\n",
+    "        subSampleRate : fraction to be sampled\n",
+    "        withReplacement : true if sampling with replacement\n",
+    "    \"\"\"\n",
+    "    sampSize = int(featData.shape[0] * subSampleRate)\n",
+    "    sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement)\n",
+    "    sampFeat = featData[sampledIndx]\n",
+    "    sampCls = clsData[sampledIndx]\n",
+    "    return(sampFeat, sampCls)\n",
+    "\n",
+    "def euclideanDistance(x,y):\n",
+    "    \"\"\"\n",
+    "    euclidean distance\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "    \"\"\"\n",
+    "    return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y)))\n",
+    "\n",
+    "def squareRooted(x):\n",
+    "    \"\"\"\n",
+    "    square root of sum square\n",
+    "    Parameters\n",
+    "        x : data vector\n",
+    "    \"\"\"\n",
+    "    return round(sqrt(sum([a*a for a in x])),3)\n",
+    "\n",
+    "def cosineSimilarity(x,y):\n",
+    "    \"\"\"\n",
+    "    cosine similarity\n",
+    "\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "    \"\"\"\n",
+    "    numerator = sum(a*b for a,b in zip(x,y))\n",
+    "    denominator = squareRooted(x) * squareRooted(y)\n",
+    "    return round(numerator / float(denominator), 3)\n",
+    "\n",
+    "def cosineDistance(x,y):\n",
+    "    \"\"\"\n",
+    "    cosine distance\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "    \"\"\"\n",
+    "    return 1.0 - cosineSimilarity(x,y)\n",
+    "\n",
+    "def manhattanDistance(x,y):\n",
+    "    \"\"\"\n",
+    "    manhattan distance\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "    \"\"\"\n",
+    "    return sum(abs(a-b) for a,b in zip(x,y))\n",
+    "\n",
+    "def nthRoot(value, nRoot):\n",
+    "    \"\"\"\n",
+    "    nth root\n",
+    "    Parameters\n",
+    "        value : data value\n",
+    "        nRoot : root\n",
+    "    \"\"\"\n",
+    "    rootValue = 1/float(nRoot)\n",
+    "    return round (Decimal(value) ** Decimal(rootValue),3)\n",
+    "\n",
+    "def minkowskiDistance(x,y,pValue):\n",
+    "    \"\"\"\n",
+    "    minkowski distance\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "        pValue : power factor\n",
+    "    \"\"\"\n",
+    "    return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue)\n",
+    "\n",
+    "def jaccardSimilarityX(x,y):\n",
+    "    \"\"\"\n",
+    "    jaccard similarity\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "    \"\"\"\n",
+    "    intersectionCardinality = len(set.intersection(*[set(x), set(y)]))\n",
+    "    unionCardinality = len(set.union(*[set(x), set(y)]))\n",
+    "    return intersectionCardinality/float(unionCardinality)\n",
+    "\n",
+    "def jaccardSimilarity(x,y,wx=1.0,wy=1.0):\n",
+    "    \"\"\"\n",
+    "    jaccard similarity\n",
+    "\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "        wx : weight for x\n",
+    "        wy : weight for y\n",
+    "    \"\"\"\n",
+    "    sx = set(x)\n",
+    "    sy = set(y)\n",
+    "    sxyInt = sx.intersection(sy)\n",
+    "    intCardinality = len(sxyInt)\n",
+    "    sxIntDiff = sx.difference(sxyInt)\n",
+    "    syIntDiff = sy.difference(sxyInt)\n",
+    "    unionCardinality = len(sx.union(sy))\n",
+    "    return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff))\n",
+    "\n",
+    "def levenshteinSimilarity(s1, s2):\n",
+    "    \"\"\"\n",
+    "    Levenshtein similarity for strings\n",
+    "\n",
+    "    Parameters\n",
+    "        sx : first string\n",
+    "        sy : second string\n",
+    "    \"\"\"\n",
+    "    assert type(s1) == str and type(s2) == str,  \"Levenshtein similarity is for string only\"\n",
+    "    d = ld(s1,s2)\n",
+    "    #print(d)\n",
+    "    l = max(len(s1),len(s2))\n",
+    "    d = 1.0 - min(d/l, 1.0)\n",
+    "    return d\t\n",
+    "\n",
+    "def norm(values, po=2):\n",
+    "    \"\"\"\n",
+    "    norm\n",
+    "    Parameters\n",
+    "        values : list of values\n",
+    "        po : power\n",
+    "    \"\"\"\n",
+    "    no = sum(list(map(lambda v: pow(v,po), values)))\n",
+    "    no = pow(no,1.0/po)\n",
+    "    return list(map(lambda v: v/no, values))\n",
+    "\n",
+    "def createOneHotVec(size, indx = -1):\n",
+    "    \"\"\"\n",
+    "    random one hot vector\n",
+    "\n",
+    "    Parameters\n",
+    "        size : vector size\n",
+    "        indx : one hot position\n",
+    "    \"\"\"\n",
+    "    vec = [0] * size\n",
+    "    s = random.randint(0, size - 1) if indx < 0 else indx\n",
+    "    vec[s] = 1\n",
+    "    return vec\n",
+    "\n",
+    "def createAllOneHotVec(size):\n",
+    "    \"\"\"\n",
+    "    create all one hot vectors\n",
+    "\n",
+    "    Parameters\n",
+    "        size : vector size and no of vectors\n",
+    "    \"\"\"\n",
+    "    vecs = list()\n",
+    "    for i in range(size):\n",
+    "        vec = [0] * size\n",
+    "        vec[i] = 1\n",
+    "        vecs.append(vec)\n",
+    "    return vecs\n",
+    "\n",
+    "def blockShuffle(data, blockSize):\n",
+    "    \"\"\"\n",
+    "    block shuffle \t\n",
+    "\n",
+    "    Parameters\n",
+    "        data : list data\n",
+    "        blockSize : block size\n",
+    "    \"\"\"\n",
+    "    numBlock = int(len(data) / blockSize)\n",
+    "    remain = len(data) % blockSize\n",
+    "    numBlock +=  (1 if remain > 0 else 0)\n",
+    "    shuffled = list()\n",
+    "    for i in range(numBlock):\n",
+    "        b = random.randint(0, numBlock-1)\n",
+    "        beg = b * blockSize\n",
+    "        if (b < numBlock-1):\n",
+    "            end = beg + blockSize\n",
+    "            shuffled.extend(data[beg:end])\t\t\n",
+    "        else:\n",
+    "            shuffled.extend(data[beg:])\n",
+    "    return shuffled\t\n",
+    "\n",
+    "def shuffle(data, numShuffle):\n",
+    "    \"\"\"\n",
+    "    shuffle data by randonm swapping\n",
+    "\n",
+    "    Parameters\n",
+    "        data : list data\n",
+    "        numShuffle : no of pairwise swaps\n",
+    "    \"\"\"\n",
+    "    sz = len(data)\n",
+    "    if numShuffle is None:\n",
+    "        numShuffle = int(sz / 2)\n",
+    "    for i in range(numShuffle):\n",
+    "        fi = random.randint(0, sz -1)\n",
+    "        se = random.randint(0, sz -1)\n",
+    "        tmp = data[fi]\n",
+    "        data[fi] = data[se]\n",
+    "        data[se] = tmp\t\n",
+    "\n",
+    "def randomWalk(size, start, lowStep, highStep):\n",
+    "    \"\"\"\n",
+    "    random walk\t\n",
+    "\n",
+    "    Parameters\n",
+    "        size : list data\n",
+    "        start : initial position\n",
+    "        lowStep : step min\n",
+    "        highStep : step max\n",
+    "    \"\"\"\n",
+    "    cur = start\n",
+    "    for i in range(size):\n",
+    "        yield cur\n",
+    "        cur += randomFloat(lowStep, highStep)\n",
+    "\n",
+    "def binaryEcodeCategorical(values, value):\n",
+    "    \"\"\"\n",
+    "    one hot binary encoding\t\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list of values\n",
+    "        value : value to be replaced with 1\n",
+    "    \"\"\"\n",
+    "    size = len(values)\n",
+    "    vec = [0] * size\n",
+    "    for i in range(size):\n",
+    "        if (values[i] == value):\n",
+    "            vec[i] = 1\n",
+    "    return vec\t\t\n",
+    "\n",
+    "def createLabeledSeq(inputData, tw):\n",
+    "    \"\"\"\n",
+    "    Creates feature, label pair from sequence data, where we have tw number of features followed by output\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list containing feature and label\n",
+    "        tw : no of features\n",
+    "    \"\"\"\n",
+    "    features = list()\n",
+    "    labels = list()\n",
+    "    l = len(inputDta)\n",
+    "    for i in range(l - tw):\n",
+    "        trainSeq = inputData[i:i+tw]\n",
+    "        trainLabel = inputData[i+tw]\n",
+    "        features.append(trainSeq)\n",
+    "        labels.append(trainLabel)\n",
+    "    return (features, labels)\n",
+    "\n",
+    "def createLabeledSeq(filePath, delim, index, tw):\n",
+    "    \"\"\"\n",
+    "    Creates feature, label pair from 1D sequence data in file\t\n",
+    "\n",
+    "    Parameters\n",
+    "        filePath : file path\n",
+    "        delim : delemeter\n",
+    "        index : column index\n",
+    "        tw : no of features\n",
+    "    \"\"\"\n",
+    "    seqData = getFileColumnAsFloat(filePath, delim, index)\n",
+    "    return createLabeledSeq(seqData, tw)\n",
+    "\n",
+    "def fromMultDimSeqToTabular(data, inpSize, seqLen):\n",
+    "    \"\"\"\n",
+    "    Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize)\n",
+    "\n",
+    "    Parameters\n",
+    "        data : 2D array\n",
+    "        inpSize : each input size in sequence\n",
+    "        seqLen : sequence length\n",
+    "    \"\"\"\t\n",
+    "    nrow = data.shape[0]\n",
+    "    assert data.shape[1] == inpSize * seqLen, \"invalid input size or sequence length\"\n",
+    "    return data.reshape(nrow * seqLen, inpSize)\n",
+    "\n",
+    "def fromTabularToMultDimSeq(data, inpSize, seqLen):\n",
+    "    \"\"\"\n",
+    "    Input shape (nrow * seqLen, inpSize)   output  shape (nrow, inpSize * seqLen) \n",
+    "    Parameters\n",
+    "        data : 2D array\n",
+    "        inpSize : each input size in sequence\n",
+    "        seqLen : sequence length\n",
+    "    \"\"\"\t\n",
+    "    nrow = int(data.shape[0] / seqLen)\n",
+    "    assert data.shape[1] == inpSize, \"invalid input size\"\n",
+    "    return data.reshape(nrow,  seqLen * inpSize)\n",
+    "\n",
+    "def difference(data, interval=1):\n",
+    "    \"\"\"\n",
+    "    takes difference in time series data\n",
+    "    Parameters\n",
+    "        data :list data\n",
+    "        interval : interval for difference\n",
+    "    \"\"\"\n",
+    "    diff = list()\n",
+    "    for i in range(interval, len(data)):\n",
+    "        value = data[i] - data[i - interval]\n",
+    "        diff.append(value)\n",
+    "    return diff\n",
+    "\n",
+    "def normalizeMatrix(data, norm, axis=1):\n",
+    "    \"\"\"\n",
+    "    normalized each row of the matrix\n",
+    "\n",
+    "    Parameters\n",
+    "        data : 2D data\n",
+    "        nporm : normalization method\n",
+    "        axis : row or column\n",
+    "    \"\"\"\n",
+    "    normalized = preprocessing.normalize(data,norm=norm, axis=axis)\n",
+    "    return normalized\n",
+    "\n",
+    "def standardizeMatrix(data, axis=0):\n",
+    "    \"\"\"\n",
+    "    standardizes each column of the matrix with mean and std deviation\n",
+    "    Parameters\n",
+    "        data : 2D data\n",
+    "        axis : row or column\n",
+    "    \"\"\"\n",
+    "    standardized = preprocessing.scale(data, axis=axis)\n",
+    "    return standardized\n",
+    "\n",
+    "def asNumpyArray(data):\n",
+    "    \"\"\"\n",
+    "    converts to numpy array\n",
+    "    Parameters\n",
+    "        data  : array\n",
+    "    \"\"\"\n",
+    "    return np.array(data)\n",
+    "\n",
+    "def perfMetric(metric, yActual, yPred, clabels=None):\n",
+    "    \"\"\"\n",
+    "    predictive model accuracy metric\n",
+    "    Parameters\n",
+    "        metric : accuracy metric\n",
+    "        yActual : actual values array\n",
+    "        yPred : predicted values array\n",
+    "        clabels : class labels\n",
+    "    \"\"\"\n",
+    "    if metric == \"rsquare\":\n",
+    "        score = metrics.r2_score(yActual, yPred)\n",
+    "    elif metric == \"mae\":\n",
+    "        score = metrics.mean_absolute_error(yActual, yPred)\n",
+    "    elif metric == \"mse\":\n",
+    "        score = metrics.mean_squared_error(yActual, yPred)\n",
+    "    elif metric == \"acc\":\n",
+    "        yPred = np.rint(yPred)\n",
+    "        score = metrics.accuracy_score(yActual, yPred)\n",
+    "    elif metric == \"mlAcc\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.accuracy_score(yActual, yPred)\n",
+    "    elif metric == \"prec\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.precision_score(yActual, yPred)\n",
+    "    elif metric == \"rec\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.recall_score(yActual, yPred)\n",
+    "    elif metric == \"fone\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.f1_score(yActual, yPred)\n",
+    "    elif metric == \"confm\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.confusion_matrix(yActual, yPred)\n",
+    "    elif metric == \"clarep\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.classification_report(yActual, yPred)\n",
+    "    elif metric == \"bce\":\n",
+    "        if clabels is None:\n",
+    "            clabels = [0, 1]\n",
+    "        score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
+    "    elif metric == \"ce\":\n",
+    "        assert clabels is not None, \"labels must be provided\"\n",
+    "        score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
+    "    else:\n",
+    "        exitWithMsg(\"invalid prediction performance metric \" + metric)\n",
+    "    return score\n",
+    "\n",
+    "def scaleData(data, method):\n",
+    "    \"\"\"\n",
+    "    scales feature data column wise\n",
+    "    Parameters\n",
+    "        data : 2D array\n",
+    "        method : scaling method\n",
+    "    \"\"\"\n",
+    "    if method == \"minmax\":\n",
+    "        scaler = preprocessing.MinMaxScaler()\n",
+    "        data = scaler.fit_transform(data)\n",
+    "    elif method == \"zscale\":\n",
+    "        data = preprocessing.scale(data)\t\n",
+    "    else:\n",
+    "        raise ValueError(\"invalid scaling method\")\t\n",
+    "    return data\n",
+    "\n",
+    "def scaleDataWithParams(data, method, scParams):\n",
+    "    \"\"\"\n",
+    "    scales feature data column wise\n",
+    "    Parameters\n",
+    "        data : 2D array\n",
+    "        method : scaling method\n",
+    "        scParams : scaling parameters\n",
+    "    \"\"\"\n",
+    "    if method == \"minmax\":\n",
+    "        data = scaleMinMaxTabData(data, scParams)\n",
+    "    elif method == \"zscale\":\n",
+    "        raise ValueError(\"invalid scaling method\")\t\n",
+    "    else:\n",
+    "        raise ValueError(\"invalid scaling method\")\t\n",
+    "    return data\n",
+    "\n",
+    "\n",
+    "def scaleMinMaxTabData(tdata, minMax):\n",
+    "    \"\"\"\n",
+    "    for tabular scales feature data column wise using min max values for each field\n",
+    "    Parameters\n",
+    "        tdata : 2D array\n",
+    "        minMax : ni, max and range for each column\n",
+    "    \"\"\"\n",
+    "    stdata = list()\n",
+    "    for r in tdata:\n",
+    "        srdata = list()\n",
+    "        for i, c in enumerate(r):\n",
+    "            sd = (c - minMax[i][0]) / minMax[i][2]\n",
+    "            srdata.append(sd)\n",
+    "        stdata.append(srdata)\n",
+    "    return stdata\n",
+    "\n",
+    "def scaleMinMax(rdata, minMax):\n",
+    "    \"\"\"\n",
+    "    scales feature data column wise using min max values for each field\n",
+    "    Parameters\n",
+    "        rdata : data array\n",
+    "        minMax : ni, max and range for each column\n",
+    "    \"\"\"\n",
+    "    srdata = list()\n",
+    "    for i in range(len(rdata)):\n",
+    "        d = rdata[i]\n",
+    "        sd = (d - minMax[i][0]) / minMax[i][2]\n",
+    "        srdata.append(sd)\n",
+    "    return srdata\n",
+    "\n",
+    "def harmonicNum(n):\n",
+    "    \"\"\"\n",
+    "    harmonic number\n",
+    "    Parameters\n",
+    "        n : number\n",
+    "    \"\"\"\n",
+    "    h = 0\n",
+    "    for i in range(1, n+1, 1):\n",
+    "        h += 1.0 / i\n",
+    "    return h\n",
+    "\n",
+    "def digammaFun(n):\n",
+    "    \"\"\"\n",
+    "    figamma function\n",
+    "    Parameters\n",
+    "        n : number\n",
+    "    \"\"\"\n",
+    "    #Euler Mascheroni constant\n",
+    "    ec = 0.577216\n",
+    "    return harmonicNum(n - 1) - ec\n",
+    "\n",
+    "def getDataPartitions(tdata, types, columns = None):\n",
+    "    \"\"\"\n",
+    "    partitions data with the given columns and random split point defined with predicates\n",
+    "    Parameters\n",
+    "        tdata : 2D array\n",
+    "        types : data typers\n",
+    "        columns : column indexes\n",
+    "    \"\"\"\n",
+    "    (dtypes, cvalues) = extractTypesFromString(types)\n",
+    "    if columns is None:\n",
+    "        ncol = len(data[0])\n",
+    "        columns = list(range(ncol))\n",
+    "    ncol = len(columns)\n",
+    "    #print(columns)\n",
+    "\n",
+    "    # partition predicates\n",
+    "    partitions = None\n",
+    "    for c in columns:\n",
+    "        #print(c)\n",
+    "        dtype = dtypes[c]\n",
+    "        pred = list()\n",
+    "        if dtype == \"int\" or dtype == \"float\":\n",
+    "            (vmin, vmax) = getColMinMax(tdata, c)\n",
+    "            r = vmax - vmin\n",
+    "            rmin = vmin + .2 * r\n",
+    "            rmax = vmax - .2 * r\n",
+    "            sp = randomFloat(rmin, rmax)\n",
+    "            if dtype == \"int\":\n",
+    "                sp = int(sp)\n",
+    "            else:\n",
+    "                sp = \"{:.3f}\".format(sp)\n",
+    "                sp = float(sp)\n",
+    "            pred.append([c, \"LT\", sp])\n",
+    "            pred.append([c, \"GE\", sp])\n",
+    "        elif dtype == \"cat\":\n",
+    "            cv = cvalues[c]\n",
+    "            card = len(cv) \n",
+    "            if card < 3:\n",
+    "                num = 1\n",
+    "            else:\n",
+    "                num = randomInt(1, card - 1)\n",
+    "            sp = selectRandomSubListFromList(cv, num)\n",
+    "            sp = \" \".join(sp)\n",
+    "            pred.append([c, \"IN\", sp])\n",
+    "            pred.append([c, \"NOTIN\", sp])\n",
+    "\n",
+    "        #print(pred)\n",
+    "        if partitions is None:\n",
+    "            partitions = pred.copy()\n",
+    "            #print(\"initial\")\n",
+    "            #print(partitions)\n",
+    "        else:\n",
+    "            #print(\"extension\")\n",
+    "            tparts = list()\n",
+    "            for p in partitions:\n",
+    "                #print(p)\n",
+    "                l1 = p.copy()\n",
+    "                l1.extend(pred[0])\n",
+    "                l2 = p.copy()\n",
+    "                l2.extend(pred[1])\n",
+    "                #print(\"after extension\")\n",
+    "                #print(l1)\n",
+    "                #print(l2)\n",
+    "                tparts.append(l1)\n",
+    "                tparts.append(l2)\n",
+    "            partitions = tparts\t\n",
+    "            #print(\"extending\")\n",
+    "            #print(partitions)\n",
+    "\n",
+    "    #for p in partitions:\n",
+    "        #print(p)\t\n",
+    "    return partitions\t\t\t\n",
+    "\n",
+    "def genAlmostUniformDistr(size, nswap=50):\n",
+    "    \"\"\"\n",
+    "    generate probability distribution\n",
+    "\n",
+    "    Parameters\n",
+    "        size : distr size\n",
+    "        nswap : no of mass swaps\n",
+    "    \"\"\"\n",
+    "    un = 1.0 / size\n",
+    "    distr = [un] * size\n",
+    "    distr = mutDistr(distr, 0.1 * un, nswap)\n",
+    "    return distr\n",
+    "\n",
+    "def mutDistr(distr, shift, nswap=50):\n",
+    "    \"\"\"\n",
+    "    mutates a probability distribution\n",
+    "\n",
+    "    Parameters\n",
+    "        distr distribution\n",
+    "        shift : amount of shift for swap\n",
+    "        nswap : no of mass swaps\n",
+    "    \"\"\"\n",
+    "    size = len(distr)\n",
+    "    for _ in range(nswap):\n",
+    "        fi = randomInt(0, size -1)\n",
+    "        si = randomInt(0, size -1)\n",
+    "        while fi == si:\n",
+    "            fi = randomInt(0, size -1)\n",
+    "            si = randomInt(0, size -1)\n",
+    "\n",
+    "        shift = randomFloat(0, shift)\n",
+    "        t = distr[fi]\n",
+    "        distr[fi] -= shift\n",
+    "        if (distr[fi] < 0):\n",
+    "            distr[fi] = 0.0\n",
+    "            shift = t\n",
+    "        distr[si] += shift\n",
+    "    return distr\n",
+    "\n",
+    "def generateBinDistribution(size, ntrue):\n",
+    "    \"\"\"\n",
+    "    generate binary array with some elements set to 1\n",
+    "\n",
+    "    Parameters\n",
+    "        size : distr size\n",
+    "        ntrue : no of true values\n",
+    "    \"\"\"\n",
+    "    distr = [0] * size\n",
+    "    idxs = selectRandomSubListFromList(list(range(size)), ntrue)\n",
+    "    for i in idxs:\n",
+    "        distr[i] = 1\n",
+    "    return distr\n",
+    "\n",
+    "def mutBinaryDistr(distr, nmut):\n",
+    "    \"\"\"\n",
+    "    mutate binary distribution\n",
+    "\n",
+    "    Parameters\n",
+    "        distr : distr\n",
+    "        nmut : no of mutations\n",
+    "    \"\"\"\n",
+    "    idxs = selectRandomSubListFromList(list(range(len(distr))), nmut)\n",
+    "    for i in idxs:\n",
+    "        distr[i] = distr[i] ^ 1\n",
+    "\n",
+    "\n",
+    "def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator that superimposes given data in the specified segment of a column\n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        column : column index \n",
+    "        offset : offset into column values\n",
+    "        seqLen : length of subseq\n",
+    "        modifier : data to be superimposed either list or a sampler object\n",
+    "        precision : floating point precision\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    beg = offset\n",
+    "    end = beg + seqLen\n",
+    "    isList = type(modifier) == list\n",
+    "    i = 0\n",
+    "    for rec in fileRecGen(filePath, delim):\n",
+    "        if i >= beg and i < end:\n",
+    "            va = float(rec[column])\n",
+    "            if isList:\n",
+    "                va += modifier[i - beg] \n",
+    "            else:\n",
+    "                va += modifier.sample()\n",
+    "            rec[column] = formatFloat(precision, va)\n",
+    "        yield delim.join(rec)\n",
+    "        i += 1\n",
+    "\n",
+    "class ShiftedDataGenerator:\n",
+    "    \"\"\"\n",
+    "    transforms data for distribution shift\n",
+    "    \"\"\"\n",
+    "    def __init__(self, types, tdata, addFact, multFact):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            types data types\n",
+    "            tdata : 2D array\n",
+    "            addFact ; factor for data shift\n",
+    "            multFact ; factor for data scaling\n",
+    "        \"\"\"\n",
+    "        (self.dtypes, self.cvalues) = extractTypesFromString(types)\n",
+    "\n",
+    "        self.limits = dict()\n",
+    "        for k,v in self.dtypes.items():\n",
+    "            if v == \"int\" or v == \"false\":\n",
+    "                (vmax, vmin) = getColMinMax(tdata, k)\n",
+    "                self.limits[k] = vmax - vmin\n",
+    "        self.addMin = - addFact / 2\n",
+    "        self.addMax =  addFact / 2\n",
+    "        self.multMin = 1.0 - multFact / 2\n",
+    "        self.multMax = 1.0 + multFact / 2\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "    def transform(self, tdata):\n",
+    "        \"\"\"\n",
+    "        linear transforms data to create  distribution shift with random shift and scale\n",
+    "        Parameters\n",
+    "            types : data types\n",
+    "        \"\"\"\n",
+    "        transforms = dict()\n",
+    "        for k,v in self.dtypes.items():\n",
+    "            if v == \"int\" or v == \"false\":\t\t\t\t\n",
+    "                shift = randomFloat(self.addMin, self.addMax) * self.limits[k] \n",
+    "                scale = randomFloat(self.multMin, self.multMax)\n",
+    "                trns = (shift, scale)\n",
+    "                transforms[k] = trns\n",
+    "            elif v == \"cat\":\n",
+    "                transforms[k] = isEventSampled(50)\n",
+    "\n",
+    "        ttdata = list()\n",
+    "        for rec in tdata:\n",
+    "            nrec = rec.copy()\n",
+    "            for c in range(len(rec)):\n",
+    "                if c in self.dtypes:\n",
+    "                    dtype = self.dtypes[c]\n",
+    "                    if dtype == \"int\" or dtype == \"float\":\n",
+    "                        (shift, scale) = transforms[c]\n",
+    "                        nval = shift +  rec[c] * scale\n",
+    "                        if dtype == \"int\":\n",
+    "                            nrec[c] = int(nval)\n",
+    "                        else:\n",
+    "                            nrec[c] = nval\n",
+    "                    elif dtype == \"cat\":\n",
+    "                        cv = self.cvalues[c]\n",
+    "                        if transforms[c]:\n",
+    "                            nval = selectOtherRandomFromList(cv, rec[c])\n",
+    "                            nrec[c] = nval\n",
+    "\n",
+    "            ttdata.append(nrec)\n",
+    "\n",
+    "        return ttdata\n",
+    "\n",
+    "    def transformSpecified(self, tdata, sshift, scale):\n",
+    "        \"\"\"\n",
+    "        linear transforms data to create  distribution shift shift specified shift and scale\n",
+    "        Parameters\n",
+    "            types : data types\n",
+    "            sshift : shift factor\n",
+    "            scale : scale factor\n",
+    "        \"\"\"\n",
+    "        transforms = dict()\n",
+    "        for k,v in self.dtypes.items():\n",
+    "            if v == \"int\" or v == \"false\":\t\t\t\t\n",
+    "                shift = sshift * self.limits[k] \n",
+    "                trns = (shift, scale)\n",
+    "                transforms[k] = trns\n",
+    "            elif v == \"cat\":\n",
+    "                transforms[k] = isEventSampled(50)\n",
+    "\n",
+    "        ttdata = self.__scaleShift(tdata, transforms)\n",
+    "        return ttdata\n",
+    "\n",
+    "    def __scaleShift(self, tdata, transforms):\n",
+    "        \"\"\"\n",
+    "        shifts and scales tabular data\n",
+    "\n",
+    "        Parameters\n",
+    "            tdata : 2D array\n",
+    "            transforms : transforms to apply\n",
+    "        \"\"\"\n",
+    "        ttdata = list()\n",
+    "        for rec in tdata:\n",
+    "            nrec = rec.copy()\n",
+    "            for c in range(len(rec)):\n",
+    "                if c in self.dtypes:\n",
+    "                    dtype = self.dtypes[c]\n",
+    "                    if dtype == \"int\" or dtype == \"float\":\n",
+    "                        (shift, scale) = transforms[c]\n",
+    "                        nval = shift + rec[c] * scale\n",
+    "                        if dtype == \"int\":\n",
+    "                            nrec[c] = int(nval)\n",
+    "                        else:\n",
+    "                            nrec[c] = nval\n",
+    "                    elif dtype == \"cat\":\n",
+    "                        cv = self.cvalues[c]\n",
+    "                        if transforms[c]:\n",
+    "                            #nval = selectOtherRandomFromList(cv, rec[c])\n",
+    "                            #nrec[c] = nval\n",
+    "                            pass\n",
+    "\n",
+    "            ttdata.append(nrec)\n",
+    "        return ttdata\n",
+    "\n",
+    "class RollingStat(object):\n",
+    "    \"\"\"\n",
+    "    stats for rolling windowt\n",
+    "    \"\"\"\n",
+    "    def __init__(self, wsize):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            wsize : window size\n",
+    "        \"\"\"\n",
+    "        self.window = list()\n",
+    "        self.wsize = wsize\n",
+    "        self.mean = None\n",
+    "        self.sd = None\n",
+    "\n",
+    "    def add(self, value):\n",
+    "        \"\"\"\n",
+    "        add a value\n",
+    "\n",
+    "        Parameters\n",
+    "            value : value to add\n",
+    "        \"\"\"\n",
+    "        self.window.append(value)\n",
+    "        if len(self.window) > self.wsize:\n",
+    "            self.window = self.window[1:]\n",
+    "\n",
+    "    def getStat(self):\n",
+    "        \"\"\"\n",
+    "        get rolling window mean and std deviation\n",
+    "        \"\"\"\n",
+    "        assertGreater(len(self.window), 0, \"window is empty\")\n",
+    "        if len(self.window) == 1:\n",
+    "            self.mean = self.window[0]\n",
+    "            self.sd = 0\n",
+    "        else:\n",
+    "            self.mean = statistics.mean(self.window)\n",
+    "            self.sd = statistics.stdev(self.window, xbar=self.mean)\n",
+    "        re = (self.mean, self.sd)\n",
+    "        return re\n",
+    "\n",
+    "    def getSize(self):\n",
+    "        \"\"\"\n",
+    "        return window size\n",
+    "        \"\"\"\n",
+    "        return len(self.window)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lib/.ipynb_checkpoints/sampler-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lib/.ipynb_checkpoints/stats-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,510 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4cbab42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import random \n",
+    "import time\n",
+    "import math\n",
+    "import numpy as np\n",
+    "import statistics \n",
+    "from util import *\n",
+    "\n",
+    "\"\"\"\n",
+    "histogram class\n",
+    "\"\"\"\n",
+    "class Histogram:\n",
+    "    def __init__(self, min, binWidth):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            min : min x\n",
+    "            binWidth : bin width\n",
+    "        \"\"\"\n",
+    "        self.xmin = min\n",
+    "        self.binWidth = binWidth\n",
+    "        self.normalized = False\n",
+    "\n",
+    "    @classmethod\n",
+    "    def createInitialized(cls, xmin, binWidth, values):\n",
+    "        \"\"\"\n",
+    "        create histogram instance with min domain, bin width and values\n",
+    "\n",
+    "        Parameters\n",
+    "            min : min x\n",
+    "            binWidth : bin width\n",
+    "            values : y values\n",
+    "        \"\"\"\n",
+    "        instance = cls(xmin, binWidth)\n",
+    "        instance.xmax = xmin + binWidth * (len(values) - 1)\n",
+    "        instance.ymin = 0\n",
+    "        instance.bins = np.array(values)\n",
+    "        instance.fmax = 0\n",
+    "        for v in values:\n",
+    "            if (v > instance.fmax):\n",
+    "                instance.fmax = v\n",
+    "        instance.ymin = 0.0\n",
+    "        instance.ymax = instance.fmax\n",
+    "        return instance\n",
+    "\n",
+    "    @classmethod\n",
+    "    def createWithNumBins(cls, values, numBins=20):\n",
+    "        \"\"\"\n",
+    "        create histogram instance values and no of bins\n",
+    "\n",
+    "        Parameters\n",
+    "            values : y values\n",
+    "            numBins : no of bins\n",
+    "        \"\"\"\n",
+    "        xmin = min(values)\n",
+    "        xmax = max(values)\n",
+    "        binWidth = (xmax + .01 - (xmin - .01)) / numBins\n",
+    "        instance = cls(xmin, binWidth)\n",
+    "        instance.xmax = xmax\n",
+    "        instance.numBin = numBins\n",
+    "        instance.bins = np.zeros(instance.numBin)\n",
+    "        for v in values:\n",
+    "            instance.add(v)\n",
+    "        return instance\n",
+    "\n",
+    "    @classmethod\n",
+    "    def createUninitialized(cls, xmin, xmax, binWidth):\n",
+    "        \"\"\"\n",
+    "        create histogram instance with no y values using domain min , max and bin width\n",
+    "\n",
+    "        Parameters\n",
+    "            min : min x\n",
+    "            max : max x\n",
+    "            binWidth : bin width\n",
+    "        \"\"\"\n",
+    "        instance = cls(xmin, binWidth)\n",
+    "        instance.xmax = xmax\n",
+    "        instance.numBin = (xmax - xmin) / binWidth + 1\n",
+    "        instance.bins = np.zeros(instance.numBin)\n",
+    "        return instance\n",
+    "\n",
+    "    def initialize(self):\n",
+    "        \"\"\"\n",
+    "        set y values to 0\n",
+    "        \"\"\"\n",
+    "        self.bins = np.zeros(self.numBin)\n",
+    "\n",
+    "    def add(self, value):\n",
+    "        \"\"\"\n",
+    "        adds a value to a bin\n",
+    "\n",
+    "        Parameters\n",
+    "            value : value\n",
+    "        \"\"\"\n",
+    "        bin = int((value - self.xmin) / self.binWidth)\n",
+    "        if (bin < 0 or  bin > self.numBin - 1):\n",
+    "            print (bin)\n",
+    "            raise ValueError(\"outside histogram range\")\n",
+    "        self.bins[bin] += 1.0\n",
+    "\n",
+    "    def normalize(self):\n",
+    "        \"\"\"\n",
+    "        normalize  bin counts\n",
+    "        \"\"\"\n",
+    "        if not self.normalized:\n",
+    "            total = self.bins.sum()\n",
+    "            self.bins = np.divide(self.bins, total)\n",
+    "            self.normalized = True\n",
+    "\n",
+    "    def cumDistr(self):\n",
+    "        \"\"\"\n",
+    "        cumulative dists\n",
+    "        \"\"\"\n",
+    "        self.normalize()\n",
+    "        self.cbins = np.cumsum(self.bins)\n",
+    "        return self.cbins\n",
+    "\n",
+    "    def distr(self):\n",
+    "        \"\"\"\n",
+    "        distr\n",
+    "        \"\"\"\n",
+    "        self.normalize()\n",
+    "        return self.bins\n",
+    "\n",
+    "\n",
+    "    def percentile(self, percent):\n",
+    "        \"\"\"\n",
+    "        return value corresponding to a percentile\n",
+    "\n",
+    "        Parameters\n",
+    "            percent : percentile value\n",
+    "        \"\"\"\n",
+    "        if self.cbins is None:\n",
+    "            raise ValueError(\"cumulative distribution is not available\")\n",
+    "\n",
+    "        for i,cuml in enumerate(self.cbins):\n",
+    "            if percent > cuml:\n",
+    "                value = (i * self.binWidth) - (self.binWidth / 2) + \\\n",
+    "                (percent - self.cbins[i-1]) * self.binWidth / (self.cbins[i] - self.cbins[i-1]) \n",
+    "                break\n",
+    "        return value\n",
+    "\n",
+    "    def max(self):\n",
+    "        \"\"\"\n",
+    "        return max bin value \n",
+    "        \"\"\"\n",
+    "        return self.bins.max()\n",
+    "\n",
+    "    def value(self, x):\n",
+    "        \"\"\"\n",
+    "        return a bin value\t\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        bin = int((x - self.xmin) / self.binWidth)\n",
+    "        f = self.bins[bin]\n",
+    "        return f\n",
+    "\n",
+    "    def bin(self, x):\n",
+    "        \"\"\"\n",
+    "        return a bin index\t\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        return int((x - self.xmin) / self.binWidth)\n",
+    "\n",
+    "    def cumValue(self, x):\n",
+    "        \"\"\"\n",
+    "        return a cumulative bin value\t\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        bin = int((x - self.xmin) / self.binWidth)\n",
+    "        c = self.cbins[bin]\n",
+    "        return c\n",
+    "\n",
+    "\n",
+    "    def getMinMax(self):\n",
+    "        \"\"\"\n",
+    "        returns x min and x max\n",
+    "        \"\"\"\n",
+    "        return (self.xmin, self.xmax)\n",
+    "\n",
+    "    def boundedValue(self, x):\n",
+    "        \"\"\"\n",
+    "        return x bounde by min and max\t\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        if x < self.xmin:\n",
+    "            x = self.xmin\n",
+    "        elif x > self.xmax:\n",
+    "            x = self.xmax\n",
+    "        return x\n",
+    "\n",
+    "\"\"\"\n",
+    "categorical histogram class\n",
+    "\"\"\"\n",
+    "class CatHistogram:\n",
+    "    def __init__(self):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "        \"\"\"\n",
+    "        self.binCounts = dict()\n",
+    "        self.counts = 0\n",
+    "        self.normalized = False\n",
+    "\n",
+    "    def add(self, value):\n",
+    "        \"\"\"\n",
+    "        adds a value to a bin\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        addToKeyedCounter(self.binCounts, value)\n",
+    "        self.counts += 1\t\n",
+    "\n",
+    "    def normalize(self):\n",
+    "        \"\"\"\n",
+    "        normalize\n",
+    "        \"\"\"\n",
+    "        if not self.normalized:\n",
+    "            self.binCounts = dict(map(lambda r : (r[0],r[1] / self.counts), self.binCounts.items()))\n",
+    "            self.normalized = True\n",
+    "\n",
+    "    def getMode(self):\n",
+    "        \"\"\"\n",
+    "        get mode\n",
+    "        \"\"\"\n",
+    "        maxk = None\n",
+    "        maxv = 0\n",
+    "        #print(self.binCounts)\n",
+    "        for  k,v  in  self.binCounts.items():\n",
+    "            if v > maxv:\n",
+    "                maxk = k\n",
+    "                maxv = v\n",
+    "        return (maxk, maxv)\t\n",
+    "\n",
+    "    def getEntropy(self):\n",
+    "        \"\"\"\n",
+    "        get entropy\n",
+    "        \"\"\"\n",
+    "        self.normalize()\n",
+    "        entr = 0 \n",
+    "        #print(self.binCounts)\n",
+    "        for  k,v  in  self.binCounts.items():\n",
+    "            entr -= v * math.log(v)\n",
+    "        return entr\n",
+    "\n",
+    "    def getUniqueValues(self):\n",
+    "        \"\"\"\n",
+    "        get unique values\n",
+    "        \"\"\"\t\t\n",
+    "        return list(self.binCounts.keys())\n",
+    "\n",
+    "    def getDistr(self):\n",
+    "        \"\"\"\n",
+    "        get distribution\n",
+    "        \"\"\"\t\n",
+    "        self.normalize()\t\n",
+    "        return self.binCounts.copy()\n",
+    "\n",
+    "class RunningStat:\n",
+    "    \"\"\"\n",
+    "    running stat class\n",
+    "    \"\"\"\n",
+    "    def __init__(self):\n",
+    "        \"\"\"\n",
+    "        initializer\t\n",
+    "        \"\"\"\n",
+    "        self.sum = 0.0\n",
+    "        self.sumSq = 0.0\n",
+    "        self.count = 0\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def create(count, sum, sumSq):\n",
+    "        \"\"\"\n",
+    "        creates iinstance\t\n",
+    "\n",
+    "        Parameters\n",
+    "            sum : sum of values\n",
+    "            sumSq : sum of valure squared\n",
+    "        \"\"\"\n",
+    "        rs = RunningStat()\n",
+    "        rs.sum = sum\n",
+    "        rs.sumSq = sumSq\n",
+    "        rs.count = count\n",
+    "        return rs\n",
+    "\n",
+    "    def add(self, value):\n",
+    "        \"\"\"\n",
+    "        adds new value\n",
+    "        Parameters\n",
+    "            value : value to add\n",
+    "        \"\"\"\n",
+    "        self.sum += value\n",
+    "        self.sumSq += (value * value)\n",
+    "        self.count += 1\n",
+    "\n",
+    "    def getStat(self):\n",
+    "        \"\"\"\n",
+    "        return mean and std deviation \n",
+    "        \"\"\"\n",
+    "        mean = self.sum /self. count\n",
+    "        t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
+    "        sd = math.sqrt(t)\n",
+    "        re = (mean, sd)\n",
+    "        return re\n",
+    "\n",
+    "    def addGetStat(self,value):\n",
+    "        \"\"\"\n",
+    "        calculate mean and std deviation with new value added\n",
+    "        Parameters\n",
+    "            value : value to add\n",
+    "        \"\"\"\n",
+    "        self.add(value)\n",
+    "        re = self.getStat()\n",
+    "        return re\n",
+    "\n",
+    "    def getCount(self):\n",
+    "        \"\"\"\n",
+    "        return count\n",
+    "        \"\"\"\n",
+    "        return self.count\n",
+    "\n",
+    "    def getState(self):\n",
+    "        \"\"\"\n",
+    "        return state\n",
+    "        \"\"\"\n",
+    "        s = (self.count, self.sum, self.sumSq)\n",
+    "        return s\n",
+    "\n",
+    "class SlidingWindowStat:\n",
+    "    \"\"\"\n",
+    "    sliding window stats\n",
+    "    \"\"\"\n",
+    "    def __init__(self):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "        \"\"\"\n",
+    "        self.sum = 0.0\n",
+    "        self.sumSq = 0.0\n",
+    "        self.count = 0\n",
+    "        self.values = None\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def create(values, sum, sumSq):\n",
+    "        \"\"\"\n",
+    "        creates iinstance\t\n",
+    "\n",
+    "        Parameters\n",
+    "            sum : sum of values\n",
+    "            sumSq : sum of valure squared\n",
+    "        \"\"\"\n",
+    "        sws = SlidingWindowStat()\n",
+    "        sws.sum = sum\n",
+    "        sws.sumSq = sumSq\n",
+    "        self.values = values.copy()\n",
+    "        sws.count = len(self.values)\n",
+    "        return sws\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def initialize(values):\n",
+    "        \"\"\"\n",
+    "        creates iinstance\t\n",
+    "\n",
+    "        Parameters\n",
+    "            values : list of values\n",
+    "        \"\"\"\n",
+    "        sws = SlidingWindowStat()\n",
+    "        sws.values = values.copy()\n",
+    "        for v in sws.values:\n",
+    "            sws.sum += v\n",
+    "            sws.sumSq += v * v\t\t\n",
+    "        sws.count = len(sws.values)\n",
+    "        return sws\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def createEmpty(count):\n",
+    "        \"\"\"\n",
+    "        creates iinstance\t\n",
+    "\n",
+    "        Parameters\n",
+    "            count : count of values\n",
+    "        \"\"\"\n",
+    "        sws = SlidingWindowStat()\n",
+    "        sws.count = count\n",
+    "        sws.values = list()\n",
+    "        return sws\n",
+    "\n",
+    "    def add(self, value):\n",
+    "        \"\"\"\n",
+    "        adds new value\n",
+    "\n",
+    "        Parameters\n",
+    "            value : value to add\n",
+    "        \"\"\"\n",
+    "        self.values.append(value)\t\t\n",
+    "        if len(self.values) > self.count:\n",
+    "            self.sum += value - self.values[0]\n",
+    "            self.sumSq += (value * value) - (self.values[0] * self.values[0])\n",
+    "            self.values.pop(0)\n",
+    "        else:\n",
+    "            self.sum += value\n",
+    "            self.sumSq += (value * value)\n",
+    "\n",
+    "\n",
+    "    def getStat(self):\n",
+    "        \"\"\"\n",
+    "        calculate mean and std deviation \n",
+    "        \"\"\"\n",
+    "        mean = self.sum /self. count\n",
+    "        t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
+    "        sd = math.sqrt(t)\n",
+    "        re = (mean, sd)\n",
+    "        return re\n",
+    "\n",
+    "    def addGetStat(self,value):\n",
+    "        \"\"\"\n",
+    "        calculate mean and std deviation with new value added\n",
+    "        \"\"\"\n",
+    "        self.add(value)\n",
+    "        re = self.getStat()\n",
+    "        return re\n",
+    "\n",
+    "    def getCount(self):\n",
+    "        \"\"\"\n",
+    "        return count\n",
+    "        \"\"\"\n",
+    "        return self.count\n",
+    "\n",
+    "    def getCurSize(self):\n",
+    "        \"\"\"\n",
+    "        return count\n",
+    "        \"\"\"\n",
+    "        return len(self.values)\n",
+    "\n",
+    "    def getState(self):\n",
+    "        \"\"\"\n",
+    "        return state\n",
+    "        \"\"\"\n",
+    "        s = (self.count, self.sum, self.sumSq)\n",
+    "        return s\n",
+    "\n",
+    "\n",
+    "def basicStat(ldata):\n",
+    "    \"\"\"\n",
+    "    mean and std dev\n",
+    "    Parameters\n",
+    "        ldata : list of values\n",
+    "    \"\"\"\n",
+    "    m = statistics.mean(ldata)\n",
+    "    s = statistics.stdev(ldata, xbar=m)\n",
+    "    r = (m, s)\n",
+    "    return r\n",
+    "\n",
+    "def getFileColumnStat(filePath, col, delem=\",\"):\n",
+    "    \"\"\"\n",
+    "    gets stats for a file column\n",
+    "\n",
+    "    Parameters\n",
+    "        filePath : file path\n",
+    "        col : col index\n",
+    "        delem : field delemter\n",
+    "    \"\"\"\n",
+    "    rs = RunningStat()\n",
+    "    for rec in fileRecGen(filePath, delem):\n",
+    "        va = float(rec[col])\n",
+    "        rs.add(va)\n",
+    "\n",
+    "    return rs.getStat()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lib/.ipynb_checkpoints/tnn-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,800 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3853095d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from torch.autograd import Variable\n",
+    "from torch.utils.data import Dataset, TensorDataset\n",
+    "from torch.utils.data import DataLoader\n",
+    "import sklearn as sk\n",
+    "from sklearn.neighbors import KDTree\n",
+    "import matplotlib\n",
+    "import random\n",
+    "import jprops\n",
+    "from random import randint\n",
+    "import statistics\n",
+    "sys.path.append(os.path.abspath(\"../lib\"))\n",
+    "from util import *\n",
+    "from mlutil import *\n",
+    "\n",
+    "\"\"\"\n",
+    "forward hook function\n",
+    "\"\"\"\n",
+    "intermedOut = {}\n",
+    "lvalues = list()\n",
+    "\n",
+    "def hookFn(m, i, o):\n",
+    "    \"\"\"\n",
+    "    call back for latent values\n",
+    "    \"\"\"\n",
+    "    #intermedOut[m] = o\n",
+    "    lv = o.data.cpu().numpy()\n",
+    "    lv = lv[0].tolist()\n",
+    "    lvalues.append(lv)\n",
+    "    #print(lv)\n",
+    "\n",
+    "def getLatValues():\n",
+    "    \"\"\"\n",
+    "    \"\"\"\n",
+    "    return lvalues\n",
+    "\n",
+    "class FeedForwardNetwork(torch.nn.Module):\n",
+    "    def __init__(self, configFile, addDefValues=None):\n",
+    "        \"\"\"\n",
+    "        In the constructor we instantiate two nn.Linear modules and assign them as\n",
+    "        member variables.\n",
+    "\n",
+    "        Parameters\n",
+    "            configFile : config file path\n",
+    "            addDefValues : dictionary of additional default values\t\n",
+    "        \"\"\"\n",
+    "        defValues = dict() if addDefValues is None else addDefValues.copy()\n",
+    "        defValues[\"common.mode\"] = (\"training\", None)\n",
+    "        defValues[\"common.model.directory\"] = (\"model\", None)\n",
+    "        defValues[\"common.model.file\"] = (None, None)\n",
+    "        defValues[\"common.preprocessing\"] = (None, None)\n",
+    "        defValues[\"common.scaling.method\"] = (\"zscale\", None)\n",
+    "        defValues[\"common.scaling.minrows\"] = (50, None)\n",
+    "        defValues[\"common.scaling.param.file\"] = (None, None)\n",
+    "        defValues[\"common.verbose\"] = (False, None)\n",
+    "        defValues[\"common.device\"] = (\"cpu\", None)\n",
+    "        defValues[\"train.data.file\"] = (None, \"missing training data file\")\n",
+    "        defValues[\"train.data.fields\"] = (None, \"missing training data field ordinals\")\n",
+    "        defValues[\"train.data.feature.fields\"] = (None, \"missing training data feature field ordinals\")\n",
+    "        defValues[\"train.data.out.fields\"] = (None, \"missing training data feature field ordinals\")\n",
+    "        defValues[\"train.layer.data\"] = (None, \"missing layer data\")\n",
+    "        defValues[\"train.input.size\"] = (None, None)\n",
+    "        defValues[\"train.output.size\"] = (None, \"missing  output size\")\n",
+    "        defValues[\"train.batch.size\"] = (10, None)\n",
+    "        defValues[\"train.loss.reduction\"] = (\"mean\", None)\n",
+    "        defValues[\"train.num.iterations\"] = (500, None)\n",
+    "        defValues[\"train.lossFn\"] = (\"mse\", None) \n",
+    "        defValues[\"train.optimizer\"] = (\"sgd\", None) \n",
+    "        defValues[\"train.opt.learning.rate\"] = (.0001, None)\n",
+    "        defValues[\"train.opt.weight.decay\"] = (0, None) \n",
+    "        defValues[\"train.opt.momentum\"] = (0, None) \n",
+    "        defValues[\"train.opt.eps\"] = (1e-08, None) \n",
+    "        defValues[\"train.opt.dampening\"] = (0, None) \n",
+    "        defValues[\"train.opt.momentum.nesterov\"] = (False, None) \n",
+    "        defValues[\"train.opt.betas\"] = ([0.9, 0.999], None) \n",
+    "        defValues[\"train.opt.alpha\"] = (0.99, None) \n",
+    "        defValues[\"train.save.model\"] = (False, None) \n",
+    "        defValues[\"train.track.error\"] = (False, None) \n",
+    "        defValues[\"train.epoch.intv\"] = (5, None) \n",
+    "        defValues[\"train.batch.intv\"] = (5, None) \n",
+    "        defValues[\"train.print.weights\"] = (False, None) \n",
+    "        defValues[\"valid.data.file\"] = (None, None)\n",
+    "        defValues[\"valid.accuracy.metric\"] = (None, None)\n",
+    "        defValues[\"predict.data.file\"] = (None, None)\n",
+    "        defValues[\"predict.use.saved.model\"] = (True, None)\n",
+    "        defValues[\"predict.output\"] = (\"binary\", None)\n",
+    "        defValues[\"predict.feat.pad.size\"] = (60, None)\n",
+    "        defValues[\"predict.print.output\"] = (True, None)\n",
+    "        defValues[\"calibrate.num.bins\"] = (10, None)\n",
+    "        defValues[\"calibrate.pred.prob.thresh\"] = (0.5, None)\n",
+    "        defValues[\"calibrate.num.nearest.neighbors\"] = (10, None)\n",
+    "        self.config = Configuration(configFile, defValues)\n",
+    "\n",
+    "        super(FeedForwardNetwork, self).__init__()\n",
+    "\n",
+    "    def setConfigParam(self, name, value):\n",
+    "        \"\"\"\n",
+    "        set config param\n",
+    "\n",
+    "        Parameters\n",
+    "            name : config name\n",
+    "            value : config value\n",
+    "        \"\"\"\n",
+    "        self.config.setParam(name, value)\n",
+    "\n",
+    "    def getConfig(self):\n",
+    "        \"\"\"\n",
+    "        get config object\n",
+    "        \"\"\"\n",
+    "        return self.config\n",
+    "\n",
+    "    def setVerbose(self, verbose):\n",
+    "        self.verbose = verbose\n",
+    "\n",
+    "    def buildModel(self):\n",
+    "        \"\"\"\n",
+    "        Loads configuration and builds the various piecess necessary for the model\n",
+    "        \"\"\"\n",
+    "        torch.manual_seed(9999)\n",
+    "\n",
+    "        self.verbose = self.config.getBooleanConfig(\"common.verbose\")[0]\n",
+    "        numinp = self.config.getIntConfig(\"train.input.size\")[0]\n",
+    "        if numinp is None:\n",
+    "            numinp = len(self.config.getIntListConfig(\"train.data.feature.fields\")[0])\n",
+    "        #numOut = len(self.config.getStringConfig(\"train.data.out.fields\")[0].split(\",\"))\n",
+    "        self.outputSize = self.config.getIntConfig(\"train.output.size\")[0]\n",
+    "        self.batchSize = self.config.getIntConfig(\"train.batch.size\")[0]\n",
+    "        #lossRed = self.config.getStringConfig(\"train.loss.reduction\")[0]\n",
+    "        #learnRate = self.config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
+    "        self.numIter = self.config.getIntConfig(\"train.num.iterations\")[0]\n",
+    "        optimizer = self.config.getStringConfig(\"train.optimizer\")[0]\n",
+    "        self.lossFnStr = self.config.getStringConfig(\"train.lossFn\")[0]\n",
+    "        self.accMetric = self.config.getStringConfig(\"valid.accuracy.metric\")[0]\n",
+    "        self.trackErr = self.config.getBooleanConfig(\"train.track.error\")[0]\n",
+    "        self.batchIntv = self.config.getIntConfig(\"train.batch.intv\")[0]\n",
+    "        self.restored = False\n",
+    "        self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None\n",
+    "\n",
+    "        #build network\n",
+    "        layers = list()\n",
+    "        ninp = numinp\n",
+    "        trData =  self.config.getStringConfig(\"train.layer.data\")[0].split(\",\")\n",
+    "        for ld in trData:\n",
+    "            lde = ld.split(\":\")\n",
+    "            assert len(lde) == 5, \"expecting 5 items for layer data\"\n",
+    "\n",
+    "            #num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction\n",
+    "            nunit = int(lde[0])\n",
+    "            actStr = lde[1]\n",
+    "            act = FeedForwardNetwork.createActivation(actStr) if actStr != \"none\"  else None\n",
+    "            bnorm = lde[2] == \"true\"\n",
+    "            afterAct = lde[3] == \"true\"\n",
+    "            dpr = float(lde[4])\n",
+    "\n",
+    "            layers.append(torch.nn.Linear(ninp, nunit))\t\t\t\n",
+    "            if bnorm:\n",
+    "                #with batch norm\n",
+    "                if afterAct:\n",
+    "                    safeAppend(layers, act)\n",
+    "                    layers.append(torch.nn.BatchNorm1d(nunit))\n",
+    "                else:\n",
+    "                    layers.append(torch.nn.BatchNorm1d(nunit))\n",
+    "                    safeAppend(layers, act)\n",
+    "            else:\n",
+    "                #without batch norm\n",
+    "                safeAppend(layers, act)\n",
+    "\n",
+    "            if dpr > 0:\n",
+    "                layers.append(torch.nn.Dropout(dpr))\n",
+    "            ninp = nunit\n",
+    "\n",
+    "        self.layers = torch.nn.Sequential(*layers)\t\n",
+    "\n",
+    "        self.device = FeedForwardNetwork.getDevice(self)\n",
+    "\n",
+    "        #training data\n",
+    "        dataFile = self.config.getStringConfig(\"train.data.file\")[0]\n",
+    "        (featData, outData) = FeedForwardNetwork.prepData(self, dataFile)\n",
+    "        self.featData = torch.from_numpy(featData)\n",
+    "        self.outData = torch.from_numpy(outData)\n",
+    "\n",
+    "        #validation data\n",
+    "        dataFile = self.config.getStringConfig(\"valid.data.file\")[0]\n",
+    "        (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataFile)\n",
+    "        self.validFeatData = torch.from_numpy(featDataV)\n",
+    "        self.validOutData = torch.from_numpy(outDataV)\n",
+    "\n",
+    "        # loss function and optimizer\n",
+    "        self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)\n",
+    "        self.optimizer =  FeedForwardNetwork.createOptimizer(self, optimizer)\n",
+    "\n",
+    "        self.yPred  = None\n",
+    "        self.restored = False\n",
+    "\n",
+    "        #mode to device\n",
+    "        self.device = FeedForwardNetwork.getDevice(self)\t\n",
+    "        self.featData = self.featData.to(self.device)\n",
+    "        self.outData = self.outData.to(self.device)\n",
+    "        self.validFeatData = self.validFeatData.to(self.device)\n",
+    "        self.to(self.device)\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def getDevice(model):\n",
+    "        \"\"\"\n",
+    "        gets device\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "        \"\"\"\n",
+    "        devType = model.config.getStringConfig(\"common.device\")[0]\n",
+    "        if devType == \"cuda\":\n",
+    "            if torch.cuda.is_available():\n",
+    "                device = torch.device(\"cuda\")\n",
+    "            else:\n",
+    "                exitWithMsg(\"cuda not available\")\n",
+    "        else:\n",
+    "            device = torch.device(\"cpu\")\n",
+    "        return device\n",
+    "\n",
+    "    def setValidationData(self, dataSource, prep=True):\n",
+    "        \"\"\"\n",
+    "        sets validation data\n",
+    "\n",
+    "        Parameters\n",
+    "            dataSource : data source str if file path or 2D array\n",
+    "            prep : if True load and prepare \n",
+    "        \"\"\"\n",
+    "        if prep:\n",
+    "            (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataSource)\n",
+    "            self.validFeatData = torch.from_numpy(featDataV)\n",
+    "            self.validOutData = outDataV\n",
+    "        else:\n",
+    "            self.validFeatData = torch.from_numpy(dataSource[0])\n",
+    "            self.validOutData = dataSource[1]\t\t\n",
+    "\n",
+    "        self.validFeatData = self.validFeatData.to(self.device)\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def createActivation(actName):\n",
+    "        \"\"\"\n",
+    "        create activation\n",
+    "\n",
+    "        Parameters\n",
+    "            actName : activation name\n",
+    "        \"\"\"\n",
+    "        if actName is None:\n",
+    "            activation = None\n",
+    "        elif actName == \"relu\":\n",
+    "            activation = torch.nn.ReLU()\n",
+    "        elif actName == \"tanh\":\n",
+    "            activation = torch.nn.Tanh()\n",
+    "        elif actName == \"sigmoid\":\n",
+    "            activation = torch.nn.Sigmoid()\n",
+    "        elif actName == \"softmax\":\n",
+    "            activation = torch.nn.Softmax(dim=1)\n",
+    "        else:\n",
+    "            exitWithMsg(\"invalid activation function name \" + actName)\n",
+    "        return activation\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def createLossFunction(model, lossFnName):\n",
+    "        \"\"\"\n",
+    "        create loss function\n",
+    "\n",
+    "        Parameters\n",
+    "            lossFnName : loss function name\n",
+    "        \"\"\"\n",
+    "        config = model.config\n",
+    "        lossRed = config.getStringConfig(\"train.loss.reduction\")[0]\n",
+    "        if lossFnName == \"ltwo\" or lossFnName == \"mse\":\n",
+    "            lossFunc = torch.nn.MSELoss(reduction=lossRed)\n",
+    "        elif lossFnName == \"ce\":\n",
+    "            lossFunc = torch.nn.CrossEntropyLoss(reduction=lossRed)\n",
+    "        elif lossFnName == \"lone\" or lossFnName == \"mae\":\n",
+    "            lossFunc = torch.nn.L1Loss(reduction=lossRed)\n",
+    "        elif lossFnName == \"bce\":\n",
+    "            lossFunc = torch.nn.BCELoss(reduction=lossRed)\n",
+    "        elif lossFnName == \"bcel\":\n",
+    "            lossFunc = torch.nn.BCEWithLogitsLoss(reduction=lossRed)\n",
+    "        elif lossFnName == \"sm\":\n",
+    "            lossFunc = torch.nn.SoftMarginLoss(reduction=lossRed)\n",
+    "        elif lossFnName == \"mlsm\":\n",
+    "            lossFunc = torch.nn.MultiLabelSoftMarginLoss(reduction=lossRed)\n",
+    "        else:\n",
+    "            exitWithMsg(\"invalid loss function name \" + lossFnName)\n",
+    "        return lossFunc\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def createOptimizer(model, optName):\n",
+    "        \"\"\"\n",
+    "        create optimizer\n",
+    "\n",
+    "        Parameters\n",
+    "            optName : optimizer name\n",
+    "        \"\"\"\n",
+    "        config = model.config\n",
+    "        learnRate = config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
+    "        weightDecay = config.getFloatConfig(\"train.opt.weight.decay\")[0]\n",
+    "        momentum = config.getFloatConfig(\"train.opt.momentum\")[0]\n",
+    "        eps = config.getFloatConfig(\"train.opt.eps\")[0]\n",
+    "        if optName == \"sgd\":\n",
+    "            dampening = config.getFloatConfig(\"train.opt.dampening\")[0]\n",
+    "            momentumNesterov = config.getBooleanConfig(\"train.opt.momentum.nesterov\")[0]\n",
+    "            optimizer = torch.optim.SGD(model.parameters(),lr=learnRate, momentum=momentum, \n",
+    "            dampening=dampening, weight_decay=weightDecay, nesterov=momentumNesterov)\n",
+    "        elif optName == \"adam\":\n",
+    "            betas = config.getFloatListConfig(\"train.opt.betas\")[0]\n",
+    "            betas = (betas[0], betas[1]) \n",
+    "            optimizer = torch.optim.Adam(model.parameters(), lr=learnRate,betas=betas, eps = eps,\n",
+    "            weight_decay=weightDecay)\n",
+    "        elif optName == \"rmsprop\":\n",
+    "            alpha = config.getFloatConfig(\"train.opt.alpha\")[0]\n",
+    "            optimizer = torch.optim.RMSprop(model.parameters(), lr=learnRate, alpha=alpha,\n",
+    "            eps=eps, weight_decay=weightDecay, momentum=momentum)\n",
+    "        else:\n",
+    "            exitWithMsg(\"invalid optimizer name \" + optName)\n",
+    "        return optimizer\n",
+    "\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        \"\"\"\n",
+    "        In the forward function we accept a Tensor of input data and we must return\n",
+    "        a Tensor of output data. We can use Modules defined in the constructor as\n",
+    "        well as arbitrary (differentiable) operations on Tensors.\n",
+    "\n",
+    "        Parameters\n",
+    "            x : data batch\n",
+    "        \"\"\"\n",
+    "        y = self.layers(x)\t\n",
+    "        return y\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def addForwardHook(model, l, cl = 0):\n",
+    "        \"\"\"\n",
+    "        register forward hooks\n",
+    "\n",
+    "        Parameters\n",
+    "            l : \n",
+    "            cl :\n",
+    "        \"\"\"\n",
+    "        for name, layer in model._modules.items():\n",
+    "            #If it is a sequential, don't register a hook on it\n",
+    "            # but recursively register hook on all it's module children\n",
+    "            print(str(cl) + \" : \" + name)\n",
+    "            if isinstance(layer, torch.nn.Sequential):\n",
+    "                FeedForwardNetwork.addForwardHook(layer, l, cl)\n",
+    "            else:\n",
+    "            #\t it's a non sequential. Register a hook\n",
+    "                if cl == l:\n",
+    "                    print(\"setting hook at layer \" + str(l))\n",
+    "                    layer.register_forward_hook(hookFn)\n",
+    "                cl += 1\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def prepData(model, dataSource, includeOutFld=True):\n",
+    "        \"\"\"\n",
+    "        loads and prepares  data\n",
+    "\n",
+    "        Parameters\n",
+    "            dataSource : data source str if file path or 2D array\n",
+    "            includeOutFld : True if target freld to be included\n",
+    "        \"\"\"\n",
+    "        # parameters\n",
+    "        fieldIndices = model.config.getIntListConfig(\"train.data.fields\")[0]\n",
+    "        featFieldIndices = model.config.getIntListConfig(\"train.data.feature.fields\")[0]\n",
+    "\n",
+    "        #all data and feature data\n",
+    "        isDataFile = isinstance(dataSource, str)\n",
+    "        selFieldIndices = fieldIndices if includeOutFld else fieldIndices[:-1]\n",
+    "        if isDataFile: \n",
+    "            #source file path \n",
+    "            (data, featData) = loadDataFile(dataSource, \",\", selFieldIndices, featFieldIndices)\n",
+    "        else:\n",
+    "            # tabular data\n",
+    "            data = tableSelFieldsFilter(dataSource, selFieldIndices)\n",
+    "            featData = tableSelFieldsFilter(data, featFieldIndices)\n",
+    "            #print(featData)\n",
+    "            featData = np.array(featData)\n",
+    "\n",
+    "        if (model.config.getStringConfig(\"common.preprocessing\")[0] == \"scale\"):\n",
+    "            scalingMethod = model.config.getStringConfig(\"common.scaling.method\")[0]\n",
+    "\n",
+    "            #scale only if there are enough rows\n",
+    "            nrow = featData.shape[0]\n",
+    "            minrows = model.config.getIntConfig(\"common.scaling.minrows\")[0]\n",
+    "            if nrow > minrows:\n",
+    "                #in place scaling\n",
+    "                featData = scaleData(featData, scalingMethod)\n",
+    "            else:\n",
+    "                #use pre computes scaling parameters\n",
+    "                spFile = model.config.getStringConfig(\"common.scaling.param.file\")[0]\n",
+    "                if spFile is None:\n",
+    "                    exitWithMsg(\"for small data sets pre computed scaling parameters need to provided\")\n",
+    "                scParams = restoreObject(spFile)\n",
+    "                featData = scaleDataWithParams(featData, scalingMethod, scParams)\n",
+    "                featData = np.array(featData)\n",
+    "\n",
+    "        # target data\n",
+    "        if includeOutFld:\n",
+    "            outFieldIndices = model.config.getStringConfig(\"train.data.out.fields\")[0]\n",
+    "            outFieldIndices = strToIntArray(outFieldIndices, \",\")\n",
+    "            if isDataFile:\n",
+    "                outData = data[:,outFieldIndices]\n",
+    "            else:\n",
+    "                outData = tableSelFieldsFilter(data, outFieldIndices)\n",
+    "                outData = np.array(outData)\n",
+    "            foData = (featData.astype(np.float32), outData.astype(np.float32))\n",
+    "        else:\n",
+    "            foData = featData.astype(np.float32)\n",
+    "        return foData\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def saveCheckpt(model):\n",
+    "        \"\"\"\n",
+    "        checkpoints model\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "        \"\"\"\n",
+    "        print(\"..saving model checkpoint\")\n",
+    "        modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
+    "        assert os.path.exists(modelDirectory), \"model save directory does not exist\"\n",
+    "        modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
+    "        filepath = os.path.join(modelDirectory, modelFile)\n",
+    "        state = {\"state_dict\": model.state_dict(), \"optim_dict\": model.optimizer.state_dict()}\n",
+    "        torch.save(state, filepath)\n",
+    "        if model.verbose:\n",
+    "            print(\"model saved\")\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def restoreCheckpt(model, loadOpt=False):\n",
+    "        \"\"\"\n",
+    "        restored checkpointed model\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "            loadOpt : True if optimizer to be loaded\n",
+    "        \"\"\"\n",
+    "        if not model.restored:\n",
+    "            print(\"..restoring model checkpoint\")\n",
+    "            modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
+    "            modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
+    "            filepath = os.path.join(modelDirectory, modelFile)\n",
+    "            assert os.path.exists(filepath), \"model save file does not exist\"\n",
+    "            checkpoint = torch.load(filepath)\n",
+    "            model.load_state_dict(checkpoint[\"state_dict\"])\n",
+    "            model.to(model.device)\n",
+    "            if loadOpt:\n",
+    "                model.optimizer.load_state_dict(checkpoint[\"optim_dict\"])\n",
+    "            model.restored = True\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def processClassifOutput(yPred, config):\n",
+    "        \"\"\"\n",
+    "        extracts probability label 1 or label with highest probability\n",
+    "\n",
+    "        Parameters\n",
+    "            yPred : predicted output\n",
+    "            config : config object\n",
+    "        \"\"\"\n",
+    "        outType = config.getStringConfig(\"predict.output\")[0]\n",
+    "        if outType == \"prob\":\n",
+    "            outputSize = config.getIntConfig(\"train.output.size\")[0]\n",
+    "            if outputSize == 2:\n",
+    "                #return prob of pos class for binary classifier \n",
+    "                yPred = yPred[:, 1]\n",
+    "            else:\n",
+    "                #return  class value and probability for multi classifier \n",
+    "                yCl = np.argmax(yPred, axis=1)\n",
+    "                yPred = list(map(lambda y : y[0][y[1]], zip(yPred, yCl)))\n",
+    "                yPred = zip(yCl, yPred)\n",
+    "        else:\n",
+    "            yPred = np.argmax(yPred, axis=1)\n",
+    "        return yPred\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def printPrediction(yPred, config, dataSource):\n",
+    "        \"\"\"\n",
+    "        prints input feature data and prediction\n",
+    "\n",
+    "        Parameters\n",
+    "            yPred : predicted output\n",
+    "            config : config object\n",
+    "            dataSource : data source str if file path or 2D array\n",
+    "        \"\"\"\n",
+    "        #prDataFilePath = config.getStringConfig(\"predict.data.file\")[0]\n",
+    "        padWidth = config.getIntConfig(\"predict.feat.pad.size\")[0]\n",
+    "        i = 0\n",
+    "        if type(dataSource) == str:\n",
+    "            for rec in fileRecGen(dataSource, \",\"):\n",
+    "                feat = (\",\".join(rec)).ljust(padWidth, \" \")\n",
+    "                rec = feat + \"\\t\" + str(yPred[i])\n",
+    "                print(rec)\n",
+    "                i += 1\n",
+    "        else:\n",
+    "            for rec in dataSource:\n",
+    "                srec = toStrList(rec, 6)\n",
+    "                feat = (\",\".join(srec)).ljust(padWidth, \" \")\n",
+    "                srec = feat + \"\\t\" + str(yPred[i])\n",
+    "                print(srec)\n",
+    "                i += 1\n",
+    "\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def allTrain(model):\n",
+    "        \"\"\"\n",
+    "        train with all data\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "        \"\"\"\n",
+    "        # train mode\n",
+    "        model.train()\n",
+    "        for t in range(model.numIter):\n",
+    "\n",
+    "\n",
+    "            # Forward pass: Compute predicted y by passing x to the model\n",
+    "            yPred = model(model.featData)\n",
+    "\n",
+    "            # Compute and print loss\n",
+    "            loss = model.lossFn(yPred, model.outData)\n",
+    "            if model.verbose and  t % 50 == 0:\n",
+    "                print(\"epoch {}  loss {:.6f}\".format(t, loss.item()))\n",
+    "\n",
+    "            # Zero gradients, perform a backward pass, and update the weights.\n",
+    "            model.optimizer.zero_grad()\n",
+    "            loss.backward()\n",
+    "            model.optimizer.step()    \t\n",
+    "\n",
+    "        #validate\n",
+    "        model.eval()\n",
+    "        yPred = model(model.validFeatData)\n",
+    "        yPred = yPred.data.cpu().numpy()\n",
+    "        yActual = model.validOutData\n",
+    "        if model.verbose:\n",
+    "            result = np.concatenate((yPred, yActual), axis = 1)\n",
+    "            print(\"predicted  actual\")\n",
+    "            print(result)\n",
+    "\n",
+    "        score = perfMetric(model.accMetric, yActual, yPred)\n",
+    "        print(formatFloat(3, score, \"perf score\"))\n",
+    "        return score\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def batchTrain(model):\n",
+    "        \"\"\"\n",
+    "        train with batch data\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "        \"\"\"\n",
+    "        model.restored = False\n",
+    "        trainData = TensorDataset(model.featData, model.outData)\n",
+    "        trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)\n",
+    "        epochIntv = model.config.getIntConfig(\"train.epoch.intv\")[0]\n",
+    "\n",
+    "        # train mode\n",
+    "        model.train()\n",
+    "\n",
+    "        if model.trackErr:\n",
+    "            trErr = list()\n",
+    "            vaErr = list()\n",
+    "        #epoch\n",
+    "        for t in range(model.numIter):\n",
+    "            #batch\n",
+    "            b = 0\n",
+    "            epochLoss = 0.0\n",
+    "            for xBatch, yBatch in trainDataLoader:\n",
+    "\n",
+    "                # Forward pass: Compute predicted y by passing x to the model\n",
+    "                xBatch, yBatch = xBatch.to(model.device), yBatch.to(model.device)\n",
+    "                yPred = model(xBatch)\n",
+    "\n",
+    "                # Compute and print loss\n",
+    "                loss = model.lossFn(yPred, yBatch)\n",
+    "                if model.verbose and t % epochIntv == 0 and b % model.batchIntv == 0:\n",
+    "                    print(\"epoch {}  batch {}  loss {:.6f}\".format(t, b, loss.item()))\n",
+    "\n",
+    "                if model.trackErr and model.batchIntv == 0:\n",
+    "                    epochLoss += loss.item()\n",
+    "\n",
+    "                #error tracking at batch level\n",
+    "                if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:\n",
+    "                    trErr.append(loss.item())\n",
+    "                    vloss = FeedForwardNetwork.evaluateModel(model)\n",
+    "                    vaErr.append(vloss)\n",
+    "\n",
+    "                # Zero gradients, perform a backward pass, and update the weights.\n",
+    "                model.optimizer.zero_grad()\n",
+    "                loss.backward()\n",
+    "                model.optimizer.step()    \t\n",
+    "                b += 1\n",
+    "\n",
+    "            #error tracking at epoch level\n",
+    "            if model.trackErr and model.batchIntv == 0:\n",
+    "                epochLoss /= len(trainDataLoader)\n",
+    "                trErr.append(epochLoss)\n",
+    "                vloss = FeedForwardNetwork.evaluateModel(model)\n",
+    "                vaErr.append(vloss)\n",
+    "\n",
+    "        #validate\n",
+    "        model.eval()\n",
+    "        yPred = model(model.validFeatData)\n",
+    "        yPred = yPred.data.cpu().numpy()\n",
+    "        yActual = model.validOutData\n",
+    "        if model.verbose:\n",
+    "            vsize = yPred.shape[0]\n",
+    "            print(\"\\npredicted \\t\\t actual\")\n",
+    "            for i in range(vsize):\n",
+    "                print(str(yPred[i]) + \"\\t\" + str(yActual[i]))\n",
+    "\n",
+    "        score = perfMetric(model.accMetric, yActual, yPred)\n",
+    "        print(yActual)\n",
+    "        print(yPred)\n",
+    "        print(formatFloat(3, score, \"perf score\"))\n",
+    "\n",
+    "        #save\n",
+    "        modelSave = model.config.getBooleanConfig(\"train.model.save\")[0]\n",
+    "        if modelSave:\n",
+    "            FeedForwardNetwork.saveCheckpt(model)\n",
+    "\n",
+    "        if model.trackErr:\n",
+    "            FeedForwardNetwork.errorPlot(model, trErr, vaErr)\n",
+    "\n",
+    "        if model.config.getBooleanConfig(\"train.print.weights\")[0]:\n",
+    "            print(\"model weights\")\n",
+    "            for param in model.parameters():\n",
+    "                print(param.data)\n",
+    "        return score\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def errorPlot(model, trErr, vaErr):\n",
+    "        \"\"\"\n",
+    "        plot errors\n",
+    "\n",
+    "        Parameters\n",
+    "            trErr : training error list\t\n",
+    "            vaErr : validation error list\t\n",
+    "        \"\"\"\n",
+    "        x = np.arange(len(trErr))\n",
+    "        plt.plot(x,trErr,label = \"training error\")\n",
+    "        plt.plot(x,vaErr,label = \"validation error\")\n",
+    "        plt.xlabel(\"iteration\")\n",
+    "        plt.ylabel(\"error\")\n",
+    "        plt.legend([\"training error\", \"validation error\"], loc='upper left')\n",
+    "        plt.show()\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def modelPredict(model, dataSource = None):\n",
+    "        \"\"\"\n",
+    "        predict\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "            dataSource : data source\n",
+    "        \"\"\"\n",
+    "        #train or restore model\n",
+    "        useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
+    "        if useSavedModel:\n",
+    "            FeedForwardNetwork.restoreCheckpt(model)\n",
+    "        else:\n",
+    "            FeedForwardNetwork.batchTrain(model) \n",
+    "\n",
+    "        #predict\n",
+    "        if dataSource is None:\n",
+    "            dataSource = model.config.getStringConfig(\"predict.data.file\")[0]\n",
+    "        featData  = FeedForwardNetwork.prepData(model, dataSource, False)\n",
+    "        #print(featData)\n",
+    "        featData = torch.from_numpy(featData)\n",
+    "        featData = featData.to(model.device)\n",
+    "\n",
+    "        model.eval()\n",
+    "        yPred = model(featData)\n",
+    "        yPred = yPred.data.cpu().numpy()\n",
+    "        #print(yPred)\n",
+    "\n",
+    "        if model.outputSize >= 2:\n",
+    "            #classification\n",
+    "            yPred = FeedForwardNetwork.processClassifOutput(yPred, model.config)\n",
+    "\n",
+    "        # print prediction\n",
+    "        if model.config.getBooleanConfig(\"predict.print.output\")[0]:\n",
+    "            FeedForwardNetwork.printPrediction(yPred, model.config, dataSource)\n",
+    "\n",
+    "        return yPred\n",
+    "\n",
+    "    def predict(self, dataSource = None):\n",
+    "        \"\"\"\n",
+    "        predict\n",
+    "\n",
+    "        Parameters\n",
+    "            dataSource : data source\n",
+    "        \"\"\"\n",
+    "        return FeedForwardNetwork.modelPredict(self, dataSource)\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def evaluateModel(model):\n",
+    "        \"\"\"\n",
+    "        evaluate model\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "        \"\"\"\n",
+    "        model.eval()\n",
+    "        with torch.no_grad():\n",
+    "            yPred = model(model.validFeatData)\n",
+    "            #yPred = yPred.data.cpu().numpy()\n",
+    "            yActual = model.validOutData\n",
+    "            score = model.lossFn(yPred, yActual).item()\n",
+    "        model.train()\n",
+    "        return score\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def prepValidate(model, dataSource=None):\n",
+    "        \"\"\"\n",
+    "        prepare for validation\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "            dataSource : data source\n",
+    "        \"\"\"\n",
+    "        #train or restore model\n",
+    "        if not model.restored:\n",
+    "            useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
+    "            if useSavedModel:\n",
+    "                FeedForwardNetwork.restoreCheckpt(model)\n",
+    "            else:\n",
+    "                FeedForwardNetwork.batchTrain(model)\n",
+    "            model.restored = True\n",
+    "\n",
+    "        if \tdataSource is not None:\n",
+    "            model.setValidationData(dataSource)\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def validateModel(model, retPred=False):\n",
+    "        \"\"\"\n",
+    "        pmodel validation\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "            retPred : if True return prediction\n",
+    "        \"\"\"\n",
+    "        model.eval()\n",
+    "        yPred = model(model.validFeatData)\n",
+    "        yPred = yPred.data.cpu().numpy()\n",
+    "        model.yPred = yPred\n",
+    "        yActual = model.validOutData\n",
+    "        vsize = yPred.shape[0]\n",
+    "        if model.verbose:\n",
+    "            print(\"\\npredicted \\t actual\")\n",
+    "            for i in range(vsize):\n",
+    "                print(\"{:.3f}\\t\\t{:.3f}\".format(yPred[i][0], yActual[i][0]))\n",
+    "\n",
+    "        score = perfMetric(model.accMetric, yActual, yPred)\n",
+    "        print(formatFloat(3, score, \"perf score\"))\n",
+    "\n",
+    "        if retPred:\n",
+    "            y = list(map(lambda i : (yPred[i][0], yActual[i][0]), range(vsize)))\n",
+    "            res = (y, score)\n",
+    "            return res\n",
+    "        else:\t\n",
+    "            return score"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lib/.ipynb_checkpoints/txproc-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,1002 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f720c141",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "from random import randint\n",
+    "import random\n",
+    "import time\n",
+    "from datetime import datetime\n",
+    "import re, string, unicodedata\n",
+    "import nltk\n",
+    "import contractions\n",
+    "import inflect\n",
+    "from bs4 import BeautifulSoup\n",
+    "from nltk import word_tokenize, sent_tokenize\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem.isri import ISRIStemmer\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "from nltk.stem.snowball import SnowballStemmer\n",
+    "from nltk.stem import LancasterStemmer, WordNetLemmatizer\n",
+    "from nltk.tag import StanfordNERTagger\n",
+    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
+    "import spacy\n",
+    "import torch\n",
+    "from collections import defaultdict\n",
+    "import pickle\n",
+    "import numpy as np\n",
+    "import re\n",
+    "\n",
+    "sys.path.append(os.path.abspath(\"../lib\"))\n",
+    "from util import *\n",
+    "from mlutil import *\n",
+    "\n",
+    "lcc = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
+    "\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
+    "ucc = [\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\", \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\"]\n",
+    "dig = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
+    "spc = [\"@\",\"#\",\"$\",\"%\",\"^\",\"&\",\"*\",\"(\",\")\",\"_\",\"+\",\"{\",\"}\",\"[\",\"]\",\"|\",\":\",\"<\",\">\",\"?\",\";\",\",\",\".\"]\n",
+    "\n",
+    "\n",
+    "class TextPreProcessor:\n",
+    "    \"\"\"\n",
+    "    text preprocessor\n",
+    "    \"\"\"\n",
+    "    def __init__(self, stemmer = \"lancaster\", verbose=False):\n",
+    "        self.verbose = verbose\n",
+    "        self.lemmatizer = WordNetLemmatizer()\n",
+    "\n",
+    "    def stripHtml(self, text):\n",
+    "        soup = BeautifulSoup(text, \"html.parser\")\n",
+    "        return soup.get_text()\n",
+    "\n",
+    "    def removeBetweenSquareBrackets(self, text):\n",
+    "        return re.sub('\\[[^]]*\\]', '', text)\n",
+    "\n",
+    "    def denoiseText(self, text):\n",
+    "        text = stripHtml(text)\n",
+    "        text = removeBetweenSquareBrackets(text)\n",
+    "        return text\n",
+    "\n",
+    "    def replaceContractions(self, text):\n",
+    "        \"\"\"Replace contractions in string of text\"\"\"\n",
+    "        return contractions.fix(text)\n",
+    "\n",
+    "    def tokenize(self, text):\n",
+    "        words = nltk.word_tokenize(text)\n",
+    "        return words\n",
+    "\n",
+    "    def removeNonAscii(self, words):\n",
+    "        \"\"\"Remove non-ASCII characters from list of tokenized words\"\"\"\n",
+    "        newWords = []\n",
+    "        for word in words:\n",
+    "            if isinstance(word, unicode):\n",
+    "                newWord = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')\n",
+    "            else:\n",
+    "                newWord = word\n",
+    "            newWords.append(newWord)\n",
+    "        return newWords\n",
+    "\n",
+    "    def replaceNonAsciiFromText(self, text):\n",
+    "        \"\"\" replaces non ascii with blank  \"\"\"\n",
+    "        return ''.join([i if ord(i) < 128 else ' ' for i in text])\n",
+    "\n",
+    "    def removeNonAsciiFromText(self, text):\n",
+    "        \"\"\" replaces non ascii with blank  \"\"\"\n",
+    "        return ''.join([i if ord(i) < 128 else '' for i in text])\n",
+    "\n",
+    "    def allow(self, words):\n",
+    "        \"\"\" allow only specific charaters \"\"\"\n",
+    "        allowed = [word for word in words if re.match('^[A-Za-z0-9\\.\\,\\:\\;\\!\\?\\(\\)\\'\\-\\$\\@\\%\\\"]+$', word) is not None]\t\t\n",
+    "        return allowed\t\t\n",
+    "\n",
+    "    def toLowercase(self, words):\n",
+    "        \"\"\"Convert all characters to lowercase from list of tokenized words\"\"\"\n",
+    "        newWords = [word.lower() for word in words]\n",
+    "        return newWords\n",
+    "\n",
+    "    def removePunctuation(self, words):\n",
+    "        \"\"\"Remove punctuation from list of tokenized words\"\"\"\n",
+    "        newWords = []\n",
+    "        for word in words:\n",
+    "            newWord = re.sub(r'[^\\w\\s]', '', word)\n",
+    "            if newWord != '':\n",
+    "                newWords.append(newWord)\n",
+    "        return newWords\n",
+    "\n",
+    "    def replaceNumbers(self, words):\n",
+    "        \"\"\"Replace all interger occurrences in list of tokenized words with textual representation\"\"\"\n",
+    "        p = inflect.engine()\n",
+    "        newWords = []\n",
+    "        for word in words:\n",
+    "            if word.isdigit():\n",
+    "                newWord = p.number_to_words(word)\n",
+    "                newWords.append(newWord)\n",
+    "            else:\n",
+    "                newWords.append(word)\n",
+    "        return newWords\n",
+    "\n",
+    "    def removeStopwords(self, words):\n",
+    "        \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
+    "        newWords = []\n",
+    "        for word in words:\n",
+    "            if word not in stopwords.words('english'):\n",
+    "                newWords.append(word)\n",
+    "        return newWords\n",
+    "\n",
+    "    def removeCustomStopwords(self, words, stopWords):\n",
+    "        \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
+    "        removed = [word for word in words if word not in stopWords]\t\t\n",
+    "        return removed\n",
+    "\n",
+    "    def removeLowFreqWords(self, words, minFreq):\n",
+    "        \"\"\"Remove low frewquncy words from list of tokenized words\"\"\"\n",
+    "        frequency = defaultdict(int)\n",
+    "        for word in words:\n",
+    "            frequency[word] += 1\n",
+    "        removed = [word for word in words if frequency[word] > minFreq]\t\t\n",
+    "        return removed\t\n",
+    "\n",
+    "    def removeNumbers(self, words):\n",
+    "        \"\"\"Remove numbers\"\"\"\n",
+    "        removed = [word for word in words if not isNumber(word)]\t\t\n",
+    "        return removed\t\t\n",
+    "\n",
+    "    def removeShortWords(self, words, minLengh):\n",
+    "        \"\"\"Remove short words \"\"\"\n",
+    "        removed = [word for word in words if len(word) >= minLengh]\t\t\n",
+    "        return removed\t\t\n",
+    "\n",
+    "    def keepAllowedWords(self, words, keepWords):\n",
+    "        \"\"\"Keep  words from the list only\"\"\"\n",
+    "        kept = [word for word in words if word in keepWords]\t\t\n",
+    "        return kept\n",
+    "\n",
+    "    def stemWords(self, words):\n",
+    "        \"\"\"Stem words in list of tokenized words\"\"\"\n",
+    "        if stemmer == \"lancaster\":\n",
+    "            stemmer = LancasterStemmer()\n",
+    "        elif stemmer == \"snowbal\":\n",
+    "            stemmer = SnowballStemmer()\n",
+    "        elif stemmer == \"porter\":\n",
+    "            stemmer = PorterStemmer()\n",
+    "        stems = [stemmer.stem(word) for word in words]\n",
+    "        return stems\n",
+    "\n",
+    "    def lemmatizeWords(self, words):\n",
+    "        \"\"\"Lemmatize tokens in list of tokenized words\"\"\"\n",
+    "        lemmas = [self.lemmatizer.lemmatize(word) for word in words]\n",
+    "        return lemmas\n",
+    "\n",
+    "    def lemmatizeVerbs(self, words):\n",
+    "        \"\"\"Lemmatize verbs in list of tokenized words\"\"\"\n",
+    "        lemmas = [self.lemmatizer.lemmatize(word, pos='v') for word in words]\n",
+    "        return lemmas\n",
+    "\n",
+    "    def normalize(self, words):\n",
+    "        words = self.removeNonAscii(words)\n",
+    "        words = self.toLowercase(words)\n",
+    "        words = self.removePunctuation(words)\n",
+    "        words = self.replaceNumbers(words)\n",
+    "        words = self.removeStopwords(words)\n",
+    "        return words\n",
+    "\n",
+    "    def posTag(self, textTokens):\n",
+    "        tags = nltk.pos_tag(textTokens)\n",
+    "        return tags\n",
+    "\n",
+    "    def extractEntity(self, textTokens, classifierPath, jarPath):\n",
+    "        st = StanfordNERTagger(classifierPath, jarPath) \n",
+    "        entities = st.tag(textTokens)\n",
+    "        return entities\n",
+    "\n",
+    "    def documentFeatures(self, document, wordFeatures):\n",
+    "        documentWords = set(document)\n",
+    "        features = {}\n",
+    "        for word in wordFeatures:\n",
+    "            features[word] = (word in documentWords)\n",
+    "        return features\n",
+    "\n",
+    "class NGram:\n",
+    "    \"\"\"\n",
+    "    word ngram\n",
+    "    \"\"\"\n",
+    "    def __init__(self, vocFilt, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        self.vocFilt = vocFilt\n",
+    "        self.nGramCounter = dict()\n",
+    "        self.nGramFreq = dict()\n",
+    "        self.corpSize = 0\n",
+    "        self.vocabulary = set()\n",
+    "        self.freqDone = False\n",
+    "        self.verbose = verbose\n",
+    "        self.vecWords = None\n",
+    "        self.nonZeroCount = 0\n",
+    "\n",
+    "    def countDocNGrams(self, words):\n",
+    "        \"\"\"\n",
+    "        count words in a doc\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"doc size \" + str(len(words)))\n",
+    "        nGrams = self.toNGram(words)\n",
+    "        for nGram in nGrams:\n",
+    "            count = self.nGramCounter.get(nGram, 0)\n",
+    "            self.nGramCounter[nGram] = count + 1\n",
+    "            self.corpSize += 1\n",
+    "        self.vocabulary.update(words)\t\n",
+    "\n",
+    "    def remLowCount(self, minCount):\n",
+    "        \"\"\"\n",
+    "        removes items with count below threshold\n",
+    "        \"\"\"\n",
+    "        self.nGramCounter = dict(filter(lambda item: item[1] >= minCount, self.nGramCounter.items()))\n",
+    "\n",
+    "    def getVocabSize(self):\n",
+    "        \"\"\"\n",
+    "        get vocabulary size\n",
+    "        \"\"\"\n",
+    "        return len(self.nGramCounter)\n",
+    "\n",
+    "    def getNGramFreq(self):\n",
+    "        \"\"\"\n",
+    "        get normalized count\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"counter size \" + str(len(self.nGramCounter)))\n",
+    "        if not self.freqDone:\n",
+    "            for item in self.nGramCounter.items():\n",
+    "                self.nGramFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
+    "            self.freqDone = True\n",
+    "        return self.nGramFreq\n",
+    "\n",
+    "    def getNGramIndex(self, show):\n",
+    "        \"\"\"\n",
+    "        convert to list\n",
+    "        \"\"\"\n",
+    "        if self.vecWords is None:\n",
+    "            self.vecWords = list(self.nGramCounter)\n",
+    "            if show:\n",
+    "                for vw in enumerate(self.vecWords):\n",
+    "                    print(vw)\n",
+    "\n",
+    "    def getVector(self, words, byCount, normalized):\n",
+    "        \"\"\"\n",
+    "        convert to vector\n",
+    "        \"\"\"\n",
+    "        if self.vecWords is None:\n",
+    "            self.vecWords = list(self.nGramCounter)\n",
+    "\n",
+    "        nGrams = self.toNGram(words)\n",
+    "        if self.verbose:\n",
+    "            print(\"vocabulary size {}\".format(len(self.vecWords)))\n",
+    "            print(\"ngrams\")\n",
+    "            print(nGrams)\n",
+    "        self.nonZeroCount = 0\n",
+    "        vec = list(map(lambda vw: self.getVecElem(vw, nGrams, byCount, normalized), self.vecWords))\n",
+    "        return vec\n",
+    "\n",
+    "    def getVecElem(self, vw, nGrams, byCount, normalized):\n",
+    "        \"\"\"\n",
+    "        get vector element\n",
+    "        \"\"\"\n",
+    "        if vw in nGrams:\n",
+    "            if byCount:\n",
+    "                if normalized:\n",
+    "                    el = self.nGramFreq[vw]\n",
+    "                else:\n",
+    "                    el = self.nGramCounter[vw]\n",
+    "            else:\n",
+    "                el = 1\n",
+    "            self.nonZeroCount += 1\n",
+    "        else:\n",
+    "            if (byCount and normalized):\n",
+    "                el = 0.0\n",
+    "            else:\n",
+    "                el = 0\n",
+    "        return el\n",
+    "\n",
+    "    def getNonZeroCount(self):\n",
+    "        \"\"\"\n",
+    "        get non zero vector element count\n",
+    "        \"\"\"\n",
+    "        return self.nonZeroCount\n",
+    "\n",
+    "    def toBiGram(self, words):\n",
+    "        \"\"\"\n",
+    "        convert to bigram\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"doc size \" + str(len(words)))\n",
+    "        biGrams = list()\n",
+    "        for i in range(len(words)-1):\n",
+    "            w1 = words[i]\n",
+    "            w2 = words[i+1]\n",
+    "            if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt):\n",
+    "                nGram = (w1, w2)\n",
+    "                biGrams.append(nGram)\n",
+    "        return biGrams\n",
+    "\n",
+    "    def toTriGram(self, words):\n",
+    "        \"\"\"\n",
+    "        convert to trigram\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"doc size \" + str(len(words)))\n",
+    "        triGrams = list()\n",
+    "        for i in range(len(words)-2):\n",
+    "            w1 = words[i]\n",
+    "            w2 = words[i+1]\n",
+    "            w3 = words[i+2]\n",
+    "            if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt and w3 in self.vocFilt):\n",
+    "                nGram = (w1, w2, w3)\n",
+    "                triGrams.append(nGram)\n",
+    "        return triGrams\n",
+    "\n",
+    "    def save(self, saveFile):\n",
+    "        \"\"\"\n",
+    "        save \n",
+    "        \"\"\"\n",
+    "        sf = open(saveFile, \"wb\")\n",
+    "        pickle.dump(self, sf)\n",
+    "        sf.close()\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def load(saveFile):\n",
+    "        \"\"\"\n",
+    "        load\n",
+    "        \"\"\"\n",
+    "        sf = open(saveFile, \"rb\")\n",
+    "        nGrams = pickle.load(sf)\n",
+    "        sf.close()\n",
+    "        return nGrams\n",
+    "\n",
+    "class CharNGram:\n",
+    "    \"\"\"\n",
+    "    character n gram\n",
+    "    \"\"\"\n",
+    "    def __init__(self, domains, ngsize, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        self.chDomain = list()\n",
+    "        self.ws = \"#\"\n",
+    "        self.chDomain.append(self.ws)\n",
+    "        for d in domains:\n",
+    "            if d == \"lcc\":\n",
+    "                self.chDomain.extend(lcc)\n",
+    "            elif d == \"ucc\":\n",
+    "                self.chDomain.extend(ucc)\n",
+    "            elif d == \"dig\":\n",
+    "                self.chDomain.extend(dig)\n",
+    "            elif d == \"spc\":\n",
+    "                self.chDomain.extend(spc)\n",
+    "            else:\n",
+    "                raise ValueError(\"invalid character type \" + d)\n",
+    "\n",
+    "        self.ngsize = ngsize\n",
+    "        self.radixPow = None\n",
+    "        self.cntVecSize = None\n",
+    "\n",
+    "    def addSpChar(self, spChar):\n",
+    "        \"\"\"\n",
+    "        add special characters\n",
+    "        \"\"\"\n",
+    "        self.chDomain.extend(spChar)\n",
+    "\n",
+    "    def setWsRepl(self, ws):\n",
+    "        \"\"\"\n",
+    "        set white space replacement charater\n",
+    "        \"\"\"\n",
+    "        self.ws = ws\n",
+    "        self.chDomain[0] = self.ws\n",
+    "\n",
+    "    def finalize(self):\n",
+    "        \"\"\"\n",
+    "        final setup\n",
+    "        \"\"\"\t\t\n",
+    "        domSize = len(self.chDomain)\n",
+    "        self.cntVecSize = int(math.pow(domSize, self.ngsize))\n",
+    "        if self.radixPow is None:\n",
+    "            self.radixPow = list()\n",
+    "            for i in range(self.ngsize-1, 0, -1):\n",
+    "                self.radixPow.append(int(math.pow(domSize, i)))\n",
+    "            self.radixPow.append(1)\n",
+    "\n",
+    "\n",
+    "    def toMgramCount(self, text):\n",
+    "        \"\"\"\n",
+    "        get ngram count list\n",
+    "        \"\"\"\n",
+    "        #print(text)\n",
+    "        ngCounts = [0] *  self.cntVecSize\n",
+    "\n",
+    "        ngram = list()\n",
+    "        totNgCount  = 0\n",
+    "        for ch in text:\n",
+    "            if ch.isspace():\n",
+    "                l = len(ngram)\n",
+    "                if l == 0 or ngram[l-1] != self.ws:\n",
+    "                    ngram.append(self.ws)\n",
+    "            else:\n",
+    "                ngram.append(ch)\n",
+    "\n",
+    "            if len(ngram) == self.ngsize:\n",
+    "                i = self.__getNgramIndex(ngram)\n",
+    "                assert i < self.cntVecSize, \"ngram index out of range index \" + str(i) + \" size \" + str(self.cntVecSize) \n",
+    "                ngCounts[i] += 1\n",
+    "                ngram.clear()\n",
+    "                totNgCount += 1\n",
+    "\n",
+    "        return ngCounts\n",
+    "\n",
+    "    def __getNgramIndex(self, ngram):\n",
+    "        \"\"\"\n",
+    "        get index of an ngram into a list of size equal total number of possible ngrams\n",
+    "        \"\"\"\n",
+    "        assert len(ngram) == len(self.radixPow), \"ngram size mismatch\"\t\t\n",
+    "        ngi = 0\n",
+    "        for ch, rp in zip(ngram, self.radixPow):\n",
+    "            i = self.chDomain.index(ch)\n",
+    "            ngi += i * rp\n",
+    "\n",
+    "        return ngi\n",
+    "\n",
+    "\n",
+    "class TfIdf:\n",
+    "    \"\"\"\n",
+    "    TF IDF\t\n",
+    "    \"\"\"\n",
+    "    def __init__(self, vocFilt, doIdf, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        self.vocFilt = vocFilt\n",
+    "        self.doIdf = doIdf\n",
+    "        self.wordCounter = {}\n",
+    "        self.wordFreq = {}\n",
+    "        self.wordInDocCount = {}\n",
+    "        self.docCount = 0\n",
+    "        self.corpSize = 0\n",
+    "        self.freqDone = False\n",
+    "        self.vocabulary = set()\n",
+    "        self.wordIndex = None\n",
+    "        self.verbose = verbose\n",
+    "        self.vecWords = None\n",
+    "\n",
+    "    def countDocWords(self, words):\n",
+    "        \"\"\"\n",
+    "        count words in a doc\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"doc size \" + str(len(words)))\n",
+    "        for word in words:\n",
+    "            if self.vocFilt is None or word in self.vocFilt:\n",
+    "                count = self.wordCounter.get(word, 0)\n",
+    "                self.wordCounter[word] = count + 1\n",
+    "        self.corpSize += len(words)\n",
+    "        self.vocabulary.update(words)\n",
+    "\n",
+    "        if (self.doIdf):\n",
+    "            self.docCount += 1\n",
+    "            for word in set(words):\n",
+    "                self.wordInDocCount.get(word, 0)\n",
+    "                self.wordInDocCount[word] = count + 1\n",
+    "        self.freqDone = False\n",
+    "\n",
+    "\n",
+    "    def getWordFreq(self):\n",
+    "        \"\"\"\n",
+    "        get tfidf for corpus\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"counter size \" + str(len(self.wordCounter)))\n",
+    "        if not self.freqDone:\n",
+    "            for item in self.wordCounter.items():\n",
+    "                self.wordFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
+    "            if self.doIdf:\n",
+    "                for k in self.wordFreq.keys():\n",
+    "                    self.wordFreq.items[k] *=  math.log(self.docCount / self.wordInDocCount.items[k])\t\n",
+    "            self.freqDone = True\n",
+    "        return self.wordFreq\n",
+    "\n",
+    "    def getCount(self, word):\n",
+    "        \"\"\"\n",
+    "        get counter\n",
+    "        \"\"\"\n",
+    "        if word in self.wordCounter:\n",
+    "            count = self.wordCounter[word]\n",
+    "        else:\n",
+    "            raise ValueError(\"word not found in count table \" + word)\n",
+    "        return count\n",
+    "\n",
+    "    def getFreq(self, word):\n",
+    "        \"\"\"\n",
+    "        get normalized frequency\n",
+    "        \"\"\"\n",
+    "        if word in self.wordFreq:\n",
+    "            freq = self.wordFreq[word]\n",
+    "        else:\n",
+    "            raise ValueError(\"word not found in count table \" + word)\n",
+    "        return freq\n",
+    "\n",
+    "    def resetCounter(self):\n",
+    "        \"\"\"\n",
+    "        reset counter\n",
+    "        \"\"\"\n",
+    "        self.wordCounter = {}\n",
+    "\n",
+    "    def buildVocabulary(self, words):\n",
+    "        \"\"\"\n",
+    "        build vocbulary\n",
+    "        \"\"\"\n",
+    "        self.vocabulary.update(words)\n",
+    "\n",
+    "    def getVocabulary(self):\n",
+    "        \"\"\"\n",
+    "        return vocabulary\n",
+    "        \"\"\"\n",
+    "        return self.vocabulary\n",
+    "\n",
+    "    def creatWordIndex(self):\n",
+    "        \"\"\"\n",
+    "        index for all words in vcabulary\n",
+    "        \"\"\"\n",
+    "        self.wordIndex = {word : idx for idx, word in enumerate(list(self.vocabulary))}\n",
+    "\n",
+    "    def getVector(self, words, byCount, normalized):\n",
+    "        \"\"\"\n",
+    "        get vector\n",
+    "        \"\"\"\n",
+    "        if self.vecWords is None:\n",
+    "            self.vecWords = list(self.wordCounter)\n",
+    "        vec = list(map(lambda vw: self.getVecElem(vw, words, byCount, normalized), self.vecWords))\n",
+    "        return vec\n",
+    "\n",
+    "    def getVecElem(self, vw, words, byCount, normalized):\n",
+    "        \"\"\"\n",
+    "        vector element\n",
+    "        \"\"\"\n",
+    "        el = 0\n",
+    "        if vw in words:\n",
+    "            if byCount:\n",
+    "                if normalized:\n",
+    "                    el = self.wordFreq[vw]\n",
+    "                else:\n",
+    "                    el = self.wordCounter[vw]\n",
+    "            else:\n",
+    "                el = 1\n",
+    "        return el\n",
+    "\n",
+    "    def save(self, saveFile):\n",
+    "        \"\"\"\n",
+    "        save\n",
+    "        \"\"\"\n",
+    "        sf = open(saveFile, \"wb\")\n",
+    "        pickle.dump(self, sf)\n",
+    "        sf.close()\n",
+    "\n",
+    "    # load \n",
+    "    @staticmethod\n",
+    "    def load(saveFile):\n",
+    "        \"\"\"\n",
+    "        load\n",
+    "        \"\"\"\n",
+    "        sf = open(saveFile, \"rb\")\n",
+    "        tfidf = pickle.load(sf)\n",
+    "        sf.close()\n",
+    "        return tfidf\n",
+    "\n",
+    "# bigram\n",
+    "class BiGram(NGram):\n",
+    "    def __init__(self, vocFilt, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        super(BiGram, self).__init__(vocFilt, verbose)\n",
+    "\n",
+    "    def toNGram(self, words):\n",
+    "        \"\"\"\n",
+    "        convert to Ngrams\n",
+    "        \"\"\"\n",
+    "        return self.toBiGram(words)\n",
+    "\n",
+    "# trigram\n",
+    "class TriGram(NGram):\n",
+    "    def __init__(self, vocFilt, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        super(TriGram, self).__init__(vocFilt, verbose)\n",
+    "\n",
+    "    def toNGram(self, words):\n",
+    "        \"\"\"\n",
+    "        convert to Ngrams\n",
+    "        \"\"\"\n",
+    "        return self.toTriGram(words)\n",
+    "\n",
+    "\n",
+    "\n",
+    "class DocSentences:\n",
+    "    \"\"\"\n",
+    "    sentence processor\n",
+    "    \"\"\"\n",
+    "    def __init__(self, filePath, minLength, verbose, text=None):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        if filePath:\n",
+    "            self.filePath = filePath\n",
+    "            with open(filePath, 'r') as contentFile:\n",
+    "                content = contentFile.read()\n",
+    "        elif text:\n",
+    "            content = text\n",
+    "        else:\n",
+    "            raise valueError(\"either file path or text must be provided\")\n",
+    "\n",
+    "        #self.sentences = content.split('.')\n",
+    "        self.verbose = verbose\n",
+    "        tp = TextPreProcessor()\n",
+    "        content = tp.removeNonAsciiFromText(content)\n",
+    "        sentences = sent_tokenize(content)\n",
+    "        self.sentences = list(filter(lambda s: len(nltk.word_tokenize(s)) >= minLength, sentences))\n",
+    "        if self.verbose:\n",
+    "            print (\"num of senteces after length filter \" + str(len(self.sentences)))\n",
+    "        self.sentencesAsTokens = [clean(s, tp, verbose) for s in self.sentences]\t\n",
+    "\n",
+    "    # get sentence tokens\n",
+    "    def getSentencesAsTokens(self):\n",
+    "        return self.sentencesAsTokens\n",
+    "\n",
+    "    # get sentences\n",
+    "    def getSentences(self):\n",
+    "        return self.sentences\n",
+    "\n",
+    "    # build term freq table\n",
+    "    def getTermFreqTable(self):\n",
+    "        # term count table for all words\n",
+    "        termTable = TfIdf(None, False)\n",
+    "        sentWords = self.getSentencesAsTokens()\n",
+    "        for seWords in sentWords:\n",
+    "            termTable.countDocWords(seWords)\n",
+    "        return termTable\n",
+    "\n",
+    "# sentence processor\n",
+    "class WordVectorContainer:\n",
+    "    def __init__(self, dirPath, verbose):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        self.docs = list()\n",
+    "        self.wordVectors = list()\n",
+    "        self.tp = TextPreProcessor()\n",
+    "        self.similarityAlgo = \"cosine\"\n",
+    "        self.simAlgoNormalizer = None\n",
+    "        self.termTable = None\n",
+    "\n",
+    "\n",
+    "    def addDir(self, dirPath):\n",
+    "        \"\"\"\n",
+    "        add content of all files ina directory\n",
+    "        \"\"\"\n",
+    "        docs, filePaths  = getFileContent(dirPath, verbose)\n",
+    "        self.docs.extend(docs)\n",
+    "        self.wordVectors.extend([clean(doc, self.tp, verbose) for doc in docs])\n",
+    "\n",
+    "    def addFile(self, filePath):\n",
+    "        \"\"\"\n",
+    "        add file content\n",
+    "        \"\"\"\n",
+    "        with open(filePath, 'r') as contentFile:\n",
+    "            content = contentFile.read()\n",
+    "        self.wordVectors.append(clean(content, self.tp, verbose))\n",
+    "\n",
+    "    def addText(self, text):\n",
+    "        \"\"\"\n",
+    "        add text\n",
+    "        \"\"\"\n",
+    "        self.wordVectors.append(clean(text, self.tp, verbose))\n",
+    "\n",
+    "    def addWords(self, words):\n",
+    "        \"\"\"\n",
+    "        add words\n",
+    "        \"\"\"\n",
+    "        self.wordVectors.append(words)\n",
+    "\n",
+    "    def withSimilarityAlgo(self, algo, normalizer=None):\n",
+    "        \"\"\"\n",
+    "        set similarity algo\n",
+    "        \"\"\"\n",
+    "        self.similarityAlgo = algo\n",
+    "        self.simAlgoNormalizer = normalizer\n",
+    "\n",
+    "    def getDocsWords(self):\n",
+    "        \"\"\"\n",
+    "        get word vectors\n",
+    "        \"\"\"\n",
+    "        return self.wordVectors\n",
+    "\n",
+    "    def getDocs(self):\n",
+    "        \"\"\"\n",
+    "        get docs\n",
+    "        \"\"\"\n",
+    "        return self.docs\n",
+    "\n",
+    "    def getTermFreqTable(self):\n",
+    "        \"\"\"\n",
+    "        term count table for all words\n",
+    "        \"\"\"\n",
+    "        self.termTable = TfIdf(None, False)\n",
+    "        for words in self.wordVectors:\n",
+    "            self.termTable.countDocWords(words)\n",
+    "        self.termTable.getWordFreq()\n",
+    "        return self.termTable\n",
+    "\n",
+    "    def getPairWiseSimilarity(self, byCount, normalized):\n",
+    "        \"\"\"\n",
+    "        pair wise similarity\n",
+    "        \"\"\"\n",
+    "        self.getNumWordVectors()\n",
+    "\n",
+    "        size = len(self.wordVectors)\n",
+    "        simArray = np.empty(shape=(size,size))\n",
+    "        for i in range(size):\n",
+    "            simArray[i][i] = 1.0\n",
+    "\n",
+    "        for i in range(size):\n",
+    "            for j in range(i+1, size):\n",
+    "                if self.similarityAlgo == \"cosine\":\n",
+    "                    sim = cosineSimilarity(self.numWordVectors[i], self.numWordVectors[j])\n",
+    "                elif self.similarityAlgo == \"jaccard\":\n",
+    "                    sim = jaccardSimilarity(self.wordVectors[i], self.wordVectors[j],\\\n",
+    "                        self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
+    "                else:\n",
+    "                    raise ValueError(\"invalid similarity algorithms\")\n",
+    "                simArray[i][j] = sim\n",
+    "                simArray[j][i] = sim\n",
+    "        return simArray\n",
+    "\n",
+    "    def getInterSetSimilarity(self, byCount, normalized, split):\n",
+    "        \"\"\"\n",
+    "        inter set pair wise  similarity\n",
+    "        \"\"\"\n",
+    "        self.getNumWordVectors()\n",
+    "        size = len(self.wordVectors)\n",
+    "        if not self.similarityAlgo == \"jaccard\":\n",
+    "            firstNumVec = self.numWordVectors[:split]\n",
+    "            secNumVec = self.numWordVectors[split:]\n",
+    "            fiSize = len(firstNumVec)\n",
+    "            seSize = len(secNumVec)\n",
+    "        else:\n",
+    "            firstVec = self.wordVectors[:split]\n",
+    "            secVec = self.wordVectors[split:]\n",
+    "            fiSize = len(firstVec)\n",
+    "            seSize = len(secVec)\n",
+    "\n",
+    "        simArray = np.empty(shape=(fiSize,seSize))\n",
+    "        for i in range(fiSize):\n",
+    "            for j in range(seSize):\n",
+    "                if self.similarityAlgo == \"cosine\":\n",
+    "                    sim = cosineSimilarity(firstNumVec[i], secNumVec[j])\n",
+    "                elif self.similarityAlgo == \"jaccard\":\n",
+    "                    sim = jaccardSimilarity(firstVec[i], secVec[j],\\\n",
+    "                        self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
+    "                else:\n",
+    "                    raise ValueError(\"invalid similarity algorithms\")\n",
+    "                simArray[i][j] = sim\n",
+    "        return simArray\n",
+    "\n",
+    "    def getNumWordVectors(self):\n",
+    "        \"\"\"\n",
+    "        get vectors\n",
+    "        \"\"\"\n",
+    "        if not self.similarityAlgo == \"jaccard\":\n",
+    "            if self.numWordVectors is None:\n",
+    "                self.numWordVectors = list(map(lambda wv: self.termTable.getVector(wv, byCount, normalized), self.wordVectors))\n",
+    "\n",
+    "# fragments documents into whole doc, paragraph or passages\n",
+    "class TextFragmentGenerator:\n",
+    "    def __init__(self, level,  minParNl, passSize, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        self.level = level\n",
+    "        self.minParNl = minParNl\n",
+    "        self.passSize = passSize\n",
+    "        self.fragments = None\n",
+    "        self.verbose = verbose\n",
+    "\n",
+    "    def loadDocs(self, fpaths):\n",
+    "        \"\"\"\n",
+    "        loads documents from one file, multiple files or all files under directory\n",
+    "        \"\"\"\n",
+    "        fPaths = fpaths.split(\",\")\n",
+    "        if len(fPaths) == 1:\n",
+    "            if os.path.isfile(fPaths[0]):\n",
+    "                #one file\n",
+    "                if self.verbose:\n",
+    "                    print(\"got one file from path\")\n",
+    "                dnames = fPaths\n",
+    "                docStr = getOneFileContent(fPaths[0])\n",
+    "                dtexts = [docStr]\n",
+    "            else:\n",
+    "                #all files under directory\n",
+    "                if self.verbose:\n",
+    "                    print(\"got all files under directory from path\")\n",
+    "                dtexts, dnames = getFileContent(fPaths[0])\n",
+    "                if self.verbose:\n",
+    "                    print(\"found {} files\".format(len(dtexts)))\n",
+    "        else:\n",
+    "            #list of files\n",
+    "            if self.verbose: \n",
+    "                print(\"got list of files from path\")\n",
+    "            dnames = fPaths\n",
+    "            dtexts = list(map(getOneFileContent, fpaths))\n",
+    "            if self.verbose:\n",
+    "                print(\"found {} files\".format(len(dtexts)))\n",
+    "\n",
+    "        ndocs = (dtexts, dnames)\t\n",
+    "        if self.verbose:\n",
+    "            print(\"docs\")\n",
+    "            for dn, dt in zip(dnames, dtexts):\n",
+    "                print(dn + \"\\t\" + dt[:40])\n",
+    "\n",
+    "        return ndocs\n",
+    "\n",
+    "    def generateFragmentsFromFiles(self, fpaths):\n",
+    "        \"\"\"\n",
+    "        fragments documents into whole doc, paragraph or passages\n",
+    "        \"\"\"\n",
+    "        dtexts, dnames = self.loadDocs(fpaths)\n",
+    "        return self.generateFragments(dtexts, dnames)\n",
+    "\n",
+    "\n",
+    "    def generateFragmentsFromNamedDocs(self, ndocs):\n",
+    "        \"\"\"\n",
+    "        fragments documents into whole doc, paragraph or passages\n",
+    "        \"\"\"\n",
+    "        dtexts = list(map(lambda nd : nd[1], ndocs))\n",
+    "        dnames = list(map(lambda nd : nd[0], ndocs))\n",
+    "        #for i in range(len(dtexts)):\n",
+    "        #\tprint(dnames[i])\n",
+    "        #\tprint(dtexts[i][:40])\n",
+    "        return self.generateFragments(dtexts, dnames)\n",
+    "\n",
+    "    def generateFragments(self, dtexts, dnames):\n",
+    "        \"\"\"\n",
+    "        fragments documents into whole doc, paragraph or passages\n",
+    "        \"\"\"\n",
+    "        if self.level == \"para\" or self.level == \"passage\":\n",
+    "            #split paras\n",
+    "            dptexts = list()\n",
+    "            dpnames = list()\n",
+    "            for dt, dn in zip(dtexts, dnames):\n",
+    "                paras = getParas(dt, self.minParNl)\n",
+    "                if self.verbose:\n",
+    "                    print(dn)\n",
+    "                    print(\"no of paras {}\".format(len(paras)))\n",
+    "                dptexts.extend(paras)\n",
+    "                pnames = list(map(lambda i : dn + \":\" + str(i), range(len(paras))))\n",
+    "                dpnames.extend(pnames)\n",
+    "            dtexts = dptexts\n",
+    "            dnames = dpnames\n",
+    "\n",
+    "        if self.level == \"passage\":\n",
+    "            #split each para into passages\n",
+    "            dptexts = list()\n",
+    "            dpnames = list()\n",
+    "            for dt, dn in zip(dtexts, dnames):\n",
+    "                sents = sent_tokenize(dt.strip())\t\t\t\n",
+    "                if self.verbose:\n",
+    "                    print(dn)\n",
+    "                    print(\"no of sentences {}\".format(len(sents)))\n",
+    "                span = self.passSize\n",
+    "                if len(sents) <= span:\n",
+    "                    pass\n",
+    "                else:\n",
+    "                    for i in range(0, len(sents) - span, 1):\n",
+    "                        dptext = None\n",
+    "                        for j in range(span):\n",
+    "                            if dptext is None:\n",
+    "                                dptext = sents[i + j] +  \". \"\n",
+    "                            else:\n",
+    "                                dptext = dptext + sents[i + j] + \". \" \n",
+    "                        dpname = dn + \":\" + str(i)\n",
+    "                        dptexts.append(dptext)\n",
+    "                        dpnames.append(dpname)\n",
+    "\n",
+    "            dtexts = dptexts\n",
+    "            dnames = dpnames\n",
+    "\n",
+    "        self.fragments = list(zip(dnames, dtexts))\n",
+    "        #if self.verbose:\n",
+    "        #\tprint(\"num fragments {}\".format(len(self.fragments)))\n",
+    "        return self.fragments\n",
+    "\n",
+    "    def showFragments(self):\n",
+    "        \"\"\"\n",
+    "        show fragments\n",
+    "        \"\"\"\n",
+    "        print(\"showing all \" + self.level + \" for the first 40 characters\")\n",
+    "        for dn, dt in self.fragments:\n",
+    "            print(dn + \"\\t\" + dt[:40])\n",
+    "\n",
+    "    def isDocLevel(self):\n",
+    "        \"\"\"\n",
+    "        true if fragment is at doc level\n",
+    "        \"\"\"\n",
+    "        return self.level != \"para\" and self.level != \"passage\"\n",
+    "\n",
+    "# clean doc to create term array\n",
+    "def clean(doc, preprocessor, verbose):\n",
+    "    \"\"\"\n",
+    "    text pre process\n",
+    "    \"\"\"\n",
+    "    if verbose:\n",
+    "        print (\"--raw doc\")\n",
+    "        print (doc)\n",
+    "    #print \"next clean\"\n",
+    "    doc = preprocessor.removeNonAsciiFromText(doc)\n",
+    "    words = preprocessor.tokenize(doc)\n",
+    "    words = preprocessor.allow(words)\n",
+    "    words = preprocessor.toLowercase(words)\n",
+    "    words = preprocessor.removeStopwords(words)\n",
+    "    words = preprocessor.removeShortWords(words, 3)\n",
+    "    words = preprocessor.removePunctuation(words)\n",
+    "    words = preprocessor.lemmatizeWords(words)\n",
+    "    #words = preprocessor.removeNonAscii(words)\n",
+    "    if verbose:\n",
+    "        print (\"--after pre processing\")\n",
+    "        print (words)\n",
+    "    return words\n",
+    "\n",
+    "# get sentences\n",
+    "def getSentences(filePath):\n",
+    "    \"\"\"\n",
+    "    text pre process\n",
+    "    \"\"\"\n",
+    "    with open(filePath, 'r') as contentFile:\n",
+    "        content = contentFile.read()\n",
+    "        sentences = content.split('.')\n",
+    "    return sentences\n",
+    "\n",
+    "def getParas(text, minParNl=2):\n",
+    "    \"\"\"\n",
+    "    split into paras\n",
+    "    \"\"\"\n",
+    "    regx = \"\\n+\" if minParNl == 1 else \"\\n{2,}\"\n",
+    "    paras = re.split(regx, text.replace(\"\\r\\n\", \"\\n\"))\n",
+    "    return paras\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lib/.ipynb_checkpoints/util-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,2141 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "031d69ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "from random import randint\n",
+    "import random\n",
+    "import time\n",
+    "import uuid\n",
+    "from datetime import datetime\n",
+    "import math\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import logging\n",
+    "import logging.handlers\n",
+    "import pickle\n",
+    "from contextlib import contextmanager\n",
+    "\n",
+    "tokens = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\",\n",
+    "    \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
+    "numTokens = tokens[:10]\n",
+    "alphaTokens = tokens[10:36]\n",
+    "loCaseChars = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
+    "\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
+    "\n",
+    "typeInt = \"int\"\n",
+    "typeFloat = \"float\"\n",
+    "typeString = \"string\"\n",
+    "\n",
+    "secInMinute = 60\n",
+    "secInHour = 60 * 60\n",
+    "secInDay = 24 * secInHour\n",
+    "secInWeek = 7 * secInDay\n",
+    "secInYear = 365 * secInDay\n",
+    "secInMonth = secInYear / 12\n",
+    "\n",
+    "minInHour = 60\n",
+    "minInDay = 24 * minInHour\n",
+    "\n",
+    "ftPerYard = 3\n",
+    "ftPerMile = ftPerYard * 1760\n",
+    "\n",
+    "\n",
+    "def genID(size):\n",
+    "    \"\"\"\n",
+    "    generates ID\n",
+    "\n",
+    "    Parameters\n",
+    "        size : size of ID\n",
+    "    \"\"\"\n",
+    "    id = \"\"\n",
+    "    for i in range(size):\n",
+    "        id = id + selectRandomFromList(tokens)\n",
+    "    return id\n",
+    "\n",
+    "def genIdList(numId, idSize):\n",
+    "    \"\"\"\n",
+    "    generate list of IDs\n",
+    "\n",
+    "    Parameters:\n",
+    "        numId: number of Ids\n",
+    "        idSize: ID size\n",
+    "    \"\"\"\n",
+    "    iDs = []\n",
+    "    for i in range(numId):\n",
+    "        iDs.append(genID(idSize))\n",
+    "    return iDs\n",
+    "\n",
+    "def genNumID(size):\n",
+    "    \"\"\"\n",
+    "    generates ID consisting of digits onl\n",
+    "\n",
+    "    Parameters\n",
+    "        size : size of ID\n",
+    "    \"\"\"\n",
+    "    id = \"\"\n",
+    "    for i in range(size):\n",
+    "        id = id + selectRandomFromList(numTokens)\n",
+    "    return id\n",
+    "\n",
+    "def genLowCaseID(size):\n",
+    "    \"\"\"\n",
+    "    generates ID consisting of lower case chars\n",
+    "\n",
+    "    Parameters\n",
+    "        size : size of ID\n",
+    "    \"\"\"\n",
+    "    id = \"\"\n",
+    "    for i in range(size):\n",
+    "        id = id + selectRandomFromList(loCaseChars)\n",
+    "    return id\n",
+    "\n",
+    "def genNumIdList(numId, idSize):\n",
+    "    \"\"\"\n",
+    "    generate list of numeric IDs\n",
+    "\n",
+    "    Parameters:\n",
+    "        numId: number of Ids\n",
+    "        idSize: ID size\n",
+    "    \"\"\"\n",
+    "    iDs = []\n",
+    "    for i in range(numId):\n",
+    "        iDs.append(genNumID(idSize))\n",
+    "    return iDs\n",
+    "\n",
+    "def genNameInitial():\n",
+    "    \"\"\"\n",
+    "    generate name initial\n",
+    "    \"\"\"\n",
+    "    return selectRandomFromList(alphaTokens) + selectRandomFromList(alphaTokens)\n",
+    "\n",
+    "def genPhoneNum(arCode):\n",
+    "    \"\"\"\n",
+    "    generates phone number\n",
+    "\n",
+    "    Parameters\n",
+    "        arCode: area code\n",
+    "    \"\"\"\n",
+    "    phNum = genNumID(7)\n",
+    "    return arCode + str(phNum)\n",
+    "\n",
+    "def selectRandomFromList(ldata):\n",
+    "    \"\"\"\n",
+    "    select an element randomly from a lis\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "    \"\"\"\n",
+    "    return ldata[randint(0, len(ldata)-1)]\n",
+    "\n",
+    "def selectOtherRandomFromList(ldata, cval):\n",
+    "    \"\"\"\n",
+    "    select an element randomly from a list excluding the given one\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        cval : value to be excluded\n",
+    "    \"\"\"\n",
+    "    nval = selectRandomFromList(ldata)\n",
+    "    while nval == cval:\n",
+    "        nval = selectRandomFromList(ldata)\n",
+    "    return nval\n",
+    "\n",
+    "def selectRandomSubListFromList(ldata, num):\n",
+    "    \"\"\"\n",
+    "    generates random sublist from a list without replacemment\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        num : output list size\n",
+    "    \"\"\"\n",
+    "    assertLesser(num, len(ldata), \"size of sublist to be sampled greater than or equal to main list\")\n",
+    "    i = randint(0, len(ldata)-1)\n",
+    "    sel = ldata[i]\n",
+    "    selSet = {i}\n",
+    "    selList = [sel]\n",
+    "    while (len(selSet) < num):\n",
+    "        i = randint(0, len(ldata)-1)\n",
+    "        if (i not in selSet):\n",
+    "            sel = ldata[i]\n",
+    "            selSet.add(i)\n",
+    "            selList.append(sel)\t\t\n",
+    "    return selList\n",
+    "\n",
+    "def selectRandomSubListFromListWithRepl(ldata, num):\n",
+    "    \"\"\"\n",
+    "    generates random sublist from a list with replacemment\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        num : output list size\n",
+    "    \"\"\"\n",
+    "    return list(map(lambda i : selectRandomFromList(ldata), range(num)))\n",
+    "\n",
+    "def selectRandomFromDict(ddata):\n",
+    "    \"\"\"\n",
+    "    select an element randomly from a dictionary\n",
+    "\n",
+    "    Parameters\n",
+    "        ddata : dictionary data\n",
+    "    \"\"\"\n",
+    "    dkeys = list(ddata.keys())\n",
+    "    dk = selectRandomFromList(dkeys)\n",
+    "    el = (dk, ddata[dk])\n",
+    "    return el\n",
+    "\n",
+    "def setListRandomFromList(ldata, ldataRepl):\n",
+    "    \"\"\"\n",
+    "    sets some elents in the first list randomly with elements from the second list\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        ldataRepl : list with replacement data\n",
+    "    \"\"\"\n",
+    "    l = len(ldata)\n",
+    "    selSet = set()\n",
+    "    for d in ldataRepl:\n",
+    "        i = randint(0, l-1)\n",
+    "        while i in selSet:\n",
+    "            i = randint(0, l-1)\n",
+    "        ldata[i] = d\n",
+    "        selSet.add(i)\n",
+    "\n",
+    "def genIpAddress():\n",
+    "    \"\"\"\n",
+    "    generates IP address\n",
+    "    \"\"\"\n",
+    "    i1 = randint(0,256)\n",
+    "    i2 = randint(0,256)\n",
+    "    i3 = randint(0,256)\n",
+    "    i4 = randint(0,256)\n",
+    "    ip = \"%d.%d.%d.%d\" %(i1,i2,i3,i4)\n",
+    "    return ip\n",
+    "\n",
+    "def curTimeMs():\n",
+    "    \"\"\"\n",
+    "    current time in ms\n",
+    "    \"\"\"\n",
+    "    return int((datetime.utcnow() - datetime(1970,1,1)).total_seconds() * 1000)\n",
+    "\n",
+    "def secDegPolyFit(x1, y1, x2, y2, x3, y3):\n",
+    "    \"\"\"\n",
+    "    second deg polynomial \t\n",
+    "\n",
+    "    Parameters\n",
+    "        x1 : 1st point x\n",
+    "        y1 : 1st point y\n",
+    "        x2 : 2nd point x\n",
+    "        y2 : 2nd point y\n",
+    "        x3 : 3rd point x\n",
+    "        y3 : 3rd point y\n",
+    "    \"\"\"\n",
+    "    t = (y1 - y2) / (x1 - x2)\n",
+    "    a = t - (y2 - y3) / (x2 - x3)\n",
+    "    a = a / (x1 - x3)\n",
+    "    b = t - a * (x1 + x2)\n",
+    "    c = y1 - a * x1 * x1 - b * x1\n",
+    "    return (a, b, c)\n",
+    "\n",
+    "def range_limit(val, minv, maxv):\n",
+    "    \"\"\"\n",
+    "    range limit a value\n",
+    "\n",
+    "    Parameters\n",
+    "        val : data value\n",
+    "        minv : minimum\n",
+    "        maxv : maximum\n",
+    "    \"\"\"\n",
+    "    if (val < minv):\n",
+    "        val = minv\n",
+    "    elif (val > maxv):\n",
+    "        val = maxv\n",
+    "    return val\t\n",
+    "\n",
+    "def isInRange(val, minv, maxv):\n",
+    "    \"\"\"\n",
+    "    checks if within range\n",
+    "\n",
+    "    Parameters\n",
+    "        val : data value\n",
+    "        minv : minimum\n",
+    "        maxv : maximum\n",
+    "    \"\"\"\n",
+    "    return val >= minv and val <= maxv\n",
+    "\n",
+    "def stripFileLines(filePath, offset):\n",
+    "    \"\"\"\n",
+    "    strips number of chars from both ends\n",
+    "\n",
+    "    Parameters\n",
+    "        filePath : file path\n",
+    "        offset : offset from both ends of  line \n",
+    "    \"\"\"\n",
+    "    fp = open(filePath, \"r\")\n",
+    "    for line in fp:\n",
+    "        stripped = line[offset:len(line) - 1 - offset]\n",
+    "        print (stripped)\n",
+    "    fp.close()\n",
+    "\n",
+    "def genLatLong(lat1, long1, lat2, long2):\n",
+    "    \"\"\"\n",
+    "    generate lat log within limits\n",
+    "\n",
+    "    Parameters\n",
+    "        lat1 : lat of 1st point\n",
+    "        long1 : long of 1st point\n",
+    "        lat2 : lat of 2nd point\n",
+    "        long2 : long of 2nd point\n",
+    "    \"\"\"\n",
+    "    lat = lat1 + (lat2 - lat1) * random.random()\n",
+    "    longg = long1 + (long2 - long1) * random.random()\n",
+    "    return (lat, longg)\n",
+    "\n",
+    "def geoDistance(lat1, long1, lat2, long2):\n",
+    "    \"\"\"\n",
+    "    find geo distance in ft\n",
+    "\n",
+    "    Parameters\n",
+    "        lat1 : lat of 1st point\n",
+    "        long1 : long of 1st point\n",
+    "        lat2 : lat of 2nd point\n",
+    "        long2 : long of 2nd point\n",
+    "    \"\"\"\n",
+    "    latDiff = math.radians(lat1 - lat2)\n",
+    "    longDiff = math.radians(long1 - long2)\n",
+    "    l1 = math.sin(latDiff/2.0)\n",
+    "    l2 = math.sin(longDiff/2.0)\n",
+    "    l3 = math.cos(math.radians(lat1))\n",
+    "    l4 = math.cos(math.radians(lat2))\n",
+    "    a = l1 * l1 + l3 * l4 * l2 * l2\n",
+    "    l5 = math.sqrt(a)\n",
+    "    l6 = math.sqrt(1.0 - a)\n",
+    "    c = 2.0 * math.atan2(l5, l6)\n",
+    "    r = 6371008.8 * 3.280840\n",
+    "    return c * r\n",
+    "\n",
+    "def minLimit(val, limit):\n",
+    "    \"\"\"\n",
+    "    min limit\n",
+    "    Parameters\n",
+    "    \"\"\"\n",
+    "    if (val < limit):\n",
+    "        val = limit\n",
+    "    return val;\n",
+    "\n",
+    "def maxLimit(val, limit):\n",
+    "    \"\"\"\n",
+    "    max limit\n",
+    "    Parameters\n",
+    "    \"\"\"\n",
+    "    if (val > limit):\n",
+    "        val = limit\n",
+    "    return val;\n",
+    "\n",
+    "def rangeSample(val, minLim, maxLim):\n",
+    "    \"\"\"\n",
+    "    if out side range sample within range\n",
+    "\n",
+    "    Parameters\n",
+    "        val : value\n",
+    "        minLim : minimum\n",
+    "        maxLim : maximum\n",
+    "    \"\"\"\n",
+    "    if val < minLim or val > maxLim:\n",
+    "        val = randint(minLim, maxLim)\n",
+    "    return val\n",
+    "\n",
+    "def genRandomIntListWithinRange(size, minLim, maxLim):\n",
+    "    \"\"\"\n",
+    "    random unique list of integers within range\n",
+    "\n",
+    "    Parameters\n",
+    "        size : size of returned list\n",
+    "        minLim : minimum\n",
+    "        maxLim : maximum\n",
+    "    \"\"\"\n",
+    "    values = set()\n",
+    "    for i in range(size):\n",
+    "        val = randint(minLim, maxLim)\n",
+    "        while val not in values:\n",
+    "            values.add(val)\n",
+    "    return list(values)\n",
+    "\n",
+    "def preturbScalar(value, vrange):\n",
+    "    \"\"\"\n",
+    "    preturbs a mutiplicative value within range\n",
+    "\n",
+    "    Parameters\n",
+    "        value : data value\n",
+    "        vrange : value delta  fraction\n",
+    "    \"\"\"\n",
+    "    scale = 1.0 - vrange + 2 * vrange * random.random() \n",
+    "    return value * scale\n",
+    "\n",
+    "def preturbScalarAbs(value, vrange):\n",
+    "    \"\"\"\n",
+    "    preturbs an absolute value within range\n",
+    "\n",
+    "    Parameters\n",
+    "        value : data value\n",
+    "        vrange : value delta  absolute\n",
+    "    \"\"\"\n",
+    "    delta = - vrange + 2.0 * vrange * random.random() \n",
+    "    return value + delta\n",
+    "\n",
+    "def preturbVector(values, vrange):\n",
+    "    \"\"\"\n",
+    "    preturbs a list within range\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "        vrange : value delta  fraction\n",
+    "    \"\"\"\n",
+    "    nValues = list(map(lambda va: preturbScalar(va, vrange), values))\n",
+    "    return nValues\n",
+    "\n",
+    "def randomShiftVector(values, smin, smax):\n",
+    "    \"\"\"\n",
+    "    shifts  a list by a random quanity with a range\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "        smin : samplinf minimum\n",
+    "        smax : sampling maximum\n",
+    "    \"\"\"\n",
+    "    shift = np.random.uniform(smin, smax)\n",
+    "    return list(map(lambda va: va + shift, values))\n",
+    "\n",
+    "def floatRange(beg, end, incr):\n",
+    "    \"\"\"\n",
+    "    generates float range\n",
+    "\n",
+    "    Parameters\n",
+    "        beg :range begin\n",
+    "        end: range end\n",
+    "        incr : range increment\n",
+    "    \"\"\"\n",
+    "    return list(np.arange(beg, end, incr))\n",
+    "\n",
+    "def shuffle(values, *numShuffles):\n",
+    "    \"\"\"\n",
+    "    in place shuffling with swap of pairs\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "        numShuffles : parameter list for number of shuffles\n",
+    "    \"\"\"\n",
+    "    size = len(values)\n",
+    "    if len(numShuffles) == 0:\n",
+    "        numShuffle = int(size / 2)\n",
+    "    elif len(numShuffles) == 1:\n",
+    "        numShuffle = numShuffles[0]\n",
+    "    else:\n",
+    "        numShuffle = randint(numShuffles[0], numShuffles[1])\n",
+    "    print(\"numShuffle {}\".format(numShuffle))\n",
+    "    for i in range(numShuffle):\n",
+    "        first = random.randint(0, size - 1)\n",
+    "        second = random.randint(0, size - 1)\n",
+    "        while first == second:\n",
+    "            second = random.randint(0, size - 1)\n",
+    "        tmp = values[first]\n",
+    "        values[first] = values[second]\n",
+    "        values[second] = tmp\n",
+    "\n",
+    "\n",
+    "def splitList(itms, numGr):\n",
+    "    \"\"\"\n",
+    "    splits a list into sub lists of approximately equal size, with items in sublists randomly chod=sen\n",
+    "\n",
+    "    Parameters\n",
+    "        itms ; list of values\t\t\n",
+    "        numGr : no of groups\n",
+    "    \"\"\"\n",
+    "    tcount = len(itms)\n",
+    "    cItems = list(itms)\n",
+    "    sz = int(len(cItems) / numGr)\n",
+    "    groups = list()\n",
+    "    count = 0\n",
+    "    for i in range(numGr):\n",
+    "        if (i == numGr - 1):\n",
+    "            csz = tcount - count\n",
+    "        else:\n",
+    "            csz = sz + randint(-2, 2)\n",
+    "            count += csz\n",
+    "        gr = list()\n",
+    "        for  j in range(csz):\n",
+    "            it = selectRandomFromList(cItems)\n",
+    "            gr.append(it)\n",
+    "            cItems.remove(it)\n",
+    "        groups.append(gr)\n",
+    "    return groups\n",
+    "\n",
+    "def multVector(values, vrange):\n",
+    "    \"\"\"\n",
+    "    multiplies a list within value  range\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list of values\n",
+    "        vrange : fraction of vaue to be used to update\n",
+    "    \"\"\"\n",
+    "    scale = 1.0 - vrange + 2 * vrange * random.random()\n",
+    "    nValues = list(map(lambda va: va * scale, values))\n",
+    "    return nValues\n",
+    "\n",
+    "def weightedAverage(values, weights):\n",
+    "    \"\"\"\n",
+    "    calculates weighted average\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list of values\n",
+    "        weights : list of weights\n",
+    "    \"\"\"\t\t\n",
+    "    assert len(values) == len(weights), \"values and weights should be same size\"\n",
+    "    vw = zip(values, weights)\n",
+    "    wva = list(map(lambda e : e[0] * e[1], vw))\n",
+    "    #wa = sum(x * y for x, y in vw) / sum(weights)\n",
+    "    wav = sum(wva) / sum(weights)\n",
+    "    return wav\n",
+    "\n",
+    "def extractFields(line, delim, keepIndices):\n",
+    "    \"\"\"\n",
+    "    breaks a line into fields and keeps only specified fileds and returns new line\n",
+    "\n",
+    "    Parameters\n",
+    "        line ; deli separated string\n",
+    "        delim : delemeter\n",
+    "        keepIndices : list of indexes to fields to be retained\n",
+    "    \"\"\"\n",
+    "    items = line.split(delim)\n",
+    "    newLine = []\n",
+    "    for i in keepIndices:\n",
+    "        newLine.append(line[i])\n",
+    "    return delim.join(newLine)\n",
+    "\n",
+    "def remFields(line, delim, remIndices):\n",
+    "    \"\"\"\n",
+    "    removes fields from delim separated string\n",
+    "\n",
+    "    Parameters\n",
+    "        line ; delemeter separated string\n",
+    "        delim : delemeter\n",
+    "        remIndices : list of indexes to fields to be removed\n",
+    "    \"\"\"\n",
+    "    items = line.split(delim)\n",
+    "    newLine = []\n",
+    "    for i in range(len(items)):\n",
+    "        if not arrayContains(remIndices, i):\n",
+    "            newLine.append(line[i])\n",
+    "    return delim.join(newLine)\n",
+    "\n",
+    "def extractList(data, indices):\n",
+    "    \"\"\"\n",
+    "    extracts list from another list, given indices\n",
+    "\n",
+    "    Parameters\n",
+    "        remIndices : list data\n",
+    "        indices : list of indexes to fields to be retained\n",
+    "    \"\"\"\n",
+    "    if areAllFieldsIncluded(data, indices):\n",
+    "        exList = data.copy()\n",
+    "        #print(\"all indices\")\n",
+    "    else:\n",
+    "        exList = list()\n",
+    "        le = len(data)\n",
+    "        for i in indices:\n",
+    "            assert i < le , \"index {} out of bound {}\".format(i, le)\n",
+    "            exList.append(data[i])\n",
+    "\n",
+    "    return exList\n",
+    "\n",
+    "def arrayContains(arr, item):\n",
+    "    \"\"\"\n",
+    "    checks if array contains an item \n",
+    "\n",
+    "    Parameters\n",
+    "        arr : list data\n",
+    "        item : item to search\n",
+    "    \"\"\"\n",
+    "    contains = True\n",
+    "    try:\n",
+    "        arr.index(item)\n",
+    "    except ValueError:\n",
+    "        contains = False\n",
+    "    return contains\n",
+    "\n",
+    "def strToIntArray(line, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    int array from delim separated string\n",
+    "\n",
+    "    Parameters\n",
+    "        line ; delemeter separated string\n",
+    "    \"\"\"\n",
+    "    arr = line.split(delim)\n",
+    "    return [int(a) for a in arr]\n",
+    "\n",
+    "def strToFloatArray(line, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    float array from delim separated string\n",
+    "\n",
+    "    Parameters\n",
+    "        line ; delemeter separated string\n",
+    "    \"\"\"\n",
+    "    arr = line.split(delim)\n",
+    "    return [float(a) for a in arr]\n",
+    "\n",
+    "def strListOrRangeToIntArray(line):\n",
+    "    \"\"\"\n",
+    "    int array from delim separated string or range\n",
+    "\n",
+    "    Parameters\n",
+    "        line ; delemeter separated string\n",
+    "    \"\"\"\n",
+    "    varr = line.split(\",\")\n",
+    "    if (len(varr) > 1):\n",
+    "        iarr =  list(map(lambda v: int(v), varr))\n",
+    "    else:\n",
+    "        vrange = line.split(\":\")\n",
+    "        if (len(vrange) == 2):\n",
+    "            lo = int(vrange[0])\n",
+    "            hi = int(vrange[1])\n",
+    "            iarr = list(range(lo, hi+1))\n",
+    "        else:\n",
+    "            iarr = [int(line)]\n",
+    "    return iarr\n",
+    "\n",
+    "def toStr(val, precision):\n",
+    "    \"\"\"\n",
+    "    converts any type to string\t\n",
+    "\n",
+    "    Parameters\n",
+    "        val : value\n",
+    "        precision ; precision for float value\n",
+    "    \"\"\"\n",
+    "    if type(val) == float or type(val) == np.float64 or type(val) == np.float32:\n",
+    "        format = \"%\" + \".%df\" %(precision)\n",
+    "        sVal = format %(val)\n",
+    "    else:\n",
+    "        sVal = str(val)\n",
+    "    return sVal\n",
+    "\n",
+    "def toStrFromList(values, precision, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    converts list of any type to delim separated string\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "        precision ; precision for float value\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    sValues = list(map(lambda v: toStr(v, precision), values))\n",
+    "    return delim.join(sValues)\n",
+    "\n",
+    "def toIntList(values):\n",
+    "    \"\"\"\n",
+    "    convert to int list\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "    \"\"\"\n",
+    "    return list(map(lambda va: int(va), values))\n",
+    "\n",
+    "def toFloatList(values):\n",
+    "    \"\"\"\n",
+    "    convert to float list\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "    \"\"\"\n",
+    "    return list(map(lambda va: float(va), values))\n",
+    "\n",
+    "def toStrList(values, precision=None):\n",
+    "    \"\"\"\n",
+    "    convert to string list\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "        precision ; precision for float value\n",
+    "    \"\"\"\n",
+    "    return list(map(lambda va: toStr(va, precision), values))\n",
+    "\n",
+    "def toIntFromBoolean(value):\n",
+    "    \"\"\"\n",
+    "    convert to int\n",
+    "\n",
+    "    Parameters\n",
+    "        value : boolean value\n",
+    "    \"\"\"\n",
+    "    ival = 1 if value else 0\n",
+    "    return ival\n",
+    "\n",
+    "def typedValue(val, dtype=None):\n",
+    "    \"\"\"\n",
+    "    return typed value given string, discovers data type if not specified\n",
+    "\n",
+    "    Parameters\n",
+    "        val : value\n",
+    "        dtype : data type\n",
+    "    \"\"\"\n",
+    "    tVal = None\n",
+    "\n",
+    "    if dtype is not None:\n",
+    "        if dtype == \"num\":\n",
+    "            dtype = \"int\" if dtype.find(\".\") == -1 else \"float\"\n",
+    "\n",
+    "        if dtype == \"int\":\n",
+    "            tVal = int(val)\n",
+    "        elif dtype == \"float\":\n",
+    "            tVal = float(val)\n",
+    "        elif dtype == \"bool\":\n",
+    "            tVal = bool(val)\n",
+    "        else:\n",
+    "            tVal = val\n",
+    "    else:\n",
+    "        if type(val) == str:\n",
+    "            lVal = val.lower()\n",
+    "\n",
+    "            #int\n",
+    "            done = True\n",
+    "            try:\n",
+    "                tVal = int(val)\n",
+    "            except ValueError:\n",
+    "                done = False\n",
+    "\n",
+    "            #float\n",
+    "            if not done:\n",
+    "                done = True\n",
+    "                try:\n",
+    "                    tVal = float(val)\n",
+    "                except ValueError:\n",
+    "                    done = False\n",
+    "\n",
+    "            #boolean\n",
+    "            if not done:\n",
+    "                done = True\n",
+    "                if lVal == \"true\":\n",
+    "                    tVal = True\n",
+    "                elif lVal == \"false\":\n",
+    "                    tVal = False\n",
+    "                else:\n",
+    "                    done = False\n",
+    "            #None\t\t\n",
+    "            if not done:\n",
+    "                if lVal == \"none\":\n",
+    "                    tVal = None\n",
+    "                else:\n",
+    "                    tVal = val\n",
+    "        else:\n",
+    "            tVal = val\n",
+    "\n",
+    "    return tVal\n",
+    "\n",
+    "def getAllFiles(dirPath):\n",
+    "    \"\"\"\n",
+    "    get all files recursively\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : directory path\n",
+    "    \"\"\"\n",
+    "    filePaths = []\n",
+    "    for (thisDir, subDirs, fileNames) in os.walk(dirPath):\n",
+    "        for fileName in fileNames:\n",
+    "            filePaths.append(os.path.join(thisDir, fileName))\n",
+    "    filePaths.sort()\n",
+    "    return filePaths\n",
+    "\n",
+    "def getFileContent(fpath, verbose=False):\n",
+    "    \"\"\"\n",
+    "    get file contents in directory\n",
+    "\n",
+    "    Parameters\n",
+    "        fpath ; directory path\n",
+    "        verbose : verbosity flag\n",
+    "    \"\"\"\n",
+    "    # dcument list\n",
+    "    docComplete  = []\n",
+    "    filePaths = getAllFiles(fpath)\n",
+    "\n",
+    "    # read files\n",
+    "    for filePath in filePaths:\n",
+    "        if verbose:\n",
+    "            print(\"next file \" + filePath)\n",
+    "        with open(filePath, 'r') as contentFile:\n",
+    "            content = contentFile.read()\n",
+    "            docComplete.append(content)\n",
+    "    return (docComplete, filePaths)\n",
+    "\n",
+    "def getOneFileContent(fpath):\n",
+    "    \"\"\"\n",
+    "    get one file contents\n",
+    "\n",
+    "    Parameters\n",
+    "        fpath : file path\n",
+    "    \"\"\"\n",
+    "    with open(fpath, 'r') as contentFile:\n",
+    "        docStr = contentFile.read()\n",
+    "    return docStr\n",
+    "\n",
+    "def getFileLines(dirPath, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get lines from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    lines = list()\n",
+    "    for li in fileRecGen(dirPath, delim):\n",
+    "        lines.append(li)\n",
+    "    return lines\n",
+    "\n",
+    "def getFileSampleLines(dirPath, percen, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get sampled lines from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        percen : sampling percentage\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    lines = list()\n",
+    "    for li in fileRecGen(dirPath, delim):\n",
+    "        if randint(0, 100) < percen:\n",
+    "            lines.append(li)\n",
+    "    return lines\n",
+    "\n",
+    "def getFileColumnAsString(dirPath, index, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get string column from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        index : index\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    fields = list()\n",
+    "    for rec in fileRecGen(dirPath, delim):\n",
+    "        fields.append(rec[index])\n",
+    "    #print(fields)\t\n",
+    "    return fields\n",
+    "\n",
+    "def getFileColumnsAsString(dirPath, indexes, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get multiple string columns from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        indexes : indexes of columns\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    nindex = len(indexes)\n",
+    "    columns = list(map(lambda i : list(), range(nindex)))\n",
+    "    for rec in fileRecGen(dirPath, delim):\n",
+    "        for i in range(nindex):\n",
+    "            columns[i].append(rec[indexes[i]])\n",
+    "    return columns\n",
+    "\n",
+    "def getFileColumnAsFloat(dirPath, index, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get float fileds from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        index : index\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    #print(\"{}  {}\".format(dirPath, index))\n",
+    "    fields = getFileColumnAsString(dirPath, index, delim)\n",
+    "    return list(map(lambda v:float(v), fields))\n",
+    "\n",
+    "def getFileColumnAsInt(dirPath, index, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get float fileds from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        index : index\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    fields = getFileColumnAsString(dirPath, index, delim)\n",
+    "    return list(map(lambda v:int(v), fields))\n",
+    "\n",
+    "def getFileAsIntMatrix(dirPath, columns, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts int matrix from csv file given column indices with each row being  concatenation of \n",
+    "    extracted column values row size = num of columns\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        columns : indexes of columns\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    mat = list()\n",
+    "    for rec in  fileSelFieldsRecGen(dirPath, columns, delim):\n",
+    "        mat.append(asIntList(rec))\n",
+    "    return mat\n",
+    "\n",
+    "def getFileAsFloatMatrix(dirPath, columns, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts float matrix from csv file given column indices with each row being concatenation of  \n",
+    "    extracted column values row size = num of columns\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        columns : indexes of columns\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    mat = list()\n",
+    "    for rec in  fileSelFieldsRecGen(dirPath, columns, delim):\n",
+    "        mat.append(asFloatList(rec))\n",
+    "    return mat\n",
+    "\n",
+    "def getFileAsFloatColumn(dirPath):\n",
+    "    \"\"\"\n",
+    "    grt float list from a file with one float per row\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "    \"\"\"\n",
+    "    flist = list()\n",
+    "    for rec in fileRecGen(dirPath, None):\n",
+    "        flist.append(float(rec))\n",
+    "    return flist\n",
+    "\n",
+    "def getFileAsFiltFloatMatrix(dirPath, filt, columns, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts float matrix from csv file given row filter and column indices with each row being \n",
+    "    concatenation of  extracted column values row size = num of columns\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        columns : indexes of columns\n",
+    "        filt : row filter lambda\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    mat = list()\n",
+    "    for rec in  fileFiltSelFieldsRecGen(dirPath, filt, columns, delim):\n",
+    "        mat.append(asFloatList(rec))\n",
+    "    return mat\n",
+    "\n",
+    "def getFileAsTypedRecords(dirPath, types, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts typed records from csv file with each row being concatenation of  \n",
+    "    extracted column values \n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        types : data types\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    (dtypes, cvalues) = extractTypesFromString(types)\t\n",
+    "    tdata = list()\n",
+    "    for rec in  fileRecGen(dirPath, delim):\n",
+    "        trec = list()\n",
+    "        for index, value in enumerate(rec):\n",
+    "            value = __convToTyped(index, value, dtypes)\n",
+    "            trec.append(value)\n",
+    "        tdata.append(trec)\n",
+    "    return tdata\n",
+    "\n",
+    "\n",
+    "def getFileColsAsTypedRecords(dirPath, columns, types, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts typed records from csv file given column indices with each row being concatenation of  \n",
+    "    extracted column values \n",
+    "    Parameters\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        columns : column indexes\n",
+    "        types : data types\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    (dtypes, cvalues) = extractTypesFromString(types)\t\n",
+    "    tdata = list()\n",
+    "    for rec in  fileSelFieldsRecGen(dirPath, columns, delim):\n",
+    "        trec = list()\n",
+    "        for indx, value in enumerate(rec):\n",
+    "            tindx = columns[indx]\n",
+    "            value = __convToTyped(tindx, value, dtypes)\n",
+    "            trec.append(value)\n",
+    "        tdata.append(trec)\n",
+    "    return tdata\n",
+    "\n",
+    "def getFileColumnsMinMax(dirPath, columns, dtype, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts numeric matrix from csv file given column indices. For each column return min and max\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        columns : column indexes\n",
+    "        dtype : data type\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    dtypes = list(map(lambda c : str(c) + \":\" + dtype, columns))\n",
+    "    dtypes = \",\".join(dtypes)\n",
+    "    #print(dtypes)\n",
+    "\n",
+    "    tdata = getFileColsAsTypedRecords(dirPath, columns, dtypes, delim)\n",
+    "    minMax = list()\n",
+    "    ncola = len(tdata[0])\n",
+    "    ncole = len(columns)\n",
+    "    assertEqual(ncola, ncole, \"actual no of columns different from expected\")\n",
+    "\n",
+    "    for ci in range(ncole):\t\n",
+    "        vmin = sys.float_info.max\n",
+    "        vmax = sys.float_info.min\n",
+    "        for r in tdata:\n",
+    "            cv = r[ci]\n",
+    "            vmin = cv if cv < vmin else vmin\n",
+    "            vmax = cv if cv > vmax else vmax\n",
+    "        mm = (vmin, vmax, vmax - vmin)\n",
+    "        minMax.append(mm)\n",
+    "\n",
+    "    return minMax\n",
+    "\n",
+    "\n",
+    "def getRecAsTypedRecord(rec, types, delim=None):\n",
+    "    \"\"\"\n",
+    "    converts record to  typed records \n",
+    "    Parameters\n",
+    "        rec : delemeter separate string or list of string\n",
+    "        types : field  data types\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\t\n",
+    "    if delim is not None:\n",
+    "        rec = rec.split(delim)\n",
+    "    (dtypes, cvalues) = extractTypesFromString(types)\t\n",
+    "    #print(types)\n",
+    "    #print(dtypes)\n",
+    "    trec = list()\n",
+    "    for ind, value in enumerate(rec):\n",
+    "        tvalue = __convToTyped(ind, value, dtypes)\n",
+    "        trec.append(tvalue)\n",
+    "    return trec\n",
+    "\n",
+    "def __convToTyped(index, value, dtypes):\n",
+    "    \"\"\"\n",
+    "    convert to typed value \n",
+    "    Parameters\n",
+    "        index : index in type list\n",
+    "        value : data value\n",
+    "        dtypes : data type list\n",
+    "    \"\"\"\n",
+    "    #print(index, value)\n",
+    "    dtype = dtypes[index]\n",
+    "    tvalue = value\n",
+    "    if dtype == \"int\":\n",
+    "        tvalue = int(value)\n",
+    "    elif dtype == \"float\":\n",
+    "        tvalue = float(value)\n",
+    "    return tvalue\n",
+    "\n",
+    "\n",
+    "\n",
+    "def extractTypesFromString(types):\n",
+    "    \"\"\"\n",
+    "    extracts column data types and set values for categorical variables \n",
+    "    Parameters\n",
+    "        types : encoded type information\n",
+    "    \"\"\"\n",
+    "    ftypes = types.split(\",\")\n",
+    "    dtypes = dict()\n",
+    "    cvalues = dict()\n",
+    "    for ftype in ftypes:\n",
+    "        items = ftype.split(\":\") \n",
+    "        cindex = int(items[0])\n",
+    "        dtype = items[1]\n",
+    "        dtypes[cindex] = dtype\n",
+    "        if len(items) == 3:\n",
+    "            sitems = items[2].split()\n",
+    "            cvalues[cindex] = sitems\n",
+    "    return (dtypes, cvalues)\n",
+    "\n",
+    "def getMultipleFileAsInttMatrix(dirPathWithCol,  delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts int matrix from from csv files given column index for each file. \n",
+    "    num of columns  = number of rows in each file and num of rows = number of files\n",
+    "    Parameters\n",
+    "        dirPathWithCol: list of file path and collumn index pair\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    mat = list()\n",
+    "    minLen = -1\n",
+    "    for path, col in dirPathWithCol:\n",
+    "        colVals = getFileColumnAsInt(path, col, delim)\n",
+    "        if minLen < 0 or len(colVals) < minLen:\n",
+    "            minLen = len(colVals)\n",
+    "        mat.append(colVals)\n",
+    "\n",
+    "    #make all same length\n",
+    "    mat = list(map(lambda li:li[:minLen], mat))\t\n",
+    "    return mat\n",
+    "\n",
+    "def getMultipleFileAsFloatMatrix(dirPathWithCol,  delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts float matrix from from csv files given column index for each file. \n",
+    "    num of columns  = number of rows in each file and num of rows = number of files\n",
+    "    Parameters\n",
+    "        dirPathWithCol: list of file path and collumn index pair\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    mat = list()\n",
+    "    minLen = -1\n",
+    "    for path, col in dirPathWithCol:\n",
+    "        colVals = getFileColumnAsFloat(path, col, delim)\n",
+    "        if minLen < 0 or len(colVals) < minLen:\n",
+    "            minLen = len(colVals)\n",
+    "        mat.append(colVals)\n",
+    "\n",
+    "    #make all same length\n",
+    "    mat = list(map(lambda li:li[:minLen], mat))\n",
+    "    return mat\n",
+    "\n",
+    "def writeStrListToFile(ldata, filePath, delem=\",\"):\n",
+    "    \"\"\"\n",
+    "    writes list of dlem separated string or list of list of string to afile\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        filePath : file path\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"w\") as fh:\n",
+    "        for r in ldata:\n",
+    "            if type(r) == list:\n",
+    "                r = delem.join(r)\n",
+    "            fh.write(r + \"\\n\")\n",
+    "\n",
+    "def writeFloatListToFile(ldata, prec, filePath):\n",
+    "    \"\"\"\n",
+    "    writes float list to file, one value per line\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        prec : precision\n",
+    "        filePath : file path\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"w\") as fh:\n",
+    "        for d in ldata:\n",
+    "            fh.write(formatFloat(prec, d) + \"\\n\")\n",
+    "\n",
+    "\n",
+    "def takeFirst(elems):\n",
+    "    \"\"\"\n",
+    "    return fisrt item\n",
+    "    Parameters\n",
+    "        elems : list of data \n",
+    "    \"\"\"\n",
+    "    return elems[0]\n",
+    "\n",
+    "def takeSecond(elems):\n",
+    "    \"\"\"\n",
+    "    return 2nd element\n",
+    "    Parameters\n",
+    "        elems : list of data \n",
+    "    \"\"\"\n",
+    "    return elems[1]\n",
+    "\n",
+    "def takeThird(elems):\n",
+    "    \"\"\"\n",
+    "    returns 3rd element\n",
+    "    Parameters\n",
+    "        elems : list of data \n",
+    "    \"\"\"\n",
+    "    return elems[2]\n",
+    "\n",
+    "def addToKeyedCounter(dCounter, key, count=1):\n",
+    "    \"\"\"\n",
+    "    add to to keyed counter\n",
+    "    Parameters\n",
+    "        dCounter : dictionary of counters\n",
+    "        key : dictionary key\n",
+    "        count : count to add\n",
+    "    \"\"\"\n",
+    "    curCount = dCounter.get(key, 0)\n",
+    "    dCounter[key] = curCount + count\n",
+    "\n",
+    "def incrKeyedCounter(dCounter, key):\n",
+    "    \"\"\"\n",
+    "    increment keyed counter\n",
+    "    Parameters\n",
+    "        dCounter : dictionary of counters\n",
+    "        key : dictionary key\n",
+    "    \"\"\"\n",
+    "    addToKeyedCounter(dCounter, key, 1)\n",
+    "\n",
+    "def appendKeyedList(dList, key, elem):\n",
+    "    \"\"\"\n",
+    "    keyed list\n",
+    "    Parameters\n",
+    "        dList : dictionary of lists\n",
+    "        key : dictionary key\n",
+    "        elem : value to append\n",
+    "    \"\"\"\n",
+    "    curList = dList.get(key, [])\n",
+    "    curList.append(elem)\n",
+    "    dList[key] = curList\n",
+    "\n",
+    "def isNumber(st):\n",
+    "    \"\"\"\n",
+    "    Returns True is string is a number\n",
+    "    Parameters\n",
+    "        st : string value\n",
+    "    \"\"\"\n",
+    "    return st.replace('.','',1).isdigit()\n",
+    "\n",
+    "def removeNan(values):\n",
+    "    \"\"\"\n",
+    "    removes nan from list\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "    \"\"\"\n",
+    "    return list(filter(lambda v: not math.isnan(v), values))\n",
+    "\n",
+    "def fileRecGen(filePath, delim = \",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator\n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"r\") as fp:\n",
+    "        for line in fp:\t\n",
+    "            line = line[:-1]\n",
+    "            if delim is not None:\n",
+    "                line = line.split(delim)\n",
+    "            yield line\n",
+    "\n",
+    "def fileSelFieldsRecGen(dirPath, columns, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator given column indices \n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        columns : column indexes as int array or coma separated string\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    if type(columns) == str:\n",
+    "        columns = strToIntArray(columns, delim)\n",
+    "    for rec in fileRecGen(dirPath, delim):\n",
+    "        extracted = extractList(rec, columns)\n",
+    "        yield extracted\n",
+    "\n",
+    "def fileFiltRecGen(filePath, filt, delim = \",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator with  row filter applied\n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        filt : row filter\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"r\") as fp:\n",
+    "        for line in fp:\t\n",
+    "            line = line[:-1]\n",
+    "            if delim is not None:\n",
+    "                line = line.split(delim)\n",
+    "            if filt(line):\n",
+    "                yield line\n",
+    "\n",
+    "def fileFiltSelFieldsRecGen(filePath, filt, columns, delim = \",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator with  row and column filter applied\n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        filt : row filter\n",
+    "        columns : column indexes as int array or coma separated string\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    columns = strToIntArray(columns, delim)\n",
+    "    with open(filePath, \"r\") as fp:\n",
+    "        for line in fp:\t\n",
+    "            line = line[:-1]\n",
+    "            if delim is not None:\n",
+    "                line = line.split(delim)\n",
+    "            if filt(line):\n",
+    "                selected = extractList(line, columns)\n",
+    "                yield selected\n",
+    "\n",
+    "def fileTypedRecGen(filePath, ftypes, delim = \",\"):\n",
+    "    \"\"\"\n",
+    "    file typed record generator\n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        ftypes : list of field types\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"r\") as fp:\n",
+    "        for line in fp:\t\n",
+    "            line = line[:-1]\n",
+    "            line = line.split(delim)\n",
+    "            for i in range(0, len(ftypes), 2):\n",
+    "                ci = ftypes[i]\n",
+    "                dtype = ftypes[i+1]\n",
+    "                assertLesser(ci, len(line), \"index out of bound\")\n",
+    "                if dtype == \"int\":\n",
+    "                    line[ci] = int(line[ci])\n",
+    "                elif dtype == \"float\":\n",
+    "                    line[ci] = float(line[ci])\n",
+    "                else:\n",
+    "                    exitWithMsg(\"invalid data type\")\n",
+    "            yield line\n",
+    "\n",
+    "def fileMutatedFieldsRecGen(dirPath, mutator, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator with some columns mutated \n",
+    "    Parameters\n",
+    "        dirPath ; file path\n",
+    "        mutator : row field mutator\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    for rec in fileRecGen(dirPath, delim):\n",
+    "        mutated = mutator(rec)\n",
+    "        yield mutated\n",
+    "\n",
+    "def tableSelFieldsFilter(tdata, columns):\n",
+    "    \"\"\"\n",
+    "    gets tabular data for selected columns \n",
+    "    Parameters\n",
+    "        tdata : tabular data\n",
+    "        columns : column indexes\n",
+    "    \"\"\"\n",
+    "    if areAllFieldsIncluded(tdata[0], columns):\n",
+    "        ntdata = tdata\n",
+    "    else:\n",
+    "        ntdata = list()\n",
+    "        for rec in tdata:\n",
+    "            #print(rec)\n",
+    "            #print(columns)\n",
+    "            nrec = extractList(rec, columns)\n",
+    "            ntdata.append(nrec)\n",
+    "    return ntdata\n",
+    "\n",
+    "\n",
+    "def areAllFieldsIncluded(ldata, columns):\n",
+    "    \"\"\"\n",
+    "    return True id all indexes are in the columns\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        columns : column indexes\n",
+    "    \"\"\"\n",
+    "    return list(range(len(ldata))) == columns\n",
+    "\n",
+    "def asIntList(items):\n",
+    "    \"\"\"\n",
+    "    returns int list\n",
+    "    Parameters\n",
+    "        items : list data\n",
+    "    \"\"\"\n",
+    "    return [int(i) for i in items]\n",
+    "\n",
+    "def asFloatList(items):\n",
+    "    \"\"\"\n",
+    "    returns float list\n",
+    "    Parameters\n",
+    "        items : list data\n",
+    "    \"\"\"\n",
+    "    return [float(i) for i in items]\n",
+    "\n",
+    "def pastTime(interval, unit):\n",
+    "    \"\"\"\n",
+    "    current and past time\n",
+    "    Parameters\n",
+    "        interval : time interval\n",
+    "        unit: time unit\n",
+    "    \"\"\"\n",
+    "    curTime = int(time.time())\n",
+    "    if unit == \"d\":\n",
+    "        pastTime = curTime - interval * secInDay\n",
+    "    elif unit == \"h\":\n",
+    "        pastTime = curTime - interval * secInHour\n",
+    "    elif unit == \"m\":\n",
+    "        pastTime = curTime - interval * secInMinute\n",
+    "    else:\n",
+    "        raise ValueError(\"invalid time unit \" + unit)\n",
+    "    return (curTime, pastTime)\n",
+    "\n",
+    "def minuteAlign(ts):\n",
+    "    \"\"\"\n",
+    "    minute aligned time\t\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    return int((ts / secInMinute)) * secInMinute\n",
+    "\n",
+    "def multMinuteAlign(ts, min):\n",
+    "    \"\"\"\n",
+    "    multi minute aligned time\t\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "        min : minute value\n",
+    "    \"\"\"\n",
+    "    intv = secInMinute * min\n",
+    "    return int((ts / intv)) * intv\n",
+    "\n",
+    "def hourAlign(ts):\n",
+    "    \"\"\"\n",
+    "    hour aligned time\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    return int((ts / secInHour)) * secInHour\n",
+    "\n",
+    "def hourOfDayAlign(ts, hour):\n",
+    "    \"\"\"\n",
+    "    hour of day aligned time\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "        hour : hour of day\n",
+    "    \"\"\"\n",
+    "    day = int(ts / secInDay)\n",
+    "    return (24 * day + hour) * secInHour\n",
+    "\n",
+    "def dayAlign(ts):\n",
+    "    \"\"\"\n",
+    "    day aligned time\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    return int(ts / secInDay) * secInDay\n",
+    "\n",
+    "def timeAlign(ts, unit):\n",
+    "    \"\"\"\n",
+    "    boundary alignment of time\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "        unit : unit of time\n",
+    "    \"\"\"\n",
+    "    alignedTs = 0\n",
+    "    if unit == \"s\":\n",
+    "        alignedTs = ts\n",
+    "    elif unit == \"m\":\n",
+    "        alignedTs = minuteAlign(ts)\n",
+    "    elif unit == \"h\":\n",
+    "        alignedTs = hourAlign(ts)\n",
+    "    elif unit == \"d\":\n",
+    "        alignedTs = dayAlign(ts)\n",
+    "    else:\n",
+    "        raise ValueError(\"invalid time unit\")\n",
+    "    return \talignedTs\n",
+    "\n",
+    "def monthOfYear(ts):\n",
+    "    \"\"\"\n",
+    "    month of year\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    rem = ts % secInYear\n",
+    "    dow = int(rem / secInMonth)\n",
+    "    return dow\n",
+    "\n",
+    "def dayOfWeek(ts):\n",
+    "    \"\"\"\n",
+    "    day of week\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    rem = ts % secInWeek\n",
+    "    dow = int(rem / secInDay)\n",
+    "    return dow\n",
+    "\n",
+    "def hourOfDay(ts):\n",
+    "    \"\"\"\n",
+    "    hour of day\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    rem = ts % secInDay\n",
+    "    hod = int(rem / secInHour)\n",
+    "    return hod\n",
+    "\n",
+    "def processCmdLineArgs(expectedTypes, usage):\n",
+    "    \"\"\"\n",
+    "    process command line args and returns args as typed values\n",
+    "    Parameters\n",
+    "        expectedTypes : expected data types of arguments\n",
+    "        usage : usage message string\n",
+    "    \"\"\"\n",
+    "    args = []\n",
+    "    numComLineArgs = len(sys.argv)\n",
+    "    numExpected = len(expectedTypes)\n",
+    "    if (numComLineArgs - 1 == len(expectedTypes)):\n",
+    "        try:\n",
+    "            for i in range(0, numExpected):\n",
+    "                if (expectedTypes[i] == typeInt):\n",
+    "                    args.append(int(sys.argv[i+1]))\n",
+    "                elif (expectedTypes[i] == typeFloat):\n",
+    "                    args.append(float(sys.argv[i+1]))\n",
+    "                elif (expectedTypes[i] == typeString):\n",
+    "                    args.append(sys.argv[i+1])\n",
+    "        except ValueError:\n",
+    "            print (\"expected number of command line arguments found but there is type mis match\")\n",
+    "            sys.exit(1)\n",
+    "    else:\n",
+    "        print (\"expected number of command line arguments not found\")\n",
+    "        print (usage)\n",
+    "        sys.exit(1)\n",
+    "    return args\n",
+    "\n",
+    "def mutateString(val, numMutate, ctype):\n",
+    "    \"\"\"\n",
+    "    mutate string multiple times\n",
+    "    Parameters\n",
+    "        val : string value\n",
+    "        numMutate : num of mutations\n",
+    "        ctype : type of character to mutate with\n",
+    "    \"\"\"\n",
+    "    mutations = set()\n",
+    "    count = 0\n",
+    "    while count < numMutate:\n",
+    "        j = randint(0, len(val)-1)\n",
+    "        if j not in mutations:\n",
+    "            if ctype == \"alpha\":\n",
+    "                ch = selectRandomFromList(alphaTokens)\n",
+    "            elif ctype == \"num\":\n",
+    "                ch = selectRandomFromList(numTokens)\n",
+    "            elif ctype == \"any\":\n",
+    "                ch = selectRandomFromList(tokens)\n",
+    "            val = val[:j] + ch + val[j+1:]\n",
+    "            mutations.add(j)\n",
+    "            count += 1\n",
+    "    return val\n",
+    "\n",
+    "def mutateList(values, numMutate, vmin, vmax):\n",
+    "    \"\"\"\n",
+    "    mutate list multiple times\n",
+    "    Parameters\n",
+    "        values : list value\n",
+    "        numMutate : num of mutations\n",
+    "        vmin : minimum of value range\n",
+    "        vmax : maximum of value range\n",
+    "    \"\"\"\n",
+    "    mutations = set()\n",
+    "    count = 0\n",
+    "    while count < numMutate:\n",
+    "        j = randint(0, len(values)-1)\n",
+    "        if j not in mutations:\n",
+    "            values[j] = np.random.uniform(vmin, vmax)\n",
+    "            count += 1\n",
+    "    return values\n",
+    "\n",
+    "\n",
+    "def swap(values, first, second):\n",
+    "    \"\"\"\n",
+    "    swap two elements\n",
+    "    Parameters\n",
+    "        values : list value\n",
+    "        first : first swap position\n",
+    "        second : second swap position\n",
+    "    \"\"\"\n",
+    "    t = values[first]\n",
+    "    values[first] = values[second]\n",
+    "    values[second] = t\n",
+    "\n",
+    "def swapBetweenLists(values1, values2):\n",
+    "    \"\"\"\n",
+    "    swap two elements between 2 lists\n",
+    "    Parameters\n",
+    "        values1 : first list of values\n",
+    "        values2 : second list of values\n",
+    "    \"\"\"\n",
+    "    p1 = randint(0, len(values1)-1)\n",
+    "    p2 = randint(0, len(values2)-1)\n",
+    "    tmp = values1[p1]\t\n",
+    "    values1[p1] = values2[p2]\n",
+    "    values2[p2] = tmp\n",
+    "\n",
+    "def safeAppend(values, value):\n",
+    "    \"\"\"\n",
+    "    append only if not None\n",
+    "    Parameters\n",
+    "        values : list value\n",
+    "        value : value to append\n",
+    "    \"\"\"\n",
+    "    if value is not None:\n",
+    "        values.append(value)\n",
+    "\n",
+    "def getAllIndex(ldata, fldata):\n",
+    "    \"\"\"\n",
+    "    get ALL indexes of list elements\n",
+    "    Parameters\n",
+    "        ldata : list data to find index in\n",
+    "        fldata : list data for values for index look up\n",
+    "    \"\"\"\n",
+    "    return list(map(lambda e : fldata.index(e), ldata))\n",
+    "\n",
+    "def findIntersection(lOne, lTwo):\n",
+    "    \"\"\"\n",
+    "    find intersection elements between 2 lists\n",
+    "    Parameters\n",
+    "        lOne : first list of data\n",
+    "        lTwo : second list of data\n",
+    "    \"\"\"\n",
+    "    sOne = set(lOne)\n",
+    "    sTwo = set(lTwo)\n",
+    "    sInt = sOne.intersection(sTwo)\n",
+    "    return list(sInt)\n",
+    "\n",
+    "def isIntvOverlapped(rOne, rTwo):\n",
+    "    \"\"\"\n",
+    "    checks overlap between 2 intervals\n",
+    "    Parameters\n",
+    "        rOne : first interval boundaries\n",
+    "        rTwo : second interval boundaries\n",
+    "    \"\"\"\n",
+    "    clear = rOne[1] <=  rTwo[0] or rOne[0] >=  rTwo[1] \n",
+    "    return not clear\n",
+    "\n",
+    "def isIntvLess(rOne, rTwo):\n",
+    "    \"\"\"\n",
+    "    checks if first iterval is less than second\n",
+    "    Parameters\n",
+    "        rOne : first interval boundaries\n",
+    "        rTwo : second interval boundaries\n",
+    "    \"\"\"\n",
+    "    less = rOne[1] <=  rTwo[0] \n",
+    "    return less\n",
+    "\n",
+    "def findRank(e, values):\n",
+    "    \"\"\"\n",
+    "    find rank of value in a list\n",
+    "    Parameters\n",
+    "        e : value to compare with\n",
+    "        values : list data\n",
+    "    \"\"\"\n",
+    "    count =  1\n",
+    "    for ve in values:\n",
+    "        if ve < e:\n",
+    "            count += 1\n",
+    "    return count\n",
+    "\n",
+    "def findRanks(toBeRanked, values):\n",
+    "    \"\"\"\n",
+    "    find ranks of values in one list in another list\n",
+    "    Parameters\n",
+    "        toBeRanked : list of values for which ranks are found\n",
+    "        values : list in which rank is found : \n",
+    "    \"\"\"\n",
+    "    return list(map(lambda e: findRank(e, values), toBeRanked))\n",
+    "\n",
+    "def formatFloat(prec, value, label = None):\n",
+    "    \"\"\"\n",
+    "    formats a float with optional label\n",
+    "    Parameters\n",
+    "        prec : precision\n",
+    "        value : data value\n",
+    "        label : label for data\n",
+    "    \"\"\"\n",
+    "    st = (label + \" \") if label else \"\"\n",
+    "    formatter = \"{:.\" + str(prec) + \"f}\" \n",
+    "    return st + formatter.format(value)\n",
+    "\n",
+    "def formatAny(value, label = None):\n",
+    "    \"\"\"\n",
+    "    formats any obkect with optional label\n",
+    "    Parameters\n",
+    "        value : data value\n",
+    "        label : label for data\n",
+    "    \"\"\"\n",
+    "    st = (label + \" \") if label else \"\"\n",
+    "    return st + str(value)\n",
+    "\n",
+    "def printList(values):\n",
+    "    \"\"\"\n",
+    "    pretty print list\n",
+    "    Parameters\n",
+    "        values : list of values\n",
+    "    \"\"\"\n",
+    "    for v in values:\n",
+    "        print(v)\n",
+    "\n",
+    "def printMap(values, klab, vlab, precision, offset=16):\n",
+    "    \"\"\"\n",
+    "    pretty print hash map\n",
+    "    Parameters\n",
+    "        values : dictionary of values\n",
+    "        klab : label for key\n",
+    "        vlab : label for value\n",
+    "        precision : precision\n",
+    "        offset : left justify offset\n",
+    "    \"\"\"\n",
+    "    print(klab.ljust(offset, \" \") + vlab)\n",
+    "    for k in values.keys():\n",
+    "        v = values[k]\n",
+    "        ks = toStr(k, precision).ljust(offset, \" \")\n",
+    "        vs = toStr(v, precision)\n",
+    "        print(ks +  vs)\n",
+    "\n",
+    "def printPairList(values, lab1, lab2, precision, offset=16):\n",
+    "    \"\"\"\n",
+    "    pretty print list of pairs\n",
+    "    Parameters\n",
+    "        values : dictionary of values\n",
+    "        lab1 : first label\n",
+    "        lab2 : second label\n",
+    "        precision : precision\n",
+    "        offset : left justify offset\n",
+    "    \"\"\"\n",
+    "    print(lab1.ljust(offset, \" \") + lab2)\n",
+    "    for (v1, v2) in values:\n",
+    "        sv1 = toStr(v1, precision).ljust(offset, \" \")\n",
+    "        sv2 = toStr(v2, precision)\n",
+    "        print(sv1 + sv2)\n",
+    "\n",
+    "def createMap(*values):\n",
+    "    \"\"\"\n",
+    "    create disctionary with results\n",
+    "    Parameters\n",
+    "        values : sequence of key value pairs\n",
+    "    \"\"\"\n",
+    "    result = dict()\n",
+    "    for i in range(0, len(values), 2):\n",
+    "        result[values[i]] = values[i+1]\n",
+    "    return result\n",
+    "\n",
+    "def getColMinMax(table, col):\n",
+    "    \"\"\"\n",
+    "    return min, max values of a column\n",
+    "    Parameters\n",
+    "        table : tabular data\n",
+    "        col : column index\n",
+    "    \"\"\"\n",
+    "    vmin = None\n",
+    "    vmax = None\n",
+    "    for rec in table:\n",
+    "        value = rec[col]\n",
+    "        if vmin is None:\n",
+    "            vmin = value\n",
+    "            vmax = value\n",
+    "        else:\n",
+    "            if value < vmin:\n",
+    "                vmin = value\n",
+    "            elif value > vmax:\n",
+    "                vmax = value\n",
+    "    return (vmin, vmax, vmax - vmin)\n",
+    "\n",
+    "def createLogger(name, logFilePath, logLevName):\n",
+    "    \"\"\"\n",
+    "    creates logger\n",
+    "    Parameters\n",
+    "        name : logger name\n",
+    "        logFilePath : log file path\n",
+    "        logLevName : log level\n",
+    "    \"\"\"\n",
+    "    logger = logging.getLogger(name)\n",
+    "    fHandler = logging.handlers.RotatingFileHandler(logFilePath, maxBytes=1048576, backupCount=4)\n",
+    "    logLev = logLevName.lower()\n",
+    "    if logLev == \"debug\":\n",
+    "        logLevel = logging.DEBUG\n",
+    "    elif logLev == \"info\":\n",
+    "        logLevel = logging.INFO\n",
+    "    elif logLev == \"warning\":\n",
+    "        logLevel = logging.WARNING\n",
+    "    elif logLev == \"error\":\n",
+    "        logLevel = logging.ERROR\n",
+    "    elif logLev == \"critical\":\n",
+    "        logLevel = logging.CRITICAL\n",
+    "    else:\n",
+    "        raise ValueError(\"invalid log level name \" + logLevelName)\n",
+    "    fHandler.setLevel(logLevel)\n",
+    "    fFormat = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n",
+    "    fHandler.setFormatter(fFormat)\n",
+    "    logger.addHandler(fHandler)\n",
+    "    logger.setLevel(logLevel)\n",
+    "    return logger\n",
+    "\n",
+    "@contextmanager\n",
+    "def suppressStdout():\n",
+    "    \"\"\"\n",
+    "    suppress stdout\n",
+    "    Parameters\n",
+    "    \"\"\"\n",
+    "    with open(os.devnull, \"w\") as devnull:\n",
+    "        oldStdout = sys.stdout\n",
+    "        sys.stdout = devnull\n",
+    "        try:  \n",
+    "            yield\n",
+    "        finally:\n",
+    "            sys.stdout = oldStdout\n",
+    "\n",
+    "def exitWithMsg(msg):\n",
+    "    \"\"\"\n",
+    "    print message and exit\n",
+    "    Parameters\n",
+    "        msg : message\n",
+    "    \"\"\"\n",
+    "    print(msg + \" -- quitting\")\n",
+    "    sys.exit(0)\n",
+    "\n",
+    "def drawLine(data, yscale=None):\n",
+    "    \"\"\"\n",
+    "    line plot\n",
+    "    Parameters\n",
+    "        data : list data\n",
+    "        yscale : y axis scale\n",
+    "    \"\"\"\n",
+    "    plt.plot(data)\n",
+    "    if yscale:\n",
+    "        step = int(yscale / 10)\n",
+    "        step = int(step / 10) * 10\n",
+    "        plt.yticks(range(0, yscale, step))\n",
+    "    plt.show()\n",
+    "\n",
+    "def drawPlot(x, y, xlabel, ylabel):\n",
+    "    \"\"\"\n",
+    "    line plot\n",
+    "    Parameters\n",
+    "        x : x values\n",
+    "        y : y values\n",
+    "        xlabel : x axis label\n",
+    "        ylabel : y axis label\n",
+    "    \"\"\"\n",
+    "    plt.plot(x,y)\n",
+    "    plt.xlabel(xlabel)\n",
+    "    plt.ylabel(ylabel)\n",
+    "    plt.show()\n",
+    "\n",
+    "def drawPairPlot(x, y1, y2, xlabel,ylabel, y1label, y2label):\n",
+    "    \"\"\"\n",
+    "    line plot of 2 lines\n",
+    "    Parameters\n",
+    "        x : x values\n",
+    "        y1 : first y values\n",
+    "        y2 : second y values\n",
+    "        xlabel : x labbel\n",
+    "        ylabel : y label\n",
+    "        y1label : first plot label\n",
+    "        y2label : second plot label\n",
+    "    \"\"\"\n",
+    "    plt.plot(x, y1, label = y1label)\n",
+    "    plt.plot(x, y2, label = y2label)\n",
+    "    plt.xlabel(xlabel)\n",
+    "    plt.ylabel(ylabel)\n",
+    "    plt.legend()\n",
+    "    plt.show()\n",
+    "\n",
+    "def drawHist(ldata, myTitle, myXlabel, myYlabel, nbins=10):\n",
+    "    \"\"\"\n",
+    "    draw histogram\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        myTitle : title\n",
+    "        myXlabel : x label\n",
+    "        myYlabel : y label \n",
+    "        nbins : num of bins\n",
+    "    \"\"\"\n",
+    "    plt.hist(ldata, bins=nbins, density=True)\n",
+    "    plt.title(myTitle)\n",
+    "    plt.xlabel(myXlabel)\n",
+    "    plt.ylabel(myYlabel)\n",
+    "    plt.show()\t\n",
+    "\n",
+    "def saveObject(obj, filePath):\n",
+    "    \"\"\"\n",
+    "    saves an object\n",
+    "    Parameters\n",
+    "        obj : object\n",
+    "        filePath : file path for saved object\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"wb\") as outfile:\n",
+    "        pickle.dump(obj,outfile)\n",
+    "\n",
+    "def restoreObject(filePath):\n",
+    "    \"\"\"\n",
+    "    restores an object\n",
+    "    Parameters\n",
+    "        filePath : file path to restore object from\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"rb\") as infile:\n",
+    "        obj = pickle.load(infile)\n",
+    "    return obj\n",
+    "\n",
+    "def isNumeric(data):\n",
+    "    \"\"\"\n",
+    "    true if all elements int or float\n",
+    "    Parameters\n",
+    "        data : numeric data list\n",
+    "    \"\"\"\n",
+    "    if type(data) == list or type(data) == np.ndarray:\n",
+    "        col = pd.Series(data)\n",
+    "    else:\n",
+    "        col = data\n",
+    "    return col.dtype == np.int32 or col.dtype == np.int64 or col.dtype == np.float32 or col.dtype == np.float64\n",
+    "\n",
+    "def isInteger(data):\n",
+    "    \"\"\"\n",
+    "    true if all elements int \n",
+    "    Parameters\n",
+    "        data : numeric data list\n",
+    "    \"\"\"\n",
+    "    if type(data) == list or type(data) == np.ndarray:\n",
+    "        col = pd.Series(data)\n",
+    "    else:\n",
+    "        col = data\n",
+    "    return col.dtype == np.int32 or col.dtype == np.int64\n",
+    "\n",
+    "def isFloat(data):\n",
+    "    \"\"\"\n",
+    "    true if all elements  float\n",
+    "    Parameters\n",
+    "        data : numeric data list\n",
+    "    \"\"\"\n",
+    "    if type(data) == list or type(data) == np.ndarray:\n",
+    "        col = pd.Series(data)\n",
+    "    else:\n",
+    "        col = data\n",
+    "    return col.dtype == np.float32 or col.dtype == np.float64\n",
+    "\n",
+    "def isBinary(data):\n",
+    "    \"\"\"\n",
+    "    true if all elements either 0 or 1\n",
+    "    Parameters\n",
+    "        data : binary data\n",
+    "    \"\"\"\n",
+    "    re = next((d for d in data if not (type(d) == int and (d == 0 or d == 1))), None)\n",
+    "    return (re is None)\n",
+    "\n",
+    "def isCategorical(data):\n",
+    "    \"\"\"\n",
+    "    true if all elements int or string\n",
+    "    Parameters\n",
+    "        data : data value\n",
+    "    \"\"\"\n",
+    "    re = next((d for d in data if not (type(d) == int or type(d) == str)), None)\n",
+    "    return (re is None)\n",
+    "\n",
+    "def assertEqual(value, veq, msg):\n",
+    "    \"\"\"\n",
+    "    assert equal to\n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        veq : value to be equated with\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value == veq , msg\n",
+    "\n",
+    "def assertGreater(value, vmin, msg):\n",
+    "    \"\"\"\n",
+    "    assert greater than \n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        vmin : minimum value\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value > vmin , msg\n",
+    "\n",
+    "def assertGreaterEqual(value, vmin, msg):\n",
+    "    \"\"\"\n",
+    "    assert greater than \n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        vmin : minimum value\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value >= vmin , msg\n",
+    "\n",
+    "def assertLesser(value, vmax, msg):\n",
+    "    \"\"\"\n",
+    "    assert less than\n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        vmax : maximum value\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value < vmax , msg\n",
+    "\n",
+    "def assertLesserEqual(value, vmax, msg):\n",
+    "    \"\"\"\n",
+    "    assert less than\n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        vmax : maximum value\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value <= vmax , msg\n",
+    "\n",
+    "def assertWithinRange(value, vmin, vmax, msg):\n",
+    "    \"\"\"\n",
+    "    assert within range\n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        vmin : minimum value\n",
+    "        vmax : maximum value\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value >= vmin and value <= vmax, msg\n",
+    "\n",
+    "def assertInList(value, values, msg):\n",
+    "    \"\"\"\n",
+    "    assert contains in a list\n",
+    "    Parameters\n",
+    "        value ; balue to check for inclusion\n",
+    "        values : list data\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value in values, msg\n",
+    "\n",
+    "def maxListDist(l1, l2):\n",
+    "    \"\"\"\n",
+    "    maximum list element difference between 2 lists\n",
+    "    Parameters\n",
+    "        l1 : first list data\n",
+    "        l2 : second list data\n",
+    "    \"\"\"\n",
+    "    dist = max(list(map(lambda v : abs(v[0] - v[1]), zip(l1, l2))))\t\n",
+    "    return dist\n",
+    "\n",
+    "def fileLineCount(fPath):\n",
+    "    \"\"\" \n",
+    "    number of lines ina file \n",
+    "    Parameters\n",
+    "        fPath : file path\n",
+    "    \"\"\"\n",
+    "    with open(fPath) as f:\n",
+    "        for i, li in enumerate(f):\n",
+    "            pass\n",
+    "    return (i + 1)\n",
+    "\n",
+    "def getAlphaNumCharCount(sdata):\n",
+    "    \"\"\" \n",
+    "    number of alphabetic and numeric charcters in a string \n",
+    "    Parameters\n",
+    "        sdata : string data\n",
+    "    \"\"\"\n",
+    "    acount = 0\n",
+    "    ncount = 0\n",
+    "    scount = 0\n",
+    "    ocount = 0\n",
+    "    assertEqual(type(sdata), str, \"input must be string\")\n",
+    "    for c in sdata:\n",
+    "        if c.isnumeric():\n",
+    "            ncount += 1\n",
+    "        elif c.isalpha():\n",
+    "            acount += 1\n",
+    "        elif c.isspace():\n",
+    "            scount += 1\n",
+    "        else:\n",
+    "            ocount += 1\n",
+    "    r = (acount, ncount, ocount)\n",
+    "    return r\n",
+    "\n",
+    "class StepFunction:\n",
+    "    \"\"\"\n",
+    "    step function\n",
+    "    Parameters\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  *values):\n",
+    "        \"\"\"\n",
+    "        initilizer\n",
+    "\n",
+    "        Parameters\n",
+    "            values : list of tuples, wich each tuple containing 2 x values and corresponding y value\n",
+    "        \"\"\"\n",
+    "        self.points = values\n",
+    "\n",
+    "    def find(self, x):\n",
+    "        \"\"\"\n",
+    "        finds step function value\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        found = False\n",
+    "        y = 0\n",
+    "        for p in self.points:\n",
+    "            if (x >= p[0] and x < p[1]):\n",
+    "                y = p[2]\n",
+    "                found = True\n",
+    "                break\n",
+    "\n",
+    "        if not found:\n",
+    "            l = len(self.points)\n",
+    "            if (x < self.points[0][0]):\n",
+    "                y = self.points[0][2]\n",
+    "            elif (x > self.points[l-1][1]):\n",
+    "                y = self.points[l-1][2]\n",
+    "        return y\n",
+    "\n",
+    "\n",
+    "class DummyVarGenerator:\n",
+    "    \"\"\"\n",
+    "    dummy variable generator for categorical variable\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  rowSize, catValues, trueVal, falseVal, delim=None):\n",
+    "        \"\"\"\n",
+    "        initilizer\n",
+    "\n",
+    "        Parameters\n",
+    "            rowSize : row size\n",
+    "            catValues : dictionary with field index as key and list of categorical values as value\n",
+    "            trueVal : true value, typically \"1\"\n",
+    "            falseval : false value , typically \"0\"\n",
+    "            delim : field delemeter\n",
+    "        \"\"\"\n",
+    "        self.rowSize = rowSize\n",
+    "        self.catValues = catValues\n",
+    "        numCatVar = len(catValues)\n",
+    "        colCount = 0\n",
+    "        for v in self.catValues.values():\n",
+    "            colCount += len(v)\n",
+    "        self.newRowSize = rowSize - numCatVar + colCount\n",
+    "        #print (\"new row size {}\".format(self.newRowSize))\n",
+    "        self.trueVal = trueVal\n",
+    "        self.falseVal = falseVal\n",
+    "        self.delim = delim\n",
+    "\n",
+    "    def processRow(self, row):\n",
+    "        \"\"\"\n",
+    "        encodes categorical variables, returning as delemeter separate dstring or list\n",
+    "\n",
+    "        Parameters\n",
+    "            row : row either delemeter separated string or list\n",
+    "        \"\"\"\n",
+    "        if self.delim is not None:\n",
+    "            rowArr = row.split(self.delim)\n",
+    "            msg = \"row does not have expected number of columns found \" + str(len(rowArr)) + \" expected \" + str(self.rowSize)\n",
+    "            assert len(rowArr) == self.rowSize, msg\n",
+    "        else:\n",
+    "            rowArr = row\n",
+    "\n",
+    "        newRowArr = []\n",
+    "        for i in range(len(rowArr)):\n",
+    "            curVal = rowArr[i]\n",
+    "            if (i in self.catValues):\n",
+    "                values = self.catValues[i]\n",
+    "                for val in values:\n",
+    "                    if val == curVal:\n",
+    "                        newVal = self.trueVal\n",
+    "                    else:\n",
+    "                        newVal = self.falseVal\n",
+    "                    newRowArr.append(newVal)\n",
+    "            else:\n",
+    "                newRowArr.append(curVal)\n",
+    "        assert len(newRowArr) == self.newRowSize, \"invalid new row size \" + str(len(newRowArr)) + \" expected \" + str(self.newRowSize)\n",
+    "        encRow = self.delim.join(newRowArr) if self.delim is not None else newRowArr\n",
+    "        return encRow\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lib/mlutil.ipynb ADDED Viewed

	@@ -0,0 +1,1297 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d05ce02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import numpy as np\n",
+    "from sklearn import preprocessing\n",
+    "from sklearn import metrics\n",
+    "from sklearn.datasets import make_blobs\n",
+    "from sklearn.datasets import make_classification\n",
+    "import random\n",
+    "from math import *\n",
+    "from decimal import Decimal\n",
+    "import statistics\n",
+    "import jprops\n",
+    "from Levenshtein import distance as ld\n",
+    "from util import *\n",
+    "from sampler import *\n",
+    "\n",
+    "class Configuration:\n",
+    "    \"\"\"\n",
+    "    Configuration management. Supports default value, mandatory value and typed value.\n",
+    "    \"\"\"\n",
+    "    def __init__(self, configFile, defValues, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            configFile : config file path\n",
+    "            defValues : dictionary of default values\n",
+    "            verbose : verbosity flag\n",
+    "        \"\"\"\n",
+    "        configs = {}\n",
+    "        with open(configFile) as fp:\n",
+    "            for key, value in jprops.iter_properties(fp):\n",
+    "                configs[key] = value\n",
+    "        self.configs = configs\n",
+    "        self.defValues = defValues\n",
+    "        self.verbose = verbose\n",
+    "\n",
+    "    def override(self, configFile):\n",
+    "        \"\"\"\n",
+    "        over ride configuration from file\n",
+    "\n",
+    "        Parameters\n",
+    "            configFile : override config file path\n",
+    "        \"\"\"\n",
+    "        with open(configFile) as fp:\n",
+    "            for key, value in jprops.iter_properties(fp):\n",
+    "                self.configs[key] = value\n",
+    "\n",
+    "\n",
+    "    def setParam(self, name, value):\n",
+    "        \"\"\"\n",
+    "        override individual configuration\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "            value : config param value\n",
+    "        \"\"\"\n",
+    "        self.configs[name] = value\n",
+    "\n",
+    "\n",
+    "    def getStringConfig(self, name):\n",
+    "        \"\"\"\n",
+    "        get string param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            val = (self.configs[name], False)\n",
+    "        if self.verbose:\n",
+    "            print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def getIntConfig(self, name):\n",
+    "        \"\"\"\n",
+    "        get int param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        #print \"%s %s\" %(name,self.configs[name])\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            val = (int(self.configs[name]), False)\n",
+    "        if self.verbose:\n",
+    "            print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def getFloatConfig(self, name):\n",
+    "        \"\"\"\n",
+    "        get float param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        #print \"%s %s\" %(name,self.configs[name])\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            val = (float(self.configs[name]), False)\n",
+    "        if self.verbose:\n",
+    "            print( \"{} {} {:06.3f}\".format(name, self.configs[name], val[0]))\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def getBooleanConfig(self, name):\n",
+    "        \"\"\"\n",
+    "        #get boolean param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            bVal = self.configs[name].lower() == \"true\"\n",
+    "            val = (bVal, False)\n",
+    "        if self.verbose:\n",
+    "            print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def getIntListConfig(self, name, delim=\",\"):\n",
+    "        \"\"\"\n",
+    "        get int list param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "            delim : delemeter\n",
+    "        \"\"\"\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            delSepStr = self.getStringConfig(name)\n",
+    "\n",
+    "            #specified as range\n",
+    "            intList = strListOrRangeToIntArray(delSepStr[0])\n",
+    "            val =(intList, delSepStr[1])\n",
+    "        return val\n",
+    "\n",
+    "    def getFloatListConfig(self, name, delim=\",\"):\n",
+    "        \"\"\"\n",
+    "        get float list param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "            delim : delemeter\n",
+    "        \"\"\"\n",
+    "        delSepStr = self.getStringConfig(name)\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            flList = strToFloatArray(delSepStr[0], delim)\n",
+    "            val =(flList, delSepStr[1])\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def getStringListConfig(self, name, delim=\",\"):\n",
+    "        \"\"\"\n",
+    "        get string list param\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "            delim : delemeter\n",
+    "        \"\"\"\n",
+    "        delSepStr = self.getStringConfig(name)\n",
+    "        if self.isNone(name):\n",
+    "            val = (None, False)\n",
+    "        elif self.isDefault(name):\n",
+    "            val = (self.handleDefault(name), True)\n",
+    "        else:\n",
+    "            strList = delSepStr[0].split(delim)\n",
+    "            val = (strList, delSepStr[1])\n",
+    "        return val\n",
+    "\n",
+    "    def handleDefault(self, name):\n",
+    "        \"\"\"\n",
+    "        handles default\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        dVal = self.defValues[name]\n",
+    "        if (dVal[1] is None):\n",
+    "            val = dVal[0]\n",
+    "        else:\n",
+    "            raise ValueError(dVal[1])\n",
+    "        return val\n",
+    "\n",
+    "\n",
+    "    def isNone(self, name):\n",
+    "        \"\"\"\n",
+    "        true is value is None\t\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        return self.configs[name].lower() == \"none\"\n",
+    "\n",
+    "\n",
+    "    def isDefault(self, name):\n",
+    "        \"\"\"\n",
+    "        true if the value is default\t\n",
+    "        Parameters\n",
+    "            name : config param name\n",
+    "        \"\"\"\n",
+    "        de = self.configs[name] == \"_\"\n",
+    "        #print de\n",
+    "        return de\n",
+    "\n",
+    "\n",
+    "    def eitherOrStringConfig(self, firstName, secondName):\n",
+    "        \"\"\"\n",
+    "        returns one of two string parameters\t\n",
+    "        Parameters\n",
+    "            firstName : first parameter name\n",
+    "            secondName : second parameter name\t\n",
+    "        \"\"\"\n",
+    "        if not self.isNone(firstName):\n",
+    "            first = self.getStringConfig(firstName)[0]\n",
+    "            second = None\n",
+    "            if not self.isNone(secondName):\n",
+    "                raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \"  \" + secondName)\n",
+    "        else:\n",
+    "            if not self.isNone(secondName):\n",
+    "                second = self.getStringConfig(secondtName)[0]\n",
+    "                first = None\n",
+    "            else:\n",
+    "                raise ValueError(\"at least one of the two parameters should be set \" + firstName + \"  \" + secondName)\n",
+    "        return (first, second)\n",
+    "\n",
+    "\n",
+    "    def eitherOrIntConfig(self, firstName, secondName):\n",
+    "        \"\"\"\n",
+    "        returns one of two int parameters\t\n",
+    "        Parameters\n",
+    "            firstName : first parameter name\n",
+    "            secondName : second parameter name\t\n",
+    "        \"\"\"\n",
+    "        if not self.isNone(firstName):\n",
+    "            first = self.getIntConfig(firstName)[0]\n",
+    "            second = None\n",
+    "            if not self.isNone(secondName):\n",
+    "                raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \"  \" + secondName)\n",
+    "        else:\n",
+    "            if not self.isNone(secondName):\n",
+    "                second = self.getIntConfig(secondsName)[0]\n",
+    "                first = None\n",
+    "            else:\n",
+    "                raise ValueError(\"at least one of the two parameters should be set \" + firstName + \"  \" + secondName)\n",
+    "        return (first, second)\n",
+    "\n",
+    "\n",
+    "class CatLabelGenerator:\n",
+    "    \"\"\"\n",
+    "    label generator for categorical variables\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  catValues, delim):\n",
+    "        \"\"\"\n",
+    "        initilizers\n",
+    "\n",
+    "        Parameters\n",
+    "            catValues : dictionary of categorical values\n",
+    "            delim : delemeter\n",
+    "        \"\"\"\n",
+    "        self.encoders = {}\n",
+    "        self.catValues = catValues\n",
+    "        self.delim = delim\n",
+    "        for k in self.catValues.keys():\t\n",
+    "            le = preprocessing.LabelEncoder()\t\n",
+    "            le.fit(self.catValues[k])\n",
+    "            self.encoders[k] = le\n",
+    "\n",
+    "    def processRow(self, row):\t\n",
+    "        \"\"\"\n",
+    "        encode row categorical values\n",
+    "\n",
+    "        Parameters:\n",
+    "            row : data row\n",
+    "        \"\"\"\n",
+    "        #print row\n",
+    "        rowArr = row.split(self.delim)\n",
+    "        for i in range(len(rowArr)):\n",
+    "            if (i in self.catValues):\n",
+    "                curVal = rowArr[i]\n",
+    "                assert curVal in self.catValues[i], \"categorival value invalid\"\n",
+    "                encVal = self.encoders[i].transform([curVal])\n",
+    "                rowArr[i] = str(encVal[0])\n",
+    "        return self.delim.join(rowArr)\t\t\n",
+    "\n",
+    "    def getOrigLabels(self, indx):\n",
+    "        \"\"\"\n",
+    "        get original labels\n",
+    "\n",
+    "        Parameters:\n",
+    "            indx : column index\n",
+    "        \"\"\"\n",
+    "        return self.encoders[indx].classes_\t\n",
+    "\n",
+    "\n",
+    "class SupvLearningDataGenerator:\n",
+    "    \"\"\"\n",
+    "    data generator for supervised learning\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  configFile):\n",
+    "        \"\"\"\n",
+    "        initilizers\n",
+    "\n",
+    "        Parameters\n",
+    "            configFile : config file path\n",
+    "        \"\"\"\n",
+    "        defValues = dict()\n",
+    "        defValues[\"common.num.samp\"] = (100, None)\n",
+    "        defValues[\"common.num.feat\"] = (5, None)\n",
+    "        defValues[\"common.feat.trans\"] = (None, None)\n",
+    "        defValues[\"common.feat.types\"] = (None, \"missing feature types\")\n",
+    "        defValues[\"common.cat.feat.distr\"] = (None, None)\n",
+    "        defValues[\"common.output.precision\"] = (3, None)\n",
+    "        defValues[\"common.error\"] = (0.01, None)\n",
+    "        defValues[\"class.gen.technique\"] = (\"blob\", None)\n",
+    "        defValues[\"class.num.feat.informative\"] = (2, None)\n",
+    "        defValues[\"class.num.feat.redundant\"] = (2, None)\n",
+    "        defValues[\"class.num.feat.repeated\"] = (0, None)\n",
+    "        defValues[\"class.num.feat.cat\"] = (0, None)\n",
+    "        defValues[\"class.num.class\"] = (2, None)\n",
+    "\n",
+    "        self.config = Configuration(configFile, defValues)\n",
+    "\n",
+    "    def genClassifierData(self):\n",
+    "        \"\"\"\n",
+    "        generates classifier data\n",
+    "        \"\"\"\n",
+    "        nsamp =  self.config.getIntConfig(\"common.num.samp\")[0]\n",
+    "        nfeat =  self.config.getIntConfig(\"common.num.feat\")[0]\n",
+    "        nclass =  self.config.getIntConfig(\"class.num.class\")[0]\n",
+    "        #transform with shift and scale\n",
+    "        ftrans =  self.config.getFloatListConfig(\"common.feat.trans\")[0]\n",
+    "        feTrans = dict()\n",
+    "        for i in range(0, len(ftrans), 2):\n",
+    "            tr = (ftrans[i], ftrans[i+1])\n",
+    "            indx = int(i/2)\n",
+    "            feTrans[indx] = tr\n",
+    "\n",
+    "        ftypes =  self.config.getStringListConfig(\"common.feat.types\")[0]\n",
+    "\n",
+    "        # categorical feature distribution\n",
+    "        feCatDist = dict()\n",
+    "        fcatdl =  self.config.getStringListConfig(\"common.cat.feat.distr\")[0]\n",
+    "        for fcatds in fcatdl:\n",
+    "            fcatd = fcatds.split(\":\")\n",
+    "            feInd =  int(fcatd[0])\n",
+    "            clVal =  int(fcatd[1])\n",
+    "            key = (feInd, clVal)\t\t#feature index and class value\n",
+    "            dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2)))\n",
+    "            feCatDist[key] = CategoricalRejectSampler(*dist)\n",
+    "\n",
+    "        #shift and scale\n",
+    "        genTechnique = self.config.getStringConfig(\"class.gen.technique\")[0]\n",
+    "        error = self.config.getFloatConfig(\"common.error\")[0]\n",
+    "        if genTechnique == \"blob\":\n",
+    "            features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat)\n",
+    "            for i in range(nsamp):\t\t\t#shift and scale\n",
+    "                for j in range(nfeat):\n",
+    "                    tr = feTrans[j]\n",
+    "                    features[i,j] = (features[i,j]  + tr[0]) * tr[1]\n",
+    "            claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz)))\n",
+    "        elif genTechnique == \"classify\":\n",
+    "            nfeatInfo =  self.config.getIntConfig(\"class.num.feat.informative\")[0]\n",
+    "            nfeatRed =  self.config.getIntConfig(\"class.num.feat.redundant\")[0]\n",
+    "            nfeatRep =  self.config.getIntConfig(\"class.num.feat.repeated\")[0]\n",
+    "            shifts = list(map(lambda i : feTrans[i][0], range(nfeat)))\n",
+    "            scales = list(map(lambda i : feTrans[i][1], range(nfeat)))\n",
+    "            features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed, \n",
+    "            n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales)\n",
+    "        else:\n",
+    "            raise \"invalid genaration technique\"\n",
+    "\n",
+    "        # add categorical features and format\n",
+    "        nCatFeat = self.config.getIntConfig(\"class.num.feat.cat\")[0]\n",
+    "        prec =  self.config.getIntConfig(\"common.output.precision\")[0]\n",
+    "        for f , c in zip(features, claz):\n",
+    "            nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat)))\n",
+    "            if nCatFeat > 0:\n",
+    "                cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1)))\n",
+    "                rec = \",\".join(nfs) + \",\" +  \",\".join(cfs)  + \",\" + str(c)\n",
+    "            else:\n",
+    "                rec = \",\".join(nfs)  + \",\" + str(c)\n",
+    "            yield rec\n",
+    "\n",
+    "    def numFeToStr(self, fv, ft, prec):\n",
+    "        \"\"\"\n",
+    "        nummeric feature value to string\n",
+    "\n",
+    "        Parameters\n",
+    "            fv : field value\n",
+    "            ft : field data type\n",
+    "            prec : precision\n",
+    "        \"\"\"\n",
+    "        if ft == \"float\":\n",
+    "            s = formatFloat(prec, fv)\n",
+    "        elif ft ==\"int\":\n",
+    "            s = str(int(fv))\n",
+    "        else:\t\t\n",
+    "            raise \"invalid type expecting float or int\"\n",
+    "        return s\n",
+    "\n",
+    "    def catFe(self, i, cv, ft, feCatDist):\n",
+    "        \"\"\"\n",
+    "        generate categorical feature\n",
+    "\n",
+    "        Parameters\n",
+    "            i : col index\n",
+    "            cv : class value\n",
+    "            ft : field data type\n",
+    "            feCatDist : cat value distribution\n",
+    "        \"\"\"\n",
+    "        if ft == \"cat\":\n",
+    "            key = (i, cv)\n",
+    "            s = feCatDist[key].sample()\n",
+    "        else:\t\t\n",
+    "            raise \"invalid type expecting categorical\"\n",
+    "        return s\n",
+    "\n",
+    "\n",
+    "\n",
+    "def loadDataFile(file, delim, cols, colIndices):\n",
+    "    \"\"\"\n",
+    "    loads delim separated file and extracts columns\n",
+    "    Parameters\n",
+    "        file : file path\n",
+    "        delim : delemeter\n",
+    "        cols : columns to use from file\n",
+    "        colIndices ; columns to extract\n",
+    "    \"\"\"\n",
+    "    data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
+    "    extrData = data[:,colIndices]\n",
+    "    return (data, extrData)\n",
+    "\n",
+    "def loadFeatDataFile(file, delim, cols):\n",
+    "    \"\"\"\n",
+    "    loads delim separated file and extracts columns\n",
+    "\n",
+    "    Parameters\n",
+    "        file : file path\n",
+    "        delim : delemeter\n",
+    "        cols : columns to use from file\n",
+    "    \"\"\"\n",
+    "    data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
+    "    return data\n",
+    "\n",
+    "def extrColumns(arr, columns):\n",
+    "    \"\"\"\n",
+    "    extracts columns\n",
+    "\n",
+    "    Parameters\n",
+    "        arr : 2D array\n",
+    "        columns : columns\n",
+    "    \"\"\"\n",
+    "    return arr[:, columns]\n",
+    "\n",
+    "def subSample(featData, clsData, subSampleRate, withReplacement):\n",
+    "    \"\"\"\n",
+    "    subsample feature and class label data\t\n",
+    "    Parameters\n",
+    "        featData : 2D array of feature data\n",
+    "        clsData : arrray of class labels\n",
+    "        subSampleRate : fraction to be sampled\n",
+    "        withReplacement : true if sampling with replacement\n",
+    "    \"\"\"\n",
+    "    sampSize = int(featData.shape[0] * subSampleRate)\n",
+    "    sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement)\n",
+    "    sampFeat = featData[sampledIndx]\n",
+    "    sampCls = clsData[sampledIndx]\n",
+    "    return(sampFeat, sampCls)\n",
+    "\n",
+    "def euclideanDistance(x,y):\n",
+    "    \"\"\"\n",
+    "    euclidean distance\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "    \"\"\"\n",
+    "    return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y)))\n",
+    "\n",
+    "def squareRooted(x):\n",
+    "    \"\"\"\n",
+    "    square root of sum square\n",
+    "    Parameters\n",
+    "        x : data vector\n",
+    "    \"\"\"\n",
+    "    return round(sqrt(sum([a*a for a in x])),3)\n",
+    "\n",
+    "def cosineSimilarity(x,y):\n",
+    "    \"\"\"\n",
+    "    cosine similarity\n",
+    "\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "    \"\"\"\n",
+    "    numerator = sum(a*b for a,b in zip(x,y))\n",
+    "    denominator = squareRooted(x) * squareRooted(y)\n",
+    "    return round(numerator / float(denominator), 3)\n",
+    "\n",
+    "def cosineDistance(x,y):\n",
+    "    \"\"\"\n",
+    "    cosine distance\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "    \"\"\"\n",
+    "    return 1.0 - cosineSimilarity(x,y)\n",
+    "\n",
+    "def manhattanDistance(x,y):\n",
+    "    \"\"\"\n",
+    "    manhattan distance\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "    \"\"\"\n",
+    "    return sum(abs(a-b) for a,b in zip(x,y))\n",
+    "\n",
+    "def nthRoot(value, nRoot):\n",
+    "    \"\"\"\n",
+    "    nth root\n",
+    "    Parameters\n",
+    "        value : data value\n",
+    "        nRoot : root\n",
+    "    \"\"\"\n",
+    "    rootValue = 1/float(nRoot)\n",
+    "    return round (Decimal(value) ** Decimal(rootValue),3)\n",
+    "\n",
+    "def minkowskiDistance(x,y,pValue):\n",
+    "    \"\"\"\n",
+    "    minkowski distance\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "        pValue : power factor\n",
+    "    \"\"\"\n",
+    "    return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue)\n",
+    "\n",
+    "def jaccardSimilarityX(x,y):\n",
+    "    \"\"\"\n",
+    "    jaccard similarity\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "    \"\"\"\n",
+    "    intersectionCardinality = len(set.intersection(*[set(x), set(y)]))\n",
+    "    unionCardinality = len(set.union(*[set(x), set(y)]))\n",
+    "    return intersectionCardinality/float(unionCardinality)\n",
+    "\n",
+    "def jaccardSimilarity(x,y,wx=1.0,wy=1.0):\n",
+    "    \"\"\"\n",
+    "    jaccard similarity\n",
+    "\n",
+    "    Parameters\n",
+    "        x : first vector\n",
+    "        y : second fvector\n",
+    "        wx : weight for x\n",
+    "        wy : weight for y\n",
+    "    \"\"\"\n",
+    "    sx = set(x)\n",
+    "    sy = set(y)\n",
+    "    sxyInt = sx.intersection(sy)\n",
+    "    intCardinality = len(sxyInt)\n",
+    "    sxIntDiff = sx.difference(sxyInt)\n",
+    "    syIntDiff = sy.difference(sxyInt)\n",
+    "    unionCardinality = len(sx.union(sy))\n",
+    "    return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff))\n",
+    "\n",
+    "def levenshteinSimilarity(s1, s2):\n",
+    "    \"\"\"\n",
+    "    Levenshtein similarity for strings\n",
+    "\n",
+    "    Parameters\n",
+    "        sx : first string\n",
+    "        sy : second string\n",
+    "    \"\"\"\n",
+    "    assert type(s1) == str and type(s2) == str,  \"Levenshtein similarity is for string only\"\n",
+    "    d = ld(s1,s2)\n",
+    "    #print(d)\n",
+    "    l = max(len(s1),len(s2))\n",
+    "    d = 1.0 - min(d/l, 1.0)\n",
+    "    return d\t\n",
+    "\n",
+    "def norm(values, po=2):\n",
+    "    \"\"\"\n",
+    "    norm\n",
+    "    Parameters\n",
+    "        values : list of values\n",
+    "        po : power\n",
+    "    \"\"\"\n",
+    "    no = sum(list(map(lambda v: pow(v,po), values)))\n",
+    "    no = pow(no,1.0/po)\n",
+    "    return list(map(lambda v: v/no, values))\n",
+    "\n",
+    "def createOneHotVec(size, indx = -1):\n",
+    "    \"\"\"\n",
+    "    random one hot vector\n",
+    "\n",
+    "    Parameters\n",
+    "        size : vector size\n",
+    "        indx : one hot position\n",
+    "    \"\"\"\n",
+    "    vec = [0] * size\n",
+    "    s = random.randint(0, size - 1) if indx < 0 else indx\n",
+    "    vec[s] = 1\n",
+    "    return vec\n",
+    "\n",
+    "def createAllOneHotVec(size):\n",
+    "    \"\"\"\n",
+    "    create all one hot vectors\n",
+    "\n",
+    "    Parameters\n",
+    "        size : vector size and no of vectors\n",
+    "    \"\"\"\n",
+    "    vecs = list()\n",
+    "    for i in range(size):\n",
+    "        vec = [0] * size\n",
+    "        vec[i] = 1\n",
+    "        vecs.append(vec)\n",
+    "    return vecs\n",
+    "\n",
+    "def blockShuffle(data, blockSize):\n",
+    "    \"\"\"\n",
+    "    block shuffle \t\n",
+    "\n",
+    "    Parameters\n",
+    "        data : list data\n",
+    "        blockSize : block size\n",
+    "    \"\"\"\n",
+    "    numBlock = int(len(data) / blockSize)\n",
+    "    remain = len(data) % blockSize\n",
+    "    numBlock +=  (1 if remain > 0 else 0)\n",
+    "    shuffled = list()\n",
+    "    for i in range(numBlock):\n",
+    "        b = random.randint(0, numBlock-1)\n",
+    "        beg = b * blockSize\n",
+    "        if (b < numBlock-1):\n",
+    "            end = beg + blockSize\n",
+    "            shuffled.extend(data[beg:end])\t\t\n",
+    "        else:\n",
+    "            shuffled.extend(data[beg:])\n",
+    "    return shuffled\t\n",
+    "\n",
+    "def shuffle(data, numShuffle):\n",
+    "    \"\"\"\n",
+    "    shuffle data by randonm swapping\n",
+    "\n",
+    "    Parameters\n",
+    "        data : list data\n",
+    "        numShuffle : no of pairwise swaps\n",
+    "    \"\"\"\n",
+    "    sz = len(data)\n",
+    "    if numShuffle is None:\n",
+    "        numShuffle = int(sz / 2)\n",
+    "    for i in range(numShuffle):\n",
+    "        fi = random.randint(0, sz -1)\n",
+    "        se = random.randint(0, sz -1)\n",
+    "        tmp = data[fi]\n",
+    "        data[fi] = data[se]\n",
+    "        data[se] = tmp\t\n",
+    "\n",
+    "def randomWalk(size, start, lowStep, highStep):\n",
+    "    \"\"\"\n",
+    "    random walk\t\n",
+    "\n",
+    "    Parameters\n",
+    "        size : list data\n",
+    "        start : initial position\n",
+    "        lowStep : step min\n",
+    "        highStep : step max\n",
+    "    \"\"\"\n",
+    "    cur = start\n",
+    "    for i in range(size):\n",
+    "        yield cur\n",
+    "        cur += randomFloat(lowStep, highStep)\n",
+    "\n",
+    "def binaryEcodeCategorical(values, value):\n",
+    "    \"\"\"\n",
+    "    one hot binary encoding\t\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list of values\n",
+    "        value : value to be replaced with 1\n",
+    "    \"\"\"\n",
+    "    size = len(values)\n",
+    "    vec = [0] * size\n",
+    "    for i in range(size):\n",
+    "        if (values[i] == value):\n",
+    "            vec[i] = 1\n",
+    "    return vec\t\t\n",
+    "\n",
+    "def createLabeledSeq(inputData, tw):\n",
+    "    \"\"\"\n",
+    "    Creates feature, label pair from sequence data, where we have tw number of features followed by output\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list containing feature and label\n",
+    "        tw : no of features\n",
+    "    \"\"\"\n",
+    "    features = list()\n",
+    "    labels = list()\n",
+    "    l = len(inputDta)\n",
+    "    for i in range(l - tw):\n",
+    "        trainSeq = inputData[i:i+tw]\n",
+    "        trainLabel = inputData[i+tw]\n",
+    "        features.append(trainSeq)\n",
+    "        labels.append(trainLabel)\n",
+    "    return (features, labels)\n",
+    "\n",
+    "def createLabeledSeq(filePath, delim, index, tw):\n",
+    "    \"\"\"\n",
+    "    Creates feature, label pair from 1D sequence data in file\t\n",
+    "\n",
+    "    Parameters\n",
+    "        filePath : file path\n",
+    "        delim : delemeter\n",
+    "        index : column index\n",
+    "        tw : no of features\n",
+    "    \"\"\"\n",
+    "    seqData = getFileColumnAsFloat(filePath, delim, index)\n",
+    "    return createLabeledSeq(seqData, tw)\n",
+    "\n",
+    "def fromMultDimSeqToTabular(data, inpSize, seqLen):\n",
+    "    \"\"\"\n",
+    "    Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize)\n",
+    "\n",
+    "    Parameters\n",
+    "        data : 2D array\n",
+    "        inpSize : each input size in sequence\n",
+    "        seqLen : sequence length\n",
+    "    \"\"\"\t\n",
+    "    nrow = data.shape[0]\n",
+    "    assert data.shape[1] == inpSize * seqLen, \"invalid input size or sequence length\"\n",
+    "    return data.reshape(nrow * seqLen, inpSize)\n",
+    "\n",
+    "def fromTabularToMultDimSeq(data, inpSize, seqLen):\n",
+    "    \"\"\"\n",
+    "    Input shape (nrow * seqLen, inpSize)   output  shape (nrow, inpSize * seqLen) \n",
+    "    Parameters\n",
+    "        data : 2D array\n",
+    "        inpSize : each input size in sequence\n",
+    "        seqLen : sequence length\n",
+    "    \"\"\"\t\n",
+    "    nrow = int(data.shape[0] / seqLen)\n",
+    "    assert data.shape[1] == inpSize, \"invalid input size\"\n",
+    "    return data.reshape(nrow,  seqLen * inpSize)\n",
+    "\n",
+    "def difference(data, interval=1):\n",
+    "    \"\"\"\n",
+    "    takes difference in time series data\n",
+    "    Parameters\n",
+    "        data :list data\n",
+    "        interval : interval for difference\n",
+    "    \"\"\"\n",
+    "    diff = list()\n",
+    "    for i in range(interval, len(data)):\n",
+    "        value = data[i] - data[i - interval]\n",
+    "        diff.append(value)\n",
+    "    return diff\n",
+    "\n",
+    "def normalizeMatrix(data, norm, axis=1):\n",
+    "    \"\"\"\n",
+    "    normalized each row of the matrix\n",
+    "\n",
+    "    Parameters\n",
+    "        data : 2D data\n",
+    "        nporm : normalization method\n",
+    "        axis : row or column\n",
+    "    \"\"\"\n",
+    "    normalized = preprocessing.normalize(data,norm=norm, axis=axis)\n",
+    "    return normalized\n",
+    "\n",
+    "def standardizeMatrix(data, axis=0):\n",
+    "    \"\"\"\n",
+    "    standardizes each column of the matrix with mean and std deviation\n",
+    "    Parameters\n",
+    "        data : 2D data\n",
+    "        axis : row or column\n",
+    "    \"\"\"\n",
+    "    standardized = preprocessing.scale(data, axis=axis)\n",
+    "    return standardized\n",
+    "\n",
+    "def asNumpyArray(data):\n",
+    "    \"\"\"\n",
+    "    converts to numpy array\n",
+    "    Parameters\n",
+    "        data  : array\n",
+    "    \"\"\"\n",
+    "    return np.array(data)\n",
+    "\n",
+    "def perfMetric(metric, yActual, yPred, clabels=None):\n",
+    "    \"\"\"\n",
+    "    predictive model accuracy metric\n",
+    "    Parameters\n",
+    "        metric : accuracy metric\n",
+    "        yActual : actual values array\n",
+    "        yPred : predicted values array\n",
+    "        clabels : class labels\n",
+    "    \"\"\"\n",
+    "    if metric == \"rsquare\":\n",
+    "        score = metrics.r2_score(yActual, yPred)\n",
+    "    elif metric == \"mae\":\n",
+    "        score = metrics.mean_absolute_error(yActual, yPred)\n",
+    "    elif metric == \"mse\":\n",
+    "        score = metrics.mean_squared_error(yActual, yPred)\n",
+    "    elif metric == \"acc\":\n",
+    "        yPred = np.rint(yPred)\n",
+    "        score = metrics.accuracy_score(yActual, yPred)\n",
+    "    elif metric == \"mlAcc\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.accuracy_score(yActual, yPred)\n",
+    "    elif metric == \"prec\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.precision_score(yActual, yPred)\n",
+    "    elif metric == \"rec\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.recall_score(yActual, yPred)\n",
+    "    elif metric == \"fone\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.f1_score(yActual, yPred)\n",
+    "    elif metric == \"confm\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.confusion_matrix(yActual, yPred)\n",
+    "    elif metric == \"clarep\":\n",
+    "        yPred = np.argmax(yPred, axis=1)\n",
+    "        score = metrics.classification_report(yActual, yPred)\n",
+    "    elif metric == \"bce\":\n",
+    "        if clabels is None:\n",
+    "            clabels = [0, 1]\n",
+    "        score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
+    "    elif metric == \"ce\":\n",
+    "        assert clabels is not None, \"labels must be provided\"\n",
+    "        score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
+    "    else:\n",
+    "        exitWithMsg(\"invalid prediction performance metric \" + metric)\n",
+    "    return score\n",
+    "\n",
+    "def scaleData(data, method):\n",
+    "    \"\"\"\n",
+    "    scales feature data column wise\n",
+    "    Parameters\n",
+    "        data : 2D array\n",
+    "        method : scaling method\n",
+    "    \"\"\"\n",
+    "    if method == \"minmax\":\n",
+    "        scaler = preprocessing.MinMaxScaler()\n",
+    "        data = scaler.fit_transform(data)\n",
+    "    elif method == \"zscale\":\n",
+    "        data = preprocessing.scale(data)\t\n",
+    "    else:\n",
+    "        raise ValueError(\"invalid scaling method\")\t\n",
+    "    return data\n",
+    "\n",
+    "def scaleDataWithParams(data, method, scParams):\n",
+    "    \"\"\"\n",
+    "    scales feature data column wise\n",
+    "    Parameters\n",
+    "        data : 2D array\n",
+    "        method : scaling method\n",
+    "        scParams : scaling parameters\n",
+    "    \"\"\"\n",
+    "    if method == \"minmax\":\n",
+    "        data = scaleMinMaxTabData(data, scParams)\n",
+    "    elif method == \"zscale\":\n",
+    "        raise ValueError(\"invalid scaling method\")\t\n",
+    "    else:\n",
+    "        raise ValueError(\"invalid scaling method\")\t\n",
+    "    return data\n",
+    "\n",
+    "\n",
+    "def scaleMinMaxTabData(tdata, minMax):\n",
+    "    \"\"\"\n",
+    "    for tabular scales feature data column wise using min max values for each field\n",
+    "    Parameters\n",
+    "        tdata : 2D array\n",
+    "        minMax : ni, max and range for each column\n",
+    "    \"\"\"\n",
+    "    stdata = list()\n",
+    "    for r in tdata:\n",
+    "        srdata = list()\n",
+    "        for i, c in enumerate(r):\n",
+    "            sd = (c - minMax[i][0]) / minMax[i][2]\n",
+    "            srdata.append(sd)\n",
+    "        stdata.append(srdata)\n",
+    "    return stdata\n",
+    "\n",
+    "def scaleMinMax(rdata, minMax):\n",
+    "    \"\"\"\n",
+    "    scales feature data column wise using min max values for each field\n",
+    "    Parameters\n",
+    "        rdata : data array\n",
+    "        minMax : ni, max and range for each column\n",
+    "    \"\"\"\n",
+    "    srdata = list()\n",
+    "    for i in range(len(rdata)):\n",
+    "        d = rdata[i]\n",
+    "        sd = (d - minMax[i][0]) / minMax[i][2]\n",
+    "        srdata.append(sd)\n",
+    "    return srdata\n",
+    "\n",
+    "def harmonicNum(n):\n",
+    "    \"\"\"\n",
+    "    harmonic number\n",
+    "    Parameters\n",
+    "        n : number\n",
+    "    \"\"\"\n",
+    "    h = 0\n",
+    "    for i in range(1, n+1, 1):\n",
+    "        h += 1.0 / i\n",
+    "    return h\n",
+    "\n",
+    "def digammaFun(n):\n",
+    "    \"\"\"\n",
+    "    figamma function\n",
+    "    Parameters\n",
+    "        n : number\n",
+    "    \"\"\"\n",
+    "    #Euler Mascheroni constant\n",
+    "    ec = 0.577216\n",
+    "    return harmonicNum(n - 1) - ec\n",
+    "\n",
+    "def getDataPartitions(tdata, types, columns = None):\n",
+    "    \"\"\"\n",
+    "    partitions data with the given columns and random split point defined with predicates\n",
+    "    Parameters\n",
+    "        tdata : 2D array\n",
+    "        types : data typers\n",
+    "        columns : column indexes\n",
+    "    \"\"\"\n",
+    "    (dtypes, cvalues) = extractTypesFromString(types)\n",
+    "    if columns is None:\n",
+    "        ncol = len(data[0])\n",
+    "        columns = list(range(ncol))\n",
+    "    ncol = len(columns)\n",
+    "    #print(columns)\n",
+    "\n",
+    "    # partition predicates\n",
+    "    partitions = None\n",
+    "    for c in columns:\n",
+    "        #print(c)\n",
+    "        dtype = dtypes[c]\n",
+    "        pred = list()\n",
+    "        if dtype == \"int\" or dtype == \"float\":\n",
+    "            (vmin, vmax) = getColMinMax(tdata, c)\n",
+    "            r = vmax - vmin\n",
+    "            rmin = vmin + .2 * r\n",
+    "            rmax = vmax - .2 * r\n",
+    "            sp = randomFloat(rmin, rmax)\n",
+    "            if dtype == \"int\":\n",
+    "                sp = int(sp)\n",
+    "            else:\n",
+    "                sp = \"{:.3f}\".format(sp)\n",
+    "                sp = float(sp)\n",
+    "            pred.append([c, \"LT\", sp])\n",
+    "            pred.append([c, \"GE\", sp])\n",
+    "        elif dtype == \"cat\":\n",
+    "            cv = cvalues[c]\n",
+    "            card = len(cv) \n",
+    "            if card < 3:\n",
+    "                num = 1\n",
+    "            else:\n",
+    "                num = randomInt(1, card - 1)\n",
+    "            sp = selectRandomSubListFromList(cv, num)\n",
+    "            sp = \" \".join(sp)\n",
+    "            pred.append([c, \"IN\", sp])\n",
+    "            pred.append([c, \"NOTIN\", sp])\n",
+    "\n",
+    "        #print(pred)\n",
+    "        if partitions is None:\n",
+    "            partitions = pred.copy()\n",
+    "            #print(\"initial\")\n",
+    "            #print(partitions)\n",
+    "        else:\n",
+    "            #print(\"extension\")\n",
+    "            tparts = list()\n",
+    "            for p in partitions:\n",
+    "                #print(p)\n",
+    "                l1 = p.copy()\n",
+    "                l1.extend(pred[0])\n",
+    "                l2 = p.copy()\n",
+    "                l2.extend(pred[1])\n",
+    "                #print(\"after extension\")\n",
+    "                #print(l1)\n",
+    "                #print(l2)\n",
+    "                tparts.append(l1)\n",
+    "                tparts.append(l2)\n",
+    "            partitions = tparts\t\n",
+    "            #print(\"extending\")\n",
+    "            #print(partitions)\n",
+    "\n",
+    "    #for p in partitions:\n",
+    "        #print(p)\t\n",
+    "    return partitions\t\t\t\n",
+    "\n",
+    "def genAlmostUniformDistr(size, nswap=50):\n",
+    "    \"\"\"\n",
+    "    generate probability distribution\n",
+    "\n",
+    "    Parameters\n",
+    "        size : distr size\n",
+    "        nswap : no of mass swaps\n",
+    "    \"\"\"\n",
+    "    un = 1.0 / size\n",
+    "    distr = [un] * size\n",
+    "    distr = mutDistr(distr, 0.1 * un, nswap)\n",
+    "    return distr\n",
+    "\n",
+    "def mutDistr(distr, shift, nswap=50):\n",
+    "    \"\"\"\n",
+    "    mutates a probability distribution\n",
+    "\n",
+    "    Parameters\n",
+    "        distr distribution\n",
+    "        shift : amount of shift for swap\n",
+    "        nswap : no of mass swaps\n",
+    "    \"\"\"\n",
+    "    size = len(distr)\n",
+    "    for _ in range(nswap):\n",
+    "        fi = randomInt(0, size -1)\n",
+    "        si = randomInt(0, size -1)\n",
+    "        while fi == si:\n",
+    "            fi = randomInt(0, size -1)\n",
+    "            si = randomInt(0, size -1)\n",
+    "\n",
+    "        shift = randomFloat(0, shift)\n",
+    "        t = distr[fi]\n",
+    "        distr[fi] -= shift\n",
+    "        if (distr[fi] < 0):\n",
+    "            distr[fi] = 0.0\n",
+    "            shift = t\n",
+    "        distr[si] += shift\n",
+    "    return distr\n",
+    "\n",
+    "def generateBinDistribution(size, ntrue):\n",
+    "    \"\"\"\n",
+    "    generate binary array with some elements set to 1\n",
+    "\n",
+    "    Parameters\n",
+    "        size : distr size\n",
+    "        ntrue : no of true values\n",
+    "    \"\"\"\n",
+    "    distr = [0] * size\n",
+    "    idxs = selectRandomSubListFromList(list(range(size)), ntrue)\n",
+    "    for i in idxs:\n",
+    "        distr[i] = 1\n",
+    "    return distr\n",
+    "\n",
+    "def mutBinaryDistr(distr, nmut):\n",
+    "    \"\"\"\n",
+    "    mutate binary distribution\n",
+    "\n",
+    "    Parameters\n",
+    "        distr : distr\n",
+    "        nmut : no of mutations\n",
+    "    \"\"\"\n",
+    "    idxs = selectRandomSubListFromList(list(range(len(distr))), nmut)\n",
+    "    for i in idxs:\n",
+    "        distr[i] = distr[i] ^ 1\n",
+    "\n",
+    "\n",
+    "def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator that superimposes given data in the specified segment of a column\n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        column : column index \n",
+    "        offset : offset into column values\n",
+    "        seqLen : length of subseq\n",
+    "        modifier : data to be superimposed either list or a sampler object\n",
+    "        precision : floating point precision\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    beg = offset\n",
+    "    end = beg + seqLen\n",
+    "    isList = type(modifier) == list\n",
+    "    i = 0\n",
+    "    for rec in fileRecGen(filePath, delim):\n",
+    "        if i >= beg and i < end:\n",
+    "            va = float(rec[column])\n",
+    "            if isList:\n",
+    "                va += modifier[i - beg] \n",
+    "            else:\n",
+    "                va += modifier.sample()\n",
+    "            rec[column] = formatFloat(precision, va)\n",
+    "        yield delim.join(rec)\n",
+    "        i += 1\n",
+    "\n",
+    "class ShiftedDataGenerator:\n",
+    "    \"\"\"\n",
+    "    transforms data for distribution shift\n",
+    "    \"\"\"\n",
+    "    def __init__(self, types, tdata, addFact, multFact):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            types data types\n",
+    "            tdata : 2D array\n",
+    "            addFact ; factor for data shift\n",
+    "            multFact ; factor for data scaling\n",
+    "        \"\"\"\n",
+    "        (self.dtypes, self.cvalues) = extractTypesFromString(types)\n",
+    "\n",
+    "        self.limits = dict()\n",
+    "        for k,v in self.dtypes.items():\n",
+    "            if v == \"int\" or v == \"false\":\n",
+    "                (vmax, vmin) = getColMinMax(tdata, k)\n",
+    "                self.limits[k] = vmax - vmin\n",
+    "        self.addMin = - addFact / 2\n",
+    "        self.addMax =  addFact / 2\n",
+    "        self.multMin = 1.0 - multFact / 2\n",
+    "        self.multMax = 1.0 + multFact / 2\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "    def transform(self, tdata):\n",
+    "        \"\"\"\n",
+    "        linear transforms data to create  distribution shift with random shift and scale\n",
+    "        Parameters\n",
+    "            types : data types\n",
+    "        \"\"\"\n",
+    "        transforms = dict()\n",
+    "        for k,v in self.dtypes.items():\n",
+    "            if v == \"int\" or v == \"false\":\t\t\t\t\n",
+    "                shift = randomFloat(self.addMin, self.addMax) * self.limits[k] \n",
+    "                scale = randomFloat(self.multMin, self.multMax)\n",
+    "                trns = (shift, scale)\n",
+    "                transforms[k] = trns\n",
+    "            elif v == \"cat\":\n",
+    "                transforms[k] = isEventSampled(50)\n",
+    "\n",
+    "        ttdata = list()\n",
+    "        for rec in tdata:\n",
+    "            nrec = rec.copy()\n",
+    "            for c in range(len(rec)):\n",
+    "                if c in self.dtypes:\n",
+    "                    dtype = self.dtypes[c]\n",
+    "                    if dtype == \"int\" or dtype == \"float\":\n",
+    "                        (shift, scale) = transforms[c]\n",
+    "                        nval = shift +  rec[c] * scale\n",
+    "                        if dtype == \"int\":\n",
+    "                            nrec[c] = int(nval)\n",
+    "                        else:\n",
+    "                            nrec[c] = nval\n",
+    "                    elif dtype == \"cat\":\n",
+    "                        cv = self.cvalues[c]\n",
+    "                        if transforms[c]:\n",
+    "                            nval = selectOtherRandomFromList(cv, rec[c])\n",
+    "                            nrec[c] = nval\n",
+    "\n",
+    "            ttdata.append(nrec)\n",
+    "\n",
+    "        return ttdata\n",
+    "\n",
+    "    def transformSpecified(self, tdata, sshift, scale):\n",
+    "        \"\"\"\n",
+    "        linear transforms data to create  distribution shift shift specified shift and scale\n",
+    "        Parameters\n",
+    "            types : data types\n",
+    "            sshift : shift factor\n",
+    "            scale : scale factor\n",
+    "        \"\"\"\n",
+    "        transforms = dict()\n",
+    "        for k,v in self.dtypes.items():\n",
+    "            if v == \"int\" or v == \"false\":\t\t\t\t\n",
+    "                shift = sshift * self.limits[k] \n",
+    "                trns = (shift, scale)\n",
+    "                transforms[k] = trns\n",
+    "            elif v == \"cat\":\n",
+    "                transforms[k] = isEventSampled(50)\n",
+    "\n",
+    "        ttdata = self.__scaleShift(tdata, transforms)\n",
+    "        return ttdata\n",
+    "\n",
+    "    def __scaleShift(self, tdata, transforms):\n",
+    "        \"\"\"\n",
+    "        shifts and scales tabular data\n",
+    "\n",
+    "        Parameters\n",
+    "            tdata : 2D array\n",
+    "            transforms : transforms to apply\n",
+    "        \"\"\"\n",
+    "        ttdata = list()\n",
+    "        for rec in tdata:\n",
+    "            nrec = rec.copy()\n",
+    "            for c in range(len(rec)):\n",
+    "                if c in self.dtypes:\n",
+    "                    dtype = self.dtypes[c]\n",
+    "                    if dtype == \"int\" or dtype == \"float\":\n",
+    "                        (shift, scale) = transforms[c]\n",
+    "                        nval = shift + rec[c] * scale\n",
+    "                        if dtype == \"int\":\n",
+    "                            nrec[c] = int(nval)\n",
+    "                        else:\n",
+    "                            nrec[c] = nval\n",
+    "                    elif dtype == \"cat\":\n",
+    "                        cv = self.cvalues[c]\n",
+    "                        if transforms[c]:\n",
+    "                            #nval = selectOtherRandomFromList(cv, rec[c])\n",
+    "                            #nrec[c] = nval\n",
+    "                            pass\n",
+    "\n",
+    "            ttdata.append(nrec)\n",
+    "        return ttdata\n",
+    "\n",
+    "class RollingStat(object):\n",
+    "    \"\"\"\n",
+    "    stats for rolling windowt\n",
+    "    \"\"\"\n",
+    "    def __init__(self, wsize):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            wsize : window size\n",
+    "        \"\"\"\n",
+    "        self.window = list()\n",
+    "        self.wsize = wsize\n",
+    "        self.mean = None\n",
+    "        self.sd = None\n",
+    "\n",
+    "    def add(self, value):\n",
+    "        \"\"\"\n",
+    "        add a value\n",
+    "\n",
+    "        Parameters\n",
+    "            value : value to add\n",
+    "        \"\"\"\n",
+    "        self.window.append(value)\n",
+    "        if len(self.window) > self.wsize:\n",
+    "            self.window = self.window[1:]\n",
+    "\n",
+    "    def getStat(self):\n",
+    "        \"\"\"\n",
+    "        get rolling window mean and std deviation\n",
+    "        \"\"\"\n",
+    "        assertGreater(len(self.window), 0, \"window is empty\")\n",
+    "        if len(self.window) == 1:\n",
+    "            self.mean = self.window[0]\n",
+    "            self.sd = 0\n",
+    "        else:\n",
+    "            self.mean = statistics.mean(self.window)\n",
+    "            self.sd = statistics.stdev(self.window, xbar=self.mean)\n",
+    "        re = (self.mean, self.sd)\n",
+    "        return re\n",
+    "\n",
+    "    def getSize(self):\n",
+    "        \"\"\"\n",
+    "        return window size\n",
+    "        \"\"\"\n",
+    "        return len(self.window)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lib/sampler.ipynb ADDED Viewed

	@@ -0,0 +1,1366 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c19a2efe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import random \n",
+    "import time\n",
+    "import math\n",
+    "import random\n",
+    "import numpy as np\n",
+    "from scipy import stats\n",
+    "from random import randint\n",
+    "from util import *\n",
+    "from stats import Histogram\n",
+    "\n",
+    "def randomFloat(low, high):\n",
+    "    \"\"\"\n",
+    "    sample float within range\n",
+    "    Parameters\n",
+    "        low : low valuee\n",
+    "        high : high valuee\n",
+    "    \"\"\"\n",
+    "    return random.random() * (high-low) + low\n",
+    "\n",
+    "def randomInt(minv, maxv):\n",
+    "    \"\"\"\n",
+    "    sample int within range\n",
+    "    Parameters\n",
+    "        minv : low valuee\n",
+    "        maxv : high valuee\n",
+    "    \"\"\"\n",
+    "    return randint(minv, maxv)\n",
+    "\n",
+    "def randIndex(lData):\n",
+    "    \"\"\"\n",
+    "    random index of a list\n",
+    "    Parameters\n",
+    "        lData : list data\n",
+    "    \"\"\"\n",
+    "    return randint(0, len(lData)-1)\n",
+    "\n",
+    "def randomUniformSampled(low, high):\n",
+    "    \"\"\"\n",
+    "    sample float within range\n",
+    "\n",
+    "    Parameters\n",
+    "        low : low value\n",
+    "        high : high value\n",
+    "    \"\"\"\n",
+    "    return np.random.uniform(low, high)\n",
+    "\n",
+    "def randomUniformSampledList(low, high, size):\n",
+    "    \"\"\"\n",
+    "    sample floats within range to create list\n",
+    "    Parameters\n",
+    "        low : low value\n",
+    "        high : high value\n",
+    "        size ; size of list to be returned\n",
+    "    \"\"\"\n",
+    "    return np.random.uniform(low, high, size)\n",
+    "\n",
+    "def randomNormSampled(mean, sd):\n",
+    "    \"\"\"\n",
+    "    sample float from normal\n",
+    "    Parameters\n",
+    "        mean : mean\n",
+    "        sd : std deviation\n",
+    "    \"\"\"\n",
+    "    return np.random.normal(mean, sd)\n",
+    "\n",
+    "def randomNormSampledList(mean, sd, size):\n",
+    "    \"\"\"\n",
+    "    sample float list from normal \n",
+    "    Parameters\n",
+    "        mean : mean\n",
+    "        sd : std deviation\n",
+    "        size : size of list to be returned\n",
+    "    \"\"\"\n",
+    "    return np.random.normal(mean, sd, size)\n",
+    "\n",
+    "def randomSampledList(sampler, size):\n",
+    "    \"\"\"\n",
+    "    sample list from given sampler \n",
+    "    Parameters\n",
+    "        sampler : sampler object\n",
+    "        size : size of list to be returned\n",
+    "    \"\"\"\n",
+    "    return list(map(lambda i : sampler.sample(), range(size)))\n",
+    "\n",
+    "\n",
+    "def minLimit(val, minv):\n",
+    "    \"\"\"\n",
+    "    min limit\n",
+    "\n",
+    "    Parameters\n",
+    "        val : value\n",
+    "        minv : min limit\n",
+    "    \"\"\"\n",
+    "    if (val < minv):\n",
+    "        val = minv\n",
+    "    return val\n",
+    "\n",
+    "\n",
+    "def rangeLimit(val, minv, maxv):\n",
+    "    \"\"\"\n",
+    "    range limit\n",
+    "    Parameters\n",
+    "        val : value\n",
+    "        minv : min limit\n",
+    "        maxv : max limit\n",
+    "    \"\"\"\n",
+    "    if (val < minv):\n",
+    "        val = minv\n",
+    "    elif (val > maxv):\n",
+    "        val = maxv\n",
+    "    return val\n",
+    "\n",
+    "\n",
+    "def sampleUniform(minv, maxv):\n",
+    "    \"\"\"\n",
+    "    sample int within range\n",
+    "    Parameters\n",
+    "        minv ; int min limit\n",
+    "        maxv : int max limit\n",
+    "    \"\"\"\n",
+    "    return randint(minv, maxv)\n",
+    "\n",
+    "\n",
+    "def sampleFromBase(value, dev):\n",
+    "    \"\"\"\n",
+    "    sample int wrt base\n",
+    "    Parameters\n",
+    "        value : base value\n",
+    "        dev : deviation\n",
+    "    \"\"\"\n",
+    "    return randint(value - dev, value + dev)\n",
+    "\n",
+    "\n",
+    "def sampleFloatFromBase(value, dev):\n",
+    "    \"\"\"\n",
+    "    sample float wrt base\n",
+    "    Parameters\n",
+    "        value : base value\n",
+    "        dev : deviation\n",
+    "    \"\"\"\n",
+    "    return randomFloat(value - dev, value + dev)\n",
+    "\n",
+    "\n",
+    "def distrUniformWithRanndom(total, numItems, noiseLevel):\n",
+    "    \"\"\"\n",
+    "    uniformly distribute with some randomness and preserves total\n",
+    "    Parameters\n",
+    "        total : total count\n",
+    "        numItems : no of bins\n",
+    "        noiseLevel : noise level fraction\n",
+    "    \"\"\"\n",
+    "    perItem = total / numItems\n",
+    "    var = perItem * noiseLevel\n",
+    "    items = []\n",
+    "    for i in range(numItems):\n",
+    "        item = perItem + randomFloat(-var, var)\n",
+    "        items.append(item)\t\n",
+    "\n",
+    "    #adjust last item\n",
+    "    sm = sum(items[:-1])\n",
+    "    items[-1] = total - sm\n",
+    "    return items\n",
+    "\n",
+    "\n",
+    "def isEventSampled(threshold, maxv=100):\n",
+    "    \"\"\"\n",
+    "    sample event which occurs if sampled below threshold\n",
+    "    Parameters\n",
+    "        threshold : threshold for sampling\n",
+    "        maxv : maximum values\n",
+    "    \"\"\"\n",
+    "    return randint(0, maxv) < threshold\n",
+    "\n",
+    "\n",
+    "def sampleBinaryEvents(events, probPercent):\n",
+    "    \"\"\"\n",
+    "    sample binary events\n",
+    "    Parameters\n",
+    "        events : two events\n",
+    "        probPercent : probability as percentage\n",
+    "    \"\"\"\n",
+    "    if (randint(0, 100) < probPercent):\n",
+    "        event = events[0]\n",
+    "    else:\n",
+    "        event = events[1]\n",
+    "    return event\n",
+    "\n",
+    "\n",
+    "def addNoiseNum(value, sampler):\n",
+    "    \"\"\"\n",
+    "    add noise to numeric value\n",
+    "    Parameters\n",
+    "        value : base value\n",
+    "        sampler : sampler for noise\n",
+    "    \"\"\"\n",
+    "    return value * (1 + sampler.sample())\n",
+    "\n",
+    "\n",
+    "def addNoiseCat(value, values, noise):\t\n",
+    "    \"\"\"\n",
+    "    add noise to categorical value i.e with some probability change value\n",
+    "    Parameters\n",
+    "        value : cat value\n",
+    "        values : cat values\n",
+    "        noise : noise level fraction\n",
+    "    \"\"\"\n",
+    "    newValue = value\n",
+    "    threshold = int(noise * 100)\n",
+    "    if (isEventSampled(threshold)):\t\t\n",
+    "        newValue = selectRandomFromList(values)\n",
+    "        while newValue == value:\n",
+    "            newValue = selectRandomFromList(values)\n",
+    "    return newValue\n",
+    "\n",
+    "\n",
+    "def sampleWithReplace(data, sampSize):\n",
+    "    \"\"\"\n",
+    "    sample with replacement\n",
+    "    Parameters\n",
+    "        data : array\n",
+    "        sampSize : sample size\n",
+    "    \"\"\"\n",
+    "    sampled = list()\n",
+    "    le = len(data)\n",
+    "    if sampSize is None:\n",
+    "        sampSize = le\n",
+    "    for i in range(sampSize):\n",
+    "        j = random.randint(0, le - 1)\n",
+    "        sampled.append(data[j])\n",
+    "    return sampled\n",
+    "\n",
+    "class CumDistr:\n",
+    "    \"\"\"\n",
+    "    cumulative distr\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, data, numBins = None):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            data : array\n",
+    "            numBins : no of bins\n",
+    "        \"\"\"\n",
+    "        if not numBins:\n",
+    "            numBins = int(len(data) / 5)\n",
+    "        res = stats.cumfreq(data, numbins=numBins)\n",
+    "        self.cdistr = res.cumcount / len(data)\n",
+    "        self.loLim = res.lowerlimit\n",
+    "        self.upLim = res.lowerlimit + res.binsize * res.cumcount.size\n",
+    "        self.binWidth = res.binsize\n",
+    "\n",
+    "    def getDistr(self, value):\n",
+    "        \"\"\"\n",
+    "        get cumulative distribution\n",
+    "\n",
+    "        Parameters\n",
+    "            value : value\n",
+    "        \"\"\"\n",
+    "        if value <= self.loLim:\n",
+    "            d = 0.0\n",
+    "        elif value >= self.upLim:\n",
+    "            d = 1.0\n",
+    "        else:\n",
+    "            bin = int((value - self.loLim) / self.binWidth)\n",
+    "            d = self.cdistr[bin]\n",
+    "        return d\n",
+    "\n",
+    "class BernoulliTrialSampler:\n",
+    "    \"\"\"\n",
+    "    bernoulli trial sampler return True or False\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, pr):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            pr : probability\n",
+    "        \"\"\"\n",
+    "        self.pr = pr\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        return random.random() < self.pr\n",
+    "\n",
+    "class PoissonSampler:\n",
+    "    \"\"\"\n",
+    "    poisson sampler returns number of events\n",
+    "    \"\"\"\n",
+    "    def __init__(self, rateOccur, maxSamp):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            rateOccur : rate of occurence\n",
+    "            maxSamp : max limit on no of samples\n",
+    "        \"\"\"\n",
+    "        self.rateOccur = rateOccur\n",
+    "        self.maxSamp = int(maxSamp)\n",
+    "        self.pmax = self.calculatePr(rateOccur)\n",
+    "\n",
+    "    def calculatePr(self, numOccur):\n",
+    "        \"\"\"\n",
+    "        calulates probability\n",
+    "\n",
+    "        Parameters\n",
+    "            numOccur : no of occurence\n",
+    "        \"\"\"\n",
+    "        p = (self.rateOccur ** numOccur) * math.exp(-self.rateOccur) / math.factorial(numOccur)\n",
+    "        return p\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        done = False\n",
+    "        samp = 0\n",
+    "        while not done:\n",
+    "            no = randint(0, self.maxSamp)\n",
+    "            sp = randomFloat(0.0, self.pmax)\n",
+    "            ap = self.calculatePr(no)\n",
+    "            if sp < ap:\n",
+    "                done = True\n",
+    "                samp = no\n",
+    "        return samp\n",
+    "\n",
+    "class ExponentialSampler:\n",
+    "    \"\"\"\n",
+    "    returns interval between events\n",
+    "    \"\"\"\n",
+    "    def __init__(self, rateOccur, maxSamp = None):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            rateOccur : rate of occurence\n",
+    "            maxSamp : max limit on interval\n",
+    "        \"\"\"\n",
+    "        self.interval = 1.0 / rateOccur\n",
+    "        self.maxSamp = int(maxSamp) if maxSamp is not None else None\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        sampled = np.random.exponential(scale=self.interval)\n",
+    "        if self.maxSamp is not None:\n",
+    "            while sampled > self.maxSamp:\n",
+    "                sampled = np.random.exponential(scale=self.interval)\n",
+    "        return sampled\n",
+    "\n",
+    "class UniformNumericSampler:\n",
+    "    \"\"\"\n",
+    "    uniform sampler for numerical values\n",
+    "    \"\"\"\n",
+    "    def __init__(self, minv, maxv):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            minv : min value\n",
+    "            maxv : max value\n",
+    "        \"\"\"\n",
+    "        self.minv = minv\n",
+    "        self.maxv = maxv\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        \"\"\"\n",
+    "        returns true\n",
+    "        \"\"\"\n",
+    "        return True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        samp =\tsampleUniform(self.minv, self.maxv) if isinstance(self.minv, int) else randomFloat(self.minv, self.maxv)\n",
+    "        return samp\t\n",
+    "\n",
+    "class UniformCategoricalSampler:\n",
+    "    \"\"\"\n",
+    "    uniform sampler for categorical values\n",
+    "    \"\"\"\n",
+    "    def __init__(self, cvalues):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            cvalues : categorical value list\n",
+    "        \"\"\"\n",
+    "        self.cvalues = cvalues\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return False\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        return selectRandomFromList(self.cvalues)\t\n",
+    "\n",
+    "class NormalSampler:\n",
+    "    \"\"\"\n",
+    "    normal sampler\n",
+    "    \"\"\"\n",
+    "    def __init__(self, mean, stdDev):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            mean : mean\n",
+    "            stdDev : std deviation\n",
+    "        \"\"\"\n",
+    "        self.mean = mean\n",
+    "        self.stdDev = stdDev\n",
+    "        self.sampleAsInt = False\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sampleAsIntValue(self):\n",
+    "        \"\"\"\n",
+    "        set True to sample as int\n",
+    "        \"\"\"\n",
+    "        self.sampleAsInt = True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        samp =  np.random.normal(self.mean, self.stdDev)\n",
+    "        if self.sampleAsInt:\n",
+    "            samp = int(samp)\n",
+    "        return samp\n",
+    "\n",
+    "class LogNormalSampler:\n",
+    "    \"\"\"\n",
+    "    log normal sampler\n",
+    "    \"\"\"\n",
+    "    def __init__(self, mean, stdDev):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            mean : mean\n",
+    "            stdDev : std deviation\n",
+    "        \"\"\"\n",
+    "        self.mean = mean\n",
+    "        self.stdDev = stdDev\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        return np.random.lognormal(self.mean, self.stdDev)\n",
+    "\n",
+    "class NormalSamplerWithTrendCycle:\n",
+    "    \"\"\"\n",
+    "    normal sampler with cycle and trend\n",
+    "    \"\"\"\n",
+    "    def __init__(self, mean, stdDev, dmean, cycle,  step=1):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            mean : mean\n",
+    "            stdDev : std deviation\n",
+    "            dmean : trend delta\n",
+    "            cycle : cycle values wrt base mean\n",
+    "            step : adjustment step for cycle and trend\n",
+    "        \"\"\"\n",
+    "        self.mean = mean\n",
+    "        self.cmean = mean\n",
+    "        self.stdDev = stdDev\n",
+    "        self.dmean = dmean\n",
+    "        self.cycle = cycle\n",
+    "        self.clen = len(cycle) if cycle is not None else 0\n",
+    "        self.step = step\n",
+    "        self.count = 0\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        s = np.random.normal(self.cmean, self.stdDev)\n",
+    "        self.count += 1\n",
+    "        if self.count % self.step == 0:\n",
+    "            cy = 0\n",
+    "            if self.clen > 1:\n",
+    "                coff =  self.count % self.clen\n",
+    "                cy = self.cycle[coff]\n",
+    "            tr = self.count * self.dmean\n",
+    "            self.cmean = self.mean + tr + cy\n",
+    "        return s\n",
+    "\n",
+    "\n",
+    "class ParetoSampler:\n",
+    "    \"\"\"\n",
+    "    pareto sampler\n",
+    "    \"\"\"\n",
+    "    def __init__(self, mode, shape):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            mode : mode\n",
+    "            shape : shape\n",
+    "        \"\"\"\n",
+    "        self.mode = mode\n",
+    "        self.shape = shape\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        return (np.random.pareto(self.shape) + 1) * self.mode\n",
+    "\n",
+    "class GammaSampler:\n",
+    "    \"\"\"\n",
+    "    pareto sampler\n",
+    "    \"\"\"\n",
+    "    def __init__(self, shape, scale):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            shape : shape\n",
+    "            scale : scale\n",
+    "        \"\"\"\n",
+    "        self.shape = shape\n",
+    "        self.scale = scale\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        return np.random.gamma(self.shape, self.scale)\n",
+    "\n",
+    "class GaussianRejectSampler:\n",
+    "    \"\"\"\n",
+    "    gaussian sampling based on rejection sampling\n",
+    "    \"\"\"\n",
+    "    def __init__(self, mean, stdDev):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            mean : mean\n",
+    "            stdDev : std deviation\n",
+    "        \"\"\"\n",
+    "        self.mean = mean\n",
+    "        self.stdDev = stdDev\n",
+    "        self.xmin = mean - 3 * stdDev\n",
+    "        self.xmax = mean + 3 * stdDev\n",
+    "        self.ymin = 0.0\n",
+    "        self.fmax = 1.0 / (math.sqrt(2.0 * 3.14) * stdDev)\n",
+    "        self.ymax = 1.05 * self.fmax\n",
+    "        self.sampleAsInt = False\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sampleAsIntValue(self):\n",
+    "        \"\"\"\n",
+    "        sample as int value\n",
+    "        \"\"\"\n",
+    "        self.sampleAsInt = True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        done = False\n",
+    "        samp = 0\n",
+    "        while not done:\n",
+    "            x = randomFloat(self.xmin, self.xmax)\n",
+    "            y = randomFloat(self.ymin, self.ymax)\n",
+    "            f = self.fmax * math.exp(-(x - self.mean) * (x - self.mean) / (2.0 * self.stdDev * self.stdDev))\n",
+    "            if (y < f):\n",
+    "                done = True\n",
+    "                samp = x\n",
+    "        if self.sampleAsInt:\n",
+    "            samp = int(samp)\n",
+    "        return samp\n",
+    "\n",
+    "class DiscreteRejectSampler:\n",
+    "    \"\"\"\n",
+    "    non parametric sampling for discrete values  using given distribution based \n",
+    "    on rejection sampling\t\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  xmin, xmax, step, *values):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            xmin : min  value\n",
+    "            xmax : max  value\n",
+    "            step : discrete step\n",
+    "            values : distr values\n",
+    "        \"\"\"\n",
+    "        self.xmin = xmin\n",
+    "        self.xmax = xmax\n",
+    "        self.step = step\n",
+    "        self.distr = values\n",
+    "        if (len(self.distr) == 1):\n",
+    "            self.distr = self.distr[0]\t\n",
+    "        numSteps = int((self.xmax - self.xmin) / self.step)\n",
+    "        #print(\"{:.3f} {:.3f} {:.3f} {}\".format(self.xmin, self.xmax, self.step, numSteps))\n",
+    "        assert len(self.distr)\t== numSteps + 1, \"invalid number of distr values expected {}\".format(numSteps + 1)\n",
+    "        self.ximin = 0\n",
+    "        self.ximax = numSteps\n",
+    "        self.pmax = float(max(self.distr))\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        done = False\n",
+    "        samp = None\n",
+    "        while not done:\n",
+    "            xi = randint(self.ximin, self.ximax)\n",
+    "            #print(formatAny(xi, \"xi\"))\n",
+    "            ps = randomFloat(0.0, self.pmax)\n",
+    "            pa = self.distr[xi]\n",
+    "            if ps < pa:\n",
+    "                samp = self.xmin + xi  * self.step\n",
+    "                done = True\n",
+    "        return samp\n",
+    "\n",
+    "\n",
+    "class TriangularRejectSampler:\n",
+    "    \"\"\"\n",
+    "    non parametric sampling using triangular distribution based on rejection sampling\t\n",
+    "    \"\"\"\n",
+    "    def __init__(self, xmin, xmax, vertexValue, vertexPos=None):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            xmin : min  value\n",
+    "            xmax : max  value\n",
+    "            vertexValue : distr value at vertex\n",
+    "            vertexPos : vertex pposition\n",
+    "        \"\"\"\n",
+    "        self.xmin = xmin\n",
+    "        self.xmax = xmax\n",
+    "        self.vertexValue = vertexValue\n",
+    "        if vertexPos: \n",
+    "            assert vertexPos > xmin and vertexPos < xmax, \"vertex position outside bound\"\n",
+    "            self.vertexPos = vertexPos\n",
+    "        else:\n",
+    "            self.vertexPos = 0.5 * (xmin + xmax)\n",
+    "        self.s1 = vertexValue / (self.vertexPos - xmin)\n",
+    "        self.s2 = vertexValue / (xmax - self.vertexPos)\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        done = False\n",
+    "        samp = None\n",
+    "        while not done:\n",
+    "            x = randomFloat(self.xmin, self.xmax)\n",
+    "            y = randomFloat(0.0, self.vertexValue)\n",
+    "            f = (x - self.xmin) * self.s1 if x < self.vertexPos else (self.xmax - x) * self.s2\n",
+    "            if (y < f):\n",
+    "                done = True\n",
+    "                samp = x\n",
+    "\n",
+    "        return samp;\t\n",
+    "\n",
+    "class NonParamRejectSampler:\n",
+    "    \"\"\"\n",
+    "    non parametric sampling using given distribution based on rejection sampling\t\n",
+    "    \"\"\"\n",
+    "    def __init__(self, xmin, binWidth, *values):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            xmin : min  value\n",
+    "            binWidth : bin width\n",
+    "            values : distr values\n",
+    "        \"\"\"\n",
+    "        self.values = values\n",
+    "        if (len(self.values) == 1):\n",
+    "            self.values = self.values[0]\n",
+    "        self.xmin = xmin\n",
+    "        self.xmax = xmin + binWidth * (len(self.values) - 1)\n",
+    "        #print(self.xmin, self.xmax, binWidth)\n",
+    "        self.binWidth = binWidth\n",
+    "        self.fmax = 0\n",
+    "        for v in self.values:\n",
+    "            if (v > self.fmax):\n",
+    "                self.fmax = v\n",
+    "        self.ymin = 0\n",
+    "        self.ymax = self.fmax\n",
+    "        self.sampleAsInt = True\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sampleAsFloat(self):\n",
+    "        self.sampleAsInt = False\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        done = False\n",
+    "        samp = 0\n",
+    "        while not done:\n",
+    "            if self.sampleAsInt:\n",
+    "                x = random.randint(self.xmin, self.xmax)\n",
+    "                y = random.randint(self.ymin, self.ymax)\n",
+    "            else:\n",
+    "                x = randomFloat(self.xmin, self.xmax)\n",
+    "                y = randomFloat(self.ymin, self.ymax)\n",
+    "            bin = int((x - self.xmin) / self.binWidth)\n",
+    "            f = self.values[bin]\n",
+    "            if (y < f):\n",
+    "                done = True\n",
+    "                samp = x\n",
+    "        return samp\n",
+    "\n",
+    "class JointNonParamRejectSampler:\n",
+    "    \"\"\"\n",
+    "    non parametric sampling using given distribution based on rejection sampling\t\n",
+    "    \"\"\"\n",
+    "    def __init__(self, xmin, xbinWidth, xnbin, ymin, ybinWidth, ynbin, *values):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            xmin : min  value for x\n",
+    "            xbinWidth : bin width for x\n",
+    "            xnbin : no of bins for x\n",
+    "            ymin : min  value for y\n",
+    "            ybinWidth : bin width for y\n",
+    "            ynbin : no of bins for y\n",
+    "            values : distr values\n",
+    "        \"\"\"\n",
+    "        self.values = values\n",
+    "        if (len(self.values) == 1):\n",
+    "            self.values = self.values[0]\n",
+    "        assert len(self.values) ==  xnbin * ynbin, \"wrong number of values for joint distr\"\n",
+    "        self.xmin = xmin\n",
+    "        self.xmax = xmin + xbinWidth * xnbin\n",
+    "        self.xbinWidth = xbinWidth\n",
+    "        self.ymin = ymin\n",
+    "        self.ymax = ymin + ybinWidth * ynbin\n",
+    "        self.ybinWidth = ybinWidth\n",
+    "        self.pmax = max(self.values)\n",
+    "        self.values = np.array(self.values).reshape(xnbin, ynbin)\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        done = False\n",
+    "        samp = 0\n",
+    "        while not done:\n",
+    "            x = randomFloat(self.xmin, self.xmax)\n",
+    "            y = randomFloat(self.ymin, self.ymax)\n",
+    "            xbin = int((x - self.xmin) / self.xbinWidth)\n",
+    "            ybin = int((y - self.ymin) / self.ybinWidth)\n",
+    "            ap = self.values[xbin][ybin]\n",
+    "            sp = randomFloat(0.0, self.pmax)\n",
+    "            if (sp < ap):\n",
+    "                done = True\n",
+    "                samp = [x,y]\n",
+    "        return samp\n",
+    "\n",
+    "\n",
+    "class JointNormalSampler:\n",
+    "    \"\"\"\n",
+    "    joint normal sampler\t\n",
+    "    \"\"\"\n",
+    "    def __init__(self, *values):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            values : 2 mean values followed by 4 values for covar matrix\n",
+    "        \"\"\"\n",
+    "        lvalues = list(values)\n",
+    "        assert len(lvalues) == 6, \"incorrect number of arguments for joint normal sampler\"\n",
+    "        mean = lvalues[:2]\n",
+    "        self.mean = np.array(mean)\n",
+    "        sd = lvalues[2:]\n",
+    "        self.sd = np.array(sd).reshape(2,2)\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        return list(np.random.multivariate_normal(self.mean, self.sd))\n",
+    "\n",
+    "\n",
+    "class MultiVarNormalSampler:\n",
+    "    \"\"\"\n",
+    "    muti variate normal sampler\t\n",
+    "    \"\"\"\n",
+    "    def __init__(self, numVar, *values):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            numVar : no of variables\n",
+    "            values : numVar mean values followed by numVar x numVar values for covar matrix\n",
+    "        \"\"\"\n",
+    "        lvalues = list(values)\n",
+    "        assert len(lvalues) == numVar + numVar * numVar, \"incorrect number of arguments for multi var normal sampler\"\n",
+    "        mean = lvalues[:numVar]\n",
+    "        self.mean = np.array(mean)\n",
+    "        sd = lvalues[numVar:]\n",
+    "        self.sd = np.array(sd).reshape(numVar,numVar)\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        return list(np.random.multivariate_normal(self.mean, self.sd))\n",
+    "\n",
+    "class CategoricalRejectSampler:\n",
+    "    \"\"\"\n",
+    "    non parametric sampling for categorical attributes using given distribution based \n",
+    "    on rejection sampling\t\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  *values):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            values : list of tuples which contains a categorical value and the corresponsding distr value\n",
+    "        \"\"\"\n",
+    "        self.distr = values\n",
+    "        if (len(self.distr) == 1):\n",
+    "            self.distr = self.distr[0]\n",
+    "        maxv = 0\n",
+    "        for t in self.distr:\n",
+    "            if t[1] > maxv:\n",
+    "                maxv = t[1]\n",
+    "        self.maxv = maxv\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        done = False\n",
+    "        samp = \"\"\n",
+    "        while not done:\n",
+    "            t = self.distr[randint(0, len(self.distr)-1)]\t\n",
+    "            d = randomFloat(0, self.maxv)\t\n",
+    "            if (d <= t[1]):\n",
+    "                done = True\n",
+    "                samp = t[0]\n",
+    "        return samp\n",
+    "\n",
+    "\n",
+    "class DistrMixtureSampler:\n",
+    "    \"\"\"\n",
+    "    distr mixture sampler\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  mixtureWtDistr, *compDistr):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            mixtureWtDistr : sampler that returns index into sampler list\n",
+    "            compDistr : sampler list\n",
+    "        \"\"\"\n",
+    "        self.mixtureWtDistr = mixtureWtDistr\n",
+    "        self.compDistr = compDistr\n",
+    "        if (len(self.compDistr) == 1):\n",
+    "            self.compDistr = self.compDistr[0]\n",
+    "\n",
+    "    def isNumeric(self):\n",
+    "        return True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        comp = self.mixtureWtDistr.sample()\n",
+    "\n",
+    "        #sample  sampled comp distr\n",
+    "        return self.compDistr[comp].sample()\n",
+    "\n",
+    "class AncestralSampler:\n",
+    "    \"\"\"\n",
+    "    ancestral sampler using conditional distribution\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  parentDistr, childDistr, numChildren):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            parentDistr : parent distr\n",
+    "            childDistr : childdren distribution dictionary\n",
+    "            numChildren : no of children\n",
+    "        \"\"\"\n",
+    "        self.parentDistr = parentDistr\n",
+    "        self.childDistr = childDistr\n",
+    "        self.numChildren = numChildren\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        parent = self.parentDistr.sample()\n",
+    "\n",
+    "        #sample all children conditioned on parent\n",
+    "        children = []\n",
+    "        for i in range(self.numChildren):\n",
+    "            key = (parent, i)\n",
+    "            child = self.childDistr[key].sample()\n",
+    "            children.append(child)\n",
+    "        return (parent, children)\n",
+    "\n",
+    "class ClusterSampler:\n",
+    "    \"\"\"\n",
+    "    sample cluster and then sample member of sampled cluster\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  clusters, *clustDistr):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            clusters : dictionary clusters\n",
+    "            clustDistr : distr for clusters\n",
+    "        \"\"\"\n",
+    "        self.sampler = CategoricalRejectSampler(*clustDistr)\n",
+    "        self.clusters = clusters\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        cluster = self.sampler.sample()\n",
+    "        member = random.choice(self.clusters[cluster])\n",
+    "        return (cluster, member)\n",
+    "\n",
+    "\n",
+    "class MetropolitanSampler:\n",
+    "    \"\"\"\n",
+    "    metropolitan sampler\t\n",
+    "    \"\"\"\n",
+    "    def __init__(self, propStdDev, min, binWidth, values):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            propStdDev : proposal distr std dev\n",
+    "            min : min domain value for target distr\n",
+    "            binWidth : bin width\n",
+    "            values : target distr values\n",
+    "        \"\"\"\n",
+    "        self.targetDistr = Histogram.createInitialized(min, binWidth, values)\n",
+    "        self.propsalDistr = GaussianRejectSampler(0, propStdDev)\n",
+    "        self.proposalMixture = False\n",
+    "\n",
+    "        # bootstrap sample\n",
+    "        (minv, maxv) = self.targetDistr.getMinMax()\n",
+    "        self.curSample = random.randint(minv, maxv)\n",
+    "        self.curDistr = self.targetDistr.value(self.curSample)\n",
+    "        self.transCount = 0\n",
+    "\n",
+    "    def initialize(self):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        (minv, maxv) = self.targetDistr.getMinMax()\n",
+    "        self.curSample = random.randint(minv, maxv)\n",
+    "        self.curDistr = self.targetDistr.value(self.curSample)\n",
+    "        self.transCount = 0\n",
+    "\n",
+    "    def setProposalDistr(self, propsalDistr):\n",
+    "        \"\"\"\n",
+    "        set custom proposal distribution\n",
+    "        Parameters\n",
+    "            propsalDistr : proposal distribution\n",
+    "        \"\"\"\n",
+    "        self.propsalDistr = propsalDistr\n",
+    "\n",
+    "\n",
+    "    def setGlobalProposalDistr(self, globPropStdDev, proposalChoiceThreshold):\n",
+    "        \"\"\"\n",
+    "        set custom proposal distribution\n",
+    "        Parameters\n",
+    "            globPropStdDev : global proposal distr std deviation\n",
+    "            proposalChoiceThreshold : threshold for using global proposal distribution\n",
+    "        \"\"\"\n",
+    "        self.globalProposalDistr = GaussianRejectSampler(0, globPropStdDev)\n",
+    "        self.proposalChoiceThreshold = proposalChoiceThreshold\n",
+    "        self.proposalMixture = True\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        samples value\n",
+    "        \"\"\"\n",
+    "        nextSample = self.proposalSample(1)\n",
+    "        self.targetSample(nextSample)\n",
+    "        return self.curSample;\n",
+    "\n",
+    "    def proposalSample(self, skip):\n",
+    "        \"\"\"\n",
+    "        sample from proposal distribution\n",
+    "        Parameters\n",
+    "            skip : no of samples to skip\n",
+    "        \"\"\"\n",
+    "        for i in range(skip):\n",
+    "            if not self.proposalMixture:\n",
+    "                #one proposal distr\n",
+    "                nextSample = self.curSample + self.propsalDistr.sample()\n",
+    "                nextSample = self.targetDistr.boundedValue(nextSample)\n",
+    "            else:\n",
+    "                #mixture of proposal distr\n",
+    "                if random.random() < self.proposalChoiceThreshold:\n",
+    "                    nextSample = self.curSample + self.propsalDistr.sample()\n",
+    "                else:\n",
+    "                    nextSample = self.curSample + self.globalProposalDistr.sample()\n",
+    "                nextSample = self.targetDistr.boundedValue(nextSample)\n",
+    "\n",
+    "        return nextSample\n",
+    "\n",
+    "    def targetSample(self, nextSample):\n",
+    "        \"\"\"\n",
+    "        target sample\n",
+    "        Parameters\n",
+    "            nextSample : proposal distr sample\n",
+    "        \"\"\"\n",
+    "        nextDistr = self.targetDistr.value(nextSample)\n",
+    "\n",
+    "        transition = False\n",
+    "        if nextDistr > self.curDistr:\n",
+    "            transition = True\n",
+    "        else:\n",
+    "            distrRatio = float(nextDistr) / self.curDistr\n",
+    "            if random.random() < distrRatio:\n",
+    "                transition = True\n",
+    "\n",
+    "        if transition:\n",
+    "            self.curSample = nextSample\n",
+    "            self.curDistr = nextDistr\n",
+    "            self.transCount += 1\n",
+    "\n",
+    "\n",
+    "    def subSample(self, skip):\n",
+    "        \"\"\"\n",
+    "        sub sample\n",
+    "        Parameters\n",
+    "            skip : no of samples to skip\n",
+    "        \"\"\"\n",
+    "        nextSample = self.proposalSample(skip)\n",
+    "        self.targetSample(nextSample)\n",
+    "        return self.curSample;\n",
+    "\n",
+    "    def setMixtureProposal(self, globPropStdDev, mixtureThreshold):\n",
+    "        \"\"\"\n",
+    "        mixture proposal\n",
+    "        Parameters\n",
+    "            globPropStdDev : global proposal distr std deviation\n",
+    "            mixtureThreshold : threshold for using global proposal distribution\n",
+    "        \"\"\"\n",
+    "        self.globalProposalDistr = GaussianRejectSampler(0, globPropStdDev)\n",
+    "        self.mixtureThreshold = mixtureThreshold\n",
+    "\n",
+    "    def samplePropsal(self):\n",
+    "        \"\"\"\n",
+    "        sample from proposal distr\n",
+    "        \"\"\"\n",
+    "        if self.globalPropsalDistr is None:\n",
+    "            proposal = self.propsalDistr.sample()\n",
+    "        else:\n",
+    "            if random.random() < self.mixtureThreshold:\n",
+    "                proposal = self.propsalDistr.sample()\n",
+    "            else:\n",
+    "                proposal = self.globalProposalDistr.sample()\n",
+    "\n",
+    "        return proposal\n",
+    "\n",
+    "class PermutationSampler:\n",
+    "    \"\"\"\n",
+    "    permutation sampler by shuffling a list\n",
+    "    \"\"\"\n",
+    "    def __init__(self):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        self.values = None\n",
+    "        self.numShuffles = None\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def createSamplerWithValues(values, *numShuffles):\n",
+    "        \"\"\"\n",
+    "        creator with values\n",
+    "        Parameters\n",
+    "            values : list data\n",
+    "            numShuffles : no of shuffles or range of no of shuffles\n",
+    "        \"\"\"\n",
+    "        sampler = PermutationSampler()\n",
+    "        sampler.values = values\n",
+    "        sampler.numShuffles = numShuffles\n",
+    "        return sampler\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def createSamplerWithRange(minv, maxv, *numShuffles):\n",
+    "        \"\"\"\n",
+    "        creator with ramge min and max\n",
+    "\n",
+    "        Parameters\n",
+    "            minv : min of range\n",
+    "            maxv : max of range\n",
+    "            numShuffles : no of shuffles or range of no of shuffles\n",
+    "        \"\"\"\n",
+    "        sampler = PermutationSampler()\n",
+    "        sampler.values = list(range(minv, maxv + 1))\n",
+    "        sampler.numShuffles = numShuffles\n",
+    "        return sampler\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        sample new permutation\n",
+    "        \"\"\"\n",
+    "        cloned = self.values.copy()\n",
+    "        shuffle(cloned, *self.numShuffles)\n",
+    "        return cloned\n",
+    "\n",
+    "class SpikeyDataSampler:\n",
+    "    \"\"\"\n",
+    "    samples spikey data\n",
+    "    \"\"\"\n",
+    "    def __init__(self, intvMean, intvScale, distr, spikeValueMean, spikeValueStd, spikeMaxDuration, baseValue = 0):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            intvMean : interval mean\n",
+    "            intvScale : interval std dev\n",
+    "            distr : type of distr for interval\n",
+    "            spikeValueMean : spike value mean\n",
+    "            spikeValueStd : spike value std dev\n",
+    "            spikeMaxDuration : max duration for spike\n",
+    "            baseValue : base or offset value\n",
+    "        \"\"\"\n",
+    "        if distr == \"norm\":\n",
+    "            self.intvSampler = NormalSampler(intvMean, intvScale)\n",
+    "        elif distr == \"expo\":\n",
+    "            rate = 1.0 / intvScale\n",
+    "            self.intvSampler = ExponentialSampler(rate)\n",
+    "        else:\n",
+    "            raise ValueError(\"invalid distribution\")\n",
+    "\n",
+    "        self.spikeSampler = NormalSampler(spikeValueMean, spikeValueStd)\n",
+    "        self.spikeMaxDuration = spikeMaxDuration\n",
+    "        self.baseValue = baseValue\n",
+    "        self.inSpike = False\n",
+    "        self.spikeCount = 0\n",
+    "        self.baseCount = 0\n",
+    "        self.baseLength = int(self.intvSampler.sample())\n",
+    "        self.spikeValues = list()\n",
+    "        self.spikeLength = None\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        sample new value\n",
+    "        \"\"\"\n",
+    "        if self.baseCount <= self.baseLength:\n",
+    "            sampled = self.baseValue\n",
+    "            self.baseCount += 1\n",
+    "        else:\n",
+    "            if not self.inSpike:\n",
+    "                #starting spike\n",
+    "                spikeVal = self.spikeSampler.sample()\n",
+    "                self.spikeLength = sampleUniform(1, self.spikeMaxDuration)\n",
+    "                spikeMaxPos = 0 if self.spikeLength == 1 else sampleUniform(0, self.spikeLength-1)\n",
+    "                self.spikeValues.clear()\n",
+    "                for i in range(self.spikeLength):\n",
+    "                    if i < spikeMaxPos:\n",
+    "                        frac = (i + 1) / (spikeMaxPos + 1)\n",
+    "                        frac = sampleFloatFromBase(frac, 0.1 * frac)\n",
+    "                    elif i > spikeMaxPos:\n",
+    "                        frac =  (self.spikeLength - i) / (self.spikeLength - spikeMaxPos)\n",
+    "                        frac = sampleFloatFromBase(frac, 0.1 * frac)\n",
+    "                    else:\n",
+    "                        frac = 1.0\n",
+    "                    self.spikeValues.append(frac * spikeVal)\n",
+    "                    self.inSpike = True\n",
+    "                    self.spikeCount = 0\n",
+    "\n",
+    "\n",
+    "            sampled = self.spikeValues[self.spikeCount]\n",
+    "            self.spikeCount += 1\n",
+    "\n",
+    "            if self.spikeCount == self.spikeLength:\n",
+    "                #ending spike\n",
+    "                self.baseCount = 0\n",
+    "                self.baseLength = int(self.intvSampler.sample())\n",
+    "                self.inSpike = False\n",
+    "\n",
+    "        return sampled\n",
+    "\n",
+    "\n",
+    "class EventSampler:\n",
+    "    \"\"\"\n",
+    "    sample event\n",
+    "    \"\"\"\n",
+    "    def __init__(self, intvSampler, valSampler=None):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            intvSampler : interval sampler\n",
+    "            valSampler : value sampler\n",
+    "        \"\"\"\n",
+    "        self.intvSampler = intvSampler\n",
+    "        self.valSampler = valSampler\n",
+    "        self.trigger = int(self.intvSampler.sample())\n",
+    "        self.count = 0\n",
+    "\n",
+    "    def reset(self):\n",
+    "        \"\"\"\n",
+    "        reset trigger\n",
+    "        \"\"\"\n",
+    "        self.trigger = int(self.intvSampler.sample())\n",
+    "        self.count = 0\n",
+    "\n",
+    "    def sample(self):\n",
+    "        \"\"\"\n",
+    "        sample event\n",
+    "        \"\"\"\n",
+    "        if self.count == self.trigger:\n",
+    "            sampled = self.valSampler.sample() if self.valSampler is not None else 1.0\n",
+    "            self.trigger = int(self.intvSampler.sample())\n",
+    "            self.count = 0\n",
+    "        else:\n",
+    "            sample = 0.0\n",
+    "            self.count += 1\n",
+    "        return sampled\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "def createSampler(data):\n",
+    "    \"\"\"\n",
+    "    create sampler\n",
+    "\n",
+    "    Parameters\n",
+    "        data : sampler description\n",
+    "    \"\"\"\n",
+    "    #print(data)\n",
+    "    items = data.split(\":\")\n",
+    "    size = len(items)\n",
+    "    dtype = items[-1]\n",
+    "    stype = items[-2]\n",
+    "    sampler = None\n",
+    "    if stype == \"uniform\":\n",
+    "        if dtype == \"int\":\n",
+    "            min = int(items[0])\n",
+    "            max = int(items[1])\n",
+    "            sampler = UniformNumericSampler(min, max)\n",
+    "        elif dtype == \"float\":\n",
+    "            min = float(items[0])\n",
+    "            max = float(items[1])\n",
+    "            sampler = UniformNumericSampler(min, max)\n",
+    "        elif dtype == \"categorical\":\n",
+    "            values = items[:-2]\n",
+    "            sampler = UniformCategoricalSampler(values)\n",
+    "    elif stype == \"normal\":\n",
+    "            mean = float(items[0])\n",
+    "            sd = float(items[1])\n",
+    "            sampler = NormalSampler(mean, sd)\n",
+    "            if dtype == \"int\":\n",
+    "                sampler.sampleAsIntValue()\n",
+    "    elif stype == \"nonparam\":\n",
+    "        if dtype == \"int\" or dtype == \"float\":\n",
+    "            min = int(items[0])\n",
+    "            binWidth = int(items[1])\n",
+    "            values = items[2:-2]\n",
+    "            values = list(map(lambda v: int(v), values))\n",
+    "            sampler = NonParamRejectSampler(min, binWidth, values)\n",
+    "            if dtype == \"float\":\n",
+    "                sampler.sampleAsFloat()\n",
+    "        elif dtype == \"categorical\":\n",
+    "            values = list()\n",
+    "            for i in range(0, size-2, 2):\n",
+    "                cval = items[i]\n",
+    "                dist = int(items[i+1])\n",
+    "                pair = (cval, dist)\n",
+    "                values.append(pair)\n",
+    "            sampler = CategoricalRejectSampler(values)\n",
+    "    elif stype == \"discrete\":\n",
+    "        vmin = int(items[0])\n",
+    "        vmax = int(items[1])\n",
+    "        step = int(items[2])\n",
+    "        values = list(map(lambda i : int(items[i]), range(3, len(items)-2)))\n",
+    "        sampler = DiscreteRejectSampler(vmin, vmax, step, values)\n",
+    "    else:\n",
+    "        raise ValueError(\"invalid sampler type \" + dtype)\n",
+    "    return sampler\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lib/stats.ipynb ADDED Viewed

	@@ -0,0 +1,510 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4cbab42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import random \n",
+    "import time\n",
+    "import math\n",
+    "import numpy as np\n",
+    "import statistics \n",
+    "from util import *\n",
+    "\n",
+    "\"\"\"\n",
+    "histogram class\n",
+    "\"\"\"\n",
+    "class Histogram:\n",
+    "    def __init__(self, min, binWidth):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "\n",
+    "        Parameters\n",
+    "            min : min x\n",
+    "            binWidth : bin width\n",
+    "        \"\"\"\n",
+    "        self.xmin = min\n",
+    "        self.binWidth = binWidth\n",
+    "        self.normalized = False\n",
+    "\n",
+    "    @classmethod\n",
+    "    def createInitialized(cls, xmin, binWidth, values):\n",
+    "        \"\"\"\n",
+    "        create histogram instance with min domain, bin width and values\n",
+    "\n",
+    "        Parameters\n",
+    "            min : min x\n",
+    "            binWidth : bin width\n",
+    "            values : y values\n",
+    "        \"\"\"\n",
+    "        instance = cls(xmin, binWidth)\n",
+    "        instance.xmax = xmin + binWidth * (len(values) - 1)\n",
+    "        instance.ymin = 0\n",
+    "        instance.bins = np.array(values)\n",
+    "        instance.fmax = 0\n",
+    "        for v in values:\n",
+    "            if (v > instance.fmax):\n",
+    "                instance.fmax = v\n",
+    "        instance.ymin = 0.0\n",
+    "        instance.ymax = instance.fmax\n",
+    "        return instance\n",
+    "\n",
+    "    @classmethod\n",
+    "    def createWithNumBins(cls, values, numBins=20):\n",
+    "        \"\"\"\n",
+    "        create histogram instance values and no of bins\n",
+    "\n",
+    "        Parameters\n",
+    "            values : y values\n",
+    "            numBins : no of bins\n",
+    "        \"\"\"\n",
+    "        xmin = min(values)\n",
+    "        xmax = max(values)\n",
+    "        binWidth = (xmax + .01 - (xmin - .01)) / numBins\n",
+    "        instance = cls(xmin, binWidth)\n",
+    "        instance.xmax = xmax\n",
+    "        instance.numBin = numBins\n",
+    "        instance.bins = np.zeros(instance.numBin)\n",
+    "        for v in values:\n",
+    "            instance.add(v)\n",
+    "        return instance\n",
+    "\n",
+    "    @classmethod\n",
+    "    def createUninitialized(cls, xmin, xmax, binWidth):\n",
+    "        \"\"\"\n",
+    "        create histogram instance with no y values using domain min , max and bin width\n",
+    "\n",
+    "        Parameters\n",
+    "            min : min x\n",
+    "            max : max x\n",
+    "            binWidth : bin width\n",
+    "        \"\"\"\n",
+    "        instance = cls(xmin, binWidth)\n",
+    "        instance.xmax = xmax\n",
+    "        instance.numBin = (xmax - xmin) / binWidth + 1\n",
+    "        instance.bins = np.zeros(instance.numBin)\n",
+    "        return instance\n",
+    "\n",
+    "    def initialize(self):\n",
+    "        \"\"\"\n",
+    "        set y values to 0\n",
+    "        \"\"\"\n",
+    "        self.bins = np.zeros(self.numBin)\n",
+    "\n",
+    "    def add(self, value):\n",
+    "        \"\"\"\n",
+    "        adds a value to a bin\n",
+    "\n",
+    "        Parameters\n",
+    "            value : value\n",
+    "        \"\"\"\n",
+    "        bin = int((value - self.xmin) / self.binWidth)\n",
+    "        if (bin < 0 or  bin > self.numBin - 1):\n",
+    "            print (bin)\n",
+    "            raise ValueError(\"outside histogram range\")\n",
+    "        self.bins[bin] += 1.0\n",
+    "\n",
+    "    def normalize(self):\n",
+    "        \"\"\"\n",
+    "        normalize  bin counts\n",
+    "        \"\"\"\n",
+    "        if not self.normalized:\n",
+    "            total = self.bins.sum()\n",
+    "            self.bins = np.divide(self.bins, total)\n",
+    "            self.normalized = True\n",
+    "\n",
+    "    def cumDistr(self):\n",
+    "        \"\"\"\n",
+    "        cumulative dists\n",
+    "        \"\"\"\n",
+    "        self.normalize()\n",
+    "        self.cbins = np.cumsum(self.bins)\n",
+    "        return self.cbins\n",
+    "\n",
+    "    def distr(self):\n",
+    "        \"\"\"\n",
+    "        distr\n",
+    "        \"\"\"\n",
+    "        self.normalize()\n",
+    "        return self.bins\n",
+    "\n",
+    "\n",
+    "    def percentile(self, percent):\n",
+    "        \"\"\"\n",
+    "        return value corresponding to a percentile\n",
+    "\n",
+    "        Parameters\n",
+    "            percent : percentile value\n",
+    "        \"\"\"\n",
+    "        if self.cbins is None:\n",
+    "            raise ValueError(\"cumulative distribution is not available\")\n",
+    "\n",
+    "        for i,cuml in enumerate(self.cbins):\n",
+    "            if percent > cuml:\n",
+    "                value = (i * self.binWidth) - (self.binWidth / 2) + \\\n",
+    "                (percent - self.cbins[i-1]) * self.binWidth / (self.cbins[i] - self.cbins[i-1]) \n",
+    "                break\n",
+    "        return value\n",
+    "\n",
+    "    def max(self):\n",
+    "        \"\"\"\n",
+    "        return max bin value \n",
+    "        \"\"\"\n",
+    "        return self.bins.max()\n",
+    "\n",
+    "    def value(self, x):\n",
+    "        \"\"\"\n",
+    "        return a bin value\t\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        bin = int((x - self.xmin) / self.binWidth)\n",
+    "        f = self.bins[bin]\n",
+    "        return f\n",
+    "\n",
+    "    def bin(self, x):\n",
+    "        \"\"\"\n",
+    "        return a bin index\t\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        return int((x - self.xmin) / self.binWidth)\n",
+    "\n",
+    "    def cumValue(self, x):\n",
+    "        \"\"\"\n",
+    "        return a cumulative bin value\t\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        bin = int((x - self.xmin) / self.binWidth)\n",
+    "        c = self.cbins[bin]\n",
+    "        return c\n",
+    "\n",
+    "\n",
+    "    def getMinMax(self):\n",
+    "        \"\"\"\n",
+    "        returns x min and x max\n",
+    "        \"\"\"\n",
+    "        return (self.xmin, self.xmax)\n",
+    "\n",
+    "    def boundedValue(self, x):\n",
+    "        \"\"\"\n",
+    "        return x bounde by min and max\t\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        if x < self.xmin:\n",
+    "            x = self.xmin\n",
+    "        elif x > self.xmax:\n",
+    "            x = self.xmax\n",
+    "        return x\n",
+    "\n",
+    "\"\"\"\n",
+    "categorical histogram class\n",
+    "\"\"\"\n",
+    "class CatHistogram:\n",
+    "    def __init__(self):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "        \"\"\"\n",
+    "        self.binCounts = dict()\n",
+    "        self.counts = 0\n",
+    "        self.normalized = False\n",
+    "\n",
+    "    def add(self, value):\n",
+    "        \"\"\"\n",
+    "        adds a value to a bin\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        addToKeyedCounter(self.binCounts, value)\n",
+    "        self.counts += 1\t\n",
+    "\n",
+    "    def normalize(self):\n",
+    "        \"\"\"\n",
+    "        normalize\n",
+    "        \"\"\"\n",
+    "        if not self.normalized:\n",
+    "            self.binCounts = dict(map(lambda r : (r[0],r[1] / self.counts), self.binCounts.items()))\n",
+    "            self.normalized = True\n",
+    "\n",
+    "    def getMode(self):\n",
+    "        \"\"\"\n",
+    "        get mode\n",
+    "        \"\"\"\n",
+    "        maxk = None\n",
+    "        maxv = 0\n",
+    "        #print(self.binCounts)\n",
+    "        for  k,v  in  self.binCounts.items():\n",
+    "            if v > maxv:\n",
+    "                maxk = k\n",
+    "                maxv = v\n",
+    "        return (maxk, maxv)\t\n",
+    "\n",
+    "    def getEntropy(self):\n",
+    "        \"\"\"\n",
+    "        get entropy\n",
+    "        \"\"\"\n",
+    "        self.normalize()\n",
+    "        entr = 0 \n",
+    "        #print(self.binCounts)\n",
+    "        for  k,v  in  self.binCounts.items():\n",
+    "            entr -= v * math.log(v)\n",
+    "        return entr\n",
+    "\n",
+    "    def getUniqueValues(self):\n",
+    "        \"\"\"\n",
+    "        get unique values\n",
+    "        \"\"\"\t\t\n",
+    "        return list(self.binCounts.keys())\n",
+    "\n",
+    "    def getDistr(self):\n",
+    "        \"\"\"\n",
+    "        get distribution\n",
+    "        \"\"\"\t\n",
+    "        self.normalize()\t\n",
+    "        return self.binCounts.copy()\n",
+    "\n",
+    "class RunningStat:\n",
+    "    \"\"\"\n",
+    "    running stat class\n",
+    "    \"\"\"\n",
+    "    def __init__(self):\n",
+    "        \"\"\"\n",
+    "        initializer\t\n",
+    "        \"\"\"\n",
+    "        self.sum = 0.0\n",
+    "        self.sumSq = 0.0\n",
+    "        self.count = 0\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def create(count, sum, sumSq):\n",
+    "        \"\"\"\n",
+    "        creates iinstance\t\n",
+    "\n",
+    "        Parameters\n",
+    "            sum : sum of values\n",
+    "            sumSq : sum of valure squared\n",
+    "        \"\"\"\n",
+    "        rs = RunningStat()\n",
+    "        rs.sum = sum\n",
+    "        rs.sumSq = sumSq\n",
+    "        rs.count = count\n",
+    "        return rs\n",
+    "\n",
+    "    def add(self, value):\n",
+    "        \"\"\"\n",
+    "        adds new value\n",
+    "        Parameters\n",
+    "            value : value to add\n",
+    "        \"\"\"\n",
+    "        self.sum += value\n",
+    "        self.sumSq += (value * value)\n",
+    "        self.count += 1\n",
+    "\n",
+    "    def getStat(self):\n",
+    "        \"\"\"\n",
+    "        return mean and std deviation \n",
+    "        \"\"\"\n",
+    "        mean = self.sum /self. count\n",
+    "        t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
+    "        sd = math.sqrt(t)\n",
+    "        re = (mean, sd)\n",
+    "        return re\n",
+    "\n",
+    "    def addGetStat(self,value):\n",
+    "        \"\"\"\n",
+    "        calculate mean and std deviation with new value added\n",
+    "        Parameters\n",
+    "            value : value to add\n",
+    "        \"\"\"\n",
+    "        self.add(value)\n",
+    "        re = self.getStat()\n",
+    "        return re\n",
+    "\n",
+    "    def getCount(self):\n",
+    "        \"\"\"\n",
+    "        return count\n",
+    "        \"\"\"\n",
+    "        return self.count\n",
+    "\n",
+    "    def getState(self):\n",
+    "        \"\"\"\n",
+    "        return state\n",
+    "        \"\"\"\n",
+    "        s = (self.count, self.sum, self.sumSq)\n",
+    "        return s\n",
+    "\n",
+    "class SlidingWindowStat:\n",
+    "    \"\"\"\n",
+    "    sliding window stats\n",
+    "    \"\"\"\n",
+    "    def __init__(self):\n",
+    "        \"\"\"\n",
+    "        initializer\n",
+    "        \"\"\"\n",
+    "        self.sum = 0.0\n",
+    "        self.sumSq = 0.0\n",
+    "        self.count = 0\n",
+    "        self.values = None\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def create(values, sum, sumSq):\n",
+    "        \"\"\"\n",
+    "        creates iinstance\t\n",
+    "\n",
+    "        Parameters\n",
+    "            sum : sum of values\n",
+    "            sumSq : sum of valure squared\n",
+    "        \"\"\"\n",
+    "        sws = SlidingWindowStat()\n",
+    "        sws.sum = sum\n",
+    "        sws.sumSq = sumSq\n",
+    "        self.values = values.copy()\n",
+    "        sws.count = len(self.values)\n",
+    "        return sws\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def initialize(values):\n",
+    "        \"\"\"\n",
+    "        creates iinstance\t\n",
+    "\n",
+    "        Parameters\n",
+    "            values : list of values\n",
+    "        \"\"\"\n",
+    "        sws = SlidingWindowStat()\n",
+    "        sws.values = values.copy()\n",
+    "        for v in sws.values:\n",
+    "            sws.sum += v\n",
+    "            sws.sumSq += v * v\t\t\n",
+    "        sws.count = len(sws.values)\n",
+    "        return sws\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def createEmpty(count):\n",
+    "        \"\"\"\n",
+    "        creates iinstance\t\n",
+    "\n",
+    "        Parameters\n",
+    "            count : count of values\n",
+    "        \"\"\"\n",
+    "        sws = SlidingWindowStat()\n",
+    "        sws.count = count\n",
+    "        sws.values = list()\n",
+    "        return sws\n",
+    "\n",
+    "    def add(self, value):\n",
+    "        \"\"\"\n",
+    "        adds new value\n",
+    "\n",
+    "        Parameters\n",
+    "            value : value to add\n",
+    "        \"\"\"\n",
+    "        self.values.append(value)\t\t\n",
+    "        if len(self.values) > self.count:\n",
+    "            self.sum += value - self.values[0]\n",
+    "            self.sumSq += (value * value) - (self.values[0] * self.values[0])\n",
+    "            self.values.pop(0)\n",
+    "        else:\n",
+    "            self.sum += value\n",
+    "            self.sumSq += (value * value)\n",
+    "\n",
+    "\n",
+    "    def getStat(self):\n",
+    "        \"\"\"\n",
+    "        calculate mean and std deviation \n",
+    "        \"\"\"\n",
+    "        mean = self.sum /self. count\n",
+    "        t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
+    "        sd = math.sqrt(t)\n",
+    "        re = (mean, sd)\n",
+    "        return re\n",
+    "\n",
+    "    def addGetStat(self,value):\n",
+    "        \"\"\"\n",
+    "        calculate mean and std deviation with new value added\n",
+    "        \"\"\"\n",
+    "        self.add(value)\n",
+    "        re = self.getStat()\n",
+    "        return re\n",
+    "\n",
+    "    def getCount(self):\n",
+    "        \"\"\"\n",
+    "        return count\n",
+    "        \"\"\"\n",
+    "        return self.count\n",
+    "\n",
+    "    def getCurSize(self):\n",
+    "        \"\"\"\n",
+    "        return count\n",
+    "        \"\"\"\n",
+    "        return len(self.values)\n",
+    "\n",
+    "    def getState(self):\n",
+    "        \"\"\"\n",
+    "        return state\n",
+    "        \"\"\"\n",
+    "        s = (self.count, self.sum, self.sumSq)\n",
+    "        return s\n",
+    "\n",
+    "\n",
+    "def basicStat(ldata):\n",
+    "    \"\"\"\n",
+    "    mean and std dev\n",
+    "    Parameters\n",
+    "        ldata : list of values\n",
+    "    \"\"\"\n",
+    "    m = statistics.mean(ldata)\n",
+    "    s = statistics.stdev(ldata, xbar=m)\n",
+    "    r = (m, s)\n",
+    "    return r\n",
+    "\n",
+    "def getFileColumnStat(filePath, col, delem=\",\"):\n",
+    "    \"\"\"\n",
+    "    gets stats for a file column\n",
+    "\n",
+    "    Parameters\n",
+    "        filePath : file path\n",
+    "        col : col index\n",
+    "        delem : field delemter\n",
+    "    \"\"\"\n",
+    "    rs = RunningStat()\n",
+    "    for rec in fileRecGen(filePath, delem):\n",
+    "        va = float(rec[col])\n",
+    "        rs.add(va)\n",
+    "\n",
+    "    return rs.getStat()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lib/tnn.ipynb ADDED Viewed

	@@ -0,0 +1,800 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3853095d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from torch.autograd import Variable\n",
+    "from torch.utils.data import Dataset, TensorDataset\n",
+    "from torch.utils.data import DataLoader\n",
+    "import sklearn as sk\n",
+    "from sklearn.neighbors import KDTree\n",
+    "import matplotlib\n",
+    "import random\n",
+    "import jprops\n",
+    "from random import randint\n",
+    "import statistics\n",
+    "sys.path.append(os.path.abspath(\"../lib\"))\n",
+    "from util import *\n",
+    "from mlutil import *\n",
+    "\n",
+    "\"\"\"\n",
+    "forward hook function\n",
+    "\"\"\"\n",
+    "intermedOut = {}\n",
+    "lvalues = list()\n",
+    "\n",
+    "def hookFn(m, i, o):\n",
+    "    \"\"\"\n",
+    "    call back for latent values\n",
+    "    \"\"\"\n",
+    "    #intermedOut[m] = o\n",
+    "    lv = o.data.cpu().numpy()\n",
+    "    lv = lv[0].tolist()\n",
+    "    lvalues.append(lv)\n",
+    "    #print(lv)\n",
+    "\n",
+    "def getLatValues():\n",
+    "    \"\"\"\n",
+    "    \"\"\"\n",
+    "    return lvalues\n",
+    "\n",
+    "class FeedForwardNetwork(torch.nn.Module):\n",
+    "    def __init__(self, configFile, addDefValues=None):\n",
+    "        \"\"\"\n",
+    "        In the constructor we instantiate two nn.Linear modules and assign them as\n",
+    "        member variables.\n",
+    "\n",
+    "        Parameters\n",
+    "            configFile : config file path\n",
+    "            addDefValues : dictionary of additional default values\t\n",
+    "        \"\"\"\n",
+    "        defValues = dict() if addDefValues is None else addDefValues.copy()\n",
+    "        defValues[\"common.mode\"] = (\"training\", None)\n",
+    "        defValues[\"common.model.directory\"] = (\"model\", None)\n",
+    "        defValues[\"common.model.file\"] = (None, None)\n",
+    "        defValues[\"common.preprocessing\"] = (None, None)\n",
+    "        defValues[\"common.scaling.method\"] = (\"zscale\", None)\n",
+    "        defValues[\"common.scaling.minrows\"] = (50, None)\n",
+    "        defValues[\"common.scaling.param.file\"] = (None, None)\n",
+    "        defValues[\"common.verbose\"] = (False, None)\n",
+    "        defValues[\"common.device\"] = (\"cpu\", None)\n",
+    "        defValues[\"train.data.file\"] = (None, \"missing training data file\")\n",
+    "        defValues[\"train.data.fields\"] = (None, \"missing training data field ordinals\")\n",
+    "        defValues[\"train.data.feature.fields\"] = (None, \"missing training data feature field ordinals\")\n",
+    "        defValues[\"train.data.out.fields\"] = (None, \"missing training data feature field ordinals\")\n",
+    "        defValues[\"train.layer.data\"] = (None, \"missing layer data\")\n",
+    "        defValues[\"train.input.size\"] = (None, None)\n",
+    "        defValues[\"train.output.size\"] = (None, \"missing  output size\")\n",
+    "        defValues[\"train.batch.size\"] = (10, None)\n",
+    "        defValues[\"train.loss.reduction\"] = (\"mean\", None)\n",
+    "        defValues[\"train.num.iterations\"] = (500, None)\n",
+    "        defValues[\"train.lossFn\"] = (\"mse\", None) \n",
+    "        defValues[\"train.optimizer\"] = (\"sgd\", None) \n",
+    "        defValues[\"train.opt.learning.rate\"] = (.0001, None)\n",
+    "        defValues[\"train.opt.weight.decay\"] = (0, None) \n",
+    "        defValues[\"train.opt.momentum\"] = (0, None) \n",
+    "        defValues[\"train.opt.eps\"] = (1e-08, None) \n",
+    "        defValues[\"train.opt.dampening\"] = (0, None) \n",
+    "        defValues[\"train.opt.momentum.nesterov\"] = (False, None) \n",
+    "        defValues[\"train.opt.betas\"] = ([0.9, 0.999], None) \n",
+    "        defValues[\"train.opt.alpha\"] = (0.99, None) \n",
+    "        defValues[\"train.save.model\"] = (False, None) \n",
+    "        defValues[\"train.track.error\"] = (False, None) \n",
+    "        defValues[\"train.epoch.intv\"] = (5, None) \n",
+    "        defValues[\"train.batch.intv\"] = (5, None) \n",
+    "        defValues[\"train.print.weights\"] = (False, None) \n",
+    "        defValues[\"valid.data.file\"] = (None, None)\n",
+    "        defValues[\"valid.accuracy.metric\"] = (None, None)\n",
+    "        defValues[\"predict.data.file\"] = (None, None)\n",
+    "        defValues[\"predict.use.saved.model\"] = (True, None)\n",
+    "        defValues[\"predict.output\"] = (\"binary\", None)\n",
+    "        defValues[\"predict.feat.pad.size\"] = (60, None)\n",
+    "        defValues[\"predict.print.output\"] = (True, None)\n",
+    "        defValues[\"calibrate.num.bins\"] = (10, None)\n",
+    "        defValues[\"calibrate.pred.prob.thresh\"] = (0.5, None)\n",
+    "        defValues[\"calibrate.num.nearest.neighbors\"] = (10, None)\n",
+    "        self.config = Configuration(configFile, defValues)\n",
+    "\n",
+    "        super(FeedForwardNetwork, self).__init__()\n",
+    "\n",
+    "    def setConfigParam(self, name, value):\n",
+    "        \"\"\"\n",
+    "        set config param\n",
+    "\n",
+    "        Parameters\n",
+    "            name : config name\n",
+    "            value : config value\n",
+    "        \"\"\"\n",
+    "        self.config.setParam(name, value)\n",
+    "\n",
+    "    def getConfig(self):\n",
+    "        \"\"\"\n",
+    "        get config object\n",
+    "        \"\"\"\n",
+    "        return self.config\n",
+    "\n",
+    "    def setVerbose(self, verbose):\n",
+    "        self.verbose = verbose\n",
+    "\n",
+    "    def buildModel(self):\n",
+    "        \"\"\"\n",
+    "        Loads configuration and builds the various piecess necessary for the model\n",
+    "        \"\"\"\n",
+    "        torch.manual_seed(9999)\n",
+    "\n",
+    "        self.verbose = self.config.getBooleanConfig(\"common.verbose\")[0]\n",
+    "        numinp = self.config.getIntConfig(\"train.input.size\")[0]\n",
+    "        if numinp is None:\n",
+    "            numinp = len(self.config.getIntListConfig(\"train.data.feature.fields\")[0])\n",
+    "        #numOut = len(self.config.getStringConfig(\"train.data.out.fields\")[0].split(\",\"))\n",
+    "        self.outputSize = self.config.getIntConfig(\"train.output.size\")[0]\n",
+    "        self.batchSize = self.config.getIntConfig(\"train.batch.size\")[0]\n",
+    "        #lossRed = self.config.getStringConfig(\"train.loss.reduction\")[0]\n",
+    "        #learnRate = self.config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
+    "        self.numIter = self.config.getIntConfig(\"train.num.iterations\")[0]\n",
+    "        optimizer = self.config.getStringConfig(\"train.optimizer\")[0]\n",
+    "        self.lossFnStr = self.config.getStringConfig(\"train.lossFn\")[0]\n",
+    "        self.accMetric = self.config.getStringConfig(\"valid.accuracy.metric\")[0]\n",
+    "        self.trackErr = self.config.getBooleanConfig(\"train.track.error\")[0]\n",
+    "        self.batchIntv = self.config.getIntConfig(\"train.batch.intv\")[0]\n",
+    "        self.restored = False\n",
+    "        self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None\n",
+    "\n",
+    "        #build network\n",
+    "        layers = list()\n",
+    "        ninp = numinp\n",
+    "        trData =  self.config.getStringConfig(\"train.layer.data\")[0].split(\",\")\n",
+    "        for ld in trData:\n",
+    "            lde = ld.split(\":\")\n",
+    "            assert len(lde) == 5, \"expecting 5 items for layer data\"\n",
+    "\n",
+    "            #num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction\n",
+    "            nunit = int(lde[0])\n",
+    "            actStr = lde[1]\n",
+    "            act = FeedForwardNetwork.createActivation(actStr) if actStr != \"none\"  else None\n",
+    "            bnorm = lde[2] == \"true\"\n",
+    "            afterAct = lde[3] == \"true\"\n",
+    "            dpr = float(lde[4])\n",
+    "\n",
+    "            layers.append(torch.nn.Linear(ninp, nunit))\t\t\t\n",
+    "            if bnorm:\n",
+    "                #with batch norm\n",
+    "                if afterAct:\n",
+    "                    safeAppend(layers, act)\n",
+    "                    layers.append(torch.nn.BatchNorm1d(nunit))\n",
+    "                else:\n",
+    "                    layers.append(torch.nn.BatchNorm1d(nunit))\n",
+    "                    safeAppend(layers, act)\n",
+    "            else:\n",
+    "                #without batch norm\n",
+    "                safeAppend(layers, act)\n",
+    "\n",
+    "            if dpr > 0:\n",
+    "                layers.append(torch.nn.Dropout(dpr))\n",
+    "            ninp = nunit\n",
+    "\n",
+    "        self.layers = torch.nn.Sequential(*layers)\t\n",
+    "\n",
+    "        self.device = FeedForwardNetwork.getDevice(self)\n",
+    "\n",
+    "        #training data\n",
+    "        dataFile = self.config.getStringConfig(\"train.data.file\")[0]\n",
+    "        (featData, outData) = FeedForwardNetwork.prepData(self, dataFile)\n",
+    "        self.featData = torch.from_numpy(featData)\n",
+    "        self.outData = torch.from_numpy(outData)\n",
+    "\n",
+    "        #validation data\n",
+    "        dataFile = self.config.getStringConfig(\"valid.data.file\")[0]\n",
+    "        (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataFile)\n",
+    "        self.validFeatData = torch.from_numpy(featDataV)\n",
+    "        self.validOutData = torch.from_numpy(outDataV)\n",
+    "\n",
+    "        # loss function and optimizer\n",
+    "        self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)\n",
+    "        self.optimizer =  FeedForwardNetwork.createOptimizer(self, optimizer)\n",
+    "\n",
+    "        self.yPred  = None\n",
+    "        self.restored = False\n",
+    "\n",
+    "        #mode to device\n",
+    "        self.device = FeedForwardNetwork.getDevice(self)\t\n",
+    "        self.featData = self.featData.to(self.device)\n",
+    "        self.outData = self.outData.to(self.device)\n",
+    "        self.validFeatData = self.validFeatData.to(self.device)\n",
+    "        self.to(self.device)\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def getDevice(model):\n",
+    "        \"\"\"\n",
+    "        gets device\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "        \"\"\"\n",
+    "        devType = model.config.getStringConfig(\"common.device\")[0]\n",
+    "        if devType == \"cuda\":\n",
+    "            if torch.cuda.is_available():\n",
+    "                device = torch.device(\"cuda\")\n",
+    "            else:\n",
+    "                exitWithMsg(\"cuda not available\")\n",
+    "        else:\n",
+    "            device = torch.device(\"cpu\")\n",
+    "        return device\n",
+    "\n",
+    "    def setValidationData(self, dataSource, prep=True):\n",
+    "        \"\"\"\n",
+    "        sets validation data\n",
+    "\n",
+    "        Parameters\n",
+    "            dataSource : data source str if file path or 2D array\n",
+    "            prep : if True load and prepare \n",
+    "        \"\"\"\n",
+    "        if prep:\n",
+    "            (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataSource)\n",
+    "            self.validFeatData = torch.from_numpy(featDataV)\n",
+    "            self.validOutData = outDataV\n",
+    "        else:\n",
+    "            self.validFeatData = torch.from_numpy(dataSource[0])\n",
+    "            self.validOutData = dataSource[1]\t\t\n",
+    "\n",
+    "        self.validFeatData = self.validFeatData.to(self.device)\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def createActivation(actName):\n",
+    "        \"\"\"\n",
+    "        create activation\n",
+    "\n",
+    "        Parameters\n",
+    "            actName : activation name\n",
+    "        \"\"\"\n",
+    "        if actName is None:\n",
+    "            activation = None\n",
+    "        elif actName == \"relu\":\n",
+    "            activation = torch.nn.ReLU()\n",
+    "        elif actName == \"tanh\":\n",
+    "            activation = torch.nn.Tanh()\n",
+    "        elif actName == \"sigmoid\":\n",
+    "            activation = torch.nn.Sigmoid()\n",
+    "        elif actName == \"softmax\":\n",
+    "            activation = torch.nn.Softmax(dim=1)\n",
+    "        else:\n",
+    "            exitWithMsg(\"invalid activation function name \" + actName)\n",
+    "        return activation\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def createLossFunction(model, lossFnName):\n",
+    "        \"\"\"\n",
+    "        create loss function\n",
+    "\n",
+    "        Parameters\n",
+    "            lossFnName : loss function name\n",
+    "        \"\"\"\n",
+    "        config = model.config\n",
+    "        lossRed = config.getStringConfig(\"train.loss.reduction\")[0]\n",
+    "        if lossFnName == \"ltwo\" or lossFnName == \"mse\":\n",
+    "            lossFunc = torch.nn.MSELoss(reduction=lossRed)\n",
+    "        elif lossFnName == \"ce\":\n",
+    "            lossFunc = torch.nn.CrossEntropyLoss(reduction=lossRed)\n",
+    "        elif lossFnName == \"lone\" or lossFnName == \"mae\":\n",
+    "            lossFunc = torch.nn.L1Loss(reduction=lossRed)\n",
+    "        elif lossFnName == \"bce\":\n",
+    "            lossFunc = torch.nn.BCELoss(reduction=lossRed)\n",
+    "        elif lossFnName == \"bcel\":\n",
+    "            lossFunc = torch.nn.BCEWithLogitsLoss(reduction=lossRed)\n",
+    "        elif lossFnName == \"sm\":\n",
+    "            lossFunc = torch.nn.SoftMarginLoss(reduction=lossRed)\n",
+    "        elif lossFnName == \"mlsm\":\n",
+    "            lossFunc = torch.nn.MultiLabelSoftMarginLoss(reduction=lossRed)\n",
+    "        else:\n",
+    "            exitWithMsg(\"invalid loss function name \" + lossFnName)\n",
+    "        return lossFunc\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def createOptimizer(model, optName):\n",
+    "        \"\"\"\n",
+    "        create optimizer\n",
+    "\n",
+    "        Parameters\n",
+    "            optName : optimizer name\n",
+    "        \"\"\"\n",
+    "        config = model.config\n",
+    "        learnRate = config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
+    "        weightDecay = config.getFloatConfig(\"train.opt.weight.decay\")[0]\n",
+    "        momentum = config.getFloatConfig(\"train.opt.momentum\")[0]\n",
+    "        eps = config.getFloatConfig(\"train.opt.eps\")[0]\n",
+    "        if optName == \"sgd\":\n",
+    "            dampening = config.getFloatConfig(\"train.opt.dampening\")[0]\n",
+    "            momentumNesterov = config.getBooleanConfig(\"train.opt.momentum.nesterov\")[0]\n",
+    "            optimizer = torch.optim.SGD(model.parameters(),lr=learnRate, momentum=momentum, \n",
+    "            dampening=dampening, weight_decay=weightDecay, nesterov=momentumNesterov)\n",
+    "        elif optName == \"adam\":\n",
+    "            betas = config.getFloatListConfig(\"train.opt.betas\")[0]\n",
+    "            betas = (betas[0], betas[1]) \n",
+    "            optimizer = torch.optim.Adam(model.parameters(), lr=learnRate,betas=betas, eps = eps,\n",
+    "            weight_decay=weightDecay)\n",
+    "        elif optName == \"rmsprop\":\n",
+    "            alpha = config.getFloatConfig(\"train.opt.alpha\")[0]\n",
+    "            optimizer = torch.optim.RMSprop(model.parameters(), lr=learnRate, alpha=alpha,\n",
+    "            eps=eps, weight_decay=weightDecay, momentum=momentum)\n",
+    "        else:\n",
+    "            exitWithMsg(\"invalid optimizer name \" + optName)\n",
+    "        return optimizer\n",
+    "\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        \"\"\"\n",
+    "        In the forward function we accept a Tensor of input data and we must return\n",
+    "        a Tensor of output data. We can use Modules defined in the constructor as\n",
+    "        well as arbitrary (differentiable) operations on Tensors.\n",
+    "\n",
+    "        Parameters\n",
+    "            x : data batch\n",
+    "        \"\"\"\n",
+    "        y = self.layers(x)\t\n",
+    "        return y\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def addForwardHook(model, l, cl = 0):\n",
+    "        \"\"\"\n",
+    "        register forward hooks\n",
+    "\n",
+    "        Parameters\n",
+    "            l : \n",
+    "            cl :\n",
+    "        \"\"\"\n",
+    "        for name, layer in model._modules.items():\n",
+    "            #If it is a sequential, don't register a hook on it\n",
+    "            # but recursively register hook on all it's module children\n",
+    "            print(str(cl) + \" : \" + name)\n",
+    "            if isinstance(layer, torch.nn.Sequential):\n",
+    "                FeedForwardNetwork.addForwardHook(layer, l, cl)\n",
+    "            else:\n",
+    "            #\t it's a non sequential. Register a hook\n",
+    "                if cl == l:\n",
+    "                    print(\"setting hook at layer \" + str(l))\n",
+    "                    layer.register_forward_hook(hookFn)\n",
+    "                cl += 1\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def prepData(model, dataSource, includeOutFld=True):\n",
+    "        \"\"\"\n",
+    "        loads and prepares  data\n",
+    "\n",
+    "        Parameters\n",
+    "            dataSource : data source str if file path or 2D array\n",
+    "            includeOutFld : True if target freld to be included\n",
+    "        \"\"\"\n",
+    "        # parameters\n",
+    "        fieldIndices = model.config.getIntListConfig(\"train.data.fields\")[0]\n",
+    "        featFieldIndices = model.config.getIntListConfig(\"train.data.feature.fields\")[0]\n",
+    "\n",
+    "        #all data and feature data\n",
+    "        isDataFile = isinstance(dataSource, str)\n",
+    "        selFieldIndices = fieldIndices if includeOutFld else fieldIndices[:-1]\n",
+    "        if isDataFile: \n",
+    "            #source file path \n",
+    "            (data, featData) = loadDataFile(dataSource, \",\", selFieldIndices, featFieldIndices)\n",
+    "        else:\n",
+    "            # tabular data\n",
+    "            data = tableSelFieldsFilter(dataSource, selFieldIndices)\n",
+    "            featData = tableSelFieldsFilter(data, featFieldIndices)\n",
+    "            #print(featData)\n",
+    "            featData = np.array(featData)\n",
+    "\n",
+    "        if (model.config.getStringConfig(\"common.preprocessing\")[0] == \"scale\"):\n",
+    "            scalingMethod = model.config.getStringConfig(\"common.scaling.method\")[0]\n",
+    "\n",
+    "            #scale only if there are enough rows\n",
+    "            nrow = featData.shape[0]\n",
+    "            minrows = model.config.getIntConfig(\"common.scaling.minrows\")[0]\n",
+    "            if nrow > minrows:\n",
+    "                #in place scaling\n",
+    "                featData = scaleData(featData, scalingMethod)\n",
+    "            else:\n",
+    "                #use pre computes scaling parameters\n",
+    "                spFile = model.config.getStringConfig(\"common.scaling.param.file\")[0]\n",
+    "                if spFile is None:\n",
+    "                    exitWithMsg(\"for small data sets pre computed scaling parameters need to provided\")\n",
+    "                scParams = restoreObject(spFile)\n",
+    "                featData = scaleDataWithParams(featData, scalingMethod, scParams)\n",
+    "                featData = np.array(featData)\n",
+    "\n",
+    "        # target data\n",
+    "        if includeOutFld:\n",
+    "            outFieldIndices = model.config.getStringConfig(\"train.data.out.fields\")[0]\n",
+    "            outFieldIndices = strToIntArray(outFieldIndices, \",\")\n",
+    "            if isDataFile:\n",
+    "                outData = data[:,outFieldIndices]\n",
+    "            else:\n",
+    "                outData = tableSelFieldsFilter(data, outFieldIndices)\n",
+    "                outData = np.array(outData)\n",
+    "            foData = (featData.astype(np.float32), outData.astype(np.float32))\n",
+    "        else:\n",
+    "            foData = featData.astype(np.float32)\n",
+    "        return foData\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def saveCheckpt(model):\n",
+    "        \"\"\"\n",
+    "        checkpoints model\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "        \"\"\"\n",
+    "        print(\"..saving model checkpoint\")\n",
+    "        modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
+    "        assert os.path.exists(modelDirectory), \"model save directory does not exist\"\n",
+    "        modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
+    "        filepath = os.path.join(modelDirectory, modelFile)\n",
+    "        state = {\"state_dict\": model.state_dict(), \"optim_dict\": model.optimizer.state_dict()}\n",
+    "        torch.save(state, filepath)\n",
+    "        if model.verbose:\n",
+    "            print(\"model saved\")\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def restoreCheckpt(model, loadOpt=False):\n",
+    "        \"\"\"\n",
+    "        restored checkpointed model\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "            loadOpt : True if optimizer to be loaded\n",
+    "        \"\"\"\n",
+    "        if not model.restored:\n",
+    "            print(\"..restoring model checkpoint\")\n",
+    "            modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
+    "            modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
+    "            filepath = os.path.join(modelDirectory, modelFile)\n",
+    "            assert os.path.exists(filepath), \"model save file does not exist\"\n",
+    "            checkpoint = torch.load(filepath)\n",
+    "            model.load_state_dict(checkpoint[\"state_dict\"])\n",
+    "            model.to(model.device)\n",
+    "            if loadOpt:\n",
+    "                model.optimizer.load_state_dict(checkpoint[\"optim_dict\"])\n",
+    "            model.restored = True\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def processClassifOutput(yPred, config):\n",
+    "        \"\"\"\n",
+    "        extracts probability label 1 or label with highest probability\n",
+    "\n",
+    "        Parameters\n",
+    "            yPred : predicted output\n",
+    "            config : config object\n",
+    "        \"\"\"\n",
+    "        outType = config.getStringConfig(\"predict.output\")[0]\n",
+    "        if outType == \"prob\":\n",
+    "            outputSize = config.getIntConfig(\"train.output.size\")[0]\n",
+    "            if outputSize == 2:\n",
+    "                #return prob of pos class for binary classifier \n",
+    "                yPred = yPred[:, 1]\n",
+    "            else:\n",
+    "                #return  class value and probability for multi classifier \n",
+    "                yCl = np.argmax(yPred, axis=1)\n",
+    "                yPred = list(map(lambda y : y[0][y[1]], zip(yPred, yCl)))\n",
+    "                yPred = zip(yCl, yPred)\n",
+    "        else:\n",
+    "            yPred = np.argmax(yPred, axis=1)\n",
+    "        return yPred\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def printPrediction(yPred, config, dataSource):\n",
+    "        \"\"\"\n",
+    "        prints input feature data and prediction\n",
+    "\n",
+    "        Parameters\n",
+    "            yPred : predicted output\n",
+    "            config : config object\n",
+    "            dataSource : data source str if file path or 2D array\n",
+    "        \"\"\"\n",
+    "        #prDataFilePath = config.getStringConfig(\"predict.data.file\")[0]\n",
+    "        padWidth = config.getIntConfig(\"predict.feat.pad.size\")[0]\n",
+    "        i = 0\n",
+    "        if type(dataSource) == str:\n",
+    "            for rec in fileRecGen(dataSource, \",\"):\n",
+    "                feat = (\",\".join(rec)).ljust(padWidth, \" \")\n",
+    "                rec = feat + \"\\t\" + str(yPred[i])\n",
+    "                print(rec)\n",
+    "                i += 1\n",
+    "        else:\n",
+    "            for rec in dataSource:\n",
+    "                srec = toStrList(rec, 6)\n",
+    "                feat = (\",\".join(srec)).ljust(padWidth, \" \")\n",
+    "                srec = feat + \"\\t\" + str(yPred[i])\n",
+    "                print(srec)\n",
+    "                i += 1\n",
+    "\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def allTrain(model):\n",
+    "        \"\"\"\n",
+    "        train with all data\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "        \"\"\"\n",
+    "        # train mode\n",
+    "        model.train()\n",
+    "        for t in range(model.numIter):\n",
+    "\n",
+    "\n",
+    "            # Forward pass: Compute predicted y by passing x to the model\n",
+    "            yPred = model(model.featData)\n",
+    "\n",
+    "            # Compute and print loss\n",
+    "            loss = model.lossFn(yPred, model.outData)\n",
+    "            if model.verbose and  t % 50 == 0:\n",
+    "                print(\"epoch {}  loss {:.6f}\".format(t, loss.item()))\n",
+    "\n",
+    "            # Zero gradients, perform a backward pass, and update the weights.\n",
+    "            model.optimizer.zero_grad()\n",
+    "            loss.backward()\n",
+    "            model.optimizer.step()    \t\n",
+    "\n",
+    "        #validate\n",
+    "        model.eval()\n",
+    "        yPred = model(model.validFeatData)\n",
+    "        yPred = yPred.data.cpu().numpy()\n",
+    "        yActual = model.validOutData\n",
+    "        if model.verbose:\n",
+    "            result = np.concatenate((yPred, yActual), axis = 1)\n",
+    "            print(\"predicted  actual\")\n",
+    "            print(result)\n",
+    "\n",
+    "        score = perfMetric(model.accMetric, yActual, yPred)\n",
+    "        print(formatFloat(3, score, \"perf score\"))\n",
+    "        return score\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def batchTrain(model):\n",
+    "        \"\"\"\n",
+    "        train with batch data\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "        \"\"\"\n",
+    "        model.restored = False\n",
+    "        trainData = TensorDataset(model.featData, model.outData)\n",
+    "        trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)\n",
+    "        epochIntv = model.config.getIntConfig(\"train.epoch.intv\")[0]\n",
+    "\n",
+    "        # train mode\n",
+    "        model.train()\n",
+    "\n",
+    "        if model.trackErr:\n",
+    "            trErr = list()\n",
+    "            vaErr = list()\n",
+    "        #epoch\n",
+    "        for t in range(model.numIter):\n",
+    "            #batch\n",
+    "            b = 0\n",
+    "            epochLoss = 0.0\n",
+    "            for xBatch, yBatch in trainDataLoader:\n",
+    "\n",
+    "                # Forward pass: Compute predicted y by passing x to the model\n",
+    "                xBatch, yBatch = xBatch.to(model.device), yBatch.to(model.device)\n",
+    "                yPred = model(xBatch)\n",
+    "\n",
+    "                # Compute and print loss\n",
+    "                loss = model.lossFn(yPred, yBatch)\n",
+    "                if model.verbose and t % epochIntv == 0 and b % model.batchIntv == 0:\n",
+    "                    print(\"epoch {}  batch {}  loss {:.6f}\".format(t, b, loss.item()))\n",
+    "\n",
+    "                if model.trackErr and model.batchIntv == 0:\n",
+    "                    epochLoss += loss.item()\n",
+    "\n",
+    "                #error tracking at batch level\n",
+    "                if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:\n",
+    "                    trErr.append(loss.item())\n",
+    "                    vloss = FeedForwardNetwork.evaluateModel(model)\n",
+    "                    vaErr.append(vloss)\n",
+    "\n",
+    "                # Zero gradients, perform a backward pass, and update the weights.\n",
+    "                model.optimizer.zero_grad()\n",
+    "                loss.backward()\n",
+    "                model.optimizer.step()    \t\n",
+    "                b += 1\n",
+    "\n",
+    "            #error tracking at epoch level\n",
+    "            if model.trackErr and model.batchIntv == 0:\n",
+    "                epochLoss /= len(trainDataLoader)\n",
+    "                trErr.append(epochLoss)\n",
+    "                vloss = FeedForwardNetwork.evaluateModel(model)\n",
+    "                vaErr.append(vloss)\n",
+    "\n",
+    "        #validate\n",
+    "        model.eval()\n",
+    "        yPred = model(model.validFeatData)\n",
+    "        yPred = yPred.data.cpu().numpy()\n",
+    "        yActual = model.validOutData\n",
+    "        if model.verbose:\n",
+    "            vsize = yPred.shape[0]\n",
+    "            print(\"\\npredicted \\t\\t actual\")\n",
+    "            for i in range(vsize):\n",
+    "                print(str(yPred[i]) + \"\\t\" + str(yActual[i]))\n",
+    "\n",
+    "        score = perfMetric(model.accMetric, yActual, yPred)\n",
+    "        print(yActual)\n",
+    "        print(yPred)\n",
+    "        print(formatFloat(3, score, \"perf score\"))\n",
+    "\n",
+    "        #save\n",
+    "        modelSave = model.config.getBooleanConfig(\"train.model.save\")[0]\n",
+    "        if modelSave:\n",
+    "            FeedForwardNetwork.saveCheckpt(model)\n",
+    "\n",
+    "        if model.trackErr:\n",
+    "            FeedForwardNetwork.errorPlot(model, trErr, vaErr)\n",
+    "\n",
+    "        if model.config.getBooleanConfig(\"train.print.weights\")[0]:\n",
+    "            print(\"model weights\")\n",
+    "            for param in model.parameters():\n",
+    "                print(param.data)\n",
+    "        return score\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def errorPlot(model, trErr, vaErr):\n",
+    "        \"\"\"\n",
+    "        plot errors\n",
+    "\n",
+    "        Parameters\n",
+    "            trErr : training error list\t\n",
+    "            vaErr : validation error list\t\n",
+    "        \"\"\"\n",
+    "        x = np.arange(len(trErr))\n",
+    "        plt.plot(x,trErr,label = \"training error\")\n",
+    "        plt.plot(x,vaErr,label = \"validation error\")\n",
+    "        plt.xlabel(\"iteration\")\n",
+    "        plt.ylabel(\"error\")\n",
+    "        plt.legend([\"training error\", \"validation error\"], loc='upper left')\n",
+    "        plt.show()\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def modelPredict(model, dataSource = None):\n",
+    "        \"\"\"\n",
+    "        predict\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "            dataSource : data source\n",
+    "        \"\"\"\n",
+    "        #train or restore model\n",
+    "        useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
+    "        if useSavedModel:\n",
+    "            FeedForwardNetwork.restoreCheckpt(model)\n",
+    "        else:\n",
+    "            FeedForwardNetwork.batchTrain(model) \n",
+    "\n",
+    "        #predict\n",
+    "        if dataSource is None:\n",
+    "            dataSource = model.config.getStringConfig(\"predict.data.file\")[0]\n",
+    "        featData  = FeedForwardNetwork.prepData(model, dataSource, False)\n",
+    "        #print(featData)\n",
+    "        featData = torch.from_numpy(featData)\n",
+    "        featData = featData.to(model.device)\n",
+    "\n",
+    "        model.eval()\n",
+    "        yPred = model(featData)\n",
+    "        yPred = yPred.data.cpu().numpy()\n",
+    "        #print(yPred)\n",
+    "\n",
+    "        if model.outputSize >= 2:\n",
+    "            #classification\n",
+    "            yPred = FeedForwardNetwork.processClassifOutput(yPred, model.config)\n",
+    "\n",
+    "        # print prediction\n",
+    "        if model.config.getBooleanConfig(\"predict.print.output\")[0]:\n",
+    "            FeedForwardNetwork.printPrediction(yPred, model.config, dataSource)\n",
+    "\n",
+    "        return yPred\n",
+    "\n",
+    "    def predict(self, dataSource = None):\n",
+    "        \"\"\"\n",
+    "        predict\n",
+    "\n",
+    "        Parameters\n",
+    "            dataSource : data source\n",
+    "        \"\"\"\n",
+    "        return FeedForwardNetwork.modelPredict(self, dataSource)\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def evaluateModel(model):\n",
+    "        \"\"\"\n",
+    "        evaluate model\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "        \"\"\"\n",
+    "        model.eval()\n",
+    "        with torch.no_grad():\n",
+    "            yPred = model(model.validFeatData)\n",
+    "            #yPred = yPred.data.cpu().numpy()\n",
+    "            yActual = model.validOutData\n",
+    "            score = model.lossFn(yPred, yActual).item()\n",
+    "        model.train()\n",
+    "        return score\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def prepValidate(model, dataSource=None):\n",
+    "        \"\"\"\n",
+    "        prepare for validation\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "            dataSource : data source\n",
+    "        \"\"\"\n",
+    "        #train or restore model\n",
+    "        if not model.restored:\n",
+    "            useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
+    "            if useSavedModel:\n",
+    "                FeedForwardNetwork.restoreCheckpt(model)\n",
+    "            else:\n",
+    "                FeedForwardNetwork.batchTrain(model)\n",
+    "            model.restored = True\n",
+    "\n",
+    "        if \tdataSource is not None:\n",
+    "            model.setValidationData(dataSource)\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def validateModel(model, retPred=False):\n",
+    "        \"\"\"\n",
+    "        pmodel validation\n",
+    "\n",
+    "        Parameters\n",
+    "            model : torch model\n",
+    "            retPred : if True return prediction\n",
+    "        \"\"\"\n",
+    "        model.eval()\n",
+    "        yPred = model(model.validFeatData)\n",
+    "        yPred = yPred.data.cpu().numpy()\n",
+    "        model.yPred = yPred\n",
+    "        yActual = model.validOutData\n",
+    "        vsize = yPred.shape[0]\n",
+    "        if model.verbose:\n",
+    "            print(\"\\npredicted \\t actual\")\n",
+    "            for i in range(vsize):\n",
+    "                print(\"{:.3f}\\t\\t{:.3f}\".format(yPred[i][0], yActual[i][0]))\n",
+    "\n",
+    "        score = perfMetric(model.accMetric, yActual, yPred)\n",
+    "        print(formatFloat(3, score, \"perf score\"))\n",
+    "\n",
+    "        if retPred:\n",
+    "            y = list(map(lambda i : (yPred[i][0], yActual[i][0]), range(vsize)))\n",
+    "            res = (y, score)\n",
+    "            return res\n",
+    "        else:\t\n",
+    "            return score"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lib/txproc.ipynb ADDED Viewed

	@@ -0,0 +1,1002 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f720c141",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "from random import randint\n",
+    "import random\n",
+    "import time\n",
+    "from datetime import datetime\n",
+    "import re, string, unicodedata\n",
+    "import nltk\n",
+    "import contractions\n",
+    "import inflect\n",
+    "from bs4 import BeautifulSoup\n",
+    "from nltk import word_tokenize, sent_tokenize\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem.isri import ISRIStemmer\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "from nltk.stem.snowball import SnowballStemmer\n",
+    "from nltk.stem import LancasterStemmer, WordNetLemmatizer\n",
+    "from nltk.tag import StanfordNERTagger\n",
+    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
+    "import spacy\n",
+    "import torch\n",
+    "from collections import defaultdict\n",
+    "import pickle\n",
+    "import numpy as np\n",
+    "import re\n",
+    "\n",
+    "sys.path.append(os.path.abspath(\"../lib\"))\n",
+    "from util import *\n",
+    "from mlutil import *\n",
+    "\n",
+    "lcc = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
+    "\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
+    "ucc = [\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\", \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\"]\n",
+    "dig = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
+    "spc = [\"@\",\"#\",\"$\",\"%\",\"^\",\"&\",\"*\",\"(\",\")\",\"_\",\"+\",\"{\",\"}\",\"[\",\"]\",\"|\",\":\",\"<\",\">\",\"?\",\";\",\",\",\".\"]\n",
+    "\n",
+    "\n",
+    "class TextPreProcessor:\n",
+    "    \"\"\"\n",
+    "    text preprocessor\n",
+    "    \"\"\"\n",
+    "    def __init__(self, stemmer = \"lancaster\", verbose=False):\n",
+    "        self.verbose = verbose\n",
+    "        self.lemmatizer = WordNetLemmatizer()\n",
+    "\n",
+    "    def stripHtml(self, text):\n",
+    "        soup = BeautifulSoup(text, \"html.parser\")\n",
+    "        return soup.get_text()\n",
+    "\n",
+    "    def removeBetweenSquareBrackets(self, text):\n",
+    "        return re.sub('\\[[^]]*\\]', '', text)\n",
+    "\n",
+    "    def denoiseText(self, text):\n",
+    "        text = stripHtml(text)\n",
+    "        text = removeBetweenSquareBrackets(text)\n",
+    "        return text\n",
+    "\n",
+    "    def replaceContractions(self, text):\n",
+    "        \"\"\"Replace contractions in string of text\"\"\"\n",
+    "        return contractions.fix(text)\n",
+    "\n",
+    "    def tokenize(self, text):\n",
+    "        words = nltk.word_tokenize(text)\n",
+    "        return words\n",
+    "\n",
+    "    def removeNonAscii(self, words):\n",
+    "        \"\"\"Remove non-ASCII characters from list of tokenized words\"\"\"\n",
+    "        newWords = []\n",
+    "        for word in words:\n",
+    "            if isinstance(word, unicode):\n",
+    "                newWord = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')\n",
+    "            else:\n",
+    "                newWord = word\n",
+    "            newWords.append(newWord)\n",
+    "        return newWords\n",
+    "\n",
+    "    def replaceNonAsciiFromText(self, text):\n",
+    "        \"\"\" replaces non ascii with blank  \"\"\"\n",
+    "        return ''.join([i if ord(i) < 128 else ' ' for i in text])\n",
+    "\n",
+    "    def removeNonAsciiFromText(self, text):\n",
+    "        \"\"\" replaces non ascii with blank  \"\"\"\n",
+    "        return ''.join([i if ord(i) < 128 else '' for i in text])\n",
+    "\n",
+    "    def allow(self, words):\n",
+    "        \"\"\" allow only specific charaters \"\"\"\n",
+    "        allowed = [word for word in words if re.match('^[A-Za-z0-9\\.\\,\\:\\;\\!\\?\\(\\)\\'\\-\\$\\@\\%\\\"]+$', word) is not None]\t\t\n",
+    "        return allowed\t\t\n",
+    "\n",
+    "    def toLowercase(self, words):\n",
+    "        \"\"\"Convert all characters to lowercase from list of tokenized words\"\"\"\n",
+    "        newWords = [word.lower() for word in words]\n",
+    "        return newWords\n",
+    "\n",
+    "    def removePunctuation(self, words):\n",
+    "        \"\"\"Remove punctuation from list of tokenized words\"\"\"\n",
+    "        newWords = []\n",
+    "        for word in words:\n",
+    "            newWord = re.sub(r'[^\\w\\s]', '', word)\n",
+    "            if newWord != '':\n",
+    "                newWords.append(newWord)\n",
+    "        return newWords\n",
+    "\n",
+    "    def replaceNumbers(self, words):\n",
+    "        \"\"\"Replace all interger occurrences in list of tokenized words with textual representation\"\"\"\n",
+    "        p = inflect.engine()\n",
+    "        newWords = []\n",
+    "        for word in words:\n",
+    "            if word.isdigit():\n",
+    "                newWord = p.number_to_words(word)\n",
+    "                newWords.append(newWord)\n",
+    "            else:\n",
+    "                newWords.append(word)\n",
+    "        return newWords\n",
+    "\n",
+    "    def removeStopwords(self, words):\n",
+    "        \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
+    "        newWords = []\n",
+    "        for word in words:\n",
+    "            if word not in stopwords.words('english'):\n",
+    "                newWords.append(word)\n",
+    "        return newWords\n",
+    "\n",
+    "    def removeCustomStopwords(self, words, stopWords):\n",
+    "        \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
+    "        removed = [word for word in words if word not in stopWords]\t\t\n",
+    "        return removed\n",
+    "\n",
+    "    def removeLowFreqWords(self, words, minFreq):\n",
+    "        \"\"\"Remove low frewquncy words from list of tokenized words\"\"\"\n",
+    "        frequency = defaultdict(int)\n",
+    "        for word in words:\n",
+    "            frequency[word] += 1\n",
+    "        removed = [word for word in words if frequency[word] > minFreq]\t\t\n",
+    "        return removed\t\n",
+    "\n",
+    "    def removeNumbers(self, words):\n",
+    "        \"\"\"Remove numbers\"\"\"\n",
+    "        removed = [word for word in words if not isNumber(word)]\t\t\n",
+    "        return removed\t\t\n",
+    "\n",
+    "    def removeShortWords(self, words, minLengh):\n",
+    "        \"\"\"Remove short words \"\"\"\n",
+    "        removed = [word for word in words if len(word) >= minLengh]\t\t\n",
+    "        return removed\t\t\n",
+    "\n",
+    "    def keepAllowedWords(self, words, keepWords):\n",
+    "        \"\"\"Keep  words from the list only\"\"\"\n",
+    "        kept = [word for word in words if word in keepWords]\t\t\n",
+    "        return kept\n",
+    "\n",
+    "    def stemWords(self, words):\n",
+    "        \"\"\"Stem words in list of tokenized words\"\"\"\n",
+    "        if stemmer == \"lancaster\":\n",
+    "            stemmer = LancasterStemmer()\n",
+    "        elif stemmer == \"snowbal\":\n",
+    "            stemmer = SnowballStemmer()\n",
+    "        elif stemmer == \"porter\":\n",
+    "            stemmer = PorterStemmer()\n",
+    "        stems = [stemmer.stem(word) for word in words]\n",
+    "        return stems\n",
+    "\n",
+    "    def lemmatizeWords(self, words):\n",
+    "        \"\"\"Lemmatize tokens in list of tokenized words\"\"\"\n",
+    "        lemmas = [self.lemmatizer.lemmatize(word) for word in words]\n",
+    "        return lemmas\n",
+    "\n",
+    "    def lemmatizeVerbs(self, words):\n",
+    "        \"\"\"Lemmatize verbs in list of tokenized words\"\"\"\n",
+    "        lemmas = [self.lemmatizer.lemmatize(word, pos='v') for word in words]\n",
+    "        return lemmas\n",
+    "\n",
+    "    def normalize(self, words):\n",
+    "        words = self.removeNonAscii(words)\n",
+    "        words = self.toLowercase(words)\n",
+    "        words = self.removePunctuation(words)\n",
+    "        words = self.replaceNumbers(words)\n",
+    "        words = self.removeStopwords(words)\n",
+    "        return words\n",
+    "\n",
+    "    def posTag(self, textTokens):\n",
+    "        tags = nltk.pos_tag(textTokens)\n",
+    "        return tags\n",
+    "\n",
+    "    def extractEntity(self, textTokens, classifierPath, jarPath):\n",
+    "        st = StanfordNERTagger(classifierPath, jarPath) \n",
+    "        entities = st.tag(textTokens)\n",
+    "        return entities\n",
+    "\n",
+    "    def documentFeatures(self, document, wordFeatures):\n",
+    "        documentWords = set(document)\n",
+    "        features = {}\n",
+    "        for word in wordFeatures:\n",
+    "            features[word] = (word in documentWords)\n",
+    "        return features\n",
+    "\n",
+    "class NGram:\n",
+    "    \"\"\"\n",
+    "    word ngram\n",
+    "    \"\"\"\n",
+    "    def __init__(self, vocFilt, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        self.vocFilt = vocFilt\n",
+    "        self.nGramCounter = dict()\n",
+    "        self.nGramFreq = dict()\n",
+    "        self.corpSize = 0\n",
+    "        self.vocabulary = set()\n",
+    "        self.freqDone = False\n",
+    "        self.verbose = verbose\n",
+    "        self.vecWords = None\n",
+    "        self.nonZeroCount = 0\n",
+    "\n",
+    "    def countDocNGrams(self, words):\n",
+    "        \"\"\"\n",
+    "        count words in a doc\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"doc size \" + str(len(words)))\n",
+    "        nGrams = self.toNGram(words)\n",
+    "        for nGram in nGrams:\n",
+    "            count = self.nGramCounter.get(nGram, 0)\n",
+    "            self.nGramCounter[nGram] = count + 1\n",
+    "            self.corpSize += 1\n",
+    "        self.vocabulary.update(words)\t\n",
+    "\n",
+    "    def remLowCount(self, minCount):\n",
+    "        \"\"\"\n",
+    "        removes items with count below threshold\n",
+    "        \"\"\"\n",
+    "        self.nGramCounter = dict(filter(lambda item: item[1] >= minCount, self.nGramCounter.items()))\n",
+    "\n",
+    "    def getVocabSize(self):\n",
+    "        \"\"\"\n",
+    "        get vocabulary size\n",
+    "        \"\"\"\n",
+    "        return len(self.nGramCounter)\n",
+    "\n",
+    "    def getNGramFreq(self):\n",
+    "        \"\"\"\n",
+    "        get normalized count\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"counter size \" + str(len(self.nGramCounter)))\n",
+    "        if not self.freqDone:\n",
+    "            for item in self.nGramCounter.items():\n",
+    "                self.nGramFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
+    "            self.freqDone = True\n",
+    "        return self.nGramFreq\n",
+    "\n",
+    "    def getNGramIndex(self, show):\n",
+    "        \"\"\"\n",
+    "        convert to list\n",
+    "        \"\"\"\n",
+    "        if self.vecWords is None:\n",
+    "            self.vecWords = list(self.nGramCounter)\n",
+    "            if show:\n",
+    "                for vw in enumerate(self.vecWords):\n",
+    "                    print(vw)\n",
+    "\n",
+    "    def getVector(self, words, byCount, normalized):\n",
+    "        \"\"\"\n",
+    "        convert to vector\n",
+    "        \"\"\"\n",
+    "        if self.vecWords is None:\n",
+    "            self.vecWords = list(self.nGramCounter)\n",
+    "\n",
+    "        nGrams = self.toNGram(words)\n",
+    "        if self.verbose:\n",
+    "            print(\"vocabulary size {}\".format(len(self.vecWords)))\n",
+    "            print(\"ngrams\")\n",
+    "            print(nGrams)\n",
+    "        self.nonZeroCount = 0\n",
+    "        vec = list(map(lambda vw: self.getVecElem(vw, nGrams, byCount, normalized), self.vecWords))\n",
+    "        return vec\n",
+    "\n",
+    "    def getVecElem(self, vw, nGrams, byCount, normalized):\n",
+    "        \"\"\"\n",
+    "        get vector element\n",
+    "        \"\"\"\n",
+    "        if vw in nGrams:\n",
+    "            if byCount:\n",
+    "                if normalized:\n",
+    "                    el = self.nGramFreq[vw]\n",
+    "                else:\n",
+    "                    el = self.nGramCounter[vw]\n",
+    "            else:\n",
+    "                el = 1\n",
+    "            self.nonZeroCount += 1\n",
+    "        else:\n",
+    "            if (byCount and normalized):\n",
+    "                el = 0.0\n",
+    "            else:\n",
+    "                el = 0\n",
+    "        return el\n",
+    "\n",
+    "    def getNonZeroCount(self):\n",
+    "        \"\"\"\n",
+    "        get non zero vector element count\n",
+    "        \"\"\"\n",
+    "        return self.nonZeroCount\n",
+    "\n",
+    "    def toBiGram(self, words):\n",
+    "        \"\"\"\n",
+    "        convert to bigram\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"doc size \" + str(len(words)))\n",
+    "        biGrams = list()\n",
+    "        for i in range(len(words)-1):\n",
+    "            w1 = words[i]\n",
+    "            w2 = words[i+1]\n",
+    "            if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt):\n",
+    "                nGram = (w1, w2)\n",
+    "                biGrams.append(nGram)\n",
+    "        return biGrams\n",
+    "\n",
+    "    def toTriGram(self, words):\n",
+    "        \"\"\"\n",
+    "        convert to trigram\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"doc size \" + str(len(words)))\n",
+    "        triGrams = list()\n",
+    "        for i in range(len(words)-2):\n",
+    "            w1 = words[i]\n",
+    "            w2 = words[i+1]\n",
+    "            w3 = words[i+2]\n",
+    "            if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt and w3 in self.vocFilt):\n",
+    "                nGram = (w1, w2, w3)\n",
+    "                triGrams.append(nGram)\n",
+    "        return triGrams\n",
+    "\n",
+    "    def save(self, saveFile):\n",
+    "        \"\"\"\n",
+    "        save \n",
+    "        \"\"\"\n",
+    "        sf = open(saveFile, \"wb\")\n",
+    "        pickle.dump(self, sf)\n",
+    "        sf.close()\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def load(saveFile):\n",
+    "        \"\"\"\n",
+    "        load\n",
+    "        \"\"\"\n",
+    "        sf = open(saveFile, \"rb\")\n",
+    "        nGrams = pickle.load(sf)\n",
+    "        sf.close()\n",
+    "        return nGrams\n",
+    "\n",
+    "class CharNGram:\n",
+    "    \"\"\"\n",
+    "    character n gram\n",
+    "    \"\"\"\n",
+    "    def __init__(self, domains, ngsize, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        self.chDomain = list()\n",
+    "        self.ws = \"#\"\n",
+    "        self.chDomain.append(self.ws)\n",
+    "        for d in domains:\n",
+    "            if d == \"lcc\":\n",
+    "                self.chDomain.extend(lcc)\n",
+    "            elif d == \"ucc\":\n",
+    "                self.chDomain.extend(ucc)\n",
+    "            elif d == \"dig\":\n",
+    "                self.chDomain.extend(dig)\n",
+    "            elif d == \"spc\":\n",
+    "                self.chDomain.extend(spc)\n",
+    "            else:\n",
+    "                raise ValueError(\"invalid character type \" + d)\n",
+    "\n",
+    "        self.ngsize = ngsize\n",
+    "        self.radixPow = None\n",
+    "        self.cntVecSize = None\n",
+    "\n",
+    "    def addSpChar(self, spChar):\n",
+    "        \"\"\"\n",
+    "        add special characters\n",
+    "        \"\"\"\n",
+    "        self.chDomain.extend(spChar)\n",
+    "\n",
+    "    def setWsRepl(self, ws):\n",
+    "        \"\"\"\n",
+    "        set white space replacement charater\n",
+    "        \"\"\"\n",
+    "        self.ws = ws\n",
+    "        self.chDomain[0] = self.ws\n",
+    "\n",
+    "    def finalize(self):\n",
+    "        \"\"\"\n",
+    "        final setup\n",
+    "        \"\"\"\t\t\n",
+    "        domSize = len(self.chDomain)\n",
+    "        self.cntVecSize = int(math.pow(domSize, self.ngsize))\n",
+    "        if self.radixPow is None:\n",
+    "            self.radixPow = list()\n",
+    "            for i in range(self.ngsize-1, 0, -1):\n",
+    "                self.radixPow.append(int(math.pow(domSize, i)))\n",
+    "            self.radixPow.append(1)\n",
+    "\n",
+    "\n",
+    "    def toMgramCount(self, text):\n",
+    "        \"\"\"\n",
+    "        get ngram count list\n",
+    "        \"\"\"\n",
+    "        #print(text)\n",
+    "        ngCounts = [0] *  self.cntVecSize\n",
+    "\n",
+    "        ngram = list()\n",
+    "        totNgCount  = 0\n",
+    "        for ch in text:\n",
+    "            if ch.isspace():\n",
+    "                l = len(ngram)\n",
+    "                if l == 0 or ngram[l-1] != self.ws:\n",
+    "                    ngram.append(self.ws)\n",
+    "            else:\n",
+    "                ngram.append(ch)\n",
+    "\n",
+    "            if len(ngram) == self.ngsize:\n",
+    "                i = self.__getNgramIndex(ngram)\n",
+    "                assert i < self.cntVecSize, \"ngram index out of range index \" + str(i) + \" size \" + str(self.cntVecSize) \n",
+    "                ngCounts[i] += 1\n",
+    "                ngram.clear()\n",
+    "                totNgCount += 1\n",
+    "\n",
+    "        return ngCounts\n",
+    "\n",
+    "    def __getNgramIndex(self, ngram):\n",
+    "        \"\"\"\n",
+    "        get index of an ngram into a list of size equal total number of possible ngrams\n",
+    "        \"\"\"\n",
+    "        assert len(ngram) == len(self.radixPow), \"ngram size mismatch\"\t\t\n",
+    "        ngi = 0\n",
+    "        for ch, rp in zip(ngram, self.radixPow):\n",
+    "            i = self.chDomain.index(ch)\n",
+    "            ngi += i * rp\n",
+    "\n",
+    "        return ngi\n",
+    "\n",
+    "\n",
+    "class TfIdf:\n",
+    "    \"\"\"\n",
+    "    TF IDF\t\n",
+    "    \"\"\"\n",
+    "    def __init__(self, vocFilt, doIdf, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        self.vocFilt = vocFilt\n",
+    "        self.doIdf = doIdf\n",
+    "        self.wordCounter = {}\n",
+    "        self.wordFreq = {}\n",
+    "        self.wordInDocCount = {}\n",
+    "        self.docCount = 0\n",
+    "        self.corpSize = 0\n",
+    "        self.freqDone = False\n",
+    "        self.vocabulary = set()\n",
+    "        self.wordIndex = None\n",
+    "        self.verbose = verbose\n",
+    "        self.vecWords = None\n",
+    "\n",
+    "    def countDocWords(self, words):\n",
+    "        \"\"\"\n",
+    "        count words in a doc\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"doc size \" + str(len(words)))\n",
+    "        for word in words:\n",
+    "            if self.vocFilt is None or word in self.vocFilt:\n",
+    "                count = self.wordCounter.get(word, 0)\n",
+    "                self.wordCounter[word] = count + 1\n",
+    "        self.corpSize += len(words)\n",
+    "        self.vocabulary.update(words)\n",
+    "\n",
+    "        if (self.doIdf):\n",
+    "            self.docCount += 1\n",
+    "            for word in set(words):\n",
+    "                self.wordInDocCount.get(word, 0)\n",
+    "                self.wordInDocCount[word] = count + 1\n",
+    "        self.freqDone = False\n",
+    "\n",
+    "\n",
+    "    def getWordFreq(self):\n",
+    "        \"\"\"\n",
+    "        get tfidf for corpus\n",
+    "        \"\"\"\n",
+    "        if self.verbose:\n",
+    "            print (\"counter size \" + str(len(self.wordCounter)))\n",
+    "        if not self.freqDone:\n",
+    "            for item in self.wordCounter.items():\n",
+    "                self.wordFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
+    "            if self.doIdf:\n",
+    "                for k in self.wordFreq.keys():\n",
+    "                    self.wordFreq.items[k] *=  math.log(self.docCount / self.wordInDocCount.items[k])\t\n",
+    "            self.freqDone = True\n",
+    "        return self.wordFreq\n",
+    "\n",
+    "    def getCount(self, word):\n",
+    "        \"\"\"\n",
+    "        get counter\n",
+    "        \"\"\"\n",
+    "        if word in self.wordCounter:\n",
+    "            count = self.wordCounter[word]\n",
+    "        else:\n",
+    "            raise ValueError(\"word not found in count table \" + word)\n",
+    "        return count\n",
+    "\n",
+    "    def getFreq(self, word):\n",
+    "        \"\"\"\n",
+    "        get normalized frequency\n",
+    "        \"\"\"\n",
+    "        if word in self.wordFreq:\n",
+    "            freq = self.wordFreq[word]\n",
+    "        else:\n",
+    "            raise ValueError(\"word not found in count table \" + word)\n",
+    "        return freq\n",
+    "\n",
+    "    def resetCounter(self):\n",
+    "        \"\"\"\n",
+    "        reset counter\n",
+    "        \"\"\"\n",
+    "        self.wordCounter = {}\n",
+    "\n",
+    "    def buildVocabulary(self, words):\n",
+    "        \"\"\"\n",
+    "        build vocbulary\n",
+    "        \"\"\"\n",
+    "        self.vocabulary.update(words)\n",
+    "\n",
+    "    def getVocabulary(self):\n",
+    "        \"\"\"\n",
+    "        return vocabulary\n",
+    "        \"\"\"\n",
+    "        return self.vocabulary\n",
+    "\n",
+    "    def creatWordIndex(self):\n",
+    "        \"\"\"\n",
+    "        index for all words in vcabulary\n",
+    "        \"\"\"\n",
+    "        self.wordIndex = {word : idx for idx, word in enumerate(list(self.vocabulary))}\n",
+    "\n",
+    "    def getVector(self, words, byCount, normalized):\n",
+    "        \"\"\"\n",
+    "        get vector\n",
+    "        \"\"\"\n",
+    "        if self.vecWords is None:\n",
+    "            self.vecWords = list(self.wordCounter)\n",
+    "        vec = list(map(lambda vw: self.getVecElem(vw, words, byCount, normalized), self.vecWords))\n",
+    "        return vec\n",
+    "\n",
+    "    def getVecElem(self, vw, words, byCount, normalized):\n",
+    "        \"\"\"\n",
+    "        vector element\n",
+    "        \"\"\"\n",
+    "        el = 0\n",
+    "        if vw in words:\n",
+    "            if byCount:\n",
+    "                if normalized:\n",
+    "                    el = self.wordFreq[vw]\n",
+    "                else:\n",
+    "                    el = self.wordCounter[vw]\n",
+    "            else:\n",
+    "                el = 1\n",
+    "        return el\n",
+    "\n",
+    "    def save(self, saveFile):\n",
+    "        \"\"\"\n",
+    "        save\n",
+    "        \"\"\"\n",
+    "        sf = open(saveFile, \"wb\")\n",
+    "        pickle.dump(self, sf)\n",
+    "        sf.close()\n",
+    "\n",
+    "    # load \n",
+    "    @staticmethod\n",
+    "    def load(saveFile):\n",
+    "        \"\"\"\n",
+    "        load\n",
+    "        \"\"\"\n",
+    "        sf = open(saveFile, \"rb\")\n",
+    "        tfidf = pickle.load(sf)\n",
+    "        sf.close()\n",
+    "        return tfidf\n",
+    "\n",
+    "# bigram\n",
+    "class BiGram(NGram):\n",
+    "    def __init__(self, vocFilt, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        super(BiGram, self).__init__(vocFilt, verbose)\n",
+    "\n",
+    "    def toNGram(self, words):\n",
+    "        \"\"\"\n",
+    "        convert to Ngrams\n",
+    "        \"\"\"\n",
+    "        return self.toBiGram(words)\n",
+    "\n",
+    "# trigram\n",
+    "class TriGram(NGram):\n",
+    "    def __init__(self, vocFilt, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        super(TriGram, self).__init__(vocFilt, verbose)\n",
+    "\n",
+    "    def toNGram(self, words):\n",
+    "        \"\"\"\n",
+    "        convert to Ngrams\n",
+    "        \"\"\"\n",
+    "        return self.toTriGram(words)\n",
+    "\n",
+    "\n",
+    "\n",
+    "class DocSentences:\n",
+    "    \"\"\"\n",
+    "    sentence processor\n",
+    "    \"\"\"\n",
+    "    def __init__(self, filePath, minLength, verbose, text=None):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        if filePath:\n",
+    "            self.filePath = filePath\n",
+    "            with open(filePath, 'r') as contentFile:\n",
+    "                content = contentFile.read()\n",
+    "        elif text:\n",
+    "            content = text\n",
+    "        else:\n",
+    "            raise valueError(\"either file path or text must be provided\")\n",
+    "\n",
+    "        #self.sentences = content.split('.')\n",
+    "        self.verbose = verbose\n",
+    "        tp = TextPreProcessor()\n",
+    "        content = tp.removeNonAsciiFromText(content)\n",
+    "        sentences = sent_tokenize(content)\n",
+    "        self.sentences = list(filter(lambda s: len(nltk.word_tokenize(s)) >= minLength, sentences))\n",
+    "        if self.verbose:\n",
+    "            print (\"num of senteces after length filter \" + str(len(self.sentences)))\n",
+    "        self.sentencesAsTokens = [clean(s, tp, verbose) for s in self.sentences]\t\n",
+    "\n",
+    "    # get sentence tokens\n",
+    "    def getSentencesAsTokens(self):\n",
+    "        return self.sentencesAsTokens\n",
+    "\n",
+    "    # get sentences\n",
+    "    def getSentences(self):\n",
+    "        return self.sentences\n",
+    "\n",
+    "    # build term freq table\n",
+    "    def getTermFreqTable(self):\n",
+    "        # term count table for all words\n",
+    "        termTable = TfIdf(None, False)\n",
+    "        sentWords = self.getSentencesAsTokens()\n",
+    "        for seWords in sentWords:\n",
+    "            termTable.countDocWords(seWords)\n",
+    "        return termTable\n",
+    "\n",
+    "# sentence processor\n",
+    "class WordVectorContainer:\n",
+    "    def __init__(self, dirPath, verbose):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        self.docs = list()\n",
+    "        self.wordVectors = list()\n",
+    "        self.tp = TextPreProcessor()\n",
+    "        self.similarityAlgo = \"cosine\"\n",
+    "        self.simAlgoNormalizer = None\n",
+    "        self.termTable = None\n",
+    "\n",
+    "\n",
+    "    def addDir(self, dirPath):\n",
+    "        \"\"\"\n",
+    "        add content of all files ina directory\n",
+    "        \"\"\"\n",
+    "        docs, filePaths  = getFileContent(dirPath, verbose)\n",
+    "        self.docs.extend(docs)\n",
+    "        self.wordVectors.extend([clean(doc, self.tp, verbose) for doc in docs])\n",
+    "\n",
+    "    def addFile(self, filePath):\n",
+    "        \"\"\"\n",
+    "        add file content\n",
+    "        \"\"\"\n",
+    "        with open(filePath, 'r') as contentFile:\n",
+    "            content = contentFile.read()\n",
+    "        self.wordVectors.append(clean(content, self.tp, verbose))\n",
+    "\n",
+    "    def addText(self, text):\n",
+    "        \"\"\"\n",
+    "        add text\n",
+    "        \"\"\"\n",
+    "        self.wordVectors.append(clean(text, self.tp, verbose))\n",
+    "\n",
+    "    def addWords(self, words):\n",
+    "        \"\"\"\n",
+    "        add words\n",
+    "        \"\"\"\n",
+    "        self.wordVectors.append(words)\n",
+    "\n",
+    "    def withSimilarityAlgo(self, algo, normalizer=None):\n",
+    "        \"\"\"\n",
+    "        set similarity algo\n",
+    "        \"\"\"\n",
+    "        self.similarityAlgo = algo\n",
+    "        self.simAlgoNormalizer = normalizer\n",
+    "\n",
+    "    def getDocsWords(self):\n",
+    "        \"\"\"\n",
+    "        get word vectors\n",
+    "        \"\"\"\n",
+    "        return self.wordVectors\n",
+    "\n",
+    "    def getDocs(self):\n",
+    "        \"\"\"\n",
+    "        get docs\n",
+    "        \"\"\"\n",
+    "        return self.docs\n",
+    "\n",
+    "    def getTermFreqTable(self):\n",
+    "        \"\"\"\n",
+    "        term count table for all words\n",
+    "        \"\"\"\n",
+    "        self.termTable = TfIdf(None, False)\n",
+    "        for words in self.wordVectors:\n",
+    "            self.termTable.countDocWords(words)\n",
+    "        self.termTable.getWordFreq()\n",
+    "        return self.termTable\n",
+    "\n",
+    "    def getPairWiseSimilarity(self, byCount, normalized):\n",
+    "        \"\"\"\n",
+    "        pair wise similarity\n",
+    "        \"\"\"\n",
+    "        self.getNumWordVectors()\n",
+    "\n",
+    "        size = len(self.wordVectors)\n",
+    "        simArray = np.empty(shape=(size,size))\n",
+    "        for i in range(size):\n",
+    "            simArray[i][i] = 1.0\n",
+    "\n",
+    "        for i in range(size):\n",
+    "            for j in range(i+1, size):\n",
+    "                if self.similarityAlgo == \"cosine\":\n",
+    "                    sim = cosineSimilarity(self.numWordVectors[i], self.numWordVectors[j])\n",
+    "                elif self.similarityAlgo == \"jaccard\":\n",
+    "                    sim = jaccardSimilarity(self.wordVectors[i], self.wordVectors[j],\\\n",
+    "                        self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
+    "                else:\n",
+    "                    raise ValueError(\"invalid similarity algorithms\")\n",
+    "                simArray[i][j] = sim\n",
+    "                simArray[j][i] = sim\n",
+    "        return simArray\n",
+    "\n",
+    "    def getInterSetSimilarity(self, byCount, normalized, split):\n",
+    "        \"\"\"\n",
+    "        inter set pair wise  similarity\n",
+    "        \"\"\"\n",
+    "        self.getNumWordVectors()\n",
+    "        size = len(self.wordVectors)\n",
+    "        if not self.similarityAlgo == \"jaccard\":\n",
+    "            firstNumVec = self.numWordVectors[:split]\n",
+    "            secNumVec = self.numWordVectors[split:]\n",
+    "            fiSize = len(firstNumVec)\n",
+    "            seSize = len(secNumVec)\n",
+    "        else:\n",
+    "            firstVec = self.wordVectors[:split]\n",
+    "            secVec = self.wordVectors[split:]\n",
+    "            fiSize = len(firstVec)\n",
+    "            seSize = len(secVec)\n",
+    "\n",
+    "        simArray = np.empty(shape=(fiSize,seSize))\n",
+    "        for i in range(fiSize):\n",
+    "            for j in range(seSize):\n",
+    "                if self.similarityAlgo == \"cosine\":\n",
+    "                    sim = cosineSimilarity(firstNumVec[i], secNumVec[j])\n",
+    "                elif self.similarityAlgo == \"jaccard\":\n",
+    "                    sim = jaccardSimilarity(firstVec[i], secVec[j],\\\n",
+    "                        self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
+    "                else:\n",
+    "                    raise ValueError(\"invalid similarity algorithms\")\n",
+    "                simArray[i][j] = sim\n",
+    "        return simArray\n",
+    "\n",
+    "    def getNumWordVectors(self):\n",
+    "        \"\"\"\n",
+    "        get vectors\n",
+    "        \"\"\"\n",
+    "        if not self.similarityAlgo == \"jaccard\":\n",
+    "            if self.numWordVectors is None:\n",
+    "                self.numWordVectors = list(map(lambda wv: self.termTable.getVector(wv, byCount, normalized), self.wordVectors))\n",
+    "\n",
+    "# fragments documents into whole doc, paragraph or passages\n",
+    "class TextFragmentGenerator:\n",
+    "    def __init__(self, level,  minParNl, passSize, verbose=False):\n",
+    "        \"\"\"\n",
+    "        initialize\n",
+    "        \"\"\"\n",
+    "        self.level = level\n",
+    "        self.minParNl = minParNl\n",
+    "        self.passSize = passSize\n",
+    "        self.fragments = None\n",
+    "        self.verbose = verbose\n",
+    "\n",
+    "    def loadDocs(self, fpaths):\n",
+    "        \"\"\"\n",
+    "        loads documents from one file, multiple files or all files under directory\n",
+    "        \"\"\"\n",
+    "        fPaths = fpaths.split(\",\")\n",
+    "        if len(fPaths) == 1:\n",
+    "            if os.path.isfile(fPaths[0]):\n",
+    "                #one file\n",
+    "                if self.verbose:\n",
+    "                    print(\"got one file from path\")\n",
+    "                dnames = fPaths\n",
+    "                docStr = getOneFileContent(fPaths[0])\n",
+    "                dtexts = [docStr]\n",
+    "            else:\n",
+    "                #all files under directory\n",
+    "                if self.verbose:\n",
+    "                    print(\"got all files under directory from path\")\n",
+    "                dtexts, dnames = getFileContent(fPaths[0])\n",
+    "                if self.verbose:\n",
+    "                    print(\"found {} files\".format(len(dtexts)))\n",
+    "        else:\n",
+    "            #list of files\n",
+    "            if self.verbose: \n",
+    "                print(\"got list of files from path\")\n",
+    "            dnames = fPaths\n",
+    "            dtexts = list(map(getOneFileContent, fpaths))\n",
+    "            if self.verbose:\n",
+    "                print(\"found {} files\".format(len(dtexts)))\n",
+    "\n",
+    "        ndocs = (dtexts, dnames)\t\n",
+    "        if self.verbose:\n",
+    "            print(\"docs\")\n",
+    "            for dn, dt in zip(dnames, dtexts):\n",
+    "                print(dn + \"\\t\" + dt[:40])\n",
+    "\n",
+    "        return ndocs\n",
+    "\n",
+    "    def generateFragmentsFromFiles(self, fpaths):\n",
+    "        \"\"\"\n",
+    "        fragments documents into whole doc, paragraph or passages\n",
+    "        \"\"\"\n",
+    "        dtexts, dnames = self.loadDocs(fpaths)\n",
+    "        return self.generateFragments(dtexts, dnames)\n",
+    "\n",
+    "\n",
+    "    def generateFragmentsFromNamedDocs(self, ndocs):\n",
+    "        \"\"\"\n",
+    "        fragments documents into whole doc, paragraph or passages\n",
+    "        \"\"\"\n",
+    "        dtexts = list(map(lambda nd : nd[1], ndocs))\n",
+    "        dnames = list(map(lambda nd : nd[0], ndocs))\n",
+    "        #for i in range(len(dtexts)):\n",
+    "        #\tprint(dnames[i])\n",
+    "        #\tprint(dtexts[i][:40])\n",
+    "        return self.generateFragments(dtexts, dnames)\n",
+    "\n",
+    "    def generateFragments(self, dtexts, dnames):\n",
+    "        \"\"\"\n",
+    "        fragments documents into whole doc, paragraph or passages\n",
+    "        \"\"\"\n",
+    "        if self.level == \"para\" or self.level == \"passage\":\n",
+    "            #split paras\n",
+    "            dptexts = list()\n",
+    "            dpnames = list()\n",
+    "            for dt, dn in zip(dtexts, dnames):\n",
+    "                paras = getParas(dt, self.minParNl)\n",
+    "                if self.verbose:\n",
+    "                    print(dn)\n",
+    "                    print(\"no of paras {}\".format(len(paras)))\n",
+    "                dptexts.extend(paras)\n",
+    "                pnames = list(map(lambda i : dn + \":\" + str(i), range(len(paras))))\n",
+    "                dpnames.extend(pnames)\n",
+    "            dtexts = dptexts\n",
+    "            dnames = dpnames\n",
+    "\n",
+    "        if self.level == \"passage\":\n",
+    "            #split each para into passages\n",
+    "            dptexts = list()\n",
+    "            dpnames = list()\n",
+    "            for dt, dn in zip(dtexts, dnames):\n",
+    "                sents = sent_tokenize(dt.strip())\t\t\t\n",
+    "                if self.verbose:\n",
+    "                    print(dn)\n",
+    "                    print(\"no of sentences {}\".format(len(sents)))\n",
+    "                span = self.passSize\n",
+    "                if len(sents) <= span:\n",
+    "                    pass\n",
+    "                else:\n",
+    "                    for i in range(0, len(sents) - span, 1):\n",
+    "                        dptext = None\n",
+    "                        for j in range(span):\n",
+    "                            if dptext is None:\n",
+    "                                dptext = sents[i + j] +  \". \"\n",
+    "                            else:\n",
+    "                                dptext = dptext + sents[i + j] + \". \" \n",
+    "                        dpname = dn + \":\" + str(i)\n",
+    "                        dptexts.append(dptext)\n",
+    "                        dpnames.append(dpname)\n",
+    "\n",
+    "            dtexts = dptexts\n",
+    "            dnames = dpnames\n",
+    "\n",
+    "        self.fragments = list(zip(dnames, dtexts))\n",
+    "        #if self.verbose:\n",
+    "        #\tprint(\"num fragments {}\".format(len(self.fragments)))\n",
+    "        return self.fragments\n",
+    "\n",
+    "    def showFragments(self):\n",
+    "        \"\"\"\n",
+    "        show fragments\n",
+    "        \"\"\"\n",
+    "        print(\"showing all \" + self.level + \" for the first 40 characters\")\n",
+    "        for dn, dt in self.fragments:\n",
+    "            print(dn + \"\\t\" + dt[:40])\n",
+    "\n",
+    "    def isDocLevel(self):\n",
+    "        \"\"\"\n",
+    "        true if fragment is at doc level\n",
+    "        \"\"\"\n",
+    "        return self.level != \"para\" and self.level != \"passage\"\n",
+    "\n",
+    "# clean doc to create term array\n",
+    "def clean(doc, preprocessor, verbose):\n",
+    "    \"\"\"\n",
+    "    text pre process\n",
+    "    \"\"\"\n",
+    "    if verbose:\n",
+    "        print (\"--raw doc\")\n",
+    "        print (doc)\n",
+    "    #print \"next clean\"\n",
+    "    doc = preprocessor.removeNonAsciiFromText(doc)\n",
+    "    words = preprocessor.tokenize(doc)\n",
+    "    words = preprocessor.allow(words)\n",
+    "    words = preprocessor.toLowercase(words)\n",
+    "    words = preprocessor.removeStopwords(words)\n",
+    "    words = preprocessor.removeShortWords(words, 3)\n",
+    "    words = preprocessor.removePunctuation(words)\n",
+    "    words = preprocessor.lemmatizeWords(words)\n",
+    "    #words = preprocessor.removeNonAscii(words)\n",
+    "    if verbose:\n",
+    "        print (\"--after pre processing\")\n",
+    "        print (words)\n",
+    "    return words\n",
+    "\n",
+    "# get sentences\n",
+    "def getSentences(filePath):\n",
+    "    \"\"\"\n",
+    "    text pre process\n",
+    "    \"\"\"\n",
+    "    with open(filePath, 'r') as contentFile:\n",
+    "        content = contentFile.read()\n",
+    "        sentences = content.split('.')\n",
+    "    return sentences\n",
+    "\n",
+    "def getParas(text, minParNl=2):\n",
+    "    \"\"\"\n",
+    "    split into paras\n",
+    "    \"\"\"\n",
+    "    regx = \"\\n+\" if minParNl == 1 else \"\\n{2,}\"\n",
+    "    paras = re.split(regx, text.replace(\"\\r\\n\", \"\\n\"))\n",
+    "    return paras\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lib/util.ipynb ADDED Viewed

	@@ -0,0 +1,2141 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21cb09bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "from random import randint\n",
+    "import random\n",
+    "import time\n",
+    "import uuid\n",
+    "from datetime import datetime\n",
+    "import math\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import logging\n",
+    "import logging.handlers\n",
+    "import pickle\n",
+    "from contextlib import contextmanager\n",
+    "\n",
+    "tokens = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\",\n",
+    "    \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
+    "numTokens = tokens[:10]\n",
+    "alphaTokens = tokens[10:36]\n",
+    "loCaseChars = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
+    "\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
+    "\n",
+    "typeInt = \"int\"\n",
+    "typeFloat = \"float\"\n",
+    "typeString = \"string\"\n",
+    "\n",
+    "secInMinute = 60\n",
+    "secInHour = 60 * 60\n",
+    "secInDay = 24 * secInHour\n",
+    "secInWeek = 7 * secInDay\n",
+    "secInYear = 365 * secInDay\n",
+    "secInMonth = secInYear / 12\n",
+    "\n",
+    "minInHour = 60\n",
+    "minInDay = 24 * minInHour\n",
+    "\n",
+    "ftPerYard = 3\n",
+    "ftPerMile = ftPerYard * 1760\n",
+    "\n",
+    "\n",
+    "def genID(size):\n",
+    "    \"\"\"\n",
+    "    generates ID\n",
+    "\n",
+    "    Parameters\n",
+    "        size : size of ID\n",
+    "    \"\"\"\n",
+    "    id = \"\"\n",
+    "    for i in range(size):\n",
+    "        id = id + selectRandomFromList(tokens)\n",
+    "    return id\n",
+    "\n",
+    "def genIdList(numId, idSize):\n",
+    "    \"\"\"\n",
+    "    generate list of IDs\n",
+    "\n",
+    "    Parameters:\n",
+    "        numId: number of Ids\n",
+    "        idSize: ID size\n",
+    "    \"\"\"\n",
+    "    iDs = []\n",
+    "    for i in range(numId):\n",
+    "        iDs.append(genID(idSize))\n",
+    "    return iDs\n",
+    "\n",
+    "def genNumID(size):\n",
+    "    \"\"\"\n",
+    "    generates ID consisting of digits onl\n",
+    "\n",
+    "    Parameters\n",
+    "        size : size of ID\n",
+    "    \"\"\"\n",
+    "    id = \"\"\n",
+    "    for i in range(size):\n",
+    "        id = id + selectRandomFromList(numTokens)\n",
+    "    return id\n",
+    "\n",
+    "def genLowCaseID(size):\n",
+    "    \"\"\"\n",
+    "    generates ID consisting of lower case chars\n",
+    "\n",
+    "    Parameters\n",
+    "        size : size of ID\n",
+    "    \"\"\"\n",
+    "    id = \"\"\n",
+    "    for i in range(size):\n",
+    "        id = id + selectRandomFromList(loCaseChars)\n",
+    "    return id\n",
+    "\n",
+    "def genNumIdList(numId, idSize):\n",
+    "    \"\"\"\n",
+    "    generate list of numeric IDs\n",
+    "\n",
+    "    Parameters:\n",
+    "        numId: number of Ids\n",
+    "        idSize: ID size\n",
+    "    \"\"\"\n",
+    "    iDs = []\n",
+    "    for i in range(numId):\n",
+    "        iDs.append(genNumID(idSize))\n",
+    "    return iDs\n",
+    "\n",
+    "def genNameInitial():\n",
+    "    \"\"\"\n",
+    "    generate name initial\n",
+    "    \"\"\"\n",
+    "    return selectRandomFromList(alphaTokens) + selectRandomFromList(alphaTokens)\n",
+    "\n",
+    "def genPhoneNum(arCode):\n",
+    "    \"\"\"\n",
+    "    generates phone number\n",
+    "\n",
+    "    Parameters\n",
+    "        arCode: area code\n",
+    "    \"\"\"\n",
+    "    phNum = genNumID(7)\n",
+    "    return arCode + str(phNum)\n",
+    "\n",
+    "def selectRandomFromList(ldata):\n",
+    "    \"\"\"\n",
+    "    select an element randomly from a lis\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "    \"\"\"\n",
+    "    return ldata[randint(0, len(ldata)-1)]\n",
+    "\n",
+    "def selectOtherRandomFromList(ldata, cval):\n",
+    "    \"\"\"\n",
+    "    select an element randomly from a list excluding the given one\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        cval : value to be excluded\n",
+    "    \"\"\"\n",
+    "    nval = selectRandomFromList(ldata)\n",
+    "    while nval == cval:\n",
+    "        nval = selectRandomFromList(ldata)\n",
+    "    return nval\n",
+    "\n",
+    "def selectRandomSubListFromList(ldata, num):\n",
+    "    \"\"\"\n",
+    "    generates random sublist from a list without replacemment\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        num : output list size\n",
+    "    \"\"\"\n",
+    "    assertLesser(num, len(ldata), \"size of sublist to be sampled greater than or equal to main list\")\n",
+    "    i = randint(0, len(ldata)-1)\n",
+    "    sel = ldata[i]\n",
+    "    selSet = {i}\n",
+    "    selList = [sel]\n",
+    "    while (len(selSet) < num):\n",
+    "        i = randint(0, len(ldata)-1)\n",
+    "        if (i not in selSet):\n",
+    "            sel = ldata[i]\n",
+    "            selSet.add(i)\n",
+    "            selList.append(sel)\n",
+    "    return selList\n",
+    "\n",
+    "def selectRandomSubListFromListWithRepl(ldata, num):\n",
+    "    \"\"\"\n",
+    "    generates random sublist from a list with replacemment\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        num : output list size\n",
+    "    \"\"\"\n",
+    "    return list(map(lambda i : selectRandomFromList(ldata), range(num)))\n",
+    "\n",
+    "def selectRandomFromDict(ddata):\n",
+    "    \"\"\"\n",
+    "    select an element randomly from a dictionary\n",
+    "\n",
+    "    Parameters\n",
+    "        ddata : dictionary data\n",
+    "    \"\"\"\n",
+    "    dkeys = list(ddata.keys())\n",
+    "    dk = selectRandomFromList(dkeys)\n",
+    "    el = (dk, ddata[dk])\n",
+    "    return el\n",
+    "\n",
+    "def setListRandomFromList(ldata, ldataRepl):\n",
+    "    \"\"\"\n",
+    "    sets some elents in the first list randomly with elements from the second list\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        ldataRepl : list with replacement data\n",
+    "    \"\"\"\n",
+    "    l = len(ldata)\n",
+    "    selSet = set()\n",
+    "    for d in ldataRepl:\n",
+    "        i = randint(0, l-1)\n",
+    "        while i in selSet:\n",
+    "            i = randint(0, l-1)\n",
+    "        ldata[i] = d\n",
+    "        selSet.add(i)\n",
+    "\n",
+    "def genIpAddress():\n",
+    "    \"\"\"\n",
+    "    generates IP address\n",
+    "    \"\"\"\n",
+    "    i1 = randint(0,256)\n",
+    "    i2 = randint(0,256)\n",
+    "    i3 = randint(0,256)\n",
+    "    i4 = randint(0,256)\n",
+    "    ip = \"%d.%d.%d.%d\" %(i1,i2,i3,i4)\n",
+    "    return ip\n",
+    "\n",
+    "def curTimeMs():\n",
+    "    \"\"\"\n",
+    "    current time in ms\n",
+    "    \"\"\"\n",
+    "    return int((datetime.utcnow() - datetime(1970,1,1)).total_seconds() * 1000)\n",
+    "\n",
+    "def secDegPolyFit(x1, y1, x2, y2, x3, y3):\n",
+    "    \"\"\"\n",
+    "    second deg polynomial \t\n",
+    "\n",
+    "    Parameters\n",
+    "        x1 : 1st point x\n",
+    "        y1 : 1st point y\n",
+    "        x2 : 2nd point x\n",
+    "        y2 : 2nd point y\n",
+    "        x3 : 3rd point x\n",
+    "        y3 : 3rd point y\n",
+    "    \"\"\"\n",
+    "    t = (y1 - y2) / (x1 - x2)\n",
+    "    a = t - (y2 - y3) / (x2 - x3)\n",
+    "    a = a / (x1 - x3)\n",
+    "    b = t - a * (x1 + x2)\n",
+    "    c = y1 - a * x1 * x1 - b * x1\n",
+    "    return (a, b, c)\n",
+    "\n",
+    "def range_limit(val, minv, maxv):\n",
+    "    \"\"\"\n",
+    "    range limit a value\n",
+    "\n",
+    "    Parameters\n",
+    "        val : data value\n",
+    "        minv : minimum\n",
+    "        maxv : maximum\n",
+    "    \"\"\"\n",
+    "    if (val < minv):\n",
+    "        val = minv\n",
+    "    elif (val > maxv):\n",
+    "        val = maxv\n",
+    "    return val\n",
+    "\n",
+    "def isInRange(val, minv, maxv):\n",
+    "    \"\"\"\n",
+    "    checks if within range\n",
+    "\n",
+    "    Parameters\n",
+    "        val : data value\n",
+    "        minv : minimum\n",
+    "        maxv : maximum\n",
+    "    \"\"\"\n",
+    "    return val >= minv and val <= maxv\n",
+    "\n",
+    "def stripFileLines(filePath, offset):\n",
+    "    \"\"\"\n",
+    "    strips number of chars from both ends\n",
+    "\n",
+    "    Parameters\n",
+    "        filePath : file path\n",
+    "        offset : offset from both ends of  line \n",
+    "    \"\"\"\n",
+    "    fp = open(filePath, \"r\")\n",
+    "    for line in fp:\n",
+    "        stripped = line[offset:len(line) - 1 - offset]\n",
+    "        print (stripped)\n",
+    "    fp.close()\n",
+    "\n",
+    "def genLatLong(lat1, long1, lat2, long2):\n",
+    "    \"\"\"\n",
+    "    generate lat log within limits\n",
+    "\n",
+    "    Parameters\n",
+    "        lat1 : lat of 1st point\n",
+    "        long1 : long of 1st point\n",
+    "        lat2 : lat of 2nd point\n",
+    "        long2 : long of 2nd point\n",
+    "    \"\"\"\n",
+    "    lat = lat1 + (lat2 - lat1) * random.random()\n",
+    "    longg = long1 + (long2 - long1) * random.random()\n",
+    "    return (lat, longg)\n",
+    "\n",
+    "def geoDistance(lat1, long1, lat2, long2):\n",
+    "    \"\"\"\n",
+    "    find geo distance in ft\n",
+    "\n",
+    "    Parameters\n",
+    "        lat1 : lat of 1st point\n",
+    "        long1 : long of 1st point\n",
+    "        lat2 : lat of 2nd point\n",
+    "        long2 : long of 2nd point\n",
+    "    \"\"\"\n",
+    "    latDiff = math.radians(lat1 - lat2)\n",
+    "    longDiff = math.radians(long1 - long2)\n",
+    "    l1 = math.sin(latDiff/2.0)\n",
+    "    l2 = math.sin(longDiff/2.0)\n",
+    "    l3 = math.cos(math.radians(lat1))\n",
+    "    l4 = math.cos(math.radians(lat2))\n",
+    "    a = l1 * l1 + l3 * l4 * l2 * l2\n",
+    "    l5 = math.sqrt(a)\n",
+    "    l6 = math.sqrt(1.0 - a)\n",
+    "    c = 2.0 * math.atan2(l5, l6)\n",
+    "    r = 6371008.8 * 3.280840\n",
+    "    return c * r\n",
+    "\n",
+    "def minLimit(val, limit):\n",
+    "    \"\"\"\n",
+    "    min limit\n",
+    "    Parameters\n",
+    "    \"\"\"\n",
+    "    if (val < limit):\n",
+    "        val = limit\n",
+    "    return val;\n",
+    "\n",
+    "def maxLimit(val, limit):\n",
+    "    \"\"\"\n",
+    "    max limit\n",
+    "    Parameters\n",
+    "    \"\"\"\n",
+    "    if (val > limit):\n",
+    "        val = limit\n",
+    "    return val;\n",
+    "\n",
+    "def rangeSample(val, minLim, maxLim):\n",
+    "    \"\"\"\n",
+    "    if out side range sample within range\n",
+    "\n",
+    "    Parameters\n",
+    "        val : value\n",
+    "        minLim : minimum\n",
+    "        maxLim : maximum\n",
+    "    \"\"\"\n",
+    "    if val < minLim or val > maxLim:\n",
+    "        val = randint(minLim, maxLim)\n",
+    "    return val\n",
+    "\n",
+    "def genRandomIntListWithinRange(size, minLim, maxLim):\n",
+    "    \"\"\"\n",
+    "    random unique list of integers within range\n",
+    "\n",
+    "    Parameters\n",
+    "        size : size of returned list\n",
+    "        minLim : minimum\n",
+    "        maxLim : maximum\n",
+    "    \"\"\"\n",
+    "    values = set()\n",
+    "    for i in range(size):\n",
+    "        val = randint(minLim, maxLim)\n",
+    "        while val not in values:\n",
+    "            values.add(val)\n",
+    "    return list(values)\n",
+    "\n",
+    "def preturbScalar(value, vrange):\n",
+    "    \"\"\"\n",
+    "    preturbs a mutiplicative value within range\n",
+    "\n",
+    "    Parameters\n",
+    "        value : data value\n",
+    "        vrange : value delta  fraction\n",
+    "    \"\"\"\n",
+    "    scale = 1.0 - vrange + 2 * vrange * random.random() \n",
+    "    return value * scale\n",
+    "\n",
+    "def preturbScalarAbs(value, vrange):\n",
+    "    \"\"\"\n",
+    "    preturbs an absolute value within range\n",
+    "\n",
+    "    Parameters\n",
+    "        value : data value\n",
+    "        vrange : value delta  absolute\n",
+    "    \"\"\"\n",
+    "    delta = - vrange + 2.0 * vrange * random.random() \n",
+    "    return value + delta\n",
+    "\n",
+    "def preturbVector(values, vrange):\n",
+    "    \"\"\"\n",
+    "    preturbs a list within range\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "        vrange : value delta  fraction\n",
+    "    \"\"\"\n",
+    "    nValues = list(map(lambda va: preturbScalar(va, vrange), values))\n",
+    "    return nValues\n",
+    "\n",
+    "def randomShiftVector(values, smin, smax):\n",
+    "    \"\"\"\n",
+    "    shifts  a list by a random quanity with a range\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "        smin : samplinf minimum\n",
+    "        smax : sampling maximum\n",
+    "    \"\"\"\n",
+    "    shift = np.random.uniform(smin, smax)\n",
+    "    return list(map(lambda va: va + shift, values))\n",
+    "\n",
+    "def floatRange(beg, end, incr):\n",
+    "    \"\"\"\n",
+    "    generates float range\n",
+    "\n",
+    "    Parameters\n",
+    "        beg :range begin\n",
+    "        end: range end\n",
+    "        incr : range increment\n",
+    "    \"\"\"\n",
+    "    return list(np.arange(beg, end, incr))\n",
+    "\n",
+    "def shuffle(values, *numShuffles):\n",
+    "    \"\"\"\n",
+    "    in place shuffling with swap of pairs\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "        numShuffles : parameter list for number of shuffles\n",
+    "    \"\"\"\n",
+    "    size = len(values)\n",
+    "    if len(numShuffles) == 0:\n",
+    "        numShuffle = int(size / 2)\n",
+    "    elif len(numShuffles) == 1:\n",
+    "        numShuffle = numShuffles[0]\n",
+    "    else:\n",
+    "        numShuffle = randint(numShuffles[0], numShuffles[1])\n",
+    "    print(\"numShuffle {}\".format(numShuffle))\n",
+    "    for i in range(numShuffle):\n",
+    "        first = random.randint(0, size - 1)\n",
+    "        second = random.randint(0, size - 1)\n",
+    "        while first == second:\n",
+    "            second = random.randint(0, size - 1)\n",
+    "        tmp = values[first]\n",
+    "        values[first] = values[second]\n",
+    "        values[second] = tmp\n",
+    "\n",
+    "\n",
+    "def splitList(itms, numGr):\n",
+    "    \"\"\"\n",
+    "    splits a list into sub lists of approximately equal size, with items in sublists randomly chod=sen\n",
+    "\n",
+    "    Parameters\n",
+    "        itms ; list of values\t\t\n",
+    "        numGr : no of groups\n",
+    "    \"\"\"\n",
+    "    tcount = len(itms)\n",
+    "    cItems = list(itms)\n",
+    "    sz = int(len(cItems) / numGr)\n",
+    "    groups = list()\n",
+    "    count = 0\n",
+    "    for i in range(numGr):\n",
+    "        if (i == numGr - 1):\n",
+    "            csz = tcount - count\n",
+    "        else:\n",
+    "            csz = sz + randint(-2, 2)\n",
+    "            count += csz\n",
+    "        gr = list()\n",
+    "        for  j in range(csz):\n",
+    "            it = selectRandomFromList(cItems)\n",
+    "            gr.append(it)\n",
+    "            cItems.remove(it)\n",
+    "        groups.append(gr)\n",
+    "    return groups\n",
+    "\n",
+    "def multVector(values, vrange):\n",
+    "    \"\"\"\n",
+    "    multiplies a list within value  range\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list of values\n",
+    "        vrange : fraction of vaue to be used to update\n",
+    "    \"\"\"\n",
+    "    scale = 1.0 - vrange + 2 * vrange * random.random()\n",
+    "    nValues = list(map(lambda va: va * scale, values))\n",
+    "    return nValues\n",
+    "\n",
+    "def weightedAverage(values, weights):\n",
+    "    \"\"\"\n",
+    "    calculates weighted average\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list of values\n",
+    "        weights : list of weights\n",
+    "    \"\"\"\t\t\n",
+    "    assert len(values) == len(weights), \"values and weights should be same size\"\n",
+    "    vw = zip(values, weights)\n",
+    "    wva = list(map(lambda e : e[0] * e[1], vw))\n",
+    "    #wa = sum(x * y for x, y in vw) / sum(weights)\n",
+    "    wav = sum(wva) / sum(weights)\n",
+    "    return wav\n",
+    "\n",
+    "def extractFields(line, delim, keepIndices):\n",
+    "    \"\"\"\n",
+    "    breaks a line into fields and keeps only specified fileds and returns new line\n",
+    "\n",
+    "    Parameters\n",
+    "        line ; deli separated string\n",
+    "        delim : delemeter\n",
+    "        keepIndices : list of indexes to fields to be retained\n",
+    "    \"\"\"\n",
+    "    items = line.split(delim)\n",
+    "    newLine = []\n",
+    "    for i in keepIndices:\n",
+    "        newLine.append(line[i])\n",
+    "    return delim.join(newLine)\n",
+    "\n",
+    "def remFields(line, delim, remIndices):\n",
+    "    \"\"\"\n",
+    "    removes fields from delim separated string\n",
+    "\n",
+    "    Parameters\n",
+    "        line ; delemeter separated string\n",
+    "        delim : delemeter\n",
+    "        remIndices : list of indexes to fields to be removed\n",
+    "    \"\"\"\n",
+    "    items = line.split(delim)\n",
+    "    newLine = []\n",
+    "    for i in range(len(items)):\n",
+    "        if not arrayContains(remIndices, i):\n",
+    "            newLine.append(line[i])\n",
+    "    return delim.join(newLine)\n",
+    "\n",
+    "def extractList(data, indices):\n",
+    "    \"\"\"\n",
+    "    extracts list from another list, given indices\n",
+    "\n",
+    "    Parameters\n",
+    "        remIndices : list data\n",
+    "        indices : list of indexes to fields to be retained\n",
+    "    \"\"\"\n",
+    "    if areAllFieldsIncluded(data, indices):\n",
+    "        exList = data.copy()\n",
+    "        #print(\"all indices\")\n",
+    "    else:\n",
+    "        exList = list()\n",
+    "        le = len(data)\n",
+    "        for i in indices:\n",
+    "            assert i < le , \"index {} out of bound {}\".format(i, le)\n",
+    "            exList.append(data[i])\n",
+    "\n",
+    "    return exList\n",
+    "\n",
+    "def arrayContains(arr, item):\n",
+    "    \"\"\"\n",
+    "    checks if array contains an item \n",
+    "\n",
+    "    Parameters\n",
+    "        arr : list data\n",
+    "        item : item to search\n",
+    "    \"\"\"\n",
+    "    contains = True\n",
+    "    try:\n",
+    "        arr.index(item)\n",
+    "    except ValueError:\n",
+    "        contains = False\n",
+    "    return contains\n",
+    "\n",
+    "def strToIntArray(line, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    int array from delim separated string\n",
+    "\n",
+    "    Parameters\n",
+    "        line ; delemeter separated string\n",
+    "    \"\"\"\n",
+    "    arr = line.split(delim)\n",
+    "    return [int(a) for a in arr]\n",
+    "\n",
+    "def strToFloatArray(line, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    float array from delim separated string\n",
+    "\n",
+    "    Parameters\n",
+    "        line ; delemeter separated string\n",
+    "    \"\"\"\n",
+    "    arr = line.split(delim)\n",
+    "    return [float(a) for a in arr]\n",
+    "\n",
+    "def strListOrRangeToIntArray(line):\n",
+    "    \"\"\"\n",
+    "    int array from delim separated string or range\n",
+    "\n",
+    "    Parameters\n",
+    "        line ; delemeter separated string\n",
+    "    \"\"\"\n",
+    "    varr = line.split(\",\")\n",
+    "    if (len(varr) > 1):\n",
+    "        iarr =  list(map(lambda v: int(v), varr))\n",
+    "    else:\n",
+    "        vrange = line.split(\":\")\n",
+    "        if (len(vrange) == 2):\n",
+    "            lo = int(vrange[0])\n",
+    "            hi = int(vrange[1])\n",
+    "            iarr = list(range(lo, hi+1))\n",
+    "        else:\n",
+    "            iarr = [int(line)]\n",
+    "    return iarr\n",
+    "\n",
+    "def toStr(val, precision):\n",
+    "    \"\"\"\n",
+    "    converts any type to string\t\n",
+    "\n",
+    "    Parameters\n",
+    "        val : value\n",
+    "        precision ; precision for float value\n",
+    "    \"\"\"\n",
+    "    if type(val) == float or type(val) == np.float64 or type(val) == np.float32:\n",
+    "        format = \"%\" + \".%df\" %(precision)\n",
+    "        sVal = format %(val)\n",
+    "    else:\n",
+    "        sVal = str(val)\n",
+    "    return sVal\n",
+    "\n",
+    "def toStrFromList(values, precision, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    converts list of any type to delim separated string\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "        precision ; precision for float value\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    sValues = list(map(lambda v: toStr(v, precision), values))\n",
+    "    return delim.join(sValues)\n",
+    "\n",
+    "def toIntList(values):\n",
+    "    \"\"\"\n",
+    "    convert to int list\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "    \"\"\"\n",
+    "    return list(map(lambda va: int(va), values))\n",
+    "\n",
+    "def toFloatList(values):\n",
+    "    \"\"\"\n",
+    "    convert to float list\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "    \"\"\"\n",
+    "    return list(map(lambda va: float(va), values))\n",
+    "\n",
+    "def toStrList(values, precision=None):\n",
+    "    \"\"\"\n",
+    "    convert to string list\n",
+    "\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "        precision ; precision for float value\n",
+    "    \"\"\"\n",
+    "    return list(map(lambda va: toStr(va, precision), values))\n",
+    "\n",
+    "def toIntFromBoolean(value):\n",
+    "    \"\"\"\n",
+    "    convert to int\n",
+    "\n",
+    "    Parameters\n",
+    "        value : boolean value\n",
+    "    \"\"\"\n",
+    "    ival = 1 if value else 0\n",
+    "    return ival\n",
+    "\n",
+    "def typedValue(val, dtype=None):\n",
+    "    \"\"\"\n",
+    "    return typed value given string, discovers data type if not specified\n",
+    "\n",
+    "    Parameters\n",
+    "        val : value\n",
+    "        dtype : data type\n",
+    "    \"\"\"\n",
+    "    tVal = None\n",
+    "\n",
+    "    if dtype is not None:\n",
+    "        if dtype == \"num\":\n",
+    "            dtype = \"int\" if dtype.find(\".\") == -1 else \"float\"\n",
+    "\n",
+    "        if dtype == \"int\":\n",
+    "            tVal = int(val)\n",
+    "        elif dtype == \"float\":\n",
+    "            tVal = float(val)\n",
+    "        elif dtype == \"bool\":\n",
+    "            tVal = bool(val)\n",
+    "        else:\n",
+    "            tVal = val\n",
+    "    else:\n",
+    "        if type(val) == str:\n",
+    "            lVal = val.lower()\n",
+    "\n",
+    "            #int\n",
+    "            done = True\n",
+    "            try:\n",
+    "                tVal = int(val)\n",
+    "            except ValueError:\n",
+    "                done = False\n",
+    "\n",
+    "            #float\n",
+    "            if not done:\n",
+    "                done = True\n",
+    "                try:\n",
+    "                    tVal = float(val)\n",
+    "                except ValueError:\n",
+    "                    done = False\n",
+    "\n",
+    "            #boolean\n",
+    "            if not done:\n",
+    "                done = True\n",
+    "                if lVal == \"true\":\n",
+    "                    tVal = True\n",
+    "                elif lVal == \"false\":\n",
+    "                    tVal = False\n",
+    "                else:\n",
+    "                    done = False\n",
+    "            #None\t\t\n",
+    "            if not done:\n",
+    "                if lVal == \"none\":\n",
+    "                    tVal = None\n",
+    "                else:\n",
+    "                    tVal = val\n",
+    "        else:\n",
+    "            tVal = val\n",
+    "\n",
+    "    return tVal\n",
+    "\n",
+    "def getAllFiles(dirPath):\n",
+    "    \"\"\"\n",
+    "    get all files recursively\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : directory path\n",
+    "    \"\"\"\n",
+    "    filePaths = []\n",
+    "    for (thisDir, subDirs, fileNames) in os.walk(dirPath):\n",
+    "        for fileName in fileNames:\n",
+    "            filePaths.append(os.path.join(thisDir, fileName))\n",
+    "    filePaths.sort()\n",
+    "    return filePaths\n",
+    "\n",
+    "def getFileContent(fpath, verbose=False):\n",
+    "    \"\"\"\n",
+    "    get file contents in directory\n",
+    "\n",
+    "    Parameters\n",
+    "        fpath ; directory path\n",
+    "        verbose : verbosity flag\n",
+    "    \"\"\"\n",
+    "    # dcument list\n",
+    "    docComplete  = []\n",
+    "    filePaths = getAllFiles(fpath)\n",
+    "\n",
+    "    # read files\n",
+    "    for filePath in filePaths:\n",
+    "        if verbose:\n",
+    "            print(\"next file \" + filePath)\n",
+    "        with open(filePath, 'r') as contentFile:\n",
+    "            content = contentFile.read()\n",
+    "            docComplete.append(content)\n",
+    "    return (docComplete, filePaths)\n",
+    "\n",
+    "def getOneFileContent(fpath):\n",
+    "    \"\"\"\n",
+    "    get one file contents\n",
+    "\n",
+    "    Parameters\n",
+    "        fpath : file path\n",
+    "    \"\"\"\n",
+    "    with open(fpath, 'r') as contentFile:\n",
+    "        docStr = contentFile.read()\n",
+    "    return docStr\n",
+    "\n",
+    "def getFileLines(dirPath, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get lines from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    lines = list()\n",
+    "    for li in fileRecGen(dirPath, delim):\n",
+    "        lines.append(li)\n",
+    "    return lines\n",
+    "\n",
+    "def getFileSampleLines(dirPath, percen, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get sampled lines from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        percen : sampling percentage\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    lines = list()\n",
+    "    for li in fileRecGen(dirPath, delim):\n",
+    "        if randint(0, 100) < percen:\n",
+    "            lines.append(li)\n",
+    "    return lines\n",
+    "\n",
+    "def getFileColumnAsString(dirPath, index, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get string column from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        index : index\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    fields = list()\n",
+    "    for rec in fileRecGen(dirPath, delim):\n",
+    "        fields.append(rec[index])\n",
+    "    #print(fields)\t\n",
+    "    return fields\n",
+    "\n",
+    "def getFileColumnsAsString(dirPath, indexes, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get multiple string columns from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        indexes : indexes of columns\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    nindex = len(indexes)\n",
+    "    columns = list(map(lambda i : list(), range(nindex)))\n",
+    "    for rec in fileRecGen(dirPath, delim):\n",
+    "        for i in range(nindex):\n",
+    "            columns[i].append(rec[indexes[i]])\n",
+    "    return columns\n",
+    "\n",
+    "def getFileColumnAsFloat(dirPath, index, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get float fileds from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        index : index\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    #print(\"{}  {}\".format(dirPath, index))\n",
+    "    fields = getFileColumnAsString(dirPath, index, delim)\n",
+    "    return list(map(lambda v:float(v), fields))\n",
+    "\n",
+    "def getFileColumnAsInt(dirPath, index, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    get float fileds from a file\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        index : index\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    fields = getFileColumnAsString(dirPath, index, delim)\n",
+    "    return list(map(lambda v:int(v), fields))\n",
+    "\n",
+    "def getFileAsIntMatrix(dirPath, columns, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts int matrix from csv file given column indices with each row being  concatenation of \n",
+    "    extracted column values row size = num of columns\n",
+    "\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        columns : indexes of columns\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    mat = list()\n",
+    "    for rec in  fileSelFieldsRecGen(dirPath, columns, delim):\n",
+    "        mat.append(asIntList(rec))\n",
+    "    return mat\n",
+    "\n",
+    "def getFileAsFloatMatrix(dirPath, columns, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts float matrix from csv file given column indices with each row being concatenation of  \n",
+    "    extracted column values row size = num of columns\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        columns : indexes of columns\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    mat = list()\n",
+    "    for rec in  fileSelFieldsRecGen(dirPath, columns, delim):\n",
+    "        mat.append(asFloatList(rec))\n",
+    "    return mat\n",
+    "\n",
+    "def getFileAsFloatColumn(dirPath):\n",
+    "    \"\"\"\n",
+    "    grt float list from a file with one float per row\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "    \"\"\"\n",
+    "    flist = list()\n",
+    "    for rec in fileRecGen(dirPath, None):\n",
+    "        flist.append(float(rec))\n",
+    "    return flist\n",
+    "\n",
+    "def getFileAsFiltFloatMatrix(dirPath, filt, columns, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts float matrix from csv file given row filter and column indices with each row being \n",
+    "    concatenation of  extracted column values row size = num of columns\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        columns : indexes of columns\n",
+    "        filt : row filter lambda\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    mat = list()\n",
+    "    for rec in  fileFiltSelFieldsRecGen(dirPath, filt, columns, delim):\n",
+    "        mat.append(asFloatList(rec))\n",
+    "    return mat\n",
+    "\n",
+    "def getFileAsTypedRecords(dirPath, types, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts typed records from csv file with each row being concatenation of  \n",
+    "    extracted column values \n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        types : data types\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    (dtypes, cvalues) = extractTypesFromString(types)\n",
+    "    tdata = list()\n",
+    "    for rec in  fileRecGen(dirPath, delim):\n",
+    "        trec = list()\n",
+    "        for index, value in enumerate(rec):\n",
+    "            value = __convToTyped(index, value, dtypes)\n",
+    "            trec.append(value)\n",
+    "        tdata.append(trec)\n",
+    "    return tdata\n",
+    "\n",
+    "\n",
+    "def getFileColsAsTypedRecords(dirPath, columns, types, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts typed records from csv file given column indices with each row being concatenation of  \n",
+    "    extracted column values \n",
+    "    Parameters\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        columns : column indexes\n",
+    "        types : data types\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    (dtypes, cvalues) = extractTypesFromString(types)\n",
+    "    tdata = list()\n",
+    "    for rec in  fileSelFieldsRecGen(dirPath, columns, delim):\n",
+    "        trec = list()\n",
+    "        for indx, value in enumerate(rec):\n",
+    "            tindx = columns[indx]\n",
+    "            value = __convToTyped(tindx, value, dtypes)\n",
+    "            trec.append(value)\n",
+    "        tdata.append(trec)\n",
+    "    return tdata\n",
+    "\n",
+    "def getFileColumnsMinMax(dirPath, columns, dtype, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts numeric matrix from csv file given column indices. For each column return min and max\n",
+    "    Parameters\n",
+    "        dirPath : file path\n",
+    "        columns : column indexes\n",
+    "        dtype : data type\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    dtypes = list(map(lambda c : str(c) + \":\" + dtype, columns))\n",
+    "    dtypes = \",\".join(dtypes)\n",
+    "    #print(dtypes)\n",
+    "\n",
+    "    tdata = getFileColsAsTypedRecords(dirPath, columns, dtypes, delim)\n",
+    "    minMax = list()\n",
+    "    ncola = len(tdata[0])\n",
+    "    ncole = len(columns)\n",
+    "    assertEqual(ncola, ncole, \"actual no of columns different from expected\")\n",
+    "\n",
+    "    for ci in range(ncole):\t\n",
+    "        vmin = sys.float_info.max\n",
+    "        vmax = sys.float_info.min\n",
+    "        for r in tdata:\n",
+    "            cv = r[ci]\n",
+    "            vmin = cv if cv < vmin else vmin\n",
+    "            vmax = cv if cv > vmax else vmax\n",
+    "        mm = (vmin, vmax, vmax - vmin)\n",
+    "        minMax.append(mm)\n",
+    "\n",
+    "    return minMax\n",
+    "\n",
+    "\n",
+    "def getRecAsTypedRecord(rec, types, delim=None):\n",
+    "    \"\"\"\n",
+    "    converts record to  typed records \n",
+    "    Parameters\n",
+    "        rec : delemeter separate string or list of string\n",
+    "        types : field  data types\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\t\n",
+    "    if delim is not None:\n",
+    "        rec = rec.split(delim)\n",
+    "    (dtypes, cvalues) = extractTypesFromString(types)\n",
+    "    #print(types)\n",
+    "    #print(dtypes)\n",
+    "    trec = list()\n",
+    "    for ind, value in enumerate(rec):\n",
+    "        tvalue = __convToTyped(ind, value, dtypes)\n",
+    "        trec.append(tvalue)\n",
+    "    return trec\n",
+    "\n",
+    "def __convToTyped(index, value, dtypes):\n",
+    "    \"\"\"\n",
+    "    convert to typed value \n",
+    "    Parameters\n",
+    "        index : index in type list\n",
+    "        value : data value\n",
+    "        dtypes : data type list\n",
+    "    \"\"\"\n",
+    "    #print(index, value)\n",
+    "    dtype = dtypes[index]\n",
+    "    tvalue = value\n",
+    "    if dtype == \"int\":\n",
+    "        tvalue = int(value)\n",
+    "    elif dtype == \"float\":\n",
+    "        tvalue = float(value)\n",
+    "    return tvalue\n",
+    "\n",
+    "\n",
+    "\n",
+    "def extractTypesFromString(types):\n",
+    "    \"\"\"\n",
+    "    extracts column data types and set values for categorical variables \n",
+    "    Parameters\n",
+    "        types : encoded type information\n",
+    "    \"\"\"\n",
+    "    ftypes = types.split(\",\")\n",
+    "    dtypes = dict()\n",
+    "    cvalues = dict()\n",
+    "    for ftype in ftypes:\n",
+    "        items = ftype.split(\":\") \n",
+    "        cindex = int(items[0])\n",
+    "        dtype = items[1]\n",
+    "        dtypes[cindex] = dtype\n",
+    "        if len(items) == 3:\n",
+    "            sitems = items[2].split()\n",
+    "            cvalues[cindex] = sitems\n",
+    "    return (dtypes, cvalues)\n",
+    "\n",
+    "def getMultipleFileAsInttMatrix(dirPathWithCol,  delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts int matrix from from csv files given column index for each file. \n",
+    "    num of columns  = number of rows in each file and num of rows = number of files\n",
+    "    Parameters\n",
+    "        dirPathWithCol: list of file path and collumn index pair\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    mat = list()\n",
+    "    minLen = -1\n",
+    "    for path, col in dirPathWithCol:\n",
+    "        colVals = getFileColumnAsInt(path, col, delim)\n",
+    "        if minLen < 0 or len(colVals) < minLen:\n",
+    "            minLen = len(colVals)\n",
+    "        mat.append(colVals)\n",
+    "\n",
+    "    #make all same length\n",
+    "    mat = list(map(lambda li:li[:minLen], mat))\n",
+    "    return mat\n",
+    "\n",
+    "def getMultipleFileAsFloatMatrix(dirPathWithCol,  delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    extracts float matrix from from csv files given column index for each file. \n",
+    "    num of columns  = number of rows in each file and num of rows = number of files\n",
+    "    Parameters\n",
+    "        dirPathWithCol: list of file path and collumn index pair\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    mat = list()\n",
+    "    minLen = -1\n",
+    "    for path, col in dirPathWithCol:\n",
+    "        colVals = getFileColumnAsFloat(path, col, delim)\n",
+    "        if minLen < 0 or len(colVals) < minLen:\n",
+    "            minLen = len(colVals)\n",
+    "        mat.append(colVals)\n",
+    "\n",
+    "    #make all same length\n",
+    "    mat = list(map(lambda li:li[:minLen], mat))\n",
+    "    return mat\n",
+    "\n",
+    "def writeStrListToFile(ldata, filePath, delem=\",\"):\n",
+    "    \"\"\"\n",
+    "    writes list of dlem separated string or list of list of string to afile\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        filePath : file path\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"w\") as fh:\n",
+    "        for r in ldata:\n",
+    "            if type(r) == list:\n",
+    "                r = delem.join(r)\n",
+    "            fh.write(r + \"\\n\")\n",
+    "\n",
+    "def writeFloatListToFile(ldata, prec, filePath):\n",
+    "    \"\"\"\n",
+    "    writes float list to file, one value per line\n",
+    "\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        prec : precision\n",
+    "        filePath : file path\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"w\") as fh:\n",
+    "        for d in ldata:\n",
+    "            fh.write(formatFloat(prec, d) + \"\\n\")\n",
+    "\n",
+    "\n",
+    "def takeFirst(elems):\n",
+    "    \"\"\"\n",
+    "    return fisrt item\n",
+    "    Parameters\n",
+    "        elems : list of data \n",
+    "    \"\"\"\n",
+    "    return elems[0]\n",
+    "\n",
+    "def takeSecond(elems):\n",
+    "    \"\"\"\n",
+    "    return 2nd element\n",
+    "    Parameters\n",
+    "        elems : list of data \n",
+    "    \"\"\"\n",
+    "    return elems[1]\n",
+    "\n",
+    "def takeThird(elems):\n",
+    "    \"\"\"\n",
+    "    returns 3rd element\n",
+    "    Parameters\n",
+    "        elems : list of data \n",
+    "    \"\"\"\n",
+    "    return elems[2]\n",
+    "\n",
+    "def addToKeyedCounter(dCounter, key, count=1):\n",
+    "    \"\"\"\n",
+    "    add to to keyed counter\n",
+    "    Parameters\n",
+    "        dCounter : dictionary of counters\n",
+    "        key : dictionary key\n",
+    "        count : count to add\n",
+    "    \"\"\"\n",
+    "    curCount = dCounter.get(key, 0)\n",
+    "    dCounter[key] = curCount + count\n",
+    "\n",
+    "def incrKeyedCounter(dCounter, key):\n",
+    "    \"\"\"\n",
+    "    increment keyed counter\n",
+    "    Parameters\n",
+    "        dCounter : dictionary of counters\n",
+    "        key : dictionary key\n",
+    "    \"\"\"\n",
+    "    addToKeyedCounter(dCounter, key, 1)\n",
+    "\n",
+    "def appendKeyedList(dList, key, elem):\n",
+    "    \"\"\"\n",
+    "    keyed list\n",
+    "    Parameters\n",
+    "        dList : dictionary of lists\n",
+    "        key : dictionary key\n",
+    "        elem : value to append\n",
+    "    \"\"\"\n",
+    "    curList = dList.get(key, [])\n",
+    "    curList.append(elem)\n",
+    "    dList[key] = curList\n",
+    "\n",
+    "def isNumber(st):\n",
+    "    \"\"\"\n",
+    "    Returns True is string is a number\n",
+    "    Parameters\n",
+    "        st : string value\n",
+    "    \"\"\"\n",
+    "    return st.replace('.','',1).isdigit()\n",
+    "\n",
+    "def removeNan(values):\n",
+    "    \"\"\"\n",
+    "    removes nan from list\n",
+    "    Parameters\n",
+    "        values : list data\n",
+    "    \"\"\"\n",
+    "    return list(filter(lambda v: not math.isnan(v), values))\n",
+    "\n",
+    "def fileRecGen(filePath, delim = \",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator\n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"r\") as fp:\n",
+    "        for line in fp:\t\n",
+    "            line = line[:-1]\n",
+    "            if delim is not None:\n",
+    "                line = line.split(delim)\n",
+    "            yield line\n",
+    "\n",
+    "def fileSelFieldsRecGen(dirPath, columns, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator given column indices \n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        columns : column indexes as int array or coma separated string\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    if type(columns) == str:\n",
+    "        columns = strToIntArray(columns, delim)\n",
+    "    for rec in fileRecGen(dirPath, delim):\n",
+    "        extracted = extractList(rec, columns)\n",
+    "        yield extracted\n",
+    "\n",
+    "def fileFiltRecGen(filePath, filt, delim = \",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator with  row filter applied\n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        filt : row filter\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"r\") as fp:\n",
+    "        for line in fp:\t\n",
+    "            line = line[:-1]\n",
+    "            if delim is not None:\n",
+    "                line = line.split(delim)\n",
+    "            if filt(line):\n",
+    "                yield line\n",
+    "\n",
+    "def fileFiltSelFieldsRecGen(filePath, filt, columns, delim = \",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator with  row and column filter applied\n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        filt : row filter\n",
+    "        columns : column indexes as int array or coma separated string\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    columns = strToIntArray(columns, delim)\n",
+    "    with open(filePath, \"r\") as fp:\n",
+    "        for line in fp:\t\n",
+    "            line = line[:-1]\n",
+    "            if delim is not None:\n",
+    "                line = line.split(delim)\n",
+    "            if filt(line):\n",
+    "                selected = extractList(line, columns)\n",
+    "                yield selected\n",
+    "\n",
+    "def fileTypedRecGen(filePath, ftypes, delim = \",\"):\n",
+    "    \"\"\"\n",
+    "    file typed record generator\n",
+    "    Parameters\n",
+    "        filePath ; file path\n",
+    "        ftypes : list of field types\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"r\") as fp:\n",
+    "        for line in fp:\t\n",
+    "            line = line[:-1]\n",
+    "            line = line.split(delim)\n",
+    "            for i in range(0, len(ftypes), 2):\n",
+    "                ci = ftypes[i]\n",
+    "                dtype = ftypes[i+1]\n",
+    "                assertLesser(ci, len(line), \"index out of bound\")\n",
+    "                if dtype == \"int\":\n",
+    "                    line[ci] = int(line[ci])\n",
+    "                elif dtype == \"float\":\n",
+    "                    line[ci] = float(line[ci])\n",
+    "                else:\n",
+    "                    exitWithMsg(\"invalid data type\")\n",
+    "            yield line\n",
+    "\n",
+    "def fileMutatedFieldsRecGen(dirPath, mutator, delim=\",\"):\n",
+    "    \"\"\"\n",
+    "    file record generator with some columns mutated \n",
+    "    Parameters\n",
+    "        dirPath ; file path\n",
+    "        mutator : row field mutator\n",
+    "        delim : delemeter\n",
+    "    \"\"\"\n",
+    "    for rec in fileRecGen(dirPath, delim):\n",
+    "        mutated = mutator(rec)\n",
+    "        yield mutated\n",
+    "\n",
+    "def tableSelFieldsFilter(tdata, columns):\n",
+    "    \"\"\"\n",
+    "    gets tabular data for selected columns \n",
+    "    Parameters\n",
+    "        tdata : tabular data\n",
+    "        columns : column indexes\n",
+    "    \"\"\"\n",
+    "    if areAllFieldsIncluded(tdata[0], columns):\n",
+    "        ntdata = tdata\n",
+    "    else:\n",
+    "        ntdata = list()\n",
+    "        for rec in tdata:\n",
+    "            #print(rec)\n",
+    "            #print(columns)\n",
+    "            nrec = extractList(rec, columns)\n",
+    "            ntdata.append(nrec)\n",
+    "    return ntdata\n",
+    "\n",
+    "\n",
+    "def areAllFieldsIncluded(ldata, columns):\n",
+    "    \"\"\"\n",
+    "    return True id all indexes are in the columns\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        columns : column indexes\n",
+    "    \"\"\"\n",
+    "    return list(range(len(ldata))) == columns\n",
+    "\n",
+    "def asIntList(items):\n",
+    "    \"\"\"\n",
+    "    returns int list\n",
+    "    Parameters\n",
+    "        items : list data\n",
+    "    \"\"\"\n",
+    "    return [int(i) for i in items]\n",
+    "\n",
+    "def asFloatList(items):\n",
+    "    \"\"\"\n",
+    "    returns float list\n",
+    "    Parameters\n",
+    "        items : list data\n",
+    "    \"\"\"\n",
+    "    return [float(i) for i in items]\n",
+    "\n",
+    "def pastTime(interval, unit):\n",
+    "    \"\"\"\n",
+    "    current and past time\n",
+    "    Parameters\n",
+    "        interval : time interval\n",
+    "        unit: time unit\n",
+    "    \"\"\"\n",
+    "    curTime = int(time.time())\n",
+    "    if unit == \"d\":\n",
+    "        pastTime = curTime - interval * secInDay\n",
+    "    elif unit == \"h\":\n",
+    "        pastTime = curTime - interval * secInHour\n",
+    "    elif unit == \"m\":\n",
+    "        pastTime = curTime - interval * secInMinute\n",
+    "    else:\n",
+    "        raise ValueError(\"invalid time unit \" + unit)\n",
+    "    return (curTime, pastTime)\n",
+    "\n",
+    "def minuteAlign(ts):\n",
+    "    \"\"\"\n",
+    "    minute aligned time\t\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    return int((ts / secInMinute)) * secInMinute\n",
+    "\n",
+    "def multMinuteAlign(ts, min):\n",
+    "    \"\"\"\n",
+    "    multi minute aligned time\t\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "        min : minute value\n",
+    "    \"\"\"\n",
+    "    intv = secInMinute * min\n",
+    "    return int((ts / intv)) * intv\n",
+    "\n",
+    "def hourAlign(ts):\n",
+    "    \"\"\"\n",
+    "    hour aligned time\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    return int((ts / secInHour)) * secInHour\n",
+    "\n",
+    "def hourOfDayAlign(ts, hour):\n",
+    "    \"\"\"\n",
+    "    hour of day aligned time\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "        hour : hour of day\n",
+    "    \"\"\"\n",
+    "    day = int(ts / secInDay)\n",
+    "    return (24 * day + hour) * secInHour\n",
+    "\n",
+    "def dayAlign(ts):\n",
+    "    \"\"\"\n",
+    "    day aligned time\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    return int(ts / secInDay) * secInDay\n",
+    "\n",
+    "def timeAlign(ts, unit):\n",
+    "    \"\"\"\n",
+    "    boundary alignment of time\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "        unit : unit of time\n",
+    "    \"\"\"\n",
+    "    alignedTs = 0\n",
+    "    if unit == \"s\":\n",
+    "        alignedTs = ts\n",
+    "    elif unit == \"m\":\n",
+    "        alignedTs = minuteAlign(ts)\n",
+    "    elif unit == \"h\":\n",
+    "        alignedTs = hourAlign(ts)\n",
+    "    elif unit == \"d\":\n",
+    "        alignedTs = dayAlign(ts)\n",
+    "    else:\n",
+    "        raise ValueError(\"invalid time unit\")\n",
+    "    return alignedTs\n",
+    "\n",
+    "def monthOfYear(ts):\n",
+    "    \"\"\"\n",
+    "    month of year\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    rem = ts % secInYear\n",
+    "    dow = int(rem / secInMonth)\n",
+    "    return dow\n",
+    "\n",
+    "def dayOfWeek(ts):\n",
+    "    \"\"\"\n",
+    "    day of week\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    rem = ts % secInWeek\n",
+    "    dow = int(rem / secInDay)\n",
+    "    return dow\n",
+    "\n",
+    "def hourOfDay(ts):\n",
+    "    \"\"\"\n",
+    "    hour of day\n",
+    "    Parameters\n",
+    "        ts : time stamp in sec\n",
+    "    \"\"\"\n",
+    "    rem = ts % secInDay\n",
+    "    hod = int(rem / secInHour)\n",
+    "    return hod\n",
+    "\n",
+    "def processCmdLineArgs(expectedTypes, usage):\n",
+    "    \"\"\"\n",
+    "    process command line args and returns args as typed values\n",
+    "    Parameters\n",
+    "        expectedTypes : expected data types of arguments\n",
+    "        usage : usage message string\n",
+    "    \"\"\"\n",
+    "    args = []\n",
+    "    numComLineArgs = len(sys.argv)\n",
+    "    numExpected = len(expectedTypes)\n",
+    "    if (numComLineArgs - 1 == len(expectedTypes)):\n",
+    "        try:\n",
+    "            for i in range(0, numExpected):\n",
+    "                if (expectedTypes[i] == typeInt):\n",
+    "                    args.append(int(sys.argv[i+1]))\n",
+    "                elif (expectedTypes[i] == typeFloat):\n",
+    "                    args.append(float(sys.argv[i+1]))\n",
+    "                elif (expectedTypes[i] == typeString):\n",
+    "                    args.append(sys.argv[i+1])\n",
+    "        except ValueError:\n",
+    "            print (\"expected number of command line arguments found but there is type mis match\")\n",
+    "            sys.exit(1)\n",
+    "    else:\n",
+    "        print (\"expected number of command line arguments not found\")\n",
+    "        print (usage)\n",
+    "        sys.exit(1)\n",
+    "    return args\n",
+    "\n",
+    "def mutateString(val, numMutate, ctype):\n",
+    "    \"\"\"\n",
+    "    mutate string multiple times\n",
+    "    Parameters\n",
+    "        val : string value\n",
+    "        numMutate : num of mutations\n",
+    "        ctype : type of character to mutate with\n",
+    "    \"\"\"\n",
+    "    mutations = set()\n",
+    "    count = 0\n",
+    "    while count < numMutate:\n",
+    "        j = randint(0, len(val)-1)\n",
+    "        if j not in mutations:\n",
+    "            if ctype == \"alpha\":\n",
+    "                ch = selectRandomFromList(alphaTokens)\n",
+    "            elif ctype == \"num\":\n",
+    "                ch = selectRandomFromList(numTokens)\n",
+    "            elif ctype == \"any\":\n",
+    "                ch = selectRandomFromList(tokens)\n",
+    "            val = val[:j] + ch + val[j+1:]\n",
+    "            mutations.add(j)\n",
+    "            count += 1\n",
+    "    return val\n",
+    "\n",
+    "def mutateList(values, numMutate, vmin, vmax):\n",
+    "    \"\"\"\n",
+    "    mutate list multiple times\n",
+    "    Parameters\n",
+    "        values : list value\n",
+    "        numMutate : num of mutations\n",
+    "        vmin : minimum of value range\n",
+    "        vmax : maximum of value range\n",
+    "    \"\"\"\n",
+    "    mutations = set()\n",
+    "    count = 0\n",
+    "    while count < numMutate:\n",
+    "        j = randint(0, len(values)-1)\n",
+    "        if j not in mutations:\n",
+    "            values[j] = np.random.uniform(vmin, vmax)\n",
+    "            count += 1\n",
+    "    return values\n",
+    "\n",
+    "\n",
+    "def swap(values, first, second):\n",
+    "    \"\"\"\n",
+    "    swap two elements\n",
+    "    Parameters\n",
+    "        values : list value\n",
+    "        first : first swap position\n",
+    "        second : second swap position\n",
+    "    \"\"\"\n",
+    "    t = values[first]\n",
+    "    values[first] = values[second]\n",
+    "    values[second] = t\n",
+    "\n",
+    "def swapBetweenLists(values1, values2):\n",
+    "    \"\"\"\n",
+    "    swap two elements between 2 lists\n",
+    "    Parameters\n",
+    "        values1 : first list of values\n",
+    "        values2 : second list of values\n",
+    "    \"\"\"\n",
+    "    p1 = randint(0, len(values1)-1)\n",
+    "    p2 = randint(0, len(values2)-1)\n",
+    "    tmp = values1[p1]\n",
+    "    values1[p1] = values2[p2]\n",
+    "    values2[p2] = tmp\n",
+    "\n",
+    "def safeAppend(values, value):\n",
+    "    \"\"\"\n",
+    "    append only if not None\n",
+    "    Parameters\n",
+    "        values : list value\n",
+    "        value : value to append\n",
+    "    \"\"\"\n",
+    "    if value is not None:\n",
+    "        values.append(value)\n",
+    "\n",
+    "def getAllIndex(ldata, fldata):\n",
+    "    \"\"\"\n",
+    "    get ALL indexes of list elements\n",
+    "    Parameters\n",
+    "        ldata : list data to find index in\n",
+    "        fldata : list data for values for index look up\n",
+    "    \"\"\"\n",
+    "    return list(map(lambda e : fldata.index(e), ldata))\n",
+    "\n",
+    "def findIntersection(lOne, lTwo):\n",
+    "    \"\"\"\n",
+    "    find intersection elements between 2 lists\n",
+    "    Parameters\n",
+    "        lOne : first list of data\n",
+    "        lTwo : second list of data\n",
+    "    \"\"\"\n",
+    "    sOne = set(lOne)\n",
+    "    sTwo = set(lTwo)\n",
+    "    sInt = sOne.intersection(sTwo)\n",
+    "    return list(sInt)\n",
+    "\n",
+    "def isIntvOverlapped(rOne, rTwo):\n",
+    "    \"\"\"\n",
+    "    checks overlap between 2 intervals\n",
+    "    Parameters\n",
+    "        rOne : first interval boundaries\n",
+    "        rTwo : second interval boundaries\n",
+    "    \"\"\"\n",
+    "    clear = rOne[1] <=  rTwo[0] or rOne[0] >=  rTwo[1] \n",
+    "    return not clear\n",
+    "\n",
+    "def isIntvLess(rOne, rTwo):\n",
+    "    \"\"\"\n",
+    "    checks if first iterval is less than second\n",
+    "    Parameters\n",
+    "        rOne : first interval boundaries\n",
+    "        rTwo : second interval boundaries\n",
+    "    \"\"\"\n",
+    "    less = rOne[1] <=  rTwo[0] \n",
+    "    return less\n",
+    "\n",
+    "def findRank(e, values):\n",
+    "    \"\"\"\n",
+    "    find rank of value in a list\n",
+    "    Parameters\n",
+    "        e : value to compare with\n",
+    "        values : list data\n",
+    "    \"\"\"\n",
+    "    count =  1\n",
+    "    for ve in values:\n",
+    "        if ve < e:\n",
+    "            count += 1\n",
+    "    return count\n",
+    "\n",
+    "def findRanks(toBeRanked, values):\n",
+    "    \"\"\"\n",
+    "    find ranks of values in one list in another list\n",
+    "    Parameters\n",
+    "        toBeRanked : list of values for which ranks are found\n",
+    "        values : list in which rank is found : \n",
+    "    \"\"\"\n",
+    "    return list(map(lambda e: findRank(e, values), toBeRanked))\n",
+    "\n",
+    "def formatFloat(prec, value, label = None):\n",
+    "    \"\"\"\n",
+    "    formats a float with optional label\n",
+    "    Parameters\n",
+    "        prec : precision\n",
+    "        value : data value\n",
+    "        label : label for data\n",
+    "    \"\"\"\n",
+    "    st = (label + \" \") if label else \"\"\n",
+    "    formatter = \"{:.\" + str(prec) + \"f}\" \n",
+    "    return st + formatter.format(value)\n",
+    "\n",
+    "def formatAny(value, label = None):\n",
+    "    \"\"\"\n",
+    "    formats any obkect with optional label\n",
+    "    Parameters\n",
+    "        value : data value\n",
+    "        label : label for data\n",
+    "    \"\"\"\n",
+    "    st = (label + \" \") if label else \"\"\n",
+    "    return st + str(value)\n",
+    "\n",
+    "def printList(values):\n",
+    "    \"\"\"\n",
+    "    pretty print list\n",
+    "    Parameters\n",
+    "        values : list of values\n",
+    "    \"\"\"\n",
+    "    for v in values:\n",
+    "        print(v)\n",
+    "\n",
+    "def printMap(values, klab, vlab, precision, offset=16):\n",
+    "    \"\"\"\n",
+    "    pretty print hash map\n",
+    "    Parameters\n",
+    "        values : dictionary of values\n",
+    "        klab : label for key\n",
+    "        vlab : label for value\n",
+    "        precision : precision\n",
+    "        offset : left justify offset\n",
+    "    \"\"\"\n",
+    "    print(klab.ljust(offset, \" \") + vlab)\n",
+    "    for k in values.keys():\n",
+    "        v = values[k]\n",
+    "        ks = toStr(k, precision).ljust(offset, \" \")\n",
+    "        vs = toStr(v, precision)\n",
+    "        print(ks +  vs)\n",
+    "\n",
+    "def printPairList(values, lab1, lab2, precision, offset=16):\n",
+    "    \"\"\"\n",
+    "    pretty print list of pairs\n",
+    "    Parameters\n",
+    "        values : dictionary of values\n",
+    "        lab1 : first label\n",
+    "        lab2 : second label\n",
+    "        precision : precision\n",
+    "        offset : left justify offset\n",
+    "    \"\"\"\n",
+    "    print(lab1.ljust(offset, \" \") + lab2)\n",
+    "    for (v1, v2) in values:\n",
+    "        sv1 = toStr(v1, precision).ljust(offset, \" \")\n",
+    "        sv2 = toStr(v2, precision)\n",
+    "        print(sv1 + sv2)\n",
+    "\n",
+    "def createMap(*values):\n",
+    "    \"\"\"\n",
+    "    create disctionary with results\n",
+    "    Parameters\n",
+    "        values : sequence of key value pairs\n",
+    "    \"\"\"\n",
+    "    result = dict()\n",
+    "    for i in range(0, len(values), 2):\n",
+    "        result[values[i]] = values[i+1]\n",
+    "    return result\n",
+    "\n",
+    "def getColMinMax(table, col):\n",
+    "    \"\"\"\n",
+    "    return min, max values of a column\n",
+    "    Parameters\n",
+    "        table : tabular data\n",
+    "        col : column index\n",
+    "    \"\"\"\n",
+    "    vmin = None\n",
+    "    vmax = None\n",
+    "    for rec in table:\n",
+    "        value = rec[col]\n",
+    "        if vmin is None:\n",
+    "            vmin = value\n",
+    "            vmax = value\n",
+    "        else:\n",
+    "            if value < vmin:\n",
+    "                vmin = value\n",
+    "            elif value > vmax:\n",
+    "                vmax = value\n",
+    "    return (vmin, vmax, vmax - vmin)\n",
+    "\n",
+    "def createLogger(name, logFilePath, logLevName):\n",
+    "    \"\"\"\n",
+    "    creates logger\n",
+    "    Parameters\n",
+    "        name : logger name\n",
+    "        logFilePath : log file path\n",
+    "        logLevName : log level\n",
+    "    \"\"\"\n",
+    "    logger = logging.getLogger(name)\n",
+    "    fHandler = logging.handlers.RotatingFileHandler(logFilePath, maxBytes=1048576, backupCount=4)\n",
+    "    logLev = logLevName.lower()\n",
+    "    if logLev == \"debug\":\n",
+    "        logLevel = logging.DEBUG\n",
+    "    elif logLev == \"info\":\n",
+    "        logLevel = logging.INFO\n",
+    "    elif logLev == \"warning\":\n",
+    "        logLevel = logging.WARNING\n",
+    "    elif logLev == \"error\":\n",
+    "        logLevel = logging.ERROR\n",
+    "    elif logLev == \"critical\":\n",
+    "        logLevel = logging.CRITICAL\n",
+    "    else:\n",
+    "        raise ValueError(\"invalid log level name \" + logLevelName)\n",
+    "    fHandler.setLevel(logLevel)\n",
+    "    fFormat = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n",
+    "    fHandler.setFormatter(fFormat)\n",
+    "    logger.addHandler(fHandler)\n",
+    "    logger.setLevel(logLevel)\n",
+    "    return logger\n",
+    "\n",
+    "@contextmanager\n",
+    "def suppressStdout():\n",
+    "    \"\"\"\n",
+    "    suppress stdout\n",
+    "    Parameters\n",
+    "    \"\"\"\n",
+    "    with open(os.devnull, \"w\") as devnull:\n",
+    "        oldStdout = sys.stdout\n",
+    "        sys.stdout = devnull\n",
+    "        try:  \n",
+    "            yield\n",
+    "        finally:\n",
+    "            sys.stdout = oldStdout\n",
+    "\n",
+    "def exitWithMsg(msg):\n",
+    "    \"\"\"\n",
+    "    print message and exit\n",
+    "    Parameters\n",
+    "        msg : message\n",
+    "    \"\"\"\n",
+    "    print(msg + \" -- quitting\")\n",
+    "    sys.exit(0)\n",
+    "\n",
+    "def drawLine(data, yscale=None):\n",
+    "    \"\"\"\n",
+    "    line plot\n",
+    "    Parameters\n",
+    "        data : list data\n",
+    "        yscale : y axis scale\n",
+    "    \"\"\"\n",
+    "    plt.plot(data)\n",
+    "    if yscale:\n",
+    "        step = int(yscale / 10)\n",
+    "        step = int(step / 10) * 10\n",
+    "        plt.yticks(range(0, yscale, step))\n",
+    "    plt.show()\n",
+    "\n",
+    "def drawPlot(x, y, xlabel, ylabel):\n",
+    "    \"\"\"\n",
+    "    line plot\n",
+    "    Parameters\n",
+    "        x : x values\n",
+    "        y : y values\n",
+    "        xlabel : x axis label\n",
+    "        ylabel : y axis label\n",
+    "    \"\"\"\n",
+    "    plt.plot(x,y)\n",
+    "    plt.xlabel(xlabel)\n",
+    "    plt.ylabel(ylabel)\n",
+    "    plt.show()\n",
+    "\n",
+    "def drawPairPlot(x, y1, y2, xlabel,ylabel, y1label, y2label):\n",
+    "    \"\"\"\n",
+    "    line plot of 2 lines\n",
+    "    Parameters\n",
+    "        x : x values\n",
+    "        y1 : first y values\n",
+    "        y2 : second y values\n",
+    "        xlabel : x labbel\n",
+    "        ylabel : y label\n",
+    "        y1label : first plot label\n",
+    "        y2label : second plot label\n",
+    "    \"\"\"\n",
+    "    plt.plot(x, y1, label = y1label)\n",
+    "    plt.plot(x, y2, label = y2label)\n",
+    "    plt.xlabel(xlabel)\n",
+    "    plt.ylabel(ylabel)\n",
+    "    plt.legend()\n",
+    "    plt.show()\n",
+    "\n",
+    "def drawHist(ldata, myTitle, myXlabel, myYlabel, nbins=10):\n",
+    "    \"\"\"\n",
+    "    draw histogram\n",
+    "    Parameters\n",
+    "        ldata : list data\n",
+    "        myTitle : title\n",
+    "        myXlabel : x label\n",
+    "        myYlabel : y label \n",
+    "        nbins : num of bins\n",
+    "    \"\"\"\n",
+    "    plt.hist(ldata, bins=nbins, density=True)\n",
+    "    plt.title(myTitle)\n",
+    "    plt.xlabel(myXlabel)\n",
+    "    plt.ylabel(myYlabel)\n",
+    "    plt.show()\n",
+    "\n",
+    "def saveObject(obj, filePath):\n",
+    "    \"\"\"\n",
+    "    saves an object\n",
+    "    Parameters\n",
+    "        obj : object\n",
+    "        filePath : file path for saved object\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"wb\") as outfile:\n",
+    "        pickle.dump(obj,outfile)\n",
+    "\n",
+    "def restoreObject(filePath):\n",
+    "    \"\"\"\n",
+    "    restores an object\n",
+    "    Parameters\n",
+    "        filePath : file path to restore object from\n",
+    "    \"\"\"\n",
+    "    with open(filePath, \"rb\") as infile:\n",
+    "        obj = pickle.load(infile)\n",
+    "    return obj\n",
+    "\n",
+    "def isNumeric(data):\n",
+    "    \"\"\"\n",
+    "    true if all elements int or float\n",
+    "    Parameters\n",
+    "        data : numeric data list\n",
+    "    \"\"\"\n",
+    "    if type(data) == list or type(data) == np.ndarray:\n",
+    "        col = pd.Series(data)\n",
+    "    else:\n",
+    "        col = data\n",
+    "    return col.dtype == np.int32 or col.dtype == np.int64 or col.dtype == np.float32 or col.dtype == np.float64\n",
+    "\n",
+    "def isInteger(data):\n",
+    "    \"\"\"\n",
+    "    true if all elements int \n",
+    "    Parameters\n",
+    "        data : numeric data list\n",
+    "    \"\"\"\n",
+    "    if type(data) == list or type(data) == np.ndarray:\n",
+    "        col = pd.Series(data)\n",
+    "    else:\n",
+    "        col = data\n",
+    "    return col.dtype == np.int32 or col.dtype == np.int64\n",
+    "\n",
+    "def isFloat(data):\n",
+    "    \"\"\"\n",
+    "    true if all elements  float\n",
+    "    Parameters\n",
+    "        data : numeric data list\n",
+    "    \"\"\"\n",
+    "    if type(data) == list or type(data) == np.ndarray:\n",
+    "        col = pd.Series(data)\n",
+    "    else:\n",
+    "        col = data\n",
+    "    return col.dtype == np.float32 or col.dtype == np.float64\n",
+    "\n",
+    "def isBinary(data):\n",
+    "    \"\"\"\n",
+    "    true if all elements either 0 or 1\n",
+    "    Parameters\n",
+    "        data : binary data\n",
+    "    \"\"\"\n",
+    "    re = next((d for d in data if not (type(d) == int and (d == 0 or d == 1))), None)\n",
+    "    return (re is None)\n",
+    "\n",
+    "def isCategorical(data):\n",
+    "    \"\"\"\n",
+    "    true if all elements int or string\n",
+    "    Parameters\n",
+    "        data : data value\n",
+    "    \"\"\"\n",
+    "    re = next((d for d in data if not (type(d) == int or type(d) == str)), None)\n",
+    "    return (re is None)\n",
+    "\n",
+    "def assertEqual(value, veq, msg):\n",
+    "    \"\"\"\n",
+    "    assert equal to\n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        veq : value to be equated with\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value == veq , msg\n",
+    "\n",
+    "def assertGreater(value, vmin, msg):\n",
+    "    \"\"\"\n",
+    "    assert greater than \n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        vmin : minimum value\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value > vmin , msg\n",
+    "\n",
+    "def assertGreaterEqual(value, vmin, msg):\n",
+    "    \"\"\"\n",
+    "    assert greater than \n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        vmin : minimum value\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value >= vmin , msg\n",
+    "\n",
+    "def assertLesser(value, vmax, msg):\n",
+    "    \"\"\"\n",
+    "    assert less than\n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        vmax : maximum value\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value < vmax , msg\n",
+    "\n",
+    "def assertLesserEqual(value, vmax, msg):\n",
+    "    \"\"\"\n",
+    "    assert less than\n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        vmax : maximum value\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value <= vmax , msg\n",
+    "\n",
+    "def assertWithinRange(value, vmin, vmax, msg):\n",
+    "    \"\"\"\n",
+    "    assert within range\n",
+    "    Parameters\n",
+    "        value : value\n",
+    "        vmin : minimum value\n",
+    "        vmax : maximum value\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value >= vmin and value <= vmax, msg\n",
+    "\n",
+    "def assertInList(value, values, msg):\n",
+    "    \"\"\"\n",
+    "    assert contains in a list\n",
+    "    Parameters\n",
+    "        value ; balue to check for inclusion\n",
+    "        values : list data\n",
+    "        msg : error msg\n",
+    "    \"\"\"\n",
+    "    assert value in values, msg\n",
+    "\n",
+    "def maxListDist(l1, l2):\n",
+    "    \"\"\"\n",
+    "    maximum list element difference between 2 lists\n",
+    "    Parameters\n",
+    "        l1 : first list data\n",
+    "        l2 : second list data\n",
+    "    \"\"\"\n",
+    "    dist = max(list(map(lambda v : abs(v[0] - v[1]), zip(l1, l2))))\t\n",
+    "    return dist\n",
+    "\n",
+    "def fileLineCount(fPath):\n",
+    "    \"\"\" \n",
+    "    number of lines ina file \n",
+    "    Parameters\n",
+    "        fPath : file path\n",
+    "    \"\"\"\n",
+    "    with open(fPath) as f:\n",
+    "        for i, li in enumerate(f):\n",
+    "            pass\n",
+    "    return (i + 1)\n",
+    "\n",
+    "def getAlphaNumCharCount(sdata):\n",
+    "    \"\"\" \n",
+    "    number of alphabetic and numeric charcters in a string \n",
+    "    Parameters\n",
+    "        sdata : string data\n",
+    "    \"\"\"\n",
+    "    acount = 0\n",
+    "    ncount = 0\n",
+    "    scount = 0\n",
+    "    ocount = 0\n",
+    "    assertEqual(type(sdata), str, \"input must be string\")\n",
+    "    for c in sdata:\n",
+    "        if c.isnumeric():\n",
+    "            ncount += 1\n",
+    "        elif c.isalpha():\n",
+    "            acount += 1\n",
+    "        elif c.isspace():\n",
+    "            scount += 1\n",
+    "        else:\n",
+    "            ocount += 1\n",
+    "    r = (acount, ncount, ocount)\n",
+    "    return r\n",
+    "\n",
+    "class StepFunction:\n",
+    "    \"\"\"\n",
+    "    step function\n",
+    "    Parameters\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  *values):\n",
+    "        \"\"\"\n",
+    "        initilizer\n",
+    "\n",
+    "        Parameters\n",
+    "            values : list of tuples, wich each tuple containing 2 x values and corresponding y value\n",
+    "        \"\"\"\n",
+    "        self.points = values\n",
+    "\n",
+    "    def find(self, x):\n",
+    "        \"\"\"\n",
+    "        finds step function value\n",
+    "\n",
+    "        Parameters\n",
+    "            x : x value\n",
+    "        \"\"\"\n",
+    "        found = False\n",
+    "        y = 0\n",
+    "        for p in self.points:\n",
+    "            if (x >= p[0] and x < p[1]):\n",
+    "                y = p[2]\n",
+    "                found = True\n",
+    "                break\n",
+    "\n",
+    "        if not found:\n",
+    "            l = len(self.points)\n",
+    "            if (x < self.points[0][0]):\n",
+    "                y = self.points[0][2]\n",
+    "            elif (x > self.points[l-1][1]):\n",
+    "                y = self.points[l-1][2]\n",
+    "        return y\n",
+    "\n",
+    "\n",
+    "class DummyVarGenerator:\n",
+    "    \"\"\"\n",
+    "    dummy variable generator for categorical variable\n",
+    "    \"\"\"\n",
+    "    def __init__(self,  rowSize, catValues, trueVal, falseVal, delim=None):\n",
+    "        \"\"\"\n",
+    "        initilizer\n",
+    "\n",
+    "        Parameters\n",
+    "            rowSize : row size\n",
+    "            catValues : dictionary with field index as key and list of categorical values as value\n",
+    "            trueVal : true value, typically \"1\"\n",
+    "            falseval : false value , typically \"0\"\n",
+    "            delim : field delemeter\n",
+    "        \"\"\"\n",
+    "        self.rowSize = rowSize\n",
+    "        self.catValues = catValues\n",
+    "        numCatVar = len(catValues)\n",
+    "        colCount = 0\n",
+    "        for v in self.catValues.values():\n",
+    "            colCount += len(v)\n",
+    "        self.newRowSize = rowSize - numCatVar + colCount\n",
+    "        #print (\"new row size {}\".format(self.newRowSize))\n",
+    "        self.trueVal = trueVal\n",
+    "        self.falseVal = falseVal\n",
+    "        self.delim = delim\n",
+    "\n",
+    "    def processRow(self, row):\n",
+    "        \"\"\"\n",
+    "        encodes categorical variables, returning as delemeter separate dstring or list\n",
+    "\n",
+    "        Parameters\n",
+    "            row : row either delemeter separated string or list\n",
+    "        \"\"\"\n",
+    "        if self.delim is not None:\n",
+    "            rowArr = row.split(self.delim)\n",
+    "            msg = \"row does not have expected number of columns found \" + str(len(rowArr)) + \" expected \" + str(self.rowSize)\n",
+    "            assert len(rowArr) == self.rowSize, msg\n",
+    "        else:\n",
+    "            rowArr = row\n",
+    "\n",
+    "        newRowArr = []\n",
+    "        for i in range(len(rowArr)):\n",
+    "            curVal = rowArr[i]\n",
+    "            if (i in self.catValues):\n",
+    "                values = self.catValues[i]\n",
+    "                for val in values:\n",
+    "                    if val == curVal:\n",
+    "                        newVal = self.trueVal\n",
+    "                    else:\n",
+    "                        newVal = self.falseVal\n",
+    "                    newRowArr.append(newVal)\n",
+    "            else:\n",
+    "                newRowArr.append(curVal)\n",
+    "        assert len(newRowArr) == self.newRowSize, \"invalid new row size \" + str(len(newRowArr)) + \" expected \" + str(self.newRowSize)\n",
+    "        encRow = self.delim.join(newRowArr) if self.delim is not None else newRowArr\n",
+    "        return encRow\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

model/tnn/pdamb.mod ADDED Viewed

Binary file (1.45 kB). View file