saritha5 commited on
Commit
fc22863
·
1 Parent(s): 757e41f

Upload 13 files

Browse files
lib/.ipynb_checkpoints/mlutil-checkpoint.ipynb ADDED
@@ -0,0 +1,1297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "2d05ce02",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import sys\n",
12
+ "import numpy as np\n",
13
+ "from sklearn import preprocessing\n",
14
+ "from sklearn import metrics\n",
15
+ "from sklearn.datasets import make_blobs\n",
16
+ "from sklearn.datasets import make_classification\n",
17
+ "import random\n",
18
+ "from math import *\n",
19
+ "from decimal import Decimal\n",
20
+ "import statistics\n",
21
+ "import jprops\n",
22
+ "from Levenshtein import distance as ld\n",
23
+ "from util import *\n",
24
+ "from sampler import *\n",
25
+ "\n",
26
+ "class Configuration:\n",
27
+ " \"\"\"\n",
28
+ " Configuration management. Supports default value, mandatory value and typed value.\n",
29
+ " \"\"\"\n",
30
+ " def __init__(self, configFile, defValues, verbose=False):\n",
31
+ " \"\"\"\n",
32
+ " initializer\n",
33
+ "\n",
34
+ " Parameters\n",
35
+ " configFile : config file path\n",
36
+ " defValues : dictionary of default values\n",
37
+ " verbose : verbosity flag\n",
38
+ " \"\"\"\n",
39
+ " configs = {}\n",
40
+ " with open(configFile) as fp:\n",
41
+ " for key, value in jprops.iter_properties(fp):\n",
42
+ " configs[key] = value\n",
43
+ " self.configs = configs\n",
44
+ " self.defValues = defValues\n",
45
+ " self.verbose = verbose\n",
46
+ "\n",
47
+ " def override(self, configFile):\n",
48
+ " \"\"\"\n",
49
+ " over ride configuration from file\n",
50
+ "\n",
51
+ " Parameters\n",
52
+ " configFile : override config file path\n",
53
+ " \"\"\"\n",
54
+ " with open(configFile) as fp:\n",
55
+ " for key, value in jprops.iter_properties(fp):\n",
56
+ " self.configs[key] = value\n",
57
+ "\n",
58
+ "\n",
59
+ " def setParam(self, name, value):\n",
60
+ " \"\"\"\n",
61
+ " override individual configuration\n",
62
+ " Parameters\n",
63
+ " name : config param name\n",
64
+ " value : config param value\n",
65
+ " \"\"\"\n",
66
+ " self.configs[name] = value\n",
67
+ "\n",
68
+ "\n",
69
+ " def getStringConfig(self, name):\n",
70
+ " \"\"\"\n",
71
+ " get string param\n",
72
+ " Parameters\n",
73
+ " name : config param name\n",
74
+ " \"\"\"\n",
75
+ " if self.isNone(name):\n",
76
+ " val = (None, False)\n",
77
+ " elif self.isDefault(name):\n",
78
+ " val = (self.handleDefault(name), True)\n",
79
+ " else:\n",
80
+ " val = (self.configs[name], False)\n",
81
+ " if self.verbose:\n",
82
+ " print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
83
+ " return val\n",
84
+ "\n",
85
+ "\n",
86
+ " def getIntConfig(self, name):\n",
87
+ " \"\"\"\n",
88
+ " get int param\n",
89
+ " Parameters\n",
90
+ " name : config param name\n",
91
+ " \"\"\"\n",
92
+ " #print \"%s %s\" %(name,self.configs[name])\n",
93
+ " if self.isNone(name):\n",
94
+ " val = (None, False)\n",
95
+ " elif self.isDefault(name):\n",
96
+ " val = (self.handleDefault(name), True)\n",
97
+ " else:\n",
98
+ " val = (int(self.configs[name]), False)\n",
99
+ " if self.verbose:\n",
100
+ " print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
101
+ " return val\n",
102
+ "\n",
103
+ "\n",
104
+ " def getFloatConfig(self, name):\n",
105
+ " \"\"\"\n",
106
+ " get float param\n",
107
+ " Parameters\n",
108
+ " name : config param name\n",
109
+ " \"\"\"\n",
110
+ " #print \"%s %s\" %(name,self.configs[name])\n",
111
+ " if self.isNone(name):\n",
112
+ " val = (None, False)\n",
113
+ " elif self.isDefault(name):\n",
114
+ " val = (self.handleDefault(name), True)\n",
115
+ " else:\n",
116
+ " val = (float(self.configs[name]), False)\n",
117
+ " if self.verbose:\n",
118
+ " print( \"{} {} {:06.3f}\".format(name, self.configs[name], val[0]))\n",
119
+ " return val\n",
120
+ "\n",
121
+ "\n",
122
+ " def getBooleanConfig(self, name):\n",
123
+ " \"\"\"\n",
124
+ " #get boolean param\n",
125
+ " Parameters\n",
126
+ " name : config param name\n",
127
+ " \"\"\"\n",
128
+ " if self.isNone(name):\n",
129
+ " val = (None, False)\n",
130
+ " elif self.isDefault(name):\n",
131
+ " val = (self.handleDefault(name), True)\n",
132
+ " else:\n",
133
+ " bVal = self.configs[name].lower() == \"true\"\n",
134
+ " val = (bVal, False)\n",
135
+ " if self.verbose:\n",
136
+ " print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
137
+ " return val\n",
138
+ "\n",
139
+ "\n",
140
+ " def getIntListConfig(self, name, delim=\",\"):\n",
141
+ " \"\"\"\n",
142
+ " get int list param\n",
143
+ " Parameters\n",
144
+ " name : config param name\n",
145
+ " delim : delemeter\n",
146
+ " \"\"\"\n",
147
+ " if self.isNone(name):\n",
148
+ " val = (None, False)\n",
149
+ " elif self.isDefault(name):\n",
150
+ " val = (self.handleDefault(name), True)\n",
151
+ " else:\n",
152
+ " delSepStr = self.getStringConfig(name)\n",
153
+ "\n",
154
+ " #specified as range\n",
155
+ " intList = strListOrRangeToIntArray(delSepStr[0])\n",
156
+ " val =(intList, delSepStr[1])\n",
157
+ " return val\n",
158
+ "\n",
159
+ " def getFloatListConfig(self, name, delim=\",\"):\n",
160
+ " \"\"\"\n",
161
+ " get float list param\n",
162
+ " Parameters\n",
163
+ " name : config param name\n",
164
+ " delim : delemeter\n",
165
+ " \"\"\"\n",
166
+ " delSepStr = self.getStringConfig(name)\n",
167
+ " if self.isNone(name):\n",
168
+ " val = (None, False)\n",
169
+ " elif self.isDefault(name):\n",
170
+ " val = (self.handleDefault(name), True)\n",
171
+ " else:\n",
172
+ " flList = strToFloatArray(delSepStr[0], delim)\n",
173
+ " val =(flList, delSepStr[1])\n",
174
+ " return val\n",
175
+ "\n",
176
+ "\n",
177
+ " def getStringListConfig(self, name, delim=\",\"):\n",
178
+ " \"\"\"\n",
179
+ " get string list param\n",
180
+ " Parameters\n",
181
+ " name : config param name\n",
182
+ " delim : delemeter\n",
183
+ " \"\"\"\n",
184
+ " delSepStr = self.getStringConfig(name)\n",
185
+ " if self.isNone(name):\n",
186
+ " val = (None, False)\n",
187
+ " elif self.isDefault(name):\n",
188
+ " val = (self.handleDefault(name), True)\n",
189
+ " else:\n",
190
+ " strList = delSepStr[0].split(delim)\n",
191
+ " val = (strList, delSepStr[1])\n",
192
+ " return val\n",
193
+ "\n",
194
+ " def handleDefault(self, name):\n",
195
+ " \"\"\"\n",
196
+ " handles default\n",
197
+ " Parameters\n",
198
+ " name : config param name\n",
199
+ " \"\"\"\n",
200
+ " dVal = self.defValues[name]\n",
201
+ " if (dVal[1] is None):\n",
202
+ " val = dVal[0]\n",
203
+ " else:\n",
204
+ " raise ValueError(dVal[1])\n",
205
+ " return val\n",
206
+ "\n",
207
+ "\n",
208
+ " def isNone(self, name):\n",
209
+ " \"\"\"\n",
210
+ " true is value is None\t\n",
211
+ " Parameters\n",
212
+ " name : config param name\n",
213
+ " \"\"\"\n",
214
+ " return self.configs[name].lower() == \"none\"\n",
215
+ "\n",
216
+ "\n",
217
+ " def isDefault(self, name):\n",
218
+ " \"\"\"\n",
219
+ " true if the value is default\t\n",
220
+ " Parameters\n",
221
+ " name : config param name\n",
222
+ " \"\"\"\n",
223
+ " de = self.configs[name] == \"_\"\n",
224
+ " #print de\n",
225
+ " return de\n",
226
+ "\n",
227
+ "\n",
228
+ " def eitherOrStringConfig(self, firstName, secondName):\n",
229
+ " \"\"\"\n",
230
+ " returns one of two string parameters\t\n",
231
+ " Parameters\n",
232
+ " firstName : first parameter name\n",
233
+ " secondName : second parameter name\t\n",
234
+ " \"\"\"\n",
235
+ " if not self.isNone(firstName):\n",
236
+ " first = self.getStringConfig(firstName)[0]\n",
237
+ " second = None\n",
238
+ " if not self.isNone(secondName):\n",
239
+ " raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \" \" + secondName)\n",
240
+ " else:\n",
241
+ " if not self.isNone(secondName):\n",
242
+ " second = self.getStringConfig(secondtName)[0]\n",
243
+ " first = None\n",
244
+ " else:\n",
245
+ " raise ValueError(\"at least one of the two parameters should be set \" + firstName + \" \" + secondName)\n",
246
+ " return (first, second)\n",
247
+ "\n",
248
+ "\n",
249
+ " def eitherOrIntConfig(self, firstName, secondName):\n",
250
+ " \"\"\"\n",
251
+ " returns one of two int parameters\t\n",
252
+ " Parameters\n",
253
+ " firstName : first parameter name\n",
254
+ " secondName : second parameter name\t\n",
255
+ " \"\"\"\n",
256
+ " if not self.isNone(firstName):\n",
257
+ " first = self.getIntConfig(firstName)[0]\n",
258
+ " second = None\n",
259
+ " if not self.isNone(secondName):\n",
260
+ " raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \" \" + secondName)\n",
261
+ " else:\n",
262
+ " if not self.isNone(secondName):\n",
263
+ " second = self.getIntConfig(secondsName)[0]\n",
264
+ " first = None\n",
265
+ " else:\n",
266
+ " raise ValueError(\"at least one of the two parameters should be set \" + firstName + \" \" + secondName)\n",
267
+ " return (first, second)\n",
268
+ "\n",
269
+ "\n",
270
+ "class CatLabelGenerator:\n",
271
+ " \"\"\"\n",
272
+ " label generator for categorical variables\n",
273
+ " \"\"\"\n",
274
+ " def __init__(self, catValues, delim):\n",
275
+ " \"\"\"\n",
276
+ " initilizers\n",
277
+ "\n",
278
+ " Parameters\n",
279
+ " catValues : dictionary of categorical values\n",
280
+ " delim : delemeter\n",
281
+ " \"\"\"\n",
282
+ " self.encoders = {}\n",
283
+ " self.catValues = catValues\n",
284
+ " self.delim = delim\n",
285
+ " for k in self.catValues.keys():\t\n",
286
+ " le = preprocessing.LabelEncoder()\t\n",
287
+ " le.fit(self.catValues[k])\n",
288
+ " self.encoders[k] = le\n",
289
+ "\n",
290
+ " def processRow(self, row):\t\n",
291
+ " \"\"\"\n",
292
+ " encode row categorical values\n",
293
+ "\n",
294
+ " Parameters:\n",
295
+ " row : data row\n",
296
+ " \"\"\"\n",
297
+ " #print row\n",
298
+ " rowArr = row.split(self.delim)\n",
299
+ " for i in range(len(rowArr)):\n",
300
+ " if (i in self.catValues):\n",
301
+ " curVal = rowArr[i]\n",
302
+ " assert curVal in self.catValues[i], \"categorival value invalid\"\n",
303
+ " encVal = self.encoders[i].transform([curVal])\n",
304
+ " rowArr[i] = str(encVal[0])\n",
305
+ " return self.delim.join(rowArr)\t\t\n",
306
+ "\n",
307
+ " def getOrigLabels(self, indx):\n",
308
+ " \"\"\"\n",
309
+ " get original labels\n",
310
+ "\n",
311
+ " Parameters:\n",
312
+ " indx : column index\n",
313
+ " \"\"\"\n",
314
+ " return self.encoders[indx].classes_\t\n",
315
+ "\n",
316
+ "\n",
317
+ "class SupvLearningDataGenerator:\n",
318
+ " \"\"\"\n",
319
+ " data generator for supervised learning\n",
320
+ " \"\"\"\n",
321
+ " def __init__(self, configFile):\n",
322
+ " \"\"\"\n",
323
+ " initilizers\n",
324
+ "\n",
325
+ " Parameters\n",
326
+ " configFile : config file path\n",
327
+ " \"\"\"\n",
328
+ " defValues = dict()\n",
329
+ " defValues[\"common.num.samp\"] = (100, None)\n",
330
+ " defValues[\"common.num.feat\"] = (5, None)\n",
331
+ " defValues[\"common.feat.trans\"] = (None, None)\n",
332
+ " defValues[\"common.feat.types\"] = (None, \"missing feature types\")\n",
333
+ " defValues[\"common.cat.feat.distr\"] = (None, None)\n",
334
+ " defValues[\"common.output.precision\"] = (3, None)\n",
335
+ " defValues[\"common.error\"] = (0.01, None)\n",
336
+ " defValues[\"class.gen.technique\"] = (\"blob\", None)\n",
337
+ " defValues[\"class.num.feat.informative\"] = (2, None)\n",
338
+ " defValues[\"class.num.feat.redundant\"] = (2, None)\n",
339
+ " defValues[\"class.num.feat.repeated\"] = (0, None)\n",
340
+ " defValues[\"class.num.feat.cat\"] = (0, None)\n",
341
+ " defValues[\"class.num.class\"] = (2, None)\n",
342
+ "\n",
343
+ " self.config = Configuration(configFile, defValues)\n",
344
+ "\n",
345
+ " def genClassifierData(self):\n",
346
+ " \"\"\"\n",
347
+ " generates classifier data\n",
348
+ " \"\"\"\n",
349
+ " nsamp = self.config.getIntConfig(\"common.num.samp\")[0]\n",
350
+ " nfeat = self.config.getIntConfig(\"common.num.feat\")[0]\n",
351
+ " nclass = self.config.getIntConfig(\"class.num.class\")[0]\n",
352
+ " #transform with shift and scale\n",
353
+ " ftrans = self.config.getFloatListConfig(\"common.feat.trans\")[0]\n",
354
+ " feTrans = dict()\n",
355
+ " for i in range(0, len(ftrans), 2):\n",
356
+ " tr = (ftrans[i], ftrans[i+1])\n",
357
+ " indx = int(i/2)\n",
358
+ " feTrans[indx] = tr\n",
359
+ "\n",
360
+ " ftypes = self.config.getStringListConfig(\"common.feat.types\")[0]\n",
361
+ "\n",
362
+ " # categorical feature distribution\n",
363
+ " feCatDist = dict()\n",
364
+ " fcatdl = self.config.getStringListConfig(\"common.cat.feat.distr\")[0]\n",
365
+ " for fcatds in fcatdl:\n",
366
+ " fcatd = fcatds.split(\":\")\n",
367
+ " feInd = int(fcatd[0])\n",
368
+ " clVal = int(fcatd[1])\n",
369
+ " key = (feInd, clVal)\t\t#feature index and class value\n",
370
+ " dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2)))\n",
371
+ " feCatDist[key] = CategoricalRejectSampler(*dist)\n",
372
+ "\n",
373
+ " #shift and scale\n",
374
+ " genTechnique = self.config.getStringConfig(\"class.gen.technique\")[0]\n",
375
+ " error = self.config.getFloatConfig(\"common.error\")[0]\n",
376
+ " if genTechnique == \"blob\":\n",
377
+ " features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat)\n",
378
+ " for i in range(nsamp):\t\t\t#shift and scale\n",
379
+ " for j in range(nfeat):\n",
380
+ " tr = feTrans[j]\n",
381
+ " features[i,j] = (features[i,j] + tr[0]) * tr[1]\n",
382
+ " claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz)))\n",
383
+ " elif genTechnique == \"classify\":\n",
384
+ " nfeatInfo = self.config.getIntConfig(\"class.num.feat.informative\")[0]\n",
385
+ " nfeatRed = self.config.getIntConfig(\"class.num.feat.redundant\")[0]\n",
386
+ " nfeatRep = self.config.getIntConfig(\"class.num.feat.repeated\")[0]\n",
387
+ " shifts = list(map(lambda i : feTrans[i][0], range(nfeat)))\n",
388
+ " scales = list(map(lambda i : feTrans[i][1], range(nfeat)))\n",
389
+ " features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed, \n",
390
+ " n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales)\n",
391
+ " else:\n",
392
+ " raise \"invalid genaration technique\"\n",
393
+ "\n",
394
+ " # add categorical features and format\n",
395
+ " nCatFeat = self.config.getIntConfig(\"class.num.feat.cat\")[0]\n",
396
+ " prec = self.config.getIntConfig(\"common.output.precision\")[0]\n",
397
+ " for f , c in zip(features, claz):\n",
398
+ " nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat)))\n",
399
+ " if nCatFeat > 0:\n",
400
+ " cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1)))\n",
401
+ " rec = \",\".join(nfs) + \",\" + \",\".join(cfs) + \",\" + str(c)\n",
402
+ " else:\n",
403
+ " rec = \",\".join(nfs) + \",\" + str(c)\n",
404
+ " yield rec\n",
405
+ "\n",
406
+ " def numFeToStr(self, fv, ft, prec):\n",
407
+ " \"\"\"\n",
408
+ " nummeric feature value to string\n",
409
+ "\n",
410
+ " Parameters\n",
411
+ " fv : field value\n",
412
+ " ft : field data type\n",
413
+ " prec : precision\n",
414
+ " \"\"\"\n",
415
+ " if ft == \"float\":\n",
416
+ " s = formatFloat(prec, fv)\n",
417
+ " elif ft ==\"int\":\n",
418
+ " s = str(int(fv))\n",
419
+ " else:\t\t\n",
420
+ " raise \"invalid type expecting float or int\"\n",
421
+ " return s\n",
422
+ "\n",
423
+ " def catFe(self, i, cv, ft, feCatDist):\n",
424
+ " \"\"\"\n",
425
+ " generate categorical feature\n",
426
+ "\n",
427
+ " Parameters\n",
428
+ " i : col index\n",
429
+ " cv : class value\n",
430
+ " ft : field data type\n",
431
+ " feCatDist : cat value distribution\n",
432
+ " \"\"\"\n",
433
+ " if ft == \"cat\":\n",
434
+ " key = (i, cv)\n",
435
+ " s = feCatDist[key].sample()\n",
436
+ " else:\t\t\n",
437
+ " raise \"invalid type expecting categorical\"\n",
438
+ " return s\n",
439
+ "\n",
440
+ "\n",
441
+ "\n",
442
+ "def loadDataFile(file, delim, cols, colIndices):\n",
443
+ " \"\"\"\n",
444
+ " loads delim separated file and extracts columns\n",
445
+ " Parameters\n",
446
+ " file : file path\n",
447
+ " delim : delemeter\n",
448
+ " cols : columns to use from file\n",
449
+ " colIndices ; columns to extract\n",
450
+ " \"\"\"\n",
451
+ " data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
452
+ " extrData = data[:,colIndices]\n",
453
+ " return (data, extrData)\n",
454
+ "\n",
455
+ "def loadFeatDataFile(file, delim, cols):\n",
456
+ " \"\"\"\n",
457
+ " loads delim separated file and extracts columns\n",
458
+ "\n",
459
+ " Parameters\n",
460
+ " file : file path\n",
461
+ " delim : delemeter\n",
462
+ " cols : columns to use from file\n",
463
+ " \"\"\"\n",
464
+ " data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
465
+ " return data\n",
466
+ "\n",
467
+ "def extrColumns(arr, columns):\n",
468
+ " \"\"\"\n",
469
+ " extracts columns\n",
470
+ "\n",
471
+ " Parameters\n",
472
+ " arr : 2D array\n",
473
+ " columns : columns\n",
474
+ " \"\"\"\n",
475
+ " return arr[:, columns]\n",
476
+ "\n",
477
+ "def subSample(featData, clsData, subSampleRate, withReplacement):\n",
478
+ " \"\"\"\n",
479
+ " subsample feature and class label data\t\n",
480
+ " Parameters\n",
481
+ " featData : 2D array of feature data\n",
482
+ " clsData : arrray of class labels\n",
483
+ " subSampleRate : fraction to be sampled\n",
484
+ " withReplacement : true if sampling with replacement\n",
485
+ " \"\"\"\n",
486
+ " sampSize = int(featData.shape[0] * subSampleRate)\n",
487
+ " sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement)\n",
488
+ " sampFeat = featData[sampledIndx]\n",
489
+ " sampCls = clsData[sampledIndx]\n",
490
+ " return(sampFeat, sampCls)\n",
491
+ "\n",
492
+ "def euclideanDistance(x,y):\n",
493
+ " \"\"\"\n",
494
+ " euclidean distance\n",
495
+ " Parameters\n",
496
+ " x : first vector\n",
497
+ " y : second fvector\n",
498
+ " \"\"\"\n",
499
+ " return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y)))\n",
500
+ "\n",
501
+ "def squareRooted(x):\n",
502
+ " \"\"\"\n",
503
+ " square root of sum square\n",
504
+ " Parameters\n",
505
+ " x : data vector\n",
506
+ " \"\"\"\n",
507
+ " return round(sqrt(sum([a*a for a in x])),3)\n",
508
+ "\n",
509
+ "def cosineSimilarity(x,y):\n",
510
+ " \"\"\"\n",
511
+ " cosine similarity\n",
512
+ "\n",
513
+ " Parameters\n",
514
+ " x : first vector\n",
515
+ " y : second fvector\n",
516
+ " \"\"\"\n",
517
+ " numerator = sum(a*b for a,b in zip(x,y))\n",
518
+ " denominator = squareRooted(x) * squareRooted(y)\n",
519
+ " return round(numerator / float(denominator), 3)\n",
520
+ "\n",
521
+ "def cosineDistance(x,y):\n",
522
+ " \"\"\"\n",
523
+ " cosine distance\n",
524
+ " Parameters\n",
525
+ " x : first vector\n",
526
+ " y : second fvector\n",
527
+ " \"\"\"\n",
528
+ " return 1.0 - cosineSimilarity(x,y)\n",
529
+ "\n",
530
+ "def manhattanDistance(x,y):\n",
531
+ " \"\"\"\n",
532
+ " manhattan distance\n",
533
+ " Parameters\n",
534
+ " x : first vector\n",
535
+ " y : second fvector\n",
536
+ " \"\"\"\n",
537
+ " return sum(abs(a-b) for a,b in zip(x,y))\n",
538
+ "\n",
539
+ "def nthRoot(value, nRoot):\n",
540
+ " \"\"\"\n",
541
+ " nth root\n",
542
+ " Parameters\n",
543
+ " value : data value\n",
544
+ " nRoot : root\n",
545
+ " \"\"\"\n",
546
+ " rootValue = 1/float(nRoot)\n",
547
+ " return round (Decimal(value) ** Decimal(rootValue),3)\n",
548
+ "\n",
549
+ "def minkowskiDistance(x,y,pValue):\n",
550
+ " \"\"\"\n",
551
+ " minkowski distance\n",
552
+ " Parameters\n",
553
+ " x : first vector\n",
554
+ " y : second fvector\n",
555
+ " pValue : power factor\n",
556
+ " \"\"\"\n",
557
+ " return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue)\n",
558
+ "\n",
559
+ "def jaccardSimilarityX(x,y):\n",
560
+ " \"\"\"\n",
561
+ " jaccard similarity\n",
562
+ " Parameters\n",
563
+ " x : first vector\n",
564
+ " y : second fvector\n",
565
+ " \"\"\"\n",
566
+ " intersectionCardinality = len(set.intersection(*[set(x), set(y)]))\n",
567
+ " unionCardinality = len(set.union(*[set(x), set(y)]))\n",
568
+ " return intersectionCardinality/float(unionCardinality)\n",
569
+ "\n",
570
+ "def jaccardSimilarity(x,y,wx=1.0,wy=1.0):\n",
571
+ " \"\"\"\n",
572
+ " jaccard similarity\n",
573
+ "\n",
574
+ " Parameters\n",
575
+ " x : first vector\n",
576
+ " y : second fvector\n",
577
+ " wx : weight for x\n",
578
+ " wy : weight for y\n",
579
+ " \"\"\"\n",
580
+ " sx = set(x)\n",
581
+ " sy = set(y)\n",
582
+ " sxyInt = sx.intersection(sy)\n",
583
+ " intCardinality = len(sxyInt)\n",
584
+ " sxIntDiff = sx.difference(sxyInt)\n",
585
+ " syIntDiff = sy.difference(sxyInt)\n",
586
+ " unionCardinality = len(sx.union(sy))\n",
587
+ " return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff))\n",
588
+ "\n",
589
+ "def levenshteinSimilarity(s1, s2):\n",
590
+ " \"\"\"\n",
591
+ " Levenshtein similarity for strings\n",
592
+ "\n",
593
+ " Parameters\n",
594
+ " sx : first string\n",
595
+ " sy : second string\n",
596
+ " \"\"\"\n",
597
+ " assert type(s1) == str and type(s2) == str, \"Levenshtein similarity is for string only\"\n",
598
+ " d = ld(s1,s2)\n",
599
+ " #print(d)\n",
600
+ " l = max(len(s1),len(s2))\n",
601
+ " d = 1.0 - min(d/l, 1.0)\n",
602
+ " return d\t\n",
603
+ "\n",
604
+ "def norm(values, po=2):\n",
605
+ " \"\"\"\n",
606
+ " norm\n",
607
+ " Parameters\n",
608
+ " values : list of values\n",
609
+ " po : power\n",
610
+ " \"\"\"\n",
611
+ " no = sum(list(map(lambda v: pow(v,po), values)))\n",
612
+ " no = pow(no,1.0/po)\n",
613
+ " return list(map(lambda v: v/no, values))\n",
614
+ "\n",
615
+ "def createOneHotVec(size, indx = -1):\n",
616
+ " \"\"\"\n",
617
+ " random one hot vector\n",
618
+ "\n",
619
+ " Parameters\n",
620
+ " size : vector size\n",
621
+ " indx : one hot position\n",
622
+ " \"\"\"\n",
623
+ " vec = [0] * size\n",
624
+ " s = random.randint(0, size - 1) if indx < 0 else indx\n",
625
+ " vec[s] = 1\n",
626
+ " return vec\n",
627
+ "\n",
628
+ "def createAllOneHotVec(size):\n",
629
+ " \"\"\"\n",
630
+ " create all one hot vectors\n",
631
+ "\n",
632
+ " Parameters\n",
633
+ " size : vector size and no of vectors\n",
634
+ " \"\"\"\n",
635
+ " vecs = list()\n",
636
+ " for i in range(size):\n",
637
+ " vec = [0] * size\n",
638
+ " vec[i] = 1\n",
639
+ " vecs.append(vec)\n",
640
+ " return vecs\n",
641
+ "\n",
642
+ "def blockShuffle(data, blockSize):\n",
643
+ " \"\"\"\n",
644
+ " block shuffle \t\n",
645
+ "\n",
646
+ " Parameters\n",
647
+ " data : list data\n",
648
+ " blockSize : block size\n",
649
+ " \"\"\"\n",
650
+ " numBlock = int(len(data) / blockSize)\n",
651
+ " remain = len(data) % blockSize\n",
652
+ " numBlock += (1 if remain > 0 else 0)\n",
653
+ " shuffled = list()\n",
654
+ " for i in range(numBlock):\n",
655
+ " b = random.randint(0, numBlock-1)\n",
656
+ " beg = b * blockSize\n",
657
+ " if (b < numBlock-1):\n",
658
+ " end = beg + blockSize\n",
659
+ " shuffled.extend(data[beg:end])\t\t\n",
660
+ " else:\n",
661
+ " shuffled.extend(data[beg:])\n",
662
+ " return shuffled\t\n",
663
+ "\n",
664
+ "def shuffle(data, numShuffle):\n",
665
+ " \"\"\"\n",
666
+ " shuffle data by randonm swapping\n",
667
+ "\n",
668
+ " Parameters\n",
669
+ " data : list data\n",
670
+ " numShuffle : no of pairwise swaps\n",
671
+ " \"\"\"\n",
672
+ " sz = len(data)\n",
673
+ " if numShuffle is None:\n",
674
+ " numShuffle = int(sz / 2)\n",
675
+ " for i in range(numShuffle):\n",
676
+ " fi = random.randint(0, sz -1)\n",
677
+ " se = random.randint(0, sz -1)\n",
678
+ " tmp = data[fi]\n",
679
+ " data[fi] = data[se]\n",
680
+ " data[se] = tmp\t\n",
681
+ "\n",
682
+ "def randomWalk(size, start, lowStep, highStep):\n",
683
+ " \"\"\"\n",
684
+ " random walk\t\n",
685
+ "\n",
686
+ " Parameters\n",
687
+ " size : list data\n",
688
+ " start : initial position\n",
689
+ " lowStep : step min\n",
690
+ " highStep : step max\n",
691
+ " \"\"\"\n",
692
+ " cur = start\n",
693
+ " for i in range(size):\n",
694
+ " yield cur\n",
695
+ " cur += randomFloat(lowStep, highStep)\n",
696
+ "\n",
697
+ "def binaryEcodeCategorical(values, value):\n",
698
+ " \"\"\"\n",
699
+ " one hot binary encoding\t\n",
700
+ "\n",
701
+ " Parameters\n",
702
+ " values : list of values\n",
703
+ " value : value to be replaced with 1\n",
704
+ " \"\"\"\n",
705
+ " size = len(values)\n",
706
+ " vec = [0] * size\n",
707
+ " for i in range(size):\n",
708
+ " if (values[i] == value):\n",
709
+ " vec[i] = 1\n",
710
+ " return vec\t\t\n",
711
+ "\n",
712
+ "def createLabeledSeq(inputData, tw):\n",
713
+ " \"\"\"\n",
714
+ " Creates feature, label pair from sequence data, where we have tw number of features followed by output\n",
715
+ "\n",
716
+ " Parameters\n",
717
+ " values : list containing feature and label\n",
718
+ " tw : no of features\n",
719
+ " \"\"\"\n",
720
+ " features = list()\n",
721
+ " labels = list()\n",
722
+ " l = len(inputDta)\n",
723
+ " for i in range(l - tw):\n",
724
+ " trainSeq = inputData[i:i+tw]\n",
725
+ " trainLabel = inputData[i+tw]\n",
726
+ " features.append(trainSeq)\n",
727
+ " labels.append(trainLabel)\n",
728
+ " return (features, labels)\n",
729
+ "\n",
730
+ "def createLabeledSeq(filePath, delim, index, tw):\n",
731
+ " \"\"\"\n",
732
+ " Creates feature, label pair from 1D sequence data in file\t\n",
733
+ "\n",
734
+ " Parameters\n",
735
+ " filePath : file path\n",
736
+ " delim : delemeter\n",
737
+ " index : column index\n",
738
+ " tw : no of features\n",
739
+ " \"\"\"\n",
740
+ " seqData = getFileColumnAsFloat(filePath, delim, index)\n",
741
+ " return createLabeledSeq(seqData, tw)\n",
742
+ "\n",
743
+ "def fromMultDimSeqToTabular(data, inpSize, seqLen):\n",
744
+ " \"\"\"\n",
745
+ " Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize)\n",
746
+ "\n",
747
+ " Parameters\n",
748
+ " data : 2D array\n",
749
+ " inpSize : each input size in sequence\n",
750
+ " seqLen : sequence length\n",
751
+ " \"\"\"\t\n",
752
+ " nrow = data.shape[0]\n",
753
+ " assert data.shape[1] == inpSize * seqLen, \"invalid input size or sequence length\"\n",
754
+ " return data.reshape(nrow * seqLen, inpSize)\n",
755
+ "\n",
756
+ "def fromTabularToMultDimSeq(data, inpSize, seqLen):\n",
757
+ " \"\"\"\n",
758
+ " Input shape (nrow * seqLen, inpSize) output shape (nrow, inpSize * seqLen) \n",
759
+ " Parameters\n",
760
+ " data : 2D array\n",
761
+ " inpSize : each input size in sequence\n",
762
+ " seqLen : sequence length\n",
763
+ " \"\"\"\t\n",
764
+ " nrow = int(data.shape[0] / seqLen)\n",
765
+ " assert data.shape[1] == inpSize, \"invalid input size\"\n",
766
+ " return data.reshape(nrow, seqLen * inpSize)\n",
767
+ "\n",
768
+ "def difference(data, interval=1):\n",
769
+ " \"\"\"\n",
770
+ " takes difference in time series data\n",
771
+ " Parameters\n",
772
+ " data :list data\n",
773
+ " interval : interval for difference\n",
774
+ " \"\"\"\n",
775
+ " diff = list()\n",
776
+ " for i in range(interval, len(data)):\n",
777
+ " value = data[i] - data[i - interval]\n",
778
+ " diff.append(value)\n",
779
+ " return diff\n",
780
+ "\n",
781
+ "def normalizeMatrix(data, norm, axis=1):\n",
782
+ " \"\"\"\n",
783
+ " normalized each row of the matrix\n",
784
+ "\n",
785
+ " Parameters\n",
786
+ " data : 2D data\n",
787
+ " nporm : normalization method\n",
788
+ " axis : row or column\n",
789
+ " \"\"\"\n",
790
+ " normalized = preprocessing.normalize(data,norm=norm, axis=axis)\n",
791
+ " return normalized\n",
792
+ "\n",
793
+ "def standardizeMatrix(data, axis=0):\n",
794
+ " \"\"\"\n",
795
+ " standardizes each column of the matrix with mean and std deviation\n",
796
+ " Parameters\n",
797
+ " data : 2D data\n",
798
+ " axis : row or column\n",
799
+ " \"\"\"\n",
800
+ " standardized = preprocessing.scale(data, axis=axis)\n",
801
+ " return standardized\n",
802
+ "\n",
803
+ "def asNumpyArray(data):\n",
804
+ " \"\"\"\n",
805
+ " converts to numpy array\n",
806
+ " Parameters\n",
807
+ " data : array\n",
808
+ " \"\"\"\n",
809
+ " return np.array(data)\n",
810
+ "\n",
811
+ "def perfMetric(metric, yActual, yPred, clabels=None):\n",
812
+ " \"\"\"\n",
813
+ " predictive model accuracy metric\n",
814
+ " Parameters\n",
815
+ " metric : accuracy metric\n",
816
+ " yActual : actual values array\n",
817
+ " yPred : predicted values array\n",
818
+ " clabels : class labels\n",
819
+ " \"\"\"\n",
820
+ " if metric == \"rsquare\":\n",
821
+ " score = metrics.r2_score(yActual, yPred)\n",
822
+ " elif metric == \"mae\":\n",
823
+ " score = metrics.mean_absolute_error(yActual, yPred)\n",
824
+ " elif metric == \"mse\":\n",
825
+ " score = metrics.mean_squared_error(yActual, yPred)\n",
826
+ " elif metric == \"acc\":\n",
827
+ " yPred = np.rint(yPred)\n",
828
+ " score = metrics.accuracy_score(yActual, yPred)\n",
829
+ " elif metric == \"mlAcc\":\n",
830
+ " yPred = np.argmax(yPred, axis=1)\n",
831
+ " score = metrics.accuracy_score(yActual, yPred)\n",
832
+ " elif metric == \"prec\":\n",
833
+ " yPred = np.argmax(yPred, axis=1)\n",
834
+ " score = metrics.precision_score(yActual, yPred)\n",
835
+ " elif metric == \"rec\":\n",
836
+ " yPred = np.argmax(yPred, axis=1)\n",
837
+ " score = metrics.recall_score(yActual, yPred)\n",
838
+ " elif metric == \"fone\":\n",
839
+ " yPred = np.argmax(yPred, axis=1)\n",
840
+ " score = metrics.f1_score(yActual, yPred)\n",
841
+ " elif metric == \"confm\":\n",
842
+ " yPred = np.argmax(yPred, axis=1)\n",
843
+ " score = metrics.confusion_matrix(yActual, yPred)\n",
844
+ " elif metric == \"clarep\":\n",
845
+ " yPred = np.argmax(yPred, axis=1)\n",
846
+ " score = metrics.classification_report(yActual, yPred)\n",
847
+ " elif metric == \"bce\":\n",
848
+ " if clabels is None:\n",
849
+ " clabels = [0, 1]\n",
850
+ " score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
851
+ " elif metric == \"ce\":\n",
852
+ " assert clabels is not None, \"labels must be provided\"\n",
853
+ " score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
854
+ " else:\n",
855
+ " exitWithMsg(\"invalid prediction performance metric \" + metric)\n",
856
+ " return score\n",
857
+ "\n",
858
+ "def scaleData(data, method):\n",
859
+ " \"\"\"\n",
860
+ " scales feature data column wise\n",
861
+ " Parameters\n",
862
+ " data : 2D array\n",
863
+ " method : scaling method\n",
864
+ " \"\"\"\n",
865
+ " if method == \"minmax\":\n",
866
+ " scaler = preprocessing.MinMaxScaler()\n",
867
+ " data = scaler.fit_transform(data)\n",
868
+ " elif method == \"zscale\":\n",
869
+ " data = preprocessing.scale(data)\t\n",
870
+ " else:\n",
871
+ " raise ValueError(\"invalid scaling method\")\t\n",
872
+ " return data\n",
873
+ "\n",
874
+ "def scaleDataWithParams(data, method, scParams):\n",
875
+ " \"\"\"\n",
876
+ " scales feature data column wise\n",
877
+ " Parameters\n",
878
+ " data : 2D array\n",
879
+ " method : scaling method\n",
880
+ " scParams : scaling parameters\n",
881
+ " \"\"\"\n",
882
+ " if method == \"minmax\":\n",
883
+ " data = scaleMinMaxTabData(data, scParams)\n",
884
+ " elif method == \"zscale\":\n",
885
+ " raise ValueError(\"invalid scaling method\")\t\n",
886
+ " else:\n",
887
+ " raise ValueError(\"invalid scaling method\")\t\n",
888
+ " return data\n",
889
+ "\n",
890
+ "\n",
891
+ "def scaleMinMaxTabData(tdata, minMax):\n",
892
+ " \"\"\"\n",
893
+ " for tabular scales feature data column wise using min max values for each field\n",
894
+ " Parameters\n",
895
+ " tdata : 2D array\n",
896
+ " minMax : ni, max and range for each column\n",
897
+ " \"\"\"\n",
898
+ " stdata = list()\n",
899
+ " for r in tdata:\n",
900
+ " srdata = list()\n",
901
+ " for i, c in enumerate(r):\n",
902
+ " sd = (c - minMax[i][0]) / minMax[i][2]\n",
903
+ " srdata.append(sd)\n",
904
+ " stdata.append(srdata)\n",
905
+ " return stdata\n",
906
+ "\n",
907
+ "def scaleMinMax(rdata, minMax):\n",
908
+ " \"\"\"\n",
909
+ " scales feature data column wise using min max values for each field\n",
910
+ " Parameters\n",
911
+ " rdata : data array\n",
912
+ " minMax : ni, max and range for each column\n",
913
+ " \"\"\"\n",
914
+ " srdata = list()\n",
915
+ " for i in range(len(rdata)):\n",
916
+ " d = rdata[i]\n",
917
+ " sd = (d - minMax[i][0]) / minMax[i][2]\n",
918
+ " srdata.append(sd)\n",
919
+ " return srdata\n",
920
+ "\n",
921
+ "def harmonicNum(n):\n",
922
+ " \"\"\"\n",
923
+ " harmonic number\n",
924
+ " Parameters\n",
925
+ " n : number\n",
926
+ " \"\"\"\n",
927
+ " h = 0\n",
928
+ " for i in range(1, n+1, 1):\n",
929
+ " h += 1.0 / i\n",
930
+ " return h\n",
931
+ "\n",
932
+ "def digammaFun(n):\n",
933
+ " \"\"\"\n",
934
+ " figamma function\n",
935
+ " Parameters\n",
936
+ " n : number\n",
937
+ " \"\"\"\n",
938
+ " #Euler Mascheroni constant\n",
939
+ " ec = 0.577216\n",
940
+ " return harmonicNum(n - 1) - ec\n",
941
+ "\n",
942
+ "def getDataPartitions(tdata, types, columns = None):\n",
943
+ " \"\"\"\n",
944
+ " partitions data with the given columns and random split point defined with predicates\n",
945
+ " Parameters\n",
946
+ " tdata : 2D array\n",
947
+ " types : data typers\n",
948
+ " columns : column indexes\n",
949
+ " \"\"\"\n",
950
+ " (dtypes, cvalues) = extractTypesFromString(types)\n",
951
+ " if columns is None:\n",
952
+ " ncol = len(data[0])\n",
953
+ " columns = list(range(ncol))\n",
954
+ " ncol = len(columns)\n",
955
+ " #print(columns)\n",
956
+ "\n",
957
+ " # partition predicates\n",
958
+ " partitions = None\n",
959
+ " for c in columns:\n",
960
+ " #print(c)\n",
961
+ " dtype = dtypes[c]\n",
962
+ " pred = list()\n",
963
+ " if dtype == \"int\" or dtype == \"float\":\n",
964
+ " (vmin, vmax) = getColMinMax(tdata, c)\n",
965
+ " r = vmax - vmin\n",
966
+ " rmin = vmin + .2 * r\n",
967
+ " rmax = vmax - .2 * r\n",
968
+ " sp = randomFloat(rmin, rmax)\n",
969
+ " if dtype == \"int\":\n",
970
+ " sp = int(sp)\n",
971
+ " else:\n",
972
+ " sp = \"{:.3f}\".format(sp)\n",
973
+ " sp = float(sp)\n",
974
+ " pred.append([c, \"LT\", sp])\n",
975
+ " pred.append([c, \"GE\", sp])\n",
976
+ " elif dtype == \"cat\":\n",
977
+ " cv = cvalues[c]\n",
978
+ " card = len(cv) \n",
979
+ " if card < 3:\n",
980
+ " num = 1\n",
981
+ " else:\n",
982
+ " num = randomInt(1, card - 1)\n",
983
+ " sp = selectRandomSubListFromList(cv, num)\n",
984
+ " sp = \" \".join(sp)\n",
985
+ " pred.append([c, \"IN\", sp])\n",
986
+ " pred.append([c, \"NOTIN\", sp])\n",
987
+ "\n",
988
+ " #print(pred)\n",
989
+ " if partitions is None:\n",
990
+ " partitions = pred.copy()\n",
991
+ " #print(\"initial\")\n",
992
+ " #print(partitions)\n",
993
+ " else:\n",
994
+ " #print(\"extension\")\n",
995
+ " tparts = list()\n",
996
+ " for p in partitions:\n",
997
+ " #print(p)\n",
998
+ " l1 = p.copy()\n",
999
+ " l1.extend(pred[0])\n",
1000
+ " l2 = p.copy()\n",
1001
+ " l2.extend(pred[1])\n",
1002
+ " #print(\"after extension\")\n",
1003
+ " #print(l1)\n",
1004
+ " #print(l2)\n",
1005
+ " tparts.append(l1)\n",
1006
+ " tparts.append(l2)\n",
1007
+ " partitions = tparts\t\n",
1008
+ " #print(\"extending\")\n",
1009
+ " #print(partitions)\n",
1010
+ "\n",
1011
+ " #for p in partitions:\n",
1012
+ " #print(p)\t\n",
1013
+ " return partitions\t\t\t\n",
1014
+ "\n",
1015
+ "def genAlmostUniformDistr(size, nswap=50):\n",
1016
+ " \"\"\"\n",
1017
+ " generate probability distribution\n",
1018
+ "\n",
1019
+ " Parameters\n",
1020
+ " size : distr size\n",
1021
+ " nswap : no of mass swaps\n",
1022
+ " \"\"\"\n",
1023
+ " un = 1.0 / size\n",
1024
+ " distr = [un] * size\n",
1025
+ " distr = mutDistr(distr, 0.1 * un, nswap)\n",
1026
+ " return distr\n",
1027
+ "\n",
1028
+ "def mutDistr(distr, shift, nswap=50):\n",
1029
+ " \"\"\"\n",
1030
+ " mutates a probability distribution\n",
1031
+ "\n",
1032
+ " Parameters\n",
1033
+ " distr distribution\n",
1034
+ " shift : amount of shift for swap\n",
1035
+ " nswap : no of mass swaps\n",
1036
+ " \"\"\"\n",
1037
+ " size = len(distr)\n",
1038
+ " for _ in range(nswap):\n",
1039
+ " fi = randomInt(0, size -1)\n",
1040
+ " si = randomInt(0, size -1)\n",
1041
+ " while fi == si:\n",
1042
+ " fi = randomInt(0, size -1)\n",
1043
+ " si = randomInt(0, size -1)\n",
1044
+ "\n",
1045
+ " shift = randomFloat(0, shift)\n",
1046
+ " t = distr[fi]\n",
1047
+ " distr[fi] -= shift\n",
1048
+ " if (distr[fi] < 0):\n",
1049
+ " distr[fi] = 0.0\n",
1050
+ " shift = t\n",
1051
+ " distr[si] += shift\n",
1052
+ " return distr\n",
1053
+ "\n",
1054
+ "def generateBinDistribution(size, ntrue):\n",
1055
+ " \"\"\"\n",
1056
+ " generate binary array with some elements set to 1\n",
1057
+ "\n",
1058
+ " Parameters\n",
1059
+ " size : distr size\n",
1060
+ " ntrue : no of true values\n",
1061
+ " \"\"\"\n",
1062
+ " distr = [0] * size\n",
1063
+ " idxs = selectRandomSubListFromList(list(range(size)), ntrue)\n",
1064
+ " for i in idxs:\n",
1065
+ " distr[i] = 1\n",
1066
+ " return distr\n",
1067
+ "\n",
1068
+ "def mutBinaryDistr(distr, nmut):\n",
1069
+ " \"\"\"\n",
1070
+ " mutate binary distribution\n",
1071
+ "\n",
1072
+ " Parameters\n",
1073
+ " distr : distr\n",
1074
+ " nmut : no of mutations\n",
1075
+ " \"\"\"\n",
1076
+ " idxs = selectRandomSubListFromList(list(range(len(distr))), nmut)\n",
1077
+ " for i in idxs:\n",
1078
+ " distr[i] = distr[i] ^ 1\n",
1079
+ "\n",
1080
+ "\n",
1081
+ "def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=\",\"):\n",
1082
+ " \"\"\"\n",
1083
+ " file record generator that superimposes given data in the specified segment of a column\n",
1084
+ " Parameters\n",
1085
+ " filePath ; file path\n",
1086
+ " column : column index \n",
1087
+ " offset : offset into column values\n",
1088
+ " seqLen : length of subseq\n",
1089
+ " modifier : data to be superimposed either list or a sampler object\n",
1090
+ " precision : floating point precision\n",
1091
+ " delim : delemeter\n",
1092
+ " \"\"\"\n",
1093
+ " beg = offset\n",
1094
+ " end = beg + seqLen\n",
1095
+ " isList = type(modifier) == list\n",
1096
+ " i = 0\n",
1097
+ " for rec in fileRecGen(filePath, delim):\n",
1098
+ " if i >= beg and i < end:\n",
1099
+ " va = float(rec[column])\n",
1100
+ " if isList:\n",
1101
+ " va += modifier[i - beg] \n",
1102
+ " else:\n",
1103
+ " va += modifier.sample()\n",
1104
+ " rec[column] = formatFloat(precision, va)\n",
1105
+ " yield delim.join(rec)\n",
1106
+ " i += 1\n",
1107
+ "\n",
1108
+ "class ShiftedDataGenerator:\n",
1109
+ " \"\"\"\n",
1110
+ " transforms data for distribution shift\n",
1111
+ " \"\"\"\n",
1112
+ " def __init__(self, types, tdata, addFact, multFact):\n",
1113
+ " \"\"\"\n",
1114
+ " initializer\n",
1115
+ "\n",
1116
+ " Parameters\n",
1117
+ " types data types\n",
1118
+ " tdata : 2D array\n",
1119
+ " addFact ; factor for data shift\n",
1120
+ " multFact ; factor for data scaling\n",
1121
+ " \"\"\"\n",
1122
+ " (self.dtypes, self.cvalues) = extractTypesFromString(types)\n",
1123
+ "\n",
1124
+ " self.limits = dict()\n",
1125
+ " for k,v in self.dtypes.items():\n",
1126
+ " if v == \"int\" or v == \"false\":\n",
1127
+ " (vmax, vmin) = getColMinMax(tdata, k)\n",
1128
+ " self.limits[k] = vmax - vmin\n",
1129
+ " self.addMin = - addFact / 2\n",
1130
+ " self.addMax = addFact / 2\n",
1131
+ " self.multMin = 1.0 - multFact / 2\n",
1132
+ " self.multMax = 1.0 + multFact / 2\n",
1133
+ "\n",
1134
+ "\n",
1135
+ "\n",
1136
+ "\n",
1137
+ " def transform(self, tdata):\n",
1138
+ " \"\"\"\n",
1139
+ " linear transforms data to create distribution shift with random shift and scale\n",
1140
+ " Parameters\n",
1141
+ " types : data types\n",
1142
+ " \"\"\"\n",
1143
+ " transforms = dict()\n",
1144
+ " for k,v in self.dtypes.items():\n",
1145
+ " if v == \"int\" or v == \"false\":\t\t\t\t\n",
1146
+ " shift = randomFloat(self.addMin, self.addMax) * self.limits[k] \n",
1147
+ " scale = randomFloat(self.multMin, self.multMax)\n",
1148
+ " trns = (shift, scale)\n",
1149
+ " transforms[k] = trns\n",
1150
+ " elif v == \"cat\":\n",
1151
+ " transforms[k] = isEventSampled(50)\n",
1152
+ "\n",
1153
+ " ttdata = list()\n",
1154
+ " for rec in tdata:\n",
1155
+ " nrec = rec.copy()\n",
1156
+ " for c in range(len(rec)):\n",
1157
+ " if c in self.dtypes:\n",
1158
+ " dtype = self.dtypes[c]\n",
1159
+ " if dtype == \"int\" or dtype == \"float\":\n",
1160
+ " (shift, scale) = transforms[c]\n",
1161
+ " nval = shift + rec[c] * scale\n",
1162
+ " if dtype == \"int\":\n",
1163
+ " nrec[c] = int(nval)\n",
1164
+ " else:\n",
1165
+ " nrec[c] = nval\n",
1166
+ " elif dtype == \"cat\":\n",
1167
+ " cv = self.cvalues[c]\n",
1168
+ " if transforms[c]:\n",
1169
+ " nval = selectOtherRandomFromList(cv, rec[c])\n",
1170
+ " nrec[c] = nval\n",
1171
+ "\n",
1172
+ " ttdata.append(nrec)\n",
1173
+ "\n",
1174
+ " return ttdata\n",
1175
+ "\n",
1176
+ " def transformSpecified(self, tdata, sshift, scale):\n",
1177
+ " \"\"\"\n",
1178
+ " linear transforms data to create distribution shift shift specified shift and scale\n",
1179
+ " Parameters\n",
1180
+ " types : data types\n",
1181
+ " sshift : shift factor\n",
1182
+ " scale : scale factor\n",
1183
+ " \"\"\"\n",
1184
+ " transforms = dict()\n",
1185
+ " for k,v in self.dtypes.items():\n",
1186
+ " if v == \"int\" or v == \"false\":\t\t\t\t\n",
1187
+ " shift = sshift * self.limits[k] \n",
1188
+ " trns = (shift, scale)\n",
1189
+ " transforms[k] = trns\n",
1190
+ " elif v == \"cat\":\n",
1191
+ " transforms[k] = isEventSampled(50)\n",
1192
+ "\n",
1193
+ " ttdata = self.__scaleShift(tdata, transforms)\n",
1194
+ " return ttdata\n",
1195
+ "\n",
1196
+ " def __scaleShift(self, tdata, transforms):\n",
1197
+ " \"\"\"\n",
1198
+ " shifts and scales tabular data\n",
1199
+ "\n",
1200
+ " Parameters\n",
1201
+ " tdata : 2D array\n",
1202
+ " transforms : transforms to apply\n",
1203
+ " \"\"\"\n",
1204
+ " ttdata = list()\n",
1205
+ " for rec in tdata:\n",
1206
+ " nrec = rec.copy()\n",
1207
+ " for c in range(len(rec)):\n",
1208
+ " if c in self.dtypes:\n",
1209
+ " dtype = self.dtypes[c]\n",
1210
+ " if dtype == \"int\" or dtype == \"float\":\n",
1211
+ " (shift, scale) = transforms[c]\n",
1212
+ " nval = shift + rec[c] * scale\n",
1213
+ " if dtype == \"int\":\n",
1214
+ " nrec[c] = int(nval)\n",
1215
+ " else:\n",
1216
+ " nrec[c] = nval\n",
1217
+ " elif dtype == \"cat\":\n",
1218
+ " cv = self.cvalues[c]\n",
1219
+ " if transforms[c]:\n",
1220
+ " #nval = selectOtherRandomFromList(cv, rec[c])\n",
1221
+ " #nrec[c] = nval\n",
1222
+ " pass\n",
1223
+ "\n",
1224
+ " ttdata.append(nrec)\n",
1225
+ " return ttdata\n",
1226
+ "\n",
1227
+ "class RollingStat(object):\n",
1228
+ " \"\"\"\n",
1229
+ " stats for rolling windowt\n",
1230
+ " \"\"\"\n",
1231
+ " def __init__(self, wsize):\n",
1232
+ " \"\"\"\n",
1233
+ " initializer\n",
1234
+ "\n",
1235
+ " Parameters\n",
1236
+ " wsize : window size\n",
1237
+ " \"\"\"\n",
1238
+ " self.window = list()\n",
1239
+ " self.wsize = wsize\n",
1240
+ " self.mean = None\n",
1241
+ " self.sd = None\n",
1242
+ "\n",
1243
+ " def add(self, value):\n",
1244
+ " \"\"\"\n",
1245
+ " add a value\n",
1246
+ "\n",
1247
+ " Parameters\n",
1248
+ " value : value to add\n",
1249
+ " \"\"\"\n",
1250
+ " self.window.append(value)\n",
1251
+ " if len(self.window) > self.wsize:\n",
1252
+ " self.window = self.window[1:]\n",
1253
+ "\n",
1254
+ " def getStat(self):\n",
1255
+ " \"\"\"\n",
1256
+ " get rolling window mean and std deviation\n",
1257
+ " \"\"\"\n",
1258
+ " assertGreater(len(self.window), 0, \"window is empty\")\n",
1259
+ " if len(self.window) == 1:\n",
1260
+ " self.mean = self.window[0]\n",
1261
+ " self.sd = 0\n",
1262
+ " else:\n",
1263
+ " self.mean = statistics.mean(self.window)\n",
1264
+ " self.sd = statistics.stdev(self.window, xbar=self.mean)\n",
1265
+ " re = (self.mean, self.sd)\n",
1266
+ " return re\n",
1267
+ "\n",
1268
+ " def getSize(self):\n",
1269
+ " \"\"\"\n",
1270
+ " return window size\n",
1271
+ " \"\"\"\n",
1272
+ " return len(self.window)\n"
1273
+ ]
1274
+ }
1275
+ ],
1276
+ "metadata": {
1277
+ "kernelspec": {
1278
+ "display_name": "Python 3 (ipykernel)",
1279
+ "language": "python",
1280
+ "name": "python3"
1281
+ },
1282
+ "language_info": {
1283
+ "codemirror_mode": {
1284
+ "name": "ipython",
1285
+ "version": 3
1286
+ },
1287
+ "file_extension": ".py",
1288
+ "mimetype": "text/x-python",
1289
+ "name": "python",
1290
+ "nbconvert_exporter": "python",
1291
+ "pygments_lexer": "ipython3",
1292
+ "version": "3.9.12"
1293
+ }
1294
+ },
1295
+ "nbformat": 4,
1296
+ "nbformat_minor": 5
1297
+ }
lib/.ipynb_checkpoints/sampler-checkpoint.ipynb ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 5
6
+ }
lib/.ipynb_checkpoints/stats-checkpoint.ipynb ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "f4cbab42",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import sys\n",
11
+ "import random \n",
12
+ "import time\n",
13
+ "import math\n",
14
+ "import numpy as np\n",
15
+ "import statistics \n",
16
+ "from util import *\n",
17
+ "\n",
18
+ "\"\"\"\n",
19
+ "histogram class\n",
20
+ "\"\"\"\n",
21
+ "class Histogram:\n",
22
+ " def __init__(self, min, binWidth):\n",
23
+ " \"\"\"\n",
24
+ " initializer\n",
25
+ "\n",
26
+ " Parameters\n",
27
+ " min : min x\n",
28
+ " binWidth : bin width\n",
29
+ " \"\"\"\n",
30
+ " self.xmin = min\n",
31
+ " self.binWidth = binWidth\n",
32
+ " self.normalized = False\n",
33
+ "\n",
34
+ " @classmethod\n",
35
+ " def createInitialized(cls, xmin, binWidth, values):\n",
36
+ " \"\"\"\n",
37
+ " create histogram instance with min domain, bin width and values\n",
38
+ "\n",
39
+ " Parameters\n",
40
+ " min : min x\n",
41
+ " binWidth : bin width\n",
42
+ " values : y values\n",
43
+ " \"\"\"\n",
44
+ " instance = cls(xmin, binWidth)\n",
45
+ " instance.xmax = xmin + binWidth * (len(values) - 1)\n",
46
+ " instance.ymin = 0\n",
47
+ " instance.bins = np.array(values)\n",
48
+ " instance.fmax = 0\n",
49
+ " for v in values:\n",
50
+ " if (v > instance.fmax):\n",
51
+ " instance.fmax = v\n",
52
+ " instance.ymin = 0.0\n",
53
+ " instance.ymax = instance.fmax\n",
54
+ " return instance\n",
55
+ "\n",
56
+ " @classmethod\n",
57
+ " def createWithNumBins(cls, values, numBins=20):\n",
58
+ " \"\"\"\n",
59
+ " create histogram instance values and no of bins\n",
60
+ "\n",
61
+ " Parameters\n",
62
+ " values : y values\n",
63
+ " numBins : no of bins\n",
64
+ " \"\"\"\n",
65
+ " xmin = min(values)\n",
66
+ " xmax = max(values)\n",
67
+ " binWidth = (xmax + .01 - (xmin - .01)) / numBins\n",
68
+ " instance = cls(xmin, binWidth)\n",
69
+ " instance.xmax = xmax\n",
70
+ " instance.numBin = numBins\n",
71
+ " instance.bins = np.zeros(instance.numBin)\n",
72
+ " for v in values:\n",
73
+ " instance.add(v)\n",
74
+ " return instance\n",
75
+ "\n",
76
+ " @classmethod\n",
77
+ " def createUninitialized(cls, xmin, xmax, binWidth):\n",
78
+ " \"\"\"\n",
79
+ " create histogram instance with no y values using domain min , max and bin width\n",
80
+ "\n",
81
+ " Parameters\n",
82
+ " min : min x\n",
83
+ " max : max x\n",
84
+ " binWidth : bin width\n",
85
+ " \"\"\"\n",
86
+ " instance = cls(xmin, binWidth)\n",
87
+ " instance.xmax = xmax\n",
88
+ " instance.numBin = (xmax - xmin) / binWidth + 1\n",
89
+ " instance.bins = np.zeros(instance.numBin)\n",
90
+ " return instance\n",
91
+ "\n",
92
+ " def initialize(self):\n",
93
+ " \"\"\"\n",
94
+ " set y values to 0\n",
95
+ " \"\"\"\n",
96
+ " self.bins = np.zeros(self.numBin)\n",
97
+ "\n",
98
+ " def add(self, value):\n",
99
+ " \"\"\"\n",
100
+ " adds a value to a bin\n",
101
+ "\n",
102
+ " Parameters\n",
103
+ " value : value\n",
104
+ " \"\"\"\n",
105
+ " bin = int((value - self.xmin) / self.binWidth)\n",
106
+ " if (bin < 0 or bin > self.numBin - 1):\n",
107
+ " print (bin)\n",
108
+ " raise ValueError(\"outside histogram range\")\n",
109
+ " self.bins[bin] += 1.0\n",
110
+ "\n",
111
+ " def normalize(self):\n",
112
+ " \"\"\"\n",
113
+ " normalize bin counts\n",
114
+ " \"\"\"\n",
115
+ " if not self.normalized:\n",
116
+ " total = self.bins.sum()\n",
117
+ " self.bins = np.divide(self.bins, total)\n",
118
+ " self.normalized = True\n",
119
+ "\n",
120
+ " def cumDistr(self):\n",
121
+ " \"\"\"\n",
122
+ " cumulative dists\n",
123
+ " \"\"\"\n",
124
+ " self.normalize()\n",
125
+ " self.cbins = np.cumsum(self.bins)\n",
126
+ " return self.cbins\n",
127
+ "\n",
128
+ " def distr(self):\n",
129
+ " \"\"\"\n",
130
+ " distr\n",
131
+ " \"\"\"\n",
132
+ " self.normalize()\n",
133
+ " return self.bins\n",
134
+ "\n",
135
+ "\n",
136
+ " def percentile(self, percent):\n",
137
+ " \"\"\"\n",
138
+ " return value corresponding to a percentile\n",
139
+ "\n",
140
+ " Parameters\n",
141
+ " percent : percentile value\n",
142
+ " \"\"\"\n",
143
+ " if self.cbins is None:\n",
144
+ " raise ValueError(\"cumulative distribution is not available\")\n",
145
+ "\n",
146
+ " for i,cuml in enumerate(self.cbins):\n",
147
+ " if percent > cuml:\n",
148
+ " value = (i * self.binWidth) - (self.binWidth / 2) + \\\n",
149
+ " (percent - self.cbins[i-1]) * self.binWidth / (self.cbins[i] - self.cbins[i-1]) \n",
150
+ " break\n",
151
+ " return value\n",
152
+ "\n",
153
+ " def max(self):\n",
154
+ " \"\"\"\n",
155
+ " return max bin value \n",
156
+ " \"\"\"\n",
157
+ " return self.bins.max()\n",
158
+ "\n",
159
+ " def value(self, x):\n",
160
+ " \"\"\"\n",
161
+ " return a bin value\t\n",
162
+ "\n",
163
+ " Parameters\n",
164
+ " x : x value\n",
165
+ " \"\"\"\n",
166
+ " bin = int((x - self.xmin) / self.binWidth)\n",
167
+ " f = self.bins[bin]\n",
168
+ " return f\n",
169
+ "\n",
170
+ " def bin(self, x):\n",
171
+ " \"\"\"\n",
172
+ " return a bin index\t\n",
173
+ "\n",
174
+ " Parameters\n",
175
+ " x : x value\n",
176
+ " \"\"\"\n",
177
+ " return int((x - self.xmin) / self.binWidth)\n",
178
+ "\n",
179
+ " def cumValue(self, x):\n",
180
+ " \"\"\"\n",
181
+ " return a cumulative bin value\t\n",
182
+ "\n",
183
+ " Parameters\n",
184
+ " x : x value\n",
185
+ " \"\"\"\n",
186
+ " bin = int((x - self.xmin) / self.binWidth)\n",
187
+ " c = self.cbins[bin]\n",
188
+ " return c\n",
189
+ "\n",
190
+ "\n",
191
+ " def getMinMax(self):\n",
192
+ " \"\"\"\n",
193
+ " returns x min and x max\n",
194
+ " \"\"\"\n",
195
+ " return (self.xmin, self.xmax)\n",
196
+ "\n",
197
+ " def boundedValue(self, x):\n",
198
+ " \"\"\"\n",
199
+ " return x bounde by min and max\t\n",
200
+ "\n",
201
+ " Parameters\n",
202
+ " x : x value\n",
203
+ " \"\"\"\n",
204
+ " if x < self.xmin:\n",
205
+ " x = self.xmin\n",
206
+ " elif x > self.xmax:\n",
207
+ " x = self.xmax\n",
208
+ " return x\n",
209
+ "\n",
210
+ "\"\"\"\n",
211
+ "categorical histogram class\n",
212
+ "\"\"\"\n",
213
+ "class CatHistogram:\n",
214
+ " def __init__(self):\n",
215
+ " \"\"\"\n",
216
+ " initializer\n",
217
+ " \"\"\"\n",
218
+ " self.binCounts = dict()\n",
219
+ " self.counts = 0\n",
220
+ " self.normalized = False\n",
221
+ "\n",
222
+ " def add(self, value):\n",
223
+ " \"\"\"\n",
224
+ " adds a value to a bin\n",
225
+ "\n",
226
+ " Parameters\n",
227
+ " x : x value\n",
228
+ " \"\"\"\n",
229
+ " addToKeyedCounter(self.binCounts, value)\n",
230
+ " self.counts += 1\t\n",
231
+ "\n",
232
+ " def normalize(self):\n",
233
+ " \"\"\"\n",
234
+ " normalize\n",
235
+ " \"\"\"\n",
236
+ " if not self.normalized:\n",
237
+ " self.binCounts = dict(map(lambda r : (r[0],r[1] / self.counts), self.binCounts.items()))\n",
238
+ " self.normalized = True\n",
239
+ "\n",
240
+ " def getMode(self):\n",
241
+ " \"\"\"\n",
242
+ " get mode\n",
243
+ " \"\"\"\n",
244
+ " maxk = None\n",
245
+ " maxv = 0\n",
246
+ " #print(self.binCounts)\n",
247
+ " for k,v in self.binCounts.items():\n",
248
+ " if v > maxv:\n",
249
+ " maxk = k\n",
250
+ " maxv = v\n",
251
+ " return (maxk, maxv)\t\n",
252
+ "\n",
253
+ " def getEntropy(self):\n",
254
+ " \"\"\"\n",
255
+ " get entropy\n",
256
+ " \"\"\"\n",
257
+ " self.normalize()\n",
258
+ " entr = 0 \n",
259
+ " #print(self.binCounts)\n",
260
+ " for k,v in self.binCounts.items():\n",
261
+ " entr -= v * math.log(v)\n",
262
+ " return entr\n",
263
+ "\n",
264
+ " def getUniqueValues(self):\n",
265
+ " \"\"\"\n",
266
+ " get unique values\n",
267
+ " \"\"\"\t\t\n",
268
+ " return list(self.binCounts.keys())\n",
269
+ "\n",
270
+ " def getDistr(self):\n",
271
+ " \"\"\"\n",
272
+ " get distribution\n",
273
+ " \"\"\"\t\n",
274
+ " self.normalize()\t\n",
275
+ " return self.binCounts.copy()\n",
276
+ "\n",
277
+ "class RunningStat:\n",
278
+ " \"\"\"\n",
279
+ " running stat class\n",
280
+ " \"\"\"\n",
281
+ " def __init__(self):\n",
282
+ " \"\"\"\n",
283
+ " initializer\t\n",
284
+ " \"\"\"\n",
285
+ " self.sum = 0.0\n",
286
+ " self.sumSq = 0.0\n",
287
+ " self.count = 0\n",
288
+ "\n",
289
+ " @staticmethod\n",
290
+ " def create(count, sum, sumSq):\n",
291
+ " \"\"\"\n",
292
+ " creates iinstance\t\n",
293
+ "\n",
294
+ " Parameters\n",
295
+ " sum : sum of values\n",
296
+ " sumSq : sum of valure squared\n",
297
+ " \"\"\"\n",
298
+ " rs = RunningStat()\n",
299
+ " rs.sum = sum\n",
300
+ " rs.sumSq = sumSq\n",
301
+ " rs.count = count\n",
302
+ " return rs\n",
303
+ "\n",
304
+ " def add(self, value):\n",
305
+ " \"\"\"\n",
306
+ " adds new value\n",
307
+ " Parameters\n",
308
+ " value : value to add\n",
309
+ " \"\"\"\n",
310
+ " self.sum += value\n",
311
+ " self.sumSq += (value * value)\n",
312
+ " self.count += 1\n",
313
+ "\n",
314
+ " def getStat(self):\n",
315
+ " \"\"\"\n",
316
+ " return mean and std deviation \n",
317
+ " \"\"\"\n",
318
+ " mean = self.sum /self. count\n",
319
+ " t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
320
+ " sd = math.sqrt(t)\n",
321
+ " re = (mean, sd)\n",
322
+ " return re\n",
323
+ "\n",
324
+ " def addGetStat(self,value):\n",
325
+ " \"\"\"\n",
326
+ " calculate mean and std deviation with new value added\n",
327
+ " Parameters\n",
328
+ " value : value to add\n",
329
+ " \"\"\"\n",
330
+ " self.add(value)\n",
331
+ " re = self.getStat()\n",
332
+ " return re\n",
333
+ "\n",
334
+ " def getCount(self):\n",
335
+ " \"\"\"\n",
336
+ " return count\n",
337
+ " \"\"\"\n",
338
+ " return self.count\n",
339
+ "\n",
340
+ " def getState(self):\n",
341
+ " \"\"\"\n",
342
+ " return state\n",
343
+ " \"\"\"\n",
344
+ " s = (self.count, self.sum, self.sumSq)\n",
345
+ " return s\n",
346
+ "\n",
347
+ "class SlidingWindowStat:\n",
348
+ " \"\"\"\n",
349
+ " sliding window stats\n",
350
+ " \"\"\"\n",
351
+ " def __init__(self):\n",
352
+ " \"\"\"\n",
353
+ " initializer\n",
354
+ " \"\"\"\n",
355
+ " self.sum = 0.0\n",
356
+ " self.sumSq = 0.0\n",
357
+ " self.count = 0\n",
358
+ " self.values = None\n",
359
+ "\n",
360
+ " @staticmethod\n",
361
+ " def create(values, sum, sumSq):\n",
362
+ " \"\"\"\n",
363
+ " creates iinstance\t\n",
364
+ "\n",
365
+ " Parameters\n",
366
+ " sum : sum of values\n",
367
+ " sumSq : sum of valure squared\n",
368
+ " \"\"\"\n",
369
+ " sws = SlidingWindowStat()\n",
370
+ " sws.sum = sum\n",
371
+ " sws.sumSq = sumSq\n",
372
+ " self.values = values.copy()\n",
373
+ " sws.count = len(self.values)\n",
374
+ " return sws\n",
375
+ "\n",
376
+ " @staticmethod\n",
377
+ " def initialize(values):\n",
378
+ " \"\"\"\n",
379
+ " creates iinstance\t\n",
380
+ "\n",
381
+ " Parameters\n",
382
+ " values : list of values\n",
383
+ " \"\"\"\n",
384
+ " sws = SlidingWindowStat()\n",
385
+ " sws.values = values.copy()\n",
386
+ " for v in sws.values:\n",
387
+ " sws.sum += v\n",
388
+ " sws.sumSq += v * v\t\t\n",
389
+ " sws.count = len(sws.values)\n",
390
+ " return sws\n",
391
+ "\n",
392
+ " @staticmethod\n",
393
+ " def createEmpty(count):\n",
394
+ " \"\"\"\n",
395
+ " creates iinstance\t\n",
396
+ "\n",
397
+ " Parameters\n",
398
+ " count : count of values\n",
399
+ " \"\"\"\n",
400
+ " sws = SlidingWindowStat()\n",
401
+ " sws.count = count\n",
402
+ " sws.values = list()\n",
403
+ " return sws\n",
404
+ "\n",
405
+ " def add(self, value):\n",
406
+ " \"\"\"\n",
407
+ " adds new value\n",
408
+ "\n",
409
+ " Parameters\n",
410
+ " value : value to add\n",
411
+ " \"\"\"\n",
412
+ " self.values.append(value)\t\t\n",
413
+ " if len(self.values) > self.count:\n",
414
+ " self.sum += value - self.values[0]\n",
415
+ " self.sumSq += (value * value) - (self.values[0] * self.values[0])\n",
416
+ " self.values.pop(0)\n",
417
+ " else:\n",
418
+ " self.sum += value\n",
419
+ " self.sumSq += (value * value)\n",
420
+ "\n",
421
+ "\n",
422
+ " def getStat(self):\n",
423
+ " \"\"\"\n",
424
+ " calculate mean and std deviation \n",
425
+ " \"\"\"\n",
426
+ " mean = self.sum /self. count\n",
427
+ " t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
428
+ " sd = math.sqrt(t)\n",
429
+ " re = (mean, sd)\n",
430
+ " return re\n",
431
+ "\n",
432
+ " def addGetStat(self,value):\n",
433
+ " \"\"\"\n",
434
+ " calculate mean and std deviation with new value added\n",
435
+ " \"\"\"\n",
436
+ " self.add(value)\n",
437
+ " re = self.getStat()\n",
438
+ " return re\n",
439
+ "\n",
440
+ " def getCount(self):\n",
441
+ " \"\"\"\n",
442
+ " return count\n",
443
+ " \"\"\"\n",
444
+ " return self.count\n",
445
+ "\n",
446
+ " def getCurSize(self):\n",
447
+ " \"\"\"\n",
448
+ " return count\n",
449
+ " \"\"\"\n",
450
+ " return len(self.values)\n",
451
+ "\n",
452
+ " def getState(self):\n",
453
+ " \"\"\"\n",
454
+ " return state\n",
455
+ " \"\"\"\n",
456
+ " s = (self.count, self.sum, self.sumSq)\n",
457
+ " return s\n",
458
+ "\n",
459
+ "\n",
460
+ "def basicStat(ldata):\n",
461
+ " \"\"\"\n",
462
+ " mean and std dev\n",
463
+ " Parameters\n",
464
+ " ldata : list of values\n",
465
+ " \"\"\"\n",
466
+ " m = statistics.mean(ldata)\n",
467
+ " s = statistics.stdev(ldata, xbar=m)\n",
468
+ " r = (m, s)\n",
469
+ " return r\n",
470
+ "\n",
471
+ "def getFileColumnStat(filePath, col, delem=\",\"):\n",
472
+ " \"\"\"\n",
473
+ " gets stats for a file column\n",
474
+ "\n",
475
+ " Parameters\n",
476
+ " filePath : file path\n",
477
+ " col : col index\n",
478
+ " delem : field delemter\n",
479
+ " \"\"\"\n",
480
+ " rs = RunningStat()\n",
481
+ " for rec in fileRecGen(filePath, delem):\n",
482
+ " va = float(rec[col])\n",
483
+ " rs.add(va)\n",
484
+ "\n",
485
+ " return rs.getStat()\n"
486
+ ]
487
+ }
488
+ ],
489
+ "metadata": {
490
+ "kernelspec": {
491
+ "display_name": "Python 3 (ipykernel)",
492
+ "language": "python",
493
+ "name": "python3"
494
+ },
495
+ "language_info": {
496
+ "codemirror_mode": {
497
+ "name": "ipython",
498
+ "version": 3
499
+ },
500
+ "file_extension": ".py",
501
+ "mimetype": "text/x-python",
502
+ "name": "python",
503
+ "nbconvert_exporter": "python",
504
+ "pygments_lexer": "ipython3",
505
+ "version": "3.9.12"
506
+ }
507
+ },
508
+ "nbformat": 4,
509
+ "nbformat_minor": 5
510
+ }
lib/.ipynb_checkpoints/tnn-checkpoint.ipynb ADDED
@@ -0,0 +1,800 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "3853095d",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import sys\n",
12
+ "import matplotlib.pyplot as plt\n",
13
+ "import numpy as np\n",
14
+ "import torch\n",
15
+ "from torch.autograd import Variable\n",
16
+ "from torch.utils.data import Dataset, TensorDataset\n",
17
+ "from torch.utils.data import DataLoader\n",
18
+ "import sklearn as sk\n",
19
+ "from sklearn.neighbors import KDTree\n",
20
+ "import matplotlib\n",
21
+ "import random\n",
22
+ "import jprops\n",
23
+ "from random import randint\n",
24
+ "import statistics\n",
25
+ "sys.path.append(os.path.abspath(\"../lib\"))\n",
26
+ "from util import *\n",
27
+ "from mlutil import *\n",
28
+ "\n",
29
+ "\"\"\"\n",
30
+ "forward hook function\n",
31
+ "\"\"\"\n",
32
+ "intermedOut = {}\n",
33
+ "lvalues = list()\n",
34
+ "\n",
35
+ "def hookFn(m, i, o):\n",
36
+ " \"\"\"\n",
37
+ " call back for latent values\n",
38
+ " \"\"\"\n",
39
+ " #intermedOut[m] = o\n",
40
+ " lv = o.data.cpu().numpy()\n",
41
+ " lv = lv[0].tolist()\n",
42
+ " lvalues.append(lv)\n",
43
+ " #print(lv)\n",
44
+ "\n",
45
+ "def getLatValues():\n",
46
+ " \"\"\"\n",
47
+ " \"\"\"\n",
48
+ " return lvalues\n",
49
+ "\n",
50
+ "class FeedForwardNetwork(torch.nn.Module):\n",
51
+ " def __init__(self, configFile, addDefValues=None):\n",
52
+ " \"\"\"\n",
53
+ " In the constructor we instantiate two nn.Linear modules and assign them as\n",
54
+ " member variables.\n",
55
+ "\n",
56
+ " Parameters\n",
57
+ " configFile : config file path\n",
58
+ " addDefValues : dictionary of additional default values\t\n",
59
+ " \"\"\"\n",
60
+ " defValues = dict() if addDefValues is None else addDefValues.copy()\n",
61
+ " defValues[\"common.mode\"] = (\"training\", None)\n",
62
+ " defValues[\"common.model.directory\"] = (\"model\", None)\n",
63
+ " defValues[\"common.model.file\"] = (None, None)\n",
64
+ " defValues[\"common.preprocessing\"] = (None, None)\n",
65
+ " defValues[\"common.scaling.method\"] = (\"zscale\", None)\n",
66
+ " defValues[\"common.scaling.minrows\"] = (50, None)\n",
67
+ " defValues[\"common.scaling.param.file\"] = (None, None)\n",
68
+ " defValues[\"common.verbose\"] = (False, None)\n",
69
+ " defValues[\"common.device\"] = (\"cpu\", None)\n",
70
+ " defValues[\"train.data.file\"] = (None, \"missing training data file\")\n",
71
+ " defValues[\"train.data.fields\"] = (None, \"missing training data field ordinals\")\n",
72
+ " defValues[\"train.data.feature.fields\"] = (None, \"missing training data feature field ordinals\")\n",
73
+ " defValues[\"train.data.out.fields\"] = (None, \"missing training data feature field ordinals\")\n",
74
+ " defValues[\"train.layer.data\"] = (None, \"missing layer data\")\n",
75
+ " defValues[\"train.input.size\"] = (None, None)\n",
76
+ " defValues[\"train.output.size\"] = (None, \"missing output size\")\n",
77
+ " defValues[\"train.batch.size\"] = (10, None)\n",
78
+ " defValues[\"train.loss.reduction\"] = (\"mean\", None)\n",
79
+ " defValues[\"train.num.iterations\"] = (500, None)\n",
80
+ " defValues[\"train.lossFn\"] = (\"mse\", None) \n",
81
+ " defValues[\"train.optimizer\"] = (\"sgd\", None) \n",
82
+ " defValues[\"train.opt.learning.rate\"] = (.0001, None)\n",
83
+ " defValues[\"train.opt.weight.decay\"] = (0, None) \n",
84
+ " defValues[\"train.opt.momentum\"] = (0, None) \n",
85
+ " defValues[\"train.opt.eps\"] = (1e-08, None) \n",
86
+ " defValues[\"train.opt.dampening\"] = (0, None) \n",
87
+ " defValues[\"train.opt.momentum.nesterov\"] = (False, None) \n",
88
+ " defValues[\"train.opt.betas\"] = ([0.9, 0.999], None) \n",
89
+ " defValues[\"train.opt.alpha\"] = (0.99, None) \n",
90
+ " defValues[\"train.save.model\"] = (False, None) \n",
91
+ " defValues[\"train.track.error\"] = (False, None) \n",
92
+ " defValues[\"train.epoch.intv\"] = (5, None) \n",
93
+ " defValues[\"train.batch.intv\"] = (5, None) \n",
94
+ " defValues[\"train.print.weights\"] = (False, None) \n",
95
+ " defValues[\"valid.data.file\"] = (None, None)\n",
96
+ " defValues[\"valid.accuracy.metric\"] = (None, None)\n",
97
+ " defValues[\"predict.data.file\"] = (None, None)\n",
98
+ " defValues[\"predict.use.saved.model\"] = (True, None)\n",
99
+ " defValues[\"predict.output\"] = (\"binary\", None)\n",
100
+ " defValues[\"predict.feat.pad.size\"] = (60, None)\n",
101
+ " defValues[\"predict.print.output\"] = (True, None)\n",
102
+ " defValues[\"calibrate.num.bins\"] = (10, None)\n",
103
+ " defValues[\"calibrate.pred.prob.thresh\"] = (0.5, None)\n",
104
+ " defValues[\"calibrate.num.nearest.neighbors\"] = (10, None)\n",
105
+ " self.config = Configuration(configFile, defValues)\n",
106
+ "\n",
107
+ " super(FeedForwardNetwork, self).__init__()\n",
108
+ "\n",
109
+ " def setConfigParam(self, name, value):\n",
110
+ " \"\"\"\n",
111
+ " set config param\n",
112
+ "\n",
113
+ " Parameters\n",
114
+ " name : config name\n",
115
+ " value : config value\n",
116
+ " \"\"\"\n",
117
+ " self.config.setParam(name, value)\n",
118
+ "\n",
119
+ " def getConfig(self):\n",
120
+ " \"\"\"\n",
121
+ " get config object\n",
122
+ " \"\"\"\n",
123
+ " return self.config\n",
124
+ "\n",
125
+ " def setVerbose(self, verbose):\n",
126
+ " self.verbose = verbose\n",
127
+ "\n",
128
+ " def buildModel(self):\n",
129
+ " \"\"\"\n",
130
+ " Loads configuration and builds the various piecess necessary for the model\n",
131
+ " \"\"\"\n",
132
+ " torch.manual_seed(9999)\n",
133
+ "\n",
134
+ " self.verbose = self.config.getBooleanConfig(\"common.verbose\")[0]\n",
135
+ " numinp = self.config.getIntConfig(\"train.input.size\")[0]\n",
136
+ " if numinp is None:\n",
137
+ " numinp = len(self.config.getIntListConfig(\"train.data.feature.fields\")[0])\n",
138
+ " #numOut = len(self.config.getStringConfig(\"train.data.out.fields\")[0].split(\",\"))\n",
139
+ " self.outputSize = self.config.getIntConfig(\"train.output.size\")[0]\n",
140
+ " self.batchSize = self.config.getIntConfig(\"train.batch.size\")[0]\n",
141
+ " #lossRed = self.config.getStringConfig(\"train.loss.reduction\")[0]\n",
142
+ " #learnRate = self.config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
143
+ " self.numIter = self.config.getIntConfig(\"train.num.iterations\")[0]\n",
144
+ " optimizer = self.config.getStringConfig(\"train.optimizer\")[0]\n",
145
+ " self.lossFnStr = self.config.getStringConfig(\"train.lossFn\")[0]\n",
146
+ " self.accMetric = self.config.getStringConfig(\"valid.accuracy.metric\")[0]\n",
147
+ " self.trackErr = self.config.getBooleanConfig(\"train.track.error\")[0]\n",
148
+ " self.batchIntv = self.config.getIntConfig(\"train.batch.intv\")[0]\n",
149
+ " self.restored = False\n",
150
+ " self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None\n",
151
+ "\n",
152
+ " #build network\n",
153
+ " layers = list()\n",
154
+ " ninp = numinp\n",
155
+ " trData = self.config.getStringConfig(\"train.layer.data\")[0].split(\",\")\n",
156
+ " for ld in trData:\n",
157
+ " lde = ld.split(\":\")\n",
158
+ " assert len(lde) == 5, \"expecting 5 items for layer data\"\n",
159
+ "\n",
160
+ " #num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction\n",
161
+ " nunit = int(lde[0])\n",
162
+ " actStr = lde[1]\n",
163
+ " act = FeedForwardNetwork.createActivation(actStr) if actStr != \"none\" else None\n",
164
+ " bnorm = lde[2] == \"true\"\n",
165
+ " afterAct = lde[3] == \"true\"\n",
166
+ " dpr = float(lde[4])\n",
167
+ "\n",
168
+ " layers.append(torch.nn.Linear(ninp, nunit))\t\t\t\n",
169
+ " if bnorm:\n",
170
+ " #with batch norm\n",
171
+ " if afterAct:\n",
172
+ " safeAppend(layers, act)\n",
173
+ " layers.append(torch.nn.BatchNorm1d(nunit))\n",
174
+ " else:\n",
175
+ " layers.append(torch.nn.BatchNorm1d(nunit))\n",
176
+ " safeAppend(layers, act)\n",
177
+ " else:\n",
178
+ " #without batch norm\n",
179
+ " safeAppend(layers, act)\n",
180
+ "\n",
181
+ " if dpr > 0:\n",
182
+ " layers.append(torch.nn.Dropout(dpr))\n",
183
+ " ninp = nunit\n",
184
+ "\n",
185
+ " self.layers = torch.nn.Sequential(*layers)\t\n",
186
+ "\n",
187
+ " self.device = FeedForwardNetwork.getDevice(self)\n",
188
+ "\n",
189
+ " #training data\n",
190
+ " dataFile = self.config.getStringConfig(\"train.data.file\")[0]\n",
191
+ " (featData, outData) = FeedForwardNetwork.prepData(self, dataFile)\n",
192
+ " self.featData = torch.from_numpy(featData)\n",
193
+ " self.outData = torch.from_numpy(outData)\n",
194
+ "\n",
195
+ " #validation data\n",
196
+ " dataFile = self.config.getStringConfig(\"valid.data.file\")[0]\n",
197
+ " (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataFile)\n",
198
+ " self.validFeatData = torch.from_numpy(featDataV)\n",
199
+ " self.validOutData = torch.from_numpy(outDataV)\n",
200
+ "\n",
201
+ " # loss function and optimizer\n",
202
+ " self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)\n",
203
+ " self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizer)\n",
204
+ "\n",
205
+ " self.yPred = None\n",
206
+ " self.restored = False\n",
207
+ "\n",
208
+ " #mode to device\n",
209
+ " self.device = FeedForwardNetwork.getDevice(self)\t\n",
210
+ " self.featData = self.featData.to(self.device)\n",
211
+ " self.outData = self.outData.to(self.device)\n",
212
+ " self.validFeatData = self.validFeatData.to(self.device)\n",
213
+ " self.to(self.device)\n",
214
+ "\n",
215
+ " @staticmethod\n",
216
+ " def getDevice(model):\n",
217
+ " \"\"\"\n",
218
+ " gets device\n",
219
+ "\n",
220
+ " Parameters\n",
221
+ " model : torch model\n",
222
+ " \"\"\"\n",
223
+ " devType = model.config.getStringConfig(\"common.device\")[0]\n",
224
+ " if devType == \"cuda\":\n",
225
+ " if torch.cuda.is_available():\n",
226
+ " device = torch.device(\"cuda\")\n",
227
+ " else:\n",
228
+ " exitWithMsg(\"cuda not available\")\n",
229
+ " else:\n",
230
+ " device = torch.device(\"cpu\")\n",
231
+ " return device\n",
232
+ "\n",
233
+ " def setValidationData(self, dataSource, prep=True):\n",
234
+ " \"\"\"\n",
235
+ " sets validation data\n",
236
+ "\n",
237
+ " Parameters\n",
238
+ " dataSource : data source str if file path or 2D array\n",
239
+ " prep : if True load and prepare \n",
240
+ " \"\"\"\n",
241
+ " if prep:\n",
242
+ " (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataSource)\n",
243
+ " self.validFeatData = torch.from_numpy(featDataV)\n",
244
+ " self.validOutData = outDataV\n",
245
+ " else:\n",
246
+ " self.validFeatData = torch.from_numpy(dataSource[0])\n",
247
+ " self.validOutData = dataSource[1]\t\t\n",
248
+ "\n",
249
+ " self.validFeatData = self.validFeatData.to(self.device)\n",
250
+ "\n",
251
+ " @staticmethod\n",
252
+ " def createActivation(actName):\n",
253
+ " \"\"\"\n",
254
+ " create activation\n",
255
+ "\n",
256
+ " Parameters\n",
257
+ " actName : activation name\n",
258
+ " \"\"\"\n",
259
+ " if actName is None:\n",
260
+ " activation = None\n",
261
+ " elif actName == \"relu\":\n",
262
+ " activation = torch.nn.ReLU()\n",
263
+ " elif actName == \"tanh\":\n",
264
+ " activation = torch.nn.Tanh()\n",
265
+ " elif actName == \"sigmoid\":\n",
266
+ " activation = torch.nn.Sigmoid()\n",
267
+ " elif actName == \"softmax\":\n",
268
+ " activation = torch.nn.Softmax(dim=1)\n",
269
+ " else:\n",
270
+ " exitWithMsg(\"invalid activation function name \" + actName)\n",
271
+ " return activation\n",
272
+ "\n",
273
+ " @staticmethod\n",
274
+ " def createLossFunction(model, lossFnName):\n",
275
+ " \"\"\"\n",
276
+ " create loss function\n",
277
+ "\n",
278
+ " Parameters\n",
279
+ " lossFnName : loss function name\n",
280
+ " \"\"\"\n",
281
+ " config = model.config\n",
282
+ " lossRed = config.getStringConfig(\"train.loss.reduction\")[0]\n",
283
+ " if lossFnName == \"ltwo\" or lossFnName == \"mse\":\n",
284
+ " lossFunc = torch.nn.MSELoss(reduction=lossRed)\n",
285
+ " elif lossFnName == \"ce\":\n",
286
+ " lossFunc = torch.nn.CrossEntropyLoss(reduction=lossRed)\n",
287
+ " elif lossFnName == \"lone\" or lossFnName == \"mae\":\n",
288
+ " lossFunc = torch.nn.L1Loss(reduction=lossRed)\n",
289
+ " elif lossFnName == \"bce\":\n",
290
+ " lossFunc = torch.nn.BCELoss(reduction=lossRed)\n",
291
+ " elif lossFnName == \"bcel\":\n",
292
+ " lossFunc = torch.nn.BCEWithLogitsLoss(reduction=lossRed)\n",
293
+ " elif lossFnName == \"sm\":\n",
294
+ " lossFunc = torch.nn.SoftMarginLoss(reduction=lossRed)\n",
295
+ " elif lossFnName == \"mlsm\":\n",
296
+ " lossFunc = torch.nn.MultiLabelSoftMarginLoss(reduction=lossRed)\n",
297
+ " else:\n",
298
+ " exitWithMsg(\"invalid loss function name \" + lossFnName)\n",
299
+ " return lossFunc\n",
300
+ "\n",
301
+ " @staticmethod\n",
302
+ " def createOptimizer(model, optName):\n",
303
+ " \"\"\"\n",
304
+ " create optimizer\n",
305
+ "\n",
306
+ " Parameters\n",
307
+ " optName : optimizer name\n",
308
+ " \"\"\"\n",
309
+ " config = model.config\n",
310
+ " learnRate = config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
311
+ " weightDecay = config.getFloatConfig(\"train.opt.weight.decay\")[0]\n",
312
+ " momentum = config.getFloatConfig(\"train.opt.momentum\")[0]\n",
313
+ " eps = config.getFloatConfig(\"train.opt.eps\")[0]\n",
314
+ " if optName == \"sgd\":\n",
315
+ " dampening = config.getFloatConfig(\"train.opt.dampening\")[0]\n",
316
+ " momentumNesterov = config.getBooleanConfig(\"train.opt.momentum.nesterov\")[0]\n",
317
+ " optimizer = torch.optim.SGD(model.parameters(),lr=learnRate, momentum=momentum, \n",
318
+ " dampening=dampening, weight_decay=weightDecay, nesterov=momentumNesterov)\n",
319
+ " elif optName == \"adam\":\n",
320
+ " betas = config.getFloatListConfig(\"train.opt.betas\")[0]\n",
321
+ " betas = (betas[0], betas[1]) \n",
322
+ " optimizer = torch.optim.Adam(model.parameters(), lr=learnRate,betas=betas, eps = eps,\n",
323
+ " weight_decay=weightDecay)\n",
324
+ " elif optName == \"rmsprop\":\n",
325
+ " alpha = config.getFloatConfig(\"train.opt.alpha\")[0]\n",
326
+ " optimizer = torch.optim.RMSprop(model.parameters(), lr=learnRate, alpha=alpha,\n",
327
+ " eps=eps, weight_decay=weightDecay, momentum=momentum)\n",
328
+ " else:\n",
329
+ " exitWithMsg(\"invalid optimizer name \" + optName)\n",
330
+ " return optimizer\n",
331
+ "\n",
332
+ "\n",
333
+ " def forward(self, x):\n",
334
+ " \"\"\"\n",
335
+ " In the forward function we accept a Tensor of input data and we must return\n",
336
+ " a Tensor of output data. We can use Modules defined in the constructor as\n",
337
+ " well as arbitrary (differentiable) operations on Tensors.\n",
338
+ "\n",
339
+ " Parameters\n",
340
+ " x : data batch\n",
341
+ " \"\"\"\n",
342
+ " y = self.layers(x)\t\n",
343
+ " return y\n",
344
+ "\n",
345
+ " @staticmethod\n",
346
+ " def addForwardHook(model, l, cl = 0):\n",
347
+ " \"\"\"\n",
348
+ " register forward hooks\n",
349
+ "\n",
350
+ " Parameters\n",
351
+ " l : \n",
352
+ " cl :\n",
353
+ " \"\"\"\n",
354
+ " for name, layer in model._modules.items():\n",
355
+ " #If it is a sequential, don't register a hook on it\n",
356
+ " # but recursively register hook on all it's module children\n",
357
+ " print(str(cl) + \" : \" + name)\n",
358
+ " if isinstance(layer, torch.nn.Sequential):\n",
359
+ " FeedForwardNetwork.addForwardHook(layer, l, cl)\n",
360
+ " else:\n",
361
+ " #\t it's a non sequential. Register a hook\n",
362
+ " if cl == l:\n",
363
+ " print(\"setting hook at layer \" + str(l))\n",
364
+ " layer.register_forward_hook(hookFn)\n",
365
+ " cl += 1\n",
366
+ "\n",
367
+ " @staticmethod\n",
368
+ " def prepData(model, dataSource, includeOutFld=True):\n",
369
+ " \"\"\"\n",
370
+ " loads and prepares data\n",
371
+ "\n",
372
+ " Parameters\n",
373
+ " dataSource : data source str if file path or 2D array\n",
374
+ " includeOutFld : True if target freld to be included\n",
375
+ " \"\"\"\n",
376
+ " # parameters\n",
377
+ " fieldIndices = model.config.getIntListConfig(\"train.data.fields\")[0]\n",
378
+ " featFieldIndices = model.config.getIntListConfig(\"train.data.feature.fields\")[0]\n",
379
+ "\n",
380
+ " #all data and feature data\n",
381
+ " isDataFile = isinstance(dataSource, str)\n",
382
+ " selFieldIndices = fieldIndices if includeOutFld else fieldIndices[:-1]\n",
383
+ " if isDataFile: \n",
384
+ " #source file path \n",
385
+ " (data, featData) = loadDataFile(dataSource, \",\", selFieldIndices, featFieldIndices)\n",
386
+ " else:\n",
387
+ " # tabular data\n",
388
+ " data = tableSelFieldsFilter(dataSource, selFieldIndices)\n",
389
+ " featData = tableSelFieldsFilter(data, featFieldIndices)\n",
390
+ " #print(featData)\n",
391
+ " featData = np.array(featData)\n",
392
+ "\n",
393
+ " if (model.config.getStringConfig(\"common.preprocessing\")[0] == \"scale\"):\n",
394
+ " scalingMethod = model.config.getStringConfig(\"common.scaling.method\")[0]\n",
395
+ "\n",
396
+ " #scale only if there are enough rows\n",
397
+ " nrow = featData.shape[0]\n",
398
+ " minrows = model.config.getIntConfig(\"common.scaling.minrows\")[0]\n",
399
+ " if nrow > minrows:\n",
400
+ " #in place scaling\n",
401
+ " featData = scaleData(featData, scalingMethod)\n",
402
+ " else:\n",
403
+ " #use pre computes scaling parameters\n",
404
+ " spFile = model.config.getStringConfig(\"common.scaling.param.file\")[0]\n",
405
+ " if spFile is None:\n",
406
+ " exitWithMsg(\"for small data sets pre computed scaling parameters need to provided\")\n",
407
+ " scParams = restoreObject(spFile)\n",
408
+ " featData = scaleDataWithParams(featData, scalingMethod, scParams)\n",
409
+ " featData = np.array(featData)\n",
410
+ "\n",
411
+ " # target data\n",
412
+ " if includeOutFld:\n",
413
+ " outFieldIndices = model.config.getStringConfig(\"train.data.out.fields\")[0]\n",
414
+ " outFieldIndices = strToIntArray(outFieldIndices, \",\")\n",
415
+ " if isDataFile:\n",
416
+ " outData = data[:,outFieldIndices]\n",
417
+ " else:\n",
418
+ " outData = tableSelFieldsFilter(data, outFieldIndices)\n",
419
+ " outData = np.array(outData)\n",
420
+ " foData = (featData.astype(np.float32), outData.astype(np.float32))\n",
421
+ " else:\n",
422
+ " foData = featData.astype(np.float32)\n",
423
+ " return foData\n",
424
+ "\n",
425
+ " @staticmethod\n",
426
+ " def saveCheckpt(model):\n",
427
+ " \"\"\"\n",
428
+ " checkpoints model\n",
429
+ "\n",
430
+ " Parameters\n",
431
+ " model : torch model\n",
432
+ " \"\"\"\n",
433
+ " print(\"..saving model checkpoint\")\n",
434
+ " modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
435
+ " assert os.path.exists(modelDirectory), \"model save directory does not exist\"\n",
436
+ " modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
437
+ " filepath = os.path.join(modelDirectory, modelFile)\n",
438
+ " state = {\"state_dict\": model.state_dict(), \"optim_dict\": model.optimizer.state_dict()}\n",
439
+ " torch.save(state, filepath)\n",
440
+ " if model.verbose:\n",
441
+ " print(\"model saved\")\n",
442
+ "\n",
443
+ " @staticmethod\n",
444
+ " def restoreCheckpt(model, loadOpt=False):\n",
445
+ " \"\"\"\n",
446
+ " restored checkpointed model\n",
447
+ "\n",
448
+ " Parameters\n",
449
+ " model : torch model\n",
450
+ " loadOpt : True if optimizer to be loaded\n",
451
+ " \"\"\"\n",
452
+ " if not model.restored:\n",
453
+ " print(\"..restoring model checkpoint\")\n",
454
+ " modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
455
+ " modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
456
+ " filepath = os.path.join(modelDirectory, modelFile)\n",
457
+ " assert os.path.exists(filepath), \"model save file does not exist\"\n",
458
+ " checkpoint = torch.load(filepath)\n",
459
+ " model.load_state_dict(checkpoint[\"state_dict\"])\n",
460
+ " model.to(model.device)\n",
461
+ " if loadOpt:\n",
462
+ " model.optimizer.load_state_dict(checkpoint[\"optim_dict\"])\n",
463
+ " model.restored = True\n",
464
+ "\n",
465
+ " @staticmethod\n",
466
+ " def processClassifOutput(yPred, config):\n",
467
+ " \"\"\"\n",
468
+ " extracts probability label 1 or label with highest probability\n",
469
+ "\n",
470
+ " Parameters\n",
471
+ " yPred : predicted output\n",
472
+ " config : config object\n",
473
+ " \"\"\"\n",
474
+ " outType = config.getStringConfig(\"predict.output\")[0]\n",
475
+ " if outType == \"prob\":\n",
476
+ " outputSize = config.getIntConfig(\"train.output.size\")[0]\n",
477
+ " if outputSize == 2:\n",
478
+ " #return prob of pos class for binary classifier \n",
479
+ " yPred = yPred[:, 1]\n",
480
+ " else:\n",
481
+ " #return class value and probability for multi classifier \n",
482
+ " yCl = np.argmax(yPred, axis=1)\n",
483
+ " yPred = list(map(lambda y : y[0][y[1]], zip(yPred, yCl)))\n",
484
+ " yPred = zip(yCl, yPred)\n",
485
+ " else:\n",
486
+ " yPred = np.argmax(yPred, axis=1)\n",
487
+ " return yPred\n",
488
+ "\n",
489
+ " @staticmethod\n",
490
+ " def printPrediction(yPred, config, dataSource):\n",
491
+ " \"\"\"\n",
492
+ " prints input feature data and prediction\n",
493
+ "\n",
494
+ " Parameters\n",
495
+ " yPred : predicted output\n",
496
+ " config : config object\n",
497
+ " dataSource : data source str if file path or 2D array\n",
498
+ " \"\"\"\n",
499
+ " #prDataFilePath = config.getStringConfig(\"predict.data.file\")[0]\n",
500
+ " padWidth = config.getIntConfig(\"predict.feat.pad.size\")[0]\n",
501
+ " i = 0\n",
502
+ " if type(dataSource) == str:\n",
503
+ " for rec in fileRecGen(dataSource, \",\"):\n",
504
+ " feat = (\",\".join(rec)).ljust(padWidth, \" \")\n",
505
+ " rec = feat + \"\\t\" + str(yPred[i])\n",
506
+ " print(rec)\n",
507
+ " i += 1\n",
508
+ " else:\n",
509
+ " for rec in dataSource:\n",
510
+ " srec = toStrList(rec, 6)\n",
511
+ " feat = (\",\".join(srec)).ljust(padWidth, \" \")\n",
512
+ " srec = feat + \"\\t\" + str(yPred[i])\n",
513
+ " print(srec)\n",
514
+ " i += 1\n",
515
+ "\n",
516
+ "\n",
517
+ " @staticmethod\n",
518
+ " def allTrain(model):\n",
519
+ " \"\"\"\n",
520
+ " train with all data\n",
521
+ "\n",
522
+ " Parameters\n",
523
+ " model : torch model\n",
524
+ " \"\"\"\n",
525
+ " # train mode\n",
526
+ " model.train()\n",
527
+ " for t in range(model.numIter):\n",
528
+ "\n",
529
+ "\n",
530
+ " # Forward pass: Compute predicted y by passing x to the model\n",
531
+ " yPred = model(model.featData)\n",
532
+ "\n",
533
+ " # Compute and print loss\n",
534
+ " loss = model.lossFn(yPred, model.outData)\n",
535
+ " if model.verbose and t % 50 == 0:\n",
536
+ " print(\"epoch {} loss {:.6f}\".format(t, loss.item()))\n",
537
+ "\n",
538
+ " # Zero gradients, perform a backward pass, and update the weights.\n",
539
+ " model.optimizer.zero_grad()\n",
540
+ " loss.backward()\n",
541
+ " model.optimizer.step() \t\n",
542
+ "\n",
543
+ " #validate\n",
544
+ " model.eval()\n",
545
+ " yPred = model(model.validFeatData)\n",
546
+ " yPred = yPred.data.cpu().numpy()\n",
547
+ " yActual = model.validOutData\n",
548
+ " if model.verbose:\n",
549
+ " result = np.concatenate((yPred, yActual), axis = 1)\n",
550
+ " print(\"predicted actual\")\n",
551
+ " print(result)\n",
552
+ "\n",
553
+ " score = perfMetric(model.accMetric, yActual, yPred)\n",
554
+ " print(formatFloat(3, score, \"perf score\"))\n",
555
+ " return score\n",
556
+ "\n",
557
+ " @staticmethod\n",
558
+ " def batchTrain(model):\n",
559
+ " \"\"\"\n",
560
+ " train with batch data\n",
561
+ "\n",
562
+ " Parameters\n",
563
+ " model : torch model\n",
564
+ " \"\"\"\n",
565
+ " model.restored = False\n",
566
+ " trainData = TensorDataset(model.featData, model.outData)\n",
567
+ " trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)\n",
568
+ " epochIntv = model.config.getIntConfig(\"train.epoch.intv\")[0]\n",
569
+ "\n",
570
+ " # train mode\n",
571
+ " model.train()\n",
572
+ "\n",
573
+ " if model.trackErr:\n",
574
+ " trErr = list()\n",
575
+ " vaErr = list()\n",
576
+ " #epoch\n",
577
+ " for t in range(model.numIter):\n",
578
+ " #batch\n",
579
+ " b = 0\n",
580
+ " epochLoss = 0.0\n",
581
+ " for xBatch, yBatch in trainDataLoader:\n",
582
+ "\n",
583
+ " # Forward pass: Compute predicted y by passing x to the model\n",
584
+ " xBatch, yBatch = xBatch.to(model.device), yBatch.to(model.device)\n",
585
+ " yPred = model(xBatch)\n",
586
+ "\n",
587
+ " # Compute and print loss\n",
588
+ " loss = model.lossFn(yPred, yBatch)\n",
589
+ " if model.verbose and t % epochIntv == 0 and b % model.batchIntv == 0:\n",
590
+ " print(\"epoch {} batch {} loss {:.6f}\".format(t, b, loss.item()))\n",
591
+ "\n",
592
+ " if model.trackErr and model.batchIntv == 0:\n",
593
+ " epochLoss += loss.item()\n",
594
+ "\n",
595
+ " #error tracking at batch level\n",
596
+ " if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:\n",
597
+ " trErr.append(loss.item())\n",
598
+ " vloss = FeedForwardNetwork.evaluateModel(model)\n",
599
+ " vaErr.append(vloss)\n",
600
+ "\n",
601
+ " # Zero gradients, perform a backward pass, and update the weights.\n",
602
+ " model.optimizer.zero_grad()\n",
603
+ " loss.backward()\n",
604
+ " model.optimizer.step() \t\n",
605
+ " b += 1\n",
606
+ "\n",
607
+ " #error tracking at epoch level\n",
608
+ " if model.trackErr and model.batchIntv == 0:\n",
609
+ " epochLoss /= len(trainDataLoader)\n",
610
+ " trErr.append(epochLoss)\n",
611
+ " vloss = FeedForwardNetwork.evaluateModel(model)\n",
612
+ " vaErr.append(vloss)\n",
613
+ "\n",
614
+ " #validate\n",
615
+ " model.eval()\n",
616
+ " yPred = model(model.validFeatData)\n",
617
+ " yPred = yPred.data.cpu().numpy()\n",
618
+ " yActual = model.validOutData\n",
619
+ " if model.verbose:\n",
620
+ " vsize = yPred.shape[0]\n",
621
+ " print(\"\\npredicted \\t\\t actual\")\n",
622
+ " for i in range(vsize):\n",
623
+ " print(str(yPred[i]) + \"\\t\" + str(yActual[i]))\n",
624
+ "\n",
625
+ " score = perfMetric(model.accMetric, yActual, yPred)\n",
626
+ " print(yActual)\n",
627
+ " print(yPred)\n",
628
+ " print(formatFloat(3, score, \"perf score\"))\n",
629
+ "\n",
630
+ " #save\n",
631
+ " modelSave = model.config.getBooleanConfig(\"train.model.save\")[0]\n",
632
+ " if modelSave:\n",
633
+ " FeedForwardNetwork.saveCheckpt(model)\n",
634
+ "\n",
635
+ " if model.trackErr:\n",
636
+ " FeedForwardNetwork.errorPlot(model, trErr, vaErr)\n",
637
+ "\n",
638
+ " if model.config.getBooleanConfig(\"train.print.weights\")[0]:\n",
639
+ " print(\"model weights\")\n",
640
+ " for param in model.parameters():\n",
641
+ " print(param.data)\n",
642
+ " return score\n",
643
+ "\n",
644
+ " @staticmethod\n",
645
+ " def errorPlot(model, trErr, vaErr):\n",
646
+ " \"\"\"\n",
647
+ " plot errors\n",
648
+ "\n",
649
+ " Parameters\n",
650
+ " trErr : training error list\t\n",
651
+ " vaErr : validation error list\t\n",
652
+ " \"\"\"\n",
653
+ " x = np.arange(len(trErr))\n",
654
+ " plt.plot(x,trErr,label = \"training error\")\n",
655
+ " plt.plot(x,vaErr,label = \"validation error\")\n",
656
+ " plt.xlabel(\"iteration\")\n",
657
+ " plt.ylabel(\"error\")\n",
658
+ " plt.legend([\"training error\", \"validation error\"], loc='upper left')\n",
659
+ " plt.show()\n",
660
+ "\n",
661
+ " @staticmethod\n",
662
+ " def modelPredict(model, dataSource = None):\n",
663
+ " \"\"\"\n",
664
+ " predict\n",
665
+ "\n",
666
+ " Parameters\n",
667
+ " model : torch model\n",
668
+ " dataSource : data source\n",
669
+ " \"\"\"\n",
670
+ " #train or restore model\n",
671
+ " useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
672
+ " if useSavedModel:\n",
673
+ " FeedForwardNetwork.restoreCheckpt(model)\n",
674
+ " else:\n",
675
+ " FeedForwardNetwork.batchTrain(model) \n",
676
+ "\n",
677
+ " #predict\n",
678
+ " if dataSource is None:\n",
679
+ " dataSource = model.config.getStringConfig(\"predict.data.file\")[0]\n",
680
+ " featData = FeedForwardNetwork.prepData(model, dataSource, False)\n",
681
+ " #print(featData)\n",
682
+ " featData = torch.from_numpy(featData)\n",
683
+ " featData = featData.to(model.device)\n",
684
+ "\n",
685
+ " model.eval()\n",
686
+ " yPred = model(featData)\n",
687
+ " yPred = yPred.data.cpu().numpy()\n",
688
+ " #print(yPred)\n",
689
+ "\n",
690
+ " if model.outputSize >= 2:\n",
691
+ " #classification\n",
692
+ " yPred = FeedForwardNetwork.processClassifOutput(yPred, model.config)\n",
693
+ "\n",
694
+ " # print prediction\n",
695
+ " if model.config.getBooleanConfig(\"predict.print.output\")[0]:\n",
696
+ " FeedForwardNetwork.printPrediction(yPred, model.config, dataSource)\n",
697
+ "\n",
698
+ " return yPred\n",
699
+ "\n",
700
+ " def predict(self, dataSource = None):\n",
701
+ " \"\"\"\n",
702
+ " predict\n",
703
+ "\n",
704
+ " Parameters\n",
705
+ " dataSource : data source\n",
706
+ " \"\"\"\n",
707
+ " return FeedForwardNetwork.modelPredict(self, dataSource)\n",
708
+ "\n",
709
+ " @staticmethod\n",
710
+ " def evaluateModel(model):\n",
711
+ " \"\"\"\n",
712
+ " evaluate model\n",
713
+ "\n",
714
+ " Parameters\n",
715
+ " model : torch model\n",
716
+ " \"\"\"\n",
717
+ " model.eval()\n",
718
+ " with torch.no_grad():\n",
719
+ " yPred = model(model.validFeatData)\n",
720
+ " #yPred = yPred.data.cpu().numpy()\n",
721
+ " yActual = model.validOutData\n",
722
+ " score = model.lossFn(yPred, yActual).item()\n",
723
+ " model.train()\n",
724
+ " return score\n",
725
+ "\n",
726
+ " @staticmethod\n",
727
+ " def prepValidate(model, dataSource=None):\n",
728
+ " \"\"\"\n",
729
+ " prepare for validation\n",
730
+ "\n",
731
+ " Parameters\n",
732
+ " model : torch model\n",
733
+ " dataSource : data source\n",
734
+ " \"\"\"\n",
735
+ " #train or restore model\n",
736
+ " if not model.restored:\n",
737
+ " useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
738
+ " if useSavedModel:\n",
739
+ " FeedForwardNetwork.restoreCheckpt(model)\n",
740
+ " else:\n",
741
+ " FeedForwardNetwork.batchTrain(model)\n",
742
+ " model.restored = True\n",
743
+ "\n",
744
+ " if \tdataSource is not None:\n",
745
+ " model.setValidationData(dataSource)\n",
746
+ "\n",
747
+ " @staticmethod\n",
748
+ " def validateModel(model, retPred=False):\n",
749
+ " \"\"\"\n",
750
+ " pmodel validation\n",
751
+ "\n",
752
+ " Parameters\n",
753
+ " model : torch model\n",
754
+ " retPred : if True return prediction\n",
755
+ " \"\"\"\n",
756
+ " model.eval()\n",
757
+ " yPred = model(model.validFeatData)\n",
758
+ " yPred = yPred.data.cpu().numpy()\n",
759
+ " model.yPred = yPred\n",
760
+ " yActual = model.validOutData\n",
761
+ " vsize = yPred.shape[0]\n",
762
+ " if model.verbose:\n",
763
+ " print(\"\\npredicted \\t actual\")\n",
764
+ " for i in range(vsize):\n",
765
+ " print(\"{:.3f}\\t\\t{:.3f}\".format(yPred[i][0], yActual[i][0]))\n",
766
+ "\n",
767
+ " score = perfMetric(model.accMetric, yActual, yPred)\n",
768
+ " print(formatFloat(3, score, \"perf score\"))\n",
769
+ "\n",
770
+ " if retPred:\n",
771
+ " y = list(map(lambda i : (yPred[i][0], yActual[i][0]), range(vsize)))\n",
772
+ " res = (y, score)\n",
773
+ " return res\n",
774
+ " else:\t\n",
775
+ " return score"
776
+ ]
777
+ }
778
+ ],
779
+ "metadata": {
780
+ "kernelspec": {
781
+ "display_name": "Python 3 (ipykernel)",
782
+ "language": "python",
783
+ "name": "python3"
784
+ },
785
+ "language_info": {
786
+ "codemirror_mode": {
787
+ "name": "ipython",
788
+ "version": 3
789
+ },
790
+ "file_extension": ".py",
791
+ "mimetype": "text/x-python",
792
+ "name": "python",
793
+ "nbconvert_exporter": "python",
794
+ "pygments_lexer": "ipython3",
795
+ "version": "3.9.12"
796
+ }
797
+ },
798
+ "nbformat": 4,
799
+ "nbformat_minor": 5
800
+ }
lib/.ipynb_checkpoints/txproc-checkpoint.ipynb ADDED
@@ -0,0 +1,1002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "f720c141",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import sys\n",
12
+ "from random import randint\n",
13
+ "import random\n",
14
+ "import time\n",
15
+ "from datetime import datetime\n",
16
+ "import re, string, unicodedata\n",
17
+ "import nltk\n",
18
+ "import contractions\n",
19
+ "import inflect\n",
20
+ "from bs4 import BeautifulSoup\n",
21
+ "from nltk import word_tokenize, sent_tokenize\n",
22
+ "from nltk.corpus import stopwords\n",
23
+ "from nltk.stem.isri import ISRIStemmer\n",
24
+ "from nltk.stem.porter import PorterStemmer\n",
25
+ "from nltk.stem.snowball import SnowballStemmer\n",
26
+ "from nltk.stem import LancasterStemmer, WordNetLemmatizer\n",
27
+ "from nltk.tag import StanfordNERTagger\n",
28
+ "from nltk.tokenize import word_tokenize, sent_tokenize\n",
29
+ "import spacy\n",
30
+ "import torch\n",
31
+ "from collections import defaultdict\n",
32
+ "import pickle\n",
33
+ "import numpy as np\n",
34
+ "import re\n",
35
+ "\n",
36
+ "sys.path.append(os.path.abspath(\"../lib\"))\n",
37
+ "from util import *\n",
38
+ "from mlutil import *\n",
39
+ "\n",
40
+ "lcc = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
41
+ "\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
42
+ "ucc = [\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\", \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\"]\n",
43
+ "dig = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
44
+ "spc = [\"@\",\"#\",\"$\",\"%\",\"^\",\"&\",\"*\",\"(\",\")\",\"_\",\"+\",\"{\",\"}\",\"[\",\"]\",\"|\",\":\",\"<\",\">\",\"?\",\";\",\",\",\".\"]\n",
45
+ "\n",
46
+ "\n",
47
+ "class TextPreProcessor:\n",
48
+ " \"\"\"\n",
49
+ " text preprocessor\n",
50
+ " \"\"\"\n",
51
+ " def __init__(self, stemmer = \"lancaster\", verbose=False):\n",
52
+ " self.verbose = verbose\n",
53
+ " self.lemmatizer = WordNetLemmatizer()\n",
54
+ "\n",
55
+ " def stripHtml(self, text):\n",
56
+ " soup = BeautifulSoup(text, \"html.parser\")\n",
57
+ " return soup.get_text()\n",
58
+ "\n",
59
+ " def removeBetweenSquareBrackets(self, text):\n",
60
+ " return re.sub('\\[[^]]*\\]', '', text)\n",
61
+ "\n",
62
+ " def denoiseText(self, text):\n",
63
+ " text = stripHtml(text)\n",
64
+ " text = removeBetweenSquareBrackets(text)\n",
65
+ " return text\n",
66
+ "\n",
67
+ " def replaceContractions(self, text):\n",
68
+ " \"\"\"Replace contractions in string of text\"\"\"\n",
69
+ " return contractions.fix(text)\n",
70
+ "\n",
71
+ " def tokenize(self, text):\n",
72
+ " words = nltk.word_tokenize(text)\n",
73
+ " return words\n",
74
+ "\n",
75
+ " def removeNonAscii(self, words):\n",
76
+ " \"\"\"Remove non-ASCII characters from list of tokenized words\"\"\"\n",
77
+ " newWords = []\n",
78
+ " for word in words:\n",
79
+ " if isinstance(word, unicode):\n",
80
+ " newWord = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')\n",
81
+ " else:\n",
82
+ " newWord = word\n",
83
+ " newWords.append(newWord)\n",
84
+ " return newWords\n",
85
+ "\n",
86
+ " def replaceNonAsciiFromText(self, text):\n",
87
+ " \"\"\" replaces non ascii with blank \"\"\"\n",
88
+ " return ''.join([i if ord(i) < 128 else ' ' for i in text])\n",
89
+ "\n",
90
+ " def removeNonAsciiFromText(self, text):\n",
91
+ " \"\"\" replaces non ascii with blank \"\"\"\n",
92
+ " return ''.join([i if ord(i) < 128 else '' for i in text])\n",
93
+ "\n",
94
+ " def allow(self, words):\n",
95
+ " \"\"\" allow only specific charaters \"\"\"\n",
96
+ " allowed = [word for word in words if re.match('^[A-Za-z0-9\\.\\,\\:\\;\\!\\?\\(\\)\\'\\-\\$\\@\\%\\\"]+$', word) is not None]\t\t\n",
97
+ " return allowed\t\t\n",
98
+ "\n",
99
+ " def toLowercase(self, words):\n",
100
+ " \"\"\"Convert all characters to lowercase from list of tokenized words\"\"\"\n",
101
+ " newWords = [word.lower() for word in words]\n",
102
+ " return newWords\n",
103
+ "\n",
104
+ " def removePunctuation(self, words):\n",
105
+ " \"\"\"Remove punctuation from list of tokenized words\"\"\"\n",
106
+ " newWords = []\n",
107
+ " for word in words:\n",
108
+ " newWord = re.sub(r'[^\\w\\s]', '', word)\n",
109
+ " if newWord != '':\n",
110
+ " newWords.append(newWord)\n",
111
+ " return newWords\n",
112
+ "\n",
113
+ " def replaceNumbers(self, words):\n",
114
+ " \"\"\"Replace all interger occurrences in list of tokenized words with textual representation\"\"\"\n",
115
+ " p = inflect.engine()\n",
116
+ " newWords = []\n",
117
+ " for word in words:\n",
118
+ " if word.isdigit():\n",
119
+ " newWord = p.number_to_words(word)\n",
120
+ " newWords.append(newWord)\n",
121
+ " else:\n",
122
+ " newWords.append(word)\n",
123
+ " return newWords\n",
124
+ "\n",
125
+ " def removeStopwords(self, words):\n",
126
+ " \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
127
+ " newWords = []\n",
128
+ " for word in words:\n",
129
+ " if word not in stopwords.words('english'):\n",
130
+ " newWords.append(word)\n",
131
+ " return newWords\n",
132
+ "\n",
133
+ " def removeCustomStopwords(self, words, stopWords):\n",
134
+ " \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
135
+ " removed = [word for word in words if word not in stopWords]\t\t\n",
136
+ " return removed\n",
137
+ "\n",
138
+ " def removeLowFreqWords(self, words, minFreq):\n",
139
+ " \"\"\"Remove low frewquncy words from list of tokenized words\"\"\"\n",
140
+ " frequency = defaultdict(int)\n",
141
+ " for word in words:\n",
142
+ " frequency[word] += 1\n",
143
+ " removed = [word for word in words if frequency[word] > minFreq]\t\t\n",
144
+ " return removed\t\n",
145
+ "\n",
146
+ " def removeNumbers(self, words):\n",
147
+ " \"\"\"Remove numbers\"\"\"\n",
148
+ " removed = [word for word in words if not isNumber(word)]\t\t\n",
149
+ " return removed\t\t\n",
150
+ "\n",
151
+ " def removeShortWords(self, words, minLengh):\n",
152
+ " \"\"\"Remove short words \"\"\"\n",
153
+ " removed = [word for word in words if len(word) >= minLengh]\t\t\n",
154
+ " return removed\t\t\n",
155
+ "\n",
156
+ " def keepAllowedWords(self, words, keepWords):\n",
157
+ " \"\"\"Keep words from the list only\"\"\"\n",
158
+ " kept = [word for word in words if word in keepWords]\t\t\n",
159
+ " return kept\n",
160
+ "\n",
161
+ " def stemWords(self, words):\n",
162
+ " \"\"\"Stem words in list of tokenized words\"\"\"\n",
163
+ " if stemmer == \"lancaster\":\n",
164
+ " stemmer = LancasterStemmer()\n",
165
+ " elif stemmer == \"snowbal\":\n",
166
+ " stemmer = SnowballStemmer()\n",
167
+ " elif stemmer == \"porter\":\n",
168
+ " stemmer = PorterStemmer()\n",
169
+ " stems = [stemmer.stem(word) for word in words]\n",
170
+ " return stems\n",
171
+ "\n",
172
+ " def lemmatizeWords(self, words):\n",
173
+ " \"\"\"Lemmatize tokens in list of tokenized words\"\"\"\n",
174
+ " lemmas = [self.lemmatizer.lemmatize(word) for word in words]\n",
175
+ " return lemmas\n",
176
+ "\n",
177
+ " def lemmatizeVerbs(self, words):\n",
178
+ " \"\"\"Lemmatize verbs in list of tokenized words\"\"\"\n",
179
+ " lemmas = [self.lemmatizer.lemmatize(word, pos='v') for word in words]\n",
180
+ " return lemmas\n",
181
+ "\n",
182
+ " def normalize(self, words):\n",
183
+ " words = self.removeNonAscii(words)\n",
184
+ " words = self.toLowercase(words)\n",
185
+ " words = self.removePunctuation(words)\n",
186
+ " words = self.replaceNumbers(words)\n",
187
+ " words = self.removeStopwords(words)\n",
188
+ " return words\n",
189
+ "\n",
190
+ " def posTag(self, textTokens):\n",
191
+ " tags = nltk.pos_tag(textTokens)\n",
192
+ " return tags\n",
193
+ "\n",
194
+ " def extractEntity(self, textTokens, classifierPath, jarPath):\n",
195
+ " st = StanfordNERTagger(classifierPath, jarPath) \n",
196
+ " entities = st.tag(textTokens)\n",
197
+ " return entities\n",
198
+ "\n",
199
+ " def documentFeatures(self, document, wordFeatures):\n",
200
+ " documentWords = set(document)\n",
201
+ " features = {}\n",
202
+ " for word in wordFeatures:\n",
203
+ " features[word] = (word in documentWords)\n",
204
+ " return features\n",
205
+ "\n",
206
+ "class NGram:\n",
207
+ " \"\"\"\n",
208
+ " word ngram\n",
209
+ " \"\"\"\n",
210
+ " def __init__(self, vocFilt, verbose=False):\n",
211
+ " \"\"\"\n",
212
+ " initialize\n",
213
+ " \"\"\"\n",
214
+ " self.vocFilt = vocFilt\n",
215
+ " self.nGramCounter = dict()\n",
216
+ " self.nGramFreq = dict()\n",
217
+ " self.corpSize = 0\n",
218
+ " self.vocabulary = set()\n",
219
+ " self.freqDone = False\n",
220
+ " self.verbose = verbose\n",
221
+ " self.vecWords = None\n",
222
+ " self.nonZeroCount = 0\n",
223
+ "\n",
224
+ " def countDocNGrams(self, words):\n",
225
+ " \"\"\"\n",
226
+ " count words in a doc\n",
227
+ " \"\"\"\n",
228
+ " if self.verbose:\n",
229
+ " print (\"doc size \" + str(len(words)))\n",
230
+ " nGrams = self.toNGram(words)\n",
231
+ " for nGram in nGrams:\n",
232
+ " count = self.nGramCounter.get(nGram, 0)\n",
233
+ " self.nGramCounter[nGram] = count + 1\n",
234
+ " self.corpSize += 1\n",
235
+ " self.vocabulary.update(words)\t\n",
236
+ "\n",
237
+ " def remLowCount(self, minCount):\n",
238
+ " \"\"\"\n",
239
+ " removes items with count below threshold\n",
240
+ " \"\"\"\n",
241
+ " self.nGramCounter = dict(filter(lambda item: item[1] >= minCount, self.nGramCounter.items()))\n",
242
+ "\n",
243
+ " def getVocabSize(self):\n",
244
+ " \"\"\"\n",
245
+ " get vocabulary size\n",
246
+ " \"\"\"\n",
247
+ " return len(self.nGramCounter)\n",
248
+ "\n",
249
+ " def getNGramFreq(self):\n",
250
+ " \"\"\"\n",
251
+ " get normalized count\n",
252
+ " \"\"\"\n",
253
+ " if self.verbose:\n",
254
+ " print (\"counter size \" + str(len(self.nGramCounter)))\n",
255
+ " if not self.freqDone:\n",
256
+ " for item in self.nGramCounter.items():\n",
257
+ " self.nGramFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
258
+ " self.freqDone = True\n",
259
+ " return self.nGramFreq\n",
260
+ "\n",
261
+ " def getNGramIndex(self, show):\n",
262
+ " \"\"\"\n",
263
+ " convert to list\n",
264
+ " \"\"\"\n",
265
+ " if self.vecWords is None:\n",
266
+ " self.vecWords = list(self.nGramCounter)\n",
267
+ " if show:\n",
268
+ " for vw in enumerate(self.vecWords):\n",
269
+ " print(vw)\n",
270
+ "\n",
271
+ " def getVector(self, words, byCount, normalized):\n",
272
+ " \"\"\"\n",
273
+ " convert to vector\n",
274
+ " \"\"\"\n",
275
+ " if self.vecWords is None:\n",
276
+ " self.vecWords = list(self.nGramCounter)\n",
277
+ "\n",
278
+ " nGrams = self.toNGram(words)\n",
279
+ " if self.verbose:\n",
280
+ " print(\"vocabulary size {}\".format(len(self.vecWords)))\n",
281
+ " print(\"ngrams\")\n",
282
+ " print(nGrams)\n",
283
+ " self.nonZeroCount = 0\n",
284
+ " vec = list(map(lambda vw: self.getVecElem(vw, nGrams, byCount, normalized), self.vecWords))\n",
285
+ " return vec\n",
286
+ "\n",
287
+ " def getVecElem(self, vw, nGrams, byCount, normalized):\n",
288
+ " \"\"\"\n",
289
+ " get vector element\n",
290
+ " \"\"\"\n",
291
+ " if vw in nGrams:\n",
292
+ " if byCount:\n",
293
+ " if normalized:\n",
294
+ " el = self.nGramFreq[vw]\n",
295
+ " else:\n",
296
+ " el = self.nGramCounter[vw]\n",
297
+ " else:\n",
298
+ " el = 1\n",
299
+ " self.nonZeroCount += 1\n",
300
+ " else:\n",
301
+ " if (byCount and normalized):\n",
302
+ " el = 0.0\n",
303
+ " else:\n",
304
+ " el = 0\n",
305
+ " return el\n",
306
+ "\n",
307
+ " def getNonZeroCount(self):\n",
308
+ " \"\"\"\n",
309
+ " get non zero vector element count\n",
310
+ " \"\"\"\n",
311
+ " return self.nonZeroCount\n",
312
+ "\n",
313
+ " def toBiGram(self, words):\n",
314
+ " \"\"\"\n",
315
+ " convert to bigram\n",
316
+ " \"\"\"\n",
317
+ " if self.verbose:\n",
318
+ " print (\"doc size \" + str(len(words)))\n",
319
+ " biGrams = list()\n",
320
+ " for i in range(len(words)-1):\n",
321
+ " w1 = words[i]\n",
322
+ " w2 = words[i+1]\n",
323
+ " if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt):\n",
324
+ " nGram = (w1, w2)\n",
325
+ " biGrams.append(nGram)\n",
326
+ " return biGrams\n",
327
+ "\n",
328
+ " def toTriGram(self, words):\n",
329
+ " \"\"\"\n",
330
+ " convert to trigram\n",
331
+ " \"\"\"\n",
332
+ " if self.verbose:\n",
333
+ " print (\"doc size \" + str(len(words)))\n",
334
+ " triGrams = list()\n",
335
+ " for i in range(len(words)-2):\n",
336
+ " w1 = words[i]\n",
337
+ " w2 = words[i+1]\n",
338
+ " w3 = words[i+2]\n",
339
+ " if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt and w3 in self.vocFilt):\n",
340
+ " nGram = (w1, w2, w3)\n",
341
+ " triGrams.append(nGram)\n",
342
+ " return triGrams\n",
343
+ "\n",
344
+ " def save(self, saveFile):\n",
345
+ " \"\"\"\n",
346
+ " save \n",
347
+ " \"\"\"\n",
348
+ " sf = open(saveFile, \"wb\")\n",
349
+ " pickle.dump(self, sf)\n",
350
+ " sf.close()\n",
351
+ "\n",
352
+ " @staticmethod\n",
353
+ " def load(saveFile):\n",
354
+ " \"\"\"\n",
355
+ " load\n",
356
+ " \"\"\"\n",
357
+ " sf = open(saveFile, \"rb\")\n",
358
+ " nGrams = pickle.load(sf)\n",
359
+ " sf.close()\n",
360
+ " return nGrams\n",
361
+ "\n",
362
+ "class CharNGram:\n",
363
+ " \"\"\"\n",
364
+ " character n gram\n",
365
+ " \"\"\"\n",
366
+ " def __init__(self, domains, ngsize, verbose=False):\n",
367
+ " \"\"\"\n",
368
+ " initialize\n",
369
+ " \"\"\"\n",
370
+ " self.chDomain = list()\n",
371
+ " self.ws = \"#\"\n",
372
+ " self.chDomain.append(self.ws)\n",
373
+ " for d in domains:\n",
374
+ " if d == \"lcc\":\n",
375
+ " self.chDomain.extend(lcc)\n",
376
+ " elif d == \"ucc\":\n",
377
+ " self.chDomain.extend(ucc)\n",
378
+ " elif d == \"dig\":\n",
379
+ " self.chDomain.extend(dig)\n",
380
+ " elif d == \"spc\":\n",
381
+ " self.chDomain.extend(spc)\n",
382
+ " else:\n",
383
+ " raise ValueError(\"invalid character type \" + d)\n",
384
+ "\n",
385
+ " self.ngsize = ngsize\n",
386
+ " self.radixPow = None\n",
387
+ " self.cntVecSize = None\n",
388
+ "\n",
389
+ " def addSpChar(self, spChar):\n",
390
+ " \"\"\"\n",
391
+ " add special characters\n",
392
+ " \"\"\"\n",
393
+ " self.chDomain.extend(spChar)\n",
394
+ "\n",
395
+ " def setWsRepl(self, ws):\n",
396
+ " \"\"\"\n",
397
+ " set white space replacement charater\n",
398
+ " \"\"\"\n",
399
+ " self.ws = ws\n",
400
+ " self.chDomain[0] = self.ws\n",
401
+ "\n",
402
+ " def finalize(self):\n",
403
+ " \"\"\"\n",
404
+ " final setup\n",
405
+ " \"\"\"\t\t\n",
406
+ " domSize = len(self.chDomain)\n",
407
+ " self.cntVecSize = int(math.pow(domSize, self.ngsize))\n",
408
+ " if self.radixPow is None:\n",
409
+ " self.radixPow = list()\n",
410
+ " for i in range(self.ngsize-1, 0, -1):\n",
411
+ " self.radixPow.append(int(math.pow(domSize, i)))\n",
412
+ " self.radixPow.append(1)\n",
413
+ "\n",
414
+ "\n",
415
+ " def toMgramCount(self, text):\n",
416
+ " \"\"\"\n",
417
+ " get ngram count list\n",
418
+ " \"\"\"\n",
419
+ " #print(text)\n",
420
+ " ngCounts = [0] * self.cntVecSize\n",
421
+ "\n",
422
+ " ngram = list()\n",
423
+ " totNgCount = 0\n",
424
+ " for ch in text:\n",
425
+ " if ch.isspace():\n",
426
+ " l = len(ngram)\n",
427
+ " if l == 0 or ngram[l-1] != self.ws:\n",
428
+ " ngram.append(self.ws)\n",
429
+ " else:\n",
430
+ " ngram.append(ch)\n",
431
+ "\n",
432
+ " if len(ngram) == self.ngsize:\n",
433
+ " i = self.__getNgramIndex(ngram)\n",
434
+ " assert i < self.cntVecSize, \"ngram index out of range index \" + str(i) + \" size \" + str(self.cntVecSize) \n",
435
+ " ngCounts[i] += 1\n",
436
+ " ngram.clear()\n",
437
+ " totNgCount += 1\n",
438
+ "\n",
439
+ " return ngCounts\n",
440
+ "\n",
441
+ " def __getNgramIndex(self, ngram):\n",
442
+ " \"\"\"\n",
443
+ " get index of an ngram into a list of size equal total number of possible ngrams\n",
444
+ " \"\"\"\n",
445
+ " assert len(ngram) == len(self.radixPow), \"ngram size mismatch\"\t\t\n",
446
+ " ngi = 0\n",
447
+ " for ch, rp in zip(ngram, self.radixPow):\n",
448
+ " i = self.chDomain.index(ch)\n",
449
+ " ngi += i * rp\n",
450
+ "\n",
451
+ " return ngi\n",
452
+ "\n",
453
+ "\n",
454
+ "class TfIdf:\n",
455
+ " \"\"\"\n",
456
+ " TF IDF\t\n",
457
+ " \"\"\"\n",
458
+ " def __init__(self, vocFilt, doIdf, verbose=False):\n",
459
+ " \"\"\"\n",
460
+ " initialize\n",
461
+ " \"\"\"\n",
462
+ " self.vocFilt = vocFilt\n",
463
+ " self.doIdf = doIdf\n",
464
+ " self.wordCounter = {}\n",
465
+ " self.wordFreq = {}\n",
466
+ " self.wordInDocCount = {}\n",
467
+ " self.docCount = 0\n",
468
+ " self.corpSize = 0\n",
469
+ " self.freqDone = False\n",
470
+ " self.vocabulary = set()\n",
471
+ " self.wordIndex = None\n",
472
+ " self.verbose = verbose\n",
473
+ " self.vecWords = None\n",
474
+ "\n",
475
+ " def countDocWords(self, words):\n",
476
+ " \"\"\"\n",
477
+ " count words in a doc\n",
478
+ " \"\"\"\n",
479
+ " if self.verbose:\n",
480
+ " print (\"doc size \" + str(len(words)))\n",
481
+ " for word in words:\n",
482
+ " if self.vocFilt is None or word in self.vocFilt:\n",
483
+ " count = self.wordCounter.get(word, 0)\n",
484
+ " self.wordCounter[word] = count + 1\n",
485
+ " self.corpSize += len(words)\n",
486
+ " self.vocabulary.update(words)\n",
487
+ "\n",
488
+ " if (self.doIdf):\n",
489
+ " self.docCount += 1\n",
490
+ " for word in set(words):\n",
491
+ " self.wordInDocCount.get(word, 0)\n",
492
+ " self.wordInDocCount[word] = count + 1\n",
493
+ " self.freqDone = False\n",
494
+ "\n",
495
+ "\n",
496
+ " def getWordFreq(self):\n",
497
+ " \"\"\"\n",
498
+ " get tfidf for corpus\n",
499
+ " \"\"\"\n",
500
+ " if self.verbose:\n",
501
+ " print (\"counter size \" + str(len(self.wordCounter)))\n",
502
+ " if not self.freqDone:\n",
503
+ " for item in self.wordCounter.items():\n",
504
+ " self.wordFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
505
+ " if self.doIdf:\n",
506
+ " for k in self.wordFreq.keys():\n",
507
+ " self.wordFreq.items[k] *= math.log(self.docCount / self.wordInDocCount.items[k])\t\n",
508
+ " self.freqDone = True\n",
509
+ " return self.wordFreq\n",
510
+ "\n",
511
+ " def getCount(self, word):\n",
512
+ " \"\"\"\n",
513
+ " get counter\n",
514
+ " \"\"\"\n",
515
+ " if word in self.wordCounter:\n",
516
+ " count = self.wordCounter[word]\n",
517
+ " else:\n",
518
+ " raise ValueError(\"word not found in count table \" + word)\n",
519
+ " return count\n",
520
+ "\n",
521
+ " def getFreq(self, word):\n",
522
+ " \"\"\"\n",
523
+ " get normalized frequency\n",
524
+ " \"\"\"\n",
525
+ " if word in self.wordFreq:\n",
526
+ " freq = self.wordFreq[word]\n",
527
+ " else:\n",
528
+ " raise ValueError(\"word not found in count table \" + word)\n",
529
+ " return freq\n",
530
+ "\n",
531
+ " def resetCounter(self):\n",
532
+ " \"\"\"\n",
533
+ " reset counter\n",
534
+ " \"\"\"\n",
535
+ " self.wordCounter = {}\n",
536
+ "\n",
537
+ " def buildVocabulary(self, words):\n",
538
+ " \"\"\"\n",
539
+ " build vocbulary\n",
540
+ " \"\"\"\n",
541
+ " self.vocabulary.update(words)\n",
542
+ "\n",
543
+ " def getVocabulary(self):\n",
544
+ " \"\"\"\n",
545
+ " return vocabulary\n",
546
+ " \"\"\"\n",
547
+ " return self.vocabulary\n",
548
+ "\n",
549
+ " def creatWordIndex(self):\n",
550
+ " \"\"\"\n",
551
+ " index for all words in vcabulary\n",
552
+ " \"\"\"\n",
553
+ " self.wordIndex = {word : idx for idx, word in enumerate(list(self.vocabulary))}\n",
554
+ "\n",
555
+ " def getVector(self, words, byCount, normalized):\n",
556
+ " \"\"\"\n",
557
+ " get vector\n",
558
+ " \"\"\"\n",
559
+ " if self.vecWords is None:\n",
560
+ " self.vecWords = list(self.wordCounter)\n",
561
+ " vec = list(map(lambda vw: self.getVecElem(vw, words, byCount, normalized), self.vecWords))\n",
562
+ " return vec\n",
563
+ "\n",
564
+ " def getVecElem(self, vw, words, byCount, normalized):\n",
565
+ " \"\"\"\n",
566
+ " vector element\n",
567
+ " \"\"\"\n",
568
+ " el = 0\n",
569
+ " if vw in words:\n",
570
+ " if byCount:\n",
571
+ " if normalized:\n",
572
+ " el = self.wordFreq[vw]\n",
573
+ " else:\n",
574
+ " el = self.wordCounter[vw]\n",
575
+ " else:\n",
576
+ " el = 1\n",
577
+ " return el\n",
578
+ "\n",
579
+ " def save(self, saveFile):\n",
580
+ " \"\"\"\n",
581
+ " save\n",
582
+ " \"\"\"\n",
583
+ " sf = open(saveFile, \"wb\")\n",
584
+ " pickle.dump(self, sf)\n",
585
+ " sf.close()\n",
586
+ "\n",
587
+ " # load \n",
588
+ " @staticmethod\n",
589
+ " def load(saveFile):\n",
590
+ " \"\"\"\n",
591
+ " load\n",
592
+ " \"\"\"\n",
593
+ " sf = open(saveFile, \"rb\")\n",
594
+ " tfidf = pickle.load(sf)\n",
595
+ " sf.close()\n",
596
+ " return tfidf\n",
597
+ "\n",
598
+ "# bigram\n",
599
+ "class BiGram(NGram):\n",
600
+ " def __init__(self, vocFilt, verbose=False):\n",
601
+ " \"\"\"\n",
602
+ " initialize\n",
603
+ " \"\"\"\n",
604
+ " super(BiGram, self).__init__(vocFilt, verbose)\n",
605
+ "\n",
606
+ " def toNGram(self, words):\n",
607
+ " \"\"\"\n",
608
+ " convert to Ngrams\n",
609
+ " \"\"\"\n",
610
+ " return self.toBiGram(words)\n",
611
+ "\n",
612
+ "# trigram\n",
613
+ "class TriGram(NGram):\n",
614
+ " def __init__(self, vocFilt, verbose=False):\n",
615
+ " \"\"\"\n",
616
+ " initialize\n",
617
+ " \"\"\"\n",
618
+ " super(TriGram, self).__init__(vocFilt, verbose)\n",
619
+ "\n",
620
+ " def toNGram(self, words):\n",
621
+ " \"\"\"\n",
622
+ " convert to Ngrams\n",
623
+ " \"\"\"\n",
624
+ " return self.toTriGram(words)\n",
625
+ "\n",
626
+ "\n",
627
+ "\n",
628
+ "class DocSentences:\n",
629
+ " \"\"\"\n",
630
+ " sentence processor\n",
631
+ " \"\"\"\n",
632
+ " def __init__(self, filePath, minLength, verbose, text=None):\n",
633
+ " \"\"\"\n",
634
+ " initialize\n",
635
+ " \"\"\"\n",
636
+ " if filePath:\n",
637
+ " self.filePath = filePath\n",
638
+ " with open(filePath, 'r') as contentFile:\n",
639
+ " content = contentFile.read()\n",
640
+ " elif text:\n",
641
+ " content = text\n",
642
+ " else:\n",
643
+ " raise valueError(\"either file path or text must be provided\")\n",
644
+ "\n",
645
+ " #self.sentences = content.split('.')\n",
646
+ " self.verbose = verbose\n",
647
+ " tp = TextPreProcessor()\n",
648
+ " content = tp.removeNonAsciiFromText(content)\n",
649
+ " sentences = sent_tokenize(content)\n",
650
+ " self.sentences = list(filter(lambda s: len(nltk.word_tokenize(s)) >= minLength, sentences))\n",
651
+ " if self.verbose:\n",
652
+ " print (\"num of senteces after length filter \" + str(len(self.sentences)))\n",
653
+ " self.sentencesAsTokens = [clean(s, tp, verbose) for s in self.sentences]\t\n",
654
+ "\n",
655
+ " # get sentence tokens\n",
656
+ " def getSentencesAsTokens(self):\n",
657
+ " return self.sentencesAsTokens\n",
658
+ "\n",
659
+ " # get sentences\n",
660
+ " def getSentences(self):\n",
661
+ " return self.sentences\n",
662
+ "\n",
663
+ " # build term freq table\n",
664
+ " def getTermFreqTable(self):\n",
665
+ " # term count table for all words\n",
666
+ " termTable = TfIdf(None, False)\n",
667
+ " sentWords = self.getSentencesAsTokens()\n",
668
+ " for seWords in sentWords:\n",
669
+ " termTable.countDocWords(seWords)\n",
670
+ " return termTable\n",
671
+ "\n",
672
+ "# sentence processor\n",
673
+ "class WordVectorContainer:\n",
674
+ " def __init__(self, dirPath, verbose):\n",
675
+ " \"\"\"\n",
676
+ " initialize\n",
677
+ " \"\"\"\n",
678
+ " self.docs = list()\n",
679
+ " self.wordVectors = list()\n",
680
+ " self.tp = TextPreProcessor()\n",
681
+ " self.similarityAlgo = \"cosine\"\n",
682
+ " self.simAlgoNormalizer = None\n",
683
+ " self.termTable = None\n",
684
+ "\n",
685
+ "\n",
686
+ " def addDir(self, dirPath):\n",
687
+ " \"\"\"\n",
688
+ " add content of all files ina directory\n",
689
+ " \"\"\"\n",
690
+ " docs, filePaths = getFileContent(dirPath, verbose)\n",
691
+ " self.docs.extend(docs)\n",
692
+ " self.wordVectors.extend([clean(doc, self.tp, verbose) for doc in docs])\n",
693
+ "\n",
694
+ " def addFile(self, filePath):\n",
695
+ " \"\"\"\n",
696
+ " add file content\n",
697
+ " \"\"\"\n",
698
+ " with open(filePath, 'r') as contentFile:\n",
699
+ " content = contentFile.read()\n",
700
+ " self.wordVectors.append(clean(content, self.tp, verbose))\n",
701
+ "\n",
702
+ " def addText(self, text):\n",
703
+ " \"\"\"\n",
704
+ " add text\n",
705
+ " \"\"\"\n",
706
+ " self.wordVectors.append(clean(text, self.tp, verbose))\n",
707
+ "\n",
708
+ " def addWords(self, words):\n",
709
+ " \"\"\"\n",
710
+ " add words\n",
711
+ " \"\"\"\n",
712
+ " self.wordVectors.append(words)\n",
713
+ "\n",
714
+ " def withSimilarityAlgo(self, algo, normalizer=None):\n",
715
+ " \"\"\"\n",
716
+ " set similarity algo\n",
717
+ " \"\"\"\n",
718
+ " self.similarityAlgo = algo\n",
719
+ " self.simAlgoNormalizer = normalizer\n",
720
+ "\n",
721
+ " def getDocsWords(self):\n",
722
+ " \"\"\"\n",
723
+ " get word vectors\n",
724
+ " \"\"\"\n",
725
+ " return self.wordVectors\n",
726
+ "\n",
727
+ " def getDocs(self):\n",
728
+ " \"\"\"\n",
729
+ " get docs\n",
730
+ " \"\"\"\n",
731
+ " return self.docs\n",
732
+ "\n",
733
+ " def getTermFreqTable(self):\n",
734
+ " \"\"\"\n",
735
+ " term count table for all words\n",
736
+ " \"\"\"\n",
737
+ " self.termTable = TfIdf(None, False)\n",
738
+ " for words in self.wordVectors:\n",
739
+ " self.termTable.countDocWords(words)\n",
740
+ " self.termTable.getWordFreq()\n",
741
+ " return self.termTable\n",
742
+ "\n",
743
+ " def getPairWiseSimilarity(self, byCount, normalized):\n",
744
+ " \"\"\"\n",
745
+ " pair wise similarity\n",
746
+ " \"\"\"\n",
747
+ " self.getNumWordVectors()\n",
748
+ "\n",
749
+ " size = len(self.wordVectors)\n",
750
+ " simArray = np.empty(shape=(size,size))\n",
751
+ " for i in range(size):\n",
752
+ " simArray[i][i] = 1.0\n",
753
+ "\n",
754
+ " for i in range(size):\n",
755
+ " for j in range(i+1, size):\n",
756
+ " if self.similarityAlgo == \"cosine\":\n",
757
+ " sim = cosineSimilarity(self.numWordVectors[i], self.numWordVectors[j])\n",
758
+ " elif self.similarityAlgo == \"jaccard\":\n",
759
+ " sim = jaccardSimilarity(self.wordVectors[i], self.wordVectors[j],\\\n",
760
+ " self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
761
+ " else:\n",
762
+ " raise ValueError(\"invalid similarity algorithms\")\n",
763
+ " simArray[i][j] = sim\n",
764
+ " simArray[j][i] = sim\n",
765
+ " return simArray\n",
766
+ "\n",
767
+ " def getInterSetSimilarity(self, byCount, normalized, split):\n",
768
+ " \"\"\"\n",
769
+ " inter set pair wise similarity\n",
770
+ " \"\"\"\n",
771
+ " self.getNumWordVectors()\n",
772
+ " size = len(self.wordVectors)\n",
773
+ " if not self.similarityAlgo == \"jaccard\":\n",
774
+ " firstNumVec = self.numWordVectors[:split]\n",
775
+ " secNumVec = self.numWordVectors[split:]\n",
776
+ " fiSize = len(firstNumVec)\n",
777
+ " seSize = len(secNumVec)\n",
778
+ " else:\n",
779
+ " firstVec = self.wordVectors[:split]\n",
780
+ " secVec = self.wordVectors[split:]\n",
781
+ " fiSize = len(firstVec)\n",
782
+ " seSize = len(secVec)\n",
783
+ "\n",
784
+ " simArray = np.empty(shape=(fiSize,seSize))\n",
785
+ " for i in range(fiSize):\n",
786
+ " for j in range(seSize):\n",
787
+ " if self.similarityAlgo == \"cosine\":\n",
788
+ " sim = cosineSimilarity(firstNumVec[i], secNumVec[j])\n",
789
+ " elif self.similarityAlgo == \"jaccard\":\n",
790
+ " sim = jaccardSimilarity(firstVec[i], secVec[j],\\\n",
791
+ " self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
792
+ " else:\n",
793
+ " raise ValueError(\"invalid similarity algorithms\")\n",
794
+ " simArray[i][j] = sim\n",
795
+ " return simArray\n",
796
+ "\n",
797
+ " def getNumWordVectors(self):\n",
798
+ " \"\"\"\n",
799
+ " get vectors\n",
800
+ " \"\"\"\n",
801
+ " if not self.similarityAlgo == \"jaccard\":\n",
802
+ " if self.numWordVectors is None:\n",
803
+ " self.numWordVectors = list(map(lambda wv: self.termTable.getVector(wv, byCount, normalized), self.wordVectors))\n",
804
+ "\n",
805
+ "# fragments documents into whole doc, paragraph or passages\n",
806
+ "class TextFragmentGenerator:\n",
807
+ " def __init__(self, level, minParNl, passSize, verbose=False):\n",
808
+ " \"\"\"\n",
809
+ " initialize\n",
810
+ " \"\"\"\n",
811
+ " self.level = level\n",
812
+ " self.minParNl = minParNl\n",
813
+ " self.passSize = passSize\n",
814
+ " self.fragments = None\n",
815
+ " self.verbose = verbose\n",
816
+ "\n",
817
+ " def loadDocs(self, fpaths):\n",
818
+ " \"\"\"\n",
819
+ " loads documents from one file, multiple files or all files under directory\n",
820
+ " \"\"\"\n",
821
+ " fPaths = fpaths.split(\",\")\n",
822
+ " if len(fPaths) == 1:\n",
823
+ " if os.path.isfile(fPaths[0]):\n",
824
+ " #one file\n",
825
+ " if self.verbose:\n",
826
+ " print(\"got one file from path\")\n",
827
+ " dnames = fPaths\n",
828
+ " docStr = getOneFileContent(fPaths[0])\n",
829
+ " dtexts = [docStr]\n",
830
+ " else:\n",
831
+ " #all files under directory\n",
832
+ " if self.verbose:\n",
833
+ " print(\"got all files under directory from path\")\n",
834
+ " dtexts, dnames = getFileContent(fPaths[0])\n",
835
+ " if self.verbose:\n",
836
+ " print(\"found {} files\".format(len(dtexts)))\n",
837
+ " else:\n",
838
+ " #list of files\n",
839
+ " if self.verbose: \n",
840
+ " print(\"got list of files from path\")\n",
841
+ " dnames = fPaths\n",
842
+ " dtexts = list(map(getOneFileContent, fpaths))\n",
843
+ " if self.verbose:\n",
844
+ " print(\"found {} files\".format(len(dtexts)))\n",
845
+ "\n",
846
+ " ndocs = (dtexts, dnames)\t\n",
847
+ " if self.verbose:\n",
848
+ " print(\"docs\")\n",
849
+ " for dn, dt in zip(dnames, dtexts):\n",
850
+ " print(dn + \"\\t\" + dt[:40])\n",
851
+ "\n",
852
+ " return ndocs\n",
853
+ "\n",
854
+ " def generateFragmentsFromFiles(self, fpaths):\n",
855
+ " \"\"\"\n",
856
+ " fragments documents into whole doc, paragraph or passages\n",
857
+ " \"\"\"\n",
858
+ " dtexts, dnames = self.loadDocs(fpaths)\n",
859
+ " return self.generateFragments(dtexts, dnames)\n",
860
+ "\n",
861
+ "\n",
862
+ " def generateFragmentsFromNamedDocs(self, ndocs):\n",
863
+ " \"\"\"\n",
864
+ " fragments documents into whole doc, paragraph or passages\n",
865
+ " \"\"\"\n",
866
+ " dtexts = list(map(lambda nd : nd[1], ndocs))\n",
867
+ " dnames = list(map(lambda nd : nd[0], ndocs))\n",
868
+ " #for i in range(len(dtexts)):\n",
869
+ " #\tprint(dnames[i])\n",
870
+ " #\tprint(dtexts[i][:40])\n",
871
+ " return self.generateFragments(dtexts, dnames)\n",
872
+ "\n",
873
+ " def generateFragments(self, dtexts, dnames):\n",
874
+ " \"\"\"\n",
875
+ " fragments documents into whole doc, paragraph or passages\n",
876
+ " \"\"\"\n",
877
+ " if self.level == \"para\" or self.level == \"passage\":\n",
878
+ " #split paras\n",
879
+ " dptexts = list()\n",
880
+ " dpnames = list()\n",
881
+ " for dt, dn in zip(dtexts, dnames):\n",
882
+ " paras = getParas(dt, self.minParNl)\n",
883
+ " if self.verbose:\n",
884
+ " print(dn)\n",
885
+ " print(\"no of paras {}\".format(len(paras)))\n",
886
+ " dptexts.extend(paras)\n",
887
+ " pnames = list(map(lambda i : dn + \":\" + str(i), range(len(paras))))\n",
888
+ " dpnames.extend(pnames)\n",
889
+ " dtexts = dptexts\n",
890
+ " dnames = dpnames\n",
891
+ "\n",
892
+ " if self.level == \"passage\":\n",
893
+ " #split each para into passages\n",
894
+ " dptexts = list()\n",
895
+ " dpnames = list()\n",
896
+ " for dt, dn in zip(dtexts, dnames):\n",
897
+ " sents = sent_tokenize(dt.strip())\t\t\t\n",
898
+ " if self.verbose:\n",
899
+ " print(dn)\n",
900
+ " print(\"no of sentences {}\".format(len(sents)))\n",
901
+ " span = self.passSize\n",
902
+ " if len(sents) <= span:\n",
903
+ " pass\n",
904
+ " else:\n",
905
+ " for i in range(0, len(sents) - span, 1):\n",
906
+ " dptext = None\n",
907
+ " for j in range(span):\n",
908
+ " if dptext is None:\n",
909
+ " dptext = sents[i + j] + \". \"\n",
910
+ " else:\n",
911
+ " dptext = dptext + sents[i + j] + \". \" \n",
912
+ " dpname = dn + \":\" + str(i)\n",
913
+ " dptexts.append(dptext)\n",
914
+ " dpnames.append(dpname)\n",
915
+ "\n",
916
+ " dtexts = dptexts\n",
917
+ " dnames = dpnames\n",
918
+ "\n",
919
+ " self.fragments = list(zip(dnames, dtexts))\n",
920
+ " #if self.verbose:\n",
921
+ " #\tprint(\"num fragments {}\".format(len(self.fragments)))\n",
922
+ " return self.fragments\n",
923
+ "\n",
924
+ " def showFragments(self):\n",
925
+ " \"\"\"\n",
926
+ " show fragments\n",
927
+ " \"\"\"\n",
928
+ " print(\"showing all \" + self.level + \" for the first 40 characters\")\n",
929
+ " for dn, dt in self.fragments:\n",
930
+ " print(dn + \"\\t\" + dt[:40])\n",
931
+ "\n",
932
+ " def isDocLevel(self):\n",
933
+ " \"\"\"\n",
934
+ " true if fragment is at doc level\n",
935
+ " \"\"\"\n",
936
+ " return self.level != \"para\" and self.level != \"passage\"\n",
937
+ "\n",
938
+ "# clean doc to create term array\n",
939
+ "def clean(doc, preprocessor, verbose):\n",
940
+ " \"\"\"\n",
941
+ " text pre process\n",
942
+ " \"\"\"\n",
943
+ " if verbose:\n",
944
+ " print (\"--raw doc\")\n",
945
+ " print (doc)\n",
946
+ " #print \"next clean\"\n",
947
+ " doc = preprocessor.removeNonAsciiFromText(doc)\n",
948
+ " words = preprocessor.tokenize(doc)\n",
949
+ " words = preprocessor.allow(words)\n",
950
+ " words = preprocessor.toLowercase(words)\n",
951
+ " words = preprocessor.removeStopwords(words)\n",
952
+ " words = preprocessor.removeShortWords(words, 3)\n",
953
+ " words = preprocessor.removePunctuation(words)\n",
954
+ " words = preprocessor.lemmatizeWords(words)\n",
955
+ " #words = preprocessor.removeNonAscii(words)\n",
956
+ " if verbose:\n",
957
+ " print (\"--after pre processing\")\n",
958
+ " print (words)\n",
959
+ " return words\n",
960
+ "\n",
961
+ "# get sentences\n",
962
+ "def getSentences(filePath):\n",
963
+ " \"\"\"\n",
964
+ " text pre process\n",
965
+ " \"\"\"\n",
966
+ " with open(filePath, 'r') as contentFile:\n",
967
+ " content = contentFile.read()\n",
968
+ " sentences = content.split('.')\n",
969
+ " return sentences\n",
970
+ "\n",
971
+ "def getParas(text, minParNl=2):\n",
972
+ " \"\"\"\n",
973
+ " split into paras\n",
974
+ " \"\"\"\n",
975
+ " regx = \"\\n+\" if minParNl == 1 else \"\\n{2,}\"\n",
976
+ " paras = re.split(regx, text.replace(\"\\r\\n\", \"\\n\"))\n",
977
+ " return paras\n"
978
+ ]
979
+ }
980
+ ],
981
+ "metadata": {
982
+ "kernelspec": {
983
+ "display_name": "Python 3 (ipykernel)",
984
+ "language": "python",
985
+ "name": "python3"
986
+ },
987
+ "language_info": {
988
+ "codemirror_mode": {
989
+ "name": "ipython",
990
+ "version": 3
991
+ },
992
+ "file_extension": ".py",
993
+ "mimetype": "text/x-python",
994
+ "name": "python",
995
+ "nbconvert_exporter": "python",
996
+ "pygments_lexer": "ipython3",
997
+ "version": "3.9.12"
998
+ }
999
+ },
1000
+ "nbformat": 4,
1001
+ "nbformat_minor": 5
1002
+ }
lib/.ipynb_checkpoints/util-checkpoint.ipynb ADDED
@@ -0,0 +1,2141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "031d69ef",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import sys\n",
12
+ "from random import randint\n",
13
+ "import random\n",
14
+ "import time\n",
15
+ "import uuid\n",
16
+ "from datetime import datetime\n",
17
+ "import math\n",
18
+ "import numpy as np\n",
19
+ "import pandas as pd\n",
20
+ "import matplotlib.pyplot as plt\n",
21
+ "import numpy as np\n",
22
+ "import logging\n",
23
+ "import logging.handlers\n",
24
+ "import pickle\n",
25
+ "from contextlib import contextmanager\n",
26
+ "\n",
27
+ "tokens = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\",\n",
28
+ " \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
29
+ "numTokens = tokens[:10]\n",
30
+ "alphaTokens = tokens[10:36]\n",
31
+ "loCaseChars = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
32
+ "\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
33
+ "\n",
34
+ "typeInt = \"int\"\n",
35
+ "typeFloat = \"float\"\n",
36
+ "typeString = \"string\"\n",
37
+ "\n",
38
+ "secInMinute = 60\n",
39
+ "secInHour = 60 * 60\n",
40
+ "secInDay = 24 * secInHour\n",
41
+ "secInWeek = 7 * secInDay\n",
42
+ "secInYear = 365 * secInDay\n",
43
+ "secInMonth = secInYear / 12\n",
44
+ "\n",
45
+ "minInHour = 60\n",
46
+ "minInDay = 24 * minInHour\n",
47
+ "\n",
48
+ "ftPerYard = 3\n",
49
+ "ftPerMile = ftPerYard * 1760\n",
50
+ "\n",
51
+ "\n",
52
+ "def genID(size):\n",
53
+ " \"\"\"\n",
54
+ " generates ID\n",
55
+ "\n",
56
+ " Parameters\n",
57
+ " size : size of ID\n",
58
+ " \"\"\"\n",
59
+ " id = \"\"\n",
60
+ " for i in range(size):\n",
61
+ " id = id + selectRandomFromList(tokens)\n",
62
+ " return id\n",
63
+ "\n",
64
+ "def genIdList(numId, idSize):\n",
65
+ " \"\"\"\n",
66
+ " generate list of IDs\n",
67
+ "\n",
68
+ " Parameters:\n",
69
+ " numId: number of Ids\n",
70
+ " idSize: ID size\n",
71
+ " \"\"\"\n",
72
+ " iDs = []\n",
73
+ " for i in range(numId):\n",
74
+ " iDs.append(genID(idSize))\n",
75
+ " return iDs\n",
76
+ "\n",
77
+ "def genNumID(size):\n",
78
+ " \"\"\"\n",
79
+ " generates ID consisting of digits onl\n",
80
+ "\n",
81
+ " Parameters\n",
82
+ " size : size of ID\n",
83
+ " \"\"\"\n",
84
+ " id = \"\"\n",
85
+ " for i in range(size):\n",
86
+ " id = id + selectRandomFromList(numTokens)\n",
87
+ " return id\n",
88
+ "\n",
89
+ "def genLowCaseID(size):\n",
90
+ " \"\"\"\n",
91
+ " generates ID consisting of lower case chars\n",
92
+ "\n",
93
+ " Parameters\n",
94
+ " size : size of ID\n",
95
+ " \"\"\"\n",
96
+ " id = \"\"\n",
97
+ " for i in range(size):\n",
98
+ " id = id + selectRandomFromList(loCaseChars)\n",
99
+ " return id\n",
100
+ "\n",
101
+ "def genNumIdList(numId, idSize):\n",
102
+ " \"\"\"\n",
103
+ " generate list of numeric IDs\n",
104
+ "\n",
105
+ " Parameters:\n",
106
+ " numId: number of Ids\n",
107
+ " idSize: ID size\n",
108
+ " \"\"\"\n",
109
+ " iDs = []\n",
110
+ " for i in range(numId):\n",
111
+ " iDs.append(genNumID(idSize))\n",
112
+ " return iDs\n",
113
+ "\n",
114
+ "def genNameInitial():\n",
115
+ " \"\"\"\n",
116
+ " generate name initial\n",
117
+ " \"\"\"\n",
118
+ " return selectRandomFromList(alphaTokens) + selectRandomFromList(alphaTokens)\n",
119
+ "\n",
120
+ "def genPhoneNum(arCode):\n",
121
+ " \"\"\"\n",
122
+ " generates phone number\n",
123
+ "\n",
124
+ " Parameters\n",
125
+ " arCode: area code\n",
126
+ " \"\"\"\n",
127
+ " phNum = genNumID(7)\n",
128
+ " return arCode + str(phNum)\n",
129
+ "\n",
130
+ "def selectRandomFromList(ldata):\n",
131
+ " \"\"\"\n",
132
+ " select an element randomly from a lis\n",
133
+ "\n",
134
+ " Parameters\n",
135
+ " ldata : list data\n",
136
+ " \"\"\"\n",
137
+ " return ldata[randint(0, len(ldata)-1)]\n",
138
+ "\n",
139
+ "def selectOtherRandomFromList(ldata, cval):\n",
140
+ " \"\"\"\n",
141
+ " select an element randomly from a list excluding the given one\n",
142
+ "\n",
143
+ " Parameters\n",
144
+ " ldata : list data\n",
145
+ " cval : value to be excluded\n",
146
+ " \"\"\"\n",
147
+ " nval = selectRandomFromList(ldata)\n",
148
+ " while nval == cval:\n",
149
+ " nval = selectRandomFromList(ldata)\n",
150
+ " return nval\n",
151
+ "\n",
152
+ "def selectRandomSubListFromList(ldata, num):\n",
153
+ " \"\"\"\n",
154
+ " generates random sublist from a list without replacemment\n",
155
+ "\n",
156
+ " Parameters\n",
157
+ " ldata : list data\n",
158
+ " num : output list size\n",
159
+ " \"\"\"\n",
160
+ " assertLesser(num, len(ldata), \"size of sublist to be sampled greater than or equal to main list\")\n",
161
+ " i = randint(0, len(ldata)-1)\n",
162
+ " sel = ldata[i]\n",
163
+ " selSet = {i}\n",
164
+ " selList = [sel]\n",
165
+ " while (len(selSet) < num):\n",
166
+ " i = randint(0, len(ldata)-1)\n",
167
+ " if (i not in selSet):\n",
168
+ " sel = ldata[i]\n",
169
+ " selSet.add(i)\n",
170
+ " selList.append(sel)\t\t\n",
171
+ " return selList\n",
172
+ "\n",
173
+ "def selectRandomSubListFromListWithRepl(ldata, num):\n",
174
+ " \"\"\"\n",
175
+ " generates random sublist from a list with replacemment\n",
176
+ "\n",
177
+ " Parameters\n",
178
+ " ldata : list data\n",
179
+ " num : output list size\n",
180
+ " \"\"\"\n",
181
+ " return list(map(lambda i : selectRandomFromList(ldata), range(num)))\n",
182
+ "\n",
183
+ "def selectRandomFromDict(ddata):\n",
184
+ " \"\"\"\n",
185
+ " select an element randomly from a dictionary\n",
186
+ "\n",
187
+ " Parameters\n",
188
+ " ddata : dictionary data\n",
189
+ " \"\"\"\n",
190
+ " dkeys = list(ddata.keys())\n",
191
+ " dk = selectRandomFromList(dkeys)\n",
192
+ " el = (dk, ddata[dk])\n",
193
+ " return el\n",
194
+ "\n",
195
+ "def setListRandomFromList(ldata, ldataRepl):\n",
196
+ " \"\"\"\n",
197
+ " sets some elents in the first list randomly with elements from the second list\n",
198
+ "\n",
199
+ " Parameters\n",
200
+ " ldata : list data\n",
201
+ " ldataRepl : list with replacement data\n",
202
+ " \"\"\"\n",
203
+ " l = len(ldata)\n",
204
+ " selSet = set()\n",
205
+ " for d in ldataRepl:\n",
206
+ " i = randint(0, l-1)\n",
207
+ " while i in selSet:\n",
208
+ " i = randint(0, l-1)\n",
209
+ " ldata[i] = d\n",
210
+ " selSet.add(i)\n",
211
+ "\n",
212
+ "def genIpAddress():\n",
213
+ " \"\"\"\n",
214
+ " generates IP address\n",
215
+ " \"\"\"\n",
216
+ " i1 = randint(0,256)\n",
217
+ " i2 = randint(0,256)\n",
218
+ " i3 = randint(0,256)\n",
219
+ " i4 = randint(0,256)\n",
220
+ " ip = \"%d.%d.%d.%d\" %(i1,i2,i3,i4)\n",
221
+ " return ip\n",
222
+ "\n",
223
+ "def curTimeMs():\n",
224
+ " \"\"\"\n",
225
+ " current time in ms\n",
226
+ " \"\"\"\n",
227
+ " return int((datetime.utcnow() - datetime(1970,1,1)).total_seconds() * 1000)\n",
228
+ "\n",
229
+ "def secDegPolyFit(x1, y1, x2, y2, x3, y3):\n",
230
+ " \"\"\"\n",
231
+ " second deg polynomial \t\n",
232
+ "\n",
233
+ " Parameters\n",
234
+ " x1 : 1st point x\n",
235
+ " y1 : 1st point y\n",
236
+ " x2 : 2nd point x\n",
237
+ " y2 : 2nd point y\n",
238
+ " x3 : 3rd point x\n",
239
+ " y3 : 3rd point y\n",
240
+ " \"\"\"\n",
241
+ " t = (y1 - y2) / (x1 - x2)\n",
242
+ " a = t - (y2 - y3) / (x2 - x3)\n",
243
+ " a = a / (x1 - x3)\n",
244
+ " b = t - a * (x1 + x2)\n",
245
+ " c = y1 - a * x1 * x1 - b * x1\n",
246
+ " return (a, b, c)\n",
247
+ "\n",
248
+ "def range_limit(val, minv, maxv):\n",
249
+ " \"\"\"\n",
250
+ " range limit a value\n",
251
+ "\n",
252
+ " Parameters\n",
253
+ " val : data value\n",
254
+ " minv : minimum\n",
255
+ " maxv : maximum\n",
256
+ " \"\"\"\n",
257
+ " if (val < minv):\n",
258
+ " val = minv\n",
259
+ " elif (val > maxv):\n",
260
+ " val = maxv\n",
261
+ " return val\t\n",
262
+ "\n",
263
+ "def isInRange(val, minv, maxv):\n",
264
+ " \"\"\"\n",
265
+ " checks if within range\n",
266
+ "\n",
267
+ " Parameters\n",
268
+ " val : data value\n",
269
+ " minv : minimum\n",
270
+ " maxv : maximum\n",
271
+ " \"\"\"\n",
272
+ " return val >= minv and val <= maxv\n",
273
+ "\n",
274
+ "def stripFileLines(filePath, offset):\n",
275
+ " \"\"\"\n",
276
+ " strips number of chars from both ends\n",
277
+ "\n",
278
+ " Parameters\n",
279
+ " filePath : file path\n",
280
+ " offset : offset from both ends of line \n",
281
+ " \"\"\"\n",
282
+ " fp = open(filePath, \"r\")\n",
283
+ " for line in fp:\n",
284
+ " stripped = line[offset:len(line) - 1 - offset]\n",
285
+ " print (stripped)\n",
286
+ " fp.close()\n",
287
+ "\n",
288
+ "def genLatLong(lat1, long1, lat2, long2):\n",
289
+ " \"\"\"\n",
290
+ " generate lat log within limits\n",
291
+ "\n",
292
+ " Parameters\n",
293
+ " lat1 : lat of 1st point\n",
294
+ " long1 : long of 1st point\n",
295
+ " lat2 : lat of 2nd point\n",
296
+ " long2 : long of 2nd point\n",
297
+ " \"\"\"\n",
298
+ " lat = lat1 + (lat2 - lat1) * random.random()\n",
299
+ " longg = long1 + (long2 - long1) * random.random()\n",
300
+ " return (lat, longg)\n",
301
+ "\n",
302
+ "def geoDistance(lat1, long1, lat2, long2):\n",
303
+ " \"\"\"\n",
304
+ " find geo distance in ft\n",
305
+ "\n",
306
+ " Parameters\n",
307
+ " lat1 : lat of 1st point\n",
308
+ " long1 : long of 1st point\n",
309
+ " lat2 : lat of 2nd point\n",
310
+ " long2 : long of 2nd point\n",
311
+ " \"\"\"\n",
312
+ " latDiff = math.radians(lat1 - lat2)\n",
313
+ " longDiff = math.radians(long1 - long2)\n",
314
+ " l1 = math.sin(latDiff/2.0)\n",
315
+ " l2 = math.sin(longDiff/2.0)\n",
316
+ " l3 = math.cos(math.radians(lat1))\n",
317
+ " l4 = math.cos(math.radians(lat2))\n",
318
+ " a = l1 * l1 + l3 * l4 * l2 * l2\n",
319
+ " l5 = math.sqrt(a)\n",
320
+ " l6 = math.sqrt(1.0 - a)\n",
321
+ " c = 2.0 * math.atan2(l5, l6)\n",
322
+ " r = 6371008.8 * 3.280840\n",
323
+ " return c * r\n",
324
+ "\n",
325
+ "def minLimit(val, limit):\n",
326
+ " \"\"\"\n",
327
+ " min limit\n",
328
+ " Parameters\n",
329
+ " \"\"\"\n",
330
+ " if (val < limit):\n",
331
+ " val = limit\n",
332
+ " return val;\n",
333
+ "\n",
334
+ "def maxLimit(val, limit):\n",
335
+ " \"\"\"\n",
336
+ " max limit\n",
337
+ " Parameters\n",
338
+ " \"\"\"\n",
339
+ " if (val > limit):\n",
340
+ " val = limit\n",
341
+ " return val;\n",
342
+ "\n",
343
+ "def rangeSample(val, minLim, maxLim):\n",
344
+ " \"\"\"\n",
345
+ " if out side range sample within range\n",
346
+ "\n",
347
+ " Parameters\n",
348
+ " val : value\n",
349
+ " minLim : minimum\n",
350
+ " maxLim : maximum\n",
351
+ " \"\"\"\n",
352
+ " if val < minLim or val > maxLim:\n",
353
+ " val = randint(minLim, maxLim)\n",
354
+ " return val\n",
355
+ "\n",
356
+ "def genRandomIntListWithinRange(size, minLim, maxLim):\n",
357
+ " \"\"\"\n",
358
+ " random unique list of integers within range\n",
359
+ "\n",
360
+ " Parameters\n",
361
+ " size : size of returned list\n",
362
+ " minLim : minimum\n",
363
+ " maxLim : maximum\n",
364
+ " \"\"\"\n",
365
+ " values = set()\n",
366
+ " for i in range(size):\n",
367
+ " val = randint(minLim, maxLim)\n",
368
+ " while val not in values:\n",
369
+ " values.add(val)\n",
370
+ " return list(values)\n",
371
+ "\n",
372
+ "def preturbScalar(value, vrange):\n",
373
+ " \"\"\"\n",
374
+ " preturbs a mutiplicative value within range\n",
375
+ "\n",
376
+ " Parameters\n",
377
+ " value : data value\n",
378
+ " vrange : value delta fraction\n",
379
+ " \"\"\"\n",
380
+ " scale = 1.0 - vrange + 2 * vrange * random.random() \n",
381
+ " return value * scale\n",
382
+ "\n",
383
+ "def preturbScalarAbs(value, vrange):\n",
384
+ " \"\"\"\n",
385
+ " preturbs an absolute value within range\n",
386
+ "\n",
387
+ " Parameters\n",
388
+ " value : data value\n",
389
+ " vrange : value delta absolute\n",
390
+ " \"\"\"\n",
391
+ " delta = - vrange + 2.0 * vrange * random.random() \n",
392
+ " return value + delta\n",
393
+ "\n",
394
+ "def preturbVector(values, vrange):\n",
395
+ " \"\"\"\n",
396
+ " preturbs a list within range\n",
397
+ "\n",
398
+ " Parameters\n",
399
+ " values : list data\n",
400
+ " vrange : value delta fraction\n",
401
+ " \"\"\"\n",
402
+ " nValues = list(map(lambda va: preturbScalar(va, vrange), values))\n",
403
+ " return nValues\n",
404
+ "\n",
405
+ "def randomShiftVector(values, smin, smax):\n",
406
+ " \"\"\"\n",
407
+ " shifts a list by a random quanity with a range\n",
408
+ "\n",
409
+ " Parameters\n",
410
+ " values : list data\n",
411
+ " smin : samplinf minimum\n",
412
+ " smax : sampling maximum\n",
413
+ " \"\"\"\n",
414
+ " shift = np.random.uniform(smin, smax)\n",
415
+ " return list(map(lambda va: va + shift, values))\n",
416
+ "\n",
417
+ "def floatRange(beg, end, incr):\n",
418
+ " \"\"\"\n",
419
+ " generates float range\n",
420
+ "\n",
421
+ " Parameters\n",
422
+ " beg :range begin\n",
423
+ " end: range end\n",
424
+ " incr : range increment\n",
425
+ " \"\"\"\n",
426
+ " return list(np.arange(beg, end, incr))\n",
427
+ "\n",
428
+ "def shuffle(values, *numShuffles):\n",
429
+ " \"\"\"\n",
430
+ " in place shuffling with swap of pairs\n",
431
+ "\n",
432
+ " Parameters\n",
433
+ " values : list data\n",
434
+ " numShuffles : parameter list for number of shuffles\n",
435
+ " \"\"\"\n",
436
+ " size = len(values)\n",
437
+ " if len(numShuffles) == 0:\n",
438
+ " numShuffle = int(size / 2)\n",
439
+ " elif len(numShuffles) == 1:\n",
440
+ " numShuffle = numShuffles[0]\n",
441
+ " else:\n",
442
+ " numShuffle = randint(numShuffles[0], numShuffles[1])\n",
443
+ " print(\"numShuffle {}\".format(numShuffle))\n",
444
+ " for i in range(numShuffle):\n",
445
+ " first = random.randint(0, size - 1)\n",
446
+ " second = random.randint(0, size - 1)\n",
447
+ " while first == second:\n",
448
+ " second = random.randint(0, size - 1)\n",
449
+ " tmp = values[first]\n",
450
+ " values[first] = values[second]\n",
451
+ " values[second] = tmp\n",
452
+ "\n",
453
+ "\n",
454
+ "def splitList(itms, numGr):\n",
455
+ " \"\"\"\n",
456
+ " splits a list into sub lists of approximately equal size, with items in sublists randomly chod=sen\n",
457
+ "\n",
458
+ " Parameters\n",
459
+ " itms ; list of values\t\t\n",
460
+ " numGr : no of groups\n",
461
+ " \"\"\"\n",
462
+ " tcount = len(itms)\n",
463
+ " cItems = list(itms)\n",
464
+ " sz = int(len(cItems) / numGr)\n",
465
+ " groups = list()\n",
466
+ " count = 0\n",
467
+ " for i in range(numGr):\n",
468
+ " if (i == numGr - 1):\n",
469
+ " csz = tcount - count\n",
470
+ " else:\n",
471
+ " csz = sz + randint(-2, 2)\n",
472
+ " count += csz\n",
473
+ " gr = list()\n",
474
+ " for j in range(csz):\n",
475
+ " it = selectRandomFromList(cItems)\n",
476
+ " gr.append(it)\n",
477
+ " cItems.remove(it)\n",
478
+ " groups.append(gr)\n",
479
+ " return groups\n",
480
+ "\n",
481
+ "def multVector(values, vrange):\n",
482
+ " \"\"\"\n",
483
+ " multiplies a list within value range\n",
484
+ "\n",
485
+ " Parameters\n",
486
+ " values : list of values\n",
487
+ " vrange : fraction of vaue to be used to update\n",
488
+ " \"\"\"\n",
489
+ " scale = 1.0 - vrange + 2 * vrange * random.random()\n",
490
+ " nValues = list(map(lambda va: va * scale, values))\n",
491
+ " return nValues\n",
492
+ "\n",
493
+ "def weightedAverage(values, weights):\n",
494
+ " \"\"\"\n",
495
+ " calculates weighted average\n",
496
+ "\n",
497
+ " Parameters\n",
498
+ " values : list of values\n",
499
+ " weights : list of weights\n",
500
+ " \"\"\"\t\t\n",
501
+ " assert len(values) == len(weights), \"values and weights should be same size\"\n",
502
+ " vw = zip(values, weights)\n",
503
+ " wva = list(map(lambda e : e[0] * e[1], vw))\n",
504
+ " #wa = sum(x * y for x, y in vw) / sum(weights)\n",
505
+ " wav = sum(wva) / sum(weights)\n",
506
+ " return wav\n",
507
+ "\n",
508
+ "def extractFields(line, delim, keepIndices):\n",
509
+ " \"\"\"\n",
510
+ " breaks a line into fields and keeps only specified fileds and returns new line\n",
511
+ "\n",
512
+ " Parameters\n",
513
+ " line ; deli separated string\n",
514
+ " delim : delemeter\n",
515
+ " keepIndices : list of indexes to fields to be retained\n",
516
+ " \"\"\"\n",
517
+ " items = line.split(delim)\n",
518
+ " newLine = []\n",
519
+ " for i in keepIndices:\n",
520
+ " newLine.append(line[i])\n",
521
+ " return delim.join(newLine)\n",
522
+ "\n",
523
+ "def remFields(line, delim, remIndices):\n",
524
+ " \"\"\"\n",
525
+ " removes fields from delim separated string\n",
526
+ "\n",
527
+ " Parameters\n",
528
+ " line ; delemeter separated string\n",
529
+ " delim : delemeter\n",
530
+ " remIndices : list of indexes to fields to be removed\n",
531
+ " \"\"\"\n",
532
+ " items = line.split(delim)\n",
533
+ " newLine = []\n",
534
+ " for i in range(len(items)):\n",
535
+ " if not arrayContains(remIndices, i):\n",
536
+ " newLine.append(line[i])\n",
537
+ " return delim.join(newLine)\n",
538
+ "\n",
539
+ "def extractList(data, indices):\n",
540
+ " \"\"\"\n",
541
+ " extracts list from another list, given indices\n",
542
+ "\n",
543
+ " Parameters\n",
544
+ " remIndices : list data\n",
545
+ " indices : list of indexes to fields to be retained\n",
546
+ " \"\"\"\n",
547
+ " if areAllFieldsIncluded(data, indices):\n",
548
+ " exList = data.copy()\n",
549
+ " #print(\"all indices\")\n",
550
+ " else:\n",
551
+ " exList = list()\n",
552
+ " le = len(data)\n",
553
+ " for i in indices:\n",
554
+ " assert i < le , \"index {} out of bound {}\".format(i, le)\n",
555
+ " exList.append(data[i])\n",
556
+ "\n",
557
+ " return exList\n",
558
+ "\n",
559
+ "def arrayContains(arr, item):\n",
560
+ " \"\"\"\n",
561
+ " checks if array contains an item \n",
562
+ "\n",
563
+ " Parameters\n",
564
+ " arr : list data\n",
565
+ " item : item to search\n",
566
+ " \"\"\"\n",
567
+ " contains = True\n",
568
+ " try:\n",
569
+ " arr.index(item)\n",
570
+ " except ValueError:\n",
571
+ " contains = False\n",
572
+ " return contains\n",
573
+ "\n",
574
+ "def strToIntArray(line, delim=\",\"):\n",
575
+ " \"\"\"\n",
576
+ " int array from delim separated string\n",
577
+ "\n",
578
+ " Parameters\n",
579
+ " line ; delemeter separated string\n",
580
+ " \"\"\"\n",
581
+ " arr = line.split(delim)\n",
582
+ " return [int(a) for a in arr]\n",
583
+ "\n",
584
+ "def strToFloatArray(line, delim=\",\"):\n",
585
+ " \"\"\"\n",
586
+ " float array from delim separated string\n",
587
+ "\n",
588
+ " Parameters\n",
589
+ " line ; delemeter separated string\n",
590
+ " \"\"\"\n",
591
+ " arr = line.split(delim)\n",
592
+ " return [float(a) for a in arr]\n",
593
+ "\n",
594
+ "def strListOrRangeToIntArray(line):\n",
595
+ " \"\"\"\n",
596
+ " int array from delim separated string or range\n",
597
+ "\n",
598
+ " Parameters\n",
599
+ " line ; delemeter separated string\n",
600
+ " \"\"\"\n",
601
+ " varr = line.split(\",\")\n",
602
+ " if (len(varr) > 1):\n",
603
+ " iarr = list(map(lambda v: int(v), varr))\n",
604
+ " else:\n",
605
+ " vrange = line.split(\":\")\n",
606
+ " if (len(vrange) == 2):\n",
607
+ " lo = int(vrange[0])\n",
608
+ " hi = int(vrange[1])\n",
609
+ " iarr = list(range(lo, hi+1))\n",
610
+ " else:\n",
611
+ " iarr = [int(line)]\n",
612
+ " return iarr\n",
613
+ "\n",
614
+ "def toStr(val, precision):\n",
615
+ " \"\"\"\n",
616
+ " converts any type to string\t\n",
617
+ "\n",
618
+ " Parameters\n",
619
+ " val : value\n",
620
+ " precision ; precision for float value\n",
621
+ " \"\"\"\n",
622
+ " if type(val) == float or type(val) == np.float64 or type(val) == np.float32:\n",
623
+ " format = \"%\" + \".%df\" %(precision)\n",
624
+ " sVal = format %(val)\n",
625
+ " else:\n",
626
+ " sVal = str(val)\n",
627
+ " return sVal\n",
628
+ "\n",
629
+ "def toStrFromList(values, precision, delim=\",\"):\n",
630
+ " \"\"\"\n",
631
+ " converts list of any type to delim separated string\n",
632
+ "\n",
633
+ " Parameters\n",
634
+ " values : list data\n",
635
+ " precision ; precision for float value\n",
636
+ " delim : delemeter\n",
637
+ " \"\"\"\n",
638
+ " sValues = list(map(lambda v: toStr(v, precision), values))\n",
639
+ " return delim.join(sValues)\n",
640
+ "\n",
641
+ "def toIntList(values):\n",
642
+ " \"\"\"\n",
643
+ " convert to int list\n",
644
+ "\n",
645
+ " Parameters\n",
646
+ " values : list data\n",
647
+ " \"\"\"\n",
648
+ " return list(map(lambda va: int(va), values))\n",
649
+ "\n",
650
+ "def toFloatList(values):\n",
651
+ " \"\"\"\n",
652
+ " convert to float list\n",
653
+ "\n",
654
+ " Parameters\n",
655
+ " values : list data\n",
656
+ " \"\"\"\n",
657
+ " return list(map(lambda va: float(va), values))\n",
658
+ "\n",
659
+ "def toStrList(values, precision=None):\n",
660
+ " \"\"\"\n",
661
+ " convert to string list\n",
662
+ "\n",
663
+ " Parameters\n",
664
+ " values : list data\n",
665
+ " precision ; precision for float value\n",
666
+ " \"\"\"\n",
667
+ " return list(map(lambda va: toStr(va, precision), values))\n",
668
+ "\n",
669
+ "def toIntFromBoolean(value):\n",
670
+ " \"\"\"\n",
671
+ " convert to int\n",
672
+ "\n",
673
+ " Parameters\n",
674
+ " value : boolean value\n",
675
+ " \"\"\"\n",
676
+ " ival = 1 if value else 0\n",
677
+ " return ival\n",
678
+ "\n",
679
+ "def typedValue(val, dtype=None):\n",
680
+ " \"\"\"\n",
681
+ " return typed value given string, discovers data type if not specified\n",
682
+ "\n",
683
+ " Parameters\n",
684
+ " val : value\n",
685
+ " dtype : data type\n",
686
+ " \"\"\"\n",
687
+ " tVal = None\n",
688
+ "\n",
689
+ " if dtype is not None:\n",
690
+ " if dtype == \"num\":\n",
691
+ " dtype = \"int\" if dtype.find(\".\") == -1 else \"float\"\n",
692
+ "\n",
693
+ " if dtype == \"int\":\n",
694
+ " tVal = int(val)\n",
695
+ " elif dtype == \"float\":\n",
696
+ " tVal = float(val)\n",
697
+ " elif dtype == \"bool\":\n",
698
+ " tVal = bool(val)\n",
699
+ " else:\n",
700
+ " tVal = val\n",
701
+ " else:\n",
702
+ " if type(val) == str:\n",
703
+ " lVal = val.lower()\n",
704
+ "\n",
705
+ " #int\n",
706
+ " done = True\n",
707
+ " try:\n",
708
+ " tVal = int(val)\n",
709
+ " except ValueError:\n",
710
+ " done = False\n",
711
+ "\n",
712
+ " #float\n",
713
+ " if not done:\n",
714
+ " done = True\n",
715
+ " try:\n",
716
+ " tVal = float(val)\n",
717
+ " except ValueError:\n",
718
+ " done = False\n",
719
+ "\n",
720
+ " #boolean\n",
721
+ " if not done:\n",
722
+ " done = True\n",
723
+ " if lVal == \"true\":\n",
724
+ " tVal = True\n",
725
+ " elif lVal == \"false\":\n",
726
+ " tVal = False\n",
727
+ " else:\n",
728
+ " done = False\n",
729
+ " #None\t\t\n",
730
+ " if not done:\n",
731
+ " if lVal == \"none\":\n",
732
+ " tVal = None\n",
733
+ " else:\n",
734
+ " tVal = val\n",
735
+ " else:\n",
736
+ " tVal = val\n",
737
+ "\n",
738
+ " return tVal\n",
739
+ "\n",
740
+ "def getAllFiles(dirPath):\n",
741
+ " \"\"\"\n",
742
+ " get all files recursively\n",
743
+ "\n",
744
+ " Parameters\n",
745
+ " dirPath : directory path\n",
746
+ " \"\"\"\n",
747
+ " filePaths = []\n",
748
+ " for (thisDir, subDirs, fileNames) in os.walk(dirPath):\n",
749
+ " for fileName in fileNames:\n",
750
+ " filePaths.append(os.path.join(thisDir, fileName))\n",
751
+ " filePaths.sort()\n",
752
+ " return filePaths\n",
753
+ "\n",
754
+ "def getFileContent(fpath, verbose=False):\n",
755
+ " \"\"\"\n",
756
+ " get file contents in directory\n",
757
+ "\n",
758
+ " Parameters\n",
759
+ " fpath ; directory path\n",
760
+ " verbose : verbosity flag\n",
761
+ " \"\"\"\n",
762
+ " # dcument list\n",
763
+ " docComplete = []\n",
764
+ " filePaths = getAllFiles(fpath)\n",
765
+ "\n",
766
+ " # read files\n",
767
+ " for filePath in filePaths:\n",
768
+ " if verbose:\n",
769
+ " print(\"next file \" + filePath)\n",
770
+ " with open(filePath, 'r') as contentFile:\n",
771
+ " content = contentFile.read()\n",
772
+ " docComplete.append(content)\n",
773
+ " return (docComplete, filePaths)\n",
774
+ "\n",
775
+ "def getOneFileContent(fpath):\n",
776
+ " \"\"\"\n",
777
+ " get one file contents\n",
778
+ "\n",
779
+ " Parameters\n",
780
+ " fpath : file path\n",
781
+ " \"\"\"\n",
782
+ " with open(fpath, 'r') as contentFile:\n",
783
+ " docStr = contentFile.read()\n",
784
+ " return docStr\n",
785
+ "\n",
786
+ "def getFileLines(dirPath, delim=\",\"):\n",
787
+ " \"\"\"\n",
788
+ " get lines from a file\n",
789
+ "\n",
790
+ " Parameters\n",
791
+ " dirPath : file path\n",
792
+ " delim : delemeter\n",
793
+ " \"\"\"\n",
794
+ " lines = list()\n",
795
+ " for li in fileRecGen(dirPath, delim):\n",
796
+ " lines.append(li)\n",
797
+ " return lines\n",
798
+ "\n",
799
+ "def getFileSampleLines(dirPath, percen, delim=\",\"):\n",
800
+ " \"\"\"\n",
801
+ " get sampled lines from a file\n",
802
+ "\n",
803
+ " Parameters\n",
804
+ " dirPath : file path\n",
805
+ " percen : sampling percentage\n",
806
+ " delim : delemeter\n",
807
+ " \"\"\"\n",
808
+ " lines = list()\n",
809
+ " for li in fileRecGen(dirPath, delim):\n",
810
+ " if randint(0, 100) < percen:\n",
811
+ " lines.append(li)\n",
812
+ " return lines\n",
813
+ "\n",
814
+ "def getFileColumnAsString(dirPath, index, delim=\",\"):\n",
815
+ " \"\"\"\n",
816
+ " get string column from a file\n",
817
+ "\n",
818
+ " Parameters\n",
819
+ " dirPath : file path\n",
820
+ " index : index\n",
821
+ " delim : delemeter\n",
822
+ " \"\"\"\n",
823
+ " fields = list()\n",
824
+ " for rec in fileRecGen(dirPath, delim):\n",
825
+ " fields.append(rec[index])\n",
826
+ " #print(fields)\t\n",
827
+ " return fields\n",
828
+ "\n",
829
+ "def getFileColumnsAsString(dirPath, indexes, delim=\",\"):\n",
830
+ " \"\"\"\n",
831
+ " get multiple string columns from a file\n",
832
+ "\n",
833
+ " Parameters\n",
834
+ " dirPath : file path\n",
835
+ " indexes : indexes of columns\n",
836
+ " delim : delemeter\n",
837
+ " \"\"\"\n",
838
+ " nindex = len(indexes)\n",
839
+ " columns = list(map(lambda i : list(), range(nindex)))\n",
840
+ " for rec in fileRecGen(dirPath, delim):\n",
841
+ " for i in range(nindex):\n",
842
+ " columns[i].append(rec[indexes[i]])\n",
843
+ " return columns\n",
844
+ "\n",
845
+ "def getFileColumnAsFloat(dirPath, index, delim=\",\"):\n",
846
+ " \"\"\"\n",
847
+ " get float fileds from a file\n",
848
+ "\n",
849
+ " Parameters\n",
850
+ " dirPath : file path\n",
851
+ " index : index\n",
852
+ " delim : delemeter\n",
853
+ " \"\"\"\n",
854
+ " #print(\"{} {}\".format(dirPath, index))\n",
855
+ " fields = getFileColumnAsString(dirPath, index, delim)\n",
856
+ " return list(map(lambda v:float(v), fields))\n",
857
+ "\n",
858
+ "def getFileColumnAsInt(dirPath, index, delim=\",\"):\n",
859
+ " \"\"\"\n",
860
+ " get float fileds from a file\n",
861
+ "\n",
862
+ " Parameters\n",
863
+ " dirPath : file path\n",
864
+ " index : index\n",
865
+ " delim : delemeter\n",
866
+ " \"\"\"\n",
867
+ " fields = getFileColumnAsString(dirPath, index, delim)\n",
868
+ " return list(map(lambda v:int(v), fields))\n",
869
+ "\n",
870
+ "def getFileAsIntMatrix(dirPath, columns, delim=\",\"):\n",
871
+ " \"\"\"\n",
872
+ " extracts int matrix from csv file given column indices with each row being concatenation of \n",
873
+ " extracted column values row size = num of columns\n",
874
+ "\n",
875
+ " Parameters\n",
876
+ " dirPath : file path\n",
877
+ " columns : indexes of columns\n",
878
+ " delim : delemeter\n",
879
+ " \"\"\"\n",
880
+ " mat = list()\n",
881
+ " for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
882
+ " mat.append(asIntList(rec))\n",
883
+ " return mat\n",
884
+ "\n",
885
+ "def getFileAsFloatMatrix(dirPath, columns, delim=\",\"):\n",
886
+ " \"\"\"\n",
887
+ " extracts float matrix from csv file given column indices with each row being concatenation of \n",
888
+ " extracted column values row size = num of columns\n",
889
+ " Parameters\n",
890
+ " dirPath : file path\n",
891
+ " columns : indexes of columns\n",
892
+ " delim : delemeter\n",
893
+ " \"\"\"\n",
894
+ " mat = list()\n",
895
+ " for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
896
+ " mat.append(asFloatList(rec))\n",
897
+ " return mat\n",
898
+ "\n",
899
+ "def getFileAsFloatColumn(dirPath):\n",
900
+ " \"\"\"\n",
901
+ " grt float list from a file with one float per row\n",
902
+ " Parameters\n",
903
+ " dirPath : file path\n",
904
+ " \"\"\"\n",
905
+ " flist = list()\n",
906
+ " for rec in fileRecGen(dirPath, None):\n",
907
+ " flist.append(float(rec))\n",
908
+ " return flist\n",
909
+ "\n",
910
+ "def getFileAsFiltFloatMatrix(dirPath, filt, columns, delim=\",\"):\n",
911
+ " \"\"\"\n",
912
+ " extracts float matrix from csv file given row filter and column indices with each row being \n",
913
+ " concatenation of extracted column values row size = num of columns\n",
914
+ " Parameters\n",
915
+ " dirPath : file path\n",
916
+ " columns : indexes of columns\n",
917
+ " filt : row filter lambda\n",
918
+ " delim : delemeter\n",
919
+ " \"\"\"\n",
920
+ " mat = list()\n",
921
+ " for rec in fileFiltSelFieldsRecGen(dirPath, filt, columns, delim):\n",
922
+ " mat.append(asFloatList(rec))\n",
923
+ " return mat\n",
924
+ "\n",
925
+ "def getFileAsTypedRecords(dirPath, types, delim=\",\"):\n",
926
+ " \"\"\"\n",
927
+ " extracts typed records from csv file with each row being concatenation of \n",
928
+ " extracted column values \n",
929
+ " Parameters\n",
930
+ " dirPath : file path\n",
931
+ " types : data types\n",
932
+ " delim : delemeter\n",
933
+ " \"\"\"\n",
934
+ " (dtypes, cvalues) = extractTypesFromString(types)\t\n",
935
+ " tdata = list()\n",
936
+ " for rec in fileRecGen(dirPath, delim):\n",
937
+ " trec = list()\n",
938
+ " for index, value in enumerate(rec):\n",
939
+ " value = __convToTyped(index, value, dtypes)\n",
940
+ " trec.append(value)\n",
941
+ " tdata.append(trec)\n",
942
+ " return tdata\n",
943
+ "\n",
944
+ "\n",
945
+ "def getFileColsAsTypedRecords(dirPath, columns, types, delim=\",\"):\n",
946
+ " \"\"\"\n",
947
+ " extracts typed records from csv file given column indices with each row being concatenation of \n",
948
+ " extracted column values \n",
949
+ " Parameters\n",
950
+ " Parameters\n",
951
+ " dirPath : file path\n",
952
+ " columns : column indexes\n",
953
+ " types : data types\n",
954
+ " delim : delemeter\n",
955
+ " \"\"\"\n",
956
+ " (dtypes, cvalues) = extractTypesFromString(types)\t\n",
957
+ " tdata = list()\n",
958
+ " for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
959
+ " trec = list()\n",
960
+ " for indx, value in enumerate(rec):\n",
961
+ " tindx = columns[indx]\n",
962
+ " value = __convToTyped(tindx, value, dtypes)\n",
963
+ " trec.append(value)\n",
964
+ " tdata.append(trec)\n",
965
+ " return tdata\n",
966
+ "\n",
967
+ "def getFileColumnsMinMax(dirPath, columns, dtype, delim=\",\"):\n",
968
+ " \"\"\"\n",
969
+ " extracts numeric matrix from csv file given column indices. For each column return min and max\n",
970
+ " Parameters\n",
971
+ " dirPath : file path\n",
972
+ " columns : column indexes\n",
973
+ " dtype : data type\n",
974
+ " delim : delemeter\n",
975
+ " \"\"\"\n",
976
+ " dtypes = list(map(lambda c : str(c) + \":\" + dtype, columns))\n",
977
+ " dtypes = \",\".join(dtypes)\n",
978
+ " #print(dtypes)\n",
979
+ "\n",
980
+ " tdata = getFileColsAsTypedRecords(dirPath, columns, dtypes, delim)\n",
981
+ " minMax = list()\n",
982
+ " ncola = len(tdata[0])\n",
983
+ " ncole = len(columns)\n",
984
+ " assertEqual(ncola, ncole, \"actual no of columns different from expected\")\n",
985
+ "\n",
986
+ " for ci in range(ncole):\t\n",
987
+ " vmin = sys.float_info.max\n",
988
+ " vmax = sys.float_info.min\n",
989
+ " for r in tdata:\n",
990
+ " cv = r[ci]\n",
991
+ " vmin = cv if cv < vmin else vmin\n",
992
+ " vmax = cv if cv > vmax else vmax\n",
993
+ " mm = (vmin, vmax, vmax - vmin)\n",
994
+ " minMax.append(mm)\n",
995
+ "\n",
996
+ " return minMax\n",
997
+ "\n",
998
+ "\n",
999
+ "def getRecAsTypedRecord(rec, types, delim=None):\n",
1000
+ " \"\"\"\n",
1001
+ " converts record to typed records \n",
1002
+ " Parameters\n",
1003
+ " rec : delemeter separate string or list of string\n",
1004
+ " types : field data types\n",
1005
+ " delim : delemeter\n",
1006
+ " \"\"\"\t\n",
1007
+ " if delim is not None:\n",
1008
+ " rec = rec.split(delim)\n",
1009
+ " (dtypes, cvalues) = extractTypesFromString(types)\t\n",
1010
+ " #print(types)\n",
1011
+ " #print(dtypes)\n",
1012
+ " trec = list()\n",
1013
+ " for ind, value in enumerate(rec):\n",
1014
+ " tvalue = __convToTyped(ind, value, dtypes)\n",
1015
+ " trec.append(tvalue)\n",
1016
+ " return trec\n",
1017
+ "\n",
1018
+ "def __convToTyped(index, value, dtypes):\n",
1019
+ " \"\"\"\n",
1020
+ " convert to typed value \n",
1021
+ " Parameters\n",
1022
+ " index : index in type list\n",
1023
+ " value : data value\n",
1024
+ " dtypes : data type list\n",
1025
+ " \"\"\"\n",
1026
+ " #print(index, value)\n",
1027
+ " dtype = dtypes[index]\n",
1028
+ " tvalue = value\n",
1029
+ " if dtype == \"int\":\n",
1030
+ " tvalue = int(value)\n",
1031
+ " elif dtype == \"float\":\n",
1032
+ " tvalue = float(value)\n",
1033
+ " return tvalue\n",
1034
+ "\n",
1035
+ "\n",
1036
+ "\n",
1037
+ "def extractTypesFromString(types):\n",
1038
+ " \"\"\"\n",
1039
+ " extracts column data types and set values for categorical variables \n",
1040
+ " Parameters\n",
1041
+ " types : encoded type information\n",
1042
+ " \"\"\"\n",
1043
+ " ftypes = types.split(\",\")\n",
1044
+ " dtypes = dict()\n",
1045
+ " cvalues = dict()\n",
1046
+ " for ftype in ftypes:\n",
1047
+ " items = ftype.split(\":\") \n",
1048
+ " cindex = int(items[0])\n",
1049
+ " dtype = items[1]\n",
1050
+ " dtypes[cindex] = dtype\n",
1051
+ " if len(items) == 3:\n",
1052
+ " sitems = items[2].split()\n",
1053
+ " cvalues[cindex] = sitems\n",
1054
+ " return (dtypes, cvalues)\n",
1055
+ "\n",
1056
+ "def getMultipleFileAsInttMatrix(dirPathWithCol, delim=\",\"):\n",
1057
+ " \"\"\"\n",
1058
+ " extracts int matrix from from csv files given column index for each file. \n",
1059
+ " num of columns = number of rows in each file and num of rows = number of files\n",
1060
+ " Parameters\n",
1061
+ " dirPathWithCol: list of file path and collumn index pair\n",
1062
+ " delim : delemeter\n",
1063
+ " \"\"\"\n",
1064
+ " mat = list()\n",
1065
+ " minLen = -1\n",
1066
+ " for path, col in dirPathWithCol:\n",
1067
+ " colVals = getFileColumnAsInt(path, col, delim)\n",
1068
+ " if minLen < 0 or len(colVals) < minLen:\n",
1069
+ " minLen = len(colVals)\n",
1070
+ " mat.append(colVals)\n",
1071
+ "\n",
1072
+ " #make all same length\n",
1073
+ " mat = list(map(lambda li:li[:minLen], mat))\t\n",
1074
+ " return mat\n",
1075
+ "\n",
1076
+ "def getMultipleFileAsFloatMatrix(dirPathWithCol, delim=\",\"):\n",
1077
+ " \"\"\"\n",
1078
+ " extracts float matrix from from csv files given column index for each file. \n",
1079
+ " num of columns = number of rows in each file and num of rows = number of files\n",
1080
+ " Parameters\n",
1081
+ " dirPathWithCol: list of file path and collumn index pair\n",
1082
+ " delim : delemeter\n",
1083
+ " \"\"\"\n",
1084
+ " mat = list()\n",
1085
+ " minLen = -1\n",
1086
+ " for path, col in dirPathWithCol:\n",
1087
+ " colVals = getFileColumnAsFloat(path, col, delim)\n",
1088
+ " if minLen < 0 or len(colVals) < minLen:\n",
1089
+ " minLen = len(colVals)\n",
1090
+ " mat.append(colVals)\n",
1091
+ "\n",
1092
+ " #make all same length\n",
1093
+ " mat = list(map(lambda li:li[:minLen], mat))\n",
1094
+ " return mat\n",
1095
+ "\n",
1096
+ "def writeStrListToFile(ldata, filePath, delem=\",\"):\n",
1097
+ " \"\"\"\n",
1098
+ " writes list of dlem separated string or list of list of string to afile\n",
1099
+ "\n",
1100
+ " Parameters\n",
1101
+ " ldata : list data\n",
1102
+ " filePath : file path\n",
1103
+ " delim : delemeter\n",
1104
+ " \"\"\"\n",
1105
+ " with open(filePath, \"w\") as fh:\n",
1106
+ " for r in ldata:\n",
1107
+ " if type(r) == list:\n",
1108
+ " r = delem.join(r)\n",
1109
+ " fh.write(r + \"\\n\")\n",
1110
+ "\n",
1111
+ "def writeFloatListToFile(ldata, prec, filePath):\n",
1112
+ " \"\"\"\n",
1113
+ " writes float list to file, one value per line\n",
1114
+ "\n",
1115
+ " Parameters\n",
1116
+ " ldata : list data\n",
1117
+ " prec : precision\n",
1118
+ " filePath : file path\n",
1119
+ " \"\"\"\n",
1120
+ " with open(filePath, \"w\") as fh:\n",
1121
+ " for d in ldata:\n",
1122
+ " fh.write(formatFloat(prec, d) + \"\\n\")\n",
1123
+ "\n",
1124
+ "\n",
1125
+ "def takeFirst(elems):\n",
1126
+ " \"\"\"\n",
1127
+ " return fisrt item\n",
1128
+ " Parameters\n",
1129
+ " elems : list of data \n",
1130
+ " \"\"\"\n",
1131
+ " return elems[0]\n",
1132
+ "\n",
1133
+ "def takeSecond(elems):\n",
1134
+ " \"\"\"\n",
1135
+ " return 2nd element\n",
1136
+ " Parameters\n",
1137
+ " elems : list of data \n",
1138
+ " \"\"\"\n",
1139
+ " return elems[1]\n",
1140
+ "\n",
1141
+ "def takeThird(elems):\n",
1142
+ " \"\"\"\n",
1143
+ " returns 3rd element\n",
1144
+ " Parameters\n",
1145
+ " elems : list of data \n",
1146
+ " \"\"\"\n",
1147
+ " return elems[2]\n",
1148
+ "\n",
1149
+ "def addToKeyedCounter(dCounter, key, count=1):\n",
1150
+ " \"\"\"\n",
1151
+ " add to to keyed counter\n",
1152
+ " Parameters\n",
1153
+ " dCounter : dictionary of counters\n",
1154
+ " key : dictionary key\n",
1155
+ " count : count to add\n",
1156
+ " \"\"\"\n",
1157
+ " curCount = dCounter.get(key, 0)\n",
1158
+ " dCounter[key] = curCount + count\n",
1159
+ "\n",
1160
+ "def incrKeyedCounter(dCounter, key):\n",
1161
+ " \"\"\"\n",
1162
+ " increment keyed counter\n",
1163
+ " Parameters\n",
1164
+ " dCounter : dictionary of counters\n",
1165
+ " key : dictionary key\n",
1166
+ " \"\"\"\n",
1167
+ " addToKeyedCounter(dCounter, key, 1)\n",
1168
+ "\n",
1169
+ "def appendKeyedList(dList, key, elem):\n",
1170
+ " \"\"\"\n",
1171
+ " keyed list\n",
1172
+ " Parameters\n",
1173
+ " dList : dictionary of lists\n",
1174
+ " key : dictionary key\n",
1175
+ " elem : value to append\n",
1176
+ " \"\"\"\n",
1177
+ " curList = dList.get(key, [])\n",
1178
+ " curList.append(elem)\n",
1179
+ " dList[key] = curList\n",
1180
+ "\n",
1181
+ "def isNumber(st):\n",
1182
+ " \"\"\"\n",
1183
+ " Returns True is string is a number\n",
1184
+ " Parameters\n",
1185
+ " st : string value\n",
1186
+ " \"\"\"\n",
1187
+ " return st.replace('.','',1).isdigit()\n",
1188
+ "\n",
1189
+ "def removeNan(values):\n",
1190
+ " \"\"\"\n",
1191
+ " removes nan from list\n",
1192
+ " Parameters\n",
1193
+ " values : list data\n",
1194
+ " \"\"\"\n",
1195
+ " return list(filter(lambda v: not math.isnan(v), values))\n",
1196
+ "\n",
1197
+ "def fileRecGen(filePath, delim = \",\"):\n",
1198
+ " \"\"\"\n",
1199
+ " file record generator\n",
1200
+ " Parameters\n",
1201
+ " filePath ; file path\n",
1202
+ " delim : delemeter\n",
1203
+ " \"\"\"\n",
1204
+ " with open(filePath, \"r\") as fp:\n",
1205
+ " for line in fp:\t\n",
1206
+ " line = line[:-1]\n",
1207
+ " if delim is not None:\n",
1208
+ " line = line.split(delim)\n",
1209
+ " yield line\n",
1210
+ "\n",
1211
+ "def fileSelFieldsRecGen(dirPath, columns, delim=\",\"):\n",
1212
+ " \"\"\"\n",
1213
+ " file record generator given column indices \n",
1214
+ " Parameters\n",
1215
+ " filePath ; file path\n",
1216
+ " columns : column indexes as int array or coma separated string\n",
1217
+ " delim : delemeter\n",
1218
+ " \"\"\"\n",
1219
+ " if type(columns) == str:\n",
1220
+ " columns = strToIntArray(columns, delim)\n",
1221
+ " for rec in fileRecGen(dirPath, delim):\n",
1222
+ " extracted = extractList(rec, columns)\n",
1223
+ " yield extracted\n",
1224
+ "\n",
1225
+ "def fileFiltRecGen(filePath, filt, delim = \",\"):\n",
1226
+ " \"\"\"\n",
1227
+ " file record generator with row filter applied\n",
1228
+ " Parameters\n",
1229
+ " filePath ; file path\n",
1230
+ " filt : row filter\n",
1231
+ " delim : delemeter\n",
1232
+ " \"\"\"\n",
1233
+ " with open(filePath, \"r\") as fp:\n",
1234
+ " for line in fp:\t\n",
1235
+ " line = line[:-1]\n",
1236
+ " if delim is not None:\n",
1237
+ " line = line.split(delim)\n",
1238
+ " if filt(line):\n",
1239
+ " yield line\n",
1240
+ "\n",
1241
+ "def fileFiltSelFieldsRecGen(filePath, filt, columns, delim = \",\"):\n",
1242
+ " \"\"\"\n",
1243
+ " file record generator with row and column filter applied\n",
1244
+ " Parameters\n",
1245
+ " filePath ; file path\n",
1246
+ " filt : row filter\n",
1247
+ " columns : column indexes as int array or coma separated string\n",
1248
+ " delim : delemeter\n",
1249
+ " \"\"\"\n",
1250
+ " columns = strToIntArray(columns, delim)\n",
1251
+ " with open(filePath, \"r\") as fp:\n",
1252
+ " for line in fp:\t\n",
1253
+ " line = line[:-1]\n",
1254
+ " if delim is not None:\n",
1255
+ " line = line.split(delim)\n",
1256
+ " if filt(line):\n",
1257
+ " selected = extractList(line, columns)\n",
1258
+ " yield selected\n",
1259
+ "\n",
1260
+ "def fileTypedRecGen(filePath, ftypes, delim = \",\"):\n",
1261
+ " \"\"\"\n",
1262
+ " file typed record generator\n",
1263
+ " Parameters\n",
1264
+ " filePath ; file path\n",
1265
+ " ftypes : list of field types\n",
1266
+ " delim : delemeter\n",
1267
+ " \"\"\"\n",
1268
+ " with open(filePath, \"r\") as fp:\n",
1269
+ " for line in fp:\t\n",
1270
+ " line = line[:-1]\n",
1271
+ " line = line.split(delim)\n",
1272
+ " for i in range(0, len(ftypes), 2):\n",
1273
+ " ci = ftypes[i]\n",
1274
+ " dtype = ftypes[i+1]\n",
1275
+ " assertLesser(ci, len(line), \"index out of bound\")\n",
1276
+ " if dtype == \"int\":\n",
1277
+ " line[ci] = int(line[ci])\n",
1278
+ " elif dtype == \"float\":\n",
1279
+ " line[ci] = float(line[ci])\n",
1280
+ " else:\n",
1281
+ " exitWithMsg(\"invalid data type\")\n",
1282
+ " yield line\n",
1283
+ "\n",
1284
+ "def fileMutatedFieldsRecGen(dirPath, mutator, delim=\",\"):\n",
1285
+ " \"\"\"\n",
1286
+ " file record generator with some columns mutated \n",
1287
+ " Parameters\n",
1288
+ " dirPath ; file path\n",
1289
+ " mutator : row field mutator\n",
1290
+ " delim : delemeter\n",
1291
+ " \"\"\"\n",
1292
+ " for rec in fileRecGen(dirPath, delim):\n",
1293
+ " mutated = mutator(rec)\n",
1294
+ " yield mutated\n",
1295
+ "\n",
1296
+ "def tableSelFieldsFilter(tdata, columns):\n",
1297
+ " \"\"\"\n",
1298
+ " gets tabular data for selected columns \n",
1299
+ " Parameters\n",
1300
+ " tdata : tabular data\n",
1301
+ " columns : column indexes\n",
1302
+ " \"\"\"\n",
1303
+ " if areAllFieldsIncluded(tdata[0], columns):\n",
1304
+ " ntdata = tdata\n",
1305
+ " else:\n",
1306
+ " ntdata = list()\n",
1307
+ " for rec in tdata:\n",
1308
+ " #print(rec)\n",
1309
+ " #print(columns)\n",
1310
+ " nrec = extractList(rec, columns)\n",
1311
+ " ntdata.append(nrec)\n",
1312
+ " return ntdata\n",
1313
+ "\n",
1314
+ "\n",
1315
+ "def areAllFieldsIncluded(ldata, columns):\n",
1316
+ " \"\"\"\n",
1317
+ " return True id all indexes are in the columns\n",
1318
+ " Parameters\n",
1319
+ " ldata : list data\n",
1320
+ " columns : column indexes\n",
1321
+ " \"\"\"\n",
1322
+ " return list(range(len(ldata))) == columns\n",
1323
+ "\n",
1324
+ "def asIntList(items):\n",
1325
+ " \"\"\"\n",
1326
+ " returns int list\n",
1327
+ " Parameters\n",
1328
+ " items : list data\n",
1329
+ " \"\"\"\n",
1330
+ " return [int(i) for i in items]\n",
1331
+ "\n",
1332
+ "def asFloatList(items):\n",
1333
+ " \"\"\"\n",
1334
+ " returns float list\n",
1335
+ " Parameters\n",
1336
+ " items : list data\n",
1337
+ " \"\"\"\n",
1338
+ " return [float(i) for i in items]\n",
1339
+ "\n",
1340
+ "def pastTime(interval, unit):\n",
1341
+ " \"\"\"\n",
1342
+ " current and past time\n",
1343
+ " Parameters\n",
1344
+ " interval : time interval\n",
1345
+ " unit: time unit\n",
1346
+ " \"\"\"\n",
1347
+ " curTime = int(time.time())\n",
1348
+ " if unit == \"d\":\n",
1349
+ " pastTime = curTime - interval * secInDay\n",
1350
+ " elif unit == \"h\":\n",
1351
+ " pastTime = curTime - interval * secInHour\n",
1352
+ " elif unit == \"m\":\n",
1353
+ " pastTime = curTime - interval * secInMinute\n",
1354
+ " else:\n",
1355
+ " raise ValueError(\"invalid time unit \" + unit)\n",
1356
+ " return (curTime, pastTime)\n",
1357
+ "\n",
1358
+ "def minuteAlign(ts):\n",
1359
+ " \"\"\"\n",
1360
+ " minute aligned time\t\n",
1361
+ " Parameters\n",
1362
+ " ts : time stamp in sec\n",
1363
+ " \"\"\"\n",
1364
+ " return int((ts / secInMinute)) * secInMinute\n",
1365
+ "\n",
1366
+ "def multMinuteAlign(ts, min):\n",
1367
+ " \"\"\"\n",
1368
+ " multi minute aligned time\t\n",
1369
+ " Parameters\n",
1370
+ " ts : time stamp in sec\n",
1371
+ " min : minute value\n",
1372
+ " \"\"\"\n",
1373
+ " intv = secInMinute * min\n",
1374
+ " return int((ts / intv)) * intv\n",
1375
+ "\n",
1376
+ "def hourAlign(ts):\n",
1377
+ " \"\"\"\n",
1378
+ " hour aligned time\n",
1379
+ " Parameters\n",
1380
+ " ts : time stamp in sec\n",
1381
+ " \"\"\"\n",
1382
+ " return int((ts / secInHour)) * secInHour\n",
1383
+ "\n",
1384
+ "def hourOfDayAlign(ts, hour):\n",
1385
+ " \"\"\"\n",
1386
+ " hour of day aligned time\n",
1387
+ " Parameters\n",
1388
+ " ts : time stamp in sec\n",
1389
+ " hour : hour of day\n",
1390
+ " \"\"\"\n",
1391
+ " day = int(ts / secInDay)\n",
1392
+ " return (24 * day + hour) * secInHour\n",
1393
+ "\n",
1394
+ "def dayAlign(ts):\n",
1395
+ " \"\"\"\n",
1396
+ " day aligned time\n",
1397
+ " Parameters\n",
1398
+ " ts : time stamp in sec\n",
1399
+ " \"\"\"\n",
1400
+ " return int(ts / secInDay) * secInDay\n",
1401
+ "\n",
1402
+ "def timeAlign(ts, unit):\n",
1403
+ " \"\"\"\n",
1404
+ " boundary alignment of time\n",
1405
+ " Parameters\n",
1406
+ " ts : time stamp in sec\n",
1407
+ " unit : unit of time\n",
1408
+ " \"\"\"\n",
1409
+ " alignedTs = 0\n",
1410
+ " if unit == \"s\":\n",
1411
+ " alignedTs = ts\n",
1412
+ " elif unit == \"m\":\n",
1413
+ " alignedTs = minuteAlign(ts)\n",
1414
+ " elif unit == \"h\":\n",
1415
+ " alignedTs = hourAlign(ts)\n",
1416
+ " elif unit == \"d\":\n",
1417
+ " alignedTs = dayAlign(ts)\n",
1418
+ " else:\n",
1419
+ " raise ValueError(\"invalid time unit\")\n",
1420
+ " return \talignedTs\n",
1421
+ "\n",
1422
+ "def monthOfYear(ts):\n",
1423
+ " \"\"\"\n",
1424
+ " month of year\n",
1425
+ " Parameters\n",
1426
+ " ts : time stamp in sec\n",
1427
+ " \"\"\"\n",
1428
+ " rem = ts % secInYear\n",
1429
+ " dow = int(rem / secInMonth)\n",
1430
+ " return dow\n",
1431
+ "\n",
1432
+ "def dayOfWeek(ts):\n",
1433
+ " \"\"\"\n",
1434
+ " day of week\n",
1435
+ " Parameters\n",
1436
+ " ts : time stamp in sec\n",
1437
+ " \"\"\"\n",
1438
+ " rem = ts % secInWeek\n",
1439
+ " dow = int(rem / secInDay)\n",
1440
+ " return dow\n",
1441
+ "\n",
1442
+ "def hourOfDay(ts):\n",
1443
+ " \"\"\"\n",
1444
+ " hour of day\n",
1445
+ " Parameters\n",
1446
+ " ts : time stamp in sec\n",
1447
+ " \"\"\"\n",
1448
+ " rem = ts % secInDay\n",
1449
+ " hod = int(rem / secInHour)\n",
1450
+ " return hod\n",
1451
+ "\n",
1452
+ "def processCmdLineArgs(expectedTypes, usage):\n",
1453
+ " \"\"\"\n",
1454
+ " process command line args and returns args as typed values\n",
1455
+ " Parameters\n",
1456
+ " expectedTypes : expected data types of arguments\n",
1457
+ " usage : usage message string\n",
1458
+ " \"\"\"\n",
1459
+ " args = []\n",
1460
+ " numComLineArgs = len(sys.argv)\n",
1461
+ " numExpected = len(expectedTypes)\n",
1462
+ " if (numComLineArgs - 1 == len(expectedTypes)):\n",
1463
+ " try:\n",
1464
+ " for i in range(0, numExpected):\n",
1465
+ " if (expectedTypes[i] == typeInt):\n",
1466
+ " args.append(int(sys.argv[i+1]))\n",
1467
+ " elif (expectedTypes[i] == typeFloat):\n",
1468
+ " args.append(float(sys.argv[i+1]))\n",
1469
+ " elif (expectedTypes[i] == typeString):\n",
1470
+ " args.append(sys.argv[i+1])\n",
1471
+ " except ValueError:\n",
1472
+ " print (\"expected number of command line arguments found but there is type mis match\")\n",
1473
+ " sys.exit(1)\n",
1474
+ " else:\n",
1475
+ " print (\"expected number of command line arguments not found\")\n",
1476
+ " print (usage)\n",
1477
+ " sys.exit(1)\n",
1478
+ " return args\n",
1479
+ "\n",
1480
+ "def mutateString(val, numMutate, ctype):\n",
1481
+ " \"\"\"\n",
1482
+ " mutate string multiple times\n",
1483
+ " Parameters\n",
1484
+ " val : string value\n",
1485
+ " numMutate : num of mutations\n",
1486
+ " ctype : type of character to mutate with\n",
1487
+ " \"\"\"\n",
1488
+ " mutations = set()\n",
1489
+ " count = 0\n",
1490
+ " while count < numMutate:\n",
1491
+ " j = randint(0, len(val)-1)\n",
1492
+ " if j not in mutations:\n",
1493
+ " if ctype == \"alpha\":\n",
1494
+ " ch = selectRandomFromList(alphaTokens)\n",
1495
+ " elif ctype == \"num\":\n",
1496
+ " ch = selectRandomFromList(numTokens)\n",
1497
+ " elif ctype == \"any\":\n",
1498
+ " ch = selectRandomFromList(tokens)\n",
1499
+ " val = val[:j] + ch + val[j+1:]\n",
1500
+ " mutations.add(j)\n",
1501
+ " count += 1\n",
1502
+ " return val\n",
1503
+ "\n",
1504
+ "def mutateList(values, numMutate, vmin, vmax):\n",
1505
+ " \"\"\"\n",
1506
+ " mutate list multiple times\n",
1507
+ " Parameters\n",
1508
+ " values : list value\n",
1509
+ " numMutate : num of mutations\n",
1510
+ " vmin : minimum of value range\n",
1511
+ " vmax : maximum of value range\n",
1512
+ " \"\"\"\n",
1513
+ " mutations = set()\n",
1514
+ " count = 0\n",
1515
+ " while count < numMutate:\n",
1516
+ " j = randint(0, len(values)-1)\n",
1517
+ " if j not in mutations:\n",
1518
+ " values[j] = np.random.uniform(vmin, vmax)\n",
1519
+ " count += 1\n",
1520
+ " return values\n",
1521
+ "\n",
1522
+ "\n",
1523
+ "def swap(values, first, second):\n",
1524
+ " \"\"\"\n",
1525
+ " swap two elements\n",
1526
+ " Parameters\n",
1527
+ " values : list value\n",
1528
+ " first : first swap position\n",
1529
+ " second : second swap position\n",
1530
+ " \"\"\"\n",
1531
+ " t = values[first]\n",
1532
+ " values[first] = values[second]\n",
1533
+ " values[second] = t\n",
1534
+ "\n",
1535
+ "def swapBetweenLists(values1, values2):\n",
1536
+ " \"\"\"\n",
1537
+ " swap two elements between 2 lists\n",
1538
+ " Parameters\n",
1539
+ " values1 : first list of values\n",
1540
+ " values2 : second list of values\n",
1541
+ " \"\"\"\n",
1542
+ " p1 = randint(0, len(values1)-1)\n",
1543
+ " p2 = randint(0, len(values2)-1)\n",
1544
+ " tmp = values1[p1]\t\n",
1545
+ " values1[p1] = values2[p2]\n",
1546
+ " values2[p2] = tmp\n",
1547
+ "\n",
1548
+ "def safeAppend(values, value):\n",
1549
+ " \"\"\"\n",
1550
+ " append only if not None\n",
1551
+ " Parameters\n",
1552
+ " values : list value\n",
1553
+ " value : value to append\n",
1554
+ " \"\"\"\n",
1555
+ " if value is not None:\n",
1556
+ " values.append(value)\n",
1557
+ "\n",
1558
+ "def getAllIndex(ldata, fldata):\n",
1559
+ " \"\"\"\n",
1560
+ " get ALL indexes of list elements\n",
1561
+ " Parameters\n",
1562
+ " ldata : list data to find index in\n",
1563
+ " fldata : list data for values for index look up\n",
1564
+ " \"\"\"\n",
1565
+ " return list(map(lambda e : fldata.index(e), ldata))\n",
1566
+ "\n",
1567
+ "def findIntersection(lOne, lTwo):\n",
1568
+ " \"\"\"\n",
1569
+ " find intersection elements between 2 lists\n",
1570
+ " Parameters\n",
1571
+ " lOne : first list of data\n",
1572
+ " lTwo : second list of data\n",
1573
+ " \"\"\"\n",
1574
+ " sOne = set(lOne)\n",
1575
+ " sTwo = set(lTwo)\n",
1576
+ " sInt = sOne.intersection(sTwo)\n",
1577
+ " return list(sInt)\n",
1578
+ "\n",
1579
+ "def isIntvOverlapped(rOne, rTwo):\n",
1580
+ " \"\"\"\n",
1581
+ " checks overlap between 2 intervals\n",
1582
+ " Parameters\n",
1583
+ " rOne : first interval boundaries\n",
1584
+ " rTwo : second interval boundaries\n",
1585
+ " \"\"\"\n",
1586
+ " clear = rOne[1] <= rTwo[0] or rOne[0] >= rTwo[1] \n",
1587
+ " return not clear\n",
1588
+ "\n",
1589
+ "def isIntvLess(rOne, rTwo):\n",
1590
+ " \"\"\"\n",
1591
+ " checks if first iterval is less than second\n",
1592
+ " Parameters\n",
1593
+ " rOne : first interval boundaries\n",
1594
+ " rTwo : second interval boundaries\n",
1595
+ " \"\"\"\n",
1596
+ " less = rOne[1] <= rTwo[0] \n",
1597
+ " return less\n",
1598
+ "\n",
1599
+ "def findRank(e, values):\n",
1600
+ " \"\"\"\n",
1601
+ " find rank of value in a list\n",
1602
+ " Parameters\n",
1603
+ " e : value to compare with\n",
1604
+ " values : list data\n",
1605
+ " \"\"\"\n",
1606
+ " count = 1\n",
1607
+ " for ve in values:\n",
1608
+ " if ve < e:\n",
1609
+ " count += 1\n",
1610
+ " return count\n",
1611
+ "\n",
1612
+ "def findRanks(toBeRanked, values):\n",
1613
+ " \"\"\"\n",
1614
+ " find ranks of values in one list in another list\n",
1615
+ " Parameters\n",
1616
+ " toBeRanked : list of values for which ranks are found\n",
1617
+ " values : list in which rank is found : \n",
1618
+ " \"\"\"\n",
1619
+ " return list(map(lambda e: findRank(e, values), toBeRanked))\n",
1620
+ "\n",
1621
+ "def formatFloat(prec, value, label = None):\n",
1622
+ " \"\"\"\n",
1623
+ " formats a float with optional label\n",
1624
+ " Parameters\n",
1625
+ " prec : precision\n",
1626
+ " value : data value\n",
1627
+ " label : label for data\n",
1628
+ " \"\"\"\n",
1629
+ " st = (label + \" \") if label else \"\"\n",
1630
+ " formatter = \"{:.\" + str(prec) + \"f}\" \n",
1631
+ " return st + formatter.format(value)\n",
1632
+ "\n",
1633
+ "def formatAny(value, label = None):\n",
1634
+ " \"\"\"\n",
1635
+ " formats any obkect with optional label\n",
1636
+ " Parameters\n",
1637
+ " value : data value\n",
1638
+ " label : label for data\n",
1639
+ " \"\"\"\n",
1640
+ " st = (label + \" \") if label else \"\"\n",
1641
+ " return st + str(value)\n",
1642
+ "\n",
1643
+ "def printList(values):\n",
1644
+ " \"\"\"\n",
1645
+ " pretty print list\n",
1646
+ " Parameters\n",
1647
+ " values : list of values\n",
1648
+ " \"\"\"\n",
1649
+ " for v in values:\n",
1650
+ " print(v)\n",
1651
+ "\n",
1652
+ "def printMap(values, klab, vlab, precision, offset=16):\n",
1653
+ " \"\"\"\n",
1654
+ " pretty print hash map\n",
1655
+ " Parameters\n",
1656
+ " values : dictionary of values\n",
1657
+ " klab : label for key\n",
1658
+ " vlab : label for value\n",
1659
+ " precision : precision\n",
1660
+ " offset : left justify offset\n",
1661
+ " \"\"\"\n",
1662
+ " print(klab.ljust(offset, \" \") + vlab)\n",
1663
+ " for k in values.keys():\n",
1664
+ " v = values[k]\n",
1665
+ " ks = toStr(k, precision).ljust(offset, \" \")\n",
1666
+ " vs = toStr(v, precision)\n",
1667
+ " print(ks + vs)\n",
1668
+ "\n",
1669
+ "def printPairList(values, lab1, lab2, precision, offset=16):\n",
1670
+ " \"\"\"\n",
1671
+ " pretty print list of pairs\n",
1672
+ " Parameters\n",
1673
+ " values : dictionary of values\n",
1674
+ " lab1 : first label\n",
1675
+ " lab2 : second label\n",
1676
+ " precision : precision\n",
1677
+ " offset : left justify offset\n",
1678
+ " \"\"\"\n",
1679
+ " print(lab1.ljust(offset, \" \") + lab2)\n",
1680
+ " for (v1, v2) in values:\n",
1681
+ " sv1 = toStr(v1, precision).ljust(offset, \" \")\n",
1682
+ " sv2 = toStr(v2, precision)\n",
1683
+ " print(sv1 + sv2)\n",
1684
+ "\n",
1685
+ "def createMap(*values):\n",
1686
+ " \"\"\"\n",
1687
+ " create disctionary with results\n",
1688
+ " Parameters\n",
1689
+ " values : sequence of key value pairs\n",
1690
+ " \"\"\"\n",
1691
+ " result = dict()\n",
1692
+ " for i in range(0, len(values), 2):\n",
1693
+ " result[values[i]] = values[i+1]\n",
1694
+ " return result\n",
1695
+ "\n",
1696
+ "def getColMinMax(table, col):\n",
1697
+ " \"\"\"\n",
1698
+ " return min, max values of a column\n",
1699
+ " Parameters\n",
1700
+ " table : tabular data\n",
1701
+ " col : column index\n",
1702
+ " \"\"\"\n",
1703
+ " vmin = None\n",
1704
+ " vmax = None\n",
1705
+ " for rec in table:\n",
1706
+ " value = rec[col]\n",
1707
+ " if vmin is None:\n",
1708
+ " vmin = value\n",
1709
+ " vmax = value\n",
1710
+ " else:\n",
1711
+ " if value < vmin:\n",
1712
+ " vmin = value\n",
1713
+ " elif value > vmax:\n",
1714
+ " vmax = value\n",
1715
+ " return (vmin, vmax, vmax - vmin)\n",
1716
+ "\n",
1717
+ "def createLogger(name, logFilePath, logLevName):\n",
1718
+ " \"\"\"\n",
1719
+ " creates logger\n",
1720
+ " Parameters\n",
1721
+ " name : logger name\n",
1722
+ " logFilePath : log file path\n",
1723
+ " logLevName : log level\n",
1724
+ " \"\"\"\n",
1725
+ " logger = logging.getLogger(name)\n",
1726
+ " fHandler = logging.handlers.RotatingFileHandler(logFilePath, maxBytes=1048576, backupCount=4)\n",
1727
+ " logLev = logLevName.lower()\n",
1728
+ " if logLev == \"debug\":\n",
1729
+ " logLevel = logging.DEBUG\n",
1730
+ " elif logLev == \"info\":\n",
1731
+ " logLevel = logging.INFO\n",
1732
+ " elif logLev == \"warning\":\n",
1733
+ " logLevel = logging.WARNING\n",
1734
+ " elif logLev == \"error\":\n",
1735
+ " logLevel = logging.ERROR\n",
1736
+ " elif logLev == \"critical\":\n",
1737
+ " logLevel = logging.CRITICAL\n",
1738
+ " else:\n",
1739
+ " raise ValueError(\"invalid log level name \" + logLevelName)\n",
1740
+ " fHandler.setLevel(logLevel)\n",
1741
+ " fFormat = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n",
1742
+ " fHandler.setFormatter(fFormat)\n",
1743
+ " logger.addHandler(fHandler)\n",
1744
+ " logger.setLevel(logLevel)\n",
1745
+ " return logger\n",
1746
+ "\n",
1747
+ "@contextmanager\n",
1748
+ "def suppressStdout():\n",
1749
+ " \"\"\"\n",
1750
+ " suppress stdout\n",
1751
+ " Parameters\n",
1752
+ " \"\"\"\n",
1753
+ " with open(os.devnull, \"w\") as devnull:\n",
1754
+ " oldStdout = sys.stdout\n",
1755
+ " sys.stdout = devnull\n",
1756
+ " try: \n",
1757
+ " yield\n",
1758
+ " finally:\n",
1759
+ " sys.stdout = oldStdout\n",
1760
+ "\n",
1761
+ "def exitWithMsg(msg):\n",
1762
+ " \"\"\"\n",
1763
+ " print message and exit\n",
1764
+ " Parameters\n",
1765
+ " msg : message\n",
1766
+ " \"\"\"\n",
1767
+ " print(msg + \" -- quitting\")\n",
1768
+ " sys.exit(0)\n",
1769
+ "\n",
1770
+ "def drawLine(data, yscale=None):\n",
1771
+ " \"\"\"\n",
1772
+ " line plot\n",
1773
+ " Parameters\n",
1774
+ " data : list data\n",
1775
+ " yscale : y axis scale\n",
1776
+ " \"\"\"\n",
1777
+ " plt.plot(data)\n",
1778
+ " if yscale:\n",
1779
+ " step = int(yscale / 10)\n",
1780
+ " step = int(step / 10) * 10\n",
1781
+ " plt.yticks(range(0, yscale, step))\n",
1782
+ " plt.show()\n",
1783
+ "\n",
1784
+ "def drawPlot(x, y, xlabel, ylabel):\n",
1785
+ " \"\"\"\n",
1786
+ " line plot\n",
1787
+ " Parameters\n",
1788
+ " x : x values\n",
1789
+ " y : y values\n",
1790
+ " xlabel : x axis label\n",
1791
+ " ylabel : y axis label\n",
1792
+ " \"\"\"\n",
1793
+ " plt.plot(x,y)\n",
1794
+ " plt.xlabel(xlabel)\n",
1795
+ " plt.ylabel(ylabel)\n",
1796
+ " plt.show()\n",
1797
+ "\n",
1798
+ "def drawPairPlot(x, y1, y2, xlabel,ylabel, y1label, y2label):\n",
1799
+ " \"\"\"\n",
1800
+ " line plot of 2 lines\n",
1801
+ " Parameters\n",
1802
+ " x : x values\n",
1803
+ " y1 : first y values\n",
1804
+ " y2 : second y values\n",
1805
+ " xlabel : x labbel\n",
1806
+ " ylabel : y label\n",
1807
+ " y1label : first plot label\n",
1808
+ " y2label : second plot label\n",
1809
+ " \"\"\"\n",
1810
+ " plt.plot(x, y1, label = y1label)\n",
1811
+ " plt.plot(x, y2, label = y2label)\n",
1812
+ " plt.xlabel(xlabel)\n",
1813
+ " plt.ylabel(ylabel)\n",
1814
+ " plt.legend()\n",
1815
+ " plt.show()\n",
1816
+ "\n",
1817
+ "def drawHist(ldata, myTitle, myXlabel, myYlabel, nbins=10):\n",
1818
+ " \"\"\"\n",
1819
+ " draw histogram\n",
1820
+ " Parameters\n",
1821
+ " ldata : list data\n",
1822
+ " myTitle : title\n",
1823
+ " myXlabel : x label\n",
1824
+ " myYlabel : y label \n",
1825
+ " nbins : num of bins\n",
1826
+ " \"\"\"\n",
1827
+ " plt.hist(ldata, bins=nbins, density=True)\n",
1828
+ " plt.title(myTitle)\n",
1829
+ " plt.xlabel(myXlabel)\n",
1830
+ " plt.ylabel(myYlabel)\n",
1831
+ " plt.show()\t\n",
1832
+ "\n",
1833
+ "def saveObject(obj, filePath):\n",
1834
+ " \"\"\"\n",
1835
+ " saves an object\n",
1836
+ " Parameters\n",
1837
+ " obj : object\n",
1838
+ " filePath : file path for saved object\n",
1839
+ " \"\"\"\n",
1840
+ " with open(filePath, \"wb\") as outfile:\n",
1841
+ " pickle.dump(obj,outfile)\n",
1842
+ "\n",
1843
+ "def restoreObject(filePath):\n",
1844
+ " \"\"\"\n",
1845
+ " restores an object\n",
1846
+ " Parameters\n",
1847
+ " filePath : file path to restore object from\n",
1848
+ " \"\"\"\n",
1849
+ " with open(filePath, \"rb\") as infile:\n",
1850
+ " obj = pickle.load(infile)\n",
1851
+ " return obj\n",
1852
+ "\n",
1853
+ "def isNumeric(data):\n",
1854
+ " \"\"\"\n",
1855
+ " true if all elements int or float\n",
1856
+ " Parameters\n",
1857
+ " data : numeric data list\n",
1858
+ " \"\"\"\n",
1859
+ " if type(data) == list or type(data) == np.ndarray:\n",
1860
+ " col = pd.Series(data)\n",
1861
+ " else:\n",
1862
+ " col = data\n",
1863
+ " return col.dtype == np.int32 or col.dtype == np.int64 or col.dtype == np.float32 or col.dtype == np.float64\n",
1864
+ "\n",
1865
+ "def isInteger(data):\n",
1866
+ " \"\"\"\n",
1867
+ " true if all elements int \n",
1868
+ " Parameters\n",
1869
+ " data : numeric data list\n",
1870
+ " \"\"\"\n",
1871
+ " if type(data) == list or type(data) == np.ndarray:\n",
1872
+ " col = pd.Series(data)\n",
1873
+ " else:\n",
1874
+ " col = data\n",
1875
+ " return col.dtype == np.int32 or col.dtype == np.int64\n",
1876
+ "\n",
1877
+ "def isFloat(data):\n",
1878
+ " \"\"\"\n",
1879
+ " true if all elements float\n",
1880
+ " Parameters\n",
1881
+ " data : numeric data list\n",
1882
+ " \"\"\"\n",
1883
+ " if type(data) == list or type(data) == np.ndarray:\n",
1884
+ " col = pd.Series(data)\n",
1885
+ " else:\n",
1886
+ " col = data\n",
1887
+ " return col.dtype == np.float32 or col.dtype == np.float64\n",
1888
+ "\n",
1889
+ "def isBinary(data):\n",
1890
+ " \"\"\"\n",
1891
+ " true if all elements either 0 or 1\n",
1892
+ " Parameters\n",
1893
+ " data : binary data\n",
1894
+ " \"\"\"\n",
1895
+ " re = next((d for d in data if not (type(d) == int and (d == 0 or d == 1))), None)\n",
1896
+ " return (re is None)\n",
1897
+ "\n",
1898
+ "def isCategorical(data):\n",
1899
+ " \"\"\"\n",
1900
+ " true if all elements int or string\n",
1901
+ " Parameters\n",
1902
+ " data : data value\n",
1903
+ " \"\"\"\n",
1904
+ " re = next((d for d in data if not (type(d) == int or type(d) == str)), None)\n",
1905
+ " return (re is None)\n",
1906
+ "\n",
1907
+ "def assertEqual(value, veq, msg):\n",
1908
+ " \"\"\"\n",
1909
+ " assert equal to\n",
1910
+ " Parameters\n",
1911
+ " value : value\n",
1912
+ " veq : value to be equated with\n",
1913
+ " msg : error msg\n",
1914
+ " \"\"\"\n",
1915
+ " assert value == veq , msg\n",
1916
+ "\n",
1917
+ "def assertGreater(value, vmin, msg):\n",
1918
+ " \"\"\"\n",
1919
+ " assert greater than \n",
1920
+ " Parameters\n",
1921
+ " value : value\n",
1922
+ " vmin : minimum value\n",
1923
+ " msg : error msg\n",
1924
+ " \"\"\"\n",
1925
+ " assert value > vmin , msg\n",
1926
+ "\n",
1927
+ "def assertGreaterEqual(value, vmin, msg):\n",
1928
+ " \"\"\"\n",
1929
+ " assert greater than \n",
1930
+ " Parameters\n",
1931
+ " value : value\n",
1932
+ " vmin : minimum value\n",
1933
+ " msg : error msg\n",
1934
+ " \"\"\"\n",
1935
+ " assert value >= vmin , msg\n",
1936
+ "\n",
1937
+ "def assertLesser(value, vmax, msg):\n",
1938
+ " \"\"\"\n",
1939
+ " assert less than\n",
1940
+ " Parameters\n",
1941
+ " value : value\n",
1942
+ " vmax : maximum value\n",
1943
+ " msg : error msg\n",
1944
+ " \"\"\"\n",
1945
+ " assert value < vmax , msg\n",
1946
+ "\n",
1947
+ "def assertLesserEqual(value, vmax, msg):\n",
1948
+ " \"\"\"\n",
1949
+ " assert less than\n",
1950
+ " Parameters\n",
1951
+ " value : value\n",
1952
+ " vmax : maximum value\n",
1953
+ " msg : error msg\n",
1954
+ " \"\"\"\n",
1955
+ " assert value <= vmax , msg\n",
1956
+ "\n",
1957
+ "def assertWithinRange(value, vmin, vmax, msg):\n",
1958
+ " \"\"\"\n",
1959
+ " assert within range\n",
1960
+ " Parameters\n",
1961
+ " value : value\n",
1962
+ " vmin : minimum value\n",
1963
+ " vmax : maximum value\n",
1964
+ " msg : error msg\n",
1965
+ " \"\"\"\n",
1966
+ " assert value >= vmin and value <= vmax, msg\n",
1967
+ "\n",
1968
+ "def assertInList(value, values, msg):\n",
1969
+ " \"\"\"\n",
1970
+ " assert contains in a list\n",
1971
+ " Parameters\n",
1972
+ " value ; balue to check for inclusion\n",
1973
+ " values : list data\n",
1974
+ " msg : error msg\n",
1975
+ " \"\"\"\n",
1976
+ " assert value in values, msg\n",
1977
+ "\n",
1978
+ "def maxListDist(l1, l2):\n",
1979
+ " \"\"\"\n",
1980
+ " maximum list element difference between 2 lists\n",
1981
+ " Parameters\n",
1982
+ " l1 : first list data\n",
1983
+ " l2 : second list data\n",
1984
+ " \"\"\"\n",
1985
+ " dist = max(list(map(lambda v : abs(v[0] - v[1]), zip(l1, l2))))\t\n",
1986
+ " return dist\n",
1987
+ "\n",
1988
+ "def fileLineCount(fPath):\n",
1989
+ " \"\"\" \n",
1990
+ " number of lines ina file \n",
1991
+ " Parameters\n",
1992
+ " fPath : file path\n",
1993
+ " \"\"\"\n",
1994
+ " with open(fPath) as f:\n",
1995
+ " for i, li in enumerate(f):\n",
1996
+ " pass\n",
1997
+ " return (i + 1)\n",
1998
+ "\n",
1999
+ "def getAlphaNumCharCount(sdata):\n",
2000
+ " \"\"\" \n",
2001
+ " number of alphabetic and numeric charcters in a string \n",
2002
+ " Parameters\n",
2003
+ " sdata : string data\n",
2004
+ " \"\"\"\n",
2005
+ " acount = 0\n",
2006
+ " ncount = 0\n",
2007
+ " scount = 0\n",
2008
+ " ocount = 0\n",
2009
+ " assertEqual(type(sdata), str, \"input must be string\")\n",
2010
+ " for c in sdata:\n",
2011
+ " if c.isnumeric():\n",
2012
+ " ncount += 1\n",
2013
+ " elif c.isalpha():\n",
2014
+ " acount += 1\n",
2015
+ " elif c.isspace():\n",
2016
+ " scount += 1\n",
2017
+ " else:\n",
2018
+ " ocount += 1\n",
2019
+ " r = (acount, ncount, ocount)\n",
2020
+ " return r\n",
2021
+ "\n",
2022
+ "class StepFunction:\n",
2023
+ " \"\"\"\n",
2024
+ " step function\n",
2025
+ " Parameters\n",
2026
+ " \"\"\"\n",
2027
+ " def __init__(self, *values):\n",
2028
+ " \"\"\"\n",
2029
+ " initilizer\n",
2030
+ "\n",
2031
+ " Parameters\n",
2032
+ " values : list of tuples, wich each tuple containing 2 x values and corresponding y value\n",
2033
+ " \"\"\"\n",
2034
+ " self.points = values\n",
2035
+ "\n",
2036
+ " def find(self, x):\n",
2037
+ " \"\"\"\n",
2038
+ " finds step function value\n",
2039
+ "\n",
2040
+ " Parameters\n",
2041
+ " x : x value\n",
2042
+ " \"\"\"\n",
2043
+ " found = False\n",
2044
+ " y = 0\n",
2045
+ " for p in self.points:\n",
2046
+ " if (x >= p[0] and x < p[1]):\n",
2047
+ " y = p[2]\n",
2048
+ " found = True\n",
2049
+ " break\n",
2050
+ "\n",
2051
+ " if not found:\n",
2052
+ " l = len(self.points)\n",
2053
+ " if (x < self.points[0][0]):\n",
2054
+ " y = self.points[0][2]\n",
2055
+ " elif (x > self.points[l-1][1]):\n",
2056
+ " y = self.points[l-1][2]\n",
2057
+ " return y\n",
2058
+ "\n",
2059
+ "\n",
2060
+ "class DummyVarGenerator:\n",
2061
+ " \"\"\"\n",
2062
+ " dummy variable generator for categorical variable\n",
2063
+ " \"\"\"\n",
2064
+ " def __init__(self, rowSize, catValues, trueVal, falseVal, delim=None):\n",
2065
+ " \"\"\"\n",
2066
+ " initilizer\n",
2067
+ "\n",
2068
+ " Parameters\n",
2069
+ " rowSize : row size\n",
2070
+ " catValues : dictionary with field index as key and list of categorical values as value\n",
2071
+ " trueVal : true value, typically \"1\"\n",
2072
+ " falseval : false value , typically \"0\"\n",
2073
+ " delim : field delemeter\n",
2074
+ " \"\"\"\n",
2075
+ " self.rowSize = rowSize\n",
2076
+ " self.catValues = catValues\n",
2077
+ " numCatVar = len(catValues)\n",
2078
+ " colCount = 0\n",
2079
+ " for v in self.catValues.values():\n",
2080
+ " colCount += len(v)\n",
2081
+ " self.newRowSize = rowSize - numCatVar + colCount\n",
2082
+ " #print (\"new row size {}\".format(self.newRowSize))\n",
2083
+ " self.trueVal = trueVal\n",
2084
+ " self.falseVal = falseVal\n",
2085
+ " self.delim = delim\n",
2086
+ "\n",
2087
+ " def processRow(self, row):\n",
2088
+ " \"\"\"\n",
2089
+ " encodes categorical variables, returning as delemeter separate dstring or list\n",
2090
+ "\n",
2091
+ " Parameters\n",
2092
+ " row : row either delemeter separated string or list\n",
2093
+ " \"\"\"\n",
2094
+ " if self.delim is not None:\n",
2095
+ " rowArr = row.split(self.delim)\n",
2096
+ " msg = \"row does not have expected number of columns found \" + str(len(rowArr)) + \" expected \" + str(self.rowSize)\n",
2097
+ " assert len(rowArr) == self.rowSize, msg\n",
2098
+ " else:\n",
2099
+ " rowArr = row\n",
2100
+ "\n",
2101
+ " newRowArr = []\n",
2102
+ " for i in range(len(rowArr)):\n",
2103
+ " curVal = rowArr[i]\n",
2104
+ " if (i in self.catValues):\n",
2105
+ " values = self.catValues[i]\n",
2106
+ " for val in values:\n",
2107
+ " if val == curVal:\n",
2108
+ " newVal = self.trueVal\n",
2109
+ " else:\n",
2110
+ " newVal = self.falseVal\n",
2111
+ " newRowArr.append(newVal)\n",
2112
+ " else:\n",
2113
+ " newRowArr.append(curVal)\n",
2114
+ " assert len(newRowArr) == self.newRowSize, \"invalid new row size \" + str(len(newRowArr)) + \" expected \" + str(self.newRowSize)\n",
2115
+ " encRow = self.delim.join(newRowArr) if self.delim is not None else newRowArr\n",
2116
+ " return encRow\n"
2117
+ ]
2118
+ }
2119
+ ],
2120
+ "metadata": {
2121
+ "kernelspec": {
2122
+ "display_name": "Python 3 (ipykernel)",
2123
+ "language": "python",
2124
+ "name": "python3"
2125
+ },
2126
+ "language_info": {
2127
+ "codemirror_mode": {
2128
+ "name": "ipython",
2129
+ "version": 3
2130
+ },
2131
+ "file_extension": ".py",
2132
+ "mimetype": "text/x-python",
2133
+ "name": "python",
2134
+ "nbconvert_exporter": "python",
2135
+ "pygments_lexer": "ipython3",
2136
+ "version": "3.9.12"
2137
+ }
2138
+ },
2139
+ "nbformat": 4,
2140
+ "nbformat_minor": 5
2141
+ }
lib/mlutil.ipynb ADDED
@@ -0,0 +1,1297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "2d05ce02",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import sys\n",
12
+ "import numpy as np\n",
13
+ "from sklearn import preprocessing\n",
14
+ "from sklearn import metrics\n",
15
+ "from sklearn.datasets import make_blobs\n",
16
+ "from sklearn.datasets import make_classification\n",
17
+ "import random\n",
18
+ "from math import *\n",
19
+ "from decimal import Decimal\n",
20
+ "import statistics\n",
21
+ "import jprops\n",
22
+ "from Levenshtein import distance as ld\n",
23
+ "from util import *\n",
24
+ "from sampler import *\n",
25
+ "\n",
26
+ "class Configuration:\n",
27
+ " \"\"\"\n",
28
+ " Configuration management. Supports default value, mandatory value and typed value.\n",
29
+ " \"\"\"\n",
30
+ " def __init__(self, configFile, defValues, verbose=False):\n",
31
+ " \"\"\"\n",
32
+ " initializer\n",
33
+ "\n",
34
+ " Parameters\n",
35
+ " configFile : config file path\n",
36
+ " defValues : dictionary of default values\n",
37
+ " verbose : verbosity flag\n",
38
+ " \"\"\"\n",
39
+ " configs = {}\n",
40
+ " with open(configFile) as fp:\n",
41
+ " for key, value in jprops.iter_properties(fp):\n",
42
+ " configs[key] = value\n",
43
+ " self.configs = configs\n",
44
+ " self.defValues = defValues\n",
45
+ " self.verbose = verbose\n",
46
+ "\n",
47
+ " def override(self, configFile):\n",
48
+ " \"\"\"\n",
49
+ " over ride configuration from file\n",
50
+ "\n",
51
+ " Parameters\n",
52
+ " configFile : override config file path\n",
53
+ " \"\"\"\n",
54
+ " with open(configFile) as fp:\n",
55
+ " for key, value in jprops.iter_properties(fp):\n",
56
+ " self.configs[key] = value\n",
57
+ "\n",
58
+ "\n",
59
+ " def setParam(self, name, value):\n",
60
+ " \"\"\"\n",
61
+ " override individual configuration\n",
62
+ " Parameters\n",
63
+ " name : config param name\n",
64
+ " value : config param value\n",
65
+ " \"\"\"\n",
66
+ " self.configs[name] = value\n",
67
+ "\n",
68
+ "\n",
69
+ " def getStringConfig(self, name):\n",
70
+ " \"\"\"\n",
71
+ " get string param\n",
72
+ " Parameters\n",
73
+ " name : config param name\n",
74
+ " \"\"\"\n",
75
+ " if self.isNone(name):\n",
76
+ " val = (None, False)\n",
77
+ " elif self.isDefault(name):\n",
78
+ " val = (self.handleDefault(name), True)\n",
79
+ " else:\n",
80
+ " val = (self.configs[name], False)\n",
81
+ " if self.verbose:\n",
82
+ " print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
83
+ " return val\n",
84
+ "\n",
85
+ "\n",
86
+ " def getIntConfig(self, name):\n",
87
+ " \"\"\"\n",
88
+ " get int param\n",
89
+ " Parameters\n",
90
+ " name : config param name\n",
91
+ " \"\"\"\n",
92
+ " #print \"%s %s\" %(name,self.configs[name])\n",
93
+ " if self.isNone(name):\n",
94
+ " val = (None, False)\n",
95
+ " elif self.isDefault(name):\n",
96
+ " val = (self.handleDefault(name), True)\n",
97
+ " else:\n",
98
+ " val = (int(self.configs[name]), False)\n",
99
+ " if self.verbose:\n",
100
+ " print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
101
+ " return val\n",
102
+ "\n",
103
+ "\n",
104
+ " def getFloatConfig(self, name):\n",
105
+ " \"\"\"\n",
106
+ " get float param\n",
107
+ " Parameters\n",
108
+ " name : config param name\n",
109
+ " \"\"\"\n",
110
+ " #print \"%s %s\" %(name,self.configs[name])\n",
111
+ " if self.isNone(name):\n",
112
+ " val = (None, False)\n",
113
+ " elif self.isDefault(name):\n",
114
+ " val = (self.handleDefault(name), True)\n",
115
+ " else:\n",
116
+ " val = (float(self.configs[name]), False)\n",
117
+ " if self.verbose:\n",
118
+ " print( \"{} {} {:06.3f}\".format(name, self.configs[name], val[0]))\n",
119
+ " return val\n",
120
+ "\n",
121
+ "\n",
122
+ " def getBooleanConfig(self, name):\n",
123
+ " \"\"\"\n",
124
+ " #get boolean param\n",
125
+ " Parameters\n",
126
+ " name : config param name\n",
127
+ " \"\"\"\n",
128
+ " if self.isNone(name):\n",
129
+ " val = (None, False)\n",
130
+ " elif self.isDefault(name):\n",
131
+ " val = (self.handleDefault(name), True)\n",
132
+ " else:\n",
133
+ " bVal = self.configs[name].lower() == \"true\"\n",
134
+ " val = (bVal, False)\n",
135
+ " if self.verbose:\n",
136
+ " print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n",
137
+ " return val\n",
138
+ "\n",
139
+ "\n",
140
+ " def getIntListConfig(self, name, delim=\",\"):\n",
141
+ " \"\"\"\n",
142
+ " get int list param\n",
143
+ " Parameters\n",
144
+ " name : config param name\n",
145
+ " delim : delemeter\n",
146
+ " \"\"\"\n",
147
+ " if self.isNone(name):\n",
148
+ " val = (None, False)\n",
149
+ " elif self.isDefault(name):\n",
150
+ " val = (self.handleDefault(name), True)\n",
151
+ " else:\n",
152
+ " delSepStr = self.getStringConfig(name)\n",
153
+ "\n",
154
+ " #specified as range\n",
155
+ " intList = strListOrRangeToIntArray(delSepStr[0])\n",
156
+ " val =(intList, delSepStr[1])\n",
157
+ " return val\n",
158
+ "\n",
159
+ " def getFloatListConfig(self, name, delim=\",\"):\n",
160
+ " \"\"\"\n",
161
+ " get float list param\n",
162
+ " Parameters\n",
163
+ " name : config param name\n",
164
+ " delim : delemeter\n",
165
+ " \"\"\"\n",
166
+ " delSepStr = self.getStringConfig(name)\n",
167
+ " if self.isNone(name):\n",
168
+ " val = (None, False)\n",
169
+ " elif self.isDefault(name):\n",
170
+ " val = (self.handleDefault(name), True)\n",
171
+ " else:\n",
172
+ " flList = strToFloatArray(delSepStr[0], delim)\n",
173
+ " val =(flList, delSepStr[1])\n",
174
+ " return val\n",
175
+ "\n",
176
+ "\n",
177
+ " def getStringListConfig(self, name, delim=\",\"):\n",
178
+ " \"\"\"\n",
179
+ " get string list param\n",
180
+ " Parameters\n",
181
+ " name : config param name\n",
182
+ " delim : delemeter\n",
183
+ " \"\"\"\n",
184
+ " delSepStr = self.getStringConfig(name)\n",
185
+ " if self.isNone(name):\n",
186
+ " val = (None, False)\n",
187
+ " elif self.isDefault(name):\n",
188
+ " val = (self.handleDefault(name), True)\n",
189
+ " else:\n",
190
+ " strList = delSepStr[0].split(delim)\n",
191
+ " val = (strList, delSepStr[1])\n",
192
+ " return val\n",
193
+ "\n",
194
+ " def handleDefault(self, name):\n",
195
+ " \"\"\"\n",
196
+ " handles default\n",
197
+ " Parameters\n",
198
+ " name : config param name\n",
199
+ " \"\"\"\n",
200
+ " dVal = self.defValues[name]\n",
201
+ " if (dVal[1] is None):\n",
202
+ " val = dVal[0]\n",
203
+ " else:\n",
204
+ " raise ValueError(dVal[1])\n",
205
+ " return val\n",
206
+ "\n",
207
+ "\n",
208
+ " def isNone(self, name):\n",
209
+ " \"\"\"\n",
210
+ " true is value is None\t\n",
211
+ " Parameters\n",
212
+ " name : config param name\n",
213
+ " \"\"\"\n",
214
+ " return self.configs[name].lower() == \"none\"\n",
215
+ "\n",
216
+ "\n",
217
+ " def isDefault(self, name):\n",
218
+ " \"\"\"\n",
219
+ " true if the value is default\t\n",
220
+ " Parameters\n",
221
+ " name : config param name\n",
222
+ " \"\"\"\n",
223
+ " de = self.configs[name] == \"_\"\n",
224
+ " #print de\n",
225
+ " return de\n",
226
+ "\n",
227
+ "\n",
228
+ " def eitherOrStringConfig(self, firstName, secondName):\n",
229
+ " \"\"\"\n",
230
+ " returns one of two string parameters\t\n",
231
+ " Parameters\n",
232
+ " firstName : first parameter name\n",
233
+ " secondName : second parameter name\t\n",
234
+ " \"\"\"\n",
235
+ " if not self.isNone(firstName):\n",
236
+ " first = self.getStringConfig(firstName)[0]\n",
237
+ " second = None\n",
238
+ " if not self.isNone(secondName):\n",
239
+ " raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \" \" + secondName)\n",
240
+ " else:\n",
241
+ " if not self.isNone(secondName):\n",
242
+ " second = self.getStringConfig(secondtName)[0]\n",
243
+ " first = None\n",
244
+ " else:\n",
245
+ " raise ValueError(\"at least one of the two parameters should be set \" + firstName + \" \" + secondName)\n",
246
+ " return (first, second)\n",
247
+ "\n",
248
+ "\n",
249
+ " def eitherOrIntConfig(self, firstName, secondName):\n",
250
+ " \"\"\"\n",
251
+ " returns one of two int parameters\t\n",
252
+ " Parameters\n",
253
+ " firstName : first parameter name\n",
254
+ " secondName : second parameter name\t\n",
255
+ " \"\"\"\n",
256
+ " if not self.isNone(firstName):\n",
257
+ " first = self.getIntConfig(firstName)[0]\n",
258
+ " second = None\n",
259
+ " if not self.isNone(secondName):\n",
260
+ " raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \" \" + secondName)\n",
261
+ " else:\n",
262
+ " if not self.isNone(secondName):\n",
263
+ " second = self.getIntConfig(secondsName)[0]\n",
264
+ " first = None\n",
265
+ " else:\n",
266
+ " raise ValueError(\"at least one of the two parameters should be set \" + firstName + \" \" + secondName)\n",
267
+ " return (first, second)\n",
268
+ "\n",
269
+ "\n",
270
+ "class CatLabelGenerator:\n",
271
+ " \"\"\"\n",
272
+ " label generator for categorical variables\n",
273
+ " \"\"\"\n",
274
+ " def __init__(self, catValues, delim):\n",
275
+ " \"\"\"\n",
276
+ " initilizers\n",
277
+ "\n",
278
+ " Parameters\n",
279
+ " catValues : dictionary of categorical values\n",
280
+ " delim : delemeter\n",
281
+ " \"\"\"\n",
282
+ " self.encoders = {}\n",
283
+ " self.catValues = catValues\n",
284
+ " self.delim = delim\n",
285
+ " for k in self.catValues.keys():\t\n",
286
+ " le = preprocessing.LabelEncoder()\t\n",
287
+ " le.fit(self.catValues[k])\n",
288
+ " self.encoders[k] = le\n",
289
+ "\n",
290
+ " def processRow(self, row):\t\n",
291
+ " \"\"\"\n",
292
+ " encode row categorical values\n",
293
+ "\n",
294
+ " Parameters:\n",
295
+ " row : data row\n",
296
+ " \"\"\"\n",
297
+ " #print row\n",
298
+ " rowArr = row.split(self.delim)\n",
299
+ " for i in range(len(rowArr)):\n",
300
+ " if (i in self.catValues):\n",
301
+ " curVal = rowArr[i]\n",
302
+ " assert curVal in self.catValues[i], \"categorival value invalid\"\n",
303
+ " encVal = self.encoders[i].transform([curVal])\n",
304
+ " rowArr[i] = str(encVal[0])\n",
305
+ " return self.delim.join(rowArr)\t\t\n",
306
+ "\n",
307
+ " def getOrigLabels(self, indx):\n",
308
+ " \"\"\"\n",
309
+ " get original labels\n",
310
+ "\n",
311
+ " Parameters:\n",
312
+ " indx : column index\n",
313
+ " \"\"\"\n",
314
+ " return self.encoders[indx].classes_\t\n",
315
+ "\n",
316
+ "\n",
317
+ "class SupvLearningDataGenerator:\n",
318
+ " \"\"\"\n",
319
+ " data generator for supervised learning\n",
320
+ " \"\"\"\n",
321
+ " def __init__(self, configFile):\n",
322
+ " \"\"\"\n",
323
+ " initilizers\n",
324
+ "\n",
325
+ " Parameters\n",
326
+ " configFile : config file path\n",
327
+ " \"\"\"\n",
328
+ " defValues = dict()\n",
329
+ " defValues[\"common.num.samp\"] = (100, None)\n",
330
+ " defValues[\"common.num.feat\"] = (5, None)\n",
331
+ " defValues[\"common.feat.trans\"] = (None, None)\n",
332
+ " defValues[\"common.feat.types\"] = (None, \"missing feature types\")\n",
333
+ " defValues[\"common.cat.feat.distr\"] = (None, None)\n",
334
+ " defValues[\"common.output.precision\"] = (3, None)\n",
335
+ " defValues[\"common.error\"] = (0.01, None)\n",
336
+ " defValues[\"class.gen.technique\"] = (\"blob\", None)\n",
337
+ " defValues[\"class.num.feat.informative\"] = (2, None)\n",
338
+ " defValues[\"class.num.feat.redundant\"] = (2, None)\n",
339
+ " defValues[\"class.num.feat.repeated\"] = (0, None)\n",
340
+ " defValues[\"class.num.feat.cat\"] = (0, None)\n",
341
+ " defValues[\"class.num.class\"] = (2, None)\n",
342
+ "\n",
343
+ " self.config = Configuration(configFile, defValues)\n",
344
+ "\n",
345
+ " def genClassifierData(self):\n",
346
+ " \"\"\"\n",
347
+ " generates classifier data\n",
348
+ " \"\"\"\n",
349
+ " nsamp = self.config.getIntConfig(\"common.num.samp\")[0]\n",
350
+ " nfeat = self.config.getIntConfig(\"common.num.feat\")[0]\n",
351
+ " nclass = self.config.getIntConfig(\"class.num.class\")[0]\n",
352
+ " #transform with shift and scale\n",
353
+ " ftrans = self.config.getFloatListConfig(\"common.feat.trans\")[0]\n",
354
+ " feTrans = dict()\n",
355
+ " for i in range(0, len(ftrans), 2):\n",
356
+ " tr = (ftrans[i], ftrans[i+1])\n",
357
+ " indx = int(i/2)\n",
358
+ " feTrans[indx] = tr\n",
359
+ "\n",
360
+ " ftypes = self.config.getStringListConfig(\"common.feat.types\")[0]\n",
361
+ "\n",
362
+ " # categorical feature distribution\n",
363
+ " feCatDist = dict()\n",
364
+ " fcatdl = self.config.getStringListConfig(\"common.cat.feat.distr\")[0]\n",
365
+ " for fcatds in fcatdl:\n",
366
+ " fcatd = fcatds.split(\":\")\n",
367
+ " feInd = int(fcatd[0])\n",
368
+ " clVal = int(fcatd[1])\n",
369
+ " key = (feInd, clVal)\t\t#feature index and class value\n",
370
+ " dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2)))\n",
371
+ " feCatDist[key] = CategoricalRejectSampler(*dist)\n",
372
+ "\n",
373
+ " #shift and scale\n",
374
+ " genTechnique = self.config.getStringConfig(\"class.gen.technique\")[0]\n",
375
+ " error = self.config.getFloatConfig(\"common.error\")[0]\n",
376
+ " if genTechnique == \"blob\":\n",
377
+ " features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat)\n",
378
+ " for i in range(nsamp):\t\t\t#shift and scale\n",
379
+ " for j in range(nfeat):\n",
380
+ " tr = feTrans[j]\n",
381
+ " features[i,j] = (features[i,j] + tr[0]) * tr[1]\n",
382
+ " claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz)))\n",
383
+ " elif genTechnique == \"classify\":\n",
384
+ " nfeatInfo = self.config.getIntConfig(\"class.num.feat.informative\")[0]\n",
385
+ " nfeatRed = self.config.getIntConfig(\"class.num.feat.redundant\")[0]\n",
386
+ " nfeatRep = self.config.getIntConfig(\"class.num.feat.repeated\")[0]\n",
387
+ " shifts = list(map(lambda i : feTrans[i][0], range(nfeat)))\n",
388
+ " scales = list(map(lambda i : feTrans[i][1], range(nfeat)))\n",
389
+ " features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed, \n",
390
+ " n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales)\n",
391
+ " else:\n",
392
+ " raise \"invalid genaration technique\"\n",
393
+ "\n",
394
+ " # add categorical features and format\n",
395
+ " nCatFeat = self.config.getIntConfig(\"class.num.feat.cat\")[0]\n",
396
+ " prec = self.config.getIntConfig(\"common.output.precision\")[0]\n",
397
+ " for f , c in zip(features, claz):\n",
398
+ " nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat)))\n",
399
+ " if nCatFeat > 0:\n",
400
+ " cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1)))\n",
401
+ " rec = \",\".join(nfs) + \",\" + \",\".join(cfs) + \",\" + str(c)\n",
402
+ " else:\n",
403
+ " rec = \",\".join(nfs) + \",\" + str(c)\n",
404
+ " yield rec\n",
405
+ "\n",
406
+ " def numFeToStr(self, fv, ft, prec):\n",
407
+ " \"\"\"\n",
408
+ " nummeric feature value to string\n",
409
+ "\n",
410
+ " Parameters\n",
411
+ " fv : field value\n",
412
+ " ft : field data type\n",
413
+ " prec : precision\n",
414
+ " \"\"\"\n",
415
+ " if ft == \"float\":\n",
416
+ " s = formatFloat(prec, fv)\n",
417
+ " elif ft ==\"int\":\n",
418
+ " s = str(int(fv))\n",
419
+ " else:\t\t\n",
420
+ " raise \"invalid type expecting float or int\"\n",
421
+ " return s\n",
422
+ "\n",
423
+ " def catFe(self, i, cv, ft, feCatDist):\n",
424
+ " \"\"\"\n",
425
+ " generate categorical feature\n",
426
+ "\n",
427
+ " Parameters\n",
428
+ " i : col index\n",
429
+ " cv : class value\n",
430
+ " ft : field data type\n",
431
+ " feCatDist : cat value distribution\n",
432
+ " \"\"\"\n",
433
+ " if ft == \"cat\":\n",
434
+ " key = (i, cv)\n",
435
+ " s = feCatDist[key].sample()\n",
436
+ " else:\t\t\n",
437
+ " raise \"invalid type expecting categorical\"\n",
438
+ " return s\n",
439
+ "\n",
440
+ "\n",
441
+ "\n",
442
+ "def loadDataFile(file, delim, cols, colIndices):\n",
443
+ " \"\"\"\n",
444
+ " loads delim separated file and extracts columns\n",
445
+ " Parameters\n",
446
+ " file : file path\n",
447
+ " delim : delemeter\n",
448
+ " cols : columns to use from file\n",
449
+ " colIndices ; columns to extract\n",
450
+ " \"\"\"\n",
451
+ " data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
452
+ " extrData = data[:,colIndices]\n",
453
+ " return (data, extrData)\n",
454
+ "\n",
455
+ "def loadFeatDataFile(file, delim, cols):\n",
456
+ " \"\"\"\n",
457
+ " loads delim separated file and extracts columns\n",
458
+ "\n",
459
+ " Parameters\n",
460
+ " file : file path\n",
461
+ " delim : delemeter\n",
462
+ " cols : columns to use from file\n",
463
+ " \"\"\"\n",
464
+ " data = np.loadtxt(file, delimiter=delim, usecols=cols)\n",
465
+ " return data\n",
466
+ "\n",
467
+ "def extrColumns(arr, columns):\n",
468
+ " \"\"\"\n",
469
+ " extracts columns\n",
470
+ "\n",
471
+ " Parameters\n",
472
+ " arr : 2D array\n",
473
+ " columns : columns\n",
474
+ " \"\"\"\n",
475
+ " return arr[:, columns]\n",
476
+ "\n",
477
+ "def subSample(featData, clsData, subSampleRate, withReplacement):\n",
478
+ " \"\"\"\n",
479
+ " subsample feature and class label data\t\n",
480
+ " Parameters\n",
481
+ " featData : 2D array of feature data\n",
482
+ " clsData : arrray of class labels\n",
483
+ " subSampleRate : fraction to be sampled\n",
484
+ " withReplacement : true if sampling with replacement\n",
485
+ " \"\"\"\n",
486
+ " sampSize = int(featData.shape[0] * subSampleRate)\n",
487
+ " sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement)\n",
488
+ " sampFeat = featData[sampledIndx]\n",
489
+ " sampCls = clsData[sampledIndx]\n",
490
+ " return(sampFeat, sampCls)\n",
491
+ "\n",
492
+ "def euclideanDistance(x,y):\n",
493
+ " \"\"\"\n",
494
+ " euclidean distance\n",
495
+ " Parameters\n",
496
+ " x : first vector\n",
497
+ " y : second fvector\n",
498
+ " \"\"\"\n",
499
+ " return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y)))\n",
500
+ "\n",
501
+ "def squareRooted(x):\n",
502
+ " \"\"\"\n",
503
+ " square root of sum square\n",
504
+ " Parameters\n",
505
+ " x : data vector\n",
506
+ " \"\"\"\n",
507
+ " return round(sqrt(sum([a*a for a in x])),3)\n",
508
+ "\n",
509
+ "def cosineSimilarity(x,y):\n",
510
+ " \"\"\"\n",
511
+ " cosine similarity\n",
512
+ "\n",
513
+ " Parameters\n",
514
+ " x : first vector\n",
515
+ " y : second fvector\n",
516
+ " \"\"\"\n",
517
+ " numerator = sum(a*b for a,b in zip(x,y))\n",
518
+ " denominator = squareRooted(x) * squareRooted(y)\n",
519
+ " return round(numerator / float(denominator), 3)\n",
520
+ "\n",
521
+ "def cosineDistance(x,y):\n",
522
+ " \"\"\"\n",
523
+ " cosine distance\n",
524
+ " Parameters\n",
525
+ " x : first vector\n",
526
+ " y : second fvector\n",
527
+ " \"\"\"\n",
528
+ " return 1.0 - cosineSimilarity(x,y)\n",
529
+ "\n",
530
+ "def manhattanDistance(x,y):\n",
531
+ " \"\"\"\n",
532
+ " manhattan distance\n",
533
+ " Parameters\n",
534
+ " x : first vector\n",
535
+ " y : second fvector\n",
536
+ " \"\"\"\n",
537
+ " return sum(abs(a-b) for a,b in zip(x,y))\n",
538
+ "\n",
539
+ "def nthRoot(value, nRoot):\n",
540
+ " \"\"\"\n",
541
+ " nth root\n",
542
+ " Parameters\n",
543
+ " value : data value\n",
544
+ " nRoot : root\n",
545
+ " \"\"\"\n",
546
+ " rootValue = 1/float(nRoot)\n",
547
+ " return round (Decimal(value) ** Decimal(rootValue),3)\n",
548
+ "\n",
549
+ "def minkowskiDistance(x,y,pValue):\n",
550
+ " \"\"\"\n",
551
+ " minkowski distance\n",
552
+ " Parameters\n",
553
+ " x : first vector\n",
554
+ " y : second fvector\n",
555
+ " pValue : power factor\n",
556
+ " \"\"\"\n",
557
+ " return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue)\n",
558
+ "\n",
559
+ "def jaccardSimilarityX(x,y):\n",
560
+ " \"\"\"\n",
561
+ " jaccard similarity\n",
562
+ " Parameters\n",
563
+ " x : first vector\n",
564
+ " y : second fvector\n",
565
+ " \"\"\"\n",
566
+ " intersectionCardinality = len(set.intersection(*[set(x), set(y)]))\n",
567
+ " unionCardinality = len(set.union(*[set(x), set(y)]))\n",
568
+ " return intersectionCardinality/float(unionCardinality)\n",
569
+ "\n",
570
+ "def jaccardSimilarity(x,y,wx=1.0,wy=1.0):\n",
571
+ " \"\"\"\n",
572
+ " jaccard similarity\n",
573
+ "\n",
574
+ " Parameters\n",
575
+ " x : first vector\n",
576
+ " y : second fvector\n",
577
+ " wx : weight for x\n",
578
+ " wy : weight for y\n",
579
+ " \"\"\"\n",
580
+ " sx = set(x)\n",
581
+ " sy = set(y)\n",
582
+ " sxyInt = sx.intersection(sy)\n",
583
+ " intCardinality = len(sxyInt)\n",
584
+ " sxIntDiff = sx.difference(sxyInt)\n",
585
+ " syIntDiff = sy.difference(sxyInt)\n",
586
+ " unionCardinality = len(sx.union(sy))\n",
587
+ " return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff))\n",
588
+ "\n",
589
+ "def levenshteinSimilarity(s1, s2):\n",
590
+ " \"\"\"\n",
591
+ " Levenshtein similarity for strings\n",
592
+ "\n",
593
+ " Parameters\n",
594
+ " sx : first string\n",
595
+ " sy : second string\n",
596
+ " \"\"\"\n",
597
+ " assert type(s1) == str and type(s2) == str, \"Levenshtein similarity is for string only\"\n",
598
+ " d = ld(s1,s2)\n",
599
+ " #print(d)\n",
600
+ " l = max(len(s1),len(s2))\n",
601
+ " d = 1.0 - min(d/l, 1.0)\n",
602
+ " return d\t\n",
603
+ "\n",
604
+ "def norm(values, po=2):\n",
605
+ " \"\"\"\n",
606
+ " norm\n",
607
+ " Parameters\n",
608
+ " values : list of values\n",
609
+ " po : power\n",
610
+ " \"\"\"\n",
611
+ " no = sum(list(map(lambda v: pow(v,po), values)))\n",
612
+ " no = pow(no,1.0/po)\n",
613
+ " return list(map(lambda v: v/no, values))\n",
614
+ "\n",
615
+ "def createOneHotVec(size, indx = -1):\n",
616
+ " \"\"\"\n",
617
+ " random one hot vector\n",
618
+ "\n",
619
+ " Parameters\n",
620
+ " size : vector size\n",
621
+ " indx : one hot position\n",
622
+ " \"\"\"\n",
623
+ " vec = [0] * size\n",
624
+ " s = random.randint(0, size - 1) if indx < 0 else indx\n",
625
+ " vec[s] = 1\n",
626
+ " return vec\n",
627
+ "\n",
628
+ "def createAllOneHotVec(size):\n",
629
+ " \"\"\"\n",
630
+ " create all one hot vectors\n",
631
+ "\n",
632
+ " Parameters\n",
633
+ " size : vector size and no of vectors\n",
634
+ " \"\"\"\n",
635
+ " vecs = list()\n",
636
+ " for i in range(size):\n",
637
+ " vec = [0] * size\n",
638
+ " vec[i] = 1\n",
639
+ " vecs.append(vec)\n",
640
+ " return vecs\n",
641
+ "\n",
642
+ "def blockShuffle(data, blockSize):\n",
643
+ " \"\"\"\n",
644
+ " block shuffle \t\n",
645
+ "\n",
646
+ " Parameters\n",
647
+ " data : list data\n",
648
+ " blockSize : block size\n",
649
+ " \"\"\"\n",
650
+ " numBlock = int(len(data) / blockSize)\n",
651
+ " remain = len(data) % blockSize\n",
652
+ " numBlock += (1 if remain > 0 else 0)\n",
653
+ " shuffled = list()\n",
654
+ " for i in range(numBlock):\n",
655
+ " b = random.randint(0, numBlock-1)\n",
656
+ " beg = b * blockSize\n",
657
+ " if (b < numBlock-1):\n",
658
+ " end = beg + blockSize\n",
659
+ " shuffled.extend(data[beg:end])\t\t\n",
660
+ " else:\n",
661
+ " shuffled.extend(data[beg:])\n",
662
+ " return shuffled\t\n",
663
+ "\n",
664
+ "def shuffle(data, numShuffle):\n",
665
+ " \"\"\"\n",
666
+ " shuffle data by randonm swapping\n",
667
+ "\n",
668
+ " Parameters\n",
669
+ " data : list data\n",
670
+ " numShuffle : no of pairwise swaps\n",
671
+ " \"\"\"\n",
672
+ " sz = len(data)\n",
673
+ " if numShuffle is None:\n",
674
+ " numShuffle = int(sz / 2)\n",
675
+ " for i in range(numShuffle):\n",
676
+ " fi = random.randint(0, sz -1)\n",
677
+ " se = random.randint(0, sz -1)\n",
678
+ " tmp = data[fi]\n",
679
+ " data[fi] = data[se]\n",
680
+ " data[se] = tmp\t\n",
681
+ "\n",
682
+ "def randomWalk(size, start, lowStep, highStep):\n",
683
+ " \"\"\"\n",
684
+ " random walk\t\n",
685
+ "\n",
686
+ " Parameters\n",
687
+ " size : list data\n",
688
+ " start : initial position\n",
689
+ " lowStep : step min\n",
690
+ " highStep : step max\n",
691
+ " \"\"\"\n",
692
+ " cur = start\n",
693
+ " for i in range(size):\n",
694
+ " yield cur\n",
695
+ " cur += randomFloat(lowStep, highStep)\n",
696
+ "\n",
697
+ "def binaryEcodeCategorical(values, value):\n",
698
+ " \"\"\"\n",
699
+ " one hot binary encoding\t\n",
700
+ "\n",
701
+ " Parameters\n",
702
+ " values : list of values\n",
703
+ " value : value to be replaced with 1\n",
704
+ " \"\"\"\n",
705
+ " size = len(values)\n",
706
+ " vec = [0] * size\n",
707
+ " for i in range(size):\n",
708
+ " if (values[i] == value):\n",
709
+ " vec[i] = 1\n",
710
+ " return vec\t\t\n",
711
+ "\n",
712
+ "def createLabeledSeq(inputData, tw):\n",
713
+ " \"\"\"\n",
714
+ " Creates feature, label pair from sequence data, where we have tw number of features followed by output\n",
715
+ "\n",
716
+ " Parameters\n",
717
+ " values : list containing feature and label\n",
718
+ " tw : no of features\n",
719
+ " \"\"\"\n",
720
+ " features = list()\n",
721
+ " labels = list()\n",
722
+ " l = len(inputDta)\n",
723
+ " for i in range(l - tw):\n",
724
+ " trainSeq = inputData[i:i+tw]\n",
725
+ " trainLabel = inputData[i+tw]\n",
726
+ " features.append(trainSeq)\n",
727
+ " labels.append(trainLabel)\n",
728
+ " return (features, labels)\n",
729
+ "\n",
730
+ "def createLabeledSeq(filePath, delim, index, tw):\n",
731
+ " \"\"\"\n",
732
+ " Creates feature, label pair from 1D sequence data in file\t\n",
733
+ "\n",
734
+ " Parameters\n",
735
+ " filePath : file path\n",
736
+ " delim : delemeter\n",
737
+ " index : column index\n",
738
+ " tw : no of features\n",
739
+ " \"\"\"\n",
740
+ " seqData = getFileColumnAsFloat(filePath, delim, index)\n",
741
+ " return createLabeledSeq(seqData, tw)\n",
742
+ "\n",
743
+ "def fromMultDimSeqToTabular(data, inpSize, seqLen):\n",
744
+ " \"\"\"\n",
745
+ " Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize)\n",
746
+ "\n",
747
+ " Parameters\n",
748
+ " data : 2D array\n",
749
+ " inpSize : each input size in sequence\n",
750
+ " seqLen : sequence length\n",
751
+ " \"\"\"\t\n",
752
+ " nrow = data.shape[0]\n",
753
+ " assert data.shape[1] == inpSize * seqLen, \"invalid input size or sequence length\"\n",
754
+ " return data.reshape(nrow * seqLen, inpSize)\n",
755
+ "\n",
756
+ "def fromTabularToMultDimSeq(data, inpSize, seqLen):\n",
757
+ " \"\"\"\n",
758
+ " Input shape (nrow * seqLen, inpSize) output shape (nrow, inpSize * seqLen) \n",
759
+ " Parameters\n",
760
+ " data : 2D array\n",
761
+ " inpSize : each input size in sequence\n",
762
+ " seqLen : sequence length\n",
763
+ " \"\"\"\t\n",
764
+ " nrow = int(data.shape[0] / seqLen)\n",
765
+ " assert data.shape[1] == inpSize, \"invalid input size\"\n",
766
+ " return data.reshape(nrow, seqLen * inpSize)\n",
767
+ "\n",
768
+ "def difference(data, interval=1):\n",
769
+ " \"\"\"\n",
770
+ " takes difference in time series data\n",
771
+ " Parameters\n",
772
+ " data :list data\n",
773
+ " interval : interval for difference\n",
774
+ " \"\"\"\n",
775
+ " diff = list()\n",
776
+ " for i in range(interval, len(data)):\n",
777
+ " value = data[i] - data[i - interval]\n",
778
+ " diff.append(value)\n",
779
+ " return diff\n",
780
+ "\n",
781
+ "def normalizeMatrix(data, norm, axis=1):\n",
782
+ " \"\"\"\n",
783
+ " normalized each row of the matrix\n",
784
+ "\n",
785
+ " Parameters\n",
786
+ " data : 2D data\n",
787
+ " nporm : normalization method\n",
788
+ " axis : row or column\n",
789
+ " \"\"\"\n",
790
+ " normalized = preprocessing.normalize(data,norm=norm, axis=axis)\n",
791
+ " return normalized\n",
792
+ "\n",
793
+ "def standardizeMatrix(data, axis=0):\n",
794
+ " \"\"\"\n",
795
+ " standardizes each column of the matrix with mean and std deviation\n",
796
+ " Parameters\n",
797
+ " data : 2D data\n",
798
+ " axis : row or column\n",
799
+ " \"\"\"\n",
800
+ " standardized = preprocessing.scale(data, axis=axis)\n",
801
+ " return standardized\n",
802
+ "\n",
803
+ "def asNumpyArray(data):\n",
804
+ " \"\"\"\n",
805
+ " converts to numpy array\n",
806
+ " Parameters\n",
807
+ " data : array\n",
808
+ " \"\"\"\n",
809
+ " return np.array(data)\n",
810
+ "\n",
811
+ "def perfMetric(metric, yActual, yPred, clabels=None):\n",
812
+ " \"\"\"\n",
813
+ " predictive model accuracy metric\n",
814
+ " Parameters\n",
815
+ " metric : accuracy metric\n",
816
+ " yActual : actual values array\n",
817
+ " yPred : predicted values array\n",
818
+ " clabels : class labels\n",
819
+ " \"\"\"\n",
820
+ " if metric == \"rsquare\":\n",
821
+ " score = metrics.r2_score(yActual, yPred)\n",
822
+ " elif metric == \"mae\":\n",
823
+ " score = metrics.mean_absolute_error(yActual, yPred)\n",
824
+ " elif metric == \"mse\":\n",
825
+ " score = metrics.mean_squared_error(yActual, yPred)\n",
826
+ " elif metric == \"acc\":\n",
827
+ " yPred = np.rint(yPred)\n",
828
+ " score = metrics.accuracy_score(yActual, yPred)\n",
829
+ " elif metric == \"mlAcc\":\n",
830
+ " yPred = np.argmax(yPred, axis=1)\n",
831
+ " score = metrics.accuracy_score(yActual, yPred)\n",
832
+ " elif metric == \"prec\":\n",
833
+ " yPred = np.argmax(yPred, axis=1)\n",
834
+ " score = metrics.precision_score(yActual, yPred)\n",
835
+ " elif metric == \"rec\":\n",
836
+ " yPred = np.argmax(yPred, axis=1)\n",
837
+ " score = metrics.recall_score(yActual, yPred)\n",
838
+ " elif metric == \"fone\":\n",
839
+ " yPred = np.argmax(yPred, axis=1)\n",
840
+ " score = metrics.f1_score(yActual, yPred)\n",
841
+ " elif metric == \"confm\":\n",
842
+ " yPred = np.argmax(yPred, axis=1)\n",
843
+ " score = metrics.confusion_matrix(yActual, yPred)\n",
844
+ " elif metric == \"clarep\":\n",
845
+ " yPred = np.argmax(yPred, axis=1)\n",
846
+ " score = metrics.classification_report(yActual, yPred)\n",
847
+ " elif metric == \"bce\":\n",
848
+ " if clabels is None:\n",
849
+ " clabels = [0, 1]\n",
850
+ " score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
851
+ " elif metric == \"ce\":\n",
852
+ " assert clabels is not None, \"labels must be provided\"\n",
853
+ " score = metrics.log_loss(yActual, yPred, labels=clabels)\n",
854
+ " else:\n",
855
+ " exitWithMsg(\"invalid prediction performance metric \" + metric)\n",
856
+ " return score\n",
857
+ "\n",
858
+ "def scaleData(data, method):\n",
859
+ " \"\"\"\n",
860
+ " scales feature data column wise\n",
861
+ " Parameters\n",
862
+ " data : 2D array\n",
863
+ " method : scaling method\n",
864
+ " \"\"\"\n",
865
+ " if method == \"minmax\":\n",
866
+ " scaler = preprocessing.MinMaxScaler()\n",
867
+ " data = scaler.fit_transform(data)\n",
868
+ " elif method == \"zscale\":\n",
869
+ " data = preprocessing.scale(data)\t\n",
870
+ " else:\n",
871
+ " raise ValueError(\"invalid scaling method\")\t\n",
872
+ " return data\n",
873
+ "\n",
874
+ "def scaleDataWithParams(data, method, scParams):\n",
875
+ " \"\"\"\n",
876
+ " scales feature data column wise\n",
877
+ " Parameters\n",
878
+ " data : 2D array\n",
879
+ " method : scaling method\n",
880
+ " scParams : scaling parameters\n",
881
+ " \"\"\"\n",
882
+ " if method == \"minmax\":\n",
883
+ " data = scaleMinMaxTabData(data, scParams)\n",
884
+ " elif method == \"zscale\":\n",
885
+ " raise ValueError(\"invalid scaling method\")\t\n",
886
+ " else:\n",
887
+ " raise ValueError(\"invalid scaling method\")\t\n",
888
+ " return data\n",
889
+ "\n",
890
+ "\n",
891
+ "def scaleMinMaxTabData(tdata, minMax):\n",
892
+ " \"\"\"\n",
893
+ " for tabular scales feature data column wise using min max values for each field\n",
894
+ " Parameters\n",
895
+ " tdata : 2D array\n",
896
+ " minMax : ni, max and range for each column\n",
897
+ " \"\"\"\n",
898
+ " stdata = list()\n",
899
+ " for r in tdata:\n",
900
+ " srdata = list()\n",
901
+ " for i, c in enumerate(r):\n",
902
+ " sd = (c - minMax[i][0]) / minMax[i][2]\n",
903
+ " srdata.append(sd)\n",
904
+ " stdata.append(srdata)\n",
905
+ " return stdata\n",
906
+ "\n",
907
+ "def scaleMinMax(rdata, minMax):\n",
908
+ " \"\"\"\n",
909
+ " scales feature data column wise using min max values for each field\n",
910
+ " Parameters\n",
911
+ " rdata : data array\n",
912
+ " minMax : ni, max and range for each column\n",
913
+ " \"\"\"\n",
914
+ " srdata = list()\n",
915
+ " for i in range(len(rdata)):\n",
916
+ " d = rdata[i]\n",
917
+ " sd = (d - minMax[i][0]) / minMax[i][2]\n",
918
+ " srdata.append(sd)\n",
919
+ " return srdata\n",
920
+ "\n",
921
+ "def harmonicNum(n):\n",
922
+ " \"\"\"\n",
923
+ " harmonic number\n",
924
+ " Parameters\n",
925
+ " n : number\n",
926
+ " \"\"\"\n",
927
+ " h = 0\n",
928
+ " for i in range(1, n+1, 1):\n",
929
+ " h += 1.0 / i\n",
930
+ " return h\n",
931
+ "\n",
932
+ "def digammaFun(n):\n",
933
+ " \"\"\"\n",
934
+ " figamma function\n",
935
+ " Parameters\n",
936
+ " n : number\n",
937
+ " \"\"\"\n",
938
+ " #Euler Mascheroni constant\n",
939
+ " ec = 0.577216\n",
940
+ " return harmonicNum(n - 1) - ec\n",
941
+ "\n",
942
+ "def getDataPartitions(tdata, types, columns = None):\n",
943
+ " \"\"\"\n",
944
+ " partitions data with the given columns and random split point defined with predicates\n",
945
+ " Parameters\n",
946
+ " tdata : 2D array\n",
947
+ " types : data typers\n",
948
+ " columns : column indexes\n",
949
+ " \"\"\"\n",
950
+ " (dtypes, cvalues) = extractTypesFromString(types)\n",
951
+ " if columns is None:\n",
952
+ " ncol = len(data[0])\n",
953
+ " columns = list(range(ncol))\n",
954
+ " ncol = len(columns)\n",
955
+ " #print(columns)\n",
956
+ "\n",
957
+ " # partition predicates\n",
958
+ " partitions = None\n",
959
+ " for c in columns:\n",
960
+ " #print(c)\n",
961
+ " dtype = dtypes[c]\n",
962
+ " pred = list()\n",
963
+ " if dtype == \"int\" or dtype == \"float\":\n",
964
+ " (vmin, vmax) = getColMinMax(tdata, c)\n",
965
+ " r = vmax - vmin\n",
966
+ " rmin = vmin + .2 * r\n",
967
+ " rmax = vmax - .2 * r\n",
968
+ " sp = randomFloat(rmin, rmax)\n",
969
+ " if dtype == \"int\":\n",
970
+ " sp = int(sp)\n",
971
+ " else:\n",
972
+ " sp = \"{:.3f}\".format(sp)\n",
973
+ " sp = float(sp)\n",
974
+ " pred.append([c, \"LT\", sp])\n",
975
+ " pred.append([c, \"GE\", sp])\n",
976
+ " elif dtype == \"cat\":\n",
977
+ " cv = cvalues[c]\n",
978
+ " card = len(cv) \n",
979
+ " if card < 3:\n",
980
+ " num = 1\n",
981
+ " else:\n",
982
+ " num = randomInt(1, card - 1)\n",
983
+ " sp = selectRandomSubListFromList(cv, num)\n",
984
+ " sp = \" \".join(sp)\n",
985
+ " pred.append([c, \"IN\", sp])\n",
986
+ " pred.append([c, \"NOTIN\", sp])\n",
987
+ "\n",
988
+ " #print(pred)\n",
989
+ " if partitions is None:\n",
990
+ " partitions = pred.copy()\n",
991
+ " #print(\"initial\")\n",
992
+ " #print(partitions)\n",
993
+ " else:\n",
994
+ " #print(\"extension\")\n",
995
+ " tparts = list()\n",
996
+ " for p in partitions:\n",
997
+ " #print(p)\n",
998
+ " l1 = p.copy()\n",
999
+ " l1.extend(pred[0])\n",
1000
+ " l2 = p.copy()\n",
1001
+ " l2.extend(pred[1])\n",
1002
+ " #print(\"after extension\")\n",
1003
+ " #print(l1)\n",
1004
+ " #print(l2)\n",
1005
+ " tparts.append(l1)\n",
1006
+ " tparts.append(l2)\n",
1007
+ " partitions = tparts\t\n",
1008
+ " #print(\"extending\")\n",
1009
+ " #print(partitions)\n",
1010
+ "\n",
1011
+ " #for p in partitions:\n",
1012
+ " #print(p)\t\n",
1013
+ " return partitions\t\t\t\n",
1014
+ "\n",
1015
+ "def genAlmostUniformDistr(size, nswap=50):\n",
1016
+ " \"\"\"\n",
1017
+ " generate probability distribution\n",
1018
+ "\n",
1019
+ " Parameters\n",
1020
+ " size : distr size\n",
1021
+ " nswap : no of mass swaps\n",
1022
+ " \"\"\"\n",
1023
+ " un = 1.0 / size\n",
1024
+ " distr = [un] * size\n",
1025
+ " distr = mutDistr(distr, 0.1 * un, nswap)\n",
1026
+ " return distr\n",
1027
+ "\n",
1028
+ "def mutDistr(distr, shift, nswap=50):\n",
1029
+ " \"\"\"\n",
1030
+ " mutates a probability distribution\n",
1031
+ "\n",
1032
+ " Parameters\n",
1033
+ " distr distribution\n",
1034
+ " shift : amount of shift for swap\n",
1035
+ " nswap : no of mass swaps\n",
1036
+ " \"\"\"\n",
1037
+ " size = len(distr)\n",
1038
+ " for _ in range(nswap):\n",
1039
+ " fi = randomInt(0, size -1)\n",
1040
+ " si = randomInt(0, size -1)\n",
1041
+ " while fi == si:\n",
1042
+ " fi = randomInt(0, size -1)\n",
1043
+ " si = randomInt(0, size -1)\n",
1044
+ "\n",
1045
+ " shift = randomFloat(0, shift)\n",
1046
+ " t = distr[fi]\n",
1047
+ " distr[fi] -= shift\n",
1048
+ " if (distr[fi] < 0):\n",
1049
+ " distr[fi] = 0.0\n",
1050
+ " shift = t\n",
1051
+ " distr[si] += shift\n",
1052
+ " return distr\n",
1053
+ "\n",
1054
+ "def generateBinDistribution(size, ntrue):\n",
1055
+ " \"\"\"\n",
1056
+ " generate binary array with some elements set to 1\n",
1057
+ "\n",
1058
+ " Parameters\n",
1059
+ " size : distr size\n",
1060
+ " ntrue : no of true values\n",
1061
+ " \"\"\"\n",
1062
+ " distr = [0] * size\n",
1063
+ " idxs = selectRandomSubListFromList(list(range(size)), ntrue)\n",
1064
+ " for i in idxs:\n",
1065
+ " distr[i] = 1\n",
1066
+ " return distr\n",
1067
+ "\n",
1068
+ "def mutBinaryDistr(distr, nmut):\n",
1069
+ " \"\"\"\n",
1070
+ " mutate binary distribution\n",
1071
+ "\n",
1072
+ " Parameters\n",
1073
+ " distr : distr\n",
1074
+ " nmut : no of mutations\n",
1075
+ " \"\"\"\n",
1076
+ " idxs = selectRandomSubListFromList(list(range(len(distr))), nmut)\n",
1077
+ " for i in idxs:\n",
1078
+ " distr[i] = distr[i] ^ 1\n",
1079
+ "\n",
1080
+ "\n",
1081
+ "def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=\",\"):\n",
1082
+ " \"\"\"\n",
1083
+ " file record generator that superimposes given data in the specified segment of a column\n",
1084
+ " Parameters\n",
1085
+ " filePath ; file path\n",
1086
+ " column : column index \n",
1087
+ " offset : offset into column values\n",
1088
+ " seqLen : length of subseq\n",
1089
+ " modifier : data to be superimposed either list or a sampler object\n",
1090
+ " precision : floating point precision\n",
1091
+ " delim : delemeter\n",
1092
+ " \"\"\"\n",
1093
+ " beg = offset\n",
1094
+ " end = beg + seqLen\n",
1095
+ " isList = type(modifier) == list\n",
1096
+ " i = 0\n",
1097
+ " for rec in fileRecGen(filePath, delim):\n",
1098
+ " if i >= beg and i < end:\n",
1099
+ " va = float(rec[column])\n",
1100
+ " if isList:\n",
1101
+ " va += modifier[i - beg] \n",
1102
+ " else:\n",
1103
+ " va += modifier.sample()\n",
1104
+ " rec[column] = formatFloat(precision, va)\n",
1105
+ " yield delim.join(rec)\n",
1106
+ " i += 1\n",
1107
+ "\n",
1108
+ "class ShiftedDataGenerator:\n",
1109
+ " \"\"\"\n",
1110
+ " transforms data for distribution shift\n",
1111
+ " \"\"\"\n",
1112
+ " def __init__(self, types, tdata, addFact, multFact):\n",
1113
+ " \"\"\"\n",
1114
+ " initializer\n",
1115
+ "\n",
1116
+ " Parameters\n",
1117
+ " types data types\n",
1118
+ " tdata : 2D array\n",
1119
+ " addFact ; factor for data shift\n",
1120
+ " multFact ; factor for data scaling\n",
1121
+ " \"\"\"\n",
1122
+ " (self.dtypes, self.cvalues) = extractTypesFromString(types)\n",
1123
+ "\n",
1124
+ " self.limits = dict()\n",
1125
+ " for k,v in self.dtypes.items():\n",
1126
+ " if v == \"int\" or v == \"false\":\n",
1127
+ " (vmax, vmin) = getColMinMax(tdata, k)\n",
1128
+ " self.limits[k] = vmax - vmin\n",
1129
+ " self.addMin = - addFact / 2\n",
1130
+ " self.addMax = addFact / 2\n",
1131
+ " self.multMin = 1.0 - multFact / 2\n",
1132
+ " self.multMax = 1.0 + multFact / 2\n",
1133
+ "\n",
1134
+ "\n",
1135
+ "\n",
1136
+ "\n",
1137
+ " def transform(self, tdata):\n",
1138
+ " \"\"\"\n",
1139
+ " linear transforms data to create distribution shift with random shift and scale\n",
1140
+ " Parameters\n",
1141
+ " types : data types\n",
1142
+ " \"\"\"\n",
1143
+ " transforms = dict()\n",
1144
+ " for k,v in self.dtypes.items():\n",
1145
+ " if v == \"int\" or v == \"false\":\t\t\t\t\n",
1146
+ " shift = randomFloat(self.addMin, self.addMax) * self.limits[k] \n",
1147
+ " scale = randomFloat(self.multMin, self.multMax)\n",
1148
+ " trns = (shift, scale)\n",
1149
+ " transforms[k] = trns\n",
1150
+ " elif v == \"cat\":\n",
1151
+ " transforms[k] = isEventSampled(50)\n",
1152
+ "\n",
1153
+ " ttdata = list()\n",
1154
+ " for rec in tdata:\n",
1155
+ " nrec = rec.copy()\n",
1156
+ " for c in range(len(rec)):\n",
1157
+ " if c in self.dtypes:\n",
1158
+ " dtype = self.dtypes[c]\n",
1159
+ " if dtype == \"int\" or dtype == \"float\":\n",
1160
+ " (shift, scale) = transforms[c]\n",
1161
+ " nval = shift + rec[c] * scale\n",
1162
+ " if dtype == \"int\":\n",
1163
+ " nrec[c] = int(nval)\n",
1164
+ " else:\n",
1165
+ " nrec[c] = nval\n",
1166
+ " elif dtype == \"cat\":\n",
1167
+ " cv = self.cvalues[c]\n",
1168
+ " if transforms[c]:\n",
1169
+ " nval = selectOtherRandomFromList(cv, rec[c])\n",
1170
+ " nrec[c] = nval\n",
1171
+ "\n",
1172
+ " ttdata.append(nrec)\n",
1173
+ "\n",
1174
+ " return ttdata\n",
1175
+ "\n",
1176
+ " def transformSpecified(self, tdata, sshift, scale):\n",
1177
+ " \"\"\"\n",
1178
+ " linear transforms data to create distribution shift shift specified shift and scale\n",
1179
+ " Parameters\n",
1180
+ " types : data types\n",
1181
+ " sshift : shift factor\n",
1182
+ " scale : scale factor\n",
1183
+ " \"\"\"\n",
1184
+ " transforms = dict()\n",
1185
+ " for k,v in self.dtypes.items():\n",
1186
+ " if v == \"int\" or v == \"false\":\t\t\t\t\n",
1187
+ " shift = sshift * self.limits[k] \n",
1188
+ " trns = (shift, scale)\n",
1189
+ " transforms[k] = trns\n",
1190
+ " elif v == \"cat\":\n",
1191
+ " transforms[k] = isEventSampled(50)\n",
1192
+ "\n",
1193
+ " ttdata = self.__scaleShift(tdata, transforms)\n",
1194
+ " return ttdata\n",
1195
+ "\n",
1196
+ " def __scaleShift(self, tdata, transforms):\n",
1197
+ " \"\"\"\n",
1198
+ " shifts and scales tabular data\n",
1199
+ "\n",
1200
+ " Parameters\n",
1201
+ " tdata : 2D array\n",
1202
+ " transforms : transforms to apply\n",
1203
+ " \"\"\"\n",
1204
+ " ttdata = list()\n",
1205
+ " for rec in tdata:\n",
1206
+ " nrec = rec.copy()\n",
1207
+ " for c in range(len(rec)):\n",
1208
+ " if c in self.dtypes:\n",
1209
+ " dtype = self.dtypes[c]\n",
1210
+ " if dtype == \"int\" or dtype == \"float\":\n",
1211
+ " (shift, scale) = transforms[c]\n",
1212
+ " nval = shift + rec[c] * scale\n",
1213
+ " if dtype == \"int\":\n",
1214
+ " nrec[c] = int(nval)\n",
1215
+ " else:\n",
1216
+ " nrec[c] = nval\n",
1217
+ " elif dtype == \"cat\":\n",
1218
+ " cv = self.cvalues[c]\n",
1219
+ " if transforms[c]:\n",
1220
+ " #nval = selectOtherRandomFromList(cv, rec[c])\n",
1221
+ " #nrec[c] = nval\n",
1222
+ " pass\n",
1223
+ "\n",
1224
+ " ttdata.append(nrec)\n",
1225
+ " return ttdata\n",
1226
+ "\n",
1227
+ "class RollingStat(object):\n",
1228
+ " \"\"\"\n",
1229
+ " stats for rolling windowt\n",
1230
+ " \"\"\"\n",
1231
+ " def __init__(self, wsize):\n",
1232
+ " \"\"\"\n",
1233
+ " initializer\n",
1234
+ "\n",
1235
+ " Parameters\n",
1236
+ " wsize : window size\n",
1237
+ " \"\"\"\n",
1238
+ " self.window = list()\n",
1239
+ " self.wsize = wsize\n",
1240
+ " self.mean = None\n",
1241
+ " self.sd = None\n",
1242
+ "\n",
1243
+ " def add(self, value):\n",
1244
+ " \"\"\"\n",
1245
+ " add a value\n",
1246
+ "\n",
1247
+ " Parameters\n",
1248
+ " value : value to add\n",
1249
+ " \"\"\"\n",
1250
+ " self.window.append(value)\n",
1251
+ " if len(self.window) > self.wsize:\n",
1252
+ " self.window = self.window[1:]\n",
1253
+ "\n",
1254
+ " def getStat(self):\n",
1255
+ " \"\"\"\n",
1256
+ " get rolling window mean and std deviation\n",
1257
+ " \"\"\"\n",
1258
+ " assertGreater(len(self.window), 0, \"window is empty\")\n",
1259
+ " if len(self.window) == 1:\n",
1260
+ " self.mean = self.window[0]\n",
1261
+ " self.sd = 0\n",
1262
+ " else:\n",
1263
+ " self.mean = statistics.mean(self.window)\n",
1264
+ " self.sd = statistics.stdev(self.window, xbar=self.mean)\n",
1265
+ " re = (self.mean, self.sd)\n",
1266
+ " return re\n",
1267
+ "\n",
1268
+ " def getSize(self):\n",
1269
+ " \"\"\"\n",
1270
+ " return window size\n",
1271
+ " \"\"\"\n",
1272
+ " return len(self.window)\n"
1273
+ ]
1274
+ }
1275
+ ],
1276
+ "metadata": {
1277
+ "kernelspec": {
1278
+ "display_name": "Python 3 (ipykernel)",
1279
+ "language": "python",
1280
+ "name": "python3"
1281
+ },
1282
+ "language_info": {
1283
+ "codemirror_mode": {
1284
+ "name": "ipython",
1285
+ "version": 3
1286
+ },
1287
+ "file_extension": ".py",
1288
+ "mimetype": "text/x-python",
1289
+ "name": "python",
1290
+ "nbconvert_exporter": "python",
1291
+ "pygments_lexer": "ipython3",
1292
+ "version": "3.9.12"
1293
+ }
1294
+ },
1295
+ "nbformat": 4,
1296
+ "nbformat_minor": 5
1297
+ }
lib/sampler.ipynb ADDED
@@ -0,0 +1,1366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "c19a2efe",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import sys\n",
11
+ "import random \n",
12
+ "import time\n",
13
+ "import math\n",
14
+ "import random\n",
15
+ "import numpy as np\n",
16
+ "from scipy import stats\n",
17
+ "from random import randint\n",
18
+ "from util import *\n",
19
+ "from stats import Histogram\n",
20
+ "\n",
21
+ "def randomFloat(low, high):\n",
22
+ " \"\"\"\n",
23
+ " sample float within range\n",
24
+ " Parameters\n",
25
+ " low : low valuee\n",
26
+ " high : high valuee\n",
27
+ " \"\"\"\n",
28
+ " return random.random() * (high-low) + low\n",
29
+ "\n",
30
+ "def randomInt(minv, maxv):\n",
31
+ " \"\"\"\n",
32
+ " sample int within range\n",
33
+ " Parameters\n",
34
+ " minv : low valuee\n",
35
+ " maxv : high valuee\n",
36
+ " \"\"\"\n",
37
+ " return randint(minv, maxv)\n",
38
+ "\n",
39
+ "def randIndex(lData):\n",
40
+ " \"\"\"\n",
41
+ " random index of a list\n",
42
+ " Parameters\n",
43
+ " lData : list data\n",
44
+ " \"\"\"\n",
45
+ " return randint(0, len(lData)-1)\n",
46
+ "\n",
47
+ "def randomUniformSampled(low, high):\n",
48
+ " \"\"\"\n",
49
+ " sample float within range\n",
50
+ "\n",
51
+ " Parameters\n",
52
+ " low : low value\n",
53
+ " high : high value\n",
54
+ " \"\"\"\n",
55
+ " return np.random.uniform(low, high)\n",
56
+ "\n",
57
+ "def randomUniformSampledList(low, high, size):\n",
58
+ " \"\"\"\n",
59
+ " sample floats within range to create list\n",
60
+ " Parameters\n",
61
+ " low : low value\n",
62
+ " high : high value\n",
63
+ " size ; size of list to be returned\n",
64
+ " \"\"\"\n",
65
+ " return np.random.uniform(low, high, size)\n",
66
+ "\n",
67
+ "def randomNormSampled(mean, sd):\n",
68
+ " \"\"\"\n",
69
+ " sample float from normal\n",
70
+ " Parameters\n",
71
+ " mean : mean\n",
72
+ " sd : std deviation\n",
73
+ " \"\"\"\n",
74
+ " return np.random.normal(mean, sd)\n",
75
+ "\n",
76
+ "def randomNormSampledList(mean, sd, size):\n",
77
+ " \"\"\"\n",
78
+ " sample float list from normal \n",
79
+ " Parameters\n",
80
+ " mean : mean\n",
81
+ " sd : std deviation\n",
82
+ " size : size of list to be returned\n",
83
+ " \"\"\"\n",
84
+ " return np.random.normal(mean, sd, size)\n",
85
+ "\n",
86
+ "def randomSampledList(sampler, size):\n",
87
+ " \"\"\"\n",
88
+ " sample list from given sampler \n",
89
+ " Parameters\n",
90
+ " sampler : sampler object\n",
91
+ " size : size of list to be returned\n",
92
+ " \"\"\"\n",
93
+ " return list(map(lambda i : sampler.sample(), range(size)))\n",
94
+ "\n",
95
+ "\n",
96
+ "def minLimit(val, minv):\n",
97
+ " \"\"\"\n",
98
+ " min limit\n",
99
+ "\n",
100
+ " Parameters\n",
101
+ " val : value\n",
102
+ " minv : min limit\n",
103
+ " \"\"\"\n",
104
+ " if (val < minv):\n",
105
+ " val = minv\n",
106
+ " return val\n",
107
+ "\n",
108
+ "\n",
109
+ "def rangeLimit(val, minv, maxv):\n",
110
+ " \"\"\"\n",
111
+ " range limit\n",
112
+ " Parameters\n",
113
+ " val : value\n",
114
+ " minv : min limit\n",
115
+ " maxv : max limit\n",
116
+ " \"\"\"\n",
117
+ " if (val < minv):\n",
118
+ " val = minv\n",
119
+ " elif (val > maxv):\n",
120
+ " val = maxv\n",
121
+ " return val\n",
122
+ "\n",
123
+ "\n",
124
+ "def sampleUniform(minv, maxv):\n",
125
+ " \"\"\"\n",
126
+ " sample int within range\n",
127
+ " Parameters\n",
128
+ " minv ; int min limit\n",
129
+ " maxv : int max limit\n",
130
+ " \"\"\"\n",
131
+ " return randint(minv, maxv)\n",
132
+ "\n",
133
+ "\n",
134
+ "def sampleFromBase(value, dev):\n",
135
+ " \"\"\"\n",
136
+ " sample int wrt base\n",
137
+ " Parameters\n",
138
+ " value : base value\n",
139
+ " dev : deviation\n",
140
+ " \"\"\"\n",
141
+ " return randint(value - dev, value + dev)\n",
142
+ "\n",
143
+ "\n",
144
+ "def sampleFloatFromBase(value, dev):\n",
145
+ " \"\"\"\n",
146
+ " sample float wrt base\n",
147
+ " Parameters\n",
148
+ " value : base value\n",
149
+ " dev : deviation\n",
150
+ " \"\"\"\n",
151
+ " return randomFloat(value - dev, value + dev)\n",
152
+ "\n",
153
+ "\n",
154
+ "def distrUniformWithRanndom(total, numItems, noiseLevel):\n",
155
+ " \"\"\"\n",
156
+ " uniformly distribute with some randomness and preserves total\n",
157
+ " Parameters\n",
158
+ " total : total count\n",
159
+ " numItems : no of bins\n",
160
+ " noiseLevel : noise level fraction\n",
161
+ " \"\"\"\n",
162
+ " perItem = total / numItems\n",
163
+ " var = perItem * noiseLevel\n",
164
+ " items = []\n",
165
+ " for i in range(numItems):\n",
166
+ " item = perItem + randomFloat(-var, var)\n",
167
+ " items.append(item)\t\n",
168
+ "\n",
169
+ " #adjust last item\n",
170
+ " sm = sum(items[:-1])\n",
171
+ " items[-1] = total - sm\n",
172
+ " return items\n",
173
+ "\n",
174
+ "\n",
175
+ "def isEventSampled(threshold, maxv=100):\n",
176
+ " \"\"\"\n",
177
+ " sample event which occurs if sampled below threshold\n",
178
+ " Parameters\n",
179
+ " threshold : threshold for sampling\n",
180
+ " maxv : maximum values\n",
181
+ " \"\"\"\n",
182
+ " return randint(0, maxv) < threshold\n",
183
+ "\n",
184
+ "\n",
185
+ "def sampleBinaryEvents(events, probPercent):\n",
186
+ " \"\"\"\n",
187
+ " sample binary events\n",
188
+ " Parameters\n",
189
+ " events : two events\n",
190
+ " probPercent : probability as percentage\n",
191
+ " \"\"\"\n",
192
+ " if (randint(0, 100) < probPercent):\n",
193
+ " event = events[0]\n",
194
+ " else:\n",
195
+ " event = events[1]\n",
196
+ " return event\n",
197
+ "\n",
198
+ "\n",
199
+ "def addNoiseNum(value, sampler):\n",
200
+ " \"\"\"\n",
201
+ " add noise to numeric value\n",
202
+ " Parameters\n",
203
+ " value : base value\n",
204
+ " sampler : sampler for noise\n",
205
+ " \"\"\"\n",
206
+ " return value * (1 + sampler.sample())\n",
207
+ "\n",
208
+ "\n",
209
+ "def addNoiseCat(value, values, noise):\t\n",
210
+ " \"\"\"\n",
211
+ " add noise to categorical value i.e with some probability change value\n",
212
+ " Parameters\n",
213
+ " value : cat value\n",
214
+ " values : cat values\n",
215
+ " noise : noise level fraction\n",
216
+ " \"\"\"\n",
217
+ " newValue = value\n",
218
+ " threshold = int(noise * 100)\n",
219
+ " if (isEventSampled(threshold)):\t\t\n",
220
+ " newValue = selectRandomFromList(values)\n",
221
+ " while newValue == value:\n",
222
+ " newValue = selectRandomFromList(values)\n",
223
+ " return newValue\n",
224
+ "\n",
225
+ "\n",
226
+ "def sampleWithReplace(data, sampSize):\n",
227
+ " \"\"\"\n",
228
+ " sample with replacement\n",
229
+ " Parameters\n",
230
+ " data : array\n",
231
+ " sampSize : sample size\n",
232
+ " \"\"\"\n",
233
+ " sampled = list()\n",
234
+ " le = len(data)\n",
235
+ " if sampSize is None:\n",
236
+ " sampSize = le\n",
237
+ " for i in range(sampSize):\n",
238
+ " j = random.randint(0, le - 1)\n",
239
+ " sampled.append(data[j])\n",
240
+ " return sampled\n",
241
+ "\n",
242
+ "class CumDistr:\n",
243
+ " \"\"\"\n",
244
+ " cumulative distr\n",
245
+ " \"\"\"\n",
246
+ "\n",
247
+ " def __init__(self, data, numBins = None):\n",
248
+ " \"\"\"\n",
249
+ " initializer\n",
250
+ "\n",
251
+ " Parameters\n",
252
+ " data : array\n",
253
+ " numBins : no of bins\n",
254
+ " \"\"\"\n",
255
+ " if not numBins:\n",
256
+ " numBins = int(len(data) / 5)\n",
257
+ " res = stats.cumfreq(data, numbins=numBins)\n",
258
+ " self.cdistr = res.cumcount / len(data)\n",
259
+ " self.loLim = res.lowerlimit\n",
260
+ " self.upLim = res.lowerlimit + res.binsize * res.cumcount.size\n",
261
+ " self.binWidth = res.binsize\n",
262
+ "\n",
263
+ " def getDistr(self, value):\n",
264
+ " \"\"\"\n",
265
+ " get cumulative distribution\n",
266
+ "\n",
267
+ " Parameters\n",
268
+ " value : value\n",
269
+ " \"\"\"\n",
270
+ " if value <= self.loLim:\n",
271
+ " d = 0.0\n",
272
+ " elif value >= self.upLim:\n",
273
+ " d = 1.0\n",
274
+ " else:\n",
275
+ " bin = int((value - self.loLim) / self.binWidth)\n",
276
+ " d = self.cdistr[bin]\n",
277
+ " return d\n",
278
+ "\n",
279
+ "class BernoulliTrialSampler:\n",
280
+ " \"\"\"\n",
281
+ " bernoulli trial sampler return True or False\n",
282
+ " \"\"\"\n",
283
+ "\n",
284
+ " def __init__(self, pr):\n",
285
+ " \"\"\"\n",
286
+ " initializer\n",
287
+ "\n",
288
+ " Parameters\n",
289
+ " pr : probability\n",
290
+ " \"\"\"\n",
291
+ " self.pr = pr\n",
292
+ "\n",
293
+ " def sample(self):\n",
294
+ " \"\"\"\n",
295
+ " samples value\n",
296
+ " \"\"\"\n",
297
+ " return random.random() < self.pr\n",
298
+ "\n",
299
+ "class PoissonSampler:\n",
300
+ " \"\"\"\n",
301
+ " poisson sampler returns number of events\n",
302
+ " \"\"\"\n",
303
+ " def __init__(self, rateOccur, maxSamp):\n",
304
+ " \"\"\"\n",
305
+ " initializer\n",
306
+ "\n",
307
+ " Parameters\n",
308
+ " rateOccur : rate of occurence\n",
309
+ " maxSamp : max limit on no of samples\n",
310
+ " \"\"\"\n",
311
+ " self.rateOccur = rateOccur\n",
312
+ " self.maxSamp = int(maxSamp)\n",
313
+ " self.pmax = self.calculatePr(rateOccur)\n",
314
+ "\n",
315
+ " def calculatePr(self, numOccur):\n",
316
+ " \"\"\"\n",
317
+ " calulates probability\n",
318
+ "\n",
319
+ " Parameters\n",
320
+ " numOccur : no of occurence\n",
321
+ " \"\"\"\n",
322
+ " p = (self.rateOccur ** numOccur) * math.exp(-self.rateOccur) / math.factorial(numOccur)\n",
323
+ " return p\n",
324
+ "\n",
325
+ " def sample(self):\n",
326
+ " \"\"\"\n",
327
+ " samples value\n",
328
+ " \"\"\"\n",
329
+ " done = False\n",
330
+ " samp = 0\n",
331
+ " while not done:\n",
332
+ " no = randint(0, self.maxSamp)\n",
333
+ " sp = randomFloat(0.0, self.pmax)\n",
334
+ " ap = self.calculatePr(no)\n",
335
+ " if sp < ap:\n",
336
+ " done = True\n",
337
+ " samp = no\n",
338
+ " return samp\n",
339
+ "\n",
340
+ "class ExponentialSampler:\n",
341
+ " \"\"\"\n",
342
+ " returns interval between events\n",
343
+ " \"\"\"\n",
344
+ " def __init__(self, rateOccur, maxSamp = None):\n",
345
+ " \"\"\"\n",
346
+ " initializer\n",
347
+ "\n",
348
+ " Parameters\n",
349
+ " rateOccur : rate of occurence\n",
350
+ " maxSamp : max limit on interval\n",
351
+ " \"\"\"\n",
352
+ " self.interval = 1.0 / rateOccur\n",
353
+ " self.maxSamp = int(maxSamp) if maxSamp is not None else None\n",
354
+ "\n",
355
+ " def sample(self):\n",
356
+ " \"\"\"\n",
357
+ " samples value\n",
358
+ " \"\"\"\n",
359
+ " sampled = np.random.exponential(scale=self.interval)\n",
360
+ " if self.maxSamp is not None:\n",
361
+ " while sampled > self.maxSamp:\n",
362
+ " sampled = np.random.exponential(scale=self.interval)\n",
363
+ " return sampled\n",
364
+ "\n",
365
+ "class UniformNumericSampler:\n",
366
+ " \"\"\"\n",
367
+ " uniform sampler for numerical values\n",
368
+ " \"\"\"\n",
369
+ " def __init__(self, minv, maxv):\n",
370
+ " \"\"\"\n",
371
+ " initializer\n",
372
+ "\n",
373
+ " Parameters\n",
374
+ " minv : min value\n",
375
+ " maxv : max value\n",
376
+ " \"\"\"\n",
377
+ " self.minv = minv\n",
378
+ " self.maxv = maxv\n",
379
+ "\n",
380
+ " def isNumeric(self):\n",
381
+ " \"\"\"\n",
382
+ " returns true\n",
383
+ " \"\"\"\n",
384
+ " return True\n",
385
+ "\n",
386
+ " def sample(self):\n",
387
+ " \"\"\"\n",
388
+ " samples value\n",
389
+ " \"\"\"\n",
390
+ " samp =\tsampleUniform(self.minv, self.maxv) if isinstance(self.minv, int) else randomFloat(self.minv, self.maxv)\n",
391
+ " return samp\t\n",
392
+ "\n",
393
+ "class UniformCategoricalSampler:\n",
394
+ " \"\"\"\n",
395
+ " uniform sampler for categorical values\n",
396
+ " \"\"\"\n",
397
+ " def __init__(self, cvalues):\n",
398
+ " \"\"\"\n",
399
+ " initializer\n",
400
+ "\n",
401
+ " Parameters\n",
402
+ " cvalues : categorical value list\n",
403
+ " \"\"\"\n",
404
+ " self.cvalues = cvalues\n",
405
+ "\n",
406
+ " def isNumeric(self):\n",
407
+ " return False\n",
408
+ "\n",
409
+ " def sample(self):\n",
410
+ " \"\"\"\n",
411
+ " samples value\n",
412
+ " \"\"\"\n",
413
+ " return selectRandomFromList(self.cvalues)\t\n",
414
+ "\n",
415
+ "class NormalSampler:\n",
416
+ " \"\"\"\n",
417
+ " normal sampler\n",
418
+ " \"\"\"\n",
419
+ " def __init__(self, mean, stdDev):\n",
420
+ " \"\"\"\n",
421
+ " initializer\n",
422
+ "\n",
423
+ " Parameters\n",
424
+ " mean : mean\n",
425
+ " stdDev : std deviation\n",
426
+ " \"\"\"\n",
427
+ " self.mean = mean\n",
428
+ " self.stdDev = stdDev\n",
429
+ " self.sampleAsInt = False\n",
430
+ "\n",
431
+ " def isNumeric(self):\n",
432
+ " return True\n",
433
+ "\n",
434
+ " def sampleAsIntValue(self):\n",
435
+ " \"\"\"\n",
436
+ " set True to sample as int\n",
437
+ " \"\"\"\n",
438
+ " self.sampleAsInt = True\n",
439
+ "\n",
440
+ " def sample(self):\n",
441
+ " \"\"\"\n",
442
+ " samples value\n",
443
+ " \"\"\"\n",
444
+ " samp = np.random.normal(self.mean, self.stdDev)\n",
445
+ " if self.sampleAsInt:\n",
446
+ " samp = int(samp)\n",
447
+ " return samp\n",
448
+ "\n",
449
+ "class LogNormalSampler:\n",
450
+ " \"\"\"\n",
451
+ " log normal sampler\n",
452
+ " \"\"\"\n",
453
+ " def __init__(self, mean, stdDev):\n",
454
+ " \"\"\"\n",
455
+ " initializer\n",
456
+ "\n",
457
+ " Parameters\n",
458
+ " mean : mean\n",
459
+ " stdDev : std deviation\n",
460
+ " \"\"\"\n",
461
+ " self.mean = mean\n",
462
+ " self.stdDev = stdDev\n",
463
+ "\n",
464
+ " def isNumeric(self):\n",
465
+ " return True\n",
466
+ "\n",
467
+ " def sample(self):\n",
468
+ " \"\"\"\n",
469
+ " samples value\n",
470
+ " \"\"\"\n",
471
+ " return np.random.lognormal(self.mean, self.stdDev)\n",
472
+ "\n",
473
+ "class NormalSamplerWithTrendCycle:\n",
474
+ " \"\"\"\n",
475
+ " normal sampler with cycle and trend\n",
476
+ " \"\"\"\n",
477
+ " def __init__(self, mean, stdDev, dmean, cycle, step=1):\n",
478
+ " \"\"\"\n",
479
+ " initializer\n",
480
+ "\n",
481
+ " Parameters\n",
482
+ " mean : mean\n",
483
+ " stdDev : std deviation\n",
484
+ " dmean : trend delta\n",
485
+ " cycle : cycle values wrt base mean\n",
486
+ " step : adjustment step for cycle and trend\n",
487
+ " \"\"\"\n",
488
+ " self.mean = mean\n",
489
+ " self.cmean = mean\n",
490
+ " self.stdDev = stdDev\n",
491
+ " self.dmean = dmean\n",
492
+ " self.cycle = cycle\n",
493
+ " self.clen = len(cycle) if cycle is not None else 0\n",
494
+ " self.step = step\n",
495
+ " self.count = 0\n",
496
+ "\n",
497
+ " def isNumeric(self):\n",
498
+ " return True\n",
499
+ "\n",
500
+ " def sample(self):\n",
501
+ " \"\"\"\n",
502
+ " samples value\n",
503
+ " \"\"\"\n",
504
+ " s = np.random.normal(self.cmean, self.stdDev)\n",
505
+ " self.count += 1\n",
506
+ " if self.count % self.step == 0:\n",
507
+ " cy = 0\n",
508
+ " if self.clen > 1:\n",
509
+ " coff = self.count % self.clen\n",
510
+ " cy = self.cycle[coff]\n",
511
+ " tr = self.count * self.dmean\n",
512
+ " self.cmean = self.mean + tr + cy\n",
513
+ " return s\n",
514
+ "\n",
515
+ "\n",
516
+ "class ParetoSampler:\n",
517
+ " \"\"\"\n",
518
+ " pareto sampler\n",
519
+ " \"\"\"\n",
520
+ " def __init__(self, mode, shape):\n",
521
+ " \"\"\"\n",
522
+ " initializer\n",
523
+ "\n",
524
+ " Parameters\n",
525
+ " mode : mode\n",
526
+ " shape : shape\n",
527
+ " \"\"\"\n",
528
+ " self.mode = mode\n",
529
+ " self.shape = shape\n",
530
+ "\n",
531
+ " def isNumeric(self):\n",
532
+ " return True\n",
533
+ "\n",
534
+ " def sample(self):\n",
535
+ " \"\"\"\n",
536
+ " samples value\n",
537
+ " \"\"\"\n",
538
+ " return (np.random.pareto(self.shape) + 1) * self.mode\n",
539
+ "\n",
540
+ "class GammaSampler:\n",
541
+ " \"\"\"\n",
542
+ " pareto sampler\n",
543
+ " \"\"\"\n",
544
+ " def __init__(self, shape, scale):\n",
545
+ " \"\"\"\n",
546
+ " initializer\n",
547
+ "\n",
548
+ " Parameters\n",
549
+ " shape : shape\n",
550
+ " scale : scale\n",
551
+ " \"\"\"\n",
552
+ " self.shape = shape\n",
553
+ " self.scale = scale\n",
554
+ "\n",
555
+ " def isNumeric(self):\n",
556
+ " return True\n",
557
+ "\n",
558
+ " def sample(self):\n",
559
+ " \"\"\"\n",
560
+ " samples value\n",
561
+ " \"\"\"\n",
562
+ " return np.random.gamma(self.shape, self.scale)\n",
563
+ "\n",
564
+ "class GaussianRejectSampler:\n",
565
+ " \"\"\"\n",
566
+ " gaussian sampling based on rejection sampling\n",
567
+ " \"\"\"\n",
568
+ " def __init__(self, mean, stdDev):\n",
569
+ " \"\"\"\n",
570
+ " initializer\n",
571
+ "\n",
572
+ " Parameters\n",
573
+ " mean : mean\n",
574
+ " stdDev : std deviation\n",
575
+ " \"\"\"\n",
576
+ " self.mean = mean\n",
577
+ " self.stdDev = stdDev\n",
578
+ " self.xmin = mean - 3 * stdDev\n",
579
+ " self.xmax = mean + 3 * stdDev\n",
580
+ " self.ymin = 0.0\n",
581
+ " self.fmax = 1.0 / (math.sqrt(2.0 * 3.14) * stdDev)\n",
582
+ " self.ymax = 1.05 * self.fmax\n",
583
+ " self.sampleAsInt = False\n",
584
+ "\n",
585
+ " def isNumeric(self):\n",
586
+ " return True\n",
587
+ "\n",
588
+ " def sampleAsIntValue(self):\n",
589
+ " \"\"\"\n",
590
+ " sample as int value\n",
591
+ " \"\"\"\n",
592
+ " self.sampleAsInt = True\n",
593
+ "\n",
594
+ " def sample(self):\n",
595
+ " \"\"\"\n",
596
+ " samples value\n",
597
+ " \"\"\"\n",
598
+ " done = False\n",
599
+ " samp = 0\n",
600
+ " while not done:\n",
601
+ " x = randomFloat(self.xmin, self.xmax)\n",
602
+ " y = randomFloat(self.ymin, self.ymax)\n",
603
+ " f = self.fmax * math.exp(-(x - self.mean) * (x - self.mean) / (2.0 * self.stdDev * self.stdDev))\n",
604
+ " if (y < f):\n",
605
+ " done = True\n",
606
+ " samp = x\n",
607
+ " if self.sampleAsInt:\n",
608
+ " samp = int(samp)\n",
609
+ " return samp\n",
610
+ "\n",
611
+ "class DiscreteRejectSampler:\n",
612
+ " \"\"\"\n",
613
+ " non parametric sampling for discrete values using given distribution based \n",
614
+ " on rejection sampling\t\n",
615
+ " \"\"\"\n",
616
+ " def __init__(self, xmin, xmax, step, *values):\n",
617
+ " \"\"\"\n",
618
+ " initializer\n",
619
+ "\n",
620
+ " Parameters\n",
621
+ " xmin : min value\n",
622
+ " xmax : max value\n",
623
+ " step : discrete step\n",
624
+ " values : distr values\n",
625
+ " \"\"\"\n",
626
+ " self.xmin = xmin\n",
627
+ " self.xmax = xmax\n",
628
+ " self.step = step\n",
629
+ " self.distr = values\n",
630
+ " if (len(self.distr) == 1):\n",
631
+ " self.distr = self.distr[0]\t\n",
632
+ " numSteps = int((self.xmax - self.xmin) / self.step)\n",
633
+ " #print(\"{:.3f} {:.3f} {:.3f} {}\".format(self.xmin, self.xmax, self.step, numSteps))\n",
634
+ " assert len(self.distr)\t== numSteps + 1, \"invalid number of distr values expected {}\".format(numSteps + 1)\n",
635
+ " self.ximin = 0\n",
636
+ " self.ximax = numSteps\n",
637
+ " self.pmax = float(max(self.distr))\n",
638
+ "\n",
639
+ " def isNumeric(self):\n",
640
+ " return True\n",
641
+ "\n",
642
+ " def sample(self):\n",
643
+ " \"\"\"\n",
644
+ " samples value\n",
645
+ " \"\"\"\n",
646
+ " done = False\n",
647
+ " samp = None\n",
648
+ " while not done:\n",
649
+ " xi = randint(self.ximin, self.ximax)\n",
650
+ " #print(formatAny(xi, \"xi\"))\n",
651
+ " ps = randomFloat(0.0, self.pmax)\n",
652
+ " pa = self.distr[xi]\n",
653
+ " if ps < pa:\n",
654
+ " samp = self.xmin + xi * self.step\n",
655
+ " done = True\n",
656
+ " return samp\n",
657
+ "\n",
658
+ "\n",
659
+ "class TriangularRejectSampler:\n",
660
+ " \"\"\"\n",
661
+ " non parametric sampling using triangular distribution based on rejection sampling\t\n",
662
+ " \"\"\"\n",
663
+ " def __init__(self, xmin, xmax, vertexValue, vertexPos=None):\n",
664
+ " \"\"\"\n",
665
+ " initializer\n",
666
+ "\n",
667
+ " Parameters\n",
668
+ " xmin : min value\n",
669
+ " xmax : max value\n",
670
+ " vertexValue : distr value at vertex\n",
671
+ " vertexPos : vertex pposition\n",
672
+ " \"\"\"\n",
673
+ " self.xmin = xmin\n",
674
+ " self.xmax = xmax\n",
675
+ " self.vertexValue = vertexValue\n",
676
+ " if vertexPos: \n",
677
+ " assert vertexPos > xmin and vertexPos < xmax, \"vertex position outside bound\"\n",
678
+ " self.vertexPos = vertexPos\n",
679
+ " else:\n",
680
+ " self.vertexPos = 0.5 * (xmin + xmax)\n",
681
+ " self.s1 = vertexValue / (self.vertexPos - xmin)\n",
682
+ " self.s2 = vertexValue / (xmax - self.vertexPos)\n",
683
+ "\n",
684
+ " def isNumeric(self):\n",
685
+ " return True\n",
686
+ "\n",
687
+ " def sample(self):\n",
688
+ " \"\"\"\n",
689
+ " samples value\n",
690
+ " \"\"\"\n",
691
+ " done = False\n",
692
+ " samp = None\n",
693
+ " while not done:\n",
694
+ " x = randomFloat(self.xmin, self.xmax)\n",
695
+ " y = randomFloat(0.0, self.vertexValue)\n",
696
+ " f = (x - self.xmin) * self.s1 if x < self.vertexPos else (self.xmax - x) * self.s2\n",
697
+ " if (y < f):\n",
698
+ " done = True\n",
699
+ " samp = x\n",
700
+ "\n",
701
+ " return samp;\t\n",
702
+ "\n",
703
+ "class NonParamRejectSampler:\n",
704
+ " \"\"\"\n",
705
+ " non parametric sampling using given distribution based on rejection sampling\t\n",
706
+ " \"\"\"\n",
707
+ " def __init__(self, xmin, binWidth, *values):\n",
708
+ " \"\"\"\n",
709
+ " initializer\n",
710
+ "\n",
711
+ " Parameters\n",
712
+ " xmin : min value\n",
713
+ " binWidth : bin width\n",
714
+ " values : distr values\n",
715
+ " \"\"\"\n",
716
+ " self.values = values\n",
717
+ " if (len(self.values) == 1):\n",
718
+ " self.values = self.values[0]\n",
719
+ " self.xmin = xmin\n",
720
+ " self.xmax = xmin + binWidth * (len(self.values) - 1)\n",
721
+ " #print(self.xmin, self.xmax, binWidth)\n",
722
+ " self.binWidth = binWidth\n",
723
+ " self.fmax = 0\n",
724
+ " for v in self.values:\n",
725
+ " if (v > self.fmax):\n",
726
+ " self.fmax = v\n",
727
+ " self.ymin = 0\n",
728
+ " self.ymax = self.fmax\n",
729
+ " self.sampleAsInt = True\n",
730
+ "\n",
731
+ " def isNumeric(self):\n",
732
+ " return True\n",
733
+ "\n",
734
+ " def sampleAsFloat(self):\n",
735
+ " self.sampleAsInt = False\n",
736
+ "\n",
737
+ " def sample(self):\n",
738
+ " \"\"\"\n",
739
+ " samples value\n",
740
+ " \"\"\"\n",
741
+ " done = False\n",
742
+ " samp = 0\n",
743
+ " while not done:\n",
744
+ " if self.sampleAsInt:\n",
745
+ " x = random.randint(self.xmin, self.xmax)\n",
746
+ " y = random.randint(self.ymin, self.ymax)\n",
747
+ " else:\n",
748
+ " x = randomFloat(self.xmin, self.xmax)\n",
749
+ " y = randomFloat(self.ymin, self.ymax)\n",
750
+ " bin = int((x - self.xmin) / self.binWidth)\n",
751
+ " f = self.values[bin]\n",
752
+ " if (y < f):\n",
753
+ " done = True\n",
754
+ " samp = x\n",
755
+ " return samp\n",
756
+ "\n",
757
+ "class JointNonParamRejectSampler:\n",
758
+ " \"\"\"\n",
759
+ " non parametric sampling using given distribution based on rejection sampling\t\n",
760
+ " \"\"\"\n",
761
+ " def __init__(self, xmin, xbinWidth, xnbin, ymin, ybinWidth, ynbin, *values):\n",
762
+ " \"\"\"\n",
763
+ " initializer\n",
764
+ "\n",
765
+ " Parameters\n",
766
+ " xmin : min value for x\n",
767
+ " xbinWidth : bin width for x\n",
768
+ " xnbin : no of bins for x\n",
769
+ " ymin : min value for y\n",
770
+ " ybinWidth : bin width for y\n",
771
+ " ynbin : no of bins for y\n",
772
+ " values : distr values\n",
773
+ " \"\"\"\n",
774
+ " self.values = values\n",
775
+ " if (len(self.values) == 1):\n",
776
+ " self.values = self.values[0]\n",
777
+ " assert len(self.values) == xnbin * ynbin, \"wrong number of values for joint distr\"\n",
778
+ " self.xmin = xmin\n",
779
+ " self.xmax = xmin + xbinWidth * xnbin\n",
780
+ " self.xbinWidth = xbinWidth\n",
781
+ " self.ymin = ymin\n",
782
+ " self.ymax = ymin + ybinWidth * ynbin\n",
783
+ " self.ybinWidth = ybinWidth\n",
784
+ " self.pmax = max(self.values)\n",
785
+ " self.values = np.array(self.values).reshape(xnbin, ynbin)\n",
786
+ "\n",
787
+ " def isNumeric(self):\n",
788
+ " return True\n",
789
+ "\n",
790
+ " def sample(self):\n",
791
+ " \"\"\"\n",
792
+ " samples value\n",
793
+ " \"\"\"\n",
794
+ " done = False\n",
795
+ " samp = 0\n",
796
+ " while not done:\n",
797
+ " x = randomFloat(self.xmin, self.xmax)\n",
798
+ " y = randomFloat(self.ymin, self.ymax)\n",
799
+ " xbin = int((x - self.xmin) / self.xbinWidth)\n",
800
+ " ybin = int((y - self.ymin) / self.ybinWidth)\n",
801
+ " ap = self.values[xbin][ybin]\n",
802
+ " sp = randomFloat(0.0, self.pmax)\n",
803
+ " if (sp < ap):\n",
804
+ " done = True\n",
805
+ " samp = [x,y]\n",
806
+ " return samp\n",
807
+ "\n",
808
+ "\n",
809
+ "class JointNormalSampler:\n",
810
+ " \"\"\"\n",
811
+ " joint normal sampler\t\n",
812
+ " \"\"\"\n",
813
+ " def __init__(self, *values):\n",
814
+ " \"\"\"\n",
815
+ " initializer\n",
816
+ "\n",
817
+ " Parameters\n",
818
+ " values : 2 mean values followed by 4 values for covar matrix\n",
819
+ " \"\"\"\n",
820
+ " lvalues = list(values)\n",
821
+ " assert len(lvalues) == 6, \"incorrect number of arguments for joint normal sampler\"\n",
822
+ " mean = lvalues[:2]\n",
823
+ " self.mean = np.array(mean)\n",
824
+ " sd = lvalues[2:]\n",
825
+ " self.sd = np.array(sd).reshape(2,2)\n",
826
+ "\n",
827
+ " def isNumeric(self):\n",
828
+ " return True\n",
829
+ "\n",
830
+ " def sample(self):\n",
831
+ " \"\"\"\n",
832
+ " samples value\n",
833
+ " \"\"\"\n",
834
+ " return list(np.random.multivariate_normal(self.mean, self.sd))\n",
835
+ "\n",
836
+ "\n",
837
+ "class MultiVarNormalSampler:\n",
838
+ " \"\"\"\n",
839
+ " muti variate normal sampler\t\n",
840
+ " \"\"\"\n",
841
+ " def __init__(self, numVar, *values):\n",
842
+ " \"\"\"\n",
843
+ " initializer\n",
844
+ "\n",
845
+ " Parameters\n",
846
+ " numVar : no of variables\n",
847
+ " values : numVar mean values followed by numVar x numVar values for covar matrix\n",
848
+ " \"\"\"\n",
849
+ " lvalues = list(values)\n",
850
+ " assert len(lvalues) == numVar + numVar * numVar, \"incorrect number of arguments for multi var normal sampler\"\n",
851
+ " mean = lvalues[:numVar]\n",
852
+ " self.mean = np.array(mean)\n",
853
+ " sd = lvalues[numVar:]\n",
854
+ " self.sd = np.array(sd).reshape(numVar,numVar)\n",
855
+ "\n",
856
+ " def isNumeric(self):\n",
857
+ " return True\n",
858
+ "\n",
859
+ " def sample(self):\n",
860
+ " \"\"\"\n",
861
+ " samples value\n",
862
+ " \"\"\"\n",
863
+ " return list(np.random.multivariate_normal(self.mean, self.sd))\n",
864
+ "\n",
865
+ "class CategoricalRejectSampler:\n",
866
+ " \"\"\"\n",
867
+ " non parametric sampling for categorical attributes using given distribution based \n",
868
+ " on rejection sampling\t\n",
869
+ " \"\"\"\n",
870
+ " def __init__(self, *values):\n",
871
+ " \"\"\"\n",
872
+ " initializer\n",
873
+ "\n",
874
+ " Parameters\n",
875
+ " values : list of tuples which contains a categorical value and the corresponsding distr value\n",
876
+ " \"\"\"\n",
877
+ " self.distr = values\n",
878
+ " if (len(self.distr) == 1):\n",
879
+ " self.distr = self.distr[0]\n",
880
+ " maxv = 0\n",
881
+ " for t in self.distr:\n",
882
+ " if t[1] > maxv:\n",
883
+ " maxv = t[1]\n",
884
+ " self.maxv = maxv\n",
885
+ "\n",
886
+ " def sample(self):\n",
887
+ " \"\"\"\n",
888
+ " samples value\n",
889
+ " \"\"\"\n",
890
+ " done = False\n",
891
+ " samp = \"\"\n",
892
+ " while not done:\n",
893
+ " t = self.distr[randint(0, len(self.distr)-1)]\t\n",
894
+ " d = randomFloat(0, self.maxv)\t\n",
895
+ " if (d <= t[1]):\n",
896
+ " done = True\n",
897
+ " samp = t[0]\n",
898
+ " return samp\n",
899
+ "\n",
900
+ "\n",
901
+ "class DistrMixtureSampler:\n",
902
+ " \"\"\"\n",
903
+ " distr mixture sampler\n",
904
+ " \"\"\"\n",
905
+ " def __init__(self, mixtureWtDistr, *compDistr):\n",
906
+ " \"\"\"\n",
907
+ " initializer\n",
908
+ "\n",
909
+ " Parameters\n",
910
+ " mixtureWtDistr : sampler that returns index into sampler list\n",
911
+ " compDistr : sampler list\n",
912
+ " \"\"\"\n",
913
+ " self.mixtureWtDistr = mixtureWtDistr\n",
914
+ " self.compDistr = compDistr\n",
915
+ " if (len(self.compDistr) == 1):\n",
916
+ " self.compDistr = self.compDistr[0]\n",
917
+ "\n",
918
+ " def isNumeric(self):\n",
919
+ " return True\n",
920
+ "\n",
921
+ " def sample(self):\n",
922
+ " \"\"\"\n",
923
+ " samples value\n",
924
+ " \"\"\"\n",
925
+ " comp = self.mixtureWtDistr.sample()\n",
926
+ "\n",
927
+ " #sample sampled comp distr\n",
928
+ " return self.compDistr[comp].sample()\n",
929
+ "\n",
930
+ "class AncestralSampler:\n",
931
+ " \"\"\"\n",
932
+ " ancestral sampler using conditional distribution\n",
933
+ " \"\"\"\n",
934
+ " def __init__(self, parentDistr, childDistr, numChildren):\n",
935
+ " \"\"\"\n",
936
+ " initializer\n",
937
+ "\n",
938
+ " Parameters\n",
939
+ " parentDistr : parent distr\n",
940
+ " childDistr : childdren distribution dictionary\n",
941
+ " numChildren : no of children\n",
942
+ " \"\"\"\n",
943
+ " self.parentDistr = parentDistr\n",
944
+ " self.childDistr = childDistr\n",
945
+ " self.numChildren = numChildren\n",
946
+ "\n",
947
+ " def sample(self):\n",
948
+ " \"\"\"\n",
949
+ " samples value\n",
950
+ " \"\"\"\n",
951
+ " parent = self.parentDistr.sample()\n",
952
+ "\n",
953
+ " #sample all children conditioned on parent\n",
954
+ " children = []\n",
955
+ " for i in range(self.numChildren):\n",
956
+ " key = (parent, i)\n",
957
+ " child = self.childDistr[key].sample()\n",
958
+ " children.append(child)\n",
959
+ " return (parent, children)\n",
960
+ "\n",
961
+ "class ClusterSampler:\n",
962
+ " \"\"\"\n",
963
+ " sample cluster and then sample member of sampled cluster\n",
964
+ " \"\"\"\n",
965
+ " def __init__(self, clusters, *clustDistr):\n",
966
+ " \"\"\"\n",
967
+ " initializer\n",
968
+ "\n",
969
+ " Parameters\n",
970
+ " clusters : dictionary clusters\n",
971
+ " clustDistr : distr for clusters\n",
972
+ " \"\"\"\n",
973
+ " self.sampler = CategoricalRejectSampler(*clustDistr)\n",
974
+ " self.clusters = clusters\n",
975
+ "\n",
976
+ " def sample(self):\n",
977
+ " \"\"\"\n",
978
+ " samples value\n",
979
+ " \"\"\"\n",
980
+ " cluster = self.sampler.sample()\n",
981
+ " member = random.choice(self.clusters[cluster])\n",
982
+ " return (cluster, member)\n",
983
+ "\n",
984
+ "\n",
985
+ "class MetropolitanSampler:\n",
986
+ " \"\"\"\n",
987
+ " metropolitan sampler\t\n",
988
+ " \"\"\"\n",
989
+ " def __init__(self, propStdDev, min, binWidth, values):\n",
990
+ " \"\"\"\n",
991
+ " initializer\n",
992
+ "\n",
993
+ " Parameters\n",
994
+ " propStdDev : proposal distr std dev\n",
995
+ " min : min domain value for target distr\n",
996
+ " binWidth : bin width\n",
997
+ " values : target distr values\n",
998
+ " \"\"\"\n",
999
+ " self.targetDistr = Histogram.createInitialized(min, binWidth, values)\n",
1000
+ " self.propsalDistr = GaussianRejectSampler(0, propStdDev)\n",
1001
+ " self.proposalMixture = False\n",
1002
+ "\n",
1003
+ " # bootstrap sample\n",
1004
+ " (minv, maxv) = self.targetDistr.getMinMax()\n",
1005
+ " self.curSample = random.randint(minv, maxv)\n",
1006
+ " self.curDistr = self.targetDistr.value(self.curSample)\n",
1007
+ " self.transCount = 0\n",
1008
+ "\n",
1009
+ " def initialize(self):\n",
1010
+ " \"\"\"\n",
1011
+ " initialize\n",
1012
+ " \"\"\"\n",
1013
+ " (minv, maxv) = self.targetDistr.getMinMax()\n",
1014
+ " self.curSample = random.randint(minv, maxv)\n",
1015
+ " self.curDistr = self.targetDistr.value(self.curSample)\n",
1016
+ " self.transCount = 0\n",
1017
+ "\n",
1018
+ " def setProposalDistr(self, propsalDistr):\n",
1019
+ " \"\"\"\n",
1020
+ " set custom proposal distribution\n",
1021
+ " Parameters\n",
1022
+ " propsalDistr : proposal distribution\n",
1023
+ " \"\"\"\n",
1024
+ " self.propsalDistr = propsalDistr\n",
1025
+ "\n",
1026
+ "\n",
1027
+ " def setGlobalProposalDistr(self, globPropStdDev, proposalChoiceThreshold):\n",
1028
+ " \"\"\"\n",
1029
+ " set custom proposal distribution\n",
1030
+ " Parameters\n",
1031
+ " globPropStdDev : global proposal distr std deviation\n",
1032
+ " proposalChoiceThreshold : threshold for using global proposal distribution\n",
1033
+ " \"\"\"\n",
1034
+ " self.globalProposalDistr = GaussianRejectSampler(0, globPropStdDev)\n",
1035
+ " self.proposalChoiceThreshold = proposalChoiceThreshold\n",
1036
+ " self.proposalMixture = True\n",
1037
+ "\n",
1038
+ " def sample(self):\n",
1039
+ " \"\"\"\n",
1040
+ " samples value\n",
1041
+ " \"\"\"\n",
1042
+ " nextSample = self.proposalSample(1)\n",
1043
+ " self.targetSample(nextSample)\n",
1044
+ " return self.curSample;\n",
1045
+ "\n",
1046
+ " def proposalSample(self, skip):\n",
1047
+ " \"\"\"\n",
1048
+ " sample from proposal distribution\n",
1049
+ " Parameters\n",
1050
+ " skip : no of samples to skip\n",
1051
+ " \"\"\"\n",
1052
+ " for i in range(skip):\n",
1053
+ " if not self.proposalMixture:\n",
1054
+ " #one proposal distr\n",
1055
+ " nextSample = self.curSample + self.propsalDistr.sample()\n",
1056
+ " nextSample = self.targetDistr.boundedValue(nextSample)\n",
1057
+ " else:\n",
1058
+ " #mixture of proposal distr\n",
1059
+ " if random.random() < self.proposalChoiceThreshold:\n",
1060
+ " nextSample = self.curSample + self.propsalDistr.sample()\n",
1061
+ " else:\n",
1062
+ " nextSample = self.curSample + self.globalProposalDistr.sample()\n",
1063
+ " nextSample = self.targetDistr.boundedValue(nextSample)\n",
1064
+ "\n",
1065
+ " return nextSample\n",
1066
+ "\n",
1067
+ " def targetSample(self, nextSample):\n",
1068
+ " \"\"\"\n",
1069
+ " target sample\n",
1070
+ " Parameters\n",
1071
+ " nextSample : proposal distr sample\n",
1072
+ " \"\"\"\n",
1073
+ " nextDistr = self.targetDistr.value(nextSample)\n",
1074
+ "\n",
1075
+ " transition = False\n",
1076
+ " if nextDistr > self.curDistr:\n",
1077
+ " transition = True\n",
1078
+ " else:\n",
1079
+ " distrRatio = float(nextDistr) / self.curDistr\n",
1080
+ " if random.random() < distrRatio:\n",
1081
+ " transition = True\n",
1082
+ "\n",
1083
+ " if transition:\n",
1084
+ " self.curSample = nextSample\n",
1085
+ " self.curDistr = nextDistr\n",
1086
+ " self.transCount += 1\n",
1087
+ "\n",
1088
+ "\n",
1089
+ " def subSample(self, skip):\n",
1090
+ " \"\"\"\n",
1091
+ " sub sample\n",
1092
+ " Parameters\n",
1093
+ " skip : no of samples to skip\n",
1094
+ " \"\"\"\n",
1095
+ " nextSample = self.proposalSample(skip)\n",
1096
+ " self.targetSample(nextSample)\n",
1097
+ " return self.curSample;\n",
1098
+ "\n",
1099
+ " def setMixtureProposal(self, globPropStdDev, mixtureThreshold):\n",
1100
+ " \"\"\"\n",
1101
+ " mixture proposal\n",
1102
+ " Parameters\n",
1103
+ " globPropStdDev : global proposal distr std deviation\n",
1104
+ " mixtureThreshold : threshold for using global proposal distribution\n",
1105
+ " \"\"\"\n",
1106
+ " self.globalProposalDistr = GaussianRejectSampler(0, globPropStdDev)\n",
1107
+ " self.mixtureThreshold = mixtureThreshold\n",
1108
+ "\n",
1109
+ " def samplePropsal(self):\n",
1110
+ " \"\"\"\n",
1111
+ " sample from proposal distr\n",
1112
+ " \"\"\"\n",
1113
+ " if self.globalPropsalDistr is None:\n",
1114
+ " proposal = self.propsalDistr.sample()\n",
1115
+ " else:\n",
1116
+ " if random.random() < self.mixtureThreshold:\n",
1117
+ " proposal = self.propsalDistr.sample()\n",
1118
+ " else:\n",
1119
+ " proposal = self.globalProposalDistr.sample()\n",
1120
+ "\n",
1121
+ " return proposal\n",
1122
+ "\n",
1123
+ "class PermutationSampler:\n",
1124
+ " \"\"\"\n",
1125
+ " permutation sampler by shuffling a list\n",
1126
+ " \"\"\"\n",
1127
+ " def __init__(self):\n",
1128
+ " \"\"\"\n",
1129
+ " initialize\n",
1130
+ " \"\"\"\n",
1131
+ " self.values = None\n",
1132
+ " self.numShuffles = None\n",
1133
+ "\n",
1134
+ " @staticmethod\n",
1135
+ " def createSamplerWithValues(values, *numShuffles):\n",
1136
+ " \"\"\"\n",
1137
+ " creator with values\n",
1138
+ " Parameters\n",
1139
+ " values : list data\n",
1140
+ " numShuffles : no of shuffles or range of no of shuffles\n",
1141
+ " \"\"\"\n",
1142
+ " sampler = PermutationSampler()\n",
1143
+ " sampler.values = values\n",
1144
+ " sampler.numShuffles = numShuffles\n",
1145
+ " return sampler\n",
1146
+ "\n",
1147
+ " @staticmethod\n",
1148
+ " def createSamplerWithRange(minv, maxv, *numShuffles):\n",
1149
+ " \"\"\"\n",
1150
+ " creator with ramge min and max\n",
1151
+ "\n",
1152
+ " Parameters\n",
1153
+ " minv : min of range\n",
1154
+ " maxv : max of range\n",
1155
+ " numShuffles : no of shuffles or range of no of shuffles\n",
1156
+ " \"\"\"\n",
1157
+ " sampler = PermutationSampler()\n",
1158
+ " sampler.values = list(range(minv, maxv + 1))\n",
1159
+ " sampler.numShuffles = numShuffles\n",
1160
+ " return sampler\n",
1161
+ "\n",
1162
+ " def sample(self):\n",
1163
+ " \"\"\"\n",
1164
+ " sample new permutation\n",
1165
+ " \"\"\"\n",
1166
+ " cloned = self.values.copy()\n",
1167
+ " shuffle(cloned, *self.numShuffles)\n",
1168
+ " return cloned\n",
1169
+ "\n",
1170
+ "class SpikeyDataSampler:\n",
1171
+ " \"\"\"\n",
1172
+ " samples spikey data\n",
1173
+ " \"\"\"\n",
1174
+ " def __init__(self, intvMean, intvScale, distr, spikeValueMean, spikeValueStd, spikeMaxDuration, baseValue = 0):\n",
1175
+ " \"\"\"\n",
1176
+ " initializer\n",
1177
+ "\n",
1178
+ " Parameters\n",
1179
+ " intvMean : interval mean\n",
1180
+ " intvScale : interval std dev\n",
1181
+ " distr : type of distr for interval\n",
1182
+ " spikeValueMean : spike value mean\n",
1183
+ " spikeValueStd : spike value std dev\n",
1184
+ " spikeMaxDuration : max duration for spike\n",
1185
+ " baseValue : base or offset value\n",
1186
+ " \"\"\"\n",
1187
+ " if distr == \"norm\":\n",
1188
+ " self.intvSampler = NormalSampler(intvMean, intvScale)\n",
1189
+ " elif distr == \"expo\":\n",
1190
+ " rate = 1.0 / intvScale\n",
1191
+ " self.intvSampler = ExponentialSampler(rate)\n",
1192
+ " else:\n",
1193
+ " raise ValueError(\"invalid distribution\")\n",
1194
+ "\n",
1195
+ " self.spikeSampler = NormalSampler(spikeValueMean, spikeValueStd)\n",
1196
+ " self.spikeMaxDuration = spikeMaxDuration\n",
1197
+ " self.baseValue = baseValue\n",
1198
+ " self.inSpike = False\n",
1199
+ " self.spikeCount = 0\n",
1200
+ " self.baseCount = 0\n",
1201
+ " self.baseLength = int(self.intvSampler.sample())\n",
1202
+ " self.spikeValues = list()\n",
1203
+ " self.spikeLength = None\n",
1204
+ "\n",
1205
+ " def sample(self):\n",
1206
+ " \"\"\"\n",
1207
+ " sample new value\n",
1208
+ " \"\"\"\n",
1209
+ " if self.baseCount <= self.baseLength:\n",
1210
+ " sampled = self.baseValue\n",
1211
+ " self.baseCount += 1\n",
1212
+ " else:\n",
1213
+ " if not self.inSpike:\n",
1214
+ " #starting spike\n",
1215
+ " spikeVal = self.spikeSampler.sample()\n",
1216
+ " self.spikeLength = sampleUniform(1, self.spikeMaxDuration)\n",
1217
+ " spikeMaxPos = 0 if self.spikeLength == 1 else sampleUniform(0, self.spikeLength-1)\n",
1218
+ " self.spikeValues.clear()\n",
1219
+ " for i in range(self.spikeLength):\n",
1220
+ " if i < spikeMaxPos:\n",
1221
+ " frac = (i + 1) / (spikeMaxPos + 1)\n",
1222
+ " frac = sampleFloatFromBase(frac, 0.1 * frac)\n",
1223
+ " elif i > spikeMaxPos:\n",
1224
+ " frac = (self.spikeLength - i) / (self.spikeLength - spikeMaxPos)\n",
1225
+ " frac = sampleFloatFromBase(frac, 0.1 * frac)\n",
1226
+ " else:\n",
1227
+ " frac = 1.0\n",
1228
+ " self.spikeValues.append(frac * spikeVal)\n",
1229
+ " self.inSpike = True\n",
1230
+ " self.spikeCount = 0\n",
1231
+ "\n",
1232
+ "\n",
1233
+ " sampled = self.spikeValues[self.spikeCount]\n",
1234
+ " self.spikeCount += 1\n",
1235
+ "\n",
1236
+ " if self.spikeCount == self.spikeLength:\n",
1237
+ " #ending spike\n",
1238
+ " self.baseCount = 0\n",
1239
+ " self.baseLength = int(self.intvSampler.sample())\n",
1240
+ " self.inSpike = False\n",
1241
+ "\n",
1242
+ " return sampled\n",
1243
+ "\n",
1244
+ "\n",
1245
+ "class EventSampler:\n",
1246
+ " \"\"\"\n",
1247
+ " sample event\n",
1248
+ " \"\"\"\n",
1249
+ " def __init__(self, intvSampler, valSampler=None):\n",
1250
+ " \"\"\"\n",
1251
+ " initializer\n",
1252
+ "\n",
1253
+ " Parameters\n",
1254
+ " intvSampler : interval sampler\n",
1255
+ " valSampler : value sampler\n",
1256
+ " \"\"\"\n",
1257
+ " self.intvSampler = intvSampler\n",
1258
+ " self.valSampler = valSampler\n",
1259
+ " self.trigger = int(self.intvSampler.sample())\n",
1260
+ " self.count = 0\n",
1261
+ "\n",
1262
+ " def reset(self):\n",
1263
+ " \"\"\"\n",
1264
+ " reset trigger\n",
1265
+ " \"\"\"\n",
1266
+ " self.trigger = int(self.intvSampler.sample())\n",
1267
+ " self.count = 0\n",
1268
+ "\n",
1269
+ " def sample(self):\n",
1270
+ " \"\"\"\n",
1271
+ " sample event\n",
1272
+ " \"\"\"\n",
1273
+ " if self.count == self.trigger:\n",
1274
+ " sampled = self.valSampler.sample() if self.valSampler is not None else 1.0\n",
1275
+ " self.trigger = int(self.intvSampler.sample())\n",
1276
+ " self.count = 0\n",
1277
+ " else:\n",
1278
+ " sample = 0.0\n",
1279
+ " self.count += 1\n",
1280
+ " return sampled\n",
1281
+ "\n",
1282
+ "\n",
1283
+ "\n",
1284
+ "\n",
1285
+ "def createSampler(data):\n",
1286
+ " \"\"\"\n",
1287
+ " create sampler\n",
1288
+ "\n",
1289
+ " Parameters\n",
1290
+ " data : sampler description\n",
1291
+ " \"\"\"\n",
1292
+ " #print(data)\n",
1293
+ " items = data.split(\":\")\n",
1294
+ " size = len(items)\n",
1295
+ " dtype = items[-1]\n",
1296
+ " stype = items[-2]\n",
1297
+ " sampler = None\n",
1298
+ " if stype == \"uniform\":\n",
1299
+ " if dtype == \"int\":\n",
1300
+ " min = int(items[0])\n",
1301
+ " max = int(items[1])\n",
1302
+ " sampler = UniformNumericSampler(min, max)\n",
1303
+ " elif dtype == \"float\":\n",
1304
+ " min = float(items[0])\n",
1305
+ " max = float(items[1])\n",
1306
+ " sampler = UniformNumericSampler(min, max)\n",
1307
+ " elif dtype == \"categorical\":\n",
1308
+ " values = items[:-2]\n",
1309
+ " sampler = UniformCategoricalSampler(values)\n",
1310
+ " elif stype == \"normal\":\n",
1311
+ " mean = float(items[0])\n",
1312
+ " sd = float(items[1])\n",
1313
+ " sampler = NormalSampler(mean, sd)\n",
1314
+ " if dtype == \"int\":\n",
1315
+ " sampler.sampleAsIntValue()\n",
1316
+ " elif stype == \"nonparam\":\n",
1317
+ " if dtype == \"int\" or dtype == \"float\":\n",
1318
+ " min = int(items[0])\n",
1319
+ " binWidth = int(items[1])\n",
1320
+ " values = items[2:-2]\n",
1321
+ " values = list(map(lambda v: int(v), values))\n",
1322
+ " sampler = NonParamRejectSampler(min, binWidth, values)\n",
1323
+ " if dtype == \"float\":\n",
1324
+ " sampler.sampleAsFloat()\n",
1325
+ " elif dtype == \"categorical\":\n",
1326
+ " values = list()\n",
1327
+ " for i in range(0, size-2, 2):\n",
1328
+ " cval = items[i]\n",
1329
+ " dist = int(items[i+1])\n",
1330
+ " pair = (cval, dist)\n",
1331
+ " values.append(pair)\n",
1332
+ " sampler = CategoricalRejectSampler(values)\n",
1333
+ " elif stype == \"discrete\":\n",
1334
+ " vmin = int(items[0])\n",
1335
+ " vmax = int(items[1])\n",
1336
+ " step = int(items[2])\n",
1337
+ " values = list(map(lambda i : int(items[i]), range(3, len(items)-2)))\n",
1338
+ " sampler = DiscreteRejectSampler(vmin, vmax, step, values)\n",
1339
+ " else:\n",
1340
+ " raise ValueError(\"invalid sampler type \" + dtype)\n",
1341
+ " return sampler\n"
1342
+ ]
1343
+ }
1344
+ ],
1345
+ "metadata": {
1346
+ "kernelspec": {
1347
+ "display_name": "Python 3 (ipykernel)",
1348
+ "language": "python",
1349
+ "name": "python3"
1350
+ },
1351
+ "language_info": {
1352
+ "codemirror_mode": {
1353
+ "name": "ipython",
1354
+ "version": 3
1355
+ },
1356
+ "file_extension": ".py",
1357
+ "mimetype": "text/x-python",
1358
+ "name": "python",
1359
+ "nbconvert_exporter": "python",
1360
+ "pygments_lexer": "ipython3",
1361
+ "version": "3.9.12"
1362
+ }
1363
+ },
1364
+ "nbformat": 4,
1365
+ "nbformat_minor": 5
1366
+ }
lib/stats.ipynb ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "f4cbab42",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import sys\n",
11
+ "import random \n",
12
+ "import time\n",
13
+ "import math\n",
14
+ "import numpy as np\n",
15
+ "import statistics \n",
16
+ "from util import *\n",
17
+ "\n",
18
+ "\"\"\"\n",
19
+ "histogram class\n",
20
+ "\"\"\"\n",
21
+ "class Histogram:\n",
22
+ " def __init__(self, min, binWidth):\n",
23
+ " \"\"\"\n",
24
+ " initializer\n",
25
+ "\n",
26
+ " Parameters\n",
27
+ " min : min x\n",
28
+ " binWidth : bin width\n",
29
+ " \"\"\"\n",
30
+ " self.xmin = min\n",
31
+ " self.binWidth = binWidth\n",
32
+ " self.normalized = False\n",
33
+ "\n",
34
+ " @classmethod\n",
35
+ " def createInitialized(cls, xmin, binWidth, values):\n",
36
+ " \"\"\"\n",
37
+ " create histogram instance with min domain, bin width and values\n",
38
+ "\n",
39
+ " Parameters\n",
40
+ " min : min x\n",
41
+ " binWidth : bin width\n",
42
+ " values : y values\n",
43
+ " \"\"\"\n",
44
+ " instance = cls(xmin, binWidth)\n",
45
+ " instance.xmax = xmin + binWidth * (len(values) - 1)\n",
46
+ " instance.ymin = 0\n",
47
+ " instance.bins = np.array(values)\n",
48
+ " instance.fmax = 0\n",
49
+ " for v in values:\n",
50
+ " if (v > instance.fmax):\n",
51
+ " instance.fmax = v\n",
52
+ " instance.ymin = 0.0\n",
53
+ " instance.ymax = instance.fmax\n",
54
+ " return instance\n",
55
+ "\n",
56
+ " @classmethod\n",
57
+ " def createWithNumBins(cls, values, numBins=20):\n",
58
+ " \"\"\"\n",
59
+ " create histogram instance values and no of bins\n",
60
+ "\n",
61
+ " Parameters\n",
62
+ " values : y values\n",
63
+ " numBins : no of bins\n",
64
+ " \"\"\"\n",
65
+ " xmin = min(values)\n",
66
+ " xmax = max(values)\n",
67
+ " binWidth = (xmax + .01 - (xmin - .01)) / numBins\n",
68
+ " instance = cls(xmin, binWidth)\n",
69
+ " instance.xmax = xmax\n",
70
+ " instance.numBin = numBins\n",
71
+ " instance.bins = np.zeros(instance.numBin)\n",
72
+ " for v in values:\n",
73
+ " instance.add(v)\n",
74
+ " return instance\n",
75
+ "\n",
76
+ " @classmethod\n",
77
+ " def createUninitialized(cls, xmin, xmax, binWidth):\n",
78
+ " \"\"\"\n",
79
+ " create histogram instance with no y values using domain min , max and bin width\n",
80
+ "\n",
81
+ " Parameters\n",
82
+ " min : min x\n",
83
+ " max : max x\n",
84
+ " binWidth : bin width\n",
85
+ " \"\"\"\n",
86
+ " instance = cls(xmin, binWidth)\n",
87
+ " instance.xmax = xmax\n",
88
+ " instance.numBin = (xmax - xmin) / binWidth + 1\n",
89
+ " instance.bins = np.zeros(instance.numBin)\n",
90
+ " return instance\n",
91
+ "\n",
92
+ " def initialize(self):\n",
93
+ " \"\"\"\n",
94
+ " set y values to 0\n",
95
+ " \"\"\"\n",
96
+ " self.bins = np.zeros(self.numBin)\n",
97
+ "\n",
98
+ " def add(self, value):\n",
99
+ " \"\"\"\n",
100
+ " adds a value to a bin\n",
101
+ "\n",
102
+ " Parameters\n",
103
+ " value : value\n",
104
+ " \"\"\"\n",
105
+ " bin = int((value - self.xmin) / self.binWidth)\n",
106
+ " if (bin < 0 or bin > self.numBin - 1):\n",
107
+ " print (bin)\n",
108
+ " raise ValueError(\"outside histogram range\")\n",
109
+ " self.bins[bin] += 1.0\n",
110
+ "\n",
111
+ " def normalize(self):\n",
112
+ " \"\"\"\n",
113
+ " normalize bin counts\n",
114
+ " \"\"\"\n",
115
+ " if not self.normalized:\n",
116
+ " total = self.bins.sum()\n",
117
+ " self.bins = np.divide(self.bins, total)\n",
118
+ " self.normalized = True\n",
119
+ "\n",
120
+ " def cumDistr(self):\n",
121
+ " \"\"\"\n",
122
+ " cumulative dists\n",
123
+ " \"\"\"\n",
124
+ " self.normalize()\n",
125
+ " self.cbins = np.cumsum(self.bins)\n",
126
+ " return self.cbins\n",
127
+ "\n",
128
+ " def distr(self):\n",
129
+ " \"\"\"\n",
130
+ " distr\n",
131
+ " \"\"\"\n",
132
+ " self.normalize()\n",
133
+ " return self.bins\n",
134
+ "\n",
135
+ "\n",
136
+ " def percentile(self, percent):\n",
137
+ " \"\"\"\n",
138
+ " return value corresponding to a percentile\n",
139
+ "\n",
140
+ " Parameters\n",
141
+ " percent : percentile value\n",
142
+ " \"\"\"\n",
143
+ " if self.cbins is None:\n",
144
+ " raise ValueError(\"cumulative distribution is not available\")\n",
145
+ "\n",
146
+ " for i,cuml in enumerate(self.cbins):\n",
147
+ " if percent > cuml:\n",
148
+ " value = (i * self.binWidth) - (self.binWidth / 2) + \\\n",
149
+ " (percent - self.cbins[i-1]) * self.binWidth / (self.cbins[i] - self.cbins[i-1]) \n",
150
+ " break\n",
151
+ " return value\n",
152
+ "\n",
153
+ " def max(self):\n",
154
+ " \"\"\"\n",
155
+ " return max bin value \n",
156
+ " \"\"\"\n",
157
+ " return self.bins.max()\n",
158
+ "\n",
159
+ " def value(self, x):\n",
160
+ " \"\"\"\n",
161
+ " return a bin value\t\n",
162
+ "\n",
163
+ " Parameters\n",
164
+ " x : x value\n",
165
+ " \"\"\"\n",
166
+ " bin = int((x - self.xmin) / self.binWidth)\n",
167
+ " f = self.bins[bin]\n",
168
+ " return f\n",
169
+ "\n",
170
+ " def bin(self, x):\n",
171
+ " \"\"\"\n",
172
+ " return a bin index\t\n",
173
+ "\n",
174
+ " Parameters\n",
175
+ " x : x value\n",
176
+ " \"\"\"\n",
177
+ " return int((x - self.xmin) / self.binWidth)\n",
178
+ "\n",
179
+ " def cumValue(self, x):\n",
180
+ " \"\"\"\n",
181
+ " return a cumulative bin value\t\n",
182
+ "\n",
183
+ " Parameters\n",
184
+ " x : x value\n",
185
+ " \"\"\"\n",
186
+ " bin = int((x - self.xmin) / self.binWidth)\n",
187
+ " c = self.cbins[bin]\n",
188
+ " return c\n",
189
+ "\n",
190
+ "\n",
191
+ " def getMinMax(self):\n",
192
+ " \"\"\"\n",
193
+ " returns x min and x max\n",
194
+ " \"\"\"\n",
195
+ " return (self.xmin, self.xmax)\n",
196
+ "\n",
197
+ " def boundedValue(self, x):\n",
198
+ " \"\"\"\n",
199
+ " return x bounde by min and max\t\n",
200
+ "\n",
201
+ " Parameters\n",
202
+ " x : x value\n",
203
+ " \"\"\"\n",
204
+ " if x < self.xmin:\n",
205
+ " x = self.xmin\n",
206
+ " elif x > self.xmax:\n",
207
+ " x = self.xmax\n",
208
+ " return x\n",
209
+ "\n",
210
+ "\"\"\"\n",
211
+ "categorical histogram class\n",
212
+ "\"\"\"\n",
213
+ "class CatHistogram:\n",
214
+ " def __init__(self):\n",
215
+ " \"\"\"\n",
216
+ " initializer\n",
217
+ " \"\"\"\n",
218
+ " self.binCounts = dict()\n",
219
+ " self.counts = 0\n",
220
+ " self.normalized = False\n",
221
+ "\n",
222
+ " def add(self, value):\n",
223
+ " \"\"\"\n",
224
+ " adds a value to a bin\n",
225
+ "\n",
226
+ " Parameters\n",
227
+ " x : x value\n",
228
+ " \"\"\"\n",
229
+ " addToKeyedCounter(self.binCounts, value)\n",
230
+ " self.counts += 1\t\n",
231
+ "\n",
232
+ " def normalize(self):\n",
233
+ " \"\"\"\n",
234
+ " normalize\n",
235
+ " \"\"\"\n",
236
+ " if not self.normalized:\n",
237
+ " self.binCounts = dict(map(lambda r : (r[0],r[1] / self.counts), self.binCounts.items()))\n",
238
+ " self.normalized = True\n",
239
+ "\n",
240
+ " def getMode(self):\n",
241
+ " \"\"\"\n",
242
+ " get mode\n",
243
+ " \"\"\"\n",
244
+ " maxk = None\n",
245
+ " maxv = 0\n",
246
+ " #print(self.binCounts)\n",
247
+ " for k,v in self.binCounts.items():\n",
248
+ " if v > maxv:\n",
249
+ " maxk = k\n",
250
+ " maxv = v\n",
251
+ " return (maxk, maxv)\t\n",
252
+ "\n",
253
+ " def getEntropy(self):\n",
254
+ " \"\"\"\n",
255
+ " get entropy\n",
256
+ " \"\"\"\n",
257
+ " self.normalize()\n",
258
+ " entr = 0 \n",
259
+ " #print(self.binCounts)\n",
260
+ " for k,v in self.binCounts.items():\n",
261
+ " entr -= v * math.log(v)\n",
262
+ " return entr\n",
263
+ "\n",
264
+ " def getUniqueValues(self):\n",
265
+ " \"\"\"\n",
266
+ " get unique values\n",
267
+ " \"\"\"\t\t\n",
268
+ " return list(self.binCounts.keys())\n",
269
+ "\n",
270
+ " def getDistr(self):\n",
271
+ " \"\"\"\n",
272
+ " get distribution\n",
273
+ " \"\"\"\t\n",
274
+ " self.normalize()\t\n",
275
+ " return self.binCounts.copy()\n",
276
+ "\n",
277
+ "class RunningStat:\n",
278
+ " \"\"\"\n",
279
+ " running stat class\n",
280
+ " \"\"\"\n",
281
+ " def __init__(self):\n",
282
+ " \"\"\"\n",
283
+ " initializer\t\n",
284
+ " \"\"\"\n",
285
+ " self.sum = 0.0\n",
286
+ " self.sumSq = 0.0\n",
287
+ " self.count = 0\n",
288
+ "\n",
289
+ " @staticmethod\n",
290
+ " def create(count, sum, sumSq):\n",
291
+ " \"\"\"\n",
292
+ " creates iinstance\t\n",
293
+ "\n",
294
+ " Parameters\n",
295
+ " sum : sum of values\n",
296
+ " sumSq : sum of valure squared\n",
297
+ " \"\"\"\n",
298
+ " rs = RunningStat()\n",
299
+ " rs.sum = sum\n",
300
+ " rs.sumSq = sumSq\n",
301
+ " rs.count = count\n",
302
+ " return rs\n",
303
+ "\n",
304
+ " def add(self, value):\n",
305
+ " \"\"\"\n",
306
+ " adds new value\n",
307
+ " Parameters\n",
308
+ " value : value to add\n",
309
+ " \"\"\"\n",
310
+ " self.sum += value\n",
311
+ " self.sumSq += (value * value)\n",
312
+ " self.count += 1\n",
313
+ "\n",
314
+ " def getStat(self):\n",
315
+ " \"\"\"\n",
316
+ " return mean and std deviation \n",
317
+ " \"\"\"\n",
318
+ " mean = self.sum /self. count\n",
319
+ " t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
320
+ " sd = math.sqrt(t)\n",
321
+ " re = (mean, sd)\n",
322
+ " return re\n",
323
+ "\n",
324
+ " def addGetStat(self,value):\n",
325
+ " \"\"\"\n",
326
+ " calculate mean and std deviation with new value added\n",
327
+ " Parameters\n",
328
+ " value : value to add\n",
329
+ " \"\"\"\n",
330
+ " self.add(value)\n",
331
+ " re = self.getStat()\n",
332
+ " return re\n",
333
+ "\n",
334
+ " def getCount(self):\n",
335
+ " \"\"\"\n",
336
+ " return count\n",
337
+ " \"\"\"\n",
338
+ " return self.count\n",
339
+ "\n",
340
+ " def getState(self):\n",
341
+ " \"\"\"\n",
342
+ " return state\n",
343
+ " \"\"\"\n",
344
+ " s = (self.count, self.sum, self.sumSq)\n",
345
+ " return s\n",
346
+ "\n",
347
+ "class SlidingWindowStat:\n",
348
+ " \"\"\"\n",
349
+ " sliding window stats\n",
350
+ " \"\"\"\n",
351
+ " def __init__(self):\n",
352
+ " \"\"\"\n",
353
+ " initializer\n",
354
+ " \"\"\"\n",
355
+ " self.sum = 0.0\n",
356
+ " self.sumSq = 0.0\n",
357
+ " self.count = 0\n",
358
+ " self.values = None\n",
359
+ "\n",
360
+ " @staticmethod\n",
361
+ " def create(values, sum, sumSq):\n",
362
+ " \"\"\"\n",
363
+ " creates iinstance\t\n",
364
+ "\n",
365
+ " Parameters\n",
366
+ " sum : sum of values\n",
367
+ " sumSq : sum of valure squared\n",
368
+ " \"\"\"\n",
369
+ " sws = SlidingWindowStat()\n",
370
+ " sws.sum = sum\n",
371
+ " sws.sumSq = sumSq\n",
372
+ " self.values = values.copy()\n",
373
+ " sws.count = len(self.values)\n",
374
+ " return sws\n",
375
+ "\n",
376
+ " @staticmethod\n",
377
+ " def initialize(values):\n",
378
+ " \"\"\"\n",
379
+ " creates iinstance\t\n",
380
+ "\n",
381
+ " Parameters\n",
382
+ " values : list of values\n",
383
+ " \"\"\"\n",
384
+ " sws = SlidingWindowStat()\n",
385
+ " sws.values = values.copy()\n",
386
+ " for v in sws.values:\n",
387
+ " sws.sum += v\n",
388
+ " sws.sumSq += v * v\t\t\n",
389
+ " sws.count = len(sws.values)\n",
390
+ " return sws\n",
391
+ "\n",
392
+ " @staticmethod\n",
393
+ " def createEmpty(count):\n",
394
+ " \"\"\"\n",
395
+ " creates iinstance\t\n",
396
+ "\n",
397
+ " Parameters\n",
398
+ " count : count of values\n",
399
+ " \"\"\"\n",
400
+ " sws = SlidingWindowStat()\n",
401
+ " sws.count = count\n",
402
+ " sws.values = list()\n",
403
+ " return sws\n",
404
+ "\n",
405
+ " def add(self, value):\n",
406
+ " \"\"\"\n",
407
+ " adds new value\n",
408
+ "\n",
409
+ " Parameters\n",
410
+ " value : value to add\n",
411
+ " \"\"\"\n",
412
+ " self.values.append(value)\t\t\n",
413
+ " if len(self.values) > self.count:\n",
414
+ " self.sum += value - self.values[0]\n",
415
+ " self.sumSq += (value * value) - (self.values[0] * self.values[0])\n",
416
+ " self.values.pop(0)\n",
417
+ " else:\n",
418
+ " self.sum += value\n",
419
+ " self.sumSq += (value * value)\n",
420
+ "\n",
421
+ "\n",
422
+ " def getStat(self):\n",
423
+ " \"\"\"\n",
424
+ " calculate mean and std deviation \n",
425
+ " \"\"\"\n",
426
+ " mean = self.sum /self. count\n",
427
+ " t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n",
428
+ " sd = math.sqrt(t)\n",
429
+ " re = (mean, sd)\n",
430
+ " return re\n",
431
+ "\n",
432
+ " def addGetStat(self,value):\n",
433
+ " \"\"\"\n",
434
+ " calculate mean and std deviation with new value added\n",
435
+ " \"\"\"\n",
436
+ " self.add(value)\n",
437
+ " re = self.getStat()\n",
438
+ " return re\n",
439
+ "\n",
440
+ " def getCount(self):\n",
441
+ " \"\"\"\n",
442
+ " return count\n",
443
+ " \"\"\"\n",
444
+ " return self.count\n",
445
+ "\n",
446
+ " def getCurSize(self):\n",
447
+ " \"\"\"\n",
448
+ " return count\n",
449
+ " \"\"\"\n",
450
+ " return len(self.values)\n",
451
+ "\n",
452
+ " def getState(self):\n",
453
+ " \"\"\"\n",
454
+ " return state\n",
455
+ " \"\"\"\n",
456
+ " s = (self.count, self.sum, self.sumSq)\n",
457
+ " return s\n",
458
+ "\n",
459
+ "\n",
460
+ "def basicStat(ldata):\n",
461
+ " \"\"\"\n",
462
+ " mean and std dev\n",
463
+ " Parameters\n",
464
+ " ldata : list of values\n",
465
+ " \"\"\"\n",
466
+ " m = statistics.mean(ldata)\n",
467
+ " s = statistics.stdev(ldata, xbar=m)\n",
468
+ " r = (m, s)\n",
469
+ " return r\n",
470
+ "\n",
471
+ "def getFileColumnStat(filePath, col, delem=\",\"):\n",
472
+ " \"\"\"\n",
473
+ " gets stats for a file column\n",
474
+ "\n",
475
+ " Parameters\n",
476
+ " filePath : file path\n",
477
+ " col : col index\n",
478
+ " delem : field delemter\n",
479
+ " \"\"\"\n",
480
+ " rs = RunningStat()\n",
481
+ " for rec in fileRecGen(filePath, delem):\n",
482
+ " va = float(rec[col])\n",
483
+ " rs.add(va)\n",
484
+ "\n",
485
+ " return rs.getStat()\n"
486
+ ]
487
+ }
488
+ ],
489
+ "metadata": {
490
+ "kernelspec": {
491
+ "display_name": "Python 3 (ipykernel)",
492
+ "language": "python",
493
+ "name": "python3"
494
+ },
495
+ "language_info": {
496
+ "codemirror_mode": {
497
+ "name": "ipython",
498
+ "version": 3
499
+ },
500
+ "file_extension": ".py",
501
+ "mimetype": "text/x-python",
502
+ "name": "python",
503
+ "nbconvert_exporter": "python",
504
+ "pygments_lexer": "ipython3",
505
+ "version": "3.9.12"
506
+ }
507
+ },
508
+ "nbformat": 4,
509
+ "nbformat_minor": 5
510
+ }
lib/tnn.ipynb ADDED
@@ -0,0 +1,800 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "3853095d",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import sys\n",
12
+ "import matplotlib.pyplot as plt\n",
13
+ "import numpy as np\n",
14
+ "import torch\n",
15
+ "from torch.autograd import Variable\n",
16
+ "from torch.utils.data import Dataset, TensorDataset\n",
17
+ "from torch.utils.data import DataLoader\n",
18
+ "import sklearn as sk\n",
19
+ "from sklearn.neighbors import KDTree\n",
20
+ "import matplotlib\n",
21
+ "import random\n",
22
+ "import jprops\n",
23
+ "from random import randint\n",
24
+ "import statistics\n",
25
+ "sys.path.append(os.path.abspath(\"../lib\"))\n",
26
+ "from util import *\n",
27
+ "from mlutil import *\n",
28
+ "\n",
29
+ "\"\"\"\n",
30
+ "forward hook function\n",
31
+ "\"\"\"\n",
32
+ "intermedOut = {}\n",
33
+ "lvalues = list()\n",
34
+ "\n",
35
+ "def hookFn(m, i, o):\n",
36
+ " \"\"\"\n",
37
+ " call back for latent values\n",
38
+ " \"\"\"\n",
39
+ " #intermedOut[m] = o\n",
40
+ " lv = o.data.cpu().numpy()\n",
41
+ " lv = lv[0].tolist()\n",
42
+ " lvalues.append(lv)\n",
43
+ " #print(lv)\n",
44
+ "\n",
45
+ "def getLatValues():\n",
46
+ " \"\"\"\n",
47
+ " \"\"\"\n",
48
+ " return lvalues\n",
49
+ "\n",
50
+ "class FeedForwardNetwork(torch.nn.Module):\n",
51
+ " def __init__(self, configFile, addDefValues=None):\n",
52
+ " \"\"\"\n",
53
+ " In the constructor we instantiate two nn.Linear modules and assign them as\n",
54
+ " member variables.\n",
55
+ "\n",
56
+ " Parameters\n",
57
+ " configFile : config file path\n",
58
+ " addDefValues : dictionary of additional default values\t\n",
59
+ " \"\"\"\n",
60
+ " defValues = dict() if addDefValues is None else addDefValues.copy()\n",
61
+ " defValues[\"common.mode\"] = (\"training\", None)\n",
62
+ " defValues[\"common.model.directory\"] = (\"model\", None)\n",
63
+ " defValues[\"common.model.file\"] = (None, None)\n",
64
+ " defValues[\"common.preprocessing\"] = (None, None)\n",
65
+ " defValues[\"common.scaling.method\"] = (\"zscale\", None)\n",
66
+ " defValues[\"common.scaling.minrows\"] = (50, None)\n",
67
+ " defValues[\"common.scaling.param.file\"] = (None, None)\n",
68
+ " defValues[\"common.verbose\"] = (False, None)\n",
69
+ " defValues[\"common.device\"] = (\"cpu\", None)\n",
70
+ " defValues[\"train.data.file\"] = (None, \"missing training data file\")\n",
71
+ " defValues[\"train.data.fields\"] = (None, \"missing training data field ordinals\")\n",
72
+ " defValues[\"train.data.feature.fields\"] = (None, \"missing training data feature field ordinals\")\n",
73
+ " defValues[\"train.data.out.fields\"] = (None, \"missing training data feature field ordinals\")\n",
74
+ " defValues[\"train.layer.data\"] = (None, \"missing layer data\")\n",
75
+ " defValues[\"train.input.size\"] = (None, None)\n",
76
+ " defValues[\"train.output.size\"] = (None, \"missing output size\")\n",
77
+ " defValues[\"train.batch.size\"] = (10, None)\n",
78
+ " defValues[\"train.loss.reduction\"] = (\"mean\", None)\n",
79
+ " defValues[\"train.num.iterations\"] = (500, None)\n",
80
+ " defValues[\"train.lossFn\"] = (\"mse\", None) \n",
81
+ " defValues[\"train.optimizer\"] = (\"sgd\", None) \n",
82
+ " defValues[\"train.opt.learning.rate\"] = (.0001, None)\n",
83
+ " defValues[\"train.opt.weight.decay\"] = (0, None) \n",
84
+ " defValues[\"train.opt.momentum\"] = (0, None) \n",
85
+ " defValues[\"train.opt.eps\"] = (1e-08, None) \n",
86
+ " defValues[\"train.opt.dampening\"] = (0, None) \n",
87
+ " defValues[\"train.opt.momentum.nesterov\"] = (False, None) \n",
88
+ " defValues[\"train.opt.betas\"] = ([0.9, 0.999], None) \n",
89
+ " defValues[\"train.opt.alpha\"] = (0.99, None) \n",
90
+ " defValues[\"train.save.model\"] = (False, None) \n",
91
+ " defValues[\"train.track.error\"] = (False, None) \n",
92
+ " defValues[\"train.epoch.intv\"] = (5, None) \n",
93
+ " defValues[\"train.batch.intv\"] = (5, None) \n",
94
+ " defValues[\"train.print.weights\"] = (False, None) \n",
95
+ " defValues[\"valid.data.file\"] = (None, None)\n",
96
+ " defValues[\"valid.accuracy.metric\"] = (None, None)\n",
97
+ " defValues[\"predict.data.file\"] = (None, None)\n",
98
+ " defValues[\"predict.use.saved.model\"] = (True, None)\n",
99
+ " defValues[\"predict.output\"] = (\"binary\", None)\n",
100
+ " defValues[\"predict.feat.pad.size\"] = (60, None)\n",
101
+ " defValues[\"predict.print.output\"] = (True, None)\n",
102
+ " defValues[\"calibrate.num.bins\"] = (10, None)\n",
103
+ " defValues[\"calibrate.pred.prob.thresh\"] = (0.5, None)\n",
104
+ " defValues[\"calibrate.num.nearest.neighbors\"] = (10, None)\n",
105
+ " self.config = Configuration(configFile, defValues)\n",
106
+ "\n",
107
+ " super(FeedForwardNetwork, self).__init__()\n",
108
+ "\n",
109
+ " def setConfigParam(self, name, value):\n",
110
+ " \"\"\"\n",
111
+ " set config param\n",
112
+ "\n",
113
+ " Parameters\n",
114
+ " name : config name\n",
115
+ " value : config value\n",
116
+ " \"\"\"\n",
117
+ " self.config.setParam(name, value)\n",
118
+ "\n",
119
+ " def getConfig(self):\n",
120
+ " \"\"\"\n",
121
+ " get config object\n",
122
+ " \"\"\"\n",
123
+ " return self.config\n",
124
+ "\n",
125
+ " def setVerbose(self, verbose):\n",
126
+ " self.verbose = verbose\n",
127
+ "\n",
128
+ " def buildModel(self):\n",
129
+ " \"\"\"\n",
130
+ " Loads configuration and builds the various piecess necessary for the model\n",
131
+ " \"\"\"\n",
132
+ " torch.manual_seed(9999)\n",
133
+ "\n",
134
+ " self.verbose = self.config.getBooleanConfig(\"common.verbose\")[0]\n",
135
+ " numinp = self.config.getIntConfig(\"train.input.size\")[0]\n",
136
+ " if numinp is None:\n",
137
+ " numinp = len(self.config.getIntListConfig(\"train.data.feature.fields\")[0])\n",
138
+ " #numOut = len(self.config.getStringConfig(\"train.data.out.fields\")[0].split(\",\"))\n",
139
+ " self.outputSize = self.config.getIntConfig(\"train.output.size\")[0]\n",
140
+ " self.batchSize = self.config.getIntConfig(\"train.batch.size\")[0]\n",
141
+ " #lossRed = self.config.getStringConfig(\"train.loss.reduction\")[0]\n",
142
+ " #learnRate = self.config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
143
+ " self.numIter = self.config.getIntConfig(\"train.num.iterations\")[0]\n",
144
+ " optimizer = self.config.getStringConfig(\"train.optimizer\")[0]\n",
145
+ " self.lossFnStr = self.config.getStringConfig(\"train.lossFn\")[0]\n",
146
+ " self.accMetric = self.config.getStringConfig(\"valid.accuracy.metric\")[0]\n",
147
+ " self.trackErr = self.config.getBooleanConfig(\"train.track.error\")[0]\n",
148
+ " self.batchIntv = self.config.getIntConfig(\"train.batch.intv\")[0]\n",
149
+ " self.restored = False\n",
150
+ " self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None\n",
151
+ "\n",
152
+ " #build network\n",
153
+ " layers = list()\n",
154
+ " ninp = numinp\n",
155
+ " trData = self.config.getStringConfig(\"train.layer.data\")[0].split(\",\")\n",
156
+ " for ld in trData:\n",
157
+ " lde = ld.split(\":\")\n",
158
+ " assert len(lde) == 5, \"expecting 5 items for layer data\"\n",
159
+ "\n",
160
+ " #num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction\n",
161
+ " nunit = int(lde[0])\n",
162
+ " actStr = lde[1]\n",
163
+ " act = FeedForwardNetwork.createActivation(actStr) if actStr != \"none\" else None\n",
164
+ " bnorm = lde[2] == \"true\"\n",
165
+ " afterAct = lde[3] == \"true\"\n",
166
+ " dpr = float(lde[4])\n",
167
+ "\n",
168
+ " layers.append(torch.nn.Linear(ninp, nunit))\t\t\t\n",
169
+ " if bnorm:\n",
170
+ " #with batch norm\n",
171
+ " if afterAct:\n",
172
+ " safeAppend(layers, act)\n",
173
+ " layers.append(torch.nn.BatchNorm1d(nunit))\n",
174
+ " else:\n",
175
+ " layers.append(torch.nn.BatchNorm1d(nunit))\n",
176
+ " safeAppend(layers, act)\n",
177
+ " else:\n",
178
+ " #without batch norm\n",
179
+ " safeAppend(layers, act)\n",
180
+ "\n",
181
+ " if dpr > 0:\n",
182
+ " layers.append(torch.nn.Dropout(dpr))\n",
183
+ " ninp = nunit\n",
184
+ "\n",
185
+ " self.layers = torch.nn.Sequential(*layers)\t\n",
186
+ "\n",
187
+ " self.device = FeedForwardNetwork.getDevice(self)\n",
188
+ "\n",
189
+ " #training data\n",
190
+ " dataFile = self.config.getStringConfig(\"train.data.file\")[0]\n",
191
+ " (featData, outData) = FeedForwardNetwork.prepData(self, dataFile)\n",
192
+ " self.featData = torch.from_numpy(featData)\n",
193
+ " self.outData = torch.from_numpy(outData)\n",
194
+ "\n",
195
+ " #validation data\n",
196
+ " dataFile = self.config.getStringConfig(\"valid.data.file\")[0]\n",
197
+ " (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataFile)\n",
198
+ " self.validFeatData = torch.from_numpy(featDataV)\n",
199
+ " self.validOutData = torch.from_numpy(outDataV)\n",
200
+ "\n",
201
+ " # loss function and optimizer\n",
202
+ " self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)\n",
203
+ " self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizer)\n",
204
+ "\n",
205
+ " self.yPred = None\n",
206
+ " self.restored = False\n",
207
+ "\n",
208
+ " #mode to device\n",
209
+ " self.device = FeedForwardNetwork.getDevice(self)\t\n",
210
+ " self.featData = self.featData.to(self.device)\n",
211
+ " self.outData = self.outData.to(self.device)\n",
212
+ " self.validFeatData = self.validFeatData.to(self.device)\n",
213
+ " self.to(self.device)\n",
214
+ "\n",
215
+ " @staticmethod\n",
216
+ " def getDevice(model):\n",
217
+ " \"\"\"\n",
218
+ " gets device\n",
219
+ "\n",
220
+ " Parameters\n",
221
+ " model : torch model\n",
222
+ " \"\"\"\n",
223
+ " devType = model.config.getStringConfig(\"common.device\")[0]\n",
224
+ " if devType == \"cuda\":\n",
225
+ " if torch.cuda.is_available():\n",
226
+ " device = torch.device(\"cuda\")\n",
227
+ " else:\n",
228
+ " exitWithMsg(\"cuda not available\")\n",
229
+ " else:\n",
230
+ " device = torch.device(\"cpu\")\n",
231
+ " return device\n",
232
+ "\n",
233
+ " def setValidationData(self, dataSource, prep=True):\n",
234
+ " \"\"\"\n",
235
+ " sets validation data\n",
236
+ "\n",
237
+ " Parameters\n",
238
+ " dataSource : data source str if file path or 2D array\n",
239
+ " prep : if True load and prepare \n",
240
+ " \"\"\"\n",
241
+ " if prep:\n",
242
+ " (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataSource)\n",
243
+ " self.validFeatData = torch.from_numpy(featDataV)\n",
244
+ " self.validOutData = outDataV\n",
245
+ " else:\n",
246
+ " self.validFeatData = torch.from_numpy(dataSource[0])\n",
247
+ " self.validOutData = dataSource[1]\t\t\n",
248
+ "\n",
249
+ " self.validFeatData = self.validFeatData.to(self.device)\n",
250
+ "\n",
251
+ " @staticmethod\n",
252
+ " def createActivation(actName):\n",
253
+ " \"\"\"\n",
254
+ " create activation\n",
255
+ "\n",
256
+ " Parameters\n",
257
+ " actName : activation name\n",
258
+ " \"\"\"\n",
259
+ " if actName is None:\n",
260
+ " activation = None\n",
261
+ " elif actName == \"relu\":\n",
262
+ " activation = torch.nn.ReLU()\n",
263
+ " elif actName == \"tanh\":\n",
264
+ " activation = torch.nn.Tanh()\n",
265
+ " elif actName == \"sigmoid\":\n",
266
+ " activation = torch.nn.Sigmoid()\n",
267
+ " elif actName == \"softmax\":\n",
268
+ " activation = torch.nn.Softmax(dim=1)\n",
269
+ " else:\n",
270
+ " exitWithMsg(\"invalid activation function name \" + actName)\n",
271
+ " return activation\n",
272
+ "\n",
273
+ " @staticmethod\n",
274
+ " def createLossFunction(model, lossFnName):\n",
275
+ " \"\"\"\n",
276
+ " create loss function\n",
277
+ "\n",
278
+ " Parameters\n",
279
+ " lossFnName : loss function name\n",
280
+ " \"\"\"\n",
281
+ " config = model.config\n",
282
+ " lossRed = config.getStringConfig(\"train.loss.reduction\")[0]\n",
283
+ " if lossFnName == \"ltwo\" or lossFnName == \"mse\":\n",
284
+ " lossFunc = torch.nn.MSELoss(reduction=lossRed)\n",
285
+ " elif lossFnName == \"ce\":\n",
286
+ " lossFunc = torch.nn.CrossEntropyLoss(reduction=lossRed)\n",
287
+ " elif lossFnName == \"lone\" or lossFnName == \"mae\":\n",
288
+ " lossFunc = torch.nn.L1Loss(reduction=lossRed)\n",
289
+ " elif lossFnName == \"bce\":\n",
290
+ " lossFunc = torch.nn.BCELoss(reduction=lossRed)\n",
291
+ " elif lossFnName == \"bcel\":\n",
292
+ " lossFunc = torch.nn.BCEWithLogitsLoss(reduction=lossRed)\n",
293
+ " elif lossFnName == \"sm\":\n",
294
+ " lossFunc = torch.nn.SoftMarginLoss(reduction=lossRed)\n",
295
+ " elif lossFnName == \"mlsm\":\n",
296
+ " lossFunc = torch.nn.MultiLabelSoftMarginLoss(reduction=lossRed)\n",
297
+ " else:\n",
298
+ " exitWithMsg(\"invalid loss function name \" + lossFnName)\n",
299
+ " return lossFunc\n",
300
+ "\n",
301
+ " @staticmethod\n",
302
+ " def createOptimizer(model, optName):\n",
303
+ " \"\"\"\n",
304
+ " create optimizer\n",
305
+ "\n",
306
+ " Parameters\n",
307
+ " optName : optimizer name\n",
308
+ " \"\"\"\n",
309
+ " config = model.config\n",
310
+ " learnRate = config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
311
+ " weightDecay = config.getFloatConfig(\"train.opt.weight.decay\")[0]\n",
312
+ " momentum = config.getFloatConfig(\"train.opt.momentum\")[0]\n",
313
+ " eps = config.getFloatConfig(\"train.opt.eps\")[0]\n",
314
+ " if optName == \"sgd\":\n",
315
+ " dampening = config.getFloatConfig(\"train.opt.dampening\")[0]\n",
316
+ " momentumNesterov = config.getBooleanConfig(\"train.opt.momentum.nesterov\")[0]\n",
317
+ " optimizer = torch.optim.SGD(model.parameters(),lr=learnRate, momentum=momentum, \n",
318
+ " dampening=dampening, weight_decay=weightDecay, nesterov=momentumNesterov)\n",
319
+ " elif optName == \"adam\":\n",
320
+ " betas = config.getFloatListConfig(\"train.opt.betas\")[0]\n",
321
+ " betas = (betas[0], betas[1]) \n",
322
+ " optimizer = torch.optim.Adam(model.parameters(), lr=learnRate,betas=betas, eps = eps,\n",
323
+ " weight_decay=weightDecay)\n",
324
+ " elif optName == \"rmsprop\":\n",
325
+ " alpha = config.getFloatConfig(\"train.opt.alpha\")[0]\n",
326
+ " optimizer = torch.optim.RMSprop(model.parameters(), lr=learnRate, alpha=alpha,\n",
327
+ " eps=eps, weight_decay=weightDecay, momentum=momentum)\n",
328
+ " else:\n",
329
+ " exitWithMsg(\"invalid optimizer name \" + optName)\n",
330
+ " return optimizer\n",
331
+ "\n",
332
+ "\n",
333
+ " def forward(self, x):\n",
334
+ " \"\"\"\n",
335
+ " In the forward function we accept a Tensor of input data and we must return\n",
336
+ " a Tensor of output data. We can use Modules defined in the constructor as\n",
337
+ " well as arbitrary (differentiable) operations on Tensors.\n",
338
+ "\n",
339
+ " Parameters\n",
340
+ " x : data batch\n",
341
+ " \"\"\"\n",
342
+ " y = self.layers(x)\t\n",
343
+ " return y\n",
344
+ "\n",
345
+ " @staticmethod\n",
346
+ " def addForwardHook(model, l, cl = 0):\n",
347
+ " \"\"\"\n",
348
+ " register forward hooks\n",
349
+ "\n",
350
+ " Parameters\n",
351
+ " l : \n",
352
+ " cl :\n",
353
+ " \"\"\"\n",
354
+ " for name, layer in model._modules.items():\n",
355
+ " #If it is a sequential, don't register a hook on it\n",
356
+ " # but recursively register hook on all it's module children\n",
357
+ " print(str(cl) + \" : \" + name)\n",
358
+ " if isinstance(layer, torch.nn.Sequential):\n",
359
+ " FeedForwardNetwork.addForwardHook(layer, l, cl)\n",
360
+ " else:\n",
361
+ " #\t it's a non sequential. Register a hook\n",
362
+ " if cl == l:\n",
363
+ " print(\"setting hook at layer \" + str(l))\n",
364
+ " layer.register_forward_hook(hookFn)\n",
365
+ " cl += 1\n",
366
+ "\n",
367
+ " @staticmethod\n",
368
+ " def prepData(model, dataSource, includeOutFld=True):\n",
369
+ " \"\"\"\n",
370
+ " loads and prepares data\n",
371
+ "\n",
372
+ " Parameters\n",
373
+ " dataSource : data source str if file path or 2D array\n",
374
+ " includeOutFld : True if target freld to be included\n",
375
+ " \"\"\"\n",
376
+ " # parameters\n",
377
+ " fieldIndices = model.config.getIntListConfig(\"train.data.fields\")[0]\n",
378
+ " featFieldIndices = model.config.getIntListConfig(\"train.data.feature.fields\")[0]\n",
379
+ "\n",
380
+ " #all data and feature data\n",
381
+ " isDataFile = isinstance(dataSource, str)\n",
382
+ " selFieldIndices = fieldIndices if includeOutFld else fieldIndices[:-1]\n",
383
+ " if isDataFile: \n",
384
+ " #source file path \n",
385
+ " (data, featData) = loadDataFile(dataSource, \",\", selFieldIndices, featFieldIndices)\n",
386
+ " else:\n",
387
+ " # tabular data\n",
388
+ " data = tableSelFieldsFilter(dataSource, selFieldIndices)\n",
389
+ " featData = tableSelFieldsFilter(data, featFieldIndices)\n",
390
+ " #print(featData)\n",
391
+ " featData = np.array(featData)\n",
392
+ "\n",
393
+ " if (model.config.getStringConfig(\"common.preprocessing\")[0] == \"scale\"):\n",
394
+ " scalingMethod = model.config.getStringConfig(\"common.scaling.method\")[0]\n",
395
+ "\n",
396
+ " #scale only if there are enough rows\n",
397
+ " nrow = featData.shape[0]\n",
398
+ " minrows = model.config.getIntConfig(\"common.scaling.minrows\")[0]\n",
399
+ " if nrow > minrows:\n",
400
+ " #in place scaling\n",
401
+ " featData = scaleData(featData, scalingMethod)\n",
402
+ " else:\n",
403
+ " #use pre computes scaling parameters\n",
404
+ " spFile = model.config.getStringConfig(\"common.scaling.param.file\")[0]\n",
405
+ " if spFile is None:\n",
406
+ " exitWithMsg(\"for small data sets pre computed scaling parameters need to provided\")\n",
407
+ " scParams = restoreObject(spFile)\n",
408
+ " featData = scaleDataWithParams(featData, scalingMethod, scParams)\n",
409
+ " featData = np.array(featData)\n",
410
+ "\n",
411
+ " # target data\n",
412
+ " if includeOutFld:\n",
413
+ " outFieldIndices = model.config.getStringConfig(\"train.data.out.fields\")[0]\n",
414
+ " outFieldIndices = strToIntArray(outFieldIndices, \",\")\n",
415
+ " if isDataFile:\n",
416
+ " outData = data[:,outFieldIndices]\n",
417
+ " else:\n",
418
+ " outData = tableSelFieldsFilter(data, outFieldIndices)\n",
419
+ " outData = np.array(outData)\n",
420
+ " foData = (featData.astype(np.float32), outData.astype(np.float32))\n",
421
+ " else:\n",
422
+ " foData = featData.astype(np.float32)\n",
423
+ " return foData\n",
424
+ "\n",
425
+ " @staticmethod\n",
426
+ " def saveCheckpt(model):\n",
427
+ " \"\"\"\n",
428
+ " checkpoints model\n",
429
+ "\n",
430
+ " Parameters\n",
431
+ " model : torch model\n",
432
+ " \"\"\"\n",
433
+ " print(\"..saving model checkpoint\")\n",
434
+ " modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
435
+ " assert os.path.exists(modelDirectory), \"model save directory does not exist\"\n",
436
+ " modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
437
+ " filepath = os.path.join(modelDirectory, modelFile)\n",
438
+ " state = {\"state_dict\": model.state_dict(), \"optim_dict\": model.optimizer.state_dict()}\n",
439
+ " torch.save(state, filepath)\n",
440
+ " if model.verbose:\n",
441
+ " print(\"model saved\")\n",
442
+ "\n",
443
+ " @staticmethod\n",
444
+ " def restoreCheckpt(model, loadOpt=False):\n",
445
+ " \"\"\"\n",
446
+ " restored checkpointed model\n",
447
+ "\n",
448
+ " Parameters\n",
449
+ " model : torch model\n",
450
+ " loadOpt : True if optimizer to be loaded\n",
451
+ " \"\"\"\n",
452
+ " if not model.restored:\n",
453
+ " print(\"..restoring model checkpoint\")\n",
454
+ " modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
455
+ " modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
456
+ " filepath = os.path.join(modelDirectory, modelFile)\n",
457
+ " assert os.path.exists(filepath), \"model save file does not exist\"\n",
458
+ " checkpoint = torch.load(filepath)\n",
459
+ " model.load_state_dict(checkpoint[\"state_dict\"])\n",
460
+ " model.to(model.device)\n",
461
+ " if loadOpt:\n",
462
+ " model.optimizer.load_state_dict(checkpoint[\"optim_dict\"])\n",
463
+ " model.restored = True\n",
464
+ "\n",
465
+ " @staticmethod\n",
466
+ " def processClassifOutput(yPred, config):\n",
467
+ " \"\"\"\n",
468
+ " extracts probability label 1 or label with highest probability\n",
469
+ "\n",
470
+ " Parameters\n",
471
+ " yPred : predicted output\n",
472
+ " config : config object\n",
473
+ " \"\"\"\n",
474
+ " outType = config.getStringConfig(\"predict.output\")[0]\n",
475
+ " if outType == \"prob\":\n",
476
+ " outputSize = config.getIntConfig(\"train.output.size\")[0]\n",
477
+ " if outputSize == 2:\n",
478
+ " #return prob of pos class for binary classifier \n",
479
+ " yPred = yPred[:, 1]\n",
480
+ " else:\n",
481
+ " #return class value and probability for multi classifier \n",
482
+ " yCl = np.argmax(yPred, axis=1)\n",
483
+ " yPred = list(map(lambda y : y[0][y[1]], zip(yPred, yCl)))\n",
484
+ " yPred = zip(yCl, yPred)\n",
485
+ " else:\n",
486
+ " yPred = np.argmax(yPred, axis=1)\n",
487
+ " return yPred\n",
488
+ "\n",
489
+ " @staticmethod\n",
490
+ " def printPrediction(yPred, config, dataSource):\n",
491
+ " \"\"\"\n",
492
+ " prints input feature data and prediction\n",
493
+ "\n",
494
+ " Parameters\n",
495
+ " yPred : predicted output\n",
496
+ " config : config object\n",
497
+ " dataSource : data source str if file path or 2D array\n",
498
+ " \"\"\"\n",
499
+ " #prDataFilePath = config.getStringConfig(\"predict.data.file\")[0]\n",
500
+ " padWidth = config.getIntConfig(\"predict.feat.pad.size\")[0]\n",
501
+ " i = 0\n",
502
+ " if type(dataSource) == str:\n",
503
+ " for rec in fileRecGen(dataSource, \",\"):\n",
504
+ " feat = (\",\".join(rec)).ljust(padWidth, \" \")\n",
505
+ " rec = feat + \"\\t\" + str(yPred[i])\n",
506
+ " print(rec)\n",
507
+ " i += 1\n",
508
+ " else:\n",
509
+ " for rec in dataSource:\n",
510
+ " srec = toStrList(rec, 6)\n",
511
+ " feat = (\",\".join(srec)).ljust(padWidth, \" \")\n",
512
+ " srec = feat + \"\\t\" + str(yPred[i])\n",
513
+ " print(srec)\n",
514
+ " i += 1\n",
515
+ "\n",
516
+ "\n",
517
+ " @staticmethod\n",
518
+ " def allTrain(model):\n",
519
+ " \"\"\"\n",
520
+ " train with all data\n",
521
+ "\n",
522
+ " Parameters\n",
523
+ " model : torch model\n",
524
+ " \"\"\"\n",
525
+ " # train mode\n",
526
+ " model.train()\n",
527
+ " for t in range(model.numIter):\n",
528
+ "\n",
529
+ "\n",
530
+ " # Forward pass: Compute predicted y by passing x to the model\n",
531
+ " yPred = model(model.featData)\n",
532
+ "\n",
533
+ " # Compute and print loss\n",
534
+ " loss = model.lossFn(yPred, model.outData)\n",
535
+ " if model.verbose and t % 50 == 0:\n",
536
+ " print(\"epoch {} loss {:.6f}\".format(t, loss.item()))\n",
537
+ "\n",
538
+ " # Zero gradients, perform a backward pass, and update the weights.\n",
539
+ " model.optimizer.zero_grad()\n",
540
+ " loss.backward()\n",
541
+ " model.optimizer.step() \t\n",
542
+ "\n",
543
+ " #validate\n",
544
+ " model.eval()\n",
545
+ " yPred = model(model.validFeatData)\n",
546
+ " yPred = yPred.data.cpu().numpy()\n",
547
+ " yActual = model.validOutData\n",
548
+ " if model.verbose:\n",
549
+ " result = np.concatenate((yPred, yActual), axis = 1)\n",
550
+ " print(\"predicted actual\")\n",
551
+ " print(result)\n",
552
+ "\n",
553
+ " score = perfMetric(model.accMetric, yActual, yPred)\n",
554
+ " print(formatFloat(3, score, \"perf score\"))\n",
555
+ " return score\n",
556
+ "\n",
557
+ " @staticmethod\n",
558
+ " def batchTrain(model):\n",
559
+ " \"\"\"\n",
560
+ " train with batch data\n",
561
+ "\n",
562
+ " Parameters\n",
563
+ " model : torch model\n",
564
+ " \"\"\"\n",
565
+ " model.restored = False\n",
566
+ " trainData = TensorDataset(model.featData, model.outData)\n",
567
+ " trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)\n",
568
+ " epochIntv = model.config.getIntConfig(\"train.epoch.intv\")[0]\n",
569
+ "\n",
570
+ " # train mode\n",
571
+ " model.train()\n",
572
+ "\n",
573
+ " if model.trackErr:\n",
574
+ " trErr = list()\n",
575
+ " vaErr = list()\n",
576
+ " #epoch\n",
577
+ " for t in range(model.numIter):\n",
578
+ " #batch\n",
579
+ " b = 0\n",
580
+ " epochLoss = 0.0\n",
581
+ " for xBatch, yBatch in trainDataLoader:\n",
582
+ "\n",
583
+ " # Forward pass: Compute predicted y by passing x to the model\n",
584
+ " xBatch, yBatch = xBatch.to(model.device), yBatch.to(model.device)\n",
585
+ " yPred = model(xBatch)\n",
586
+ "\n",
587
+ " # Compute and print loss\n",
588
+ " loss = model.lossFn(yPred, yBatch)\n",
589
+ " if model.verbose and t % epochIntv == 0 and b % model.batchIntv == 0:\n",
590
+ " print(\"epoch {} batch {} loss {:.6f}\".format(t, b, loss.item()))\n",
591
+ "\n",
592
+ " if model.trackErr and model.batchIntv == 0:\n",
593
+ " epochLoss += loss.item()\n",
594
+ "\n",
595
+ " #error tracking at batch level\n",
596
+ " if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:\n",
597
+ " trErr.append(loss.item())\n",
598
+ " vloss = FeedForwardNetwork.evaluateModel(model)\n",
599
+ " vaErr.append(vloss)\n",
600
+ "\n",
601
+ " # Zero gradients, perform a backward pass, and update the weights.\n",
602
+ " model.optimizer.zero_grad()\n",
603
+ " loss.backward()\n",
604
+ " model.optimizer.step() \t\n",
605
+ " b += 1\n",
606
+ "\n",
607
+ " #error tracking at epoch level\n",
608
+ " if model.trackErr and model.batchIntv == 0:\n",
609
+ " epochLoss /= len(trainDataLoader)\n",
610
+ " trErr.append(epochLoss)\n",
611
+ " vloss = FeedForwardNetwork.evaluateModel(model)\n",
612
+ " vaErr.append(vloss)\n",
613
+ "\n",
614
+ " #validate\n",
615
+ " model.eval()\n",
616
+ " yPred = model(model.validFeatData)\n",
617
+ " yPred = yPred.data.cpu().numpy()\n",
618
+ " yActual = model.validOutData\n",
619
+ " if model.verbose:\n",
620
+ " vsize = yPred.shape[0]\n",
621
+ " print(\"\\npredicted \\t\\t actual\")\n",
622
+ " for i in range(vsize):\n",
623
+ " print(str(yPred[i]) + \"\\t\" + str(yActual[i]))\n",
624
+ "\n",
625
+ " score = perfMetric(model.accMetric, yActual, yPred)\n",
626
+ " print(yActual)\n",
627
+ " print(yPred)\n",
628
+ " print(formatFloat(3, score, \"perf score\"))\n",
629
+ "\n",
630
+ " #save\n",
631
+ " modelSave = model.config.getBooleanConfig(\"train.model.save\")[0]\n",
632
+ " if modelSave:\n",
633
+ " FeedForwardNetwork.saveCheckpt(model)\n",
634
+ "\n",
635
+ " if model.trackErr:\n",
636
+ " FeedForwardNetwork.errorPlot(model, trErr, vaErr)\n",
637
+ "\n",
638
+ " if model.config.getBooleanConfig(\"train.print.weights\")[0]:\n",
639
+ " print(\"model weights\")\n",
640
+ " for param in model.parameters():\n",
641
+ " print(param.data)\n",
642
+ " return score\n",
643
+ "\n",
644
+ " @staticmethod\n",
645
+ " def errorPlot(model, trErr, vaErr):\n",
646
+ " \"\"\"\n",
647
+ " plot errors\n",
648
+ "\n",
649
+ " Parameters\n",
650
+ " trErr : training error list\t\n",
651
+ " vaErr : validation error list\t\n",
652
+ " \"\"\"\n",
653
+ " x = np.arange(len(trErr))\n",
654
+ " plt.plot(x,trErr,label = \"training error\")\n",
655
+ " plt.plot(x,vaErr,label = \"validation error\")\n",
656
+ " plt.xlabel(\"iteration\")\n",
657
+ " plt.ylabel(\"error\")\n",
658
+ " plt.legend([\"training error\", \"validation error\"], loc='upper left')\n",
659
+ " plt.show()\n",
660
+ "\n",
661
+ " @staticmethod\n",
662
+ " def modelPredict(model, dataSource = None):\n",
663
+ " \"\"\"\n",
664
+ " predict\n",
665
+ "\n",
666
+ " Parameters\n",
667
+ " model : torch model\n",
668
+ " dataSource : data source\n",
669
+ " \"\"\"\n",
670
+ " #train or restore model\n",
671
+ " useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
672
+ " if useSavedModel:\n",
673
+ " FeedForwardNetwork.restoreCheckpt(model)\n",
674
+ " else:\n",
675
+ " FeedForwardNetwork.batchTrain(model) \n",
676
+ "\n",
677
+ " #predict\n",
678
+ " if dataSource is None:\n",
679
+ " dataSource = model.config.getStringConfig(\"predict.data.file\")[0]\n",
680
+ " featData = FeedForwardNetwork.prepData(model, dataSource, False)\n",
681
+ " #print(featData)\n",
682
+ " featData = torch.from_numpy(featData)\n",
683
+ " featData = featData.to(model.device)\n",
684
+ "\n",
685
+ " model.eval()\n",
686
+ " yPred = model(featData)\n",
687
+ " yPred = yPred.data.cpu().numpy()\n",
688
+ " #print(yPred)\n",
689
+ "\n",
690
+ " if model.outputSize >= 2:\n",
691
+ " #classification\n",
692
+ " yPred = FeedForwardNetwork.processClassifOutput(yPred, model.config)\n",
693
+ "\n",
694
+ " # print prediction\n",
695
+ " if model.config.getBooleanConfig(\"predict.print.output\")[0]:\n",
696
+ " FeedForwardNetwork.printPrediction(yPred, model.config, dataSource)\n",
697
+ "\n",
698
+ " return yPred\n",
699
+ "\n",
700
+ " def predict(self, dataSource = None):\n",
701
+ " \"\"\"\n",
702
+ " predict\n",
703
+ "\n",
704
+ " Parameters\n",
705
+ " dataSource : data source\n",
706
+ " \"\"\"\n",
707
+ " return FeedForwardNetwork.modelPredict(self, dataSource)\n",
708
+ "\n",
709
+ " @staticmethod\n",
710
+ " def evaluateModel(model):\n",
711
+ " \"\"\"\n",
712
+ " evaluate model\n",
713
+ "\n",
714
+ " Parameters\n",
715
+ " model : torch model\n",
716
+ " \"\"\"\n",
717
+ " model.eval()\n",
718
+ " with torch.no_grad():\n",
719
+ " yPred = model(model.validFeatData)\n",
720
+ " #yPred = yPred.data.cpu().numpy()\n",
721
+ " yActual = model.validOutData\n",
722
+ " score = model.lossFn(yPred, yActual).item()\n",
723
+ " model.train()\n",
724
+ " return score\n",
725
+ "\n",
726
+ " @staticmethod\n",
727
+ " def prepValidate(model, dataSource=None):\n",
728
+ " \"\"\"\n",
729
+ " prepare for validation\n",
730
+ "\n",
731
+ " Parameters\n",
732
+ " model : torch model\n",
733
+ " dataSource : data source\n",
734
+ " \"\"\"\n",
735
+ " #train or restore model\n",
736
+ " if not model.restored:\n",
737
+ " useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
738
+ " if useSavedModel:\n",
739
+ " FeedForwardNetwork.restoreCheckpt(model)\n",
740
+ " else:\n",
741
+ " FeedForwardNetwork.batchTrain(model)\n",
742
+ " model.restored = True\n",
743
+ "\n",
744
+ " if \tdataSource is not None:\n",
745
+ " model.setValidationData(dataSource)\n",
746
+ "\n",
747
+ " @staticmethod\n",
748
+ " def validateModel(model, retPred=False):\n",
749
+ " \"\"\"\n",
750
+ " pmodel validation\n",
751
+ "\n",
752
+ " Parameters\n",
753
+ " model : torch model\n",
754
+ " retPred : if True return prediction\n",
755
+ " \"\"\"\n",
756
+ " model.eval()\n",
757
+ " yPred = model(model.validFeatData)\n",
758
+ " yPred = yPred.data.cpu().numpy()\n",
759
+ " model.yPred = yPred\n",
760
+ " yActual = model.validOutData\n",
761
+ " vsize = yPred.shape[0]\n",
762
+ " if model.verbose:\n",
763
+ " print(\"\\npredicted \\t actual\")\n",
764
+ " for i in range(vsize):\n",
765
+ " print(\"{:.3f}\\t\\t{:.3f}\".format(yPred[i][0], yActual[i][0]))\n",
766
+ "\n",
767
+ " score = perfMetric(model.accMetric, yActual, yPred)\n",
768
+ " print(formatFloat(3, score, \"perf score\"))\n",
769
+ "\n",
770
+ " if retPred:\n",
771
+ " y = list(map(lambda i : (yPred[i][0], yActual[i][0]), range(vsize)))\n",
772
+ " res = (y, score)\n",
773
+ " return res\n",
774
+ " else:\t\n",
775
+ " return score"
776
+ ]
777
+ }
778
+ ],
779
+ "metadata": {
780
+ "kernelspec": {
781
+ "display_name": "Python 3 (ipykernel)",
782
+ "language": "python",
783
+ "name": "python3"
784
+ },
785
+ "language_info": {
786
+ "codemirror_mode": {
787
+ "name": "ipython",
788
+ "version": 3
789
+ },
790
+ "file_extension": ".py",
791
+ "mimetype": "text/x-python",
792
+ "name": "python",
793
+ "nbconvert_exporter": "python",
794
+ "pygments_lexer": "ipython3",
795
+ "version": "3.9.12"
796
+ }
797
+ },
798
+ "nbformat": 4,
799
+ "nbformat_minor": 5
800
+ }
lib/txproc.ipynb ADDED
@@ -0,0 +1,1002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "f720c141",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import sys\n",
12
+ "from random import randint\n",
13
+ "import random\n",
14
+ "import time\n",
15
+ "from datetime import datetime\n",
16
+ "import re, string, unicodedata\n",
17
+ "import nltk\n",
18
+ "import contractions\n",
19
+ "import inflect\n",
20
+ "from bs4 import BeautifulSoup\n",
21
+ "from nltk import word_tokenize, sent_tokenize\n",
22
+ "from nltk.corpus import stopwords\n",
23
+ "from nltk.stem.isri import ISRIStemmer\n",
24
+ "from nltk.stem.porter import PorterStemmer\n",
25
+ "from nltk.stem.snowball import SnowballStemmer\n",
26
+ "from nltk.stem import LancasterStemmer, WordNetLemmatizer\n",
27
+ "from nltk.tag import StanfordNERTagger\n",
28
+ "from nltk.tokenize import word_tokenize, sent_tokenize\n",
29
+ "import spacy\n",
30
+ "import torch\n",
31
+ "from collections import defaultdict\n",
32
+ "import pickle\n",
33
+ "import numpy as np\n",
34
+ "import re\n",
35
+ "\n",
36
+ "sys.path.append(os.path.abspath(\"../lib\"))\n",
37
+ "from util import *\n",
38
+ "from mlutil import *\n",
39
+ "\n",
40
+ "lcc = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
41
+ "\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
42
+ "ucc = [\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\", \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\"]\n",
43
+ "dig = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
44
+ "spc = [\"@\",\"#\",\"$\",\"%\",\"^\",\"&\",\"*\",\"(\",\")\",\"_\",\"+\",\"{\",\"}\",\"[\",\"]\",\"|\",\":\",\"<\",\">\",\"?\",\";\",\",\",\".\"]\n",
45
+ "\n",
46
+ "\n",
47
+ "class TextPreProcessor:\n",
48
+ " \"\"\"\n",
49
+ " text preprocessor\n",
50
+ " \"\"\"\n",
51
+ " def __init__(self, stemmer = \"lancaster\", verbose=False):\n",
52
+ " self.verbose = verbose\n",
53
+ " self.lemmatizer = WordNetLemmatizer()\n",
54
+ "\n",
55
+ " def stripHtml(self, text):\n",
56
+ " soup = BeautifulSoup(text, \"html.parser\")\n",
57
+ " return soup.get_text()\n",
58
+ "\n",
59
+ " def removeBetweenSquareBrackets(self, text):\n",
60
+ " return re.sub('\\[[^]]*\\]', '', text)\n",
61
+ "\n",
62
+ " def denoiseText(self, text):\n",
63
+ " text = stripHtml(text)\n",
64
+ " text = removeBetweenSquareBrackets(text)\n",
65
+ " return text\n",
66
+ "\n",
67
+ " def replaceContractions(self, text):\n",
68
+ " \"\"\"Replace contractions in string of text\"\"\"\n",
69
+ " return contractions.fix(text)\n",
70
+ "\n",
71
+ " def tokenize(self, text):\n",
72
+ " words = nltk.word_tokenize(text)\n",
73
+ " return words\n",
74
+ "\n",
75
+ " def removeNonAscii(self, words):\n",
76
+ " \"\"\"Remove non-ASCII characters from list of tokenized words\"\"\"\n",
77
+ " newWords = []\n",
78
+ " for word in words:\n",
79
+ " if isinstance(word, unicode):\n",
80
+ " newWord = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')\n",
81
+ " else:\n",
82
+ " newWord = word\n",
83
+ " newWords.append(newWord)\n",
84
+ " return newWords\n",
85
+ "\n",
86
+ " def replaceNonAsciiFromText(self, text):\n",
87
+ " \"\"\" replaces non ascii with blank \"\"\"\n",
88
+ " return ''.join([i if ord(i) < 128 else ' ' for i in text])\n",
89
+ "\n",
90
+ " def removeNonAsciiFromText(self, text):\n",
91
+ " \"\"\" replaces non ascii with blank \"\"\"\n",
92
+ " return ''.join([i if ord(i) < 128 else '' for i in text])\n",
93
+ "\n",
94
+ " def allow(self, words):\n",
95
+ " \"\"\" allow only specific charaters \"\"\"\n",
96
+ " allowed = [word for word in words if re.match('^[A-Za-z0-9\\.\\,\\:\\;\\!\\?\\(\\)\\'\\-\\$\\@\\%\\\"]+$', word) is not None]\t\t\n",
97
+ " return allowed\t\t\n",
98
+ "\n",
99
+ " def toLowercase(self, words):\n",
100
+ " \"\"\"Convert all characters to lowercase from list of tokenized words\"\"\"\n",
101
+ " newWords = [word.lower() for word in words]\n",
102
+ " return newWords\n",
103
+ "\n",
104
+ " def removePunctuation(self, words):\n",
105
+ " \"\"\"Remove punctuation from list of tokenized words\"\"\"\n",
106
+ " newWords = []\n",
107
+ " for word in words:\n",
108
+ " newWord = re.sub(r'[^\\w\\s]', '', word)\n",
109
+ " if newWord != '':\n",
110
+ " newWords.append(newWord)\n",
111
+ " return newWords\n",
112
+ "\n",
113
+ " def replaceNumbers(self, words):\n",
114
+ " \"\"\"Replace all interger occurrences in list of tokenized words with textual representation\"\"\"\n",
115
+ " p = inflect.engine()\n",
116
+ " newWords = []\n",
117
+ " for word in words:\n",
118
+ " if word.isdigit():\n",
119
+ " newWord = p.number_to_words(word)\n",
120
+ " newWords.append(newWord)\n",
121
+ " else:\n",
122
+ " newWords.append(word)\n",
123
+ " return newWords\n",
124
+ "\n",
125
+ " def removeStopwords(self, words):\n",
126
+ " \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
127
+ " newWords = []\n",
128
+ " for word in words:\n",
129
+ " if word not in stopwords.words('english'):\n",
130
+ " newWords.append(word)\n",
131
+ " return newWords\n",
132
+ "\n",
133
+ " def removeCustomStopwords(self, words, stopWords):\n",
134
+ " \"\"\"Remove stop words from list of tokenized words\"\"\"\n",
135
+ " removed = [word for word in words if word not in stopWords]\t\t\n",
136
+ " return removed\n",
137
+ "\n",
138
+ " def removeLowFreqWords(self, words, minFreq):\n",
139
+ " \"\"\"Remove low frewquncy words from list of tokenized words\"\"\"\n",
140
+ " frequency = defaultdict(int)\n",
141
+ " for word in words:\n",
142
+ " frequency[word] += 1\n",
143
+ " removed = [word for word in words if frequency[word] > minFreq]\t\t\n",
144
+ " return removed\t\n",
145
+ "\n",
146
+ " def removeNumbers(self, words):\n",
147
+ " \"\"\"Remove numbers\"\"\"\n",
148
+ " removed = [word for word in words if not isNumber(word)]\t\t\n",
149
+ " return removed\t\t\n",
150
+ "\n",
151
+ " def removeShortWords(self, words, minLengh):\n",
152
+ " \"\"\"Remove short words \"\"\"\n",
153
+ " removed = [word for word in words if len(word) >= minLengh]\t\t\n",
154
+ " return removed\t\t\n",
155
+ "\n",
156
+ " def keepAllowedWords(self, words, keepWords):\n",
157
+ " \"\"\"Keep words from the list only\"\"\"\n",
158
+ " kept = [word for word in words if word in keepWords]\t\t\n",
159
+ " return kept\n",
160
+ "\n",
161
+ " def stemWords(self, words):\n",
162
+ " \"\"\"Stem words in list of tokenized words\"\"\"\n",
163
+ " if stemmer == \"lancaster\":\n",
164
+ " stemmer = LancasterStemmer()\n",
165
+ " elif stemmer == \"snowbal\":\n",
166
+ " stemmer = SnowballStemmer()\n",
167
+ " elif stemmer == \"porter\":\n",
168
+ " stemmer = PorterStemmer()\n",
169
+ " stems = [stemmer.stem(word) for word in words]\n",
170
+ " return stems\n",
171
+ "\n",
172
+ " def lemmatizeWords(self, words):\n",
173
+ " \"\"\"Lemmatize tokens in list of tokenized words\"\"\"\n",
174
+ " lemmas = [self.lemmatizer.lemmatize(word) for word in words]\n",
175
+ " return lemmas\n",
176
+ "\n",
177
+ " def lemmatizeVerbs(self, words):\n",
178
+ " \"\"\"Lemmatize verbs in list of tokenized words\"\"\"\n",
179
+ " lemmas = [self.lemmatizer.lemmatize(word, pos='v') for word in words]\n",
180
+ " return lemmas\n",
181
+ "\n",
182
+ " def normalize(self, words):\n",
183
+ " words = self.removeNonAscii(words)\n",
184
+ " words = self.toLowercase(words)\n",
185
+ " words = self.removePunctuation(words)\n",
186
+ " words = self.replaceNumbers(words)\n",
187
+ " words = self.removeStopwords(words)\n",
188
+ " return words\n",
189
+ "\n",
190
+ " def posTag(self, textTokens):\n",
191
+ " tags = nltk.pos_tag(textTokens)\n",
192
+ " return tags\n",
193
+ "\n",
194
+ " def extractEntity(self, textTokens, classifierPath, jarPath):\n",
195
+ " st = StanfordNERTagger(classifierPath, jarPath) \n",
196
+ " entities = st.tag(textTokens)\n",
197
+ " return entities\n",
198
+ "\n",
199
+ " def documentFeatures(self, document, wordFeatures):\n",
200
+ " documentWords = set(document)\n",
201
+ " features = {}\n",
202
+ " for word in wordFeatures:\n",
203
+ " features[word] = (word in documentWords)\n",
204
+ " return features\n",
205
+ "\n",
206
+ "class NGram:\n",
207
+ " \"\"\"\n",
208
+ " word ngram\n",
209
+ " \"\"\"\n",
210
+ " def __init__(self, vocFilt, verbose=False):\n",
211
+ " \"\"\"\n",
212
+ " initialize\n",
213
+ " \"\"\"\n",
214
+ " self.vocFilt = vocFilt\n",
215
+ " self.nGramCounter = dict()\n",
216
+ " self.nGramFreq = dict()\n",
217
+ " self.corpSize = 0\n",
218
+ " self.vocabulary = set()\n",
219
+ " self.freqDone = False\n",
220
+ " self.verbose = verbose\n",
221
+ " self.vecWords = None\n",
222
+ " self.nonZeroCount = 0\n",
223
+ "\n",
224
+ " def countDocNGrams(self, words):\n",
225
+ " \"\"\"\n",
226
+ " count words in a doc\n",
227
+ " \"\"\"\n",
228
+ " if self.verbose:\n",
229
+ " print (\"doc size \" + str(len(words)))\n",
230
+ " nGrams = self.toNGram(words)\n",
231
+ " for nGram in nGrams:\n",
232
+ " count = self.nGramCounter.get(nGram, 0)\n",
233
+ " self.nGramCounter[nGram] = count + 1\n",
234
+ " self.corpSize += 1\n",
235
+ " self.vocabulary.update(words)\t\n",
236
+ "\n",
237
+ " def remLowCount(self, minCount):\n",
238
+ " \"\"\"\n",
239
+ " removes items with count below threshold\n",
240
+ " \"\"\"\n",
241
+ " self.nGramCounter = dict(filter(lambda item: item[1] >= minCount, self.nGramCounter.items()))\n",
242
+ "\n",
243
+ " def getVocabSize(self):\n",
244
+ " \"\"\"\n",
245
+ " get vocabulary size\n",
246
+ " \"\"\"\n",
247
+ " return len(self.nGramCounter)\n",
248
+ "\n",
249
+ " def getNGramFreq(self):\n",
250
+ " \"\"\"\n",
251
+ " get normalized count\n",
252
+ " \"\"\"\n",
253
+ " if self.verbose:\n",
254
+ " print (\"counter size \" + str(len(self.nGramCounter)))\n",
255
+ " if not self.freqDone:\n",
256
+ " for item in self.nGramCounter.items():\n",
257
+ " self.nGramFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
258
+ " self.freqDone = True\n",
259
+ " return self.nGramFreq\n",
260
+ "\n",
261
+ " def getNGramIndex(self, show):\n",
262
+ " \"\"\"\n",
263
+ " convert to list\n",
264
+ " \"\"\"\n",
265
+ " if self.vecWords is None:\n",
266
+ " self.vecWords = list(self.nGramCounter)\n",
267
+ " if show:\n",
268
+ " for vw in enumerate(self.vecWords):\n",
269
+ " print(vw)\n",
270
+ "\n",
271
+ " def getVector(self, words, byCount, normalized):\n",
272
+ " \"\"\"\n",
273
+ " convert to vector\n",
274
+ " \"\"\"\n",
275
+ " if self.vecWords is None:\n",
276
+ " self.vecWords = list(self.nGramCounter)\n",
277
+ "\n",
278
+ " nGrams = self.toNGram(words)\n",
279
+ " if self.verbose:\n",
280
+ " print(\"vocabulary size {}\".format(len(self.vecWords)))\n",
281
+ " print(\"ngrams\")\n",
282
+ " print(nGrams)\n",
283
+ " self.nonZeroCount = 0\n",
284
+ " vec = list(map(lambda vw: self.getVecElem(vw, nGrams, byCount, normalized), self.vecWords))\n",
285
+ " return vec\n",
286
+ "\n",
287
+ " def getVecElem(self, vw, nGrams, byCount, normalized):\n",
288
+ " \"\"\"\n",
289
+ " get vector element\n",
290
+ " \"\"\"\n",
291
+ " if vw in nGrams:\n",
292
+ " if byCount:\n",
293
+ " if normalized:\n",
294
+ " el = self.nGramFreq[vw]\n",
295
+ " else:\n",
296
+ " el = self.nGramCounter[vw]\n",
297
+ " else:\n",
298
+ " el = 1\n",
299
+ " self.nonZeroCount += 1\n",
300
+ " else:\n",
301
+ " if (byCount and normalized):\n",
302
+ " el = 0.0\n",
303
+ " else:\n",
304
+ " el = 0\n",
305
+ " return el\n",
306
+ "\n",
307
+ " def getNonZeroCount(self):\n",
308
+ " \"\"\"\n",
309
+ " get non zero vector element count\n",
310
+ " \"\"\"\n",
311
+ " return self.nonZeroCount\n",
312
+ "\n",
313
+ " def toBiGram(self, words):\n",
314
+ " \"\"\"\n",
315
+ " convert to bigram\n",
316
+ " \"\"\"\n",
317
+ " if self.verbose:\n",
318
+ " print (\"doc size \" + str(len(words)))\n",
319
+ " biGrams = list()\n",
320
+ " for i in range(len(words)-1):\n",
321
+ " w1 = words[i]\n",
322
+ " w2 = words[i+1]\n",
323
+ " if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt):\n",
324
+ " nGram = (w1, w2)\n",
325
+ " biGrams.append(nGram)\n",
326
+ " return biGrams\n",
327
+ "\n",
328
+ " def toTriGram(self, words):\n",
329
+ " \"\"\"\n",
330
+ " convert to trigram\n",
331
+ " \"\"\"\n",
332
+ " if self.verbose:\n",
333
+ " print (\"doc size \" + str(len(words)))\n",
334
+ " triGrams = list()\n",
335
+ " for i in range(len(words)-2):\n",
336
+ " w1 = words[i]\n",
337
+ " w2 = words[i+1]\n",
338
+ " w3 = words[i+2]\n",
339
+ " if self.vocFilt is None or (w1 in self.vocFilt and w2 in self.vocFilt and w3 in self.vocFilt):\n",
340
+ " nGram = (w1, w2, w3)\n",
341
+ " triGrams.append(nGram)\n",
342
+ " return triGrams\n",
343
+ "\n",
344
+ " def save(self, saveFile):\n",
345
+ " \"\"\"\n",
346
+ " save \n",
347
+ " \"\"\"\n",
348
+ " sf = open(saveFile, \"wb\")\n",
349
+ " pickle.dump(self, sf)\n",
350
+ " sf.close()\n",
351
+ "\n",
352
+ " @staticmethod\n",
353
+ " def load(saveFile):\n",
354
+ " \"\"\"\n",
355
+ " load\n",
356
+ " \"\"\"\n",
357
+ " sf = open(saveFile, \"rb\")\n",
358
+ " nGrams = pickle.load(sf)\n",
359
+ " sf.close()\n",
360
+ " return nGrams\n",
361
+ "\n",
362
+ "class CharNGram:\n",
363
+ " \"\"\"\n",
364
+ " character n gram\n",
365
+ " \"\"\"\n",
366
+ " def __init__(self, domains, ngsize, verbose=False):\n",
367
+ " \"\"\"\n",
368
+ " initialize\n",
369
+ " \"\"\"\n",
370
+ " self.chDomain = list()\n",
371
+ " self.ws = \"#\"\n",
372
+ " self.chDomain.append(self.ws)\n",
373
+ " for d in domains:\n",
374
+ " if d == \"lcc\":\n",
375
+ " self.chDomain.extend(lcc)\n",
376
+ " elif d == \"ucc\":\n",
377
+ " self.chDomain.extend(ucc)\n",
378
+ " elif d == \"dig\":\n",
379
+ " self.chDomain.extend(dig)\n",
380
+ " elif d == \"spc\":\n",
381
+ " self.chDomain.extend(spc)\n",
382
+ " else:\n",
383
+ " raise ValueError(\"invalid character type \" + d)\n",
384
+ "\n",
385
+ " self.ngsize = ngsize\n",
386
+ " self.radixPow = None\n",
387
+ " self.cntVecSize = None\n",
388
+ "\n",
389
+ " def addSpChar(self, spChar):\n",
390
+ " \"\"\"\n",
391
+ " add special characters\n",
392
+ " \"\"\"\n",
393
+ " self.chDomain.extend(spChar)\n",
394
+ "\n",
395
+ " def setWsRepl(self, ws):\n",
396
+ " \"\"\"\n",
397
+ " set white space replacement charater\n",
398
+ " \"\"\"\n",
399
+ " self.ws = ws\n",
400
+ " self.chDomain[0] = self.ws\n",
401
+ "\n",
402
+ " def finalize(self):\n",
403
+ " \"\"\"\n",
404
+ " final setup\n",
405
+ " \"\"\"\t\t\n",
406
+ " domSize = len(self.chDomain)\n",
407
+ " self.cntVecSize = int(math.pow(domSize, self.ngsize))\n",
408
+ " if self.radixPow is None:\n",
409
+ " self.radixPow = list()\n",
410
+ " for i in range(self.ngsize-1, 0, -1):\n",
411
+ " self.radixPow.append(int(math.pow(domSize, i)))\n",
412
+ " self.radixPow.append(1)\n",
413
+ "\n",
414
+ "\n",
415
+ " def toMgramCount(self, text):\n",
416
+ " \"\"\"\n",
417
+ " get ngram count list\n",
418
+ " \"\"\"\n",
419
+ " #print(text)\n",
420
+ " ngCounts = [0] * self.cntVecSize\n",
421
+ "\n",
422
+ " ngram = list()\n",
423
+ " totNgCount = 0\n",
424
+ " for ch in text:\n",
425
+ " if ch.isspace():\n",
426
+ " l = len(ngram)\n",
427
+ " if l == 0 or ngram[l-1] != self.ws:\n",
428
+ " ngram.append(self.ws)\n",
429
+ " else:\n",
430
+ " ngram.append(ch)\n",
431
+ "\n",
432
+ " if len(ngram) == self.ngsize:\n",
433
+ " i = self.__getNgramIndex(ngram)\n",
434
+ " assert i < self.cntVecSize, \"ngram index out of range index \" + str(i) + \" size \" + str(self.cntVecSize) \n",
435
+ " ngCounts[i] += 1\n",
436
+ " ngram.clear()\n",
437
+ " totNgCount += 1\n",
438
+ "\n",
439
+ " return ngCounts\n",
440
+ "\n",
441
+ " def __getNgramIndex(self, ngram):\n",
442
+ " \"\"\"\n",
443
+ " get index of an ngram into a list of size equal total number of possible ngrams\n",
444
+ " \"\"\"\n",
445
+ " assert len(ngram) == len(self.radixPow), \"ngram size mismatch\"\t\t\n",
446
+ " ngi = 0\n",
447
+ " for ch, rp in zip(ngram, self.radixPow):\n",
448
+ " i = self.chDomain.index(ch)\n",
449
+ " ngi += i * rp\n",
450
+ "\n",
451
+ " return ngi\n",
452
+ "\n",
453
+ "\n",
454
+ "class TfIdf:\n",
455
+ " \"\"\"\n",
456
+ " TF IDF\t\n",
457
+ " \"\"\"\n",
458
+ " def __init__(self, vocFilt, doIdf, verbose=False):\n",
459
+ " \"\"\"\n",
460
+ " initialize\n",
461
+ " \"\"\"\n",
462
+ " self.vocFilt = vocFilt\n",
463
+ " self.doIdf = doIdf\n",
464
+ " self.wordCounter = {}\n",
465
+ " self.wordFreq = {}\n",
466
+ " self.wordInDocCount = {}\n",
467
+ " self.docCount = 0\n",
468
+ " self.corpSize = 0\n",
469
+ " self.freqDone = False\n",
470
+ " self.vocabulary = set()\n",
471
+ " self.wordIndex = None\n",
472
+ " self.verbose = verbose\n",
473
+ " self.vecWords = None\n",
474
+ "\n",
475
+ " def countDocWords(self, words):\n",
476
+ " \"\"\"\n",
477
+ " count words in a doc\n",
478
+ " \"\"\"\n",
479
+ " if self.verbose:\n",
480
+ " print (\"doc size \" + str(len(words)))\n",
481
+ " for word in words:\n",
482
+ " if self.vocFilt is None or word in self.vocFilt:\n",
483
+ " count = self.wordCounter.get(word, 0)\n",
484
+ " self.wordCounter[word] = count + 1\n",
485
+ " self.corpSize += len(words)\n",
486
+ " self.vocabulary.update(words)\n",
487
+ "\n",
488
+ " if (self.doIdf):\n",
489
+ " self.docCount += 1\n",
490
+ " for word in set(words):\n",
491
+ " self.wordInDocCount.get(word, 0)\n",
492
+ " self.wordInDocCount[word] = count + 1\n",
493
+ " self.freqDone = False\n",
494
+ "\n",
495
+ "\n",
496
+ " def getWordFreq(self):\n",
497
+ " \"\"\"\n",
498
+ " get tfidf for corpus\n",
499
+ " \"\"\"\n",
500
+ " if self.verbose:\n",
501
+ " print (\"counter size \" + str(len(self.wordCounter)))\n",
502
+ " if not self.freqDone:\n",
503
+ " for item in self.wordCounter.items():\n",
504
+ " self.wordFreq[item[0]] = float(item[1]) / self.corpSize\t\t\t\t\t\n",
505
+ " if self.doIdf:\n",
506
+ " for k in self.wordFreq.keys():\n",
507
+ " self.wordFreq.items[k] *= math.log(self.docCount / self.wordInDocCount.items[k])\t\n",
508
+ " self.freqDone = True\n",
509
+ " return self.wordFreq\n",
510
+ "\n",
511
+ " def getCount(self, word):\n",
512
+ " \"\"\"\n",
513
+ " get counter\n",
514
+ " \"\"\"\n",
515
+ " if word in self.wordCounter:\n",
516
+ " count = self.wordCounter[word]\n",
517
+ " else:\n",
518
+ " raise ValueError(\"word not found in count table \" + word)\n",
519
+ " return count\n",
520
+ "\n",
521
+ " def getFreq(self, word):\n",
522
+ " \"\"\"\n",
523
+ " get normalized frequency\n",
524
+ " \"\"\"\n",
525
+ " if word in self.wordFreq:\n",
526
+ " freq = self.wordFreq[word]\n",
527
+ " else:\n",
528
+ " raise ValueError(\"word not found in count table \" + word)\n",
529
+ " return freq\n",
530
+ "\n",
531
+ " def resetCounter(self):\n",
532
+ " \"\"\"\n",
533
+ " reset counter\n",
534
+ " \"\"\"\n",
535
+ " self.wordCounter = {}\n",
536
+ "\n",
537
+ " def buildVocabulary(self, words):\n",
538
+ " \"\"\"\n",
539
+ " build vocbulary\n",
540
+ " \"\"\"\n",
541
+ " self.vocabulary.update(words)\n",
542
+ "\n",
543
+ " def getVocabulary(self):\n",
544
+ " \"\"\"\n",
545
+ " return vocabulary\n",
546
+ " \"\"\"\n",
547
+ " return self.vocabulary\n",
548
+ "\n",
549
+ " def creatWordIndex(self):\n",
550
+ " \"\"\"\n",
551
+ " index for all words in vcabulary\n",
552
+ " \"\"\"\n",
553
+ " self.wordIndex = {word : idx for idx, word in enumerate(list(self.vocabulary))}\n",
554
+ "\n",
555
+ " def getVector(self, words, byCount, normalized):\n",
556
+ " \"\"\"\n",
557
+ " get vector\n",
558
+ " \"\"\"\n",
559
+ " if self.vecWords is None:\n",
560
+ " self.vecWords = list(self.wordCounter)\n",
561
+ " vec = list(map(lambda vw: self.getVecElem(vw, words, byCount, normalized), self.vecWords))\n",
562
+ " return vec\n",
563
+ "\n",
564
+ " def getVecElem(self, vw, words, byCount, normalized):\n",
565
+ " \"\"\"\n",
566
+ " vector element\n",
567
+ " \"\"\"\n",
568
+ " el = 0\n",
569
+ " if vw in words:\n",
570
+ " if byCount:\n",
571
+ " if normalized:\n",
572
+ " el = self.wordFreq[vw]\n",
573
+ " else:\n",
574
+ " el = self.wordCounter[vw]\n",
575
+ " else:\n",
576
+ " el = 1\n",
577
+ " return el\n",
578
+ "\n",
579
+ " def save(self, saveFile):\n",
580
+ " \"\"\"\n",
581
+ " save\n",
582
+ " \"\"\"\n",
583
+ " sf = open(saveFile, \"wb\")\n",
584
+ " pickle.dump(self, sf)\n",
585
+ " sf.close()\n",
586
+ "\n",
587
+ " # load \n",
588
+ " @staticmethod\n",
589
+ " def load(saveFile):\n",
590
+ " \"\"\"\n",
591
+ " load\n",
592
+ " \"\"\"\n",
593
+ " sf = open(saveFile, \"rb\")\n",
594
+ " tfidf = pickle.load(sf)\n",
595
+ " sf.close()\n",
596
+ " return tfidf\n",
597
+ "\n",
598
+ "# bigram\n",
599
+ "class BiGram(NGram):\n",
600
+ " def __init__(self, vocFilt, verbose=False):\n",
601
+ " \"\"\"\n",
602
+ " initialize\n",
603
+ " \"\"\"\n",
604
+ " super(BiGram, self).__init__(vocFilt, verbose)\n",
605
+ "\n",
606
+ " def toNGram(self, words):\n",
607
+ " \"\"\"\n",
608
+ " convert to Ngrams\n",
609
+ " \"\"\"\n",
610
+ " return self.toBiGram(words)\n",
611
+ "\n",
612
+ "# trigram\n",
613
+ "class TriGram(NGram):\n",
614
+ " def __init__(self, vocFilt, verbose=False):\n",
615
+ " \"\"\"\n",
616
+ " initialize\n",
617
+ " \"\"\"\n",
618
+ " super(TriGram, self).__init__(vocFilt, verbose)\n",
619
+ "\n",
620
+ " def toNGram(self, words):\n",
621
+ " \"\"\"\n",
622
+ " convert to Ngrams\n",
623
+ " \"\"\"\n",
624
+ " return self.toTriGram(words)\n",
625
+ "\n",
626
+ "\n",
627
+ "\n",
628
+ "class DocSentences:\n",
629
+ " \"\"\"\n",
630
+ " sentence processor\n",
631
+ " \"\"\"\n",
632
+ " def __init__(self, filePath, minLength, verbose, text=None):\n",
633
+ " \"\"\"\n",
634
+ " initialize\n",
635
+ " \"\"\"\n",
636
+ " if filePath:\n",
637
+ " self.filePath = filePath\n",
638
+ " with open(filePath, 'r') as contentFile:\n",
639
+ " content = contentFile.read()\n",
640
+ " elif text:\n",
641
+ " content = text\n",
642
+ " else:\n",
643
+ " raise valueError(\"either file path or text must be provided\")\n",
644
+ "\n",
645
+ " #self.sentences = content.split('.')\n",
646
+ " self.verbose = verbose\n",
647
+ " tp = TextPreProcessor()\n",
648
+ " content = tp.removeNonAsciiFromText(content)\n",
649
+ " sentences = sent_tokenize(content)\n",
650
+ " self.sentences = list(filter(lambda s: len(nltk.word_tokenize(s)) >= minLength, sentences))\n",
651
+ " if self.verbose:\n",
652
+ " print (\"num of senteces after length filter \" + str(len(self.sentences)))\n",
653
+ " self.sentencesAsTokens = [clean(s, tp, verbose) for s in self.sentences]\t\n",
654
+ "\n",
655
+ " # get sentence tokens\n",
656
+ " def getSentencesAsTokens(self):\n",
657
+ " return self.sentencesAsTokens\n",
658
+ "\n",
659
+ " # get sentences\n",
660
+ " def getSentences(self):\n",
661
+ " return self.sentences\n",
662
+ "\n",
663
+ " # build term freq table\n",
664
+ " def getTermFreqTable(self):\n",
665
+ " # term count table for all words\n",
666
+ " termTable = TfIdf(None, False)\n",
667
+ " sentWords = self.getSentencesAsTokens()\n",
668
+ " for seWords in sentWords:\n",
669
+ " termTable.countDocWords(seWords)\n",
670
+ " return termTable\n",
671
+ "\n",
672
+ "# sentence processor\n",
673
+ "class WordVectorContainer:\n",
674
+ " def __init__(self, dirPath, verbose):\n",
675
+ " \"\"\"\n",
676
+ " initialize\n",
677
+ " \"\"\"\n",
678
+ " self.docs = list()\n",
679
+ " self.wordVectors = list()\n",
680
+ " self.tp = TextPreProcessor()\n",
681
+ " self.similarityAlgo = \"cosine\"\n",
682
+ " self.simAlgoNormalizer = None\n",
683
+ " self.termTable = None\n",
684
+ "\n",
685
+ "\n",
686
+ " def addDir(self, dirPath):\n",
687
+ " \"\"\"\n",
688
+ " add content of all files ina directory\n",
689
+ " \"\"\"\n",
690
+ " docs, filePaths = getFileContent(dirPath, verbose)\n",
691
+ " self.docs.extend(docs)\n",
692
+ " self.wordVectors.extend([clean(doc, self.tp, verbose) for doc in docs])\n",
693
+ "\n",
694
+ " def addFile(self, filePath):\n",
695
+ " \"\"\"\n",
696
+ " add file content\n",
697
+ " \"\"\"\n",
698
+ " with open(filePath, 'r') as contentFile:\n",
699
+ " content = contentFile.read()\n",
700
+ " self.wordVectors.append(clean(content, self.tp, verbose))\n",
701
+ "\n",
702
+ " def addText(self, text):\n",
703
+ " \"\"\"\n",
704
+ " add text\n",
705
+ " \"\"\"\n",
706
+ " self.wordVectors.append(clean(text, self.tp, verbose))\n",
707
+ "\n",
708
+ " def addWords(self, words):\n",
709
+ " \"\"\"\n",
710
+ " add words\n",
711
+ " \"\"\"\n",
712
+ " self.wordVectors.append(words)\n",
713
+ "\n",
714
+ " def withSimilarityAlgo(self, algo, normalizer=None):\n",
715
+ " \"\"\"\n",
716
+ " set similarity algo\n",
717
+ " \"\"\"\n",
718
+ " self.similarityAlgo = algo\n",
719
+ " self.simAlgoNormalizer = normalizer\n",
720
+ "\n",
721
+ " def getDocsWords(self):\n",
722
+ " \"\"\"\n",
723
+ " get word vectors\n",
724
+ " \"\"\"\n",
725
+ " return self.wordVectors\n",
726
+ "\n",
727
+ " def getDocs(self):\n",
728
+ " \"\"\"\n",
729
+ " get docs\n",
730
+ " \"\"\"\n",
731
+ " return self.docs\n",
732
+ "\n",
733
+ " def getTermFreqTable(self):\n",
734
+ " \"\"\"\n",
735
+ " term count table for all words\n",
736
+ " \"\"\"\n",
737
+ " self.termTable = TfIdf(None, False)\n",
738
+ " for words in self.wordVectors:\n",
739
+ " self.termTable.countDocWords(words)\n",
740
+ " self.termTable.getWordFreq()\n",
741
+ " return self.termTable\n",
742
+ "\n",
743
+ " def getPairWiseSimilarity(self, byCount, normalized):\n",
744
+ " \"\"\"\n",
745
+ " pair wise similarity\n",
746
+ " \"\"\"\n",
747
+ " self.getNumWordVectors()\n",
748
+ "\n",
749
+ " size = len(self.wordVectors)\n",
750
+ " simArray = np.empty(shape=(size,size))\n",
751
+ " for i in range(size):\n",
752
+ " simArray[i][i] = 1.0\n",
753
+ "\n",
754
+ " for i in range(size):\n",
755
+ " for j in range(i+1, size):\n",
756
+ " if self.similarityAlgo == \"cosine\":\n",
757
+ " sim = cosineSimilarity(self.numWordVectors[i], self.numWordVectors[j])\n",
758
+ " elif self.similarityAlgo == \"jaccard\":\n",
759
+ " sim = jaccardSimilarity(self.wordVectors[i], self.wordVectors[j],\\\n",
760
+ " self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
761
+ " else:\n",
762
+ " raise ValueError(\"invalid similarity algorithms\")\n",
763
+ " simArray[i][j] = sim\n",
764
+ " simArray[j][i] = sim\n",
765
+ " return simArray\n",
766
+ "\n",
767
+ " def getInterSetSimilarity(self, byCount, normalized, split):\n",
768
+ " \"\"\"\n",
769
+ " inter set pair wise similarity\n",
770
+ " \"\"\"\n",
771
+ " self.getNumWordVectors()\n",
772
+ " size = len(self.wordVectors)\n",
773
+ " if not self.similarityAlgo == \"jaccard\":\n",
774
+ " firstNumVec = self.numWordVectors[:split]\n",
775
+ " secNumVec = self.numWordVectors[split:]\n",
776
+ " fiSize = len(firstNumVec)\n",
777
+ " seSize = len(secNumVec)\n",
778
+ " else:\n",
779
+ " firstVec = self.wordVectors[:split]\n",
780
+ " secVec = self.wordVectors[split:]\n",
781
+ " fiSize = len(firstVec)\n",
782
+ " seSize = len(secVec)\n",
783
+ "\n",
784
+ " simArray = np.empty(shape=(fiSize,seSize))\n",
785
+ " for i in range(fiSize):\n",
786
+ " for j in range(seSize):\n",
787
+ " if self.similarityAlgo == \"cosine\":\n",
788
+ " sim = cosineSimilarity(firstNumVec[i], secNumVec[j])\n",
789
+ " elif self.similarityAlgo == \"jaccard\":\n",
790
+ " sim = jaccardSimilarity(firstVec[i], secVec[j],\\\n",
791
+ " self.simAlgoNormalizer[0], self.simAlgoNormalizer[1])\n",
792
+ " else:\n",
793
+ " raise ValueError(\"invalid similarity algorithms\")\n",
794
+ " simArray[i][j] = sim\n",
795
+ " return simArray\n",
796
+ "\n",
797
+ " def getNumWordVectors(self):\n",
798
+ " \"\"\"\n",
799
+ " get vectors\n",
800
+ " \"\"\"\n",
801
+ " if not self.similarityAlgo == \"jaccard\":\n",
802
+ " if self.numWordVectors is None:\n",
803
+ " self.numWordVectors = list(map(lambda wv: self.termTable.getVector(wv, byCount, normalized), self.wordVectors))\n",
804
+ "\n",
805
+ "# fragments documents into whole doc, paragraph or passages\n",
806
+ "class TextFragmentGenerator:\n",
807
+ " def __init__(self, level, minParNl, passSize, verbose=False):\n",
808
+ " \"\"\"\n",
809
+ " initialize\n",
810
+ " \"\"\"\n",
811
+ " self.level = level\n",
812
+ " self.minParNl = minParNl\n",
813
+ " self.passSize = passSize\n",
814
+ " self.fragments = None\n",
815
+ " self.verbose = verbose\n",
816
+ "\n",
817
+ " def loadDocs(self, fpaths):\n",
818
+ " \"\"\"\n",
819
+ " loads documents from one file, multiple files or all files under directory\n",
820
+ " \"\"\"\n",
821
+ " fPaths = fpaths.split(\",\")\n",
822
+ " if len(fPaths) == 1:\n",
823
+ " if os.path.isfile(fPaths[0]):\n",
824
+ " #one file\n",
825
+ " if self.verbose:\n",
826
+ " print(\"got one file from path\")\n",
827
+ " dnames = fPaths\n",
828
+ " docStr = getOneFileContent(fPaths[0])\n",
829
+ " dtexts = [docStr]\n",
830
+ " else:\n",
831
+ " #all files under directory\n",
832
+ " if self.verbose:\n",
833
+ " print(\"got all files under directory from path\")\n",
834
+ " dtexts, dnames = getFileContent(fPaths[0])\n",
835
+ " if self.verbose:\n",
836
+ " print(\"found {} files\".format(len(dtexts)))\n",
837
+ " else:\n",
838
+ " #list of files\n",
839
+ " if self.verbose: \n",
840
+ " print(\"got list of files from path\")\n",
841
+ " dnames = fPaths\n",
842
+ " dtexts = list(map(getOneFileContent, fpaths))\n",
843
+ " if self.verbose:\n",
844
+ " print(\"found {} files\".format(len(dtexts)))\n",
845
+ "\n",
846
+ " ndocs = (dtexts, dnames)\t\n",
847
+ " if self.verbose:\n",
848
+ " print(\"docs\")\n",
849
+ " for dn, dt in zip(dnames, dtexts):\n",
850
+ " print(dn + \"\\t\" + dt[:40])\n",
851
+ "\n",
852
+ " return ndocs\n",
853
+ "\n",
854
+ " def generateFragmentsFromFiles(self, fpaths):\n",
855
+ " \"\"\"\n",
856
+ " fragments documents into whole doc, paragraph or passages\n",
857
+ " \"\"\"\n",
858
+ " dtexts, dnames = self.loadDocs(fpaths)\n",
859
+ " return self.generateFragments(dtexts, dnames)\n",
860
+ "\n",
861
+ "\n",
862
+ " def generateFragmentsFromNamedDocs(self, ndocs):\n",
863
+ " \"\"\"\n",
864
+ " fragments documents into whole doc, paragraph or passages\n",
865
+ " \"\"\"\n",
866
+ " dtexts = list(map(lambda nd : nd[1], ndocs))\n",
867
+ " dnames = list(map(lambda nd : nd[0], ndocs))\n",
868
+ " #for i in range(len(dtexts)):\n",
869
+ " #\tprint(dnames[i])\n",
870
+ " #\tprint(dtexts[i][:40])\n",
871
+ " return self.generateFragments(dtexts, dnames)\n",
872
+ "\n",
873
+ " def generateFragments(self, dtexts, dnames):\n",
874
+ " \"\"\"\n",
875
+ " fragments documents into whole doc, paragraph or passages\n",
876
+ " \"\"\"\n",
877
+ " if self.level == \"para\" or self.level == \"passage\":\n",
878
+ " #split paras\n",
879
+ " dptexts = list()\n",
880
+ " dpnames = list()\n",
881
+ " for dt, dn in zip(dtexts, dnames):\n",
882
+ " paras = getParas(dt, self.minParNl)\n",
883
+ " if self.verbose:\n",
884
+ " print(dn)\n",
885
+ " print(\"no of paras {}\".format(len(paras)))\n",
886
+ " dptexts.extend(paras)\n",
887
+ " pnames = list(map(lambda i : dn + \":\" + str(i), range(len(paras))))\n",
888
+ " dpnames.extend(pnames)\n",
889
+ " dtexts = dptexts\n",
890
+ " dnames = dpnames\n",
891
+ "\n",
892
+ " if self.level == \"passage\":\n",
893
+ " #split each para into passages\n",
894
+ " dptexts = list()\n",
895
+ " dpnames = list()\n",
896
+ " for dt, dn in zip(dtexts, dnames):\n",
897
+ " sents = sent_tokenize(dt.strip())\t\t\t\n",
898
+ " if self.verbose:\n",
899
+ " print(dn)\n",
900
+ " print(\"no of sentences {}\".format(len(sents)))\n",
901
+ " span = self.passSize\n",
902
+ " if len(sents) <= span:\n",
903
+ " pass\n",
904
+ " else:\n",
905
+ " for i in range(0, len(sents) - span, 1):\n",
906
+ " dptext = None\n",
907
+ " for j in range(span):\n",
908
+ " if dptext is None:\n",
909
+ " dptext = sents[i + j] + \". \"\n",
910
+ " else:\n",
911
+ " dptext = dptext + sents[i + j] + \". \" \n",
912
+ " dpname = dn + \":\" + str(i)\n",
913
+ " dptexts.append(dptext)\n",
914
+ " dpnames.append(dpname)\n",
915
+ "\n",
916
+ " dtexts = dptexts\n",
917
+ " dnames = dpnames\n",
918
+ "\n",
919
+ " self.fragments = list(zip(dnames, dtexts))\n",
920
+ " #if self.verbose:\n",
921
+ " #\tprint(\"num fragments {}\".format(len(self.fragments)))\n",
922
+ " return self.fragments\n",
923
+ "\n",
924
+ " def showFragments(self):\n",
925
+ " \"\"\"\n",
926
+ " show fragments\n",
927
+ " \"\"\"\n",
928
+ " print(\"showing all \" + self.level + \" for the first 40 characters\")\n",
929
+ " for dn, dt in self.fragments:\n",
930
+ " print(dn + \"\\t\" + dt[:40])\n",
931
+ "\n",
932
+ " def isDocLevel(self):\n",
933
+ " \"\"\"\n",
934
+ " true if fragment is at doc level\n",
935
+ " \"\"\"\n",
936
+ " return self.level != \"para\" and self.level != \"passage\"\n",
937
+ "\n",
938
+ "# clean doc to create term array\n",
939
+ "def clean(doc, preprocessor, verbose):\n",
940
+ " \"\"\"\n",
941
+ " text pre process\n",
942
+ " \"\"\"\n",
943
+ " if verbose:\n",
944
+ " print (\"--raw doc\")\n",
945
+ " print (doc)\n",
946
+ " #print \"next clean\"\n",
947
+ " doc = preprocessor.removeNonAsciiFromText(doc)\n",
948
+ " words = preprocessor.tokenize(doc)\n",
949
+ " words = preprocessor.allow(words)\n",
950
+ " words = preprocessor.toLowercase(words)\n",
951
+ " words = preprocessor.removeStopwords(words)\n",
952
+ " words = preprocessor.removeShortWords(words, 3)\n",
953
+ " words = preprocessor.removePunctuation(words)\n",
954
+ " words = preprocessor.lemmatizeWords(words)\n",
955
+ " #words = preprocessor.removeNonAscii(words)\n",
956
+ " if verbose:\n",
957
+ " print (\"--after pre processing\")\n",
958
+ " print (words)\n",
959
+ " return words\n",
960
+ "\n",
961
+ "# get sentences\n",
962
+ "def getSentences(filePath):\n",
963
+ " \"\"\"\n",
964
+ " text pre process\n",
965
+ " \"\"\"\n",
966
+ " with open(filePath, 'r') as contentFile:\n",
967
+ " content = contentFile.read()\n",
968
+ " sentences = content.split('.')\n",
969
+ " return sentences\n",
970
+ "\n",
971
+ "def getParas(text, minParNl=2):\n",
972
+ " \"\"\"\n",
973
+ " split into paras\n",
974
+ " \"\"\"\n",
975
+ " regx = \"\\n+\" if minParNl == 1 else \"\\n{2,}\"\n",
976
+ " paras = re.split(regx, text.replace(\"\\r\\n\", \"\\n\"))\n",
977
+ " return paras\n"
978
+ ]
979
+ }
980
+ ],
981
+ "metadata": {
982
+ "kernelspec": {
983
+ "display_name": "Python 3 (ipykernel)",
984
+ "language": "python",
985
+ "name": "python3"
986
+ },
987
+ "language_info": {
988
+ "codemirror_mode": {
989
+ "name": "ipython",
990
+ "version": 3
991
+ },
992
+ "file_extension": ".py",
993
+ "mimetype": "text/x-python",
994
+ "name": "python",
995
+ "nbconvert_exporter": "python",
996
+ "pygments_lexer": "ipython3",
997
+ "version": "3.9.12"
998
+ }
999
+ },
1000
+ "nbformat": 4,
1001
+ "nbformat_minor": 5
1002
+ }
lib/util.ipynb ADDED
@@ -0,0 +1,2141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "21cb09bb",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import sys\n",
12
+ "from random import randint\n",
13
+ "import random\n",
14
+ "import time\n",
15
+ "import uuid\n",
16
+ "from datetime import datetime\n",
17
+ "import math\n",
18
+ "import numpy as np\n",
19
+ "import pandas as pd\n",
20
+ "import matplotlib.pyplot as plt\n",
21
+ "import numpy as np\n",
22
+ "import logging\n",
23
+ "import logging.handlers\n",
24
+ "import pickle\n",
25
+ "from contextlib import contextmanager\n",
26
+ "\n",
27
+ "tokens = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\",\n",
28
+ " \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
29
+ "numTokens = tokens[:10]\n",
30
+ "alphaTokens = tokens[10:36]\n",
31
+ "loCaseChars = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
32
+ "\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
33
+ "\n",
34
+ "typeInt = \"int\"\n",
35
+ "typeFloat = \"float\"\n",
36
+ "typeString = \"string\"\n",
37
+ "\n",
38
+ "secInMinute = 60\n",
39
+ "secInHour = 60 * 60\n",
40
+ "secInDay = 24 * secInHour\n",
41
+ "secInWeek = 7 * secInDay\n",
42
+ "secInYear = 365 * secInDay\n",
43
+ "secInMonth = secInYear / 12\n",
44
+ "\n",
45
+ "minInHour = 60\n",
46
+ "minInDay = 24 * minInHour\n",
47
+ "\n",
48
+ "ftPerYard = 3\n",
49
+ "ftPerMile = ftPerYard * 1760\n",
50
+ "\n",
51
+ "\n",
52
+ "def genID(size):\n",
53
+ " \"\"\"\n",
54
+ " generates ID\n",
55
+ "\n",
56
+ " Parameters\n",
57
+ " size : size of ID\n",
58
+ " \"\"\"\n",
59
+ " id = \"\"\n",
60
+ " for i in range(size):\n",
61
+ " id = id + selectRandomFromList(tokens)\n",
62
+ " return id\n",
63
+ "\n",
64
+ "def genIdList(numId, idSize):\n",
65
+ " \"\"\"\n",
66
+ " generate list of IDs\n",
67
+ "\n",
68
+ " Parameters:\n",
69
+ " numId: number of Ids\n",
70
+ " idSize: ID size\n",
71
+ " \"\"\"\n",
72
+ " iDs = []\n",
73
+ " for i in range(numId):\n",
74
+ " iDs.append(genID(idSize))\n",
75
+ " return iDs\n",
76
+ "\n",
77
+ "def genNumID(size):\n",
78
+ " \"\"\"\n",
79
+ " generates ID consisting of digits onl\n",
80
+ "\n",
81
+ " Parameters\n",
82
+ " size : size of ID\n",
83
+ " \"\"\"\n",
84
+ " id = \"\"\n",
85
+ " for i in range(size):\n",
86
+ " id = id + selectRandomFromList(numTokens)\n",
87
+ " return id\n",
88
+ "\n",
89
+ "def genLowCaseID(size):\n",
90
+ " \"\"\"\n",
91
+ " generates ID consisting of lower case chars\n",
92
+ "\n",
93
+ " Parameters\n",
94
+ " size : size of ID\n",
95
+ " \"\"\"\n",
96
+ " id = \"\"\n",
97
+ " for i in range(size):\n",
98
+ " id = id + selectRandomFromList(loCaseChars)\n",
99
+ " return id\n",
100
+ "\n",
101
+ "def genNumIdList(numId, idSize):\n",
102
+ " \"\"\"\n",
103
+ " generate list of numeric IDs\n",
104
+ "\n",
105
+ " Parameters:\n",
106
+ " numId: number of Ids\n",
107
+ " idSize: ID size\n",
108
+ " \"\"\"\n",
109
+ " iDs = []\n",
110
+ " for i in range(numId):\n",
111
+ " iDs.append(genNumID(idSize))\n",
112
+ " return iDs\n",
113
+ "\n",
114
+ "def genNameInitial():\n",
115
+ " \"\"\"\n",
116
+ " generate name initial\n",
117
+ " \"\"\"\n",
118
+ " return selectRandomFromList(alphaTokens) + selectRandomFromList(alphaTokens)\n",
119
+ "\n",
120
+ "def genPhoneNum(arCode):\n",
121
+ " \"\"\"\n",
122
+ " generates phone number\n",
123
+ "\n",
124
+ " Parameters\n",
125
+ " arCode: area code\n",
126
+ " \"\"\"\n",
127
+ " phNum = genNumID(7)\n",
128
+ " return arCode + str(phNum)\n",
129
+ "\n",
130
+ "def selectRandomFromList(ldata):\n",
131
+ " \"\"\"\n",
132
+ " select an element randomly from a lis\n",
133
+ "\n",
134
+ " Parameters\n",
135
+ " ldata : list data\n",
136
+ " \"\"\"\n",
137
+ " return ldata[randint(0, len(ldata)-1)]\n",
138
+ "\n",
139
+ "def selectOtherRandomFromList(ldata, cval):\n",
140
+ " \"\"\"\n",
141
+ " select an element randomly from a list excluding the given one\n",
142
+ "\n",
143
+ " Parameters\n",
144
+ " ldata : list data\n",
145
+ " cval : value to be excluded\n",
146
+ " \"\"\"\n",
147
+ " nval = selectRandomFromList(ldata)\n",
148
+ " while nval == cval:\n",
149
+ " nval = selectRandomFromList(ldata)\n",
150
+ " return nval\n",
151
+ "\n",
152
+ "def selectRandomSubListFromList(ldata, num):\n",
153
+ " \"\"\"\n",
154
+ " generates random sublist from a list without replacemment\n",
155
+ "\n",
156
+ " Parameters\n",
157
+ " ldata : list data\n",
158
+ " num : output list size\n",
159
+ " \"\"\"\n",
160
+ " assertLesser(num, len(ldata), \"size of sublist to be sampled greater than or equal to main list\")\n",
161
+ " i = randint(0, len(ldata)-1)\n",
162
+ " sel = ldata[i]\n",
163
+ " selSet = {i}\n",
164
+ " selList = [sel]\n",
165
+ " while (len(selSet) < num):\n",
166
+ " i = randint(0, len(ldata)-1)\n",
167
+ " if (i not in selSet):\n",
168
+ " sel = ldata[i]\n",
169
+ " selSet.add(i)\n",
170
+ " selList.append(sel)\n",
171
+ " return selList\n",
172
+ "\n",
173
+ "def selectRandomSubListFromListWithRepl(ldata, num):\n",
174
+ " \"\"\"\n",
175
+ " generates random sublist from a list with replacemment\n",
176
+ "\n",
177
+ " Parameters\n",
178
+ " ldata : list data\n",
179
+ " num : output list size\n",
180
+ " \"\"\"\n",
181
+ " return list(map(lambda i : selectRandomFromList(ldata), range(num)))\n",
182
+ "\n",
183
+ "def selectRandomFromDict(ddata):\n",
184
+ " \"\"\"\n",
185
+ " select an element randomly from a dictionary\n",
186
+ "\n",
187
+ " Parameters\n",
188
+ " ddata : dictionary data\n",
189
+ " \"\"\"\n",
190
+ " dkeys = list(ddata.keys())\n",
191
+ " dk = selectRandomFromList(dkeys)\n",
192
+ " el = (dk, ddata[dk])\n",
193
+ " return el\n",
194
+ "\n",
195
+ "def setListRandomFromList(ldata, ldataRepl):\n",
196
+ " \"\"\"\n",
197
+ " sets some elents in the first list randomly with elements from the second list\n",
198
+ "\n",
199
+ " Parameters\n",
200
+ " ldata : list data\n",
201
+ " ldataRepl : list with replacement data\n",
202
+ " \"\"\"\n",
203
+ " l = len(ldata)\n",
204
+ " selSet = set()\n",
205
+ " for d in ldataRepl:\n",
206
+ " i = randint(0, l-1)\n",
207
+ " while i in selSet:\n",
208
+ " i = randint(0, l-1)\n",
209
+ " ldata[i] = d\n",
210
+ " selSet.add(i)\n",
211
+ "\n",
212
+ "def genIpAddress():\n",
213
+ " \"\"\"\n",
214
+ " generates IP address\n",
215
+ " \"\"\"\n",
216
+ " i1 = randint(0,256)\n",
217
+ " i2 = randint(0,256)\n",
218
+ " i3 = randint(0,256)\n",
219
+ " i4 = randint(0,256)\n",
220
+ " ip = \"%d.%d.%d.%d\" %(i1,i2,i3,i4)\n",
221
+ " return ip\n",
222
+ "\n",
223
+ "def curTimeMs():\n",
224
+ " \"\"\"\n",
225
+ " current time in ms\n",
226
+ " \"\"\"\n",
227
+ " return int((datetime.utcnow() - datetime(1970,1,1)).total_seconds() * 1000)\n",
228
+ "\n",
229
+ "def secDegPolyFit(x1, y1, x2, y2, x3, y3):\n",
230
+ " \"\"\"\n",
231
+ " second deg polynomial \t\n",
232
+ "\n",
233
+ " Parameters\n",
234
+ " x1 : 1st point x\n",
235
+ " y1 : 1st point y\n",
236
+ " x2 : 2nd point x\n",
237
+ " y2 : 2nd point y\n",
238
+ " x3 : 3rd point x\n",
239
+ " y3 : 3rd point y\n",
240
+ " \"\"\"\n",
241
+ " t = (y1 - y2) / (x1 - x2)\n",
242
+ " a = t - (y2 - y3) / (x2 - x3)\n",
243
+ " a = a / (x1 - x3)\n",
244
+ " b = t - a * (x1 + x2)\n",
245
+ " c = y1 - a * x1 * x1 - b * x1\n",
246
+ " return (a, b, c)\n",
247
+ "\n",
248
+ "def range_limit(val, minv, maxv):\n",
249
+ " \"\"\"\n",
250
+ " range limit a value\n",
251
+ "\n",
252
+ " Parameters\n",
253
+ " val : data value\n",
254
+ " minv : minimum\n",
255
+ " maxv : maximum\n",
256
+ " \"\"\"\n",
257
+ " if (val < minv):\n",
258
+ " val = minv\n",
259
+ " elif (val > maxv):\n",
260
+ " val = maxv\n",
261
+ " return val\n",
262
+ "\n",
263
+ "def isInRange(val, minv, maxv):\n",
264
+ " \"\"\"\n",
265
+ " checks if within range\n",
266
+ "\n",
267
+ " Parameters\n",
268
+ " val : data value\n",
269
+ " minv : minimum\n",
270
+ " maxv : maximum\n",
271
+ " \"\"\"\n",
272
+ " return val >= minv and val <= maxv\n",
273
+ "\n",
274
+ "def stripFileLines(filePath, offset):\n",
275
+ " \"\"\"\n",
276
+ " strips number of chars from both ends\n",
277
+ "\n",
278
+ " Parameters\n",
279
+ " filePath : file path\n",
280
+ " offset : offset from both ends of line \n",
281
+ " \"\"\"\n",
282
+ " fp = open(filePath, \"r\")\n",
283
+ " for line in fp:\n",
284
+ " stripped = line[offset:len(line) - 1 - offset]\n",
285
+ " print (stripped)\n",
286
+ " fp.close()\n",
287
+ "\n",
288
+ "def genLatLong(lat1, long1, lat2, long2):\n",
289
+ " \"\"\"\n",
290
+ " generate lat log within limits\n",
291
+ "\n",
292
+ " Parameters\n",
293
+ " lat1 : lat of 1st point\n",
294
+ " long1 : long of 1st point\n",
295
+ " lat2 : lat of 2nd point\n",
296
+ " long2 : long of 2nd point\n",
297
+ " \"\"\"\n",
298
+ " lat = lat1 + (lat2 - lat1) * random.random()\n",
299
+ " longg = long1 + (long2 - long1) * random.random()\n",
300
+ " return (lat, longg)\n",
301
+ "\n",
302
+ "def geoDistance(lat1, long1, lat2, long2):\n",
303
+ " \"\"\"\n",
304
+ " find geo distance in ft\n",
305
+ "\n",
306
+ " Parameters\n",
307
+ " lat1 : lat of 1st point\n",
308
+ " long1 : long of 1st point\n",
309
+ " lat2 : lat of 2nd point\n",
310
+ " long2 : long of 2nd point\n",
311
+ " \"\"\"\n",
312
+ " latDiff = math.radians(lat1 - lat2)\n",
313
+ " longDiff = math.radians(long1 - long2)\n",
314
+ " l1 = math.sin(latDiff/2.0)\n",
315
+ " l2 = math.sin(longDiff/2.0)\n",
316
+ " l3 = math.cos(math.radians(lat1))\n",
317
+ " l4 = math.cos(math.radians(lat2))\n",
318
+ " a = l1 * l1 + l3 * l4 * l2 * l2\n",
319
+ " l5 = math.sqrt(a)\n",
320
+ " l6 = math.sqrt(1.0 - a)\n",
321
+ " c = 2.0 * math.atan2(l5, l6)\n",
322
+ " r = 6371008.8 * 3.280840\n",
323
+ " return c * r\n",
324
+ "\n",
325
+ "def minLimit(val, limit):\n",
326
+ " \"\"\"\n",
327
+ " min limit\n",
328
+ " Parameters\n",
329
+ " \"\"\"\n",
330
+ " if (val < limit):\n",
331
+ " val = limit\n",
332
+ " return val;\n",
333
+ "\n",
334
+ "def maxLimit(val, limit):\n",
335
+ " \"\"\"\n",
336
+ " max limit\n",
337
+ " Parameters\n",
338
+ " \"\"\"\n",
339
+ " if (val > limit):\n",
340
+ " val = limit\n",
341
+ " return val;\n",
342
+ "\n",
343
+ "def rangeSample(val, minLim, maxLim):\n",
344
+ " \"\"\"\n",
345
+ " if out side range sample within range\n",
346
+ "\n",
347
+ " Parameters\n",
348
+ " val : value\n",
349
+ " minLim : minimum\n",
350
+ " maxLim : maximum\n",
351
+ " \"\"\"\n",
352
+ " if val < minLim or val > maxLim:\n",
353
+ " val = randint(minLim, maxLim)\n",
354
+ " return val\n",
355
+ "\n",
356
+ "def genRandomIntListWithinRange(size, minLim, maxLim):\n",
357
+ " \"\"\"\n",
358
+ " random unique list of integers within range\n",
359
+ "\n",
360
+ " Parameters\n",
361
+ " size : size of returned list\n",
362
+ " minLim : minimum\n",
363
+ " maxLim : maximum\n",
364
+ " \"\"\"\n",
365
+ " values = set()\n",
366
+ " for i in range(size):\n",
367
+ " val = randint(minLim, maxLim)\n",
368
+ " while val not in values:\n",
369
+ " values.add(val)\n",
370
+ " return list(values)\n",
371
+ "\n",
372
+ "def preturbScalar(value, vrange):\n",
373
+ " \"\"\"\n",
374
+ " preturbs a mutiplicative value within range\n",
375
+ "\n",
376
+ " Parameters\n",
377
+ " value : data value\n",
378
+ " vrange : value delta fraction\n",
379
+ " \"\"\"\n",
380
+ " scale = 1.0 - vrange + 2 * vrange * random.random() \n",
381
+ " return value * scale\n",
382
+ "\n",
383
+ "def preturbScalarAbs(value, vrange):\n",
384
+ " \"\"\"\n",
385
+ " preturbs an absolute value within range\n",
386
+ "\n",
387
+ " Parameters\n",
388
+ " value : data value\n",
389
+ " vrange : value delta absolute\n",
390
+ " \"\"\"\n",
391
+ " delta = - vrange + 2.0 * vrange * random.random() \n",
392
+ " return value + delta\n",
393
+ "\n",
394
+ "def preturbVector(values, vrange):\n",
395
+ " \"\"\"\n",
396
+ " preturbs a list within range\n",
397
+ "\n",
398
+ " Parameters\n",
399
+ " values : list data\n",
400
+ " vrange : value delta fraction\n",
401
+ " \"\"\"\n",
402
+ " nValues = list(map(lambda va: preturbScalar(va, vrange), values))\n",
403
+ " return nValues\n",
404
+ "\n",
405
+ "def randomShiftVector(values, smin, smax):\n",
406
+ " \"\"\"\n",
407
+ " shifts a list by a random quanity with a range\n",
408
+ "\n",
409
+ " Parameters\n",
410
+ " values : list data\n",
411
+ " smin : samplinf minimum\n",
412
+ " smax : sampling maximum\n",
413
+ " \"\"\"\n",
414
+ " shift = np.random.uniform(smin, smax)\n",
415
+ " return list(map(lambda va: va + shift, values))\n",
416
+ "\n",
417
+ "def floatRange(beg, end, incr):\n",
418
+ " \"\"\"\n",
419
+ " generates float range\n",
420
+ "\n",
421
+ " Parameters\n",
422
+ " beg :range begin\n",
423
+ " end: range end\n",
424
+ " incr : range increment\n",
425
+ " \"\"\"\n",
426
+ " return list(np.arange(beg, end, incr))\n",
427
+ "\n",
428
+ "def shuffle(values, *numShuffles):\n",
429
+ " \"\"\"\n",
430
+ " in place shuffling with swap of pairs\n",
431
+ "\n",
432
+ " Parameters\n",
433
+ " values : list data\n",
434
+ " numShuffles : parameter list for number of shuffles\n",
435
+ " \"\"\"\n",
436
+ " size = len(values)\n",
437
+ " if len(numShuffles) == 0:\n",
438
+ " numShuffle = int(size / 2)\n",
439
+ " elif len(numShuffles) == 1:\n",
440
+ " numShuffle = numShuffles[0]\n",
441
+ " else:\n",
442
+ " numShuffle = randint(numShuffles[0], numShuffles[1])\n",
443
+ " print(\"numShuffle {}\".format(numShuffle))\n",
444
+ " for i in range(numShuffle):\n",
445
+ " first = random.randint(0, size - 1)\n",
446
+ " second = random.randint(0, size - 1)\n",
447
+ " while first == second:\n",
448
+ " second = random.randint(0, size - 1)\n",
449
+ " tmp = values[first]\n",
450
+ " values[first] = values[second]\n",
451
+ " values[second] = tmp\n",
452
+ "\n",
453
+ "\n",
454
+ "def splitList(itms, numGr):\n",
455
+ " \"\"\"\n",
456
+ " splits a list into sub lists of approximately equal size, with items in sublists randomly chod=sen\n",
457
+ "\n",
458
+ " Parameters\n",
459
+ " itms ; list of values\t\t\n",
460
+ " numGr : no of groups\n",
461
+ " \"\"\"\n",
462
+ " tcount = len(itms)\n",
463
+ " cItems = list(itms)\n",
464
+ " sz = int(len(cItems) / numGr)\n",
465
+ " groups = list()\n",
466
+ " count = 0\n",
467
+ " for i in range(numGr):\n",
468
+ " if (i == numGr - 1):\n",
469
+ " csz = tcount - count\n",
470
+ " else:\n",
471
+ " csz = sz + randint(-2, 2)\n",
472
+ " count += csz\n",
473
+ " gr = list()\n",
474
+ " for j in range(csz):\n",
475
+ " it = selectRandomFromList(cItems)\n",
476
+ " gr.append(it)\n",
477
+ " cItems.remove(it)\n",
478
+ " groups.append(gr)\n",
479
+ " return groups\n",
480
+ "\n",
481
+ "def multVector(values, vrange):\n",
482
+ " \"\"\"\n",
483
+ " multiplies a list within value range\n",
484
+ "\n",
485
+ " Parameters\n",
486
+ " values : list of values\n",
487
+ " vrange : fraction of vaue to be used to update\n",
488
+ " \"\"\"\n",
489
+ " scale = 1.0 - vrange + 2 * vrange * random.random()\n",
490
+ " nValues = list(map(lambda va: va * scale, values))\n",
491
+ " return nValues\n",
492
+ "\n",
493
+ "def weightedAverage(values, weights):\n",
494
+ " \"\"\"\n",
495
+ " calculates weighted average\n",
496
+ "\n",
497
+ " Parameters\n",
498
+ " values : list of values\n",
499
+ " weights : list of weights\n",
500
+ " \"\"\"\t\t\n",
501
+ " assert len(values) == len(weights), \"values and weights should be same size\"\n",
502
+ " vw = zip(values, weights)\n",
503
+ " wva = list(map(lambda e : e[0] * e[1], vw))\n",
504
+ " #wa = sum(x * y for x, y in vw) / sum(weights)\n",
505
+ " wav = sum(wva) / sum(weights)\n",
506
+ " return wav\n",
507
+ "\n",
508
+ "def extractFields(line, delim, keepIndices):\n",
509
+ " \"\"\"\n",
510
+ " breaks a line into fields and keeps only specified fileds and returns new line\n",
511
+ "\n",
512
+ " Parameters\n",
513
+ " line ; deli separated string\n",
514
+ " delim : delemeter\n",
515
+ " keepIndices : list of indexes to fields to be retained\n",
516
+ " \"\"\"\n",
517
+ " items = line.split(delim)\n",
518
+ " newLine = []\n",
519
+ " for i in keepIndices:\n",
520
+ " newLine.append(line[i])\n",
521
+ " return delim.join(newLine)\n",
522
+ "\n",
523
+ "def remFields(line, delim, remIndices):\n",
524
+ " \"\"\"\n",
525
+ " removes fields from delim separated string\n",
526
+ "\n",
527
+ " Parameters\n",
528
+ " line ; delemeter separated string\n",
529
+ " delim : delemeter\n",
530
+ " remIndices : list of indexes to fields to be removed\n",
531
+ " \"\"\"\n",
532
+ " items = line.split(delim)\n",
533
+ " newLine = []\n",
534
+ " for i in range(len(items)):\n",
535
+ " if not arrayContains(remIndices, i):\n",
536
+ " newLine.append(line[i])\n",
537
+ " return delim.join(newLine)\n",
538
+ "\n",
539
+ "def extractList(data, indices):\n",
540
+ " \"\"\"\n",
541
+ " extracts list from another list, given indices\n",
542
+ "\n",
543
+ " Parameters\n",
544
+ " remIndices : list data\n",
545
+ " indices : list of indexes to fields to be retained\n",
546
+ " \"\"\"\n",
547
+ " if areAllFieldsIncluded(data, indices):\n",
548
+ " exList = data.copy()\n",
549
+ " #print(\"all indices\")\n",
550
+ " else:\n",
551
+ " exList = list()\n",
552
+ " le = len(data)\n",
553
+ " for i in indices:\n",
554
+ " assert i < le , \"index {} out of bound {}\".format(i, le)\n",
555
+ " exList.append(data[i])\n",
556
+ "\n",
557
+ " return exList\n",
558
+ "\n",
559
+ "def arrayContains(arr, item):\n",
560
+ " \"\"\"\n",
561
+ " checks if array contains an item \n",
562
+ "\n",
563
+ " Parameters\n",
564
+ " arr : list data\n",
565
+ " item : item to search\n",
566
+ " \"\"\"\n",
567
+ " contains = True\n",
568
+ " try:\n",
569
+ " arr.index(item)\n",
570
+ " except ValueError:\n",
571
+ " contains = False\n",
572
+ " return contains\n",
573
+ "\n",
574
+ "def strToIntArray(line, delim=\",\"):\n",
575
+ " \"\"\"\n",
576
+ " int array from delim separated string\n",
577
+ "\n",
578
+ " Parameters\n",
579
+ " line ; delemeter separated string\n",
580
+ " \"\"\"\n",
581
+ " arr = line.split(delim)\n",
582
+ " return [int(a) for a in arr]\n",
583
+ "\n",
584
+ "def strToFloatArray(line, delim=\",\"):\n",
585
+ " \"\"\"\n",
586
+ " float array from delim separated string\n",
587
+ "\n",
588
+ " Parameters\n",
589
+ " line ; delemeter separated string\n",
590
+ " \"\"\"\n",
591
+ " arr = line.split(delim)\n",
592
+ " return [float(a) for a in arr]\n",
593
+ "\n",
594
+ "def strListOrRangeToIntArray(line):\n",
595
+ " \"\"\"\n",
596
+ " int array from delim separated string or range\n",
597
+ "\n",
598
+ " Parameters\n",
599
+ " line ; delemeter separated string\n",
600
+ " \"\"\"\n",
601
+ " varr = line.split(\",\")\n",
602
+ " if (len(varr) > 1):\n",
603
+ " iarr = list(map(lambda v: int(v), varr))\n",
604
+ " else:\n",
605
+ " vrange = line.split(\":\")\n",
606
+ " if (len(vrange) == 2):\n",
607
+ " lo = int(vrange[0])\n",
608
+ " hi = int(vrange[1])\n",
609
+ " iarr = list(range(lo, hi+1))\n",
610
+ " else:\n",
611
+ " iarr = [int(line)]\n",
612
+ " return iarr\n",
613
+ "\n",
614
+ "def toStr(val, precision):\n",
615
+ " \"\"\"\n",
616
+ " converts any type to string\t\n",
617
+ "\n",
618
+ " Parameters\n",
619
+ " val : value\n",
620
+ " precision ; precision for float value\n",
621
+ " \"\"\"\n",
622
+ " if type(val) == float or type(val) == np.float64 or type(val) == np.float32:\n",
623
+ " format = \"%\" + \".%df\" %(precision)\n",
624
+ " sVal = format %(val)\n",
625
+ " else:\n",
626
+ " sVal = str(val)\n",
627
+ " return sVal\n",
628
+ "\n",
629
+ "def toStrFromList(values, precision, delim=\",\"):\n",
630
+ " \"\"\"\n",
631
+ " converts list of any type to delim separated string\n",
632
+ "\n",
633
+ " Parameters\n",
634
+ " values : list data\n",
635
+ " precision ; precision for float value\n",
636
+ " delim : delemeter\n",
637
+ " \"\"\"\n",
638
+ " sValues = list(map(lambda v: toStr(v, precision), values))\n",
639
+ " return delim.join(sValues)\n",
640
+ "\n",
641
+ "def toIntList(values):\n",
642
+ " \"\"\"\n",
643
+ " convert to int list\n",
644
+ "\n",
645
+ " Parameters\n",
646
+ " values : list data\n",
647
+ " \"\"\"\n",
648
+ " return list(map(lambda va: int(va), values))\n",
649
+ "\n",
650
+ "def toFloatList(values):\n",
651
+ " \"\"\"\n",
652
+ " convert to float list\n",
653
+ "\n",
654
+ " Parameters\n",
655
+ " values : list data\n",
656
+ " \"\"\"\n",
657
+ " return list(map(lambda va: float(va), values))\n",
658
+ "\n",
659
+ "def toStrList(values, precision=None):\n",
660
+ " \"\"\"\n",
661
+ " convert to string list\n",
662
+ "\n",
663
+ " Parameters\n",
664
+ " values : list data\n",
665
+ " precision ; precision for float value\n",
666
+ " \"\"\"\n",
667
+ " return list(map(lambda va: toStr(va, precision), values))\n",
668
+ "\n",
669
+ "def toIntFromBoolean(value):\n",
670
+ " \"\"\"\n",
671
+ " convert to int\n",
672
+ "\n",
673
+ " Parameters\n",
674
+ " value : boolean value\n",
675
+ " \"\"\"\n",
676
+ " ival = 1 if value else 0\n",
677
+ " return ival\n",
678
+ "\n",
679
+ "def typedValue(val, dtype=None):\n",
680
+ " \"\"\"\n",
681
+ " return typed value given string, discovers data type if not specified\n",
682
+ "\n",
683
+ " Parameters\n",
684
+ " val : value\n",
685
+ " dtype : data type\n",
686
+ " \"\"\"\n",
687
+ " tVal = None\n",
688
+ "\n",
689
+ " if dtype is not None:\n",
690
+ " if dtype == \"num\":\n",
691
+ " dtype = \"int\" if dtype.find(\".\") == -1 else \"float\"\n",
692
+ "\n",
693
+ " if dtype == \"int\":\n",
694
+ " tVal = int(val)\n",
695
+ " elif dtype == \"float\":\n",
696
+ " tVal = float(val)\n",
697
+ " elif dtype == \"bool\":\n",
698
+ " tVal = bool(val)\n",
699
+ " else:\n",
700
+ " tVal = val\n",
701
+ " else:\n",
702
+ " if type(val) == str:\n",
703
+ " lVal = val.lower()\n",
704
+ "\n",
705
+ " #int\n",
706
+ " done = True\n",
707
+ " try:\n",
708
+ " tVal = int(val)\n",
709
+ " except ValueError:\n",
710
+ " done = False\n",
711
+ "\n",
712
+ " #float\n",
713
+ " if not done:\n",
714
+ " done = True\n",
715
+ " try:\n",
716
+ " tVal = float(val)\n",
717
+ " except ValueError:\n",
718
+ " done = False\n",
719
+ "\n",
720
+ " #boolean\n",
721
+ " if not done:\n",
722
+ " done = True\n",
723
+ " if lVal == \"true\":\n",
724
+ " tVal = True\n",
725
+ " elif lVal == \"false\":\n",
726
+ " tVal = False\n",
727
+ " else:\n",
728
+ " done = False\n",
729
+ " #None\t\t\n",
730
+ " if not done:\n",
731
+ " if lVal == \"none\":\n",
732
+ " tVal = None\n",
733
+ " else:\n",
734
+ " tVal = val\n",
735
+ " else:\n",
736
+ " tVal = val\n",
737
+ "\n",
738
+ " return tVal\n",
739
+ "\n",
740
+ "def getAllFiles(dirPath):\n",
741
+ " \"\"\"\n",
742
+ " get all files recursively\n",
743
+ "\n",
744
+ " Parameters\n",
745
+ " dirPath : directory path\n",
746
+ " \"\"\"\n",
747
+ " filePaths = []\n",
748
+ " for (thisDir, subDirs, fileNames) in os.walk(dirPath):\n",
749
+ " for fileName in fileNames:\n",
750
+ " filePaths.append(os.path.join(thisDir, fileName))\n",
751
+ " filePaths.sort()\n",
752
+ " return filePaths\n",
753
+ "\n",
754
+ "def getFileContent(fpath, verbose=False):\n",
755
+ " \"\"\"\n",
756
+ " get file contents in directory\n",
757
+ "\n",
758
+ " Parameters\n",
759
+ " fpath ; directory path\n",
760
+ " verbose : verbosity flag\n",
761
+ " \"\"\"\n",
762
+ " # dcument list\n",
763
+ " docComplete = []\n",
764
+ " filePaths = getAllFiles(fpath)\n",
765
+ "\n",
766
+ " # read files\n",
767
+ " for filePath in filePaths:\n",
768
+ " if verbose:\n",
769
+ " print(\"next file \" + filePath)\n",
770
+ " with open(filePath, 'r') as contentFile:\n",
771
+ " content = contentFile.read()\n",
772
+ " docComplete.append(content)\n",
773
+ " return (docComplete, filePaths)\n",
774
+ "\n",
775
+ "def getOneFileContent(fpath):\n",
776
+ " \"\"\"\n",
777
+ " get one file contents\n",
778
+ "\n",
779
+ " Parameters\n",
780
+ " fpath : file path\n",
781
+ " \"\"\"\n",
782
+ " with open(fpath, 'r') as contentFile:\n",
783
+ " docStr = contentFile.read()\n",
784
+ " return docStr\n",
785
+ "\n",
786
+ "def getFileLines(dirPath, delim=\",\"):\n",
787
+ " \"\"\"\n",
788
+ " get lines from a file\n",
789
+ "\n",
790
+ " Parameters\n",
791
+ " dirPath : file path\n",
792
+ " delim : delemeter\n",
793
+ " \"\"\"\n",
794
+ " lines = list()\n",
795
+ " for li in fileRecGen(dirPath, delim):\n",
796
+ " lines.append(li)\n",
797
+ " return lines\n",
798
+ "\n",
799
+ "def getFileSampleLines(dirPath, percen, delim=\",\"):\n",
800
+ " \"\"\"\n",
801
+ " get sampled lines from a file\n",
802
+ "\n",
803
+ " Parameters\n",
804
+ " dirPath : file path\n",
805
+ " percen : sampling percentage\n",
806
+ " delim : delemeter\n",
807
+ " \"\"\"\n",
808
+ " lines = list()\n",
809
+ " for li in fileRecGen(dirPath, delim):\n",
810
+ " if randint(0, 100) < percen:\n",
811
+ " lines.append(li)\n",
812
+ " return lines\n",
813
+ "\n",
814
+ "def getFileColumnAsString(dirPath, index, delim=\",\"):\n",
815
+ " \"\"\"\n",
816
+ " get string column from a file\n",
817
+ "\n",
818
+ " Parameters\n",
819
+ " dirPath : file path\n",
820
+ " index : index\n",
821
+ " delim : delemeter\n",
822
+ " \"\"\"\n",
823
+ " fields = list()\n",
824
+ " for rec in fileRecGen(dirPath, delim):\n",
825
+ " fields.append(rec[index])\n",
826
+ " #print(fields)\t\n",
827
+ " return fields\n",
828
+ "\n",
829
+ "def getFileColumnsAsString(dirPath, indexes, delim=\",\"):\n",
830
+ " \"\"\"\n",
831
+ " get multiple string columns from a file\n",
832
+ "\n",
833
+ " Parameters\n",
834
+ " dirPath : file path\n",
835
+ " indexes : indexes of columns\n",
836
+ " delim : delemeter\n",
837
+ " \"\"\"\n",
838
+ " nindex = len(indexes)\n",
839
+ " columns = list(map(lambda i : list(), range(nindex)))\n",
840
+ " for rec in fileRecGen(dirPath, delim):\n",
841
+ " for i in range(nindex):\n",
842
+ " columns[i].append(rec[indexes[i]])\n",
843
+ " return columns\n",
844
+ "\n",
845
+ "def getFileColumnAsFloat(dirPath, index, delim=\",\"):\n",
846
+ " \"\"\"\n",
847
+ " get float fileds from a file\n",
848
+ "\n",
849
+ " Parameters\n",
850
+ " dirPath : file path\n",
851
+ " index : index\n",
852
+ " delim : delemeter\n",
853
+ " \"\"\"\n",
854
+ " #print(\"{} {}\".format(dirPath, index))\n",
855
+ " fields = getFileColumnAsString(dirPath, index, delim)\n",
856
+ " return list(map(lambda v:float(v), fields))\n",
857
+ "\n",
858
+ "def getFileColumnAsInt(dirPath, index, delim=\",\"):\n",
859
+ " \"\"\"\n",
860
+ " get float fileds from a file\n",
861
+ "\n",
862
+ " Parameters\n",
863
+ " dirPath : file path\n",
864
+ " index : index\n",
865
+ " delim : delemeter\n",
866
+ " \"\"\"\n",
867
+ " fields = getFileColumnAsString(dirPath, index, delim)\n",
868
+ " return list(map(lambda v:int(v), fields))\n",
869
+ "\n",
870
+ "def getFileAsIntMatrix(dirPath, columns, delim=\",\"):\n",
871
+ " \"\"\"\n",
872
+ " extracts int matrix from csv file given column indices with each row being concatenation of \n",
873
+ " extracted column values row size = num of columns\n",
874
+ "\n",
875
+ " Parameters\n",
876
+ " dirPath : file path\n",
877
+ " columns : indexes of columns\n",
878
+ " delim : delemeter\n",
879
+ " \"\"\"\n",
880
+ " mat = list()\n",
881
+ " for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
882
+ " mat.append(asIntList(rec))\n",
883
+ " return mat\n",
884
+ "\n",
885
+ "def getFileAsFloatMatrix(dirPath, columns, delim=\",\"):\n",
886
+ " \"\"\"\n",
887
+ " extracts float matrix from csv file given column indices with each row being concatenation of \n",
888
+ " extracted column values row size = num of columns\n",
889
+ " Parameters\n",
890
+ " dirPath : file path\n",
891
+ " columns : indexes of columns\n",
892
+ " delim : delemeter\n",
893
+ " \"\"\"\n",
894
+ " mat = list()\n",
895
+ " for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
896
+ " mat.append(asFloatList(rec))\n",
897
+ " return mat\n",
898
+ "\n",
899
+ "def getFileAsFloatColumn(dirPath):\n",
900
+ " \"\"\"\n",
901
+ " grt float list from a file with one float per row\n",
902
+ " Parameters\n",
903
+ " dirPath : file path\n",
904
+ " \"\"\"\n",
905
+ " flist = list()\n",
906
+ " for rec in fileRecGen(dirPath, None):\n",
907
+ " flist.append(float(rec))\n",
908
+ " return flist\n",
909
+ "\n",
910
+ "def getFileAsFiltFloatMatrix(dirPath, filt, columns, delim=\",\"):\n",
911
+ " \"\"\"\n",
912
+ " extracts float matrix from csv file given row filter and column indices with each row being \n",
913
+ " concatenation of extracted column values row size = num of columns\n",
914
+ " Parameters\n",
915
+ " dirPath : file path\n",
916
+ " columns : indexes of columns\n",
917
+ " filt : row filter lambda\n",
918
+ " delim : delemeter\n",
919
+ " \"\"\"\n",
920
+ " mat = list()\n",
921
+ " for rec in fileFiltSelFieldsRecGen(dirPath, filt, columns, delim):\n",
922
+ " mat.append(asFloatList(rec))\n",
923
+ " return mat\n",
924
+ "\n",
925
+ "def getFileAsTypedRecords(dirPath, types, delim=\",\"):\n",
926
+ " \"\"\"\n",
927
+ " extracts typed records from csv file with each row being concatenation of \n",
928
+ " extracted column values \n",
929
+ " Parameters\n",
930
+ " dirPath : file path\n",
931
+ " types : data types\n",
932
+ " delim : delemeter\n",
933
+ " \"\"\"\n",
934
+ " (dtypes, cvalues) = extractTypesFromString(types)\n",
935
+ " tdata = list()\n",
936
+ " for rec in fileRecGen(dirPath, delim):\n",
937
+ " trec = list()\n",
938
+ " for index, value in enumerate(rec):\n",
939
+ " value = __convToTyped(index, value, dtypes)\n",
940
+ " trec.append(value)\n",
941
+ " tdata.append(trec)\n",
942
+ " return tdata\n",
943
+ "\n",
944
+ "\n",
945
+ "def getFileColsAsTypedRecords(dirPath, columns, types, delim=\",\"):\n",
946
+ " \"\"\"\n",
947
+ " extracts typed records from csv file given column indices with each row being concatenation of \n",
948
+ " extracted column values \n",
949
+ " Parameters\n",
950
+ " Parameters\n",
951
+ " dirPath : file path\n",
952
+ " columns : column indexes\n",
953
+ " types : data types\n",
954
+ " delim : delemeter\n",
955
+ " \"\"\"\n",
956
+ " (dtypes, cvalues) = extractTypesFromString(types)\n",
957
+ " tdata = list()\n",
958
+ " for rec in fileSelFieldsRecGen(dirPath, columns, delim):\n",
959
+ " trec = list()\n",
960
+ " for indx, value in enumerate(rec):\n",
961
+ " tindx = columns[indx]\n",
962
+ " value = __convToTyped(tindx, value, dtypes)\n",
963
+ " trec.append(value)\n",
964
+ " tdata.append(trec)\n",
965
+ " return tdata\n",
966
+ "\n",
967
+ "def getFileColumnsMinMax(dirPath, columns, dtype, delim=\",\"):\n",
968
+ " \"\"\"\n",
969
+ " extracts numeric matrix from csv file given column indices. For each column return min and max\n",
970
+ " Parameters\n",
971
+ " dirPath : file path\n",
972
+ " columns : column indexes\n",
973
+ " dtype : data type\n",
974
+ " delim : delemeter\n",
975
+ " \"\"\"\n",
976
+ " dtypes = list(map(lambda c : str(c) + \":\" + dtype, columns))\n",
977
+ " dtypes = \",\".join(dtypes)\n",
978
+ " #print(dtypes)\n",
979
+ "\n",
980
+ " tdata = getFileColsAsTypedRecords(dirPath, columns, dtypes, delim)\n",
981
+ " minMax = list()\n",
982
+ " ncola = len(tdata[0])\n",
983
+ " ncole = len(columns)\n",
984
+ " assertEqual(ncola, ncole, \"actual no of columns different from expected\")\n",
985
+ "\n",
986
+ " for ci in range(ncole):\t\n",
987
+ " vmin = sys.float_info.max\n",
988
+ " vmax = sys.float_info.min\n",
989
+ " for r in tdata:\n",
990
+ " cv = r[ci]\n",
991
+ " vmin = cv if cv < vmin else vmin\n",
992
+ " vmax = cv if cv > vmax else vmax\n",
993
+ " mm = (vmin, vmax, vmax - vmin)\n",
994
+ " minMax.append(mm)\n",
995
+ "\n",
996
+ " return minMax\n",
997
+ "\n",
998
+ "\n",
999
+ "def getRecAsTypedRecord(rec, types, delim=None):\n",
1000
+ " \"\"\"\n",
1001
+ " converts record to typed records \n",
1002
+ " Parameters\n",
1003
+ " rec : delemeter separate string or list of string\n",
1004
+ " types : field data types\n",
1005
+ " delim : delemeter\n",
1006
+ " \"\"\"\t\n",
1007
+ " if delim is not None:\n",
1008
+ " rec = rec.split(delim)\n",
1009
+ " (dtypes, cvalues) = extractTypesFromString(types)\n",
1010
+ " #print(types)\n",
1011
+ " #print(dtypes)\n",
1012
+ " trec = list()\n",
1013
+ " for ind, value in enumerate(rec):\n",
1014
+ " tvalue = __convToTyped(ind, value, dtypes)\n",
1015
+ " trec.append(tvalue)\n",
1016
+ " return trec\n",
1017
+ "\n",
1018
+ "def __convToTyped(index, value, dtypes):\n",
1019
+ " \"\"\"\n",
1020
+ " convert to typed value \n",
1021
+ " Parameters\n",
1022
+ " index : index in type list\n",
1023
+ " value : data value\n",
1024
+ " dtypes : data type list\n",
1025
+ " \"\"\"\n",
1026
+ " #print(index, value)\n",
1027
+ " dtype = dtypes[index]\n",
1028
+ " tvalue = value\n",
1029
+ " if dtype == \"int\":\n",
1030
+ " tvalue = int(value)\n",
1031
+ " elif dtype == \"float\":\n",
1032
+ " tvalue = float(value)\n",
1033
+ " return tvalue\n",
1034
+ "\n",
1035
+ "\n",
1036
+ "\n",
1037
+ "def extractTypesFromString(types):\n",
1038
+ " \"\"\"\n",
1039
+ " extracts column data types and set values for categorical variables \n",
1040
+ " Parameters\n",
1041
+ " types : encoded type information\n",
1042
+ " \"\"\"\n",
1043
+ " ftypes = types.split(\",\")\n",
1044
+ " dtypes = dict()\n",
1045
+ " cvalues = dict()\n",
1046
+ " for ftype in ftypes:\n",
1047
+ " items = ftype.split(\":\") \n",
1048
+ " cindex = int(items[0])\n",
1049
+ " dtype = items[1]\n",
1050
+ " dtypes[cindex] = dtype\n",
1051
+ " if len(items) == 3:\n",
1052
+ " sitems = items[2].split()\n",
1053
+ " cvalues[cindex] = sitems\n",
1054
+ " return (dtypes, cvalues)\n",
1055
+ "\n",
1056
+ "def getMultipleFileAsInttMatrix(dirPathWithCol, delim=\",\"):\n",
1057
+ " \"\"\"\n",
1058
+ " extracts int matrix from from csv files given column index for each file. \n",
1059
+ " num of columns = number of rows in each file and num of rows = number of files\n",
1060
+ " Parameters\n",
1061
+ " dirPathWithCol: list of file path and collumn index pair\n",
1062
+ " delim : delemeter\n",
1063
+ " \"\"\"\n",
1064
+ " mat = list()\n",
1065
+ " minLen = -1\n",
1066
+ " for path, col in dirPathWithCol:\n",
1067
+ " colVals = getFileColumnAsInt(path, col, delim)\n",
1068
+ " if minLen < 0 or len(colVals) < minLen:\n",
1069
+ " minLen = len(colVals)\n",
1070
+ " mat.append(colVals)\n",
1071
+ "\n",
1072
+ " #make all same length\n",
1073
+ " mat = list(map(lambda li:li[:minLen], mat))\n",
1074
+ " return mat\n",
1075
+ "\n",
1076
+ "def getMultipleFileAsFloatMatrix(dirPathWithCol, delim=\",\"):\n",
1077
+ " \"\"\"\n",
1078
+ " extracts float matrix from from csv files given column index for each file. \n",
1079
+ " num of columns = number of rows in each file and num of rows = number of files\n",
1080
+ " Parameters\n",
1081
+ " dirPathWithCol: list of file path and collumn index pair\n",
1082
+ " delim : delemeter\n",
1083
+ " \"\"\"\n",
1084
+ " mat = list()\n",
1085
+ " minLen = -1\n",
1086
+ " for path, col in dirPathWithCol:\n",
1087
+ " colVals = getFileColumnAsFloat(path, col, delim)\n",
1088
+ " if minLen < 0 or len(colVals) < minLen:\n",
1089
+ " minLen = len(colVals)\n",
1090
+ " mat.append(colVals)\n",
1091
+ "\n",
1092
+ " #make all same length\n",
1093
+ " mat = list(map(lambda li:li[:minLen], mat))\n",
1094
+ " return mat\n",
1095
+ "\n",
1096
+ "def writeStrListToFile(ldata, filePath, delem=\",\"):\n",
1097
+ " \"\"\"\n",
1098
+ " writes list of dlem separated string or list of list of string to afile\n",
1099
+ "\n",
1100
+ " Parameters\n",
1101
+ " ldata : list data\n",
1102
+ " filePath : file path\n",
1103
+ " delim : delemeter\n",
1104
+ " \"\"\"\n",
1105
+ " with open(filePath, \"w\") as fh:\n",
1106
+ " for r in ldata:\n",
1107
+ " if type(r) == list:\n",
1108
+ " r = delem.join(r)\n",
1109
+ " fh.write(r + \"\\n\")\n",
1110
+ "\n",
1111
+ "def writeFloatListToFile(ldata, prec, filePath):\n",
1112
+ " \"\"\"\n",
1113
+ " writes float list to file, one value per line\n",
1114
+ "\n",
1115
+ " Parameters\n",
1116
+ " ldata : list data\n",
1117
+ " prec : precision\n",
1118
+ " filePath : file path\n",
1119
+ " \"\"\"\n",
1120
+ " with open(filePath, \"w\") as fh:\n",
1121
+ " for d in ldata:\n",
1122
+ " fh.write(formatFloat(prec, d) + \"\\n\")\n",
1123
+ "\n",
1124
+ "\n",
1125
+ "def takeFirst(elems):\n",
1126
+ " \"\"\"\n",
1127
+ " return fisrt item\n",
1128
+ " Parameters\n",
1129
+ " elems : list of data \n",
1130
+ " \"\"\"\n",
1131
+ " return elems[0]\n",
1132
+ "\n",
1133
+ "def takeSecond(elems):\n",
1134
+ " \"\"\"\n",
1135
+ " return 2nd element\n",
1136
+ " Parameters\n",
1137
+ " elems : list of data \n",
1138
+ " \"\"\"\n",
1139
+ " return elems[1]\n",
1140
+ "\n",
1141
+ "def takeThird(elems):\n",
1142
+ " \"\"\"\n",
1143
+ " returns 3rd element\n",
1144
+ " Parameters\n",
1145
+ " elems : list of data \n",
1146
+ " \"\"\"\n",
1147
+ " return elems[2]\n",
1148
+ "\n",
1149
+ "def addToKeyedCounter(dCounter, key, count=1):\n",
1150
+ " \"\"\"\n",
1151
+ " add to to keyed counter\n",
1152
+ " Parameters\n",
1153
+ " dCounter : dictionary of counters\n",
1154
+ " key : dictionary key\n",
1155
+ " count : count to add\n",
1156
+ " \"\"\"\n",
1157
+ " curCount = dCounter.get(key, 0)\n",
1158
+ " dCounter[key] = curCount + count\n",
1159
+ "\n",
1160
+ "def incrKeyedCounter(dCounter, key):\n",
1161
+ " \"\"\"\n",
1162
+ " increment keyed counter\n",
1163
+ " Parameters\n",
1164
+ " dCounter : dictionary of counters\n",
1165
+ " key : dictionary key\n",
1166
+ " \"\"\"\n",
1167
+ " addToKeyedCounter(dCounter, key, 1)\n",
1168
+ "\n",
1169
+ "def appendKeyedList(dList, key, elem):\n",
1170
+ " \"\"\"\n",
1171
+ " keyed list\n",
1172
+ " Parameters\n",
1173
+ " dList : dictionary of lists\n",
1174
+ " key : dictionary key\n",
1175
+ " elem : value to append\n",
1176
+ " \"\"\"\n",
1177
+ " curList = dList.get(key, [])\n",
1178
+ " curList.append(elem)\n",
1179
+ " dList[key] = curList\n",
1180
+ "\n",
1181
+ "def isNumber(st):\n",
1182
+ " \"\"\"\n",
1183
+ " Returns True is string is a number\n",
1184
+ " Parameters\n",
1185
+ " st : string value\n",
1186
+ " \"\"\"\n",
1187
+ " return st.replace('.','',1).isdigit()\n",
1188
+ "\n",
1189
+ "def removeNan(values):\n",
1190
+ " \"\"\"\n",
1191
+ " removes nan from list\n",
1192
+ " Parameters\n",
1193
+ " values : list data\n",
1194
+ " \"\"\"\n",
1195
+ " return list(filter(lambda v: not math.isnan(v), values))\n",
1196
+ "\n",
1197
+ "def fileRecGen(filePath, delim = \",\"):\n",
1198
+ " \"\"\"\n",
1199
+ " file record generator\n",
1200
+ " Parameters\n",
1201
+ " filePath ; file path\n",
1202
+ " delim : delemeter\n",
1203
+ " \"\"\"\n",
1204
+ " with open(filePath, \"r\") as fp:\n",
1205
+ " for line in fp:\t\n",
1206
+ " line = line[:-1]\n",
1207
+ " if delim is not None:\n",
1208
+ " line = line.split(delim)\n",
1209
+ " yield line\n",
1210
+ "\n",
1211
+ "def fileSelFieldsRecGen(dirPath, columns, delim=\",\"):\n",
1212
+ " \"\"\"\n",
1213
+ " file record generator given column indices \n",
1214
+ " Parameters\n",
1215
+ " filePath ; file path\n",
1216
+ " columns : column indexes as int array or coma separated string\n",
1217
+ " delim : delemeter\n",
1218
+ " \"\"\"\n",
1219
+ " if type(columns) == str:\n",
1220
+ " columns = strToIntArray(columns, delim)\n",
1221
+ " for rec in fileRecGen(dirPath, delim):\n",
1222
+ " extracted = extractList(rec, columns)\n",
1223
+ " yield extracted\n",
1224
+ "\n",
1225
+ "def fileFiltRecGen(filePath, filt, delim = \",\"):\n",
1226
+ " \"\"\"\n",
1227
+ " file record generator with row filter applied\n",
1228
+ " Parameters\n",
1229
+ " filePath ; file path\n",
1230
+ " filt : row filter\n",
1231
+ " delim : delemeter\n",
1232
+ " \"\"\"\n",
1233
+ " with open(filePath, \"r\") as fp:\n",
1234
+ " for line in fp:\t\n",
1235
+ " line = line[:-1]\n",
1236
+ " if delim is not None:\n",
1237
+ " line = line.split(delim)\n",
1238
+ " if filt(line):\n",
1239
+ " yield line\n",
1240
+ "\n",
1241
+ "def fileFiltSelFieldsRecGen(filePath, filt, columns, delim = \",\"):\n",
1242
+ " \"\"\"\n",
1243
+ " file record generator with row and column filter applied\n",
1244
+ " Parameters\n",
1245
+ " filePath ; file path\n",
1246
+ " filt : row filter\n",
1247
+ " columns : column indexes as int array or coma separated string\n",
1248
+ " delim : delemeter\n",
1249
+ " \"\"\"\n",
1250
+ " columns = strToIntArray(columns, delim)\n",
1251
+ " with open(filePath, \"r\") as fp:\n",
1252
+ " for line in fp:\t\n",
1253
+ " line = line[:-1]\n",
1254
+ " if delim is not None:\n",
1255
+ " line = line.split(delim)\n",
1256
+ " if filt(line):\n",
1257
+ " selected = extractList(line, columns)\n",
1258
+ " yield selected\n",
1259
+ "\n",
1260
+ "def fileTypedRecGen(filePath, ftypes, delim = \",\"):\n",
1261
+ " \"\"\"\n",
1262
+ " file typed record generator\n",
1263
+ " Parameters\n",
1264
+ " filePath ; file path\n",
1265
+ " ftypes : list of field types\n",
1266
+ " delim : delemeter\n",
1267
+ " \"\"\"\n",
1268
+ " with open(filePath, \"r\") as fp:\n",
1269
+ " for line in fp:\t\n",
1270
+ " line = line[:-1]\n",
1271
+ " line = line.split(delim)\n",
1272
+ " for i in range(0, len(ftypes), 2):\n",
1273
+ " ci = ftypes[i]\n",
1274
+ " dtype = ftypes[i+1]\n",
1275
+ " assertLesser(ci, len(line), \"index out of bound\")\n",
1276
+ " if dtype == \"int\":\n",
1277
+ " line[ci] = int(line[ci])\n",
1278
+ " elif dtype == \"float\":\n",
1279
+ " line[ci] = float(line[ci])\n",
1280
+ " else:\n",
1281
+ " exitWithMsg(\"invalid data type\")\n",
1282
+ " yield line\n",
1283
+ "\n",
1284
+ "def fileMutatedFieldsRecGen(dirPath, mutator, delim=\",\"):\n",
1285
+ " \"\"\"\n",
1286
+ " file record generator with some columns mutated \n",
1287
+ " Parameters\n",
1288
+ " dirPath ; file path\n",
1289
+ " mutator : row field mutator\n",
1290
+ " delim : delemeter\n",
1291
+ " \"\"\"\n",
1292
+ " for rec in fileRecGen(dirPath, delim):\n",
1293
+ " mutated = mutator(rec)\n",
1294
+ " yield mutated\n",
1295
+ "\n",
1296
+ "def tableSelFieldsFilter(tdata, columns):\n",
1297
+ " \"\"\"\n",
1298
+ " gets tabular data for selected columns \n",
1299
+ " Parameters\n",
1300
+ " tdata : tabular data\n",
1301
+ " columns : column indexes\n",
1302
+ " \"\"\"\n",
1303
+ " if areAllFieldsIncluded(tdata[0], columns):\n",
1304
+ " ntdata = tdata\n",
1305
+ " else:\n",
1306
+ " ntdata = list()\n",
1307
+ " for rec in tdata:\n",
1308
+ " #print(rec)\n",
1309
+ " #print(columns)\n",
1310
+ " nrec = extractList(rec, columns)\n",
1311
+ " ntdata.append(nrec)\n",
1312
+ " return ntdata\n",
1313
+ "\n",
1314
+ "\n",
1315
+ "def areAllFieldsIncluded(ldata, columns):\n",
1316
+ " \"\"\"\n",
1317
+ " return True id all indexes are in the columns\n",
1318
+ " Parameters\n",
1319
+ " ldata : list data\n",
1320
+ " columns : column indexes\n",
1321
+ " \"\"\"\n",
1322
+ " return list(range(len(ldata))) == columns\n",
1323
+ "\n",
1324
+ "def asIntList(items):\n",
1325
+ " \"\"\"\n",
1326
+ " returns int list\n",
1327
+ " Parameters\n",
1328
+ " items : list data\n",
1329
+ " \"\"\"\n",
1330
+ " return [int(i) for i in items]\n",
1331
+ "\n",
1332
+ "def asFloatList(items):\n",
1333
+ " \"\"\"\n",
1334
+ " returns float list\n",
1335
+ " Parameters\n",
1336
+ " items : list data\n",
1337
+ " \"\"\"\n",
1338
+ " return [float(i) for i in items]\n",
1339
+ "\n",
1340
+ "def pastTime(interval, unit):\n",
1341
+ " \"\"\"\n",
1342
+ " current and past time\n",
1343
+ " Parameters\n",
1344
+ " interval : time interval\n",
1345
+ " unit: time unit\n",
1346
+ " \"\"\"\n",
1347
+ " curTime = int(time.time())\n",
1348
+ " if unit == \"d\":\n",
1349
+ " pastTime = curTime - interval * secInDay\n",
1350
+ " elif unit == \"h\":\n",
1351
+ " pastTime = curTime - interval * secInHour\n",
1352
+ " elif unit == \"m\":\n",
1353
+ " pastTime = curTime - interval * secInMinute\n",
1354
+ " else:\n",
1355
+ " raise ValueError(\"invalid time unit \" + unit)\n",
1356
+ " return (curTime, pastTime)\n",
1357
+ "\n",
1358
+ "def minuteAlign(ts):\n",
1359
+ " \"\"\"\n",
1360
+ " minute aligned time\t\n",
1361
+ " Parameters\n",
1362
+ " ts : time stamp in sec\n",
1363
+ " \"\"\"\n",
1364
+ " return int((ts / secInMinute)) * secInMinute\n",
1365
+ "\n",
1366
+ "def multMinuteAlign(ts, min):\n",
1367
+ " \"\"\"\n",
1368
+ " multi minute aligned time\t\n",
1369
+ " Parameters\n",
1370
+ " ts : time stamp in sec\n",
1371
+ " min : minute value\n",
1372
+ " \"\"\"\n",
1373
+ " intv = secInMinute * min\n",
1374
+ " return int((ts / intv)) * intv\n",
1375
+ "\n",
1376
+ "def hourAlign(ts):\n",
1377
+ " \"\"\"\n",
1378
+ " hour aligned time\n",
1379
+ " Parameters\n",
1380
+ " ts : time stamp in sec\n",
1381
+ " \"\"\"\n",
1382
+ " return int((ts / secInHour)) * secInHour\n",
1383
+ "\n",
1384
+ "def hourOfDayAlign(ts, hour):\n",
1385
+ " \"\"\"\n",
1386
+ " hour of day aligned time\n",
1387
+ " Parameters\n",
1388
+ " ts : time stamp in sec\n",
1389
+ " hour : hour of day\n",
1390
+ " \"\"\"\n",
1391
+ " day = int(ts / secInDay)\n",
1392
+ " return (24 * day + hour) * secInHour\n",
1393
+ "\n",
1394
+ "def dayAlign(ts):\n",
1395
+ " \"\"\"\n",
1396
+ " day aligned time\n",
1397
+ " Parameters\n",
1398
+ " ts : time stamp in sec\n",
1399
+ " \"\"\"\n",
1400
+ " return int(ts / secInDay) * secInDay\n",
1401
+ "\n",
1402
+ "def timeAlign(ts, unit):\n",
1403
+ " \"\"\"\n",
1404
+ " boundary alignment of time\n",
1405
+ " Parameters\n",
1406
+ " ts : time stamp in sec\n",
1407
+ " unit : unit of time\n",
1408
+ " \"\"\"\n",
1409
+ " alignedTs = 0\n",
1410
+ " if unit == \"s\":\n",
1411
+ " alignedTs = ts\n",
1412
+ " elif unit == \"m\":\n",
1413
+ " alignedTs = minuteAlign(ts)\n",
1414
+ " elif unit == \"h\":\n",
1415
+ " alignedTs = hourAlign(ts)\n",
1416
+ " elif unit == \"d\":\n",
1417
+ " alignedTs = dayAlign(ts)\n",
1418
+ " else:\n",
1419
+ " raise ValueError(\"invalid time unit\")\n",
1420
+ " return alignedTs\n",
1421
+ "\n",
1422
+ "def monthOfYear(ts):\n",
1423
+ " \"\"\"\n",
1424
+ " month of year\n",
1425
+ " Parameters\n",
1426
+ " ts : time stamp in sec\n",
1427
+ " \"\"\"\n",
1428
+ " rem = ts % secInYear\n",
1429
+ " dow = int(rem / secInMonth)\n",
1430
+ " return dow\n",
1431
+ "\n",
1432
+ "def dayOfWeek(ts):\n",
1433
+ " \"\"\"\n",
1434
+ " day of week\n",
1435
+ " Parameters\n",
1436
+ " ts : time stamp in sec\n",
1437
+ " \"\"\"\n",
1438
+ " rem = ts % secInWeek\n",
1439
+ " dow = int(rem / secInDay)\n",
1440
+ " return dow\n",
1441
+ "\n",
1442
+ "def hourOfDay(ts):\n",
1443
+ " \"\"\"\n",
1444
+ " hour of day\n",
1445
+ " Parameters\n",
1446
+ " ts : time stamp in sec\n",
1447
+ " \"\"\"\n",
1448
+ " rem = ts % secInDay\n",
1449
+ " hod = int(rem / secInHour)\n",
1450
+ " return hod\n",
1451
+ "\n",
1452
+ "def processCmdLineArgs(expectedTypes, usage):\n",
1453
+ " \"\"\"\n",
1454
+ " process command line args and returns args as typed values\n",
1455
+ " Parameters\n",
1456
+ " expectedTypes : expected data types of arguments\n",
1457
+ " usage : usage message string\n",
1458
+ " \"\"\"\n",
1459
+ " args = []\n",
1460
+ " numComLineArgs = len(sys.argv)\n",
1461
+ " numExpected = len(expectedTypes)\n",
1462
+ " if (numComLineArgs - 1 == len(expectedTypes)):\n",
1463
+ " try:\n",
1464
+ " for i in range(0, numExpected):\n",
1465
+ " if (expectedTypes[i] == typeInt):\n",
1466
+ " args.append(int(sys.argv[i+1]))\n",
1467
+ " elif (expectedTypes[i] == typeFloat):\n",
1468
+ " args.append(float(sys.argv[i+1]))\n",
1469
+ " elif (expectedTypes[i] == typeString):\n",
1470
+ " args.append(sys.argv[i+1])\n",
1471
+ " except ValueError:\n",
1472
+ " print (\"expected number of command line arguments found but there is type mis match\")\n",
1473
+ " sys.exit(1)\n",
1474
+ " else:\n",
1475
+ " print (\"expected number of command line arguments not found\")\n",
1476
+ " print (usage)\n",
1477
+ " sys.exit(1)\n",
1478
+ " return args\n",
1479
+ "\n",
1480
+ "def mutateString(val, numMutate, ctype):\n",
1481
+ " \"\"\"\n",
1482
+ " mutate string multiple times\n",
1483
+ " Parameters\n",
1484
+ " val : string value\n",
1485
+ " numMutate : num of mutations\n",
1486
+ " ctype : type of character to mutate with\n",
1487
+ " \"\"\"\n",
1488
+ " mutations = set()\n",
1489
+ " count = 0\n",
1490
+ " while count < numMutate:\n",
1491
+ " j = randint(0, len(val)-1)\n",
1492
+ " if j not in mutations:\n",
1493
+ " if ctype == \"alpha\":\n",
1494
+ " ch = selectRandomFromList(alphaTokens)\n",
1495
+ " elif ctype == \"num\":\n",
1496
+ " ch = selectRandomFromList(numTokens)\n",
1497
+ " elif ctype == \"any\":\n",
1498
+ " ch = selectRandomFromList(tokens)\n",
1499
+ " val = val[:j] + ch + val[j+1:]\n",
1500
+ " mutations.add(j)\n",
1501
+ " count += 1\n",
1502
+ " return val\n",
1503
+ "\n",
1504
+ "def mutateList(values, numMutate, vmin, vmax):\n",
1505
+ " \"\"\"\n",
1506
+ " mutate list multiple times\n",
1507
+ " Parameters\n",
1508
+ " values : list value\n",
1509
+ " numMutate : num of mutations\n",
1510
+ " vmin : minimum of value range\n",
1511
+ " vmax : maximum of value range\n",
1512
+ " \"\"\"\n",
1513
+ " mutations = set()\n",
1514
+ " count = 0\n",
1515
+ " while count < numMutate:\n",
1516
+ " j = randint(0, len(values)-1)\n",
1517
+ " if j not in mutations:\n",
1518
+ " values[j] = np.random.uniform(vmin, vmax)\n",
1519
+ " count += 1\n",
1520
+ " return values\n",
1521
+ "\n",
1522
+ "\n",
1523
+ "def swap(values, first, second):\n",
1524
+ " \"\"\"\n",
1525
+ " swap two elements\n",
1526
+ " Parameters\n",
1527
+ " values : list value\n",
1528
+ " first : first swap position\n",
1529
+ " second : second swap position\n",
1530
+ " \"\"\"\n",
1531
+ " t = values[first]\n",
1532
+ " values[first] = values[second]\n",
1533
+ " values[second] = t\n",
1534
+ "\n",
1535
+ "def swapBetweenLists(values1, values2):\n",
1536
+ " \"\"\"\n",
1537
+ " swap two elements between 2 lists\n",
1538
+ " Parameters\n",
1539
+ " values1 : first list of values\n",
1540
+ " values2 : second list of values\n",
1541
+ " \"\"\"\n",
1542
+ " p1 = randint(0, len(values1)-1)\n",
1543
+ " p2 = randint(0, len(values2)-1)\n",
1544
+ " tmp = values1[p1]\n",
1545
+ " values1[p1] = values2[p2]\n",
1546
+ " values2[p2] = tmp\n",
1547
+ "\n",
1548
+ "def safeAppend(values, value):\n",
1549
+ " \"\"\"\n",
1550
+ " append only if not None\n",
1551
+ " Parameters\n",
1552
+ " values : list value\n",
1553
+ " value : value to append\n",
1554
+ " \"\"\"\n",
1555
+ " if value is not None:\n",
1556
+ " values.append(value)\n",
1557
+ "\n",
1558
+ "def getAllIndex(ldata, fldata):\n",
1559
+ " \"\"\"\n",
1560
+ " get ALL indexes of list elements\n",
1561
+ " Parameters\n",
1562
+ " ldata : list data to find index in\n",
1563
+ " fldata : list data for values for index look up\n",
1564
+ " \"\"\"\n",
1565
+ " return list(map(lambda e : fldata.index(e), ldata))\n",
1566
+ "\n",
1567
+ "def findIntersection(lOne, lTwo):\n",
1568
+ " \"\"\"\n",
1569
+ " find intersection elements between 2 lists\n",
1570
+ " Parameters\n",
1571
+ " lOne : first list of data\n",
1572
+ " lTwo : second list of data\n",
1573
+ " \"\"\"\n",
1574
+ " sOne = set(lOne)\n",
1575
+ " sTwo = set(lTwo)\n",
1576
+ " sInt = sOne.intersection(sTwo)\n",
1577
+ " return list(sInt)\n",
1578
+ "\n",
1579
+ "def isIntvOverlapped(rOne, rTwo):\n",
1580
+ " \"\"\"\n",
1581
+ " checks overlap between 2 intervals\n",
1582
+ " Parameters\n",
1583
+ " rOne : first interval boundaries\n",
1584
+ " rTwo : second interval boundaries\n",
1585
+ " \"\"\"\n",
1586
+ " clear = rOne[1] <= rTwo[0] or rOne[0] >= rTwo[1] \n",
1587
+ " return not clear\n",
1588
+ "\n",
1589
+ "def isIntvLess(rOne, rTwo):\n",
1590
+ " \"\"\"\n",
1591
+ " checks if first iterval is less than second\n",
1592
+ " Parameters\n",
1593
+ " rOne : first interval boundaries\n",
1594
+ " rTwo : second interval boundaries\n",
1595
+ " \"\"\"\n",
1596
+ " less = rOne[1] <= rTwo[0] \n",
1597
+ " return less\n",
1598
+ "\n",
1599
+ "def findRank(e, values):\n",
1600
+ " \"\"\"\n",
1601
+ " find rank of value in a list\n",
1602
+ " Parameters\n",
1603
+ " e : value to compare with\n",
1604
+ " values : list data\n",
1605
+ " \"\"\"\n",
1606
+ " count = 1\n",
1607
+ " for ve in values:\n",
1608
+ " if ve < e:\n",
1609
+ " count += 1\n",
1610
+ " return count\n",
1611
+ "\n",
1612
+ "def findRanks(toBeRanked, values):\n",
1613
+ " \"\"\"\n",
1614
+ " find ranks of values in one list in another list\n",
1615
+ " Parameters\n",
1616
+ " toBeRanked : list of values for which ranks are found\n",
1617
+ " values : list in which rank is found : \n",
1618
+ " \"\"\"\n",
1619
+ " return list(map(lambda e: findRank(e, values), toBeRanked))\n",
1620
+ "\n",
1621
+ "def formatFloat(prec, value, label = None):\n",
1622
+ " \"\"\"\n",
1623
+ " formats a float with optional label\n",
1624
+ " Parameters\n",
1625
+ " prec : precision\n",
1626
+ " value : data value\n",
1627
+ " label : label for data\n",
1628
+ " \"\"\"\n",
1629
+ " st = (label + \" \") if label else \"\"\n",
1630
+ " formatter = \"{:.\" + str(prec) + \"f}\" \n",
1631
+ " return st + formatter.format(value)\n",
1632
+ "\n",
1633
+ "def formatAny(value, label = None):\n",
1634
+ " \"\"\"\n",
1635
+ " formats any obkect with optional label\n",
1636
+ " Parameters\n",
1637
+ " value : data value\n",
1638
+ " label : label for data\n",
1639
+ " \"\"\"\n",
1640
+ " st = (label + \" \") if label else \"\"\n",
1641
+ " return st + str(value)\n",
1642
+ "\n",
1643
+ "def printList(values):\n",
1644
+ " \"\"\"\n",
1645
+ " pretty print list\n",
1646
+ " Parameters\n",
1647
+ " values : list of values\n",
1648
+ " \"\"\"\n",
1649
+ " for v in values:\n",
1650
+ " print(v)\n",
1651
+ "\n",
1652
+ "def printMap(values, klab, vlab, precision, offset=16):\n",
1653
+ " \"\"\"\n",
1654
+ " pretty print hash map\n",
1655
+ " Parameters\n",
1656
+ " values : dictionary of values\n",
1657
+ " klab : label for key\n",
1658
+ " vlab : label for value\n",
1659
+ " precision : precision\n",
1660
+ " offset : left justify offset\n",
1661
+ " \"\"\"\n",
1662
+ " print(klab.ljust(offset, \" \") + vlab)\n",
1663
+ " for k in values.keys():\n",
1664
+ " v = values[k]\n",
1665
+ " ks = toStr(k, precision).ljust(offset, \" \")\n",
1666
+ " vs = toStr(v, precision)\n",
1667
+ " print(ks + vs)\n",
1668
+ "\n",
1669
+ "def printPairList(values, lab1, lab2, precision, offset=16):\n",
1670
+ " \"\"\"\n",
1671
+ " pretty print list of pairs\n",
1672
+ " Parameters\n",
1673
+ " values : dictionary of values\n",
1674
+ " lab1 : first label\n",
1675
+ " lab2 : second label\n",
1676
+ " precision : precision\n",
1677
+ " offset : left justify offset\n",
1678
+ " \"\"\"\n",
1679
+ " print(lab1.ljust(offset, \" \") + lab2)\n",
1680
+ " for (v1, v2) in values:\n",
1681
+ " sv1 = toStr(v1, precision).ljust(offset, \" \")\n",
1682
+ " sv2 = toStr(v2, precision)\n",
1683
+ " print(sv1 + sv2)\n",
1684
+ "\n",
1685
+ "def createMap(*values):\n",
1686
+ " \"\"\"\n",
1687
+ " create disctionary with results\n",
1688
+ " Parameters\n",
1689
+ " values : sequence of key value pairs\n",
1690
+ " \"\"\"\n",
1691
+ " result = dict()\n",
1692
+ " for i in range(0, len(values), 2):\n",
1693
+ " result[values[i]] = values[i+1]\n",
1694
+ " return result\n",
1695
+ "\n",
1696
+ "def getColMinMax(table, col):\n",
1697
+ " \"\"\"\n",
1698
+ " return min, max values of a column\n",
1699
+ " Parameters\n",
1700
+ " table : tabular data\n",
1701
+ " col : column index\n",
1702
+ " \"\"\"\n",
1703
+ " vmin = None\n",
1704
+ " vmax = None\n",
1705
+ " for rec in table:\n",
1706
+ " value = rec[col]\n",
1707
+ " if vmin is None:\n",
1708
+ " vmin = value\n",
1709
+ " vmax = value\n",
1710
+ " else:\n",
1711
+ " if value < vmin:\n",
1712
+ " vmin = value\n",
1713
+ " elif value > vmax:\n",
1714
+ " vmax = value\n",
1715
+ " return (vmin, vmax, vmax - vmin)\n",
1716
+ "\n",
1717
+ "def createLogger(name, logFilePath, logLevName):\n",
1718
+ " \"\"\"\n",
1719
+ " creates logger\n",
1720
+ " Parameters\n",
1721
+ " name : logger name\n",
1722
+ " logFilePath : log file path\n",
1723
+ " logLevName : log level\n",
1724
+ " \"\"\"\n",
1725
+ " logger = logging.getLogger(name)\n",
1726
+ " fHandler = logging.handlers.RotatingFileHandler(logFilePath, maxBytes=1048576, backupCount=4)\n",
1727
+ " logLev = logLevName.lower()\n",
1728
+ " if logLev == \"debug\":\n",
1729
+ " logLevel = logging.DEBUG\n",
1730
+ " elif logLev == \"info\":\n",
1731
+ " logLevel = logging.INFO\n",
1732
+ " elif logLev == \"warning\":\n",
1733
+ " logLevel = logging.WARNING\n",
1734
+ " elif logLev == \"error\":\n",
1735
+ " logLevel = logging.ERROR\n",
1736
+ " elif logLev == \"critical\":\n",
1737
+ " logLevel = logging.CRITICAL\n",
1738
+ " else:\n",
1739
+ " raise ValueError(\"invalid log level name \" + logLevelName)\n",
1740
+ " fHandler.setLevel(logLevel)\n",
1741
+ " fFormat = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n",
1742
+ " fHandler.setFormatter(fFormat)\n",
1743
+ " logger.addHandler(fHandler)\n",
1744
+ " logger.setLevel(logLevel)\n",
1745
+ " return logger\n",
1746
+ "\n",
1747
+ "@contextmanager\n",
1748
+ "def suppressStdout():\n",
1749
+ " \"\"\"\n",
1750
+ " suppress stdout\n",
1751
+ " Parameters\n",
1752
+ " \"\"\"\n",
1753
+ " with open(os.devnull, \"w\") as devnull:\n",
1754
+ " oldStdout = sys.stdout\n",
1755
+ " sys.stdout = devnull\n",
1756
+ " try: \n",
1757
+ " yield\n",
1758
+ " finally:\n",
1759
+ " sys.stdout = oldStdout\n",
1760
+ "\n",
1761
+ "def exitWithMsg(msg):\n",
1762
+ " \"\"\"\n",
1763
+ " print message and exit\n",
1764
+ " Parameters\n",
1765
+ " msg : message\n",
1766
+ " \"\"\"\n",
1767
+ " print(msg + \" -- quitting\")\n",
1768
+ " sys.exit(0)\n",
1769
+ "\n",
1770
+ "def drawLine(data, yscale=None):\n",
1771
+ " \"\"\"\n",
1772
+ " line plot\n",
1773
+ " Parameters\n",
1774
+ " data : list data\n",
1775
+ " yscale : y axis scale\n",
1776
+ " \"\"\"\n",
1777
+ " plt.plot(data)\n",
1778
+ " if yscale:\n",
1779
+ " step = int(yscale / 10)\n",
1780
+ " step = int(step / 10) * 10\n",
1781
+ " plt.yticks(range(0, yscale, step))\n",
1782
+ " plt.show()\n",
1783
+ "\n",
1784
+ "def drawPlot(x, y, xlabel, ylabel):\n",
1785
+ " \"\"\"\n",
1786
+ " line plot\n",
1787
+ " Parameters\n",
1788
+ " x : x values\n",
1789
+ " y : y values\n",
1790
+ " xlabel : x axis label\n",
1791
+ " ylabel : y axis label\n",
1792
+ " \"\"\"\n",
1793
+ " plt.plot(x,y)\n",
1794
+ " plt.xlabel(xlabel)\n",
1795
+ " plt.ylabel(ylabel)\n",
1796
+ " plt.show()\n",
1797
+ "\n",
1798
+ "def drawPairPlot(x, y1, y2, xlabel,ylabel, y1label, y2label):\n",
1799
+ " \"\"\"\n",
1800
+ " line plot of 2 lines\n",
1801
+ " Parameters\n",
1802
+ " x : x values\n",
1803
+ " y1 : first y values\n",
1804
+ " y2 : second y values\n",
1805
+ " xlabel : x labbel\n",
1806
+ " ylabel : y label\n",
1807
+ " y1label : first plot label\n",
1808
+ " y2label : second plot label\n",
1809
+ " \"\"\"\n",
1810
+ " plt.plot(x, y1, label = y1label)\n",
1811
+ " plt.plot(x, y2, label = y2label)\n",
1812
+ " plt.xlabel(xlabel)\n",
1813
+ " plt.ylabel(ylabel)\n",
1814
+ " plt.legend()\n",
1815
+ " plt.show()\n",
1816
+ "\n",
1817
+ "def drawHist(ldata, myTitle, myXlabel, myYlabel, nbins=10):\n",
1818
+ " \"\"\"\n",
1819
+ " draw histogram\n",
1820
+ " Parameters\n",
1821
+ " ldata : list data\n",
1822
+ " myTitle : title\n",
1823
+ " myXlabel : x label\n",
1824
+ " myYlabel : y label \n",
1825
+ " nbins : num of bins\n",
1826
+ " \"\"\"\n",
1827
+ " plt.hist(ldata, bins=nbins, density=True)\n",
1828
+ " plt.title(myTitle)\n",
1829
+ " plt.xlabel(myXlabel)\n",
1830
+ " plt.ylabel(myYlabel)\n",
1831
+ " plt.show()\n",
1832
+ "\n",
1833
+ "def saveObject(obj, filePath):\n",
1834
+ " \"\"\"\n",
1835
+ " saves an object\n",
1836
+ " Parameters\n",
1837
+ " obj : object\n",
1838
+ " filePath : file path for saved object\n",
1839
+ " \"\"\"\n",
1840
+ " with open(filePath, \"wb\") as outfile:\n",
1841
+ " pickle.dump(obj,outfile)\n",
1842
+ "\n",
1843
+ "def restoreObject(filePath):\n",
1844
+ " \"\"\"\n",
1845
+ " restores an object\n",
1846
+ " Parameters\n",
1847
+ " filePath : file path to restore object from\n",
1848
+ " \"\"\"\n",
1849
+ " with open(filePath, \"rb\") as infile:\n",
1850
+ " obj = pickle.load(infile)\n",
1851
+ " return obj\n",
1852
+ "\n",
1853
+ "def isNumeric(data):\n",
1854
+ " \"\"\"\n",
1855
+ " true if all elements int or float\n",
1856
+ " Parameters\n",
1857
+ " data : numeric data list\n",
1858
+ " \"\"\"\n",
1859
+ " if type(data) == list or type(data) == np.ndarray:\n",
1860
+ " col = pd.Series(data)\n",
1861
+ " else:\n",
1862
+ " col = data\n",
1863
+ " return col.dtype == np.int32 or col.dtype == np.int64 or col.dtype == np.float32 or col.dtype == np.float64\n",
1864
+ "\n",
1865
+ "def isInteger(data):\n",
1866
+ " \"\"\"\n",
1867
+ " true if all elements int \n",
1868
+ " Parameters\n",
1869
+ " data : numeric data list\n",
1870
+ " \"\"\"\n",
1871
+ " if type(data) == list or type(data) == np.ndarray:\n",
1872
+ " col = pd.Series(data)\n",
1873
+ " else:\n",
1874
+ " col = data\n",
1875
+ " return col.dtype == np.int32 or col.dtype == np.int64\n",
1876
+ "\n",
1877
+ "def isFloat(data):\n",
1878
+ " \"\"\"\n",
1879
+ " true if all elements float\n",
1880
+ " Parameters\n",
1881
+ " data : numeric data list\n",
1882
+ " \"\"\"\n",
1883
+ " if type(data) == list or type(data) == np.ndarray:\n",
1884
+ " col = pd.Series(data)\n",
1885
+ " else:\n",
1886
+ " col = data\n",
1887
+ " return col.dtype == np.float32 or col.dtype == np.float64\n",
1888
+ "\n",
1889
+ "def isBinary(data):\n",
1890
+ " \"\"\"\n",
1891
+ " true if all elements either 0 or 1\n",
1892
+ " Parameters\n",
1893
+ " data : binary data\n",
1894
+ " \"\"\"\n",
1895
+ " re = next((d for d in data if not (type(d) == int and (d == 0 or d == 1))), None)\n",
1896
+ " return (re is None)\n",
1897
+ "\n",
1898
+ "def isCategorical(data):\n",
1899
+ " \"\"\"\n",
1900
+ " true if all elements int or string\n",
1901
+ " Parameters\n",
1902
+ " data : data value\n",
1903
+ " \"\"\"\n",
1904
+ " re = next((d for d in data if not (type(d) == int or type(d) == str)), None)\n",
1905
+ " return (re is None)\n",
1906
+ "\n",
1907
+ "def assertEqual(value, veq, msg):\n",
1908
+ " \"\"\"\n",
1909
+ " assert equal to\n",
1910
+ " Parameters\n",
1911
+ " value : value\n",
1912
+ " veq : value to be equated with\n",
1913
+ " msg : error msg\n",
1914
+ " \"\"\"\n",
1915
+ " assert value == veq , msg\n",
1916
+ "\n",
1917
+ "def assertGreater(value, vmin, msg):\n",
1918
+ " \"\"\"\n",
1919
+ " assert greater than \n",
1920
+ " Parameters\n",
1921
+ " value : value\n",
1922
+ " vmin : minimum value\n",
1923
+ " msg : error msg\n",
1924
+ " \"\"\"\n",
1925
+ " assert value > vmin , msg\n",
1926
+ "\n",
1927
+ "def assertGreaterEqual(value, vmin, msg):\n",
1928
+ " \"\"\"\n",
1929
+ " assert greater than \n",
1930
+ " Parameters\n",
1931
+ " value : value\n",
1932
+ " vmin : minimum value\n",
1933
+ " msg : error msg\n",
1934
+ " \"\"\"\n",
1935
+ " assert value >= vmin , msg\n",
1936
+ "\n",
1937
+ "def assertLesser(value, vmax, msg):\n",
1938
+ " \"\"\"\n",
1939
+ " assert less than\n",
1940
+ " Parameters\n",
1941
+ " value : value\n",
1942
+ " vmax : maximum value\n",
1943
+ " msg : error msg\n",
1944
+ " \"\"\"\n",
1945
+ " assert value < vmax , msg\n",
1946
+ "\n",
1947
+ "def assertLesserEqual(value, vmax, msg):\n",
1948
+ " \"\"\"\n",
1949
+ " assert less than\n",
1950
+ " Parameters\n",
1951
+ " value : value\n",
1952
+ " vmax : maximum value\n",
1953
+ " msg : error msg\n",
1954
+ " \"\"\"\n",
1955
+ " assert value <= vmax , msg\n",
1956
+ "\n",
1957
+ "def assertWithinRange(value, vmin, vmax, msg):\n",
1958
+ " \"\"\"\n",
1959
+ " assert within range\n",
1960
+ " Parameters\n",
1961
+ " value : value\n",
1962
+ " vmin : minimum value\n",
1963
+ " vmax : maximum value\n",
1964
+ " msg : error msg\n",
1965
+ " \"\"\"\n",
1966
+ " assert value >= vmin and value <= vmax, msg\n",
1967
+ "\n",
1968
+ "def assertInList(value, values, msg):\n",
1969
+ " \"\"\"\n",
1970
+ " assert contains in a list\n",
1971
+ " Parameters\n",
1972
+ " value ; balue to check for inclusion\n",
1973
+ " values : list data\n",
1974
+ " msg : error msg\n",
1975
+ " \"\"\"\n",
1976
+ " assert value in values, msg\n",
1977
+ "\n",
1978
+ "def maxListDist(l1, l2):\n",
1979
+ " \"\"\"\n",
1980
+ " maximum list element difference between 2 lists\n",
1981
+ " Parameters\n",
1982
+ " l1 : first list data\n",
1983
+ " l2 : second list data\n",
1984
+ " \"\"\"\n",
1985
+ " dist = max(list(map(lambda v : abs(v[0] - v[1]), zip(l1, l2))))\t\n",
1986
+ " return dist\n",
1987
+ "\n",
1988
+ "def fileLineCount(fPath):\n",
1989
+ " \"\"\" \n",
1990
+ " number of lines ina file \n",
1991
+ " Parameters\n",
1992
+ " fPath : file path\n",
1993
+ " \"\"\"\n",
1994
+ " with open(fPath) as f:\n",
1995
+ " for i, li in enumerate(f):\n",
1996
+ " pass\n",
1997
+ " return (i + 1)\n",
1998
+ "\n",
1999
+ "def getAlphaNumCharCount(sdata):\n",
2000
+ " \"\"\" \n",
2001
+ " number of alphabetic and numeric charcters in a string \n",
2002
+ " Parameters\n",
2003
+ " sdata : string data\n",
2004
+ " \"\"\"\n",
2005
+ " acount = 0\n",
2006
+ " ncount = 0\n",
2007
+ " scount = 0\n",
2008
+ " ocount = 0\n",
2009
+ " assertEqual(type(sdata), str, \"input must be string\")\n",
2010
+ " for c in sdata:\n",
2011
+ " if c.isnumeric():\n",
2012
+ " ncount += 1\n",
2013
+ " elif c.isalpha():\n",
2014
+ " acount += 1\n",
2015
+ " elif c.isspace():\n",
2016
+ " scount += 1\n",
2017
+ " else:\n",
2018
+ " ocount += 1\n",
2019
+ " r = (acount, ncount, ocount)\n",
2020
+ " return r\n",
2021
+ "\n",
2022
+ "class StepFunction:\n",
2023
+ " \"\"\"\n",
2024
+ " step function\n",
2025
+ " Parameters\n",
2026
+ " \"\"\"\n",
2027
+ " def __init__(self, *values):\n",
2028
+ " \"\"\"\n",
2029
+ " initilizer\n",
2030
+ "\n",
2031
+ " Parameters\n",
2032
+ " values : list of tuples, wich each tuple containing 2 x values and corresponding y value\n",
2033
+ " \"\"\"\n",
2034
+ " self.points = values\n",
2035
+ "\n",
2036
+ " def find(self, x):\n",
2037
+ " \"\"\"\n",
2038
+ " finds step function value\n",
2039
+ "\n",
2040
+ " Parameters\n",
2041
+ " x : x value\n",
2042
+ " \"\"\"\n",
2043
+ " found = False\n",
2044
+ " y = 0\n",
2045
+ " for p in self.points:\n",
2046
+ " if (x >= p[0] and x < p[1]):\n",
2047
+ " y = p[2]\n",
2048
+ " found = True\n",
2049
+ " break\n",
2050
+ "\n",
2051
+ " if not found:\n",
2052
+ " l = len(self.points)\n",
2053
+ " if (x < self.points[0][0]):\n",
2054
+ " y = self.points[0][2]\n",
2055
+ " elif (x > self.points[l-1][1]):\n",
2056
+ " y = self.points[l-1][2]\n",
2057
+ " return y\n",
2058
+ "\n",
2059
+ "\n",
2060
+ "class DummyVarGenerator:\n",
2061
+ " \"\"\"\n",
2062
+ " dummy variable generator for categorical variable\n",
2063
+ " \"\"\"\n",
2064
+ " def __init__(self, rowSize, catValues, trueVal, falseVal, delim=None):\n",
2065
+ " \"\"\"\n",
2066
+ " initilizer\n",
2067
+ "\n",
2068
+ " Parameters\n",
2069
+ " rowSize : row size\n",
2070
+ " catValues : dictionary with field index as key and list of categorical values as value\n",
2071
+ " trueVal : true value, typically \"1\"\n",
2072
+ " falseval : false value , typically \"0\"\n",
2073
+ " delim : field delemeter\n",
2074
+ " \"\"\"\n",
2075
+ " self.rowSize = rowSize\n",
2076
+ " self.catValues = catValues\n",
2077
+ " numCatVar = len(catValues)\n",
2078
+ " colCount = 0\n",
2079
+ " for v in self.catValues.values():\n",
2080
+ " colCount += len(v)\n",
2081
+ " self.newRowSize = rowSize - numCatVar + colCount\n",
2082
+ " #print (\"new row size {}\".format(self.newRowSize))\n",
2083
+ " self.trueVal = trueVal\n",
2084
+ " self.falseVal = falseVal\n",
2085
+ " self.delim = delim\n",
2086
+ "\n",
2087
+ " def processRow(self, row):\n",
2088
+ " \"\"\"\n",
2089
+ " encodes categorical variables, returning as delemeter separate dstring or list\n",
2090
+ "\n",
2091
+ " Parameters\n",
2092
+ " row : row either delemeter separated string or list\n",
2093
+ " \"\"\"\n",
2094
+ " if self.delim is not None:\n",
2095
+ " rowArr = row.split(self.delim)\n",
2096
+ " msg = \"row does not have expected number of columns found \" + str(len(rowArr)) + \" expected \" + str(self.rowSize)\n",
2097
+ " assert len(rowArr) == self.rowSize, msg\n",
2098
+ " else:\n",
2099
+ " rowArr = row\n",
2100
+ "\n",
2101
+ " newRowArr = []\n",
2102
+ " for i in range(len(rowArr)):\n",
2103
+ " curVal = rowArr[i]\n",
2104
+ " if (i in self.catValues):\n",
2105
+ " values = self.catValues[i]\n",
2106
+ " for val in values:\n",
2107
+ " if val == curVal:\n",
2108
+ " newVal = self.trueVal\n",
2109
+ " else:\n",
2110
+ " newVal = self.falseVal\n",
2111
+ " newRowArr.append(newVal)\n",
2112
+ " else:\n",
2113
+ " newRowArr.append(curVal)\n",
2114
+ " assert len(newRowArr) == self.newRowSize, \"invalid new row size \" + str(len(newRowArr)) + \" expected \" + str(self.newRowSize)\n",
2115
+ " encRow = self.delim.join(newRowArr) if self.delim is not None else newRowArr\n",
2116
+ " return encRow\n"
2117
+ ]
2118
+ }
2119
+ ],
2120
+ "metadata": {
2121
+ "kernelspec": {
2122
+ "display_name": "Python 3 (ipykernel)",
2123
+ "language": "python",
2124
+ "name": "python3"
2125
+ },
2126
+ "language_info": {
2127
+ "codemirror_mode": {
2128
+ "name": "ipython",
2129
+ "version": 3
2130
+ },
2131
+ "file_extension": ".py",
2132
+ "mimetype": "text/x-python",
2133
+ "name": "python",
2134
+ "nbconvert_exporter": "python",
2135
+ "pygments_lexer": "ipython3",
2136
+ "version": "3.9.12"
2137
+ }
2138
+ },
2139
+ "nbformat": 4,
2140
+ "nbformat_minor": 5
2141
+ }
model/tnn/pdamb.mod ADDED
Binary file (1.45 kB). View file