Joshua Lansford commited on
Commit
9997114
·
1 Parent(s): 370675b

It is now in an object, trains and executes.

Browse files
Files changed (2) hide show
  1. .vscode/launch.json +51 -1
  2. transmorgrify.py +238 -36
.vscode/launch.json CHANGED
@@ -25,8 +25,23 @@
25
  "--a_header", "English",
26
  "--b_header", "Phonetic",
27
  "--device", "0:1",
28
- "--model", "phonetics_forwar.tm"
29
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  },{
31
  "name": "Train short phonetic 4000 gpu",
32
  "type": "python",
@@ -42,6 +57,41 @@
42
  "--device", "0:1",
43
  "--model", "phonetics_small.tm"
44
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  ]
47
  }
 
25
  "--a_header", "English",
26
  "--b_header", "Phonetic",
27
  "--device", "0:1",
28
+ "--model", "phonetics_forward.tm"
29
  ]
30
+ },{
31
+ "name": "Train reverse phonetic 4000 gpu",
32
+ "type": "python",
33
+ "request": "launch",
34
+ "program": "transmorgrify.py",
35
+ "console": "integratedTerminal",
36
+ "justMyCode": true,
37
+ "args": [
38
+ "--train",
39
+ "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv",
40
+ "--b_header", "English",
41
+ "--a_header", "Phonetic",
42
+ "--device", "0:1",
43
+ "--model", "phonetics_backwards.tm"
44
+ ]
45
  },{
46
  "name": "Train short phonetic 4000 gpu",
47
  "type": "python",
 
57
  "--device", "0:1",
58
  "--model", "phonetics_small.tm"
59
  ]
60
+ },{
61
+ "name": "Execute phonetic gpu",
62
+ "type": "python",
63
+ "request": "launch",
64
+ "program": "transmorgrify.py",
65
+ "console": "integratedTerminal",
66
+ "justMyCode": true,
67
+ "args": [
68
+ "--execute",
69
+ "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv",
70
+ "--out_csv", "./phonetic_out.csv",
71
+ "--a_header", "English",
72
+ "--b_header", "Phonetic",
73
+ "--device", "0:1",
74
+ "--model", "phonetics_forward.tm",
75
+ "--verbose",
76
+ ]
77
+ },{
78
+ "name": "short Execute phonetic gpu",
79
+ "type": "python",
80
+ "request": "launch",
81
+ "program": "transmorgrify.py",
82
+ "console": "integratedTerminal",
83
+ "justMyCode": true,
84
+ "args": [
85
+ "--execute",
86
+ "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic_short.csv",
87
+ "--out_csv", "./phonetic_out.csv",
88
+ "--a_header", "English",
89
+ "--b_header", "Phonetic",
90
+ "--device", "0:1",
91
+ "--model", "phonetics_forward.tm",
92
+ "--verbose",
93
+ "--include_stats",
94
+ ]
95
  }
96
  ]
97
  }
transmorgrify.py CHANGED
@@ -12,7 +12,73 @@ DELETE_FROM = 1
12
  INSERT_TO = 2
13
  START = 3
14
 
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def _list_trace( trace ):
18
  if trace.parrent is None:
@@ -270,19 +336,7 @@ def _train_catboost( X, y, iterations, device, verbose, model_piece, learning_ra
270
 
271
  return model
272
 
273
- def _train_reconstruct_models( from_sentances, to_sentances, iterations, device, num_pre_context_chars, num_post_context_chars, verbose ):
274
-
275
- X,Y = _parse_for_training( from_sentances, to_sentances, num_pre_context_chars=num_pre_context_chars, num_post_context_chars=num_post_context_chars )
276
-
277
- #train and save the action_model
278
- action_model = _train_catboost( X, Y['action'], iterations, verbose=verbose, device=device, model_piece='action' )
279
 
280
- #and the char model
281
- #slice through where only the action is insert.
282
- insert_indexes = Y['action'] == INSERT_TO
283
- char_model = _train_catboost( X[insert_indexes], Y['char'][insert_indexes], iterations, verbose=verbose, device=device, model_piece='char' )
284
-
285
- return action_model, char_model
286
 
287
  def _mktemp():
288
  #I know mktemp exists in the library but it has been depricated suggesting using
@@ -293,7 +347,103 @@ def _mktemp():
293
  number += 1
294
  return f".temp_{number}~"
295
 
296
- def train( in_csv, a_header, b_header, model, iterations, device, leading_context,trailing_context, train_percentage, verbose ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  if verbose: print( "loading csv" )
298
  full_data = pd.read_csv( in_csv )
299
 
@@ -302,33 +452,66 @@ def train( in_csv, a_header, b_header, model, iterations, device, leading_contex
302
 
303
  if verbose: print( "parcing data for training" )
304
 
305
- action_model, char_model = _train_reconstruct_models( from_sentances=train_data[a_header],
 
 
 
306
  to_sentances=train_data[b_header],
307
  iterations = iterations,
308
  device = device,
309
- num_pre_context_chars = leading_context,
310
- num_post_context_chars = trailing_context,
311
  verbose=verbose,
312
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
- temp_action_filename = _mktemp()
315
- action_model.save_model( temp_action_filename )
316
- temp_char_filename = _mktemp()
317
- char_model.save_model( temp_char_filename )
318
-
319
- with zipfile.ZipFile( model, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9 ) as myzip:
320
- with myzip.open( 'params.json', mode='w' ) as out:
321
- out.write( json.dumps({
322
- 'version': 1,
323
- 'leading_context': leading_context,
324
- 'trailing_context': trailing_context,
325
- 'iterations': iterations,
326
- }).encode())
327
- myzip.write( temp_action_filename, "action.cb" )
328
- myzip.write( temp_char_filename, "char.cb" )
329
-
330
- os.unlink( temp_action_filename )
331
- os.unlink( temp_char_filename )
332
 
333
  def main():
334
  parser = argparse.ArgumentParser(
@@ -347,6 +530,7 @@ def main():
347
  parser.add_argument('-p', '--train_percentage', help="The percentage of data to train on, leaving the rest for testing.")
348
  parser.add_argument('-e', '--execute', action='store_true', help='Use an existing trained model.')
349
  parser.add_argument('-v', '--verbose', action='store_true', help='Talks alot?' )
 
350
 
351
 
352
  args = parser.parse_args()
@@ -355,7 +539,6 @@ def main():
355
 
356
 
357
  if args.train:
358
-
359
  train_percentage = args.train_percentage
360
  if train_percentage is None:
361
  if args.execute:
@@ -375,7 +558,26 @@ def main():
375
  verbose=args.verbose,
376
  )
377
 
378
- #print(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
 
381
  if __name__ == '__main__':
 
12
  INSERT_TO = 2
13
  START = 3
14
 
15
+ FILE_VERSION = 1
16
 
17
+ class Transmorgrifyer:
18
+ def train( self, from_sentances, to_sentances, iterations, device, trailing_context, leading_context, verbose ):
19
+
20
+ X,Y = _parse_for_training( from_sentances, to_sentances, num_pre_context_chars=leading_context, num_post_context_chars=trailing_context )
21
+
22
+ #train and save the action_model
23
+ self.action_model = _train_catboost( X, Y['action'], iterations, verbose=verbose, device=device, model_piece='action' )
24
+
25
+ #and the char model
26
+ #slice through where only the action is insert.
27
+ insert_indexes = Y['action'] == INSERT_TO
28
+ self.char_model = _train_catboost( X[insert_indexes], Y['char'][insert_indexes], iterations, verbose=verbose, device=device, model_piece='char' )
29
+
30
+ self.trailing_context = trailing_context
31
+ self.leading_context = leading_context
32
+ self.iterations = iterations
33
+
34
+ def save( self, model ):
35
+ with zipfile.ZipFile( model, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9 ) as myzip:
36
+ with myzip.open( 'params.json', mode='w' ) as out:
37
+ out.write( json.dumps({
38
+ 'version': FILE_VERSION,
39
+ 'leading_context': self.leading_context,
40
+ 'trailing_context': self.trailing_context,
41
+ 'iterations': self.iterations,
42
+ }).encode())
43
+ temp_filename = _mktemp()
44
+ self.action_model.save_model( temp_filename )
45
+ myzip.write( temp_filename, "action.cb" )
46
+ self.char_model.save_model( temp_filename )
47
+ myzip.write( temp_filename, "char.cb" )
48
+ os.unlink( temp_filename )
49
+
50
+ def load( self, model ):
51
+ with zipfile.ZipFile( model, mode='r' ) as zip:
52
+ with zip.open( 'params.json' ) as fin:
53
+ params = json.loads( fin.read().decode() )
54
+ if params['version'] > FILE_VERSION: raise Exception( f"Version {params['version']} greater than {FILE_VERSION}" )
55
+ self.leading_context = params['leading_context']
56
+ self.trailing_context = params['trailing_context']
57
+ self.iterations = params['iterations']
58
+ temp_filename = _mktemp()
59
+ with zip.open( 'action.cb' ) as fin:
60
+ with open( temp_filename, "wb" ) as fout:
61
+ fout.write( fin.read() )
62
+ self.action_model = CatBoostClassifier().load_model( temp_filename )
63
+ with zip.open( 'char.cb' ) as fin:
64
+ with open( temp_filename, "wb" ) as fout:
65
+ fout.write( fin.read() )
66
+ self.char_model = CatBoostClassifier().load_model( temp_filename )
67
+
68
+ os.unlink( temp_filename)
69
+
70
+
71
+ def execute( self, from_sentances, verbose=False ):
72
+ for i,from_sentance in enumerate(from_sentances):
73
+
74
+ yield _do_reconstruct(
75
+ action_model=self.action_model,
76
+ char_model=self.char_model,
77
+ text=from_sentance,
78
+ num_pre_context_chars=self.leading_context,
79
+ num_post_context_chars=self.trailing_context )
80
+ if verbose and i % 10 == 0:
81
+ print( f"{i} of {len(from_sentances)}" )
82
 
83
  def _list_trace( trace ):
84
  if trace.parrent is None:
 
336
 
337
  return model
338
 
 
 
 
 
 
 
339
 
 
 
 
 
 
 
340
 
341
  def _mktemp():
342
  #I know mktemp exists in the library but it has been depricated suggesting using
 
347
  number += 1
348
  return f".temp_{number}~"
349
 
350
+
351
+ def _do_reconstruct( action_model, char_model, text, num_pre_context_chars, num_post_context_chars ):
352
+ # result = ""
353
+ # for i in range(len(text)):
354
+ # pre_context = ( (" " * num_pre_context_chars) + result[max(0,len(result)-num_pre_context_chars):])[-num_pre_context_chars:]
355
+ # post_context = (text[i:min(len(text),i+num_post_context_chars)] + (" " * num_post_context_chars))[:num_post_context_chars]
356
+ # full_context = pre_context + post_context
357
+ # context_as_dictionary = { 'c'+str(c):[full_context[c]] for c in range(len(full_context)) }
358
+ # context_as_pd = pd.DataFrame( context_as_dictionary )
359
+
360
+ # model_result = model.predict( context_as_pd )[0]
361
+
362
+ # if not quite and len( result ) % 500 == 0: print( "%" + str(i*100/len(text))[:4] + " " + result[-100:])
363
+
364
+ # if model_result: result += " "
365
+ # result += text[i]
366
+
367
+ # pass
368
+ # return result
369
+
370
+ #test for nan.
371
+ if text != text: text = ''
372
+
373
+ working_from = text
374
+ working_to = ""
375
+ used_from = ""
376
+ continuous_added = 0
377
+ continuous_dropped = 0
378
+ while working_from and len(working_to) < 3*len(text) and (len(working_to) < 5 or working_to[-5:] != (working_to[-1] * 5)):
379
+ from_context = (working_from + (" " * num_post_context_chars))[:num_post_context_chars]
380
+ to_context = ((" " * num_pre_context_chars) + working_to )[-num_pre_context_chars:]
381
+ used_context = ((" " * num_pre_context_chars) + used_from )[-num_pre_context_chars:]
382
+
383
+ #construct the context.
384
+ context_as_dictionary = {}
385
+ #from_context
386
+ for i in range( num_post_context_chars ):
387
+ context_as_dictionary[ f"f{i}" ] = [from_context[i]]
388
+ #to_context
389
+ for i in range( num_pre_context_chars ):
390
+ context_as_dictionary[ f"t{i}" ] = [to_context[i]]
391
+ #used_context
392
+ for i in range( num_pre_context_chars ):
393
+ context_as_dictionary[ f"u{i}" ] = [used_context[i]]
394
+ #these two things.
395
+ context_as_dictionary["continuous_added"] = [continuous_added]
396
+ context_as_dictionary["continuous_dropped"] = [continuous_dropped]
397
+
398
+ #make it a pandas.
399
+ context_as_pd = pd.DataFrame( context_as_dictionary )
400
+
401
+ #run the model
402
+ action_model_result = action_model.predict( context_as_pd )[0][0]
403
+
404
+ if action_model_result == START:
405
+ pass
406
+ elif action_model_result == INSERT_TO:
407
+ #for an insert ask the char model what to insert
408
+ char_model_result = char_model.predict( context_as_pd )[0][0]
409
+
410
+ working_to += char_model_result
411
+ continuous_added += 1
412
+ continuous_dropped = 0
413
+ elif action_model_result == DELETE_FROM:
414
+ used_from += working_from[0]
415
+ working_from = working_from[1:]
416
+ continuous_added = 0
417
+ continuous_dropped += 1
418
+ elif action_model_result == MATCH:
419
+ used_from += working_from[0]
420
+ working_to += working_from[0]
421
+ working_from = working_from[1:]
422
+ continuous_added = 0
423
+ continuous_dropped = 0
424
+
425
+ return working_to
426
+
427
+
428
+ #edit distance from https://stackoverflow.com/a/32558749/1419054
429
+ def _levenshteinDistance(s1, s2):
430
+ if s1 != s1: s1 = ''
431
+ if s2 != s2: s2 = ''
432
+ if len(s1) > len(s2):
433
+ s1, s2 = s2, s1
434
+
435
+ distances = range(len(s1) + 1)
436
+ for i2, c2 in enumerate(s2):
437
+ distances_ = [i2+1]
438
+ for i1, c1 in enumerate(s1):
439
+ if c1 == c2:
440
+ distances_.append(distances[i1])
441
+ else:
442
+ distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
443
+ distances = distances_
444
+ return distances[-1]
445
+
446
+ def train( in_csv, a_header, b_header, model, iterations, device, leading_context, trailing_context, train_percentage, verbose ):
447
  if verbose: print( "loading csv" )
448
  full_data = pd.read_csv( in_csv )
449
 
 
452
 
453
  if verbose: print( "parcing data for training" )
454
 
455
+
456
+ tm = Transmorgrifyer()
457
+
458
+ tm.train( from_sentances=train_data[a_header],
459
  to_sentances=train_data[b_header],
460
  iterations = iterations,
461
  device = device,
462
+ leading_context = leading_context,
463
+ trailing_context = trailing_context,
464
  verbose=verbose,
465
  )
466
+ tm.save( model )
467
+
468
+ def execute( include_stats, in_csv, out_csv, a_header, b_header, model, execute_percentage, verbose ):
469
+ if verbose: print( "loading csv" )
470
+
471
+ full_data = pd.read_csv( in_csv )
472
+
473
+ split_index = int( (100-execute_percentage)/100*len(full_data) )
474
+ execute_data = full_data.iloc[split_index:,:].reset_index(drop=True)
475
+
476
+
477
+ tm = Transmorgrifyer()
478
+ tm.load( model )
479
+
480
+ results = list(tm.execute( execute_data[a_header ], verbose=verbose ))
481
+
482
+
483
+ if include_stats:
484
+ before_edit_distances = []
485
+ after_edit_distances = []
486
+ percent_improvement = []
487
+
488
+ for row in range(len( execute_data )):
489
+ before_edit_distances.append(
490
+ _levenshteinDistance( execute_data[a_header][row], execute_data[b_header][row] )
491
+ )
492
+ after_edit_distances.append(
493
+ _levenshteinDistance( results[row], execute_data[b_header][row] )
494
+ )
495
+ percent_improvement.append(
496
+ 100*(before_edit_distances[row] - after_edit_distances[row])/max(1,before_edit_distances[row])
497
+ )
498
+
499
+ pd_results = pd.DataFrame( {
500
+ "in_data": execute_data[a_header],
501
+ "out_data": execute_data[b_header],
502
+ "generated_data": results,
503
+ "before_edit_distance": before_edit_distances,
504
+ "after_edit_distance": after_edit_distances,
505
+ "percent_improvement": percent_improvement,
506
+ })
507
+ pd_results.to_csv( out_csv )
508
+ else:
509
+ pd_results = pd.DataFrame( {
510
+ "out_data": execute_data[b_header],
511
+ })
512
+ pd_results.to_csv( out_csv )
513
+
514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
 
516
  def main():
517
  parser = argparse.ArgumentParser(
 
530
  parser.add_argument('-p', '--train_percentage', help="The percentage of data to train on, leaving the rest for testing.")
531
  parser.add_argument('-e', '--execute', action='store_true', help='Use an existing trained model.')
532
  parser.add_argument('-v', '--verbose', action='store_true', help='Talks alot?' )
533
+ parser.add_argument('-s', '--include_stats', action='store_true', help='Use b_header to compute stats and add to output csv.')
534
 
535
 
536
  args = parser.parse_args()
 
539
 
540
 
541
  if args.train:
 
542
  train_percentage = args.train_percentage
543
  if train_percentage is None:
544
  if args.execute:
 
558
  verbose=args.verbose,
559
  )
560
 
561
+
562
+ if args.execute:
563
+ if args.train_percentage is None:
564
+ if args.train:
565
+ execute_percentage = 50
566
+ else:
567
+ execute_percentage = 100
568
+ else:
569
+ execute_percentage = 100-args.train_percentage
570
+ execute(
571
+ include_stats=args.include_stats,
572
+ in_csv=args.in_csv,
573
+ out_csv=args.out_csv,
574
+ a_header=args.a_header,
575
+ b_header=args.b_header,
576
+ model=args.model,
577
+ execute_percentage=execute_percentage,
578
+ verbose=args.verbose,
579
+ )
580
+
581
 
582
 
583
  if __name__ == '__main__':