Joshua Lansford commited on
Commit
fbbf27f
·
1 Parent(s): 6f3a3e1

Fixing spelling of sentance to sentence in files.

Browse files
.vscode/launch.json CHANGED
@@ -21,7 +21,7 @@
21
  "justMyCode": true,
22
  "args": [
23
  "--train",
24
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv",
25
  "--a_header", "English",
26
  "--b_header", "Phonetic",
27
  "--device", "0:1",
@@ -36,7 +36,7 @@
36
  "justMyCode": true,
37
  "args": [
38
  "--train",
39
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv",
40
  "--b_header", "English",
41
  "--a_header", "Phonetic",
42
  "--device", "0:1",
@@ -51,7 +51,7 @@
51
  "justMyCode": true,
52
  "args": [
53
  "--train",
54
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic_short.csv",
55
  "--a_header", "English",
56
  "--b_header", "Phonetic",
57
  "--device", "0:1",
@@ -66,7 +66,7 @@
66
  "justMyCode": true,
67
  "args": [
68
  "--execute",
69
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv",
70
  "--out_csv", "./phonetic_out.csv",
71
  "--a_header", "English",
72
  "--b_header", "Phonetic",
@@ -83,7 +83,7 @@
83
  "justMyCode": true,
84
  "args": [
85
  "--execute",
86
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic_short.csv",
87
  "--out_csv", "./phonetic_out.csv",
88
  "--a_header", "English",
89
  "--b_header", "Phonetic",
@@ -100,7 +100,7 @@
100
  "justMyCode": true,
101
  "args": [
102
  "--execute",
103
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic_short.csv",
104
  "--out_csv", "./reverse_phonetic_out.csv",
105
  "--b_header", "English",
106
  "--a_header", "Phonetic",
 
21
  "justMyCode": true,
22
  "args": [
23
  "--train",
24
+ "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv",
25
  "--a_header", "English",
26
  "--b_header", "Phonetic",
27
  "--device", "0:1",
 
36
  "justMyCode": true,
37
  "args": [
38
  "--train",
39
+ "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv",
40
  "--b_header", "English",
41
  "--a_header", "Phonetic",
42
  "--device", "0:1",
 
51
  "justMyCode": true,
52
  "args": [
53
  "--train",
54
+ "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic_short.csv",
55
  "--a_header", "English",
56
  "--b_header", "Phonetic",
57
  "--device", "0:1",
 
66
  "justMyCode": true,
67
  "args": [
68
  "--execute",
69
+ "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv",
70
  "--out_csv", "./phonetic_out.csv",
71
  "--a_header", "English",
72
  "--b_header", "Phonetic",
 
83
  "justMyCode": true,
84
  "args": [
85
  "--execute",
86
+ "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic_short.csv",
87
  "--out_csv", "./phonetic_out.csv",
88
  "--a_header", "English",
89
  "--b_header", "Phonetic",
 
100
  "justMyCode": true,
101
  "args": [
102
  "--execute",
103
+ "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic_short.csv",
104
  "--out_csv", "./reverse_phonetic_out.csv",
105
  "--b_header", "English",
106
  "--a_header", "Phonetic",
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Sentance Transmorgrifier
3
  emoji: s
4
  colorFrom: yellow
5
  colorTo: yellow
@@ -10,24 +10,24 @@ pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
- ## Sentance Transmorgrifier
14
 
15
- # What is the Sentance Transmorgrifier?
16
- - The Sentance Transmorgrifier is a framework to make text to text conversion models which uses a categorical gradiant boost library, [catboost](https://catboost.ai/), as its back end.
17
  - This library does not use neural net or word embeddings but does the transformation on the character level.
18
- - For Sentance Transmorgrifier to work, there has to be some common characters between the from and two conversion.
19
- - The model uses a modified form of the [logest common subsequence algorithm](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) to transform the sentance conversion into a sequence of three types of operations:
20
  1. Match: Pass the character from input to output
21
  2. Drop: Remove the incoming character from the input.
22
  3. Insert: Generate a character and add it to the output.
23
  - The transformation uses a sliding context window of the next n incoming characters, ``n`` output transformed chars and n output untransformed chars.
24
  - Because the window is sliding, there is no fixed length on the character sequences which can be transformed.
25
 
26
- # Where is the code and a demo of said Sentance Transmorgrifier?
27
- - There is a [Sentance Transmorgrifier HuggingFace space](https://huggingface.co/spaces/JEdward7777/SentanceTransmorgrifier) demoing a couple models created with Sentance Transmorgrifier.
28
- - A branch of the code without the trained example models is checked in at the [Sentance Transmorgrifier Github page](https://github.com/JEdward7777/SentanceTransmogrifier).
29
 
30
- # How can I use the Sentance Transmorgrifier
31
  - The project has been configured to be able to be used in two different ways.
32
 
33
  ## Shell access
@@ -35,7 +35,7 @@ license: apache-2.0
35
 
36
  ```sh
37
  python transmorgrify.py \
38
- --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
39
  --a_header English \
40
  --b_header Phonetic\
41
  --device 0:1 \
@@ -56,7 +56,7 @@ python transmorgrify.py \
56
  ```sh
57
  python transmorgrify.py \
58
  --execute \
59
- --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
60
  --a_header English \
61
  --b_header Phonetic\
62
  --device cpu \
@@ -83,8 +83,8 @@ python transmorgrify.py \
83
  Train the Transmorgrifier model. This does not save it to disk but just trains in memory.
84
 
85
  Keyword arguments:
86
- from_sentances -- An array of strings for the input sentances.
87
- to_sentances -- An array of strings of the same length as from_sentances which the model is to train to convert to.
88
  iterations -- An integer specifying the number of iterations to convert from or to. (default 4000)
89
  device -- The gpu reference which catboost wants or "cpu". (default cpu)
90
  trailing_context -- The number of characters after the action point to include for context. (default 7)
@@ -109,10 +109,10 @@ model -- The filename of the model to load. (default my_model.tm)
109
  ```
110
  Runs the data from from_sentaces. The results are returned
111
  using yield so you need to wrap this in list() if you want
112
- to index it. from_sentances can be an array or a generator.
113
 
114
  Keyword arguments:
115
- from_sentances -- Something iterable which returns strings.
116
  ```
117
  - Here is an example of using object access to train a model
118
  ```python
@@ -125,8 +125,8 @@ train_data = pd.read_csv( "training.csv" )
125
  #do the training
126
  my_model = transmorgrify.Transmorgrifier()
127
  my_model.train(
128
- from_sentances=train_data["from_header"],
129
- to_sentances=train_data["to_header"],
130
  iterations=4000 )
131
 
132
  #save the results
 
1
  ---
2
+ title: Sentence Transmorgrifier
3
  emoji: s
4
  colorFrom: yellow
5
  colorTo: yellow
 
10
  license: apache-2.0
11
  ---
12
 
13
+ ## Sentence Transmorgrifier
14
 
15
+ # What is the Sentence Transmorgrifier?
16
+ - The Sentence Transmorgrifier is a framework to make text to text conversion models which uses a categorical gradiant boost library, [catboost](https://catboost.ai/), as its back end.
17
  - This library does not use neural net or word embeddings but does the transformation on the character level.
18
+ - For Sentence Transmorgrifier to work, there has to be some common characters between the from and two conversion.
19
+ - The model uses a modified form of the [logest common subsequence algorithm](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) to transform the sentence conversion into a sequence of three types of operations:
20
  1. Match: Pass the character from input to output
21
  2. Drop: Remove the incoming character from the input.
22
  3. Insert: Generate a character and add it to the output.
23
  - The transformation uses a sliding context window of the next n incoming characters, ``n`` output transformed chars and n output untransformed chars.
24
  - Because the window is sliding, there is no fixed length on the character sequences which can be transformed.
25
 
26
+ # Where is the code and a demo of said Sentence Transmorgrifier?
27
+ - There is a [Sentence Transmorgrifier HuggingFace space](https://huggingface.co/spaces/JEdward7777/SentenceTransmorgrifier) demoing a couple models created with Sentence Transmorgrifier.
28
+ - A branch of the code without the trained example models is checked in at the [Sentence Transmorgrifier Github page](https://github.com/JEdward7777/SentenceTransmogrifier).
29
 
30
+ # How can I use the Sentence Transmorgrifier
31
  - The project has been configured to be able to be used in two different ways.
32
 
33
  ## Shell access
 
35
 
36
  ```sh
37
  python transmorgrify.py \
38
+ --train --in_csv ./examples/phonetic/phonetic.csv \
39
  --a_header English \
40
  --b_header Phonetic\
41
  --device 0:1 \
 
56
  ```sh
57
  python transmorgrify.py \
58
  --execute \
59
+ --in_csv ./examples/phonetic/phonetic.csv \
60
  --a_header English \
61
  --b_header Phonetic\
62
  --device cpu \
 
83
  Train the Transmorgrifier model. This does not save it to disk but just trains in memory.
84
 
85
  Keyword arguments:
86
+ from_sentences -- An array of strings for the input sentences.
87
+ to_sentences -- An array of strings of the same length as from_sentences which the model is to train to convert to.
88
  iterations -- An integer specifying the number of iterations to convert from or to. (default 4000)
89
  device -- The gpu reference which catboost wants or "cpu". (default cpu)
90
  trailing_context -- The number of characters after the action point to include for context. (default 7)
 
109
  ```
110
  Runs the data from from_sentaces. The results are returned
111
  using yield so you need to wrap this in list() if you want
112
+ to index it. from_sentences can be an array or a generator.
113
 
114
  Keyword arguments:
115
+ from_sentences -- Something iterable which returns strings.
116
  ```
117
  - Here is an example of using object access to train a model
118
  ```python
 
125
  #do the training
126
  my_model = transmorgrify.Transmorgrifier()
127
  my_model.train(
128
+ from_sentences=train_data["from_header"],
129
+ to_sentences=train_data["to_header"],
130
  iterations=4000 )
131
 
132
  #save the results
app.py CHANGED
@@ -29,7 +29,7 @@ def pig_to_eng( input ):
29
  with gr.Blocks() as demo:
30
  gr.Markdown(
31
  """
32
- # Sentance Transmorgrifier demo
33
  The following demos have been trained on different tasks.
34
  Select the tab below for a demo.
35
  """
 
29
  with gr.Blocks() as demo:
30
  gr.Markdown(
31
  """
32
+ # Sentence Transmorgrifier demo
33
  The following demos have been trained on different tasks.
34
  Select the tab below for a demo.
35
  """
example_train.py CHANGED
@@ -7,8 +7,8 @@ train_data = pd.read_csv( "phonetics_out_gpu_4000.csv" )[0:100]
7
  #do the training
8
  my_model = transmorgrify.Transmorgrifier()
9
  my_model.train(
10
- from_sentances=train_data["in_data"],
11
- to_sentances=train_data["out_data"],
12
  iterations=100, )
13
 
14
  #save the results
 
7
  #do the training
8
  my_model = transmorgrify.Transmorgrifier()
9
  my_model.train(
10
+ from_sentences=train_data["in_data"],
11
+ to_sentences=train_data["out_data"],
12
  iterations=100, )
13
 
14
  #save the results
examples/piglattin/prepare_training_data.py CHANGED
@@ -39,7 +39,7 @@ def english_to_piglattin( english ):
39
  else:
40
  piglattin += "yay" + char
41
 
42
- #end of sentance needs done as well.
43
  if in_word:
44
  if start:
45
  piglattin += start.lower() + "ay"
 
39
  else:
40
  piglattin += "yay" + char
41
 
42
+ #end of sentence needs done as well.
43
  if in_word:
44
  if start:
45
  piglattin += start.lower() + "ay"
run_tests.sh CHANGED
@@ -1,7 +1,7 @@
1
  !/usr/bin/env bash
2
  # echo test 1
3
  # ./venv/bin/python transmorgrify.py \
4
- # --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
5
  # --a_header English \
6
  # --b_header Phonetic\
7
  # --device 0:1 \
@@ -11,7 +11,7 @@
11
  # --train_percentage 50
12
  # echo test 2
13
  # ./venv/bin/python transmorgrify.py \
14
- # --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
15
  # --a_header English \
16
  # --b_header Phonetic\
17
  # --device cpu \
@@ -21,7 +21,7 @@
21
  # --train_percentage 50
22
  # echo test 1b
23
  # ./venv/bin/python transmorgrify.py \
24
- # --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
25
  # --b_header English \
26
  # --a_header Phonetic\
27
  # --device 0:1 \
@@ -31,7 +31,7 @@
31
  # --train_percentage 50
32
  # echo test 3
33
  # ./venv/bin/python transmorgrify.py \
34
- # --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
35
  # --b_header English \
36
  # --a_header Phonetic\
37
  # --device cpu \
@@ -42,7 +42,7 @@
42
  echo test 4
43
  ./venv/bin/python transmorgrify.py \
44
  --execute \
45
- --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
46
  --a_header Phonetic\
47
  --b_header English \
48
  --device cpu \
@@ -54,7 +54,7 @@ echo test 4
54
  echo test 5
55
  ./venv/bin/python transmorgrify.py \
56
  --execute \
57
- --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
58
  --a_header English \
59
  --b_header Phonetic\
60
  --device cpu \
@@ -68,7 +68,7 @@ echo test 5
68
  echo test 4
69
  ./venv/bin/python transmorgrify.py \
70
  --execute \
71
- --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
72
  --a_header Piglattin\
73
  --b_header English \
74
  --device cpu \
@@ -80,7 +80,7 @@ echo test 4
80
  echo test 5
81
  ./venv/bin/python transmorgrify.py \
82
  --execute \
83
- --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
84
  --a_header English \
85
  --b_header Piglattin\
86
  --device cpu \
 
1
  !/usr/bin/env bash
2
  # echo test 1
3
  # ./venv/bin/python transmorgrify.py \
4
+ # --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
5
  # --a_header English \
6
  # --b_header Phonetic\
7
  # --device 0:1 \
 
11
  # --train_percentage 50
12
  # echo test 2
13
  # ./venv/bin/python transmorgrify.py \
14
+ # --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
15
  # --a_header English \
16
  # --b_header Phonetic\
17
  # --device cpu \
 
21
  # --train_percentage 50
22
  # echo test 1b
23
  # ./venv/bin/python transmorgrify.py \
24
+ # --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
25
  # --b_header English \
26
  # --a_header Phonetic\
27
  # --device 0:1 \
 
31
  # --train_percentage 50
32
  # echo test 3
33
  # ./venv/bin/python transmorgrify.py \
34
+ # --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
35
  # --b_header English \
36
  # --a_header Phonetic\
37
  # --device cpu \
 
42
  echo test 4
43
  ./venv/bin/python transmorgrify.py \
44
  --execute \
45
+ --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
46
  --a_header Phonetic\
47
  --b_header English \
48
  --device cpu \
 
54
  echo test 5
55
  ./venv/bin/python transmorgrify.py \
56
  --execute \
57
+ --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
58
  --a_header English \
59
  --b_header Phonetic\
60
  --device cpu \
 
68
  echo test 4
69
  ./venv/bin/python transmorgrify.py \
70
  --execute \
71
+ --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
72
  --a_header Piglattin\
73
  --b_header English \
74
  --device cpu \
 
80
  echo test 5
81
  ./venv/bin/python transmorgrify.py \
82
  --execute \
83
+ --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
84
  --a_header English \
85
  --b_header Piglattin\
86
  --device cpu \
run_tests2.sh CHANGED
@@ -1,7 +1,7 @@
1
  # !/usr/bin/env bash
2
  # echo test 1
3
  # ./venv/bin/python transmorgrify.py \
4
- # --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
5
  # --a_header English \
6
  # --b_header Piglattin\
7
  # --device 0:1 \
@@ -11,7 +11,7 @@
11
  # --train_percentage 50
12
  # echo test 1b
13
  # ./venv/bin/python transmorgrify.py \
14
- # --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
15
  # --b_header English \
16
  # --a_header Piglattin\
17
  # --device 0:1 \
@@ -22,7 +22,7 @@
22
  # echo test 4
23
  # ./venv/bin/python transmorgrify.py \
24
  # --execute \
25
- # --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
26
  # --a_header Piglattin\
27
  # --b_header English \
28
  # --device cpu \
@@ -34,7 +34,7 @@
34
  # echo test 5
35
  # ./venv/bin/python transmorgrify.py \
36
  # --execute \
37
- # --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
38
  # --a_header English \
39
  # --b_header Piglattin\
40
  # --device cpu \
 
1
  # !/usr/bin/env bash
2
  # echo test 1
3
  # ./venv/bin/python transmorgrify.py \
4
+ # --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
5
  # --a_header English \
6
  # --b_header Piglattin\
7
  # --device 0:1 \
 
11
  # --train_percentage 50
12
  # echo test 1b
13
  # ./venv/bin/python transmorgrify.py \
14
+ # --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
15
  # --b_header English \
16
  # --a_header Piglattin\
17
  # --device 0:1 \
 
22
  # echo test 4
23
  # ./venv/bin/python transmorgrify.py \
24
  # --execute \
25
+ # --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
26
  # --a_header Piglattin\
27
  # --b_header English \
28
  # --device cpu \
 
34
  # echo test 5
35
  # ./venv/bin/python transmorgrify.py \
36
  # --execute \
37
+ # --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
38
  # --a_header English \
39
  # --b_header Piglattin\
40
  # --device cpu \
transmorgrify.py CHANGED
@@ -15,20 +15,20 @@ START = 3
15
  FILE_VERSION = 1
16
 
17
  class Transmorgrifier:
18
- def train( self, from_sentances, to_sentances, iterations = 4000, device = 'cpu', trailing_context = 7, leading_context = 7, verbose=True ):
19
  """
20
  Train the Transmorgrifier model. This does not save it to disk but just trains in memory.
21
 
22
  Keyword arguments:
23
- from_sentances -- An array of strings for the input sentances.
24
- to_sentances -- An array of strings of the same length as from_sentances which the model is to train to convert to.
25
  iterations -- An integer specifying the number of iterations to convert from or to. (default 4000)
26
  device -- The gpu reference which catboost wants or "cpu". (default cpu)
27
  trailing_context -- The number of characters after the action point to include for context. (default 7)
28
  leading_context -- The number of characters before the action point to include for context. (default 7)
29
  verbose -- Increased the amount of text output during training. (default True)
30
  """
31
- X,Y = _parse_for_training( from_sentances, to_sentances, num_pre_context_chars=leading_context, num_post_context_chars=trailing_context )
32
 
33
  #train and save the action_model
34
  self.action_model = _train_catboost( X, Y['action'], iterations, verbose=verbose, device=device, model_piece='action' )
@@ -99,25 +99,25 @@ class Transmorgrifier:
99
  return self
100
 
101
 
102
- def execute( self, from_sentances, verbose=False ):
103
  """
104
  Runs the data from from_sentaces. The results are returned
105
  using yield so you need to wrap this in list() if you want
106
- to index it. from_sentances can be an array or a generator.
107
 
108
  Keyword arguments:
109
- from_sentances -- Something iterable which returns strings.
110
  """
111
- for i,from_sentance in enumerate(from_sentances):
112
 
113
  yield _do_reconstruct(
114
  action_model=self.action_model,
115
  char_model=self.char_model,
116
- text=from_sentance,
117
  num_pre_context_chars=self.leading_context,
118
  num_post_context_chars=self.trailing_context )
119
  if verbose and i % 10 == 0:
120
- print( f"{i} of {len(from_sentances)}" )
121
 
122
  def demo( self, share=False ):
123
  import gradio as gr
@@ -162,7 +162,7 @@ class _edit_trace_hop():
162
  def __repr__( self ):
163
  return self.__str__()
164
 
165
- def _trace_edits( from_sentance, to_sentance, print_debug=False ):
166
  #iterating from will be the rows down the left side.
167
  #iterating to will be the columns across the top.
168
  #we will keep one row as we work on the next.
@@ -173,9 +173,9 @@ def _trace_edits( from_sentance, to_sentance, print_debug=False ):
173
  #the index handles one before the index in the string
174
  #to handle the root cases across the top and down the left of the
175
  #match matrix.
176
- for from_row_i in range( len(from_sentance)+1 ):
177
 
178
- for to_column_i in range( len(to_sentance )+1 ):
179
 
180
  best_option = None
181
 
@@ -195,7 +195,7 @@ def _trace_edits( from_sentance, to_sentance, print_debug=False ):
195
  best_option = _edit_trace_hop()
196
  best_option.parrent = current_row[to_column_i-1]
197
  best_option.edit_distance = best_option.parrent.edit_distance + 1
198
- best_option.char = to_sentance[to_column_i-1]
199
  best_option.from_row_i = from_row_i
200
  best_option.to_column_i = to_column_i
201
  best_option.action = INSERT_TO
@@ -206,19 +206,19 @@ def _trace_edits( from_sentance, to_sentance, print_debug=False ):
206
  best_option = _edit_trace_hop()
207
  best_option.parrent = last_row[to_column_i]
208
  best_option.edit_distance = best_option.parrent.edit_distance + 1
209
- best_option.char = from_sentance[from_row_i-1]
210
  best_option.from_row_i = from_row_i
211
  best_option.to_column_i = to_column_i
212
  best_option.action = DELETE_FROM
213
 
214
  #check match
215
  if to_column_i > 0:
216
- if to_sentance[to_column_i-1] == from_sentance[from_row_i-1]:
217
  if best_option is None or last_row[to_column_i-1].edit_distance <= best_option.edit_distance: #prefer match so use <= than <
218
  best_option = _edit_trace_hop()
219
  best_option.parrent = last_row[to_column_i-1]
220
  best_option.edit_distance = best_option.parrent.edit_distance + 1
221
- best_option.char = from_sentance[from_row_i-1]
222
  best_option.from_row_i = from_row_i
223
  best_option.to_column_i = to_column_i
224
  best_option.action = MATCH
@@ -246,8 +246,8 @@ def _trace_edits( from_sentance, to_sentance, print_debug=False ):
246
  return last_row[-1]
247
 
248
 
249
- def _parse_single_for_training( from_sentance, to_sentance, num_pre_context_chars, num_post_context_chars ):
250
- trace = _trace_edits( from_sentance, to_sentance )
251
 
252
  #we will collect a snapshot at each step.
253
  trace_list = _list_trace(trace)
@@ -255,8 +255,8 @@ def _parse_single_for_training( from_sentance, to_sentance, num_pre_context_char
255
 
256
  training_collection = []
257
 
258
- #execute these things on the from_sentance and see if we get the to_sentance.
259
- working_from = from_sentance
260
  working_to = ""
261
  used_from = ""
262
  continuous_added = 0
@@ -298,7 +298,7 @@ def _parse_single_for_training( from_sentance, to_sentance, num_pre_context_char
298
  continuous_dropped = 0
299
 
300
 
301
- if to_sentance != working_to:
302
  print( "Replay failure" )
303
 
304
  #so now I have training_collection which is a list of dictionaries where each dictionary is an action with a context.
@@ -348,18 +348,18 @@ def _parse_single_for_training( from_sentance, to_sentance, num_pre_context_char
348
  return pd.DataFrame( context_split_into_dict ), pd.DataFrame( result_split_into_dict )
349
 
350
 
351
- def _parse_for_training( from_sentances, to_sentances, num_pre_context_chars, num_post_context_chars ):
352
  out_observations_list = []
353
  out_results_list = []
354
 
355
- for index, (from_sentance, to_sentance) in enumerate(zip( from_sentances, to_sentances )):
356
- if type(from_sentance) != float and type(to_sentance) != float: #bad lines are nan which are floats.
357
- specific_observation, specific_result = _parse_single_for_training( from_sentance, to_sentance, num_pre_context_chars=num_pre_context_chars, num_post_context_chars=num_post_context_chars )
358
 
359
  out_observations_list.append( specific_observation )
360
  out_results_list.append( specific_result )
361
  if index % 100 == 0:
362
- print( f"parsing {index} of {len(from_sentances)}")
363
 
364
  return pd.concat( out_observations_list ), pd.concat( out_results_list )
365
 
@@ -507,8 +507,8 @@ def train( in_csv, a_header, b_header, model, iterations, device, leading_contex
507
 
508
  tm = Transmorgrifier()
509
 
510
- tm.train( from_sentances=train_data[a_header],
511
- to_sentances=train_data[b_header],
512
  iterations = iterations,
513
  device = device,
514
  leading_context = leading_context,
 
15
  FILE_VERSION = 1
16
 
17
  class Transmorgrifier:
18
+ def train( self, from_sentences, to_sentences, iterations = 4000, device = 'cpu', trailing_context = 7, leading_context = 7, verbose=True ):
19
  """
20
  Train the Transmorgrifier model. This does not save it to disk but just trains in memory.
21
 
22
  Keyword arguments:
23
+ from_sentences -- An array of strings for the input sentences.
24
+ to_sentences -- An array of strings of the same length as from_sentences which the model is to train to convert to.
25
  iterations -- An integer specifying the number of iterations to convert from or to. (default 4000)
26
  device -- The gpu reference which catboost wants or "cpu". (default cpu)
27
  trailing_context -- The number of characters after the action point to include for context. (default 7)
28
  leading_context -- The number of characters before the action point to include for context. (default 7)
29
  verbose -- Increased the amount of text output during training. (default True)
30
  """
31
+ X,Y = _parse_for_training( from_sentences, to_sentences, num_pre_context_chars=leading_context, num_post_context_chars=trailing_context )
32
 
33
  #train and save the action_model
34
  self.action_model = _train_catboost( X, Y['action'], iterations, verbose=verbose, device=device, model_piece='action' )
 
99
  return self
100
 
101
 
102
+ def execute( self, from_sentences, verbose=False ):
103
  """
104
  Runs the data from from_sentaces. The results are returned
105
  using yield so you need to wrap this in list() if you want
106
+ to index it. from_sentences can be an array or a generator.
107
 
108
  Keyword arguments:
109
+ from_sentences -- Something iterable which returns strings.
110
  """
111
+ for i,from_sentence in enumerate(from_sentences):
112
 
113
  yield _do_reconstruct(
114
  action_model=self.action_model,
115
  char_model=self.char_model,
116
+ text=from_sentence,
117
  num_pre_context_chars=self.leading_context,
118
  num_post_context_chars=self.trailing_context )
119
  if verbose and i % 10 == 0:
120
+ print( f"{i} of {len(from_sentences)}" )
121
 
122
  def demo( self, share=False ):
123
  import gradio as gr
 
162
  def __repr__( self ):
163
  return self.__str__()
164
 
165
+ def _trace_edits( from_sentence, to_sentence, print_debug=False ):
166
  #iterating from will be the rows down the left side.
167
  #iterating to will be the columns across the top.
168
  #we will keep one row as we work on the next.
 
173
  #the index handles one before the index in the string
174
  #to handle the root cases across the top and down the left of the
175
  #match matrix.
176
+ for from_row_i in range( len(from_sentence)+1 ):
177
 
178
+ for to_column_i in range( len(to_sentence )+1 ):
179
 
180
  best_option = None
181
 
 
195
  best_option = _edit_trace_hop()
196
  best_option.parrent = current_row[to_column_i-1]
197
  best_option.edit_distance = best_option.parrent.edit_distance + 1
198
+ best_option.char = to_sentence[to_column_i-1]
199
  best_option.from_row_i = from_row_i
200
  best_option.to_column_i = to_column_i
201
  best_option.action = INSERT_TO
 
206
  best_option = _edit_trace_hop()
207
  best_option.parrent = last_row[to_column_i]
208
  best_option.edit_distance = best_option.parrent.edit_distance + 1
209
+ best_option.char = from_sentence[from_row_i-1]
210
  best_option.from_row_i = from_row_i
211
  best_option.to_column_i = to_column_i
212
  best_option.action = DELETE_FROM
213
 
214
  #check match
215
  if to_column_i > 0:
216
+ if to_sentence[to_column_i-1] == from_sentence[from_row_i-1]:
217
  if best_option is None or last_row[to_column_i-1].edit_distance <= best_option.edit_distance: #prefer match so use <= than <
218
  best_option = _edit_trace_hop()
219
  best_option.parrent = last_row[to_column_i-1]
220
  best_option.edit_distance = best_option.parrent.edit_distance + 1
221
+ best_option.char = from_sentence[from_row_i-1]
222
  best_option.from_row_i = from_row_i
223
  best_option.to_column_i = to_column_i
224
  best_option.action = MATCH
 
246
  return last_row[-1]
247
 
248
 
249
+ def _parse_single_for_training( from_sentence, to_sentence, num_pre_context_chars, num_post_context_chars ):
250
+ trace = _trace_edits( from_sentence, to_sentence )
251
 
252
  #we will collect a snapshot at each step.
253
  trace_list = _list_trace(trace)
 
255
 
256
  training_collection = []
257
 
258
+ #execute these things on the from_sentence and see if we get the to_sentence.
259
+ working_from = from_sentence
260
  working_to = ""
261
  used_from = ""
262
  continuous_added = 0
 
298
  continuous_dropped = 0
299
 
300
 
301
+ if to_sentence != working_to:
302
  print( "Replay failure" )
303
 
304
  #so now I have training_collection which is a list of dictionaries where each dictionary is an action with a context.
 
348
  return pd.DataFrame( context_split_into_dict ), pd.DataFrame( result_split_into_dict )
349
 
350
 
351
+ def _parse_for_training( from_sentences, to_sentences, num_pre_context_chars, num_post_context_chars ):
352
  out_observations_list = []
353
  out_results_list = []
354
 
355
+ for index, (from_sentence, to_sentence) in enumerate(zip( from_sentences, to_sentences )):
356
+ if type(from_sentence) != float and type(to_sentence) != float: #bad lines are nan which are floats.
357
+ specific_observation, specific_result = _parse_single_for_training( from_sentence, to_sentence, num_pre_context_chars=num_pre_context_chars, num_post_context_chars=num_post_context_chars )
358
 
359
  out_observations_list.append( specific_observation )
360
  out_results_list.append( specific_result )
361
  if index % 100 == 0:
362
+ print( f"parsing {index} of {len(from_sentences)}")
363
 
364
  return pd.concat( out_observations_list ), pd.concat( out_results_list )
365
 
 
507
 
508
  tm = Transmorgrifier()
509
 
510
+ tm.train( from_sentences=train_data[a_header],
511
+ to_sentences=train_data[b_header],
512
  iterations = iterations,
513
  device = device,
514
  leading_context = leading_context,