Spaces:

JEdward7777
/

SentenceTransmorgrifier

Runtime error

App Files Files Community

Joshua Lansford commited on Dec 4, 2022

Commit

fbbf27f

1 Parent(s): 6f3a3e1

Fixing spelling of sentance to sentence in files.

Browse files

Files changed (8) hide show

.vscode/launch.json +6 -6
README.md +18 -18
app.py +1 -1
example_train.py +2 -2
examples/piglattin/prepare_training_data.py +1 -1
run_tests.sh +8 -8
run_tests2.sh +4 -4
transmorgrify.py +29 -29

.vscode/launch.json CHANGED Viewed

@@ -21,7 +21,7 @@
             "justMyCode": true,
             "args": [
                 "--train",
-                "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv",
                 "--a_header", "English",
                 "--b_header", "Phonetic",
                 "--device", "0:1",
@@ -36,7 +36,7 @@
                 "justMyCode": true,
                 "args": [
                     "--train",
-                    "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv",
                     "--b_header", "English",
                     "--a_header", "Phonetic",
                     "--device", "0:1",
@@ -51,7 +51,7 @@
             "justMyCode": true,
             "args": [
                 "--train",
-                "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic_short.csv",
                 "--a_header", "English",
                 "--b_header", "Phonetic",
                 "--device", "0:1",
@@ -66,7 +66,7 @@
             "justMyCode": true,
             "args": [
                 "--execute",
-                "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv",
                 "--out_csv", "./phonetic_out.csv",
                 "--a_header", "English",
                 "--b_header", "Phonetic",
@@ -83,7 +83,7 @@
             "justMyCode": true,
             "args": [
                 "--execute",
-                "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic_short.csv",
                 "--out_csv", "./phonetic_out.csv",
                 "--a_header", "English",
                 "--b_header", "Phonetic",
@@ -100,7 +100,7 @@
             "justMyCode": true,
             "args": [
                 "--execute",
-                "--in_csv", "/home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic_short.csv",
                 "--out_csv", "./reverse_phonetic_out.csv",
                 "--b_header", "English",
                 "--a_header", "Phonetic",

             "justMyCode": true,
             "args": [
                 "--train",
+                "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv",
                 "--a_header", "English",
                 "--b_header", "Phonetic",
                 "--device", "0:1",
                 "justMyCode": true,
                 "args": [
                     "--train",
+                    "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv",
                     "--b_header", "English",
                     "--a_header", "Phonetic",
                     "--device", "0:1",
             "justMyCode": true,
             "args": [
                 "--train",
+                "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic_short.csv",
                 "--a_header", "English",
                 "--b_header", "Phonetic",
                 "--device", "0:1",
             "justMyCode": true,
             "args": [
                 "--execute",
+                "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv",
                 "--out_csv", "./phonetic_out.csv",
                 "--a_header", "English",
                 "--b_header", "Phonetic",
             "justMyCode": true,
             "args": [
                 "--execute",
+                "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic_short.csv",
                 "--out_csv", "./phonetic_out.csv",
                 "--a_header", "English",
                 "--b_header", "Phonetic",
             "justMyCode": true,
             "args": [
                 "--execute",
+                "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic_short.csv",
                 "--out_csv", "./reverse_phonetic_out.csv",
                 "--b_header", "English",
                 "--a_header", "Phonetic",

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Sentance Transmorgrifier
 emoji: s
 colorFrom: yellow
 colorTo: yellow
@@ -10,24 +10,24 @@ pinned: false
 license: apache-2.0
 ---
-## Sentance Transmorgrifier
-# What is the Sentance Transmorgrifier?
-- The Sentance Transmorgrifier is a framework to make text to text conversion models which uses a categorical gradiant boost library, [catboost](https://catboost.ai/), as its back end.
 - This library does not use neural net or word embeddings but does the transformation on the character level.
-- For Sentance Transmorgrifier to work, there has to be some common characters between the from and two conversion.
-- The model uses a modified form of the [logest common subsequence algorithm](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) to transform the sentance conversion into a sequence of three types of operations:
   1. Match: Pass the character from input to output
   2. Drop: Remove the incoming character from the input.
   3. Insert: Generate a character and add it to the output.
 - The transformation uses a sliding context window of the next n incoming characters, ``n`` output transformed chars and n output untransformed chars.
 - Because the window is sliding, there is no fixed length on the character sequences which can be transformed.
-# Where is the code and a demo of said Sentance Transmorgrifier?
-- There is a [Sentance Transmorgrifier HuggingFace space](https://huggingface.co/spaces/JEdward7777/SentanceTransmorgrifier) demoing a couple models created with Sentance Transmorgrifier.
-- A branch of the code without the trained example models is checked in at the [Sentance Transmorgrifier Github page](https://github.com/JEdward7777/SentanceTransmogrifier).
-# How can I use the Sentance Transmorgrifier
 - The project has been configured to be able to be used in two different ways.
 ## Shell access
@@ -35,7 +35,7 @@ license: apache-2.0
 ```sh
 python transmorgrify.py \
-    --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
      --a_header English \
      --b_header Phonetic\
      --device 0:1 \
@@ -56,7 +56,7 @@ python transmorgrify.py \
 ```sh
 python transmorgrify.py \
     --execute \
-    --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
     --a_header English \
     --b_header Phonetic\
     --device cpu \
@@ -83,8 +83,8 @@ python transmorgrify.py \
 Train the Transmorgrifier model.  This does not save it to disk but just trains in memory.
 Keyword arguments:
-from_sentances -- An array of strings for the input sentances.
-to_sentances -- An array of strings of the same length as from_sentances which the model is to train to convert to.
 iterations -- An integer specifying the number of iterations to convert from or to. (default 4000)
 device -- The gpu reference which catboost wants or "cpu". (default cpu)
 trailing_context -- The number of characters after the action point to include for context. (default 7)
@@ -109,10 +109,10 @@ model -- The filename of the model to load. (default my_model.tm)
 ```
 Runs the data from from_sentaces.  The results are returned
 using yield so you need to wrap this in list() if you want
-to index it.  from_sentances can be an array or a generator.
 Keyword arguments:
-from_sentances -- Something iterable which returns strings.
 ```
 - Here is an example of using object access to train a model
 ```python
@@ -125,8 +125,8 @@ train_data = pd.read_csv( "training.csv" )
 #do the training
 my_model = transmorgrify.Transmorgrifier()
 my_model.train(
-    from_sentances=train_data["from_header"],
-    to_sentances=train_data["to_header"],
     iterations=4000 )
 #save the results

 ---
+title: Sentence Transmorgrifier
 emoji: s
 colorFrom: yellow
 colorTo: yellow
 license: apache-2.0
 ---
+## Sentence Transmorgrifier
+# What is the Sentence Transmorgrifier?
+- The Sentence Transmorgrifier is a framework to make text to text conversion models which uses a categorical gradiant boost library, [catboost](https://catboost.ai/), as its back end.
 - This library does not use neural net or word embeddings but does the transformation on the character level.
+- For Sentence Transmorgrifier to work, there has to be some common characters between the from and two conversion.
+- The model uses a modified form of the [logest common subsequence algorithm](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) to transform the sentence conversion into a sequence of three types of operations:
   1. Match: Pass the character from input to output
   2. Drop: Remove the incoming character from the input.
   3. Insert: Generate a character and add it to the output.
 - The transformation uses a sliding context window of the next n incoming characters, ``n`` output transformed chars and n output untransformed chars.
 - Because the window is sliding, there is no fixed length on the character sequences which can be transformed.
+# Where is the code and a demo of said Sentence Transmorgrifier?
+- There is a [Sentence Transmorgrifier HuggingFace space](https://huggingface.co/spaces/JEdward7777/SentenceTransmorgrifier) demoing a couple models created with Sentence Transmorgrifier.
+- A branch of the code without the trained example models is checked in at the [Sentence Transmorgrifier Github page](https://github.com/JEdward7777/SentenceTransmogrifier).
+# How can I use the Sentence Transmorgrifier
 - The project has been configured to be able to be used in two different ways.
 ## Shell access
 ```sh
 python transmorgrify.py \
+    --train --in_csv ./examples/phonetic/phonetic.csv \
      --a_header English \
      --b_header Phonetic\
      --device 0:1 \
 ```sh
 python transmorgrify.py \
     --execute \
+    --in_csv ./examples/phonetic/phonetic.csv \
     --a_header English \
     --b_header Phonetic\
     --device cpu \
 Train the Transmorgrifier model.  This does not save it to disk but just trains in memory.
 Keyword arguments:
+from_sentences -- An array of strings for the input sentences.
+to_sentences -- An array of strings of the same length as from_sentences which the model is to train to convert to.
 iterations -- An integer specifying the number of iterations to convert from or to. (default 4000)
 device -- The gpu reference which catboost wants or "cpu". (default cpu)
 trailing_context -- The number of characters after the action point to include for context. (default 7)
 ```
 Runs the data from from_sentaces.  The results are returned
 using yield so you need to wrap this in list() if you want
+to index it.  from_sentences can be an array or a generator.
 Keyword arguments:
+from_sentences -- Something iterable which returns strings.
 ```
 - Here is an example of using object access to train a model
 ```python
 #do the training
 my_model = transmorgrify.Transmorgrifier()
 my_model.train(
+    from_sentences=train_data["from_header"],
+    to_sentences=train_data["to_header"],
     iterations=4000 )
 #save the results

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ def pig_to_eng( input ):
 with gr.Blocks() as demo:
     gr.Markdown(
 """
-# Sentance Transmorgrifier demo
 The following demos have been trained on different tasks.
 Select the tab below for a demo.
 """

 with gr.Blocks() as demo:
     gr.Markdown(
 """
+# Sentence Transmorgrifier demo
 The following demos have been trained on different tasks.
 Select the tab below for a demo.
 """

example_train.py CHANGED Viewed

@@ -7,8 +7,8 @@ train_data = pd.read_csv( "phonetics_out_gpu_4000.csv" )[0:100]
 #do the training
 my_model = transmorgrify.Transmorgrifier()
 my_model.train(
-    from_sentances=train_data["in_data"],
-    to_sentances=train_data["out_data"],
     iterations=100,  )
 #save the results

 #do the training
 my_model = transmorgrify.Transmorgrifier()
 my_model.train(
+    from_sentences=train_data["in_data"],
+    to_sentences=train_data["out_data"],
     iterations=100,  )
 #save the results

examples/piglattin/prepare_training_data.py CHANGED Viewed

@@ -39,7 +39,7 @@ def english_to_piglattin( english ):
                 else:
                     piglattin += "yay" + char
-    #end of sentance needs done as well.
     if in_word:
         if start:
             piglattin += start.lower() + "ay"

                 else:
                     piglattin += "yay" + char
+    #end of sentence needs done as well.
     if in_word:
         if start:
             piglattin += start.lower() + "ay"

run_tests.sh CHANGED Viewed

@@ -1,7 +1,7 @@
 !/usr/bin/env bash
 # echo test 1
 # ./venv/bin/python transmorgrify.py \
-#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
 #     --a_header English \
 #     --b_header Phonetic\
 #     --device 0:1 \
@@ -11,7 +11,7 @@
 #     --train_percentage 50
 # echo test 2
 # ./venv/bin/python transmorgrify.py \
-#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
 #     --a_header English \
 #     --b_header Phonetic\
 #     --device cpu \
@@ -21,7 +21,7 @@
 #     --train_percentage 50
 # echo test 1b
 # ./venv/bin/python transmorgrify.py \
-#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
 #     --b_header English \
 #     --a_header Phonetic\
 #     --device 0:1 \
@@ -31,7 +31,7 @@
 #     --train_percentage 50
 # echo test 3
 # ./venv/bin/python transmorgrify.py \
-#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
 #     --b_header English \
 #     --a_header Phonetic\
 #     --device cpu \
@@ -42,7 +42,7 @@
 echo test 4
 ./venv/bin/python transmorgrify.py \
     --execute \
-    --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
     --a_header Phonetic\
     --b_header English \
     --device cpu \
@@ -54,7 +54,7 @@ echo test 4
 echo test 5
 ./venv/bin/python transmorgrify.py \
     --execute \
-    --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/phonetic/phonetic.csv \
     --a_header English \
     --b_header Phonetic\
     --device cpu \
@@ -68,7 +68,7 @@ echo test 5
 echo test 4
 ./venv/bin/python transmorgrify.py \
     --execute \
-    --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
     --a_header Piglattin\
     --b_header English \
     --device cpu \
@@ -80,7 +80,7 @@ echo test 4
 echo test 5
 ./venv/bin/python transmorgrify.py \
     --execute \
-    --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
     --a_header English \
     --b_header Piglattin\
     --device cpu \

 !/usr/bin/env bash
 # echo test 1
 # ./venv/bin/python transmorgrify.py \
+#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
 #     --a_header English \
 #     --b_header Phonetic\
 #     --device 0:1 \
 #     --train_percentage 50
 # echo test 2
 # ./venv/bin/python transmorgrify.py \
+#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
 #     --a_header English \
 #     --b_header Phonetic\
 #     --device cpu \
 #     --train_percentage 50
 # echo test 1b
 # ./venv/bin/python transmorgrify.py \
+#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
 #     --b_header English \
 #     --a_header Phonetic\
 #     --device 0:1 \
 #     --train_percentage 50
 # echo test 3
 # ./venv/bin/python transmorgrify.py \
+#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
 #     --b_header English \
 #     --a_header Phonetic\
 #     --device cpu \
 echo test 4
 ./venv/bin/python transmorgrify.py \
     --execute \
+    --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
     --a_header Phonetic\
     --b_header English \
     --device cpu \
 echo test 5
 ./venv/bin/python transmorgrify.py \
     --execute \
+    --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv \
     --a_header English \
     --b_header Phonetic\
     --device cpu \
 echo test 4
 ./venv/bin/python transmorgrify.py \
     --execute \
+    --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
     --a_header Piglattin\
     --b_header English \
     --device cpu \
 echo test 5
 ./venv/bin/python transmorgrify.py \
     --execute \
+    --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
     --a_header English \
     --b_header Piglattin\
     --device cpu \

run_tests2.sh CHANGED Viewed

@@ -1,7 +1,7 @@
 # !/usr/bin/env bash
 # echo test 1
 # ./venv/bin/python transmorgrify.py \
-#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
 #     --a_header English \
 #     --b_header Piglattin\
 #     --device 0:1 \
@@ -11,7 +11,7 @@
 #     --train_percentage 50
 # echo test 1b
 # ./venv/bin/python transmorgrify.py \
-#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
 #     --b_header English \
 #     --a_header Piglattin\
 #     --device 0:1 \
@@ -22,7 +22,7 @@
 # echo test 4
 # ./venv/bin/python transmorgrify.py \
 #     --execute \
-#     --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
 #     --a_header Piglattin\
 #     --b_header English \
 #     --device cpu \
@@ -34,7 +34,7 @@
 # echo test 5
 # ./venv/bin/python transmorgrify.py \
 #     --execute \
-#     --in_csv /home/lansford/Sync/projects/tf_over/sentance_transmogrifier/examples/piglattin/pig_lattin.csv \
 #     --a_header English \
 #     --b_header Piglattin\
 #     --device cpu \

 # !/usr/bin/env bash
 # echo test 1
 # ./venv/bin/python transmorgrify.py \
+#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
 #     --a_header English \
 #     --b_header Piglattin\
 #     --device 0:1 \
 #     --train_percentage 50
 # echo test 1b
 # ./venv/bin/python transmorgrify.py \
+#     --train --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
 #     --b_header English \
 #     --a_header Piglattin\
 #     --device 0:1 \
 # echo test 4
 # ./venv/bin/python transmorgrify.py \
 #     --execute \
+#     --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
 #     --a_header Piglattin\
 #     --b_header English \
 #     --device cpu \
 # echo test 5
 # ./venv/bin/python transmorgrify.py \
 #     --execute \
+#     --in_csv /home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/piglattin/pig_lattin.csv \
 #     --a_header English \
 #     --b_header Piglattin\
 #     --device cpu \

transmorgrify.py CHANGED Viewed

@@ -15,20 +15,20 @@ START = 3
 FILE_VERSION = 1
 class Transmorgrifier:
-    def train( self, from_sentances, to_sentances, iterations = 4000, device = 'cpu', trailing_context = 7, leading_context = 7, verbose=True ):
         """
         Train the Transmorgrifier model.  This does not save it to disk but just trains in memory.
         Keyword arguments:
-        from_sentances -- An array of strings for the input sentances.
-        to_sentances -- An array of strings of the same length as from_sentances which the model is to train to convert to.
         iterations -- An integer specifying the number of iterations to convert from or to. (default 4000)
         device -- The gpu reference which catboost wants or "cpu". (default cpu)
         trailing_context -- The number of characters after the action point to include for context. (default 7)
         leading_context -- The number of characters before the action point to include for context. (default 7)
         verbose -- Increased the amount of text output during training. (default True)
         """
-        X,Y = _parse_for_training( from_sentances, to_sentances, num_pre_context_chars=leading_context, num_post_context_chars=trailing_context )
         #train and save the action_model
         self.action_model = _train_catboost( X, Y['action'], iterations, verbose=verbose, device=device, model_piece='action' )
@@ -99,25 +99,25 @@ class Transmorgrifier:
         return self
-    def execute( self, from_sentances, verbose=False ):
         """
         Runs the data from from_sentaces.  The results are returned
         using yield so you need to wrap this in list() if you want
-        to index it.  from_sentances can be an array or a generator.
         Keyword arguments:
-        from_sentances -- Something iterable which returns strings.
         """
-        for i,from_sentance in enumerate(from_sentances):
             yield _do_reconstruct(
                 action_model=self.action_model,
                 char_model=self.char_model,
-                text=from_sentance,
                 num_pre_context_chars=self.leading_context,
                 num_post_context_chars=self.trailing_context  )
             if verbose and i % 10 == 0:
-                print( f"{i} of {len(from_sentances)}" )
     def demo( self, share=False ):
         import gradio as gr
@@ -162,7 +162,7 @@ class _edit_trace_hop():
     def __repr__( self ):
         return self.__str__()
-def _trace_edits( from_sentance, to_sentance, print_debug=False ):
     #iterating from will be the rows down the left side.
     #iterating to will be the columns across the top.
     #we will keep one row as we work on the next.
@@ -173,9 +173,9 @@ def _trace_edits( from_sentance, to_sentance, print_debug=False ):
     #the index handles one before the index in the string
     #to handle the root cases across the top and down the left of the
     #match matrix.
-    for from_row_i in range( len(from_sentance)+1 ):
-        for to_column_i in range( len(to_sentance )+1 ):
             best_option = None
@@ -195,7 +195,7 @@ def _trace_edits( from_sentance, to_sentance, print_debug=False ):
                     best_option = _edit_trace_hop()
                     best_option.parrent = current_row[to_column_i-1]
                     best_option.edit_distance = best_option.parrent.edit_distance + 1
-                    best_option.char = to_sentance[to_column_i-1]
                     best_option.from_row_i = from_row_i
                     best_option.to_column_i = to_column_i
                     best_option.action = INSERT_TO
@@ -206,19 +206,19 @@ def _trace_edits( from_sentance, to_sentance, print_debug=False ):
                     best_option = _edit_trace_hop()
                     best_option.parrent = last_row[to_column_i]
                     best_option.edit_distance = best_option.parrent.edit_distance + 1
-                    best_option.char = from_sentance[from_row_i-1]
                     best_option.from_row_i = from_row_i
                     best_option.to_column_i = to_column_i
                     best_option.action = DELETE_FROM
                 #check match
                 if to_column_i > 0:
-                    if to_sentance[to_column_i-1] == from_sentance[from_row_i-1]:
                         if best_option is None or last_row[to_column_i-1].edit_distance <= best_option.edit_distance: #prefer match so use <= than <
                             best_option = _edit_trace_hop()
                             best_option.parrent = last_row[to_column_i-1]
                             best_option.edit_distance = best_option.parrent.edit_distance + 1
-                            best_option.char = from_sentance[from_row_i-1]
                             best_option.from_row_i = from_row_i
                             best_option.to_column_i = to_column_i
                             best_option.action = MATCH
@@ -246,8 +246,8 @@ def _trace_edits( from_sentance, to_sentance, print_debug=False ):
     return last_row[-1]
-def _parse_single_for_training( from_sentance, to_sentance, num_pre_context_chars, num_post_context_chars ):
-    trace = _trace_edits( from_sentance, to_sentance )
     #we will collect a snapshot at each step.
     trace_list = _list_trace(trace)
@@ -255,8 +255,8 @@ def _parse_single_for_training( from_sentance, to_sentance, num_pre_context_char
     training_collection = []
-    #execute these things on the from_sentance and see if we get the to_sentance.
-    working_from = from_sentance
     working_to = ""
     used_from = ""
     continuous_added = 0
@@ -298,7 +298,7 @@ def _parse_single_for_training( from_sentance, to_sentance, num_pre_context_char
             continuous_dropped = 0
-    if to_sentance != working_to:
         print( "Replay failure" )
     #so now I have training_collection which is a list of dictionaries where each dictionary is an action with a context.
@@ -348,18 +348,18 @@ def _parse_single_for_training( from_sentance, to_sentance, num_pre_context_char
     return pd.DataFrame( context_split_into_dict ), pd.DataFrame( result_split_into_dict )
-def _parse_for_training( from_sentances, to_sentances, num_pre_context_chars, num_post_context_chars ):
     out_observations_list = []
     out_results_list = []
-    for index, (from_sentance, to_sentance) in enumerate(zip( from_sentances, to_sentances )):
-        if type(from_sentance) != float and type(to_sentance) != float: #bad lines are nan which are floats.
-            specific_observation, specific_result = _parse_single_for_training( from_sentance, to_sentance, num_pre_context_chars=num_pre_context_chars, num_post_context_chars=num_post_context_chars )
             out_observations_list.append( specific_observation )
             out_results_list.append( specific_result )
         if index % 100 == 0:
-            print( f"parsing {index} of {len(from_sentances)}")
     return pd.concat( out_observations_list ), pd.concat( out_results_list )
@@ -507,8 +507,8 @@ def train( in_csv, a_header, b_header, model, iterations, device, leading_contex
     tm = Transmorgrifier()
-    tm.train( from_sentances=train_data[a_header],
-            to_sentances=train_data[b_header],
             iterations = iterations,
             device = device,
             leading_context = leading_context,

 FILE_VERSION = 1
 class Transmorgrifier:
+    def train( self, from_sentences, to_sentences, iterations = 4000, device = 'cpu', trailing_context = 7, leading_context = 7, verbose=True ):
         """
         Train the Transmorgrifier model.  This does not save it to disk but just trains in memory.
         Keyword arguments:
+        from_sentences -- An array of strings for the input sentences.
+        to_sentences -- An array of strings of the same length as from_sentences which the model is to train to convert to.
         iterations -- An integer specifying the number of iterations to convert from or to. (default 4000)
         device -- The gpu reference which catboost wants or "cpu". (default cpu)
         trailing_context -- The number of characters after the action point to include for context. (default 7)
         leading_context -- The number of characters before the action point to include for context. (default 7)
         verbose -- Increased the amount of text output during training. (default True)
         """
+        X,Y = _parse_for_training( from_sentences, to_sentences, num_pre_context_chars=leading_context, num_post_context_chars=trailing_context )
         #train and save the action_model
         self.action_model = _train_catboost( X, Y['action'], iterations, verbose=verbose, device=device, model_piece='action' )
         return self
+    def execute( self, from_sentences, verbose=False ):
         """
         Runs the data from from_sentaces.  The results are returned
         using yield so you need to wrap this in list() if you want
+        to index it.  from_sentences can be an array or a generator.
         Keyword arguments:
+        from_sentences -- Something iterable which returns strings.
         """
+        for i,from_sentence in enumerate(from_sentences):
             yield _do_reconstruct(
                 action_model=self.action_model,
                 char_model=self.char_model,
+                text=from_sentence,
                 num_pre_context_chars=self.leading_context,
                 num_post_context_chars=self.trailing_context  )
             if verbose and i % 10 == 0:
+                print( f"{i} of {len(from_sentences)}" )
     def demo( self, share=False ):
         import gradio as gr
     def __repr__( self ):
         return self.__str__()
+def _trace_edits( from_sentence, to_sentence, print_debug=False ):
     #iterating from will be the rows down the left side.
     #iterating to will be the columns across the top.
     #we will keep one row as we work on the next.
     #the index handles one before the index in the string
     #to handle the root cases across the top and down the left of the
     #match matrix.
+    for from_row_i in range( len(from_sentence)+1 ):
+        for to_column_i in range( len(to_sentence )+1 ):
             best_option = None
                     best_option = _edit_trace_hop()
                     best_option.parrent = current_row[to_column_i-1]
                     best_option.edit_distance = best_option.parrent.edit_distance + 1
+                    best_option.char = to_sentence[to_column_i-1]
                     best_option.from_row_i = from_row_i
                     best_option.to_column_i = to_column_i
                     best_option.action = INSERT_TO
                     best_option = _edit_trace_hop()
                     best_option.parrent = last_row[to_column_i]
                     best_option.edit_distance = best_option.parrent.edit_distance + 1
+                    best_option.char = from_sentence[from_row_i-1]
                     best_option.from_row_i = from_row_i
                     best_option.to_column_i = to_column_i
                     best_option.action = DELETE_FROM
                 #check match
                 if to_column_i > 0:
+                    if to_sentence[to_column_i-1] == from_sentence[from_row_i-1]:
                         if best_option is None or last_row[to_column_i-1].edit_distance <= best_option.edit_distance: #prefer match so use <= than <
                             best_option = _edit_trace_hop()
                             best_option.parrent = last_row[to_column_i-1]
                             best_option.edit_distance = best_option.parrent.edit_distance + 1
+                            best_option.char = from_sentence[from_row_i-1]
                             best_option.from_row_i = from_row_i
                             best_option.to_column_i = to_column_i
                             best_option.action = MATCH
     return last_row[-1]
+def _parse_single_for_training( from_sentence, to_sentence, num_pre_context_chars, num_post_context_chars ):
+    trace = _trace_edits( from_sentence, to_sentence )
     #we will collect a snapshot at each step.
     trace_list = _list_trace(trace)
     training_collection = []
+    #execute these things on the from_sentence and see if we get the to_sentence.
+    working_from = from_sentence
     working_to = ""
     used_from = ""
     continuous_added = 0
             continuous_dropped = 0
+    if to_sentence != working_to:
         print( "Replay failure" )
     #so now I have training_collection which is a list of dictionaries where each dictionary is an action with a context.
     return pd.DataFrame( context_split_into_dict ), pd.DataFrame( result_split_into_dict )
+def _parse_for_training( from_sentences, to_sentences, num_pre_context_chars, num_post_context_chars ):
     out_observations_list = []
     out_results_list = []
+    for index, (from_sentence, to_sentence) in enumerate(zip( from_sentences, to_sentences )):
+        if type(from_sentence) != float and type(to_sentence) != float: #bad lines are nan which are floats.
+            specific_observation, specific_result = _parse_single_for_training( from_sentence, to_sentence, num_pre_context_chars=num_pre_context_chars, num_post_context_chars=num_post_context_chars )
             out_observations_list.append( specific_observation )
             out_results_list.append( specific_result )
         if index % 100 == 0:
+            print( f"parsing {index} of {len(from_sentences)}")
     return pd.concat( out_observations_list ), pd.concat( out_results_list )
     tm = Transmorgrifier()
+    tm.train( from_sentences=train_data[a_header],
+            to_sentences=train_data[b_header],
             iterations = iterations,
             device = device,
             leading_context = leading_context,