Upload files for inference

Browse files

Files changed (10) hide show

constants.py +197 -0
lemmas/ed.npy +3 -0
lemmas/er.npy +3 -0
lemmas/est.npy +3 -0
lemmas/ing.npy +3 -0
lemmas/lemmas.npy +3 -0
lemmas/s.npy +3 -0
model.py +433 -0
saved_models/b_model.h5 +3 -0
tokens.py +534 -0

constants.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import sys
+def parseArgv(argv):
+    data = {
+        '--model-type': 'b',
+        '--vocab-size': 4096,
+        '--ngram-n': 4,
+        '--transformer-n': 32,
+        '--kaggle': False,
+        '--rhyme-size': 4,
+        '--meter-size': 3,
+    }
+    startParse = 1
+    if len(argv) > 1 and argv[1] in ['n','t','b']:
+        data['--model-type'] = argv[1]
+        startParse = 2
+    for i in range(startParse, len(argv)):
+        if argv[i] in data:
+            if argv[i] == '--kaggle':
+                data[argv[i]] = True
+            elif argv[i] == '--model-type':
+                data[argv[i]] = argv[i+1]
+            else:
+                data[argv[i]] = int(argv[i+1])
+    return data
+sysArgs = parseArgv(sys.argv)
+VOCAB_SIZE = sysArgs['--vocab-size']
+NGRAM_N = sysArgs['--ngram-n']
+TRANSFORMER_N = sysArgs['--transformer-n']
+MODEL_TYPE = sysArgs['--model-type'] # n: ngram, t: transformer, b: bard
+KAGGLE = sysArgs['--kaggle']
+TOKEN_SKIP = 2 if MODEL_TYPE == 'n' else TRANSFORMER_N-1
+RHYME_STACK_SIZE = sysArgs['--rhyme-size']
+METER_STACK_SIZE = sysArgs['--meter-size']
+VOWEL_TYPES = 14
+CONSONANT_TYPES = 10
+TITLE = " <TITLE> "
+NEWLINE = " <NEWLINE> "
+BRITISH_OUR = ['neighbor','color','flavor','splendor','labor','favor','fervor','savior','vapor','endeavor','parlor',
+                     'clamor','harbor','splendor','behavior','rumor','humor','savor','valor','armor','honor','odor']
+est_set = set(['for','t','n','liv','b','di','j','r','p','v','w','b','gu',
+                            'l','eld','pr','inter','sever','hug','earn','smil',
+                            'qu','ch','bl','conqu','pri'])
+ed_set = set(['he','you','they','we','will','mov','w','wretch','fe','wav','gre',
+                    'till','far','fell','de','b','f','l','re','hopp','ne','br',
+                    'mann','bann','bl','pleas','mark','m','sh','se','spe','ble',
+                    'lov','ste','rous','arm','bar','di','unmov','asham','cre'])
+d_set = set(['be', 'she','we','see','re','fe','rowe','fee','le','seale','dee','ne',
+                'reveale','traine','warme','coole','saile','sweate','mowe','cooke',
+                'gree','warne','aire','seate','ree','temp','doome','helpe','feare',
+                'neare','designe','adde','parte','repeate','gaine','parke','mourne',
+                'backe','cleane','raine','charme','climbe','wee','fle','barbe','roote',
+                'waite','fixe','hee','ende','wounde','pointe','earne','cree','matte',
+                'kisse','haire','marke','neede','summe','farme','poure','owne','showe',
+                'crowne','entere','evene','turne','crouche','laye','jade','recorde',
+                'flowe','looke','nee','calle','learne','spe','ble','fille','washe',
+                'boxe','talke','returne','sacre','dreame','pulle','seeme','calle',
+                'prie','forme','ruine','lighte','appeare','adorne','aske','locke',
+                'crosse','misse','arme','towe','shoute','heade','burne','faile','bowe',
+                'rolle','walke','heape','obtaine'])
+c_ed_set = set(['ad','cares','jag','pis','kis','mat','er','mis','cal','pas','fil','wo'])
+y_ed_set = set(['drapery','city','weary'])
+s_set = set(['','a','i','it','his','her',"'",'their','one','will','your','our','down','pant','wa',
+                    'god','well','other','saw','good','new','ye','leave','right','wood',
+                    'ha','thi','hi','jesu','riche','specie','alway','ala','grasse','glorie',
+                    'goe','doe','mas','pis','mi','pi','selve','wherea','prie','masse',
+                    'beautie','jame','misse','san','la','lo','politic','u','ga','bu','tos',
+                    'len'])
+st_set = set(['be','we','ne','re','tempe','le','mode', 'fore','le','que','riche','cre','pe',
+                'harde','sweete','cleane','je','te','che','highe','earne','deepe','meane','prie',
+                'olde'])
+c_est_set = set(['ful','smal'])
+er_set = set(['with','she','h','quak','curr','hopp','minist','eth','thund','whisp','whit',
+            'fev','rememb','inn','rend','de','beak','wand','port','heath','clos','should',
+            'wrapp','cap','cow','lett','moth','chart','prop','danc','dinn','slumb','tend',
+            'sever','ladd','falt','eld','aft','hind','flatt','murd','show','flow','sob',
+            'pray','s','numb','pond','ev','und','wint','shiv','ang','fin','hov','teach',
+            'clov','ov','oth','riv','barb','post','nev','discov','wat','draw','wait',
+            'suff','deliv','quiv','silv','cov','shelt','los','m','slipp','batt','plast',
+            'bitt','p','be','pe','ti','pi','ve','se','us','ton','min','sew','lit','tig',
+            'lat','inn','out','off','ent','low','pow','less','wond','mann','care','lov',
+            'rath','form','summ','bett','found','quart','tap','pap','record','shudd','pitch',
+            'shatt','tatt','rid','butt','mis','bould','bord','glimm','answ','wav','walk',
+            'glitt','gath','stick','care','temp','fish','corn','flick','dress','feath','met',
+            'broth','both','lock','tow','conqu','che','encount','head','alt','mutt','san'])
+c_er_set = set(['of','in','but','up','man','let','shut','sum','slip','din','flit',
+            'mat','bat','bit','lad','ban','bet','ad','flat','pe','ful','smal','up',
+            'pis','kis','slip','lat','cop','begin','shud','washe','shat','tat','lit',
+            'glim','lay','lad','cal','glit','pas','fil','ham','sup','pep','rub','chat',
+            'skip','alte','flut','mut','scat','dip','stag','wo'])
+r_set = set(['he',"'re",'rule','cottage','quake','cove','clove','warble','prime','lowe',
+                'cape','tempe','late','e','rive','dee','eve','wave','me','rathe','meter',
+                'anothe','mothe','mowe','sweate','saile','leade','hithe','warme','coole',
+                'reaveale','traine','chee','manne','shee','uppe','withe','designe','neare',
+                'barbe','darke','banne','pete','faste','soone','oute','rende','parke',
+                'keepe','lee','rooste','cleane','sweete','bothe','harde','sleepe','poste',
+                'loude','climbe','flowe','drawe','waite','highe','lathe','summe','fathe',
+                'cove','farme','lose','showe','deepe','longe','hove','teache','pe','rule',
+                'freeze','compute','consume','recorde','fille','washe','boxe','talke',
+                'spide','meane','outside','inside','laye','lighte','reade','ladde',
+                'eage','forme','coppe','answe','aske','dinne','wave','glitte','feve',
+                'butte','gathe','pape','broke','matte','time','locke','olde','towe','inne',
+                'shoute','heade','cunne','burne','singe','mutte','rolle','dippe','walke'])
+ing_set = set(['','us','s','st','n','wan','din','k','heav','w','morn','cloth','br','wav',
+                'even','cl','noth','charm','th','spr','bl','p','r','d','tempt','m','s','z',
+                'ch','mean','exact','bless','train','lov','str','build','pleas','slid','light',
+                'stock','feel','bo','gap'])
+c_ing_set = set(['er','wed','ad','ear','begin','pis','kis','er','mis','cal','pas','fil'])
+e_ing_set = set(['the','we','bee','bore','lute','ne','re','please','displease','tide','clothe','ke',
+                    'neare','wounde','che','feare','doome','helpe','designe','evene','dye',
+                    'adde','parte','repeate','gaine','parke','mourne','backe','cleane','charme',
+                    'climbe','waite','fixe','raine','ende','wounde','pointe','earne','neede',
+                    'summe','poure','owne','crowne','entere','turne','crouche','ble','laye',
+                    'recorde','flowe','calle','morne','learne','fille','washe','boxe','talke',
+                    'kisse','returne','dreame','pulle','seeme','matte','forme','meane','ruine',
+                    'lighte','reade','appeare','adorne','stocke','aske','locke','calle','crosse',
+                    'misse','towe','shoute','feele','heade','burne','singe','faile','bowe',
+                    'rolle','walke','heape','obtaine'])
+y_s_set = set(['ry'])
+y_er_set = set(['by'])
+y_est_set = set(['pry'])
+BANNED_TOKENS = ['1','2','3','y','e','l','maud','olaf','lorenzo','de','oscar',
+                 'r','d','f','p','agnes','eulalie','kate','niam','thel','asius',
+                 'saadi','\\\\','juanna','johnson','dudù','moore','xanthus',
+                 'arjun','pandav','draupadi','bhishma','karna','pandu','bhima',
+                 'duryodhan','drona','abhimanyu','yudhishthir','agamemnon','narad',
+                 'antilochus','diomed','helen','ulysses','achilles','nestor',
+                 'menelaus','patroclus','hector','aeneas','laertes','priam',
+                 'penelope','eumaeus','telemachus','euryclea','sarpedon','peleus',
+                 'polydamas','glaucus','antenor','idomeneus','rishi','boreas',
+                 'phaeacian','savitri','kuru','diana','panchala','ida','ithaca',
+                 'matsya','pritha','salya','kripa','hastina','sisupala','vidura',
+                 'dhrita','rashtra','jayadratha','lamia','medon','highth','haydée',
+                 'haidée', 'edward','ithacus',
+                 'lenore','à','negro','juan','harold','etc','allan','adeline',
+                 '+++++++++++++','c','j','h','4','5','6','7','8','9','10',
+                 '11','12','*','x','b','/','k','g','ii','s','u','da','el',
+                 'le','que','~','000','m','thu','thir','13','14','15','16','17',
+                 '18','19','20','30','th','bu','ri','w','v','al','iv','wi',
+                 'la','las','t','ma','ha','mee','ne','em','ry','di','st',
+                 'yr','ful','iii','bo','faire','tos','ai','en','et','sug',
+                 'ga','wel','hee','hon','n','wan','ut','te','ad','hym','na']
+PUNCT = set(['.', ',', '!', '?', ':', ';', '-'])
+VOWELS = set(['a','e','i','o','u'])
+SOMETIMES_VOWELS = VOWELS.union(['y','w'])
+DEFINED_RHYMES = {
+    "'ll": [4,1], "=er": [13,0], "the": [4,-1], 'a': [4,-1], 'we': [8,-1], 'ye': [8,-1], 'e': [8,-1],
+    'zimbabwe': [7,-1], 'one': [4,2], 'two': [11,-1], 'oh': [10,-1], 'ah': [12,-1], 'i': [9,-1],
+    'you': [11,-1], 'own': [10,2], 'know': [10,-1], 'do': [11,-1], 'upon': [3,2], 'whereon': [3,2],
+    'world': [13,4], 'learn': [13,2], 'earn': [13,2], 'yearn': [13,2], 'of': [4,5], 'service': [4,6],
+    'practice': [4,6], 'police': [8,6], 'through': [11,-1], 'tough': [4,5], 'enough': [4,5],
+    'thorough': [10,-1], 'dough': [10,-1], 'rough': [4,5], 'cough': [3,5], 'snow': [10,-1],
+    'w': [11,-1], 'walk': [3,7], 'talk': [3,7], 'son':[4,2], 'iron': [13,2], 'anon': [3,2],
+    'full': [11,1], 'pull': [11,1], 'bull': [11,1], 'put': [11,1], 'push': [11,6], 'book': [11,7],
+    'won': [4,2], 'what': [4,4], 'who': [11,-1], 'whose': [11,6], 'where': [7,0], 'there': [7,0],
+    'their': [7,0], 'theirs': [7,6], 'bear': [7,0], 'wear': [7,0], 'show': [10,-1], 'tow': [10,-1],
+    'sow': [10,-1], 'brow': [5,-1], 'prow': [5,-1], 'allow': [5,-1], 'laugh': [0,5],
+    'elbow': [10,-1], 'window': [10,-1], 'rainbow': [10,-1], 'shadow': [10,-1], 'ancient': [1,4],
+    'meant': [1,4], 'dreamt': [1,4], 'learnt': [13,4], 'hymn': [2,2], 'could': [11,4], 'should': [11,4],
+    'to': [11,-1], 'was': [4,6], 'were': [13,0], 'love': [4,5], 'eye': [9,-1], 'bury': [8,-1],
+    'your': [11,0], 'heart': [12,4], 'some': [4,2], 'come': [4,2], 'from': [4,2], 'become': [4,2],
+    'would': [11,4], 'pour': [10,0],'figure': [13,0], 'author': [4,0], 'sure': [11,0], 'rhythm': [4,2],
+    'every': [8,-1], 'very': [8,-1], 'many': [8,-1], 'any': [8,-1], 'busy': [8,-1], 'easy': [8,-1],
+    'happy': [8,-1], 'live': [2,5], 'into': [11,-1], 'soul': [10,2], 'only': [8,-1], 'earth': [13,10],
+    'though': [10,-1], 'thought': [3,4], 'bought': [3,4], 'brought': [3,4], 'ought': [3,4],
+    'said': [1,4], 'dead': [1,4], 'word': [13,4], 'heard': [13,4], 'death': [1,10], 'head': [1,4],
+    'once': [4,6], 'great': [7,4], 'young': [4,2], 'among': [4,2], 'yon': [3,2], 'wh': [-1,-1],
+    'door': [10,0], 'find': [9,4], 'mind': [9,4], 'kind': [9,4], 'behind': [9,4], 'blind': [9,4],
+    'wild': [9,4], 'give': [2,5], 'beauty': [8,-1], 'duty': [8,-1], 'move': [11,5], 'above': [4,5],
+    'prove': [11,5], 'have': [0,5], 'whom': [11,2], 'warm': [10,2], 'done': [4,2], 'gone': [3,2],
+    'behind': [9,4], 'none': [4,2], 'most': [10,4], 'ghost': [10,4], 'host': [10,4], 'post': [10,4],
+    'travel': [4,1], 'broad': [3,4],'veil': [7,1],'tread': [1,4], 'bread': [1,4], 'ocean': [4,2],
+    'truth': [11,10], 'human': [4,2], 'woman': [4,2], 'unto': [11,-1], 'worm': [13,4], 'blood': [4,4],
+    'instead': [1,4], 'spread': [1,4], 'ahead': [1,4], 'breadth': [1,10], 'breath': [1,10],
+    'valley': [8,-1], 'key': [8,-1], 'journey': [8,-1], 'honey': [8,-1], 'money': [8,-1],
+    'chimney': [8,-1], 'monkey': [8,-1], 'donkey': [8,-1], 'alley': [8,-1], 'trolley': [8,-1],
+    'galley': [8,-1], 'silly': [8,-1], 'lily': [8,-1], 'barley': [8,-1], 'quiet': [4,4],
+    'else': [1,1], 'christian': [4,2], 'shadow': [10,-1], 'meadow': [10,-1], 'mow': [10,-1],
+    'bestow': [10,-1], 'widow': [10,-1], 'friend': [1,4], 'source': [10,6], 'course': [10,6],
+    'lyre': [9,0], 'curse': [13,6], 'rehearse': [13,6], 'are': [12,0], 'genuine': [2,2],
+    'fly': [9,-1], 'july': [9,-1], 'reply': [9,-1], 'butterfly': [9,-1], 'ply': [9,-1],
+    'supply': [9,-1], 'folk': [10,7], 'welcome': [4,2], 'wash': [3,6], 'child': [9,4],
+    'deaf': [1,4], 'league': [8,7], 'plague': [7,7], 'vague': [7,7], 'overhead': [1,4]
+}
+DEFINED_METERS = {
+    "'re": 0, "'ve": 0, 'shakespeare': 2, 'every': 2, 'leaves': 1, 'evening': 2,
+    'tongue': 1, 'lovely': 2, 'quiet': 2, 'people': 2, 'something': 2,
+    'beautiful': 3, 'lyre': 1, 'hymn': 1, 'forego': 2, 'therefore': 2,
+    'somewhere': 2
+}
+for word in BRITISH_OUR:
+    DEFINED_RHYMES[word] = [4,0]

lemmas/ed.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aca3dce224618296c3e20468ecbcc85dbe80643577560ad38428f023fbdede1c
+size 5616

lemmas/er.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1100564b80079fef9223fce5dcabbdef5a76acc75d82bcc2bceb038a8b60380
+size 7715

lemmas/est.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e22c89fd4f66d10760be51319efc98ae4f28b2c78ee08ffc4b2acaa6e251589
+size 2641

lemmas/ing.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2917172bddf0147b993e4aab8383a0b95c2df0fba32dbc94821079ce1cbbe5cf
+size 15719

lemmas/lemmas.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4278d8ab3ff2cc6e234d9257ca360e03cd0b3185d085067e9701e2d8b57033ed
+size 213120

lemmas/s.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:195dc16152a9d6017d298dcba3dda551dda7ee39224fdb784f58b37f38901e5f
+size 4658

model.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import os
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from keras.layers import Dense, Flatten, Dropout, Embedding,\
+    Add, MultiHeadAttention, LayerNormalization, Input, Softmax
+import sys
+from constants import *
+from tokens import pretty_tokens, rhymeMeterFromTokens
+EPOCHS = 10
+WARMUP_STEPS = 800
+EMBED_DIM = 512
+TRANSFORMER_LAYERS = 8
+TRANSFORMER_DFF = 1024
+RHYME_METER_DFF = 64
+TRANSFORMER_HEADS = 4
+VAL_SPLIT = 0.2
+BATCH_SIZE = 256
+SAVE_AT_END = False
+VERBOSE = False
+TRAINING = True
+if '--epochs' in sys.argv:
+    EPOCHS = int(sys.argv[sys.argv.index('--epochs')+1])
+if '--warmup-steps' in sys.argv:
+    WARMUP_STEPS = int(sys.argv[sys.argv.index('--warmup-steps')+1])
+if '--embed-dim' in sys.argv:
+    EMBED_DIM = int(sys.argv[sys.argv.index('--embed-dim')+1])
+if '--transformer-layers' in sys.argv:
+    TRANSFORMER_LAYERS = int(sys.argv[sys.argv.index('--transformer-layers')+1])
+if '--transformer-dff' in sys.argv:
+    TRANSFORMER_DFF = int(sys.argv[sys.argv.index('--transformer-dff')+1])
+if '--rhyme-meter-dff' in sys.argv:
+    RHYME_METER_DFF = int(sys.argv[sys.argv.index('--rhyme-meter-dff')+1])
+if '--transformer-heads' in sys.argv:
+    TRANSFORMER_HEADS = int(sys.argv[sys.argv.index('--transformer-heads')+1])
+if '--val-split' in sys.argv:
+    VAL_SPLIT = float(sys.argv[sys.argv.index('--val-split')+1])
+if '--batch-size' in sys.argv:
+    BATCH_SIZE = int(sys.argv[sys.argv.index('--batch-size')+1])
+if '--save-at-end' in sys.argv:
+    SAVE_AT_END = True
+if '--verbose' in sys.argv:
+    VERBOSE = True
+if '--load' in sys.argv:
+    TRAINING = False
+N = NGRAM_N if MODEL_TYPE == 'n' else TRANSFORMER_N
+VOCAB = list(np.load('lemmas/lemmas.npy'))
+TEST_PROMPT = '<title> stop =ing by woods on a snowy evening <newline> '+\
+    'whose woods these are i think i know <newline> '+\
+    'his house is in the village though <newline> he'
+def sampleVocab(dist, temperature):
+    temperature = 1e-8 if temperature == 0 else temperature
+    dist = np.power(dist, temperature)
+    dist /= np.sum(dist)
+    sample = np.random.choice(np.arange(VOCAB_SIZE), p=dist)
+    return sample
+def genTokens(model, tokens, temperature=0.7, prompt=None):
+    res = [model.vocab.index(TITLE.lower()[1:-1])]
+    if prompt is not None:
+        res = [model.vocab.index(x) for x in prompt.split(' ') if x in model.vocab]
+    for _ in range(tokens):
+        pred = model.generate(res, temperature)
+        assert pred is not None
+        res.append(pred)
+    res = list(map(lambda token: model.vocab[token], res))
+    return res
+class LinearModel(keras.Model):
+    def __init__(self):
+        super(LinearModel, self).__init__()
+        self.vocab = VOCAB
+        self.seq = keras.Sequential([
+            Input(shape=(NGRAM_N-1, VOCAB_SIZE)),
+            Flatten(),
+            Dense(1024, activation='relu'),
+            Dense(1024, activation='relu'),
+            Dense(2048, activation='relu'),
+            Dropout(0.2),
+            Dense(VOCAB_SIZE, activation='softmax')
+        ])
+    def call(self, input):
+        x = tf.one_hot(input, VOCAB_SIZE)
+        x = self.seq(x)
+        return x
+    def generate(self, fullContext, temperature=0.7):
+        context = fullContext[-(N-1):]
+        while len(context) > NGRAM_N-1:
+            context.pop(0)
+        while len(context) < NGRAM_N-1:
+            context.append(-1)
+        context = np.asarray([context])
+        pred = self.call(context)[0]
+        pred = sampleVocab(pred, temperature)
+        return pred
+def positional_encoding(length, depth):
+    depth = depth / 2
+    positions = np.arange(length)[:, np.newaxis]
+    depths = np.arange(depth)[np.newaxis, :]/depth
+    angle_rates = 1 / (10000**depths)
+    angle_rads = positions * angle_rates
+    pos_encoding = np.concatenate(
+        [np.sin(angle_rads), np.cos(angle_rads)],
+        axis=-1)
+    return tf.cast(pos_encoding, dtype=tf.float32)
+class InputEmbedding(keras.layers.Layer):
+    def __init__(self):
+        super().__init__()
+        self.embed = Embedding(input_dim=VOCAB_SIZE+1, output_dim=EMBED_DIM)
+        self.pos = positional_encoding(length=TRANSFORMER_N, depth=EMBED_DIM)
+        self.add = Add()
+        self.dropout = Dropout(0.1)
+    def call(self, input):
+        length = tf.shape(input)[1]
+        x = self.embed(input)
+        x *= tf.math.sqrt(tf.cast(EMBED_DIM, tf.float32))
+        x = self.add([x, self.pos[tf.newaxis, :length, :]])
+        x = self.dropout(x)
+        return x
+class AttentionBlock(keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.mha = MultiHeadAttention(**kwargs)
+        self.dropout = Dropout(0.1)
+        self.norm = LayerNormalization()
+        self.add = Add()
+    def call(self, input):
+        x = self.mha(query=input, value=input, key=input, use_causal_mask=True)
+        x = self.dropout(x)
+        x = self.add([input, x])
+        x = self.norm(x)
+        return x
+class FeedForward(keras.layers.Layer):
+    def __init__(self, dff):
+        super().__init__()
+        self.seq = keras.Sequential([
+            Dense(dff, activation='relu'),
+            Dense(EMBED_DIM),
+            Dropout(0.1)
+        ])
+        self.add = Add()
+        self.norm = LayerNormalization()
+    def call(self, input):
+        x = self.add([input, self.seq(input)])
+        x = self.norm(x)
+        return x
+class Decoder(keras.layers.Layer):
+    def __init__(self, *, num_layers, num_heads, dff):
+        super(Decoder, self).__init__()
+        attention = []
+        for _ in range(num_layers):
+            attention.append(AttentionBlock(num_heads=num_heads, key_dim=EMBED_DIM, dropout=0.1))
+        self.attn_seq = keras.Sequential(attention)
+        self.ffn = FeedForward(dff)
+    def call(self, input):
+        x = self.attn_seq(input)
+        x = self.ffn(x)
+        return x
+class TransformerModel(keras.Model):
+    def __init__(self, *, num_layers=TRANSFORMER_LAYERS, num_heads=TRANSFORMER_HEADS, dff=TRANSFORMER_DFF):
+        super(TransformerModel, self).__init__()
+        self.vocab = VOCAB
+        self.embed = InputEmbedding()
+        self.decoder = Decoder(num_layers=num_layers, num_heads=num_heads, dff=dff)
+        self.out = Dense(VOCAB_SIZE, activation='softmax')
+    def call(self, input):
+        x = self.embed(input) # context x embedding
+        x = self.decoder(x) # context x embedding
+        x = self.out(x) # context x vocab size
+        try:
+            del x._keras_mask
+        except AttributeError:
+            pass
+        return x
+    def generate(self, fullContext, temperature=0.7):
+        context = fullContext[-N:]
+        lastToken = len(context)-1
+        while len(context) > TRANSFORMER_N:
+            context.pop(0)
+        while len(context) < TRANSFORMER_N:
+            context.append(-1)
+        context = np.asarray([context])+1
+        pred = self.call(context)[0]
+        pred = pred[lastToken]
+        pred = sampleVocab(pred, temperature)
+        return pred
+def rhyme_meter_encoding(input):
+    vowels = input[:,:,:RHYME_STACK_SIZE-1]
+    consonants = input[:,:,RHYME_STACK_SIZE-1:(RHYME_STACK_SIZE-1)*2]
+    rhyme_match = input[:,:,(RHYME_STACK_SIZE-1)*2:(RHYME_STACK_SIZE-1)*3]
+    vowels = tf.cast(vowels, tf.int8)
+    consonants = tf.cast(consonants, tf.int8)
+    vowels = tf.one_hot(vowels, depth=VOWEL_TYPES)
+    consonants = tf.one_hot(consonants, depth=CONSONANT_TYPES)
+    vowels = tf.reshape(vowels, shape=(tf.shape(vowels)[0], tf.shape(vowels)[1], -1))
+    consonants = tf.reshape(consonants, shape=(tf.shape(consonants)[0], tf.shape(consonants)[1], -1))
+    meter = input[:,:,-METER_STACK_SIZE:]
+    vowels = tf.cast(vowels, tf.float32)
+    consonants = tf.cast(consonants, tf.float32)
+    rhyme_match = tf.cast(rhyme_match, tf.float32)
+    meter = tf.cast(meter, tf.float32)
+    rhyme = tf.concat([vowels, consonants, rhyme_match], axis=2)
+    return rhyme, meter
+class RhymeMeterLayer(keras.layers.Layer):
+    def __init__(self):
+        super().__init__()
+        self.dense_r1 = Dense(RHYME_METER_DFF, activation='relu')
+        self.dense_m1 = Dense(RHYME_METER_DFF//2, activation='relu')
+        self.dense_r2 = Dense(RHYME_METER_DFF, activation='relu')
+        # self.dense_m2 = Dense(RHYME_METER_DFF//2, activation='relu')
+        self.dense_3 = Dense(RHYME_METER_DFF*2, activation='relu')
+        self.dense_final = Dense(VOCAB_SIZE)
+    def call(self, input):
+        rhyme, meter = rhyme_meter_encoding(input)
+        rhyme = self.dense_r1(rhyme)
+        rhyme = self.dense_r2(rhyme)
+        meter = self.dense_m1(meter)
+        # meter = self.dense_m2(meter)
+        x = tf.concat([rhyme, meter], axis=2)
+        x = self.dense_3(x)
+        x = self.dense_final(x)
+        return x
+class BardModel(keras.Model):
+    def __init__(self, *, num_layers=TRANSFORMER_LAYERS, num_heads=TRANSFORMER_HEADS, dff=TRANSFORMER_DFF):
+        super(BardModel, self).__init__()
+        self.vocab = VOCAB
+        self.tl = VOCAB.index(TITLE.lower()[1:-1])
+        self.rhyme_types = max(VOWEL_TYPES, CONSONANT_TYPES)
+        self.embed = InputEmbedding()
+        self.decoder = Decoder(num_layers=num_layers, num_heads=num_heads, dff=dff)
+        self.transformer_pred = Dense(VOCAB_SIZE)
+        self.rhyme_meter_pred = RhymeMeterLayer()
+        self.add = Add()
+        self.softmax = Softmax()
+    def call(self, input):
+        x = self.embed(input[0])
+        x = self.decoder(x)
+        x = self.transformer_pred(x)
+        try:
+            del x._keras_mask
+        except AttributeError:
+            pass
+        rhyme_meter_x = self.rhyme_meter_pred(input[1])
+        x = self.add([x, rhyme_meter_x])
+        x = self.softmax(x)
+        return x
+    def generate(self, fullContext, temperature=0.7):
+        context = fullContext[-N:]
+        lastToken = len(context)-1
+        while len(context) > TRANSFORMER_N:
+            context.pop(0)
+        while len(context) < TRANSFORMER_N:
+            context.append(-1)
+        context = np.asarray([context])+1
+        rm = rhymeMeterFromTokens(fullContext, len(fullContext), self.tl, self.vocab)
+        rm = np.asarray([rm])
+        pred = self.call([context, rm])[0]
+        pred = pred[lastToken]
+        pred = sampleVocab(pred, temperature)
+        return pred
+class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+  def __init__(self, d_model, warmup_steps=WARMUP_STEPS):
+    super().__init__()
+    self.d_model = d_model
+    self.d_model = tf.cast(self.d_model, tf.float32)
+    self.warmup_steps = warmup_steps
+  def __call__(self, step):
+    step = tf.cast(step, dtype=tf.float32)
+    arg1 = tf.math.rsqrt(step)
+    arg2 = step * (self.warmup_steps ** -1.5)
+    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
+def sparse_loss(y_true, y_pred):
+    loss_obj = keras.losses.SparseCategoricalCrossentropy(ignore_class=-1, reduction='none')
+    loss = loss_obj(y_true, y_pred)
+    return loss
+def sparse_perplexity(y_true, y_pred):
+    return tf.math.exp(tf.math.reduce_mean(sparse_loss(y_true, y_pred)))
+if __name__ == '__main__':
+    fname = {'n': 'inputs/ngram_train.npz',
+        't': 'inputs/transformer_train.npz',
+        'b': 'inputs/bard_train.npz'
+    }[MODEL_TYPE]
+    print("Loading data from", fname)
+    loaded = np.load(fname)
+    train_x = loaded['x']
+    train_y = loaded['y']
+    if MODEL_TYPE == 'b':
+        train_x = [tf.convert_to_tensor(train_x), tf.convert_to_tensor(loaded['rm'])] # rhyme and syllables
+    if MODEL_TYPE == 'n':
+        train_x = tf.convert_to_tensor(train_x, tf.int32)
+    del loaded
+    if TRAINING and VERBOSE:
+        if MODEL_TYPE != 'b':
+            print("X:", train_x[10:14])
+        else:
+            print("X:", train_x[0][10:14])
+            print("RM:", train_x[1][10:14][1])
+        print("Y:", train_y[10:14])
+        if MODEL_TYPE != 'b':
+            print("X shape:", train_x.shape)
+        print("Y shape:", train_y.shape)
+    print("Initializing model")
+    models = {'n': LinearModel, 't': TransformerModel, 'b': BardModel}
+    model = models[MODEL_TYPE]()
+    if MODEL_TYPE != 'b':
+        res = model(train_x[:1])
+    else:
+        x0 = train_x[0][:1]
+        x1 = train_x[1][:1]
+        res = model([x0, x1])
+    if VERBOSE:
+        print(model)
+        print(res)
+    print(model.summary())
+    if TRAINING:
+        print("Compiling model")
+        learning_rate = CustomSchedule(EMBED_DIM)
+        model.compile(optimizer=keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9),
+                    loss=sparse_loss, metrics=[sparse_perplexity])
+        print("Generating sample from baseline")
+        print(pretty_tokens(genTokens(model, 25)))
+        print("Training model")
+        min_perplexity = None
+        if not os.path.exists('saved_models'):
+            os.mkdir('saved_models')
+        class TrainCallback(keras.callbacks.Callback):
+            def on_epoch_end(self, epoch, logs=None):
+                global min_perplexity
+                perplexity = logs['val_sparse_perplexity'] if VAL_SPLIT > 0 else logs['sparse_perplexity']
+                print("\rGenerating sample from model in training: "+
+                    "epoch "+str(epoch+1)+", perplexity "+str(round(perplexity, 2)), end='')
+                print(pretty_tokens(genTokens(model, 75)))
+                if (min_perplexity is None or perplexity <= min_perplexity) and not SAVE_AT_END:
+                    min_perplexity = perplexity
+                    print("Saving model weights")
+                    model.save_weights('saved_models/'+MODEL_TYPE+'_model.h5') # no such file or directory right now
+        model.fit(train_x, train_y,
+                batch_size=BATCH_SIZE, validation_split=VAL_SPLIT, epochs=EPOCHS,
+                callbacks=[TrainCallback()])
+        if SAVE_AT_END:
+            print("Saving final model weights")
+            model.save_weights('saved_models/'+MODEL_TYPE+'_model.h5')
+        print("Generating samples from final model")
+        if VERBOSE:
+            for i in range(10):
+                print(pretty_tokens(genTokens(model, 100)))
+            print(pretty_tokens(genTokens(model, 150, prompt=TEST_PROMPT)))
+            print(pretty_tokens(genTokens(model, 500)))
+        print(pretty_tokens(genTokens(model, 500)))
+    else:
+        del train_x
+        del train_y
+        print("Loading weights")
+        model.load_weights('saved_models/'+MODEL_TYPE+'_model.h5')
+        while True:
+            temp = 0.7
+            print("Commands:\ng: generate sample with 250 tokens\nl: generate sample with custom length\np: generate sample with prompt\nt: set temperature\nq: quit")
+            cmd = input("Enter command: ")
+            try:
+                if cmd == 'g':
+                    print("Generating sample...")
+                    print(pretty_tokens(genTokens(model, 250, temperature=temp)))
+                if cmd == 'l':
+                    length = int(input("Enter length: "))
+                    print("Generating sample...")
+                    print(pretty_tokens(genTokens(model, length, temperature=temp)))
+                if cmd == 'p':
+                    prompt = ""
+                    print("Enter prompt as tokens separated by spaces and newlines.")
+                    print("Example: <title> stop =ing by woods on a snowy evening\nwhose woods these are i think i know")
+                    print("All tokens not in the vocabulary will be ignored.")
+                    while not prompt.endswith('\n\n\n'):
+                        prompt += input("")+'\n'
+                    while prompt.startswith(' ') or prompt.startswith('\n'):
+                        prompt = prompt[1:]
+                    while prompt.endswith(' ') or prompt.endswith('\n'):
+                        prompt = prompt[:-1]
+                    prompt = prompt.replace('\n', NEWLINE.lower())
+                    length = int(input("Enter length: "))
+                    print("Generating sample...")
+                    print(pretty_tokens(genTokens(model, length, temperature=temp, prompt=prompt)))
+                if cmd == 't':
+                    print("Current temperature:", temp)
+                    temp = float(input("New temperature: "))
+                    print("Temperature set to", temp)
+                if cmd == 'q':
+                    sys.exit(0)
+            except Exception as e:
+                print("Error:", e)

saved_models/b_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f03eab3fd3dd13a08aadbe6a03e7b7e0c10ed7d38534484bc58e126634879a6f
+size 157786768

tokens.py ADDED Viewed

	@@ -0,0 +1,534 @@

+import numpy as np
+import os
+import sys
+from constants import *
+if __name__ == '__main__':
+    from threading import Thread
+N_THREADS = 32
+if '--n_threads' in sys.argv:
+    N_THREADS = int(sys.argv[sys.argv.index('--n_threads')+1])
+if __name__ == '__main__':
+    if not os.path.exists('lemmas'):
+        os.mkdir('lemmas')
+    file = open("inputs/join.txt" if not KAGGLE else "inputs/join-kaggle.txt", "r")
+    text = file.read()
+    file.close()
+    tokens = text.split(" ")
+    tokens = [x for x in tokens if x != '']
+    print("Total number of tokens:", len(tokens))
+    print("Counting tokens")
+    counts = {}
+    for token in tokens:
+        if not token in counts:
+            counts[token] = 0
+        counts[token] += 1
+    words = list(counts.keys())
+    words.sort(reverse=True, key=lambda word: counts[word])
+    for token in BANNED_TOKENS:
+        if token in words:
+            words.remove(token)
+            words.append(token)
+    counts['<unk>'] = 0
+    for word in words:
+        if word in words[:VOCAB_SIZE]:
+            continue
+        counts['<unk>'] += counts[word]
+    words = list(counts.keys())
+    words.sort(reverse=True, key=lambda word: counts[word])
+    for token in BANNED_TOKENS:
+        if token in words:
+            words.remove(token)
+            words.append(token)
+    vocab = set(words[:VOCAB_SIZE])
+else:
+    print("Loading vocab")
+    vocab = set(list(np.load('lemmas/lemmas.npy')))
+def pretty_tokens(tokens, mask=True):
+    s_dict = np.load('lemmas/s.npy', allow_pickle=True).item()
+    ed_dict = np.load('lemmas/ed.npy', allow_pickle=True).item()
+    er_dict = np.load('lemmas/er.npy', allow_pickle=True).item()
+    est_dict = np.load('lemmas/est.npy', allow_pickle=True).item()
+    ing_dict = np.load('lemmas/ing.npy', allow_pickle=True).item()
+    dicts = {'=s': s_dict, '=ed': ed_dict, '=er': er_dict, '=est': est_dict, '=ing': ing_dict}
+    res = []
+    i = 0
+    def includeSpace(this):
+        nonlocal res
+        quote = set(["'", '"'])
+        nospace = set(['\n','-'])
+        prev = res[len(res)-1] if len(res) > 0 else None
+        prev2 = res[len(res)-2] if len(res) > 1 else None
+        space = not prev in nospace\
+            and not this in PUNCT and not this == '\n'\
+            and not (this.startswith("'") and this != "'")
+        if prev in quote and not prev2 in PUNCT:
+            space = False
+        elif this in quote and prev in PUNCT:
+            space = False
+        return space
+    while i < len(tokens):
+        this = tokens[i]
+        if this == NEWLINE.lower()[1:-1]:
+            this = '\n'
+        elif this == TITLE.lower()[1:-1]:
+            this = '\n ༄༅༅ '
+        elif mask and not this in vocab:
+            this = " <unk>"
+            if not includeSpace(this):
+                this = "<unk>"
+            res.append(this)
+            i += 1
+            continue
+        if i+1 < len(tokens):
+            next = tokens[i+1]
+            while next.startswith('='):
+                if next == "=nt":
+                    if tokens[i].endswith('n'):
+                        this = this[:-1]
+                    if tokens[i] == 'will':
+                        this = 'wo'
+                    elif tokens[i] == 'shall':
+                        this = 'sha'
+                    this = this+"n't"
+                else:
+                    if tokens[i] in dicts[next]:
+                        this = dicts[next][this]
+                    else:
+                        if next[1] == 'e' or next[1] == 'i':
+                            if this.endswith('e'):
+                                this = this[:-1]
+                            elif this.endswith('c'):
+                                this = this+'k'
+                            if this.endswith('y') and next[1] == 'e' and len(this) > 2 and not this[-2] in VOWELS:
+                                this = this[:-1]+'i'
+                        if next[1] == 's':
+                            if this.endswith('s') or this.endswith('sh') or this.endswith('x') or this.endswith('ch'):
+                                this = this+'e'
+                            if this.endswith('y') and len(this) > 2 and not this[-2] in VOWELS:
+                                this = this[:-1]+'ie'
+                        this = this+next[1:]
+                i += 1
+                next = tokens[i+1] if i+1 < len(tokens) else ''
+        if this.startswith('='):
+            this = this[1:]
+        elif includeSpace(this):
+            this = " "+this
+        res.append(this)
+        i += 1
+    res = ''.join(res)
+    res = res[1:] if res.startswith(' ') else res
+    return res
+def getRhyme(line):
+    # rhyme format:
+    # final vowel (short AEIO, schwa, long AEIOU, OW, OI, A/schwa before R; total 14)
+    # final consonant (R, L, N/M/NG, P/B, T/D, F/V, S/SH/Z/ZH, K/G, CH/J, TH; total 10)
+    if line is None or len(line) == 0:
+        return [-1, -1]
+    nl = NEWLINE.lower()[1:-1]
+    tl = TITLE.lower()[1:-1]
+    if line[0] == tl:
+        return [-1, -1]
+    while line[-1] == nl or line[-1] in PUNCT or line[-1] == '"' or line[-1] == "'" or line[-1] is None:
+        line = line[:-1]
+        if len(line) == 0:
+            return [-1, -1]
+    word = line[-1]+''
+    long_vowel = False
+    vowel_type = None
+    vowel_map = {'a': 0, 'e': 1, 'i': 2, 'o': 3, 'u': 4, 'ow': 5, 'ou': 5, 'oi': 6, 'oy': 6,
+                 'ay': 7, 'ai': 7, 'au': 3, 'aw': 3, 'ea': 8, 'ee': 8, 'eu': 11, 'ew': 11,
+                 'oa': 10, 'oo': 11, 'y': 9, 'ey': 7, 'ei': 9}
+    # vowel type format:
+    # 0: A, 1: E, 2: I, 3: O, 4: U, 5: OW, 6: OI
+    # short U is schwa
+    # OW, OI are always long
+    # before R: short E/I become schwa, schwa/short A get their own vowel type, short O becomes long O
+    consonant_type = -1
+    cons_map = {'r': 0, 'l': 1, 'n': 2, 'm': 2, 'ng': 2,
+                'p': 3, 'b': 3, 't': 4, 'd': 4, 'f': 5,
+                'v': 5, 's': 6, 'sh': 6, 'z': 6, 'zh': 6,
+                'th': 9, 'k': 7, 'ch': 8, 'j': 8}
+    # consonant type format:
+    # 0: R, 1: L, 2: N/M/NG, 3: P/B, 4: T/D, 5: F/V, 6: S/SH/Z/ZH/TH, 7: K/G, 8: CH/J
+    # total 9 consonant types
+    # full vowel type list: (L=long, S=short, R=before R)
+    # 0: AS (bat), 1: ES (bet), 2: IS (bit), 3: OS (bot), 4: US/schwa (but)
+    # 5: OW (bout), 6: OI (boil)
+    # 7: AL (bait), 8: EL (beat), 9: IL (bite), 10: OL (boat), 11: UL (boot)
+    # 12: AR (bar), 13: schwa_R (butter, bird, burn)
+    # total 14 vowel types
+    def getVowel(type, isLong, beforeR):
+        if beforeR and not isLong:
+            if type == 0:
+                return 12
+            if type ==1 or type == 2 or type == 4:
+                return 13
+            if type == 3:
+                return 10
+        if isLong and 0 <= type <= 4:
+            return type+7
+        return type
+    lock_consonant = -1
+    if len(line) > 1:
+        if word == '=ed':
+            if line[-2].endswith('t') or line[-2].endswith('d'):
+                return [4, 4]
+            lock_consonant = 4
+            word = line[-2]
+        if word == '=s' or word == "'s":
+            if line[-2].endswith('s') or line[-2].endswith('z') or line[-2].endswith('ch') or line[-2].endswith('sh') or line[-2].endswith('x'):
+                return [4, 6]
+            lock_consonant = 6
+            word = line[-2]
+        elif word == "'re":
+            lock_consonant = 0
+            word = line[-2]
+        elif word == "'ve":
+            lock_consonant = 5
+            word = line[-2]
+        elif word == "'ll":
+            lock_consonant = 1
+            word = line[-2]
+        elif word == "'d":
+            lock_consonant = 4
+            word = line[-2]
+        elif word == "'m":
+            lock_consonant = 2
+            word = line[-2]
+        elif word == "=nt'":
+            lock_consonant = 4
+            word = line[-2]
+    if word in DEFINED_RHYMES:
+        vowel_type = DEFINED_RHYMES[word][0]
+        consonant_type = DEFINED_RHYMES[word][1] if lock_consonant == -1 else lock_consonant
+        return [vowel_type, consonant_type]
+    if word.endswith('o'):
+        return [10, lock_consonant]
+    if word.endswith('bble') or word.endswith('ggle'):
+        return [4, 1 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('old'):
+        return [10, 1 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('ance'):
+        return [0, 6 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('ense') or word.endswith('ence'):
+        return [1, 6 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('ince'):
+        return [2, 6 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('ture') or word.endswith('sure'):
+        return [13, 0 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('all'):
+        return [3, 1 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('row') or word.endswith('low'):
+        return [10, lock_consonant]
+    if word.endswith('le') and len(word) >= 3 and not word[-3] in VOWELS:
+        return [4, 1 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('on') and len(word) > 3 and not word.endswith('oon'):
+        return [4, 2 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('al') and len(word) > 3 and not word.endswith('eal'):
+        return [4, 1 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('ous'):
+        return [4, 6 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('ly'):
+        return [8, -1 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('ward'):
+        return [13, 4 if lock_consonant == -1 else lock_consonant]
+    if word.endswith('e'):
+        long_vowel = True
+        word = word[:-1]
+    if lock_consonant == -1:
+        if word[-2:] in cons_map:
+            consonant_type = cons_map[word[-2:]]
+        elif word[-1:] in cons_map:
+            consonant_type = cons_map[word[-1:]]
+        elif word[-1] == 'c' and long_vowel:
+            consonant_type = cons_map['s']
+        elif word[-1] == 'g' and long_vowel:
+            consonant_type = cons_map['j']
+    else:
+        consonant_type = lock_consonant
+    lock_r = False
+    if not word[-1] in SOMETIMES_VOWELS:
+        while not word[-1] in SOMETIMES_VOWELS:
+            if word.endswith('igh'):
+                return [9, consonant_type]
+            if word[-1] == 'r':
+                lock_r = True
+            elif lock_r:
+                lock_r = False
+            word = word[:-1]
+            if word == '':
+                return [8, lock_consonant]
+    if word[-2:] in vowel_map:
+        vowel_type = vowel_map[word[-2:]]
+    elif word[-1:] in vowel_map:
+        vowel_type = vowel_map[word[-1:]]
+    vowel_type = getVowel(vowel_type, long_vowel, consonant_type == 0 or lock_r)
+    return [vowel_type, consonant_type]
+def pretty_rhyme(rhyme):
+    v_map = ['bat', 'bet', 'bit', 'bot', 'but', 'pout', 'boil', 'bait', 'beat', 'bite', 'boat', 'boot', 'bar', 'sir']
+    c_map = ['R', 'L', 'N/M/NG', 'P/B', 'T/D', 'F/V', 'S/SH/Z/ZH', 'K/G', 'CH/J', 'TH']
+    return "Rhyme is " +\
+        (v_map[rhyme[0]] if rhyme[0] != -1 else '--') + ' ' + (c_map[rhyme[1]] if rhyme[1] != -1 else 'ø')
+def getMeter(line):
+    if line is None:
+        return 0
+    res = 0
+    nl = NEWLINE.lower()[1:-1]
+    tl = TITLE.lower()[1:-1]
+    for i in range(len(line)):
+        word = line[i]
+        if word == nl or word == tl or word is None:
+            continue
+        if word in DEFINED_METERS:
+            res += DEFINED_METERS[word]
+            continue
+        if word == '=ed' and i > 0:
+            if line[i-1].endswith('t') or line[i-1].endswith('d') or line[i-1].endswith('te') or line[i-1].endswith('de'):
+                res += 1
+            continue
+        if word == '=s' and i > 0:
+            if line[i-1].endswith('s') or line[i-1].endswith('z') or line[i-1].endswith('ch') or line[i-1].endswith('sh') or line[i-1].endswith('x'):
+                res += 1
+            continue
+        if word.endswith('le') and len(word) >= 3 and not word[-3] in VOWELS:
+            res += 1 # to account for the dropped e
+        removed_e = False
+        if word.endswith('e'):
+            word = word[:-1]
+            removed_e = True
+        if word.endswith('y') and len(word) > 2 and not word[-2] in VOWELS:
+            word = word[:-1]+'i'
+        word = word.replace('ea','i').replace('ee','i')
+        word = word.replace('ai','i').replace('au','o')
+        word = word.replace('eu','u')
+        word = word.replace('ei','i').replace('ie','i')
+        word = word.replace('oa','o').replace('ou','o')
+        word = word.replace('oi','o').replace('oo','u')
+        if word.endswith('tion') or word.endswith('sion') or word.endswith('tian'):
+            word = word[:-4]+'shun'
+        this_count = 0
+        for vowel in VOWELS:
+            this_count += word.count(vowel)
+        if removed_e and this_count == 0:
+            this_count = 1
+        res += this_count
+    return res
+def lastLine(tokens, endl):
+    res = []
+    nl = NEWLINE.lower()[1:-1]
+    i = endl-1
+    while i > 0:
+        if tokens[i] == nl:
+            break
+        i -= 1
+    res = tokens[i:endl]
+    if len(res) == 0:
+        res = tokens[:endl]
+    return res
+def processRhymeStack(rhyme_stack):
+    prev = rhyme_stack[:-1].flatten(order='F')
+    lastRhyme = rhyme_stack[-1]
+    res = np.zeros(RHYME_STACK_SIZE-1)
+    if lastRhyme[0] != -1:
+        for i in range(RHYME_STACK_SIZE-1):
+            if rhyme_stack[i][0] == lastRhyme[0]:
+                res[i] = 1
+                if rhyme_stack[i][1] == lastRhyme[1]:
+                    res[i] = 2
+    res = np.concatenate([prev, res])
+    return res
+def processRhymeMeter(split):
+    in_title = False
+    meter = []
+    rhymes = []
+    meter_stack = np.zeros(METER_STACK_SIZE, np.int8)
+    rhyme_stack = np.zeros((RHYME_STACK_SIZE, 2), np.int8) - 1
+    tl = TITLE.lower()[1:-1]
+    nl = NEWLINE.lower()[1:-1]
+    for i in range(len(split)):
+        line = lastLine(split, i)
+        if split[i] == tl:
+            in_title = True
+            meter_stack = np.zeros(METER_STACK_SIZE, np.int8)
+            rhyme_stack = np.zeros((RHYME_STACK_SIZE, 2), np.int8) - 1
+            meter.append(meter_stack.copy())
+            rhymes.append(processRhymeStack(rhyme_stack))
+            continue
+        elif in_title and split[i] == nl:
+            in_title = False
+            meter_stack = np.zeros(METER_STACK_SIZE, np.int8)
+            meter_stack[-1] = getMeter(line)
+            meter.append(meter_stack.copy())
+            rhyme_stack = np.zeros((RHYME_STACK_SIZE, 2), np.int8) - 1
+            rhyme_stack[-1] = np.array(getRhyme(line), np.int8)
+            rhymes.append(processRhymeStack(rhyme_stack))
+            meter_stack = np.zeros(METER_STACK_SIZE, np.int8)
+            rhyme_stack = np.zeros((RHYME_STACK_SIZE, 2), np.int8) - 1
+            continue
+        if not in_title and split[i] == nl:
+            rhymes.append(processRhymeStack(rhyme_stack))
+            meter.append(meter_stack.copy())
+            if split[i-1] != nl:
+                rhyme_stack = np.roll(rhyme_stack, -1, axis=0)
+                rhyme_stack[-1] = np.array(getRhyme(line), np.int8)
+                meter_stack = np.roll(meter_stack, -1, axis=0)
+                meter_stack[-1] = getMeter(line)
+        else:
+            meter_stack[-1] = getMeter(line)
+            rhyme_stack[-1] = np.array(getRhyme(line), np.int8)
+            rhymes.append(processRhymeStack(rhyme_stack))
+            meter.append(meter_stack.copy())
+    return [rhymes, meter]
+def rhymeMeterFromTokens(tokens, endl, tl, vocab=None):
+    # used as input for model
+    res = []
+    start = endl-1
+    if len(tokens) >= endl:
+        while start > 0 and tokens[start] != tl:
+            start -= 1
+    lines = tokens[start:endl]
+    while len(lines) < TRANSFORMER_N:
+        lines.append(None)
+    input_lines = lines if vocab is None else [(vocab[x] if (x is not None and 0 <= x < VOCAB_SIZE) else None) for x in lines]
+    rhymes, meter = processRhymeMeter(input_lines)
+    rhymes = rhymes[-TRANSFORMER_N:] # context x RHYME_STACK_SIZE x 2
+    meter = meter[-TRANSFORMER_N:] # context x METER_STACK_SIZE
+    rhymes = np.array(rhymes)
+    meter = np.array(meter)
+    res = np.concatenate([rhymes, meter], axis=1) # context x (RHYME_STACK_SIZE*2 + METER_STACK_SIZE)
+    return res
+if __name__ == '__main__':
+    N = NGRAM_N if MODEL_TYPE == 'n' else TRANSFORMER_N+1
+    for i in range(N-1):
+        tokens.append(None)
+    words.remove('<unk>')
+    print({word: counts[word] for word in words[:VOCAB_SIZE]})
+    title_token = words.index(TITLE.lower()[1:-1])
+    newline_token = words.index(NEWLINE.lower()[1:-1])
+    print("Splitting poems with masked dividers")
+    mask_list = [-1]*N
+    splits = []
+    chunk_size = len(tokens)//N_THREADS
+    for i in range(N_THREADS):
+        splits.append(
+            tokens[i*chunk_size : (i+1)*chunk_size if i < N_THREADS-1 else len(tokens)])
+    results = [None] * N_THREADS
+    threads = []
+    def add_dividers(thread_index, split):
+        i = 1
+        while i < len(split):
+            if split[i] == title_token:
+                split = split[:i] + mask_list + split[i:]
+                i += N+5
+            i += 1
+        results[thread_index] = split
+        return split
+    for i in range(N_THREADS):
+        t = Thread(target=add_dividers, args=(i, splits[i],))
+        threads.append(t)
+        t.start()
+    tokens = []
+    for i in range(N_THREADS):
+        threads[i].join()
+        tokens += results[i]
+    if MODEL_TYPE == 'b':
+        print("Computing rhyme and meter information")
+        split_token_marks = []
+        split_size = len(tokens)//N_THREADS
+        for i in range(N_THREADS+1):
+            split_token_marks.append(split_size*i)
+        for i in range(1, N_THREADS):
+            while tokens[split_token_marks[i]] != TITLE.lower()[1:-1]:
+                split_token_marks[i] += 1
+                if split_token_marks[i] >= len(tokens):
+                    break
+        meter_data = []
+        rhymes_data = []
+        split_token_marks[-1] = len(tokens)
+        split_tokens = [tokens[split_token_marks[i]:split_token_marks[i+1]] for i in range(N_THREADS)]
+        rhyme_meter_res = [None] * N_THREADS
+        threads = []
+        def rhymeMeterThread(thread_index, split):
+            rhyme_meter_res[thread_index] = processRhymeMeter(split)
+        for i in range(N_THREADS):
+            t = Thread(target=rhymeMeterThread, args=(i, split_tokens[i]))
+            threads.append(t)
+            t.start()
+        for i in range(N_THREADS):
+            threads[i].join()
+            rhymes_data += rhyme_meter_res[i][0]
+            meter_data += rhyme_meter_res[i][1]
+        print("Converting rhyme and meter information")
+        rhymes_data = np.asarray(rhymes_data)
+        meter_data = np.asarray(meter_data)
+        rhyme_meter_data = np.concatenate([rhymes_data, meter_data], axis=1)
+    print("Masking unknown tokens")
+    tokens = [(words.index(x) if x in vocab else -1) for x in tokens]
+    print("Creating sets of ngrams")
+    ngrams = []
+    rm_ngrams = []
+    for i in range(0, len(tokens)-N, TOKEN_SKIP):
+        ngrams.append(tokens[i:i+N])
+        if MODEL_TYPE == 'b':
+            rm_ngrams.append(rhyme_meter_data[i:i+N-1,:])
+    train_x = []
+    train_y = []
+    train_rm = []
+    for i in range(len(ngrams)):
+        sample = ngrams[i][:N]
+        train_x.append(sample[:N-1])
+        if MODEL_TYPE == 'b':
+            sample_rm = rm_ngrams[i]
+            train_rm.append(sample_rm)
+        if MODEL_TYPE != 'n':
+            train_y.append(sample[1:])
+        else:
+            train_y.append(sample[N-1])
+    print("Converting arrays")
+    train_x = np.asarray(train_x)
+    train_y = np.asarray(train_y)
+    if MODEL_TYPE == 'b':
+        train_rm = np.asarray(train_rm, np.int8)
+    if MODEL_TYPE != 'n':
+        train_x += 1 # x in [0, VOCAB_SIZE] since 0 is for <unk>
+                     # y in [-1, VOCAB_SIZE-1] with VOCAB_SIZE tokens, one for each vocabulary item, and -1 for <unk>
+    print("Saving data")
+    fname = {'n': 'inputs/ngram_train.npz',
+        't': 'inputs/transformer_train.npz',
+        'b': 'inputs/bard_train.npz'
+    }[MODEL_TYPE]
+    if MODEL_TYPE != 'b':
+        np.savez_compressed(fname, x=train_x, y=train_y)
+    else:
+        np.savez_compressed(fname, x=train_x, rm=train_rm, y=train_y)
+    np.save('lemmas/lemmas.npy', words[:VOCAB_SIZE])