bardgpt / tokens.py

Upload files for inference

1427339 almost 2 years ago

20.5 kB

	import numpy as np
	import os
	import sys
	from constants import *
	if __name__ == '__main__':
	from threading import Thread

	N_THREADS = 32
	if '--n_threads' in sys.argv:
	N_THREADS = int(sys.argv[sys.argv.index('--n_threads')+1])

	if __name__ == '__main__':
	if not os.path.exists('lemmas'):
	os.mkdir('lemmas')

	file = open("inputs/join.txt" if not KAGGLE else "inputs/join-kaggle.txt", "r")
	text = file.read()
	file.close()

	tokens = text.split(" ")
	tokens = [x for x in tokens if x != '']
	print("Total number of tokens:", len(tokens))
	print("Counting tokens")
	counts = {}
	for token in tokens:
	if not token in counts:
	counts[token] = 0
	counts[token] += 1
	words = list(counts.keys())
	words.sort(reverse=True, key=lambda word: counts[word])

	for token in BANNED_TOKENS:
	if token in words:
	words.remove(token)
	words.append(token)
	counts['<unk>'] = 0
	for word in words:
	if word in words[:VOCAB_SIZE]:
	continue
	counts['<unk>'] += counts[word]
	words = list(counts.keys())
	words.sort(reverse=True, key=lambda word: counts[word])
	for token in BANNED_TOKENS:
	if token in words:
	words.remove(token)
	words.append(token)

	vocab = set(words[:VOCAB_SIZE])
	else:
	print("Loading vocab")
	vocab = set(list(np.load('lemmas/lemmas.npy')))

	def pretty_tokens(tokens, mask=True):
	s_dict = np.load('lemmas/s.npy', allow_pickle=True).item()
	ed_dict = np.load('lemmas/ed.npy', allow_pickle=True).item()
	er_dict = np.load('lemmas/er.npy', allow_pickle=True).item()
	est_dict = np.load('lemmas/est.npy', allow_pickle=True).item()
	ing_dict = np.load('lemmas/ing.npy', allow_pickle=True).item()
	dicts = {'=s': s_dict, '=ed': ed_dict, '=er': er_dict, '=est': est_dict, '=ing': ing_dict}
	res = []
	i = 0
	def includeSpace(this):
	nonlocal res
	quote = set(["'", '"'])
	nospace = set(['\n','-'])
	prev = res[len(res)-1] if len(res) > 0 else None
	prev2 = res[len(res)-2] if len(res) > 1 else None
	space = not prev in nospace\
	and not this in PUNCT and not this == '\n'\
	and not (this.startswith("'") and this != "'")
	if prev in quote and not prev2 in PUNCT:
	space = False
	elif this in quote and prev in PUNCT:
	space = False
	return space
	while i < len(tokens):
	this = tokens[i]
	if this == NEWLINE.lower()[1:-1]:
	this = '\n'
	elif this == TITLE.lower()[1:-1]:
	this = '\n ༄༅༅ '
	elif mask and not this in vocab:
	this = " <unk>"
	if not includeSpace(this):
	this = "<unk>"
	res.append(this)
	i += 1
	continue
	if i+1 < len(tokens):
	next = tokens[i+1]
	while next.startswith('='):
	if next == "=nt":
	if tokens[i].endswith('n'):
	this = this[:-1]
	if tokens[i] == 'will':
	this = 'wo'
	elif tokens[i] == 'shall':
	this = 'sha'
	this = this+"n't"
	else:
	if tokens[i] in dicts[next]:
	this = dicts[next][this]
	else:
	if next[1] == 'e' or next[1] == 'i':
	if this.endswith('e'):
	this = this[:-1]
	elif this.endswith('c'):
	this = this+'k'
	if this.endswith('y') and next[1] == 'e' and len(this) > 2 and not this[-2] in VOWELS:
	this = this[:-1]+'i'
	if next[1] == 's':
	if this.endswith('s') or this.endswith('sh') or this.endswith('x') or this.endswith('ch'):
	this = this+'e'
	if this.endswith('y') and len(this) > 2 and not this[-2] in VOWELS:
	this = this[:-1]+'ie'

	this = this+next[1:]
	i += 1
	next = tokens[i+1] if i+1 < len(tokens) else ''
	if this.startswith('='):
	this = this[1:]
	elif includeSpace(this):
	this = " "+this
	res.append(this)
	i += 1
	res = ''.join(res)
	res = res[1:] if res.startswith(' ') else res
	return res

	def getRhyme(line):
	# rhyme format:
	# final vowel (short AEIO, schwa, long AEIOU, OW, OI, A/schwa before R; total 14)
	# final consonant (R, L, N/M/NG, P/B, T/D, F/V, S/SH/Z/ZH, K/G, CH/J, TH; total 10)
	if line is None or len(line) == 0:
	return [-1, -1]
	nl = NEWLINE.lower()[1:-1]
	tl = TITLE.lower()[1:-1]
	if line[0] == tl:
	return [-1, -1]
	while line[-1] == nl or line[-1] in PUNCT or line[-1] == '"' or line[-1] == "'" or line[-1] is None:
	line = line[:-1]
	if len(line) == 0:
	return [-1, -1]
	word = line[-1]+''
	long_vowel = False
	vowel_type = None
	vowel_map = {'a': 0, 'e': 1, 'i': 2, 'o': 3, 'u': 4, 'ow': 5, 'ou': 5, 'oi': 6, 'oy': 6,
	'ay': 7, 'ai': 7, 'au': 3, 'aw': 3, 'ea': 8, 'ee': 8, 'eu': 11, 'ew': 11,
	'oa': 10, 'oo': 11, 'y': 9, 'ey': 7, 'ei': 9}

	# vowel type format:
	# 0: A, 1: E, 2: I, 3: O, 4: U, 5: OW, 6: OI
	# short U is schwa
	# OW, OI are always long
	# before R: short E/I become schwa, schwa/short A get their own vowel type, short O becomes long O
	consonant_type = -1
	cons_map = {'r': 0, 'l': 1, 'n': 2, 'm': 2, 'ng': 2,
	'p': 3, 'b': 3, 't': 4, 'd': 4, 'f': 5,
	'v': 5, 's': 6, 'sh': 6, 'z': 6, 'zh': 6,
	'th': 9, 'k': 7, 'ch': 8, 'j': 8}
	# consonant type format:
	# 0: R, 1: L, 2: N/M/NG, 3: P/B, 4: T/D, 5: F/V, 6: S/SH/Z/ZH/TH, 7: K/G, 8: CH/J
	# total 9 consonant types

	# full vowel type list: (L=long, S=short, R=before R)
	# 0: AS (bat), 1: ES (bet), 2: IS (bit), 3: OS (bot), 4: US/schwa (but)
	# 5: OW (bout), 6: OI (boil)
	# 7: AL (bait), 8: EL (beat), 9: IL (bite), 10: OL (boat), 11: UL (boot)
	# 12: AR (bar), 13: schwa_R (butter, bird, burn)
	# total 14 vowel types
	def getVowel(type, isLong, beforeR):
	if beforeR and not isLong:
	if type == 0:
	return 12
	if type ==1 or type == 2 or type == 4:
	return 13
	if type == 3:
	return 10
	if isLong and 0 <= type <= 4:
	return type+7
	return type

	lock_consonant = -1
	if len(line) > 1:
	if word == '=ed':
	if line[-2].endswith('t') or line[-2].endswith('d'):
	return [4, 4]
	lock_consonant = 4
	word = line[-2]
	if word == '=s' or word == "'s":
	if line[-2].endswith('s') or line[-2].endswith('z') or line[-2].endswith('ch') or line[-2].endswith('sh') or line[-2].endswith('x'):
	return [4, 6]
	lock_consonant = 6
	word = line[-2]
	elif word == "'re":
	lock_consonant = 0
	word = line[-2]
	elif word == "'ve":
	lock_consonant = 5
	word = line[-2]
	elif word == "'ll":
	lock_consonant = 1
	word = line[-2]
	elif word == "'d":
	lock_consonant = 4
	word = line[-2]
	elif word == "'m":
	lock_consonant = 2
	word = line[-2]
	elif word == "=nt'":
	lock_consonant = 4
	word = line[-2]
	if word in DEFINED_RHYMES:
	vowel_type = DEFINED_RHYMES[word][0]
	consonant_type = DEFINED_RHYMES[word][1] if lock_consonant == -1 else lock_consonant
	return [vowel_type, consonant_type]

	if word.endswith('o'):
	return [10, lock_consonant]
	if word.endswith('bble') or word.endswith('ggle'):
	return [4, 1 if lock_consonant == -1 else lock_consonant]
	if word.endswith('old'):
	return [10, 1 if lock_consonant == -1 else lock_consonant]
	if word.endswith('ance'):
	return [0, 6 if lock_consonant == -1 else lock_consonant]
	if word.endswith('ense') or word.endswith('ence'):
	return [1, 6 if lock_consonant == -1 else lock_consonant]
	if word.endswith('ince'):
	return [2, 6 if lock_consonant == -1 else lock_consonant]
	if word.endswith('ture') or word.endswith('sure'):
	return [13, 0 if lock_consonant == -1 else lock_consonant]
	if word.endswith('all'):
	return [3, 1 if lock_consonant == -1 else lock_consonant]
	if word.endswith('row') or word.endswith('low'):
	return [10, lock_consonant]
	if word.endswith('le') and len(word) >= 3 and not word[-3] in VOWELS:
	return [4, 1 if lock_consonant == -1 else lock_consonant]
	if word.endswith('on') and len(word) > 3 and not word.endswith('oon'):
	return [4, 2 if lock_consonant == -1 else lock_consonant]
	if word.endswith('al') and len(word) > 3 and not word.endswith('eal'):
	return [4, 1 if lock_consonant == -1 else lock_consonant]
	if word.endswith('ous'):
	return [4, 6 if lock_consonant == -1 else lock_consonant]
	if word.endswith('ly'):
	return [8, -1 if lock_consonant == -1 else lock_consonant]
	if word.endswith('ward'):
	return [13, 4 if lock_consonant == -1 else lock_consonant]

	if word.endswith('e'):
	long_vowel = True
	word = word[:-1]
	if lock_consonant == -1:
	if word[-2:] in cons_map:
	consonant_type = cons_map[word[-2:]]
	elif word[-1:] in cons_map:
	consonant_type = cons_map[word[-1:]]
	elif word[-1] == 'c' and long_vowel:
	consonant_type = cons_map['s']
	elif word[-1] == 'g' and long_vowel:
	consonant_type = cons_map['j']
	else:
	consonant_type = lock_consonant

	lock_r = False
	if not word[-1] in SOMETIMES_VOWELS:
	while not word[-1] in SOMETIMES_VOWELS:
	if word.endswith('igh'):
	return [9, consonant_type]
	if word[-1] == 'r':
	lock_r = True
	elif lock_r:
	lock_r = False
	word = word[:-1]
	if word == '':
	return [8, lock_consonant]
	if word[-2:] in vowel_map:
	vowel_type = vowel_map[word[-2:]]
	elif word[-1:] in vowel_map:
	vowel_type = vowel_map[word[-1:]]

	vowel_type = getVowel(vowel_type, long_vowel, consonant_type == 0 or lock_r)
	return [vowel_type, consonant_type]
	def pretty_rhyme(rhyme):
	v_map = ['bat', 'bet', 'bit', 'bot', 'but', 'pout', 'boil', 'bait', 'beat', 'bite', 'boat', 'boot', 'bar', 'sir']
	c_map = ['R', 'L', 'N/M/NG', 'P/B', 'T/D', 'F/V', 'S/SH/Z/ZH', 'K/G', 'CH/J', 'TH']
	return "Rhyme is " +\
	(v_map[rhyme[0]] if rhyme[0] != -1 else '--') + ' ' + (c_map[rhyme[1]] if rhyme[1] != -1 else 'ø')


	def getMeter(line):
	if line is None:
	return 0
	res = 0
	nl = NEWLINE.lower()[1:-1]
	tl = TITLE.lower()[1:-1]
	for i in range(len(line)):
	word = line[i]
	if word == nl or word == tl or word is None:
	continue
	if word in DEFINED_METERS:
	res += DEFINED_METERS[word]
	continue
	if word == '=ed' and i > 0:
	if line[i-1].endswith('t') or line[i-1].endswith('d') or line[i-1].endswith('te') or line[i-1].endswith('de'):
	res += 1
	continue
	if word == '=s' and i > 0:
	if line[i-1].endswith('s') or line[i-1].endswith('z') or line[i-1].endswith('ch') or line[i-1].endswith('sh') or line[i-1].endswith('x'):
	res += 1
	continue
	if word.endswith('le') and len(word) >= 3 and not word[-3] in VOWELS:
	res += 1 # to account for the dropped e
	removed_e = False
	if word.endswith('e'):
	word = word[:-1]
	removed_e = True
	if word.endswith('y') and len(word) > 2 and not word[-2] in VOWELS:
	word = word[:-1]+'i'
	word = word.replace('ea','i').replace('ee','i')
	word = word.replace('ai','i').replace('au','o')
	word = word.replace('eu','u')
	word = word.replace('ei','i').replace('ie','i')
	word = word.replace('oa','o').replace('ou','o')
	word = word.replace('oi','o').replace('oo','u')
	if word.endswith('tion') or word.endswith('sion') or word.endswith('tian'):
	word = word[:-4]+'shun'
	this_count = 0
	for vowel in VOWELS:
	this_count += word.count(vowel)
	if removed_e and this_count == 0:
	this_count = 1
	res += this_count
	return res

	def lastLine(tokens, endl):
	res = []
	nl = NEWLINE.lower()[1:-1]
	i = endl-1
	while i > 0:
	if tokens[i] == nl:
	break
	i -= 1
	res = tokens[i:endl]
	if len(res) == 0:
	res = tokens[:endl]
	return res
	def processRhymeStack(rhyme_stack):
	prev = rhyme_stack[:-1].flatten(order='F')
	lastRhyme = rhyme_stack[-1]
	res = np.zeros(RHYME_STACK_SIZE-1)
	if lastRhyme[0] != -1:
	for i in range(RHYME_STACK_SIZE-1):
	if rhyme_stack[i][0] == lastRhyme[0]:
	res[i] = 1
	if rhyme_stack[i][1] == lastRhyme[1]:
	res[i] = 2
	res = np.concatenate([prev, res])
	return res
	def processRhymeMeter(split):
	in_title = False
	meter = []
	rhymes = []
	meter_stack = np.zeros(METER_STACK_SIZE, np.int8)
	rhyme_stack = np.zeros((RHYME_STACK_SIZE, 2), np.int8) - 1
	tl = TITLE.lower()[1:-1]
	nl = NEWLINE.lower()[1:-1]
	for i in range(len(split)):
	line = lastLine(split, i)
	if split[i] == tl:
	in_title = True
	meter_stack = np.zeros(METER_STACK_SIZE, np.int8)
	rhyme_stack = np.zeros((RHYME_STACK_SIZE, 2), np.int8) - 1
	meter.append(meter_stack.copy())
	rhymes.append(processRhymeStack(rhyme_stack))
	continue
	elif in_title and split[i] == nl:
	in_title = False
	meter_stack = np.zeros(METER_STACK_SIZE, np.int8)
	meter_stack[-1] = getMeter(line)
	meter.append(meter_stack.copy())
	rhyme_stack = np.zeros((RHYME_STACK_SIZE, 2), np.int8) - 1
	rhyme_stack[-1] = np.array(getRhyme(line), np.int8)
	rhymes.append(processRhymeStack(rhyme_stack))
	meter_stack = np.zeros(METER_STACK_SIZE, np.int8)
	rhyme_stack = np.zeros((RHYME_STACK_SIZE, 2), np.int8) - 1
	continue
	if not in_title and split[i] == nl:
	rhymes.append(processRhymeStack(rhyme_stack))
	meter.append(meter_stack.copy())
	if split[i-1] != nl:
	rhyme_stack = np.roll(rhyme_stack, -1, axis=0)
	rhyme_stack[-1] = np.array(getRhyme(line), np.int8)
	meter_stack = np.roll(meter_stack, -1, axis=0)
	meter_stack[-1] = getMeter(line)
	else:
	meter_stack[-1] = getMeter(line)
	rhyme_stack[-1] = np.array(getRhyme(line), np.int8)
	rhymes.append(processRhymeStack(rhyme_stack))
	meter.append(meter_stack.copy())
	return [rhymes, meter]

	def rhymeMeterFromTokens(tokens, endl, tl, vocab=None):
	# used as input for model
	res = []
	start = endl-1
	if len(tokens) >= endl:
	while start > 0 and tokens[start] != tl:
	start -= 1
	lines = tokens[start:endl]
	while len(lines) < TRANSFORMER_N:
	lines.append(None)
	input_lines = lines if vocab is None else [(vocab[x] if (x is not None and 0 <= x < VOCAB_SIZE) else None) for x in lines]
	rhymes, meter = processRhymeMeter(input_lines)
	rhymes = rhymes[-TRANSFORMER_N:] # context x RHYME_STACK_SIZE x 2
	meter = meter[-TRANSFORMER_N:] # context x METER_STACK_SIZE
	rhymes = np.array(rhymes)
	meter = np.array(meter)
	res = np.concatenate([rhymes, meter], axis=1) # context x (RHYME_STACK_SIZE*2 + METER_STACK_SIZE)
	return res

	if __name__ == '__main__':
	N = NGRAM_N if MODEL_TYPE == 'n' else TRANSFORMER_N+1
	for i in range(N-1):
	tokens.append(None)
	words.remove('<unk>')
	print({word: counts[word] for word in words[:VOCAB_SIZE]})
	title_token = words.index(TITLE.lower()[1:-1])
	newline_token = words.index(NEWLINE.lower()[1:-1])

	print("Splitting poems with masked dividers")
	mask_list = [-1]*N
	splits = []
	chunk_size = len(tokens)//N_THREADS
	for i in range(N_THREADS):
	splits.append(
	tokens[ichunk_size : (i+1)chunk_size if i < N_THREADS-1 else len(tokens)])


	results = [None] * N_THREADS
	threads = []

	def add_dividers(thread_index, split):
	i = 1
	while i < len(split):
	if split[i] == title_token:
	split = split[:i] + mask_list + split[i:]
	i += N+5
	i += 1
	results[thread_index] = split
	return split
	for i in range(N_THREADS):
	t = Thread(target=add_dividers, args=(i, splits[i],))
	threads.append(t)
	t.start()
	tokens = []
	for i in range(N_THREADS):
	threads[i].join()
	tokens += results[i]

	if MODEL_TYPE == 'b':
	print("Computing rhyme and meter information")
	split_token_marks = []
	split_size = len(tokens)//N_THREADS
	for i in range(N_THREADS+1):
	split_token_marks.append(split_size*i)
	for i in range(1, N_THREADS):
	while tokens[split_token_marks[i]] != TITLE.lower()[1:-1]:
	split_token_marks[i] += 1
	if split_token_marks[i] >= len(tokens):
	break
	meter_data = []
	rhymes_data = []
	split_token_marks[-1] = len(tokens)
	split_tokens = [tokens[split_token_marks[i]:split_token_marks[i+1]] for i in range(N_THREADS)]
	rhyme_meter_res = [None] * N_THREADS
	threads = []
	def rhymeMeterThread(thread_index, split):
	rhyme_meter_res[thread_index] = processRhymeMeter(split)
	for i in range(N_THREADS):
	t = Thread(target=rhymeMeterThread, args=(i, split_tokens[i]))
	threads.append(t)
	t.start()
	for i in range(N_THREADS):
	threads[i].join()
	rhymes_data += rhyme_meter_res[i][0]
	meter_data += rhyme_meter_res[i][1]

	print("Converting rhyme and meter information")
	rhymes_data = np.asarray(rhymes_data)
	meter_data = np.asarray(meter_data)
	rhyme_meter_data = np.concatenate([rhymes_data, meter_data], axis=1)

	print("Masking unknown tokens")
	tokens = [(words.index(x) if x in vocab else -1) for x in tokens]

	print("Creating sets of ngrams")
	ngrams = []
	rm_ngrams = []
	for i in range(0, len(tokens)-N, TOKEN_SKIP):
	ngrams.append(tokens[i:i+N])
	if MODEL_TYPE == 'b':
	rm_ngrams.append(rhyme_meter_data[i:i+N-1,:])
	train_x = []
	train_y = []
	train_rm = []
	for i in range(len(ngrams)):
	sample = ngrams[i][:N]
	train_x.append(sample[:N-1])
	if MODEL_TYPE == 'b':
	sample_rm = rm_ngrams[i]
	train_rm.append(sample_rm)
	if MODEL_TYPE != 'n':
	train_y.append(sample[1:])
	else:
	train_y.append(sample[N-1])
	print("Converting arrays")
	train_x = np.asarray(train_x)
	train_y = np.asarray(train_y)
	if MODEL_TYPE == 'b':
	train_rm = np.asarray(train_rm, np.int8)
	if MODEL_TYPE != 'n':
	train_x += 1 # x in [0, VOCAB_SIZE] since 0 is for <unk>
	# y in [-1, VOCAB_SIZE-1] with VOCAB_SIZE tokens, one for each vocabulary item, and -1 for <unk>

	print("Saving data")
	fname = {'n': 'inputs/ngram_train.npz',
	't': 'inputs/transformer_train.npz',
	'b': 'inputs/bard_train.npz'
	}[MODEL_TYPE]
	if MODEL_TYPE != 'b':
	np.savez_compressed(fname, x=train_x, y=train_y)
	else:
	np.savez_compressed(fname, x=train_x, rm=train_rm, y=train_y)
	np.save('lemmas/lemmas.npy', words[:VOCAB_SIZE])