|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Utilities for parsing Kaggle baby names files.""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import collections |
|
import os |
|
|
|
import numpy as np |
|
import tensorflow as tf |
|
import pandas as pd |
|
|
|
|
|
_EON = 0 |
|
|
|
|
|
def read_names(names_path): |
|
"""read data from downloaded file. See SmallNames.txt for example format |
|
or go to https://www.kaggle.com/kaggle/us-baby-names for full lists |
|
|
|
Args: |
|
names_path: path to the csv file similar to the example type |
|
Returns: |
|
Dataset: a namedtuple of two elements: deduped names and their associated |
|
counts. The names contain only 26 chars and are all lower case |
|
""" |
|
names_data = pd.read_csv(names_path) |
|
names_data.Name = names_data.Name.str.lower() |
|
|
|
name_data = names_data.groupby(by=["Name"])["Count"].sum() |
|
name_counts = np.array(name_data.tolist()) |
|
names_deduped = np.array(name_data.index.tolist()) |
|
|
|
Dataset = collections.namedtuple('Dataset', ['Name', 'Count']) |
|
return Dataset(names_deduped, name_counts) |
|
|
|
|
|
def _letter_to_number(letter): |
|
"""converts letters to numbers between 1 and 27""" |
|
|
|
return ord(letter) - 96 |
|
|
|
|
|
def namignizer_iterator(names, counts, batch_size, num_steps, epoch_size): |
|
"""Takes a list of names and counts like those output from read_names, and |
|
makes an iterator yielding a batch_size by num_steps array of random names |
|
separated by an end of name token. The names are chosen randomly according |
|
to their counts. The batch may end mid-name |
|
|
|
Args: |
|
names: a set of lowercase names composed of 26 characters |
|
counts: a list of the frequency of those names |
|
batch_size: int |
|
num_steps: int |
|
epoch_size: number of batches to yield |
|
Yields: |
|
(x, y): a batch_size by num_steps array of ints representing letters, where |
|
x will be the input and y will be the target |
|
""" |
|
name_distribution = counts / counts.sum() |
|
|
|
for i in range(epoch_size): |
|
data = np.zeros(batch_size * num_steps + 1) |
|
samples = np.random.choice(names, size=batch_size * num_steps // 2, |
|
replace=True, p=name_distribution) |
|
|
|
data_index = 0 |
|
for sample in samples: |
|
if data_index >= batch_size * num_steps: |
|
break |
|
for letter in map(_letter_to_number, sample) + [_EON]: |
|
if data_index >= batch_size * num_steps: |
|
break |
|
data[data_index] = letter |
|
data_index += 1 |
|
|
|
x = data[:batch_size * num_steps].reshape((batch_size, num_steps)) |
|
y = data[1:batch_size * num_steps + 1].reshape((batch_size, num_steps)) |
|
|
|
yield (x, y) |
|
|
|
|
|
def name_to_batch(name, batch_size, num_steps): |
|
""" Takes a single name and fills a batch with it |
|
|
|
Args: |
|
name: lowercase composed of 26 characters |
|
batch_size: int |
|
num_steps: int |
|
Returns: |
|
x, y: a batch_size by num_steps array of ints representing letters, where |
|
x will be the input and y will be the target. The array is filled up |
|
to the length of the string, the rest is filled with zeros |
|
""" |
|
data = np.zeros(batch_size * num_steps + 1) |
|
|
|
data_index = 0 |
|
for letter in map(_letter_to_number, name) + [_EON]: |
|
data[data_index] = letter |
|
data_index += 1 |
|
|
|
x = data[:batch_size * num_steps].reshape((batch_size, num_steps)) |
|
y = data[1:batch_size * num_steps + 1].reshape((batch_size, num_steps)) |
|
|
|
return x, y |
|
|