File size: 1,234 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/bash

##################################################################################
# This script will download wikitext-103-raw and will do basic data preparation
# for BPE and training
##################################################################################

# provide script usage instructions
if [ $# -eq 0 ]
then
    echo "usage: $0 <data_dir>"
    exit 1
fi

get_seeded_random()
{
  seed="$1"
  openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
    </dev/zero 2>/dev/null
}

# set relevant paths
SP_PATH=/usr/local/bin
DATA_PATH=$1
TEST_PATH=$DATA_PATH/test

CUR_DIR=$(pwd)

# Download the default datasets into the $DATA_PATH; mkdir if it doesn't exist
mkdir -p $DATA_PATH
cd $DATA_PATH

echo "Downloading and extracting WikiText-103 (183 MB) for training and inference..."
wget --trust-server-names https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
unzip wikitext-103-raw-v1.zip
rm wikitext-103-raw-v1.zip
cd wikitext-103-raw

echo "Removing empty lines and shuffling training data"
sed -r '/^\s*$/d' -i wiki.train.raw
sed -r '/^\s*$/d' -i wiki.valid.raw
sed -r '/^\s*$/d' -i wiki.test.raw
sort --random-source=<(get_seeded_random 42) -R -o wiki.train.raw wiki.train.raw