|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TOPDIR='./data' |
|
RUNDIR=${PWD} |
|
|
|
mkdir -p ${TOPDIR} |
|
cd ${TOPDIR} |
|
mkdir -p raw_data |
|
mkdir -p raw_data/pretrained_embeddings |
|
mkdir -p raw_data/unlabeled_data |
|
mkdir -p raw_data/chunk |
|
cd ${RUNDIR} |
|
|
|
echo "Preparing GloVe embeddings" |
|
cd "${TOPDIR}/raw_data/pretrained_embeddings" |
|
curl -OL http://nlp.stanford.edu/data/glove.6B.zip |
|
unzip glove.6B.zip |
|
cd ${RUNDIR} |
|
echo |
|
|
|
echo "Preparing lm1b corpus" |
|
cd "${TOPDIR}/raw_data/unlabeled_data" |
|
curl -OL http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz |
|
tar xzf 1-billion-word-language-modeling-benchmark-r13output.tar.gz |
|
cd ${RUNDIR} |
|
echo |
|
|
|
echo "Preparing chunking corpus" |
|
cd "${TOPDIR}/raw_data/chunk" |
|
curl -OL https://www.clips.uantwerpen.be/conll2000/chunking/train.txt.gz |
|
curl -OL http://www.clips.uantwerpen.be/conll2000/chunking/test.txt.gz |
|
gunzip * |
|
cd ${RUNDIR} |
|
echo |
|
|
|
echo "Done with data fetching!" |
|
|
|
|