Spaces:
Runtime error
Runtime error
function usage { | |
echo "usage: <n>$0" | |
echo "note: n is the number of essays to download" | |
echo "for specific n, the resulting pg.txt file will have the following number of tokens:" | |
echo "n | tokens" | |
echo "--- | ---" | |
echo "1 | 6230" | |
echo "2 | 23619" | |
echo "5 | 25859" | |
echo "10 | 36888" | |
echo "15 | 50188" | |
echo "20 | 59094" | |
echo "25 | 88764" | |
echo "30 | 103121" | |
echo "32 | 108338" | |
echo "35 | 113403" | |
echo "40 | 127699" | |
echo "45 | 135896" | |
exit 1 | |
} | |
function has_cmd { | |
if ! [ -x "$(command -v $1)" ]; then | |
echo "error: $1 is not available" >&2 | |
exit 1 | |
fi | |
} | |
# check for: curl, html2text, tail, sed, fmt | |
has_cmd curl | |
has_cmd html2text | |
has_cmd tail | |
has_cmd sed | |
if [ $# -ne 1 ]; then | |
usage | |
fi | |
n=$1 | |
# get urls | |
urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)" | |
printf "urls:\n%s\n" "$urls" | |
if [ -f pg.txt ]; then | |
rm pg.txt | |
fi | |
c=1 | |
for url in $urls; do | |
echo "processing $url" | |
cc=$(printf "%03d" $c) | |
curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt | |
cat pg-$cc-one.txt >> pg.txt | |
cp -v pg.txt pg-$cc-all.txt | |
c=$((c+1)) | |
# don't flood the server | |
sleep 1 | |
done | |
echo "done. data in pg.txt" | |
exit 0 | |