|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
set -euo pipefail |
|
|
|
root=$(dirname $0) |
|
|
|
lang_map_path=$root/utils.map_token_lang.tsv |
|
|
|
usage () { |
|
echo "usage: $0 lang" >&2 |
|
exit 1 |
|
} |
|
|
|
[ $# -eq 1 ] || usage |
|
|
|
lang=$1 |
|
|
|
declare -A lang_map |
|
|
|
while read line; do |
|
key=$(cut -f1 <<< "$line") |
|
val=$(cut -f2 <<< "$line") |
|
lang_map[$key]=$val |
|
done < $lang_map_path |
|
|
|
if [ -v "lang_map[$lang]" ]; then |
|
lang=${lang_map[$lang]} |
|
elif [ -v "lang_map[${lang:0:3}]" ]; then |
|
lang=${lang_map[${lang:0:3}]} |
|
else |
|
echo "undefined mapping: ${lang}, falling back to: en" >&2 |
|
lang=en |
|
fi |
|
|
|
perl $root/normalize-punctuation.perl $lang |
|
|