File size: 1,527 Bytes
87f72f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env bash
set -e

# Print out the vocabulary from Gr.fst for all zipped models in a directory.
# Assumes fstprint is in PATH and ngramfst.so is in LD_LIBRARY_PATH.

if [ -z "$2" ]; then
    echo 'Usage: print-vocabulary <MODEL_DIR> <VOCAB_DIR>'
    exit 1
fi

model_dir="$1"
vocab_dir="$2"

mkdir -p "${vocab_dir}"

temp_dir="$(mktemp -d)"
function finish {
    rm -rf "${temp_dir}"
}

trap finish EXIT

find "${model_dir}" -name '*.zip' -type f | \
    while read -r zip_file; do
        model_name="$(basename "${zip_file}" .zip)"
        vocab_file="${vocab_dir}/${model_name}.txt"

        if [ -s "${vocab_file}" ]; then
            echo "Skipping ${model_name} (${vocab_file})"
            continue
        fi

        model_dir="${temp_dir}/${model_name}"
        mkdir -p "${model_dir}"
        unzip -j "${zip_file}" "${model_name}/graph/Gr.fst" -d "${model_dir}" || \
            unzip -j "${zip_file}" "${model_name}/Gr.fst" -d "${model_dir}" || \
            unzip -j "${zip_file}" "${model_name}/words.txt" -d "${model_dir}" || \
            unzip -j "${zip_file}" "${model_name}/graph/words.txt" -d "${model_dir}" || \
            true

        if [ -f "${model_dir}/words.txt" ]; then
            cut -d' ' -f1 < "${model_dir}/words.txt" | sort | uniq > "${vocab_file}"
        elif [ -f "${model_dir}/Gr.fst" ]; then
            fstprint "${model_dir}/Gr.fst" | cut -f3 | sort | uniq > "${vocab_file}"
        else
            echo "ERROR: can't get vocabulary for ${model_name}"
        fi

    done