File size: 2,831 Bytes
0b32ad6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import logging
import os
import tempfile

import pytest
from dotenv import dotenv_values

from s3prl.dataio.corpus.librispeech import LibriSpeech
from s3prl.dataio.encoder.tokenizer import load_tokenizer
from s3prl.dataio.encoder.vocabulary import generate_vocab

SAMPLE = "GOOD MORNING MY FRIEND"


def is_same_vocab(vocabs_1, vocabs_2):
    if len(vocabs_1) != len(vocabs_2):
        return False

    for v1, v2 in zip(vocabs_1, vocabs_2):
        if v1 != v2:
            return False

    return True


@pytest.mark.corpus
def test_vocabulary():
    config = dotenv_values()
    corpus = LibriSpeech(config["LibriSpeech"])
    text_list = corpus.data_dict["train-clean-100"]["text_list"]

    with tempfile.TemporaryDirectory() as directory:
        logging.info(directory)
        text_file = os.path.join(directory, "text.txt")

        with open(text_file, "w") as fp:
            for text in text_list:
                fp.write(text + "\n")

        # Character
        char_vocabs_1 = generate_vocab("character", text_list=text_list)
        char_vocabs_2 = generate_vocab("character", text_file=text_file)

        assert isinstance(char_vocabs_1, list)
        assert isinstance(char_vocabs_2, list)
        assert is_same_vocab(char_vocabs_1, char_vocabs_2)

        char_tokenizer = load_tokenizer("character", vocab_list=char_vocabs_1)
        assert char_tokenizer.decode(char_tokenizer.encode(SAMPLE)) == SAMPLE

        # Word
        word_vocabs_1 = generate_vocab("word", text_list=text_list, vocab_size=5000)
        word_vocabs_2 = generate_vocab("word", text_file=text_file, vocab_size=5000)

        assert isinstance(word_vocabs_1, list)
        assert isinstance(word_vocabs_2, list)
        assert is_same_vocab(word_vocabs_1, word_vocabs_2)

        word_tokenizer = load_tokenizer("word", vocab_list=word_vocabs_1)
        assert word_tokenizer.decode(word_tokenizer.encode(SAMPLE)) == SAMPLE

        # Subword
        vocab_file_1 = os.path.join(directory, "subword_1")
        vocab_file_2 = os.path.join(directory, "subword_2")

        subword_vocabs_1 = generate_vocab(
            "subword", text_list=text_list, vocab_size=500, output_file=vocab_file_1
        )
        subword_vocabs_2 = generate_vocab(
            "subword", text_file=text_file, vocab_size=500, output_file=vocab_file_2
        )

        subword_tokenizer_1 = load_tokenizer(
            "subword", vocab_file=vocab_file_1 + ".model"
        )
        subword_tokenizer_2 = load_tokenizer(
            "subword", vocab_file=vocab_file_2 + ".model"
        )
        assert subword_tokenizer_1.decode(subword_tokenizer_1.encode(SAMPLE)) == SAMPLE
        assert subword_tokenizer_2.decode(subword_tokenizer_2.encode(SAMPLE)) == SAMPLE
        assert subword_tokenizer_1.encode(SAMPLE) == subword_tokenizer_2.encode(SAMPLE)