File size: 783 Bytes
9e582c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class Language:
    def __init__(self, name):
        self.name = name
        self.char2index = {'#': 0, '$': 1, '^': 2}   # '^': start of sequence, '$' : unknown char, '#' : padding
        self.index2char = {0: '#', 1: '$', 2: '^'}
        self.vocab_size = 3  # Count

    def addWord(self, word):
        for char in word:
            self.addChar(char)

    def addChar(self, char):
        if char not in self.char2index:
            self.char2index[char] = self.vocab_size
            self.index2char[self.vocab_size] = char
            self.vocab_size += 1

    def encode(self, s):
        return [self.char2index[ch] for ch in s]

    def decode(self, l):
        return ''.join([self.index2char[i] for i in l])

    def vocab(self):
        return self.char2index.keys()