File size: 4,746 Bytes
cb7427c
 
 
29003f1
cb7427c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from source.config import Config


class Vocab:
    """
    Offers word2index and index2word functionality after counting words in input sentences.
    Allows choosing the size of the vocabulary by taking the most common words. Explicitly reserves four indices:
    <pad>, <sos>, <eos> and <unk>.
    """
    def __init__(self, sentence_splitter=None):
        """
        Args:
        sentence_splitter: tokenizing function
        """
        self.config = Config()

        self.counter = Counter()
        self.word2index = dict()
        self.index2word = dict()
        self.size = 0

        # predefined tokens
        self.PADDING_INDEX = 0
        self.SOS = 1
        self.EOS = 2
        self.UNKNOWN_WORD_INDEX = 3

        if sentence_splitter is None:
            # matches sequences of characters including ones between < >
            word_regex = r'(?:\w+|<\w+>)'
            # tokenize the string into words
            sentence_splitter = RegexpTokenizer(word_regex).tokenize
        self.splitter = sentence_splitter

    def add_sentence(self, sentence: str):
        """
        Update word counts from sentence after tokenizing it into words
        """
        self.counter.update(self.splitter(sentence))

    def build_vocab(self, vocab_size: int, file_name: str):
        """ Build vocabulary dictionaries word2index and index2word from a text file at config.ROOT path

        Args:

            vocab_size (int): size of vocabulary (including 4 predefined tokens: <pad>, <sos>, <eos>, <unk>)

            file_name (str): name of the text file from which the vocabulary will be built.
                Note: the lines in file are assumed to be in form: 'word SPACE index' and
                it asssumes a header line (for example: 'captions.txt')
        """

        filepath = os.path.join(self.config.ROOT, file_name)

        try:
            with open(filepath, 'r', encoding='utf-8') as file:
                for i, line in enumerate(file):
                    # ignore header line
                    if i == 0:
                        continue
                    caption = line.strip().lower().split(",", 1)[1]  # id=0, caption=1
                    self.add_sentence(caption)
        except Exception as e:
            print(f"Error processing file {filepath}: {e}")
            return

        # adding predefined tokens in the vocabulary
        self._add_predefined_tokens()

        words = self.counter.most_common(vocab_size - 4)
        # (index + 4) because first 4 tokens are the predefined ones
        for index, (word, _) in enumerate(words, start=4):
            self.word2index[word] = index
            self.index2word[index] = word

        self.size = len(self.word2index)

    def _add_predefined_tokens(self):
        predefined_tokens = ['<pad>', '<sos>', '<eos>', '<unk>']
        for index, token in enumerate(predefined_tokens):
            self.word2index[token] = index
            self.index2word[index] = token

    def word_to_index(self, word: str) -> int:
        """ Map word to index from word2index dictionary in vocabulary

        Args:
            word (str): word to be mapped

        Returns:
            int: index matched to the word
        """
        try:
            return self.word2index[word]
        except KeyError:
            return self.UNKNOWN_WORD_INDEX

    def index_to_word(self, index: int) -> str:
        """ Map word to index from index2word dictionary in vocabulary

        Args:
            word (str): index to be mapped

        Returns:
            str: word matched to the index
        """
        try:
            return self.index2word[index]
        except KeyError:
            return self.index2word[self.UNKNOWN_WORD_INDEX]

    def load_vocab(self, file_name: str):
        """ Load the word2index and index2word dictionaries from a text file at config.ROOT path

        Args:
            file_name (str): name of the text file where the vocabulary is saved (i.e 'word2index.txt')
                Note: the lines in file are assumed to be in form: 'word SPACE index' and it asssumes a header line
        """

        filepath = os.path.join(self.config.ROOT, file_name)

        self.word2index = dict()
        self.index2word = dict()

        try:
            with open(filepath) as file:
                for line in file:
                    line = line.strip().split(' ')
                    word, index = line[0], line[1]
                    self.word2index[word] = int(index)
                    self.index2word[int(index)] = word
        except Exception as e:
            print(f"Error loading vocabulary from file {filepath}: {e}")