Spaces:
Sleeping
Sleeping
File size: 4,746 Bytes
cb7427c 29003f1 cb7427c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import os
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from source.config import Config
class Vocab:
"""
Offers word2index and index2word functionality after counting words in input sentences.
Allows choosing the size of the vocabulary by taking the most common words. Explicitly reserves four indices:
<pad>, <sos>, <eos> and <unk>.
"""
def __init__(self, sentence_splitter=None):
"""
Args:
sentence_splitter: tokenizing function
"""
self.config = Config()
self.counter = Counter()
self.word2index = dict()
self.index2word = dict()
self.size = 0
# predefined tokens
self.PADDING_INDEX = 0
self.SOS = 1
self.EOS = 2
self.UNKNOWN_WORD_INDEX = 3
if sentence_splitter is None:
# matches sequences of characters including ones between < >
word_regex = r'(?:\w+|<\w+>)'
# tokenize the string into words
sentence_splitter = RegexpTokenizer(word_regex).tokenize
self.splitter = sentence_splitter
def add_sentence(self, sentence: str):
"""
Update word counts from sentence after tokenizing it into words
"""
self.counter.update(self.splitter(sentence))
def build_vocab(self, vocab_size: int, file_name: str):
""" Build vocabulary dictionaries word2index and index2word from a text file at config.ROOT path
Args:
vocab_size (int): size of vocabulary (including 4 predefined tokens: <pad>, <sos>, <eos>, <unk>)
file_name (str): name of the text file from which the vocabulary will be built.
Note: the lines in file are assumed to be in form: 'word SPACE index' and
it asssumes a header line (for example: 'captions.txt')
"""
filepath = os.path.join(self.config.ROOT, file_name)
try:
with open(filepath, 'r', encoding='utf-8') as file:
for i, line in enumerate(file):
# ignore header line
if i == 0:
continue
caption = line.strip().lower().split(",", 1)[1] # id=0, caption=1
self.add_sentence(caption)
except Exception as e:
print(f"Error processing file {filepath}: {e}")
return
# adding predefined tokens in the vocabulary
self._add_predefined_tokens()
words = self.counter.most_common(vocab_size - 4)
# (index + 4) because first 4 tokens are the predefined ones
for index, (word, _) in enumerate(words, start=4):
self.word2index[word] = index
self.index2word[index] = word
self.size = len(self.word2index)
def _add_predefined_tokens(self):
predefined_tokens = ['<pad>', '<sos>', '<eos>', '<unk>']
for index, token in enumerate(predefined_tokens):
self.word2index[token] = index
self.index2word[index] = token
def word_to_index(self, word: str) -> int:
""" Map word to index from word2index dictionary in vocabulary
Args:
word (str): word to be mapped
Returns:
int: index matched to the word
"""
try:
return self.word2index[word]
except KeyError:
return self.UNKNOWN_WORD_INDEX
def index_to_word(self, index: int) -> str:
""" Map word to index from index2word dictionary in vocabulary
Args:
word (str): index to be mapped
Returns:
str: word matched to the index
"""
try:
return self.index2word[index]
except KeyError:
return self.index2word[self.UNKNOWN_WORD_INDEX]
def load_vocab(self, file_name: str):
""" Load the word2index and index2word dictionaries from a text file at config.ROOT path
Args:
file_name (str): name of the text file where the vocabulary is saved (i.e 'word2index.txt')
Note: the lines in file are assumed to be in form: 'word SPACE index' and it asssumes a header line
"""
filepath = os.path.join(self.config.ROOT, file_name)
self.word2index = dict()
self.index2word = dict()
try:
with open(filepath) as file:
for line in file:
line = line.strip().split(' ')
word, index = line[0], line[1]
self.word2index[word] = int(index)
self.index2word[int(index)] = word
except Exception as e:
print(f"Error loading vocabulary from file {filepath}: {e}")
|