File size: 1,897 Bytes
8044721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python3
# coding=utf-8

import torch
from data.field.mini_torchtext.field import NestedField as TorchTextNestedField


class NestedField(TorchTextNestedField):
    def pad(self, example):
        self.nesting_field.include_lengths = self.include_lengths
        if not self.include_lengths:
            return self.nesting_field.pad(example)

        sentence_length = len(example)
        example, word_lengths = self.nesting_field.pad(example)
        return example, sentence_length, word_lengths

    def numericalize(self, arr, device=None):
        numericalized = []
        self.nesting_field.include_lengths = False
        if self.include_lengths:
            arr, sentence_length, word_lengths = arr

        numericalized = self.nesting_field.numericalize(arr, device=device)

        self.nesting_field.include_lengths = True
        if self.include_lengths:
            sentence_length = torch.tensor(sentence_length, dtype=self.dtype, device=device)
            word_lengths = torch.tensor(word_lengths, dtype=self.dtype, device=device)
            return (numericalized, sentence_length, word_lengths)
        return numericalized

    def build_vocab(self, *args, **kwargs):
        sources = []
        for arg in args:
            if isinstance(arg, torch.utils.data.Dataset):
                sources += [arg.get_examples(name) for name, field in arg.fields.items() if field is self]
            else:
                sources.append(arg)

        flattened = []
        for source in sources:
            flattened.extend(source)

        # just build vocab and does not load vector
        self.nesting_field.build_vocab(*flattened, **kwargs)
        super(TorchTextNestedField, self).build_vocab()
        self.vocab.extend(self.nesting_field.vocab)
        self.vocab.freqs = self.nesting_field.vocab.freqs.copy()
        self.nesting_field.vocab = self.vocab