File size: 1,707 Bytes
1244519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import torch
import random
import numpy as np

config = {}


def get_device():
    return (
        torch.device("cuda:0") if config.get("use_gpu", False) else torch.device("cpu")
    )


HWT = "HWT"
MGT = "MGT"


def init_random_seeds():
    print("Init random seeds")
    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)
    torch.cuda.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


class FeatureExtractor:
    def __init__(self, model, net=None):
        self.llm_model = model  # TODO: support different models
        self.net = net

    def process(self, text, net_required=True):
        DEVICE = get_device()
        # Tokenize
        tokens = self.llm_model.tokenizer(
            [text],
            padding="max_length",
            truncation=True,
            max_length=100,
            return_tensors="pt",
        ).to(DEVICE)
        # Predict
        outputs = self.llm_model.model(**tokens)
        # Get the feature for input text
        attention_mask = tokens["attention_mask"].unsqueeze(-1)
        hidden_states_masked = (
            outputs.last_hidden_state * attention_mask
        )  # Ignore the padding tokens
        if net_required and self.net is not None:
            feature = self.net.net(hidden_states_masked)
            return feature
        else:
            return hidden_states_masked

    def process_sents(self, sents, net_required=True):
        features = []
        for sent in sents:
            features.append(self.process(sent, net_required))
        if not features:
            return torch.tensor([])
        return torch.cat(features, dim=0)