File size: 7,306 Bytes

a0b398e

from everything import *
from bert import BertModel
from optimizer import AdamW
from tokenizer import BertTokenizer


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


class SentimentDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]

    def pad_data(self, data):
        sents = [x[0] for x in data]
        labels = [x[1] for x in data]
        sent_ids = [x[2] for x in data]

        encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
        token_ids = torch.LongTensor(encoding['input_ids'])
        attention_mask = torch.LongTensor(encoding['attention_mask'])
        labels = torch.LongTensor(labels)

        return token_ids, attention_mask, labels, sents, sent_ids

    def collate_fn(self, all_data):
        token_ids, attention_mask, labels, sents, sent_ids = self.pad_data(all_data)

        batched_data = {
            'token_ids': token_ids,
            'attention_mask': attention_mask,
            'labels': labels,
            'sents': sents,
            'sent_ids': sent_ids
        }

        return batched_data


class SentimentTestDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]

    def pad_data(self, data):
        sents = [x[0] for x in data]
        sent_ids = [x[1] for x in data]

        encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
        token_ids = torch.LongTensor(encoding['input_ids'])
        attention_mask = torch.LongTensor(encoding['attention_mask'])

        return token_ids, attention_mask, sents, sent_ids

    def collate_fn(self, all_data):
        token_ids, attention_mask, sents, sent_ids= self.pad_data(all_data)

        batched_data = {
            'token_ids': token_ids,
            'attention_mask': attention_mask,
            'sents': sents,
            'sent_ids': sent_ids
        }

        return batched_data


class AmazonDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        return self.dataset[idx]
    
    def pad_data(self, data):
        sents = [x[0] for x in data]
        sent_ids = [x[1] for x in data]
        encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
        token_ids = torch.LongTensor(encoding['input_ids'])
        attension_mask = torch.LongTensor(encoding['attention_mask'])

        return token_ids, attension_mask, sent_ids
    
    def collate_fn(self, data):
        token_ids, attention_mask, sent_ids = self.pad_data(data)

        batched_data = {
            'token_ids': token_ids,
            'attention_mask': attention_mask,
            'sent_ids': sent_ids
        }

        return batched_data


class SemanticDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        return self.dataset[idx]
    
    def pad_data(self, data):
        sents1 = [x[0] for x in data]
        sents2 = [x[1] for x in data]
        score = [x[2] for x in data]
        sent_ids = [x[3] for x in data]
        encoding = tokenizer(sents1 + sents2, return_tensors='pt', padding=True, truncation=True)
        token_ids = torch.LongTensor(encoding['input_ids'])
        attension_mask = torch.LongTensor(encoding['attention_mask'])

        return token_ids, attension_mask, score, sent_ids
    
    def collate_fn(self, data):
        token_ids, attention_mask, score, sent_ids = self.pad_data(data)
        n = len(sent_ids)

        batched_data = {
            'token_ids_1': token_ids[:n],
            'token_ids_2': token_ids[n:],
            'attention_mask_1': attention_mask[:n],
            'attention_mask_2': attention_mask[n:],
            'score': score,
            'sent_ids': sent_ids
        }

        return batched_data

        
class InferenceDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        return self.dataset[idx]
    
    def pad_data(self, data):
        anchor = [x[0] for x in data]
        positive = [x[1] for x in data]
        negative = [x[2] for x in data]
        sent_ids = [x[3] for x in data]
        encoding = tokenizer(anchor + positive + negative, return_tensors='pt', padding=True, truncation=True)
        token_ids = torch.LongTensor(encoding['input_ids'])
        attension_mask = torch.LongTensor(encoding['attention_mask'])

        return token_ids, attension_mask, sent_ids
    
    def collate_fn(self, data):
        token_ids, attention_mask, sent_ids = self.pad_data(data)
        n = len(sent_ids)

        batched_data = {
            'anchor_ids': token_ids[:n],
            'positive_ids': token_ids[n:2*n],
            'negative_ids': token_ids[2*n:],
            'anchor_masks': attention_mask[:n],
            'positive_masks': attention_mask[n:2*n],
            'negative_masks': attention_mask[2*n:],
            'sent_ids': sent_ids
        }

        return batched_data


def load_data(filename, flag='train'):
    '''
    - for amazon dataset: list of (sent, id)
    - for nli dataset: list of (anchor, positive, negative, id)
    - for stsb dataset: list of (sentence1, sentence2, score, id)

    - for test dataset: list of (sent, id)
    - for train dataset: list of (sent, label, id)
    '''

    if flag == 'amazon':
        df = pd.read_parquet(filename)
        data = list(zip(df['content'], df.index))
    elif flag == 'nli':
        df = pd.read_parquet(filename)
        data = list(zip(df['anchor'], df['positive'], df['negative'], df.index))
    elif flag == 'stsb':
        df = pd.read_parquet(filename)
        data = list(zip(df['sentence1'], df['sentence2'], df['score'], df.index))
    else:
        data, num_labels = [], set()

        with open(filename, 'r') as fp:
            if flag == 'test':
                for record in csv.DictReader(fp, delimiter = '\t'):
                    sent = record['sentence'].lower().strip()
                    sent_id = record['id'].lower().strip()
                    data.append((sent,sent_id))
            else:
                for record in csv.DictReader(fp, delimiter = '\t'):
                    sent = record['sentence'].lower().strip()
                    sent_id = record['id'].lower().strip()
                    label = int(record['sentiment'].strip())
                    num_labels.add(label)
                    data.append((sent, label, sent_id))

    print(f"load {len(data)} data from {filename}")
    if flag == "train":
        return data, len(num_labels)
    else:
        return data


def seed_everything(seed=11711):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True