File size: 1,639 Bytes
34b369f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""
Module which represents to the data validation step.
"""

import os

from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer

from src.TextSummarizer.entity import entities
from src.TextSummarizer.logger import backend_logger


class DataTransformation:
    def __init__(self, config: entities.DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)

    def convert_examples_to_features(self,example_batch):
        """
        Convert the examples to features.
        """
        input_encodings = self.tokenizer(example_batch['document'] , max_length = 800, truncation = True )

        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )

        return {
            'input_ids' : input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }

    def convert(self):
        """
        Tokenzie the dataset and store it to the disk.
        """
        backend_logger.info("Converting text to tokens....")
        # Check if the dataset folder already exists.
        if os.path.exists(os.path.join(self.config.root_dir,"dataset")):
            return
        dataset = load_from_disk(self.config.data_path)
        dataset = dataset.map(self.convert_examples_to_features, batched = True)
        dataset.save_to_disk(os.path.join(self.config.root_dir,"dataset"))
        backend_logger.info("Converted text to tokens.")