In [None]:
import os
%pwd
os.chdir("../")

%pwd

In [None]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
 root_dir: str
 data_path: str
 tokenizer_name: str

In [None]:
from box import ConfigBox
from pathlib import Path
from src.TextSummarizer.constants import file_path
from src.TextSummarizer.utils.general import read_yaml, create_directories

class ConfigurationManager:
 """
 Class to manage the configuration files.
 """

 def __init__(self) -> None:
 self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))
 self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))

 create_directories(path_to_directories=[self.config.artifacts_root])

 def get_data_transformation_config(self) -> DataTransformationConfig:
 config = self.config.data_transformation

 create_directories([config.root_dir])

 data_transformation_config = DataTransformationConfig(
 root_dir=config.root_dir,
 data_path=config.data_path,
 tokenizer_name = config.tokenizer_name
 )

 return data_transformation_config

In [None]:


import os
from src.TextSummarizer.logger import backend_logger
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk


class DataTransformation:
 def __init__(self, config: DataTransformationConfig):
 self.config = config
 self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)



 def convert_examples_to_features(self,example_batch):
 input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 800, truncation = True )

 with self.tokenizer.as_target_tokenizer():
 target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )

 return {
 'input_ids' : input_encodings['input_ids'],
 'attention_mask': input_encodings['attention_mask'],
 'labels': target_encodings['input_ids']
 }


 def convert(self):
 dataset = load_from_disk(self.config.data_path)
 dataset = dataset.map(self.convert_examples_to_features, batched = True)
 dataset.save_to_disk(os.path.join(self.config.root_dir,"dataset"))

In [None]:
try:
 config = ConfigurationManager()
 data_transformation_config = config.get_data_transformation_config()
 data_transformation = DataTransformation(config=data_transformation_config)
 data_transformation.convert()
except Exception as e:
 raise e