{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "%pwd\n", "os.chdir(\"../\")\n", "\n", "%pwd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "from pathlib import Path\n", "\n", "\n", "@dataclass(frozen=True)\n", "class DataTransformationConfig:\n", " root_dir: str\n", " data_path: str\n", " tokenizer_name: str" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from box import ConfigBox\n", "from pathlib import Path\n", "from src.TextSummarizer.constants import file_path\n", "from src.TextSummarizer.utils.general import read_yaml, create_directories\n", "\n", "class ConfigurationManager:\n", " \"\"\"\n", " Class to manage the configuration files.\n", " \"\"\"\n", "\n", " def __init__(self) -> None:\n", " self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n", " self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n", "\n", " create_directories(path_to_directories=[self.config.artifacts_root])\n", "\n", " def get_data_transformation_config(self) -> DataTransformationConfig:\n", " config = self.config.data_transformation\n", "\n", " create_directories([config.root_dir])\n", "\n", " data_transformation_config = DataTransformationConfig(\n", " root_dir=config.root_dir,\n", " data_path=config.data_path,\n", " tokenizer_name = config.tokenizer_name\n", " )\n", "\n", " return data_transformation_config" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "import os\n", "from src.TextSummarizer.logger import backend_logger\n", "from transformers import AutoTokenizer\n", "from datasets import load_dataset, load_from_disk\n", "\n", "\n", "class DataTransformation:\n", " def __init__(self, config: DataTransformationConfig):\n", " self.config = config\n", " self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)\n", "\n", "\n", "\n", " def convert_examples_to_features(self,example_batch):\n", " input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 800, truncation = True )\n", "\n", " with self.tokenizer.as_target_tokenizer():\n", " target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n", "\n", " return {\n", " 'input_ids' : input_encodings['input_ids'],\n", " 'attention_mask': input_encodings['attention_mask'],\n", " 'labels': target_encodings['input_ids']\n", " }\n", "\n", "\n", " def convert(self):\n", " dataset = load_from_disk(self.config.data_path)\n", " dataset = dataset.map(self.convert_examples_to_features, batched = True)\n", " dataset.save_to_disk(os.path.join(self.config.root_dir,\"dataset\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "try:\n", " config = ConfigurationManager()\n", " data_transformation_config = config.get_data_transformation_config()\n", " data_transformation = DataTransformation(config=data_transformation_config)\n", " data_transformation.convert()\n", "except Exception as e:\n", " raise e" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }