{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.chdir('../')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'c:\\\\mlops projects\\\\text-summarization'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%pwd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "from pathlib import Path\n", "@dataclass(frozen=True)\n", "class DataTransformationConfig:\n", " root_dir : Path\n", " data_path : Path\n", " tokenizer_name : Path" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from textsummarizer.constants import *\n", "from textsummarizer.utils.common import read_yaml, create_directories\n", "\n", "\n", "class ConfigurationManager:\n", " def __init__(\n", " self,\n", " config_filepath = CONFIG_FILE_PATH,\n", " params_filepath = PARAMS_FILE_PATH):\n", "\n", " self.config = read_yaml(config_filepath)\n", " self.params = read_yaml(params_filepath)\n", "\n", " create_directories([self.config.artifacts_root])\n", "\n", "\n", " \n", " def get_data_transformation_config(self) -> DataTransformationConfig:\n", " config = self.config.data_transformation\n", "\n", " create_directories([config.root_dir])\n", "\n", " data_transformation_config = DataTransformationConfig(\n", " root_dir=config.root_dir,\n", " data_path=config.data_path,\n", " tokenizer_name = config.tokenizer_name\n", " )\n", "\n", " return data_transformation_config\n", "\n", " " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-08-11 18:13:05,488: INFO: config: PyTorch version 2.2.2+cu121 available.]\n", "[2024-08-11 18:13:05,490: INFO: config: TensorFlow version 2.12.0 available.]\n" ] } ], "source": [ "import os\n", "from textsummarizer.logging import logger\n", "from transformers import AutoTokenizer\n", "from datasets import load_dataset, load_from_disk" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "class DataTransformation:\n", " def __init__(self, config : DataTransformationConfig):\n", " self.config = config\n", " self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)\n", " \n", " \n", " def convert_examples_to_features(self, example_batch):\n", " input_encoding = self.tokenizer(example_batch['dialogue'], max_length = 1024, truncation = True)\n", " \n", " with self.tokenizer.as_target_tokenizer():\n", " target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n", " \n", " return {\n", " 'input_ids' : input_encoding['input_ids'],\n", " 'attention_mask': input_encoding['attention_mask'],\n", " 'labels': target_encodings['input_ids']\n", " }\n", " \n", " def convert(self):\n", " dataset_samsum = load_from_disk(self.config.data_path)\n", " dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)\n", " dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,\"samsum_dataset\")) " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-08-11 18:13:44,678: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", "[2024-08-11 18:13:44,681: INFO: common: yaml file: params.yaml loaded successfully]\n", "[2024-08-11 18:13:44,684: INFO: common: created directory at: artifacts]\n", "[2024-08-11 18:13:44,686: INFO: common: created directory at: artifacts/data_transformation]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bdedbcfbff63497081e37ad9b20a6c31", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/14732 [00:00