{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.chdir('../')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'c:\\\\mlops projects\\\\text-summarization'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%pwd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "from pathlib import Path\n", "@dataclass(frozen=True)\n", "class DataTransformationConfig:\n", " root_dir : Path\n", " data_path : Path\n", " tokenizer_name : Path" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from textsummarizer.constants import *\n", "from textsummarizer.utils.common import read_yaml, create_directories\n", "\n", "\n", "class ConfigurationManager:\n", " def __init__(\n", " self,\n", " config_filepath = CONFIG_FILE_PATH,\n", " params_filepath = PARAMS_FILE_PATH):\n", "\n", " self.config = read_yaml(config_filepath)\n", " self.params = read_yaml(params_filepath)\n", "\n", " create_directories([self.config.artifacts_root])\n", "\n", "\n", " \n", " def get_data_transformation_config(self) -> DataTransformationConfig:\n", " config = self.config.data_transformation\n", "\n", " create_directories([config.root_dir])\n", "\n", " data_transformation_config = DataTransformationConfig(\n", " root_dir=config.root_dir,\n", " data_path=config.data_path,\n", " tokenizer_name = config.tokenizer_name\n", " )\n", "\n", " return data_transformation_config\n", "\n", " " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-08-11 18:13:05,488: INFO: config: PyTorch version 2.2.2+cu121 available.]\n", "[2024-08-11 18:13:05,490: INFO: config: TensorFlow version 2.12.0 available.]\n" ] } ], "source": [ "import os\n", "from textsummarizer.logging import logger\n", "from transformers import AutoTokenizer\n", "from datasets import load_dataset, load_from_disk" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "class DataTransformation:\n", " def __init__(self, config : DataTransformationConfig):\n", " self.config = config\n", " self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)\n", " \n", " \n", " def convert_examples_to_features(self, example_batch):\n", " input_encoding = self.tokenizer(example_batch['dialogue'], max_lenght = 1024, truncation = True)\n", " \n", " with self.tokenizer.as_target_tokenizer():\n", " target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n", " \n", " return {\n", " 'input_ids' : input_encoding['input_ids'],\n", " 'attention_mask': input_encoding['attention_mask'],\n", " 'labels': target_encodings['input_ids']\n", " }\n", " \n", " def convert(self):\n", " dataset_samsum = load_from_disk(self.config.data_path)\n", " dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)\n", " dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,\"samsum_dataset\")) " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-08-11 18:13:05,753: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", "[2024-08-11 18:13:05,757: INFO: common: yaml file: params.yaml loaded successfully]\n", "[2024-08-11 18:13:05,758: INFO: common: created directory at: artifacts]\n", "[2024-08-11 18:13:05,760: INFO: common: created directory at: artifacts/data_transformation]\n" ] }, { "ename": "TypeError", "evalue": "DataTransformation() takes no arguments", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[7], line 7\u001b[0m\n\u001b[0;32m 5\u001b[0m data_transformation\u001b[38;5;241m.\u001b[39mconvert()\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n", "Cell \u001b[1;32mIn[7], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m config \u001b[38;5;241m=\u001b[39m ConfigurationManager()\n\u001b[0;32m 3\u001b[0m data_transformation_config \u001b[38;5;241m=\u001b[39m config\u001b[38;5;241m.\u001b[39mget_data_transformation_config()\n\u001b[1;32m----> 4\u001b[0m data_transformation \u001b[38;5;241m=\u001b[39m \u001b[43mDataTransformation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_transformation_config\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 5\u001b[0m data_transformation\u001b[38;5;241m.\u001b[39mconvert()\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", "\u001b[1;31mTypeError\u001b[0m: DataTransformation() takes no arguments" ] } ], "source": [ "try:\n", " config = ConfigurationManager()\n", " data_transformation_config = config.get_data_transformation_config()\n", " data_transformation = DataTransformation(config=data_transformation_config)\n", " data_transformation.convert()\n", "except Exception as e:\n", " raise e" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 2 }