File size: 3,976 Bytes
34b369f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "%pwd\n",
    "os.chdir(\"../\")\n",
    "\n",
    "%pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataclasses import dataclass\n",
    "from pathlib import Path\n",
    "\n",
    "\n",
    "@dataclass(frozen=True)\n",
    "class DataTransformationConfig:\n",
    "    root_dir: str\n",
    "    data_path: str\n",
    "    tokenizer_name: str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from box import ConfigBox\n",
    "from pathlib import Path\n",
    "from src.TextSummarizer.constants import file_path\n",
    "from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
    "\n",
    "class ConfigurationManager:\n",
    "    \"\"\"\n",
    "    Class to manage the configuration files.\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self) -> None:\n",
    "        self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
    "        self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
    "\n",
    "        create_directories(path_to_directories=[self.config.artifacts_root])\n",
    "\n",
    "    def get_data_transformation_config(self) -> DataTransformationConfig:\n",
    "        config = self.config.data_transformation\n",
    "\n",
    "        create_directories([config.root_dir])\n",
    "\n",
    "        data_transformation_config = DataTransformationConfig(\n",
    "            root_dir=config.root_dir,\n",
    "            data_path=config.data_path,\n",
    "            tokenizer_name = config.tokenizer_name\n",
    "        )\n",
    "\n",
    "        return data_transformation_config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "import os\n",
    "from src.TextSummarizer.logger import backend_logger\n",
    "from transformers import AutoTokenizer\n",
    "from datasets import load_dataset, load_from_disk\n",
    "\n",
    "\n",
    "class DataTransformation:\n",
    "    def __init__(self, config: DataTransformationConfig):\n",
    "        self.config = config\n",
    "        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)\n",
    "\n",
    "\n",
    "\n",
    "    def convert_examples_to_features(self,example_batch):\n",
    "        input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 800, truncation = True )\n",
    "\n",
    "        with self.tokenizer.as_target_tokenizer():\n",
    "            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
    "\n",
    "        return {\n",
    "            'input_ids' : input_encodings['input_ids'],\n",
    "            'attention_mask': input_encodings['attention_mask'],\n",
    "            'labels': target_encodings['input_ids']\n",
    "        }\n",
    "\n",
    "\n",
    "    def convert(self):\n",
    "        dataset = load_from_disk(self.config.data_path)\n",
    "        dataset = dataset.map(self.convert_examples_to_features, batched = True)\n",
    "        dataset.save_to_disk(os.path.join(self.config.root_dir,\"dataset\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    config = ConfigurationManager()\n",
    "    data_transformation_config = config.get_data_transformation_config()\n",
    "    data_transformation = DataTransformation(config=data_transformation_config)\n",
    "    data_transformation.convert()\n",
    "except Exception as e:\n",
    "    raise e"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}