Spaces:
Sleeping
Sleeping
File size: 3,976 Bytes
34b369f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"%pwd\n",
"os.chdir(\"../\")\n",
"\n",
"%pwd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from dataclasses import dataclass\n",
"from pathlib import Path\n",
"\n",
"\n",
"@dataclass(frozen=True)\n",
"class DataTransformationConfig:\n",
" root_dir: str\n",
" data_path: str\n",
" tokenizer_name: str"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from box import ConfigBox\n",
"from pathlib import Path\n",
"from src.TextSummarizer.constants import file_path\n",
"from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
"\n",
"class ConfigurationManager:\n",
" \"\"\"\n",
" Class to manage the configuration files.\n",
" \"\"\"\n",
"\n",
" def __init__(self) -> None:\n",
" self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
" self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
"\n",
" create_directories(path_to_directories=[self.config.artifacts_root])\n",
"\n",
" def get_data_transformation_config(self) -> DataTransformationConfig:\n",
" config = self.config.data_transformation\n",
"\n",
" create_directories([config.root_dir])\n",
"\n",
" data_transformation_config = DataTransformationConfig(\n",
" root_dir=config.root_dir,\n",
" data_path=config.data_path,\n",
" tokenizer_name = config.tokenizer_name\n",
" )\n",
"\n",
" return data_transformation_config"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"import os\n",
"from src.TextSummarizer.logger import backend_logger\n",
"from transformers import AutoTokenizer\n",
"from datasets import load_dataset, load_from_disk\n",
"\n",
"\n",
"class DataTransformation:\n",
" def __init__(self, config: DataTransformationConfig):\n",
" self.config = config\n",
" self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)\n",
"\n",
"\n",
"\n",
" def convert_examples_to_features(self,example_batch):\n",
" input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 800, truncation = True )\n",
"\n",
" with self.tokenizer.as_target_tokenizer():\n",
" target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
"\n",
" return {\n",
" 'input_ids' : input_encodings['input_ids'],\n",
" 'attention_mask': input_encodings['attention_mask'],\n",
" 'labels': target_encodings['input_ids']\n",
" }\n",
"\n",
"\n",
" def convert(self):\n",
" dataset = load_from_disk(self.config.data_path)\n",
" dataset = dataset.map(self.convert_examples_to_features, batched = True)\n",
" dataset.save_to_disk(os.path.join(self.config.root_dir,\"dataset\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" config = ConfigurationManager()\n",
" data_transformation_config = config.get_data_transformation_config()\n",
" data_transformation = DataTransformation(config=data_transformation_config)\n",
" data_transformation.convert()\n",
"except Exception as e:\n",
" raise e"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|