Spaces:
Sleeping
Sleeping
File size: 11,795 Bytes
34b369f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/pavithra/projects/Text-summarization-nlp'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"%pwd\n",
"os.chdir(\"../\")\n",
"%pwd"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from dataclasses import dataclass\n",
"from pathlib import Path\n",
"\n",
"@dataclass(frozen=True)\n",
"class DataIngestionConfig:\n",
" dataset_name: str\n",
" arrow_dataset_dir: str\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from box import ConfigBox\n",
"from pathlib import Path\n",
"from src.TextSummarizer.constants import file_path\n",
"from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
"\n",
"# Create a config manager.\n",
"class ConfigManager:\n",
" \"\"\"\n",
" Class to manage the configuration files.\n",
" \"\"\"\n",
"\n",
" def __init__(self) -> None:\n",
" self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
" self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
"\n",
" create_directories(path_to_directories=[self.config.artifacts_root])\n",
"\n",
" def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
" \"\"\"\n",
" Get the config which is needed to download the data files.\n",
" \"\"\"\n",
" config: ConfigBox = self.config.data_ingestion\n",
"\n",
" data_ingestion_config: DataIngestionConfig = DataIngestionConfig(\n",
" dataset_name=config.dataset_name,\n",
" arrow_dataset_dir=config.arrow_dataset_dir,\n",
" )\n",
"\n",
" return data_ingestion_config\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# from datasets import load_dataset\n",
"\n",
"# test_dataset = load_dataset(\"alexfabbri/multi_news\")\n",
"\n",
"\n",
"# test_dataset.save_to_disk(\"multi_news_arrow_dataset\")\n",
"# from datasets import load_from_disk\n",
"# arrow_datasets_reloaded = load_from_disk(\"multi_news_arrow_dataset\")\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"\n",
"class DataIngestionComponent:\n",
" \"\"\"\n",
" A Class which is responsible for data ingestion.\n",
" \"\"\"\n",
"\n",
" def __init__(self, config: DataIngestionConfig) -> None:\n",
" self.config = config\n",
"\n",
" def save_dataset(self):\n",
" \"\"\"\n",
" Load the dataset.\n",
" \"\"\"\n",
" test_dataset = load_dataset(self.config.dataset_name)\n",
" test_dataset.save_to_disk(self.config.arrow_dataset_dir)\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'config/config.yaml'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m7\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m data_ingestion\u001b[39m.\u001b[39msave_dataset()\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m err:\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m \u001b[39mraise\u001b[39;00m err\n",
"\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m2\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m config: ConfigManager \u001b[39m=\u001b[39m ConfigManager()\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m data_ingestion_config: DataIngestionConfig \u001b[39m=\u001b[39m config\u001b[39m.\u001b[39mget_data_ingestion_config()\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m data_ingestion \u001b[39m=\u001b[39m DataIngestionComponent(config\u001b[39m=\u001b[39mdata_ingestion_config)\n",
"\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m---> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig: ConfigBox \u001b[39m=\u001b[39m read_yaml(Path(file_path\u001b[39m.\u001b[39;49mCONFIG_FILE_PATH))\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mparams: ConfigBox \u001b[39m=\u001b[39m read_yaml(Path(file_path\u001b[39m.\u001b[39mPARAMS_FILE_PATH))\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=15'>16</a>\u001b[0m create_directories(path_to_directories\u001b[39m=\u001b[39m[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39martifacts_root])\n",
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/ensure/main.py:872\u001b[0m, in \u001b[0;36mWrappedFunctionReturn.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 869\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mArgument \u001b[39m\u001b[39m{arg}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m to \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 870\u001b[0m \u001b[39mraise\u001b[39;00m EnsureError(msg\u001b[39m.\u001b[39mformat(arg\u001b[39m=\u001b[39marg, f\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mf, t\u001b[39m=\u001b[39mtempl, valt\u001b[39m=\u001b[39m\u001b[39mtype\u001b[39m(value)))\n\u001b[0;32m--> 872\u001b[0m return_val \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mf(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 873\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(return_val, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_templ):\n\u001b[1;32m 874\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mReturn value of \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n",
"File \u001b[0;32m~/projects/Text-summarization-nlp/src/TextSummarizer/utils/general.py:33\u001b[0m, in \u001b[0;36mread_yaml\u001b[0;34m(path_to_yaml)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39myaml file: \u001b[39m\u001b[39m{\u001b[39;00mpath_to_yaml\u001b[39m}\u001b[39;00m\u001b[39m is empty.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 32\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m exp:\n\u001b[0;32m---> 33\u001b[0m \u001b[39mraise\u001b[39;00m exp\n",
"File \u001b[0;32m~/projects/Text-summarization-nlp/src/TextSummarizer/utils/general.py:26\u001b[0m, in \u001b[0;36mread_yaml\u001b[0;34m(path_to_yaml)\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 20\u001b[0m \u001b[39mRead yaml file and return as Dictionary.\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \n\u001b[1;32m 22\u001b[0m \u001b[39m:param path_to_yaml: Path to yaml file.\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[39m:return: A ConfigBox dictionary object containing the the yaml file contents.\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 26\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(path_to_yaml) \u001b[39mas\u001b[39;00m yaml_file:\n\u001b[1;32m 27\u001b[0m content \u001b[39m=\u001b[39m yaml\u001b[39m.\u001b[39msafe_load(yaml_file)\n\u001b[1;32m 28\u001b[0m backend_logger\u001b[39m.\u001b[39minfo(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39myaml file: \u001b[39m\u001b[39m{\u001b[39;00mpath_to_yaml\u001b[39m}\u001b[39;00m\u001b[39m loaded successfully\u001b[39m\u001b[39m\"\u001b[39m)\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'config/config.yaml'"
]
}
],
"source": [
"try:\n",
" config: ConfigManager = ConfigManager()\n",
" data_ingestion_config: DataIngestionConfig = config.get_data_ingestion_config()\n",
" data_ingestion = DataIngestionComponent(config=data_ingestion_config)\n",
" data_ingestion.save_dataset()\n",
"except Exception as err:\n",
" raise err"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|