File size: 11,795 Bytes
34b369f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/home/pavithra/projects/Text-summarization-nlp'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "%pwd\n",
    "os.chdir(\"../\")\n",
    "%pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataclasses import dataclass\n",
    "from pathlib import Path\n",
    "\n",
    "@dataclass(frozen=True)\n",
    "class DataIngestionConfig:\n",
    "    dataset_name: str\n",
    "    arrow_dataset_dir: str\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from box import ConfigBox\n",
    "from pathlib import Path\n",
    "from src.TextSummarizer.constants import file_path\n",
    "from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
    "\n",
    "# Create a config manager.\n",
    "class ConfigManager:\n",
    "    \"\"\"\n",
    "    Class to manage the configuration files.\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self) -> None:\n",
    "        self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
    "        self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
    "\n",
    "        create_directories(path_to_directories=[self.config.artifacts_root])\n",
    "\n",
    "    def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
    "        \"\"\"\n",
    "        Get the config which is needed to download the data files.\n",
    "        \"\"\"\n",
    "        config: ConfigBox = self.config.data_ingestion\n",
    "\n",
    "        data_ingestion_config: DataIngestionConfig = DataIngestionConfig(\n",
    "            dataset_name=config.dataset_name,\n",
    "            arrow_dataset_dir=config.arrow_dataset_dir,\n",
    "        )\n",
    "\n",
    "        return data_ingestion_config\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from datasets import load_dataset\n",
    "\n",
    "# test_dataset = load_dataset(\"alexfabbri/multi_news\")\n",
    "\n",
    "\n",
    "# test_dataset.save_to_disk(\"multi_news_arrow_dataset\")\n",
    "# from datasets import load_from_disk\n",
    "# arrow_datasets_reloaded = load_from_disk(\"multi_news_arrow_dataset\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "class DataIngestionComponent:\n",
    "    \"\"\"\n",
    "    A Class which is responsible for data ingestion.\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self, config: DataIngestionConfig) -> None:\n",
    "        self.config = config\n",
    "\n",
    "    def save_dataset(self):\n",
    "        \"\"\"\n",
    "        Load the dataset.\n",
    "        \"\"\"\n",
    "        test_dataset = load_dataset(self.config.dataset_name)\n",
    "        test_dataset.save_to_disk(self.config.arrow_dataset_dir)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'config/config.yaml'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m7\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m     data_ingestion\u001b[39m.\u001b[39msave_dataset()\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m err:\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m     \u001b[39mraise\u001b[39;00m err\n",
      "\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m2\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m     config: ConfigManager \u001b[39m=\u001b[39m ConfigManager()\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m     data_ingestion_config: DataIngestionConfig \u001b[39m=\u001b[39m config\u001b[39m.\u001b[39mget_data_ingestion_config()\n\u001b[1;32m      <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m     data_ingestion \u001b[39m=\u001b[39m DataIngestionComponent(config\u001b[39m=\u001b[39mdata_ingestion_config)\n",
      "\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m---> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig: ConfigBox \u001b[39m=\u001b[39m read_yaml(Path(file_path\u001b[39m.\u001b[39;49mCONFIG_FILE_PATH))\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mparams: ConfigBox \u001b[39m=\u001b[39m read_yaml(Path(file_path\u001b[39m.\u001b[39mPARAMS_FILE_PATH))\n\u001b[1;32m     <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=15'>16</a>\u001b[0m     create_directories(path_to_directories\u001b[39m=\u001b[39m[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39martifacts_root])\n",
      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/ensure/main.py:872\u001b[0m, in \u001b[0;36mWrappedFunctionReturn.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    869\u001b[0m         msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mArgument \u001b[39m\u001b[39m{arg}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m to \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m    870\u001b[0m         \u001b[39mraise\u001b[39;00m EnsureError(msg\u001b[39m.\u001b[39mformat(arg\u001b[39m=\u001b[39marg, f\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mf, t\u001b[39m=\u001b[39mtempl, valt\u001b[39m=\u001b[39m\u001b[39mtype\u001b[39m(value)))\n\u001b[0;32m--> 872\u001b[0m return_val \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mf(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    873\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(return_val, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_templ):\n\u001b[1;32m    874\u001b[0m     msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mReturn value of \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n",
      "File \u001b[0;32m~/projects/Text-summarization-nlp/src/TextSummarizer/utils/general.py:33\u001b[0m, in \u001b[0;36mread_yaml\u001b[0;34m(path_to_yaml)\u001b[0m\n\u001b[1;32m     31\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39myaml file: \u001b[39m\u001b[39m{\u001b[39;00mpath_to_yaml\u001b[39m}\u001b[39;00m\u001b[39m is empty.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m     32\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m exp:\n\u001b[0;32m---> 33\u001b[0m     \u001b[39mraise\u001b[39;00m exp\n",
      "File \u001b[0;32m~/projects/Text-summarization-nlp/src/TextSummarizer/utils/general.py:26\u001b[0m, in \u001b[0;36mread_yaml\u001b[0;34m(path_to_yaml)\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m     20\u001b[0m \u001b[39mRead yaml file and return as Dictionary.\u001b[39;00m\n\u001b[1;32m     21\u001b[0m \n\u001b[1;32m     22\u001b[0m \u001b[39m:param path_to_yaml: Path to yaml file.\u001b[39;00m\n\u001b[1;32m     23\u001b[0m \u001b[39m:return: A ConfigBox dictionary object containing the the yaml file contents.\u001b[39;00m\n\u001b[1;32m     24\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m     25\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 26\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(path_to_yaml) \u001b[39mas\u001b[39;00m yaml_file:\n\u001b[1;32m     27\u001b[0m         content \u001b[39m=\u001b[39m yaml\u001b[39m.\u001b[39msafe_load(yaml_file)\n\u001b[1;32m     28\u001b[0m         backend_logger\u001b[39m.\u001b[39minfo(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39myaml file: \u001b[39m\u001b[39m{\u001b[39;00mpath_to_yaml\u001b[39m}\u001b[39;00m\u001b[39m loaded successfully\u001b[39m\u001b[39m\"\u001b[39m)\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'config/config.yaml'"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    config: ConfigManager = ConfigManager()\n",
    "    data_ingestion_config: DataIngestionConfig = config.get_data_ingestion_config()\n",
    "    data_ingestion = DataIngestionComponent(config=data_ingestion_config)\n",
    "    data_ingestion.save_dataset()\n",
    "except Exception as err:\n",
    "    raise err"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}