{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/home/pavithra/projects/Text-summarization-nlp'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "%pwd\n", "os.chdir(\"../\")\n", "%pwd" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "from pathlib import Path\n", "\n", "@dataclass(frozen=True)\n", "class DataIngestionConfig:\n", " dataset_name: str\n", " arrow_dataset_dir: str\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from box import ConfigBox\n", "from pathlib import Path\n", "from src.TextSummarizer.constants import file_path\n", "from src.TextSummarizer.utils.general import read_yaml, create_directories\n", "\n", "# Create a config manager.\n", "class ConfigManager:\n", " \"\"\"\n", " Class to manage the configuration files.\n", " \"\"\"\n", "\n", " def __init__(self) -> None:\n", " self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n", " self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n", "\n", " create_directories(path_to_directories=[self.config.artifacts_root])\n", "\n", " def get_data_ingestion_config(self) -> DataIngestionConfig:\n", " \"\"\"\n", " Get the config which is needed to download the data files.\n", " \"\"\"\n", " config: ConfigBox = self.config.data_ingestion\n", "\n", " data_ingestion_config: DataIngestionConfig = DataIngestionConfig(\n", " dataset_name=config.dataset_name,\n", " arrow_dataset_dir=config.arrow_dataset_dir,\n", " )\n", "\n", " return data_ingestion_config\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# from datasets import load_dataset\n", "\n", "# test_dataset = load_dataset(\"alexfabbri/multi_news\")\n", "\n", "\n", "# test_dataset.save_to_disk(\"multi_news_arrow_dataset\")\n", "# from datasets import load_from_disk\n", "# arrow_datasets_reloaded = load_from_disk(\"multi_news_arrow_dataset\")\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "class DataIngestionComponent:\n", " \"\"\"\n", " A Class which is responsible for data ingestion.\n", " \"\"\"\n", "\n", " def __init__(self, config: DataIngestionConfig) -> None:\n", " self.config = config\n", "\n", " def save_dataset(self):\n", " \"\"\"\n", " Load the dataset.\n", " \"\"\"\n", " test_dataset = load_dataset(self.config.dataset_name)\n", " test_dataset.save_to_disk(self.config.arrow_dataset_dir)\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: 'config/config.yaml'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m7\n\u001b[1;32m 5\u001b[0m data_ingestion\u001b[39m.\u001b[39msave_dataset()\n\u001b[1;32m 6\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m err:\n\u001b[0;32m----> 7\u001b[0m \u001b[39mraise\u001b[39;00m err\n", "\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m2\n\u001b[1;32m 1\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m----> 2\u001b[0m config: ConfigManager \u001b[39m=\u001b[39m ConfigManager()\n\u001b[1;32m 3\u001b[0m data_ingestion_config: DataIngestionConfig \u001b[39m=\u001b[39m config\u001b[39m.\u001b[39mget_data_ingestion_config()\n\u001b[1;32m 4\u001b[0m data_ingestion \u001b[39m=\u001b[39m DataIngestionComponent(config\u001b[39m=\u001b[39mdata_ingestion_config)\n", "\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 12\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m---> 13\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig: ConfigBox \u001b[39m=\u001b[39m read_yaml(Path(file_path\u001b[39m.\u001b[39;49mCONFIG_FILE_PATH))\n\u001b[1;32m 14\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mparams: ConfigBox \u001b[39m=\u001b[39m read_yaml(Path(file_path\u001b[39m.\u001b[39mPARAMS_FILE_PATH))\n\u001b[1;32m 16\u001b[0m create_directories(path_to_directories\u001b[39m=\u001b[39m[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39martifacts_root])\n", "File \u001b[0;32m~/.local/lib/python3.8/site-packages/ensure/main.py:872\u001b[0m, in \u001b[0;36mWrappedFunctionReturn.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 869\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mArgument \u001b[39m\u001b[39m{arg}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m to \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 870\u001b[0m \u001b[39mraise\u001b[39;00m EnsureError(msg\u001b[39m.\u001b[39mformat(arg\u001b[39m=\u001b[39marg, f\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mf, t\u001b[39m=\u001b[39mtempl, valt\u001b[39m=\u001b[39m\u001b[39mtype\u001b[39m(value)))\n\u001b[0;32m--> 872\u001b[0m return_val \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mf(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 873\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(return_val, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_templ):\n\u001b[1;32m 874\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mReturn value of \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n", "File \u001b[0;32m~/projects/Text-summarization-nlp/src/TextSummarizer/utils/general.py:33\u001b[0m, in \u001b[0;36mread_yaml\u001b[0;34m(path_to_yaml)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39myaml file: \u001b[39m\u001b[39m{\u001b[39;00mpath_to_yaml\u001b[39m}\u001b[39;00m\u001b[39m is empty.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 32\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m exp:\n\u001b[0;32m---> 33\u001b[0m \u001b[39mraise\u001b[39;00m exp\n", "File \u001b[0;32m~/projects/Text-summarization-nlp/src/TextSummarizer/utils/general.py:26\u001b[0m, in \u001b[0;36mread_yaml\u001b[0;34m(path_to_yaml)\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 20\u001b[0m \u001b[39mRead yaml file and return as Dictionary.\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \n\u001b[1;32m 22\u001b[0m \u001b[39m:param path_to_yaml: Path to yaml file.\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[39m:return: A ConfigBox dictionary object containing the the yaml file contents.\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 26\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(path_to_yaml) \u001b[39mas\u001b[39;00m yaml_file:\n\u001b[1;32m 27\u001b[0m content \u001b[39m=\u001b[39m yaml\u001b[39m.\u001b[39msafe_load(yaml_file)\n\u001b[1;32m 28\u001b[0m backend_logger\u001b[39m.\u001b[39minfo(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39myaml file: \u001b[39m\u001b[39m{\u001b[39;00mpath_to_yaml\u001b[39m}\u001b[39;00m\u001b[39m loaded successfully\u001b[39m\u001b[39m\"\u001b[39m)\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'config/config.yaml'" ] } ], "source": [ "try:\n", " config: ConfigManager = ConfigManager()\n", " data_ingestion_config: DataIngestionConfig = config.get_data_ingestion_config()\n", " data_ingestion = DataIngestionComponent(config=data_ingestion_config)\n", " data_ingestion.save_dataset()\n", "except Exception as err:\n", " raise err" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 2 }