hakim commited on
Commit
4de2404
·
1 Parent(s): 85bba48

data validation added

Browse files
config/config.yaml CHANGED
@@ -5,4 +5,10 @@ data_ingestion:
5
  root_dir: artifacts/data_ingestion
6
  source_URL: https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
7
  local_data_file: artifacts/data_ingestion/data.zip
8
- unzip_dir: artifacts/data_ingestion
 
 
 
 
 
 
 
5
  root_dir: artifacts/data_ingestion
6
  source_URL: https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
7
  local_data_file: artifacts/data_ingestion/data.zip
8
+ unzip_dir: artifacts/data_ingestion
9
+
10
+
11
+ data_validation:
12
+ root_dir: artifacts/data_validation
13
+ STATUS_FILE: artifacts/data_validation/status.txt
14
+ ALL_REQUIRED_FILES: ["train", "test", "validation"]
main.py CHANGED
@@ -1,4 +1,5 @@
1
  from textsummarizer.pipeline.stage_01_data_ingestion import DataIngestionPipeline
 
2
  from textsummarizer.logging import logger
3
 
4
  STAGE_NAME = "Data Ingestion stage"
@@ -7,6 +8,17 @@ try:
7
  data_ingestion = DataIngestionPipeline()
8
  data_ingestion.main()
9
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
 
 
 
 
 
 
 
 
 
 
 
10
  except Exception as e:
11
  logger.exception(e)
12
  raise e
 
1
  from textsummarizer.pipeline.stage_01_data_ingestion import DataIngestionPipeline
2
+ from textsummarizer.pipeline.stage_02_data_validation import DataValidationPipeline
3
  from textsummarizer.logging import logger
4
 
5
  STAGE_NAME = "Data Ingestion stage"
 
8
  data_ingestion = DataIngestionPipeline()
9
  data_ingestion.main()
10
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
11
+ except Exception as e:
12
+ logger.exception(e)
13
+ raise e
14
+
15
+
16
+ STAGE_NAME = "Data Validation stage"
17
+ try:
18
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
19
+ data_ingestion = DataValidationPipeline()
20
+ data_ingestion.main()
21
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
22
  except Exception as e:
23
  logger.exception(e)
24
  raise e
research/data_ingestoin.ipynb CHANGED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.chdir('../')"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "text/plain": [
21
+ "'c:\\\\mlops projects\\\\text-summarization'"
22
+ ]
23
+ },
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "output_type": "execute_result"
27
+ }
28
+ ],
29
+ "source": [
30
+ "%pwd"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 3,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "from dataclasses import dataclass\n",
40
+ "from pathlib import Path\n",
41
+ "@dataclass(frozen=True)\n",
42
+ "class DataIngestionConfig:\n",
43
+ " root_dir : Path\n",
44
+ " source_URL : str\n",
45
+ " local_data_file : Path\n",
46
+ " unzip_dir : Path"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 4,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "from textsummarizer.constants import *\n",
56
+ "from textsummarizer.utils.common import read_yaml, create_directories\n",
57
+ "\n",
58
+ "\n",
59
+ "class ConfigurationManager:\n",
60
+ " def __init__(\n",
61
+ " self,\n",
62
+ " config_filepath = CONFIG_FILE_PATH,\n",
63
+ " params_filepath = PARAMS_FILE_PATH):\n",
64
+ "\n",
65
+ " self.config = read_yaml(config_filepath)\n",
66
+ " self.params = read_yaml(params_filepath)\n",
67
+ "\n",
68
+ " create_directories([self.config.artifacts_root])\n",
69
+ "\n",
70
+ " \n",
71
+ "\n",
72
+ " def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
73
+ " config = self.config.data_ingestion\n",
74
+ "\n",
75
+ " create_directories([config.root_dir])\n",
76
+ "\n",
77
+ " data_ingestion_config = DataIngestionConfig(\n",
78
+ " root_dir=config.root_dir,\n",
79
+ " source_URL=config.source_URL,\n",
80
+ " local_data_file=config.local_data_file,\n",
81
+ " unzip_dir=config.unzip_dir \n",
82
+ " )\n",
83
+ "\n",
84
+ " return data_ingestion_config\n",
85
+ " "
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 5,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "import os\n",
95
+ "import urllib.request as request\n",
96
+ "import zipfile\n",
97
+ "from textsummarizer.logging import logger\n",
98
+ "from textsummarizer.utils.common import get_size"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 8,
104
+ "metadata": {},
105
+ "outputs": [],
106
+ "source": [
107
+ "class DataIngestion:\n",
108
+ " def __init__(self, config: DataIngestionConfig):\n",
109
+ " self.config = config\n",
110
+ " \n",
111
+ " \n",
112
+ " def download_file(self):\n",
113
+ " if not os.path.exists(self.config.local_data_file):\n",
114
+ " filename, header = request.urlretrieve(\n",
115
+ " url=self.config.source_URL,\n",
116
+ " filename = self.config.local_data_file\n",
117
+ " )\n",
118
+ " logger.info(f'{filename} download! with following info: \\n{header}')\n",
119
+ " \n",
120
+ " else:\n",
121
+ " logger.info(f\"File already exist size {get_size(Path(self.config.local_data_file))}\")\n",
122
+ " \n",
123
+ " \n",
124
+ " \n",
125
+ " \n",
126
+ " \n",
127
+ " def extract_zip_file(self):\n",
128
+ " \"\"\"\n",
129
+ " zip_file_path: str\n",
130
+ " Extracts the zip file into the data directory\n",
131
+ " Function returns None\n",
132
+ " \"\"\"\n",
133
+ " unzip_path = self.config.unzip_dir\n",
134
+ " os.makedirs(unzip_path, exist_ok=True)\n",
135
+ " with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:\n",
136
+ " zip_ref.extractall(unzip_path)\n",
137
+ " \n",
138
+ " \n",
139
+ " \n",
140
+ " "
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 10,
146
+ "metadata": {},
147
+ "outputs": [
148
+ {
149
+ "name": "stdout",
150
+ "output_type": "stream",
151
+ "text": [
152
+ "[2024-08-11 15:50:51,008: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
153
+ "[2024-08-11 15:50:51,011: INFO: common: yaml file: params.yaml loaded successfully]\n",
154
+ "[2024-08-11 15:50:51,012: INFO: common: created directory at: artifacts]\n",
155
+ "[2024-08-11 15:50:51,014: INFO: common: created directory at: artifacts/data_ingestion]\n",
156
+ "[2024-08-11 15:50:51,016: INFO: 4172299431: File already exist size ~ 7718 KB]\n"
157
+ ]
158
+ }
159
+ ],
160
+ "source": [
161
+ "try:\n",
162
+ " config = ConfigurationManager()\n",
163
+ " data_ingestion_config = config.get_data_ingestion_config()\n",
164
+ " data_ingestion = DataIngestion(config=data_ingestion_config)\n",
165
+ " data_ingestion.download_file()\n",
166
+ " data_ingestion.extract_zip_file()\n",
167
+ "except Exception as e:\n",
168
+ " raise e"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": null,
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": []
177
+ }
178
+ ],
179
+ "metadata": {
180
+ "kernelspec": {
181
+ "display_name": "Python 3",
182
+ "language": "python",
183
+ "name": "python3"
184
+ },
185
+ "language_info": {
186
+ "codemirror_mode": {
187
+ "name": "ipython",
188
+ "version": 3
189
+ },
190
+ "file_extension": ".py",
191
+ "mimetype": "text/x-python",
192
+ "name": "python",
193
+ "nbconvert_exporter": "python",
194
+ "pygments_lexer": "ipython3",
195
+ "version": "3.11.0"
196
+ }
197
+ },
198
+ "nbformat": 4,
199
+ "nbformat_minor": 2
200
+ }
research/data_validation.ipynb ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.chdir('../')"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "text/plain": [
21
+ "'c:\\\\mlops projects\\\\text-summarization'"
22
+ ]
23
+ },
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "output_type": "execute_result"
27
+ }
28
+ ],
29
+ "source": [
30
+ "%pwd"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "from dataclasses import dataclass\n",
40
+ "from pathlib import Path\n",
41
+ "@dataclass(frozen=True)\n",
42
+ "class DataValidationConfig:\n",
43
+ " root_dir : Path\n",
44
+ " STATUS_FILE : Path\n",
45
+ " ALL_REQUIRED_FILES : "
46
+ ]
47
+ }
48
+ ],
49
+ "metadata": {
50
+ "kernelspec": {
51
+ "display_name": "Python 3",
52
+ "language": "python",
53
+ "name": "python3"
54
+ },
55
+ "language_info": {
56
+ "codemirror_mode": {
57
+ "name": "ipython",
58
+ "version": 3
59
+ },
60
+ "file_extension": ".py",
61
+ "mimetype": "text/x-python",
62
+ "name": "python",
63
+ "nbconvert_exporter": "python",
64
+ "pygments_lexer": "ipython3",
65
+ "version": "3.11.0"
66
+ }
67
+ },
68
+ "nbformat": 4,
69
+ "nbformat_minor": 2
70
+ }
src/textsummarizer/config/configuration.py CHANGED
@@ -1,6 +1,7 @@
1
  from textsummarizer.constants import *
2
  from textsummarizer.utils.common import read_yaml, create_directories
3
- from textsummarizer.entity.config_entity import DataIngestionConfig
 
4
 
5
  class ConfigurationManager:
6
  def __init__(
@@ -28,4 +29,18 @@ class ConfigurationManager:
28
  )
29
 
30
  return data_ingestion_config
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
 
1
  from textsummarizer.constants import *
2
  from textsummarizer.utils.common import read_yaml, create_directories
3
+ from textsummarizer.entity.config_entity import (DataIngestionConfig,
4
+ DataValidationConfig)
5
 
6
  class ConfigurationManager:
7
  def __init__(
 
29
  )
30
 
31
  return data_ingestion_config
32
+
33
+
34
+ def get_data_validation_config(self) -> DataValidationConfig:
35
+ config = self.config.data_validation
36
+
37
+ create_directories([config.root_dir])
38
+
39
+ data_validation_config = DataValidationConfig(
40
+ root_dir=config.root_dir,
41
+ STATUS_FILE=config.STATUS_FILE,
42
+ ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
43
+ )
44
+
45
+ return data_validation_config
46
 
src/textsummarizer/conponents/data_validation.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from textsummarizer.logging import logger
3
+ from textsummarizer.entity.config_entity import DataValidationConfig
4
+
5
+ class DataValiadtion:
6
+ def __init__(self, config: DataValidationConfig):
7
+ self.config = config
8
+
9
+
10
+
11
+ def validate_all_files_exist(self)-> bool:
12
+ try:
13
+ validation_status = None
14
+
15
+ all_files = os.listdir(os.path.join("artifacts","data_ingestion","samsum_dataset"))
16
+
17
+ for file in all_files:
18
+ if file not in self.config.ALL_REQUIRED_FILES:
19
+ validation_status = False
20
+ with open(self.config.STATUS_FILE, 'w') as f:
21
+ f.write(f"Validation status: {validation_status}")
22
+ else:
23
+ validation_status = True
24
+ with open(self.config.STATUS_FILE, 'w') as f:
25
+ f.write(f"Validation status: {validation_status}")
26
+
27
+ return validation_status
28
+
29
+ except Exception as e:
30
+ raise e
src/textsummarizer/entity/config_entity.py CHANGED
@@ -9,3 +9,9 @@ class DataIngestionConfig:
9
  unzip_dir : Path
10
 
11
 
 
 
 
 
 
 
 
9
  unzip_dir : Path
10
 
11
 
12
+
13
+ @dataclass(frozen=True)
14
+ class DataValidationConfig:
15
+ root_dir : Path
16
+ STATUS_FILE : str
17
+ ALL_REQUIRED_FILES : list
src/textsummarizer/pipeline/stage_02_data_validation.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textsummarizer.config.configuration import ConfigurationManager
2
+ from textsummarizer.entity.config_entity import DataValidationConfig
3
+ from textsummarizer.conponents.data_validation import DataValiadtion
4
+
5
+
6
+ class DataValidationPipeline:
7
+ def __init__(self):
8
+ pass
9
+
10
+ def main(self):
11
+ config = ConfigurationManager()
12
+ data_validataion_config = config.get_data_validation_config()
13
+ data_validation = DataValiadtion(config=data_validataion_config)
14
+ data_validation.validate_all_files_exist()