hakim commited on
Commit
85bba48
·
1 Parent(s): ca2e56b

data ingestion added

Browse files
.gitignore CHANGED
@@ -160,3 +160,4 @@ cython_debug/
160
  # and can be added to the global gitignore or merged into this file. For a more nuclear
161
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
  #.idea/
 
 
160
  # and can be added to the global gitignore or merged into this file. For a more nuclear
161
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
  #.idea/
163
+ artifacts/*
config/config.yaml CHANGED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ artifacts_root: artifacts
2
+
3
+
4
+ data_ingestion:
5
+ root_dir: artifacts/data_ingestion
6
+ source_URL: https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
7
+ local_data_file: artifacts/data_ingestion/data.zip
8
+ unzip_dir: artifacts/data_ingestion
main.py CHANGED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textsummarizer.pipeline.stage_01_data_ingestion import DataIngestionPipeline
2
+ from textsummarizer.logging import logger
3
+
4
+ STAGE_NAME = "Data Ingestion stage"
5
+ try:
6
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
7
+ data_ingestion = DataIngestionPipeline()
8
+ data_ingestion.main()
9
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
10
+ except Exception as e:
11
+ logger.exception(e)
12
+ raise e
params.yaml CHANGED
@@ -0,0 +1 @@
 
 
1
+ key : val
research/data_ingestoin.ipynb ADDED
File without changes
src/textsummarizer/config/configuration.py CHANGED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textsummarizer.constants import *
2
+ from textsummarizer.utils.common import read_yaml, create_directories
3
+ from textsummarizer.entity.config_entity import DataIngestionConfig
4
+
5
+ class ConfigurationManager:
6
+ def __init__(
7
+ self,
8
+ config_filepath = CONFIG_FILE_PATH,
9
+ params_filepath = PARAMS_FILE_PATH):
10
+
11
+ self.config = read_yaml(config_filepath)
12
+ self.params = read_yaml(params_filepath)
13
+
14
+ create_directories([self.config.artifacts_root])
15
+
16
+
17
+
18
+ def get_data_ingestion_config(self) -> DataIngestionConfig:
19
+ config = self.config.data_ingestion
20
+
21
+ create_directories([config.root_dir])
22
+
23
+ data_ingestion_config = DataIngestionConfig(
24
+ root_dir=config.root_dir,
25
+ source_URL=config.source_URL,
26
+ local_data_file=config.local_data_file,
27
+ unzip_dir=config.unzip_dir
28
+ )
29
+
30
+ return data_ingestion_config
31
+
src/textsummarizer/conponents/data_ingestion.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import urllib.request as request
3
+ import zipfile
4
+ from textsummarizer.logging import logger
5
+ from textsummarizer.utils.common import get_size
6
+ from textsummarizer.entity.config_entity import DataIngestionConfig
7
+ from pathlib import Path
8
+
9
+
10
+ class DataIngestion:
11
+ def __init__(self, config: DataIngestionConfig):
12
+ self.config = config
13
+
14
+
15
+ def download_file(self):
16
+ if not os.path.exists(self.config.local_data_file):
17
+ filename, headers = request.urlretrieve(
18
+ url=self.config.source_URL,
19
+ filename=self.config.local_data_file
20
+ )
21
+ logger.info(f'Download! with following info: \n{headers}')
22
+ else:
23
+ logger.info(f'File already exist of size: {get_size(Path(self.config.local_data_file))}')
24
+
25
+
26
+
27
+ def extract_file(self):
28
+ unzip_dir = self.config.unzip_dir
29
+ os.makedirs(unzip_dir, exist_ok=True)
30
+ with zipfile.ZipFile(self.config.local_data_file, 'r') as file :
31
+ file.extractall(unzip_dir)
32
+
src/textsummarizer/constants/__init__.py CHANGED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ CONFIG_FILE_PATH = Path('config/config.yaml')
4
+ PARAMS_FILE_PATH = Path('params.yaml')
src/textsummarizer/entity/config_entity.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+ @dataclass(frozen=True)
5
+ class DataIngestionConfig:
6
+ root_dir : Path
7
+ source_URL : str
8
+ local_data_file : Path
9
+ unzip_dir : Path
10
+
11
+
src/textsummarizer/logging/__init__.py CHANGED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+
5
+ logging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
6
+ log_dir = "logs"
7
+ log_filepath = os.path.join(log_dir,"running_logs.log")
8
+ os.makedirs(log_dir, exist_ok=True)
9
+
10
+
11
+
12
+ logging.basicConfig(
13
+ level= logging.INFO,
14
+ format= logging_str,
15
+
16
+ handlers=[
17
+ logging.FileHandler(log_filepath),
18
+ logging.StreamHandler(sys.stdout)
19
+ ]
20
+ )
21
+
22
+ logger = logging.getLogger("textsummarizerLogger")
src/textsummarizer/pipeline/stage_01_data_ingestion.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textsummarizer.config.configuration import ConfigurationManager
2
+ from textsummarizer.entity.config_entity import DataIngestionConfig
3
+ from textsummarizer.conponents.data_ingestion import DataIngestion
4
+
5
+
6
+ class DataIngestionPipeline:
7
+ def __init__(self):
8
+ pass
9
+
10
+ def main(self):
11
+ config = ConfigurationManager()
12
+ data_ingestion_config = config.get_data_ingestion_config()
13
+ data_ingestion = DataIngestion(config=data_ingestion_config)
14
+ data_ingestion.download_file()
15
+ data_ingestion.extract_file()
16
+
17
+
src/textsummarizer/utils/common.py CHANGED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from box.exceptions import BoxValueError
3
+ import yaml
4
+ from textsummarizer.logging import logger
5
+ from ensure import ensure_annotations
6
+ from box import ConfigBox
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+
11
+
12
+ @ensure_annotations
13
+ def read_yaml(path_to_yaml: Path) -> ConfigBox:
14
+ """reads yaml file and returns
15
+
16
+ Args:
17
+ path_to_yaml (str): path like input
18
+
19
+ Raises:
20
+ ValueError: if yaml file is empty
21
+ e: empty file
22
+
23
+ Returns:
24
+ ConfigBox: ConfigBox type
25
+ """
26
+ try:
27
+ with open(path_to_yaml) as yaml_file:
28
+ content = yaml.safe_load(yaml_file)
29
+ logger.info(f"yaml file: {path_to_yaml} loaded successfully")
30
+ return ConfigBox(content)
31
+ except BoxValueError:
32
+ raise ValueError("yaml file is empty")
33
+ except Exception as e:
34
+ raise e
35
+
36
+
37
+
38
+ @ensure_annotations
39
+ def create_directories(path_to_directories: list, verbose=True):
40
+ """create list of directories
41
+
42
+ Args:
43
+ path_to_directories (list): list of path of directories
44
+ ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
45
+ """
46
+ for path in path_to_directories:
47
+ os.makedirs(path, exist_ok=True)
48
+ if verbose:
49
+ logger.info(f"created directory at: {path}")
50
+
51
+
52
+
53
+ @ensure_annotations
54
+ def get_size(path: Path) -> str:
55
+ """get size in KB
56
+
57
+ Args:
58
+ path (Path): path of the file
59
+
60
+ Returns:
61
+ str: size in KB
62
+ """
63
+ size_in_kb = round(os.path.getsize(path)/1024)
64
+ return f"~ {size_in_kb} KB"
65
+
66
+