Spaces:
Sleeping
Sleeping
hakim
commited on
Commit
·
85bba48
1
Parent(s):
ca2e56b
data ingestion added
Browse files- .gitignore +1 -0
- config/config.yaml +8 -0
- main.py +12 -0
- params.yaml +1 -0
- research/data_ingestoin.ipynb +0 -0
- src/textsummarizer/config/configuration.py +31 -0
- src/textsummarizer/conponents/data_ingestion.py +32 -0
- src/textsummarizer/constants/__init__.py +4 -0
- src/textsummarizer/entity/config_entity.py +11 -0
- src/textsummarizer/logging/__init__.py +22 -0
- src/textsummarizer/pipeline/stage_01_data_ingestion.py +17 -0
- src/textsummarizer/utils/common.py +66 -0
.gitignore
CHANGED
@@ -160,3 +160,4 @@ cython_debug/
|
|
160 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
#.idea/
|
|
|
|
160 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
#.idea/
|
163 |
+
artifacts/*
|
config/config.yaml
CHANGED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
artifacts_root: artifacts
|
2 |
+
|
3 |
+
|
4 |
+
data_ingestion:
|
5 |
+
root_dir: artifacts/data_ingestion
|
6 |
+
source_URL: https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
|
7 |
+
local_data_file: artifacts/data_ingestion/data.zip
|
8 |
+
unzip_dir: artifacts/data_ingestion
|
main.py
CHANGED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from textsummarizer.pipeline.stage_01_data_ingestion import DataIngestionPipeline
|
2 |
+
from textsummarizer.logging import logger
|
3 |
+
|
4 |
+
STAGE_NAME = "Data Ingestion stage"
|
5 |
+
try:
|
6 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
7 |
+
data_ingestion = DataIngestionPipeline()
|
8 |
+
data_ingestion.main()
|
9 |
+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
10 |
+
except Exception as e:
|
11 |
+
logger.exception(e)
|
12 |
+
raise e
|
params.yaml
CHANGED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
key : val
|
research/data_ingestoin.ipynb
ADDED
File without changes
|
src/textsummarizer/config/configuration.py
CHANGED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from textsummarizer.constants import *
|
2 |
+
from textsummarizer.utils.common import read_yaml, create_directories
|
3 |
+
from textsummarizer.entity.config_entity import DataIngestionConfig
|
4 |
+
|
5 |
+
class ConfigurationManager:
|
6 |
+
def __init__(
|
7 |
+
self,
|
8 |
+
config_filepath = CONFIG_FILE_PATH,
|
9 |
+
params_filepath = PARAMS_FILE_PATH):
|
10 |
+
|
11 |
+
self.config = read_yaml(config_filepath)
|
12 |
+
self.params = read_yaml(params_filepath)
|
13 |
+
|
14 |
+
create_directories([self.config.artifacts_root])
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
def get_data_ingestion_config(self) -> DataIngestionConfig:
|
19 |
+
config = self.config.data_ingestion
|
20 |
+
|
21 |
+
create_directories([config.root_dir])
|
22 |
+
|
23 |
+
data_ingestion_config = DataIngestionConfig(
|
24 |
+
root_dir=config.root_dir,
|
25 |
+
source_URL=config.source_URL,
|
26 |
+
local_data_file=config.local_data_file,
|
27 |
+
unzip_dir=config.unzip_dir
|
28 |
+
)
|
29 |
+
|
30 |
+
return data_ingestion_config
|
31 |
+
|
src/textsummarizer/conponents/data_ingestion.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import urllib.request as request
|
3 |
+
import zipfile
|
4 |
+
from textsummarizer.logging import logger
|
5 |
+
from textsummarizer.utils.common import get_size
|
6 |
+
from textsummarizer.entity.config_entity import DataIngestionConfig
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
|
10 |
+
class DataIngestion:
|
11 |
+
def __init__(self, config: DataIngestionConfig):
|
12 |
+
self.config = config
|
13 |
+
|
14 |
+
|
15 |
+
def download_file(self):
|
16 |
+
if not os.path.exists(self.config.local_data_file):
|
17 |
+
filename, headers = request.urlretrieve(
|
18 |
+
url=self.config.source_URL,
|
19 |
+
filename=self.config.local_data_file
|
20 |
+
)
|
21 |
+
logger.info(f'Download! with following info: \n{headers}')
|
22 |
+
else:
|
23 |
+
logger.info(f'File already exist of size: {get_size(Path(self.config.local_data_file))}')
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
def extract_file(self):
|
28 |
+
unzip_dir = self.config.unzip_dir
|
29 |
+
os.makedirs(unzip_dir, exist_ok=True)
|
30 |
+
with zipfile.ZipFile(self.config.local_data_file, 'r') as file :
|
31 |
+
file.extractall(unzip_dir)
|
32 |
+
|
src/textsummarizer/constants/__init__.py
CHANGED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
CONFIG_FILE_PATH = Path('config/config.yaml')
|
4 |
+
PARAMS_FILE_PATH = Path('params.yaml')
|
src/textsummarizer/entity/config_entity.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
@dataclass(frozen=True)
|
5 |
+
class DataIngestionConfig:
|
6 |
+
root_dir : Path
|
7 |
+
source_URL : str
|
8 |
+
local_data_file : Path
|
9 |
+
unzip_dir : Path
|
10 |
+
|
11 |
+
|
src/textsummarizer/logging/__init__.py
CHANGED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import logging
|
4 |
+
|
5 |
+
logging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
|
6 |
+
log_dir = "logs"
|
7 |
+
log_filepath = os.path.join(log_dir,"running_logs.log")
|
8 |
+
os.makedirs(log_dir, exist_ok=True)
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
logging.basicConfig(
|
13 |
+
level= logging.INFO,
|
14 |
+
format= logging_str,
|
15 |
+
|
16 |
+
handlers=[
|
17 |
+
logging.FileHandler(log_filepath),
|
18 |
+
logging.StreamHandler(sys.stdout)
|
19 |
+
]
|
20 |
+
)
|
21 |
+
|
22 |
+
logger = logging.getLogger("textsummarizerLogger")
|
src/textsummarizer/pipeline/stage_01_data_ingestion.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from textsummarizer.config.configuration import ConfigurationManager
|
2 |
+
from textsummarizer.entity.config_entity import DataIngestionConfig
|
3 |
+
from textsummarizer.conponents.data_ingestion import DataIngestion
|
4 |
+
|
5 |
+
|
6 |
+
class DataIngestionPipeline:
|
7 |
+
def __init__(self):
|
8 |
+
pass
|
9 |
+
|
10 |
+
def main(self):
|
11 |
+
config = ConfigurationManager()
|
12 |
+
data_ingestion_config = config.get_data_ingestion_config()
|
13 |
+
data_ingestion = DataIngestion(config=data_ingestion_config)
|
14 |
+
data_ingestion.download_file()
|
15 |
+
data_ingestion.extract_file()
|
16 |
+
|
17 |
+
|
src/textsummarizer/utils/common.py
CHANGED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from box.exceptions import BoxValueError
|
3 |
+
import yaml
|
4 |
+
from textsummarizer.logging import logger
|
5 |
+
from ensure import ensure_annotations
|
6 |
+
from box import ConfigBox
|
7 |
+
from pathlib import Path
|
8 |
+
from typing import Any
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
@ensure_annotations
|
13 |
+
def read_yaml(path_to_yaml: Path) -> ConfigBox:
|
14 |
+
"""reads yaml file and returns
|
15 |
+
|
16 |
+
Args:
|
17 |
+
path_to_yaml (str): path like input
|
18 |
+
|
19 |
+
Raises:
|
20 |
+
ValueError: if yaml file is empty
|
21 |
+
e: empty file
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
ConfigBox: ConfigBox type
|
25 |
+
"""
|
26 |
+
try:
|
27 |
+
with open(path_to_yaml) as yaml_file:
|
28 |
+
content = yaml.safe_load(yaml_file)
|
29 |
+
logger.info(f"yaml file: {path_to_yaml} loaded successfully")
|
30 |
+
return ConfigBox(content)
|
31 |
+
except BoxValueError:
|
32 |
+
raise ValueError("yaml file is empty")
|
33 |
+
except Exception as e:
|
34 |
+
raise e
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
@ensure_annotations
|
39 |
+
def create_directories(path_to_directories: list, verbose=True):
|
40 |
+
"""create list of directories
|
41 |
+
|
42 |
+
Args:
|
43 |
+
path_to_directories (list): list of path of directories
|
44 |
+
ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
|
45 |
+
"""
|
46 |
+
for path in path_to_directories:
|
47 |
+
os.makedirs(path, exist_ok=True)
|
48 |
+
if verbose:
|
49 |
+
logger.info(f"created directory at: {path}")
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
@ensure_annotations
|
54 |
+
def get_size(path: Path) -> str:
|
55 |
+
"""get size in KB
|
56 |
+
|
57 |
+
Args:
|
58 |
+
path (Path): path of the file
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
str: size in KB
|
62 |
+
"""
|
63 |
+
size_in_kb = round(os.path.getsize(path)/1024)
|
64 |
+
return f"~ {size_in_kb} KB"
|
65 |
+
|
66 |
+
|