pavithra-devi commited on
Commit
34b369f
·
1 Parent(s): 022c91d

added the appilication

Browse files
Files changed (42) hide show
  1. .github/workflows/.gitkeep +0 -0
  2. .gitignore +166 -0
  3. Dockerfile +0 -0
  4. LICENSE +21 -0
  5. app.py +49 -0
  6. check_code/01_data_dowanloading.ipynb +179 -0
  7. check_code/02_data_validation.ipynb +150 -0
  8. check_code/03_data_transformation.ipynb +132 -0
  9. check_code/04_model_training.ipynb +184 -0
  10. check_code/05_model_evaluation.ipynb +177 -0
  11. check_code/predict.ipynb +504 -0
  12. check_code/trials.ipynb +0 -0
  13. main.py +74 -0
  14. project_structure.py +52 -0
  15. requirements.txt +21 -0
  16. setup.py +36 -0
  17. src/TextSummarizer/components/__init__.py +0 -0
  18. src/TextSummarizer/components/data_ingestion.py +25 -0
  19. src/TextSummarizer/components/data_transformation.py +45 -0
  20. src/TextSummarizer/components/data_validation.py +36 -0
  21. src/TextSummarizer/components/model_evaluation.py +87 -0
  22. src/TextSummarizer/components/train_model.py +70 -0
  23. src/TextSummarizer/config/__init__.py +0 -0
  24. src/TextSummarizer/config/config.yaml +34 -0
  25. src/TextSummarizer/config/config_manager.py +117 -0
  26. src/TextSummarizer/config/params.yaml +10 -0
  27. src/TextSummarizer/constants/__init__.py +0 -0
  28. src/TextSummarizer/constants/file_path.py +8 -0
  29. src/TextSummarizer/entity/__init__.py +0 -0
  30. src/TextSummarizer/entity/entities.py +69 -0
  31. src/TextSummarizer/exception/__init__.py +0 -0
  32. src/TextSummarizer/logger/__init__.py +43 -0
  33. src/TextSummarizer/pipeline/__init__.py +0 -0
  34. src/TextSummarizer/pipeline/prediction.py +25 -0
  35. src/TextSummarizer/pipeline/step_01_data_ingestion.py +25 -0
  36. src/TextSummarizer/pipeline/step_02_data_validation.py +25 -0
  37. src/TextSummarizer/pipeline/step_03_data_transformation.py +25 -0
  38. src/TextSummarizer/pipeline/step_04_train_model.py +25 -0
  39. src/TextSummarizer/pipeline/step_05_model_evaluation.py +25 -0
  40. src/TextSummarizer/utils/__init__.py +0 -0
  41. src/TextSummarizer/utils/general.py +56 -0
  42. src/__init__.py +0 -0
.github/workflows/.gitkeep ADDED
File without changes
.gitignore ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
163
+
164
+ # leave the dataset file folder
165
+ artifacts/
166
+ venv_text_summarizaition/
Dockerfile ADDED
File without changes
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Pavithra Devi M
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import uvicorn
4
+ from fastapi import FastAPI
5
+ from fastapi.responses import Response
6
+ from starlette.responses import RedirectResponse
7
+ from textSummarizer.pipeline.prediction import PredictionPipeline
8
+
9
+ text:str = "What is Text Summarization?"
10
+
11
+ app = FastAPI()
12
+
13
+ @app.get("/", tags=["authentication"])
14
+ async def index():
15
+ """
16
+ The main page.
17
+ """
18
+ return "The API is UP and running."
19
+
20
+
21
+ @app.get("/train")
22
+ async def training():
23
+ """
24
+ The training page.
25
+ """
26
+ try:
27
+ os.system("python main.py")
28
+ return Response("Training successful !!")
29
+
30
+ except Exception as e:
31
+ return Response(f"Error Occurred! {e}")
32
+
33
+
34
+ @app.post("/predict")
35
+ async def predict_route(text):
36
+ """
37
+ The prediction api call.
38
+ """
39
+ try:
40
+
41
+ obj = PredictionPipeline()
42
+ text = obj.predict(text)
43
+ return text
44
+ except Exception as e:
45
+ raise e
46
+
47
+
48
+ if __name__=="__main__":
49
+ uvicorn.run(app, host="0.0.0.0", port=8080)
check_code/01_data_dowanloading.ipynb ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "'/home/pavithra/projects/Text-summarization-nlp'"
12
+ ]
13
+ },
14
+ "execution_count": 1,
15
+ "metadata": {},
16
+ "output_type": "execute_result"
17
+ }
18
+ ],
19
+ "source": [
20
+ "import os\n",
21
+ "%pwd\n",
22
+ "os.chdir(\"../\")\n",
23
+ "%pwd"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 7,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "from dataclasses import dataclass\n",
33
+ "from pathlib import Path\n",
34
+ "\n",
35
+ "@dataclass(frozen=True)\n",
36
+ "class DataIngestionConfig:\n",
37
+ " dataset_name: str\n",
38
+ " arrow_dataset_dir: str\n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 16,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "from box import ConfigBox\n",
48
+ "from pathlib import Path\n",
49
+ "from src.TextSummarizer.constants import file_path\n",
50
+ "from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
51
+ "\n",
52
+ "# Create a config manager.\n",
53
+ "class ConfigManager:\n",
54
+ " \"\"\"\n",
55
+ " Class to manage the configuration files.\n",
56
+ " \"\"\"\n",
57
+ "\n",
58
+ " def __init__(self) -> None:\n",
59
+ " self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
60
+ " self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
61
+ "\n",
62
+ " create_directories(path_to_directories=[self.config.artifacts_root])\n",
63
+ "\n",
64
+ " def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
65
+ " \"\"\"\n",
66
+ " Get the config which is needed to download the data files.\n",
67
+ " \"\"\"\n",
68
+ " config: ConfigBox = self.config.data_ingestion\n",
69
+ "\n",
70
+ " data_ingestion_config: DataIngestionConfig = DataIngestionConfig(\n",
71
+ " dataset_name=config.dataset_name,\n",
72
+ " arrow_dataset_dir=config.arrow_dataset_dir,\n",
73
+ " )\n",
74
+ "\n",
75
+ " return data_ingestion_config\n"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 13,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "# from datasets import load_dataset\n",
85
+ "\n",
86
+ "# test_dataset = load_dataset(\"alexfabbri/multi_news\")\n",
87
+ "\n",
88
+ "\n",
89
+ "# test_dataset.save_to_disk(\"multi_news_arrow_dataset\")\n",
90
+ "# from datasets import load_from_disk\n",
91
+ "# arrow_datasets_reloaded = load_from_disk(\"multi_news_arrow_dataset\")\n"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 14,
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "from datasets import load_dataset\n",
101
+ "\n",
102
+ "class DataIngestionComponent:\n",
103
+ " \"\"\"\n",
104
+ " A Class which is responsible for data ingestion.\n",
105
+ " \"\"\"\n",
106
+ "\n",
107
+ " def __init__(self, config: DataIngestionConfig) -> None:\n",
108
+ " self.config = config\n",
109
+ "\n",
110
+ " def save_dataset(self):\n",
111
+ " \"\"\"\n",
112
+ " Load the dataset.\n",
113
+ " \"\"\"\n",
114
+ " test_dataset = load_dataset(self.config.dataset_name)\n",
115
+ " test_dataset.save_to_disk(self.config.arrow_dataset_dir)\n"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 15,
121
+ "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "ename": "FileNotFoundError",
125
+ "evalue": "[Errno 2] No such file or directory: 'config/config.yaml'",
126
+ "output_type": "error",
127
+ "traceback": [
128
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
129
+ "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
130
+ "\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m7\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m data_ingestion\u001b[39m.\u001b[39msave_dataset()\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m err:\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m \u001b[39mraise\u001b[39;00m err\n",
131
+ "\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m2\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m config: ConfigManager \u001b[39m=\u001b[39m ConfigManager()\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m data_ingestion_config: DataIngestionConfig \u001b[39m=\u001b[39m config\u001b[39m.\u001b[39mget_data_ingestion_config()\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m data_ingestion \u001b[39m=\u001b[39m DataIngestionComponent(config\u001b[39m=\u001b[39mdata_ingestion_config)\n",
132
+ "\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m---> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig: ConfigBox \u001b[39m=\u001b[39m read_yaml(Path(file_path\u001b[39m.\u001b[39;49mCONFIG_FILE_PATH))\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mparams: ConfigBox \u001b[39m=\u001b[39m read_yaml(Path(file_path\u001b[39m.\u001b[39mPARAMS_FILE_PATH))\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=15'>16</a>\u001b[0m create_directories(path_to_directories\u001b[39m=\u001b[39m[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39martifacts_root])\n",
133
+ "File \u001b[0;32m~/.local/lib/python3.8/site-packages/ensure/main.py:872\u001b[0m, in \u001b[0;36mWrappedFunctionReturn.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 869\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mArgument \u001b[39m\u001b[39m{arg}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m to \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 870\u001b[0m \u001b[39mraise\u001b[39;00m EnsureError(msg\u001b[39m.\u001b[39mformat(arg\u001b[39m=\u001b[39marg, f\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mf, t\u001b[39m=\u001b[39mtempl, valt\u001b[39m=\u001b[39m\u001b[39mtype\u001b[39m(value)))\n\u001b[0;32m--> 872\u001b[0m return_val \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mf(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 873\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(return_val, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_templ):\n\u001b[1;32m 874\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mReturn value of \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n",
134
+ "File \u001b[0;32m~/projects/Text-summarization-nlp/src/TextSummarizer/utils/general.py:33\u001b[0m, in \u001b[0;36mread_yaml\u001b[0;34m(path_to_yaml)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39myaml file: \u001b[39m\u001b[39m{\u001b[39;00mpath_to_yaml\u001b[39m}\u001b[39;00m\u001b[39m is empty.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 32\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m exp:\n\u001b[0;32m---> 33\u001b[0m \u001b[39mraise\u001b[39;00m exp\n",
135
+ "File \u001b[0;32m~/projects/Text-summarization-nlp/src/TextSummarizer/utils/general.py:26\u001b[0m, in \u001b[0;36mread_yaml\u001b[0;34m(path_to_yaml)\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 20\u001b[0m \u001b[39mRead yaml file and return as Dictionary.\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \n\u001b[1;32m 22\u001b[0m \u001b[39m:param path_to_yaml: Path to yaml file.\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[39m:return: A ConfigBox dictionary object containing the the yaml file contents.\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 26\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(path_to_yaml) \u001b[39mas\u001b[39;00m yaml_file:\n\u001b[1;32m 27\u001b[0m content \u001b[39m=\u001b[39m yaml\u001b[39m.\u001b[39msafe_load(yaml_file)\n\u001b[1;32m 28\u001b[0m backend_logger\u001b[39m.\u001b[39minfo(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39myaml file: \u001b[39m\u001b[39m{\u001b[39;00mpath_to_yaml\u001b[39m}\u001b[39;00m\u001b[39m loaded successfully\u001b[39m\u001b[39m\"\u001b[39m)\n",
136
+ "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'config/config.yaml'"
137
+ ]
138
+ }
139
+ ],
140
+ "source": [
141
+ "try:\n",
142
+ " config: ConfigManager = ConfigManager()\n",
143
+ " data_ingestion_config: DataIngestionConfig = config.get_data_ingestion_config()\n",
144
+ " data_ingestion = DataIngestionComponent(config=data_ingestion_config)\n",
145
+ " data_ingestion.save_dataset()\n",
146
+ "except Exception as err:\n",
147
+ " raise err"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "metadata": {},
154
+ "outputs": [],
155
+ "source": []
156
+ }
157
+ ],
158
+ "metadata": {
159
+ "kernelspec": {
160
+ "display_name": "Python 3",
161
+ "language": "python",
162
+ "name": "python3"
163
+ },
164
+ "language_info": {
165
+ "codemirror_mode": {
166
+ "name": "ipython",
167
+ "version": 3
168
+ },
169
+ "file_extension": ".py",
170
+ "mimetype": "text/x-python",
171
+ "name": "python",
172
+ "nbconvert_exporter": "python",
173
+ "pygments_lexer": "ipython3",
174
+ "version": "3.8.10"
175
+ }
176
+ },
177
+ "nbformat": 4,
178
+ "nbformat_minor": 2
179
+ }
check_code/02_data_validation.ipynb ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "'/home/pavithra/projects/Text-summarization-nlp'"
12
+ ]
13
+ },
14
+ "execution_count": 1,
15
+ "metadata": {},
16
+ "output_type": "execute_result"
17
+ }
18
+ ],
19
+ "source": [
20
+ "import os\n",
21
+ "%pwd\n",
22
+ "os.chdir(\"../\")\n",
23
+ "\n",
24
+ "%pwd"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "from dataclasses import dataclass\n",
34
+ "from pathlib import Path\n",
35
+ "\n",
36
+ "@dataclass(frozen=True)\n",
37
+ "class DataValidationConfig:\n",
38
+ " root_dir: str\n",
39
+ " status_file: str\n",
40
+ " all_required_folders: list\n"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": null,
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "from src.TextSummarizer.constants import file_path\n",
50
+ "from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
51
+ "\n",
52
+ "class ConfigurationManager:\n",
53
+ " def __init__(\n",
54
+ " self,\n",
55
+ " config_filepath = file_path.CONFIG_FILE_PATH,\n",
56
+ " params_filepath = file_path.PARAMS_FILE_PATH):\n",
57
+ "\n",
58
+ " self.config = read_yaml(config_filepath)\n",
59
+ " self.params = read_yaml(params_filepath)\n",
60
+ "\n",
61
+ " create_directories([self.config.artifacts_root])\n",
62
+ "\n",
63
+ "\n",
64
+ "\n",
65
+ " def get_data_validation_config(self) -> DataValidationConfig:\n",
66
+ " config = self.config.data_validation\n",
67
+ "\n",
68
+ " create_directories([config.root_dir])\n",
69
+ "\n",
70
+ " data_validation_config = DataValidationConfig(\n",
71
+ " root_dir=config.root_dir,\n",
72
+ " status_file=config.status_file,\n",
73
+ " all_required_folders=config.all_required_folders,\n",
74
+ " )\n",
75
+ "\n",
76
+ " return data_validation_config"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": null,
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": [
85
+ "class DataValidation:\n",
86
+ " def __init__(self, config: DataValidationConfig):\n",
87
+ " self.config = config\n",
88
+ "\n",
89
+ "\n",
90
+ "\n",
91
+ " def validate_all_files_exist(self)-> bool:\n",
92
+ " try:\n",
93
+ " validation_status: bool | None = None\n",
94
+ "\n",
95
+ " all_folder = os.listdir(os.path.join(\"artifacts\",\"data\"))\n",
96
+ "\n",
97
+ " for folder in all_folder:\n",
98
+ " if folder not in self.config.all_required_folders:\n",
99
+ " validation_status = False\n",
100
+ " with open(self.config.status_file, \"w\") as f:\n",
101
+ " f.write(f\"Validation status: {validation_status}\")\n",
102
+ " else:\n",
103
+ " validation_status = True\n",
104
+ " with open(self.config.status_file, \"w\") as f:\n",
105
+ " f.write(f\"Validation status: {validation_status}\")\n",
106
+ "\n",
107
+ " return validation_status\n",
108
+ "\n",
109
+ " except Exception as exp:\n",
110
+ " raise exp"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": null,
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "try:\n",
120
+ " config = ConfigurationManager()\n",
121
+ " data_validation_config = config.get_data_validation_config()\n",
122
+ " data_validation = DataValiadtion(config=data_validation_config)\n",
123
+ " data_validation.validate_all_files_exist()\n",
124
+ "except Exception as e:\n",
125
+ " raise e"
126
+ ]
127
+ }
128
+ ],
129
+ "metadata": {
130
+ "kernelspec": {
131
+ "display_name": "Python 3",
132
+ "language": "python",
133
+ "name": "python3"
134
+ },
135
+ "language_info": {
136
+ "codemirror_mode": {
137
+ "name": "ipython",
138
+ "version": 3
139
+ },
140
+ "file_extension": ".py",
141
+ "mimetype": "text/x-python",
142
+ "name": "python",
143
+ "nbconvert_exporter": "python",
144
+ "pygments_lexer": "ipython3",
145
+ "version": "3.8.10"
146
+ }
147
+ },
148
+ "nbformat": 4,
149
+ "nbformat_minor": 2
150
+ }
check_code/03_data_transformation.ipynb ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "%pwd\n",
11
+ "os.chdir(\"../\")\n",
12
+ "\n",
13
+ "%pwd"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "from dataclasses import dataclass\n",
23
+ "from pathlib import Path\n",
24
+ "\n",
25
+ "\n",
26
+ "@dataclass(frozen=True)\n",
27
+ "class DataTransformationConfig:\n",
28
+ " root_dir: str\n",
29
+ " data_path: str\n",
30
+ " tokenizer_name: str"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "from box import ConfigBox\n",
40
+ "from pathlib import Path\n",
41
+ "from src.TextSummarizer.constants import file_path\n",
42
+ "from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
43
+ "\n",
44
+ "class ConfigurationManager:\n",
45
+ " \"\"\"\n",
46
+ " Class to manage the configuration files.\n",
47
+ " \"\"\"\n",
48
+ "\n",
49
+ " def __init__(self) -> None:\n",
50
+ " self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
51
+ " self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
52
+ "\n",
53
+ " create_directories(path_to_directories=[self.config.artifacts_root])\n",
54
+ "\n",
55
+ " def get_data_transformation_config(self) -> DataTransformationConfig:\n",
56
+ " config = self.config.data_transformation\n",
57
+ "\n",
58
+ " create_directories([config.root_dir])\n",
59
+ "\n",
60
+ " data_transformation_config = DataTransformationConfig(\n",
61
+ " root_dir=config.root_dir,\n",
62
+ " data_path=config.data_path,\n",
63
+ " tokenizer_name = config.tokenizer_name\n",
64
+ " )\n",
65
+ "\n",
66
+ " return data_transformation_config"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": null,
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "\n",
76
+ "\n",
77
+ "import os\n",
78
+ "from src.TextSummarizer.logger import backend_logger\n",
79
+ "from transformers import AutoTokenizer\n",
80
+ "from datasets import load_dataset, load_from_disk\n",
81
+ "\n",
82
+ "\n",
83
+ "class DataTransformation:\n",
84
+ " def __init__(self, config: DataTransformationConfig):\n",
85
+ " self.config = config\n",
86
+ " self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)\n",
87
+ "\n",
88
+ "\n",
89
+ "\n",
90
+ " def convert_examples_to_features(self,example_batch):\n",
91
+ " input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 800, truncation = True )\n",
92
+ "\n",
93
+ " with self.tokenizer.as_target_tokenizer():\n",
94
+ " target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
95
+ "\n",
96
+ " return {\n",
97
+ " 'input_ids' : input_encodings['input_ids'],\n",
98
+ " 'attention_mask': input_encodings['attention_mask'],\n",
99
+ " 'labels': target_encodings['input_ids']\n",
100
+ " }\n",
101
+ "\n",
102
+ "\n",
103
+ " def convert(self):\n",
104
+ " dataset = load_from_disk(self.config.data_path)\n",
105
+ " dataset = dataset.map(self.convert_examples_to_features, batched = True)\n",
106
+ " dataset.save_to_disk(os.path.join(self.config.root_dir,\"dataset\"))"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": null,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "try:\n",
116
+ " config = ConfigurationManager()\n",
117
+ " data_transformation_config = config.get_data_transformation_config()\n",
118
+ " data_transformation = DataTransformation(config=data_transformation_config)\n",
119
+ " data_transformation.convert()\n",
120
+ "except Exception as e:\n",
121
+ " raise e"
122
+ ]
123
+ }
124
+ ],
125
+ "metadata": {
126
+ "language_info": {
127
+ "name": "python"
128
+ }
129
+ },
130
+ "nbformat": 4,
131
+ "nbformat_minor": 2
132
+ }
check_code/04_model_training.ipynb ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "%pwd\n",
11
+ "os.chdir(\"../\")\n",
12
+ "\n",
13
+ "%pwd"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "from dataclasses import dataclass\n",
23
+ "from pathlib import Path\n",
24
+ "\n",
25
+ "\n",
26
+ "@dataclass(frozen=True)\n",
27
+ "class ModelTrainerConfig:\n",
28
+ " root_dir: str\n",
29
+ " data_path: str\n",
30
+ " model_ckpt: str\n",
31
+ " num_train_epochs: int\n",
32
+ " warmup_steps: int\n",
33
+ " per_device_train_batch_size: int\n",
34
+ " weight_decay: float\n",
35
+ " logging_steps: int\n",
36
+ " evaluation_strategy: str\n",
37
+ " eval_steps: int\n",
38
+ " save_steps: float\n",
39
+ " gradient_accumulation_steps: int"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "from box import ConfigBox\n",
49
+ "from pathlib import Path\n",
50
+ "from src.TextSummarizer.constants import file_path\n",
51
+ "from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
52
+ "\n",
53
+ "\n",
54
+ "class ConfigurationManager:\n",
55
+ "\n",
56
+ " def __init__(self) -> None:\n",
57
+ " self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
58
+ " self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
59
+ "\n",
60
+ " create_directories(path_to_directories=[self.config.artifacts_root])\n",
61
+ "\n",
62
+ " def get_model_trainer_config(self) -> ModelTrainerConfig:\n",
63
+ " config = self.config.model_trainer\n",
64
+ " params = self.params.TrainingArguments\n",
65
+ "\n",
66
+ " create_directories([config.root_dir])\n",
67
+ "\n",
68
+ " model_trainer_config = ModelTrainerConfig(\n",
69
+ " root_dir=config.root_dir,\n",
70
+ " data_path=config.data_path,\n",
71
+ " model_ckpt = config.model_ckpt,\n",
72
+ " num_train_epochs = params.num_train_epochs,\n",
73
+ " warmup_steps = params.warmup_steps,\n",
74
+ " per_device_train_batch_size = params.per_device_train_batch_size,\n",
75
+ " weight_decay = params.weight_decay,\n",
76
+ " logging_steps = params.logging_steps,\n",
77
+ " evaluation_strategy = params.evaluation_strategy,\n",
78
+ " eval_steps = params.evaluation_strategy,\n",
79
+ " save_steps = params.save_steps,\n",
80
+ " gradient_accumulation_steps = params.gradient_accumulation_steps\n",
81
+ " )\n",
82
+ "\n",
83
+ " return model_trainer_config"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": null,
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "from transformers import TrainingArguments, Trainer\n",
93
+ "from transformers import DataCollatorForSeq2Seq\n",
94
+ "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
95
+ "from datasets import load_dataset, load_from_disk\n",
96
+ "import torch\n",
97
+ "\n",
98
+ "\n",
99
+ "class ModelTrainer:\n",
100
+ " def __init__(self, config: ModelTrainerConfig):\n",
101
+ " self.config = config\n",
102
+ "\n",
103
+ "\n",
104
+ "\n",
105
+ " def train(self):\n",
106
+ " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
107
+ " tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)\n",
108
+ " model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)\n",
109
+ " seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)\n",
110
+ "\n",
111
+ " #loading data\n",
112
+ " dataset = load_from_disk(self.config.data_path)\n",
113
+ "\n",
114
+ " # trainer_args = TrainingArguments(\n",
115
+ " # output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,\n",
116
+ " # per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,\n",
117
+ " # weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,\n",
118
+ " # evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,\n",
119
+ " # gradient_accumulation_steps=self.config.gradient_accumulation_steps\n",
120
+ " # )\n",
121
+ "\n",
122
+ "\n",
123
+ " trainer_args = TrainingArguments(\n",
124
+ " output_dir=self.config.root_dir,\n",
125
+ " num_train_epochs=1,\n",
126
+ " warmup_steps=500,\n",
127
+ " per_device_train_batch_size=1,\n",
128
+ " per_device_eval_batch_size=1,\n",
129
+ " weight_decay=0.01,\n",
130
+ " logging_steps=10,\n",
131
+ " evaluation_strategy='steps',\n",
132
+ " eval_steps=500,\n",
133
+ " save_steps=1e6,\n",
134
+ " gradient_accumulation_steps=16\n",
135
+ " )\n",
136
+ "\n",
137
+ " trainer = Trainer(\n",
138
+ " model=model_pegasus,\n",
139
+ " args=trainer_args,\n",
140
+ " tokenizer=tokenizer,\n",
141
+ " data_collator=seq2seq_data_collator,\n",
142
+ " train_dataset=dataset[\"train\"],\n",
143
+ " eval_dataset=dataset[\"validation\"])\n",
144
+ "\n",
145
+ " # trainer.train()\n",
146
+ "\n",
147
+ " ## Save model\n",
148
+ " model_pegasus.save_pretrained(\"multi-news-model\")\n",
149
+ "\n",
150
+ " ## Save tokenizer\n",
151
+ " tokenizer.save_pretrained(\"tokenizer\")\n"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "try:\n",
161
+ " config = ConfigurationManager()\n",
162
+ " model_trainer_config = config.get_model_trainer_config()\n",
163
+ " model_trainer_config = ModelTrainer(config=model_trainer_config)\n",
164
+ " model_trainer_config.train()\n",
165
+ "except Exception as e:\n",
166
+ " raise e"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": null,
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": []
175
+ }
176
+ ],
177
+ "metadata": {
178
+ "language_info": {
179
+ "name": "python"
180
+ }
181
+ },
182
+ "nbformat": 4,
183
+ "nbformat_minor": 2
184
+ }
check_code/05_model_evaluation.ipynb ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "%pwd\n",
11
+ "os.chdir(\"../\")\n",
12
+ "\n",
13
+ "%pwd"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "from dataclasses import dataclass\n",
23
+ "from pathlib import Path\n",
24
+ "\n",
25
+ "\n",
26
+ "@dataclass(frozen=True)\n",
27
+ "class ModelEvaluationConfig:\n",
28
+ " root_dir: str\n",
29
+ " data_path: str\n",
30
+ " model_path: str\n",
31
+ " tokenizer_path: str\n",
32
+ " metric_file_name: str"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "from box import ConfigBox\n",
42
+ "from pathlib import Path\n",
43
+ "from src.TextSummarizer.constants import file_path\n",
44
+ "from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
45
+ "\n",
46
+ "\n",
47
+ "class ConfigurationManager:\n",
48
+ "\n",
49
+ " def __init__(self) -> None:\n",
50
+ " self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
51
+ " self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
52
+ "\n",
53
+ " create_directories(path_to_directories=[self.config.artifacts_root])\n",
54
+ "\n",
55
+ " def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
56
+ " config = self.config.model_evaluation\n",
57
+ "\n",
58
+ " create_directories([config.root_dir])\n",
59
+ "\n",
60
+ " model_evaluation_config = ModelEvaluationConfig(\n",
61
+ " root_dir=config.root_dir,\n",
62
+ " data_path=config.data_path,\n",
63
+ " model_path = config.model_path,\n",
64
+ " tokenizer_path = config.tokenizer_path,\n",
65
+ " metric_file_name = config.metric_file_name\n",
66
+ "\n",
67
+ " )\n",
68
+ "\n",
69
+ " return model_evaluation_config"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
79
+ "from datasets import load_dataset, load_from_disk, load_metric\n",
80
+ "import torch\n",
81
+ "import pandas as pd\n",
82
+ "from tqdm import tqdm\n",
83
+ "\n",
84
+ "class ModelEvaluation:\n",
85
+ " def __init__(self, config: ModelEvaluationConfig):\n",
86
+ " self.config = config\n",
87
+ "\n",
88
+ "\n",
89
+ "\n",
90
+ " def generate_batch_sized_chunks(self,list_of_elements, batch_size):\n",
91
+ " \"\"\"split the dataset into smaller batches that we can process simultaneously\n",
92
+ " Yield successive batch-sized chunks from list_of_elements.\"\"\"\n",
93
+ " for i in range(0, len(list_of_elements), batch_size):\n",
94
+ " yield list_of_elements[i : i + batch_size]\n",
95
+ "\n",
96
+ "\n",
97
+ " def calculate_metric_on_test_ds(self,dataset, metric, model, tokenizer,\n",
98
+ " batch_size=16, device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
99
+ " column_text=\"article\",\n",
100
+ " column_summary=\"highlights\"):\n",
101
+ " article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))\n",
102
+ " target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))\n",
103
+ "\n",
104
+ " for article_batch, target_batch in tqdm(\n",
105
+ " zip(article_batches, target_batches), total=len(article_batches)):\n",
106
+ "\n",
107
+ " inputs = tokenizer(article_batch, max_length=1024, truncation=True,\n",
108
+ " padding=\"max_length\", return_tensors=\"pt\")\n",
109
+ "\n",
110
+ " summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n",
111
+ " attention_mask=inputs[\"attention_mask\"].to(device),\n",
112
+ " length_penalty=0.8, num_beams=8, max_length=128)\n",
113
+ " ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''\n",
114
+ "\n",
115
+ " # Finally, we decode the generated texts,\n",
116
+ " # replace the token, and add the decoded texts with the references to the metric.\n",
117
+ " decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,\n",
118
+ " clean_up_tokenization_spaces=True)\n",
119
+ " for s in summaries]\n",
120
+ "\n",
121
+ " decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n",
122
+ "\n",
123
+ "\n",
124
+ " metric.add_batch(predictions=decoded_summaries, references=target_batch)\n",
125
+ "\n",
126
+ " # Finally compute and return the ROUGE scores.\n",
127
+ " score = metric.compute()\n",
128
+ " return score\n",
129
+ "\n",
130
+ "\n",
131
+ " def evaluate(self):\n",
132
+ " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
133
+ " tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n",
134
+ " model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n",
135
+ "\n",
136
+ " #loading data\n",
137
+ " dataset_samsum_pt = load_from_disk(self.config.data_path)\n",
138
+ "\n",
139
+ "\n",
140
+ " rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
141
+ "\n",
142
+ " rouge_metric = load_metric('rouge')\n",
143
+ "\n",
144
+ " score = self.calculate_metric_on_test_ds(\n",
145
+ " dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'\n",
146
+ " )\n",
147
+ "\n",
148
+ " rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )\n",
149
+ "\n",
150
+ " df = pd.DataFrame(rouge_dict, index = ['pegasus'] )\n",
151
+ " df.to_csv(self.config.metric_file_name, index=False)\n"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "try:\n",
161
+ " config = ConfigurationManager()\n",
162
+ " model_evaluation_config = config.get_model_evaluation_config()\n",
163
+ " model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n",
164
+ " model_evaluation_config.evaluate()\n",
165
+ "except Exception as e:\n",
166
+ " raise e"
167
+ ]
168
+ }
169
+ ],
170
+ "metadata": {
171
+ "language_info": {
172
+ "name": "python"
173
+ }
174
+ },
175
+ "nbformat": 4,
176
+ "nbformat_minor": 2
177
+ }
check_code/predict.ipynb ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 8,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "application/vnd.jupyter.widget-view+json": {
11
+ "model_id": "6fe4263eb5e743e7968cf6f1b140a744",
12
+ "version_major": 2,
13
+ "version_minor": 0
14
+ },
15
+ "text/plain": [
16
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
17
+ ]
18
+ },
19
+ "metadata": {},
20
+ "output_type": "display_data"
21
+ }
22
+ ],
23
+ "source": [
24
+ "# pavithra-devi/google_pegasus_multi_news_model\n",
25
+ "\n",
26
+ "\n",
27
+ "from huggingface_hub import notebook_login\n",
28
+ "\n",
29
+ "notebook_login()\n"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 10,
35
+ "metadata": {},
36
+ "outputs": [
37
+ {
38
+ "data": {
39
+ "application/vnd.jupyter.widget-view+json": {
40
+ "model_id": "df05cc7954324819a2ba5645e881ea49",
41
+ "version_major": 2,
42
+ "version_minor": 0
43
+ },
44
+ "text/plain": [
45
+ "tokenizer_config.json: 0%| | 0.00/20.1k [00:00<?, ?B/s]"
46
+ ]
47
+ },
48
+ "metadata": {},
49
+ "output_type": "display_data"
50
+ },
51
+ {
52
+ "data": {
53
+ "application/vnd.jupyter.widget-view+json": {
54
+ "model_id": "9ca03f3fecbe48f994a272a75a06e779",
55
+ "version_major": 2,
56
+ "version_minor": 0
57
+ },
58
+ "text/plain": [
59
+ "spiece.model: 0%| | 0.00/1.91M [00:00<?, ?B/s]"
60
+ ]
61
+ },
62
+ "metadata": {},
63
+ "output_type": "display_data"
64
+ },
65
+ {
66
+ "data": {
67
+ "application/vnd.jupyter.widget-view+json": {
68
+ "model_id": "ba543a91b4b9494e8009f60e2cd80fc2",
69
+ "version_major": 2,
70
+ "version_minor": 0
71
+ },
72
+ "text/plain": [
73
+ "tokenizer.json: 0%| | 0.00/6.60M [00:00<?, ?B/s]"
74
+ ]
75
+ },
76
+ "metadata": {},
77
+ "output_type": "display_data"
78
+ },
79
+ {
80
+ "data": {
81
+ "application/vnd.jupyter.widget-view+json": {
82
+ "model_id": "7911d674ade7471e95a8c9041d69adb1",
83
+ "version_major": 2,
84
+ "version_minor": 0
85
+ },
86
+ "text/plain": [
87
+ "special_tokens_map.json: 0%| | 0.00/1.77k [00:00<?, ?B/s]"
88
+ ]
89
+ },
90
+ "metadata": {},
91
+ "output_type": "display_data"
92
+ },
93
+ {
94
+ "data": {
95
+ "application/vnd.jupyter.widget-view+json": {
96
+ "model_id": "6d1b306a35f24cd8a76415bb399a74c5",
97
+ "version_major": 2,
98
+ "version_minor": 0
99
+ },
100
+ "text/plain": [
101
+ "config.json: 0%| | 0.00/1.32k [00:00<?, ?B/s]"
102
+ ]
103
+ },
104
+ "metadata": {},
105
+ "output_type": "display_data"
106
+ },
107
+ {
108
+ "data": {
109
+ "application/vnd.jupyter.widget-view+json": {
110
+ "model_id": "04440e9bef144c4197f8e11b00a21b61",
111
+ "version_major": 2,
112
+ "version_minor": 0
113
+ },
114
+ "text/plain": [
115
+ "model.safetensors: 0%| | 0.00/2.28G [00:00<?, ?B/s]"
116
+ ]
117
+ },
118
+ "metadata": {},
119
+ "output_type": "display_data"
120
+ },
121
+ {
122
+ "data": {
123
+ "application/vnd.jupyter.widget-view+json": {
124
+ "model_id": "2e5f5ed6ff8644cd92f3690374361820",
125
+ "version_major": 2,
126
+ "version_minor": 0
127
+ },
128
+ "text/plain": [
129
+ "generation_config.json: 0%| | 0.00/275 [00:00<?, ?B/s]"
130
+ ]
131
+ },
132
+ "metadata": {},
133
+ "output_type": "display_data"
134
+ }
135
+ ],
136
+ "source": [
137
+ "from transformers import AutoModelForSeq2SeqLM\n",
138
+ "model_name = \"pavithra-devi/pegasus-multi-news\"\n",
139
+ "\n",
140
+ "# Load model directly\n",
141
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
142
+ "\n",
143
+ "tokenizer = AutoTokenizer.from_pretrained(\"pavithra-devi/pegasus-multi-news\")\n",
144
+ "model = AutoModelForSeq2SeqLM.from_pretrained(\"pavithra-devi/pegasus-multi-news\")"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 12,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "from transformers import pipeline\n",
154
+ "\n",
155
+ "\n",
156
+ "summarizer = pipeline(\"summarization\", model=\"pavithra-devi/pegasus-multi-news\")\n"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": 14,
162
+ "metadata": {},
163
+ "outputs": [
164
+ {
165
+ "data": {
166
+ "text/plain": [
167
+ "[{'summary_text': '– A recent Supreme Court decision gives states the ability to opt out of the law\\'s expansion of Medicaid, the federal insurance program for poor, disabled and elderly Americans, confers \"incredible power\" on the states and their governors, says political scientist Thad Kousser, co-author of The Power of American Governors. \"No matter who wins the presidency, national politics is going to be stalemated on the Affordable Care Act,\" says Kousser, of the University of California-Berkeley. Just look at what happened when the Obama administration in 2010 offered federal stimulus money to states to begin building a high-speed rail network. Three'}]"
168
+ ]
169
+ },
170
+ "execution_count": 14,
171
+ "metadata": {},
172
+ "output_type": "execute_result"
173
+ }
174
+ ],
175
+ "source": [
176
+ "text = \"\"\" While the occupant of the governor's office is historically far less important than the party that controls the state legislature, top state officials in coming years are expected to wield significant influence in at least one major area.\n",
177
+ "\n",
178
+ " And that's health care, says political scientist Thad Kousser, co-author of The Power of American Governors.\n",
179
+ "\n",
180
+ " \"No matter who wins the presidency, national politics is going to be stalemated on the Affordable Care Act,\" says Kousser, of the University of California-Berkeley.\n",
181
+ "\n",
182
+ " A recent U.S. Supreme Court decision giving states the ability to opt out of the law's expansion of Medicaid, the federal insurance program for poor, disabled and elderly Americans, confers \"incredible power\" on the states and their governors, Kousser says.\n",
183
+ "\n",
184
+ " Just look at what happened when the Obama administration in 2010 offered federal stimulus money to states to begin building a high-speed rail network. Three Republican governors, including Rick Scott of Florida and Scott Walker of Wisconsin, rejected a share of the money citing debt and deficit concerns.\n",
185
+ "\n",
186
+ " \"A [Mitt] Romney victory would dramatically empower Republican governors,\" Kousser says. \"\"\"\n",
187
+ "\n",
188
+ "summarizer(text)"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "metadata": {},
195
+ "outputs": [],
196
+ "source": [
197
+ "[{'summary_text': '– A recent Supreme Court decision gives states the ability to opt out of the law\\'s expansion of Medicaid,\n",
198
+ " the federal insurance program for poor, disabled and elderly Americans, confers \"incredible power\" on the states and their governors,\n",
199
+ " says political scientist Thad Kousser, co-author of The Power of American Governors. \"No matter who wins the presidency, national politics\n",
200
+ " is going to be stalemated on the Affordable Care Act,\" says Kousser, of the University of California-Berkeley. Just look at what happened when the\n",
201
+ " Obama administration in 2010 offered federal stimulus money to states to begin building a high-speed rail network. Three'}]"
202
+ ]
203
+ },
204
+ {
205
+ "cell_type": "code",
206
+ "execution_count": 11,
207
+ "metadata": {},
208
+ "outputs": [
209
+ {
210
+ "data": {
211
+ "text/plain": [
212
+ "['T_destination',\n",
213
+ " '__annotations__',\n",
214
+ " '__call__',\n",
215
+ " '__class__',\n",
216
+ " '__delattr__',\n",
217
+ " '__dict__',\n",
218
+ " '__dir__',\n",
219
+ " '__doc__',\n",
220
+ " '__eq__',\n",
221
+ " '__format__',\n",
222
+ " '__ge__',\n",
223
+ " '__getattr__',\n",
224
+ " '__getattribute__',\n",
225
+ " '__getstate__',\n",
226
+ " '__gt__',\n",
227
+ " '__hash__',\n",
228
+ " '__init__',\n",
229
+ " '__init_subclass__',\n",
230
+ " '__le__',\n",
231
+ " '__lt__',\n",
232
+ " '__module__',\n",
233
+ " '__ne__',\n",
234
+ " '__new__',\n",
235
+ " '__reduce__',\n",
236
+ " '__reduce_ex__',\n",
237
+ " '__repr__',\n",
238
+ " '__setattr__',\n",
239
+ " '__setstate__',\n",
240
+ " '__sizeof__',\n",
241
+ " '__str__',\n",
242
+ " '__subclasshook__',\n",
243
+ " '__weakref__',\n",
244
+ " '_apply',\n",
245
+ " '_assisted_decoding',\n",
246
+ " '_auto_class',\n",
247
+ " '_autoset_attn_implementation',\n",
248
+ " '_backward_compatibility_gradient_checkpointing',\n",
249
+ " '_backward_hooks',\n",
250
+ " '_backward_pre_hooks',\n",
251
+ " '_beam_sample',\n",
252
+ " '_beam_search',\n",
253
+ " '_buffers',\n",
254
+ " '_call_impl',\n",
255
+ " '_check_and_enable_flash_attn_2',\n",
256
+ " '_check_and_enable_sdpa',\n",
257
+ " '_compiled_call_impl',\n",
258
+ " '_constrained_beam_search',\n",
259
+ " '_contrastive_search',\n",
260
+ " '_convert_head_mask_to_5d',\n",
261
+ " '_copy_lm_head_original_to_resized',\n",
262
+ " '_create_repo',\n",
263
+ " '_dispatch_accelerate_model',\n",
264
+ " '_expand_inputs_for_generation',\n",
265
+ " '_extract_past_from_model_output',\n",
266
+ " '_forward_hooks',\n",
267
+ " '_forward_hooks_always_called',\n",
268
+ " '_forward_hooks_with_kwargs',\n",
269
+ " '_forward_pre_hooks',\n",
270
+ " '_forward_pre_hooks_with_kwargs',\n",
271
+ " '_from_config',\n",
272
+ " '_get_backward_hooks',\n",
273
+ " '_get_backward_pre_hooks',\n",
274
+ " '_get_cache',\n",
275
+ " '_get_candidate_generator',\n",
276
+ " '_get_decoder_start_token_id',\n",
277
+ " '_get_files_timestamps',\n",
278
+ " '_get_initial_cache_position',\n",
279
+ " '_get_logits_processor',\n",
280
+ " '_get_logits_warper',\n",
281
+ " '_get_name',\n",
282
+ " '_get_no_split_modules',\n",
283
+ " '_get_resized_embeddings',\n",
284
+ " '_get_resized_lm_head',\n",
285
+ " '_get_stopping_criteria',\n",
286
+ " '_greedy_search',\n",
287
+ " '_group_beam_search',\n",
288
+ " '_has_unfinished_sequences',\n",
289
+ " '_hf_peft_config_loaded',\n",
290
+ " '_hook_rss_memory_post_forward',\n",
291
+ " '_hook_rss_memory_pre_forward',\n",
292
+ " '_init_weights',\n",
293
+ " '_initialize_weights',\n",
294
+ " '_is_full_backward_hook',\n",
295
+ " '_is_hf_initialized',\n",
296
+ " '_is_quantized_training_enabled',\n",
297
+ " '_keep_in_fp32_modules',\n",
298
+ " '_keep_in_fp32_modules',\n",
299
+ " '_keys_to_ignore_on_load_missing',\n",
300
+ " '_keys_to_ignore_on_load_unexpected',\n",
301
+ " '_keys_to_ignore_on_save',\n",
302
+ " '_load_from_state_dict',\n",
303
+ " '_load_pretrained_model',\n",
304
+ " '_load_pretrained_model_low_mem',\n",
305
+ " '_load_state_dict_post_hooks',\n",
306
+ " '_load_state_dict_pre_hooks',\n",
307
+ " '_maybe_initialize_input_ids_for_generation',\n",
308
+ " '_maybe_warn_non_full_backward_hook',\n",
309
+ " '_merge_criteria_processor_list',\n",
310
+ " '_modules',\n",
311
+ " '_named_members',\n",
312
+ " '_no_split_modules',\n",
313
+ " '_non_persistent_buffers_set',\n",
314
+ " '_parameters',\n",
315
+ " '_prepare_attention_mask_for_generation',\n",
316
+ " '_prepare_decoder_input_ids_for_generation',\n",
317
+ " '_prepare_encoder_decoder_kwargs_for_generation',\n",
318
+ " '_prepare_generated_length',\n",
319
+ " '_prepare_generation_config',\n",
320
+ " '_prepare_model_inputs',\n",
321
+ " '_prepare_special_tokens',\n",
322
+ " '_register_load_state_dict_pre_hook',\n",
323
+ " '_register_state_dict_hook',\n",
324
+ " '_reorder_cache',\n",
325
+ " '_replicate_for_data_parallel',\n",
326
+ " '_resize_final_logits_bias',\n",
327
+ " '_resize_token_embeddings',\n",
328
+ " '_sample',\n",
329
+ " '_save_to_state_dict',\n",
330
+ " '_set_default_torch_dtype',\n",
331
+ " '_set_gradient_checkpointing',\n",
332
+ " '_skip_keys_device_placement',\n",
333
+ " '_slow_forward',\n",
334
+ " '_state_dict_hooks',\n",
335
+ " '_state_dict_pre_hooks',\n",
336
+ " '_supports_cache_class',\n",
337
+ " '_supports_flash_attn_2',\n",
338
+ " '_supports_quantized_cache',\n",
339
+ " '_supports_sdpa',\n",
340
+ " '_supports_static_cache',\n",
341
+ " '_temporary_reorder_cache',\n",
342
+ " '_tie_encoder_decoder_weights',\n",
343
+ " '_tie_or_clone_weights',\n",
344
+ " '_tied_weights_keys',\n",
345
+ " '_update_model_kwargs_for_generation',\n",
346
+ " '_upload_modified_files',\n",
347
+ " '_validate_assistant',\n",
348
+ " '_validate_generated_length',\n",
349
+ " '_validate_model_class',\n",
350
+ " '_validate_model_kwargs',\n",
351
+ " '_version',\n",
352
+ " '_wrapped_call_impl',\n",
353
+ " 'active_adapter',\n",
354
+ " 'active_adapters',\n",
355
+ " 'add_adapter',\n",
356
+ " 'add_memory_hooks',\n",
357
+ " 'add_model_tags',\n",
358
+ " 'add_module',\n",
359
+ " 'apply',\n",
360
+ " 'base_model',\n",
361
+ " 'base_model_prefix',\n",
362
+ " 'bfloat16',\n",
363
+ " 'buffers',\n",
364
+ " 'call_super_init',\n",
365
+ " 'can_generate',\n",
366
+ " 'children',\n",
367
+ " 'compile',\n",
368
+ " 'compute_transition_scores',\n",
369
+ " 'config',\n",
370
+ " 'config_class',\n",
371
+ " 'cpu',\n",
372
+ " 'create_extended_attention_mask_for_decoder',\n",
373
+ " 'cuda',\n",
374
+ " 'dequantize',\n",
375
+ " 'device',\n",
376
+ " 'disable_adapters',\n",
377
+ " 'disable_input_require_grads',\n",
378
+ " 'double',\n",
379
+ " 'dtype',\n",
380
+ " 'dummy_inputs',\n",
381
+ " 'dump_patches',\n",
382
+ " 'enable_adapters',\n",
383
+ " 'enable_input_require_grads',\n",
384
+ " 'estimate_tokens',\n",
385
+ " 'eval',\n",
386
+ " 'extra_repr',\n",
387
+ " 'final_logits_bias',\n",
388
+ " 'float',\n",
389
+ " 'floating_point_ops',\n",
390
+ " 'forward',\n",
391
+ " 'framework',\n",
392
+ " 'from_pretrained',\n",
393
+ " 'generate',\n",
394
+ " 'generation_config',\n",
395
+ " 'get_adapter_state_dict',\n",
396
+ " 'get_buffer',\n",
397
+ " 'get_decoder',\n",
398
+ " 'get_encoder',\n",
399
+ " 'get_extended_attention_mask',\n",
400
+ " 'get_extra_state',\n",
401
+ " 'get_head_mask',\n",
402
+ " 'get_input_embeddings',\n",
403
+ " 'get_memory_footprint',\n",
404
+ " 'get_output_embeddings',\n",
405
+ " 'get_parameter',\n",
406
+ " 'get_position_embeddings',\n",
407
+ " 'get_submodule',\n",
408
+ " 'gradient_checkpointing_disable',\n",
409
+ " 'gradient_checkpointing_enable',\n",
410
+ " 'half',\n",
411
+ " 'init_weights',\n",
412
+ " 'invert_attention_mask',\n",
413
+ " 'ipu',\n",
414
+ " 'is_gradient_checkpointing',\n",
415
+ " 'is_parallelizable',\n",
416
+ " 'lm_head',\n",
417
+ " 'load_adapter',\n",
418
+ " 'load_state_dict',\n",
419
+ " 'main_input_name',\n",
420
+ " 'model',\n",
421
+ " 'model_tags',\n",
422
+ " 'modules',\n",
423
+ " 'name_or_path',\n",
424
+ " 'named_buffers',\n",
425
+ " 'named_children',\n",
426
+ " 'named_modules',\n",
427
+ " 'named_parameters',\n",
428
+ " 'num_parameters',\n",
429
+ " 'parameters',\n",
430
+ " 'post_init',\n",
431
+ " 'prepare_decoder_input_ids_from_labels',\n",
432
+ " 'prepare_inputs_for_generation',\n",
433
+ " 'prune_heads',\n",
434
+ " 'push_to_hub',\n",
435
+ " 'register_backward_hook',\n",
436
+ " 'register_buffer',\n",
437
+ " 'register_for_auto_class',\n",
438
+ " 'register_forward_hook',\n",
439
+ " 'register_forward_pre_hook',\n",
440
+ " 'register_full_backward_hook',\n",
441
+ " 'register_full_backward_pre_hook',\n",
442
+ " 'register_load_state_dict_post_hook',\n",
443
+ " 'register_module',\n",
444
+ " 'register_parameter',\n",
445
+ " 'register_state_dict_pre_hook',\n",
446
+ " 'requires_grad_',\n",
447
+ " 'reset_memory_hooks_state',\n",
448
+ " 'resize_position_embeddings',\n",
449
+ " 'resize_token_embeddings',\n",
450
+ " 'retrieve_modules_from_names',\n",
451
+ " 'reverse_bettertransformer',\n",
452
+ " 'save_pretrained',\n",
453
+ " 'set_adapter',\n",
454
+ " 'set_extra_state',\n",
455
+ " 'set_input_embeddings',\n",
456
+ " 'set_output_embeddings',\n",
457
+ " 'share_memory',\n",
458
+ " 'state_dict',\n",
459
+ " 'supports_gradient_checkpointing',\n",
460
+ " 'tie_weights',\n",
461
+ " 'to',\n",
462
+ " 'to_bettertransformer',\n",
463
+ " 'to_empty',\n",
464
+ " 'train',\n",
465
+ " 'training',\n",
466
+ " 'type',\n",
467
+ " 'warn_if_padding_and_no_attention_mask',\n",
468
+ " 'warnings_issued',\n",
469
+ " 'xpu',\n",
470
+ " 'zero_grad']"
471
+ ]
472
+ },
473
+ "execution_count": 11,
474
+ "metadata": {},
475
+ "output_type": "execute_result"
476
+ }
477
+ ],
478
+ "source": [
479
+ "tokenized_data ="
480
+ ]
481
+ }
482
+ ],
483
+ "metadata": {
484
+ "kernelspec": {
485
+ "display_name": "venv_text_summarizaition",
486
+ "language": "python",
487
+ "name": "python3"
488
+ },
489
+ "language_info": {
490
+ "codemirror_mode": {
491
+ "name": "ipython",
492
+ "version": 3
493
+ },
494
+ "file_extension": ".py",
495
+ "mimetype": "text/x-python",
496
+ "name": "python",
497
+ "nbconvert_exporter": "python",
498
+ "pygments_lexer": "ipython3",
499
+ "version": "3.8.10"
500
+ }
501
+ },
502
+ "nbformat": 4,
503
+ "nbformat_minor": 2
504
+ }
check_code/trials.ipynb ADDED
File without changes
main.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The file which does the all work. Calling all the pipeline to do the training.
3
+ """
4
+
5
+ from src.TextSummarizer.logger import backend_logger
6
+ from src.TextSummarizer.pipeline.step_01_data_ingestion import DataIngestionPipeline
7
+ from src.TextSummarizer.pipeline.step_02_data_validation import DataValidationPipeline
8
+ from src.TextSummarizer.pipeline.step_03_data_transformation import (
9
+ DataTransformationPipeline,
10
+ )
11
+ from src.TextSummarizer.pipeline.step_04_train_model import ModelTrainerPipeline
12
+ from src.TextSummarizer.pipeline.step_05_model_evaluation import ModelEvaluationPipeline
13
+
14
+ stage_name_01: str = "Stage 1: Data Integration Stage"
15
+ stage_name_02: str = "Stage 2: Data Validation Stage"
16
+ stage_name_03: str = "Stage 3: Data Transformation Stage"
17
+ stage_name_04: str = "Stage 4: Model training Stage"
18
+ stage_name_05: str = "Stage 5: Model Evaluation Stage"
19
+
20
+
21
+ line_msg: str = "="*100
22
+
23
+ try:
24
+ backend_logger.info(line_msg)
25
+ backend_logger.info(f"Stage {stage_name_01} started")
26
+ DataIngestionPipeline().run()
27
+ backend_logger.info(f"Stage {stage_name_01} completed.")
28
+ backend_logger.info(line_msg)
29
+ except Exception as err:
30
+ backend_logger.error(f"Data ingestion pipeline failed. Reason: {err}")
31
+
32
+
33
+ try:
34
+ backend_logger.info(line_msg)
35
+ backend_logger.info(f"Stage {stage_name_02} started")
36
+ DataValidationPipeline().run()
37
+ backend_logger.info(f"Stage {stage_name_02} completed.")
38
+ backend_logger.info(line_msg)
39
+ except Exception as err:
40
+ backend_logger.error(f"Data validation pipeline failed. Reason: {err}")
41
+
42
+
43
+ try:
44
+ backend_logger.info(line_msg)
45
+ backend_logger.info(f"Stage {stage_name_03} started")
46
+ DataTransformationPipeline().run()
47
+ backend_logger.info(f"Stage {stage_name_03} completed.")
48
+ backend_logger.info(line_msg)
49
+ except Exception as err:
50
+ backend_logger.error(f"Data Transformation pipeline failed. Reason: {err}")
51
+
52
+
53
+ # For the device limitations issues, i have trained the model on online and stored the model in huggingface profile.
54
+ # We can skip the training and model evaluation steps while running locally.
55
+
56
+
57
+ try:
58
+ backend_logger.info(line_msg)
59
+ backend_logger.info(f"Stage {stage_name_04} started")
60
+ ModelTrainerPipeline().run()
61
+ backend_logger.info(f"Stage {stage_name_04} completed.")
62
+ backend_logger.info(line_msg)
63
+ except Exception as err:
64
+ backend_logger.error(f"Data data training pipeline failed. Reason: {err}")
65
+
66
+
67
+ try:
68
+ backend_logger.info(line_msg)
69
+ backend_logger.info(f"Stage {stage_name_05} started")
70
+ ModelEvaluationPipeline().run()
71
+ backend_logger.info(f"Stage {stage_name_05} completed.")
72
+ backend_logger.info(line_msg)
73
+ except Exception as err:
74
+ backend_logger.error(f"Model evaluation pipeline failed. Reason: {err}")
project_structure.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Used to create the whole project structure.
3
+ """
4
+
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+
9
+ logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s]: %(message)s:')
10
+
11
+ project_name: str = "TextSummarizer"
12
+
13
+ list_of_files = [
14
+ ".github/workflows/.gitkeep",
15
+ f"src/{project_name}/__init__.py",
16
+ f"src/{project_name}/components/__init__.py",
17
+ f"src/{project_name}/utils/__init__.py",
18
+ f"src/{project_name}/utils/general.py",
19
+ f"src/{project_name}/logger/__init__.py",
20
+ f"src/{project_name}/config/__init__.py",
21
+ f"src/{project_name}/config/configuration.py",
22
+ f"src/{project_name}/pipeline/__init__.py",
23
+ f"src/{project_name}/entity/__init__.py",
24
+ f"src/{project_name}/constants/__init__.py",
25
+ f"src/{project_name}/exception/__init__.py",
26
+ "config/config.yaml",
27
+ "params/params.yaml",
28
+ "params.yaml",
29
+ "app.py",
30
+ "main.py",
31
+ "Dockerfile",
32
+ "requirements.txt",
33
+ "setup.py",
34
+ "check_code/trials.ipynb",
35
+
36
+ ]
37
+
38
+
39
+ for filepath in list_of_files:
40
+ filepath = Path(filepath)
41
+ filedir, filename = os.path.split(filepath)
42
+
43
+ if filedir != "":
44
+ os.makedirs(filedir, exist_ok=True)
45
+ logging.info("Creating directory:{filedir} for the file {filename}")
46
+
47
+ if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
48
+ with open(filepath,'w') as f:
49
+ logging.info("Creating empty file: {filepath}")
50
+
51
+ else:
52
+ logging.info("{filename} is already exists")
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ transformers[sentencepiece]
3
+ datasets
4
+ sacrebleu
5
+ rouge_score
6
+ py7zr
7
+ pandas
8
+ nltk
9
+ tqdm
10
+ PyYAML
11
+ matplotlib
12
+ torch
13
+ notebook
14
+ boto3
15
+ mypy-boto3-s3
16
+ python-box==6.0.2
17
+ ensure==1.0.2
18
+ fastapi==0.78.0
19
+ uvicorn==0.18.3
20
+ Jinja2==3.1.2
21
+ -e .
setup.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Make the project installable via pip. the project meta data are present here.
3
+ """
4
+
5
+ import setuptools
6
+ from setuptools import setup
7
+
8
+ # Read the readme file, which will be our long description.
9
+ with open("README.md", "r", encoding="utf-8") as file:
10
+ long_description = file.read()
11
+
12
+ version: str = "1.0.0"
13
+ repo_name: str = "Text-summarization-nlp"
14
+ git_hub_username: str = "pavi-ninjaac"
15
+ author_name: str = "Pavithra Devi M"
16
+ author_email = "[email protected]"
17
+ package_name: str = "TextSummarizer"
18
+
19
+
20
+ # setup the project.
21
+ setup(
22
+ name=package_name,
23
+ version=version,
24
+ author=author_name,
25
+ author_email=author_email,
26
+ description="A small package for text summarization",
27
+ long_description=long_description,
28
+ long_description_content="text/markdown",
29
+ url=f"https://github.com/{git_hub_username}/{repo_name}",
30
+ project_urls={
31
+ "Bug Tracker": f"https://github.com/{git_hub_username}/{repo_name}/issues",
32
+ },
33
+ package_dir={"": "src"},
34
+ packages=setuptools.find_packages(where="src"),
35
+ python_requires=">=3.8"
36
+ )
src/TextSummarizer/components/__init__.py ADDED
File without changes
src/TextSummarizer/components/data_ingestion.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from datasets import load_dataset
4
+
5
+ from src.TextSummarizer.entity import entities
6
+
7
+
8
+ class DataIngestionComponent:
9
+ """
10
+ A Class which is responsible for data ingestion.
11
+ """
12
+
13
+ def __init__(self, config: entities.DataIngestionConfig) -> None:
14
+ self.config = config
15
+
16
+ def save_dataset(self):
17
+ """
18
+ Load the dataset.
19
+ """
20
+ # if the dataset is already loaded then don't call it.
21
+ if os.path.exists(self.config.arrow_dataset_dir):
22
+ return
23
+
24
+ test_dataset = load_dataset(self.config.dataset_name)
25
+ test_dataset.save_to_disk(self.config.arrow_dataset_dir)
src/TextSummarizer/components/data_transformation.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module which represents to the data validation step.
3
+ """
4
+
5
+ import os
6
+
7
+ from datasets import load_dataset, load_from_disk
8
+ from transformers import AutoTokenizer
9
+
10
+ from src.TextSummarizer.entity import entities
11
+ from src.TextSummarizer.logger import backend_logger
12
+
13
+
14
+ class DataTransformation:
15
+ def __init__(self, config: entities.DataTransformationConfig):
16
+ self.config = config
17
+ self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
18
+
19
+ def convert_examples_to_features(self,example_batch):
20
+ """
21
+ Convert the examples to features.
22
+ """
23
+ input_encodings = self.tokenizer(example_batch['document'] , max_length = 800, truncation = True )
24
+
25
+ with self.tokenizer.as_target_tokenizer():
26
+ target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )
27
+
28
+ return {
29
+ 'input_ids' : input_encodings['input_ids'],
30
+ 'attention_mask': input_encodings['attention_mask'],
31
+ 'labels': target_encodings['input_ids']
32
+ }
33
+
34
+ def convert(self):
35
+ """
36
+ Tokenzie the dataset and store it to the disk.
37
+ """
38
+ backend_logger.info("Converting text to tokens....")
39
+ # Check if the dataset folder already exists.
40
+ if os.path.exists(os.path.join(self.config.root_dir,"dataset")):
41
+ return
42
+ dataset = load_from_disk(self.config.data_path)
43
+ dataset = dataset.map(self.convert_examples_to_features, batched = True)
44
+ dataset.save_to_disk(os.path.join(self.config.root_dir,"dataset"))
45
+ backend_logger.info("Converted text to tokens.")
src/TextSummarizer/components/data_validation.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from src.TextSummarizer.entity import entities
4
+ from src.TextSummarizer.logger import backend_logger
5
+
6
+
7
+ class DataValidation:
8
+ def __init__(self, config: entities.DataValidationConfig):
9
+ self.config = config
10
+
11
+ def validate_all_files_exist(self) -> bool:
12
+ """
13
+ Check if all the required folders are present.
14
+ """
15
+ try:
16
+ validation_status: bool | None = None
17
+
18
+ all_folder = os.listdir(os.path.join("artifacts","data"))
19
+
20
+ for folder in all_folder:
21
+ print(folder)
22
+ if folder not in self.config.all_required_folders:
23
+ validation_status = False
24
+ with open(self.config.status_file, "w") as f:
25
+ backend_logger.info("Writing the data validation status as False")
26
+ f.write(f"Validation status: {validation_status}")
27
+ else:
28
+ validation_status = True
29
+ with open(self.config.status_file, "w") as f:
30
+ backend_logger.info("Writing the data validation status as True")
31
+ f.write(f"Validation status: {validation_status}")
32
+
33
+ return validation_status
34
+
35
+ except Exception as exp:
36
+ raise exp
src/TextSummarizer/components/model_evaluation.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pandas as pd
4
+ import torch
5
+ from datasets import load_dataset, load_from_disk, load_metric
6
+ from tqdm import tqdm
7
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
8
+
9
+ from src.TextSummarizer.entity import entities
10
+ from src.TextSummarizer.logger import backend_logger
11
+
12
+
13
+ class ModelEvaluation:
14
+ def __init__(self, config: entities.ModelEvaluationConfig):
15
+ self.config = config
16
+
17
+ def generate_batch_sized_chunks(self,list_of_elements, batch_size):
18
+ """split the dataset into smaller batches that we can process simultaneously
19
+ Yield successive batch-sized chunks from list_of_elements."""
20
+ for i in range(0, len(list_of_elements), batch_size):
21
+ yield list_of_elements[i : i + batch_size]
22
+
23
+ def calculate_metric_on_test_ds(self,
24
+ dataset,
25
+ metric,
26
+ model,
27
+ tokenizer,
28
+ batch_size=16,
29
+ device="cuda" if torch.cuda.is_available() else "cpu",
30
+ column_text="article",
31
+ column_summary="highlights"):
32
+ """
33
+ Calculate the metrics.
34
+ """
35
+ article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
36
+ target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))
37
+
38
+ for article_batch, target_batch in tqdm(
39
+ zip(article_batches, target_batches), total=len(article_batches)):
40
+
41
+ inputs = tokenizer(article_batch, max_length=1024, truncation=True,
42
+ padding="max_length", return_tensors="pt")
43
+
44
+ summaries = model.generate(input_ids=inputs["input_ids"].to(device),
45
+ attention_mask=inputs["attention_mask"].to(device),
46
+ length_penalty=0.8, num_beams=8, max_length=128)
47
+ ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
48
+
49
+ # Finally, we decode the generated texts,
50
+ # replace the token, and add the decoded texts with the references to the metric.
51
+ decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
52
+ clean_up_tokenization_spaces=True)
53
+ for s in summaries]
54
+
55
+ decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
56
+
57
+
58
+ metric.add_batch(predictions=decoded_summaries, references=target_batch)
59
+
60
+ # Finally compute and return the ROUGE scores.
61
+ score = metric.compute()
62
+ return score
63
+
64
+ def run(self):
65
+ """
66
+ Run the model evaluation step.
67
+ """
68
+ device = "cuda" if torch.cuda.is_available() else "cpu"
69
+ tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
70
+ model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
71
+
72
+ #loading data
73
+ dataset_samsum_pt = load_from_disk(self.config.data_path)
74
+
75
+
76
+ rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
77
+
78
+ rouge_metric = load_metric('rouge')
79
+
80
+ score = self.calculate_metric_on_test_ds(
81
+ dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
82
+ )
83
+
84
+ rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
85
+
86
+ df = pd.DataFrame(rouge_dict, index = ['pegasus'] )
87
+ df.to_csv(self.config.metric_file_name, index=False)
src/TextSummarizer/components/train_model.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from datasets import load_from_disk
3
+ from transformers import (
4
+ AutoModelForSeq2SeqLM,
5
+ AutoTokenizer,
6
+ DataCollatorForSeq2Seq,
7
+ Trainer,
8
+ TrainingArguments,
9
+ )
10
+
11
+ from src.TextSummarizer.entity import entities
12
+
13
+
14
+ class ModelTrainer:
15
+ """
16
+ Train a model.
17
+ """
18
+ def __init__(self, config: entities.ModelTrainerConfig):
19
+ self.config = config
20
+
21
+ def train(self):
22
+ """
23
+ Train the model.
24
+ """
25
+ device = "cuda" if torch.cuda.is_available() else "cpu"
26
+ tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
27
+ model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
28
+ seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
29
+
30
+ #loading data
31
+ dataset = load_from_disk(self.config.data_path)
32
+
33
+ # trainer_args = TrainingArguments(
34
+ # output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
35
+ # per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
36
+ # weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
37
+ # evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
38
+ # gradient_accumulation_steps=self.config.gradient_accumulation_steps
39
+ # )
40
+
41
+
42
+ trainer_args = TrainingArguments(
43
+ output_dir=self.config.root_dir,
44
+ num_train_epochs=1,
45
+ warmup_steps=500,
46
+ per_device_train_batch_size=1,
47
+ per_device_eval_batch_size=1,
48
+ weight_decay=0.01,
49
+ logging_steps=10,
50
+ evaluation_strategy='steps',
51
+ eval_steps=500,
52
+ save_steps=1e6,
53
+ gradient_accumulation_steps=16
54
+ )
55
+
56
+ trainer = Trainer(
57
+ model=model_pegasus,
58
+ args=trainer_args,
59
+ tokenizer=tokenizer,
60
+ data_collator=seq2seq_data_collator,
61
+ train_dataset=dataset["train"],
62
+ eval_dataset=dataset["validation"])
63
+
64
+ # trainer.train()
65
+
66
+ ## Save model
67
+ model_pegasus.save_pretrained(self.config.model_path)
68
+
69
+ ## Save tokenizer
70
+ tokenizer.save_pretrained(self.config.tokenizer_path)
src/TextSummarizer/config/__init__.py ADDED
File without changes
src/TextSummarizer/config/config.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ artifacts_root: artifacts
2
+
3
+
4
+ data_ingestion:
5
+ dataset_name: "alexfabbri/multi_news"
6
+ arrow_dataset_dir: artifacts/data
7
+
8
+
9
+ data_validation:
10
+ root_dir: artifacts/data_validation
11
+ status_file: artifacts/data_validation/status.txt
12
+ all_required_folders: ["train", "test", "validation", "dataset_dict.json"]
13
+
14
+
15
+ data_transformation:
16
+ root_dir: artifacts/data_transformation
17
+ data_path: artifacts/data
18
+ tokenizer_name: google/pegasus-cnn_dailymail
19
+
20
+
21
+ model_trainer:
22
+ root_dir: artifacts/model_trainer
23
+ data_path: artifacts/data_transformation/dataset
24
+ model_ckpt: google/pegasus-cnn_dailymail
25
+ model_path: artifacts/model_trainer/pegasus-samsum-model
26
+ tokenizer_path: artifacts/model_trainer/tokenizer
27
+
28
+ model_evaluation:
29
+ root_dir: artifacts/model_evaluation
30
+ data_path: artifacts/data_transformation/dataset
31
+ model_path: artifacts/model_trainer/pegasus-samsum-model
32
+ tokenizer_path: artifacts/model_trainer/tokenizer
33
+ metric_file_name: artifacts/model_evaluation/metrics.csv
34
+ hub_model_name: pavithra-devi/pegasus-multi-news
src/TextSummarizer/config/config_manager.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration manager to get and set all the configuration.
3
+ """
4
+
5
+ from pathlib import Path
6
+
7
+ from box import ConfigBox
8
+
9
+ from src.TextSummarizer.constants import file_path
10
+ from src.TextSummarizer.entity import entities
11
+ from src.TextSummarizer.utils.general import create_directories, read_yaml
12
+
13
+
14
+ # Create a config manager.
15
+ class ConfigManager:
16
+ """
17
+ Class to manage the configuration files.
18
+ """
19
+
20
+ def __init__(self) -> None:
21
+ self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))
22
+ self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))
23
+
24
+ create_directories(path_to_directories=[self.config.artifacts_root])
25
+
26
+ def get_data_ingestion_config(self) -> entities.DataIngestionConfig:
27
+ """
28
+ Get the config which is needed to download the data files.
29
+ """
30
+ config: ConfigBox = self.config.data_ingestion
31
+
32
+ data_ingestion_config: entities.DataIngestionConfig = entities.DataIngestionConfig(
33
+ dataset_name=config.dataset_name,
34
+ arrow_dataset_dir=config.arrow_dataset_dir,
35
+ )
36
+
37
+ return data_ingestion_config
38
+
39
+ def get_data_validation_config(self) -> entities.DataValidationConfig:
40
+ """
41
+ Get the config which is needed to validate the data files.
42
+ """
43
+ config = self.config.data_validation
44
+
45
+ create_directories([config.root_dir])
46
+
47
+ data_validation_config = entities.DataValidationConfig(
48
+ root_dir=config.root_dir,
49
+ status_file=config.status_file,
50
+ all_required_folders=config.all_required_folders,
51
+ )
52
+
53
+ return data_validation_config
54
+
55
+ def get_data_transformation_config(self) -> entities.DataTransformationConfig:
56
+ """
57
+ Get teh data transformation configurations.
58
+ """
59
+ config = self.config.data_transformation
60
+
61
+ create_directories([config.root_dir])
62
+
63
+ data_transformation_config = entities.DataTransformationConfig(
64
+ root_dir=config.root_dir,
65
+ data_path=config.data_path,
66
+ tokenizer_name = config.tokenizer_name
67
+ )
68
+
69
+ return data_transformation_config
70
+
71
+ def get_model_trainer_config(self) -> entities.ModelTrainerConfig:
72
+ """
73
+ Get the configuration which is needed to train the model.
74
+ """
75
+ config = self.config.model_trainer
76
+ params = self.params.TrainingArguments
77
+
78
+ create_directories([config.root_dir])
79
+
80
+ model_trainer_config = entities.ModelTrainerConfig(
81
+ root_dir=config.root_dir,
82
+ data_path=config.data_path,
83
+ model_path= config.model_path,
84
+ tokenizer_path= config.tokenizer_path,
85
+ model_ckpt = config.model_ckpt,
86
+ num_train_epochs = params.num_train_epochs,
87
+ warmup_steps = params.warmup_steps,
88
+ per_device_train_batch_size = params.per_device_train_batch_size,
89
+ weight_decay = params.weight_decay,
90
+ logging_steps = params.logging_steps,
91
+ evaluation_strategy = params.evaluation_strategy,
92
+ eval_steps = params.evaluation_strategy,
93
+ save_steps = params.save_steps,
94
+ gradient_accumulation_steps = params.gradient_accumulation_steps
95
+ )
96
+
97
+ return model_trainer_config
98
+
99
+ def get_model_evaluation_config(self) -> entities.ModelEvaluationConfig:
100
+ """
101
+ Get the model evaluation configuration.
102
+ """
103
+ config = self.config.model_evaluation
104
+
105
+ create_directories([config.root_dir])
106
+
107
+ model_evaluation_config = entities.ModelEvaluationConfig(
108
+ root_dir=config.root_dir,
109
+ data_path=config.data_path,
110
+ model_path = config.model_path,
111
+ tokenizer_path = config.tokenizer_path,
112
+ metric_file_name = config.metric_file_name,
113
+ hub_model_name=config.hub_model_name
114
+
115
+ )
116
+
117
+ return model_evaluation_config
src/TextSummarizer/config/params.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ TrainingArguments:
2
+ num_train_epochs: 1
3
+ warmup_steps: 500
4
+ per_device_train_batch_size: 1
5
+ weight_decay: 0.01
6
+ logging_steps: 10
7
+ evaluation_strategy: steps
8
+ eval_steps: 500
9
+ save_steps: 1e6
10
+ gradient_accumulation_steps: 16
src/TextSummarizer/constants/__init__.py ADDED
File without changes
src/TextSummarizer/constants/file_path.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Constants which is related to the file paths are present here.
3
+ """
4
+
5
+ from typing import Final
6
+
7
+ CONFIG_FILE_PATH: Final[str] = "src/TextSummarizer/config/config.yaml"
8
+ PARAMS_FILE_PATH: Final[str] = "src/TextSummarizer/config/params.yaml"
src/TextSummarizer/entity/__init__.py ADDED
File without changes
src/TextSummarizer/entity/entities.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ All the class return types are present here.
3
+ """
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+
9
+ @dataclass()
10
+ class DataIngestionConfig:
11
+ """
12
+ The return type of the data ingestion config function.
13
+ """
14
+ dataset_name: str
15
+ arrow_dataset_dir: str
16
+
17
+
18
+ @dataclass()
19
+ class DataValidationConfig:
20
+ """
21
+ Return type of the data validation config function.
22
+ """
23
+ root_dir: str
24
+ status_file: str
25
+ all_required_folders: list
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class DataTransformationConfig:
30
+ """
31
+ Return type of the data transformation config function.
32
+ """
33
+ root_dir: str
34
+ data_path: str
35
+ tokenizer_name: str
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class ModelTrainerConfig:
40
+ """
41
+ Return type of the model trainer config function.
42
+ """
43
+ root_dir: str
44
+ data_path: str
45
+ model_ckpt: str
46
+ model_path: str
47
+ tokenizer_path: str
48
+ num_train_epochs: int
49
+ warmup_steps: int
50
+ per_device_train_batch_size: int
51
+ weight_decay: float
52
+ logging_steps: int
53
+ evaluation_strategy: str
54
+ eval_steps: int
55
+ save_steps: float
56
+ gradient_accumulation_steps: int
57
+
58
+
59
+ @dataclass(frozen=True)
60
+ class ModelEvaluationConfig:
61
+ """
62
+ Return type of the model evaluation config function.
63
+ """
64
+ root_dir: str
65
+ data_path: str
66
+ model_path: str
67
+ tokenizer_path: str
68
+ metric_file_name: str
69
+ hub_model_name: str
src/TextSummarizer/exception/__init__.py ADDED
File without changes
src/TextSummarizer/logger/__init__.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Creating the logger needed for the project.
3
+ """
4
+
5
+ import logging
6
+ import os
7
+
8
+ logging_str: str = "[%(asctime)s - %(levelname)s - %(module)s.py - %(name)s]: %(message)s"
9
+
10
+ # Create the log directory.
11
+ log_dir: str = "logs"
12
+ backend_log_filename: str = "back_end.log"
13
+ frontend_log_filename: str = "front_end.log"
14
+ backend_log_filepath: str = os.path.join(log_dir, backend_log_filename)
15
+ frontend_log_filepath: str = os.path.join(log_dir, frontend_log_filename)
16
+
17
+ os.makedirs(log_dir, exist_ok=True)
18
+
19
+ # Set the logging config.
20
+ logging.basicConfig(
21
+ level=logging.DEBUG,
22
+ format=logging_str,
23
+ # handlers=[
24
+ # logging.FileHandler(log_filepath),
25
+ # logging.StreamHandler(sys.stdout)
26
+ # ]
27
+ )
28
+
29
+ # Get the logger.
30
+ backend_file_handler = logging.FileHandler(backend_log_filepath)
31
+ frontend_file_handler = logging.FileHandler(frontend_log_filepath)
32
+
33
+ # add the formettor for the handler,
34
+ backend_file_handler.setFormatter(logging.Formatter(logging_str))
35
+ frontend_file_handler.setFormatter(logging.Formatter(logging_str))
36
+
37
+ backend_logger = logging.getLogger("backend")
38
+ frontend_logger = logging.getLogger("frontend")
39
+
40
+
41
+ # add the handlers.
42
+ backend_logger.addHandler(backend_file_handler)
43
+ frontend_logger.addHandler(frontend_file_handler)
src/TextSummarizer/pipeline/__init__.py ADDED
File without changes
src/TextSummarizer/pipeline/prediction.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ from src.TextSummarizer.config.config_manager import ConfigManager
4
+
5
+
6
+ class PredictionPipeline:
7
+ def __init__(self):
8
+ self.config = ConfigManager().get_model_evaluation_config()
9
+
10
+ def predict(self,text):
11
+ """
12
+ Predict the tex summarization for the given text.
13
+ """
14
+ gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
15
+
16
+ summarizer = pipeline("summarization", model=self.config.hub_model_name)
17
+
18
+ print("document:")
19
+ print(text)
20
+
21
+ output = summarizer(text, **gen_kwargs)[0]["summary_text"]
22
+ print("\nModel Summary:")
23
+ print(output)
24
+
25
+ return output
src/TextSummarizer/pipeline/step_01_data_ingestion.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The data ingestion pipeline.
3
+ """
4
+
5
+ from src.TextSummarizer.components.data_ingestion import DataIngestionComponent
6
+ from src.TextSummarizer.config.config_manager import ConfigManager
7
+ from src.TextSummarizer.entity import entities
8
+ from src.TextSummarizer.logger import backend_logger
9
+
10
+
11
+ class DataIngestionPipeline:
12
+ """
13
+ The data ingestion pipeline.
14
+ """
15
+
16
+ def run(self):
17
+ """
18
+ The main function of the data ingestion pipeline.
19
+ """
20
+ backend_logger.info("Starting the data ingestion pipeline.")
21
+ config: ConfigManager = ConfigManager()
22
+ data_ingestion_config: entities.DataIngestionConfig = config.get_data_ingestion_config()
23
+ data_ingestion = DataIngestionComponent(config=data_ingestion_config)
24
+ data_ingestion.save_dataset()
25
+ backend_logger.info("Finished the data ingestion pipeline.")
src/TextSummarizer/pipeline/step_02_data_validation.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The data validation pipeline.
3
+ """
4
+
5
+ from src.TextSummarizer.components.data_validation import DataValidation
6
+ from src.TextSummarizer.config.config_manager import ConfigManager
7
+ from src.TextSummarizer.entity import entities
8
+ from src.TextSummarizer.logger import backend_logger
9
+
10
+
11
+ class DataValidationPipeline:
12
+ """
13
+ The data validation pipeline.
14
+ """
15
+
16
+ def run(self):
17
+ """
18
+ The main function of the data validation pipeline.
19
+ """
20
+ backend_logger.info("Starting the data validation pipeline.")
21
+ config: ConfigManager = ConfigManager()
22
+ data_ingestion_config: entities.DataIngestionConfig = config.get_data_validation_config()
23
+ data_ingestion = DataValidation(config=data_ingestion_config)
24
+ data_ingestion.validate_all_files_exist()
25
+ backend_logger.info("Finished the data validation pipeline.")
src/TextSummarizer/pipeline/step_03_data_transformation.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The data Transformation pipeline.
3
+ """
4
+
5
+ from src.TextSummarizer.components.data_transformation import DataTransformation
6
+ from src.TextSummarizer.config.config_manager import ConfigManager
7
+ from src.TextSummarizer.entity import entities
8
+ from src.TextSummarizer.logger import backend_logger
9
+
10
+
11
+ class DataTransformationPipeline:
12
+ """
13
+ The data Transformation pipeline.
14
+ """
15
+
16
+ def run(self):
17
+ """
18
+ The main function of the data Transformation pipeline.
19
+ """
20
+ backend_logger.info("Starting the data Transformation pipeline.")
21
+ config: ConfigManager = ConfigManager()
22
+ data_transformation_config: entities.DataTransformationConfig = config.get_data_transformation_config()
23
+ data_transformation = DataTransformation(config=data_transformation_config)
24
+ data_transformation.convert()
25
+ backend_logger.info("Finished the data Transformation pipeline.")
src/TextSummarizer/pipeline/step_04_train_model.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The data Transformation pipeline.
3
+ """
4
+
5
+ from src.TextSummarizer.components.train_model import ModelTrainer
6
+ from src.TextSummarizer.config.config_manager import ConfigManager
7
+ from src.TextSummarizer.entity import entities
8
+ from src.TextSummarizer.logger import backend_logger
9
+
10
+
11
+ class ModelTrainerPipeline:
12
+ """
13
+ The data Transformation pipeline.
14
+ """
15
+
16
+ def run(self):
17
+ """
18
+ The main function of the train model pipeline.
19
+ """
20
+ backend_logger.info("Starting the train model pipeline.")
21
+ config: ConfigManager = ConfigManager()
22
+ train_model_config: entities.DataTransformationConfig = config.get_model_trainer_config()
23
+ train_model = ModelTrainer(config=train_model_config)
24
+ train_model.train()
25
+ backend_logger.info("Finished the train model pipeline.")
src/TextSummarizer/pipeline/step_05_model_evaluation.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The data Transformation pipeline.
3
+ """
4
+
5
+ from src.TextSummarizer.components.model_evaluation import ModelEvaluation
6
+ from src.TextSummarizer.config.config_manager import ConfigManager
7
+ from src.TextSummarizer.entity import entities
8
+ from src.TextSummarizer.logger import backend_logger
9
+
10
+
11
+ class ModelEvaluationPipeline:
12
+ """
13
+ The model evaluation pipeline.
14
+ """
15
+
16
+ def run(self):
17
+ """
18
+ The main function of the model evaluation pipeline.
19
+ """
20
+ backend_logger.info("Starting the model evaluation pipeline.")
21
+ config = ConfigManager()
22
+ model_evaluation_config = config.get_model_evaluation_config()
23
+ model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
24
+ model_evaluation_config.evaluate()
25
+ backend_logger.info("Finished the model evaluation pipeline.")
src/TextSummarizer/utils/__init__.py ADDED
File without changes
src/TextSummarizer/utils/general.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The functions used throughout the project is present here.
3
+ """
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import yaml
10
+ from box import ConfigBox
11
+ from box.exceptions import BoxValueError
12
+ from ensure import ensure_annotations
13
+
14
+ from src.TextSummarizer.logger import backend_logger
15
+
16
+
17
+ @ensure_annotations
18
+ def read_yaml(path_to_yaml: Path) -> ConfigBox:
19
+ """
20
+ Read yaml file and return as Dictionary.
21
+
22
+ :param path_to_yaml: Path to yaml file.
23
+ :return: A ConfigBox dictionary object containing the the yaml file contents.
24
+ """
25
+ try:
26
+ with open(path_to_yaml) as yaml_file:
27
+ content = yaml.safe_load(yaml_file)
28
+ backend_logger.info(f"yaml file: {path_to_yaml} loaded successfully")
29
+ return ConfigBox(content)
30
+ except BoxValueError:
31
+ raise ValueError(f"yaml file: {path_to_yaml} is empty.")
32
+ except Exception as exp:
33
+ raise exp
34
+
35
+
36
+ def create_directories(path_to_directories: list) -> None:
37
+ """
38
+ create list of directories.
39
+
40
+ :params path_to_directories: list of path of directories.
41
+ """
42
+ for path in path_to_directories:
43
+ os.makedirs(path, exist_ok=True)
44
+ backend_logger.info(f"created directory at: {path}")
45
+
46
+
47
+ @ensure_annotations
48
+ def get_size(path: Path) -> str:
49
+ """
50
+ Get the file size in KB.
51
+
52
+ :param path: Path of the file.
53
+ :returns: Size in KB.
54
+ """
55
+ size_in_kb = round(os.path.getsize(path)/1024)
56
+ return f"~ {size_in_kb} KB"
src/__init__.py ADDED
File without changes