Spaces:
Sleeping
Sleeping
Commit
·
34b369f
1
Parent(s):
022c91d
added the appilication
Browse files- .github/workflows/.gitkeep +0 -0
- .gitignore +166 -0
- Dockerfile +0 -0
- LICENSE +21 -0
- app.py +49 -0
- check_code/01_data_dowanloading.ipynb +179 -0
- check_code/02_data_validation.ipynb +150 -0
- check_code/03_data_transformation.ipynb +132 -0
- check_code/04_model_training.ipynb +184 -0
- check_code/05_model_evaluation.ipynb +177 -0
- check_code/predict.ipynb +504 -0
- check_code/trials.ipynb +0 -0
- main.py +74 -0
- project_structure.py +52 -0
- requirements.txt +21 -0
- setup.py +36 -0
- src/TextSummarizer/components/__init__.py +0 -0
- src/TextSummarizer/components/data_ingestion.py +25 -0
- src/TextSummarizer/components/data_transformation.py +45 -0
- src/TextSummarizer/components/data_validation.py +36 -0
- src/TextSummarizer/components/model_evaluation.py +87 -0
- src/TextSummarizer/components/train_model.py +70 -0
- src/TextSummarizer/config/__init__.py +0 -0
- src/TextSummarizer/config/config.yaml +34 -0
- src/TextSummarizer/config/config_manager.py +117 -0
- src/TextSummarizer/config/params.yaml +10 -0
- src/TextSummarizer/constants/__init__.py +0 -0
- src/TextSummarizer/constants/file_path.py +8 -0
- src/TextSummarizer/entity/__init__.py +0 -0
- src/TextSummarizer/entity/entities.py +69 -0
- src/TextSummarizer/exception/__init__.py +0 -0
- src/TextSummarizer/logger/__init__.py +43 -0
- src/TextSummarizer/pipeline/__init__.py +0 -0
- src/TextSummarizer/pipeline/prediction.py +25 -0
- src/TextSummarizer/pipeline/step_01_data_ingestion.py +25 -0
- src/TextSummarizer/pipeline/step_02_data_validation.py +25 -0
- src/TextSummarizer/pipeline/step_03_data_transformation.py +25 -0
- src/TextSummarizer/pipeline/step_04_train_model.py +25 -0
- src/TextSummarizer/pipeline/step_05_model_evaluation.py +25 -0
- src/TextSummarizer/utils/__init__.py +0 -0
- src/TextSummarizer/utils/general.py +56 -0
- src/__init__.py +0 -0
.github/workflows/.gitkeep
ADDED
File without changes
|
.gitignore
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
110 |
+
.pdm.toml
|
111 |
+
.pdm-python
|
112 |
+
.pdm-build/
|
113 |
+
|
114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
115 |
+
__pypackages__/
|
116 |
+
|
117 |
+
# Celery stuff
|
118 |
+
celerybeat-schedule
|
119 |
+
celerybeat.pid
|
120 |
+
|
121 |
+
# SageMath parsed files
|
122 |
+
*.sage.py
|
123 |
+
|
124 |
+
# Environments
|
125 |
+
.env
|
126 |
+
.venv
|
127 |
+
env/
|
128 |
+
venv/
|
129 |
+
ENV/
|
130 |
+
env.bak/
|
131 |
+
venv.bak/
|
132 |
+
|
133 |
+
# Spyder project settings
|
134 |
+
.spyderproject
|
135 |
+
.spyproject
|
136 |
+
|
137 |
+
# Rope project settings
|
138 |
+
.ropeproject
|
139 |
+
|
140 |
+
# mkdocs documentation
|
141 |
+
/site
|
142 |
+
|
143 |
+
# mypy
|
144 |
+
.mypy_cache/
|
145 |
+
.dmypy.json
|
146 |
+
dmypy.json
|
147 |
+
|
148 |
+
# Pyre type checker
|
149 |
+
.pyre/
|
150 |
+
|
151 |
+
# pytype static type analyzer
|
152 |
+
.pytype/
|
153 |
+
|
154 |
+
# Cython debug symbols
|
155 |
+
cython_debug/
|
156 |
+
|
157 |
+
# PyCharm
|
158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
+
#.idea/
|
163 |
+
|
164 |
+
# leave the dataset file folder
|
165 |
+
artifacts/
|
166 |
+
venv_text_summarizaition/
|
Dockerfile
ADDED
File without changes
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 Pavithra Devi M
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
app.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import uvicorn
|
4 |
+
from fastapi import FastAPI
|
5 |
+
from fastapi.responses import Response
|
6 |
+
from starlette.responses import RedirectResponse
|
7 |
+
from textSummarizer.pipeline.prediction import PredictionPipeline
|
8 |
+
|
9 |
+
text:str = "What is Text Summarization?"
|
10 |
+
|
11 |
+
app = FastAPI()
|
12 |
+
|
13 |
+
@app.get("/", tags=["authentication"])
|
14 |
+
async def index():
|
15 |
+
"""
|
16 |
+
The main page.
|
17 |
+
"""
|
18 |
+
return "The API is UP and running."
|
19 |
+
|
20 |
+
|
21 |
+
@app.get("/train")
|
22 |
+
async def training():
|
23 |
+
"""
|
24 |
+
The training page.
|
25 |
+
"""
|
26 |
+
try:
|
27 |
+
os.system("python main.py")
|
28 |
+
return Response("Training successful !!")
|
29 |
+
|
30 |
+
except Exception as e:
|
31 |
+
return Response(f"Error Occurred! {e}")
|
32 |
+
|
33 |
+
|
34 |
+
@app.post("/predict")
|
35 |
+
async def predict_route(text):
|
36 |
+
"""
|
37 |
+
The prediction api call.
|
38 |
+
"""
|
39 |
+
try:
|
40 |
+
|
41 |
+
obj = PredictionPipeline()
|
42 |
+
text = obj.predict(text)
|
43 |
+
return text
|
44 |
+
except Exception as e:
|
45 |
+
raise e
|
46 |
+
|
47 |
+
|
48 |
+
if __name__=="__main__":
|
49 |
+
uvicorn.run(app, host="0.0.0.0", port=8080)
|
check_code/01_data_dowanloading.ipynb
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"data": {
|
10 |
+
"text/plain": [
|
11 |
+
"'/home/pavithra/projects/Text-summarization-nlp'"
|
12 |
+
]
|
13 |
+
},
|
14 |
+
"execution_count": 1,
|
15 |
+
"metadata": {},
|
16 |
+
"output_type": "execute_result"
|
17 |
+
}
|
18 |
+
],
|
19 |
+
"source": [
|
20 |
+
"import os\n",
|
21 |
+
"%pwd\n",
|
22 |
+
"os.chdir(\"../\")\n",
|
23 |
+
"%pwd"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "code",
|
28 |
+
"execution_count": 7,
|
29 |
+
"metadata": {},
|
30 |
+
"outputs": [],
|
31 |
+
"source": [
|
32 |
+
"from dataclasses import dataclass\n",
|
33 |
+
"from pathlib import Path\n",
|
34 |
+
"\n",
|
35 |
+
"@dataclass(frozen=True)\n",
|
36 |
+
"class DataIngestionConfig:\n",
|
37 |
+
" dataset_name: str\n",
|
38 |
+
" arrow_dataset_dir: str\n"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"cell_type": "code",
|
43 |
+
"execution_count": 16,
|
44 |
+
"metadata": {},
|
45 |
+
"outputs": [],
|
46 |
+
"source": [
|
47 |
+
"from box import ConfigBox\n",
|
48 |
+
"from pathlib import Path\n",
|
49 |
+
"from src.TextSummarizer.constants import file_path\n",
|
50 |
+
"from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
|
51 |
+
"\n",
|
52 |
+
"# Create a config manager.\n",
|
53 |
+
"class ConfigManager:\n",
|
54 |
+
" \"\"\"\n",
|
55 |
+
" Class to manage the configuration files.\n",
|
56 |
+
" \"\"\"\n",
|
57 |
+
"\n",
|
58 |
+
" def __init__(self) -> None:\n",
|
59 |
+
" self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
|
60 |
+
" self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
|
61 |
+
"\n",
|
62 |
+
" create_directories(path_to_directories=[self.config.artifacts_root])\n",
|
63 |
+
"\n",
|
64 |
+
" def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
|
65 |
+
" \"\"\"\n",
|
66 |
+
" Get the config which is needed to download the data files.\n",
|
67 |
+
" \"\"\"\n",
|
68 |
+
" config: ConfigBox = self.config.data_ingestion\n",
|
69 |
+
"\n",
|
70 |
+
" data_ingestion_config: DataIngestionConfig = DataIngestionConfig(\n",
|
71 |
+
" dataset_name=config.dataset_name,\n",
|
72 |
+
" arrow_dataset_dir=config.arrow_dataset_dir,\n",
|
73 |
+
" )\n",
|
74 |
+
"\n",
|
75 |
+
" return data_ingestion_config\n"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 13,
|
81 |
+
"metadata": {},
|
82 |
+
"outputs": [],
|
83 |
+
"source": [
|
84 |
+
"# from datasets import load_dataset\n",
|
85 |
+
"\n",
|
86 |
+
"# test_dataset = load_dataset(\"alexfabbri/multi_news\")\n",
|
87 |
+
"\n",
|
88 |
+
"\n",
|
89 |
+
"# test_dataset.save_to_disk(\"multi_news_arrow_dataset\")\n",
|
90 |
+
"# from datasets import load_from_disk\n",
|
91 |
+
"# arrow_datasets_reloaded = load_from_disk(\"multi_news_arrow_dataset\")\n"
|
92 |
+
]
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"cell_type": "code",
|
96 |
+
"execution_count": 14,
|
97 |
+
"metadata": {},
|
98 |
+
"outputs": [],
|
99 |
+
"source": [
|
100 |
+
"from datasets import load_dataset\n",
|
101 |
+
"\n",
|
102 |
+
"class DataIngestionComponent:\n",
|
103 |
+
" \"\"\"\n",
|
104 |
+
" A Class which is responsible for data ingestion.\n",
|
105 |
+
" \"\"\"\n",
|
106 |
+
"\n",
|
107 |
+
" def __init__(self, config: DataIngestionConfig) -> None:\n",
|
108 |
+
" self.config = config\n",
|
109 |
+
"\n",
|
110 |
+
" def save_dataset(self):\n",
|
111 |
+
" \"\"\"\n",
|
112 |
+
" Load the dataset.\n",
|
113 |
+
" \"\"\"\n",
|
114 |
+
" test_dataset = load_dataset(self.config.dataset_name)\n",
|
115 |
+
" test_dataset.save_to_disk(self.config.arrow_dataset_dir)\n"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"cell_type": "code",
|
120 |
+
"execution_count": 15,
|
121 |
+
"metadata": {},
|
122 |
+
"outputs": [
|
123 |
+
{
|
124 |
+
"ename": "FileNotFoundError",
|
125 |
+
"evalue": "[Errno 2] No such file or directory: 'config/config.yaml'",
|
126 |
+
"output_type": "error",
|
127 |
+
"traceback": [
|
128 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
129 |
+
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
130 |
+
"\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m7\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m data_ingestion\u001b[39m.\u001b[39msave_dataset()\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m err:\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m \u001b[39mraise\u001b[39;00m err\n",
|
131 |
+
"\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m2\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m----> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m config: ConfigManager \u001b[39m=\u001b[39m ConfigManager()\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m data_ingestion_config: DataIngestionConfig \u001b[39m=\u001b[39m config\u001b[39m.\u001b[39mget_data_ingestion_config()\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m data_ingestion \u001b[39m=\u001b[39m DataIngestionComponent(config\u001b[39m=\u001b[39mdata_ingestion_config)\n",
|
132 |
+
"\u001b[1;32m/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m---> <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig: ConfigBox \u001b[39m=\u001b[39m read_yaml(Path(file_path\u001b[39m.\u001b[39;49mCONFIG_FILE_PATH))\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mparams: ConfigBox \u001b[39m=\u001b[39m read_yaml(Path(file_path\u001b[39m.\u001b[39mPARAMS_FILE_PATH))\n\u001b[1;32m <a href='vscode-notebook-cell:/home/pavithra/projects/Text-summarization-nlp/check_code/01_data_dowanloading.ipynb#W5sZmlsZQ%3D%3D?line=15'>16</a>\u001b[0m create_directories(path_to_directories\u001b[39m=\u001b[39m[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39martifacts_root])\n",
|
133 |
+
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/ensure/main.py:872\u001b[0m, in \u001b[0;36mWrappedFunctionReturn.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 869\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mArgument \u001b[39m\u001b[39m{arg}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m to \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 870\u001b[0m \u001b[39mraise\u001b[39;00m EnsureError(msg\u001b[39m.\u001b[39mformat(arg\u001b[39m=\u001b[39marg, f\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mf, t\u001b[39m=\u001b[39mtempl, valt\u001b[39m=\u001b[39m\u001b[39mtype\u001b[39m(value)))\n\u001b[0;32m--> 872\u001b[0m return_val \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mf(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 873\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(return_val, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_templ):\n\u001b[1;32m 874\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mReturn value of \u001b[39m\u001b[39m{f}\u001b[39;00m\u001b[39m of type \u001b[39m\u001b[39m{valt}\u001b[39;00m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mdoes not match annotation type \u001b[39m\u001b[39m{t}\u001b[39;00m\u001b[39m\"\u001b[39m\n",
|
134 |
+
"File \u001b[0;32m~/projects/Text-summarization-nlp/src/TextSummarizer/utils/general.py:33\u001b[0m, in \u001b[0;36mread_yaml\u001b[0;34m(path_to_yaml)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39myaml file: \u001b[39m\u001b[39m{\u001b[39;00mpath_to_yaml\u001b[39m}\u001b[39;00m\u001b[39m is empty.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 32\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m exp:\n\u001b[0;32m---> 33\u001b[0m \u001b[39mraise\u001b[39;00m exp\n",
|
135 |
+
"File \u001b[0;32m~/projects/Text-summarization-nlp/src/TextSummarizer/utils/general.py:26\u001b[0m, in \u001b[0;36mread_yaml\u001b[0;34m(path_to_yaml)\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 20\u001b[0m \u001b[39mRead yaml file and return as Dictionary.\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \n\u001b[1;32m 22\u001b[0m \u001b[39m:param path_to_yaml: Path to yaml file.\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[39m:return: A ConfigBox dictionary object containing the the yaml file contents.\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 26\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(path_to_yaml) \u001b[39mas\u001b[39;00m yaml_file:\n\u001b[1;32m 27\u001b[0m content \u001b[39m=\u001b[39m yaml\u001b[39m.\u001b[39msafe_load(yaml_file)\n\u001b[1;32m 28\u001b[0m backend_logger\u001b[39m.\u001b[39minfo(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39myaml file: \u001b[39m\u001b[39m{\u001b[39;00mpath_to_yaml\u001b[39m}\u001b[39;00m\u001b[39m loaded successfully\u001b[39m\u001b[39m\"\u001b[39m)\n",
|
136 |
+
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'config/config.yaml'"
|
137 |
+
]
|
138 |
+
}
|
139 |
+
],
|
140 |
+
"source": [
|
141 |
+
"try:\n",
|
142 |
+
" config: ConfigManager = ConfigManager()\n",
|
143 |
+
" data_ingestion_config: DataIngestionConfig = config.get_data_ingestion_config()\n",
|
144 |
+
" data_ingestion = DataIngestionComponent(config=data_ingestion_config)\n",
|
145 |
+
" data_ingestion.save_dataset()\n",
|
146 |
+
"except Exception as err:\n",
|
147 |
+
" raise err"
|
148 |
+
]
|
149 |
+
},
|
150 |
+
{
|
151 |
+
"cell_type": "code",
|
152 |
+
"execution_count": null,
|
153 |
+
"metadata": {},
|
154 |
+
"outputs": [],
|
155 |
+
"source": []
|
156 |
+
}
|
157 |
+
],
|
158 |
+
"metadata": {
|
159 |
+
"kernelspec": {
|
160 |
+
"display_name": "Python 3",
|
161 |
+
"language": "python",
|
162 |
+
"name": "python3"
|
163 |
+
},
|
164 |
+
"language_info": {
|
165 |
+
"codemirror_mode": {
|
166 |
+
"name": "ipython",
|
167 |
+
"version": 3
|
168 |
+
},
|
169 |
+
"file_extension": ".py",
|
170 |
+
"mimetype": "text/x-python",
|
171 |
+
"name": "python",
|
172 |
+
"nbconvert_exporter": "python",
|
173 |
+
"pygments_lexer": "ipython3",
|
174 |
+
"version": "3.8.10"
|
175 |
+
}
|
176 |
+
},
|
177 |
+
"nbformat": 4,
|
178 |
+
"nbformat_minor": 2
|
179 |
+
}
|
check_code/02_data_validation.ipynb
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"data": {
|
10 |
+
"text/plain": [
|
11 |
+
"'/home/pavithra/projects/Text-summarization-nlp'"
|
12 |
+
]
|
13 |
+
},
|
14 |
+
"execution_count": 1,
|
15 |
+
"metadata": {},
|
16 |
+
"output_type": "execute_result"
|
17 |
+
}
|
18 |
+
],
|
19 |
+
"source": [
|
20 |
+
"import os\n",
|
21 |
+
"%pwd\n",
|
22 |
+
"os.chdir(\"../\")\n",
|
23 |
+
"\n",
|
24 |
+
"%pwd"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"cell_type": "code",
|
29 |
+
"execution_count": null,
|
30 |
+
"metadata": {},
|
31 |
+
"outputs": [],
|
32 |
+
"source": [
|
33 |
+
"from dataclasses import dataclass\n",
|
34 |
+
"from pathlib import Path\n",
|
35 |
+
"\n",
|
36 |
+
"@dataclass(frozen=True)\n",
|
37 |
+
"class DataValidationConfig:\n",
|
38 |
+
" root_dir: str\n",
|
39 |
+
" status_file: str\n",
|
40 |
+
" all_required_folders: list\n"
|
41 |
+
]
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"cell_type": "code",
|
45 |
+
"execution_count": null,
|
46 |
+
"metadata": {},
|
47 |
+
"outputs": [],
|
48 |
+
"source": [
|
49 |
+
"from src.TextSummarizer.constants import file_path\n",
|
50 |
+
"from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
|
51 |
+
"\n",
|
52 |
+
"class ConfigurationManager:\n",
|
53 |
+
" def __init__(\n",
|
54 |
+
" self,\n",
|
55 |
+
" config_filepath = file_path.CONFIG_FILE_PATH,\n",
|
56 |
+
" params_filepath = file_path.PARAMS_FILE_PATH):\n",
|
57 |
+
"\n",
|
58 |
+
" self.config = read_yaml(config_filepath)\n",
|
59 |
+
" self.params = read_yaml(params_filepath)\n",
|
60 |
+
"\n",
|
61 |
+
" create_directories([self.config.artifacts_root])\n",
|
62 |
+
"\n",
|
63 |
+
"\n",
|
64 |
+
"\n",
|
65 |
+
" def get_data_validation_config(self) -> DataValidationConfig:\n",
|
66 |
+
" config = self.config.data_validation\n",
|
67 |
+
"\n",
|
68 |
+
" create_directories([config.root_dir])\n",
|
69 |
+
"\n",
|
70 |
+
" data_validation_config = DataValidationConfig(\n",
|
71 |
+
" root_dir=config.root_dir,\n",
|
72 |
+
" status_file=config.status_file,\n",
|
73 |
+
" all_required_folders=config.all_required_folders,\n",
|
74 |
+
" )\n",
|
75 |
+
"\n",
|
76 |
+
" return data_validation_config"
|
77 |
+
]
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"cell_type": "code",
|
81 |
+
"execution_count": null,
|
82 |
+
"metadata": {},
|
83 |
+
"outputs": [],
|
84 |
+
"source": [
|
85 |
+
"class DataValidation:\n",
|
86 |
+
" def __init__(self, config: DataValidationConfig):\n",
|
87 |
+
" self.config = config\n",
|
88 |
+
"\n",
|
89 |
+
"\n",
|
90 |
+
"\n",
|
91 |
+
" def validate_all_files_exist(self)-> bool:\n",
|
92 |
+
" try:\n",
|
93 |
+
" validation_status: bool | None = None\n",
|
94 |
+
"\n",
|
95 |
+
" all_folder = os.listdir(os.path.join(\"artifacts\",\"data\"))\n",
|
96 |
+
"\n",
|
97 |
+
" for folder in all_folder:\n",
|
98 |
+
" if folder not in self.config.all_required_folders:\n",
|
99 |
+
" validation_status = False\n",
|
100 |
+
" with open(self.config.status_file, \"w\") as f:\n",
|
101 |
+
" f.write(f\"Validation status: {validation_status}\")\n",
|
102 |
+
" else:\n",
|
103 |
+
" validation_status = True\n",
|
104 |
+
" with open(self.config.status_file, \"w\") as f:\n",
|
105 |
+
" f.write(f\"Validation status: {validation_status}\")\n",
|
106 |
+
"\n",
|
107 |
+
" return validation_status\n",
|
108 |
+
"\n",
|
109 |
+
" except Exception as exp:\n",
|
110 |
+
" raise exp"
|
111 |
+
]
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"cell_type": "code",
|
115 |
+
"execution_count": null,
|
116 |
+
"metadata": {},
|
117 |
+
"outputs": [],
|
118 |
+
"source": [
|
119 |
+
"try:\n",
|
120 |
+
" config = ConfigurationManager()\n",
|
121 |
+
" data_validation_config = config.get_data_validation_config()\n",
|
122 |
+
" data_validation = DataValiadtion(config=data_validation_config)\n",
|
123 |
+
" data_validation.validate_all_files_exist()\n",
|
124 |
+
"except Exception as e:\n",
|
125 |
+
" raise e"
|
126 |
+
]
|
127 |
+
}
|
128 |
+
],
|
129 |
+
"metadata": {
|
130 |
+
"kernelspec": {
|
131 |
+
"display_name": "Python 3",
|
132 |
+
"language": "python",
|
133 |
+
"name": "python3"
|
134 |
+
},
|
135 |
+
"language_info": {
|
136 |
+
"codemirror_mode": {
|
137 |
+
"name": "ipython",
|
138 |
+
"version": 3
|
139 |
+
},
|
140 |
+
"file_extension": ".py",
|
141 |
+
"mimetype": "text/x-python",
|
142 |
+
"name": "python",
|
143 |
+
"nbconvert_exporter": "python",
|
144 |
+
"pygments_lexer": "ipython3",
|
145 |
+
"version": "3.8.10"
|
146 |
+
}
|
147 |
+
},
|
148 |
+
"nbformat": 4,
|
149 |
+
"nbformat_minor": 2
|
150 |
+
}
|
check_code/03_data_transformation.ipynb
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import os\n",
|
10 |
+
"%pwd\n",
|
11 |
+
"os.chdir(\"../\")\n",
|
12 |
+
"\n",
|
13 |
+
"%pwd"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "code",
|
18 |
+
"execution_count": null,
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"from dataclasses import dataclass\n",
|
23 |
+
"from pathlib import Path\n",
|
24 |
+
"\n",
|
25 |
+
"\n",
|
26 |
+
"@dataclass(frozen=True)\n",
|
27 |
+
"class DataTransformationConfig:\n",
|
28 |
+
" root_dir: str\n",
|
29 |
+
" data_path: str\n",
|
30 |
+
" tokenizer_name: str"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"execution_count": null,
|
36 |
+
"metadata": {},
|
37 |
+
"outputs": [],
|
38 |
+
"source": [
|
39 |
+
"from box import ConfigBox\n",
|
40 |
+
"from pathlib import Path\n",
|
41 |
+
"from src.TextSummarizer.constants import file_path\n",
|
42 |
+
"from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
|
43 |
+
"\n",
|
44 |
+
"class ConfigurationManager:\n",
|
45 |
+
" \"\"\"\n",
|
46 |
+
" Class to manage the configuration files.\n",
|
47 |
+
" \"\"\"\n",
|
48 |
+
"\n",
|
49 |
+
" def __init__(self) -> None:\n",
|
50 |
+
" self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
|
51 |
+
" self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
|
52 |
+
"\n",
|
53 |
+
" create_directories(path_to_directories=[self.config.artifacts_root])\n",
|
54 |
+
"\n",
|
55 |
+
" def get_data_transformation_config(self) -> DataTransformationConfig:\n",
|
56 |
+
" config = self.config.data_transformation\n",
|
57 |
+
"\n",
|
58 |
+
" create_directories([config.root_dir])\n",
|
59 |
+
"\n",
|
60 |
+
" data_transformation_config = DataTransformationConfig(\n",
|
61 |
+
" root_dir=config.root_dir,\n",
|
62 |
+
" data_path=config.data_path,\n",
|
63 |
+
" tokenizer_name = config.tokenizer_name\n",
|
64 |
+
" )\n",
|
65 |
+
"\n",
|
66 |
+
" return data_transformation_config"
|
67 |
+
]
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"cell_type": "code",
|
71 |
+
"execution_count": null,
|
72 |
+
"metadata": {},
|
73 |
+
"outputs": [],
|
74 |
+
"source": [
|
75 |
+
"\n",
|
76 |
+
"\n",
|
77 |
+
"import os\n",
|
78 |
+
"from src.TextSummarizer.logger import backend_logger\n",
|
79 |
+
"from transformers import AutoTokenizer\n",
|
80 |
+
"from datasets import load_dataset, load_from_disk\n",
|
81 |
+
"\n",
|
82 |
+
"\n",
|
83 |
+
"class DataTransformation:\n",
|
84 |
+
" def __init__(self, config: DataTransformationConfig):\n",
|
85 |
+
" self.config = config\n",
|
86 |
+
" self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)\n",
|
87 |
+
"\n",
|
88 |
+
"\n",
|
89 |
+
"\n",
|
90 |
+
" def convert_examples_to_features(self,example_batch):\n",
|
91 |
+
" input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 800, truncation = True )\n",
|
92 |
+
"\n",
|
93 |
+
" with self.tokenizer.as_target_tokenizer():\n",
|
94 |
+
" target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
|
95 |
+
"\n",
|
96 |
+
" return {\n",
|
97 |
+
" 'input_ids' : input_encodings['input_ids'],\n",
|
98 |
+
" 'attention_mask': input_encodings['attention_mask'],\n",
|
99 |
+
" 'labels': target_encodings['input_ids']\n",
|
100 |
+
" }\n",
|
101 |
+
"\n",
|
102 |
+
"\n",
|
103 |
+
" def convert(self):\n",
|
104 |
+
" dataset = load_from_disk(self.config.data_path)\n",
|
105 |
+
" dataset = dataset.map(self.convert_examples_to_features, batched = True)\n",
|
106 |
+
" dataset.save_to_disk(os.path.join(self.config.root_dir,\"dataset\"))"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "code",
|
111 |
+
"execution_count": null,
|
112 |
+
"metadata": {},
|
113 |
+
"outputs": [],
|
114 |
+
"source": [
|
115 |
+
"try:\n",
|
116 |
+
" config = ConfigurationManager()\n",
|
117 |
+
" data_transformation_config = config.get_data_transformation_config()\n",
|
118 |
+
" data_transformation = DataTransformation(config=data_transformation_config)\n",
|
119 |
+
" data_transformation.convert()\n",
|
120 |
+
"except Exception as e:\n",
|
121 |
+
" raise e"
|
122 |
+
]
|
123 |
+
}
|
124 |
+
],
|
125 |
+
"metadata": {
|
126 |
+
"language_info": {
|
127 |
+
"name": "python"
|
128 |
+
}
|
129 |
+
},
|
130 |
+
"nbformat": 4,
|
131 |
+
"nbformat_minor": 2
|
132 |
+
}
|
check_code/04_model_training.ipynb
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import os\n",
|
10 |
+
"%pwd\n",
|
11 |
+
"os.chdir(\"../\")\n",
|
12 |
+
"\n",
|
13 |
+
"%pwd"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "code",
|
18 |
+
"execution_count": null,
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"from dataclasses import dataclass\n",
|
23 |
+
"from pathlib import Path\n",
|
24 |
+
"\n",
|
25 |
+
"\n",
|
26 |
+
"@dataclass(frozen=True)\n",
|
27 |
+
"class ModelTrainerConfig:\n",
|
28 |
+
" root_dir: str\n",
|
29 |
+
" data_path: str\n",
|
30 |
+
" model_ckpt: str\n",
|
31 |
+
" num_train_epochs: int\n",
|
32 |
+
" warmup_steps: int\n",
|
33 |
+
" per_device_train_batch_size: int\n",
|
34 |
+
" weight_decay: float\n",
|
35 |
+
" logging_steps: int\n",
|
36 |
+
" evaluation_strategy: str\n",
|
37 |
+
" eval_steps: int\n",
|
38 |
+
" save_steps: float\n",
|
39 |
+
" gradient_accumulation_steps: int"
|
40 |
+
]
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"cell_type": "code",
|
44 |
+
"execution_count": null,
|
45 |
+
"metadata": {},
|
46 |
+
"outputs": [],
|
47 |
+
"source": [
|
48 |
+
"from box import ConfigBox\n",
|
49 |
+
"from pathlib import Path\n",
|
50 |
+
"from src.TextSummarizer.constants import file_path\n",
|
51 |
+
"from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
|
52 |
+
"\n",
|
53 |
+
"\n",
|
54 |
+
"class ConfigurationManager:\n",
|
55 |
+
"\n",
|
56 |
+
" def __init__(self) -> None:\n",
|
57 |
+
" self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
|
58 |
+
" self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
|
59 |
+
"\n",
|
60 |
+
" create_directories(path_to_directories=[self.config.artifacts_root])\n",
|
61 |
+
"\n",
|
62 |
+
" def get_model_trainer_config(self) -> ModelTrainerConfig:\n",
|
63 |
+
" config = self.config.model_trainer\n",
|
64 |
+
" params = self.params.TrainingArguments\n",
|
65 |
+
"\n",
|
66 |
+
" create_directories([config.root_dir])\n",
|
67 |
+
"\n",
|
68 |
+
" model_trainer_config = ModelTrainerConfig(\n",
|
69 |
+
" root_dir=config.root_dir,\n",
|
70 |
+
" data_path=config.data_path,\n",
|
71 |
+
" model_ckpt = config.model_ckpt,\n",
|
72 |
+
" num_train_epochs = params.num_train_epochs,\n",
|
73 |
+
" warmup_steps = params.warmup_steps,\n",
|
74 |
+
" per_device_train_batch_size = params.per_device_train_batch_size,\n",
|
75 |
+
" weight_decay = params.weight_decay,\n",
|
76 |
+
" logging_steps = params.logging_steps,\n",
|
77 |
+
" evaluation_strategy = params.evaluation_strategy,\n",
|
78 |
+
" eval_steps = params.evaluation_strategy,\n",
|
79 |
+
" save_steps = params.save_steps,\n",
|
80 |
+
" gradient_accumulation_steps = params.gradient_accumulation_steps\n",
|
81 |
+
" )\n",
|
82 |
+
"\n",
|
83 |
+
" return model_trainer_config"
|
84 |
+
]
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"cell_type": "code",
|
88 |
+
"execution_count": null,
|
89 |
+
"metadata": {},
|
90 |
+
"outputs": [],
|
91 |
+
"source": [
|
92 |
+
"from transformers import TrainingArguments, Trainer\n",
|
93 |
+
"from transformers import DataCollatorForSeq2Seq\n",
|
94 |
+
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
|
95 |
+
"from datasets import load_dataset, load_from_disk\n",
|
96 |
+
"import torch\n",
|
97 |
+
"\n",
|
98 |
+
"\n",
|
99 |
+
"class ModelTrainer:\n",
|
100 |
+
" def __init__(self, config: ModelTrainerConfig):\n",
|
101 |
+
" self.config = config\n",
|
102 |
+
"\n",
|
103 |
+
"\n",
|
104 |
+
"\n",
|
105 |
+
" def train(self):\n",
|
106 |
+
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
107 |
+
" tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)\n",
|
108 |
+
" model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)\n",
|
109 |
+
" seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)\n",
|
110 |
+
"\n",
|
111 |
+
" #loading data\n",
|
112 |
+
" dataset = load_from_disk(self.config.data_path)\n",
|
113 |
+
"\n",
|
114 |
+
" # trainer_args = TrainingArguments(\n",
|
115 |
+
" # output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,\n",
|
116 |
+
" # per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,\n",
|
117 |
+
" # weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,\n",
|
118 |
+
" # evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,\n",
|
119 |
+
" # gradient_accumulation_steps=self.config.gradient_accumulation_steps\n",
|
120 |
+
" # )\n",
|
121 |
+
"\n",
|
122 |
+
"\n",
|
123 |
+
" trainer_args = TrainingArguments(\n",
|
124 |
+
" output_dir=self.config.root_dir,\n",
|
125 |
+
" num_train_epochs=1,\n",
|
126 |
+
" warmup_steps=500,\n",
|
127 |
+
" per_device_train_batch_size=1,\n",
|
128 |
+
" per_device_eval_batch_size=1,\n",
|
129 |
+
" weight_decay=0.01,\n",
|
130 |
+
" logging_steps=10,\n",
|
131 |
+
" evaluation_strategy='steps',\n",
|
132 |
+
" eval_steps=500,\n",
|
133 |
+
" save_steps=1e6,\n",
|
134 |
+
" gradient_accumulation_steps=16\n",
|
135 |
+
" )\n",
|
136 |
+
"\n",
|
137 |
+
" trainer = Trainer(\n",
|
138 |
+
" model=model_pegasus,\n",
|
139 |
+
" args=trainer_args,\n",
|
140 |
+
" tokenizer=tokenizer,\n",
|
141 |
+
" data_collator=seq2seq_data_collator,\n",
|
142 |
+
" train_dataset=dataset[\"train\"],\n",
|
143 |
+
" eval_dataset=dataset[\"validation\"])\n",
|
144 |
+
"\n",
|
145 |
+
" # trainer.train()\n",
|
146 |
+
"\n",
|
147 |
+
" ## Save model\n",
|
148 |
+
" model_pegasus.save_pretrained(\"multi-news-model\")\n",
|
149 |
+
"\n",
|
150 |
+
" ## Save tokenizer\n",
|
151 |
+
" tokenizer.save_pretrained(\"tokenizer\")\n"
|
152 |
+
]
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"cell_type": "code",
|
156 |
+
"execution_count": null,
|
157 |
+
"metadata": {},
|
158 |
+
"outputs": [],
|
159 |
+
"source": [
|
160 |
+
"try:\n",
|
161 |
+
" config = ConfigurationManager()\n",
|
162 |
+
" model_trainer_config = config.get_model_trainer_config()\n",
|
163 |
+
" model_trainer_config = ModelTrainer(config=model_trainer_config)\n",
|
164 |
+
" model_trainer_config.train()\n",
|
165 |
+
"except Exception as e:\n",
|
166 |
+
" raise e"
|
167 |
+
]
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"cell_type": "code",
|
171 |
+
"execution_count": null,
|
172 |
+
"metadata": {},
|
173 |
+
"outputs": [],
|
174 |
+
"source": []
|
175 |
+
}
|
176 |
+
],
|
177 |
+
"metadata": {
|
178 |
+
"language_info": {
|
179 |
+
"name": "python"
|
180 |
+
}
|
181 |
+
},
|
182 |
+
"nbformat": 4,
|
183 |
+
"nbformat_minor": 2
|
184 |
+
}
|
check_code/05_model_evaluation.ipynb
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import os\n",
|
10 |
+
"%pwd\n",
|
11 |
+
"os.chdir(\"../\")\n",
|
12 |
+
"\n",
|
13 |
+
"%pwd"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "code",
|
18 |
+
"execution_count": null,
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"from dataclasses import dataclass\n",
|
23 |
+
"from pathlib import Path\n",
|
24 |
+
"\n",
|
25 |
+
"\n",
|
26 |
+
"@dataclass(frozen=True)\n",
|
27 |
+
"class ModelEvaluationConfig:\n",
|
28 |
+
" root_dir: str\n",
|
29 |
+
" data_path: str\n",
|
30 |
+
" model_path: str\n",
|
31 |
+
" tokenizer_path: str\n",
|
32 |
+
" metric_file_name: str"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"cell_type": "code",
|
37 |
+
"execution_count": null,
|
38 |
+
"metadata": {},
|
39 |
+
"outputs": [],
|
40 |
+
"source": [
|
41 |
+
"from box import ConfigBox\n",
|
42 |
+
"from pathlib import Path\n",
|
43 |
+
"from src.TextSummarizer.constants import file_path\n",
|
44 |
+
"from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
|
45 |
+
"\n",
|
46 |
+
"\n",
|
47 |
+
"class ConfigurationManager:\n",
|
48 |
+
"\n",
|
49 |
+
" def __init__(self) -> None:\n",
|
50 |
+
" self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
|
51 |
+
" self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
|
52 |
+
"\n",
|
53 |
+
" create_directories(path_to_directories=[self.config.artifacts_root])\n",
|
54 |
+
"\n",
|
55 |
+
" def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
|
56 |
+
" config = self.config.model_evaluation\n",
|
57 |
+
"\n",
|
58 |
+
" create_directories([config.root_dir])\n",
|
59 |
+
"\n",
|
60 |
+
" model_evaluation_config = ModelEvaluationConfig(\n",
|
61 |
+
" root_dir=config.root_dir,\n",
|
62 |
+
" data_path=config.data_path,\n",
|
63 |
+
" model_path = config.model_path,\n",
|
64 |
+
" tokenizer_path = config.tokenizer_path,\n",
|
65 |
+
" metric_file_name = config.metric_file_name\n",
|
66 |
+
"\n",
|
67 |
+
" )\n",
|
68 |
+
"\n",
|
69 |
+
" return model_evaluation_config"
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"cell_type": "code",
|
74 |
+
"execution_count": null,
|
75 |
+
"metadata": {},
|
76 |
+
"outputs": [],
|
77 |
+
"source": [
|
78 |
+
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
|
79 |
+
"from datasets import load_dataset, load_from_disk, load_metric\n",
|
80 |
+
"import torch\n",
|
81 |
+
"import pandas as pd\n",
|
82 |
+
"from tqdm import tqdm\n",
|
83 |
+
"\n",
|
84 |
+
"class ModelEvaluation:\n",
|
85 |
+
" def __init__(self, config: ModelEvaluationConfig):\n",
|
86 |
+
" self.config = config\n",
|
87 |
+
"\n",
|
88 |
+
"\n",
|
89 |
+
"\n",
|
90 |
+
" def generate_batch_sized_chunks(self,list_of_elements, batch_size):\n",
|
91 |
+
" \"\"\"split the dataset into smaller batches that we can process simultaneously\n",
|
92 |
+
" Yield successive batch-sized chunks from list_of_elements.\"\"\"\n",
|
93 |
+
" for i in range(0, len(list_of_elements), batch_size):\n",
|
94 |
+
" yield list_of_elements[i : i + batch_size]\n",
|
95 |
+
"\n",
|
96 |
+
"\n",
|
97 |
+
" def calculate_metric_on_test_ds(self,dataset, metric, model, tokenizer,\n",
|
98 |
+
" batch_size=16, device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
|
99 |
+
" column_text=\"article\",\n",
|
100 |
+
" column_summary=\"highlights\"):\n",
|
101 |
+
" article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))\n",
|
102 |
+
" target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))\n",
|
103 |
+
"\n",
|
104 |
+
" for article_batch, target_batch in tqdm(\n",
|
105 |
+
" zip(article_batches, target_batches), total=len(article_batches)):\n",
|
106 |
+
"\n",
|
107 |
+
" inputs = tokenizer(article_batch, max_length=1024, truncation=True,\n",
|
108 |
+
" padding=\"max_length\", return_tensors=\"pt\")\n",
|
109 |
+
"\n",
|
110 |
+
" summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n",
|
111 |
+
" attention_mask=inputs[\"attention_mask\"].to(device),\n",
|
112 |
+
" length_penalty=0.8, num_beams=8, max_length=128)\n",
|
113 |
+
" ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''\n",
|
114 |
+
"\n",
|
115 |
+
" # Finally, we decode the generated texts,\n",
|
116 |
+
" # replace the token, and add the decoded texts with the references to the metric.\n",
|
117 |
+
" decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,\n",
|
118 |
+
" clean_up_tokenization_spaces=True)\n",
|
119 |
+
" for s in summaries]\n",
|
120 |
+
"\n",
|
121 |
+
" decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n",
|
122 |
+
"\n",
|
123 |
+
"\n",
|
124 |
+
" metric.add_batch(predictions=decoded_summaries, references=target_batch)\n",
|
125 |
+
"\n",
|
126 |
+
" # Finally compute and return the ROUGE scores.\n",
|
127 |
+
" score = metric.compute()\n",
|
128 |
+
" return score\n",
|
129 |
+
"\n",
|
130 |
+
"\n",
|
131 |
+
" def evaluate(self):\n",
|
132 |
+
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
133 |
+
" tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n",
|
134 |
+
" model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n",
|
135 |
+
"\n",
|
136 |
+
" #loading data\n",
|
137 |
+
" dataset_samsum_pt = load_from_disk(self.config.data_path)\n",
|
138 |
+
"\n",
|
139 |
+
"\n",
|
140 |
+
" rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
|
141 |
+
"\n",
|
142 |
+
" rouge_metric = load_metric('rouge')\n",
|
143 |
+
"\n",
|
144 |
+
" score = self.calculate_metric_on_test_ds(\n",
|
145 |
+
" dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'\n",
|
146 |
+
" )\n",
|
147 |
+
"\n",
|
148 |
+
" rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )\n",
|
149 |
+
"\n",
|
150 |
+
" df = pd.DataFrame(rouge_dict, index = ['pegasus'] )\n",
|
151 |
+
" df.to_csv(self.config.metric_file_name, index=False)\n"
|
152 |
+
]
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"cell_type": "code",
|
156 |
+
"execution_count": null,
|
157 |
+
"metadata": {},
|
158 |
+
"outputs": [],
|
159 |
+
"source": [
|
160 |
+
"try:\n",
|
161 |
+
" config = ConfigurationManager()\n",
|
162 |
+
" model_evaluation_config = config.get_model_evaluation_config()\n",
|
163 |
+
" model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n",
|
164 |
+
" model_evaluation_config.evaluate()\n",
|
165 |
+
"except Exception as e:\n",
|
166 |
+
" raise e"
|
167 |
+
]
|
168 |
+
}
|
169 |
+
],
|
170 |
+
"metadata": {
|
171 |
+
"language_info": {
|
172 |
+
"name": "python"
|
173 |
+
}
|
174 |
+
},
|
175 |
+
"nbformat": 4,
|
176 |
+
"nbformat_minor": 2
|
177 |
+
}
|
check_code/predict.ipynb
ADDED
@@ -0,0 +1,504 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 8,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"data": {
|
10 |
+
"application/vnd.jupyter.widget-view+json": {
|
11 |
+
"model_id": "6fe4263eb5e743e7968cf6f1b140a744",
|
12 |
+
"version_major": 2,
|
13 |
+
"version_minor": 0
|
14 |
+
},
|
15 |
+
"text/plain": [
|
16 |
+
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
17 |
+
]
|
18 |
+
},
|
19 |
+
"metadata": {},
|
20 |
+
"output_type": "display_data"
|
21 |
+
}
|
22 |
+
],
|
23 |
+
"source": [
|
24 |
+
"# pavithra-devi/google_pegasus_multi_news_model\n",
|
25 |
+
"\n",
|
26 |
+
"\n",
|
27 |
+
"from huggingface_hub import notebook_login\n",
|
28 |
+
"\n",
|
29 |
+
"notebook_login()\n"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 10,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [
|
37 |
+
{
|
38 |
+
"data": {
|
39 |
+
"application/vnd.jupyter.widget-view+json": {
|
40 |
+
"model_id": "df05cc7954324819a2ba5645e881ea49",
|
41 |
+
"version_major": 2,
|
42 |
+
"version_minor": 0
|
43 |
+
},
|
44 |
+
"text/plain": [
|
45 |
+
"tokenizer_config.json: 0%| | 0.00/20.1k [00:00<?, ?B/s]"
|
46 |
+
]
|
47 |
+
},
|
48 |
+
"metadata": {},
|
49 |
+
"output_type": "display_data"
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"data": {
|
53 |
+
"application/vnd.jupyter.widget-view+json": {
|
54 |
+
"model_id": "9ca03f3fecbe48f994a272a75a06e779",
|
55 |
+
"version_major": 2,
|
56 |
+
"version_minor": 0
|
57 |
+
},
|
58 |
+
"text/plain": [
|
59 |
+
"spiece.model: 0%| | 0.00/1.91M [00:00<?, ?B/s]"
|
60 |
+
]
|
61 |
+
},
|
62 |
+
"metadata": {},
|
63 |
+
"output_type": "display_data"
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"data": {
|
67 |
+
"application/vnd.jupyter.widget-view+json": {
|
68 |
+
"model_id": "ba543a91b4b9494e8009f60e2cd80fc2",
|
69 |
+
"version_major": 2,
|
70 |
+
"version_minor": 0
|
71 |
+
},
|
72 |
+
"text/plain": [
|
73 |
+
"tokenizer.json: 0%| | 0.00/6.60M [00:00<?, ?B/s]"
|
74 |
+
]
|
75 |
+
},
|
76 |
+
"metadata": {},
|
77 |
+
"output_type": "display_data"
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"data": {
|
81 |
+
"application/vnd.jupyter.widget-view+json": {
|
82 |
+
"model_id": "7911d674ade7471e95a8c9041d69adb1",
|
83 |
+
"version_major": 2,
|
84 |
+
"version_minor": 0
|
85 |
+
},
|
86 |
+
"text/plain": [
|
87 |
+
"special_tokens_map.json: 0%| | 0.00/1.77k [00:00<?, ?B/s]"
|
88 |
+
]
|
89 |
+
},
|
90 |
+
"metadata": {},
|
91 |
+
"output_type": "display_data"
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"data": {
|
95 |
+
"application/vnd.jupyter.widget-view+json": {
|
96 |
+
"model_id": "6d1b306a35f24cd8a76415bb399a74c5",
|
97 |
+
"version_major": 2,
|
98 |
+
"version_minor": 0
|
99 |
+
},
|
100 |
+
"text/plain": [
|
101 |
+
"config.json: 0%| | 0.00/1.32k [00:00<?, ?B/s]"
|
102 |
+
]
|
103 |
+
},
|
104 |
+
"metadata": {},
|
105 |
+
"output_type": "display_data"
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"data": {
|
109 |
+
"application/vnd.jupyter.widget-view+json": {
|
110 |
+
"model_id": "04440e9bef144c4197f8e11b00a21b61",
|
111 |
+
"version_major": 2,
|
112 |
+
"version_minor": 0
|
113 |
+
},
|
114 |
+
"text/plain": [
|
115 |
+
"model.safetensors: 0%| | 0.00/2.28G [00:00<?, ?B/s]"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
"metadata": {},
|
119 |
+
"output_type": "display_data"
|
120 |
+
},
|
121 |
+
{
|
122 |
+
"data": {
|
123 |
+
"application/vnd.jupyter.widget-view+json": {
|
124 |
+
"model_id": "2e5f5ed6ff8644cd92f3690374361820",
|
125 |
+
"version_major": 2,
|
126 |
+
"version_minor": 0
|
127 |
+
},
|
128 |
+
"text/plain": [
|
129 |
+
"generation_config.json: 0%| | 0.00/275 [00:00<?, ?B/s]"
|
130 |
+
]
|
131 |
+
},
|
132 |
+
"metadata": {},
|
133 |
+
"output_type": "display_data"
|
134 |
+
}
|
135 |
+
],
|
136 |
+
"source": [
|
137 |
+
"from transformers import AutoModelForSeq2SeqLM\n",
|
138 |
+
"model_name = \"pavithra-devi/pegasus-multi-news\"\n",
|
139 |
+
"\n",
|
140 |
+
"# Load model directly\n",
|
141 |
+
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
|
142 |
+
"\n",
|
143 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"pavithra-devi/pegasus-multi-news\")\n",
|
144 |
+
"model = AutoModelForSeq2SeqLM.from_pretrained(\"pavithra-devi/pegasus-multi-news\")"
|
145 |
+
]
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"cell_type": "code",
|
149 |
+
"execution_count": 12,
|
150 |
+
"metadata": {},
|
151 |
+
"outputs": [],
|
152 |
+
"source": [
|
153 |
+
"from transformers import pipeline\n",
|
154 |
+
"\n",
|
155 |
+
"\n",
|
156 |
+
"summarizer = pipeline(\"summarization\", model=\"pavithra-devi/pegasus-multi-news\")\n"
|
157 |
+
]
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"cell_type": "code",
|
161 |
+
"execution_count": 14,
|
162 |
+
"metadata": {},
|
163 |
+
"outputs": [
|
164 |
+
{
|
165 |
+
"data": {
|
166 |
+
"text/plain": [
|
167 |
+
"[{'summary_text': '– A recent Supreme Court decision gives states the ability to opt out of the law\\'s expansion of Medicaid, the federal insurance program for poor, disabled and elderly Americans, confers \"incredible power\" on the states and their governors, says political scientist Thad Kousser, co-author of The Power of American Governors. \"No matter who wins the presidency, national politics is going to be stalemated on the Affordable Care Act,\" says Kousser, of the University of California-Berkeley. Just look at what happened when the Obama administration in 2010 offered federal stimulus money to states to begin building a high-speed rail network. Three'}]"
|
168 |
+
]
|
169 |
+
},
|
170 |
+
"execution_count": 14,
|
171 |
+
"metadata": {},
|
172 |
+
"output_type": "execute_result"
|
173 |
+
}
|
174 |
+
],
|
175 |
+
"source": [
|
176 |
+
"text = \"\"\" While the occupant of the governor's office is historically far less important than the party that controls the state legislature, top state officials in coming years are expected to wield significant influence in at least one major area.\n",
|
177 |
+
"\n",
|
178 |
+
" And that's health care, says political scientist Thad Kousser, co-author of The Power of American Governors.\n",
|
179 |
+
"\n",
|
180 |
+
" \"No matter who wins the presidency, national politics is going to be stalemated on the Affordable Care Act,\" says Kousser, of the University of California-Berkeley.\n",
|
181 |
+
"\n",
|
182 |
+
" A recent U.S. Supreme Court decision giving states the ability to opt out of the law's expansion of Medicaid, the federal insurance program for poor, disabled and elderly Americans, confers \"incredible power\" on the states and their governors, Kousser says.\n",
|
183 |
+
"\n",
|
184 |
+
" Just look at what happened when the Obama administration in 2010 offered federal stimulus money to states to begin building a high-speed rail network. Three Republican governors, including Rick Scott of Florida and Scott Walker of Wisconsin, rejected a share of the money citing debt and deficit concerns.\n",
|
185 |
+
"\n",
|
186 |
+
" \"A [Mitt] Romney victory would dramatically empower Republican governors,\" Kousser says. \"\"\"\n",
|
187 |
+
"\n",
|
188 |
+
"summarizer(text)"
|
189 |
+
]
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"cell_type": "code",
|
193 |
+
"execution_count": null,
|
194 |
+
"metadata": {},
|
195 |
+
"outputs": [],
|
196 |
+
"source": [
|
197 |
+
"[{'summary_text': '– A recent Supreme Court decision gives states the ability to opt out of the law\\'s expansion of Medicaid,\n",
|
198 |
+
" the federal insurance program for poor, disabled and elderly Americans, confers \"incredible power\" on the states and their governors,\n",
|
199 |
+
" says political scientist Thad Kousser, co-author of The Power of American Governors. \"No matter who wins the presidency, national politics\n",
|
200 |
+
" is going to be stalemated on the Affordable Care Act,\" says Kousser, of the University of California-Berkeley. Just look at what happened when the\n",
|
201 |
+
" Obama administration in 2010 offered federal stimulus money to states to begin building a high-speed rail network. Three'}]"
|
202 |
+
]
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"cell_type": "code",
|
206 |
+
"execution_count": 11,
|
207 |
+
"metadata": {},
|
208 |
+
"outputs": [
|
209 |
+
{
|
210 |
+
"data": {
|
211 |
+
"text/plain": [
|
212 |
+
"['T_destination',\n",
|
213 |
+
" '__annotations__',\n",
|
214 |
+
" '__call__',\n",
|
215 |
+
" '__class__',\n",
|
216 |
+
" '__delattr__',\n",
|
217 |
+
" '__dict__',\n",
|
218 |
+
" '__dir__',\n",
|
219 |
+
" '__doc__',\n",
|
220 |
+
" '__eq__',\n",
|
221 |
+
" '__format__',\n",
|
222 |
+
" '__ge__',\n",
|
223 |
+
" '__getattr__',\n",
|
224 |
+
" '__getattribute__',\n",
|
225 |
+
" '__getstate__',\n",
|
226 |
+
" '__gt__',\n",
|
227 |
+
" '__hash__',\n",
|
228 |
+
" '__init__',\n",
|
229 |
+
" '__init_subclass__',\n",
|
230 |
+
" '__le__',\n",
|
231 |
+
" '__lt__',\n",
|
232 |
+
" '__module__',\n",
|
233 |
+
" '__ne__',\n",
|
234 |
+
" '__new__',\n",
|
235 |
+
" '__reduce__',\n",
|
236 |
+
" '__reduce_ex__',\n",
|
237 |
+
" '__repr__',\n",
|
238 |
+
" '__setattr__',\n",
|
239 |
+
" '__setstate__',\n",
|
240 |
+
" '__sizeof__',\n",
|
241 |
+
" '__str__',\n",
|
242 |
+
" '__subclasshook__',\n",
|
243 |
+
" '__weakref__',\n",
|
244 |
+
" '_apply',\n",
|
245 |
+
" '_assisted_decoding',\n",
|
246 |
+
" '_auto_class',\n",
|
247 |
+
" '_autoset_attn_implementation',\n",
|
248 |
+
" '_backward_compatibility_gradient_checkpointing',\n",
|
249 |
+
" '_backward_hooks',\n",
|
250 |
+
" '_backward_pre_hooks',\n",
|
251 |
+
" '_beam_sample',\n",
|
252 |
+
" '_beam_search',\n",
|
253 |
+
" '_buffers',\n",
|
254 |
+
" '_call_impl',\n",
|
255 |
+
" '_check_and_enable_flash_attn_2',\n",
|
256 |
+
" '_check_and_enable_sdpa',\n",
|
257 |
+
" '_compiled_call_impl',\n",
|
258 |
+
" '_constrained_beam_search',\n",
|
259 |
+
" '_contrastive_search',\n",
|
260 |
+
" '_convert_head_mask_to_5d',\n",
|
261 |
+
" '_copy_lm_head_original_to_resized',\n",
|
262 |
+
" '_create_repo',\n",
|
263 |
+
" '_dispatch_accelerate_model',\n",
|
264 |
+
" '_expand_inputs_for_generation',\n",
|
265 |
+
" '_extract_past_from_model_output',\n",
|
266 |
+
" '_forward_hooks',\n",
|
267 |
+
" '_forward_hooks_always_called',\n",
|
268 |
+
" '_forward_hooks_with_kwargs',\n",
|
269 |
+
" '_forward_pre_hooks',\n",
|
270 |
+
" '_forward_pre_hooks_with_kwargs',\n",
|
271 |
+
" '_from_config',\n",
|
272 |
+
" '_get_backward_hooks',\n",
|
273 |
+
" '_get_backward_pre_hooks',\n",
|
274 |
+
" '_get_cache',\n",
|
275 |
+
" '_get_candidate_generator',\n",
|
276 |
+
" '_get_decoder_start_token_id',\n",
|
277 |
+
" '_get_files_timestamps',\n",
|
278 |
+
" '_get_initial_cache_position',\n",
|
279 |
+
" '_get_logits_processor',\n",
|
280 |
+
" '_get_logits_warper',\n",
|
281 |
+
" '_get_name',\n",
|
282 |
+
" '_get_no_split_modules',\n",
|
283 |
+
" '_get_resized_embeddings',\n",
|
284 |
+
" '_get_resized_lm_head',\n",
|
285 |
+
" '_get_stopping_criteria',\n",
|
286 |
+
" '_greedy_search',\n",
|
287 |
+
" '_group_beam_search',\n",
|
288 |
+
" '_has_unfinished_sequences',\n",
|
289 |
+
" '_hf_peft_config_loaded',\n",
|
290 |
+
" '_hook_rss_memory_post_forward',\n",
|
291 |
+
" '_hook_rss_memory_pre_forward',\n",
|
292 |
+
" '_init_weights',\n",
|
293 |
+
" '_initialize_weights',\n",
|
294 |
+
" '_is_full_backward_hook',\n",
|
295 |
+
" '_is_hf_initialized',\n",
|
296 |
+
" '_is_quantized_training_enabled',\n",
|
297 |
+
" '_keep_in_fp32_modules',\n",
|
298 |
+
" '_keep_in_fp32_modules',\n",
|
299 |
+
" '_keys_to_ignore_on_load_missing',\n",
|
300 |
+
" '_keys_to_ignore_on_load_unexpected',\n",
|
301 |
+
" '_keys_to_ignore_on_save',\n",
|
302 |
+
" '_load_from_state_dict',\n",
|
303 |
+
" '_load_pretrained_model',\n",
|
304 |
+
" '_load_pretrained_model_low_mem',\n",
|
305 |
+
" '_load_state_dict_post_hooks',\n",
|
306 |
+
" '_load_state_dict_pre_hooks',\n",
|
307 |
+
" '_maybe_initialize_input_ids_for_generation',\n",
|
308 |
+
" '_maybe_warn_non_full_backward_hook',\n",
|
309 |
+
" '_merge_criteria_processor_list',\n",
|
310 |
+
" '_modules',\n",
|
311 |
+
" '_named_members',\n",
|
312 |
+
" '_no_split_modules',\n",
|
313 |
+
" '_non_persistent_buffers_set',\n",
|
314 |
+
" '_parameters',\n",
|
315 |
+
" '_prepare_attention_mask_for_generation',\n",
|
316 |
+
" '_prepare_decoder_input_ids_for_generation',\n",
|
317 |
+
" '_prepare_encoder_decoder_kwargs_for_generation',\n",
|
318 |
+
" '_prepare_generated_length',\n",
|
319 |
+
" '_prepare_generation_config',\n",
|
320 |
+
" '_prepare_model_inputs',\n",
|
321 |
+
" '_prepare_special_tokens',\n",
|
322 |
+
" '_register_load_state_dict_pre_hook',\n",
|
323 |
+
" '_register_state_dict_hook',\n",
|
324 |
+
" '_reorder_cache',\n",
|
325 |
+
" '_replicate_for_data_parallel',\n",
|
326 |
+
" '_resize_final_logits_bias',\n",
|
327 |
+
" '_resize_token_embeddings',\n",
|
328 |
+
" '_sample',\n",
|
329 |
+
" '_save_to_state_dict',\n",
|
330 |
+
" '_set_default_torch_dtype',\n",
|
331 |
+
" '_set_gradient_checkpointing',\n",
|
332 |
+
" '_skip_keys_device_placement',\n",
|
333 |
+
" '_slow_forward',\n",
|
334 |
+
" '_state_dict_hooks',\n",
|
335 |
+
" '_state_dict_pre_hooks',\n",
|
336 |
+
" '_supports_cache_class',\n",
|
337 |
+
" '_supports_flash_attn_2',\n",
|
338 |
+
" '_supports_quantized_cache',\n",
|
339 |
+
" '_supports_sdpa',\n",
|
340 |
+
" '_supports_static_cache',\n",
|
341 |
+
" '_temporary_reorder_cache',\n",
|
342 |
+
" '_tie_encoder_decoder_weights',\n",
|
343 |
+
" '_tie_or_clone_weights',\n",
|
344 |
+
" '_tied_weights_keys',\n",
|
345 |
+
" '_update_model_kwargs_for_generation',\n",
|
346 |
+
" '_upload_modified_files',\n",
|
347 |
+
" '_validate_assistant',\n",
|
348 |
+
" '_validate_generated_length',\n",
|
349 |
+
" '_validate_model_class',\n",
|
350 |
+
" '_validate_model_kwargs',\n",
|
351 |
+
" '_version',\n",
|
352 |
+
" '_wrapped_call_impl',\n",
|
353 |
+
" 'active_adapter',\n",
|
354 |
+
" 'active_adapters',\n",
|
355 |
+
" 'add_adapter',\n",
|
356 |
+
" 'add_memory_hooks',\n",
|
357 |
+
" 'add_model_tags',\n",
|
358 |
+
" 'add_module',\n",
|
359 |
+
" 'apply',\n",
|
360 |
+
" 'base_model',\n",
|
361 |
+
" 'base_model_prefix',\n",
|
362 |
+
" 'bfloat16',\n",
|
363 |
+
" 'buffers',\n",
|
364 |
+
" 'call_super_init',\n",
|
365 |
+
" 'can_generate',\n",
|
366 |
+
" 'children',\n",
|
367 |
+
" 'compile',\n",
|
368 |
+
" 'compute_transition_scores',\n",
|
369 |
+
" 'config',\n",
|
370 |
+
" 'config_class',\n",
|
371 |
+
" 'cpu',\n",
|
372 |
+
" 'create_extended_attention_mask_for_decoder',\n",
|
373 |
+
" 'cuda',\n",
|
374 |
+
" 'dequantize',\n",
|
375 |
+
" 'device',\n",
|
376 |
+
" 'disable_adapters',\n",
|
377 |
+
" 'disable_input_require_grads',\n",
|
378 |
+
" 'double',\n",
|
379 |
+
" 'dtype',\n",
|
380 |
+
" 'dummy_inputs',\n",
|
381 |
+
" 'dump_patches',\n",
|
382 |
+
" 'enable_adapters',\n",
|
383 |
+
" 'enable_input_require_grads',\n",
|
384 |
+
" 'estimate_tokens',\n",
|
385 |
+
" 'eval',\n",
|
386 |
+
" 'extra_repr',\n",
|
387 |
+
" 'final_logits_bias',\n",
|
388 |
+
" 'float',\n",
|
389 |
+
" 'floating_point_ops',\n",
|
390 |
+
" 'forward',\n",
|
391 |
+
" 'framework',\n",
|
392 |
+
" 'from_pretrained',\n",
|
393 |
+
" 'generate',\n",
|
394 |
+
" 'generation_config',\n",
|
395 |
+
" 'get_adapter_state_dict',\n",
|
396 |
+
" 'get_buffer',\n",
|
397 |
+
" 'get_decoder',\n",
|
398 |
+
" 'get_encoder',\n",
|
399 |
+
" 'get_extended_attention_mask',\n",
|
400 |
+
" 'get_extra_state',\n",
|
401 |
+
" 'get_head_mask',\n",
|
402 |
+
" 'get_input_embeddings',\n",
|
403 |
+
" 'get_memory_footprint',\n",
|
404 |
+
" 'get_output_embeddings',\n",
|
405 |
+
" 'get_parameter',\n",
|
406 |
+
" 'get_position_embeddings',\n",
|
407 |
+
" 'get_submodule',\n",
|
408 |
+
" 'gradient_checkpointing_disable',\n",
|
409 |
+
" 'gradient_checkpointing_enable',\n",
|
410 |
+
" 'half',\n",
|
411 |
+
" 'init_weights',\n",
|
412 |
+
" 'invert_attention_mask',\n",
|
413 |
+
" 'ipu',\n",
|
414 |
+
" 'is_gradient_checkpointing',\n",
|
415 |
+
" 'is_parallelizable',\n",
|
416 |
+
" 'lm_head',\n",
|
417 |
+
" 'load_adapter',\n",
|
418 |
+
" 'load_state_dict',\n",
|
419 |
+
" 'main_input_name',\n",
|
420 |
+
" 'model',\n",
|
421 |
+
" 'model_tags',\n",
|
422 |
+
" 'modules',\n",
|
423 |
+
" 'name_or_path',\n",
|
424 |
+
" 'named_buffers',\n",
|
425 |
+
" 'named_children',\n",
|
426 |
+
" 'named_modules',\n",
|
427 |
+
" 'named_parameters',\n",
|
428 |
+
" 'num_parameters',\n",
|
429 |
+
" 'parameters',\n",
|
430 |
+
" 'post_init',\n",
|
431 |
+
" 'prepare_decoder_input_ids_from_labels',\n",
|
432 |
+
" 'prepare_inputs_for_generation',\n",
|
433 |
+
" 'prune_heads',\n",
|
434 |
+
" 'push_to_hub',\n",
|
435 |
+
" 'register_backward_hook',\n",
|
436 |
+
" 'register_buffer',\n",
|
437 |
+
" 'register_for_auto_class',\n",
|
438 |
+
" 'register_forward_hook',\n",
|
439 |
+
" 'register_forward_pre_hook',\n",
|
440 |
+
" 'register_full_backward_hook',\n",
|
441 |
+
" 'register_full_backward_pre_hook',\n",
|
442 |
+
" 'register_load_state_dict_post_hook',\n",
|
443 |
+
" 'register_module',\n",
|
444 |
+
" 'register_parameter',\n",
|
445 |
+
" 'register_state_dict_pre_hook',\n",
|
446 |
+
" 'requires_grad_',\n",
|
447 |
+
" 'reset_memory_hooks_state',\n",
|
448 |
+
" 'resize_position_embeddings',\n",
|
449 |
+
" 'resize_token_embeddings',\n",
|
450 |
+
" 'retrieve_modules_from_names',\n",
|
451 |
+
" 'reverse_bettertransformer',\n",
|
452 |
+
" 'save_pretrained',\n",
|
453 |
+
" 'set_adapter',\n",
|
454 |
+
" 'set_extra_state',\n",
|
455 |
+
" 'set_input_embeddings',\n",
|
456 |
+
" 'set_output_embeddings',\n",
|
457 |
+
" 'share_memory',\n",
|
458 |
+
" 'state_dict',\n",
|
459 |
+
" 'supports_gradient_checkpointing',\n",
|
460 |
+
" 'tie_weights',\n",
|
461 |
+
" 'to',\n",
|
462 |
+
" 'to_bettertransformer',\n",
|
463 |
+
" 'to_empty',\n",
|
464 |
+
" 'train',\n",
|
465 |
+
" 'training',\n",
|
466 |
+
" 'type',\n",
|
467 |
+
" 'warn_if_padding_and_no_attention_mask',\n",
|
468 |
+
" 'warnings_issued',\n",
|
469 |
+
" 'xpu',\n",
|
470 |
+
" 'zero_grad']"
|
471 |
+
]
|
472 |
+
},
|
473 |
+
"execution_count": 11,
|
474 |
+
"metadata": {},
|
475 |
+
"output_type": "execute_result"
|
476 |
+
}
|
477 |
+
],
|
478 |
+
"source": [
|
479 |
+
"tokenized_data ="
|
480 |
+
]
|
481 |
+
}
|
482 |
+
],
|
483 |
+
"metadata": {
|
484 |
+
"kernelspec": {
|
485 |
+
"display_name": "venv_text_summarizaition",
|
486 |
+
"language": "python",
|
487 |
+
"name": "python3"
|
488 |
+
},
|
489 |
+
"language_info": {
|
490 |
+
"codemirror_mode": {
|
491 |
+
"name": "ipython",
|
492 |
+
"version": 3
|
493 |
+
},
|
494 |
+
"file_extension": ".py",
|
495 |
+
"mimetype": "text/x-python",
|
496 |
+
"name": "python",
|
497 |
+
"nbconvert_exporter": "python",
|
498 |
+
"pygments_lexer": "ipython3",
|
499 |
+
"version": "3.8.10"
|
500 |
+
}
|
501 |
+
},
|
502 |
+
"nbformat": 4,
|
503 |
+
"nbformat_minor": 2
|
504 |
+
}
|
check_code/trials.ipynb
ADDED
File without changes
|
main.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The file which does the all work. Calling all the pipeline to do the training.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from src.TextSummarizer.logger import backend_logger
|
6 |
+
from src.TextSummarizer.pipeline.step_01_data_ingestion import DataIngestionPipeline
|
7 |
+
from src.TextSummarizer.pipeline.step_02_data_validation import DataValidationPipeline
|
8 |
+
from src.TextSummarizer.pipeline.step_03_data_transformation import (
|
9 |
+
DataTransformationPipeline,
|
10 |
+
)
|
11 |
+
from src.TextSummarizer.pipeline.step_04_train_model import ModelTrainerPipeline
|
12 |
+
from src.TextSummarizer.pipeline.step_05_model_evaluation import ModelEvaluationPipeline
|
13 |
+
|
14 |
+
stage_name_01: str = "Stage 1: Data Integration Stage"
|
15 |
+
stage_name_02: str = "Stage 2: Data Validation Stage"
|
16 |
+
stage_name_03: str = "Stage 3: Data Transformation Stage"
|
17 |
+
stage_name_04: str = "Stage 4: Model training Stage"
|
18 |
+
stage_name_05: str = "Stage 5: Model Evaluation Stage"
|
19 |
+
|
20 |
+
|
21 |
+
line_msg: str = "="*100
|
22 |
+
|
23 |
+
try:
|
24 |
+
backend_logger.info(line_msg)
|
25 |
+
backend_logger.info(f"Stage {stage_name_01} started")
|
26 |
+
DataIngestionPipeline().run()
|
27 |
+
backend_logger.info(f"Stage {stage_name_01} completed.")
|
28 |
+
backend_logger.info(line_msg)
|
29 |
+
except Exception as err:
|
30 |
+
backend_logger.error(f"Data ingestion pipeline failed. Reason: {err}")
|
31 |
+
|
32 |
+
|
33 |
+
try:
|
34 |
+
backend_logger.info(line_msg)
|
35 |
+
backend_logger.info(f"Stage {stage_name_02} started")
|
36 |
+
DataValidationPipeline().run()
|
37 |
+
backend_logger.info(f"Stage {stage_name_02} completed.")
|
38 |
+
backend_logger.info(line_msg)
|
39 |
+
except Exception as err:
|
40 |
+
backend_logger.error(f"Data validation pipeline failed. Reason: {err}")
|
41 |
+
|
42 |
+
|
43 |
+
try:
|
44 |
+
backend_logger.info(line_msg)
|
45 |
+
backend_logger.info(f"Stage {stage_name_03} started")
|
46 |
+
DataTransformationPipeline().run()
|
47 |
+
backend_logger.info(f"Stage {stage_name_03} completed.")
|
48 |
+
backend_logger.info(line_msg)
|
49 |
+
except Exception as err:
|
50 |
+
backend_logger.error(f"Data Transformation pipeline failed. Reason: {err}")
|
51 |
+
|
52 |
+
|
53 |
+
# For the device limitations issues, i have trained the model on online and stored the model in huggingface profile.
|
54 |
+
# We can skip the training and model evaluation steps while running locally.
|
55 |
+
|
56 |
+
|
57 |
+
try:
|
58 |
+
backend_logger.info(line_msg)
|
59 |
+
backend_logger.info(f"Stage {stage_name_04} started")
|
60 |
+
ModelTrainerPipeline().run()
|
61 |
+
backend_logger.info(f"Stage {stage_name_04} completed.")
|
62 |
+
backend_logger.info(line_msg)
|
63 |
+
except Exception as err:
|
64 |
+
backend_logger.error(f"Data data training pipeline failed. Reason: {err}")
|
65 |
+
|
66 |
+
|
67 |
+
try:
|
68 |
+
backend_logger.info(line_msg)
|
69 |
+
backend_logger.info(f"Stage {stage_name_05} started")
|
70 |
+
ModelEvaluationPipeline().run()
|
71 |
+
backend_logger.info(f"Stage {stage_name_05} completed.")
|
72 |
+
backend_logger.info(line_msg)
|
73 |
+
except Exception as err:
|
74 |
+
backend_logger.error(f"Model evaluation pipeline failed. Reason: {err}")
|
project_structure.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Used to create the whole project structure.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
import os
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s]: %(message)s:')
|
10 |
+
|
11 |
+
project_name: str = "TextSummarizer"
|
12 |
+
|
13 |
+
list_of_files = [
|
14 |
+
".github/workflows/.gitkeep",
|
15 |
+
f"src/{project_name}/__init__.py",
|
16 |
+
f"src/{project_name}/components/__init__.py",
|
17 |
+
f"src/{project_name}/utils/__init__.py",
|
18 |
+
f"src/{project_name}/utils/general.py",
|
19 |
+
f"src/{project_name}/logger/__init__.py",
|
20 |
+
f"src/{project_name}/config/__init__.py",
|
21 |
+
f"src/{project_name}/config/configuration.py",
|
22 |
+
f"src/{project_name}/pipeline/__init__.py",
|
23 |
+
f"src/{project_name}/entity/__init__.py",
|
24 |
+
f"src/{project_name}/constants/__init__.py",
|
25 |
+
f"src/{project_name}/exception/__init__.py",
|
26 |
+
"config/config.yaml",
|
27 |
+
"params/params.yaml",
|
28 |
+
"params.yaml",
|
29 |
+
"app.py",
|
30 |
+
"main.py",
|
31 |
+
"Dockerfile",
|
32 |
+
"requirements.txt",
|
33 |
+
"setup.py",
|
34 |
+
"check_code/trials.ipynb",
|
35 |
+
|
36 |
+
]
|
37 |
+
|
38 |
+
|
39 |
+
for filepath in list_of_files:
|
40 |
+
filepath = Path(filepath)
|
41 |
+
filedir, filename = os.path.split(filepath)
|
42 |
+
|
43 |
+
if filedir != "":
|
44 |
+
os.makedirs(filedir, exist_ok=True)
|
45 |
+
logging.info("Creating directory:{filedir} for the file {filename}")
|
46 |
+
|
47 |
+
if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
|
48 |
+
with open(filepath,'w') as f:
|
49 |
+
logging.info("Creating empty file: {filepath}")
|
50 |
+
|
51 |
+
else:
|
52 |
+
logging.info("{filename} is already exists")
|
requirements.txt
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
transformers[sentencepiece]
|
3 |
+
datasets
|
4 |
+
sacrebleu
|
5 |
+
rouge_score
|
6 |
+
py7zr
|
7 |
+
pandas
|
8 |
+
nltk
|
9 |
+
tqdm
|
10 |
+
PyYAML
|
11 |
+
matplotlib
|
12 |
+
torch
|
13 |
+
notebook
|
14 |
+
boto3
|
15 |
+
mypy-boto3-s3
|
16 |
+
python-box==6.0.2
|
17 |
+
ensure==1.0.2
|
18 |
+
fastapi==0.78.0
|
19 |
+
uvicorn==0.18.3
|
20 |
+
Jinja2==3.1.2
|
21 |
+
-e .
|
setup.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Make the project installable via pip. the project meta data are present here.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import setuptools
|
6 |
+
from setuptools import setup
|
7 |
+
|
8 |
+
# Read the readme file, which will be our long description.
|
9 |
+
with open("README.md", "r", encoding="utf-8") as file:
|
10 |
+
long_description = file.read()
|
11 |
+
|
12 |
+
version: str = "1.0.0"
|
13 |
+
repo_name: str = "Text-summarization-nlp"
|
14 |
+
git_hub_username: str = "pavi-ninjaac"
|
15 |
+
author_name: str = "Pavithra Devi M"
|
16 |
+
author_email = "[email protected]"
|
17 |
+
package_name: str = "TextSummarizer"
|
18 |
+
|
19 |
+
|
20 |
+
# setup the project.
|
21 |
+
setup(
|
22 |
+
name=package_name,
|
23 |
+
version=version,
|
24 |
+
author=author_name,
|
25 |
+
author_email=author_email,
|
26 |
+
description="A small package for text summarization",
|
27 |
+
long_description=long_description,
|
28 |
+
long_description_content="text/markdown",
|
29 |
+
url=f"https://github.com/{git_hub_username}/{repo_name}",
|
30 |
+
project_urls={
|
31 |
+
"Bug Tracker": f"https://github.com/{git_hub_username}/{repo_name}/issues",
|
32 |
+
},
|
33 |
+
package_dir={"": "src"},
|
34 |
+
packages=setuptools.find_packages(where="src"),
|
35 |
+
python_requires=">=3.8"
|
36 |
+
)
|
src/TextSummarizer/components/__init__.py
ADDED
File without changes
|
src/TextSummarizer/components/data_ingestion.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from datasets import load_dataset
|
4 |
+
|
5 |
+
from src.TextSummarizer.entity import entities
|
6 |
+
|
7 |
+
|
8 |
+
class DataIngestionComponent:
|
9 |
+
"""
|
10 |
+
A Class which is responsible for data ingestion.
|
11 |
+
"""
|
12 |
+
|
13 |
+
def __init__(self, config: entities.DataIngestionConfig) -> None:
|
14 |
+
self.config = config
|
15 |
+
|
16 |
+
def save_dataset(self):
|
17 |
+
"""
|
18 |
+
Load the dataset.
|
19 |
+
"""
|
20 |
+
# if the dataset is already loaded then don't call it.
|
21 |
+
if os.path.exists(self.config.arrow_dataset_dir):
|
22 |
+
return
|
23 |
+
|
24 |
+
test_dataset = load_dataset(self.config.dataset_name)
|
25 |
+
test_dataset.save_to_disk(self.config.arrow_dataset_dir)
|
src/TextSummarizer/components/data_transformation.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module which represents to the data validation step.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import os
|
6 |
+
|
7 |
+
from datasets import load_dataset, load_from_disk
|
8 |
+
from transformers import AutoTokenizer
|
9 |
+
|
10 |
+
from src.TextSummarizer.entity import entities
|
11 |
+
from src.TextSummarizer.logger import backend_logger
|
12 |
+
|
13 |
+
|
14 |
+
class DataTransformation:
|
15 |
+
def __init__(self, config: entities.DataTransformationConfig):
|
16 |
+
self.config = config
|
17 |
+
self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
|
18 |
+
|
19 |
+
def convert_examples_to_features(self,example_batch):
|
20 |
+
"""
|
21 |
+
Convert the examples to features.
|
22 |
+
"""
|
23 |
+
input_encodings = self.tokenizer(example_batch['document'] , max_length = 800, truncation = True )
|
24 |
+
|
25 |
+
with self.tokenizer.as_target_tokenizer():
|
26 |
+
target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )
|
27 |
+
|
28 |
+
return {
|
29 |
+
'input_ids' : input_encodings['input_ids'],
|
30 |
+
'attention_mask': input_encodings['attention_mask'],
|
31 |
+
'labels': target_encodings['input_ids']
|
32 |
+
}
|
33 |
+
|
34 |
+
def convert(self):
|
35 |
+
"""
|
36 |
+
Tokenzie the dataset and store it to the disk.
|
37 |
+
"""
|
38 |
+
backend_logger.info("Converting text to tokens....")
|
39 |
+
# Check if the dataset folder already exists.
|
40 |
+
if os.path.exists(os.path.join(self.config.root_dir,"dataset")):
|
41 |
+
return
|
42 |
+
dataset = load_from_disk(self.config.data_path)
|
43 |
+
dataset = dataset.map(self.convert_examples_to_features, batched = True)
|
44 |
+
dataset.save_to_disk(os.path.join(self.config.root_dir,"dataset"))
|
45 |
+
backend_logger.info("Converted text to tokens.")
|
src/TextSummarizer/components/data_validation.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from src.TextSummarizer.entity import entities
|
4 |
+
from src.TextSummarizer.logger import backend_logger
|
5 |
+
|
6 |
+
|
7 |
+
class DataValidation:
|
8 |
+
def __init__(self, config: entities.DataValidationConfig):
|
9 |
+
self.config = config
|
10 |
+
|
11 |
+
def validate_all_files_exist(self) -> bool:
|
12 |
+
"""
|
13 |
+
Check if all the required folders are present.
|
14 |
+
"""
|
15 |
+
try:
|
16 |
+
validation_status: bool | None = None
|
17 |
+
|
18 |
+
all_folder = os.listdir(os.path.join("artifacts","data"))
|
19 |
+
|
20 |
+
for folder in all_folder:
|
21 |
+
print(folder)
|
22 |
+
if folder not in self.config.all_required_folders:
|
23 |
+
validation_status = False
|
24 |
+
with open(self.config.status_file, "w") as f:
|
25 |
+
backend_logger.info("Writing the data validation status as False")
|
26 |
+
f.write(f"Validation status: {validation_status}")
|
27 |
+
else:
|
28 |
+
validation_status = True
|
29 |
+
with open(self.config.status_file, "w") as f:
|
30 |
+
backend_logger.info("Writing the data validation status as True")
|
31 |
+
f.write(f"Validation status: {validation_status}")
|
32 |
+
|
33 |
+
return validation_status
|
34 |
+
|
35 |
+
except Exception as exp:
|
36 |
+
raise exp
|
src/TextSummarizer/components/model_evaluation.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import torch
|
5 |
+
from datasets import load_dataset, load_from_disk, load_metric
|
6 |
+
from tqdm import tqdm
|
7 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
8 |
+
|
9 |
+
from src.TextSummarizer.entity import entities
|
10 |
+
from src.TextSummarizer.logger import backend_logger
|
11 |
+
|
12 |
+
|
13 |
+
class ModelEvaluation:
|
14 |
+
def __init__(self, config: entities.ModelEvaluationConfig):
|
15 |
+
self.config = config
|
16 |
+
|
17 |
+
def generate_batch_sized_chunks(self,list_of_elements, batch_size):
|
18 |
+
"""split the dataset into smaller batches that we can process simultaneously
|
19 |
+
Yield successive batch-sized chunks from list_of_elements."""
|
20 |
+
for i in range(0, len(list_of_elements), batch_size):
|
21 |
+
yield list_of_elements[i : i + batch_size]
|
22 |
+
|
23 |
+
def calculate_metric_on_test_ds(self,
|
24 |
+
dataset,
|
25 |
+
metric,
|
26 |
+
model,
|
27 |
+
tokenizer,
|
28 |
+
batch_size=16,
|
29 |
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
30 |
+
column_text="article",
|
31 |
+
column_summary="highlights"):
|
32 |
+
"""
|
33 |
+
Calculate the metrics.
|
34 |
+
"""
|
35 |
+
article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
|
36 |
+
target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))
|
37 |
+
|
38 |
+
for article_batch, target_batch in tqdm(
|
39 |
+
zip(article_batches, target_batches), total=len(article_batches)):
|
40 |
+
|
41 |
+
inputs = tokenizer(article_batch, max_length=1024, truncation=True,
|
42 |
+
padding="max_length", return_tensors="pt")
|
43 |
+
|
44 |
+
summaries = model.generate(input_ids=inputs["input_ids"].to(device),
|
45 |
+
attention_mask=inputs["attention_mask"].to(device),
|
46 |
+
length_penalty=0.8, num_beams=8, max_length=128)
|
47 |
+
''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
|
48 |
+
|
49 |
+
# Finally, we decode the generated texts,
|
50 |
+
# replace the token, and add the decoded texts with the references to the metric.
|
51 |
+
decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
|
52 |
+
clean_up_tokenization_spaces=True)
|
53 |
+
for s in summaries]
|
54 |
+
|
55 |
+
decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
|
56 |
+
|
57 |
+
|
58 |
+
metric.add_batch(predictions=decoded_summaries, references=target_batch)
|
59 |
+
|
60 |
+
# Finally compute and return the ROUGE scores.
|
61 |
+
score = metric.compute()
|
62 |
+
return score
|
63 |
+
|
64 |
+
def run(self):
|
65 |
+
"""
|
66 |
+
Run the model evaluation step.
|
67 |
+
"""
|
68 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
69 |
+
tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
|
70 |
+
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
|
71 |
+
|
72 |
+
#loading data
|
73 |
+
dataset_samsum_pt = load_from_disk(self.config.data_path)
|
74 |
+
|
75 |
+
|
76 |
+
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
|
77 |
+
|
78 |
+
rouge_metric = load_metric('rouge')
|
79 |
+
|
80 |
+
score = self.calculate_metric_on_test_ds(
|
81 |
+
dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
|
82 |
+
)
|
83 |
+
|
84 |
+
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
|
85 |
+
|
86 |
+
df = pd.DataFrame(rouge_dict, index = ['pegasus'] )
|
87 |
+
df.to_csv(self.config.metric_file_name, index=False)
|
src/TextSummarizer/components/train_model.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from datasets import load_from_disk
|
3 |
+
from transformers import (
|
4 |
+
AutoModelForSeq2SeqLM,
|
5 |
+
AutoTokenizer,
|
6 |
+
DataCollatorForSeq2Seq,
|
7 |
+
Trainer,
|
8 |
+
TrainingArguments,
|
9 |
+
)
|
10 |
+
|
11 |
+
from src.TextSummarizer.entity import entities
|
12 |
+
|
13 |
+
|
14 |
+
class ModelTrainer:
|
15 |
+
"""
|
16 |
+
Train a model.
|
17 |
+
"""
|
18 |
+
def __init__(self, config: entities.ModelTrainerConfig):
|
19 |
+
self.config = config
|
20 |
+
|
21 |
+
def train(self):
|
22 |
+
"""
|
23 |
+
Train the model.
|
24 |
+
"""
|
25 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
26 |
+
tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
|
27 |
+
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
|
28 |
+
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
|
29 |
+
|
30 |
+
#loading data
|
31 |
+
dataset = load_from_disk(self.config.data_path)
|
32 |
+
|
33 |
+
# trainer_args = TrainingArguments(
|
34 |
+
# output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
|
35 |
+
# per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
|
36 |
+
# weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
|
37 |
+
# evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
|
38 |
+
# gradient_accumulation_steps=self.config.gradient_accumulation_steps
|
39 |
+
# )
|
40 |
+
|
41 |
+
|
42 |
+
trainer_args = TrainingArguments(
|
43 |
+
output_dir=self.config.root_dir,
|
44 |
+
num_train_epochs=1,
|
45 |
+
warmup_steps=500,
|
46 |
+
per_device_train_batch_size=1,
|
47 |
+
per_device_eval_batch_size=1,
|
48 |
+
weight_decay=0.01,
|
49 |
+
logging_steps=10,
|
50 |
+
evaluation_strategy='steps',
|
51 |
+
eval_steps=500,
|
52 |
+
save_steps=1e6,
|
53 |
+
gradient_accumulation_steps=16
|
54 |
+
)
|
55 |
+
|
56 |
+
trainer = Trainer(
|
57 |
+
model=model_pegasus,
|
58 |
+
args=trainer_args,
|
59 |
+
tokenizer=tokenizer,
|
60 |
+
data_collator=seq2seq_data_collator,
|
61 |
+
train_dataset=dataset["train"],
|
62 |
+
eval_dataset=dataset["validation"])
|
63 |
+
|
64 |
+
# trainer.train()
|
65 |
+
|
66 |
+
## Save model
|
67 |
+
model_pegasus.save_pretrained(self.config.model_path)
|
68 |
+
|
69 |
+
## Save tokenizer
|
70 |
+
tokenizer.save_pretrained(self.config.tokenizer_path)
|
src/TextSummarizer/config/__init__.py
ADDED
File without changes
|
src/TextSummarizer/config/config.yaml
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
artifacts_root: artifacts
|
2 |
+
|
3 |
+
|
4 |
+
data_ingestion:
|
5 |
+
dataset_name: "alexfabbri/multi_news"
|
6 |
+
arrow_dataset_dir: artifacts/data
|
7 |
+
|
8 |
+
|
9 |
+
data_validation:
|
10 |
+
root_dir: artifacts/data_validation
|
11 |
+
status_file: artifacts/data_validation/status.txt
|
12 |
+
all_required_folders: ["train", "test", "validation", "dataset_dict.json"]
|
13 |
+
|
14 |
+
|
15 |
+
data_transformation:
|
16 |
+
root_dir: artifacts/data_transformation
|
17 |
+
data_path: artifacts/data
|
18 |
+
tokenizer_name: google/pegasus-cnn_dailymail
|
19 |
+
|
20 |
+
|
21 |
+
model_trainer:
|
22 |
+
root_dir: artifacts/model_trainer
|
23 |
+
data_path: artifacts/data_transformation/dataset
|
24 |
+
model_ckpt: google/pegasus-cnn_dailymail
|
25 |
+
model_path: artifacts/model_trainer/pegasus-samsum-model
|
26 |
+
tokenizer_path: artifacts/model_trainer/tokenizer
|
27 |
+
|
28 |
+
model_evaluation:
|
29 |
+
root_dir: artifacts/model_evaluation
|
30 |
+
data_path: artifacts/data_transformation/dataset
|
31 |
+
model_path: artifacts/model_trainer/pegasus-samsum-model
|
32 |
+
tokenizer_path: artifacts/model_trainer/tokenizer
|
33 |
+
metric_file_name: artifacts/model_evaluation/metrics.csv
|
34 |
+
hub_model_name: pavithra-devi/pegasus-multi-news
|
src/TextSummarizer/config/config_manager.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Configuration manager to get and set all the configuration.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
from box import ConfigBox
|
8 |
+
|
9 |
+
from src.TextSummarizer.constants import file_path
|
10 |
+
from src.TextSummarizer.entity import entities
|
11 |
+
from src.TextSummarizer.utils.general import create_directories, read_yaml
|
12 |
+
|
13 |
+
|
14 |
+
# Create a config manager.
|
15 |
+
class ConfigManager:
|
16 |
+
"""
|
17 |
+
Class to manage the configuration files.
|
18 |
+
"""
|
19 |
+
|
20 |
+
def __init__(self) -> None:
|
21 |
+
self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))
|
22 |
+
self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))
|
23 |
+
|
24 |
+
create_directories(path_to_directories=[self.config.artifacts_root])
|
25 |
+
|
26 |
+
def get_data_ingestion_config(self) -> entities.DataIngestionConfig:
|
27 |
+
"""
|
28 |
+
Get the config which is needed to download the data files.
|
29 |
+
"""
|
30 |
+
config: ConfigBox = self.config.data_ingestion
|
31 |
+
|
32 |
+
data_ingestion_config: entities.DataIngestionConfig = entities.DataIngestionConfig(
|
33 |
+
dataset_name=config.dataset_name,
|
34 |
+
arrow_dataset_dir=config.arrow_dataset_dir,
|
35 |
+
)
|
36 |
+
|
37 |
+
return data_ingestion_config
|
38 |
+
|
39 |
+
def get_data_validation_config(self) -> entities.DataValidationConfig:
|
40 |
+
"""
|
41 |
+
Get the config which is needed to validate the data files.
|
42 |
+
"""
|
43 |
+
config = self.config.data_validation
|
44 |
+
|
45 |
+
create_directories([config.root_dir])
|
46 |
+
|
47 |
+
data_validation_config = entities.DataValidationConfig(
|
48 |
+
root_dir=config.root_dir,
|
49 |
+
status_file=config.status_file,
|
50 |
+
all_required_folders=config.all_required_folders,
|
51 |
+
)
|
52 |
+
|
53 |
+
return data_validation_config
|
54 |
+
|
55 |
+
def get_data_transformation_config(self) -> entities.DataTransformationConfig:
|
56 |
+
"""
|
57 |
+
Get teh data transformation configurations.
|
58 |
+
"""
|
59 |
+
config = self.config.data_transformation
|
60 |
+
|
61 |
+
create_directories([config.root_dir])
|
62 |
+
|
63 |
+
data_transformation_config = entities.DataTransformationConfig(
|
64 |
+
root_dir=config.root_dir,
|
65 |
+
data_path=config.data_path,
|
66 |
+
tokenizer_name = config.tokenizer_name
|
67 |
+
)
|
68 |
+
|
69 |
+
return data_transformation_config
|
70 |
+
|
71 |
+
def get_model_trainer_config(self) -> entities.ModelTrainerConfig:
|
72 |
+
"""
|
73 |
+
Get the configuration which is needed to train the model.
|
74 |
+
"""
|
75 |
+
config = self.config.model_trainer
|
76 |
+
params = self.params.TrainingArguments
|
77 |
+
|
78 |
+
create_directories([config.root_dir])
|
79 |
+
|
80 |
+
model_trainer_config = entities.ModelTrainerConfig(
|
81 |
+
root_dir=config.root_dir,
|
82 |
+
data_path=config.data_path,
|
83 |
+
model_path= config.model_path,
|
84 |
+
tokenizer_path= config.tokenizer_path,
|
85 |
+
model_ckpt = config.model_ckpt,
|
86 |
+
num_train_epochs = params.num_train_epochs,
|
87 |
+
warmup_steps = params.warmup_steps,
|
88 |
+
per_device_train_batch_size = params.per_device_train_batch_size,
|
89 |
+
weight_decay = params.weight_decay,
|
90 |
+
logging_steps = params.logging_steps,
|
91 |
+
evaluation_strategy = params.evaluation_strategy,
|
92 |
+
eval_steps = params.evaluation_strategy,
|
93 |
+
save_steps = params.save_steps,
|
94 |
+
gradient_accumulation_steps = params.gradient_accumulation_steps
|
95 |
+
)
|
96 |
+
|
97 |
+
return model_trainer_config
|
98 |
+
|
99 |
+
def get_model_evaluation_config(self) -> entities.ModelEvaluationConfig:
|
100 |
+
"""
|
101 |
+
Get the model evaluation configuration.
|
102 |
+
"""
|
103 |
+
config = self.config.model_evaluation
|
104 |
+
|
105 |
+
create_directories([config.root_dir])
|
106 |
+
|
107 |
+
model_evaluation_config = entities.ModelEvaluationConfig(
|
108 |
+
root_dir=config.root_dir,
|
109 |
+
data_path=config.data_path,
|
110 |
+
model_path = config.model_path,
|
111 |
+
tokenizer_path = config.tokenizer_path,
|
112 |
+
metric_file_name = config.metric_file_name,
|
113 |
+
hub_model_name=config.hub_model_name
|
114 |
+
|
115 |
+
)
|
116 |
+
|
117 |
+
return model_evaluation_config
|
src/TextSummarizer/config/params.yaml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TrainingArguments:
|
2 |
+
num_train_epochs: 1
|
3 |
+
warmup_steps: 500
|
4 |
+
per_device_train_batch_size: 1
|
5 |
+
weight_decay: 0.01
|
6 |
+
logging_steps: 10
|
7 |
+
evaluation_strategy: steps
|
8 |
+
eval_steps: 500
|
9 |
+
save_steps: 1e6
|
10 |
+
gradient_accumulation_steps: 16
|
src/TextSummarizer/constants/__init__.py
ADDED
File without changes
|
src/TextSummarizer/constants/file_path.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Constants which is related to the file paths are present here.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from typing import Final
|
6 |
+
|
7 |
+
CONFIG_FILE_PATH: Final[str] = "src/TextSummarizer/config/config.yaml"
|
8 |
+
PARAMS_FILE_PATH: Final[str] = "src/TextSummarizer/config/params.yaml"
|
src/TextSummarizer/entity/__init__.py
ADDED
File without changes
|
src/TextSummarizer/entity/entities.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
All the class return types are present here.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from dataclasses import dataclass
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
|
9 |
+
@dataclass()
|
10 |
+
class DataIngestionConfig:
|
11 |
+
"""
|
12 |
+
The return type of the data ingestion config function.
|
13 |
+
"""
|
14 |
+
dataset_name: str
|
15 |
+
arrow_dataset_dir: str
|
16 |
+
|
17 |
+
|
18 |
+
@dataclass()
|
19 |
+
class DataValidationConfig:
|
20 |
+
"""
|
21 |
+
Return type of the data validation config function.
|
22 |
+
"""
|
23 |
+
root_dir: str
|
24 |
+
status_file: str
|
25 |
+
all_required_folders: list
|
26 |
+
|
27 |
+
|
28 |
+
@dataclass(frozen=True)
|
29 |
+
class DataTransformationConfig:
|
30 |
+
"""
|
31 |
+
Return type of the data transformation config function.
|
32 |
+
"""
|
33 |
+
root_dir: str
|
34 |
+
data_path: str
|
35 |
+
tokenizer_name: str
|
36 |
+
|
37 |
+
|
38 |
+
@dataclass(frozen=True)
|
39 |
+
class ModelTrainerConfig:
|
40 |
+
"""
|
41 |
+
Return type of the model trainer config function.
|
42 |
+
"""
|
43 |
+
root_dir: str
|
44 |
+
data_path: str
|
45 |
+
model_ckpt: str
|
46 |
+
model_path: str
|
47 |
+
tokenizer_path: str
|
48 |
+
num_train_epochs: int
|
49 |
+
warmup_steps: int
|
50 |
+
per_device_train_batch_size: int
|
51 |
+
weight_decay: float
|
52 |
+
logging_steps: int
|
53 |
+
evaluation_strategy: str
|
54 |
+
eval_steps: int
|
55 |
+
save_steps: float
|
56 |
+
gradient_accumulation_steps: int
|
57 |
+
|
58 |
+
|
59 |
+
@dataclass(frozen=True)
|
60 |
+
class ModelEvaluationConfig:
|
61 |
+
"""
|
62 |
+
Return type of the model evaluation config function.
|
63 |
+
"""
|
64 |
+
root_dir: str
|
65 |
+
data_path: str
|
66 |
+
model_path: str
|
67 |
+
tokenizer_path: str
|
68 |
+
metric_file_name: str
|
69 |
+
hub_model_name: str
|
src/TextSummarizer/exception/__init__.py
ADDED
File without changes
|
src/TextSummarizer/logger/__init__.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Creating the logger needed for the project.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
import os
|
7 |
+
|
8 |
+
logging_str: str = "[%(asctime)s - %(levelname)s - %(module)s.py - %(name)s]: %(message)s"
|
9 |
+
|
10 |
+
# Create the log directory.
|
11 |
+
log_dir: str = "logs"
|
12 |
+
backend_log_filename: str = "back_end.log"
|
13 |
+
frontend_log_filename: str = "front_end.log"
|
14 |
+
backend_log_filepath: str = os.path.join(log_dir, backend_log_filename)
|
15 |
+
frontend_log_filepath: str = os.path.join(log_dir, frontend_log_filename)
|
16 |
+
|
17 |
+
os.makedirs(log_dir, exist_ok=True)
|
18 |
+
|
19 |
+
# Set the logging config.
|
20 |
+
logging.basicConfig(
|
21 |
+
level=logging.DEBUG,
|
22 |
+
format=logging_str,
|
23 |
+
# handlers=[
|
24 |
+
# logging.FileHandler(log_filepath),
|
25 |
+
# logging.StreamHandler(sys.stdout)
|
26 |
+
# ]
|
27 |
+
)
|
28 |
+
|
29 |
+
# Get the logger.
|
30 |
+
backend_file_handler = logging.FileHandler(backend_log_filepath)
|
31 |
+
frontend_file_handler = logging.FileHandler(frontend_log_filepath)
|
32 |
+
|
33 |
+
# add the formettor for the handler,
|
34 |
+
backend_file_handler.setFormatter(logging.Formatter(logging_str))
|
35 |
+
frontend_file_handler.setFormatter(logging.Formatter(logging_str))
|
36 |
+
|
37 |
+
backend_logger = logging.getLogger("backend")
|
38 |
+
frontend_logger = logging.getLogger("frontend")
|
39 |
+
|
40 |
+
|
41 |
+
# add the handlers.
|
42 |
+
backend_logger.addHandler(backend_file_handler)
|
43 |
+
frontend_logger.addHandler(frontend_file_handler)
|
src/TextSummarizer/pipeline/__init__.py
ADDED
File without changes
|
src/TextSummarizer/pipeline/prediction.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
|
3 |
+
from src.TextSummarizer.config.config_manager import ConfigManager
|
4 |
+
|
5 |
+
|
6 |
+
class PredictionPipeline:
|
7 |
+
def __init__(self):
|
8 |
+
self.config = ConfigManager().get_model_evaluation_config()
|
9 |
+
|
10 |
+
def predict(self,text):
|
11 |
+
"""
|
12 |
+
Predict the tex summarization for the given text.
|
13 |
+
"""
|
14 |
+
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
|
15 |
+
|
16 |
+
summarizer = pipeline("summarization", model=self.config.hub_model_name)
|
17 |
+
|
18 |
+
print("document:")
|
19 |
+
print(text)
|
20 |
+
|
21 |
+
output = summarizer(text, **gen_kwargs)[0]["summary_text"]
|
22 |
+
print("\nModel Summary:")
|
23 |
+
print(output)
|
24 |
+
|
25 |
+
return output
|
src/TextSummarizer/pipeline/step_01_data_ingestion.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The data ingestion pipeline.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from src.TextSummarizer.components.data_ingestion import DataIngestionComponent
|
6 |
+
from src.TextSummarizer.config.config_manager import ConfigManager
|
7 |
+
from src.TextSummarizer.entity import entities
|
8 |
+
from src.TextSummarizer.logger import backend_logger
|
9 |
+
|
10 |
+
|
11 |
+
class DataIngestionPipeline:
|
12 |
+
"""
|
13 |
+
The data ingestion pipeline.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def run(self):
|
17 |
+
"""
|
18 |
+
The main function of the data ingestion pipeline.
|
19 |
+
"""
|
20 |
+
backend_logger.info("Starting the data ingestion pipeline.")
|
21 |
+
config: ConfigManager = ConfigManager()
|
22 |
+
data_ingestion_config: entities.DataIngestionConfig = config.get_data_ingestion_config()
|
23 |
+
data_ingestion = DataIngestionComponent(config=data_ingestion_config)
|
24 |
+
data_ingestion.save_dataset()
|
25 |
+
backend_logger.info("Finished the data ingestion pipeline.")
|
src/TextSummarizer/pipeline/step_02_data_validation.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The data validation pipeline.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from src.TextSummarizer.components.data_validation import DataValidation
|
6 |
+
from src.TextSummarizer.config.config_manager import ConfigManager
|
7 |
+
from src.TextSummarizer.entity import entities
|
8 |
+
from src.TextSummarizer.logger import backend_logger
|
9 |
+
|
10 |
+
|
11 |
+
class DataValidationPipeline:
|
12 |
+
"""
|
13 |
+
The data validation pipeline.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def run(self):
|
17 |
+
"""
|
18 |
+
The main function of the data validation pipeline.
|
19 |
+
"""
|
20 |
+
backend_logger.info("Starting the data validation pipeline.")
|
21 |
+
config: ConfigManager = ConfigManager()
|
22 |
+
data_ingestion_config: entities.DataIngestionConfig = config.get_data_validation_config()
|
23 |
+
data_ingestion = DataValidation(config=data_ingestion_config)
|
24 |
+
data_ingestion.validate_all_files_exist()
|
25 |
+
backend_logger.info("Finished the data validation pipeline.")
|
src/TextSummarizer/pipeline/step_03_data_transformation.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The data Transformation pipeline.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from src.TextSummarizer.components.data_transformation import DataTransformation
|
6 |
+
from src.TextSummarizer.config.config_manager import ConfigManager
|
7 |
+
from src.TextSummarizer.entity import entities
|
8 |
+
from src.TextSummarizer.logger import backend_logger
|
9 |
+
|
10 |
+
|
11 |
+
class DataTransformationPipeline:
|
12 |
+
"""
|
13 |
+
The data Transformation pipeline.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def run(self):
|
17 |
+
"""
|
18 |
+
The main function of the data Transformation pipeline.
|
19 |
+
"""
|
20 |
+
backend_logger.info("Starting the data Transformation pipeline.")
|
21 |
+
config: ConfigManager = ConfigManager()
|
22 |
+
data_transformation_config: entities.DataTransformationConfig = config.get_data_transformation_config()
|
23 |
+
data_transformation = DataTransformation(config=data_transformation_config)
|
24 |
+
data_transformation.convert()
|
25 |
+
backend_logger.info("Finished the data Transformation pipeline.")
|
src/TextSummarizer/pipeline/step_04_train_model.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The data Transformation pipeline.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from src.TextSummarizer.components.train_model import ModelTrainer
|
6 |
+
from src.TextSummarizer.config.config_manager import ConfigManager
|
7 |
+
from src.TextSummarizer.entity import entities
|
8 |
+
from src.TextSummarizer.logger import backend_logger
|
9 |
+
|
10 |
+
|
11 |
+
class ModelTrainerPipeline:
|
12 |
+
"""
|
13 |
+
The data Transformation pipeline.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def run(self):
|
17 |
+
"""
|
18 |
+
The main function of the train model pipeline.
|
19 |
+
"""
|
20 |
+
backend_logger.info("Starting the train model pipeline.")
|
21 |
+
config: ConfigManager = ConfigManager()
|
22 |
+
train_model_config: entities.DataTransformationConfig = config.get_model_trainer_config()
|
23 |
+
train_model = ModelTrainer(config=train_model_config)
|
24 |
+
train_model.train()
|
25 |
+
backend_logger.info("Finished the train model pipeline.")
|
src/TextSummarizer/pipeline/step_05_model_evaluation.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The data Transformation pipeline.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from src.TextSummarizer.components.model_evaluation import ModelEvaluation
|
6 |
+
from src.TextSummarizer.config.config_manager import ConfigManager
|
7 |
+
from src.TextSummarizer.entity import entities
|
8 |
+
from src.TextSummarizer.logger import backend_logger
|
9 |
+
|
10 |
+
|
11 |
+
class ModelEvaluationPipeline:
|
12 |
+
"""
|
13 |
+
The model evaluation pipeline.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def run(self):
|
17 |
+
"""
|
18 |
+
The main function of the model evaluation pipeline.
|
19 |
+
"""
|
20 |
+
backend_logger.info("Starting the model evaluation pipeline.")
|
21 |
+
config = ConfigManager()
|
22 |
+
model_evaluation_config = config.get_model_evaluation_config()
|
23 |
+
model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
|
24 |
+
model_evaluation_config.evaluate()
|
25 |
+
backend_logger.info("Finished the model evaluation pipeline.")
|
src/TextSummarizer/utils/__init__.py
ADDED
File without changes
|
src/TextSummarizer/utils/general.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The functions used throughout the project is present here.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import os
|
6 |
+
from pathlib import Path
|
7 |
+
from typing import Any
|
8 |
+
|
9 |
+
import yaml
|
10 |
+
from box import ConfigBox
|
11 |
+
from box.exceptions import BoxValueError
|
12 |
+
from ensure import ensure_annotations
|
13 |
+
|
14 |
+
from src.TextSummarizer.logger import backend_logger
|
15 |
+
|
16 |
+
|
17 |
+
@ensure_annotations
|
18 |
+
def read_yaml(path_to_yaml: Path) -> ConfigBox:
|
19 |
+
"""
|
20 |
+
Read yaml file and return as Dictionary.
|
21 |
+
|
22 |
+
:param path_to_yaml: Path to yaml file.
|
23 |
+
:return: A ConfigBox dictionary object containing the the yaml file contents.
|
24 |
+
"""
|
25 |
+
try:
|
26 |
+
with open(path_to_yaml) as yaml_file:
|
27 |
+
content = yaml.safe_load(yaml_file)
|
28 |
+
backend_logger.info(f"yaml file: {path_to_yaml} loaded successfully")
|
29 |
+
return ConfigBox(content)
|
30 |
+
except BoxValueError:
|
31 |
+
raise ValueError(f"yaml file: {path_to_yaml} is empty.")
|
32 |
+
except Exception as exp:
|
33 |
+
raise exp
|
34 |
+
|
35 |
+
|
36 |
+
def create_directories(path_to_directories: list) -> None:
|
37 |
+
"""
|
38 |
+
create list of directories.
|
39 |
+
|
40 |
+
:params path_to_directories: list of path of directories.
|
41 |
+
"""
|
42 |
+
for path in path_to_directories:
|
43 |
+
os.makedirs(path, exist_ok=True)
|
44 |
+
backend_logger.info(f"created directory at: {path}")
|
45 |
+
|
46 |
+
|
47 |
+
@ensure_annotations
|
48 |
+
def get_size(path: Path) -> str:
|
49 |
+
"""
|
50 |
+
Get the file size in KB.
|
51 |
+
|
52 |
+
:param path: Path of the file.
|
53 |
+
:returns: Size in KB.
|
54 |
+
"""
|
55 |
+
size_in_kb = round(os.path.getsize(path)/1024)
|
56 |
+
return f"~ {size_in_kb} KB"
|
src/__init__.py
ADDED
File without changes
|