hakim commited on
Commit
fd31bf7
·
1 Parent(s): f2492e6

model evaluation added

Browse files
.github/workflows/main.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+
7
+ # to run this workflow manually from the Actions tab
8
+ workflow_dispatch:
9
+
10
+ jobs:
11
+ sync-to-hub:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v2
15
+ with:
16
+ fetch-depth: 0
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push --force https://Md-Hakim:[email protected]/spaces/Md-Hakim/text-summarization main
Dockerfile CHANGED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ COPY . /code
10
+
11
+ CMD ["streamlit", "run", "app.py"]
README.md CHANGED
@@ -1 +1,61 @@
1
- # text-summarization
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Text Summarization
3
+ emoji: 🐨
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: streamlit
7
+ sdk_version: 1.37.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # text-summarization
14
+
15
+
16
+ ## Workflows
17
+
18
+ 1. Update config.yaml
19
+ 2. Update secrets.yaml [Optional]
20
+ 3. Update params.yaml
21
+ 4. Update the entity
22
+ 5. Update the configuration manager in src config
23
+ 6. Update the components
24
+ 7. Update the pipeline
25
+ 8. Update the main.py
26
+ 9. Update the dvc.yaml
27
+ 10. app.py
28
+
29
+ # How to run?
30
+ ### STEPS:
31
+
32
+ Clone the repository
33
+
34
+ ```bash
35
+ https://github.com/HAKIM-ML/
36
+ text-summarization
37
+
38
+ ### STEP 01- Create a conda environment after opening the repository
39
+
40
+ ```bash
41
+ conda create -n cnncls python=3.8 -y
42
+ ```
43
+
44
+ ```bash
45
+ conda activate cnncls
46
+ ```
47
+
48
+
49
+ ### STEP 02- install the requirements
50
+ ```bash
51
+ pip install -r requirements.txt
52
+ ```
53
+
54
+ ```bash
55
+ # Finally run the following command
56
+ python app.py
57
+ ```
58
+
59
+ Now,
60
+ ```bash
61
+ open up you local host and port
app.py CHANGED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from textsummarizer.pipeline.predict import PredictionPipeline
3
+
4
+ def main():
5
+ # Set page config
6
+ st.set_page_config(page_title="Dialogue Summarizer", page_icon="💬", layout="wide")
7
+
8
+ # Custom CSS to improve the appearance
9
+ st.markdown("""
10
+ <style>
11
+ .big-font {
12
+ font-size:20px !important;
13
+ font-weight: bold;
14
+ }
15
+ .result-font {
16
+ font-size:18px !important;
17
+ font-style: italic;
18
+ }
19
+ .stButton>button {
20
+ width: 100%;
21
+ height: 50px;
22
+ font-size: 20px;
23
+ }
24
+ </style>
25
+ """, unsafe_allow_html=True)
26
+
27
+ # App title and description
28
+ st.title("🤖 AI Dialogue Summarizer")
29
+ st.markdown("Transform your lengthy conversations into concise summaries with our cutting-edge AI technology.")
30
+
31
+ # Create two columns
32
+ col1, col2 = st.columns([2, 1])
33
+
34
+ with col1:
35
+ st.markdown('<p class="big-font">Input Dialogue</p>', unsafe_allow_html=True)
36
+ user_input = st.text_area("", height=300, placeholder="Paste your dialogue here...")
37
+
38
+ with col2:
39
+ st.markdown('<p class="big-font">Summary</p>', unsafe_allow_html=True)
40
+ summary_placeholder = st.empty()
41
+
42
+ # Create an instance of PredictionPipeline
43
+ predictor = PredictionPipeline()
44
+
45
+ if st.button("📝 Generate Summary"):
46
+ if user_input:
47
+ with st.spinner('Generating summary...'):
48
+ # Get the summary
49
+ summary = predictor.predict(user_input)
50
+ # Display the summary
51
+ summary_placeholder.markdown(f'<p class="result-font">{summary}</p>', unsafe_allow_html=True)
52
+ else:
53
+ st.warning("⚠️ Please enter some text to summarize.")
54
+
55
+ # Add some spacing
56
+ st.markdown("<br><br>", unsafe_allow_html=True)
57
+
58
+ # Add a section for app info
59
+ st.markdown("## About This App")
60
+ st.info("""
61
+ This AI-powered dialogue summarizer uses advanced natural language processing to distill the key points from conversations.
62
+ It's perfect for quickly understanding the essence of meetings, chats, or any form of dialogue.
63
+
64
+ **How to use:**
65
+ 1. Paste your dialogue in the text area on the left.
66
+ 2. Click the 'Generate Summary' button.
67
+ 3. View the AI-generated summary on the right.
68
+
69
+ For best results, ensure your input is a clear dialogue or conversation.
70
+ """)
71
+
72
+ if __name__ == "__main__":
73
+ main()
config/config.yaml CHANGED
@@ -24,4 +24,13 @@ data_transformation:
24
  model_trainer:
25
  root_dir: artifacts/model_trainer
26
  data_path: artifacts/data_transformation/samsum_dataset
27
- model_ckpt: google/pegasus-cnn_dailymail
 
 
 
 
 
 
 
 
 
 
24
  model_trainer:
25
  root_dir: artifacts/model_trainer
26
  data_path: artifacts/data_transformation/samsum_dataset
27
+ model_ckpt: google/pegasus-cnn_dailymail
28
+
29
+
30
+ model_evaluation:
31
+ root_dir: artifacts/model_evaluation
32
+ data_path: artifacts/data_transformation/samsum_dataset
33
+ model_path: artifacts/model_trainer/pegasus-samsum-model
34
+ tokenizer_path: artifacts/model_trainer/tokenizer
35
+ metric_file_name: artifacts/model_evaluation/metrics.json
36
+
main.py CHANGED
@@ -2,6 +2,7 @@ from textsummarizer.pipeline.stage_01_data_ingestion import DataIngestionPipelin
2
  from textsummarizer.pipeline.stage_02_data_validation import DataValidationPipeline
3
  from textsummarizer.pipeline.stage_03_data_transformation import DataTransformationPipeline
4
  from textsummarizer.pipeline.stage_04_model_trainer import ModelTrainerPipeline
 
5
  from textsummarizer.logging import logger
6
 
7
  STAGE_NAME = "Data Ingestion stage"
@@ -38,12 +39,24 @@ except Exception as e:
38
 
39
 
40
 
41
- STAGE_NAME = "Data Traniner stage"
42
  try:
43
  logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
44
  model_tranier = ModelTrainerPipeline()
45
  model_tranier.main()
46
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
 
 
 
 
 
 
 
 
 
 
 
 
47
  except Exception as e:
48
  logger.exception(e)
49
  raise e
 
2
  from textsummarizer.pipeline.stage_02_data_validation import DataValidationPipeline
3
  from textsummarizer.pipeline.stage_03_data_transformation import DataTransformationPipeline
4
  from textsummarizer.pipeline.stage_04_model_trainer import ModelTrainerPipeline
5
+ from textsummarizer.pipeline.stage_05_model_evaluation import ModelEvaluationPipeline
6
  from textsummarizer.logging import logger
7
 
8
  STAGE_NAME = "Data Ingestion stage"
 
39
 
40
 
41
 
42
+ STAGE_NAME = "Model Traniner stage"
43
  try:
44
  logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
45
  model_tranier = ModelTrainerPipeline()
46
  model_tranier.main()
47
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
48
+ except Exception as e:
49
+ logger.exception(e)
50
+ raise e
51
+
52
+
53
+
54
+ STAGE_NAME = "Model Evaluation stage"
55
+ try:
56
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
57
+ model_evaluation = ModelEvaluationPipeline()
58
+ model_evaluation.main()
59
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
60
  except Exception as e:
61
  logger.exception(e)
62
  raise e
requirements.txt CHANGED
@@ -18,4 +18,4 @@ ensure==1.0.2
18
  fastapi==0.78.0
19
  uvicorn==0.18.3
20
  Jinja2==3.1.2
21
- -e .
 
18
  fastapi==0.78.0
19
  uvicorn==0.18.3
20
  Jinja2==3.1.2
21
+
research/model_evaluatoin.ipynb ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.chdir('../')"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "from dataclasses import dataclass\n",
20
+ "from pathlib import Path\n",
21
+ "@dataclass(frozen=True)\n",
22
+ "class ModelEvaluationConfig:\n",
23
+ " root_dir : Path\n",
24
+ " data_path : Path\n",
25
+ " model_path : Path\n",
26
+ " all_params: dict\n",
27
+ " tokenizer_path : Path\n",
28
+ " metric_file_name : Path"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 3,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "from textsummarizer.constants import *\n",
38
+ "from textsummarizer.utils.common import read_yaml, create_directories, save_json, load_json\n",
39
+ "\n",
40
+ "class ConfigurationManager:\n",
41
+ " def __init__(\n",
42
+ " self,\n",
43
+ " config_filepath = CONFIG_FILE_PATH,\n",
44
+ " params_filepath = PARAMS_FILE_PATH):\n",
45
+ "\n",
46
+ " self.config = read_yaml(config_filepath)\n",
47
+ " self.params = read_yaml(params_filepath)\n",
48
+ "\n",
49
+ " create_directories([self.config.artifacts_root])\n",
50
+ "\n",
51
+ "\n",
52
+ " \n",
53
+ " def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
54
+ " config = self.config.model_evaluation\n",
55
+ "\n",
56
+ " create_directories([config.root_dir])\n",
57
+ "\n",
58
+ " model_evaluation_config = ModelEvaluationConfig(\n",
59
+ " root_dir=config.root_dir,\n",
60
+ " data_path=config.data_path,\n",
61
+ " model_path = config.model_path,\n",
62
+ " tokenizer_path = config.tokenizer_path,\n",
63
+ " metric_file_name = config.metric_file_name\n",
64
+ " \n",
65
+ " )\n",
66
+ "\n",
67
+ " return model_evaluation_config"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 4,
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "name": "stdout",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "[2024-08-11 20:23:00,587: INFO: config: PyTorch version 2.2.2+cu121 available.]\n",
80
+ "[2024-08-11 20:23:00,589: INFO: config: TensorFlow version 2.12.0 available.]\n"
81
+ ]
82
+ }
83
+ ],
84
+ "source": [
85
+ "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
86
+ "from datasets import load_dataset, load_from_disk, load_metric\n",
87
+ "import torch\n",
88
+ "import pandas as pd\n",
89
+ "from tqdm import tqdm"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 6,
95
+ "metadata": {},
96
+ "outputs": [],
97
+ "source": [
98
+ "import mlflow\n",
99
+ "import dagshub\n",
100
+ "import json\n",
101
+ "\n",
102
+ "class ModelEvaluation:\n",
103
+ " def __init__(self, config: ModelEvaluationConfig):\n",
104
+ " self.config = config\n",
105
+ "\n",
106
+ " def generate_batch_sized_chunks(self, list_of_elements, batch_size):\n",
107
+ " \"\"\"split the dataset into smaller batches that we can process simultaneously\n",
108
+ " Yield successive batch-sized chunks from list_of_elements.\"\"\"\n",
109
+ " for i in range(0, len(list_of_elements), batch_size):\n",
110
+ " yield list_of_elements[i : i + batch_size]\n",
111
+ "\n",
112
+ " def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, \n",
113
+ " batch_size=16, device=\"cuda\" if torch.cuda.is_available() else \"cpu\", \n",
114
+ " column_text=\"article\", \n",
115
+ " column_summary=\"highlights\"):\n",
116
+ " article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))\n",
117
+ " target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))\n",
118
+ "\n",
119
+ " for article_batch, target_batch in tqdm(\n",
120
+ " zip(article_batches, target_batches), total=len(article_batches)):\n",
121
+ " \n",
122
+ " inputs = tokenizer(article_batch, max_length=1024, truncation=True, \n",
123
+ " padding=\"max_length\", return_tensors=\"pt\")\n",
124
+ " \n",
125
+ " summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n",
126
+ " attention_mask=inputs[\"attention_mask\"].to(device), \n",
127
+ " length_penalty=0.8, num_beams=8, max_length=128)\n",
128
+ " \n",
129
+ " decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, \n",
130
+ " clean_up_tokenization_spaces=True) \n",
131
+ " for s in summaries] \n",
132
+ " \n",
133
+ " decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n",
134
+ " \n",
135
+ " metric.add_batch(predictions=decoded_summaries, references=target_batch)\n",
136
+ " \n",
137
+ " score = metric.compute()\n",
138
+ " return score\n",
139
+ "\n",
140
+ " def evaluate(self):\n",
141
+ " # Set up MLflow tracking\n",
142
+ " dagshub.init(repo_owner='azizulhakim8291', repo_name='text-summarization', mlflow=True)\n",
143
+ " mlflow.set_tracking_uri(\"https://dagshub.com/azizulhakim8291/text-summarization.mlflow\")\n",
144
+ " mlflow.set_experiment(\"text-summarization-evaluation\")\n",
145
+ "\n",
146
+ " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
147
+ " tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n",
148
+ " model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n",
149
+ " \n",
150
+ " dataset_samsum_pt = load_from_disk(self.config.data_path)\n",
151
+ "\n",
152
+ " rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
153
+ " rouge_metric = load_metric('rouge')\n",
154
+ "\n",
155
+ " with mlflow.start_run():\n",
156
+ " mlflow.log_param(\"model_name\", \"pegasus\")\n",
157
+ " mlflow.log_param(\"dataset\", \"samsum\")\n",
158
+ "\n",
159
+ " score = self.calculate_metric_on_test_ds(\n",
160
+ " dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, \n",
161
+ " batch_size = 2, column_text = 'dialogue', column_summary= 'summary'\n",
162
+ " )\n",
163
+ "\n",
164
+ " rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)\n",
165
+ "\n",
166
+ " # Log metrics to MLflow\n",
167
+ " for rouge_name, rouge_score in rouge_dict.items():\n",
168
+ " mlflow.log_metric(rouge_name, rouge_score)\n",
169
+ "\n",
170
+ " # Save results as JSON\n",
171
+ " with open(self.config.metric_file_name, 'w') as f:\n",
172
+ " json.dump(rouge_dict, f, indent=4)\n",
173
+ "\n",
174
+ " # Log the JSON file as an artifact\n",
175
+ " mlflow.log_artifact(self.config.metric_file_name)"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": 8,
181
+ "metadata": {},
182
+ "outputs": [
183
+ {
184
+ "name": "stdout",
185
+ "output_type": "stream",
186
+ "text": [
187
+ "[2024-08-11 22:27:18,954: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
188
+ "[2024-08-11 22:27:18,967: INFO: common: yaml file: params.yaml loaded successfully]\n",
189
+ "[2024-08-11 22:27:18,971: INFO: common: created directory at: artifacts]\n",
190
+ "[2024-08-11 22:27:18,973: INFO: common: created directory at: artifacts/model_evaluation]\n",
191
+ "[2024-08-11 22:27:19,619: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/azizulhakim8291/text-summarization \"HTTP/1.1 200 OK\"]\n"
192
+ ]
193
+ },
194
+ {
195
+ "data": {
196
+ "text/html": [
197
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Initialized MLflow to track repo <span style=\"color: #008000; text-decoration-color: #008000\">\"azizulhakim8291/text-summarization\"</span>\n",
198
+ "</pre>\n"
199
+ ],
200
+ "text/plain": [
201
+ "Initialized MLflow to track repo \u001b[32m\"azizulhakim8291/text-summarization\"\u001b[0m\n"
202
+ ]
203
+ },
204
+ "metadata": {},
205
+ "output_type": "display_data"
206
+ },
207
+ {
208
+ "name": "stdout",
209
+ "output_type": "stream",
210
+ "text": [
211
+ "[2024-08-11 22:27:20,037: INFO: helpers: Initialized MLflow to track repo \"azizulhakim8291/text-summarization\"]\n"
212
+ ]
213
+ },
214
+ {
215
+ "data": {
216
+ "text/html": [
217
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Repository azizulhakim8291/text-summarization initialized!\n",
218
+ "</pre>\n"
219
+ ],
220
+ "text/plain": [
221
+ "Repository azizulhakim8291/text-summarization initialized!\n"
222
+ ]
223
+ },
224
+ "metadata": {},
225
+ "output_type": "display_data"
226
+ },
227
+ {
228
+ "name": "stdout",
229
+ "output_type": "stream",
230
+ "text": [
231
+ "[2024-08-11 22:27:20,040: INFO: helpers: Repository azizulhakim8291/text-summarization initialized!]\n",
232
+ "[2024-08-11 22:27:20,119: WARNING: connectionpool: Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'RemoteDisconnected('Remote end closed connection without response')': /azizulhakim8291/text-summarization.mlflow/api/2.0/mlflow/experiments/get-by-name?experiment_name=text-summarization-evaluation]\n"
233
+ ]
234
+ },
235
+ {
236
+ "name": "stderr",
237
+ "output_type": "stream",
238
+ "text": [
239
+ "c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\datasets\\load.py:756: FutureWarning: The repository for rouge contains custom code which must be executed to correctly load the metric. You can inspect the repository content at https://raw.githubusercontent.com/huggingface/datasets/2.18.0/metrics/rouge/rouge.py\n",
240
+ "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
241
+ "Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n",
242
+ " warnings.warn(\n",
243
+ "100%|██████████| 5/5 [00:21<00:00, 4.26s/it]"
244
+ ]
245
+ },
246
+ {
247
+ "name": "stdout",
248
+ "output_type": "stream",
249
+ "text": [
250
+ "[2024-08-11 22:28:20,351: INFO: rouge_scorer: Using default tokenizer.]\n"
251
+ ]
252
+ },
253
+ {
254
+ "name": "stderr",
255
+ "output_type": "stream",
256
+ "text": [
257
+ "\n"
258
+ ]
259
+ }
260
+ ],
261
+ "source": [
262
+ "try:\n",
263
+ " config = ConfigurationManager()\n",
264
+ " model_evaluation_config = config.get_model_evaluation_config()\n",
265
+ " model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n",
266
+ " model_evaluation_config.evaluate()\n",
267
+ "except Exception as e:\n",
268
+ " raise e"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": null,
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": []
277
+ }
278
+ ],
279
+ "metadata": {
280
+ "kernelspec": {
281
+ "display_name": "Python 3",
282
+ "language": "python",
283
+ "name": "python3"
284
+ },
285
+ "language_info": {
286
+ "codemirror_mode": {
287
+ "name": "ipython",
288
+ "version": 3
289
+ },
290
+ "file_extension": ".py",
291
+ "mimetype": "text/x-python",
292
+ "name": "python",
293
+ "nbconvert_exporter": "python",
294
+ "pygments_lexer": "ipython3",
295
+ "version": "3.11.0"
296
+ }
297
+ },
298
+ "nbformat": 4,
299
+ "nbformat_minor": 2
300
+ }
research/model_trainer.ipynb CHANGED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.chdir('../')"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "text/plain": [
21
+ "'c:\\\\mlops projects\\\\text-summarization'"
22
+ ]
23
+ },
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "output_type": "execute_result"
27
+ }
28
+ ],
29
+ "source": [
30
+ "%pwd"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 3,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "from dataclasses import dataclass\n",
40
+ "from pathlib import Path\n",
41
+ "@dataclass(frozen=True)\n",
42
+ "class ModelTrainerConfig:\n",
43
+ " root_dir : Path\n",
44
+ " data_path : Path\n",
45
+ " model_ckpt : Path\n",
46
+ " num_train_epochs : int\n",
47
+ " warmup_steps : int\n",
48
+ " per_device_train_batch_size : int\n",
49
+ " weight_decay : float\n",
50
+ " logging_steps : int\n",
51
+ " evaluation_strategy: str\n",
52
+ " eval_steps: int\n",
53
+ " save_steps: float\n",
54
+ " gradient_accumulation_steps: int"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 4,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "from textsummarizer.constants import *\n",
64
+ "from textsummarizer.utils.common import read_yaml,create_directories\n",
65
+ "\n",
66
+ "class ConfigurationManager:\n",
67
+ " def __init__(\n",
68
+ " self,\n",
69
+ " config_filepath = CONFIG_FILE_PATH,\n",
70
+ " params_filepath = PARAMS_FILE_PATH):\n",
71
+ "\n",
72
+ " self.config = read_yaml(config_filepath)\n",
73
+ " self.params = read_yaml(params_filepath)\n",
74
+ "\n",
75
+ " create_directories([self.config.artifacts_root])\n",
76
+ " \n",
77
+ " \n",
78
+ " def get_model_trainer_config(self) -> ModelTrainerConfig:\n",
79
+ " config = self.config.model_trainer\n",
80
+ " params = self.params.TrainingArguments\n",
81
+ "\n",
82
+ " create_directories([config.root_dir])\n",
83
+ " \n",
84
+ " \n",
85
+ " model_trainer_config = ModelTrainerConfig(\n",
86
+ " root_dir = config.root_dir,\n",
87
+ " data_path = config.data_path,\n",
88
+ " model_ckpt = config.model_ckpt,\n",
89
+ " num_train_epochs =params.num_train_epochs,\n",
90
+ " warmup_steps =params.warmup_steps,\n",
91
+ " per_device_train_batch_size = params.per_device_train_batch_size,\n",
92
+ " weight_decay = params.weight_decay,\n",
93
+ " logging_steps = params.logging_steps,\n",
94
+ " evaluation_strategy =params.evaluation_strategy,\n",
95
+ " eval_steps =params.eval_steps,\n",
96
+ " save_steps = params.save_steps,\n",
97
+ " gradient_accumulation_steps = params.gradient_accumulation_steps\n",
98
+ " )\n",
99
+ " \n",
100
+ " return model_trainer_config\n",
101
+ " "
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 6,
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "from transformers import TrainingArguments, Trainer\n",
111
+ "from transformers import DataCollatorForSeq2Seq\n",
112
+ "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
113
+ "from datasets import load_dataset, load_from_disk\n",
114
+ "import torch\n",
115
+ "import os"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 7,
121
+ "metadata": {},
122
+ "outputs": [],
123
+ "source": [
124
+ "class ModelTrainer:\n",
125
+ " def __init__(self, config : ModelTrainerConfig):\n",
126
+ " self.config = config\n",
127
+ " os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
128
+ " \n",
129
+ " \n",
130
+ " def train(self):\n",
131
+ " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
132
+ " tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)\n",
133
+ " model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)\n",
134
+ " seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)\n",
135
+ " \n",
136
+ " #loading data \n",
137
+ " dataset_samsum_pt = load_from_disk(self.config.data_path)\n",
138
+ " \n",
139
+ " \n",
140
+ " trainer_args = TrainingArguments(\n",
141
+ " output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,\n",
142
+ " per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,\n",
143
+ " weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,\n",
144
+ " evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,\n",
145
+ " gradient_accumulation_steps=self.config.gradient_accumulation_steps,\n",
146
+ " report_to=\"none\"\n",
147
+ " \n",
148
+ " ) \n",
149
+ " \n",
150
+ " \n",
151
+ " trainer = Trainer(model=model_pegasus, args=trainer_args,\n",
152
+ " tokenizer=tokenizer, data_collator=seq2seq_data_collator,\n",
153
+ " train_dataset=dataset_samsum_pt[\"test\"], \n",
154
+ " eval_dataset=dataset_samsum_pt[\"validation\"])\n",
155
+ " \n",
156
+ " \n",
157
+ " trainer.train()\n",
158
+ "\n",
159
+ " ## Save model\n",
160
+ " model_pegasus.save_pretrained(os.path.join(self.config.root_dir,\"pegasus-samsum-model\"))\n",
161
+ " ## Save tokenizer\n",
162
+ " tokenizer.save_pretrained(os.path.join(self.config.root_dir,\"tokenizer\"))\n",
163
+ " \n",
164
+ " \n",
165
+ " "
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "metadata": {},
172
+ "outputs": [],
173
+ "source": [
174
+ "try:\n",
175
+ " config = ConfigurationManager()\n",
176
+ " model_trainer_config = config.get_model_trainer_config()\n",
177
+ " model_trainer_config = ModelTrainer(config=model_trainer_config)\n",
178
+ " model_trainer_config.train()\n",
179
+ "except Exception as e:\n",
180
+ " raise e"
181
+ ]
182
+ },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": null,
186
+ "metadata": {},
187
+ "outputs": [],
188
+ "source": []
189
+ }
190
+ ],
191
+ "metadata": {
192
+ "kernelspec": {
193
+ "display_name": "Python 3",
194
+ "language": "python",
195
+ "name": "python3"
196
+ },
197
+ "language_info": {
198
+ "codemirror_mode": {
199
+ "name": "ipython",
200
+ "version": 3
201
+ },
202
+ "file_extension": ".py",
203
+ "mimetype": "text/x-python",
204
+ "name": "python",
205
+ "nbconvert_exporter": "python",
206
+ "pygments_lexer": "ipython3",
207
+ "version": "3.11.0"
208
+ }
209
+ },
210
+ "nbformat": 4,
211
+ "nbformat_minor": 2
212
+ }
src/textsummarizer/config/configuration.py CHANGED
@@ -3,7 +3,8 @@ from textsummarizer.utils.common import read_yaml, create_directories
3
  from textsummarizer.entity.config_entity import (DataIngestionConfig,
4
  DataValidationConfig,
5
  DataTransformationConfig,
6
- ModelTrainerConfig)
 
7
 
8
  class ConfigurationManager:
9
  def __init__(
@@ -84,5 +85,24 @@ class ConfigurationManager:
84
  )
85
 
86
  return model_trainer_config
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
 
 
3
  from textsummarizer.entity.config_entity import (DataIngestionConfig,
4
  DataValidationConfig,
5
  DataTransformationConfig,
6
+ ModelTrainerConfig,
7
+ ModelEvaluationConfig)
8
 
9
  class ConfigurationManager:
10
  def __init__(
 
85
  )
86
 
87
  return model_trainer_config
88
+
89
+
90
+ def get_model_evaluation_config(self) -> ModelEvaluationConfig:
91
+ config = self.config.model_evaluation
92
+ params = self.params.TrainingArguments
93
+
94
+ create_directories([config.root_dir])
95
+
96
+ model_evaluation_config = ModelEvaluationConfig(
97
+ root_dir=config.root_dir,
98
+ data_path=config.data_path,
99
+ model_path = config.model_path,
100
+ tokenizer_path = config.tokenizer_path,
101
+ metric_file_name = config.metric_file_name,
102
+ all_params = params
103
+
104
+ )
105
+
106
+ return model_evaluation_config
107
 
108
 
src/textsummarizer/conponents/model_evaluation.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
2
+ from datasets import load_dataset, load_from_disk, load_metric
3
+ import torch
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+ from textsummarizer.entity.config_entity import ModelEvaluationConfig
7
+ import mlflow
8
+ import dagshub
9
+ import json
10
+
11
+
12
+
13
+
14
+
15
+ class ModelEvaluation:
16
+ def __init__(self, config: ModelEvaluationConfig):
17
+ self.config = config
18
+
19
+ def generate_batch_sized_chunks(self, list_of_elements, batch_size):
20
+ """split the dataset into smaller batches that we can process simultaneously
21
+ Yield successive batch-sized chunks from list_of_elements."""
22
+ for i in range(0, len(list_of_elements), batch_size):
23
+ yield list_of_elements[i : i + batch_size]
24
+
25
+ def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer,
26
+ batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu",
27
+ column_text="article",
28
+ column_summary="highlights"):
29
+ article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
30
+ target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))
31
+
32
+ for article_batch, target_batch in tqdm(
33
+ zip(article_batches, target_batches), total=len(article_batches)):
34
+
35
+ inputs = tokenizer(article_batch, max_length=1024, truncation=True,
36
+ padding="max_length", return_tensors="pt")
37
+
38
+ summaries = model.generate(input_ids=inputs["input_ids"].to(device),
39
+ attention_mask=inputs["attention_mask"].to(device),
40
+ length_penalty=0.8, num_beams=8, max_length=128)
41
+
42
+ decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
43
+ clean_up_tokenization_spaces=True)
44
+ for s in summaries]
45
+
46
+ decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
47
+
48
+ metric.add_batch(predictions=decoded_summaries, references=target_batch)
49
+
50
+ score = metric.compute()
51
+ return score
52
+
53
+ def evaluate(self):
54
+ # Set up MLflow tracking
55
+ dagshub.init(repo_owner='azizulhakim8291', repo_name='text-summarization', mlflow=True)
56
+ mlflow.set_tracking_uri("https://dagshub.com/azizulhakim8291/text-summarization.mlflow")
57
+ mlflow.set_experiment("text-summarization-evaluation")
58
+
59
+ device = "cuda" if torch.cuda.is_available() else "cpu"
60
+ tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
61
+ model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
62
+
63
+ dataset_samsum_pt = load_from_disk(self.config.data_path)
64
+
65
+ rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
66
+ rouge_metric = load_metric('rouge')
67
+
68
+ with mlflow.start_run():
69
+ mlflow.log_param("model_name", "pegasus")
70
+ mlflow.log_param("dataset", "samsum")
71
+ mlflow.log_param('parameter name', 'value')
72
+
73
+ score = self.calculate_metric_on_test_ds(
74
+ dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer,
75
+ batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
76
+ )
77
+
78
+ rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
79
+ mlflow.log_params(self.config.all_params)
80
+
81
+ # Log metrics to MLflow
82
+ for rouge_name, rouge_score in rouge_dict.items():
83
+ mlflow.log_metric(rouge_name, rouge_score)
84
+
85
+ # Save results as JSON
86
+ with open(self.config.metric_file_name, 'w') as f:
87
+ json.dump(rouge_dict, f, indent=4)
88
+
89
+ # Log the JSON file as an artifact
90
+ mlflow.log_artifact(self.config.metric_file_name)
src/textsummarizer/entity/config_entity.py CHANGED
@@ -38,3 +38,13 @@ class ModelTrainerConfig:
38
  eval_steps: int
39
  save_steps: float
40
  gradient_accumulation_steps: int
 
 
 
 
 
 
 
 
 
 
 
38
  eval_steps: int
39
  save_steps: float
40
  gradient_accumulation_steps: int
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class ModelEvaluationConfig:
45
+ root_dir : Path
46
+ data_path : Path
47
+ model_path : Path
48
+ all_params: dict
49
+ tokenizer_path : Path
50
+ metric_file_name : Path
src/textsummarizer/pipeline/predict.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textsummarizer.config.configuration import ConfigurationManager
2
+ from transformers import AutoTokenizer
3
+ from transformers import pipeline
4
+
5
+
6
+
7
+ class PredictionPipeline:
8
+ def __init__(self):
9
+ self.config = ConfigurationManager().get_model_evaluation_config()
10
+
11
+ def predict(self,text):
12
+ tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
13
+ gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
14
+
15
+ pipe = pipeline("summarization", model=self.config.model_path,tokenizer=tokenizer)
16
+
17
+ print("Dialogue:")
18
+ print(text)
19
+
20
+ output = pipe(text, **gen_kwargs)[0]["summary_text"]
21
+ print("\nModel Summary:")
22
+ print(output)
23
+
24
+ return output
src/textsummarizer/pipeline/stage_05_model_evaluation.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textsummarizer.conponents.model_evaluation import ModelEvaluation
2
+ from textsummarizer.config.configuration import ConfigurationManager
3
+
4
+
5
+ class ModelEvaluationPipeline:
6
+ def __init__(self):
7
+ pass
8
+
9
+ def main(self):
10
+ config = ConfigurationManager()
11
+ model_evaluation_config = config.get_model_evaluation_config()
12
+ model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
13
+ model_evaluation_config.evaluate()
src/textsummarizer/utils/common.py CHANGED
@@ -6,7 +6,7 @@ from ensure import ensure_annotations
6
  from box import ConfigBox
7
  from pathlib import Path
8
  from typing import Any
9
-
10
 
11
 
12
  @ensure_annotations
@@ -63,4 +63,39 @@ def get_size(path: Path) -> str:
63
  size_in_kb = round(os.path.getsize(path)/1024)
64
  return f"~ {size_in_kb} KB"
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
 
6
  from box import ConfigBox
7
  from pathlib import Path
8
  from typing import Any
9
+ import json
10
 
11
 
12
  @ensure_annotations
 
63
  size_in_kb = round(os.path.getsize(path)/1024)
64
  return f"~ {size_in_kb} KB"
65
 
66
+
67
+ @ensure_annotations
68
+ def save_json(path: Path, data: dict):
69
+ """save json data
70
+
71
+ Args:
72
+ path (Path): path to json file
73
+ data (dict): data to be saved in json file
74
+ """
75
+
76
+
77
+ with open(path, 'w') as f:
78
+ json.dump(data, f, indent=4)
79
+
80
+ logger.info(f'Json file saved at: {path}')
81
+
82
+
83
+
84
+
85
+ @ensure_annotations
86
+ def load_json(path: Path) -> ConfigBox:
87
+ """load json files data
88
+
89
+ Args:
90
+ path (Path): path to json file
91
+
92
+ Returns:
93
+ ConfigBox: data as class attributes instead of dict
94
+ """
95
+
96
+ with open(path, 'r') as f:
97
+ content = json.load(f)
98
+
99
+ logger.info(f"Json file loaded successfully from: {path}")
100
+ return ConfigBox
101