Spaces:
Sleeping
Sleeping
File size: 6,629 Bytes
34b369f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"%pwd\n",
"os.chdir(\"../\")\n",
"\n",
"%pwd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from dataclasses import dataclass\n",
"from pathlib import Path\n",
"\n",
"\n",
"@dataclass(frozen=True)\n",
"class ModelEvaluationConfig:\n",
" root_dir: str\n",
" data_path: str\n",
" model_path: str\n",
" tokenizer_path: str\n",
" metric_file_name: str"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from box import ConfigBox\n",
"from pathlib import Path\n",
"from src.TextSummarizer.constants import file_path\n",
"from src.TextSummarizer.utils.general import read_yaml, create_directories\n",
"\n",
"\n",
"class ConfigurationManager:\n",
"\n",
" def __init__(self) -> None:\n",
" self.config: ConfigBox = read_yaml(Path(file_path.CONFIG_FILE_PATH))\n",
" self.params: ConfigBox = read_yaml(Path(file_path.PARAMS_FILE_PATH))\n",
"\n",
" create_directories(path_to_directories=[self.config.artifacts_root])\n",
"\n",
" def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
" config = self.config.model_evaluation\n",
"\n",
" create_directories([config.root_dir])\n",
"\n",
" model_evaluation_config = ModelEvaluationConfig(\n",
" root_dir=config.root_dir,\n",
" data_path=config.data_path,\n",
" model_path = config.model_path,\n",
" tokenizer_path = config.tokenizer_path,\n",
" metric_file_name = config.metric_file_name\n",
"\n",
" )\n",
"\n",
" return model_evaluation_config"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
"from datasets import load_dataset, load_from_disk, load_metric\n",
"import torch\n",
"import pandas as pd\n",
"from tqdm import tqdm\n",
"\n",
"class ModelEvaluation:\n",
" def __init__(self, config: ModelEvaluationConfig):\n",
" self.config = config\n",
"\n",
"\n",
"\n",
" def generate_batch_sized_chunks(self,list_of_elements, batch_size):\n",
" \"\"\"split the dataset into smaller batches that we can process simultaneously\n",
" Yield successive batch-sized chunks from list_of_elements.\"\"\"\n",
" for i in range(0, len(list_of_elements), batch_size):\n",
" yield list_of_elements[i : i + batch_size]\n",
"\n",
"\n",
" def calculate_metric_on_test_ds(self,dataset, metric, model, tokenizer,\n",
" batch_size=16, device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
" column_text=\"article\",\n",
" column_summary=\"highlights\"):\n",
" article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))\n",
" target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))\n",
"\n",
" for article_batch, target_batch in tqdm(\n",
" zip(article_batches, target_batches), total=len(article_batches)):\n",
"\n",
" inputs = tokenizer(article_batch, max_length=1024, truncation=True,\n",
" padding=\"max_length\", return_tensors=\"pt\")\n",
"\n",
" summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n",
" attention_mask=inputs[\"attention_mask\"].to(device),\n",
" length_penalty=0.8, num_beams=8, max_length=128)\n",
" ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''\n",
"\n",
" # Finally, we decode the generated texts,\n",
" # replace the token, and add the decoded texts with the references to the metric.\n",
" decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,\n",
" clean_up_tokenization_spaces=True)\n",
" for s in summaries]\n",
"\n",
" decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n",
"\n",
"\n",
" metric.add_batch(predictions=decoded_summaries, references=target_batch)\n",
"\n",
" # Finally compute and return the ROUGE scores.\n",
" score = metric.compute()\n",
" return score\n",
"\n",
"\n",
" def evaluate(self):\n",
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
" tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n",
" model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n",
"\n",
" #loading data\n",
" dataset_samsum_pt = load_from_disk(self.config.data_path)\n",
"\n",
"\n",
" rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
"\n",
" rouge_metric = load_metric('rouge')\n",
"\n",
" score = self.calculate_metric_on_test_ds(\n",
" dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'\n",
" )\n",
"\n",
" rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )\n",
"\n",
" df = pd.DataFrame(rouge_dict, index = ['pegasus'] )\n",
" df.to_csv(self.config.metric_file_name, index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" config = ConfigurationManager()\n",
" model_evaluation_config = config.get_model_evaluation_config()\n",
" model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n",
" model_evaluation_config.evaluate()\n",
"except Exception as e:\n",
" raise e"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|