|
|
|
import os |
|
import errno |
|
import torch |
|
import sys |
|
import logging |
|
import json |
|
from pathlib import Path |
|
import torch.distributed as dist |
|
import csv |
|
import os.path as osp |
|
from time import time |
|
from numpy import mean |
|
import re |
|
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup |
|
import pdb |
|
from torch import nn |
|
|
|
|
|
|
|
|
|
def set_optim(opt, model_list, freeze_part=[], accumulation_step=None): |
|
|
|
optimizer_list, scheduler_list, named_parameters = [], [], [] |
|
|
|
for model in model_list: |
|
model_para = list(model.named_parameters()) |
|
model_para_train, freeze_layer = [], [] |
|
for n, p in model_para: |
|
if not any(nd in n for nd in freeze_part): |
|
model_para_train.append((n, p)) |
|
else: |
|
p.requires_grad = False |
|
freeze_layer.append((n, p)) |
|
named_parameters.extend(model_para_train) |
|
|
|
|
|
|
|
|
|
|
|
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] |
|
|
|
ke_part = ['ke_model', 'loss_awl', 'numeric_model', 'order'] |
|
if opt.LLRD: |
|
|
|
all_name_orig = [n for n, p in named_parameters if not any(nd in n for nd in ke_part)] |
|
|
|
opt_parameters, all_name = LLRD(opt, named_parameters, no_decay, ke_part) |
|
remain = list(set(all_name_orig) - set(all_name)) |
|
remain_parameters = [ |
|
{'params': [p for n, p in named_parameters if not any(nd in n for nd in no_decay) and n in remain], "lr": opt.lr, 'weight_decay': opt.weight_decay}, |
|
{'params': [p for n, p in named_parameters if any(nd in n for nd in no_decay) and n in remain], "lr": opt.lr, 'weight_decay': 0.0} |
|
] |
|
opt_parameters.extend(remain_parameters) |
|
else: |
|
opt_parameters = [ |
|
{'params': [p for n, p in named_parameters if not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)], "lr": opt.lr, 'weight_decay': opt.weight_decay}, |
|
{'params': [p for n, p in named_parameters if any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)], "lr": opt.lr, 'weight_decay': 0.0} |
|
] |
|
|
|
ke_parameters = [ |
|
{'params': [p for n, p in named_parameters if not any(nd in n for nd in no_decay) and any(nd in n for nd in ke_part)], "lr": opt.ke_lr, 'weight_decay': opt.weight_decay}, |
|
{'params': [p for n, p in named_parameters if any(nd in n for nd in no_decay) and any(nd in n for nd in ke_part)], "lr": opt.ke_lr, 'weight_decay': 0.0} |
|
] |
|
opt_parameters.extend(ke_parameters) |
|
optimizer = AdamW(opt_parameters, lr=opt.lr, eps=opt.adam_epsilon) |
|
if accumulation_step is None: |
|
accumulation_step = opt.accumulation_steps |
|
if opt.scheduler == 'linear': |
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(opt.warmup_steps/accumulation_step), num_training_steps=int(opt.total_steps/accumulation_step)) |
|
else: |
|
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(opt.warmup_steps/accumulation_step), num_training_steps=int(opt.total_steps/accumulation_step)) |
|
|
|
|
|
all_para_num = 0 |
|
for paras in opt_parameters: |
|
all_para_num += len(paras['params']) |
|
|
|
assert len(named_parameters) == all_para_num |
|
return optimizer, scheduler |
|
|
|
|
|
|
|
def LLRD(opt, named_parameters, no_decay, ke_part =[]): |
|
opt_parameters = [] |
|
all_name = [] |
|
head_lr = opt.lr * 1.05 |
|
init_lr = opt.lr |
|
lr = init_lr |
|
|
|
|
|
params_0 = [p for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n) |
|
and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
params_1 = [p for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n) |
|
and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
|
|
name_0 = [n for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n) |
|
and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
name_1 = [n for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n) |
|
and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
|
|
all_name.extend(name_0) |
|
all_name.extend(name_1) |
|
|
|
head_params = {"params": params_0, "lr": head_lr, "weight_decay": 0.0} |
|
opt_parameters.append(head_params) |
|
|
|
head_params = {"params": params_1, "lr": head_lr, "weight_decay": 0.01} |
|
opt_parameters.append(head_params) |
|
|
|
|
|
for layer in range(11,-1,-1): |
|
params_0 = [p for n,p in named_parameters if f"encoder.layer.{layer}." in n |
|
and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
params_1 = [p for n,p in named_parameters if f"encoder.layer.{layer}." in n |
|
and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
|
|
layer_params = {"params": params_0, "lr": lr, "weight_decay": 0.0} |
|
opt_parameters.append(layer_params) |
|
|
|
layer_params = {"params": params_1, "lr": lr, "weight_decay": 0.01} |
|
opt_parameters.append(layer_params) |
|
|
|
name_0 = [n for n,p in named_parameters if f"encoder.layer.{layer}." in n |
|
and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
name_1 = [n for n,p in named_parameters if f"encoder.layer.{layer}." in n |
|
and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
all_name.extend(name_0) |
|
all_name.extend(name_1) |
|
|
|
lr *= 0.95 |
|
|
|
|
|
params_0 = [p for n,p in named_parameters if ("embeddings" in n ) |
|
and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
params_1 = [p for n,p in named_parameters if ("embeddings" in n ) |
|
and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
|
|
embed_params = {"params": params_0, "lr": lr, "weight_decay": 0.0} |
|
opt_parameters.append(embed_params) |
|
|
|
embed_params = {"params": params_1, "lr": lr, "weight_decay": 0.01} |
|
opt_parameters.append(embed_params) |
|
|
|
name_0 = [n for n,p in named_parameters if ("embeddings" in n ) |
|
and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
name_1 = [n for n,p in named_parameters if ("embeddings" in n ) |
|
and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] |
|
all_name.extend(name_0) |
|
all_name.extend(name_1) |
|
return opt_parameters, all_name |
|
|
|
class FixedScheduler(torch.optim.lr_scheduler.LambdaLR): |
|
def __init__(self, optimizer, last_epoch=-1): |
|
super(FixedScheduler, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) |
|
|
|
def lr_lambda(self, step): |
|
return 1.0 |
|
|
|
|
|
class WarmupLinearScheduler(torch.optim.lr_scheduler.LambdaLR): |
|
def __init__(self, optimizer, warmup_steps, scheduler_steps, min_ratio, last_epoch=-1): |
|
self.warmup_steps = warmup_steps |
|
self.scheduler_steps = scheduler_steps |
|
self.min_ratio = min_ratio |
|
|
|
super(WarmupLinearScheduler, self).__init__( |
|
optimizer, self.lr_lambda, last_epoch=last_epoch |
|
) |
|
|
|
def lr_lambda(self, step): |
|
if step < self.warmup_steps: |
|
return (1 - self.min_ratio) * step / float(max(1, self.warmup_steps)) + self.min_ratio |
|
|
|
|
|
|
|
|
|
return max(0.0, |
|
1.0 + (self.min_ratio - 1) * (step - self.warmup_steps) / float(max(1.0, self.scheduler_steps - self.warmup_steps)), |
|
) |
|
|
|
|
|
class Loss_log(): |
|
def __init__(self): |
|
self.loss = [] |
|
self.acc = [0.] |
|
self.flag = 0 |
|
self.token_right_num = [] |
|
self.token_all_num = [] |
|
self.word_right_num = [] |
|
self.word_all_num = [] |
|
|
|
self.use_top_k_acc = 0 |
|
|
|
def acc_init(self, topn=[1]): |
|
self.loss = [] |
|
self.token_right_num = [] |
|
self.token_all_num = [] |
|
self.topn = topn |
|
self.use_top_k_acc = 1 |
|
self.top_k_word_right = {} |
|
for n in topn: |
|
self.top_k_word_right[n] = [] |
|
|
|
def time_init(self): |
|
self.start = time() |
|
self.last = self.start |
|
self.time_used_epoch = [] |
|
|
|
def time_cpt(self, step, total_step): |
|
|
|
time_used_last_epoch = time() - self.last |
|
self.time_used_epoch.append(time_used_last_epoch) |
|
time_used = time() - self.start |
|
self.last = time() |
|
h, m, s = time_trans(time_used) |
|
time_remain = int(total_step - step) * mean(self.time_used_epoch) |
|
h_r, m_r, s_r = time_trans(time_remain) |
|
|
|
return h, m, s, h_r, m_r, s_r |
|
|
|
def get_token_acc(self): |
|
|
|
if len(self.token_all_num) == 0: |
|
return 0. |
|
elif self.use_top_k_acc == 1: |
|
res = [] |
|
for n in self.topn: |
|
res.append(round((sum(self.top_k_word_right[n]) / sum(self.token_all_num)) * 100 , 3)) |
|
return res |
|
else: |
|
return [sum(self.token_right_num)/sum(self.token_all_num)] |
|
|
|
|
|
def update_token(self, token_num, token_right): |
|
|
|
self.token_all_num.append(token_num) |
|
if isinstance(token_right, list): |
|
for i, n in enumerate(self.topn): |
|
self.top_k_word_right[n].append(token_right[i]) |
|
self.token_right_num.append(token_right) |
|
|
|
def update(self, case): |
|
self.loss.append(case) |
|
|
|
def update_acc(self, case): |
|
self.acc.append(case) |
|
|
|
def get_loss(self): |
|
if len(self.loss) == 0: |
|
return 500. |
|
return mean(self.loss) |
|
|
|
def get_acc(self): |
|
return self.acc[-1] |
|
|
|
def get_min_loss(self): |
|
return min(self.loss) |
|
|
|
def early_stop(self): |
|
|
|
if self.loss[-1] > min(self.loss): |
|
self.flag += 1 |
|
else: |
|
self.flag = 0 |
|
|
|
if self.flag > 1000: |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
def add_special_token(tokenizer, model=None, rank=0, cache_path = None): |
|
|
|
|
|
|
|
|
|
|
|
special_token = ['[SEP]', '[MASK]', '[ALM]', '[KPI]', '[CLS]', '[LOC]', '[EOS]', '[ENT]', '[ATTR]', '[NUM]', '[REL]', '|', '[DOC]'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
norm_token = ['网元实例', '事件类型', '告警级别', '告警名称', '告警源', '通讯系统', '默认值', '链路故障', '取值范围', '可选必选说明', '数据来源', '用户平面', '配置', '原则', '该参数', '失败次数', '可选参数', 'S1模式', '必选参数', 'IP地址', '响应消息', '成功次数', '测量指标', '用于', '统计周期', '该命令', '上下文', '请求次数', '本端', 'pod', 'amf', 'smf', 'nrf', 'ausf', 'upcf', 'upf', 'udm', 'PDU', 'alias', 'PLMN', 'MML', 'Info_Measure', 'icase', 'Diameter', 'MSISDN', 'RAT', 'RMV', 'PFCP', 'NSSAI', 'CCR', 'HDBNJjs', 'HNGZgd', 'SGSN', '3GPP', 'Bearer', 'sbim', 'FusionSphere', 'IMSI', 'GGSN', 'RETCODE', 'PCRF', 'PDP', 'GTP', 'OCS', 'HLR', 'FFFF', 'VLR', 'DNN', 'PID', 'CSCF', 'PDN', 'SCTP', 'SPGW', 'TAU', 'PCEF', 'NSA', 'ACL', 'BGP', 'USCDB', 'VoLTE', 'RNC', 'GPRS', 'DRA', 'MOC', '告警', '网元', '对端', '信令', '话单', '操作', '风险', '等级', '下发', '流控', '运营商', '寻呼', '漫游', '切片', '报文', '号段', '承载', '批量', '导致', '原因是', '影响', '造成', '引起', '随之', '情况下', '根因', 'trigger'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
norm_token_tobe_added = ['pod', 'amf', 'smf', 'nrf', 'ausf', 'upcf', 'upf', 'udm', 'ALM', '告警', '网元', '对端', '信令', '话单', 'RAN', 'MML', 'PGW', 'MME', 'SGW', 'NF', 'APN', 'LST', 'GW', 'QoS', 'IPv', 'PDU', 'IMS', 'EPS', 'GTP', 'PDP', 'LTE', 'HSS'] |
|
|
|
token_tobe_added = [] |
|
|
|
all_token = norm_token_tobe_added |
|
for i in all_token: |
|
if i not in tokenizer.vocab.keys() and i.lower() not in tokenizer.vocab.keys(): |
|
token_tobe_added.append(i) |
|
|
|
|
|
|
|
tokenizer.add_tokens(token_tobe_added, special_tokens=False) |
|
special_tokens_dict = {"additional_special_tokens": special_token} |
|
special_token_ = tokenizer.add_special_tokens(special_tokens_dict) |
|
if rank == 0: |
|
print("Added tokens:") |
|
print(tokenizer.get_added_vocab()) |
|
|
|
|
|
|
|
if model is not None: |
|
|
|
if rank == 0: |
|
print(f"--------------------------------") |
|
print(f"-------- orig word embedding shape: {model.get_input_embeddings().weight.shape}") |
|
sz = model.resize_token_embeddings(len(tokenizer)) |
|
if cache_path is not None: |
|
|
|
token_2_emb = torch.load(cache_path) |
|
|
|
token_dic = tokenizer.get_added_vocab() |
|
id_2_token = {v:k for k,v in token_dic.items()} |
|
with torch.no_grad(): |
|
for key in id_2_token.keys(): |
|
model.bert.embeddings.word_embeddings.weight[key,:] = nn.Parameter(token_2_emb[id_2_token[key]][0]).cuda() |
|
|
|
|
|
model.bert.tie_weights() |
|
if rank == 0: |
|
print(f"-------- resize_token_embeddings into {sz} done!") |
|
print(f"--------------------------------") |
|
|
|
|
|
norm_token = list(set(norm_token).union(set(norm_token_tobe_added))) |
|
return tokenizer, special_token, norm_token |
|
|
|
|
|
def time_trans(sec): |
|
m, s = divmod(sec, 60) |
|
h, m = divmod(m, 60) |
|
return int(h), int(m), int(s) |
|
|
|
def torch_accuracy(output, target, topk=(1,)): |
|
''' |
|
param output, target: should be torch Variable |
|
''' |
|
|
|
|
|
|
|
|
|
topn = max(topk) |
|
batch_size = output.size(0) |
|
|
|
_, pred = output.topk(topn, 1, True, True) |
|
pred = pred.t() |
|
|
|
is_correct = pred.eq(target.view(1, -1).expand_as(pred)) |
|
|
|
ans = [] |
|
ans_num = [] |
|
for i in topk: |
|
|
|
is_correct_i = is_correct[:i].contiguous().view(-1).float().sum(0, keepdim=True) |
|
ans_num.append(int(is_correct_i.item())) |
|
ans.append(is_correct_i.mul_(100.0 / batch_size)) |
|
|
|
return ans, ans_num |
|
|
|
|