KoichiYasuoka commited on
Commit
2696d5a
·
1 Parent(s): bcb111b

model improved

Browse files
Files changed (5) hide show
  1. config.json +1 -1
  2. maker.sh → maker.py +22 -41
  3. oldtokenizer.json +0 -0
  4. pytorch_model.bin +2 -2
  5. ud.py +8 -2
config.json CHANGED
@@ -371,7 +371,7 @@
371
  "summary_use_proj": true,
372
  "tokenizer_class": "PreTrainedTokenizerFast",
373
  "torch_dtype": "float32",
374
- "transformers_version": "4.42.4",
375
  "use_cache": true,
376
  "vocab_size": 32000
377
  }
 
371
  "summary_use_proj": true,
372
  "tokenizer_class": "PreTrainedTokenizerFast",
373
  "torch_dtype": "float32",
374
+ "transformers_version": "4.44.2",
375
  "use_cache": true,
376
  "vocab_size": 32000
377
  }
maker.sh → maker.py RENAMED
@@ -1,22 +1,17 @@
1
- #! /bin/sh
2
- S=abeja/gpt2-large-japanese
3
- T=KoichiYasuoka/abeja-gpt2-large-japanese-ud-causal
4
- U=https://github.com/UniversalDependencies/UD_Japanese-GSDLUW
5
- D=`basename $U`
6
- test -d $D || git clone --depth=1 $U
7
- for F in train dev test
8
- do cp $D/*-$F.conllu $F.conllu
9
- done
10
 
11
- TMPA=./maker$$a.py
12
- ( echo '#! /usr/bin/python3'
13
- echo 'src="'$S'"'
14
- cat << 'EOF'
15
- import json
16
- from transformers import AutoTokenizer
17
  tkz=AutoTokenizer.from_pretrained(src,add_prefix_space=False,legacy=False,model_max_length=1280)
18
  tkz.save_pretrained("tmpdir")
19
  d=json.loads(tkz.backend_tokenizer.to_str())
 
20
  form=set()
21
  with open("train.conllu","r",encoding="utf-8") as r:
22
  for s in r:
@@ -27,22 +22,14 @@ for t in d["model"]["vocab"]:
27
  if t[0] not in form:
28
  t[1]*=len(t[0])
29
  tkz.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/tokenizer.json")
30
- EOF
31
- ) > $TMPA
32
- chmod 755 $TMPA
33
- $TMPA
34
-
35
- TMPB=./maker$$b.py
36
- ( echo '#! /usr/bin/env deepspeed'
37
- echo 'src="'$S'"'
38
- echo 'tgt="'$T'"'
39
- cat << 'EOF'
40
- from transformers import PreTrainedTokenizerFast,AutoConfig,GPT2ForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
41
 
42
  class UDCausalDataset(object):
43
- def __init__(self,conllu,tokenizer,embeddings=None):
44
  self.conllu=open(conllu,"r",encoding="utf-8")
45
  self.tokenizer=tokenizer
 
46
  self.embeddings=embeddings
47
  self.max_tokens=3
48
  self.seeks=[(0,0)]
@@ -87,8 +74,8 @@ class UDCausalDataset(object):
87
  if w[0].isdecimal():
88
  upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
89
  deps.append((int(w[6]),w[7]))
90
- v=self.tokenizer(form,add_special_tokens=False)
91
  if t==0:
 
92
  i,u=[],[]
93
  for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
94
  if x!=[]:
@@ -98,6 +85,7 @@ class UDCausalDataset(object):
98
  pad=self.tokenizer.pad_token_id
99
  else:
100
  import torch
 
101
  m=[]
102
  for x in v["input_ids"]:
103
  if x==[]:
@@ -125,23 +113,16 @@ class UDCausalDataset(object):
125
  upos=u[0:self.max_tokens]
126
  return {"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
127
 
128
- tkz=PreTrainedTokenizerFast.from_pretrained("tmpdir")
129
- trainDS=UDCausalDataset("train.conllu",tkz)
130
- devDS=UDCausalDataset("dev.conllu",tkz)
131
- testDS=UDCausalDataset("test.conllu",tkz)
132
  lid=trainDS(devDS,testDS)
133
  cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
134
  mdl=GPT2ForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True)
135
- trainDS.embeddings=mdl.get_input_embeddings().weight.detach().cpu()
136
  trainDS.max_tokens=min(trainDS.max_tokens,cfg.max_position_embeddings)
137
- dsp={"fp16":{"enabled":"auto"},"optimizer":{"type":"AdamW"},"scheduler":{"type":"WarmupLR","params":{}},"train_batch_size":"auto","train_micro_batch_size_per_gpu":"auto","zero_optimization":{"stage":3,"offload_optimizer":{"device":"cpu","pin_memory":True},"offload_param":{"device":"cpu","pin_memory":True},"overlap_comm":True,"contiguous_gradients":True,"reduce_bucket_size":"auto","stage3_prefetch_bucket_size":"auto","stage3_param_persistence_threshold":"auto","stage3_gather_16bit_weights_on_model_save":True}}
138
- arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=16,deepspeed=dsp,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
139
  trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
140
  trn.train()
141
  trn.save_model(tgt)
142
- tkz.save_pretrained(tgt)
143
- EOF
144
- ) > $TMPB
145
- chmod 755 $TMPB
146
- $TMPB
147
- exit
 
1
+ #! /usr/bin/python3
2
+ src="abeja/gpt2-large-japanese"
3
+ tgt="KoichiYasuoka/abeja-gpt2-large-japanese-ud-causal"
4
+ url="https://github.com/UniversalDependencies/UD_Japanese-GSDLUW"
 
 
 
 
 
5
 
6
+ import os,json,unicodedata
7
+ from transformers import AutoTokenizer,PreTrainedTokenizerFast,AutoConfig,GPT2ForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
8
+ d=os.path.basename(url)
9
+ os.system("test -d "+d+" || git clone --depth=1 "+url)
10
+ os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done")
 
11
  tkz=AutoTokenizer.from_pretrained(src,add_prefix_space=False,legacy=False,model_max_length=1280)
12
  tkz.save_pretrained("tmpdir")
13
  d=json.loads(tkz.backend_tokenizer.to_str())
14
+ tkz.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/oldtokenizer.json")
15
  form=set()
16
  with open("train.conllu","r",encoding="utf-8") as r:
17
  for s in r:
 
22
  if t[0] not in form:
23
  t[1]*=len(t[0])
24
  tkz.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/tokenizer.json")
25
+ ntk=PreTrainedTokenizerFast.from_pretrained("tmpdir")
26
+ otk=PreTrainedTokenizerFast.from_pretrained("tmpdir",tokenizer_file="tmpdir/oldtokenizer.json")
 
 
 
 
 
 
 
 
 
27
 
28
  class UDCausalDataset(object):
29
+ def __init__(self,conllu,tokenizer,oldtokenizer=None,embeddings=None):
30
  self.conllu=open(conllu,"r",encoding="utf-8")
31
  self.tokenizer=tokenizer
32
+ self.oldtokenizer=oldtokenizer if oldtokenizer else tokenizer
33
  self.embeddings=embeddings
34
  self.max_tokens=3
35
  self.seeks=[(0,0)]
 
74
  if w[0].isdecimal():
75
  upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
76
  deps.append((int(w[6]),w[7]))
 
77
  if t==0:
78
+ v=self.tokenizer(form,add_special_tokens=False)
79
  i,u=[],[]
80
  for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
81
  if x!=[]:
 
85
  pad=self.tokenizer.pad_token_id
86
  else:
87
  import torch
88
+ v=self.oldtokenizer(form,add_special_tokens=False)
89
  m=[]
90
  for x in v["input_ids"]:
91
  if x==[]:
 
113
  upos=u[0:self.max_tokens]
114
  return {"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
115
 
116
+ trainDS=UDCausalDataset("train.conllu",ntk,otk)
117
+ devDS=UDCausalDataset("dev.conllu",ntk,otk)
118
+ testDS=UDCausalDataset("test.conllu",ntk,otk)
 
119
  lid=trainDS(devDS,testDS)
120
  cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
121
  mdl=GPT2ForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True)
122
+ trainDS.embeddings=mdl.get_input_embeddings().weight
123
  trainDS.max_tokens=min(trainDS.max_tokens,cfg.max_position_embeddings)
124
+ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=16,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
 
125
  trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
126
  trn.train()
127
  trn.save_model(tgt)
128
+ ntk.save_pretrained(tgt)
 
 
 
 
 
oldtokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5771f96d35bf2299e87a19725f670365d927bfb052813568e50f4c60906304d
3
- size 3003627106
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43a978eb2923908e13abf4f6698881dd6fe29375c2a99d06877883dd31a28014
3
+ size 3003633250
ud.py CHANGED
@@ -1,5 +1,10 @@
1
  import numpy
2
- from transformers import TokenClassificationPipeline
 
 
 
 
 
3
 
4
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
5
  def __init__(self,**kwargs):
@@ -42,6 +47,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
42
  def __init__(self,**kwargs):
43
  kwargs["aggregation_strategy"]="simple"
44
  super().__init__(**kwargs)
 
45
  x=self.model.config.label2id
46
  self.root=numpy.full((len(x)),numpy.nan)
47
  self.left_arc=numpy.full((len(x)),numpy.nan)
@@ -87,7 +93,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
87
  if d[i].strip()=="":
88
  d.pop(i)
89
  w.pop(i)
90
- v=self.tokenizer(d,add_special_tokens=False)
91
  e=self.model.get_input_embeddings().weight
92
  m=[]
93
  for x in v["input_ids"]:
 
1
  import numpy
2
+ from transformers import TokenClassificationPipeline,AutoTokenizer
3
+ try:
4
+ from transformers.utils import cached_file
5
+ except:
6
+ from transformers.file_utils import cached_path,hf_bucket_url
7
+ cached_file=lambda x,y:os.path.join(x,y) if os.path.isdir(x) else cached_path(hf_bucket_url(x,y))
8
 
9
  class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
10
  def __init__(self,**kwargs):
 
47
  def __init__(self,**kwargs):
48
  kwargs["aggregation_strategy"]="simple"
49
  super().__init__(**kwargs)
50
+ self.oldtokenizer=AutoTokenizer.from_pretrained(self.tokenizer.name_or_path,tokenizer_file=cached_file(self.tokenizer.name_or_path,"oldtokenizer.json"))
51
  x=self.model.config.label2id
52
  self.root=numpy.full((len(x)),numpy.nan)
53
  self.left_arc=numpy.full((len(x)),numpy.nan)
 
93
  if d[i].strip()=="":
94
  d.pop(i)
95
  w.pop(i)
96
+ v=self.oldtokenizer(d,add_special_tokens=False)
97
  e=self.model.get_input_embeddings().weight
98
  m=[]
99
  for x in v["input_ids"]: