KoichiYasuoka
commited on
Commit
·
909aa8f
1
Parent(s):
5d99ddd
model improved for transformers 4.42
Browse files- config.json +2 -13
- maker.sh +97 -0
- pytorch_model-00001-of-00007.bin +1 -1
- pytorch_model-00002-of-00007.bin +1 -1
- pytorch_model-00003-of-00007.bin +1 -1
- pytorch_model-00004-of-00007.bin +1 -1
- pytorch_model-00005-of-00007.bin +1 -1
- pytorch_model-00006-of-00007.bin +1 -1
- pytorch_model-00007-of-00007.bin +1 -1
- pytorch_model.bin.index.json +3 -3
- tokenizer_config.json +1 -0
- upos.py +1 -40
config.json
CHANGED
@@ -4,22 +4,11 @@
|
|
4 |
],
|
5 |
"attention_bias": false,
|
6 |
"attention_dropout": 0.0,
|
7 |
-
"auto_map": {
|
8 |
-
"AutoModelForTokenClassification": "upos.LlamaForTokenClassification"
|
9 |
-
},
|
10 |
"bos_token_id": 128000,
|
11 |
"custom_pipelines": {
|
12 |
"upos": {
|
13 |
"impl": "upos.BellmanFordTokenClassificationPipeline",
|
14 |
"pt": "AutoModelForTokenClassification"
|
15 |
-
},
|
16 |
-
"token-classification": {
|
17 |
-
"impl": "upos.RawTokenClassificationPipeline",
|
18 |
-
"pt": "AutoModelForTokenClassification"
|
19 |
-
},
|
20 |
-
"ner": {
|
21 |
-
"impl": "upos.RawTokenClassificationPipeline",
|
22 |
-
"pt": "AutoModelForTokenClassification"
|
23 |
}
|
24 |
},
|
25 |
"eos_token_id": 128001,
|
@@ -162,9 +151,9 @@
|
|
162 |
"rope_scaling": null,
|
163 |
"rope_theta": 500000.0,
|
164 |
"tie_word_embeddings": false,
|
165 |
-
"torch_dtype": "float32",
|
166 |
"tokenizer_class": "LlamaTokenizerFast",
|
167 |
-
"
|
|
|
168 |
"use_cache": true,
|
169 |
"vocab_size": 128259
|
170 |
}
|
|
|
4 |
],
|
5 |
"attention_bias": false,
|
6 |
"attention_dropout": 0.0,
|
|
|
|
|
|
|
7 |
"bos_token_id": 128000,
|
8 |
"custom_pipelines": {
|
9 |
"upos": {
|
10 |
"impl": "upos.BellmanFordTokenClassificationPipeline",
|
11 |
"pt": "AutoModelForTokenClassification"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
}
|
13 |
},
|
14 |
"eos_token_id": 128001,
|
|
|
151 |
"rope_scaling": null,
|
152 |
"rope_theta": 500000.0,
|
153 |
"tie_word_embeddings": false,
|
|
|
154 |
"tokenizer_class": "LlamaTokenizerFast",
|
155 |
+
"torch_dtype": "float32",
|
156 |
+
"transformers_version": "4.42.4",
|
157 |
"use_cache": true,
|
158 |
"vocab_size": 128259
|
159 |
}
|
maker.sh
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! /bin/sh
|
2 |
+
test -f ja_gsd_modern.conllu || curl -LO https://github.com/KoichiYasuoka/SuPar-UniDic/raw/main/suparunidic/suparmodels/ja_gsd_modern.conllu
|
3 |
+
curl -L https://huggingface.co/KoichiYasuoka/Llama-3-Swallow-8B-upos/resolve/main/tokenizer.json | egrep -v '"ã(ģ[^ ]|Ĥ[ģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵ]) ã(ģ[^ ]|Ĥ[ģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵ])",$' > newtokenizer.json
|
4 |
+
|
5 |
+
TMP=./maker$$.py
|
6 |
+
cat << 'EOF' > $TMP
|
7 |
+
#! /usr/bin/env deepspeed
|
8 |
+
src="KoichiYasuoka/Llama-3-Swallow-8B-upos"
|
9 |
+
tgt="KoichiYasuoka/Llama-3-Swallow-8B-char-upos"
|
10 |
+
from transformers import LlamaTokenizerFast,LlamaForTokenClassification,AutoConfig,DataCollatorForTokenClassification,TrainingArguments,Trainer
|
11 |
+
|
12 |
+
class UPOSFileDataset(object):
|
13 |
+
def __init__(self,conllu,tokenizer):
|
14 |
+
self.conllu=open(conllu,"r",encoding="utf-8")
|
15 |
+
self.tokenizer=tokenizer
|
16 |
+
self.seeks=[0]
|
17 |
+
self.multiword={}
|
18 |
+
label=set(["SYM"])
|
19 |
+
s=self.conllu.readline()
|
20 |
+
while s!="":
|
21 |
+
if s=="\n":
|
22 |
+
self.seeks.append(self.conllu.tell())
|
23 |
+
else:
|
24 |
+
w=s.split("\t")
|
25 |
+
if len(w)==10:
|
26 |
+
if w[0].isdecimal():
|
27 |
+
label.add(w[3] if w[5]=="_" else w[3]+"|"+w[5])
|
28 |
+
elif w[0].find("-")>0:
|
29 |
+
t=w[0].split("-")
|
30 |
+
f,j,k=w[1],[],[]
|
31 |
+
for i in range(int(t[0]),int(t[1])+1):
|
32 |
+
w=self.conllu.readline().split("\t")
|
33 |
+
j.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
|
34 |
+
k.append(w[1])
|
35 |
+
p="+".join(j)
|
36 |
+
label.add(p)
|
37 |
+
if p in self.multiword:
|
38 |
+
self.multiword[p][f]=list(k)
|
39 |
+
else:
|
40 |
+
self.multiword[p]={f:list(k)}
|
41 |
+
s=self.conllu.readline()
|
42 |
+
lid={}
|
43 |
+
for i,l in enumerate(sorted(label)):
|
44 |
+
lid[l],lid["B-"+l],lid["I-"+l]=i*3,i*3+1,i*3+2
|
45 |
+
self.label2id=lid
|
46 |
+
def __call__(*args):
|
47 |
+
lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))}
|
48 |
+
for t in args:
|
49 |
+
t.label2id=lid
|
50 |
+
return lid
|
51 |
+
def __del__(self):
|
52 |
+
self.conllu.close()
|
53 |
+
__len__=lambda self:len(self.seeks)-1
|
54 |
+
def __getitem__(self,i):
|
55 |
+
self.conllu.seek(self.seeks[i])
|
56 |
+
form,upos=[],[]
|
57 |
+
while self.conllu.tell()<self.seeks[i+1]:
|
58 |
+
w=self.conllu.readline().split("\t")
|
59 |
+
if len(w)==10:
|
60 |
+
form.append(w[1])
|
61 |
+
if w[0].isdecimal():
|
62 |
+
upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
|
63 |
+
elif w[0].find("-")>0:
|
64 |
+
t=w[0].split("-")
|
65 |
+
u=[]
|
66 |
+
for j in range(int(t[0]),int(t[1])+1):
|
67 |
+
k=self.conllu.readline().split("\t")
|
68 |
+
u.append(k[3] if k[5]=="_" else k[3]+"|"+k[5])
|
69 |
+
upos.append("+".join(u))
|
70 |
+
v=self.tokenizer(form,add_special_tokens=False)
|
71 |
+
i,u=[],[]
|
72 |
+
for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
|
73 |
+
if x!=[]:
|
74 |
+
i+=x
|
75 |
+
u+=[y] if len(x)==1 else ["B-"+y]+["I-"+y]*(len(x)-1)
|
76 |
+
if len(i)<self.tokenizer.model_max_length-3:
|
77 |
+
ids=[self.tokenizer.cls_token_id]+i+[self.tokenizer.sep_token_id]
|
78 |
+
upos=["SYM"]+u+["SYM"]
|
79 |
+
else:
|
80 |
+
ids=i[0:self.tokenizer.model_max_length-2]
|
81 |
+
upos=u[0:self.tokenizer.model_max_length-2]
|
82 |
+
return {"input_ids":ids,"labels":[self.label2id[t] for t in upos]}
|
83 |
+
|
84 |
+
tkz=LlamaTokenizerFast.from_pretrained(src,tokenizer_file="newtokenizer.json")
|
85 |
+
trainDS=UPOSFileDataset("ja_gsd_modern.conllu",tkz)
|
86 |
+
lid=trainDS.label2id
|
87 |
+
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
|
88 |
+
dsp={"fp16":{"enabled":"auto"},"optimizer":{"type":"AdamW"},"scheduler":{"type":"WarmupLR","params":{}},"train_batch_size":"auto","train_micro_batch_size_per_gpu":"auto","zero_optimization":{"stage":3,"offload_optimizer":{"device":"cpu","pin_memory":True},"offload_param":{"device":"cpu","pin_memory":True},"overlap_comm":True,"contiguous_gradients":True,"reduce_bucket_size":"auto","stage3_prefetch_bucket_size":"auto","stage3_param_persistence_threshold":"auto","stage3_gather_16bit_weights_on_model_save":True}}
|
89 |
+
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=8,deepspeed=dsp,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
|
90 |
+
trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=LlamaForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True),train_dataset=trainDS)
|
91 |
+
trn.train()
|
92 |
+
trn.save_model(tgt)
|
93 |
+
tkz.save_pretrained(tgt)
|
94 |
+
EOF
|
95 |
+
chmod 755 $TMP
|
96 |
+
$TMP
|
97 |
+
exit
|
pytorch_model-00001-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4886522810
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:46bfedcc1eaa8d42531ddd628c9c94b482dd2a9f20c1c16d11864e8ab70223e6
|
3 |
size 4886522810
|
pytorch_model-00002-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4832018324
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cbb4659f12fa3a9f5f878d9e3811ad00f400592e21ae27aa7334ffbf473ae582
|
3 |
size 4832018324
|
pytorch_model-00003-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4999825256
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:07d570e107da750283b13b47082b1c1378651b8c77b9d37cb9f0b87dbf4cbe48
|
3 |
size 4999825256
|
pytorch_model-00004-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4999825316
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c79d77e8e2ca8e77ac5d14712c07a5c5e8dbaba66e4476f55c561cd1b67b0c0b
|
3 |
size 4999825316
|
pytorch_model-00005-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4832018324
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fbfa6b3c3cb0b1e85e2cff6b63a2a4956c2a59710315f0db489f54f4ae9c006c
|
3 |
size 4832018324
|
pytorch_model-00006-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4999825320
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:94082c38acc2313d196be5de9285a0e4e80a3769d5301860c705091b6e3fea86
|
3 |
size 4999825320
|
pytorch_model-00007-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 470797675
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9eb4ca4e2de739ea97b5774722341c230104b0146aac414349abe0aa76eb8af5
|
3 |
size 470797675
|
pytorch_model.bin.index.json
CHANGED
@@ -3,8 +3,6 @@
|
|
3 |
"total_size": 30020731120
|
4 |
},
|
5 |
"weight_map": {
|
6 |
-
"classifier.bias": "pytorch_model-00007-of-00007.bin",
|
7 |
-
"classifier.weight": "pytorch_model-00007-of-00007.bin",
|
8 |
"model.embed_tokens.weight": "pytorch_model-00001-of-00007.bin",
|
9 |
"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
|
10 |
"model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00007.bin",
|
@@ -294,6 +292,8 @@
|
|
294 |
"model.layers.9.self_attn.o_proj.weight": "pytorch_model-00003-of-00007.bin",
|
295 |
"model.layers.9.self_attn.q_proj.weight": "pytorch_model-00003-of-00007.bin",
|
296 |
"model.layers.9.self_attn.v_proj.weight": "pytorch_model-00003-of-00007.bin",
|
297 |
-
"model.norm.weight": "pytorch_model-00007-of-00007.bin"
|
|
|
|
|
298 |
}
|
299 |
}
|
|
|
3 |
"total_size": 30020731120
|
4 |
},
|
5 |
"weight_map": {
|
|
|
|
|
6 |
"model.embed_tokens.weight": "pytorch_model-00001-of-00007.bin",
|
7 |
"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
|
8 |
"model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00007.bin",
|
|
|
292 |
"model.layers.9.self_attn.o_proj.weight": "pytorch_model-00003-of-00007.bin",
|
293 |
"model.layers.9.self_attn.q_proj.weight": "pytorch_model-00003-of-00007.bin",
|
294 |
"model.layers.9.self_attn.v_proj.weight": "pytorch_model-00003-of-00007.bin",
|
295 |
+
"model.norm.weight": "pytorch_model-00007-of-00007.bin",
|
296 |
+
"score.bias": "pytorch_model-00007-of-00007.bin",
|
297 |
+
"score.weight": "pytorch_model-00007-of-00007.bin"
|
298 |
}
|
299 |
}
|
tokenizer_config.json
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
{
|
2 |
"add_bos_token": true,
|
3 |
"add_eos_token": false,
|
|
|
4 |
"added_tokens_decoder": {
|
5 |
"128000": {
|
6 |
"content": "<|begin_of_text|>",
|
|
|
1 |
{
|
2 |
"add_bos_token": true,
|
3 |
"add_eos_token": false,
|
4 |
+
"add_prefix_space": null,
|
5 |
"added_tokens_decoder": {
|
6 |
"128000": {
|
7 |
"content": "<|begin_of_text|>",
|
upos.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
from transformers import TokenClassificationPipeline
|
2 |
-
from transformers.modeling_outputs import TokenClassifierOutput
|
3 |
|
4 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
5 |
def __init__(self,**kwargs):
|
@@ -40,41 +39,3 @@ class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
|
40 |
t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
|
41 |
return w
|
42 |
|
43 |
-
class RawTokenClassificationPipeline(TokenClassificationPipeline):
|
44 |
-
def check_model_type(self,supported_models):
|
45 |
-
pass
|
46 |
-
|
47 |
-
class LlamaForTokenClassification(LlamaPreTrainedModel):
|
48 |
-
def __init__(self,config):
|
49 |
-
from torch import nn
|
50 |
-
super().__init__(config)
|
51 |
-
self.num_labels=config.num_labels
|
52 |
-
self.model=LlamaModel(config)
|
53 |
-
if hasattr(config,"classifier_dropout") and config.classifier_dropout is not None:
|
54 |
-
classifier_dropout=config.classifier_dropout
|
55 |
-
elif hasattr(config,"hidden_dropout") and config.hidden_dropout is not None:
|
56 |
-
classifier_dropout=config.hidden_dropout
|
57 |
-
else:
|
58 |
-
classifier_dropout=0.1
|
59 |
-
self.dropout=nn.Dropout(classifier_dropout)
|
60 |
-
self.classifier=nn.Linear(config.hidden_size,config.num_labels)
|
61 |
-
self.post_init()
|
62 |
-
def get_input_embeddings(self):
|
63 |
-
return self.model.embed_tokens
|
64 |
-
def set_input_embeddings(self,value):
|
65 |
-
self.model.embed_tokens=value
|
66 |
-
def forward(self,input_ids=None,past_key_values=None,attention_mask=None,position_ids=None,inputs_embeds=None,labels=None,use_cache=None,output_attentions=None,output_hidden_states=None,return_dict=None):
|
67 |
-
return_dict=return_dict if return_dict is not None else self.config.use_return_dict
|
68 |
-
transformer_outputs=self.model(input_ids,past_key_values=past_key_values,attention_mask=attention_mask,position_ids=position_ids,inputs_embeds=inputs_embeds,use_cache=use_cache,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
|
69 |
-
hidden_states=transformer_outputs[0]
|
70 |
-
hidden_states=self.dropout(hidden_states)
|
71 |
-
logits=self.classifier(hidden_states)
|
72 |
-
loss=None
|
73 |
-
if labels is not None:
|
74 |
-
from torch import nn
|
75 |
-
loss_fct=nn.CrossEntropyLoss()
|
76 |
-
loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
|
77 |
-
if not return_dict:
|
78 |
-
output=(logits,)+transformer_outputs[2:]
|
79 |
-
return ((loss,)+output) if loss is not None else output
|
80 |
-
return TokenClassifierOutput(loss=loss,logits=logits,hidden_states=transformer_outputs.hidden_states,attentions=transformer_outputs.attentions)
|
|
|
1 |
+
from transformers import TokenClassificationPipeline
|
|
|
2 |
|
3 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
4 |
def __init__(self,**kwargs):
|
|
|
39 |
t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
|
40 |
return w
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|