zhangtao-whu commited on
Commit
476ac07
·
verified ·
1 Parent(s): f6d075a

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. xtuner/__init__.py +25 -0
  2. xtuner/__pycache__/__init__.cpython-310.pyc +0 -0
  3. xtuner/__pycache__/entry_point.cpython-310.pyc +0 -0
  4. xtuner/__pycache__/registry.cpython-310.pyc +0 -0
  5. xtuner/__pycache__/version.cpython-310.pyc +0 -0
  6. xtuner/apis/__init__.py +4 -0
  7. xtuner/apis/datasets/__init__.py +37 -0
  8. xtuner/apis/datasets/alpaca.py +92 -0
  9. xtuner/apis/datasets/arxiv.py +38 -0
  10. xtuner/apis/datasets/code_alpaca.py +34 -0
  11. xtuner/apis/datasets/colorist.py +34 -0
  12. xtuner/apis/datasets/lawyer.py +97 -0
  13. xtuner/apis/datasets/medical.py +34 -0
  14. xtuner/apis/datasets/moss_003_sft.py +72 -0
  15. xtuner/apis/datasets/oasst1.py +34 -0
  16. xtuner/apis/datasets/open_orca.py +34 -0
  17. xtuner/apis/datasets/sql.py +34 -0
  18. xtuner/apis/datasets/tiny_codes.py +34 -0
  19. xtuner/apis/datasets/wizardlm.py +34 -0
  20. xtuner/apis/model.py +89 -0
  21. xtuner/apis/training_args.py +61 -0
  22. xtuner/configs/__init__.py +19 -0
  23. xtuner/configs/__pycache__/__init__.cpython-310.pyc +0 -0
  24. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_e3.py +212 -0
  25. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_e3.py +229 -0
  26. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_oasst1_e3.py +244 -0
  27. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_zh_e3.py +212 -0
  28. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_arxiv_gentitle_e3.py +247 -0
  29. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_code_alpaca_e3.py +216 -0
  30. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_colorist_e5.py +212 -0
  31. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_lawyer_e3.py +236 -0
  32. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_512_e3.py +212 -0
  33. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_e3.py +212 -0
  34. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_open_platypus_e3.py +212 -0
  35. xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_sql_e3.py +216 -0
  36. xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_e3.py +212 -0
  37. xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_e3.py +229 -0
  38. xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_oasst1_e3.py +244 -0
  39. xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_zh_e3.py +212 -0
  40. xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_code_alpaca_e3.py +216 -0
  41. xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_lawyer_e3.py +236 -0
  42. xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_512_e3.py +212 -0
  43. xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_e3.py +212 -0
  44. xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_open_platypus_e3.py +212 -0
  45. xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_alpaca_e3.py +212 -0
  46. xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_alpaca_enzh_e3.py +229 -0
  47. xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_alpaca_enzh_oasst1_e3.py +244 -0
  48. xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_alpaca_zh_e3.py +212 -0
  49. xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_arxiv_gentitle_e3.py +247 -0
  50. xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_code_alpaca_e3.py +216 -0
xtuner/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import os
3
+
4
+ from mmengine.utils import digit_version
5
+
6
+ from .entry_point import cli
7
+ from .version import __version__, version_info
8
+
9
+ HF_CEPH_HUB = os.getenv('HF_CEPH_HUB', '')
10
+ HF_USE_CEPH = os.getenv('HF_USE_CEPH', 0) or HF_CEPH_HUB != ''
11
+ DS_CEPH_DIR = os.getenv('DS_CEPH_DIR', None)
12
+ if HF_USE_CEPH:
13
+ from .utils.fileio import (patch_hf_auto_from_pretrained,
14
+ patch_hf_save_pretrained)
15
+ patch_hf_auto_from_pretrained(HF_CEPH_HUB)
16
+ patch_hf_save_pretrained()
17
+
18
+ if DS_CEPH_DIR:
19
+ from .utils.fileio import patch_deepspeed_engine
20
+ patch_deepspeed_engine()
21
+
22
+ __all__ = [
23
+ '__version__', 'version_info', 'digit_version', 'cli', 'HF_USE_CEPH',
24
+ 'DS_CEPH_DIR'
25
+ ]
xtuner/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (673 Bytes). View file
 
xtuner/__pycache__/entry_point.cpython-310.pyc ADDED
Binary file (10.1 kB). View file
 
xtuner/__pycache__/registry.cpython-310.pyc ADDED
Binary file (283 Bytes). View file
 
xtuner/__pycache__/version.cpython-310.pyc ADDED
Binary file (803 Bytes). View file
 
xtuner/apis/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from .datasets import * # noqa: F401, F403
3
+ from .model import * # noqa: F401, F403
4
+ from .training_args import * # noqa: F401, F403
xtuner/apis/datasets/__init__.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from .alpaca import (alpaca_data_collator, alpaca_dataset,
3
+ alpaca_enzh_data_collator, alpaca_enzh_dataset,
4
+ alpaca_zh_data_collator, alpaca_zh_dataset)
5
+ from .arxiv import arxiv_data_collator, arxiv_dataset
6
+ from .code_alpaca import code_alpaca_data_collator, code_alpaca_dataset
7
+ from .colorist import colorist_data_collator, colorist_dataset
8
+ from .lawyer import (lawyer_crime_data_collator, lawyer_crime_dataset,
9
+ lawyer_data_collator, lawyer_dataset,
10
+ lawyer_reference_data_collator, lawyer_reference_dataset)
11
+ from .medical import medical_data_collator, medical_dataset
12
+ from .moss_003_sft import (moss_003_sft_data_collator, moss_003_sft_dataset,
13
+ moss_003_sft_no_plugins_data_collator,
14
+ moss_003_sft_no_plugins_dataset,
15
+ moss_003_sft_plugins_data_collator,
16
+ moss_003_sft_plugins_dataset)
17
+ from .oasst1 import oasst1_data_collator, oasst1_dataset
18
+ from .open_orca import openorca_data_collator, openorca_dataset
19
+ from .sql import sql_data_collator, sql_dataset
20
+ from .tiny_codes import tiny_codes_data_collator, tiny_codes_dataset
21
+ from .wizardlm import wizardlm_data_collator, wizardlm_dataset
22
+
23
+ __all__ = [
24
+ 'alpaca_data_collator', 'alpaca_dataset', 'alpaca_enzh_data_collator',
25
+ 'alpaca_enzh_dataset', 'alpaca_zh_data_collator', 'alpaca_zh_dataset',
26
+ 'arxiv_data_collator', 'arxiv_dataset', 'medical_data_collator',
27
+ 'medical_dataset', 'moss_003_sft_data_collator', 'moss_003_sft_dataset',
28
+ 'moss_003_sft_no_plugins_data_collator', 'moss_003_sft_no_plugins_dataset',
29
+ 'moss_003_sft_plugins_data_collator', 'moss_003_sft_plugins_dataset',
30
+ 'oasst1_data_collator', 'oasst1_dataset', 'openorca_data_collator',
31
+ 'openorca_dataset', 'lawyer_crime_dataset', 'lawyer_crime_data_collator',
32
+ 'lawyer_reference_dataset', 'lawyer_reference_data_collator',
33
+ 'lawyer_dataset', 'lawyer_data_collator', 'colorist_dataset',
34
+ 'colorist_data_collator', 'sql_dataset', 'sql_data_collator',
35
+ 'code_alpaca_dataset', 'code_alpaca_data_collator', 'tiny_codes_dataset',
36
+ 'tiny_codes_data_collator', 'wizardlm_data_collator', 'wizardlm_dataset'
37
+ ]
xtuner/apis/datasets/alpaca.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from datasets import load_dataset
5
+ from torch.utils.data import ConcatDataset
6
+
7
+ from xtuner.dataset import process_hf_dataset
8
+ from xtuner.dataset.collate_fns import default_collate_fn
9
+ from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn,
10
+ template_map_fn_factory)
11
+ from xtuner.utils import PROMPT_TEMPLATE
12
+
13
+
14
+ def alpaca_enzh_dataset(tokenizer,
15
+ path_en='tatsu-lab/alpaca',
16
+ path_zh='silk-road/alpaca-data-gpt4-chinese',
17
+ max_length=2048,
18
+ prompt_template=PROMPT_TEMPLATE.default,
19
+ remove_unused_columns=True,
20
+ pack_to_max_length=True):
21
+ alpaca = alpaca_dataset(
22
+ tokenizer,
23
+ path=path_en,
24
+ max_length=max_length,
25
+ prompt_template=prompt_template,
26
+ shuffle_before_pack=True,
27
+ remove_unused_columns=remove_unused_columns,
28
+ pack_to_max_length=pack_to_max_length)
29
+ alpaca_zh = alpaca_zh_dataset(
30
+ tokenizer,
31
+ path=path_zh,
32
+ max_length=max_length,
33
+ prompt_template=prompt_template,
34
+ shuffle_before_pack=True,
35
+ remove_unused_columns=remove_unused_columns,
36
+ pack_to_max_length=pack_to_max_length)
37
+ dataset = ConcatDataset([alpaca, alpaca_zh])
38
+ return dataset
39
+
40
+
41
+ def alpaca_enzh_data_collator(return_hf_format=False):
42
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
43
+
44
+
45
+ def alpaca_zh_dataset(tokenizer,
46
+ path='silk-road/alpaca-data-gpt4-chinese',
47
+ max_length=2048,
48
+ prompt_template=PROMPT_TEMPLATE.default,
49
+ remove_unused_columns=True,
50
+ pack_to_max_length=True):
51
+ template_map_fn = template_map_fn_factory(template=prompt_template)
52
+ dataset_org = load_dataset(path)
53
+ dataset = process_hf_dataset(
54
+ dataset=dataset_org,
55
+ tokenizer=tokenizer,
56
+ max_length=max_length,
57
+ dataset_map_fn=alpaca_zh_map_fn,
58
+ template_map_fn=template_map_fn,
59
+ remove_unused_columns=remove_unused_columns,
60
+ shuffle_before_pack=True,
61
+ pack_to_max_length=pack_to_max_length)
62
+
63
+ return dataset
64
+
65
+
66
+ def alpaca_zh_data_collator(return_hf_format=False):
67
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
68
+
69
+
70
+ def alpaca_dataset(tokenizer,
71
+ path='tatsu-lab/alpaca',
72
+ max_length=2048,
73
+ prompt_template=PROMPT_TEMPLATE.default,
74
+ remove_unused_columns=True,
75
+ pack_to_max_length=True):
76
+ template_map_fn = template_map_fn_factory(template=prompt_template)
77
+ dataset_org = load_dataset(path)
78
+ dataset = process_hf_dataset(
79
+ dataset=dataset_org,
80
+ tokenizer=tokenizer,
81
+ max_length=max_length,
82
+ dataset_map_fn=alpaca_map_fn,
83
+ template_map_fn=template_map_fn,
84
+ remove_unused_columns=remove_unused_columns,
85
+ shuffle_before_pack=True,
86
+ pack_to_max_length=pack_to_max_length)
87
+
88
+ return dataset
89
+
90
+
91
+ def alpaca_data_collator(return_hf_format=False):
92
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/datasets/arxiv.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from datasets import load_dataset
5
+
6
+ from xtuner.dataset import process_hf_dataset
7
+ from xtuner.dataset.collate_fns import default_collate_fn
8
+ from xtuner.dataset.map_fns import arxiv_map_fn, template_map_fn_factory
9
+ from xtuner.utils import PROMPT_TEMPLATE
10
+
11
+
12
+ def arxiv_dataset(tokenizer,
13
+ data_file=None,
14
+ max_length=2048,
15
+ prompt_template=PROMPT_TEMPLATE.default,
16
+ remove_unused_columns=True,
17
+ pack_to_max_length=True):
18
+ template_map_fn = template_map_fn_factory(template=prompt_template)
19
+ # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv # noqa: E501
20
+ # 2. Process data with `./tools/data_preprocess/arxiv.py`
21
+ if data_file is None:
22
+ data_file = './data/arxiv_postprocess_csAIcsCLcsCV_20200101.json'
23
+ dataset_org = load_dataset(path='json', data_files=dict(train=data_file))
24
+ dataset = process_hf_dataset(
25
+ dataset=dataset_org,
26
+ tokenizer=tokenizer,
27
+ max_length=max_length,
28
+ dataset_map_fn=arxiv_map_fn,
29
+ template_map_fn=template_map_fn,
30
+ remove_unused_columns=remove_unused_columns,
31
+ shuffle_before_pack=True,
32
+ pack_to_max_length=pack_to_max_length)
33
+
34
+ return dataset
35
+
36
+
37
+ def arxiv_data_collator(return_hf_format=False):
38
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/datasets/code_alpaca.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from datasets import load_dataset
5
+
6
+ from xtuner.dataset import process_hf_dataset
7
+ from xtuner.dataset.collate_fns import default_collate_fn
8
+ from xtuner.dataset.map_fns import code_alpaca_map_fn, template_map_fn_factory
9
+ from xtuner.utils import PROMPT_TEMPLATE
10
+
11
+
12
+ def code_alpaca_dataset(tokenizer,
13
+ path='HuggingFaceH4/CodeAlpaca_20K',
14
+ max_length=2048,
15
+ prompt_template=PROMPT_TEMPLATE.default,
16
+ remove_unused_columns=True,
17
+ pack_to_max_length=True):
18
+ template_map_fn = template_map_fn_factory(template=prompt_template)
19
+ dataset_org = load_dataset(path)
20
+ dataset = process_hf_dataset(
21
+ dataset=dataset_org,
22
+ tokenizer=tokenizer,
23
+ max_length=max_length,
24
+ dataset_map_fn=code_alpaca_map_fn,
25
+ template_map_fn=template_map_fn,
26
+ remove_unused_columns=remove_unused_columns,
27
+ shuffle_before_pack=True,
28
+ pack_to_max_length=pack_to_max_length)
29
+
30
+ return dataset
31
+
32
+
33
+ def code_alpaca_data_collator(return_hf_format=False):
34
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/datasets/colorist.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from datasets import load_dataset
5
+
6
+ from xtuner.dataset import process_hf_dataset
7
+ from xtuner.dataset.collate_fns import default_collate_fn
8
+ from xtuner.dataset.map_fns import colors_map_fn, template_map_fn_factory
9
+ from xtuner.utils import PROMPT_TEMPLATE
10
+
11
+
12
+ def colorist_dataset(tokenizer,
13
+ path='burkelibbey/colors',
14
+ max_length=2048,
15
+ prompt_template=PROMPT_TEMPLATE.default,
16
+ remove_unused_columns=True,
17
+ pack_to_max_length=True):
18
+ template_map_fn = template_map_fn_factory(template=prompt_template)
19
+ dataset_org = load_dataset(path)
20
+ dataset = process_hf_dataset(
21
+ dataset=dataset_org,
22
+ tokenizer=tokenizer,
23
+ max_length=max_length,
24
+ dataset_map_fn=colors_map_fn,
25
+ template_map_fn=template_map_fn,
26
+ remove_unused_columns=remove_unused_columns,
27
+ shuffle_before_pack=True,
28
+ pack_to_max_length=pack_to_max_length)
29
+
30
+ return dataset
31
+
32
+
33
+ def colorist_data_collator(return_hf_format=False):
34
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/datasets/lawyer.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from datasets import load_dataset
5
+ from torch.utils.data import ConcatDataset
6
+
7
+ from xtuner.dataset import process_hf_dataset
8
+ from xtuner.dataset.collate_fns import default_collate_fn
9
+ from xtuner.dataset.map_fns import (crime_kg_assitant_map_fn,
10
+ law_reference_map_fn,
11
+ template_map_fn_factory)
12
+ from xtuner.utils import PROMPT_TEMPLATE
13
+
14
+
15
+ def lawyer_dataset(tokenizer,
16
+ crime_data_file=None,
17
+ reference_data_file=None,
18
+ max_length=2048,
19
+ prompt_template=PROMPT_TEMPLATE.default,
20
+ remove_unused_columns=True,
21
+ pack_to_max_length=True):
22
+ crime_dataset = lawyer_crime_dataset(
23
+ tokenizer,
24
+ data_file=crime_data_file,
25
+ max_length=max_length,
26
+ prompt_template=prompt_template,
27
+ remove_unused_columns=remove_unused_columns,
28
+ pack_to_max_length=pack_to_max_length)
29
+ reference_dataset = lawyer_reference_dataset(
30
+ tokenizer,
31
+ data_file=reference_data_file,
32
+ max_length=max_length,
33
+ prompt_template=prompt_template,
34
+ remove_unused_columns=remove_unused_columns,
35
+ pack_to_max_length=pack_to_max_length)
36
+ dataset = ConcatDataset([crime_dataset, reference_dataset])
37
+ return dataset
38
+
39
+
40
+ def lawyer_data_collator(return_hf_format=False):
41
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
42
+
43
+
44
+ def lawyer_crime_dataset(tokenizer,
45
+ data_file=None,
46
+ max_length=2048,
47
+ prompt_template=PROMPT_TEMPLATE.default,
48
+ remove_unused_columns=True,
49
+ pack_to_max_length=True):
50
+ template_map_fn = template_map_fn_factory(template=prompt_template)
51
+ # Download data from https://github.com/LiuHC0428/LAW-GPT # noqa: E501
52
+ if data_file is None:
53
+ data_file = './data/law/CrimeKgAssitant清洗后_52k.json'
54
+ dataset_org = load_dataset(path='json', data_files=dict(train=data_file))
55
+ dataset = process_hf_dataset(
56
+ dataset=dataset_org,
57
+ tokenizer=tokenizer,
58
+ max_length=max_length,
59
+ dataset_map_fn=crime_kg_assitant_map_fn,
60
+ template_map_fn=template_map_fn,
61
+ remove_unused_columns=remove_unused_columns,
62
+ shuffle_before_pack=True,
63
+ pack_to_max_length=pack_to_max_length)
64
+
65
+ return dataset
66
+
67
+
68
+ def lawyer_crime_data_collator(return_hf_format=False):
69
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
70
+
71
+
72
+ def lawyer_reference_dataset(tokenizer,
73
+ data_file=None,
74
+ max_length=2048,
75
+ prompt_template=PROMPT_TEMPLATE.default,
76
+ remove_unused_columns=True,
77
+ pack_to_max_length=True):
78
+ template_map_fn = template_map_fn_factory(template=prompt_template)
79
+ # Download data from https://github.com/LiuHC0428/LAW-GPT # noqa: E501
80
+ if data_file is None:
81
+ data_file = './data/law/训练数据_带法律依据_92k.json'
82
+ dataset_org = load_dataset(path='json', data_files=dict(train=data_file))
83
+ dataset = process_hf_dataset(
84
+ dataset=dataset_org,
85
+ tokenizer=tokenizer,
86
+ max_length=max_length,
87
+ dataset_map_fn=law_reference_map_fn,
88
+ template_map_fn=template_map_fn,
89
+ remove_unused_columns=remove_unused_columns,
90
+ shuffle_before_pack=True,
91
+ pack_to_max_length=pack_to_max_length)
92
+
93
+ return dataset
94
+
95
+
96
+ def lawyer_reference_data_collator(return_hf_format=False):
97
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/datasets/medical.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from datasets import load_dataset
5
+
6
+ from xtuner.dataset import process_hf_dataset
7
+ from xtuner.dataset.collate_fns import default_collate_fn
8
+ from xtuner.dataset.map_fns import medical_map_fn, template_map_fn_factory
9
+ from xtuner.utils import PROMPT_TEMPLATE
10
+
11
+
12
+ def medical_dataset(tokenizer,
13
+ path='shibing624/medical',
14
+ max_length=2048,
15
+ prompt_template=PROMPT_TEMPLATE.default,
16
+ remove_unused_columns=False,
17
+ pack_to_max_length=True):
18
+ template_map_fn = template_map_fn_factory(template=prompt_template)
19
+ dataset_org = load_dataset(path)
20
+ dataset = process_hf_dataset(
21
+ dataset=dataset_org,
22
+ tokenizer=tokenizer,
23
+ max_length=max_length,
24
+ dataset_map_fn=medical_map_fn,
25
+ template_map_fn=template_map_fn,
26
+ remove_unused_columns=remove_unused_columns,
27
+ shuffle_before_pack=True,
28
+ pack_to_max_length=pack_to_max_length)
29
+
30
+ return dataset
31
+
32
+
33
+ def medical_data_collator(return_hf_format=False):
34
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/datasets/moss_003_sft.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from torch.utils.data import ConcatDataset
5
+
6
+ from xtuner.dataset import MOSSSFTDataset
7
+ from xtuner.dataset.collate_fns import default_collate_fn
8
+
9
+
10
+ def moss_003_sft_dataset(tokenizer,
11
+ plugins_data_file=None,
12
+ no_plugins_data_file=None,
13
+ bot_name=None,
14
+ max_length=2048):
15
+ plugins = moss_003_sft_plugins_dataset(
16
+ tokenizer,
17
+ data_file=plugins_data_file,
18
+ bot_name=bot_name,
19
+ max_length=max_length)
20
+ no_plugins = moss_003_sft_no_plugins_dataset(
21
+ tokenizer,
22
+ data_file=no_plugins_data_file,
23
+ bot_name=bot_name,
24
+ max_length=max_length)
25
+ dataset = ConcatDataset([plugins, no_plugins])
26
+ return dataset
27
+
28
+
29
+ def moss_003_sft_data_collator(return_hf_format=False):
30
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
31
+
32
+
33
+ def moss_003_sft_no_plugins_dataset(tokenizer,
34
+ data_file=None,
35
+ bot_name=None,
36
+ max_length=2048):
37
+
38
+ # Download data from https://huggingface.co/datasets/fnlp/moss-003-sft-data
39
+ if data_file is None:
40
+ data_file = './data/moss-003-sft-no-tools.jsonl'
41
+ dataset = MOSSSFTDataset(
42
+ data_file=data_file,
43
+ bot_name=bot_name,
44
+ tokenizer=tokenizer,
45
+ max_length=max_length)
46
+
47
+ return dataset
48
+
49
+
50
+ def moss_003_sft_no_plugins_data_collator(return_hf_format=False):
51
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
52
+
53
+
54
+ def moss_003_sft_plugins_dataset(tokenizer,
55
+ data_file=None,
56
+ bot_name=None,
57
+ max_length=2048):
58
+
59
+ # Download data from https://huggingface.co/datasets/fnlp/moss-003-sft-data
60
+ if data_file is None:
61
+ data_file = './data/conversations_with_tools_with_inner_instruction_no_text2image_train_all_random_meta0.5_0.1_0.01_moss_0709.jsonl' # noqa: E501
62
+ dataset = MOSSSFTDataset(
63
+ data_file=data_file,
64
+ bot_name=bot_name,
65
+ tokenizer=tokenizer,
66
+ max_length=max_length)
67
+
68
+ return dataset
69
+
70
+
71
+ def moss_003_sft_plugins_data_collator(return_hf_format=False):
72
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/datasets/oasst1.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from datasets import load_dataset
5
+
6
+ from xtuner.dataset import process_hf_dataset
7
+ from xtuner.dataset.collate_fns import default_collate_fn
8
+ from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory
9
+ from xtuner.utils import PROMPT_TEMPLATE
10
+
11
+
12
+ def oasst1_dataset(tokenizer,
13
+ path='timdettmers/openassistant-guanaco',
14
+ max_length=2048,
15
+ prompt_template=PROMPT_TEMPLATE.default,
16
+ remove_unused_columns=False,
17
+ pack_to_max_length=True):
18
+ template_map_fn = template_map_fn_factory(template=prompt_template)
19
+ dataset_org = load_dataset(path)
20
+ dataset = process_hf_dataset(
21
+ dataset=dataset_org,
22
+ tokenizer=tokenizer,
23
+ max_length=max_length,
24
+ dataset_map_fn=oasst1_map_fn,
25
+ template_map_fn=template_map_fn,
26
+ remove_unused_columns=remove_unused_columns,
27
+ shuffle_before_pack=True,
28
+ pack_to_max_length=pack_to_max_length)
29
+
30
+ return dataset
31
+
32
+
33
+ def oasst1_data_collator(return_hf_format=False):
34
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/datasets/open_orca.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from datasets import load_dataset
5
+
6
+ from xtuner.dataset import process_hf_dataset
7
+ from xtuner.dataset.collate_fns import default_collate_fn
8
+ from xtuner.dataset.map_fns import openorca_map_fn, template_map_fn_factory
9
+ from xtuner.utils import PROMPT_TEMPLATE
10
+
11
+
12
+ def openorca_dataset(tokenizer,
13
+ path='Open-Orca/OpenOrca',
14
+ max_length=2048,
15
+ prompt_template=PROMPT_TEMPLATE.default,
16
+ remove_unused_columns=True,
17
+ pack_to_max_length=True):
18
+ template_map_fn = template_map_fn_factory(template=prompt_template)
19
+ dataset_org = load_dataset(path)
20
+ dataset = process_hf_dataset(
21
+ dataset=dataset_org,
22
+ tokenizer=tokenizer,
23
+ max_length=max_length,
24
+ dataset_map_fn=openorca_map_fn,
25
+ template_map_fn=template_map_fn,
26
+ remove_unused_columns=remove_unused_columns,
27
+ shuffle_before_pack=True,
28
+ pack_to_max_length=pack_to_max_length)
29
+
30
+ return dataset
31
+
32
+
33
+ def openorca_data_collator(return_hf_format=False):
34
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/datasets/sql.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from datasets import load_dataset
5
+
6
+ from xtuner.dataset import process_hf_dataset
7
+ from xtuner.dataset.collate_fns import default_collate_fn
8
+ from xtuner.dataset.map_fns import sql_map_fn, template_map_fn_factory
9
+ from xtuner.utils import PROMPT_TEMPLATE
10
+
11
+
12
+ def sql_dataset(tokenizer,
13
+ path='b-mc2/sql-create-context',
14
+ max_length=2048,
15
+ prompt_template=PROMPT_TEMPLATE.default,
16
+ remove_unused_columns=True,
17
+ pack_to_max_length=True):
18
+ template_map_fn = template_map_fn_factory(template=prompt_template)
19
+ dataset_org = load_dataset(path)
20
+ dataset = process_hf_dataset(
21
+ dataset=dataset_org,
22
+ tokenizer=tokenizer,
23
+ max_length=max_length,
24
+ dataset_map_fn=sql_map_fn,
25
+ template_map_fn=template_map_fn,
26
+ remove_unused_columns=remove_unused_columns,
27
+ shuffle_before_pack=True,
28
+ pack_to_max_length=pack_to_max_length)
29
+
30
+ return dataset
31
+
32
+
33
+ def sql_data_collator(return_hf_format=False):
34
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/datasets/tiny_codes.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from datasets import load_dataset
5
+
6
+ from xtuner.dataset import process_hf_dataset
7
+ from xtuner.dataset.collate_fns import default_collate_fn
8
+ from xtuner.dataset.map_fns import template_map_fn_factory, tiny_codes_map_fn
9
+ from xtuner.utils import PROMPT_TEMPLATE
10
+
11
+
12
+ def tiny_codes_dataset(tokenizer,
13
+ path='nampdn-ai/tiny-codes',
14
+ max_length=2048,
15
+ prompt_template=PROMPT_TEMPLATE.default,
16
+ remove_unused_columns=True,
17
+ pack_to_max_length=True):
18
+ template_map_fn = template_map_fn_factory(template=prompt_template)
19
+ dataset_org = load_dataset(path)
20
+ dataset = process_hf_dataset(
21
+ dataset=dataset_org,
22
+ tokenizer=tokenizer,
23
+ max_length=max_length,
24
+ dataset_map_fn=tiny_codes_map_fn,
25
+ template_map_fn=template_map_fn,
26
+ remove_unused_columns=remove_unused_columns,
27
+ shuffle_before_pack=True,
28
+ pack_to_max_length=pack_to_max_length)
29
+
30
+ return dataset
31
+
32
+
33
+ def tiny_codes_data_collator(return_hf_format=False):
34
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/datasets/wizardlm.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from functools import partial
3
+
4
+ from datasets import load_dataset
5
+
6
+ from xtuner.dataset import process_hf_dataset
7
+ from xtuner.dataset.collate_fns import default_collate_fn
8
+ from xtuner.dataset.map_fns import template_map_fn_factory, wizardlm_map_fn
9
+ from xtuner.utils import PROMPT_TEMPLATE
10
+
11
+
12
+ def wizardlm_dataset(tokenizer,
13
+ path='WizardLM/WizardLM_evol_instruct_V2_196k',
14
+ max_length=2048,
15
+ prompt_template=PROMPT_TEMPLATE.default,
16
+ remove_unused_columns=False,
17
+ pack_to_max_length=True):
18
+ template_map_fn = template_map_fn_factory(template=prompt_template)
19
+ dataset_org = load_dataset(path)
20
+ dataset = process_hf_dataset(
21
+ dataset=dataset_org,
22
+ tokenizer=tokenizer,
23
+ max_length=max_length,
24
+ dataset_map_fn=wizardlm_map_fn,
25
+ template_map_fn=template_map_fn,
26
+ remove_unused_columns=remove_unused_columns,
27
+ shuffle_before_pack=True,
28
+ pack_to_max_length=pack_to_max_length)
29
+
30
+ return dataset
31
+
32
+
33
+ def wizardlm_data_collator(return_hf_format=False):
34
+ return partial(default_collate_fn, return_hf_format=return_hf_format)
xtuner/apis/model.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from peft import LoraConfig
4
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
5
+ BitsAndBytesConfig)
6
+
7
+ from xtuner.model import SupervisedFinetune
8
+
9
+ __all__ = ['build_model', 'build_lora_model', 'build_qlora_model']
10
+
11
+
12
+ def build_qlora_model(model_name_or_path,
13
+ quantization_config=None,
14
+ lora_config=None,
15
+ return_tokenizer=True):
16
+
17
+ if quantization_config is None:
18
+ quantization_config = BitsAndBytesConfig(
19
+ load_in_4bit=True,
20
+ load_in_8bit=False,
21
+ llm_int8_threshold=6.0,
22
+ llm_int8_has_fp16_weight=False,
23
+ bnb_4bit_compute_dtype=torch.float16,
24
+ bnb_4bit_use_double_quant=True,
25
+ bnb_4bit_quant_type='nf4')
26
+ if lora_config is None:
27
+ lora_config = LoraConfig(
28
+ r=64,
29
+ lora_alpha=16,
30
+ lora_dropout=0.1,
31
+ bias='none',
32
+ task_type='CAUSAL_LM')
33
+
34
+ llm = AutoModelForCausalLM.from_pretrained(
35
+ model_name_or_path,
36
+ torch_dtype=torch.float16,
37
+ trust_remote_code=True,
38
+ quantization_config=quantization_config)
39
+
40
+ model = SupervisedFinetune(llm, lora=lora_config)
41
+
42
+ if return_tokenizer:
43
+ tokenizer = AutoTokenizer.from_pretrained(
44
+ model_name_or_path,
45
+ trust_remote_code=True,
46
+ encode_special_tokens=True)
47
+ return model.llm, tokenizer
48
+ else:
49
+ return model.llm
50
+
51
+
52
+ def build_lora_model(model_name_or_path,
53
+ lora_config=None,
54
+ return_tokenizer=True):
55
+ if lora_config is None:
56
+ lora_config = LoraConfig(
57
+ r=64,
58
+ lora_alpha=16,
59
+ lora_dropout=0.1,
60
+ bias='none',
61
+ task_type='CAUSAL_LM')
62
+
63
+ llm = AutoModelForCausalLM.from_pretrained(
64
+ model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True)
65
+
66
+ model = SupervisedFinetune(llm, lora=lora_config)
67
+
68
+ if return_tokenizer:
69
+ tokenizer = AutoTokenizer.from_pretrained(
70
+ model_name_or_path,
71
+ trust_remote_code=True,
72
+ encode_special_tokens=True)
73
+ return model.llm, tokenizer
74
+ else:
75
+ return model.llm
76
+
77
+
78
+ def build_model(model_name_or_path, return_tokenizer=True):
79
+ model = AutoModelForCausalLM.from_pretrained(
80
+ model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True)
81
+
82
+ if return_tokenizer:
83
+ tokenizer = AutoTokenizer.from_pretrained(
84
+ model_name_or_path,
85
+ trust_remote_code=True,
86
+ encode_special_tokens=True)
87
+ return model, tokenizer
88
+ else:
89
+ return model
xtuner/apis/training_args.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from dataclasses import dataclass, field
3
+ from typing import Union
4
+
5
+ from transformers import TrainingArguments
6
+ from transformers.trainer_utils import IntervalStrategy, SchedulerType
7
+
8
+ __all__ = ['DefaultTrainingArguments']
9
+
10
+
11
+ @dataclass
12
+ class DefaultTrainingArguments(TrainingArguments):
13
+ # custom
14
+ model_name_or_path: str = field(
15
+ default=None,
16
+ metadata={'help': 'model name or path.'},
17
+ )
18
+ dataset_name_or_path: str = field(
19
+ default=None,
20
+ metadata={'help': 'dataset name or path.'},
21
+ )
22
+
23
+ # huggingface
24
+ default_output_dir = './work_dirs'
25
+ default_do_train = True
26
+ default_per_device_train_batch_size = 1
27
+ default_learning_rate = 2e-5
28
+ default_save_strategy = 'epoch'
29
+ default_lr_scheduler_type = 'cosine'
30
+ default_logging_steps = 5
31
+
32
+ output_dir: str = field(
33
+ default=default_output_dir,
34
+ metadata={
35
+ 'help': ('The output directory where the model predictions and '
36
+ 'checkpoints will be written.')
37
+ })
38
+ do_train: bool = field(
39
+ default=default_do_train,
40
+ metadata={'help': 'Whether to run training.'})
41
+ per_device_train_batch_size: int = field(
42
+ default=default_per_device_train_batch_size,
43
+ metadata={'help': 'Batch size per GPU/TPU core/CPU for training.'})
44
+ learning_rate: float = field(
45
+ default=default_learning_rate,
46
+ metadata={'help': 'The initial learning rate for AdamW.'})
47
+ save_strategy: Union[IntervalStrategy, str] = field(
48
+ default=default_save_strategy,
49
+ metadata={'help': 'The checkpoint save strategy to use.'},
50
+ )
51
+ lr_scheduler_type: Union[SchedulerType, str] = field(
52
+ default=default_lr_scheduler_type,
53
+ metadata={'help': 'The scheduler type to use.'},
54
+ )
55
+ logging_steps: float = field(
56
+ default=default_logging_steps,
57
+ metadata={
58
+ 'help': ('Log every X updates steps. Should be an integer or a '
59
+ 'float in range `[0,1)`. If smaller than 1, will be '
60
+ 'interpreted as ratio of total training steps.')
61
+ })
xtuner/configs/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import os
3
+
4
+
5
+ def get_cfgs_name_path():
6
+ path = os.path.dirname(__file__)
7
+ mapping = {}
8
+ for root, dirs, files in os.walk(path):
9
+ for file_ in files:
10
+ if file_.endswith(
11
+ ('.py', '.json')
12
+ ) and not file_.startswith('.') and not file_.startswith('_'):
13
+ mapping[os.path.splitext(file_)[0]] = os.path.join(root, file_)
14
+ return mapping
15
+
16
+
17
+ cfgs_name_path = get_cfgs_name_path()
18
+
19
+ __all__ = ['cfgs_name_path']
xtuner/configs/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (598 Bytes). View file
 
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ alpaca_en_path = 'tatsu-lab/alpaca'
31
+ prompt_template = PROMPT_TEMPLATE.default
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ alpaca_en = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=alpaca_en_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=alpaca_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=alpaca_en,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_e3.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import ConcatDataset, process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn,
16
+ template_map_fn_factory)
17
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
18
+ VarlenAttnArgsToMessageHubHook)
19
+ from xtuner.engine.runner import TrainLoop
20
+ from xtuner.model import SupervisedFinetune
21
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
22
+
23
+ #######################################################################
24
+ # PART 1 Settings #
25
+ #######################################################################
26
+ # Model
27
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
28
+ use_varlen_attn = False
29
+
30
+ # Data
31
+ alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese'
32
+ alpaca_en_path = 'tatsu-lab/alpaca'
33
+ prompt_template = PROMPT_TEMPLATE.default
34
+ max_length = 2048
35
+ pack_to_max_length = True
36
+
37
+ # Scheduler & Optimizer
38
+ batch_size = 1 # per_device
39
+ accumulative_counts = 16
40
+ dataloader_num_workers = 0
41
+ max_epochs = 3
42
+ optim_type = AdamW
43
+ lr = 2e-4
44
+ betas = (0.9, 0.999)
45
+ weight_decay = 0
46
+ max_norm = 1 # grad clip
47
+ warmup_ratio = 0.03
48
+
49
+ # Save
50
+ save_steps = 500
51
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
52
+
53
+ # Evaluate the generation performance during the training
54
+ evaluation_freq = 500
55
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
56
+ evaluation_inputs = [
57
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
58
+ ]
59
+
60
+ #######################################################################
61
+ # PART 2 Model & Tokenizer #
62
+ #######################################################################
63
+ tokenizer = dict(
64
+ type=AutoTokenizer.from_pretrained,
65
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
66
+ trust_remote_code=True,
67
+ padding_side='right')
68
+
69
+ model = dict(
70
+ type=SupervisedFinetune,
71
+ use_varlen_attn=use_varlen_attn,
72
+ llm=dict(
73
+ type=AutoModelForCausalLM.from_pretrained,
74
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
75
+ trust_remote_code=True,
76
+ torch_dtype=torch.float16,
77
+ quantization_config=dict(
78
+ type=BitsAndBytesConfig,
79
+ load_in_4bit=True,
80
+ load_in_8bit=False,
81
+ llm_int8_threshold=6.0,
82
+ llm_int8_has_fp16_weight=False,
83
+ bnb_4bit_compute_dtype=torch.float16,
84
+ bnb_4bit_use_double_quant=True,
85
+ bnb_4bit_quant_type='nf4')),
86
+ lora=dict(
87
+ type=LoraConfig,
88
+ r=64,
89
+ lora_alpha=16,
90
+ lora_dropout=0.1,
91
+ bias='none',
92
+ task_type='CAUSAL_LM'))
93
+
94
+ #######################################################################
95
+ # PART 3 Dataset & Dataloader #
96
+ #######################################################################
97
+ alpaca_en = dict(
98
+ type=process_hf_dataset,
99
+ dataset=dict(type=load_dataset, path=alpaca_en_path),
100
+ tokenizer=tokenizer,
101
+ max_length=max_length,
102
+ dataset_map_fn=alpaca_map_fn,
103
+ template_map_fn=dict(
104
+ type=template_map_fn_factory, template=prompt_template),
105
+ remove_unused_columns=True,
106
+ shuffle_before_pack=True,
107
+ pack_to_max_length=pack_to_max_length,
108
+ use_varlen_attn=use_varlen_attn)
109
+
110
+ alpaca_zh = dict(
111
+ type=process_hf_dataset,
112
+ dataset=dict(type=load_dataset, path=alpaca_zh_path),
113
+ tokenizer=tokenizer,
114
+ max_length=max_length,
115
+ dataset_map_fn=alpaca_zh_map_fn,
116
+ template_map_fn=dict(
117
+ type=template_map_fn_factory, template=prompt_template),
118
+ remove_unused_columns=True,
119
+ shuffle_before_pack=True,
120
+ pack_to_max_length=pack_to_max_length,
121
+ use_varlen_attn=use_varlen_attn)
122
+
123
+ train_dataset = dict(type=ConcatDataset, datasets=[alpaca_en, alpaca_zh])
124
+
125
+ train_dataloader = dict(
126
+ batch_size=batch_size,
127
+ num_workers=dataloader_num_workers,
128
+ dataset=train_dataset,
129
+ sampler=dict(type=DefaultSampler, shuffle=True),
130
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
131
+
132
+ #######################################################################
133
+ # PART 4 Scheduler & Optimizer #
134
+ #######################################################################
135
+ # optimizer
136
+ optim_wrapper = dict(
137
+ type=AmpOptimWrapper,
138
+ optimizer=dict(
139
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
140
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
141
+ accumulative_counts=accumulative_counts,
142
+ loss_scale='dynamic',
143
+ dtype='float16')
144
+
145
+ # learning policy
146
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
147
+ param_scheduler = [
148
+ dict(
149
+ type=LinearLR,
150
+ start_factor=1e-5,
151
+ by_epoch=True,
152
+ begin=0,
153
+ end=warmup_ratio * max_epochs,
154
+ convert_to_iter_based=True),
155
+ dict(
156
+ type=CosineAnnealingLR,
157
+ eta_min=0.0,
158
+ by_epoch=True,
159
+ begin=warmup_ratio * max_epochs,
160
+ end=max_epochs,
161
+ convert_to_iter_based=True)
162
+ ]
163
+
164
+ # train, val, test setting
165
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
166
+
167
+ #######################################################################
168
+ # PART 5 Runtime #
169
+ #######################################################################
170
+ # Log the dialogue periodically during the training process, optional
171
+ custom_hooks = [
172
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
173
+ dict(
174
+ type=EvaluateChatHook,
175
+ tokenizer=tokenizer,
176
+ every_n_iters=evaluation_freq,
177
+ evaluation_inputs=evaluation_inputs,
178
+ system=SYSTEM,
179
+ prompt_template=prompt_template)
180
+ ]
181
+
182
+ if use_varlen_attn:
183
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
184
+
185
+ # configure default hooks
186
+ default_hooks = dict(
187
+ # record the time of every iteration.
188
+ timer=dict(type=IterTimerHook),
189
+ # print log every 10 iterations.
190
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
191
+ # enable the parameter scheduler.
192
+ param_scheduler=dict(type=ParamSchedulerHook),
193
+ # save checkpoint per `save_steps`.
194
+ checkpoint=dict(
195
+ type=CheckpointHook,
196
+ by_epoch=False,
197
+ interval=save_steps,
198
+ max_keep_ckpts=save_total_limit),
199
+ # set sampler seed in distributed evrionment.
200
+ sampler_seed=dict(type=DistSamplerSeedHook),
201
+ )
202
+
203
+ # configure environment
204
+ env_cfg = dict(
205
+ # whether to enable cudnn benchmark
206
+ cudnn_benchmark=False,
207
+ # set multi process parameters
208
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
209
+ # set distributed parameters
210
+ dist_cfg=dict(backend='nccl'),
211
+ )
212
+
213
+ # set visualizer
214
+ visualizer = None
215
+
216
+ # set log level
217
+ log_level = 'INFO'
218
+
219
+ # load from which checkpoint
220
+ load_from = None
221
+
222
+ # whether to resume training from the loaded checkpoint
223
+ resume = False
224
+
225
+ # Defaults to use random seed and disable `deterministic`
226
+ randomness = dict(seed=None, deterministic=False)
227
+
228
+ # set log processor
229
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_oasst1_e3.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import ConcatDataset, process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn,
16
+ oasst1_map_fn, template_map_fn_factory)
17
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
18
+ VarlenAttnArgsToMessageHubHook)
19
+ from xtuner.engine.runner import TrainLoop
20
+ from xtuner.model import SupervisedFinetune
21
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
22
+
23
+ #######################################################################
24
+ # PART 1 Settings #
25
+ #######################################################################
26
+ # Model
27
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
28
+ use_varlen_attn = False
29
+
30
+ # Data
31
+ alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese'
32
+ alpaca_en_path = 'tatsu-lab/alpaca'
33
+ oasst1_path = 'timdettmers/openassistant-guanaco'
34
+ prompt_template = PROMPT_TEMPLATE.default
35
+ max_length = 2048
36
+ pack_to_max_length = True
37
+
38
+ # Scheduler & Optimizer
39
+ batch_size = 1 # per_device
40
+ accumulative_counts = 16
41
+ dataloader_num_workers = 0
42
+ max_epochs = 3
43
+ optim_type = AdamW
44
+ lr = 2e-4
45
+ betas = (0.9, 0.999)
46
+ weight_decay = 0
47
+ max_norm = 1 # grad clip
48
+ warmup_ratio = 0.03
49
+
50
+ # Save
51
+ save_steps = 500
52
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
53
+
54
+ # Evaluate the generation performance during the training
55
+ evaluation_freq = 500
56
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
57
+ evaluation_inputs = [
58
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
59
+ ]
60
+
61
+ #######################################################################
62
+ # PART 2 Model & Tokenizer #
63
+ #######################################################################
64
+ tokenizer = dict(
65
+ type=AutoTokenizer.from_pretrained,
66
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
67
+ trust_remote_code=True,
68
+ padding_side='right')
69
+
70
+ model = dict(
71
+ type=SupervisedFinetune,
72
+ use_varlen_attn=use_varlen_attn,
73
+ llm=dict(
74
+ type=AutoModelForCausalLM.from_pretrained,
75
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
76
+ trust_remote_code=True,
77
+ torch_dtype=torch.float16,
78
+ quantization_config=dict(
79
+ type=BitsAndBytesConfig,
80
+ load_in_4bit=True,
81
+ load_in_8bit=False,
82
+ llm_int8_threshold=6.0,
83
+ llm_int8_has_fp16_weight=False,
84
+ bnb_4bit_compute_dtype=torch.float16,
85
+ bnb_4bit_use_double_quant=True,
86
+ bnb_4bit_quant_type='nf4')),
87
+ lora=dict(
88
+ type=LoraConfig,
89
+ r=64,
90
+ lora_alpha=16,
91
+ lora_dropout=0.1,
92
+ bias='none',
93
+ task_type='CAUSAL_LM'))
94
+
95
+ #######################################################################
96
+ # PART 3 Dataset & Dataloader #
97
+ #######################################################################
98
+ alpaca_en = dict(
99
+ type=process_hf_dataset,
100
+ dataset=dict(type=load_dataset, path=alpaca_en_path),
101
+ tokenizer=tokenizer,
102
+ max_length=max_length,
103
+ dataset_map_fn=alpaca_map_fn,
104
+ template_map_fn=dict(
105
+ type=template_map_fn_factory, template=prompt_template),
106
+ remove_unused_columns=True,
107
+ shuffle_before_pack=True,
108
+ pack_to_max_length=pack_to_max_length,
109
+ use_varlen_attn=use_varlen_attn)
110
+
111
+ alpaca_zh = dict(
112
+ type=process_hf_dataset,
113
+ dataset=dict(type=load_dataset, path=alpaca_zh_path),
114
+ tokenizer=tokenizer,
115
+ max_length=max_length,
116
+ dataset_map_fn=alpaca_zh_map_fn,
117
+ template_map_fn=dict(
118
+ type=template_map_fn_factory, template=prompt_template),
119
+ remove_unused_columns=True,
120
+ shuffle_before_pack=True,
121
+ pack_to_max_length=pack_to_max_length,
122
+ use_varlen_attn=use_varlen_attn)
123
+
124
+ oasst1 = dict(
125
+ type=process_hf_dataset,
126
+ dataset=dict(type=load_dataset, path=oasst1_path),
127
+ tokenizer=tokenizer,
128
+ max_length=max_length,
129
+ dataset_map_fn=oasst1_map_fn,
130
+ template_map_fn=dict(
131
+ type=template_map_fn_factory, template=prompt_template),
132
+ remove_unused_columns=True,
133
+ shuffle_before_pack=True,
134
+ pack_to_max_length=pack_to_max_length,
135
+ use_varlen_attn=use_varlen_attn)
136
+
137
+ train_dataset = dict(
138
+ type=ConcatDataset, datasets=[alpaca_en, alpaca_zh, oasst1])
139
+
140
+ train_dataloader = dict(
141
+ batch_size=batch_size,
142
+ num_workers=dataloader_num_workers,
143
+ dataset=train_dataset,
144
+ sampler=dict(type=DefaultSampler, shuffle=True),
145
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
146
+
147
+ #######################################################################
148
+ # PART 4 Scheduler & Optimizer #
149
+ #######################################################################
150
+ # optimizer
151
+ optim_wrapper = dict(
152
+ type=AmpOptimWrapper,
153
+ optimizer=dict(
154
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
155
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
156
+ accumulative_counts=accumulative_counts,
157
+ loss_scale='dynamic',
158
+ dtype='float16')
159
+
160
+ # learning policy
161
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
162
+ param_scheduler = [
163
+ dict(
164
+ type=LinearLR,
165
+ start_factor=1e-5,
166
+ by_epoch=True,
167
+ begin=0,
168
+ end=warmup_ratio * max_epochs,
169
+ convert_to_iter_based=True),
170
+ dict(
171
+ type=CosineAnnealingLR,
172
+ eta_min=0.0,
173
+ by_epoch=True,
174
+ begin=warmup_ratio * max_epochs,
175
+ end=max_epochs,
176
+ convert_to_iter_based=True)
177
+ ]
178
+
179
+ # train, val, test setting
180
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
181
+
182
+ #######################################################################
183
+ # PART 5 Runtime #
184
+ #######################################################################
185
+ # Log the dialogue periodically during the training process, optional
186
+ custom_hooks = [
187
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
188
+ dict(
189
+ type=EvaluateChatHook,
190
+ tokenizer=tokenizer,
191
+ every_n_iters=evaluation_freq,
192
+ evaluation_inputs=evaluation_inputs,
193
+ system=SYSTEM,
194
+ prompt_template=prompt_template)
195
+ ]
196
+
197
+ if use_varlen_attn:
198
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
199
+
200
+ # configure default hooks
201
+ default_hooks = dict(
202
+ # record the time of every iteration.
203
+ timer=dict(type=IterTimerHook),
204
+ # print log every 10 iterations.
205
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
206
+ # enable the parameter scheduler.
207
+ param_scheduler=dict(type=ParamSchedulerHook),
208
+ # save checkpoint per `save_steps`.
209
+ checkpoint=dict(
210
+ type=CheckpointHook,
211
+ by_epoch=False,
212
+ interval=save_steps,
213
+ max_keep_ckpts=save_total_limit),
214
+ # set sampler seed in distributed evrionment.
215
+ sampler_seed=dict(type=DistSamplerSeedHook),
216
+ )
217
+
218
+ # configure environment
219
+ env_cfg = dict(
220
+ # whether to enable cudnn benchmark
221
+ cudnn_benchmark=False,
222
+ # set multi process parameters
223
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
224
+ # set distributed parameters
225
+ dist_cfg=dict(backend='nccl'),
226
+ )
227
+
228
+ # set visualizer
229
+ visualizer = None
230
+
231
+ # set log level
232
+ log_level = 'INFO'
233
+
234
+ # load from which checkpoint
235
+ load_from = None
236
+
237
+ # whether to resume training from the loaded checkpoint
238
+ resume = False
239
+
240
+ # Defaults to use random seed and disable `deterministic`
241
+ randomness = dict(seed=None, deterministic=False)
242
+
243
+ # set log processor
244
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_zh_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import alpaca_zh_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese'
31
+ prompt_template = PROMPT_TEMPLATE.default
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ alpaca_zh = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=alpaca_zh_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=alpaca_zh_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=alpaca_zh,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_arxiv_gentitle_e3.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import arxiv_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv
31
+ # 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501
32
+ data_path = './data/arxiv_data.json'
33
+ prompt_template = PROMPT_TEMPLATE.default
34
+ max_length = 2048
35
+ pack_to_max_length = True
36
+
37
+ # Scheduler & Optimizer
38
+ batch_size = 1 # per_device
39
+ accumulative_counts = 16
40
+ dataloader_num_workers = 0
41
+ max_epochs = 3
42
+ optim_type = AdamW
43
+ lr = 2e-4
44
+ betas = (0.9, 0.999)
45
+ weight_decay = 0
46
+ max_norm = 1 # grad clip
47
+ warmup_ratio = 0.03
48
+
49
+ # Save
50
+ save_steps = 500
51
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
52
+
53
+ # Evaluate the generation performance during the training
54
+ evaluation_freq = 500
55
+ SYSTEM = SYSTEM_TEMPLATE.arxiv_gentile
56
+ evaluation_inputs = [
57
+ ('We present InternLM, a multilingual foundational language '
58
+ 'model with 104B parameters. InternLM is pre-trained on a large '
59
+ 'corpora with 1.6T tokens with a multi-phase progressive '
60
+ 'process, and then fine-tuned to align with human preferences. '
61
+ 'We also developed a training system called Uniscale-LLM for '
62
+ 'efficient large language model training. The evaluation on a '
63
+ 'number of benchmarks shows that InternLM achieves '
64
+ 'state-of-the-art performance in multiple aspects, including '
65
+ 'knowledge understanding, reading comprehension, mathematics, '
66
+ 'and coding. With such well-rounded capabilities, InternLM '
67
+ 'achieves outstanding performances on comprehensive exams, '
68
+ 'including MMLU, AGIEval, C-Eval and GAOKAO-Bench, without '
69
+ 'resorting to external tools. On these benchmarks, InternLM '
70
+ 'not only significantly outperforms open-source models, but '
71
+ 'also obtains superior performance compared to ChatGPT. Also, '
72
+ 'InternLM demonstrates excellent capability of understanding '
73
+ 'Chinese language and Chinese culture, which makes it a '
74
+ 'suitable foundation model to support Chinese-oriented language '
75
+ 'applications. This manuscript gives a detailed study of '
76
+ 'our results, with benchmarks and examples across a diverse '
77
+ 'set of knowledge domains and tasks.'),
78
+ ('In this work, we develop and release Llama 2, a collection of '
79
+ 'pretrained and fine-tuned large language models (LLMs) ranging '
80
+ 'in scale from 7 billion to 70 billion parameters.\nOur '
81
+ 'fine-tuned LLMs, called LLAMA 2-CHAT, are optimized for '
82
+ 'dialogue use cases. Our models outperform open-source chat '
83
+ 'models on most benchmarks we tested, and based on our human '
84
+ 'evaluations for helpfulness and safety, may be a suitable '
85
+ 'substitute for closedsource models. We provide a detailed '
86
+ 'description of our approach to fine-tuning and safety '
87
+ 'improvements of LLAMA 2-CHAT in order to enable the community '
88
+ 'to build on our work and contribute to the responsible '
89
+ 'development of LLMs.')
90
+ ]
91
+
92
+ #######################################################################
93
+ # PART 2 Model & Tokenizer #
94
+ #######################################################################
95
+ tokenizer = dict(
96
+ type=AutoTokenizer.from_pretrained,
97
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
98
+ trust_remote_code=True,
99
+ padding_side='right')
100
+
101
+ model = dict(
102
+ type=SupervisedFinetune,
103
+ use_varlen_attn=use_varlen_attn,
104
+ llm=dict(
105
+ type=AutoModelForCausalLM.from_pretrained,
106
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
107
+ trust_remote_code=True,
108
+ torch_dtype=torch.float16,
109
+ quantization_config=dict(
110
+ type=BitsAndBytesConfig,
111
+ load_in_4bit=True,
112
+ load_in_8bit=False,
113
+ llm_int8_threshold=6.0,
114
+ llm_int8_has_fp16_weight=False,
115
+ bnb_4bit_compute_dtype=torch.float16,
116
+ bnb_4bit_use_double_quant=True,
117
+ bnb_4bit_quant_type='nf4')),
118
+ lora=dict(
119
+ type=LoraConfig,
120
+ r=64,
121
+ lora_alpha=16,
122
+ lora_dropout=0.1,
123
+ bias='none',
124
+ task_type='CAUSAL_LM'))
125
+
126
+ #######################################################################
127
+ # PART 3 Dataset & Dataloader #
128
+ #######################################################################
129
+ train_dataset = dict(
130
+ type=process_hf_dataset,
131
+ dataset=dict(
132
+ type=load_dataset, path='json', data_files=dict(train=data_path)),
133
+ tokenizer=tokenizer,
134
+ max_length=max_length,
135
+ dataset_map_fn=arxiv_map_fn,
136
+ template_map_fn=dict(
137
+ type=template_map_fn_factory, template=prompt_template),
138
+ remove_unused_columns=True,
139
+ shuffle_before_pack=True,
140
+ pack_to_max_length=pack_to_max_length,
141
+ use_varlen_attn=use_varlen_attn)
142
+
143
+ train_dataloader = dict(
144
+ batch_size=batch_size,
145
+ num_workers=dataloader_num_workers,
146
+ dataset=train_dataset,
147
+ sampler=dict(type=DefaultSampler, shuffle=True),
148
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
149
+
150
+ #######################################################################
151
+ # PART 4 Scheduler & Optimizer #
152
+ #######################################################################
153
+ # optimizer
154
+ optim_wrapper = dict(
155
+ type=AmpOptimWrapper,
156
+ optimizer=dict(
157
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
158
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
159
+ accumulative_counts=accumulative_counts,
160
+ loss_scale='dynamic',
161
+ dtype='float16')
162
+
163
+ # learning policy
164
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
165
+ param_scheduler = [
166
+ dict(
167
+ type=LinearLR,
168
+ start_factor=1e-5,
169
+ by_epoch=True,
170
+ begin=0,
171
+ end=warmup_ratio * max_epochs,
172
+ convert_to_iter_based=True),
173
+ dict(
174
+ type=CosineAnnealingLR,
175
+ eta_min=0.0,
176
+ by_epoch=True,
177
+ begin=warmup_ratio * max_epochs,
178
+ end=max_epochs,
179
+ convert_to_iter_based=True)
180
+ ]
181
+
182
+ # train, val, test setting
183
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
184
+
185
+ #######################################################################
186
+ # PART 5 Runtime #
187
+ #######################################################################
188
+ # Log the dialogue periodically during the training process, optional
189
+ custom_hooks = [
190
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
191
+ dict(
192
+ type=EvaluateChatHook,
193
+ tokenizer=tokenizer,
194
+ every_n_iters=evaluation_freq,
195
+ evaluation_inputs=evaluation_inputs,
196
+ system=SYSTEM,
197
+ prompt_template=prompt_template)
198
+ ]
199
+
200
+ if use_varlen_attn:
201
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
202
+
203
+ # configure default hooks
204
+ default_hooks = dict(
205
+ # record the time of every iteration.
206
+ timer=dict(type=IterTimerHook),
207
+ # print log every 10 iterations.
208
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
209
+ # enable the parameter scheduler.
210
+ param_scheduler=dict(type=ParamSchedulerHook),
211
+ # save checkpoint per `save_steps`.
212
+ checkpoint=dict(
213
+ type=CheckpointHook,
214
+ by_epoch=False,
215
+ interval=save_steps,
216
+ max_keep_ckpts=save_total_limit),
217
+ # set sampler seed in distributed evrionment.
218
+ sampler_seed=dict(type=DistSamplerSeedHook),
219
+ )
220
+
221
+ # configure environment
222
+ env_cfg = dict(
223
+ # whether to enable cudnn benchmark
224
+ cudnn_benchmark=False,
225
+ # set multi process parameters
226
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
227
+ # set distributed parameters
228
+ dist_cfg=dict(backend='nccl'),
229
+ )
230
+
231
+ # set visualizer
232
+ visualizer = None
233
+
234
+ # set log level
235
+ log_level = 'INFO'
236
+
237
+ # load from which checkpoint
238
+ load_from = None
239
+
240
+ # whether to resume training from the loaded checkpoint
241
+ resume = False
242
+
243
+ # Defaults to use random seed and disable `deterministic`
244
+ randomness = dict(seed=None, deterministic=False)
245
+
246
+ # set log processor
247
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_code_alpaca_e3.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import code_alpaca_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ data_path = 'HuggingFaceH4/CodeAlpaca_20K'
31
+ prompt_template = PROMPT_TEMPLATE.default
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 100
53
+ SYSTEM = SYSTEM_TEMPLATE.coder
54
+ evaluation_inputs = [
55
+ ('写一个Python函数,将十六进制颜色代码(如#0066ee)转换为对应的'
56
+ '红、绿、蓝(RGB)三个颜色分量值,并以元组的形式返回。'),
57
+ ('Write a Python function that takes a hexadecimal color code '
58
+ '(e.g., #0066ee) as input and converts it into the corresponding '
59
+ 'red, green, and blue (RGB) color component values.')
60
+ ]
61
+
62
+ #######################################################################
63
+ # PART 2 Model & Tokenizer #
64
+ #######################################################################
65
+ tokenizer = dict(
66
+ type=AutoTokenizer.from_pretrained,
67
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
68
+ trust_remote_code=True,
69
+ padding_side='right')
70
+
71
+ model = dict(
72
+ type=SupervisedFinetune,
73
+ use_varlen_attn=use_varlen_attn,
74
+ llm=dict(
75
+ type=AutoModelForCausalLM.from_pretrained,
76
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
77
+ trust_remote_code=True,
78
+ torch_dtype=torch.float16,
79
+ quantization_config=dict(
80
+ type=BitsAndBytesConfig,
81
+ load_in_4bit=True,
82
+ load_in_8bit=False,
83
+ llm_int8_threshold=6.0,
84
+ llm_int8_has_fp16_weight=False,
85
+ bnb_4bit_compute_dtype=torch.float16,
86
+ bnb_4bit_use_double_quant=True,
87
+ bnb_4bit_quant_type='nf4')),
88
+ lora=dict(
89
+ type=LoraConfig,
90
+ r=64,
91
+ lora_alpha=16,
92
+ lora_dropout=0.1,
93
+ bias='none',
94
+ task_type='CAUSAL_LM'))
95
+
96
+ #######################################################################
97
+ # PART 3 Dataset & Dataloader #
98
+ #######################################################################
99
+ train_dataset = dict(
100
+ type=process_hf_dataset,
101
+ dataset=dict(type=load_dataset, path=data_path),
102
+ tokenizer=tokenizer,
103
+ max_length=max_length,
104
+ dataset_map_fn=code_alpaca_map_fn,
105
+ template_map_fn=dict(
106
+ type=template_map_fn_factory, template=prompt_template),
107
+ remove_unused_columns=True,
108
+ shuffle_before_pack=True,
109
+ pack_to_max_length=pack_to_max_length,
110
+ use_varlen_attn=use_varlen_attn)
111
+
112
+ train_dataloader = dict(
113
+ batch_size=batch_size,
114
+ num_workers=dataloader_num_workers,
115
+ dataset=train_dataset,
116
+ sampler=dict(type=DefaultSampler, shuffle=True),
117
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
118
+
119
+ #######################################################################
120
+ # PART 4 Scheduler & Optimizer #
121
+ #######################################################################
122
+ # optimizer
123
+ optim_wrapper = dict(
124
+ type=AmpOptimWrapper,
125
+ optimizer=dict(
126
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
127
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
128
+ accumulative_counts=accumulative_counts,
129
+ loss_scale='dynamic',
130
+ dtype='float16')
131
+
132
+ # learning policy
133
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
134
+ param_scheduler = [
135
+ dict(
136
+ type=LinearLR,
137
+ start_factor=1e-5,
138
+ by_epoch=True,
139
+ begin=0,
140
+ end=warmup_ratio * max_epochs,
141
+ convert_to_iter_based=True),
142
+ dict(
143
+ type=CosineAnnealingLR,
144
+ eta_min=0.0,
145
+ by_epoch=True,
146
+ begin=warmup_ratio * max_epochs,
147
+ end=max_epochs,
148
+ convert_to_iter_based=True)
149
+ ]
150
+
151
+ # train, val, test setting
152
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
153
+
154
+ #######################################################################
155
+ # PART 5 Runtime #
156
+ #######################################################################
157
+ # Log the dialogue periodically during the training process, optional
158
+ custom_hooks = [
159
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
160
+ dict(
161
+ type=EvaluateChatHook,
162
+ tokenizer=tokenizer,
163
+ every_n_iters=evaluation_freq,
164
+ evaluation_inputs=evaluation_inputs,
165
+ system=SYSTEM,
166
+ prompt_template=prompt_template)
167
+ ]
168
+
169
+ if use_varlen_attn:
170
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
171
+
172
+ # configure default hooks
173
+ default_hooks = dict(
174
+ # record the time of every iteration.
175
+ timer=dict(type=IterTimerHook),
176
+ # print log every 10 iterations.
177
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
178
+ # enable the parameter scheduler.
179
+ param_scheduler=dict(type=ParamSchedulerHook),
180
+ # save checkpoint per `save_steps`.
181
+ checkpoint=dict(
182
+ type=CheckpointHook,
183
+ by_epoch=False,
184
+ interval=save_steps,
185
+ max_keep_ckpts=save_total_limit),
186
+ # set sampler seed in distributed evrionment.
187
+ sampler_seed=dict(type=DistSamplerSeedHook),
188
+ )
189
+
190
+ # configure environment
191
+ env_cfg = dict(
192
+ # whether to enable cudnn benchmark
193
+ cudnn_benchmark=False,
194
+ # set multi process parameters
195
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
196
+ # set distributed parameters
197
+ dist_cfg=dict(backend='nccl'),
198
+ )
199
+
200
+ # set visualizer
201
+ visualizer = None
202
+
203
+ # set log level
204
+ log_level = 'INFO'
205
+
206
+ # load from which checkpoint
207
+ load_from = None
208
+
209
+ # whether to resume training from the loaded checkpoint
210
+ resume = False
211
+
212
+ # Defaults to use random seed and disable `deterministic`
213
+ randomness = dict(seed=None, deterministic=False)
214
+
215
+ # set log processor
216
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_colorist_e5.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import colors_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ data_path = 'burkelibbey/colors'
31
+ prompt_template = PROMPT_TEMPLATE.default
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 5
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 200
53
+ SYSTEM = SYSTEM_TEMPLATE.colorist
54
+ evaluation_inputs = [
55
+ '请给我一个像天空一样清澈透明的蓝色。', 'Please give me a clear blue like the sky.'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ train_dataset = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=data_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=colors_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=train_dataset,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_lawyer_e3.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import ConcatDataset, process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import (crime_kg_assitant_map_fn,
16
+ law_reference_map_fn,
17
+ template_map_fn_factory)
18
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
19
+ VarlenAttnArgsToMessageHubHook)
20
+ from xtuner.engine.runner import TrainLoop
21
+ from xtuner.model import SupervisedFinetune
22
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
23
+
24
+ #######################################################################
25
+ # PART 1 Settings #
26
+ #######################################################################
27
+ # Model
28
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
29
+ use_varlen_attn = False
30
+
31
+ # Data
32
+ # download data from https://github.com/LiuHC0428/LAW-GPT
33
+ crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json'
34
+ law_reference_data_path = './data/训练数据_带法律依据_92k.json'
35
+ prompt_template = PROMPT_TEMPLATE.default
36
+ max_length = 2048
37
+ pack_to_max_length = True
38
+
39
+ # Scheduler & Optimizer
40
+ batch_size = 1 # per_device
41
+ accumulative_counts = 16
42
+ dataloader_num_workers = 0
43
+ max_epochs = 3
44
+ optim_type = AdamW
45
+ lr = 2e-4
46
+ betas = (0.9, 0.999)
47
+ weight_decay = 0
48
+ max_norm = 1 # grad clip
49
+ warmup_ratio = 0.03
50
+
51
+ # Save
52
+ save_steps = 500
53
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
54
+
55
+ # Evaluate the generation performance during the training
56
+ evaluation_freq = 500
57
+ SYSTEM = SYSTEM_TEMPLATE.lawyer
58
+ evaluation_inputs = ['请问离婚需要准备什么材料?', '销售鳄鱼皮包违法吗?']
59
+
60
+ #######################################################################
61
+ # PART 2 Model & Tokenizer #
62
+ #######################################################################
63
+ tokenizer = dict(
64
+ type=AutoTokenizer.from_pretrained,
65
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
66
+ trust_remote_code=True,
67
+ padding_side='right')
68
+
69
+ model = dict(
70
+ type=SupervisedFinetune,
71
+ use_varlen_attn=use_varlen_attn,
72
+ llm=dict(
73
+ type=AutoModelForCausalLM.from_pretrained,
74
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
75
+ trust_remote_code=True,
76
+ torch_dtype=torch.float16,
77
+ quantization_config=dict(
78
+ type=BitsAndBytesConfig,
79
+ load_in_4bit=True,
80
+ load_in_8bit=False,
81
+ llm_int8_threshold=6.0,
82
+ llm_int8_has_fp16_weight=False,
83
+ bnb_4bit_compute_dtype=torch.float16,
84
+ bnb_4bit_use_double_quant=True,
85
+ bnb_4bit_quant_type='nf4')),
86
+ lora=dict(
87
+ type=LoraConfig,
88
+ r=64,
89
+ lora_alpha=16,
90
+ lora_dropout=0.1,
91
+ bias='none',
92
+ task_type='CAUSAL_LM'))
93
+
94
+ #######################################################################
95
+ # PART 3 Dataset & Dataloader #
96
+ #######################################################################
97
+ crime_kg_assitant = dict(
98
+ type=process_hf_dataset,
99
+ dataset=dict(
100
+ type=load_dataset,
101
+ path='json',
102
+ data_files=dict(train=crime_kg_assitant_path)),
103
+ tokenizer=tokenizer,
104
+ max_length=max_length,
105
+ dataset_map_fn=crime_kg_assitant_map_fn,
106
+ template_map_fn=dict(
107
+ type=template_map_fn_factory, template=prompt_template),
108
+ remove_unused_columns=True,
109
+ shuffle_before_pack=True,
110
+ pack_to_max_length=pack_to_max_length,
111
+ use_varlen_attn=use_varlen_attn)
112
+
113
+ law_reference_data = dict(
114
+ type=process_hf_dataset,
115
+ dataset=dict(
116
+ type=load_dataset,
117
+ path='json',
118
+ data_files=dict(train=law_reference_data_path)),
119
+ tokenizer=tokenizer,
120
+ max_length=max_length,
121
+ dataset_map_fn=law_reference_map_fn,
122
+ template_map_fn=dict(
123
+ type=template_map_fn_factory, template=prompt_template),
124
+ remove_unused_columns=True,
125
+ shuffle_before_pack=True,
126
+ pack_to_max_length=pack_to_max_length,
127
+ use_varlen_attn=use_varlen_attn)
128
+
129
+ train_dataset = dict(
130
+ type=ConcatDataset, datasets=[crime_kg_assitant, law_reference_data])
131
+
132
+ train_dataloader = dict(
133
+ batch_size=batch_size,
134
+ num_workers=dataloader_num_workers,
135
+ dataset=train_dataset,
136
+ sampler=dict(type=DefaultSampler, shuffle=True),
137
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
138
+
139
+ #######################################################################
140
+ # PART 4 Scheduler & Optimizer #
141
+ #######################################################################
142
+ # optimizer
143
+ optim_wrapper = dict(
144
+ type=AmpOptimWrapper,
145
+ optimizer=dict(
146
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
147
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
148
+ accumulative_counts=accumulative_counts,
149
+ loss_scale='dynamic',
150
+ dtype='float16')
151
+
152
+ # learning policy
153
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
154
+ param_scheduler = [
155
+ dict(
156
+ type=LinearLR,
157
+ start_factor=1e-5,
158
+ by_epoch=True,
159
+ begin=0,
160
+ end=warmup_ratio * max_epochs,
161
+ convert_to_iter_based=True),
162
+ dict(
163
+ type=CosineAnnealingLR,
164
+ eta_min=0.0,
165
+ by_epoch=True,
166
+ begin=warmup_ratio * max_epochs,
167
+ end=max_epochs,
168
+ convert_to_iter_based=True)
169
+ ]
170
+
171
+ # train, val, test setting
172
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
173
+
174
+ #######################################################################
175
+ # PART 5 Runtime #
176
+ #######################################################################
177
+ # Log the dialogue periodically during the training process, optional
178
+ custom_hooks = [
179
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
180
+ dict(
181
+ type=EvaluateChatHook,
182
+ tokenizer=tokenizer,
183
+ every_n_iters=evaluation_freq,
184
+ evaluation_inputs=evaluation_inputs,
185
+ system=SYSTEM,
186
+ prompt_template=prompt_template)
187
+ ]
188
+
189
+ if use_varlen_attn:
190
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
191
+
192
+ # configure default hooks
193
+ default_hooks = dict(
194
+ # record the time of every iteration.
195
+ timer=dict(type=IterTimerHook),
196
+ # print log every 10 iterations.
197
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
198
+ # enable the parameter scheduler.
199
+ param_scheduler=dict(type=ParamSchedulerHook),
200
+ # save checkpoint per `save_steps`.
201
+ checkpoint=dict(
202
+ type=CheckpointHook,
203
+ by_epoch=False,
204
+ interval=save_steps,
205
+ max_keep_ckpts=save_total_limit),
206
+ # set sampler seed in distributed evrionment.
207
+ sampler_seed=dict(type=DistSamplerSeedHook),
208
+ )
209
+
210
+ # configure environment
211
+ env_cfg = dict(
212
+ # whether to enable cudnn benchmark
213
+ cudnn_benchmark=False,
214
+ # set multi process parameters
215
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
216
+ # set distributed parameters
217
+ dist_cfg=dict(backend='nccl'),
218
+ )
219
+
220
+ # set visualizer
221
+ visualizer = None
222
+
223
+ # set log level
224
+ log_level = 'INFO'
225
+
226
+ # load from which checkpoint
227
+ load_from = None
228
+
229
+ # whether to resume training from the loaded checkpoint
230
+ resume = False
231
+
232
+ # Defaults to use random seed and disable `deterministic`
233
+ randomness = dict(seed=None, deterministic=False)
234
+
235
+ # set log processor
236
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_512_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ data_path = 'timdettmers/openassistant-guanaco'
31
+ prompt_template = PROMPT_TEMPLATE.default
32
+ max_length = 512
33
+ pack_to_max_length = False
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = ''
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ train_dataset = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=data_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=oasst1_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=train_dataset,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ data_path = 'timdettmers/openassistant-guanaco'
31
+ prompt_template = PROMPT_TEMPLATE.default
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = ''
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ train_dataset = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=data_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=oasst1_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=train_dataset,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_open_platypus_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ data_path = 'garage-bAInd/Open-Platypus'
31
+ prompt_template = PROMPT_TEMPLATE.default
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ train_dataset = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=data_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=alpaca_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=train_dataset,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_sql_e3.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import sql_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ data_path = 'b-mc2/sql-create-context'
31
+ prompt_template = PROMPT_TEMPLATE.default
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = SYSTEM_TEMPLATE.sql
54
+ evaluation_inputs = [
55
+ ('CREATE TABLE station (name VARCHAR, lat VARCHAR, city VARCHAR)\n'
56
+ 'Find the name, latitude, and city of stations with latitude '
57
+ 'above 50.'),
58
+ ('CREATE TABLE weather (zip_code VARCHAR, mean_visibility_miles '
59
+ 'INTEGER)\n找到mean_visibility_miles最大的zip_code。')
60
+ ]
61
+
62
+ #######################################################################
63
+ # PART 2 Model & Tokenizer #
64
+ #######################################################################
65
+ tokenizer = dict(
66
+ type=AutoTokenizer.from_pretrained,
67
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
68
+ trust_remote_code=True,
69
+ padding_side='right')
70
+
71
+ model = dict(
72
+ type=SupervisedFinetune,
73
+ use_varlen_attn=use_varlen_attn,
74
+ llm=dict(
75
+ type=AutoModelForCausalLM.from_pretrained,
76
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
77
+ trust_remote_code=True,
78
+ torch_dtype=torch.float16,
79
+ quantization_config=dict(
80
+ type=BitsAndBytesConfig,
81
+ load_in_4bit=True,
82
+ load_in_8bit=False,
83
+ llm_int8_threshold=6.0,
84
+ llm_int8_has_fp16_weight=False,
85
+ bnb_4bit_compute_dtype=torch.float16,
86
+ bnb_4bit_use_double_quant=True,
87
+ bnb_4bit_quant_type='nf4')),
88
+ lora=dict(
89
+ type=LoraConfig,
90
+ r=64,
91
+ lora_alpha=16,
92
+ lora_dropout=0.1,
93
+ bias='none',
94
+ task_type='CAUSAL_LM'))
95
+
96
+ #######################################################################
97
+ # PART 3 Dataset & Dataloader #
98
+ #######################################################################
99
+ train_dataset = dict(
100
+ type=process_hf_dataset,
101
+ dataset=dict(type=load_dataset, path=data_path),
102
+ tokenizer=tokenizer,
103
+ max_length=max_length,
104
+ dataset_map_fn=sql_map_fn,
105
+ template_map_fn=dict(
106
+ type=template_map_fn_factory, template=prompt_template),
107
+ remove_unused_columns=True,
108
+ shuffle_before_pack=True,
109
+ pack_to_max_length=pack_to_max_length,
110
+ use_varlen_attn=use_varlen_attn)
111
+
112
+ train_dataloader = dict(
113
+ batch_size=batch_size,
114
+ num_workers=dataloader_num_workers,
115
+ dataset=train_dataset,
116
+ sampler=dict(type=DefaultSampler, shuffle=True),
117
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
118
+
119
+ #######################################################################
120
+ # PART 4 Scheduler & Optimizer #
121
+ #######################################################################
122
+ # optimizer
123
+ optim_wrapper = dict(
124
+ type=AmpOptimWrapper,
125
+ optimizer=dict(
126
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
127
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
128
+ accumulative_counts=accumulative_counts,
129
+ loss_scale='dynamic',
130
+ dtype='float16')
131
+
132
+ # learning policy
133
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
134
+ param_scheduler = [
135
+ dict(
136
+ type=LinearLR,
137
+ start_factor=1e-5,
138
+ by_epoch=True,
139
+ begin=0,
140
+ end=warmup_ratio * max_epochs,
141
+ convert_to_iter_based=True),
142
+ dict(
143
+ type=CosineAnnealingLR,
144
+ eta_min=0.0,
145
+ by_epoch=True,
146
+ begin=warmup_ratio * max_epochs,
147
+ end=max_epochs,
148
+ convert_to_iter_based=True)
149
+ ]
150
+
151
+ # train, val, test setting
152
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
153
+
154
+ #######################################################################
155
+ # PART 5 Runtime #
156
+ #######################################################################
157
+ # Log the dialogue periodically during the training process, optional
158
+ custom_hooks = [
159
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
160
+ dict(
161
+ type=EvaluateChatHook,
162
+ tokenizer=tokenizer,
163
+ every_n_iters=evaluation_freq,
164
+ evaluation_inputs=evaluation_inputs,
165
+ system=SYSTEM,
166
+ prompt_template=prompt_template)
167
+ ]
168
+
169
+ if use_varlen_attn:
170
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
171
+
172
+ # configure default hooks
173
+ default_hooks = dict(
174
+ # record the time of every iteration.
175
+ timer=dict(type=IterTimerHook),
176
+ # print log every 10 iterations.
177
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
178
+ # enable the parameter scheduler.
179
+ param_scheduler=dict(type=ParamSchedulerHook),
180
+ # save checkpoint per `save_steps`.
181
+ checkpoint=dict(
182
+ type=CheckpointHook,
183
+ by_epoch=False,
184
+ interval=save_steps,
185
+ max_keep_ckpts=save_total_limit),
186
+ # set sampler seed in distributed evrionment.
187
+ sampler_seed=dict(type=DistSamplerSeedHook),
188
+ )
189
+
190
+ # configure environment
191
+ env_cfg = dict(
192
+ # whether to enable cudnn benchmark
193
+ cudnn_benchmark=False,
194
+ # set multi process parameters
195
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
196
+ # set distributed parameters
197
+ dist_cfg=dict(backend='nccl'),
198
+ )
199
+
200
+ # set visualizer
201
+ visualizer = None
202
+
203
+ # set log level
204
+ log_level = 'INFO'
205
+
206
+ # load from which checkpoint
207
+ load_from = None
208
+
209
+ # whether to resume training from the loaded checkpoint
210
+ resume = False
211
+
212
+ # Defaults to use random seed and disable `deterministic`
213
+ randomness = dict(seed=None, deterministic=False)
214
+
215
+ # set log processor
216
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ alpaca_en_path = 'tatsu-lab/alpaca'
31
+ prompt_template = PROMPT_TEMPLATE.baichuan2_chat
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ alpaca_en = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=alpaca_en_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=alpaca_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=alpaca_en,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_e3.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import ConcatDataset, process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn,
16
+ template_map_fn_factory)
17
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
18
+ VarlenAttnArgsToMessageHubHook)
19
+ from xtuner.engine.runner import TrainLoop
20
+ from xtuner.model import SupervisedFinetune
21
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
22
+
23
+ #######################################################################
24
+ # PART 1 Settings #
25
+ #######################################################################
26
+ # Model
27
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat'
28
+ use_varlen_attn = False
29
+
30
+ # Data
31
+ alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese'
32
+ alpaca_en_path = 'tatsu-lab/alpaca'
33
+ prompt_template = PROMPT_TEMPLATE.baichuan2_chat
34
+ max_length = 2048
35
+ pack_to_max_length = True
36
+
37
+ # Scheduler & Optimizer
38
+ batch_size = 1 # per_device
39
+ accumulative_counts = 16
40
+ dataloader_num_workers = 0
41
+ max_epochs = 3
42
+ optim_type = AdamW
43
+ lr = 2e-4
44
+ betas = (0.9, 0.999)
45
+ weight_decay = 0
46
+ max_norm = 1 # grad clip
47
+ warmup_ratio = 0.03
48
+
49
+ # Save
50
+ save_steps = 500
51
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
52
+
53
+ # Evaluate the generation performance during the training
54
+ evaluation_freq = 500
55
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
56
+ evaluation_inputs = [
57
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
58
+ ]
59
+
60
+ #######################################################################
61
+ # PART 2 Model & Tokenizer #
62
+ #######################################################################
63
+ tokenizer = dict(
64
+ type=AutoTokenizer.from_pretrained,
65
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
66
+ trust_remote_code=True,
67
+ padding_side='right')
68
+
69
+ model = dict(
70
+ type=SupervisedFinetune,
71
+ use_varlen_attn=use_varlen_attn,
72
+ llm=dict(
73
+ type=AutoModelForCausalLM.from_pretrained,
74
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
75
+ trust_remote_code=True,
76
+ torch_dtype=torch.float16,
77
+ quantization_config=dict(
78
+ type=BitsAndBytesConfig,
79
+ load_in_4bit=True,
80
+ load_in_8bit=False,
81
+ llm_int8_threshold=6.0,
82
+ llm_int8_has_fp16_weight=False,
83
+ bnb_4bit_compute_dtype=torch.float16,
84
+ bnb_4bit_use_double_quant=True,
85
+ bnb_4bit_quant_type='nf4')),
86
+ lora=dict(
87
+ type=LoraConfig,
88
+ r=64,
89
+ lora_alpha=16,
90
+ lora_dropout=0.1,
91
+ bias='none',
92
+ task_type='CAUSAL_LM'))
93
+
94
+ #######################################################################
95
+ # PART 3 Dataset & Dataloader #
96
+ #######################################################################
97
+ alpaca_en = dict(
98
+ type=process_hf_dataset,
99
+ dataset=dict(type=load_dataset, path=alpaca_en_path),
100
+ tokenizer=tokenizer,
101
+ max_length=max_length,
102
+ dataset_map_fn=alpaca_map_fn,
103
+ template_map_fn=dict(
104
+ type=template_map_fn_factory, template=prompt_template),
105
+ remove_unused_columns=True,
106
+ shuffle_before_pack=True,
107
+ pack_to_max_length=pack_to_max_length,
108
+ use_varlen_attn=use_varlen_attn)
109
+
110
+ alpaca_zh = dict(
111
+ type=process_hf_dataset,
112
+ dataset=dict(type=load_dataset, path=alpaca_zh_path),
113
+ tokenizer=tokenizer,
114
+ max_length=max_length,
115
+ dataset_map_fn=alpaca_zh_map_fn,
116
+ template_map_fn=dict(
117
+ type=template_map_fn_factory, template=prompt_template),
118
+ remove_unused_columns=True,
119
+ shuffle_before_pack=True,
120
+ pack_to_max_length=pack_to_max_length,
121
+ use_varlen_attn=use_varlen_attn)
122
+
123
+ train_dataset = dict(type=ConcatDataset, datasets=[alpaca_en, alpaca_zh])
124
+
125
+ train_dataloader = dict(
126
+ batch_size=batch_size,
127
+ num_workers=dataloader_num_workers,
128
+ dataset=train_dataset,
129
+ sampler=dict(type=DefaultSampler, shuffle=True),
130
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
131
+
132
+ #######################################################################
133
+ # PART 4 Scheduler & Optimizer #
134
+ #######################################################################
135
+ # optimizer
136
+ optim_wrapper = dict(
137
+ type=AmpOptimWrapper,
138
+ optimizer=dict(
139
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
140
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
141
+ accumulative_counts=accumulative_counts,
142
+ loss_scale='dynamic',
143
+ dtype='float16')
144
+
145
+ # learning policy
146
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
147
+ param_scheduler = [
148
+ dict(
149
+ type=LinearLR,
150
+ start_factor=1e-5,
151
+ by_epoch=True,
152
+ begin=0,
153
+ end=warmup_ratio * max_epochs,
154
+ convert_to_iter_based=True),
155
+ dict(
156
+ type=CosineAnnealingLR,
157
+ eta_min=0.0,
158
+ by_epoch=True,
159
+ begin=warmup_ratio * max_epochs,
160
+ end=max_epochs,
161
+ convert_to_iter_based=True)
162
+ ]
163
+
164
+ # train, val, test setting
165
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
166
+
167
+ #######################################################################
168
+ # PART 5 Runtime #
169
+ #######################################################################
170
+ # Log the dialogue periodically during the training process, optional
171
+ custom_hooks = [
172
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
173
+ dict(
174
+ type=EvaluateChatHook,
175
+ tokenizer=tokenizer,
176
+ every_n_iters=evaluation_freq,
177
+ evaluation_inputs=evaluation_inputs,
178
+ system=SYSTEM,
179
+ prompt_template=prompt_template)
180
+ ]
181
+
182
+ if use_varlen_attn:
183
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
184
+
185
+ # configure default hooks
186
+ default_hooks = dict(
187
+ # record the time of every iteration.
188
+ timer=dict(type=IterTimerHook),
189
+ # print log every 10 iterations.
190
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
191
+ # enable the parameter scheduler.
192
+ param_scheduler=dict(type=ParamSchedulerHook),
193
+ # save checkpoint per `save_steps`.
194
+ checkpoint=dict(
195
+ type=CheckpointHook,
196
+ by_epoch=False,
197
+ interval=save_steps,
198
+ max_keep_ckpts=save_total_limit),
199
+ # set sampler seed in distributed evrionment.
200
+ sampler_seed=dict(type=DistSamplerSeedHook),
201
+ )
202
+
203
+ # configure environment
204
+ env_cfg = dict(
205
+ # whether to enable cudnn benchmark
206
+ cudnn_benchmark=False,
207
+ # set multi process parameters
208
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
209
+ # set distributed parameters
210
+ dist_cfg=dict(backend='nccl'),
211
+ )
212
+
213
+ # set visualizer
214
+ visualizer = None
215
+
216
+ # set log level
217
+ log_level = 'INFO'
218
+
219
+ # load from which checkpoint
220
+ load_from = None
221
+
222
+ # whether to resume training from the loaded checkpoint
223
+ resume = False
224
+
225
+ # Defaults to use random seed and disable `deterministic`
226
+ randomness = dict(seed=None, deterministic=False)
227
+
228
+ # set log processor
229
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_oasst1_e3.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import ConcatDataset, process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn,
16
+ oasst1_map_fn, template_map_fn_factory)
17
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
18
+ VarlenAttnArgsToMessageHubHook)
19
+ from xtuner.engine.runner import TrainLoop
20
+ from xtuner.model import SupervisedFinetune
21
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
22
+
23
+ #######################################################################
24
+ # PART 1 Settings #
25
+ #######################################################################
26
+ # Model
27
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat'
28
+ use_varlen_attn = False
29
+
30
+ # Data
31
+ alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese'
32
+ alpaca_en_path = 'tatsu-lab/alpaca'
33
+ oasst1_path = 'timdettmers/openassistant-guanaco'
34
+ prompt_template = PROMPT_TEMPLATE.baichuan2_chat
35
+ max_length = 2048
36
+ pack_to_max_length = True
37
+
38
+ # Scheduler & Optimizer
39
+ batch_size = 1 # per_device
40
+ accumulative_counts = 16
41
+ dataloader_num_workers = 0
42
+ max_epochs = 3
43
+ optim_type = AdamW
44
+ lr = 2e-4
45
+ betas = (0.9, 0.999)
46
+ weight_decay = 0
47
+ max_norm = 1 # grad clip
48
+ warmup_ratio = 0.03
49
+
50
+ # Save
51
+ save_steps = 500
52
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
53
+
54
+ # Evaluate the generation performance during the training
55
+ evaluation_freq = 500
56
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
57
+ evaluation_inputs = [
58
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
59
+ ]
60
+
61
+ #######################################################################
62
+ # PART 2 Model & Tokenizer #
63
+ #######################################################################
64
+ tokenizer = dict(
65
+ type=AutoTokenizer.from_pretrained,
66
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
67
+ trust_remote_code=True,
68
+ padding_side='right')
69
+
70
+ model = dict(
71
+ type=SupervisedFinetune,
72
+ use_varlen_attn=use_varlen_attn,
73
+ llm=dict(
74
+ type=AutoModelForCausalLM.from_pretrained,
75
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
76
+ trust_remote_code=True,
77
+ torch_dtype=torch.float16,
78
+ quantization_config=dict(
79
+ type=BitsAndBytesConfig,
80
+ load_in_4bit=True,
81
+ load_in_8bit=False,
82
+ llm_int8_threshold=6.0,
83
+ llm_int8_has_fp16_weight=False,
84
+ bnb_4bit_compute_dtype=torch.float16,
85
+ bnb_4bit_use_double_quant=True,
86
+ bnb_4bit_quant_type='nf4')),
87
+ lora=dict(
88
+ type=LoraConfig,
89
+ r=64,
90
+ lora_alpha=16,
91
+ lora_dropout=0.1,
92
+ bias='none',
93
+ task_type='CAUSAL_LM'))
94
+
95
+ #######################################################################
96
+ # PART 3 Dataset & Dataloader #
97
+ #######################################################################
98
+ alpaca_en = dict(
99
+ type=process_hf_dataset,
100
+ dataset=dict(type=load_dataset, path=alpaca_en_path),
101
+ tokenizer=tokenizer,
102
+ max_length=max_length,
103
+ dataset_map_fn=alpaca_map_fn,
104
+ template_map_fn=dict(
105
+ type=template_map_fn_factory, template=prompt_template),
106
+ remove_unused_columns=True,
107
+ shuffle_before_pack=True,
108
+ pack_to_max_length=pack_to_max_length,
109
+ use_varlen_attn=use_varlen_attn)
110
+
111
+ alpaca_zh = dict(
112
+ type=process_hf_dataset,
113
+ dataset=dict(type=load_dataset, path=alpaca_zh_path),
114
+ tokenizer=tokenizer,
115
+ max_length=max_length,
116
+ dataset_map_fn=alpaca_zh_map_fn,
117
+ template_map_fn=dict(
118
+ type=template_map_fn_factory, template=prompt_template),
119
+ remove_unused_columns=True,
120
+ shuffle_before_pack=True,
121
+ pack_to_max_length=pack_to_max_length,
122
+ use_varlen_attn=use_varlen_attn)
123
+
124
+ oasst1 = dict(
125
+ type=process_hf_dataset,
126
+ dataset=dict(type=load_dataset, path=oasst1_path),
127
+ tokenizer=tokenizer,
128
+ max_length=max_length,
129
+ dataset_map_fn=oasst1_map_fn,
130
+ template_map_fn=dict(
131
+ type=template_map_fn_factory, template=prompt_template),
132
+ remove_unused_columns=True,
133
+ shuffle_before_pack=True,
134
+ pack_to_max_length=pack_to_max_length,
135
+ use_varlen_attn=use_varlen_attn)
136
+
137
+ train_dataset = dict(
138
+ type=ConcatDataset, datasets=[alpaca_en, alpaca_zh, oasst1])
139
+
140
+ train_dataloader = dict(
141
+ batch_size=batch_size,
142
+ num_workers=dataloader_num_workers,
143
+ dataset=train_dataset,
144
+ sampler=dict(type=DefaultSampler, shuffle=True),
145
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
146
+
147
+ #######################################################################
148
+ # PART 4 Scheduler & Optimizer #
149
+ #######################################################################
150
+ # optimizer
151
+ optim_wrapper = dict(
152
+ type=AmpOptimWrapper,
153
+ optimizer=dict(
154
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
155
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
156
+ accumulative_counts=accumulative_counts,
157
+ loss_scale='dynamic',
158
+ dtype='float16')
159
+
160
+ # learning policy
161
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
162
+ param_scheduler = [
163
+ dict(
164
+ type=LinearLR,
165
+ start_factor=1e-5,
166
+ by_epoch=True,
167
+ begin=0,
168
+ end=warmup_ratio * max_epochs,
169
+ convert_to_iter_based=True),
170
+ dict(
171
+ type=CosineAnnealingLR,
172
+ eta_min=0.0,
173
+ by_epoch=True,
174
+ begin=warmup_ratio * max_epochs,
175
+ end=max_epochs,
176
+ convert_to_iter_based=True)
177
+ ]
178
+
179
+ # train, val, test setting
180
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
181
+
182
+ #######################################################################
183
+ # PART 5 Runtime #
184
+ #######################################################################
185
+ # Log the dialogue periodically during the training process, optional
186
+ custom_hooks = [
187
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
188
+ dict(
189
+ type=EvaluateChatHook,
190
+ tokenizer=tokenizer,
191
+ every_n_iters=evaluation_freq,
192
+ evaluation_inputs=evaluation_inputs,
193
+ system=SYSTEM,
194
+ prompt_template=prompt_template)
195
+ ]
196
+
197
+ if use_varlen_attn:
198
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
199
+
200
+ # configure default hooks
201
+ default_hooks = dict(
202
+ # record the time of every iteration.
203
+ timer=dict(type=IterTimerHook),
204
+ # print log every 10 iterations.
205
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
206
+ # enable the parameter scheduler.
207
+ param_scheduler=dict(type=ParamSchedulerHook),
208
+ # save checkpoint per `save_steps`.
209
+ checkpoint=dict(
210
+ type=CheckpointHook,
211
+ by_epoch=False,
212
+ interval=save_steps,
213
+ max_keep_ckpts=save_total_limit),
214
+ # set sampler seed in distributed evrionment.
215
+ sampler_seed=dict(type=DistSamplerSeedHook),
216
+ )
217
+
218
+ # configure environment
219
+ env_cfg = dict(
220
+ # whether to enable cudnn benchmark
221
+ cudnn_benchmark=False,
222
+ # set multi process parameters
223
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
224
+ # set distributed parameters
225
+ dist_cfg=dict(backend='nccl'),
226
+ )
227
+
228
+ # set visualizer
229
+ visualizer = None
230
+
231
+ # set log level
232
+ log_level = 'INFO'
233
+
234
+ # load from which checkpoint
235
+ load_from = None
236
+
237
+ # whether to resume training from the loaded checkpoint
238
+ resume = False
239
+
240
+ # Defaults to use random seed and disable `deterministic`
241
+ randomness = dict(seed=None, deterministic=False)
242
+
243
+ # set log processor
244
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_zh_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import alpaca_zh_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese'
31
+ prompt_template = PROMPT_TEMPLATE.baichuan2_chat
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ alpaca_zh = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=alpaca_zh_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=alpaca_zh_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=alpaca_zh,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_code_alpaca_e3.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import code_alpaca_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ data_path = 'HuggingFaceH4/CodeAlpaca_20K'
31
+ prompt_template = PROMPT_TEMPLATE.baichuan2_chat
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 100
53
+ SYSTEM = SYSTEM_TEMPLATE.coder
54
+ evaluation_inputs = [
55
+ ('写一个Python函数,将十六进制颜色代码(如#0066ee)转换为对应的'
56
+ '红、绿、蓝(RGB)三个颜色分量值,并以元组的形式返回。'),
57
+ ('Write a Python function that takes a hexadecimal color code '
58
+ '(e.g., #0066ee) as input and converts it into the corresponding '
59
+ 'red, green, and blue (RGB) color component values.')
60
+ ]
61
+
62
+ #######################################################################
63
+ # PART 2 Model & Tokenizer #
64
+ #######################################################################
65
+ tokenizer = dict(
66
+ type=AutoTokenizer.from_pretrained,
67
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
68
+ trust_remote_code=True,
69
+ padding_side='right')
70
+
71
+ model = dict(
72
+ type=SupervisedFinetune,
73
+ use_varlen_attn=use_varlen_attn,
74
+ llm=dict(
75
+ type=AutoModelForCausalLM.from_pretrained,
76
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
77
+ trust_remote_code=True,
78
+ torch_dtype=torch.float16,
79
+ quantization_config=dict(
80
+ type=BitsAndBytesConfig,
81
+ load_in_4bit=True,
82
+ load_in_8bit=False,
83
+ llm_int8_threshold=6.0,
84
+ llm_int8_has_fp16_weight=False,
85
+ bnb_4bit_compute_dtype=torch.float16,
86
+ bnb_4bit_use_double_quant=True,
87
+ bnb_4bit_quant_type='nf4')),
88
+ lora=dict(
89
+ type=LoraConfig,
90
+ r=64,
91
+ lora_alpha=16,
92
+ lora_dropout=0.1,
93
+ bias='none',
94
+ task_type='CAUSAL_LM'))
95
+
96
+ #######################################################################
97
+ # PART 3 Dataset & Dataloader #
98
+ #######################################################################
99
+ train_dataset = dict(
100
+ type=process_hf_dataset,
101
+ dataset=dict(type=load_dataset, path=data_path),
102
+ tokenizer=tokenizer,
103
+ max_length=max_length,
104
+ dataset_map_fn=code_alpaca_map_fn,
105
+ template_map_fn=dict(
106
+ type=template_map_fn_factory, template=prompt_template),
107
+ remove_unused_columns=True,
108
+ shuffle_before_pack=True,
109
+ pack_to_max_length=pack_to_max_length,
110
+ use_varlen_attn=use_varlen_attn)
111
+
112
+ train_dataloader = dict(
113
+ batch_size=batch_size,
114
+ num_workers=dataloader_num_workers,
115
+ dataset=train_dataset,
116
+ sampler=dict(type=DefaultSampler, shuffle=True),
117
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
118
+
119
+ #######################################################################
120
+ # PART 4 Scheduler & Optimizer #
121
+ #######################################################################
122
+ # optimizer
123
+ optim_wrapper = dict(
124
+ type=AmpOptimWrapper,
125
+ optimizer=dict(
126
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
127
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
128
+ accumulative_counts=accumulative_counts,
129
+ loss_scale='dynamic',
130
+ dtype='float16')
131
+
132
+ # learning policy
133
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
134
+ param_scheduler = [
135
+ dict(
136
+ type=LinearLR,
137
+ start_factor=1e-5,
138
+ by_epoch=True,
139
+ begin=0,
140
+ end=warmup_ratio * max_epochs,
141
+ convert_to_iter_based=True),
142
+ dict(
143
+ type=CosineAnnealingLR,
144
+ eta_min=0.0,
145
+ by_epoch=True,
146
+ begin=warmup_ratio * max_epochs,
147
+ end=max_epochs,
148
+ convert_to_iter_based=True)
149
+ ]
150
+
151
+ # train, val, test setting
152
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
153
+
154
+ #######################################################################
155
+ # PART 5 Runtime #
156
+ #######################################################################
157
+ # Log the dialogue periodically during the training process, optional
158
+ custom_hooks = [
159
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
160
+ dict(
161
+ type=EvaluateChatHook,
162
+ tokenizer=tokenizer,
163
+ every_n_iters=evaluation_freq,
164
+ evaluation_inputs=evaluation_inputs,
165
+ system=SYSTEM,
166
+ prompt_template=prompt_template)
167
+ ]
168
+
169
+ if use_varlen_attn:
170
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
171
+
172
+ # configure default hooks
173
+ default_hooks = dict(
174
+ # record the time of every iteration.
175
+ timer=dict(type=IterTimerHook),
176
+ # print log every 10 iterations.
177
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
178
+ # enable the parameter scheduler.
179
+ param_scheduler=dict(type=ParamSchedulerHook),
180
+ # save checkpoint per `save_steps`.
181
+ checkpoint=dict(
182
+ type=CheckpointHook,
183
+ by_epoch=False,
184
+ interval=save_steps,
185
+ max_keep_ckpts=save_total_limit),
186
+ # set sampler seed in distributed evrionment.
187
+ sampler_seed=dict(type=DistSamplerSeedHook),
188
+ )
189
+
190
+ # configure environment
191
+ env_cfg = dict(
192
+ # whether to enable cudnn benchmark
193
+ cudnn_benchmark=False,
194
+ # set multi process parameters
195
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
196
+ # set distributed parameters
197
+ dist_cfg=dict(backend='nccl'),
198
+ )
199
+
200
+ # set visualizer
201
+ visualizer = None
202
+
203
+ # set log level
204
+ log_level = 'INFO'
205
+
206
+ # load from which checkpoint
207
+ load_from = None
208
+
209
+ # whether to resume training from the loaded checkpoint
210
+ resume = False
211
+
212
+ # Defaults to use random seed and disable `deterministic`
213
+ randomness = dict(seed=None, deterministic=False)
214
+
215
+ # set log processor
216
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_lawyer_e3.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import ConcatDataset, process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import (crime_kg_assitant_map_fn,
16
+ law_reference_map_fn,
17
+ template_map_fn_factory)
18
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
19
+ VarlenAttnArgsToMessageHubHook)
20
+ from xtuner.engine.runner import TrainLoop
21
+ from xtuner.model import SupervisedFinetune
22
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
23
+
24
+ #######################################################################
25
+ # PART 1 Settings #
26
+ #######################################################################
27
+ # Model
28
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat'
29
+ use_varlen_attn = False
30
+
31
+ # Data
32
+ # download data from https://github.com/LiuHC0428/LAW-GPT
33
+ crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json'
34
+ law_reference_data_path = './data/训练数据_带法律依据_92k.json'
35
+ prompt_template = PROMPT_TEMPLATE.baichuan2_chat
36
+ max_length = 2048
37
+ pack_to_max_length = True
38
+
39
+ # Scheduler & Optimizer
40
+ batch_size = 1 # per_device
41
+ accumulative_counts = 16
42
+ dataloader_num_workers = 0
43
+ max_epochs = 3
44
+ optim_type = AdamW
45
+ lr = 2e-4
46
+ betas = (0.9, 0.999)
47
+ weight_decay = 0
48
+ max_norm = 1 # grad clip
49
+ warmup_ratio = 0.03
50
+
51
+ # Save
52
+ save_steps = 500
53
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
54
+
55
+ # Evaluate the generation performance during the training
56
+ evaluation_freq = 500
57
+ SYSTEM = SYSTEM_TEMPLATE.lawyer
58
+ evaluation_inputs = ['请问离婚需要准备什么材料?', '销售鳄鱼皮包违法吗?']
59
+
60
+ #######################################################################
61
+ # PART 2 Model & Tokenizer #
62
+ #######################################################################
63
+ tokenizer = dict(
64
+ type=AutoTokenizer.from_pretrained,
65
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
66
+ trust_remote_code=True,
67
+ padding_side='right')
68
+
69
+ model = dict(
70
+ type=SupervisedFinetune,
71
+ use_varlen_attn=use_varlen_attn,
72
+ llm=dict(
73
+ type=AutoModelForCausalLM.from_pretrained,
74
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
75
+ trust_remote_code=True,
76
+ torch_dtype=torch.float16,
77
+ quantization_config=dict(
78
+ type=BitsAndBytesConfig,
79
+ load_in_4bit=True,
80
+ load_in_8bit=False,
81
+ llm_int8_threshold=6.0,
82
+ llm_int8_has_fp16_weight=False,
83
+ bnb_4bit_compute_dtype=torch.float16,
84
+ bnb_4bit_use_double_quant=True,
85
+ bnb_4bit_quant_type='nf4')),
86
+ lora=dict(
87
+ type=LoraConfig,
88
+ r=64,
89
+ lora_alpha=16,
90
+ lora_dropout=0.1,
91
+ bias='none',
92
+ task_type='CAUSAL_LM'))
93
+
94
+ #######################################################################
95
+ # PART 3 Dataset & Dataloader #
96
+ #######################################################################
97
+ crime_kg_assitant = dict(
98
+ type=process_hf_dataset,
99
+ dataset=dict(
100
+ type=load_dataset,
101
+ path='json',
102
+ data_files=dict(train=crime_kg_assitant_path)),
103
+ tokenizer=tokenizer,
104
+ max_length=max_length,
105
+ dataset_map_fn=crime_kg_assitant_map_fn,
106
+ template_map_fn=dict(
107
+ type=template_map_fn_factory, template=prompt_template),
108
+ remove_unused_columns=True,
109
+ shuffle_before_pack=True,
110
+ pack_to_max_length=pack_to_max_length,
111
+ use_varlen_attn=use_varlen_attn)
112
+
113
+ law_reference_data = dict(
114
+ type=process_hf_dataset,
115
+ dataset=dict(
116
+ type=load_dataset,
117
+ path='json',
118
+ data_files=dict(train=law_reference_data_path)),
119
+ tokenizer=tokenizer,
120
+ max_length=max_length,
121
+ dataset_map_fn=law_reference_map_fn,
122
+ template_map_fn=dict(
123
+ type=template_map_fn_factory, template=prompt_template),
124
+ remove_unused_columns=True,
125
+ shuffle_before_pack=True,
126
+ pack_to_max_length=pack_to_max_length,
127
+ use_varlen_attn=use_varlen_attn)
128
+
129
+ train_dataset = dict(
130
+ type=ConcatDataset, datasets=[crime_kg_assitant, law_reference_data])
131
+
132
+ train_dataloader = dict(
133
+ batch_size=batch_size,
134
+ num_workers=dataloader_num_workers,
135
+ dataset=train_dataset,
136
+ sampler=dict(type=DefaultSampler, shuffle=True),
137
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
138
+
139
+ #######################################################################
140
+ # PART 4 Scheduler & Optimizer #
141
+ #######################################################################
142
+ # optimizer
143
+ optim_wrapper = dict(
144
+ type=AmpOptimWrapper,
145
+ optimizer=dict(
146
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
147
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
148
+ accumulative_counts=accumulative_counts,
149
+ loss_scale='dynamic',
150
+ dtype='float16')
151
+
152
+ # learning policy
153
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
154
+ param_scheduler = [
155
+ dict(
156
+ type=LinearLR,
157
+ start_factor=1e-5,
158
+ by_epoch=True,
159
+ begin=0,
160
+ end=warmup_ratio * max_epochs,
161
+ convert_to_iter_based=True),
162
+ dict(
163
+ type=CosineAnnealingLR,
164
+ eta_min=0.0,
165
+ by_epoch=True,
166
+ begin=warmup_ratio * max_epochs,
167
+ end=max_epochs,
168
+ convert_to_iter_based=True)
169
+ ]
170
+
171
+ # train, val, test setting
172
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
173
+
174
+ #######################################################################
175
+ # PART 5 Runtime #
176
+ #######################################################################
177
+ # Log the dialogue periodically during the training process, optional
178
+ custom_hooks = [
179
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
180
+ dict(
181
+ type=EvaluateChatHook,
182
+ tokenizer=tokenizer,
183
+ every_n_iters=evaluation_freq,
184
+ evaluation_inputs=evaluation_inputs,
185
+ system=SYSTEM,
186
+ prompt_template=prompt_template)
187
+ ]
188
+
189
+ if use_varlen_attn:
190
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
191
+
192
+ # configure default hooks
193
+ default_hooks = dict(
194
+ # record the time of every iteration.
195
+ timer=dict(type=IterTimerHook),
196
+ # print log every 10 iterations.
197
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
198
+ # enable the parameter scheduler.
199
+ param_scheduler=dict(type=ParamSchedulerHook),
200
+ # save checkpoint per `save_steps`.
201
+ checkpoint=dict(
202
+ type=CheckpointHook,
203
+ by_epoch=False,
204
+ interval=save_steps,
205
+ max_keep_ckpts=save_total_limit),
206
+ # set sampler seed in distributed evrionment.
207
+ sampler_seed=dict(type=DistSamplerSeedHook),
208
+ )
209
+
210
+ # configure environment
211
+ env_cfg = dict(
212
+ # whether to enable cudnn benchmark
213
+ cudnn_benchmark=False,
214
+ # set multi process parameters
215
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
216
+ # set distributed parameters
217
+ dist_cfg=dict(backend='nccl'),
218
+ )
219
+
220
+ # set visualizer
221
+ visualizer = None
222
+
223
+ # set log level
224
+ log_level = 'INFO'
225
+
226
+ # load from which checkpoint
227
+ load_from = None
228
+
229
+ # whether to resume training from the loaded checkpoint
230
+ resume = False
231
+
232
+ # Defaults to use random seed and disable `deterministic`
233
+ randomness = dict(seed=None, deterministic=False)
234
+
235
+ # set log processor
236
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_512_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ data_path = 'timdettmers/openassistant-guanaco'
31
+ prompt_template = PROMPT_TEMPLATE.baichuan2_chat
32
+ max_length = 512
33
+ pack_to_max_length = False
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = ''
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ train_dataset = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=data_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=oasst1_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=train_dataset,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ data_path = 'timdettmers/openassistant-guanaco'
31
+ prompt_template = PROMPT_TEMPLATE.baichuan2_chat
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = ''
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ train_dataset = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=data_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=oasst1_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=train_dataset,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_open_platypus_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ data_path = 'garage-bAInd/Open-Platypus'
31
+ prompt_template = PROMPT_TEMPLATE.baichuan2_chat
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ train_dataset = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=data_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=alpaca_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=train_dataset,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_alpaca_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ alpaca_en_path = 'tatsu-lab/alpaca'
31
+ prompt_template = PROMPT_TEMPLATE.default
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ alpaca_en = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=alpaca_en_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=alpaca_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=alpaca_en,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_alpaca_enzh_e3.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import ConcatDataset, process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn,
16
+ template_map_fn_factory)
17
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
18
+ VarlenAttnArgsToMessageHubHook)
19
+ from xtuner.engine.runner import TrainLoop
20
+ from xtuner.model import SupervisedFinetune
21
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
22
+
23
+ #######################################################################
24
+ # PART 1 Settings #
25
+ #######################################################################
26
+ # Model
27
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Base'
28
+ use_varlen_attn = False
29
+
30
+ # Data
31
+ alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese'
32
+ alpaca_en_path = 'tatsu-lab/alpaca'
33
+ prompt_template = PROMPT_TEMPLATE.default
34
+ max_length = 2048
35
+ pack_to_max_length = True
36
+
37
+ # Scheduler & Optimizer
38
+ batch_size = 1 # per_device
39
+ accumulative_counts = 16
40
+ dataloader_num_workers = 0
41
+ max_epochs = 3
42
+ optim_type = AdamW
43
+ lr = 2e-4
44
+ betas = (0.9, 0.999)
45
+ weight_decay = 0
46
+ max_norm = 1 # grad clip
47
+ warmup_ratio = 0.03
48
+
49
+ # Save
50
+ save_steps = 500
51
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
52
+
53
+ # Evaluate the generation performance during the training
54
+ evaluation_freq = 500
55
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
56
+ evaluation_inputs = [
57
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
58
+ ]
59
+
60
+ #######################################################################
61
+ # PART 2 Model & Tokenizer #
62
+ #######################################################################
63
+ tokenizer = dict(
64
+ type=AutoTokenizer.from_pretrained,
65
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
66
+ trust_remote_code=True,
67
+ padding_side='right')
68
+
69
+ model = dict(
70
+ type=SupervisedFinetune,
71
+ use_varlen_attn=use_varlen_attn,
72
+ llm=dict(
73
+ type=AutoModelForCausalLM.from_pretrained,
74
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
75
+ trust_remote_code=True,
76
+ torch_dtype=torch.float16,
77
+ quantization_config=dict(
78
+ type=BitsAndBytesConfig,
79
+ load_in_4bit=True,
80
+ load_in_8bit=False,
81
+ llm_int8_threshold=6.0,
82
+ llm_int8_has_fp16_weight=False,
83
+ bnb_4bit_compute_dtype=torch.float16,
84
+ bnb_4bit_use_double_quant=True,
85
+ bnb_4bit_quant_type='nf4')),
86
+ lora=dict(
87
+ type=LoraConfig,
88
+ r=64,
89
+ lora_alpha=16,
90
+ lora_dropout=0.1,
91
+ bias='none',
92
+ task_type='CAUSAL_LM'))
93
+
94
+ #######################################################################
95
+ # PART 3 Dataset & Dataloader #
96
+ #######################################################################
97
+ alpaca_en = dict(
98
+ type=process_hf_dataset,
99
+ dataset=dict(type=load_dataset, path=alpaca_en_path),
100
+ tokenizer=tokenizer,
101
+ max_length=max_length,
102
+ dataset_map_fn=alpaca_map_fn,
103
+ template_map_fn=dict(
104
+ type=template_map_fn_factory, template=prompt_template),
105
+ remove_unused_columns=True,
106
+ shuffle_before_pack=True,
107
+ pack_to_max_length=pack_to_max_length,
108
+ use_varlen_attn=use_varlen_attn)
109
+
110
+ alpaca_zh = dict(
111
+ type=process_hf_dataset,
112
+ dataset=dict(type=load_dataset, path=alpaca_zh_path),
113
+ tokenizer=tokenizer,
114
+ max_length=max_length,
115
+ dataset_map_fn=alpaca_zh_map_fn,
116
+ template_map_fn=dict(
117
+ type=template_map_fn_factory, template=prompt_template),
118
+ remove_unused_columns=True,
119
+ shuffle_before_pack=True,
120
+ pack_to_max_length=pack_to_max_length,
121
+ use_varlen_attn=use_varlen_attn)
122
+
123
+ train_dataset = dict(type=ConcatDataset, datasets=[alpaca_en, alpaca_zh])
124
+
125
+ train_dataloader = dict(
126
+ batch_size=batch_size,
127
+ num_workers=dataloader_num_workers,
128
+ dataset=train_dataset,
129
+ sampler=dict(type=DefaultSampler, shuffle=True),
130
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
131
+
132
+ #######################################################################
133
+ # PART 4 Scheduler & Optimizer #
134
+ #######################################################################
135
+ # optimizer
136
+ optim_wrapper = dict(
137
+ type=AmpOptimWrapper,
138
+ optimizer=dict(
139
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
140
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
141
+ accumulative_counts=accumulative_counts,
142
+ loss_scale='dynamic',
143
+ dtype='float16')
144
+
145
+ # learning policy
146
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
147
+ param_scheduler = [
148
+ dict(
149
+ type=LinearLR,
150
+ start_factor=1e-5,
151
+ by_epoch=True,
152
+ begin=0,
153
+ end=warmup_ratio * max_epochs,
154
+ convert_to_iter_based=True),
155
+ dict(
156
+ type=CosineAnnealingLR,
157
+ eta_min=0.0,
158
+ by_epoch=True,
159
+ begin=warmup_ratio * max_epochs,
160
+ end=max_epochs,
161
+ convert_to_iter_based=True)
162
+ ]
163
+
164
+ # train, val, test setting
165
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
166
+
167
+ #######################################################################
168
+ # PART 5 Runtime #
169
+ #######################################################################
170
+ # Log the dialogue periodically during the training process, optional
171
+ custom_hooks = [
172
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
173
+ dict(
174
+ type=EvaluateChatHook,
175
+ tokenizer=tokenizer,
176
+ every_n_iters=evaluation_freq,
177
+ evaluation_inputs=evaluation_inputs,
178
+ system=SYSTEM,
179
+ prompt_template=prompt_template)
180
+ ]
181
+
182
+ if use_varlen_attn:
183
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
184
+
185
+ # configure default hooks
186
+ default_hooks = dict(
187
+ # record the time of every iteration.
188
+ timer=dict(type=IterTimerHook),
189
+ # print log every 10 iterations.
190
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
191
+ # enable the parameter scheduler.
192
+ param_scheduler=dict(type=ParamSchedulerHook),
193
+ # save checkpoint per `save_steps`.
194
+ checkpoint=dict(
195
+ type=CheckpointHook,
196
+ by_epoch=False,
197
+ interval=save_steps,
198
+ max_keep_ckpts=save_total_limit),
199
+ # set sampler seed in distributed evrionment.
200
+ sampler_seed=dict(type=DistSamplerSeedHook),
201
+ )
202
+
203
+ # configure environment
204
+ env_cfg = dict(
205
+ # whether to enable cudnn benchmark
206
+ cudnn_benchmark=False,
207
+ # set multi process parameters
208
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
209
+ # set distributed parameters
210
+ dist_cfg=dict(backend='nccl'),
211
+ )
212
+
213
+ # set visualizer
214
+ visualizer = None
215
+
216
+ # set log level
217
+ log_level = 'INFO'
218
+
219
+ # load from which checkpoint
220
+ load_from = None
221
+
222
+ # whether to resume training from the loaded checkpoint
223
+ resume = False
224
+
225
+ # Defaults to use random seed and disable `deterministic`
226
+ randomness = dict(seed=None, deterministic=False)
227
+
228
+ # set log processor
229
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_alpaca_enzh_oasst1_e3.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import ConcatDataset, process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn,
16
+ oasst1_map_fn, template_map_fn_factory)
17
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
18
+ VarlenAttnArgsToMessageHubHook)
19
+ from xtuner.engine.runner import TrainLoop
20
+ from xtuner.model import SupervisedFinetune
21
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
22
+
23
+ #######################################################################
24
+ # PART 1 Settings #
25
+ #######################################################################
26
+ # Model
27
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Base'
28
+ use_varlen_attn = False
29
+
30
+ # Data
31
+ alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese'
32
+ alpaca_en_path = 'tatsu-lab/alpaca'
33
+ oasst1_path = 'timdettmers/openassistant-guanaco'
34
+ prompt_template = PROMPT_TEMPLATE.default
35
+ max_length = 2048
36
+ pack_to_max_length = True
37
+
38
+ # Scheduler & Optimizer
39
+ batch_size = 1 # per_device
40
+ accumulative_counts = 16
41
+ dataloader_num_workers = 0
42
+ max_epochs = 3
43
+ optim_type = AdamW
44
+ lr = 2e-4
45
+ betas = (0.9, 0.999)
46
+ weight_decay = 0
47
+ max_norm = 1 # grad clip
48
+ warmup_ratio = 0.03
49
+
50
+ # Save
51
+ save_steps = 500
52
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
53
+
54
+ # Evaluate the generation performance during the training
55
+ evaluation_freq = 500
56
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
57
+ evaluation_inputs = [
58
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
59
+ ]
60
+
61
+ #######################################################################
62
+ # PART 2 Model & Tokenizer #
63
+ #######################################################################
64
+ tokenizer = dict(
65
+ type=AutoTokenizer.from_pretrained,
66
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
67
+ trust_remote_code=True,
68
+ padding_side='right')
69
+
70
+ model = dict(
71
+ type=SupervisedFinetune,
72
+ use_varlen_attn=use_varlen_attn,
73
+ llm=dict(
74
+ type=AutoModelForCausalLM.from_pretrained,
75
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
76
+ trust_remote_code=True,
77
+ torch_dtype=torch.float16,
78
+ quantization_config=dict(
79
+ type=BitsAndBytesConfig,
80
+ load_in_4bit=True,
81
+ load_in_8bit=False,
82
+ llm_int8_threshold=6.0,
83
+ llm_int8_has_fp16_weight=False,
84
+ bnb_4bit_compute_dtype=torch.float16,
85
+ bnb_4bit_use_double_quant=True,
86
+ bnb_4bit_quant_type='nf4')),
87
+ lora=dict(
88
+ type=LoraConfig,
89
+ r=64,
90
+ lora_alpha=16,
91
+ lora_dropout=0.1,
92
+ bias='none',
93
+ task_type='CAUSAL_LM'))
94
+
95
+ #######################################################################
96
+ # PART 3 Dataset & Dataloader #
97
+ #######################################################################
98
+ alpaca_en = dict(
99
+ type=process_hf_dataset,
100
+ dataset=dict(type=load_dataset, path=alpaca_en_path),
101
+ tokenizer=tokenizer,
102
+ max_length=max_length,
103
+ dataset_map_fn=alpaca_map_fn,
104
+ template_map_fn=dict(
105
+ type=template_map_fn_factory, template=prompt_template),
106
+ remove_unused_columns=True,
107
+ shuffle_before_pack=True,
108
+ pack_to_max_length=pack_to_max_length,
109
+ use_varlen_attn=use_varlen_attn)
110
+
111
+ alpaca_zh = dict(
112
+ type=process_hf_dataset,
113
+ dataset=dict(type=load_dataset, path=alpaca_zh_path),
114
+ tokenizer=tokenizer,
115
+ max_length=max_length,
116
+ dataset_map_fn=alpaca_zh_map_fn,
117
+ template_map_fn=dict(
118
+ type=template_map_fn_factory, template=prompt_template),
119
+ remove_unused_columns=True,
120
+ shuffle_before_pack=True,
121
+ pack_to_max_length=pack_to_max_length,
122
+ use_varlen_attn=use_varlen_attn)
123
+
124
+ oasst1 = dict(
125
+ type=process_hf_dataset,
126
+ dataset=dict(type=load_dataset, path=oasst1_path),
127
+ tokenizer=tokenizer,
128
+ max_length=max_length,
129
+ dataset_map_fn=oasst1_map_fn,
130
+ template_map_fn=dict(
131
+ type=template_map_fn_factory, template=prompt_template),
132
+ remove_unused_columns=True,
133
+ shuffle_before_pack=True,
134
+ pack_to_max_length=pack_to_max_length,
135
+ use_varlen_attn=use_varlen_attn)
136
+
137
+ train_dataset = dict(
138
+ type=ConcatDataset, datasets=[alpaca_en, alpaca_zh, oasst1])
139
+
140
+ train_dataloader = dict(
141
+ batch_size=batch_size,
142
+ num_workers=dataloader_num_workers,
143
+ dataset=train_dataset,
144
+ sampler=dict(type=DefaultSampler, shuffle=True),
145
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
146
+
147
+ #######################################################################
148
+ # PART 4 Scheduler & Optimizer #
149
+ #######################################################################
150
+ # optimizer
151
+ optim_wrapper = dict(
152
+ type=AmpOptimWrapper,
153
+ optimizer=dict(
154
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
155
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
156
+ accumulative_counts=accumulative_counts,
157
+ loss_scale='dynamic',
158
+ dtype='float16')
159
+
160
+ # learning policy
161
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
162
+ param_scheduler = [
163
+ dict(
164
+ type=LinearLR,
165
+ start_factor=1e-5,
166
+ by_epoch=True,
167
+ begin=0,
168
+ end=warmup_ratio * max_epochs,
169
+ convert_to_iter_based=True),
170
+ dict(
171
+ type=CosineAnnealingLR,
172
+ eta_min=0.0,
173
+ by_epoch=True,
174
+ begin=warmup_ratio * max_epochs,
175
+ end=max_epochs,
176
+ convert_to_iter_based=True)
177
+ ]
178
+
179
+ # train, val, test setting
180
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
181
+
182
+ #######################################################################
183
+ # PART 5 Runtime #
184
+ #######################################################################
185
+ # Log the dialogue periodically during the training process, optional
186
+ custom_hooks = [
187
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
188
+ dict(
189
+ type=EvaluateChatHook,
190
+ tokenizer=tokenizer,
191
+ every_n_iters=evaluation_freq,
192
+ evaluation_inputs=evaluation_inputs,
193
+ system=SYSTEM,
194
+ prompt_template=prompt_template)
195
+ ]
196
+
197
+ if use_varlen_attn:
198
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
199
+
200
+ # configure default hooks
201
+ default_hooks = dict(
202
+ # record the time of every iteration.
203
+ timer=dict(type=IterTimerHook),
204
+ # print log every 10 iterations.
205
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
206
+ # enable the parameter scheduler.
207
+ param_scheduler=dict(type=ParamSchedulerHook),
208
+ # save checkpoint per `save_steps`.
209
+ checkpoint=dict(
210
+ type=CheckpointHook,
211
+ by_epoch=False,
212
+ interval=save_steps,
213
+ max_keep_ckpts=save_total_limit),
214
+ # set sampler seed in distributed evrionment.
215
+ sampler_seed=dict(type=DistSamplerSeedHook),
216
+ )
217
+
218
+ # configure environment
219
+ env_cfg = dict(
220
+ # whether to enable cudnn benchmark
221
+ cudnn_benchmark=False,
222
+ # set multi process parameters
223
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
224
+ # set distributed parameters
225
+ dist_cfg=dict(backend='nccl'),
226
+ )
227
+
228
+ # set visualizer
229
+ visualizer = None
230
+
231
+ # set log level
232
+ log_level = 'INFO'
233
+
234
+ # load from which checkpoint
235
+ load_from = None
236
+
237
+ # whether to resume training from the loaded checkpoint
238
+ resume = False
239
+
240
+ # Defaults to use random seed and disable `deterministic`
241
+ randomness = dict(seed=None, deterministic=False)
242
+
243
+ # set log processor
244
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_alpaca_zh_e3.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import alpaca_zh_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese'
31
+ prompt_template = PROMPT_TEMPLATE.default
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 500
53
+ SYSTEM = SYSTEM_TEMPLATE.alpaca
54
+ evaluation_inputs = [
55
+ '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai'
56
+ ]
57
+
58
+ #######################################################################
59
+ # PART 2 Model & Tokenizer #
60
+ #######################################################################
61
+ tokenizer = dict(
62
+ type=AutoTokenizer.from_pretrained,
63
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
64
+ trust_remote_code=True,
65
+ padding_side='right')
66
+
67
+ model = dict(
68
+ type=SupervisedFinetune,
69
+ use_varlen_attn=use_varlen_attn,
70
+ llm=dict(
71
+ type=AutoModelForCausalLM.from_pretrained,
72
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16,
75
+ quantization_config=dict(
76
+ type=BitsAndBytesConfig,
77
+ load_in_4bit=True,
78
+ load_in_8bit=False,
79
+ llm_int8_threshold=6.0,
80
+ llm_int8_has_fp16_weight=False,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ bnb_4bit_quant_type='nf4')),
84
+ lora=dict(
85
+ type=LoraConfig,
86
+ r=64,
87
+ lora_alpha=16,
88
+ lora_dropout=0.1,
89
+ bias='none',
90
+ task_type='CAUSAL_LM'))
91
+
92
+ #######################################################################
93
+ # PART 3 Dataset & Dataloader #
94
+ #######################################################################
95
+ alpaca_zh = dict(
96
+ type=process_hf_dataset,
97
+ dataset=dict(type=load_dataset, path=alpaca_zh_path),
98
+ tokenizer=tokenizer,
99
+ max_length=max_length,
100
+ dataset_map_fn=alpaca_zh_map_fn,
101
+ template_map_fn=dict(
102
+ type=template_map_fn_factory, template=prompt_template),
103
+ remove_unused_columns=True,
104
+ shuffle_before_pack=True,
105
+ pack_to_max_length=pack_to_max_length,
106
+ use_varlen_attn=use_varlen_attn)
107
+
108
+ train_dataloader = dict(
109
+ batch_size=batch_size,
110
+ num_workers=dataloader_num_workers,
111
+ dataset=alpaca_zh,
112
+ sampler=dict(type=DefaultSampler, shuffle=True),
113
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
114
+
115
+ #######################################################################
116
+ # PART 4 Scheduler & Optimizer #
117
+ #######################################################################
118
+ # optimizer
119
+ optim_wrapper = dict(
120
+ type=AmpOptimWrapper,
121
+ optimizer=dict(
122
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
123
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
124
+ accumulative_counts=accumulative_counts,
125
+ loss_scale='dynamic',
126
+ dtype='float16')
127
+
128
+ # learning policy
129
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
130
+ param_scheduler = [
131
+ dict(
132
+ type=LinearLR,
133
+ start_factor=1e-5,
134
+ by_epoch=True,
135
+ begin=0,
136
+ end=warmup_ratio * max_epochs,
137
+ convert_to_iter_based=True),
138
+ dict(
139
+ type=CosineAnnealingLR,
140
+ eta_min=0.0,
141
+ by_epoch=True,
142
+ begin=warmup_ratio * max_epochs,
143
+ end=max_epochs,
144
+ convert_to_iter_based=True)
145
+ ]
146
+
147
+ # train, val, test setting
148
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
149
+
150
+ #######################################################################
151
+ # PART 5 Runtime #
152
+ #######################################################################
153
+ # Log the dialogue periodically during the training process, optional
154
+ custom_hooks = [
155
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
156
+ dict(
157
+ type=EvaluateChatHook,
158
+ tokenizer=tokenizer,
159
+ every_n_iters=evaluation_freq,
160
+ evaluation_inputs=evaluation_inputs,
161
+ system=SYSTEM,
162
+ prompt_template=prompt_template)
163
+ ]
164
+
165
+ if use_varlen_attn:
166
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
167
+
168
+ # configure default hooks
169
+ default_hooks = dict(
170
+ # record the time of every iteration.
171
+ timer=dict(type=IterTimerHook),
172
+ # print log every 10 iterations.
173
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
174
+ # enable the parameter scheduler.
175
+ param_scheduler=dict(type=ParamSchedulerHook),
176
+ # save checkpoint per `save_steps`.
177
+ checkpoint=dict(
178
+ type=CheckpointHook,
179
+ by_epoch=False,
180
+ interval=save_steps,
181
+ max_keep_ckpts=save_total_limit),
182
+ # set sampler seed in distributed evrionment.
183
+ sampler_seed=dict(type=DistSamplerSeedHook),
184
+ )
185
+
186
+ # configure environment
187
+ env_cfg = dict(
188
+ # whether to enable cudnn benchmark
189
+ cudnn_benchmark=False,
190
+ # set multi process parameters
191
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
192
+ # set distributed parameters
193
+ dist_cfg=dict(backend='nccl'),
194
+ )
195
+
196
+ # set visualizer
197
+ visualizer = None
198
+
199
+ # set log level
200
+ log_level = 'INFO'
201
+
202
+ # load from which checkpoint
203
+ load_from = None
204
+
205
+ # whether to resume training from the loaded checkpoint
206
+ resume = False
207
+
208
+ # Defaults to use random seed and disable `deterministic`
209
+ randomness = dict(seed=None, deterministic=False)
210
+
211
+ # set log processor
212
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_arxiv_gentitle_e3.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import arxiv_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ # 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv
31
+ # 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501
32
+ data_path = './data/arxiv_data.json'
33
+ prompt_template = PROMPT_TEMPLATE.default
34
+ max_length = 2048
35
+ pack_to_max_length = True
36
+
37
+ # Scheduler & Optimizer
38
+ batch_size = 1 # per_device
39
+ accumulative_counts = 16
40
+ dataloader_num_workers = 0
41
+ max_epochs = 3
42
+ optim_type = AdamW
43
+ lr = 2e-4
44
+ betas = (0.9, 0.999)
45
+ weight_decay = 0
46
+ max_norm = 1 # grad clip
47
+ warmup_ratio = 0.03
48
+
49
+ # Save
50
+ save_steps = 500
51
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
52
+
53
+ # Evaluate the generation performance during the training
54
+ evaluation_freq = 500
55
+ SYSTEM = SYSTEM_TEMPLATE.arxiv_gentile
56
+ evaluation_inputs = [
57
+ ('We present InternLM, a multilingual foundational language '
58
+ 'model with 104B parameters. InternLM is pre-trained on a large '
59
+ 'corpora with 1.6T tokens with a multi-phase progressive '
60
+ 'process, and then fine-tuned to align with human preferences. '
61
+ 'We also developed a training system called Uniscale-LLM for '
62
+ 'efficient large language model training. The evaluation on a '
63
+ 'number of benchmarks shows that InternLM achieves '
64
+ 'state-of-the-art performance in multiple aspects, including '
65
+ 'knowledge understanding, reading comprehension, mathematics, '
66
+ 'and coding. With such well-rounded capabilities, InternLM '
67
+ 'achieves outstanding performances on comprehensive exams, '
68
+ 'including MMLU, AGIEval, C-Eval and GAOKAO-Bench, without '
69
+ 'resorting to external tools. On these benchmarks, InternLM '
70
+ 'not only significantly outperforms open-source models, but '
71
+ 'also obtains superior performance compared to ChatGPT. Also, '
72
+ 'InternLM demonstrates excellent capability of understanding '
73
+ 'Chinese language and Chinese culture, which makes it a '
74
+ 'suitable foundation model to support Chinese-oriented language '
75
+ 'applications. This manuscript gives a detailed study of '
76
+ 'our results, with benchmarks and examples across a diverse '
77
+ 'set of knowledge domains and tasks.'),
78
+ ('In this work, we develop and release Llama 2, a collection of '
79
+ 'pretrained and fine-tuned large language models (LLMs) ranging '
80
+ 'in scale from 7 billion to 70 billion parameters.\nOur '
81
+ 'fine-tuned LLMs, called LLAMA 2-CHAT, are optimized for '
82
+ 'dialogue use cases. Our models outperform open-source chat '
83
+ 'models on most benchmarks we tested, and based on our human '
84
+ 'evaluations for helpfulness and safety, may be a suitable '
85
+ 'substitute for closedsource models. We provide a detailed '
86
+ 'description of our approach to fine-tuning and safety '
87
+ 'improvements of LLAMA 2-CHAT in order to enable the community '
88
+ 'to build on our work and contribute to the responsible '
89
+ 'development of LLMs.')
90
+ ]
91
+
92
+ #######################################################################
93
+ # PART 2 Model & Tokenizer #
94
+ #######################################################################
95
+ tokenizer = dict(
96
+ type=AutoTokenizer.from_pretrained,
97
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
98
+ trust_remote_code=True,
99
+ padding_side='right')
100
+
101
+ model = dict(
102
+ type=SupervisedFinetune,
103
+ use_varlen_attn=use_varlen_attn,
104
+ llm=dict(
105
+ type=AutoModelForCausalLM.from_pretrained,
106
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
107
+ trust_remote_code=True,
108
+ torch_dtype=torch.float16,
109
+ quantization_config=dict(
110
+ type=BitsAndBytesConfig,
111
+ load_in_4bit=True,
112
+ load_in_8bit=False,
113
+ llm_int8_threshold=6.0,
114
+ llm_int8_has_fp16_weight=False,
115
+ bnb_4bit_compute_dtype=torch.float16,
116
+ bnb_4bit_use_double_quant=True,
117
+ bnb_4bit_quant_type='nf4')),
118
+ lora=dict(
119
+ type=LoraConfig,
120
+ r=64,
121
+ lora_alpha=16,
122
+ lora_dropout=0.1,
123
+ bias='none',
124
+ task_type='CAUSAL_LM'))
125
+
126
+ #######################################################################
127
+ # PART 3 Dataset & Dataloader #
128
+ #######################################################################
129
+ train_dataset = dict(
130
+ type=process_hf_dataset,
131
+ dataset=dict(
132
+ type=load_dataset, path='json', data_files=dict(train=data_path)),
133
+ tokenizer=tokenizer,
134
+ max_length=max_length,
135
+ dataset_map_fn=arxiv_map_fn,
136
+ template_map_fn=dict(
137
+ type=template_map_fn_factory, template=prompt_template),
138
+ remove_unused_columns=True,
139
+ shuffle_before_pack=True,
140
+ pack_to_max_length=pack_to_max_length,
141
+ use_varlen_attn=use_varlen_attn)
142
+
143
+ train_dataloader = dict(
144
+ batch_size=batch_size,
145
+ num_workers=dataloader_num_workers,
146
+ dataset=train_dataset,
147
+ sampler=dict(type=DefaultSampler, shuffle=True),
148
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
149
+
150
+ #######################################################################
151
+ # PART 4 Scheduler & Optimizer #
152
+ #######################################################################
153
+ # optimizer
154
+ optim_wrapper = dict(
155
+ type=AmpOptimWrapper,
156
+ optimizer=dict(
157
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
158
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
159
+ accumulative_counts=accumulative_counts,
160
+ loss_scale='dynamic',
161
+ dtype='float16')
162
+
163
+ # learning policy
164
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
165
+ param_scheduler = [
166
+ dict(
167
+ type=LinearLR,
168
+ start_factor=1e-5,
169
+ by_epoch=True,
170
+ begin=0,
171
+ end=warmup_ratio * max_epochs,
172
+ convert_to_iter_based=True),
173
+ dict(
174
+ type=CosineAnnealingLR,
175
+ eta_min=0.0,
176
+ by_epoch=True,
177
+ begin=warmup_ratio * max_epochs,
178
+ end=max_epochs,
179
+ convert_to_iter_based=True)
180
+ ]
181
+
182
+ # train, val, test setting
183
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
184
+
185
+ #######################################################################
186
+ # PART 5 Runtime #
187
+ #######################################################################
188
+ # Log the dialogue periodically during the training process, optional
189
+ custom_hooks = [
190
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
191
+ dict(
192
+ type=EvaluateChatHook,
193
+ tokenizer=tokenizer,
194
+ every_n_iters=evaluation_freq,
195
+ evaluation_inputs=evaluation_inputs,
196
+ system=SYSTEM,
197
+ prompt_template=prompt_template)
198
+ ]
199
+
200
+ if use_varlen_attn:
201
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
202
+
203
+ # configure default hooks
204
+ default_hooks = dict(
205
+ # record the time of every iteration.
206
+ timer=dict(type=IterTimerHook),
207
+ # print log every 10 iterations.
208
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
209
+ # enable the parameter scheduler.
210
+ param_scheduler=dict(type=ParamSchedulerHook),
211
+ # save checkpoint per `save_steps`.
212
+ checkpoint=dict(
213
+ type=CheckpointHook,
214
+ by_epoch=False,
215
+ interval=save_steps,
216
+ max_keep_ckpts=save_total_limit),
217
+ # set sampler seed in distributed evrionment.
218
+ sampler_seed=dict(type=DistSamplerSeedHook),
219
+ )
220
+
221
+ # configure environment
222
+ env_cfg = dict(
223
+ # whether to enable cudnn benchmark
224
+ cudnn_benchmark=False,
225
+ # set multi process parameters
226
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
227
+ # set distributed parameters
228
+ dist_cfg=dict(backend='nccl'),
229
+ )
230
+
231
+ # set visualizer
232
+ visualizer = None
233
+
234
+ # set log level
235
+ log_level = 'INFO'
236
+
237
+ # load from which checkpoint
238
+ load_from = None
239
+
240
+ # whether to resume training from the loaded checkpoint
241
+ resume = False
242
+
243
+ # Defaults to use random seed and disable `deterministic`
244
+ randomness = dict(seed=None, deterministic=False)
245
+
246
+ # set log processor
247
+ log_processor = dict(by_epoch=False)
xtuner/configs/baichuan/baichuan2_7b_base/baichuan2_7b_base_qlora_code_alpaca_e3.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch
3
+ from datasets import load_dataset
4
+ from mmengine.dataset import DefaultSampler
5
+ from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
6
+ LoggerHook, ParamSchedulerHook)
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from peft import LoraConfig
9
+ from torch.optim import AdamW
10
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
11
+ BitsAndBytesConfig)
12
+
13
+ from xtuner.dataset import process_hf_dataset
14
+ from xtuner.dataset.collate_fns import default_collate_fn
15
+ from xtuner.dataset.map_fns import code_alpaca_map_fn, template_map_fn_factory
16
+ from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
17
+ VarlenAttnArgsToMessageHubHook)
18
+ from xtuner.engine.runner import TrainLoop
19
+ from xtuner.model import SupervisedFinetune
20
+ from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
21
+
22
+ #######################################################################
23
+ # PART 1 Settings #
24
+ #######################################################################
25
+ # Model
26
+ pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Base'
27
+ use_varlen_attn = False
28
+
29
+ # Data
30
+ data_path = 'HuggingFaceH4/CodeAlpaca_20K'
31
+ prompt_template = PROMPT_TEMPLATE.default
32
+ max_length = 2048
33
+ pack_to_max_length = True
34
+
35
+ # Scheduler & Optimizer
36
+ batch_size = 1 # per_device
37
+ accumulative_counts = 16
38
+ dataloader_num_workers = 0
39
+ max_epochs = 3
40
+ optim_type = AdamW
41
+ lr = 2e-4
42
+ betas = (0.9, 0.999)
43
+ weight_decay = 0
44
+ max_norm = 1 # grad clip
45
+ warmup_ratio = 0.03
46
+
47
+ # Save
48
+ save_steps = 500
49
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
50
+
51
+ # Evaluate the generation performance during the training
52
+ evaluation_freq = 100
53
+ SYSTEM = SYSTEM_TEMPLATE.coder
54
+ evaluation_inputs = [
55
+ ('写一个Python函数,将十六进制颜色代码(如#0066ee)转换为对应的'
56
+ '红、绿、蓝(RGB)三个颜色分量值,并以元组的形式返回。'),
57
+ ('Write a Python function that takes a hexadecimal color code '
58
+ '(e.g., #0066ee) as input and converts it into the corresponding '
59
+ 'red, green, and blue (RGB) color component values.')
60
+ ]
61
+
62
+ #######################################################################
63
+ # PART 2 Model & Tokenizer #
64
+ #######################################################################
65
+ tokenizer = dict(
66
+ type=AutoTokenizer.from_pretrained,
67
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
68
+ trust_remote_code=True,
69
+ padding_side='right')
70
+
71
+ model = dict(
72
+ type=SupervisedFinetune,
73
+ use_varlen_attn=use_varlen_attn,
74
+ llm=dict(
75
+ type=AutoModelForCausalLM.from_pretrained,
76
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
77
+ trust_remote_code=True,
78
+ torch_dtype=torch.float16,
79
+ quantization_config=dict(
80
+ type=BitsAndBytesConfig,
81
+ load_in_4bit=True,
82
+ load_in_8bit=False,
83
+ llm_int8_threshold=6.0,
84
+ llm_int8_has_fp16_weight=False,
85
+ bnb_4bit_compute_dtype=torch.float16,
86
+ bnb_4bit_use_double_quant=True,
87
+ bnb_4bit_quant_type='nf4')),
88
+ lora=dict(
89
+ type=LoraConfig,
90
+ r=64,
91
+ lora_alpha=16,
92
+ lora_dropout=0.1,
93
+ bias='none',
94
+ task_type='CAUSAL_LM'))
95
+
96
+ #######################################################################
97
+ # PART 3 Dataset & Dataloader #
98
+ #######################################################################
99
+ train_dataset = dict(
100
+ type=process_hf_dataset,
101
+ dataset=dict(type=load_dataset, path=data_path),
102
+ tokenizer=tokenizer,
103
+ max_length=max_length,
104
+ dataset_map_fn=code_alpaca_map_fn,
105
+ template_map_fn=dict(
106
+ type=template_map_fn_factory, template=prompt_template),
107
+ remove_unused_columns=True,
108
+ shuffle_before_pack=True,
109
+ pack_to_max_length=pack_to_max_length,
110
+ use_varlen_attn=use_varlen_attn)
111
+
112
+ train_dataloader = dict(
113
+ batch_size=batch_size,
114
+ num_workers=dataloader_num_workers,
115
+ dataset=train_dataset,
116
+ sampler=dict(type=DefaultSampler, shuffle=True),
117
+ collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
118
+
119
+ #######################################################################
120
+ # PART 4 Scheduler & Optimizer #
121
+ #######################################################################
122
+ # optimizer
123
+ optim_wrapper = dict(
124
+ type=AmpOptimWrapper,
125
+ optimizer=dict(
126
+ type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
127
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
128
+ accumulative_counts=accumulative_counts,
129
+ loss_scale='dynamic',
130
+ dtype='float16')
131
+
132
+ # learning policy
133
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
134
+ param_scheduler = [
135
+ dict(
136
+ type=LinearLR,
137
+ start_factor=1e-5,
138
+ by_epoch=True,
139
+ begin=0,
140
+ end=warmup_ratio * max_epochs,
141
+ convert_to_iter_based=True),
142
+ dict(
143
+ type=CosineAnnealingLR,
144
+ eta_min=0.0,
145
+ by_epoch=True,
146
+ begin=warmup_ratio * max_epochs,
147
+ end=max_epochs,
148
+ convert_to_iter_based=True)
149
+ ]
150
+
151
+ # train, val, test setting
152
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
153
+
154
+ #######################################################################
155
+ # PART 5 Runtime #
156
+ #######################################################################
157
+ # Log the dialogue periodically during the training process, optional
158
+ custom_hooks = [
159
+ dict(type=DatasetInfoHook, tokenizer=tokenizer),
160
+ dict(
161
+ type=EvaluateChatHook,
162
+ tokenizer=tokenizer,
163
+ every_n_iters=evaluation_freq,
164
+ evaluation_inputs=evaluation_inputs,
165
+ system=SYSTEM,
166
+ prompt_template=prompt_template)
167
+ ]
168
+
169
+ if use_varlen_attn:
170
+ custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
171
+
172
+ # configure default hooks
173
+ default_hooks = dict(
174
+ # record the time of every iteration.
175
+ timer=dict(type=IterTimerHook),
176
+ # print log every 10 iterations.
177
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
178
+ # enable the parameter scheduler.
179
+ param_scheduler=dict(type=ParamSchedulerHook),
180
+ # save checkpoint per `save_steps`.
181
+ checkpoint=dict(
182
+ type=CheckpointHook,
183
+ by_epoch=False,
184
+ interval=save_steps,
185
+ max_keep_ckpts=save_total_limit),
186
+ # set sampler seed in distributed evrionment.
187
+ sampler_seed=dict(type=DistSamplerSeedHook),
188
+ )
189
+
190
+ # configure environment
191
+ env_cfg = dict(
192
+ # whether to enable cudnn benchmark
193
+ cudnn_benchmark=False,
194
+ # set multi process parameters
195
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
196
+ # set distributed parameters
197
+ dist_cfg=dict(backend='nccl'),
198
+ )
199
+
200
+ # set visualizer
201
+ visualizer = None
202
+
203
+ # set log level
204
+ log_level = 'INFO'
205
+
206
+ # load from which checkpoint
207
+ load_from = None
208
+
209
+ # whether to resume training from the loaded checkpoint
210
+ resume = False
211
+
212
+ # Defaults to use random seed and disable `deterministic`
213
+ randomness = dict(seed=None, deterministic=False)
214
+
215
+ # set log processor
216
+ log_processor = dict(by_epoch=False)