akswelh commited on
Commit
d90b3a8
·
verified ·
1 Parent(s): dbd7285

Upload 251 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .clang-format +155 -0
  2. .dockerignore +1 -0
  3. .gitattributes +1 -0
  4. .github/CODEOWNERS +1 -0
  5. .github/ISSUE_TEMPLATE/bug_report.md +34 -0
  6. .github/ISSUE_TEMPLATE/feature_request.md +20 -0
  7. .github/workflows/.cpu_ci_on_pr.yml +19 -0
  8. .github/workflows/coverity_scan.yml +61 -0
  9. .github/workflows/cpu_ci.yml +34 -0
  10. .github/workflows/cpu_ci_dispatch.yml +20 -0
  11. .github/workflows/docker_build.yml +50 -0
  12. .github/workflows/pull_request.yml +60 -0
  13. .gitignore +157 -0
  14. .pre-commit-config.yaml +40 -0
  15. CITATION.cff +79 -0
  16. CONTRIBUTING.md +86 -0
  17. Dockerfile +90 -0
  18. LICENSE +467 -0
  19. MANIFEST.in +2 -0
  20. README-MUP.md +49 -0
  21. README.md +863 -0
  22. configs/1-3B.yml +93 -0
  23. configs/125M-dmoe.yml +101 -0
  24. configs/125M-json.yml +80 -0
  25. configs/125M-moe.yml +101 -0
  26. configs/125M.yml +96 -0
  27. configs/13B.yml +94 -0
  28. configs/175B.yml +92 -0
  29. configs/19M.yml +97 -0
  30. configs/2-7B.yml +93 -0
  31. configs/20B.yml +113 -0
  32. configs/350M.yml +92 -0
  33. configs/49M.yml +93 -0
  34. configs/6-7B.yml +93 -0
  35. configs/760M.yml +93 -0
  36. configs/800M.yml +86 -0
  37. configs/README.md +368 -0
  38. configs/autotuning_configs/small_tune.json +78 -0
  39. configs/autotuning_configs/tune.json +72 -0
  40. configs/autotuning_configs/tune_1-3B.json +86 -0
  41. configs/autotuning_configs/tune_6-7B.json +77 -0
  42. configs/bf16_125M.yml +80 -0
  43. configs/bnb_125M.yml +87 -0
  44. configs/cpu_mock_config.yml +5 -0
  45. configs/docker/pythia-paths.yml +12 -0
  46. configs/eleutherai_cluster.yml +29 -0
  47. configs/finetuning_configs/6-9B.yml +89 -0
  48. configs/gen_docs.py +96 -0
  49. configs/gmlp_small.yml +72 -0
  50. configs/llama/13B.yml +26 -0
.clang-format ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ # Refer to the following link for the explanation of each params:
3
+ # http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
4
+ Language: Cpp
5
+ # BasedOnStyle: Google
6
+ AccessModifierOffset: -4
7
+ AlignAfterOpenBracket: Align
8
+ AlignConsecutiveAssignments: false
9
+ AlignConsecutiveDeclarations: false
10
+ AlignEscapedNewlines: Left
11
+ AlignOperands: true
12
+ AlignTrailingComments: true
13
+ AllowAllParametersOfDeclarationOnNextLine: false
14
+ AllowShortBlocksOnASingleLine: true
15
+ AllowShortCaseLabelsOnASingleLine: true
16
+ AllowShortFunctionsOnASingleLine: All
17
+ AllowShortIfStatementsOnASingleLine: true
18
+ AllowShortLoopsOnASingleLine: true
19
+ # This is deprecated
20
+ AlwaysBreakAfterDefinitionReturnType: None
21
+ AlwaysBreakAfterReturnType: None
22
+ AlwaysBreakBeforeMultilineStrings: true
23
+ AlwaysBreakTemplateDeclarations: true
24
+ BinPackArguments: false
25
+ BinPackParameters: false
26
+ BraceWrapping:
27
+ AfterClass: false
28
+ AfterControlStatement: false
29
+ AfterEnum: false
30
+ AfterFunction: false
31
+ AfterNamespace: false
32
+ AfterObjCDeclaration: false
33
+ AfterStruct: false
34
+ AfterUnion: false
35
+ AfterExternBlock: false
36
+ BeforeCatch: false
37
+ BeforeElse: false
38
+ IndentBraces: false
39
+ # disabling the below splits, else, they'll just add to the vertical length of source files!
40
+ SplitEmptyFunction: false
41
+ SplitEmptyRecord: false
42
+ SplitEmptyNamespace: false
43
+ BreakBeforeBinaryOperators: None
44
+ BreakBeforeBraces: WebKit
45
+ BreakBeforeInheritanceComma: false
46
+ BreakInheritanceList: BeforeColon
47
+ BreakBeforeTernaryOperators: true
48
+ BreakConstructorInitializersBeforeComma: false
49
+ BreakConstructorInitializers: BeforeColon
50
+ BreakAfterJavaFieldAnnotations: false
51
+ BreakStringLiterals: true
52
+ ColumnLimit: 100
53
+ CommentPragmas: '^ IWYU pragma:'
54
+ CompactNamespaces: false
55
+ ConstructorInitializerAllOnOneLineOrOnePerLine: true
56
+ # Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
57
+ ConstructorInitializerIndentWidth: 4
58
+ ContinuationIndentWidth: 4
59
+ Cpp11BracedListStyle: true
60
+ DerivePointerAlignment: false
61
+ DisableFormat: false
62
+ ExperimentalAutoDetectBinPacking: false
63
+ FixNamespaceComments: true
64
+ ForEachMacros:
65
+ - foreach
66
+ - Q_FOREACH
67
+ - BOOST_FOREACH
68
+ IncludeBlocks: Preserve
69
+ IncludeCategories:
70
+ - Regex: '^<ext/.*\.h>'
71
+ Priority: 2
72
+ - Regex: '^<.*\.h>'
73
+ Priority: 1
74
+ - Regex: '^<.*'
75
+ Priority: 2
76
+ - Regex: '.*'
77
+ Priority: 3
78
+ IncludeIsMainRegex: '([-_](test|unittest))?$'
79
+ IndentCaseLabels: true
80
+ IndentPPDirectives: None
81
+ IndentWidth: 4
82
+ IndentWrappedFunctionNames: false
83
+ JavaScriptQuotes: Leave
84
+ JavaScriptWrapImports: true
85
+ KeepEmptyLinesAtTheStartOfBlocks: false
86
+ MacroBlockBegin: ''
87
+ MacroBlockEnd: ''
88
+ MaxEmptyLinesToKeep: 1
89
+ NamespaceIndentation: None
90
+ ObjCBinPackProtocolList: Never
91
+ ObjCBlockIndentWidth: 4
92
+ ObjCSpaceAfterProperty: false
93
+ ObjCSpaceBeforeProtocolList: true
94
+ PenaltyBreakAssignment: 4
95
+ PenaltyBreakBeforeFirstCallParameter: 1
96
+ PenaltyBreakComment: 300
97
+ PenaltyBreakFirstLessLess: 120
98
+ PenaltyBreakString: 1000
99
+ PenaltyBreakTemplateDeclaration: 10
100
+ PenaltyExcessCharacter: 1000000
101
+ PenaltyReturnTypeOnItsOwnLine: 200
102
+ PointerAlignment: Left
103
+ RawStringFormats:
104
+ - Language: Cpp
105
+ Delimiters:
106
+ - cc
107
+ - CC
108
+ - cpp
109
+ - Cpp
110
+ - CPP
111
+ - 'c++'
112
+ - 'C++'
113
+ CanonicalDelimiter: ''
114
+ - Language: TextProto
115
+ Delimiters:
116
+ - pb
117
+ - PB
118
+ - proto
119
+ - PROTO
120
+ EnclosingFunctions:
121
+ - EqualsProto
122
+ - EquivToProto
123
+ - PARSE_PARTIAL_TEXT_PROTO
124
+ - PARSE_TEST_PROTO
125
+ - PARSE_TEXT_PROTO
126
+ - ParseTextOrDie
127
+ - ParseTextProtoOrDie
128
+ CanonicalDelimiter: ''
129
+ BasedOnStyle: google
130
+ # Enabling comment reflow causes doxygen comments to be messed up in their formats!
131
+ ReflowComments: true
132
+ SortIncludes: true
133
+ SortUsingDeclarations: true
134
+ SpaceAfterCStyleCast: false
135
+ SpaceAfterTemplateKeyword: true
136
+ SpaceBeforeAssignmentOperators: true
137
+ SpaceBeforeCpp11BracedList: false
138
+ SpaceBeforeCtorInitializerColon: true
139
+ SpaceBeforeInheritanceColon: true
140
+ SpaceBeforeParens: ControlStatements
141
+ SpaceBeforeRangeBasedForLoopColon: true
142
+ SpaceInEmptyParentheses: false
143
+ SpacesBeforeTrailingComments: 2
144
+ SpacesInAngles: false
145
+ SpacesInContainerLiterals: true
146
+ SpacesInCStyleCastParentheses: false
147
+ SpacesInParentheses: false
148
+ SpacesInSquareBrackets: false
149
+ Standard: Cpp11
150
+ StatementMacros:
151
+ - Q_UNUSED
152
+ - QT_REQUIRE_VERSION
153
+ # Be consistent with indent-width, even for people who use tab for indentation!
154
+ TabWidth: 4
155
+ UseTab: Never
.dockerignore ADDED
@@ -0,0 +1 @@
 
 
1
+ 20B_checkpoints/
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/memory_profiling.png filter=lfs diff=lfs merge=lfs -text
.github/CODEOWNERS ADDED
@@ -0,0 +1 @@
 
 
1
+ * @Quentin-Anthony
.github/ISSUE_TEMPLATE/bug_report.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve
4
+ title: ''
5
+ labels: bug
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Describe the bug**
11
+ A clear and concise description of what the bug is.
12
+
13
+ **To Reproduce**
14
+ Steps to reproduce the behavior:
15
+ 1. Go to '...'
16
+ 2. Click on '....'
17
+ 3. Scroll down to '....'
18
+ 4. See error
19
+
20
+ **Expected behavior**
21
+ A clear and concise description of what you expected to happen.
22
+
23
+ **Proposed solution**
24
+ If you have an idea for how we can fix this problem, describe it here.
25
+
26
+ **Screenshots**
27
+ If applicable, add screenshots to help explain your problem.
28
+
29
+ **Environment (please complete the following information):**
30
+ - GPUs:
31
+ - Configs:
32
+
33
+ **Additional context**
34
+ Add any other context about the problem here.
.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for this project
4
+ title: ''
5
+ labels: feature request
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Is your feature request related to a problem? Please describe.**
11
+ A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12
+
13
+ **Describe the solution you'd like**
14
+ A clear and concise description of what you want to happen.
15
+
16
+ **Describe alternatives you've considered**
17
+ A clear and concise description of any alternative solutions or features you've considered.
18
+
19
+ **Additional context**
20
+ Add any other context or screenshots about the feature request here.
.github/workflows/.cpu_ci_on_pr.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is hidden (.cpu_cpi_on_pr.yml) to minimize the number of runner minutes consumed.
2
+
3
+ name: "Pull Request CPU Tests"
4
+
5
+ on:
6
+ pull_request:
7
+ paths: # job only triggers when the PR changes files under megatron directory
8
+ - "megatron/**"
9
+
10
+ jobs:
11
+ run-tests:
12
+ runs-on: ubuntu-22.04 # ubuntu-latest currently points to ubuntu-22.04 but 24.04 is in beta - recommend testing on 24.04 and then changing instead of using ubuntu-latest
13
+ steps:
14
+ - name: Checkout Repository
15
+ uses: actions/checkout@v4
16
+ - name: Run CPU tests
17
+ uses: ./tests/cpu_tests
18
+ with:
19
+ target_test_ref: ${{ github.event.pull_request.base.sha }}
.github/workflows/coverity_scan.yml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Coverity
2
+ on:
3
+ workflow_dispatch:
4
+ inputs:
5
+ build_version:
6
+ description: "Version of GPT-NeoX being submitted for scan"
7
+ required: false
8
+ default: "GPT-NeoX build version"
9
+ build_description:
10
+ description: "Description of the current build"
11
+ required: false
12
+ default: "Current build of GPT-NeoX"
13
+
14
+ jobs:
15
+ coverity:
16
+
17
+ runs-on: ubuntu-latest
18
+
19
+ env:
20
+ COV_USER: ${{ secrets.COV_USER }} # needs to be an email with access to the Coverity stream - add to secrets/actions
21
+ COVERITY_PROJECT: ${{ secrets.COVERITY_PROJECT }}
22
+ COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }} # you can get this token from Coverity stream dashboard:
23
+ # https://scan.coverity.com/projects/<project>?tab=project_settings
24
+
25
+ steps:
26
+ - uses: actions/checkout@v2
27
+ with:
28
+ path: gpt-neox
29
+
30
+ - name: Install utils
31
+ run: |
32
+ sudo apt update -y && sudo apt upgrade -y
33
+ sudo apt install curl jq wget -y
34
+
35
+ - name: Coverity Download
36
+ run: |
37
+ wget https://scan.coverity.com/download/linux64 --post-data "token=$COVERITY_TOKEN&project=$COVERITY_PROJECT" -O coverity_tool.tgz --no-verbose
38
+ mkdir $GITHUB_WORKSPACE/coverity && tar xvf coverity_tool.tgz -C $GITHUB_WORKSPACE/coverity --strip-components=1
39
+ $GITHUB_WORKSPACE/coverity/bin/cov-configure --python
40
+ $GITHUB_WORKSPACE/coverity/bin/cov-configure --gcc
41
+
42
+ - name: Coverity Scan and Upload
43
+ run: |
44
+ set -x
45
+ pushd $GITHUB_WORKSPACE
46
+ cd $GITHUB_WORKSPACE/gpt-neox
47
+ $GITHUB_WORKSPACE/coverity/bin/cov-build --dir $GITHUB_WORKSPACE/cov-int --no-command --fs-capture-search ./
48
+ popd
49
+ tar caf build-results.bz2 cov-int
50
+ curl --form token=$COVERITY_TOKEN \
51
+ --form email=$COV_USER \
52
+ --form [email protected] \
53
+ --form version="${{ inputs.build_version }}" \
54
+ --form description="${{ inputs.build_description }}" \
55
+ https://scan.coverity.com/builds?project=$COVERITY_PROJECT
56
+
57
+ - name: Upload Scan Build as Artifact
58
+ uses: actions/upload-artifact@v3
59
+ with:
60
+ name: coverity-build-${{ github.sha }}
61
+ path: build-results.bz2
.github/workflows/cpu_ci.yml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Run CPU Tests"
2
+
3
+ on: "push"
4
+
5
+ jobs:
6
+ run-tests:
7
+ #runs-on: ubuntu-latest
8
+ runs-on: ubuntu-22.04
9
+ steps:
10
+ - uses: actions/checkout@v3
11
+
12
+ - name: Install Python
13
+ uses: actions/setup-python@v4
14
+ with:
15
+ python-version: "3.8"
16
+ cache: "pip"
17
+ cache-dependency-path: "**/requirements*.txt"
18
+
19
+ - name: Upgrade Pip
20
+ run: python -m pip install --upgrade pip
21
+
22
+ - name: Install Dependencies
23
+ run: |
24
+ sudo apt-get install libopenmpi-dev -y
25
+ pip install torch==1.8.2 torchvision==0.9.2 torchaudio==0.8.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cpu
26
+ pip install -r requirements/requirements.txt
27
+ pip install -r requirements/requirements-dev.txt
28
+ pip install -r requirements/requirements-wandb.txt
29
+
30
+ - name: Prepare Data
31
+ run: python prepare_data.py
32
+
33
+ - name: Run CPU Tests
34
+ run: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python pytest tests -m cpu
.github/workflows/cpu_ci_dispatch.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Workflow Dispatch CPU Tests"
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ ref:
7
+ description: 'Target ref / SHA to run tests against'
8
+ required: true
9
+ default: 'main'
10
+
11
+ jobs:
12
+ run-tests:
13
+ runs-on: ubuntu-22.04
14
+ steps:
15
+ - name: Checkout Repository
16
+ uses: actions/checkout@v4
17
+ - name: Run CPU tests
18
+ uses: ./tests/cpu_tests
19
+ with:
20
+ target_test_ref: ${{ inputs.ref }}
.github/workflows/docker_build.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: docker_build
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - '**'
7
+
8
+ jobs:
9
+ main:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ -
13
+ name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ -
17
+ name: Docker meta
18
+ id: docker_meta
19
+ uses: crazy-max/ghaction-docker-meta@v1
20
+ with:
21
+ images: leogao2/gpt-neox # list of Docker images to use as base name for tags
22
+ tag-sha: true # add git short SHA as Docker tag
23
+
24
+ -
25
+ name: Set up QEMU
26
+ uses: docker/setup-qemu-action@v1
27
+
28
+ -
29
+ name: Set up Docker Buildx
30
+ uses: docker/setup-buildx-action@v1
31
+
32
+ -
33
+ name: Login to DockerHub
34
+ uses: docker/login-action@v1
35
+ with:
36
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
37
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
38
+
39
+ -
40
+ name: Build and push
41
+ id: docker_build
42
+ uses: docker/build-push-action@v2
43
+ with:
44
+ push: ${{ github.event_name != 'pull_request' }}
45
+ tags: ${{ steps.docker_meta.outputs.tags }}
46
+ labels: ${{ steps.docker_meta.outputs.labels }}
47
+
48
+ -
49
+ name: Image digest
50
+ run: echo ${{ steps.docker_build.outputs.digest }}
.github/workflows/pull_request.yml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Pull Request
2
+
3
+ #on: [pull_request, workflow_dispatch]
4
+ on: workflow_dispatch
5
+
6
+ jobs:
7
+ pre-commit:
8
+ runs-on: ubuntu-22.04
9
+ steps:
10
+ - uses: actions/checkout@v2
11
+ - uses: actions/setup-python@v4
12
+ with:
13
+ python-version: "3.10.14"
14
+ cache: "pip"
15
+ cache-dependency-path: "**/requirements*.txt"
16
+ # Need the right version of clang-format
17
+ - run: pip install -r requirements/requirements-dev.txt
18
+ - uses: pre-commit/[email protected]
19
+ -
20
+ name: Set up Docker Buildx
21
+ uses: docker/setup-buildx-action@v1
22
+ -
23
+ name: Docker build
24
+ id: docker_build
25
+ uses: docker/build-push-action@v2
26
+
27
+ update-documentation:
28
+ runs-on: ubuntu-22.04
29
+ steps:
30
+ - uses: actions/checkout@v3
31
+ with:
32
+ ref: ${{ github.event.pull_request.head.ref}}
33
+ - run: |
34
+ rm megatron/__init__.py
35
+ pip install shortuuid
36
+ rm megatron/neox_arguments/__init__.py
37
+ python configs/gen_docs.py
38
+ git config user.name github-actions
39
+ git config user.email [email protected]
40
+ git add configs/neox_arguments.md
41
+ git commit -m "Update NeoXArgs docs automatically"
42
+ git push
43
+ run-tests:
44
+ runs-on: ubuntu-22.04
45
+ steps:
46
+ - uses: actions/checkout@v2
47
+ - uses: actions/setup-python@v4
48
+ with:
49
+ python-version: "3.10.13"
50
+ cache-dependency-path: "**/requirements*.txt"
51
+ - name: prepare data
52
+ run: python3 prepare_data.py
53
+ - name: install pytest
54
+ run: python3 -m pip install pytest pytest-forked pyyaml requests wandb
55
+ - name: install torch
56
+ run: python3 -m pip install torch
57
+ - name: install requirements
58
+ run: pip install -r requirements/requirements.txt
59
+ - name: Run Tests
60
+ run: pytest --forked tests
.gitignore ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # wandb logs
132
+ wandb/
133
+
134
+ # data files
135
+ data/**/*.idx
136
+ data/**/*.bin
137
+ data/**/*.json*
138
+ data/**/*.txt
139
+ data/**/*.gz
140
+ data/**/*.zip
141
+ data/**/*.np*
142
+ data/**/*.npy
143
+ checkpoints/
144
+ .vscode/
145
+ *.pt
146
+ *.ckpt
147
+
148
+ #test logs
149
+ test_checkpoint/
150
+ test_logs/
151
+ logs/
152
+ tensorboard/
153
+ src/
154
+
155
+ # test data files
156
+ tests/data/*.bin
157
+ tests/data/*.idx
.pre-commit-config.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.1.0
4
+ hooks:
5
+ - id: check-case-conflict
6
+ - id: check-json
7
+ - id: check-symlinks
8
+ - id: check-yaml
9
+ - id: destroyed-symlinks
10
+ - id: end-of-file-fixer
11
+ exclude: ^(docs/CNAME/|configs/neox_arguments.md)
12
+ - id: fix-byte-order-marker
13
+ - id: fix-encoding-pragma
14
+ args: [--remove]
15
+ - id: mixed-line-ending
16
+ args: [--fix=lf]
17
+ - id: requirements-txt-fixer
18
+ - id: trailing-whitespace
19
+ exclude: ^(docs/CNAME/|configs/neox_arguments.md)
20
+ - repo: https://gitlab.com/daverona/pre-commit/cpp
21
+ rev: 0.8.0
22
+ hooks:
23
+ - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
24
+ args: []
25
+
26
+ - repo: https://github.com/psf/black
27
+ rev: 22.3.0
28
+ hooks:
29
+ - id: black
30
+ language_version: python3
31
+ - repo: https://github.com/codespell-project/codespell
32
+ rev: v2.1.0
33
+ hooks:
34
+ - id: codespell
35
+ args: [
36
+ '--ignore-words-list=reord,dout,te', # Word used in error messages that need rewording. te --> transformerengine
37
+ --check-filenames,
38
+ --check-hidden,
39
+ ]
40
+ exclude: tests/data/hf_cache/tokenizer/gpt2.json
CITATION.cff ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # YAML 1.2
2
+ ---
3
+ authors:
4
+ - affiliation: EleutherAI
5
+ family-names: Andonian
6
+ given-names: Alex
7
+ - affiliation: EleutherAI
8
+ family-names: Anthony
9
+ given-names: Quentin
10
+ - affiliation: EleutherAI
11
+ family-names: Biderman
12
+ given-names: Stella
13
+ - affiliation: EleutherAI
14
+ family-names: Black
15
+ given-names: Sid
16
+ - affiliation: EleutherAI
17
+ family-names: Gali
18
+ given-names: Preetham
19
+ - affiliation: EleutherAI
20
+ family-names: Gao
21
+ given-names: Leo
22
+ - affiliation: EleutherAI
23
+ family-names: Hallahan
24
+ given-names: Eric
25
+ - affiliation: EleutherAI
26
+ family-names: Levy-Kramer
27
+ given-names: Josh
28
+ - affiliation: EleutherAI
29
+ family-names: Leahy
30
+ given-names: Connor
31
+ - affiliation: EleutherAI
32
+ family-names: Nestler
33
+ given-names: Lucas
34
+ - affiliation: EleutherAI
35
+ family-names: Parker
36
+ given-names: Kip
37
+ - affiliation: EleutherAI
38
+ family-names: Pieler
39
+ given-names: Michael
40
+ - affiliation: EleutherAI
41
+ family-names: Phang
42
+ given-names: Jason
43
+ - affiliation: EleutherAI
44
+ family-names: Purohit
45
+ given-names: Shivanshu
46
+ - affiliation: EleutherAI
47
+ family-names: Schoelkopf
48
+ given-names: Hailey
49
+ - affiliation: EleutherAI
50
+ family-names: Stander
51
+ given-names: Dashiell
52
+ - affiliation: EleutherAI
53
+ family-names: Songz
54
+ given-names: Tri
55
+ - affiliation: EleutherAI
56
+ family-names: Tigges
57
+ given-names: Curt
58
+ - affiliation: EleutherAI
59
+ family-names: Thérien
60
+ given-names: Benjamin
61
+ - affiliation: EleutherAI
62
+ family-names: Wang
63
+ given-names: Phil
64
+ - affiliation: EleutherAI
65
+ family-names: Weinbach
66
+ given-names: Samuel
67
+ cff-version: "1.1.0"
68
+ keywords:
69
+ - "Transformers"
70
+ - "Massive language model"
71
+ - "Autoregressive language model"
72
+ license: "Apache-2.0"
73
+ message: "If you use this software, please cite it using these metadata."
74
+ repository-code: "https://www.github.com/eleutherai/gpt-neox"
75
+ title: "GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch"
76
+ version: "2.0.0"
77
+ doi: "10.5281/zenodo.5879544"
78
+ date-released: 2021-08-23
79
+ ...
CONTRIBUTING.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing
2
+ GPT-NeoX welcomes your contributions!
3
+
4
+ ## Prerequisites
5
+ GPT-NeoX uses [pre-commit](https://pre-commit.com/) to ensure that formatting is
6
+ consistent across GPT-NeoX. First, ensure that `pre-commit` is installed with
7
+ `pip install pre-commit`. Next, the pre-commit hooks must be installed once
8
+ before commits can be made:
9
+ ```bash
10
+ pre-commit install
11
+ ```
12
+ Please install `clang-format` from Conda:
13
+ ```bash
14
+ conda install clang-format
15
+ ```
16
+
17
+ Afterwards, our suite of formatting tests run automatically before each `git commit`. You
18
+ can also run these manually:
19
+ ```bash
20
+ pre-commit run --all-files
21
+ ```
22
+ If a formatting test fails, it will fix the modified code in place and abort
23
+ the `git commit`. After looking over the changes, you can `git add <modified files>`
24
+ and then repeat the previous `git commit` command.
25
+
26
+
27
+ ## Testing
28
+ GPT-NeoX tracks two types of tests: unit tests and more costly model convergence tests.
29
+ Unit tests are found in `tests/unit/` and the model convergence tests are found in
30
+ `tests/model/`.
31
+
32
+ ### Unit Tests
33
+ [PyTest](https://docs.pytest.org/en/latest/) is used to execute tests. PyTest can be
34
+ installed from PyPI via `pip install pytest`. Simply invoke `pytest --forked` to run the
35
+ unit tests:
36
+ ```bash
37
+ pytest --forked tests/unit/
38
+ ```
39
+ You can also provide the `-v` flag to `pytest` to see additional information about the
40
+ tests. Note that [pytest-forked](https://github.com/pytest-dev/pytest-forked) and the
41
+ `--forked` flag are required to test CUDA functionality in distributed tests.
42
+
43
+ ### Model Tests
44
+ To execute model tests, first install GPT-NeoX. Next, execute the model test driver:
45
+ ```bash
46
+ cd tests/model/
47
+ pytest run_sanity_check.py
48
+ ```
49
+ Note that the `--forked` flag is not necessary for the model tests.
50
+
51
+ ## Contributor License Agreement
52
+ This project welcomes contributions and suggestions. Most contributions require you to
53
+ agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
54
+ actually do, grant us the rights to use your contribution. For details, visit
55
+ https://cla-assistant.io/EleutherAI/gpt-neox.
56
+
57
+ When you submit a pull request, a CLA bot will automatically determine whether you need
58
+ to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
59
+ follow the instructions provided by the bot. You will only need to do this once across
60
+ all repos using our CLA.
61
+
62
+ ## New Feature Contribution Guidelines
63
+ Unlike bug fix or improving existing feature (where users usually directly submit a PR and we review it), adding a new feature to GPT-NeoX requires several steps: (1) proposal and discussion, (2) implementation and verification, (3) release and maintenance. This general guideline applies to all new feature contributions. Core GPT-NeoX team member contributions may complete step 1 internally.
64
+
65
+ ### Step 1: Proposal and Discussion
66
+ We ask users to first post your intended feature in an issue. This issue needs to include:
67
+
68
+ * A description of the proposed feature.
69
+ * A motivation of why it will be useful to GPT-NeoX users.
70
+ * A rough design of how you implement the feature inside GPT-NeoX.
71
+ * (Important) Results or planned experiments to demonstrate the effectiveness and correctness of the feature.
72
+ * If the feature only affects performance and does not affect training convergence, we require testing on a fraction of training to demonstrate that the training/validation loss are consistent with baseline, and that the performance is better than baseline.
73
+ * If the feature does affect training convergence, we require testing the whole training to demonstrate that the feature achieves better/on-par final model quality and training performance compared to baseline.
74
+
75
+ Based on the issue we shall discuss the merit of the new feature and decide whether to accept or decline the proposal. Once accepted and after we confirm the design and implementation plan, we are ready for step 2.
76
+
77
+ ### Step 2: Implementation and Verification
78
+ The contributor will proceed and implement the feature, and the GPT-NeoX team will provide guidance/helps as needed. The required deliverables include:
79
+
80
+ * A PR to [EleutherAI/GPT-NeoX](https://github.com/EleutherAI/gpt-neox) including (1) the feature implementation (2) unit tests (3) documentation (4) example usage.
81
+ * In the implementation (code, documentation, tutorial), we require the feature author to record their GitHub username as a contact method for future questions/maintenance.
82
+
83
+ After receiving the PRs, we will review them and merge them after necessary tests/fixes.
84
+
85
+ ### Step 3: Release and Maintenance
86
+ After the PRs are merged, we will announce the feature on our website (with credit to the feature author). We ask the feature author to commit to the maintenance of the feature.
Dockerfile ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024, EleutherAI
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ FROM nvcr.io/nvidia/pytorch:24.02-py3
16
+
17
+ ENV DEBIAN_FRONTEND=noninteractive
18
+
19
+ # metainformation
20
+ LABEL org.opencontainers.image.version = "2.0"
21
+ LABEL org.opencontainers.image.authors = "[email protected]"
22
+ LABEL org.opencontainers.image.source = "https://www.github.com/eleutherai/gpt-neox"
23
+ LABEL org.opencontainers.image.licenses = " Apache-2.0"
24
+ LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/pytorch:24.02-py3"
25
+
26
+ #### System package (uses default Python 3 version in Ubuntu 20.04)
27
+ RUN apt-get update -y && \
28
+ apt-get install -y \
29
+ python3-pip sudo pdsh \
30
+ htop tmux zstd software-properties-common \
31
+ nfs-common pdsh cmake htop iftop iotop ssh \
32
+ iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils \
33
+ rdmacm-utils perftest rdma-core && \
34
+ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
35
+ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
36
+ python -m pip install --upgrade pip && \
37
+ python -m pip install gpustat
38
+
39
+ ### SSH
40
+ RUN mkdir /var/run/sshd && \
41
+ # Prevent user being kicked off after login
42
+ sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd && \
43
+ echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \
44
+ echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config && \
45
+ # FIX SUDO BUG: https://github.com/sudo-project/sudo/issues/42
46
+ echo "Set disable_coredump false" >> /etc/sudo.conf
47
+
48
+ # Expose SSH port
49
+ EXPOSE 22
50
+
51
+ # Needs to be in docker PATH if compiling other items & bashrc PATH (later)
52
+ ENV PATH=/usr/local/mpi/bin:${PATH} \
53
+ LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
54
+
55
+ # Create a wrapper for OpenMPI to allow running as root by default
56
+ RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
57
+ echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
58
+ echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
59
+ chmod a+x /usr/local/mpi/bin/mpirun
60
+
61
+ #### User account
62
+ RUN useradd --create-home --uid 1000 --shell /bin/bash mchorse && \
63
+ usermod -aG sudo mchorse && \
64
+ echo "mchorse ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
65
+
66
+ ## SSH config and bashrc
67
+ RUN mkdir -p /home/mchorse/.ssh /job && \
68
+ echo 'Host *' > /home/mchorse/.ssh/config && \
69
+ echo ' StrictHostKeyChecking no' >> /home/mchorse/.ssh/config && \
70
+ echo 'export PDSH_RCMD_TYPE=ssh' >> /home/mchorse/.bashrc && \
71
+ echo 'export PATH=/home/mchorse/.local/bin:$PATH' >> /home/mchorse/.bashrc && \
72
+ echo 'export PATH=/usr/local/mpi/bin:$PATH' >> /home/mchorse/.bashrc && \
73
+ echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc
74
+
75
+ #### Python packages
76
+ COPY requirements/* ./
77
+ RUN python -m pip install --no-cache-dir -r requirements.txt && pip install -r requirements-onebitadam.txt
78
+ RUN python -m pip install -r requirements-wandb.txt
79
+ RUN python -m pip install protobuf==3.20.*
80
+
81
+ COPY megatron/fused_kernels/ /megatron/fused_kernels
82
+ WORKDIR /megatron/fused_kernels
83
+ RUN python setup.py install
84
+
85
+ # Clear staging
86
+ RUN mkdir -p /tmp && chmod 0777 /tmp
87
+
88
+ #### SWITCH TO mchorse USER
89
+ USER mchorse
90
+ WORKDIR /home/mchorse
LICENSE ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2024
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
202
+
203
+ --
204
+
205
+ This repository also contains code from Hugging Face Inc., Google Research,
206
+ and Facebook (from their Fairseq project). Files from these
207
+ organizations have notices at the top of each file. Below are licenses
208
+ used in those files, as indicated.
209
+
210
+
211
+ ------------- LICENSE FOR NVIDIA code --------------
212
+
213
+
214
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
215
+ #
216
+ # Redistribution and use in source and binary forms, with or without
217
+ # modification, are permitted provided that the following conditions
218
+ # are met:
219
+ # * Redistributions of source code must retain the above copyright
220
+ # notice, this list of conditions and the following disclaimer.
221
+ # * Redistributions in binary form must reproduce the above copyright
222
+ # notice, this list of conditions and the following disclaimer in the
223
+ # documentation and/or other materials provided with the distribution.
224
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
225
+ # contributors may be used to endorse or promote products derived
226
+ # from this software without specific prior written permission.
227
+ #
228
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
229
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
230
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
231
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
232
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
233
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
234
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
235
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
236
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
237
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
238
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
239
+
240
+ ------------- LICENSE FOR huggingface and Google Research code --------------
241
+
242
+
243
+ Apache License
244
+ Version 2.0, January 2024
245
+ http://www.apache.org/licenses/
246
+
247
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
248
+
249
+ 1. Definitions.
250
+
251
+ "License" shall mean the terms and conditions for use, reproduction,
252
+ and distribution as defined by Sections 1 through 9 of this document.
253
+
254
+ "Licensor" shall mean the copyright owner or entity authorized by
255
+ the copyright owner that is granting the License.
256
+
257
+ "Legal Entity" shall mean the union of the acting entity and all
258
+ other entities that control, are controlled by, or are under common
259
+ control with that entity. For the purposes of this definition,
260
+ "control" means (i) the power, direct or indirect, to cause the
261
+ direction or management of such entity, whether by contract or
262
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
263
+ outstanding shares, or (iii) beneficial ownership of such entity.
264
+
265
+ "You" (or "Your") shall mean an individual or Legal Entity
266
+ exercising permissions granted by this License.
267
+
268
+ "Source" form shall mean the preferred form for making modifications,
269
+ including but not limited to software source code, documentation
270
+ source, and configuration files.
271
+
272
+ "Object" form shall mean any form resulting from mechanical
273
+ transformation or translation of a Source form, including but
274
+ not limited to compiled object code, generated documentation,
275
+ and conversions to other media types.
276
+
277
+ "Work" shall mean the work of authorship, whether in Source or
278
+ Object form, made available under the License, as indicated by a
279
+ copyright notice that is included in or attached to the work
280
+ (an example is provided in the Appendix below).
281
+
282
+ "Derivative Works" shall mean any work, whether in Source or Object
283
+ form, that is based on (or derived from) the Work and for which the
284
+ editorial revisions, annotations, elaborations, or other modifications
285
+ represent, as a whole, an original work of authorship. For the purposes
286
+ of this License, Derivative Works shall not include works that remain
287
+ separable from, or merely link (or bind by name) to the interfaces of,
288
+ the Work and Derivative Works thereof.
289
+
290
+ "Contribution" shall mean any work of authorship, including
291
+ the original version of the Work and any modifications or additions
292
+ to that Work or Derivative Works thereof, that is intentionally
293
+ submitted to Licensor for inclusion in the Work by the copyright owner
294
+ or by an individual or Legal Entity authorized to submit on behalf of
295
+ the copyright owner. For the purposes of this definition, "submitted"
296
+ means any form of electronic, verbal, or written communication sent
297
+ to the Licensor or its representatives, including but not limited to
298
+ communication on electronic mailing lists, source code control systems,
299
+ and issue tracking systems that are managed by, or on behalf of, the
300
+ Licensor for the purpose of discussing and improving the Work, but
301
+ excluding communication that is conspicuously marked or otherwise
302
+ designated in writing by the copyright owner as "Not a Contribution."
303
+
304
+ "Contributor" shall mean Licensor and any individual or Legal Entity
305
+ on behalf of whom a Contribution has been received by Licensor and
306
+ subsequently incorporated within the Work.
307
+
308
+ 2. Grant of Copyright License. Subject to the terms and conditions of
309
+ this License, each Contributor hereby grants to You a perpetual,
310
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
311
+ copyright license to reproduce, prepare Derivative Works of,
312
+ publicly display, publicly perform, sublicense, and distribute the
313
+ Work and such Derivative Works in Source or Object form.
314
+
315
+ 3. Grant of Patent License. Subject to the terms and conditions of
316
+ this License, each Contributor hereby grants to You a perpetual,
317
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
318
+ (except as stated in this section) patent license to make, have made,
319
+ use, offer to sell, sell, import, and otherwise transfer the Work,
320
+ where such license applies only to those patent claims licensable
321
+ by such Contributor that are necessarily infringed by their
322
+ Contribution(s) alone or by combination of their Contribution(s)
323
+ with the Work to which such Contribution(s) was submitted. If You
324
+ institute patent litigation against any entity (including a
325
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
326
+ or a Contribution incorporated within the Work constitutes direct
327
+ or contributory patent infringement, then any patent licenses
328
+ granted to You under this License for that Work shall terminate
329
+ as of the date such litigation is filed.
330
+
331
+ 4. Redistribution. You may reproduce and distribute copies of the
332
+ Work or Derivative Works thereof in any medium, with or without
333
+ modifications, and in Source or Object form, provided that You
334
+ meet the following conditions:
335
+
336
+ (a) You must give any other recipients of the Work or
337
+ Derivative Works a copy of this License; and
338
+
339
+ (b) You must cause any modified files to carry prominent notices
340
+ stating that You changed the files; and
341
+
342
+ (c) You must retain, in the Source form of any Derivative Works
343
+ that You distribute, all copyright, patent, trademark, and
344
+ attribution notices from the Source form of the Work,
345
+ excluding those notices that do not pertain to any part of
346
+ the Derivative Works; and
347
+
348
+ (d) If the Work includes a "NOTICE" text file as part of its
349
+ distribution, then any Derivative Works that You distribute must
350
+ include a readable copy of the attribution notices contained
351
+ within such NOTICE file, excluding those notices that do not
352
+ pertain to any part of the Derivative Works, in at least one
353
+ of the following places: within a NOTICE text file distributed
354
+ as part of the Derivative Works; within the Source form or
355
+ documentation, if provided along with the Derivative Works; or,
356
+ within a display generated by the Derivative Works, if and
357
+ wherever such third-party notices normally appear. The contents
358
+ of the NOTICE file are for informational purposes only and
359
+ do not modify the License. You may add Your own attribution
360
+ notices within Derivative Works that You distribute, alongside
361
+ or as an addendum to the NOTICE text from the Work, provided
362
+ that such additional attribution notices cannot be construed
363
+ as modifying the License.
364
+
365
+ You may add Your own copyright statement to Your modifications and
366
+ may provide additional or different license terms and conditions
367
+ for use, reproduction, or distribution of Your modifications, or
368
+ for any such Derivative Works as a whole, provided Your use,
369
+ reproduction, and distribution of the Work otherwise complies with
370
+ the conditions stated in this License.
371
+
372
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
373
+ any Contribution intentionally submitted for inclusion in the Work
374
+ by You to the Licensor shall be under the terms and conditions of
375
+ this License, without any additional terms or conditions.
376
+ Notwithstanding the above, nothing herein shall supersede or modify
377
+ the terms of any separate license agreement you may have executed
378
+ with Licensor regarding such Contributions.
379
+
380
+ 6. Trademarks. This License does not grant permission to use the trade
381
+ names, trademarks, service marks, or product names of the Licensor,
382
+ except as required for reasonable and customary use in describing the
383
+ origin of the Work and reproducing the content of the NOTICE file.
384
+
385
+ 7. Disclaimer of Warranty. Unless required by applicable law or
386
+ agreed to in writing, Licensor provides the Work (and each
387
+ Contributor provides its Contributions) on an "AS IS" BASIS,
388
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
389
+ implied, including, without limitation, any warranties or conditions
390
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
391
+ PARTICULAR PURPOSE. You are solely responsible for determining the
392
+ appropriateness of using or redistributing the Work and assume any
393
+ risks associated with Your exercise of permissions under this License.
394
+
395
+ 8. Limitation of Liability. In no event and under no legal theory,
396
+ whether in tort (including negligence), contract, or otherwise,
397
+ unless required by applicable law (such as deliberate and grossly
398
+ negligent acts) or agreed to in writing, shall any Contributor be
399
+ liable to You for damages, including any direct, indirect, special,
400
+ incidental, or consequential damages of any character arising as a
401
+ result of this License or out of the use or inability to use the
402
+ Work (including but not limited to damages for loss of goodwill,
403
+ work stoppage, computer failure or malfunction, or any and all
404
+ other commercial damages or losses), even if such Contributor
405
+ has been advised of the possibility of such damages.
406
+
407
+ 9. Accepting Warranty or Additional Liability. While redistributing
408
+ the Work or Derivative Works thereof, You may choose to offer,
409
+ and charge a fee for, acceptance of support, warranty, indemnity,
410
+ or other liability obligations and/or rights consistent with this
411
+ License. However, in accepting such obligations, You may act only
412
+ on Your own behalf and on Your sole responsibility, not on behalf
413
+ of any other Contributor, and only if You agree to indemnify,
414
+ defend, and hold each Contributor harmless for any liability
415
+ incurred by, or claims asserted against, such Contributor by reason
416
+ of your accepting any such warranty or additional liability.
417
+
418
+ END OF TERMS AND CONDITIONS
419
+
420
+ APPENDIX: How to apply the Apache License to your work.
421
+
422
+ To apply the Apache License to your work, attach the following
423
+ boilerplate notice, with the fields enclosed by brackets "[]"
424
+ replaced with your own identifying information. (Don't include
425
+ the brackets!) The text should be enclosed in the appropriate
426
+ comment syntax for the file format. We also recommend that a
427
+ file or class name and description of purpose be included on the
428
+ same "printed page" as the copyright notice for easier
429
+ identification within third-party archives.
430
+
431
+ Copyright [yyyy] [name of copyright owner]
432
+
433
+ Licensed under the Apache License, Version 2.0 (the "License");
434
+ you may not use this file except in compliance with the License.
435
+ You may obtain a copy of the License at
436
+
437
+ http://www.apache.org/licenses/LICENSE-2.0
438
+
439
+ Unless required by applicable law or agreed to in writing, software
440
+ distributed under the License is distributed on an "AS IS" BASIS,
441
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
442
+ See the License for the specific language governing permissions and
443
+ limitations under the License.
444
+
445
+ ------------- LICENSE FOR Facebook Fairseq code --------------
446
+
447
+ MIT License
448
+
449
+ Copyright (c) Facebook, Inc. and its affiliates.
450
+
451
+ Permission is hereby granted, free of charge, to any person obtaining a copy
452
+ of this software and associated documentation files (the "Software"), to deal
453
+ in the Software without restriction, including without limitation the rights
454
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
455
+ copies of the Software, and to permit persons to whom the Software is
456
+ furnished to do so, subject to the following conditions:
457
+
458
+ The above copyright notice and this permission notice shall be included in all
459
+ copies or substantial portions of the Software.
460
+
461
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
462
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
463
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
464
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
465
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
466
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
467
+ SOFTWARE.
MANIFEST.in ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ include megatron/data/Makefile
2
+ include megatron/data/helpers.cpp
README-MUP.md ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How to use Mup (https://github.com/microsoft/mup)
2
+
3
+ ## Add mup neox args to your config
4
+
5
+ ```
6
+ # mup
7
+
8
+ "use-mup": true,
9
+
10
+ "save-base-shapes": false, # this only needs to be enabled once in order to generate the base-shapes-file on each rank
11
+
12
+ "base-shapes-file": "base-shapes", # load base shapes from this file
13
+
14
+ "coord-check": false, # generate coord check plots to verify mup's implementation in neox
15
+
16
+ # mup hp search
17
+
18
+ "mup-init-scale": 1.0,
19
+
20
+ "mup-attn-temp": 1.0,
21
+
22
+ "mup-output-temp": 1.0,
23
+
24
+ "mup-embedding-mult": 1.0,
25
+
26
+ "mup-rp-embedding-mult": 1.0,
27
+ ```
28
+
29
+ ## Generate base shapes
30
+
31
+ 1. Set use-mup to true
32
+ 2. Set save-base-shapes to true
33
+ 3. Run once. gpt-neox will instantiate a base model and a delta model, then save one file per rank named <base-shapes-file>.<rank>. gpt-neox will exit immediately.
34
+ 4. Set save-base-shapes to false
35
+
36
+ ## Generate coord check plots (optional)
37
+
38
+ 1. Keep use-mup true
39
+ 2. Set coord-check to true
40
+ 3. Run once. gpt-neox will output jpg images similar to https://github.com/microsoft/mutransformers/blob/main/README.md#coord-check. gpt-neox will exit immediately
41
+ 4. Set coord-check to false
42
+
43
+ ## Tune mup hyperparameters and LR
44
+
45
+ The values under `mup hp search` were added and correspond to appendix F.4 from https://arxiv.org/pdf/2203.03466.pdf. These and LR are tuned with a random search using the scaled-up config (tested with 6-7B.yml) but with hidden-size set to the value from the scaled-down config (125M.yml).
46
+
47
+ ## Transfer
48
+
49
+ With the best LR set and the best mup HPs set, revert the value of hidden-size in the scaled-up config and run again.
README.md ADDED
@@ -0,0 +1,863 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [![GitHub issues](https://img.shields.io/github/issues/EleutherAI/gpt-neox)](https://github.com/EleutherAI/gpt-neox/issues)
2
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Weights & Biases monitoring" height=20>](https://wandb.ai/eleutherai/neox)
3
+
4
+ # GPT-NeoX
5
+
6
+ This repository records [EleutherAI](https://www.eleuther.ai)'s library for training large-scale language models on GPUs. Our current framework is based on NVIDIA's [Megatron Language Model](https://github.com/NVIDIA/Megatron-LM) and has been augmented with techniques from [DeepSpeed](https://www.deepspeed.ai) as well as some novel optimizations. We aim to make this repo a centralized and accessible place to gather techniques for training large-scale autoregressive language models, and accelerate research into large-scale training. This library is in widespread use in [academic, industry, and government labs](https://github.com/EleutherAI/gpt-neox#adoption-and-publications), including by researchers at Oak Ridge National Lab, CarperAI, Stability AI, Together.ai, Korea University, Carnegie Mellon University, and the University of Tokyo among others. Uniquely among similar libraries GPT-NeoX supports a wide variety of systems and hardwares, including launching via Slurm, MPI, and the IBM Job Step Manager, and has been run at scale on [AWS](https://aws.amazon.com/), [CoreWeave](https://www.coreweave.com/), [ORNL Summit](https://www.olcf.ornl.gov/summit/), [ORNL Frontier](https://www.olcf.ornl.gov/frontier/), [LUMI](https://www.lumi-supercomputer.eu/), and others.
7
+
8
+ **If you are not looking to train models with billions of parameters from scratch, this is likely the wrong library to use. For generic inference needs, we recommend you use the Hugging Face `transformers` library instead which supports GPT-NeoX models.**
9
+
10
+ ## Why GPT-NeoX?
11
+
12
+ GPT-NeoX leverages many of the same features and technologies as the popular Megatron-DeepSpeed library but with substantially increased usability and novel optimizations. Major features include:
13
+ * Distributed training with ZeRO and 3D parallelism
14
+ * A wide variety of systems and hardwares, including launching via Slurm, MPI, and the IBM Job Step Manager, and has been run at scale on [AWS](https://aws.amazon.com/), [CoreWeave](https://www.coreweave.com/), Oak Ridge's [Summit](https://www.olcf.ornl.gov/summit/) and [Frontier](https://www.olcf.ornl.gov/frontier/), [Pacific Northwest National Laboratory](https://hpc.pnl.gov/index.shtml), Argonne's [Polaris](https://docs.alcf.anl.gov/polaris/data-science-workflows/applications/gpt-neox/), [LUMI](https://www.lumi-supercomputer.eu/), and more.
15
+ * Cutting edge architectural innovations including rotary and alibi positional embeddings, parallel feedforward attention layers, and flash attention.
16
+ * Predefined configurations for popular architectures including Pythia, PaLM, Falcon, and LLaMA 1 \& 2
17
+ * Curriculum Learning
18
+ * Easy connections with the open source ecosystem, including Hugging Face's [tokenizers](https://github.com/huggingface/tokenizers) and [transformers](https://github.com/huggingface/transformers/) libraries, monitor experiments via [WandB](https://wandb.ai/site)/[Comet](https://www.comet.com/site/)/TensorBoard, and evaluation via our [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness).
19
+
20
+ ## News
21
+ **[9/9/2024]** We now support preference learning via [DPO](https://arxiv.org/abs/2305.18290), [KTO](https://arxiv.org/abs/2402.01306), and reward modeling
22
+
23
+ **[9/9/2024]** We now support integration with [Comet ML](https://www.comet.com/site/), a machine learning monitoring platform
24
+
25
+ **[5/21/2024]** We now support [RWKV](https://www.rwkv.com/) with pipeline parallelism!. See the PRs for [RWKV](https://github.com/EleutherAI/gpt-neox/pull/1198) and [RWKV+pipeline](https://github.com/EleutherAI/gpt-neox/pull/1221)
26
+
27
+ **[3/21/2024]** We now support Mixture-of-Experts (MoE)
28
+
29
+ **[3/17/2024]** We now support AMD MI250X GPUs
30
+
31
+ **[3/15/2024]** We now support [Mamba](https://github.com/state-spaces/mamba) with tensor parallelism! See [the PR](https://github.com/EleutherAI/gpt-neox/pull/1184)
32
+
33
+ **[8/10/2023]** We now support checkpointing with AWS S3! Activate with the `s3_path` config option (for more detail, see [the PR](https://github.com/EleutherAI/gpt-neox/pull/1010))
34
+
35
+ **[9/20/2023]** As of https://github.com/EleutherAI/gpt-neox/pull/1035, we have deprecated Flash Attention 0.x and 1.x, and migrated support to Flash Attention 2.x. We don't believe this will cause problems, but if you have a specific use-case that requires old flash support using the latest GPT-NeoX, please raise an issue.
36
+
37
+ **[8/10/2023]** We have experimental support for LLaMA 2 and Flash Attention v2 supported in our [math-lm](https://github.com/EleutherAI/math-lm) project that will be upstreamed later this month.
38
+
39
+ **[5/17/2023]** After fixing some miscellaneous bugs we now fully support bf16.
40
+
41
+ **[4/11/2023]** We have upgraded our Flash Attention implementation to now support Alibi positional embeddings.
42
+
43
+ **[3/9/2023]** We have released GPT-NeoX 2.0.0, an upgraded version built on the latest DeepSpeed which will be regularly synced with going forward.
44
+
45
+ ## Versions
46
+
47
+ Prior to 3/9/2023, GPT-NeoX relied on [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed), which was based on an old version of DeepSpeed (0.3.15). In order to migrate to the latest upstream DeepSpeed version while allowing users to access the old versions of GPT-NeoX and DeeperSpeed, we have introduced two versioned releases for both libraries:
48
+
49
+ - Version 2.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v2.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v2.0) are the latest versions built on the latest DeepSpeed, and will be maintained going forward.
50
+ - Version 1.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v1.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v1.0) maintain snapshots of the old stable versions that [GPT-NeoX-20B](https://arxiv.org/abs/2204.06745) and the [Pythia Suite](https://github.com/EleutherAI/pythia) were trained on.
51
+
52
+ # Contents
53
+
54
+ - [GPT-NeoX](#gpt-neox)
55
+ * [Why GPT-NeoX?](#why-gpt-neox)
56
+ * [News](#news)
57
+ * [Versions](#versions)
58
+ - [Contents](#contents)
59
+ - [Quick Start](#quick-start)
60
+ * [Environment and Dependencies](#environment-and-dependencies)
61
+ + [Host Setup](#host-setup)
62
+ + [Flash Attention](#flash-attention)
63
+ + [Multi-Node Launching](#multi-node-launching)
64
+ + [Containerized Setup](#containerized-setup)
65
+ * [Usage](#usage)
66
+ - [Configuration](#configuration)
67
+ * [Mixture of Experts](#mixture-of-experts)
68
+ - [Datasets](#datasets)
69
+ * [Preconfigured Datasets](#preconfigured-datasets)
70
+ * [Using Custom Data](#using-custom-data)
71
+ - [Training and Finetuning](#training-and-finetuning)
72
+ * [Pretrained Models](#pretrained-models)
73
+ + [GPT-NeoX-20B](#gpt-neox-20b)
74
+ + [Pythia](#pythia)
75
+ + [Polyglot](#polyglot)
76
+ - [Inference](#inference)
77
+ - [Evaluation](#evaluation)
78
+ - [Exporting to Hugging Face](#exporting-to-hugging-face)
79
+ - [Monitoring](#monitoring)
80
+ * [Weights and Biases](#weights-and-biases)
81
+ * [TensorBoard](#tensorboard)
82
+ - [Running on multi-node](#running-on-multi-node)
83
+ - [Profiling](#profiling)
84
+ - [Adoption and Publications](#adoption-and-publications)
85
+ * [Publications](#publications)
86
+ * [Models](#models)
87
+ + [English LLMs](#english-llms)
88
+ + [Non-English LLMs](#non-english-llms)
89
+ + [Code Models](#code-models)
90
+ + [Other Modalities](#other-modalities)
91
+ - [Administrative Notes](#administrative-notes)
92
+ * [Citing GPT-NeoX](#citing-gpt-neox)
93
+ * [Contributing](#contributing)
94
+ * [Licensing](#licensing)
95
+ * [Acknowledgements](#acknowledgements)
96
+
97
+ # Quick Start
98
+
99
+ ## Environment and Dependencies
100
+
101
+ ### Host Setup
102
+
103
+ First make sure you are in an environment with Python 3.8 with an appropriate version of PyTorch 1.8 or later installed. **Note:** Some of the libraries that GPT-NeoX depends on have not been updated to be compatible with Python 3.10+. Python 3.9 appears to work, but this codebase has been developed and tested for Python 3.8.
104
+
105
+ To install the remaining basic dependencies, run:
106
+
107
+ ```bash
108
+ pip install -r requirements/requirements.txt
109
+ pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB
110
+ pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard
111
+ pip install -r requirements/requirements-comet.txt # optional, if logging via Comet
112
+ ```
113
+
114
+ from the repository root.
115
+
116
+ > [!Warning]
117
+ > Our codebase relies on [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed), our fork of the [DeepSpeed](https://github.com/microsoft/DeepSpeed) library with some added changes. We strongly recommend using Anaconda, a virtual machine, or some other form of environment isolation before continuing. Failure to do so may cause other repositories that rely on DeepSpeed to break.
118
+
119
+ </aside>
120
+
121
+ ### Fused Kernels
122
+ We now support AMD GPUs (MI100, MI250X) through JIT fused-kernel compilation. Fused kernels will be built and loaded as needed. To avoid waiting during job launching, you can also do the following for manual pre-build:
123
+
124
+ ```python
125
+ python
126
+ from megatron.fused_kernels import load
127
+ load()
128
+ ```
129
+ This will automatically adapts building process over different GPU vendors (AMD, NVIDIA) without platform specific code changes. To further test fused kernels using `pytest`, use `pytest tests/model/test_fused_kernels.py`
130
+
131
+ ### Flash Attention
132
+
133
+ To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details.
134
+
135
+
136
+ ### Multi-Node Launching
137
+
138
+ NeoX and Deep(er)Speed support training on multiple different nodes and you have the option of using a variety of different launchers to orchestrate multi-node jobs.
139
+
140
+ In general there needs to be a "hostfile" somewhere accessible with the format:
141
+
142
+ ```bash
143
+ node1_ip slots=8
144
+ node2_ip slots=8
145
+ ```
146
+
147
+ where the first column contains the IP address for each node in your setup and the number of slots is the number of GPUs that node has access to. In your config you must pass in the path to the hostfile with `"hostfile": "/path/to/hostfile"`. Alternatively the path to the hostfile can be in the environment variable `DLTS_HOSTFILE`.
148
+
149
+ #### pdsh
150
+
151
+ `pdsh` is the default launcher, and if you're using `pdsh` then all you must do (besides ensuring that pdsh is installed in your environment) is set `{"launcher": "pdsh"}` in your config files.
152
+
153
+ #### MPI
154
+
155
+ If using MPI then you must specify the MPI library (DeepSpeed/GPT-NeoX currently supports `mvapich`, `openmpi`, `mpich`, and `impi`, though `openmpi` is the most commonly used and tested) as well as pass the `deepspeed_mpi` flag in your config file:
156
+
157
+ ```json
158
+ {
159
+ "launcher": "openmpi",
160
+ "deepspeed_mpi": true
161
+ }
162
+ ```
163
+
164
+ With your environment properly set up and the correct configuration files you can use `deepy.py` like a normal python script and start (for example) a training job with:
165
+
166
+ `python3 deepy.py train.py /path/to/configs/my_model.yml`
167
+
168
+ #### Slurm
169
+
170
+ Using Slurm can be slightly more involved. Like with MPI, you must add the following to your config:
171
+
172
+ ```json
173
+ {
174
+ "launcher": "slurm",
175
+ "deepspeed_slurm": true
176
+ }
177
+ ```
178
+ If you do not have ssh access to the compute nodes in your Slurm cluster you need to add `{"no_ssh_check": true}`
179
+
180
+ #### (Advanced) Custom Launching
181
+
182
+ There are many cases where the above default launching options are not sufficient
183
+
184
+ - Many clusters have their own unique job scheduler or specific MPI/Slurm arguments necessary for launching jobs such as [Summit JSRun](https://docs.olcf.ornl.gov/systems/summit_user_guide.html#job-launcher-jsrun) or [LLNL Flux](https://computing.llnl.gov/projects/flux-building-framework-resource-management)
185
+ - While the above Slurm/MPI/pdsh default options are enough for most job runs, advanced users may want to add arguments for optimization or debugging purposes
186
+
187
+ In these cases, you will need to modify the DeepSpeed [multinode runner](https://github.com/microsoft/DeepSpeed/blob/17957728c0362bf8ae70feca308e491e55ef9feb/deepspeed/launcher/multinode_runner.py) utility to support your usecase. Broadly, these enhancements fall under two categories:
188
+
189
+ ##### 1. Adding a Launcher (e.g. [JSRun](https://docs.olcf.ornl.gov/systems/summit_user_guide.html#job-launcher-jsrun), [Flux](https://computing.llnl.gov/projects/flux-building-framework-resource-management), etc)
190
+
191
+ In this case, you must add a new multinode runner class to `deepspeed/launcher/multinode_runner.py` and expose it as a configuration option in GPT-NeoX. Examples on how we did this for [Summit JSRun](https://docs.olcf.ornl.gov/systems/summit_user_guide.html#job-launcher-jsrun) are in [this DeeperSpeed commit](https://github.com/EleutherAI/DeeperSpeed/commit/9aed6c8500d7c492d85c5c88687322dbda70e370) and [this GPT-NeoX commit](https://github.com/EleutherAI/gpt-neox/commit/3782c7ae60f8624e566e3879b89bb09e8b59b869), respectively.
192
+
193
+ ##### 2. Modifying Run Command or Environment Variables
194
+
195
+ We have encountered many cases where we wish to modify the MPI/Slurm run command for an optimization or to debug (e.g. to modify the [Slurm srun CPU binding](https://slurm.schedmd.com/srun.html#OPT_cpu-bind) or to tag MPI logs with the rank). In this case, you must modify the multinode runner class' run command under its `get_cmd` method (e.g. [mpirun_cmd](https://github.com/microsoft/DeepSpeed/blob/17957728c0362bf8ae70feca308e491e55ef9feb/deepspeed/launcher/multinode_runner.py#L135-L147) for OpenMPI). Examples on how we did this to provide optimized and rank-tagged run commands using Slurm and OpenMPI for the Stability cluster are in [this DeeperSpeed branch](https://github.com/microsoft/DeepSpeed/compare/master...EleutherAI:DeeperSpeed:v2.0-stability)
196
+
197
+
198
+ #### Hostfile Generation
199
+
200
+ In general you will not be able to have a single fixed hostfile, so you need to have a script to generate one dynamically when your job starts. An example script to dynamically generate a hostfile using [Slurm](https://slurm.schedmd.com/documentation.html) and 8 GPUs per node is:
201
+
202
+ ```bash
203
+ #!/bin/bash
204
+ GPUS_PER_NODE=8
205
+ mkdir -p /sample/path/to/hostfiles
206
+ # need to add the current slurm jobid to hostfile name so that we don't add to previous hostfile
207
+ hostfile=/sample/path/to/hostfiles/hosts_$SLURM_JOBID
208
+ # be extra sure we aren't appending to a previous hostfile
209
+ rm $hostfile &> /dev/null
210
+ # loop over the node names
211
+ for i in `scontrol show hostnames $SLURM_NODELIST`
212
+ do
213
+ # add a line to the hostfile
214
+ echo $i slots=$GPUS_PER_NODE >>$hostfile
215
+ done
216
+ ```
217
+
218
+ `$SLURM_JOBID` and `$SLURM_NODELIST` being environment variables Slurm will create for you. See the [sbatch documentation](https://slurm.schedmd.com/sbatch.html#SECTION_OUTPUT-ENVIRONMENT-VARIABLES) for a full list of available Slurm environment variables set at job creation time.
219
+
220
+ #### Job Launching
221
+
222
+ Then you can create an [sbatch](https://slurm.schedmd.com/sbatch.html) script from which to kick off your GPT-NeoX job. A bare-bones sbatch script on a Slurm-based cluster with 8 GPUs per node would look like this:
223
+
224
+ ```bash
225
+ #!/bin/bash
226
+ #SBATCH --job-name="neox"
227
+ #SBATCH --partition=your-partition
228
+ #SBATCH --nodes=1
229
+ #SBATCH --ntasks-per-node=8
230
+ #SBATCH --gres=gpu:8
231
+
232
+ # Some potentially useful distributed environment variables
233
+ export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
234
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
235
+ export MASTER_PORT=12802
236
+ export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
237
+
238
+ # Your hostfile creation script from above
239
+ ./write_hostfile.sh
240
+ # Tell DeepSpeed where to find our generated hostfile via DLTS_HOSTFILE
241
+ export DLTS_HOSTFILE=/sample/path/to/hostfiles/hosts_$SLURM_JOBID
242
+
243
+ # Launch training
244
+ python3 deepy.py train.py /sample/path/to/your/configs/my_model.yml
245
+
246
+ ```
247
+
248
+ You can then kick off a training run with `sbatch my_sbatch_script.sh`
249
+
250
+
251
+ ### Containerized Setup
252
+
253
+ We also provide a Dockerfile and docker-compose configuration if you prefer to run NeoX in a container.
254
+
255
+ Requirements to run the container are to have appropriate GPU drivers, an up-to-date installation of Docker, and [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed. To test if your installation is good you can use their "sample workload", which is:
256
+
257
+ ```
258
+ docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
259
+ ```
260
+
261
+ Provided that will run, you need to export NEOX_DATA_PATH and NEOX_CHECKPOINT_PATH in your environment to specify your data directory and directory for storing and loading checkpoints:
262
+
263
+ ```
264
+ export NEOX_DATA_PATH=/mnt/sda/data/enwiki8 #or wherever your data is stored on your system
265
+ export NEOX_CHECKPOINT_PATH=/mnt/sda/checkpoints
266
+ ```
267
+
268
+ And then, from the gpt-neox directory, you can build the image and run a shell in a container with
269
+
270
+ ```
271
+ docker compose run gpt-neox bash
272
+ ```
273
+
274
+ After the build, you should be able to do this:
275
+ ```
276
+ mchorse@537851ed67de:~$ echo $(pwd)
277
+ /home/mchorse
278
+ mchorse@537851ed67de:~$ ls -al
279
+ total 48
280
+ drwxr-xr-x 1 mchorse mchorse 4096 Jan 8 05:33 .
281
+ drwxr-xr-x 1 root root 4096 Jan 8 04:09 ..
282
+ -rw-r--r-- 1 mchorse mchorse 220 Feb 25 2020 .bash_logout
283
+ -rw-r--r-- 1 mchorse mchorse 3972 Jan 8 04:09 .bashrc
284
+ drwxr-xr-x 4 mchorse mchorse 4096 Jan 8 05:35 .cache
285
+ drwx------ 3 mchorse mchorse 4096 Jan 8 05:33 .nv
286
+ -rw-r--r-- 1 mchorse mchorse 807 Feb 25 2020 .profile
287
+ drwxr-xr-x 2 root root 4096 Jan 8 04:09 .ssh
288
+ drwxrwxr-x 8 mchorse mchorse 4096 Jan 8 05:35 chk
289
+ drwxrwxrwx 6 root root 4096 Jan 7 17:02 data
290
+ drwxr-xr-x 11 mchorse mchorse 4096 Jan 8 03:52 gpt-neox
291
+ ```
292
+
293
+ For a long-running job, you should run
294
+
295
+ ```
296
+ docker compose up -d
297
+ ```
298
+
299
+ to run the container in detached mode, and then, in a separate terminal session, run
300
+
301
+ ```
302
+ docker compose exec gpt-neox bash
303
+ ```
304
+
305
+ You can then run any job you want from inside the container.
306
+
307
+ Concerns when running for a long time or in detached mode include
308
+ - You will have to terminate the container manually when you are no longer using it
309
+ - If you want processes to continue running when your shell session ends, you will need to background them.
310
+ - If you then want logging, you will have to make sure to pipe logs to disk, and set up wandb and/or Comet logging.
311
+
312
+ If you prefer to run the prebuilt container image from dockerhub, you can run the docker compose commands with ```-f docker-compose-dockerhub.yml``` instead, e.g.,
313
+
314
+ ```
315
+ docker compose run -f docker-compose-dockerhub.yml gpt-neox bash
316
+ ```
317
+
318
+ ## Usage
319
+
320
+ All functionality should be launched using `deepy.py`, a wrapper around the `deepspeed` launcher.
321
+
322
+ We currently offer three main functions:
323
+ 1. `train.py` is used for training and finetuning models.
324
+ 2. `eval.py` is used to evaluate a trained model using the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
325
+ 3. `generate.py` is used to sample text from a trained model.
326
+
327
+ which can be launched with:
328
+
329
+ ```bash
330
+ ./deepy.py [script.py] [./path/to/config_1.yml] [./path/to/config_2.yml] ... [./path/to/config_n.yml]
331
+ ```
332
+
333
+ For example, to launch training you can run
334
+ ```bash
335
+ ./deepy.py train.py ./configs/20B.yml ./configs/local_cluster.yml
336
+ ```
337
+
338
+ For more details on each entry point, see the [Training and Finetuning](#training-and-finetuning), [Inference](#inference) and [Evaluation](#evaluation) respectively.
339
+
340
+ # Configuration
341
+
342
+ GPT-NeoX parameters are defined in a YAML configuration file which is passed to the deepy.py launcher. We have provided some example .yml files in [configs](./configs/), showing a diverse array of features and model sizes.
343
+
344
+ These files are generally complete, but non-optimal. For example, depending on your specific GPU configuration, you may need to change some settings such as `pipe-parallel-size`, `model-parallel-size` to increase or decrease the degree of parallelisation, `train_micro_batch_size_per_gpu` or `gradient-accumulation-steps` to modify batch size related settings, or the `zero_optimization` dict to modify how optimizer states are parallelised across workers.
345
+
346
+ For a more detailed guide to the features available and how to configure them, see [the configuration README](configs/README.md), and for documentation of every possible argument, see [configs/neox_arguments.md](configs/neox_arguments.md).
347
+
348
+ ## Mixture of Experts
349
+
350
+ GPT-NeoX includes multiple expert implementations for MoE. To select between them, specify `moe_type` of `megablocks` (default) or `deepspeed`.
351
+
352
+ Both are based on the DeepSpeed MoE parallelism framework, which supports tensor-expert-data parallelism.
353
+ Both allow you to toggle between token-dropping and dropless (default, and this is what Megablocks was designed for).
354
+ Sinkhorn routing to come soon!
355
+
356
+ For an example of a basic complete configuration, see configs/125M-dmoe.yml (for Megablocks dropless) or configs/125M-moe.yml.
357
+
358
+ Most MoE related configuration arguments are prefixed with `moe`. Some common configuration parameters and their defaults are as follows:
359
+
360
+ ```
361
+ moe_type: megablocks
362
+ moe_num_experts: 1 # 1 disables MoE. 8 is a reasonable value.
363
+ moe_loss_coeff: 0.1
364
+ expert_interval: 2 # See details below
365
+ enable_expert_tensor_parallelism: false # See details below
366
+ moe_expert_parallel_size: 1 # See details below
367
+ moe_token_dropping: false
368
+ ```
369
+
370
+ DeepSpeed can be further configured with the following:
371
+
372
+ ```
373
+ moe_top_k: 1
374
+ moe_min_capacity: 4
375
+ moe_train_capacity_factor: 1.0 # Setting to 1.0
376
+ moe_eval_capacity_factor: 1.0 # Setting to 1.0
377
+ ```
378
+
379
+ One MoE layer is present every `expert_interval` transformer layers including the first, so with 12 layers total:
380
+
381
+ ```
382
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
383
+ ```
384
+
385
+ Experts would be in these layers:
386
+
387
+ ```
388
+ 0, 2, 4, 6, 8, 10
389
+ ```
390
+
391
+ By default, we use expert-data parallelism, so any available tensor parallelism (`model_parallel_size`) will be used for expert routing. For instance, given the following:
392
+
393
+ ```
394
+ expert_parallel_size: 4
395
+ model_parallel_size: 2 # aka tensor parallelism
396
+ ```
397
+
398
+ With 32 GPUs, the behavior will be look like:
399
+
400
+ - In non-expert layers:
401
+ - Tensor parallelism is 2. (There are 32 / 2 = 16 such tensor parallel groups, each of size 2.)
402
+ - Data parallelism implicitly becomes 32 / 2 = 16.
403
+ - In expert layers:
404
+ - There is no tensor parallelism.
405
+ - Expert parallelism is 4. (There are 32 / 4 = 8 expert parallel groups, each of size 4.)
406
+ - Data parallelism implicitly becomes 32 / 4 = 8. Some cross-node token routing happens as a result of this redivision of data parallelism between 16 and 8. To avoid it, ensure that `expert_parallel_size == model_parallel_size`.
407
+
408
+ Setting `enable_expert_tensor_parallelism` enables tensor-expert-data (TED) parallelism. The way to interpret the above would then be:
409
+
410
+ - In non-expert layers: same as before.
411
+ - In expert layers:
412
+ - Tensor parallelism is 2. (There are 32 / 2 = 16 tensor parallel groups, each of size 2.)
413
+ - Expert parallelism is 4. (There are 32 / 4 = 8 expert parallel groups, each of size 4.)
414
+ - Data parallelism implicitly becomes 32 / (2 * 4) = 4. Again, cross-node token routing happens. To avoid, ensure `expert_parallel_size == 1` or `model_parallel_size == 1`.
415
+
416
+ So note that DP must be divisible by (MP * EP). For more details, see the [TED paper].
417
+
418
+ Pipeline parallelism is not yet supported - coming soon!
419
+
420
+ [TED paper]: https://arxiv.org/abs/2303.06318
421
+
422
+ # Datasets
423
+
424
+ ## Preconfigured Datasets
425
+
426
+ Several preconfigured datasets are available, including most components from [the Pile](https://arxiv.org/abs/2101.00027), as well as the Pile train set itself, for straightforward tokenization using the `prepare_data.py` entry point.
427
+
428
+ E.G, to download and tokenize the enwik8 dataset with the GPT2 Tokenizer, saving them to `./data` you can run:
429
+
430
+ ```
431
+ python prepare_data.py -d ./data
432
+ ```
433
+
434
+ or a single shard of the pile (`pile_subset`) with the GPT-NeoX-20B tokenizer (assuming you have it saved at `./20B_checkpoints/20B_tokenizer.json`):
435
+
436
+ ```
437
+ python prepare_data.py -d ./data -t HFTokenizer --vocab-file ./20B_checkpoints/20B_tokenizer.json pile_subset
438
+ ```
439
+
440
+ The tokenized data will be saved out to two files: `[data-dir]/[dataset-name]/[dataset-name]_text_document.bin`and `[data-dir]/[dataset-name]/[dataset-name]_text_document.idx`. You will need to add the prefix that both these files share to your training configuration file under the `data-path` field. E.G:
441
+
442
+ ```yaml
443
+ "data-path": "./data/enwik8/enwik8_text_document",
444
+ ```
445
+
446
+ ## Using Custom Data
447
+
448
+ To prepare your own dataset for training with custom data, format it as one large [jsonl](https://jsonlines.org/)-formatted file with each item in the list of dictionaries being a separate document. The document text should be grouped under one JSON key, i.e `"text"`. Any auxiliary data stored in other fields will not be used.
449
+
450
+ Next make sure to download the GPT2 tokenizer vocab, and merge files from the following links:
451
+
452
+ - Vocab: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
453
+ - Merge: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
454
+
455
+ Or use the 20B tokenizer (for which only a single Vocab file is needed):
456
+
457
+ - Vocab: https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/20B_tokenizer.json
458
+
459
+ (alternatively, you can provide any tokenizer file that can be loaded by Hugging Face's tokenizers library with the `Tokenizer.from_pretrained()` command)
460
+
461
+ You can now pretokenize your data using `tools/datasets/preprocess_data.py`, the arguments for which are detailed below:
462
+
463
+ ```
464
+ usage: preprocess_data.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--num-docs NUM_DOCS] --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer} [--vocab-file VOCAB_FILE] [--merge-file MERGE_FILE] [--append-eod] [--ftfy] --output-prefix OUTPUT_PREFIX
465
+ [--dataset-impl {lazy,cached,mmap}] [--workers WORKERS] [--log-interval LOG_INTERVAL]
466
+
467
+ optional arguments:
468
+ -h, --help show this help message and exit
469
+
470
+ input data:
471
+ --input INPUT Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated list
472
+ --jsonl-keys JSONL_KEYS [JSONL_KEYS ...]
473
+ space separate listed of keys to extract from jsonl. Default: text
474
+ --num-docs NUM_DOCS Optional: Number of documents in the input data (if known) for an accurate progress bar.
475
+
476
+ tokenizer:
477
+ --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer}
478
+ What type of tokenizer to use.
479
+ --vocab-file VOCAB_FILE
480
+ Path to the vocab file
481
+ --merge-file MERGE_FILE
482
+ Path to the BPE merge file (if necessary).
483
+ --append-eod Append an <eod> token to the end of a document.
484
+ --ftfy Use ftfy to clean text
485
+
486
+ output data:
487
+ --output-prefix OUTPUT_PREFIX
488
+ Path to binary output file without suffix
489
+ --dataset-impl {lazy,cached,mmap}
490
+ Dataset implementation to use. Default: mmap
491
+
492
+ runtime:
493
+ --workers WORKERS Number of worker processes to launch
494
+ --log-interval LOG_INTERVAL
495
+ Interval between progress updates
496
+
497
+ ```
498
+
499
+ For example:
500
+
501
+ ```bash
502
+ python tools/datasets/preprocess_data.py \
503
+ --input ./data/mydataset.jsonl.zst \
504
+ --output-prefix ./data/mydataset \
505
+ --vocab ./data/gpt2-vocab.json \
506
+ --merge-file gpt2-merges.txt \
507
+ --dataset-impl mmap \
508
+ --tokenizer-type GPT2BPETokenizer \
509
+ --append-eod
510
+ ```
511
+
512
+ You would then run training with the following settings added to your configuration file:
513
+
514
+ ```yaml
515
+ "data-path": "data/mydataset_text_document",
516
+ ```
517
+
518
+ # Training and Finetuning
519
+
520
+ Training is launched using `deepy.py`, a wrapper around DeepSpeed's launcher, which launches the same script in parallel across many GPUs / nodes.
521
+
522
+ The general usage pattern is:
523
+
524
+ ```bash
525
+ python ./deepy.py train.py [path/to/config1.yml] [path/to/config2.yml] ...
526
+ ```
527
+
528
+ You can pass in an arbitrary number of configs which will all be merged at runtime.
529
+
530
+ You can also optionally pass in a config prefix, which will assume all your configs are in the same folder and append that prefix to their path.
531
+
532
+ For example:
533
+
534
+ ```bash
535
+ python ./deepy.py train.py -d configs 125M.yml local_setup.yml
536
+ ```
537
+
538
+ This will deploy the `train.py` script on all nodes with one process per GPU. The worker nodes and number of GPUs are specified in the `/job/hostfile` file (see [parameter documentation](configs/README.md)), or can simply be passed in as the `num_gpus` arg if running on a single node setup.
539
+
540
+ Although this is not strictly necessary, we find it useful to define the model parameters in one config file (e.g `configs/125M.yml`) and the data path parameters in another (e.g `configs/local_setup.yml`).
541
+
542
+
543
+ ## Pretrained Models
544
+
545
+ ### GPT-NeoX-20B
546
+
547
+ GPT-NeoX-20B is a 20 billion parameter autoregressive language model trained on [the Pile](https://arxiv.org/abs/2101.00027). Technical details about GPT-NeoX-20B can be found in [the associated paper](https://arxiv.org/abs/2204.06745). The configuration file for this model is both available at [`./configs/20B.yml`](./configs/20B.yml) and included in the download links below.
548
+
549
+ [Slim weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/) - (No optimizer states, for inference or finetuning, 39GB)
550
+
551
+ To download from the command line to a folder named `20B_checkpoints`, use the following command:
552
+
553
+ ```bash
554
+ wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/ -P 20B_checkpoints
555
+ ```
556
+
557
+ [Full weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights/) - (Including optimizer states, 268GB)
558
+
559
+ To download from the command line to a folder named `20B_checkpoints`, use the following command:
560
+
561
+ ```bash
562
+ wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights/ -P 20B_checkpoints
563
+ ```
564
+
565
+ Weights can be alternatively be downloaded using a BitTorrent client. Torrent files can be downloaded here: [slim weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights.torrent), [full weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights.torrent).
566
+
567
+ We additionally have 150 checkpoints saved throughout training, one every 1,000 steps. We are working on figuring out how to best serve these at scale, but in the meanwhile people interested in working with the partially trained checkpoints can email us at [email protected] to arrange access.
568
+
569
+ ### Pythia
570
+
571
+ The Pythia Scaling Suite is a suite of models ranging from 70M parameters to 12B parameters trained on [the Pile](https://pile.eleuther.ai) intended to promote research on interpretability and training dynamics of large language models. Further details about the project and links to the models can be found in the [in the paper](https://arxiv.org/abs/2304.01373) and [on the project's GitHub](https://github.com/EleutherAI/pythia).
572
+
573
+ ### Polyglot
574
+
575
+ The Polyglot Project is an effort to train powerful non-English pretrained language models to promote the accessibility of this technology to researchers outside the dominant powerhouses of machine learning. EleutherAI has trained and released 1.3B, 3.8B, and 5.8B parameter Korean language models, the largest of which outpreforms all other publicly available language models on Korean language tasks. Further details about the project and links to the models can be found [here](https://github.com/EleutherAI/polyglot).
576
+
577
+ # Inference
578
+
579
+ **For most uses we recommend deploying models trained using the GPT-NeoX library via the Hugging Face Transformers library which is better optimized for inference.**
580
+
581
+ We support three types of generation from a pretrained model:
582
+ 1. Unconditional generation
583
+ 2. Conditional generation based on an input read from a file
584
+ 3. Interactive generation, which allows for multiple rounds of back-and-forth between a user and the language model via a command line interface
585
+
586
+ All three types of text generation can be launched via `python ./deepy.py generate.py -d configs 125M.yml local_setup.yml text_generation.yml` with the appropriate values set in `configs/text_generation.yml`.
587
+
588
+ # Evaluation
589
+
590
+ GPT-NeoX supports evaluation on downstream tasks through the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
591
+
592
+ To evaluate a trained model on the evaluation harness, simply run:
593
+
594
+ ```bash
595
+ python ./deepy.py eval.py -d configs your_configs.yml --eval_tasks task1 task2 ... taskn
596
+ ```
597
+
598
+ where `--eval_tasks` is a list of evaluation tasks followed by spaces, e.g `--eval_tasks lambada hellaswag piqa sciq`. For details of all tasks available, refer to the [lm-evaluation-harness repo](https://github.com/EleutherAI/lm-evaluation-harness).
599
+
600
+ # Exporting to Hugging Face
601
+
602
+ GPT-NeoX is optimized heavily for training only, and GPT-NeoX model checkpoints are not compatible out of the box with other deep learning libraries. To make models easily loadable and shareable with end users, and for further exporting to various other frameworks, GPT-NeoX supports checkpoint conversion to the [Hugging Face Transformers](https://arxiv.org/abs/1910.03771) format.
603
+
604
+ Though NeoX supports a number of different architectural configurations, including AliBi positional embeddings, not all of these configurations map cleanly onto the supported configurations within Hugging Face Transformers.
605
+
606
+ NeoX supports export of compatible models into the following architectures:
607
+ - GPTNeoXForCausalLM
608
+ - LlamaForCausalLM
609
+ - MistralForCausalLM
610
+
611
+ Training a model which does not fit into one of these Hugging Face Transformers architectures cleanly will require writing custom modeling code for the exported model.
612
+
613
+ To convert a GPT-NeoX library checkpoint to Hugging Face-loadable format, run:
614
+ ```bash
615
+ python ./tools/ckpts/convert_neox_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location --precision {auto,fp16,bf16,fp32} --architecture {neox,mistral,llama}
616
+ ```
617
+
618
+ Then to upload a model to [the Hugging Face Hub](https://huggingface.co/), run:
619
+ ```bash
620
+ huggingface-cli login
621
+ python ./tools/ckpts/upload.py
622
+ ```
623
+ and input the requested information, including HF hub user token.
624
+
625
+ ### Importing Models Into GPT-NeoX
626
+
627
+ NeoX supplies several utilities for converting a pretrained model checkpoint into a format that can be trained within the library.
628
+
629
+ The following models or model families can be loaded in GPT-NeoX:
630
+ - Llama 1
631
+ - Llama 2
632
+ - CodeLlama
633
+ - Mistral-7b-v0.1
634
+
635
+ We provide two utilities for converting from two different checkpoint formats into a format compatible with GPT-NeoX.
636
+
637
+ To convert a Llama 1 or Llama 2 checkpoint distributed by Meta AI from its original file format (downloadable [here](https://github.com/facebookresearch/llama) or [here](https://huggingface.co/meta-llama/Llama-2-7b)) into the GPT-NeoX library, run
638
+
639
+ ```
640
+ python tools/ckpts/convert_raw_llama_weights_to_neox.py --input_dir /path/to/model/parent/dir/7B --model_size 7B --output_dir /path/to/save/ckpt --num_output_shards <TENSOR_PARALLEL_SIZE> (--pipeline_parallel if pipeline-parallel-size >= 1)
641
+ ```
642
+
643
+
644
+ To convert from a Hugging Face model into a NeoX-loadable, run `tools/ckpts/convert_hf_to_sequential.py`. See documentation within that file for further options.
645
+
646
+
647
+ # Monitoring
648
+
649
+ In addition to storing logs locally, we provide built-in support for two popular experiment monitoring frameworks: [Weights & Biases](https://wandb.ai/site), [TensorBoard](https://www.tensorflow.org/tensorboard/), and [Comet](https://www.comet.com/site)
650
+
651
+ ## Weights and Biases
652
+
653
+ [Weights & Biases to record our experiments](https://wandb.ai/eleutherai/neox) is a machine learning monitoring platform. To use wandb to monitor your gpt-neox experiments:
654
+ 1. Create an account at https://wandb.ai/site to generate your API key
655
+ 2. Log into Weights & Biases on your machine&mdash;you can do this by executing `wandb login`&mdash;your runs will automatically be recorded.
656
+ 3. Dependencies required for wandb monitoring can be found in and installed from `./requirements/requirements-wandb.txt`. An example config is provided in `./configs/local_setup_wandb.yml`.
657
+ 4. There are two optional fields associated with Weights & Biases: <code><var>wandb_group</var></code> allows you to name the run group and <code><var>wandb_team</var></code> allows you to assign your runs to an organization or team account. An example config is provided in `./configs/local_setup_wandb.yml`.
658
+
659
+ ## TensorBoard
660
+
661
+ We support using TensorBoard via the <code><var>tensorboard-dir</var></code> field. Dependencies required for TensorBoard monitoring can be found in and installed from `./requirements/requirements-tensorboard.txt`.
662
+
663
+ ## Comet
664
+
665
+ [Comet](https://www.comet.com/site) is a machine learning monitoring platform. To use comet to monitor your gpt-neox experiments:
666
+ 1. Create an account at https://www.comet.com/login to generate your API key.
667
+ 2. Once generated, link your API key at runtime by running `comet login` or passing `export COMET_API_KEY=<your-key-here>`
668
+ 3. Install `comet_ml` and any dependency libraries via `pip install -r requirements/requirements-comet.txt`
669
+ 4. Enable Comet with `use_comet: True`. You can also customize where data is being logged with `comet_workspace` and `comet_project`. A full example config with comet enabled is provided in `configs/local_setup_comet.yml`.
670
+ 5. Run your experiment, and monitor metrics in the Comet workspace that you passed!
671
+
672
+ # Running on multi-node
673
+
674
+ If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, you can set the environment variable `DLTS_HOSTFILE` to point to the hostfile.
675
+
676
+ # Profiling
677
+
678
+ We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memory Profiling.
679
+
680
+ ## Nsight Systems Profiling
681
+
682
+ To use the Nsight Systems profiling, set config options `profile`, `profile_step_start`, and `profile_step_stop` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config).
683
+
684
+ To populate nsys metrics, launch training with:
685
+
686
+ ```
687
+ nsys profile -s none -t nvtx,cuda -o <path/to/profiling/output> --force-overwrite true \
688
+ --capture-range=cudaProfilerApi --capture-range-end=stop python $TRAIN_PATH/deepy.py \
689
+ $TRAIN_PATH/train.py --conf_dir configs <config files>
690
+ ```
691
+
692
+ The generated output file can then by viewed with the Nsight Systems GUI:
693
+
694
+ ![nsight-prof](images/nsight_profiling.png)
695
+
696
+ ## PyTorch Profiling
697
+
698
+ To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config).
699
+
700
+ The PyTorch profiler will save traces to your `tensorboard` log directory. You can view these traces within
701
+ TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html).
702
+
703
+ ![torch-prof](images/pytorch_profiling.png)
704
+
705
+ ## PyTorch Memory Profiling
706
+
707
+ To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config).
708
+
709
+ ![mem-prof](images/memory_profiling.png)
710
+
711
+ View the generated profile with the [memory_viz.py](https://github.com/pytorch/pytorch/blob/main/torch/cuda/_memory_viz.py) script. Run with:
712
+
713
+ ```
714
+ python _memory_viz.py trace_plot <generated_profile> -o trace.html
715
+ ```
716
+
717
+ # Adoption and Publications
718
+
719
+ The GPT-NeoX library was been widely adopted by academic and industry researchers and ported on to many HPC systems.
720
+
721
+ If you have found this library useful in your research, please reach out and let us know! We would love to add you to our lists.
722
+
723
+ ## Publications
724
+
725
+ EleutherAI and our collaborators have used it in the following publications:
726
+ - **Sid Black**, **Stella Biderman**, **Eric Hallahan**, **Quentin Anthony**, **Leo Gao**, **Laurence Golding**, **Horace He**, **Connor Leahy**, **Kyle McDonell**, **Jason Phang**, **Michael Pieler**, **Shivanshu Purohit**, **Laria Reynolds**, **Jon Tow**, **Ben Wang**, and **Samuel Weinbach**. "[GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*, 2022.
727
+ - **Stella Biderman**, **Hailey Schoelkopf**, **Quentin Anthony**, **Herbie Bradley**, **Kyle O'Brien**, **Eric Hallahan**, **Mohammad Aflah Khan**, **Shivanshu Purohit**, **USVSN Sai Prashanth**, Edward Raff, **Aviya Skowron**, **Lintang Sutawika**, **Oskar van der Wal**. "[Pythia: A suite for analyzing large language models across training and scaling](https://arxiv.org/abs/2304.01373)." In _International Conference on Machine Learning_, pp. 2397-2430. _PMLR_, 2023.
728
+ - Zhangir Azerbayev, Bartosz Piotrowski, **Hailey Schoelkopf**, Edward W. Ayers, Dragomir Radev, and Jeremy Avigad. "[Proofnet: Autoformalizing and formally proving undergraduate-level mathematics](https://arxiv.org/abs/2302.12433). *arXiv preprint arXiv:2302.12433*, 2023.
729
+ - **Stella Biderman**, **USVSN Sai Prashanth**, **Lintang Sutawika**, **Hailey Schoelkopf**, **Quentin Anthony**, **Shivanshu Purohit**, and Edward Raff. "[Emergent and predictable memorization in large language models.](https://arxiv.org/abs/2304.11158)" In _Neural Information Processing Systems_, 2023.
730
+ - **Hyunwoong Ko**, **Kichang Yang**, **Minho Ryu**, **Taekyoon Choi**, **Seungmu Yang,** and Sungho Park. "[A Technical Report for Polyglot-Ko: Open-Source Large-Scale Korean Language Models](https://arxiv.org/abs/2306.02254)." *arXiv preprint arXiv:2306.02254*, 2023.
731
+ - Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats Leon Richter, **Quentin Anthony**, Eugene Belilovsky, Irina Rish, and Timothée Lesort. "[Continual Pre-Training of Large Language Models: How to re-warm your model?](https://arxiv.org/abs/2308.04014)" In _Workshop on Efficient Systems for Foundation Models @ ICML_, 2023.
732
+ - **Zhangir Azerbayev**, **Hailey Schoelkopf**, Keiran Paster, Marco Dos Santos, Stephen McAleer, Albert Q Jiang, Jia Deng, **Stella Biderman**, and Sean Welleck. "[Llemma: An open language model for mathematics]([https://arxiv.org/abs/2308.04014](https://arxiv.org/abs/2310.10631))" In _Math-AI Workshop @ NeurIPS_, 2023.
733
+ - Alexander Havrilla, Maksym Zhuravinskyi, Duy Phung, Aman Tiwari, Jonathan Tow, **Stella Biderman**, **Quentin Anthony**, and **Louis Castricato**. "[trlX: A Framework for Large Scale Reinforcement Learning from Human Feedback](https://aclanthology.org/2023.emnlp-main.530/)." In _Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing_, 2023.
734
+ - **Quentin Anthony**, **Jacob Hatef**, Deepak Narayanan, **Stella Biderman**, Stas Bekman, Junqi Yin, Aamir Shafi, Hari Subramoni, and Dhabaleswar Panda. "[The Case for Co-Designing Model Architectures with Hardware](https://arxiv.org/abs/2401.14489)." In _arXiv preprint_, 2024.
735
+ - Adam Ibrahim, Benjamin Thérien, Kshitij Gupta, Mats L. Richter, **Quentin Anthony**, Timothée Lesort, Eugene Belilovsky, Irina Rish. "[Simple and Scalable Strategies to Continually Pre-train Large Language Models](https://arxiv.org/abs/2403.08763)." In _arXiv preprint_, 2024.
736
+ - Junqi Yin, Avishek Bose, Guojing Cong, Isaac Lyngaas, **Quentin Anthony**. "[Comparative Study of Large Language Model Architectures on Frontier](https://arxiv.org/abs/2402.00691)." In _arXiv preprint_, 2024.
737
+
738
+ The following publications by other research groups use this library:
739
+ - Ta-Chung Chi, Ting-Han Fan, Peter J. Ramadge, and Alexander Rudnicky. "[KERPLE: Kernelized Relative Positional Embedding for Length Extrapolation](https://arxiv.org/abs/2205.09921)." In *Advances in Neural Information Processing Systems* 35, 2022.
740
+ - Sameera Horawalavithana, Ellyn Ayton, Shivam Sharma, Scott Howland, Megha Subramanian, Scott Vasquez, Robin Cosbey, Maria Glenski, and Svitlana Volkova. "[Foundation Models of Scientific Knowledge for Chemistry: Opportunities, Challenges and Lessons Learned](https://aclanthology.org/2022.bigscience-1.12/)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*, 2022.
741
+ - Sophia Kolak, Ruben Martins, Claire Le Goues, and Vincent J. Hellendoorn. "[Patch Generation with Language Models: Feasibility and Scaling Behavior](https://par.nsf.gov/biblio/10340618)"." In *Proceedings of the Deep Learning for Code Workshop at ICLR*, 2022.
742
+ - Frank F. Xu, Uri Alon, Graham Neubig, and Vincent J. Hellendoorn. "[A Systematic Evaluation of Large Language Models of Code](https://arxiv.org/abs/2202.13169)." In *Proceedings of the ICLR Workshop on Deep Learning For Code*, 2022.
743
+ - Byung-Doh Oh and William Schuler. "[Transformer-Based LM Surprisal Predicts Human Reading Times Best with About Two Billion Training Tokens](https://arxiv.org/abs/2304.11389)." In *Findings of the Association for Computational Linguistics*, 2023.
744
+ - Ta-Chung Chi, Ting-Han Fan, Alexander Rudnicky, and Peter Ramadge. "[Dissecting Transformer Length Extrapolation via the Lens of Receptive Field Analysis](https://aclanthology.org/2023.acl-long.756/)." In _Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)_, pp. 13522-13537, 2023.
745
+ - Ta-Chung Chi, Ting-Han Fan, Li-Wei Chen, Alexander Rudnicky, and Peter Ramadge. "[Latent Positional Information is in the Self-Attention Variance of Transformer Language Models Without Positional Embeddings](https://aclanthology.org/2023.acl-short.102/)." In _Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)_, pp. 13522-13537, 2023.
746
+ - Xidong Feng, Yicheng Luo, Ziyan Wang, Hongrui Tang, Mengyue Yang, Kun Shao, David Mguni, Yali Du, and Jun Wang. "[ChessGPT: Bridging Policy Learning and Language Modeling.](https://arxiv.org/abs/2306.09200)" _arXiv preprint arXiv:2306.09200_, 2023.
747
+ - Orion Walker Dollar, Sameera Horawalavithana, Scott Vasquez, W. James Pfaendtner, and Svitlana Volkova. "[MolJET: Multimodal Joint Embedding Transformer for Conditional de novo Molecular Design and Multi-Property Optimization.](https://openreview.net/pdf?id=7UudBVsIrr)" _preprint under review_, 2023.
748
+ - Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv:2310.01119_, 2023.
749
+ - Alon Albalak, Liangming Pan, Colin Raffel, and William Yang Wang. "[Efficient Online Data Mixing For Language Model Pre-Training](https://arxiv.org/abs/2312.02406)." In _NeurIPS Workshop on R0-FoMo: Robustness of Few-shot and Zero-shot Learning in Large Foundation Models_, 2023.
750
+ - Eghbal A. Hosseini and Evelina Fedorenko. "[Large language models implicitly learn to straighten neural sentence trajectories to construct a predictive representation of natural language](https://www.biorxiv.org/content/10.1101/2023.11.05.564832v1)." In _Neural Information Processing Systems_, 2023.
751
+ - Junqi Yin, Sajal Dash, Feiyi Wang, and Mallikarjun Shankar. "[FORGE: Pre-Training Open Foundation Models for Science](https://dl.acm.org/doi/abs/10.1145/3581784.3613215). In _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, 1-13, 2023.
752
+ - Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." In _arXiv preprint arXiv:2310.01119_, 2023.
753
+ - Peng Di, Jianguo Li, Hang Yu, Wei Jiang, Wenting Cai, Yang Cao, Chaoyu Chen, Dajun Chen, Hongwei Chen, Liang Chen, Gang Fan, Jie Gong, Zi Gong, Wen Hu, Tingting Guo, Zhichao Lei, Ting Li, Zheng Li, Ming Liang, Cong Liao, Bingchang Liu, Jiachen Liu, Zhiwei Liu, Shaojun Lu, Min Shen, Guangpei Wang, Huan Wang, Zhi Wang, Zhaogui Xu, Jiawei Yang, Qing Ye, Gehao Zhang, Yu Zhang, Zelin Zhao, Xunjin Zheng, Hailian Zhou, Lifu Zhu, and Xianying Zhu. "[CodeFuse-13B: A Pretrained Multi-lingual Code Large Language Model](https://arxiv.org/abs/2310.06266)." In _arXiv preprint arXiv:2310.06266_, 2023.
754
+ - Nikitha Rao, Kush Jain, Uri Alon, Claire Le Goues, and Vincent J Hellendoorn. "[CAT-LM Training Language Models on Aligned Code And Tests](https://arxiv.org/abs/2310.01602)." In _38th IEEE/ACM International Conference on Automated Software Engineering (ASE)_, pp. 409-420. IEEE, 2023.
755
+ - Pratyush Patel, Esha Choukse, Chaojie Zhang, Íñigo Goiri, Brijesh Warrier, Nithish Mahalingam, Ricardo Bianchini. "[POLCA: Power Oversubscription in LLM Cloud Providers](https://arxiv.org/abs/2308.12908)." In _arXiv preprint_, 2023.
756
+ - Junqi Yin, Sajal Dash, John Gounley, Feiyi Wang, and Georgia Tourassi. "[Evaluation of pre-training large language models on leadership-class supercomputers](https://link.springer.com/article/10.1007/s11227-023-05479-7)." In _the Journal of Supercomputing_ 79, no. 18, 2023.
757
+ - Tal Kadosh, Niranjan Hasabnis, Vy A. Vo, Nadav Schneider, Neva Krien, Mihai Capota, Abdul Wasay, Nesreen Ahmed, Ted Willke, Guy Tamir, Yuval Pinter, Timothy Mattson, and Gal Oren. "[Domain-Specific Code Language Models: Unraveling the Potential for HPC Codes and Tasks](https://arxiv.org/abs/2312.13322)." In _arXiv preprint_, 2023.
758
+ - Guobin Shen, Dongcheng Zhao, Yiting Dong, Yang Li, Jindong Li, Kang Sun, and Yi Zeng. "[Astrocyte-Enabled Advancements in Spiking Neural Networks for Large Language Modeling](https://arxiv.org/abs/2312.07625)." In _arXiv preprint_, 2023.
759
+ - Eghbal A. Hosseini, Martin A. Schrimpf, Yian Zhang, Samuel Bowman, Noga Zaslavsky, and Evelina Fedorenko. "[Artificial neural network language models align neurally and behaviorally with humans even after a developmentally realistic amount of training.](https://www.biorxiv.org/content/10.1101/2022.10.04.510681)" In _Neurobiology of Language_, 2024.
760
+ - Xiongye Xiao, Chenyu Zhou, Heng Ping, Defu Cao, Yaxing Li, Yizhuo Zhou, Shixuan Li, and Paul Bogdan. "[Exploring Neuron Interactions and Emergence in LLMs: From the Multifractal Analysis Perspective](https://arxiv.org/abs/2402.09099)." In _arXiv preprint_, 2024.
761
+ - Zhiyuan Zeng, Qipeng Guo, Zhaoye Fei, Zhangyue Yin, Yunhua Zhou, Linyang Li, Tianxiang Sun, Hang Yan, Dahua Lin, and Xipeng Qiu. "[Turn Waste into Worth: Rectifying Top-k Router of MoE](https://arxiv.org/abs/2402.12399)." In _arXiv preprint_, 2024.
762
+
763
+ ## Models
764
+ The following models were trained using this library:
765
+
766
+ ### English LLMs
767
+ - EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) and [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia)
768
+ - CarperAI's [FIM-NeoX-1.3B](https://huggingface.co/CarperAI/FIM-NeoX-1.3B)
769
+ - StabilityAI's [StableLM (3B and 7B)](https://github.com/Stability-AI/StableLM)
770
+ - Together.ai's [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1)
771
+ - Carnegie Mellon University's [proofGPT (1.3B and 6.7B)](https://huggingface.co/hoskinson-center/proofGPT-v0.1-6.7B)
772
+ - Dampish's [StellarX (2.8B and 4B)](https://huggingface.co/Dampish/StellarX-4B-V0.2)
773
+ - Chinese Academy of Sciences's [AstroSNN (1.5B)](https://arxiv.org/abs/2312.07625)
774
+
775
+ ### Non-English LLMs
776
+ - EleutherAI's [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean)
777
+ - Korea University's [KULLM-Polyglot (5.8B and 12.8B)](https://github.com/nlpai-lab/KULLM) (Korean)
778
+ - Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b) (Japanese)
779
+ - LearnItAnyway's [LLaVA-Polyglot-Ko (1.3B)](https://huggingface.co/LearnItAnyway/llava-polyglot-ko-1.3b-hf) (Korean)
780
+ - Rinna Co.'s [japanese-gpt-neox-3.6b](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) (Japanese) and [bilingual-gpt-neox-4b](https://huggingface.co/rinna/bilingual-gpt-neox-4b) (English / Japanese)
781
+ - CyberAgent's [Open-CLM (125M through 7B)](https://huggingface.co/cyberagent/open-calm-7b) (Japanese)
782
+ - The Hungarian Research Centre for Linguistics's [PULI GPTrio (6.7B)](https://huggingface.co/NYTK/PULI-GPTrio) (Hungarian / English / Chinese)
783
+ - The University of Tokyo's [weblab-10b](https://huggingface.co/Kojima777/weblab-10b) and [weblab-10b-instruct](https://huggingface.co/Kojima777/weblab-10b-instruction-sft) (Japanese)
784
+ - nolando.ai's [Hi-NOLIN (9B)](https://blog.nolano.ai/Hi-NOLIN/) (English, Hindi)
785
+ - Renmin University of China's [YuLan (12B)](https://huggingface.co/yulan-team/YuLan-Base-12b) (English, Chinese)
786
+ - The Basque Center for Language Technology's [Latixna (70B)](https://huggingface.co/HiTZ/latxa-70b-v1.2) (Basque)
787
+
788
+ ### Code Models
789
+ - Carnegie Mellon University's [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs) and [CAT-LM (2.7B)](https://huggingface.co/nikitharao/catlm)
790
+ - StabilityAI's [StableCode (1.3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding) and [StableCode-Completion-Alpha (3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding)
791
+ - CodeFuse AI's [CodeFuse (13B)](https://huggingface.co/codefuse-ai/CodeFuse-13B)
792
+
793
+ ### AI for Science
794
+ - EleutherAI's [LLeMMA (34B)](https://arxiv.org/abs/2310.10631)
795
+ - Oak Ridge National Lab's [FORGE (26B)](https://github.com/at-aaims/forge)
796
+ - Oak Ridge National Lab's [Unnamed Material Science Domain Models (7B)](https://arxiv.org/abs/2402.00691)
797
+ - Pacific Northwest National Lab's [MolJet (undisclosed size)](https://openreview.net/pdf?id=7UudBVsIrr)
798
+
799
+ ### Other Modalities
800
+ - Rinna Co.'s [PSLM (7B)](https://arxiv.org/abs/2406.12428) (speech / text)
801
+ - University College London's [ChessGPT-3B](https://huggingface.co/Waterhorse/chessgpt-base-v1)
802
+ - Gretel's [Text-to-Table (3B)](https://huggingface.co/gretelai/text2table)
803
+
804
+ # Administrative Notes
805
+
806
+ ## Citing GPT-NeoX
807
+
808
+ If you have found the GPT-NeoX library helpful in your work, you can cite this repository as
809
+
810
+ ```bibtex
811
+ @software{gpt-neox-library,
812
+ title = {{GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch}},
813
+ author = {Andonian, Alex and Anthony, Quentin and Biderman, Stella and Black, Sid and Gali, Preetham and Gao, Leo and Hallahan, Eric and Levy-Kramer, Josh and Leahy, Connor and Nestler, Lucas and Parker, Kip and Pieler, Michael and Phang, Jason and Purohit, Shivanshu and Schoelkopf, Hailey and Stander, Dashiell and Songz, Tri and Tigges, Curt and Thérien, Benjamin and Wang, Phil and Weinbach, Samuel},
814
+ url = {https://www.github.com/eleutherai/gpt-neox},
815
+ doi = {10.5281/zenodo.5879544},
816
+ month = {9},
817
+ year = {2023},
818
+ version = {2.0.0},
819
+ }
820
+ ```
821
+
822
+ To cite the 20 billion parameter model named `GPT-NeoX-20B`, please use
823
+
824
+ ```bibtex
825
+ @inproceedings{gpt-neox-20b,
826
+ title={{GPT-NeoX-20B}: An Open-Source Autoregressive Language Model},
827
+ author={Black, Sid and Biderman, Stella and Hallahan, Eric and Anthony, Quentin and Gao, Leo and Golding, Laurence and He, Horace and Leahy, Connor and McDonell, Kyle and Phang, Jason and Pieler, Michael and Prashanth, USVSN Sai and Purohit, Shivanshu and Reynolds, Laria and Tow, Jonathan and Wang, Ben and Weinbach, Samuel},
828
+ booktitle={Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models},
829
+ url={https://arxiv.org/abs/2204.06745},
830
+ year={2022}
831
+ }
832
+ ```
833
+
834
+ ## Contributing
835
+ GPT-NeoX is built by the open-source AI community, and relies on our amazing contributors! Please see our
836
+ [contributing](CONTRIBUTING.md) guide for more details on our CLA, code formatting, testing,
837
+ etc.
838
+
839
+ ## Licensing
840
+
841
+ This repository hosts code that is part of EleutherAI's GPT-NeoX project. Copyright (c) 2024, EleutherAI. Licensed under the Apache License:
842
+
843
+ Licensed under the Apache License, Version 2.0 (the "License");
844
+ you may not use this file except in compliance with the License.
845
+ You may obtain a copy of the License at
846
+
847
+ http://www.apache.org/licenses/LICENSE-2.0
848
+
849
+ Unless required by applicable law or agreed to in writing, software
850
+ distributed under the License is distributed on an "AS IS" BASIS,
851
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
852
+ See the License for the specific language governing permissions and
853
+ limitations under the License.
854
+
855
+ This repository is based off code written by NVIDIA that is licensed under the Apache License, Version 2.0. In accordance with the Apache License, all files that are modifications of code originally written by NVIDIA maintain a NVIDIA copyright header. All files that do not contain such a header are the exclusive copyright of EleutherAI. When the NVIDIA code has been modified from its original version, that fact is noted in the copyright header. All derivative works of this repository must preserve these headers under the terms of the Apache License.
856
+
857
+ This repository also contains code written by a number of other authors. Such contributions are marked and the relevant licensing is included where appropriate.
858
+
859
+ For full terms, see the `LICENSE` file. If you have any questions, comments, or concerns about licensing please email us at [email protected].
860
+
861
+ ## Acknowledgements
862
+
863
+ We run our experiments on a Kubernetes cluster provided by [CoreWeave](https://coreweave.com/) and a Slurm cluster provided by [Stability AI](https://stability.ai). We are thankful to the DeepSpeed team for their advice and consultation.
configs/1-3B.yml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
+ # across the node boundaries )
5
+ "pipe_parallel_size": 1,
6
+ "model_parallel_size": 1,
7
+
8
+ # model settings
9
+ "num_layers": 24,
10
+ "hidden_size": 2048,
11
+ "num_attention_heads": 16,
12
+ "seq_length": 2048,
13
+ "max_position_embeddings": 2048,
14
+ "norm": "layernorm",
15
+ "pos_emb": "rotary",
16
+ "no_weight_tying": true,
17
+ "gpt_j_residual": false,
18
+ "output_layer_parallelism": "column",
19
+
20
+ # these should provide some speedup but takes a while to build, set to true if desired
21
+ "scaled_upper_triang_masked_softmax_fusion": false,
22
+ "bias_gelu_fusion": false,
23
+ "rope_fusion": false,
24
+ "layernorm_fusion": false,
25
+
26
+ # init methods
27
+ "init_method": "small_init",
28
+ "output_layer_init_method": "wang_init",
29
+
30
+ # optimizer settings
31
+ "optimizer": {
32
+ "type": "Adam",
33
+ "params": {
34
+ "lr": 0.0002,
35
+ "betas": [0.9, 0.95],
36
+ "eps": 1.0e-8,
37
+ }
38
+ },
39
+ "min_lr": 0.00002,
40
+
41
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42
+ "zero_optimization": {
43
+ "stage": 1,
44
+ "allgather_partitions": True,
45
+ "allgather_bucket_size": 500000000,
46
+ "overlap_comm": True,
47
+ "reduce_scatter": True,
48
+ "reduce_bucket_size": 500000000,
49
+ "contiguous_gradients": True,
50
+ },
51
+
52
+ # batch / data settings
53
+ "train_micro_batch_size_per_gpu": 4,
54
+ "data_impl": "mmap",
55
+
56
+ # activation checkpointing
57
+ "checkpoint_activations": true,
58
+ "checkpoint_num_layers": 1,
59
+ "partition_activations": true,
60
+ "synchronize_each_layer": true,
61
+
62
+ # regularization
63
+ "gradient_clipping": 1.0,
64
+ "weight_decay": 0.1,
65
+ "hidden_dropout": 0,
66
+ "attention_dropout": 0,
67
+
68
+ # precision settings
69
+ "fp16": {
70
+ "fp16": true,
71
+ "enabled": true,
72
+ "loss_scale": 0,
73
+ "loss_scale_window": 1000,
74
+ "hysteresis": 2,
75
+ "min_loss_scale": 1
76
+ },
77
+
78
+ # misc. training settings
79
+ "train_iters": 320000,
80
+ "lr_decay_iters": 320000,
81
+ "distributed_backend": "nccl",
82
+ "lr_decay_style": "cosine",
83
+ "warmup": 0.01,
84
+ "checkpoint_factor": 10000,
85
+ "eval_interval": 1000,
86
+ "eval_iters": 10,
87
+
88
+ # logging
89
+ "log_interval": 100,
90
+ "steps_per_print": 10,
91
+ "keep_last_n_checkpoints": 4,
92
+ "wall_clock_breakdown": true,
93
+ }
configs/125M-dmoe.yml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # See README for MoE config docs!
4
+ "moe_type": "megablocks",
5
+ "moe_token_dropping": false,
6
+ # Have 4 experts per layer (every 2 layers by default)
7
+ "moe_num_experts": 4,
8
+ # parallelism settings
9
+ "enable_expert_tensor_parallelism": true,
10
+ "pipe_parallel_size": 1, # not yet supported for MoE
11
+ "model_parallel_size": 1,
12
+ "moe_expert_parallel_size": 1,
13
+
14
+ # model settings
15
+ "num_layers": 12,
16
+ "hidden_size": 768,
17
+ "num_attention_heads": 12,
18
+ "seq_length": 2048,
19
+ "max_position_embeddings": 2048,
20
+ "norm": "layernorm",
21
+ "pos_emb": "rotary",
22
+ "no_weight_tying": true,
23
+ "gpt_j_residual": false,
24
+ "output_layer_parallelism": "column",
25
+
26
+ # these should provide some speedup but takes a while to build, set to true if desired
27
+ "scaled_upper_triang_masked_softmax_fusion": false,
28
+ "bias_gelu_fusion": false,
29
+ "rope_fusion": false,
30
+
31
+ # init methods
32
+ "init_method": "small_init",
33
+ "output_layer_init_method": "wang_init",
34
+
35
+
36
+ # optimizer settings
37
+ "optimizer": {
38
+ "type": "Adam",
39
+ "params": {
40
+ "lr": 0.0006,
41
+ "betas": [0.9, 0.95],
42
+ "eps": 1.0e-8,
43
+ }
44
+ },
45
+ "min_lr": 0.00006,
46
+
47
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
48
+ "zero_optimization": {
49
+ "stage": 0,
50
+ "allgather_partitions": True,
51
+ "allgather_bucket_size": 500000000,
52
+ "overlap_comm": True,
53
+ "reduce_scatter": True,
54
+ "reduce_bucket_size": 500000000,
55
+ "contiguous_gradients": True,
56
+ },
57
+
58
+ # batch / data settings
59
+ "train_micro_batch_size_per_gpu": 4,
60
+ "data_impl": "mmap",
61
+
62
+ # activation checkpointing
63
+ "checkpoint_activations": true,
64
+ "checkpoint_num_layers": 1,
65
+ "partition_activations": true,
66
+ "synchronize_each_layer": true,
67
+
68
+ # regularization
69
+ "gradient_clipping": 1.0,
70
+ "weight_decay": 0.1,
71
+ "hidden_dropout": 0.0,
72
+ "attention_dropout": 0.0,
73
+
74
+ # precision settings
75
+ "fp16": {
76
+ "enabled": true,
77
+ "loss_scale": 0,
78
+ "loss_scale_window": 1000,
79
+ "hysteresis": 2,
80
+ "min_loss_scale": 1
81
+ },
82
+
83
+ # misc. training settings
84
+ "train_iters": 320000,
85
+ "lr_decay_iters": 320000,
86
+ "distributed_backend": "nccl",
87
+ "lr_decay_style": "cosine",
88
+ "warmup": 0.01,
89
+ "checkpoint_factor": 10000,
90
+ "eval_interval": 1000,
91
+ "eval_iters": 10,
92
+
93
+ # logging
94
+ "log_interval": 10,
95
+ "steps_per_print": 10,
96
+ "keep_last_n_checkpoints": 4,
97
+ "wall_clock_breakdown": true,
98
+
99
+ # networking
100
+ "hostfile": "/mock_path"
101
+ }
configs/125M-json.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pipe_parallel_size": 1,
3
+ "model_parallel_size": 1,
4
+
5
+ "num_layers": 12,
6
+ "hidden_size": 768,
7
+ "num_attention_heads": 12,
8
+ "seq_length": 2048,
9
+ "max_position_embeddings": 2048,
10
+ "norm": "layernorm",
11
+ "pos_emb": "rotary",
12
+ "no_weight_tying": true,
13
+ "gpt_j_residual": false,
14
+ "output_layer_parallelism": "column",
15
+
16
+ "scaled_upper_triang_masked_softmax_fusion": false,
17
+ "bias_gelu_fusion": false,
18
+ "rope_fusion": false,
19
+ "layernorm_fusion": false,
20
+
21
+ "init_method": "small_init",
22
+ "output_layer_init_method": "wang_init",
23
+
24
+ "optimizer": {
25
+ "type": "Adam",
26
+ "params": {
27
+ "lr": 0.0006,
28
+ "betas": [0.9, 0.95],
29
+ "eps": 1.0e-8
30
+ }
31
+ },
32
+ "min_lr": 0.00006,
33
+
34
+ "zero_optimization": {
35
+ "stage": 1,
36
+ "allgather_partitions": true,
37
+ "allgather_bucket_size": 500000000,
38
+ "overlap_comm": true,
39
+ "reduce_scatter": true,
40
+ "reduce_bucket_size": 500000000,
41
+ "contiguous_gradients": true
42
+ },
43
+
44
+ "train_micro_batch_size_per_gpu": 4,
45
+ "data_impl": "mmap",
46
+
47
+ "checkpoint_activations": true,
48
+ "checkpoint_num_layers": 1,
49
+ "partition_activations": true,
50
+ "synchronize_each_layer": true,
51
+
52
+ "gradient_clipping": 1.0,
53
+ "weight_decay": 0.1,
54
+ "hidden_dropout": 0.0,
55
+ "attention_dropout": 0.0,
56
+
57
+ "fp16": {
58
+ "enabled": true,
59
+ "loss_scale": 0,
60
+ "loss_scale_window": 1000,
61
+ "hysteresis": 2,
62
+ "min_loss_scale": 1
63
+ },
64
+
65
+ "train_iters": 320000,
66
+ "lr_decay_iters": 320000,
67
+ "distributed_backend": "nccl",
68
+ "lr_decay_style": "cosine",
69
+ "warmup": 0.01,
70
+ "checkpoint_factor": 10000,
71
+ "eval_interval": 1000,
72
+ "eval_iters": 10,
73
+
74
+ "log_interval": 100,
75
+ "steps_per_print": 10,
76
+ "keep_last_n_checkpoints": 4,
77
+ "wall_clock_breakdown": true,
78
+
79
+ "hostfile": "/mock_path"
80
+ }
configs/125M-moe.yml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # See README for MoE config docs!
4
+ "moe_type": "deepspeed",
5
+ "moe_token_dropping": true,
6
+ # Have 4 experts per layer (every 2 layers by default)
7
+ "moe_num_experts": 4,
8
+ # parallelism settings
9
+ "enable_expert_tensor_parallelism": true,
10
+ "pipe_parallel_size": 1, # not yet supported for MoE
11
+ "model_parallel_size": 1,
12
+ "moe_expert_parallel_size": 1,
13
+
14
+ # model settings
15
+ "num_layers": 12,
16
+ "hidden_size": 768,
17
+ "num_attention_heads": 12,
18
+ "seq_length": 2048,
19
+ "max_position_embeddings": 2048,
20
+ "norm": "layernorm",
21
+ "pos_emb": "rotary",
22
+ "no_weight_tying": true,
23
+ "gpt_j_residual": false,
24
+ "output_layer_parallelism": "column",
25
+
26
+ # these should provide some speedup but takes a while to build, set to true if desired
27
+ "scaled_upper_triang_masked_softmax_fusion": false,
28
+ "bias_gelu_fusion": false,
29
+ "rope_fusion": false,
30
+
31
+ # init methods
32
+ "init_method": "small_init",
33
+ "output_layer_init_method": "wang_init",
34
+
35
+
36
+ # optimizer settings
37
+ "optimizer": {
38
+ "type": "Adam",
39
+ "params": {
40
+ "lr": 0.0006,
41
+ "betas": [0.9, 0.95],
42
+ "eps": 1.0e-8,
43
+ }
44
+ },
45
+ "min_lr": 0.00006,
46
+
47
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
48
+ "zero_optimization": {
49
+ "stage": 1,
50
+ "allgather_partitions": True,
51
+ "allgather_bucket_size": 500000000,
52
+ "overlap_comm": True,
53
+ "reduce_scatter": True,
54
+ "reduce_bucket_size": 500000000,
55
+ "contiguous_gradients": True,
56
+ },
57
+
58
+ # batch / data settings
59
+ "train_micro_batch_size_per_gpu": 4,
60
+ "data_impl": "mmap",
61
+
62
+ # activation checkpointing
63
+ "checkpoint_activations": true,
64
+ "checkpoint_num_layers": 1,
65
+ "partition_activations": true,
66
+ "synchronize_each_layer": true,
67
+
68
+ # regularization
69
+ "gradient_clipping": 1.0,
70
+ "weight_decay": 0.1,
71
+ "hidden_dropout": 0.0,
72
+ "attention_dropout": 0.0,
73
+
74
+ # precision settings
75
+ "fp16": {
76
+ "enabled": true,
77
+ "loss_scale": 0,
78
+ "loss_scale_window": 1000,
79
+ "hysteresis": 2,
80
+ "min_loss_scale": 1
81
+ },
82
+
83
+ # misc. training settings
84
+ "train_iters": 320000,
85
+ "lr_decay_iters": 320000,
86
+ "distributed_backend": "nccl",
87
+ "lr_decay_style": "cosine",
88
+ "warmup": 0.01,
89
+ "checkpoint_factor": 10000,
90
+ "eval_interval": 1000,
91
+ "eval_iters": 10,
92
+
93
+ # logging
94
+ "log_interval": 10,
95
+ "steps_per_print": 10,
96
+ "keep_last_n_checkpoints": 4,
97
+ "wall_clock_breakdown": true,
98
+
99
+ # networking
100
+ "hostfile": "/mock_path"
101
+ }
configs/125M.yml ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
+ # across the node boundaries )
5
+ "pipe_parallel_size": 1,
6
+ "model_parallel_size": 1,
7
+
8
+ # model settings
9
+ "num_layers": 12,
10
+ "hidden_size": 768,
11
+ "num_attention_heads": 12,
12
+ "seq_length": 2048,
13
+ "max_position_embeddings": 2048,
14
+ "norm": "layernorm",
15
+ "pos_emb": "rotary",
16
+ "no_weight_tying": true,
17
+ "gpt_j_residual": false,
18
+ "output_layer_parallelism": "column",
19
+
20
+ # these should provide some speedup but takes a while to build, set to true if desired
21
+ "scaled_upper_triang_masked_softmax_fusion": false,
22
+ "bias_gelu_fusion": false,
23
+ "rope_fusion": false,
24
+ "layernorm_fusion": false,
25
+
26
+ # init methods
27
+ "init_method": "small_init",
28
+ "output_layer_init_method": "wang_init",
29
+
30
+
31
+ # optimizer settings
32
+ "optimizer": {
33
+ "type": "Adam",
34
+ "params": {
35
+ "lr": 0.0006,
36
+ "betas": [0.9, 0.95],
37
+ "eps": 1.0e-8,
38
+ }
39
+ },
40
+ "min_lr": 0.00006,
41
+
42
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
43
+ "zero_optimization": {
44
+ "stage": 1,
45
+ "allgather_partitions": True,
46
+ "allgather_bucket_size": 500000000,
47
+ "overlap_comm": True,
48
+ "reduce_scatter": True,
49
+ "reduce_bucket_size": 500000000,
50
+ "contiguous_gradients": True,
51
+ },
52
+
53
+ # batch / data settings
54
+ "train_micro_batch_size_per_gpu": 4,
55
+ "data_impl": "mmap",
56
+
57
+ # activation checkpointing
58
+ "checkpoint_activations": true,
59
+ "checkpoint_num_layers": 1,
60
+ "partition_activations": true,
61
+ "synchronize_each_layer": true,
62
+
63
+ # regularization
64
+ "gradient_clipping": 1.0,
65
+ "weight_decay": 0.1,
66
+ "hidden_dropout": 0.0,
67
+ "attention_dropout": 0.0,
68
+
69
+ # precision settings
70
+ "fp16": {
71
+ "enabled": true,
72
+ "loss_scale": 0,
73
+ "loss_scale_window": 1000,
74
+ "hysteresis": 2,
75
+ "min_loss_scale": 1
76
+ },
77
+
78
+ # misc. training settings
79
+ "train_iters": 320000,
80
+ "lr_decay_iters": 320000,
81
+ "distributed_backend": "nccl",
82
+ "lr_decay_style": "cosine",
83
+ "warmup": 0.01,
84
+ "checkpoint_factor": 10000,
85
+ "eval_interval": 1000,
86
+ "eval_iters": 10,
87
+
88
+ # logging
89
+ "log_interval": 100,
90
+ "steps_per_print": 10,
91
+ "keep_last_n_checkpoints": 4,
92
+ "wall_clock_breakdown": true,
93
+
94
+ # networking
95
+ "hostfile": "/mock_path"
96
+ }
configs/13B.yml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
+ # across the node boundaries )
5
+ "pipe_parallel_size": 1,
6
+ "model_parallel_size": 1,
7
+
8
+ # model settings
9
+ "num_layers": 40,
10
+ "hidden_size": 5120,
11
+ "num_attention_heads": 40,
12
+ "seq_length": 2048,
13
+ "max_position_embeddings": 2048,
14
+ "norm": "layernorm",
15
+ "pos_emb": "rotary",
16
+ "no_weight_tying": true,
17
+ "gpt_j_residual": false,
18
+ "output_layer_parallelism": "column",
19
+
20
+ # these should provide some speedup but takes a while to build, set to true if desired
21
+ "scaled_upper_triang_masked_softmax_fusion": false,
22
+ "bias_gelu_fusion": false,
23
+ "rope_fusion": false,
24
+ "layernorm_fusion": false,
25
+
26
+ # init methods
27
+ "init_method": "small_init",
28
+ "output_layer_init_method": "wang_init",
29
+
30
+
31
+ # optimizer settings
32
+ "optimizer": {
33
+ "type": "Adam",
34
+ "params": {
35
+ "lr": 0.0001,
36
+ "betas": [0.9, 0.95],
37
+ "eps": 1.0e-8,
38
+ }
39
+ },
40
+
41
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42
+ "zero_optimization": {
43
+ "stage": 1,
44
+ "allgather_partitions": True,
45
+ "allgather_bucket_size": 500000000,
46
+ "overlap_comm": True,
47
+ "reduce_scatter": True,
48
+ "reduce_bucket_size": 500000000,
49
+ "contiguous_gradients": True,
50
+ },
51
+ "min_lr": 0.00001,
52
+
53
+ # batch / data settings
54
+ "train_micro_batch_size_per_gpu": 4,
55
+ "data_impl": "mmap",
56
+
57
+ # activation checkpointing
58
+ "checkpoint_activations": true,
59
+ "checkpoint_num_layers": 1,
60
+ "partition_activations": true,
61
+ "synchronize_each_layer": true,
62
+
63
+ # regularization
64
+ "gradient_clipping": 1.0,
65
+ "weight_decay": 0.1,
66
+ "hidden_dropout": 0,
67
+ "attention_dropout": 0,
68
+
69
+ # precision settings
70
+ "fp16": {
71
+ "fp16": true,
72
+ "enabled": true,
73
+ "loss_scale": 0,
74
+ "loss_scale_window": 1000,
75
+ "hysteresis": 2,
76
+ "min_loss_scale": 1
77
+ },
78
+
79
+ # misc. training settings
80
+ "train_iters": 320000,
81
+ "lr_decay_iters": 320000,
82
+ "distributed_backend": "nccl",
83
+ "lr_decay_style": "cosine",
84
+ "warmup": 0.01,
85
+ "checkpoint_factor": 10000,
86
+ "eval_interval": 1000,
87
+ "eval_iters": 10,
88
+
89
+ # logging
90
+ "log_interval": 100,
91
+ "steps_per_print": 10,
92
+ "keep_last_n_checkpoints": 4,
93
+ "wall_clock_breakdown": true,
94
+ }
configs/175B.yml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
+ # across the node boundaries )
5
+ "pipe_parallel_size": 1,
6
+ "model_parallel_size": 1,
7
+
8
+ # model settings
9
+ "num_layers": 96,
10
+ "hidden_size": 12288,
11
+ "num_attention_heads": 96,
12
+ "seq_length": 2048,
13
+ "max_position_embeddings": 2048,
14
+ "norm": "layernorm",
15
+ "pos_emb": "rotary",
16
+ "no_weight_tying": true,
17
+ "gpt_j_residual": false,
18
+ "output_layer_parallelism": "column",
19
+
20
+ # these should provide some speedup but takes a while to build, set to true if desired
21
+ "scaled_upper_triang_masked_softmax_fusion": false,
22
+ "bias_gelu_fusion": false,
23
+ "rope_fusion": false,
24
+ "layernorm_fusion": false,
25
+
26
+ # init methods
27
+ "init_method": "small_init",
28
+ "output_layer_init_method": "wang_init",
29
+
30
+ # optimizer settings
31
+ "optimizer": {
32
+ "type": "Adam",
33
+ "params": {
34
+ "lr": 0.00006,
35
+ "betas": [0.9, 0.95],
36
+ "eps": 1.0e-8,
37
+ }
38
+ },
39
+ "min_lr": 0.000006,
40
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
41
+ "zero_optimization": {
42
+ "stage": 1,
43
+ "allgather_partitions": True,
44
+ "allgather_bucket_size": 500000000,
45
+ "overlap_comm": True,
46
+ "reduce_scatter": True,
47
+ "reduce_bucket_size": 500000000,
48
+ "contiguous_gradients": True,
49
+ },
50
+
51
+ # batch / data settings
52
+ "train_micro_batch_size_per_gpu": 4,
53
+ "data_impl": "mmap",
54
+
55
+ # activation checkpointing
56
+ "checkpoint_activations": true,
57
+ "checkpoint_num_layers": 1,
58
+ "partition_activations": true,
59
+ "synchronize_each_layer": true,
60
+
61
+ # regularization
62
+ "gradient_clipping": 1.0,
63
+ "weight_decay": 0.1,
64
+ "hidden_dropout": 0,
65
+ "attention_dropout": 0,
66
+
67
+ # precision settings
68
+ "fp16": {
69
+ "fp16": true,
70
+ "enabled": true,
71
+ "loss_scale": 0,
72
+ "loss_scale_window": 1000,
73
+ "hysteresis": 2,
74
+ "min_loss_scale": 1
75
+ },
76
+
77
+ # misc. training settings
78
+ "train_iters": 320000,
79
+ "lr_decay_iters": 320000,
80
+ "distributed_backend": "nccl",
81
+ "lr_decay_style": "cosine",
82
+ "warmup": 0.01,
83
+ "checkpoint_factor": 10000,
84
+ "eval_interval": 1000,
85
+ "eval_iters": 10,
86
+
87
+ # logging
88
+ "log_interval": 100,
89
+ "steps_per_print": 10,
90
+ "keep_last_n_checkpoints": 4,
91
+ "wall_clock_breakdown": true,
92
+ }
configs/19M.yml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pipe_parallel_size": 1,
3
+ "model_parallel_size": 1,
4
+
5
+ # model settings
6
+ "num_layers": 6,
7
+ "hidden_size": 512,
8
+ "num_attention_heads": 8,
9
+ "seq_length": 2048,
10
+ "max_position_embeddings": 2048,
11
+ "pos_emb": "rotary",
12
+ "no_weight_tying": true,
13
+ "gpt_j_residual": false,
14
+ "output_layer_parallelism": "column",
15
+
16
+ "scaled_upper_triang_masked_softmax_fusion": false,
17
+ "bias_gelu_fusion": false,
18
+ "rope_fusion": false,
19
+ "layernorm_fusion": false,
20
+
21
+ # init methods
22
+ "init_method": "small_init",
23
+ "output_layer_init_method": "wang_init",
24
+
25
+ "optimizer": {
26
+ "type": "Adam",
27
+ "params": {
28
+ "lr": 0.001,
29
+ "betas": [0.9, 0.95],
30
+ "eps": 1.0e-8,
31
+ }
32
+ },
33
+ "min_lr": 0.0001,
34
+
35
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
36
+ "zero_optimization": {
37
+ "stage": 1,
38
+ "allgather_partitions": True,
39
+ "allgather_bucket_size": 500000000,
40
+ "overlap_comm": True,
41
+ "reduce_scatter": True,
42
+ "reduce_bucket_size": 500000000,
43
+ "contiguous_gradients": True,
44
+ },
45
+
46
+ "train_micro_batch_size_per_gpu": 4, #32,
47
+ "gradient_accumulation_steps": 1,
48
+ "data_impl": "mmap",
49
+ "num_workers": 1,
50
+
51
+ # activation checkpointing
52
+ "checkpoint_activations": true,
53
+ "checkpoint_num_layers": 1,
54
+ "partition_activations": true,
55
+ "synchronize_each_layer": true,
56
+
57
+ # regularization
58
+ "gradient_clipping": 1.0,
59
+ "weight_decay": 0.1,
60
+ "hidden_dropout": 0,
61
+ "attention_dropout": 0,
62
+
63
+ # precision settings
64
+ "fp16": {
65
+ "fp16": true,
66
+ "enabled": true,
67
+ "loss_scale": 0,
68
+ "loss_scale_window": 1000,
69
+ "initial_scale_power": 12,
70
+ "hysteresis": 2,
71
+ "min_loss_scale": 1,
72
+ },
73
+
74
+ "train_iters": 143000,
75
+ "lr_decay_iters": 143000,
76
+ "distributed_backend": "nccl",
77
+ "lr_decay_style": "cosine",
78
+ "warmup": 0.01,
79
+ "checkpoint_factor": 1000,
80
+ "eval_interval": 100000,
81
+ "eval_iters": 10,
82
+
83
+ "log_interval": 10,
84
+ "steps_per_print": 10,
85
+ "wall_clock_breakdown": true,
86
+
87
+ # additional deepspeed args not specified above
88
+ "deepspeed_extra_args": {
89
+ "comms_logger": {
90
+ "enabled": true,
91
+ "verbose": true,
92
+ "prof_all": true,
93
+ "debug": false
94
+ },
95
+ }
96
+
97
+ }
configs/2-7B.yml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
+ # across the node boundaries )
5
+ "pipe_parallel_size": 1,
6
+ "model_parallel_size": 1,
7
+
8
+ # model settings
9
+ "num_layers": 32,
10
+ "hidden_size": 2560,
11
+ "num_attention_heads": 32,
12
+ "seq_length": 2048,
13
+ "max_position_embeddings": 2048,
14
+ "norm": "layernorm",
15
+ "pos_emb": "rotary",
16
+ "no_weight_tying": true,
17
+ "gpt_j_residual": false,
18
+ "output_layer_parallelism": "column",
19
+
20
+ # these should provide some speedup but takes a while to build, set to true if desired
21
+ "scaled_upper_triang_masked_softmax_fusion": false,
22
+ "bias_gelu_fusion": false,
23
+ "rope_fusion": false,
24
+ "layernorm_fusion": false,
25
+
26
+ # init methods
27
+ "init_method": "small_init",
28
+ "output_layer_init_method": "wang_init",
29
+
30
+ # optimizer settings
31
+ "optimizer": {
32
+ "type": "Adam",
33
+ "params": {
34
+ "lr": 0.00016,
35
+ "betas": [0.9, 0.95],
36
+ "eps": 1.0e-8,
37
+ }
38
+ },
39
+ "min_lr": 0.000016,
40
+
41
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42
+ "zero_optimization": {
43
+ "stage": 1,
44
+ "allgather_partitions": True,
45
+ "allgather_bucket_size": 500000000,
46
+ "overlap_comm": True,
47
+ "reduce_scatter": True,
48
+ "reduce_bucket_size": 500000000,
49
+ "contiguous_gradients": True,
50
+ },
51
+
52
+ # batch / data settings
53
+ "train_micro_batch_size_per_gpu": 4,
54
+ "data_impl": "mmap",
55
+
56
+ # activation checkpointing
57
+ "checkpoint_activations": true,
58
+ "checkpoint_num_layers": 1,
59
+ "partition_activations": true,
60
+ "synchronize_each_layer": true,
61
+
62
+ # regularization
63
+ "gradient_clipping": 1.0,
64
+ "weight_decay": 0.1,
65
+ "hidden_dropout": 0,
66
+ "attention_dropout": 0,
67
+
68
+ # precision settings
69
+ "fp16": {
70
+ "fp16": true,
71
+ "enabled": true,
72
+ "loss_scale": 0,
73
+ "loss_scale_window": 1000,
74
+ "hysteresis": 2,
75
+ "min_loss_scale": 1
76
+ },
77
+
78
+ # misc. training settings
79
+ "train_iters": 320000,
80
+ "lr_decay_iters": 320000,
81
+ "distributed_backend": "nccl",
82
+ "lr_decay_style": "cosine",
83
+ "warmup": 0.01,
84
+ "checkpoint_factor": 10000,
85
+ "eval_interval": 1000,
86
+ "eval_iters": 10,
87
+
88
+ # logging
89
+ "log_interval": 100,
90
+ "steps_per_print": 10,
91
+ "keep_last_n_checkpoints": 4,
92
+ "wall_clock_breakdown": true,
93
+ }
configs/20B.yml ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100
2
+ # GPUs. Depending on your system configuration, you may need to change some parameters in order to fit
3
+ # the model in memory.
4
+
5
+ {
6
+ # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in
7
+ "vocab_file": "./20B_checkpoints/20B_tokenizer.json",
8
+ "save": "./20B_checkpoints",
9
+ "load": "./20B_checkpoints",
10
+
11
+ # If finetuning, edit the following to the location of your finetuning dataset:
12
+ "data_path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document",
13
+
14
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
15
+ # across the node boundaries )
16
+ "pipe_parallel_size": 4,
17
+ "model_parallel_size": 2,
18
+
19
+ # model settings
20
+ "num_layers": 44,
21
+ "hidden_size": 6144,
22
+ "num_attention_heads": 64,
23
+ "seq_length": 2048,
24
+ "max_position_embeddings": 2048,
25
+ "norm": "layernorm",
26
+ "pos_emb": "rotary",
27
+ "rotary_pct": 0.25,
28
+ "no_weight_tying": true,
29
+ "gpt_j_residual": true,
30
+ "output_layer_parallelism": "column",
31
+ "scaled_upper_triang_masked_softmax_fusion": true,
32
+ "bias_gelu_fusion": true,
33
+ "rope_fusion": false,
34
+ "layernorm_fusion": false,
35
+
36
+ # init methods
37
+ "init_method": "small_init",
38
+ "output_layer_init_method": "wang_init",
39
+
40
+ # optimizer settings
41
+ "optimizer": {
42
+ "type": "Adam",
43
+ "params": {
44
+ "lr": 0.97e-4,
45
+ "betas": [0.9, 0.95],
46
+ "eps": 1.0e-8,
47
+ }
48
+ },
49
+
50
+ "min_lr": 0.97e-5,
51
+
52
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
53
+ "zero_optimization": {
54
+ "stage": 1,
55
+ "allgather_partitions": True,
56
+ "allgather_bucket_size": 1260000000,
57
+ "overlap_comm": True,
58
+ "reduce_scatter": True,
59
+ "reduce_bucket_size": 1260000000,
60
+ "contiguous_gradients": True,
61
+ },
62
+
63
+ # batch / data settings (assuming 96 GPUs)
64
+ "train_micro_batch_size_per_gpu": 4,
65
+ "gradient_accumulation_steps": 32,
66
+ "data_impl": "mmap",
67
+ "split": "995,4,1",
68
+
69
+ # activation checkpointing
70
+ "checkpoint_activations": true,
71
+ "checkpoint_num_layers": 1,
72
+ "partition_activations": false,
73
+ "synchronize_each_layer": true,
74
+
75
+ # regularization
76
+ "gradient_clipping": 1.0,
77
+ "weight_decay": 0.01,
78
+ "hidden_dropout": 0,
79
+ "attention_dropout": 0,
80
+
81
+ # precision settings
82
+ "fp16": {
83
+ "fp16": true,
84
+ "enabled": true,
85
+ "loss_scale": 0,
86
+ "loss_scale_window": 1000,
87
+ "initial_scale_power": 12,
88
+ "hysteresis": 2,
89
+ "min_loss_scale": 1
90
+ },
91
+
92
+ # misc. training settings
93
+ "train_iters": 150000,
94
+ "lr_decay_iters": 150000,
95
+
96
+ "distributed_backend": "nccl",
97
+ "lr_decay_style": "cosine",
98
+ "warmup": 0.01,
99
+ "checkpoint_factor": 500, # this variable previously called `save-interval`
100
+ "eval_interval": 1000,
101
+ "eval_iters": 10,
102
+
103
+ # logging
104
+ "log_interval": 2,
105
+ "steps_per_print": 2,
106
+ "wall_clock_breakdown": false,
107
+
108
+ ### NEW DATA: ####
109
+ "tokenizer_type": "HFTokenizer",
110
+ "tensorboard-dir": "./tensorboard",
111
+ "log_dir": "./logs",
112
+
113
+ }
configs/350M.yml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
+ # across the node boundaries )
5
+ "pipe_parallel_size": 1,
6
+ "model_parallel_size": 1,
7
+
8
+ # model settings
9
+ "num_layers": 24,
10
+ "hidden_size": 1024,
11
+ "num_attention_heads": 16,
12
+ "seq_length": 2048,
13
+ "max_position_embeddings": 2048,
14
+ "norm": "layernorm",
15
+ "pos_emb": "rotary",
16
+ "no_weight_tying": true,
17
+ "gpt_j_residual": false,
18
+ "output_layer_parallelism": "column",
19
+
20
+ # these should provide some speedup but takes a while to build, set to true if desired
21
+ "scaled_upper_triang_masked_softmax_fusion": false,
22
+ "bias_gelu_fusion": false,
23
+ "rope_fusion": false,
24
+ "layernorm_fusion": false,
25
+
26
+ # init methods
27
+ "init_method": "small_init",
28
+ "output_layer_init_method": "wang_init",
29
+
30
+ # optimizer settings
31
+ "optimizer": {
32
+ "type": "Adam",
33
+ "params": {
34
+ "lr": 0.0003,
35
+ "betas": [0.9, 0.95],
36
+ "eps": 1.0e-8,
37
+ }
38
+ },
39
+ "min_lr": 0.00003,
40
+
41
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42
+ "zero_optimization": {
43
+ "stage": 1,
44
+ "allgather_partitions": True,
45
+ "allgather_bucket_size": 500000000,
46
+ "overlap_comm": True,
47
+ "reduce_scatter": True,
48
+ "reduce_bucket_size": 500000000,
49
+ "contiguous_gradients": True,
50
+ },
51
+ # batch / data settings
52
+ "train_micro_batch_size_per_gpu": 4,
53
+ "data_impl": "mmap",
54
+
55
+ # activation checkpointing
56
+ "checkpoint_activations": true,
57
+ "checkpoint_num_layers": 1,
58
+ "partition_activations": true,
59
+ "synchronize_each_layer": true,
60
+
61
+ # regularization
62
+ "gradient_clipping": 1.0,
63
+ "weight_decay": 0.1,
64
+ "hidden_dropout": 0,
65
+ "attention_dropout": 0,
66
+
67
+ # precision settings
68
+ "fp16": {
69
+ "fp16": true,
70
+ "enabled": true,
71
+ "loss_scale": 0,
72
+ "loss_scale_window": 1000,
73
+ "hysteresis": 2,
74
+ "min_loss_scale": 1
75
+ },
76
+
77
+ # misc. training settings
78
+ "train_iters": 320000,
79
+ "lr_decay_iters": 320000,
80
+ "distributed_backend": "nccl",
81
+ "lr_decay_style": "cosine",
82
+ "warmup": 0.01,
83
+ "checkpoint_factor": 10000,
84
+ "eval_interval": 1000,
85
+ "eval_iters": 10,
86
+
87
+ # logging
88
+ "log_interval": 100,
89
+ "steps_per_print": 10,
90
+ "keep_last_n_checkpoints": 4,
91
+ "wall_clock_breakdown": true,
92
+ }
configs/49M.yml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ # parallelism settings
3
+ "pipe_parallel_size": 1,
4
+ "model_parallel_size": 1,
5
+
6
+ # model settings
7
+ "num_layers": 10,
8
+ "hidden_size": 640,
9
+ "num_attention_heads": 10,
10
+ "seq_length": 2048,
11
+ "max_position_embeddings": 2048,
12
+ "pos_emb": "rotary",
13
+ "rotary_pct": 0.25,
14
+ "no_weight_tying": true,
15
+ "gpt_j_residual": true,
16
+ "output_layer_parallelism": "column",
17
+
18
+ # these should provide some speedup but takes a while to build, set to true if desired
19
+ "scaled_upper_triang_masked_softmax_fusion": false,
20
+ "bias_gelu_fusion": false,
21
+ "rope_fusion": false,
22
+ "layernorm_fusion": false,
23
+
24
+ # init methods
25
+ "init_method": "small_init",
26
+ "output_layer_init_method": "wang_init",
27
+
28
+ # optimizer settings
29
+ "optimizer": {
30
+ "type": "Adam",
31
+ "params": {
32
+ "lr": 0.0008,
33
+ "betas": [0.9, 0.95],
34
+ "eps": 1.0e-8,
35
+ }
36
+ },
37
+ "min_lr": 0.00008,
38
+
39
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
40
+ "zero_optimization": {
41
+ "stage": 1,
42
+ "allgather_partitions": True,
43
+ "allgather_bucket_size": 500000000,
44
+ "overlap_comm": True,
45
+ "reduce_scatter": True,
46
+ "reduce_bucket_size": 500000000,
47
+ "contiguous_gradients": True,
48
+ },
49
+
50
+ # batch / data settings
51
+ "train_micro_batch_size_per_gpu": 32,
52
+ "gradient_accumulation_steps": 1,
53
+ "data_impl": "mmap",
54
+ "num_workers": 1,
55
+
56
+ # activation checkpointing
57
+ "checkpoint_activations": true,
58
+ "checkpoint_num_layers": 1,
59
+ "partition_activations": true,
60
+ "synchronize_each_layer": true,
61
+
62
+ # regularization
63
+ "gradient_clipping": 1.0,
64
+ "weight_decay": 0.1,
65
+ "hidden_dropout": 0,
66
+ "attention_dropout": 0,
67
+
68
+ # precision settings
69
+ "fp16": {
70
+ "fp16": true,
71
+ "enabled": true,
72
+ "loss_scale": 0,
73
+ "loss_scale_window": 1000,
74
+ "initial_scale_power": 12,
75
+ "hysteresis": 2,
76
+ "min_loss_scale": 1,
77
+ },
78
+
79
+ # misc. training settings
80
+ "train_iters": 143000,
81
+ "lr_decay_iters": 143000,
82
+ "distributed_backend": "nccl",
83
+ "lr_decay_style": "cosine",
84
+ "warmup": 0.01,
85
+ "checkpoint_factor": 1000,
86
+ "eval_interval": 100000,
87
+ "eval_iters": 10,
88
+
89
+ # logging
90
+ "log_interval": 10,
91
+ "steps_per_print": 10,
92
+ "wall_clock_breakdown": true,
93
+ }
configs/6-7B.yml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
+ # across the node boundaries )
5
+ "pipe_parallel_size": 1,
6
+ "model_parallel_size": 1,
7
+
8
+ # model settings
9
+ "num_layers": 32,
10
+ "hidden_size": 4096,
11
+ "num_attention_heads": 32,
12
+ "seq_length": 2048,
13
+ "max_position_embeddings": 2048,
14
+ "norm": "layernorm",
15
+ "pos_emb": "rotary",
16
+ "no_weight_tying": true,
17
+ "gpt_j_residual": false,
18
+ "output_layer_parallelism": "column",
19
+
20
+ # these should provide some speedup but takes a while to build, set to true if desired
21
+ "scaled_upper_triang_masked_softmax_fusion": false,
22
+ "bias_gelu_fusion": false,
23
+ "rope_fusion": false,
24
+ "layernorm_fusion": false,
25
+
26
+ # init methods
27
+ "init_method": "small_init",
28
+ "output_layer_init_method": "wang_init",
29
+
30
+ # optimizer settings
31
+ "optimizer": {
32
+ "type": "Adam",
33
+ "params": {
34
+ "lr": 0.00012,
35
+ "betas": [0.9, 0.95],
36
+ "eps": 1.0e-8,
37
+ }
38
+ },
39
+
40
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
41
+ "zero_optimization": {
42
+ "stage": 1,
43
+ "allgather_partitions": True,
44
+ "allgather_bucket_size": 500000000,
45
+ "overlap_comm": True,
46
+ "reduce_scatter": True,
47
+ "reduce_bucket_size": 500000000,
48
+ "contiguous_gradients": True,
49
+ },
50
+ "min_lr": 0.000012,
51
+
52
+ # batch / data settings
53
+ "train_micro_batch_size_per_gpu": 4,
54
+ "data_impl": "mmap",
55
+
56
+ # activation checkpointing
57
+ "checkpoint_activations": true,
58
+ "checkpoint_num_layers": 1,
59
+ "partition_activations": true,
60
+ "synchronize_each_layer": true,
61
+
62
+ # regularization
63
+ "gradient_clipping": 1.0,
64
+ "weight_decay": 0.1,
65
+ "hidden_dropout": 0,
66
+ "attention_dropout": 0,
67
+
68
+ # precision settings
69
+ "fp16": {
70
+ "fp16": true,
71
+ "enabled": true,
72
+ "loss_scale": 0,
73
+ "loss_scale_window": 1000,
74
+ "hysteresis": 2,
75
+ "min_loss_scale": 1
76
+ },
77
+
78
+ # misc. training settings
79
+ "train_iters": 320000,
80
+ "lr_decay_iters": 320000,
81
+ "distributed_backend": "nccl",
82
+ "lr_decay_style": "cosine",
83
+ "warmup": 0.01,
84
+ "checkpoint_factor": 10000,
85
+ "eval_interval": 1000,
86
+ "eval_iters": 10,
87
+
88
+ # logging
89
+ "log_interval": 100,
90
+ "steps_per_print": 10,
91
+ "keep_last_n_checkpoints": 4,
92
+ "wall_clock_breakdown": true,
93
+ }
configs/760M.yml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
+ # across the node boundaries )
5
+ "pipe_parallel_size": 1,
6
+ "model_parallel_size": 1,
7
+
8
+ # model settings
9
+ "num_layers": 24,
10
+ "hidden_size": 1536,
11
+ "num_attention_heads": 16,
12
+ "seq_length": 2048,
13
+ "max_position_embeddings": 2048,
14
+ "norm": "layernorm",
15
+ "pos_emb": "rotary",
16
+ "no_weight_tying": true,
17
+ "gpt_j_residual": false,
18
+ "output_layer_parallelism": "column",
19
+
20
+ # these should provide some speedup but takes a while to build, set to true if desired
21
+ "scaled_upper_triang_masked_softmax_fusion": false,
22
+ "bias_gelu_fusion": false,
23
+ "rope_fusion": false,
24
+ "layernorm_fusion": false,
25
+
26
+ # init methods
27
+ "init_method": "small_init",
28
+ "output_layer_init_method": "wang_init",
29
+
30
+ # optimizer settings
31
+ "optimizer": {
32
+ "type": "Adam",
33
+ "params": {
34
+ "lr": 0.00025,
35
+ "betas": [0.9, 0.999],
36
+ "eps": 1.0e-8,
37
+ }
38
+ },
39
+ "min_lr": 0.000025,
40
+
41
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
42
+ "zero_optimization": {
43
+ "stage": 1,
44
+ "allgather_partitions": True,
45
+ "allgather_bucket_size": 500000000,
46
+ "overlap_comm": True,
47
+ "reduce_scatter": True,
48
+ "reduce_bucket_size": 500000000,
49
+ "contiguous_gradients": True,
50
+ },
51
+
52
+ # batch / data settings
53
+ "train_micro_batch_size_per_gpu": 4,
54
+ "data_impl": "mmap",
55
+
56
+ # activation checkpointing
57
+ "checkpoint_activations": true,
58
+ "checkpoint_num_layers": 1,
59
+ "partition_activations": true,
60
+ "synchronize_each_layer": true,
61
+
62
+ # regularization
63
+ "gradient_clipping": 1.0,
64
+ "weight_decay": 0.1,
65
+ "hidden_dropout": 0,
66
+ "attention_dropout": 0,
67
+
68
+ # precision settings
69
+ "fp16": {
70
+ "fp16": true,
71
+ "enabled": true,
72
+ "loss_scale": 0,
73
+ "loss_scale_window": 1000,
74
+ "hysteresis": 2,
75
+ "min_loss_scale": 1
76
+ },
77
+
78
+ # misc. training settings
79
+ "train_iters": 320000,
80
+ "lr_decay_iters": 320000,
81
+ "distributed_backend": "nccl",
82
+ "lr_decay_style": "cosine",
83
+ "warmup": 0.01,
84
+ "checkpoint_factor": 10000,
85
+ "eval_interval": 1000,
86
+ "eval_iters": 10,
87
+
88
+ # logging
89
+ "log_interval": 100,
90
+ "steps_per_print": 10,
91
+ "keep_last_n_checkpoints": 4,
92
+ "wall_clock_breakdown": true,
93
+ }
configs/800M.yml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pipe_parallel_size": 1,
3
+ "model_parallel_size": 1,
4
+
5
+ # model settings
6
+ "num_layers": 16,
7
+ "hidden_size": 2048,
8
+ "num_attention_heads": 8,
9
+ "seq_length": 2048,
10
+ "max_position_embeddings": 2048,
11
+ "pos_emb": "rotary",
12
+ "no_weight_tying": true,
13
+ "gpt_j_residual": false,
14
+ "output_layer_parallelism": "column",
15
+
16
+ "scaled_upper_triang_masked_softmax_fusion": false,
17
+ "bias_gelu_fusion": false,
18
+ "rope_fusion": false,
19
+ "layernorm_fusion": false,
20
+
21
+ # init methods
22
+ "init_method": "small_init",
23
+ "output_layer_init_method": "wang_init",
24
+
25
+ "optimizer": {
26
+ "type": "Adam",
27
+ "params": {
28
+ "lr": 0.00025,
29
+ "betas": [0.9, 0.95],
30
+ "eps": 1.0e-8,
31
+ }
32
+ },
33
+ "min_lr": 0.000025,
34
+
35
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
36
+ "zero_optimization": {
37
+ "stage": 1,
38
+ "allgather_partitions": True,
39
+ "allgather_bucket_size": 500000000,
40
+ "overlap_comm": True,
41
+ "reduce_scatter": True,
42
+ "reduce_bucket_size": 500000000,
43
+ "contiguous_gradients": True,
44
+ },
45
+
46
+ "train_micro_batch_size_per_gpu": 16,
47
+ "gradient_accumulation_steps": 1,
48
+ "data_impl": "mmap",
49
+ "num_workers": 1,
50
+
51
+ # activation checkpointing
52
+ "checkpoint_activations": true,
53
+ "checkpoint_num_layers": 1,
54
+ "partition_activations": true,
55
+ "synchronize_each_layer": true,
56
+
57
+ # regularization
58
+ "gradient_clipping": 1.0,
59
+ "weight_decay": 0.1,
60
+ "hidden_dropout": 0,
61
+ "attention_dropout": 0,
62
+
63
+ # precision settings
64
+ "fp16": {
65
+ "fp16": true,
66
+ "enabled": true,
67
+ "loss_scale": 0,
68
+ "loss_scale_window": 1000,
69
+ "initial_scale_power": 12,
70
+ "hysteresis": 2,
71
+ "min_loss_scale": 1,
72
+ },
73
+
74
+ "train_iters": 143000,
75
+ "lr_decay_iters": 143000,
76
+ "distributed_backend": "nccl",
77
+ "lr_decay_style": "cosine",
78
+ "warmup": 0.01,
79
+ "checkpoint_factor": 1000,
80
+ "eval_interval": 40000,
81
+ "eval_iters": 10,
82
+
83
+ "log_interval": 10,
84
+ "steps_per_print": 10,
85
+ "wall_clock_breakdown": true,
86
+ }
configs/README.md ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration and parameters
2
+
3
+ GPT-NeoX parameters are defined in a YAML configuration file which is passed to the `deepy.py` launcher - for examples see the files contained in this folder.
4
+ Parameters originate from either the [DeepSpeed runner CLI (DSL)](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/launcher/runner.py#L33), [DeepSpeed configuration file (DSC)](https://www.deepspeed.ai/docs/config-json/), [Megatron-LM CLI (Meg)](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/arguments.py#L224) or are GPT-NeoX (NeoX) modifications.
5
+
6
+ ## Example Configuration (GPT3 Small):
7
+
8
+ Below is an example configuration `.yaml` to train a ~160M parameter GPT model. This readme will go through each section in the configuration and the options available.
9
+
10
+ For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md)
11
+
12
+ Note: yaml arguments may be formatted with either '-' or '\_'. The standard separator used is a '\_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
13
+ ```yaml
14
+ # GPT-3 pretraining setup
15
+ {
16
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
17
+ # across the node boundaries )
18
+ "pipe_parallel_size": 1,
19
+ "model_parallel_size": 1,
20
+
21
+ # model settings
22
+ "num_layers": 12,
23
+ "hidden_size": 768,
24
+ "num_attention_heads": 12,
25
+ "seq_length": 2048,
26
+ "max_position_embeddings": 2048,
27
+ "norm": "rmsnorm",
28
+ "pos_emb": "none",
29
+ "no_weight_tying": true,
30
+ # this should provide some speedup but takes a while to build, set to true if desired
31
+ "scaled_upper_triang_masked_softmax_fusion": false,
32
+ "train_iters": 320000,
33
+
34
+ # optimizer settings
35
+ "optimizer": {
36
+ "type": "Adam",
37
+ "params": {
38
+ "lr": 0.0006,
39
+ "max_grad_norm": 1.0,
40
+ "betas": [0.9, 0.95]
41
+ }
42
+ },
43
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
44
+ "zero_optimization": {
45
+ "stage": 0,
46
+ "allgather_partitions": True,
47
+ "allgather_bucket_size": 500000000,
48
+ "overlap_comm": True,
49
+ "reduce_scatter": True,
50
+ "reduce_bucket_size": 500000000,
51
+ "contiguous_gradients": True,
52
+ },
53
+
54
+ # batch / data settings
55
+ "train_micro_batch_size_per_gpu": 4,
56
+ "gradient_accumulation_steps": 1,
57
+ "data_impl": "mmap",
58
+ "split": "949,50,1",
59
+
60
+ # activation checkpointing
61
+ "checkpoint_activations": true,
62
+ "checkpoint_num_layers": 1,
63
+ "partition_activations": true,
64
+ "synchronize_each_layer": true,
65
+
66
+ # regularization
67
+ "gradient_clipping": 1.0,
68
+ "weight_decay": 0,
69
+ "hidden_dropout": 0,
70
+ "attention_dropout": 0,
71
+
72
+ # precision settings
73
+ "fp16": {
74
+ "enabled": true,
75
+ "loss_scale": 0,
76
+ "loss_scale_window": 1000,
77
+ "hysteresis": 2,
78
+ "min_loss_scale": 1
79
+ },
80
+
81
+ # lr decay settings
82
+ "lr_decay_iters": 320000,
83
+ "lr_decay_style": "cosine",
84
+ "warmup": 0.01,
85
+
86
+ # misc. training settings
87
+ "distributed_backend": "nccl",
88
+ "checkpoint_factor": 10000,
89
+ "eval_interval": 1000,
90
+ "eval_iters": 10,
91
+
92
+ # logging
93
+ "log_interval": 100,
94
+ "steps_per_print": 10,
95
+ "keep_last_n_checkpoints": 4,
96
+ "wall_clock_breakdown": true,
97
+ }
98
+ ```
99
+
100
+ ### Parallelism Settings:
101
+
102
+ The parallelism settings are left at 1 in all configs, as the settings you want will be highly dependent on your compute setup and network topology.
103
+ We have found it best to do model parallelism within a node, and schedule pipeline stages across node boundaries.
104
+
105
+ ```yaml
106
+ "pipe_parallel_size": 1,
107
+ "model_parallel_size": 1,
108
+ ```
109
+
110
+ These can be set to any integer between `0` and `num_gpus`, and `num_gpus` must be divisible by `pipe_parallel_size` * `model_parallel_size`.
111
+
112
+
113
+ ### Model Settings:
114
+ ```yaml
115
+ # model settings
116
+ "num_layers": 12,
117
+ "hidden_size": 768,
118
+ "num_attention_heads": 12,
119
+ "seq_length": 2048,
120
+ "max_position_embeddings": 2048,
121
+ "norm": "rmsnorm",
122
+ "pos_emb": "none",
123
+ "no_weight_tying": true,
124
+ # this should provide some speedup but takes a while to build, set to true if desired
125
+ "scaled_upper_triang_masked_softmax_fusion": false,
126
+ "train_iters": 320000,
127
+ # alternatively, use train_epochs to automatically determine the number of training iterations
128
+ #"train_epochs": 1,
129
+ ```
130
+ An example of some basic settings used to configure your model's architecture and number of training steps.
131
+
132
+ ### Optimizer Settings:
133
+
134
+ Our optimizer configuration has a similar syntax to deepspeed's. Different optimizers will have different arguments for "params".
135
+ Learning rate should be configured from here using the `"lr"` field of `optimizer["params"]`.
136
+
137
+ ```yaml
138
+ # optimizer settings
139
+ "optimizer": {
140
+ "type": "Adam",
141
+ "params": {
142
+ "lr": 0.0006,
143
+ "max_grad_norm": 1.0,
144
+ "betas": [0.9, 0.95]
145
+ }
146
+ }
147
+ ```
148
+ Available optimizer types are:
149
+
150
+ - `"Adam"`: regular Adam optimizer
151
+ - `"OneBitAdam"`: Deepspeed's [OneBitAdam optimizer](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). To use 1-bit adam, you'll also need to add the `freeze_step`, `cuda_aware`, and `comm_backend_name` fields, like so:
152
+ ```yaml
153
+ "optimizer": {
154
+ "type": "OneBitAdam",
155
+ "params": {
156
+ "lr": 0.0001,
157
+ "freeze_step": 23000,
158
+ "betas": [0.9, 0.95],
159
+ "cuda_aware": false,
160
+ "comm_backend_name": "nccl"
161
+ }
162
+ ```
163
+
164
+ - `"CPU_Adam"`/`"CPU_torch_adam"`: Adam optimizer on CPU. Either megatron's version ("CPU_Adam") or torch's ("CPU_torch_adam")
165
+ - `"SM3"`: SM3 or [Memory adaptive efficient optimization optimizer](https://arxiv.org/pdf/1901.11150.pdf). We have found this doesn't work well with fp16 training.
166
+ - `"madgrad_wd"`: MADGRAD or [A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
167
+ Optimizer] weight decay has been implemented AdamW style instead of the original madgrad Adam style. https://arxiv.org/abs/2101.11075
168
+
169
+ ### ZeRO Optimization:
170
+
171
+ ```yaml
172
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
173
+ "zero_optimization": {
174
+ "stage": 0,
175
+ "allgather_partitions": True,
176
+ "allgather_bucket_size": 500000000,
177
+ "overlap_comm": True,
178
+ "reduce_scatter": True,
179
+ "reduce_bucket_size": 500000000,
180
+ "contiguous_gradients": True,
181
+ },
182
+ "zero_allow_untested_optimizer": false,
183
+
184
+ ```
185
+
186
+ ZeRO optimization in NeoX is currently configured identically to how deepspeed configures it, please see [the deepspeed docs](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training) for more information.
187
+
188
+ If you want to combine an optimizer untested by DeepSpeed with ZeRO (i.e, not ADAM or LAMB), you must pass `"zero_allow_untested_optimizer": true` *outside* of the `"zero_optimization"` dictionary (see above).
189
+
190
+ N.B - ZeRO stages 2+ are incompatible with pipeline parallelism. Please set `"pipe-parallel-size"` to 0 if you want to use ZeRO stage 2 or more.
191
+
192
+ ### Batch Size Settings:
193
+
194
+ ```yaml
195
+ # batch / data settings
196
+ "train_micro_batch_size_per_gpu": 4,
197
+ "gradient_accumulation_steps": 1,
198
+ ```
199
+ Our global batch size configuration follows deepspeed's and can be configured in a number of ways. At least any one of `"train_batch_size"` and `"train_micro_batch_size_per_gpu"`.
200
+ - `"train_batch_size"`: The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
201
+ - `"train_micro_batch_size_per_gpu""`: Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, `gradient_accumulation_steps` is automatically calculated using train_batch_size and number of GPUs.
202
+ - `"gradient_accumulation_steps"`: Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs.
203
+
204
+ ### Extra DeepSpeed Settings
205
+
206
+ ```yaml
207
+ # additional deepspeed args not specified above
208
+ "deepspeed_extra_args": {
209
+ "comms_logger": {
210
+ "enabled": true,
211
+ "verbose": true,
212
+ "prof_all": true,
213
+ "debug": false
214
+ },
215
+ }
216
+ ```
217
+ Additional DeepSpeed settings besides those mentioned above should be wrapped in the `"deepspeed_extra_args` argument, as in the example above. This functionality is designed to allow arguments not specified by existing dataclasses to be passed to DeepSpeed (e.g. when new functionalities are implemented). If any settings are duplicated here from elsewhere in the YAML, the system will throw an exception and notify the user.
218
+
219
+ ### Dataset / Tokenizer / Checkpoint / Logging Settings:
220
+
221
+ ```yaml
222
+ "data_impl": "mmap",
223
+ "split": "949,50,1",
224
+ # Suggested data paths when using GPT-NeoX locally
225
+ "data_path": "data/enwik8/enwik8_text_document",
226
+ #"train_data_path": "data/enwik8/enwik8_text_document",
227
+ #"test_data_path": "data/enwik8/enwik8_text_document",
228
+ #"valid_data_path": "data/enwik8/enwik8_text_document",
229
+ "vocab_file": "data/gpt2-vocab.json",
230
+ "merge_file": "data/gpt2-merges.txt",
231
+ "save": "checkpoints",
232
+ "load": "checkpoints",
233
+ "tensorboard_dir": "tensorboard",
234
+ "log_dir": "logs",
235
+ "checkpoint_factor": 10000,
236
+ "eval_interval": 1000,
237
+ "eval_iters": 10,
238
+ ```
239
+
240
+ For KTO style training, you'll need to add the reward & label data path, e.g.:
241
+
242
+ ```yaml
243
+ "data_impl": "mmap",
244
+ # Suggested data paths when using GPT-NeoX locally
245
+ "train_data_path": "data/enwik8/enwik8_text_document",
246
+ "train_label_data_path": "data/enwik8/enwik8_text_label_document",
247
+ "train_reward_data_path": "data/enwik8/enwik8_text_reward_document",
248
+ "test_data_path": "data/enwik8/enwik8_text_document",
249
+ "test_label_data_path": "data/enwik8/enwik8_text_label_document",
250
+ "test_reward_data_path": "data/enwik8/enwik8_text_reward_document",
251
+ "valid_data_path": "data/enwik8/enwik8_text_document",
252
+ "valid_label_data_path": "data/enwik8/enwik8_text_label_document",
253
+ "valid_reward_data_path": "data/enwik8/enwik8_text_reward_document",
254
+ "vocab_file": "data/gpt2-vocab.json",
255
+ "merge_file": "data/gpt2-merges.txt",
256
+ "save": "checkpoints",
257
+ "load": "checkpoints",
258
+ "tensorboard_dir": "tensorboard",
259
+ "log_dir": "logs",
260
+ "checkpoint_factor": 10000,
261
+ "eval_interval": 1000,
262
+ "eval_iters": 10,
263
+ ```
264
+
265
+ For DPO style training, you'll need to set pos/neg data paths instead of a single one, e.g.
266
+
267
+ ```yaml
268
+ "dataset_impl": "pairwise",
269
+ "train_impl": "dpo",
270
+ "pack_impl": "unpacked",
271
+ "dpo_beta": 0.1,
272
+ "dpo_fp32": true,
273
+ "pos_train_data_path": "data/enwik8/enwik8_text_pos_document",
274
+ "pos_valid_data_path": "data/enwik8/enwik8_text_pos_document",
275
+ "pos_test_data_path": "data/enwik8/enwik8_text_pos_document",
276
+ "neg_train_data_path": "data/enwik8/enwik8_text_neg_document",
277
+ "neg_valid_data_path": "data/enwik8/enwik8_text_neg_document",
278
+ "neg_test_data_path": "data/enwik8/enwik8_text_neg_document",
279
+ ## If you have labels... (likely to mask out user turns)
280
+ "pos_train_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
281
+ "pos_valid_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
282
+ "pos_test_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
283
+ "neg_train_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
284
+ "neg_valid_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
285
+ "neg_test_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
286
+ ## If you want to precompute the logits over your dataset...
287
+ "precompute_model_name": "gpt2",
288
+ ## Needed for the generation.py step, if precomputing
289
+ "text_gen_type": "precompute"
290
+ ```
291
+
292
+ ### LR Scheduler settings
293
+
294
+ ```yaml
295
+ "lr_decay_iters": 320000,
296
+ "lr_decay_style": "cosine",
297
+ "warmup": 0.01,
298
+ ```
299
+
300
+ Settings used to modify the learning rate over time.
301
+
302
+ N.B - `OneBitAdam` requires you to use deepspeed's internal lr scheduler because reasons. Currently the lr decay style defaults to deepspeed's `WarmupDecay
303
+
304
+ ### Activation Checkpointing Settings:
305
+
306
+ ```yaml
307
+ "checkpoint_activations": true,
308
+ "checkpoint_num_layers": 1,
309
+ "partition_activations": true,
310
+ "synchronize_each_layer": true,
311
+ ```
312
+
313
+ Checkpointing works by trading compute for memory. Rather than storing all intermediate activations of the entire computation graph for computing backward, the checkpointed part does not save intermediate activations, and instead recomputes them in backward pass.
314
+
315
+ ### Mixed Precision Training Settings:
316
+ gpt-neox's fp16 training is configured identically to DeepSpeed's, please see [their documentation](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) for more information.
317
+ An example config for fp16 training:
318
+
319
+ ```yaml
320
+ "fp16": {
321
+ "enabled": true,
322
+ "loss_scale": 0,
323
+ "loss_scale_window": 1000,
324
+ "hysteresis": 2,
325
+ "min_loss_scale": 1
326
+ },
327
+ ```
328
+
329
+ Alternatively you can use the `precision` config which can be set to `fp16`, `bfloat16`, or `fp32`. If you set `"precision": "fp16"` without adding a `"fp16": {...}` dict, then it will simply use DeepSpeed's defaults for fp16 training.
330
+
331
+
332
+ ### SLURM Settings
333
+
334
+ If you are running GPT-NeoX on a SLURM cluster and wish to use SLURM to coordinate nodes, then you must set the following variables in your config:
335
+
336
+ ```yaml
337
+ "launcher": "slurm",
338
+ "deepspeed_slurm": true
339
+ ```
340
+
341
+ Additionally, you need to modify _all_ of your configs to conform to the JSON. When launching a GPT-NeoX job you can specify multiple YAML config files. Internally, all of these files are merged into one config and then passed as a single long command line argument to Deep(er)Speed. When using SLURM and its internal command `srun`, python fails to parse this long command line argument unless it is in the more restrictive JSON format. In practice, the example NeoX configs are already very close to JSON. As an example, this is a snippet of a YAML-compatible config, N.B. the comment the capital-F `False`:
342
+
343
+ ```yaml
344
+ # optimizer settings
345
+ "optimizer": {
346
+ "type": "OneBitAdam",
347
+ "params": {
348
+ "lr": 0.0001,
349
+ "freeze_step": 23000,
350
+ "betas": [0.9, 0.95],
351
+ "cuda_aware": False,
352
+ "comm_backend_name": "nccl"
353
+ }
354
+ ```
355
+
356
+ To make this JSON just remove the comment and use all lowercase for the boolean:
357
+
358
+ ```yaml
359
+ "optimizer": {
360
+ "type": "OneBitAdam",
361
+ "params": {
362
+ "lr": 0.0001,
363
+ "freeze_step": 23000,
364
+ "betas": [0.9, 0.95],
365
+ "cuda_aware": false,
366
+ "comm_backend_name": "nccl"
367
+ }
368
+ ```
configs/autotuning_configs/small_tune.json ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pipe-parallel-size": 1,
3
+ "model-parallel-size": 1,
4
+
5
+ "num-layers": 12,
6
+ "hidden-size": 768,
7
+ "num-attention-heads": 12,
8
+ "seq-length": 2048,
9
+ "max-position-embeddings": 2048,
10
+ "norm": "layernorm",
11
+ "pos-emb": "rotary",
12
+ "no-weight-tying": true,
13
+
14
+ "scaled-upper-triang-masked-softmax-fusion": false,
15
+ "bias-gelu-fusion": false,
16
+
17
+
18
+ "optimizer": {
19
+ "type": "Adam",
20
+ "params": {
21
+ "lr": 0.0006,
22
+ "betas": [0.9, 0.999],
23
+ "eps": 1.0e-8
24
+ }
25
+ },
26
+
27
+ "train_micro_batch_size_per_gpu": 1,
28
+ "data-impl": "mmap",
29
+ "split": "949,50,1",
30
+
31
+ "checkpoint-activations": true,
32
+ "checkpoint-num-layers": 1,
33
+ "partition-activations": true,
34
+ "synchronize-each-layer": true,
35
+
36
+ "gradient_clipping": 1.0,
37
+ "weight-decay": 0.0,
38
+ "hidden-dropout": 0.0,
39
+ "attention-dropout": 0.0,
40
+
41
+ "fp16": {
42
+ "enabled": true,
43
+ "loss_scale": 0,
44
+ "loss_scale_window": 1000,
45
+ "hysteresis": 2,
46
+ "min_loss_scale": 1
47
+ },
48
+
49
+ "train-iters": 320000,
50
+ "lr-decay-iters": 320000,
51
+ "distributed-backend": "nccl",
52
+ "lr-decay-style": "cosine",
53
+ "warmup": 0.01,
54
+ "save-interval": 10000,
55
+ "eval-interval": 1000,
56
+ "eval-iters": 10,
57
+
58
+ "log-interval": 100,
59
+ "steps_per_print": 10,
60
+ "keep-last-n-checkpoints": 4,
61
+ "wall_clock_breakdown": true,
62
+ "launcher": "slurm",
63
+ "deepspeed_slurm": true,
64
+ "comment": "neox",
65
+ "autotuning": {
66
+ "enabled": true,
67
+ "arg_mappings": {
68
+ "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
69
+ "gradient_accumulation_steps ": "--gradient_accumulation_steps"
70
+ }
71
+ },
72
+ "zero_optimization": {
73
+ "stage": [0, 1, 2, 3]
74
+ },
75
+ "train-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
76
+ "valid-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
77
+ "test-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"]
78
+ }
configs/autotuning_configs/tune.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pipe-parallel-size": 1,
3
+ "model-parallel-size": 1,
4
+ "num-layers": 12,
5
+ "hidden-size": 768,
6
+ "num-attention-heads": 12,
7
+ "seq-length": 2048,
8
+ "max-position-embeddings": 2048,
9
+ "norm": "layernorm",
10
+ "pos-emb": "rotary",
11
+ "no-weight-tying": true,
12
+ "scaled-upper-triang-masked-softmax-fusion": true,
13
+ "bias-gelu-fusion": true,
14
+ "optimizer": {
15
+ "type": "Adam",
16
+ "params": {
17
+ "lr": 0.0006,
18
+ "betas": [0.9, 0.999],
19
+ "eps": 1.0e-8
20
+ }
21
+ },
22
+ "zero_optimization": {
23
+ "stage": 0,
24
+ "allgather_partitions": true,
25
+ "allgather_bucket_size": 500000000,
26
+ "overlap_comm": true,
27
+ "reduce_scatter": true,
28
+ "reduce_bucket_size": 500000000,
29
+ "contiguous_gradients": true,
30
+ "cpu_offload": false
31
+ },
32
+ "train_micro_batch_size_per_gpu": 1,
33
+ "autotuning_config": {
34
+ "enabled": true,
35
+ "arg_mappings": {
36
+ "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
37
+ "gradient_accumulation_steps ": "--gradient_accumulation_steps"
38
+ }
39
+ },
40
+ "data-impl": "mmap",
41
+ "split": "949,50,1",
42
+ "checkpoint-activations": true,
43
+ "checkpoint-num-layers": 1,
44
+ "partition-activations": true,
45
+ "synchronize-each-layer": true,
46
+ "gradient_clipping": 1.0,
47
+ "weight-decay": 0.0,
48
+ "hidden-dropout": 0.0,
49
+ "attention-dropout": 0.0,
50
+ "fp16": {
51
+ "enabled": true,
52
+ "loss_scale": 0,
53
+ "loss_scale_window": 1000,
54
+ "hysteresis": 2,
55
+ "min_loss_scale": 1
56
+ },
57
+ "train-iters": 200,
58
+ "lr-decay-iters": 320000,
59
+ "distributed-backend": "nccl",
60
+ "lr-decay-style": "cosine",
61
+ "warmup": 0.01,
62
+ "save-interval": 10000,
63
+ "eval-interval": 1000,
64
+ "eval-iters": 10,
65
+ "log-interval": 100,
66
+ "steps_per_print": 10,
67
+ "keep-last-n-checkpoints": 4,
68
+ "wall_clock_breakdown": true,
69
+ "launcher": "slurm",
70
+ "deepspeed_slurm": true,
71
+ "comment": "neox"
72
+ }
configs/autotuning_configs/tune_1-3B.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pipe-parallel-size": 1,
3
+ "model-parallel-size": 1,
4
+
5
+ "num-layers": 24,
6
+ "hidden-size": 2048,
7
+ "num-attention-heads": 16,
8
+ "seq-length": 2048,
9
+ "max-position-embeddings": 2048,
10
+ "norm": "layernorm",
11
+ "pos-emb": "rotary",
12
+ "no-weight-tying": true,
13
+ "gpt_j_residual": false,
14
+ "output_layer_parallelism": "column",
15
+ "attention_config": [[["flash"], 24]],
16
+ "scaled-upper-triang-masked-softmax-fusion": false,
17
+ "bias-gelu-fusion": false,
18
+
19
+ "init_method": "small_init",
20
+ "output_layer_init_method": "wang_init",
21
+
22
+ "optimizer": {
23
+ "type": "Adam",
24
+ "params": {
25
+ "lr": 0.0002,
26
+ "betas": [0.9, 0.95],
27
+ "eps": 1.0e-8
28
+ }
29
+ },
30
+ "min_lr": 0.00002,
31
+
32
+ "zero_optimization": {
33
+ "stage": 1,
34
+ "allgather_partitions": true,
35
+ "allgather_bucket_size": 500000000,
36
+ "overlap_comm": true,
37
+ "reduce_scatter": true,
38
+ "reduce_bucket_size": 500000000,
39
+ "contiguous_gradients": true
40
+ },
41
+ "train_micro_batch_size_per_gpu": 1,
42
+ "autotuning": {
43
+ "enabled": true,
44
+ "arg_mappings": {
45
+ "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
46
+ "gradient_accumulation_steps ": "--gradient_accumulation_steps"
47
+ }
48
+ },
49
+ "data-impl": "mmap",
50
+
51
+ "checkpoint-activations": false,
52
+ "checkpoint-num-layers": 1,
53
+ "partition-activations": true,
54
+ "synchronize-each-layer": true,
55
+
56
+ "gradient_clipping": 1.0,
57
+ "weight-decay": 0.1,
58
+ "hidden-dropout": 0,
59
+ "attention-dropout": 0,
60
+
61
+ "fp16": {
62
+ "fp16": true,
63
+ "enabled": true,
64
+ "loss_scale": 0,
65
+ "loss_scale_window": 1000,
66
+ "hysteresis": 2,
67
+ "min_loss_scale": 1
68
+ },
69
+
70
+ "train-iters": 320000,
71
+ "lr-decay-iters": 320000,
72
+ "distributed-backend": "nccl",
73
+ "lr-decay-style": "cosine",
74
+ "warmup": 0.01,
75
+ "checkpoint-factor": 10000,
76
+ "eval-interval": 1000,
77
+ "eval-iters": 10,
78
+ "launcher": "slurm",
79
+ "deepspeed_slurm": true,
80
+ "no_ssh_check": true,
81
+
82
+ "log-interval": 10,
83
+ "steps_per_print": 10,
84
+ "keep-last-n-checkpoints": 1,
85
+ "wall_clock_breakdown": true
86
+ }
configs/autotuning_configs/tune_6-7B.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pipe-parallel-size": 1,
3
+ "model-parallel-size": 8,
4
+
5
+ "num-layers": 32,
6
+ "hidden-size": 4096,
7
+ "num-attention-heads": 32,
8
+ "seq-length": 2048,
9
+ "max-position-embeddings": 2048,
10
+ "norm": "layernorm",
11
+ "pos-emb": "rotary",
12
+ "no-weight-tying": true,
13
+
14
+ "scaled-upper-triang-masked-softmax-fusion": false,
15
+ "bias-gelu-fusion": false,
16
+
17
+
18
+ "optimizer": {
19
+ "type": "Adam",
20
+ "params": {
21
+ "lr": 0.00012,
22
+ "betas": [0.9, 0.999],
23
+ "eps": 1.0e-8
24
+ }
25
+ },
26
+
27
+ "train_micro_batch_size_per_gpu": 1,
28
+ "zero_optimization": {
29
+ "stage": [0, 1, 2, 3]
30
+ },
31
+ "data-impl": "mmap",
32
+ "split": "949,50,1",
33
+
34
+ "checkpoint-activations": true,
35
+ "checkpoint-num-layers": 1,
36
+ "partition-activations": true,
37
+ "synchronize-each-layer": true,
38
+
39
+ "gradient_clipping": 1.0,
40
+ "weight-decay": 0,
41
+ "hidden-dropout": 0,
42
+ "attention-dropout": 0,
43
+
44
+ "fp16": {
45
+ "fp16": true,
46
+ "enabled": true,
47
+ "loss_scale": 0,
48
+ "loss_scale_window": 1000,
49
+ "hysteresis": 2,
50
+ "min_loss_scale": 1
51
+ },
52
+
53
+ "train-iters": 100,
54
+ "lr-decay-iters": 320000,
55
+ "distributed-backend": "nccl",
56
+ "lr-decay-style": "cosine",
57
+ "warmup": 0.01,
58
+ "checkpoint-factor": 10000,
59
+ "eval-interval": 1000,
60
+ "eval-iters": 10,
61
+ "log-interval": 100,
62
+ "steps_per_print": 10,
63
+ "keep-last-n-checkpoints": 4,
64
+ "wall_clock_breakdown": true,
65
+ "launcher": "slurm",
66
+ "deepspeed_slurm": true,
67
+ "no_ssh_check": true,
68
+ "comment": "neox",
69
+ "autotuning": {
70
+ "enabled": true,
71
+ "mp_size": 8,
72
+ "arg_mappings": {
73
+ "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
74
+ "gradient_accumulation_steps ": "--gradient_accumulation_steps"
75
+ }
76
+ }
77
+ }
configs/bf16_125M.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
+ # across the node boundaries )
5
+ "pipe_parallel_size": 1,
6
+ "model_parallel_size": 1,
7
+
8
+ # model settings
9
+ "num_layers": 12,
10
+ "hidden_size": 768,
11
+ "num_attention_heads": 12,
12
+ "seq_length": 2048,
13
+ "max_position_embeddings": 2048,
14
+ "norm": "layernorm",
15
+ "pos_emb": "rotary",
16
+ "no_weight_tying": true,
17
+
18
+ # these should provide some speedup but takes a while to build, set to true if desired
19
+ "scaled_upper_triang_masked_softmax_fusion": false,
20
+ "bias_gelu_fusion": false,
21
+ "rope_fusion": false,
22
+ "layernorm_fusion": false,
23
+
24
+
25
+ # optimizer settings
26
+ "optimizer": {
27
+ "type": "Adam",
28
+ "params": {
29
+ "lr": 0.0006,
30
+ "betas": [0.9, 0.999],
31
+ "eps": 1.0e-8,
32
+ }
33
+ },
34
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
35
+ "zero_optimization": {
36
+ "stage": 0,
37
+ "allgather_partitions": True,
38
+ "allgather_bucket_size": 500000000,
39
+ "overlap_comm": True,
40
+ "reduce_scatter": True,
41
+ "reduce_bucket_size": 500000000,
42
+ "contiguous_gradients": True,
43
+ },
44
+
45
+ # batch / data settings
46
+ "train_micro_batch_size_per_gpu": 4,
47
+ "data_impl": "mmap",
48
+ "split": "949,50,1",
49
+
50
+ # activation checkpointing
51
+ "checkpoint_activations": true,
52
+ "checkpoint_num_layers": 1,
53
+ "partition_activations": true,
54
+ "synchronize_each_layer": true,
55
+
56
+ # regularization
57
+ "gradient_clipping": 1.0,
58
+ "weight_decay": 0.0,
59
+ "hidden_dropout": 0.0,
60
+ "attention_dropout": 0.0,
61
+
62
+ "precision": "bfloat16",
63
+
64
+ "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
65
+ # misc. training settings
66
+ "train_iters": 320000,
67
+ "lr_decay_iters": 320000,
68
+ "distributed_backend": "nccl",
69
+ "lr_decay_style": "cosine",
70
+ "warmup": 0.01,
71
+ "checkpoint_factor": 10000,
72
+ "eval_interval": 1000,
73
+ "eval_iters": 10,
74
+
75
+ # logging
76
+ "log_interval": 100,
77
+ "steps_per_print": 10,
78
+ "keep_last_n_checkpoints": 4,
79
+ "wall_clock_breakdown": true,
80
+ }
configs/bnb_125M.yml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
+ # across the node boundaries )
5
+ "pipe_parallel_size": 1,
6
+ "model_parallel_size": 1,
7
+
8
+ # model settings
9
+ "num_layers": 12,
10
+ "hidden_size": 768,
11
+ "num_attention_heads": 12,
12
+ "seq_length": 2048,
13
+ "max_position_embeddings": 2048,
14
+ "norm": "layernorm",
15
+ "pos_emb": "rotary",
16
+ "no_weight_tying": true,
17
+ "use_bnb_optimizer": true,
18
+
19
+ # these should provide some speedup but takes a while to build, set to true if desired
20
+ "scaled_upper_triang_masked_softmax_fusion": false,
21
+ "bias_gelu_fusion": false,
22
+ "rope_fusion": false,
23
+ "layernorm_fusion": false,
24
+
25
+
26
+ # optimizer settings
27
+ "optimizer": {
28
+ "type": "Adam",
29
+ "params": {
30
+ "lr": 0.0006,
31
+ "betas": [0.9, 0.999],
32
+ "eps": 1.0e-8,
33
+ }
34
+ },
35
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
36
+ "zero_optimization": {
37
+ "stage": 0,
38
+ "allgather_partitions": True,
39
+ "allgather_bucket_size": 500000000,
40
+ "overlap_comm": True,
41
+ "reduce_scatter": True,
42
+ "reduce_bucket_size": 500000000,
43
+ "contiguous_gradients": True,
44
+ },
45
+
46
+ # batch / data settings
47
+ "train_micro_batch_size_per_gpu": 4,
48
+ "data_impl": "mmap",
49
+ "split": "949,50,1",
50
+
51
+ # activation checkpointing
52
+ "checkpoint_activations": true,
53
+ "checkpoint_num_layers": 1,
54
+ "partition_activations": true,
55
+ "synchronize_each_layer": true,
56
+
57
+ # regularization
58
+ "gradient_clipping": 1.0,
59
+ "weight_decay": 0.0,
60
+ "hidden_dropout": 0.0,
61
+ "attention_dropout": 0.0,
62
+
63
+ # precision settings
64
+ "fp16": {
65
+ "enabled": true,
66
+ "loss_scale": 0,
67
+ "loss_scale_window": 1000,
68
+ "hysteresis": 2,
69
+ "min_loss_scale": 1
70
+ },
71
+
72
+ # misc. training settings
73
+ "train_iters": 320000,
74
+ "lr_decay_iters": 320000,
75
+ "distributed_backend": "nccl",
76
+ "lr_decay_style": "cosine",
77
+ "warmup": 0.01,
78
+ "checkpoint_factor": 10000,
79
+ "eval_interval": 1000,
80
+ "eval_iters": 10,
81
+
82
+ # logging
83
+ "log_interval": 100,
84
+ "steps_per_print": 10,
85
+ "keep_last_n_checkpoints": 4,
86
+ "wall_clock_breakdown": true,
87
+ }
configs/cpu_mock_config.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # CPU unit tests should be independent of the presence of GPUs on the test server
2
+ # host. This configuration mocks these GPU resources and other dependencies.
3
+ {
4
+ "global_num_gpus": 1
5
+ }
configs/docker/pythia-paths.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
3
+ "valid-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
4
+ "test-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
5
+
6
+ "tokenizer-type": "HFTokenizer",
7
+ "vocab-file": "/home/mchorse/data/tokenizers/20B_tokenizer.json",
8
+
9
+ "save": "/home/mchorse/chk/",
10
+ "load": "/home/mchorse/chk/",
11
+ "checkpoint_validation_with_forward_pass": False
12
+ }
configs/eleutherai_cluster.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data paths and options when using EleutherAI cluster
2
+ {
3
+ # you may include multiple distinct datasets if desired
4
+ "train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"],
5
+ "valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"],
6
+ "test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"],
7
+
8
+ # if using multiple datasets, provide weights for them to be sampled with
9
+ # "train-data-weights": [1., 2.],
10
+ # "test-data-weights": [2., 1.],
11
+ # "valid-data-weights": [0.5, 0.4],
12
+
13
+
14
+ # If you would like the code to create val and test datasets from your training set use the following instead
15
+ # "split" determines the relative size of train, val, and test
16
+
17
+ # "split" 995,4,1
18
+ # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document",
19
+
20
+ "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json",
21
+ "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt",
22
+ "save": "/mnt/ssd-1/checkpoints",
23
+ "load": "/mnt/ssd-1/checkpoints",
24
+ "tensorboard_dir": "/mnt/ssd-1/tensorboard",
25
+ "log_dir": "/mnt/ssd-1/logs",
26
+ "wandb_team": "eleutherai",
27
+ "wandb_project": "neox",
28
+ "wandb_group": "example"
29
+ }
configs/finetuning_configs/6-9B.yml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ # finetuning option
3
+ "load": "/path/to/checkpoint",
4
+ "finetune": true,
5
+
6
+ "pipe-parallel-size": 1,
7
+ "model-parallel-size": 2,
8
+
9
+ "num-layers": 32,
10
+ "hidden-size": 4096,
11
+ "num-attention-heads": 32,
12
+ "seq-length": 2048,
13
+ "max-position-embeddings": 2048,
14
+ "norm": "layernorm",
15
+ "pos-emb": "rotary",
16
+ "rotary_pct": 0.25,
17
+ "no-weight-tying": true,
18
+ "gpt_j_residual": true,
19
+ "output_layer_parallelism": "column",
20
+
21
+ "attention-config": [[["flash"], 32]],
22
+
23
+ "scaled-upper-triang-masked-softmax-fusion": true,
24
+ "bias-gelu-fusion": true,
25
+
26
+
27
+ "optimizer": {
28
+ "type": "Adam",
29
+ "params": {
30
+ "lr": 0.00012,
31
+ "betas": [0.9, 0.95],
32
+ "eps": 1.0e-8
33
+ }
34
+ },
35
+
36
+ "min_lr": 0.000012,
37
+
38
+ "zero_optimization": {
39
+ "stage": 1,
40
+ "allgather_partitions": true,
41
+ "allgather_bucket_size": 1260000000,
42
+ "overlap_comm": true,
43
+ "reduce_scatter": true,
44
+ "reduce_bucket_size": 1260000000,
45
+ "contiguous_gradients": true,
46
+ "cpu_offload": false,
47
+ "load_from_fp32_weights": False, # if checkpoint has fp16/bf16 params
48
+ },
49
+
50
+ "train_micro_batch_size_per_gpu": 8,
51
+ "gradient_accumulation_steps": 2,
52
+ "data-impl": "mmap",
53
+
54
+ "checkpoint-activations": true,
55
+ "checkpoint-num-layers": 1,
56
+ "partition-activations": true,
57
+ "synchronize-each-layer": true,
58
+
59
+ "gradient_clipping": 1.0,
60
+ "weight-decay": 0.1,
61
+ "hidden-dropout": 0,
62
+ "attention-dropout": 0,
63
+
64
+ "fp16": {
65
+ "fp16": true,
66
+ "enabled": true,
67
+ "loss_scale": 0,
68
+ "loss_scale_window": 1000,
69
+ "initial_scale_power": 12,
70
+ "hysteresis": 2,
71
+ "min_loss_scale": 1
72
+ },
73
+
74
+ "train-iters": 143000,
75
+ "lr-decay-iters": 143000,
76
+ "distributed-backend": "nccl",
77
+ "lr-decay-style": "cosine",
78
+ "warmup": 0.01,
79
+ "checkpoint-factor": 1000,
80
+ "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
81
+ "eval-interval": 143000,
82
+ "eval-iters": 10,
83
+
84
+ "log-interval": 10,
85
+ "steps_per_print": 10,
86
+ "wall_clock_breakdown": true,
87
+
88
+ "tokenizer_type": "HFTokenizer"
89
+ }
configs/gen_docs.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ sys.path.append(
5
+ os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
6
+ )
7
+ from megatron.neox_arguments import neox_args, deepspeed_args
8
+ from inspect import getmembers, getsource
9
+ from dataclasses import field, is_dataclass
10
+ from itertools import tee, zip_longest
11
+ import pathlib
12
+
13
+
14
+ def pairwise(iterable):
15
+ "s -> (s0,s1), (s1,s2), (s2, s3), ..."
16
+ a, b = tee(iterable)
17
+ next(b, None)
18
+ return zip_longest(a, b)
19
+
20
+
21
+ def get_docs(module):
22
+ ARGS_CLASSES = getmembers(module, is_dataclass)
23
+ results = {}
24
+ for name, dcls in ARGS_CLASSES:
25
+ assert is_dataclass(dcls)
26
+ src = getsource(dcls)
27
+ d = dcls()
28
+ loc = 0
29
+ results[name] = {"doc": d.__doc__.strip(), "attributes": {}}
30
+ for cur, _next in pairwise(d.__dataclass_fields__.items()):
31
+ field_name, field_def = cur
32
+ field_type = field_def.type
33
+ if hasattr(field_type, "__name__"):
34
+ if field_type.__name__ == "Literal" or field_type.__name__ == "Union":
35
+ field_type = field_type
36
+ else:
37
+ field_type = str(field_type.__name__)
38
+ else:
39
+ field_type = str(field_type)
40
+
41
+ field_default = field_def.default
42
+
43
+ # try to find the field definition
44
+ loc = src.find(f" {field_name}:", loc + len(field_name) + 1)
45
+
46
+ if _next is not None:
47
+ next_field_name, _ = _next
48
+ # try to find the next field definition
49
+ next_loc = src.find(f"{next_field_name}:", loc + len(field_name))
50
+ else:
51
+ next_loc = len(src)
52
+
53
+ # try to get the docstring
54
+ _src = src[loc:next_loc].strip()
55
+ if '"""' in _src:
56
+ doc = _src.split('"""')[1].strip()
57
+ elif "'''" in _src:
58
+ doc = _src.split("'''")[1].strip()
59
+ else:
60
+ doc = ""
61
+ results[name]["attributes"][field_name] = {
62
+ "name": field_name,
63
+ "type": field_type,
64
+ "default": field_default,
65
+ "doc": doc,
66
+ }
67
+ return results
68
+
69
+
70
+ def to_md(docs, intro_str=""):
71
+ """
72
+ Writes the docs dictionary to markdown format
73
+ """
74
+ lines = []
75
+ lines.append(intro_str)
76
+ for name, doc in docs.items():
77
+ lines.append(f"## {name}")
78
+ lines.append(f"{doc['doc']}")
79
+ lines.append("")
80
+ for field_name, field_def in doc["attributes"].items():
81
+ # attribute name and type
82
+ lines.append(f"- **{field_name}**: {field_def['type']}")
83
+ # default value
84
+ lines.append(f" Default = {str(field_def['default'])}")
85
+ lines.append(f" {field_def['doc']}")
86
+ lines.append("")
87
+ return "\n\n".join(lines)
88
+
89
+
90
+ if __name__ == "__main__":
91
+ docs = get_docs(neox_args)
92
+ docs.update(get_docs(deepspeed_args))
93
+ intro_str = """Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):\n"""
94
+ md = to_md(docs, intro_str=intro_str)
95
+ with open(f"{pathlib.Path(__file__).parent.resolve()}/neox_arguments.md", "w") as f:
96
+ f.write(md)
configs/gmlp_small.yml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-2 pretraining setup
2
+ {
3
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
4
+ # across the node boundaries )
5
+ "pipe_parallel_size": 1,
6
+ "model_parallel_size": 1,
7
+ "attention_config": [[["gmlp"], "all"]],
8
+
9
+
10
+ # model settings
11
+ "num_layers": 12,
12
+ "hidden_size": 768, # gmlp d_ff defaults to hidden_size * 4
13
+ "gmlp_attn_dim": 64,
14
+ "num_attention_heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention.
15
+ "seq_length": 2048,
16
+ "max_position_embeddings": 2048,
17
+ "norm": "layernorm",
18
+ "pos_emb": "none",
19
+ "no_weight_tying": true,
20
+
21
+ # optimizer settings
22
+ "optimizer": {
23
+ "type": "Adam",
24
+ "params": {
25
+ "lr": 0.0006,
26
+ "betas": [0.9, 0.999],
27
+ "eps": 1.0e_8,
28
+ }
29
+ },
30
+
31
+ # batch / data settings
32
+ "train_micro_batch_size_per_gpu": 4,
33
+ "data_impl": "mmap",
34
+ "split": "949,50,1",
35
+
36
+ # activation checkpointing
37
+ "checkpoint_activations": true,
38
+ "checkpoint_num_layers": 1,
39
+ "partition_activations": false,
40
+ "synchronize_each_layer": true,
41
+
42
+ # regularization
43
+ "gradient_clipping": 1.0,
44
+ "weight_decay": 0.1,
45
+ "hidden_dropout": 0.0,
46
+ "attention_dropout": 0.0,
47
+
48
+ # precision settings
49
+ "fp16": {
50
+ "enabled": true,
51
+ "loss_scale": 0,
52
+ "loss_scale_window": 1000,
53
+ "hysteresis": 2,
54
+ "min_loss_scale": 1
55
+ },
56
+
57
+ # misc. training settings
58
+ "train_iters": 320000,
59
+ "lr_decay_iters": 320000,
60
+ "distributed_backend": "nccl",
61
+ "lr_decay_style": "cosine",
62
+ "warmup": 0.01,
63
+ "checkpoint_factor": 10000,
64
+ "eval_interval": 1000,
65
+ "eval_iters": 10,
66
+
67
+ # logging
68
+ "log_interval": 100,
69
+ "steps_per_print": 10,
70
+ "keep_last_n_checkpoints": 4,
71
+ "wall_clock_breakdown": true,
72
+ }
configs/llama/13B.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pipe_parallel_size": 1,
3
+ "model_parallel_size": 2,
4
+ "make_vocab_size_divisible_by": 1,
5
+
6
+ # model settings
7
+ "num_layers": 40,
8
+ "hidden_size": 5120,
9
+ "num_attention_heads": 40,
10
+ "seq_length": 2048,
11
+ "max_position_embeddings": 2048,
12
+ "pos_emb": "rotary",
13
+ "rotary_pct": 1,
14
+ "no_weight_tying": true,
15
+ "gpt_j_residual": false,
16
+ "output_layer_parallelism": "column",
17
+ "norm": "rmsnorm",
18
+ "rms_norm_epsilon": 1.0e-6,
19
+
20
+ "scaled_upper_triang_masked_softmax_fusion": true,
21
+ "bias_gelu_fusion": false,
22
+ "use_bias_in_norms": false,
23
+ "use_bias_in_attn_linear": false,
24
+ "activation": "swiglu",
25
+ "mlp_multiple_of": 256,
26
+ }