taozi555 commited on
Commit
1ea204c
1 Parent(s): ee6f462

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ .ipynb_checkpoints/LimaRP-augmented-8k-context-checkpoint.json filter=lfs diff=lfs merge=lfs -text
37
+ 11.json filter=lfs diff=lfs merge=lfs -text
38
+ Claude-3-Opus-Instruct-15K/.ipynb_checkpoints/Claude3-Opus-Multi-Instruct-5K-v1-checkpoint.json filter=lfs diff=lfs merge=lfs -text
39
+ Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json filter=lfs diff=lfs merge=lfs -text
40
+ Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json filter=lfs diff=lfs merge=lfs -text
41
+ Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json filter=lfs diff=lfs merge=lfs -text
42
+ LimaRP-augmented-8k-context.json filter=lfs diff=lfs merge=lfs -text
43
+ clean-gpt2.json filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/LimaRP-augmented-8k-context-checkpoint.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb7ac99415c0870fd9cfc48d0bce2aa980fb975eaf71e50dde47c39fc1c66648
3
+ size 20503146
.ipynb_checkpoints/config-checkpoint.yaml ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: /root/autodl-tmp/c4/Meta-Llama-3-8B-Instruct-abliterated-v3
2
+ model_type: AutoModelForCausalLM
3
+ tokenizer_type: AutoTokenizer
4
+ tokenizer_use_fast: false
5
+
6
+
7
+ load_in_8bit: false
8
+ load_in_4bit: false
9
+ strict: false
10
+ model_config:
11
+
12
+ datasets:
13
+ - path: /root/autodl-tmp/c4/11.json
14
+ type: sharegpt
15
+ conversation: llama3
16
+ roles:
17
+ input: user
18
+ output: assistant
19
+ - path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
20
+ type: sharegpt
21
+ conversation: llama3
22
+ - path: Sao10K/Short-Storygen-v2
23
+ type:
24
+ # The below are defaults. only set what's needed if you use a different column name.
25
+ system_prompt: ""
26
+ system_format: "{system}"
27
+ field_instruction: prompt
28
+ field_system: system
29
+ field_output: response
30
+ conversation: llama3
31
+ - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json
32
+ type:
33
+ # The below are defaults. only set what's needed if you use a different column name.
34
+ system_prompt: ""
35
+ system_format: "{system}"
36
+ field_instruction: prompt
37
+ #field_input: prompt
38
+ field_output: response
39
+ conversation: llama3
40
+ - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json
41
+ type:
42
+ # The below are defaults. only set what's needed if you use a different column name.
43
+ system_prompt: ""
44
+ system_format: "{system}"
45
+ field_instruction: prompt
46
+ #field_input: prompt
47
+ field_output: response
48
+ conversation: llama3
49
+ - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json
50
+ type:
51
+ # The below are defaults. only set what's needed if you use a different column name.
52
+ system_prompt: ""
53
+ system_format: "{system}"
54
+ field_instruction: prompt
55
+ #field_input: prompt
56
+ field_output: response
57
+ conversation: llama3
58
+ - path: SicariusSicariiStuff/Bluemoon_Top50MB_Sorted_Fixed
59
+ type: sharegpt
60
+ conversation: llama3
61
+ - path: /root/autodl-tmp/c4/clean-gpt2.json
62
+ type: sharegpt
63
+ conversation: llama3
64
+ - path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
65
+ type: sharegpt
66
+ conversation: llama3
67
+
68
+ chat_template: llama3
69
+
70
+
71
+ dataset_prepared_path: /root/autodl-tmp/thingy
72
+ val_set_size: 0.005
73
+ output_dir: ./out
74
+
75
+ sequence_len: 4096
76
+ sample_packing: true
77
+ pad_to_sequence_len: true
78
+
79
+ gradient_accumulation_steps: 4
80
+ micro_batch_size: 3
81
+ num_epochs: 5
82
+ logging_steps: 1
83
+ optimizer: adamw_8bit
84
+ lr_scheduler: cosine
85
+ learning_rate: 2e-5
86
+
87
+ wandb_project: llama3-8b-hiwaifu
88
+ wandb_watch:
89
+ wandb_run_id:
90
+ wandb_log_model:
91
+
92
+ train_on_inputs: false
93
+ group_by_length: false
94
+ bf16: auto
95
+ fp16:
96
+ tf32: false
97
+
98
+ gradient_checkpointing: true
99
+ gradient_checkpointing_kwargs:
100
+ use_reentrant: false
101
+ early_stopping_patience:
102
+ resume_from_checkpoint:
103
+ local_rank:
104
+ logging_steps: 1
105
+ xformers_attention:
106
+ flash_attention: true
107
+ saves_per_epoch: 1
108
+ save_total_limit: 2
109
+ save_steps:
110
+ evals_per_epoch: 4
111
+ eval_sample_packing: false
112
+ debug:
113
+ deepspeed: /root/autodl-tmp/c4/axolotl/deepspeed_configs/zero3_bf16.json
114
+ weight_decay: 0.05
115
+ fsdp:
116
+ fsdp_config:
117
+ special_tokens:
118
+ eos_token: "<|eot_id|>"
119
+ pad_token: "<|end_of_text|>"
.ipynb_checkpoints/mergekit-checkpoint.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ merge_method: task_arithmetic
2
+ base_model: /root/autodl-tmp/llama3
3
+ models:
4
+ - model: /root/autodl-tmp/llama3
5
+ parameters:
6
+ weight: 0.3
7
+ - model: /root/autodl-tmp/out/checkpoint-385
8
+ parameters:
9
+ weight: 0.7
10
+ dtype: float16
.ipynb_checkpoints/test-checkpoint.php ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?php
2
+ ini_set('memory_limit', '-1');
3
+ function readJsonFile($filename) {
4
+ $jsonString = file_get_contents($filename);
5
+ $data = json_decode($jsonString, true);
6
+ return $data;
7
+ }
8
+
9
+ function concatenateConversations($conversations) {
10
+ $concatenated = array_reduce($conversations, function($carry, $item) {
11
+ return $carry . ' ' . $item['content'];
12
+ }, '');
13
+ return trim($concatenated);
14
+ }
15
+ function cosineSimilarity($tokensA, $tokensB) {
16
+ $a = $b = $c = 0;
17
+ $uniqueTokens = array_unique(array_merge(array_keys($tokensA), array_keys($tokensB)));
18
+ foreach ($uniqueTokens as $token) {
19
+ $x = isset($tokensA[$token]) ? $tokensA[$token] : 0;
20
+ $y = isset($tokensB[$token]) ? $tokensB[$token] : 0;
21
+ $a += $x * $y;
22
+ $b += $x * $x;
23
+ $c += $y * $y;
24
+ }
25
+ return $b * $c > 0 ? $a / sqrt($b * $c) : 0;
26
+ }
27
+ function filterConversations($data) {
28
+ $conversationsStr = array_map(function($item) {
29
+ return ['content' => concatenateConversations($item['conversations']), 'original' => $item];
30
+ }, $data);
31
+
32
+ usort($conversationsStr, function($a, $b) {
33
+ return strlen($b['content']) <=> strlen($a['content']);
34
+ });
35
+
36
+ $filteredData = [];
37
+ while (!empty($conversationsStr)) {
38
+ $last_len = 0;
39
+ #is_martch
40
+ $is_martch = true;
41
+ $last_index = 0;
42
+ $longest = array_shift($conversationsStr);
43
+ $newConversationsStr = [];
44
+ $tokensB = array_count_values(str_word_count($longest['content'], 1));
45
+ foreach ($conversationsStr as $index=>$item) {
46
+ $tokensA = array_count_values(str_word_count($item['content'], 1));
47
+ $similarity = cosineSimilarity($tokensA, $tokensB);
48
+ if ($similarity<0.95) {
49
+ $newConversationsStr[] = $item;
50
+ }else{
51
+ $is_martch = False;
52
+ $itemCount= count($item['original']['conversations']);
53
+ $longestCount= count($longest['original']['conversations']);
54
+ if($itemCount>$longestCount){
55
+ if($itemCount>$last_len){
56
+ $last_len = $itemCount;
57
+ $last_index = $index;
58
+ }
59
+ }else{
60
+ if($longestCount>$last_len){
61
+ $last_len = $longestCount;
62
+ $last_index = $index;
63
+ }
64
+ }
65
+ }
66
+ }
67
+ if($is_martch){
68
+ $filteredData[] = $longest['original'];
69
+ }else if($last_index>0){
70
+ $filteredData[] = $conversationsStr[$last_index]['original'];
71
+ }
72
+ $conversationsStr = $newConversationsStr;
73
+ print_r("\r".count($conversationsStr));
74
+ }
75
+
76
+ return $filteredData;
77
+ }
78
+
79
+ function writeJsonFile($data, $filename) {
80
+ $jsonData = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
81
+ file_put_contents($filename, $jsonData);
82
+ }
83
+
84
+ function main() {
85
+ $inputFilename = 'merged_data1716636036.json';
86
+ $outputFilename = 'filtered_data.json';
87
+
88
+ $data = readJsonFile($inputFilename);
89
+ echo "Reading data completed.\n".count($data)."数据";
90
+ // 切分数据
91
+ $parts = array_chunk($data, intval(count($data) / 2)); // 假设启动4个进程
92
+ $processes = [];
93
+
94
+ foreach ($parts as $index => $part) {
95
+ $pid = pcntl_fork();
96
+ if ($pid == -1) {
97
+ die('Could not fork');
98
+ } else if ($pid) {
99
+ // 父进程
100
+ $processes[] = $pid;
101
+ } else {
102
+ // 子进程
103
+ processPart($part, $index);
104
+ exit();
105
+ }
106
+ }
107
+ $status = null;
108
+ foreach ($processes as $process) {
109
+ pcntl_waitpid($process, $status);
110
+ }
111
+
112
+ echo "All processes completed.\n";
113
+ // 假设所有的 JSON 文件都在同一个目录下,例如 "datasets/"
114
+ $directory = "datasets/";
115
+ $allData = [];
116
+
117
+ // 打开目录,并读取其内容
118
+ if ($handle = opendir($directory)) {
119
+ while (false !== ($entry = readdir($handle))) {
120
+ if ($entry != "." && $entry != ".." && pathinfo($entry, PATHINFO_EXTENSION) == 'json') {
121
+ // 读取 JSON 文件
122
+ $jsonContent = file_get_contents($directory . $entry);
123
+ echo $directory . $entry;
124
+ // 解码 JSON 文件内容为数组
125
+ $data = json_decode($jsonContent, true);
126
+ // 将解码后的数组合并到主数组中
127
+ $allData = array_merge($allData, $data);
128
+ }
129
+ }
130
+ closedir($handle);
131
+ }
132
+ shuffle($allData);
133
+ // 编码总数组为 JSON
134
+ $finalJson = json_encode($allData, JSON_PRETTY_PRINT);
135
+
136
+ // 写入最终的 JSON 到一个新文件
137
+ file_put_contents("datasets/merged_data.json", $finalJson);
138
+
139
+ echo "All JSON files have been merged into merged_data.json\n";
140
+ $pattern = $directory . '/filtered_data_part_[0-9]*.json';
141
+ $files = glob($pattern);
142
+
143
+ // 遍历文件数组,并逐个删除
144
+ foreach ($files as $file) {
145
+ if (is_file($file)) {
146
+ if (unlink($file)) {
147
+ echo "Deleted: $file\n";
148
+ } else {
149
+ echo "Error deleting: $file\n";
150
+ }
151
+ }
152
+ }
153
+ // 源文件路径
154
+ $sourceFile = $directory.'merged_data.json';
155
+
156
+ // 目标目录路径
157
+ $destinationFile = 'merged_data'.time().'.json';
158
+
159
+ // 移动文件
160
+ if (rename($sourceFile, $destinationFile)) {
161
+ echo "文件成功移动到: $destinationFile";
162
+ } else {
163
+ echo "文件移动失败!";
164
+ }
165
+ }
166
+
167
+ function processPart($dataPart, $index) {
168
+ $filteredData = filterConversations($dataPart);
169
+ $outputFilename = "datasets/filtered_data_part_$index.json";
170
+ writeJsonFile($filteredData, $outputFilename);
171
+ echo "Process $index: Writing data completed.\n";
172
+ }
173
+
174
+
175
+ main();
176
+ ?>
177
+ ?>
.ipynb_checkpoints/test-checkpoint.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import math
3
+ from collections import Counter
4
+
5
+ def read_json_file(filename):
6
+ with open(filename, 'r', encoding='utf-8') as file:
7
+ data = json.load(file)
8
+ return data
9
+
10
+ def concatenate_conversations(conversations):
11
+ concatenated = ' '.join(conv['content'] for conv in conversations)
12
+ return concatenated.strip()
13
+
14
+ def cosine_similarity(tokensA, tokensB):
15
+ a = b = c = 0.0
16
+ unique_tokens = set(tokensA.keys()).union(set(tokensB.keys()))
17
+ for token in unique_tokens:
18
+ x = tokensA.get(token, 0)
19
+ y = tokensB.get(token, 0)
20
+ a += x * y
21
+ b += x ** 2
22
+ c += y ** 2
23
+ return a / (math.sqrt(b) * math.sqrt(c)) if b > 0 and c > 0 else 0
24
+
25
+ def filter_conversations(data):
26
+ conversations_str = [{'content': concatenate_conversations(item['conversations']), 'original': item} for item in data]
27
+ conversations_str.sort(key=lambda x: len(x['content']), reverse=True)
28
+
29
+ filtered_data = []
30
+ while conversations_str:
31
+ longest = conversations_str.pop(0)
32
+ tokensB = Counter(longest['content'].split())
33
+ new_conversations_str = []
34
+ for item in conversations_str:
35
+ tokensA = Counter(item['content'].split())
36
+ similarity = cosine_similarity(tokensA, tokensB)
37
+ if similarity < 0.95:
38
+ new_conversations_str.append(item)
39
+ else:
40
+ longest_count = len(longest['original']['conversations'])
41
+ item_count = len(item['original']['conversations'])
42
+ if item_count > longest_count:
43
+ longest = item
44
+ filtered_data.append(longest['original'])
45
+ conversations_str = conversations_str
46
+ print("\rRemaining items: {}".format(len(conversations_str)), end='')
47
+
48
+ return filtered_data
49
+
50
+ def write_json_file(data, filename):
51
+ with open(filename, 'w', encoding='utf-8') as file:
52
+ json.dump(data, file, ensure_ascii=False, indent=4)
53
+
54
+ def main():
55
+ input_filename = 'unique_data1.json'
56
+ output_filename = 'filtered_data.json'
57
+
58
+ data = read_json_file(input_filename)
59
+ print(f"Reading data completed. {len(data)} entries loaded.")
60
+ filtered_data = filter_conversations(data)
61
+ print("Filtering completed.")
62
+ write_json_file(filtered_data, output_filename)
63
+ print("Writing data completed.")
64
+
65
+ main()
11.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cb89c31fc384d1e7de36753f3309c35747d912288fbfaaf4c3cc9492e30064c
3
+ size 48701344
Claude-3-Opus-Instruct-15K/.ipynb_checkpoints/Claude3-Opus-Multi-Instruct-5K-v1-checkpoint.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3df81a1d629c2b7601f4fd0c44901f28d5b5892eb2fb8b897d7d05a80ac274d0
3
+ size 14998066
Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3df81a1d629c2b7601f4fd0c44901f28d5b5892eb2fb8b897d7d05a80ac274d0
3
+ size 14998066
Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6bd8f6b0c314377f8ac9aab6ab8b93bb9f1d939df129e51103e7d8ac717b2b5
3
+ size 14586329
Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1312a6ba6d347a33609c03a466c02eff8838ed7c5680ba1dacb57218028cbe02
3
+ size 23703566
Claude-3-Opus-Instruct-15K/zero3_bf16.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "zero_optimization": {
3
+ "stage": 3,
4
+ "overlap_comm": true,
5
+ "contiguous_gradients": true,
6
+ "sub_group_size": 0,
7
+ "reduce_bucket_size": "auto",
8
+ "stage3_prefetch_bucket_size": "auto",
9
+ "stage3_param_persistence_threshold": "auto",
10
+ "stage3_max_live_parameters": 0,
11
+ "stage3_max_reuse_distance": 0,
12
+ "stage3_gather_16bit_weights_on_model_save": true
13
+ },
14
+ "bf16": {
15
+ "enabled": true
16
+ },
17
+ "fp16": {
18
+ "enabled": "auto",
19
+ "auto_cast": false,
20
+ "loss_scale": 0,
21
+ "initial_scale_power": 32,
22
+ "loss_scale_window": 1000,
23
+ "hysteresis": 2,
24
+ "min_loss_scale": 1
25
+ },
26
+ "gradient_accumulation_steps": "auto",
27
+ "gradient_clipping": "auto",
28
+ "train_batch_size": "auto",
29
+ "train_micro_batch_size_per_gpu": "auto",
30
+ "wall_clock_breakdown": false
31
+ }
LimaRP-augmented-8k-context.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb7ac99415c0870fd9cfc48d0bce2aa980fb975eaf71e50dde47c39fc1c66648
3
+ size 20503146
clean-gpt2.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdfbff2b639bc8a9ec55e3093803b87d1b7dbbeb76db2e21092a553d62c8635e
3
+ size 686939966
config.yaml ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: /root/autodl-tmp/c4/Meta-Llama-3-8B-Instruct-abliterated-v3
2
+ model_type: AutoModelForCausalLM
3
+ tokenizer_type: AutoTokenizer
4
+ tokenizer_use_fast: false
5
+
6
+
7
+ load_in_8bit: false
8
+ load_in_4bit: false
9
+ strict: false
10
+ model_config:
11
+
12
+ datasets:
13
+ - path: /root/autodl-tmp/c4/11.json
14
+ type: sharegpt
15
+ conversation: llama3
16
+ roles:
17
+ input: user
18
+ output: assistant
19
+ - path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
20
+ type: sharegpt
21
+ conversation: llama3
22
+ - path: Sao10K/Short-Storygen-v2
23
+ type:
24
+ # The below are defaults. only set what's needed if you use a different column name.
25
+ system_prompt: ""
26
+ system_format: "{system}"
27
+ field_instruction: prompt
28
+ field_system: system
29
+ field_output: response
30
+ conversation: llama3
31
+ - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json
32
+ type:
33
+ # The below are defaults. only set what's needed if you use a different column name.
34
+ system_prompt: ""
35
+ system_format: "{system}"
36
+ field_instruction: prompt
37
+ #field_input: prompt
38
+ field_output: response
39
+ conversation: llama3
40
+ - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json
41
+ type:
42
+ # The below are defaults. only set what's needed if you use a different column name.
43
+ system_prompt: ""
44
+ system_format: "{system}"
45
+ field_instruction: prompt
46
+ #field_input: prompt
47
+ field_output: response
48
+ conversation: llama3
49
+ - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json
50
+ type:
51
+ # The below are defaults. only set what's needed if you use a different column name.
52
+ system_prompt: ""
53
+ system_format: "{system}"
54
+ field_instruction: prompt
55
+ #field_input: prompt
56
+ field_output: response
57
+ conversation: llama3
58
+ - path: SicariusSicariiStuff/Bluemoon_Top50MB_Sorted_Fixed
59
+ type: sharegpt
60
+ conversation: llama3
61
+ - path: /root/autodl-tmp/c4/clean-gpt2.json
62
+ type: sharegpt
63
+ conversation: llama3
64
+ - path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
65
+ type: sharegpt
66
+ conversation: llama3
67
+
68
+ chat_template: llama3
69
+
70
+
71
+ dataset_prepared_path: /root/autodl-tmp/thingy
72
+ val_set_size: 0.005
73
+ output_dir: ./out
74
+
75
+ sequence_len: 4096
76
+ sample_packing: true
77
+ pad_to_sequence_len: true
78
+
79
+ gradient_accumulation_steps: 4
80
+ micro_batch_size: 3
81
+ num_epochs: 5
82
+ logging_steps: 1
83
+ optimizer: adamw_8bit
84
+ lr_scheduler: cosine
85
+ learning_rate: 2e-5
86
+
87
+ wandb_project: llama3-8b-hiwaifu
88
+ wandb_watch:
89
+ wandb_run_id:
90
+ wandb_log_model:
91
+
92
+ train_on_inputs: false
93
+ group_by_length: false
94
+ bf16: auto
95
+ fp16:
96
+ tf32: false
97
+
98
+ gradient_checkpointing: true
99
+ gradient_checkpointing_kwargs:
100
+ use_reentrant: false
101
+ early_stopping_patience:
102
+ resume_from_checkpoint:
103
+ local_rank:
104
+ logging_steps: 1
105
+ xformers_attention:
106
+ flash_attention: true
107
+ saves_per_epoch: 1
108
+ save_total_limit: 2
109
+ save_steps:
110
+ evals_per_epoch: 4
111
+ eval_sample_packing: false
112
+ debug:
113
+ deepspeed: /root/autodl-tmp/c4/axolotl/deepspeed_configs/zero3_bf16.json
114
+ weight_decay: 0.05
115
+ fsdp:
116
+ fsdp_config:
117
+ special_tokens:
118
+ eos_token: "<|eot_id|>"
119
+ pad_token: "<|end_of_text|>"
mergekit.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ merge_method: task_arithmetic
2
+ base_model: /root/autodl-tmp/llama3
3
+ models:
4
+ - model: /root/autodl-tmp/llama3
5
+ parameters:
6
+ weight: 0.3
7
+ - model: /root/autodl-tmp/out/checkpoint-385
8
+ parameters:
9
+ weight: 0.7
10
+ dtype: float16
pippa_raw_fix.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3792ffd85ebb51b05f7636e54f67cb64239d980c6fb29e888be744e286ff997
3
+ size 102788461
test.php ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?php
2
+ ini_set('memory_limit', '-1');
3
+ function readJsonFile($filename) {
4
+ $jsonString = file_get_contents($filename);
5
+ $data = json_decode($jsonString, true);
6
+ return $data;
7
+ }
8
+
9
+ function concatenateConversations($conversations) {
10
+ $concatenated = array_reduce($conversations, function($carry, $item) {
11
+ return $carry . ' ' . $item['content'];
12
+ }, '');
13
+ return trim($concatenated);
14
+ }
15
+ function cosineSimilarity($tokensA, $tokensB) {
16
+ $a = $b = $c = 0;
17
+ $uniqueTokens = array_unique(array_merge(array_keys($tokensA), array_keys($tokensB)));
18
+ foreach ($uniqueTokens as $token) {
19
+ $x = isset($tokensA[$token]) ? $tokensA[$token] : 0;
20
+ $y = isset($tokensB[$token]) ? $tokensB[$token] : 0;
21
+ $a += $x * $y;
22
+ $b += $x * $x;
23
+ $c += $y * $y;
24
+ }
25
+ return $b * $c > 0 ? $a / sqrt($b * $c) : 0;
26
+ }
27
+ function filterConversations($data) {
28
+ $conversationsStr = array_map(function($item) {
29
+ return ['content' => concatenateConversations($item['conversations']), 'original' => $item];
30
+ }, $data);
31
+
32
+ usort($conversationsStr, function($a, $b) {
33
+ return strlen($b['content']) <=> strlen($a['content']);
34
+ });
35
+
36
+ $filteredData = [];
37
+ while (!empty($conversationsStr)) {
38
+ $last_len = 0;
39
+ #is_martch
40
+ $is_martch = true;
41
+ $last_index = 0;
42
+ $longest = array_shift($conversationsStr);
43
+ $newConversationsStr = [];
44
+ $tokensB = array_count_values(str_word_count($longest['content'], 1));
45
+ foreach ($conversationsStr as $index=>$item) {
46
+ $tokensA = array_count_values(str_word_count($item['content'], 1));
47
+ $similarity = cosineSimilarity($tokensA, $tokensB);
48
+ if ($similarity<0.95) {
49
+ $newConversationsStr[] = $item;
50
+ }else{
51
+ $is_martch = False;
52
+ $itemCount= count($item['original']['conversations']);
53
+ $longestCount= count($longest['original']['conversations']);
54
+ if($itemCount>$longestCount){
55
+ if($itemCount>$last_len){
56
+ $last_len = $itemCount;
57
+ $last_index = $index;
58
+ }
59
+ }else{
60
+ if($longestCount>$last_len){
61
+ $last_len = $longestCount;
62
+ $last_index = $index;
63
+ }
64
+ }
65
+ }
66
+ }
67
+ if($is_martch){
68
+ $filteredData[] = $longest['original'];
69
+ }else if($last_index>0){
70
+ $filteredData[] = $conversationsStr[$last_index]['original'];
71
+ }
72
+ $conversationsStr = $newConversationsStr;
73
+ print_r("\r".count($conversationsStr));
74
+ }
75
+
76
+ return $filteredData;
77
+ }
78
+
79
+ function writeJsonFile($data, $filename) {
80
+ $jsonData = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
81
+ file_put_contents($filename, $jsonData);
82
+ }
83
+
84
+ function main() {
85
+ $inputFilename = 'merged_data1716636036.json';
86
+ $outputFilename = 'filtered_data.json';
87
+
88
+ $data = readJsonFile($inputFilename);
89
+ echo "Reading data completed.\n".count($data)."数据";
90
+ // 切分数据
91
+ $parts = array_chunk($data, intval(count($data) / 2)); // 假设启动4个进程
92
+ $processes = [];
93
+
94
+ foreach ($parts as $index => $part) {
95
+ $pid = pcntl_fork();
96
+ if ($pid == -1) {
97
+ die('Could not fork');
98
+ } else if ($pid) {
99
+ // 父进程
100
+ $processes[] = $pid;
101
+ } else {
102
+ // 子进程
103
+ processPart($part, $index);
104
+ exit();
105
+ }
106
+ }
107
+ $status = null;
108
+ foreach ($processes as $process) {
109
+ pcntl_waitpid($process, $status);
110
+ }
111
+
112
+ echo "All processes completed.\n";
113
+ // 假设所有的 JSON 文件都在同一个目录下,例如 "datasets/"
114
+ $directory = "datasets/";
115
+ $allData = [];
116
+
117
+ // 打开目录,并读取其内容
118
+ if ($handle = opendir($directory)) {
119
+ while (false !== ($entry = readdir($handle))) {
120
+ if ($entry != "." && $entry != ".." && pathinfo($entry, PATHINFO_EXTENSION) == 'json') {
121
+ // 读取 JSON 文件
122
+ $jsonContent = file_get_contents($directory . $entry);
123
+ echo $directory . $entry;
124
+ // 解码 JSON 文件内容为数组
125
+ $data = json_decode($jsonContent, true);
126
+ // 将解码后的数组合并到主数组中
127
+ $allData = array_merge($allData, $data);
128
+ }
129
+ }
130
+ closedir($handle);
131
+ }
132
+ shuffle($allData);
133
+ // 编码总数组为 JSON
134
+ $finalJson = json_encode($allData, JSON_PRETTY_PRINT);
135
+
136
+ // 写入最终的 JSON 到一个新文件
137
+ file_put_contents("datasets/merged_data.json", $finalJson);
138
+
139
+ echo "All JSON files have been merged into merged_data.json\n";
140
+ $pattern = $directory . '/filtered_data_part_[0-9]*.json';
141
+ $files = glob($pattern);
142
+
143
+ // 遍历文件数组,并逐个删除
144
+ foreach ($files as $file) {
145
+ if (is_file($file)) {
146
+ if (unlink($file)) {
147
+ echo "Deleted: $file\n";
148
+ } else {
149
+ echo "Error deleting: $file\n";
150
+ }
151
+ }
152
+ }
153
+ // 源文件路径
154
+ $sourceFile = $directory.'merged_data.json';
155
+
156
+ // 目标目录路径
157
+ $destinationFile = 'merged_data'.time().'.json';
158
+
159
+ // 移动文件
160
+ if (rename($sourceFile, $destinationFile)) {
161
+ echo "文件成功移动到: $destinationFile";
162
+ } else {
163
+ echo "文件移动失败!";
164
+ }
165
+ }
166
+
167
+ function processPart($dataPart, $index) {
168
+ $filteredData = filterConversations($dataPart);
169
+ $outputFilename = "datasets/filtered_data_part_$index.json";
170
+ writeJsonFile($filteredData, $outputFilename);
171
+ echo "Process $index: Writing data completed.\n";
172
+ }
173
+
174
+
175
+ main();
176
+ ?>
177
+ ?>
test.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import math
3
+ from collections import Counter
4
+
5
+ def read_json_file(filename):
6
+ with open(filename, 'r', encoding='utf-8') as file:
7
+ data = json.load(file)
8
+ return data
9
+
10
+ def concatenate_conversations(conversations):
11
+ concatenated = ' '.join(conv['content'] for conv in conversations)
12
+ return concatenated.strip()
13
+
14
+ def cosine_similarity(tokensA, tokensB):
15
+ a = b = c = 0.0
16
+ unique_tokens = set(tokensA.keys()).union(set(tokensB.keys()))
17
+ for token in unique_tokens:
18
+ x = tokensA.get(token, 0)
19
+ y = tokensB.get(token, 0)
20
+ a += x * y
21
+ b += x ** 2
22
+ c += y ** 2
23
+ return a / (math.sqrt(b) * math.sqrt(c)) if b > 0 and c > 0 else 0
24
+
25
+ def filter_conversations(data):
26
+ conversations_str = [{'content': concatenate_conversations(item['conversations']), 'original': item} for item in data]
27
+ conversations_str.sort(key=lambda x: len(x['content']), reverse=True)
28
+
29
+ filtered_data = []
30
+ while conversations_str:
31
+ longest = conversations_str.pop(0)
32
+ tokensB = Counter(longest['content'].split())
33
+ new_conversations_str = []
34
+ for item in conversations_str:
35
+ tokensA = Counter(item['content'].split())
36
+ similarity = cosine_similarity(tokensA, tokensB)
37
+ if similarity < 0.95:
38
+ new_conversations_str.append(item)
39
+ else:
40
+ longest_count = len(longest['original']['conversations'])
41
+ item_count = len(item['original']['conversations'])
42
+ if item_count > longest_count:
43
+ longest = item
44
+ filtered_data.append(longest['original'])
45
+ conversations_str = conversations_str
46
+ print("\rRemaining items: {}".format(len(conversations_str)), end='')
47
+
48
+ return filtered_data
49
+
50
+ def write_json_file(data, filename):
51
+ with open(filename, 'w', encoding='utf-8') as file:
52
+ json.dump(data, file, ensure_ascii=False, indent=4)
53
+
54
+ def main():
55
+ input_filename = 'unique_data1.json'
56
+ output_filename = 'filtered_data.json'
57
+
58
+ data = read_json_file(input_filename)
59
+ print(f"Reading data completed. {len(data)} entries loaded.")
60
+ filtered_data = filter_conversations(data)
61
+ print("Filtering completed.")
62
+ write_json_file(filtered_data, output_filename)
63
+ print("Writing data completed.")
64
+
65
+ main()