Upload folder using huggingface_hub
Browse files- .gitattributes +8 -0
- .ipynb_checkpoints/LimaRP-augmented-8k-context-checkpoint.json +3 -0
- .ipynb_checkpoints/config-checkpoint.yaml +119 -0
- .ipynb_checkpoints/mergekit-checkpoint.yaml +10 -0
- .ipynb_checkpoints/test-checkpoint.php +177 -0
- .ipynb_checkpoints/test-checkpoint.py +65 -0
- 11.json +3 -0
- Claude-3-Opus-Instruct-15K/.ipynb_checkpoints/Claude3-Opus-Multi-Instruct-5K-v1-checkpoint.json +3 -0
- Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json +3 -0
- Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json +3 -0
- Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json +3 -0
- Claude-3-Opus-Instruct-15K/zero3_bf16.json +31 -0
- LimaRP-augmented-8k-context.json +3 -0
- clean-gpt2.json +3 -0
- config.yaml +119 -0
- mergekit.yaml +10 -0
- pippa_raw_fix.parquet +3 -0
- test.php +177 -0
- test.py +65 -0
.gitattributes
CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
.ipynb_checkpoints/LimaRP-augmented-8k-context-checkpoint.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
11.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
Claude-3-Opus-Instruct-15K/.ipynb_checkpoints/Claude3-Opus-Multi-Instruct-5K-v1-checkpoint.json filter=lfs diff=lfs merge=lfs -text
|
39 |
+
Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json filter=lfs diff=lfs merge=lfs -text
|
40 |
+
Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json filter=lfs diff=lfs merge=lfs -text
|
41 |
+
Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json filter=lfs diff=lfs merge=lfs -text
|
42 |
+
LimaRP-augmented-8k-context.json filter=lfs diff=lfs merge=lfs -text
|
43 |
+
clean-gpt2.json filter=lfs diff=lfs merge=lfs -text
|
.ipynb_checkpoints/LimaRP-augmented-8k-context-checkpoint.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fb7ac99415c0870fd9cfc48d0bce2aa980fb975eaf71e50dde47c39fc1c66648
|
3 |
+
size 20503146
|
.ipynb_checkpoints/config-checkpoint.yaml
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_model: /root/autodl-tmp/c4/Meta-Llama-3-8B-Instruct-abliterated-v3
|
2 |
+
model_type: AutoModelForCausalLM
|
3 |
+
tokenizer_type: AutoTokenizer
|
4 |
+
tokenizer_use_fast: false
|
5 |
+
|
6 |
+
|
7 |
+
load_in_8bit: false
|
8 |
+
load_in_4bit: false
|
9 |
+
strict: false
|
10 |
+
model_config:
|
11 |
+
|
12 |
+
datasets:
|
13 |
+
- path: /root/autodl-tmp/c4/11.json
|
14 |
+
type: sharegpt
|
15 |
+
conversation: llama3
|
16 |
+
roles:
|
17 |
+
input: user
|
18 |
+
output: assistant
|
19 |
+
- path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
|
20 |
+
type: sharegpt
|
21 |
+
conversation: llama3
|
22 |
+
- path: Sao10K/Short-Storygen-v2
|
23 |
+
type:
|
24 |
+
# The below are defaults. only set what's needed if you use a different column name.
|
25 |
+
system_prompt: ""
|
26 |
+
system_format: "{system}"
|
27 |
+
field_instruction: prompt
|
28 |
+
field_system: system
|
29 |
+
field_output: response
|
30 |
+
conversation: llama3
|
31 |
+
- path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json
|
32 |
+
type:
|
33 |
+
# The below are defaults. only set what's needed if you use a different column name.
|
34 |
+
system_prompt: ""
|
35 |
+
system_format: "{system}"
|
36 |
+
field_instruction: prompt
|
37 |
+
#field_input: prompt
|
38 |
+
field_output: response
|
39 |
+
conversation: llama3
|
40 |
+
- path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json
|
41 |
+
type:
|
42 |
+
# The below are defaults. only set what's needed if you use a different column name.
|
43 |
+
system_prompt: ""
|
44 |
+
system_format: "{system}"
|
45 |
+
field_instruction: prompt
|
46 |
+
#field_input: prompt
|
47 |
+
field_output: response
|
48 |
+
conversation: llama3
|
49 |
+
- path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json
|
50 |
+
type:
|
51 |
+
# The below are defaults. only set what's needed if you use a different column name.
|
52 |
+
system_prompt: ""
|
53 |
+
system_format: "{system}"
|
54 |
+
field_instruction: prompt
|
55 |
+
#field_input: prompt
|
56 |
+
field_output: response
|
57 |
+
conversation: llama3
|
58 |
+
- path: SicariusSicariiStuff/Bluemoon_Top50MB_Sorted_Fixed
|
59 |
+
type: sharegpt
|
60 |
+
conversation: llama3
|
61 |
+
- path: /root/autodl-tmp/c4/clean-gpt2.json
|
62 |
+
type: sharegpt
|
63 |
+
conversation: llama3
|
64 |
+
- path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
|
65 |
+
type: sharegpt
|
66 |
+
conversation: llama3
|
67 |
+
|
68 |
+
chat_template: llama3
|
69 |
+
|
70 |
+
|
71 |
+
dataset_prepared_path: /root/autodl-tmp/thingy
|
72 |
+
val_set_size: 0.005
|
73 |
+
output_dir: ./out
|
74 |
+
|
75 |
+
sequence_len: 4096
|
76 |
+
sample_packing: true
|
77 |
+
pad_to_sequence_len: true
|
78 |
+
|
79 |
+
gradient_accumulation_steps: 4
|
80 |
+
micro_batch_size: 3
|
81 |
+
num_epochs: 5
|
82 |
+
logging_steps: 1
|
83 |
+
optimizer: adamw_8bit
|
84 |
+
lr_scheduler: cosine
|
85 |
+
learning_rate: 2e-5
|
86 |
+
|
87 |
+
wandb_project: llama3-8b-hiwaifu
|
88 |
+
wandb_watch:
|
89 |
+
wandb_run_id:
|
90 |
+
wandb_log_model:
|
91 |
+
|
92 |
+
train_on_inputs: false
|
93 |
+
group_by_length: false
|
94 |
+
bf16: auto
|
95 |
+
fp16:
|
96 |
+
tf32: false
|
97 |
+
|
98 |
+
gradient_checkpointing: true
|
99 |
+
gradient_checkpointing_kwargs:
|
100 |
+
use_reentrant: false
|
101 |
+
early_stopping_patience:
|
102 |
+
resume_from_checkpoint:
|
103 |
+
local_rank:
|
104 |
+
logging_steps: 1
|
105 |
+
xformers_attention:
|
106 |
+
flash_attention: true
|
107 |
+
saves_per_epoch: 1
|
108 |
+
save_total_limit: 2
|
109 |
+
save_steps:
|
110 |
+
evals_per_epoch: 4
|
111 |
+
eval_sample_packing: false
|
112 |
+
debug:
|
113 |
+
deepspeed: /root/autodl-tmp/c4/axolotl/deepspeed_configs/zero3_bf16.json
|
114 |
+
weight_decay: 0.05
|
115 |
+
fsdp:
|
116 |
+
fsdp_config:
|
117 |
+
special_tokens:
|
118 |
+
eos_token: "<|eot_id|>"
|
119 |
+
pad_token: "<|end_of_text|>"
|
.ipynb_checkpoints/mergekit-checkpoint.yaml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
merge_method: task_arithmetic
|
2 |
+
base_model: /root/autodl-tmp/llama3
|
3 |
+
models:
|
4 |
+
- model: /root/autodl-tmp/llama3
|
5 |
+
parameters:
|
6 |
+
weight: 0.3
|
7 |
+
- model: /root/autodl-tmp/out/checkpoint-385
|
8 |
+
parameters:
|
9 |
+
weight: 0.7
|
10 |
+
dtype: float16
|
.ipynb_checkpoints/test-checkpoint.php
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?php
|
2 |
+
ini_set('memory_limit', '-1');
|
3 |
+
function readJsonFile($filename) {
|
4 |
+
$jsonString = file_get_contents($filename);
|
5 |
+
$data = json_decode($jsonString, true);
|
6 |
+
return $data;
|
7 |
+
}
|
8 |
+
|
9 |
+
function concatenateConversations($conversations) {
|
10 |
+
$concatenated = array_reduce($conversations, function($carry, $item) {
|
11 |
+
return $carry . ' ' . $item['content'];
|
12 |
+
}, '');
|
13 |
+
return trim($concatenated);
|
14 |
+
}
|
15 |
+
function cosineSimilarity($tokensA, $tokensB) {
|
16 |
+
$a = $b = $c = 0;
|
17 |
+
$uniqueTokens = array_unique(array_merge(array_keys($tokensA), array_keys($tokensB)));
|
18 |
+
foreach ($uniqueTokens as $token) {
|
19 |
+
$x = isset($tokensA[$token]) ? $tokensA[$token] : 0;
|
20 |
+
$y = isset($tokensB[$token]) ? $tokensB[$token] : 0;
|
21 |
+
$a += $x * $y;
|
22 |
+
$b += $x * $x;
|
23 |
+
$c += $y * $y;
|
24 |
+
}
|
25 |
+
return $b * $c > 0 ? $a / sqrt($b * $c) : 0;
|
26 |
+
}
|
27 |
+
function filterConversations($data) {
|
28 |
+
$conversationsStr = array_map(function($item) {
|
29 |
+
return ['content' => concatenateConversations($item['conversations']), 'original' => $item];
|
30 |
+
}, $data);
|
31 |
+
|
32 |
+
usort($conversationsStr, function($a, $b) {
|
33 |
+
return strlen($b['content']) <=> strlen($a['content']);
|
34 |
+
});
|
35 |
+
|
36 |
+
$filteredData = [];
|
37 |
+
while (!empty($conversationsStr)) {
|
38 |
+
$last_len = 0;
|
39 |
+
#is_martch
|
40 |
+
$is_martch = true;
|
41 |
+
$last_index = 0;
|
42 |
+
$longest = array_shift($conversationsStr);
|
43 |
+
$newConversationsStr = [];
|
44 |
+
$tokensB = array_count_values(str_word_count($longest['content'], 1));
|
45 |
+
foreach ($conversationsStr as $index=>$item) {
|
46 |
+
$tokensA = array_count_values(str_word_count($item['content'], 1));
|
47 |
+
$similarity = cosineSimilarity($tokensA, $tokensB);
|
48 |
+
if ($similarity<0.95) {
|
49 |
+
$newConversationsStr[] = $item;
|
50 |
+
}else{
|
51 |
+
$is_martch = False;
|
52 |
+
$itemCount= count($item['original']['conversations']);
|
53 |
+
$longestCount= count($longest['original']['conversations']);
|
54 |
+
if($itemCount>$longestCount){
|
55 |
+
if($itemCount>$last_len){
|
56 |
+
$last_len = $itemCount;
|
57 |
+
$last_index = $index;
|
58 |
+
}
|
59 |
+
}else{
|
60 |
+
if($longestCount>$last_len){
|
61 |
+
$last_len = $longestCount;
|
62 |
+
$last_index = $index;
|
63 |
+
}
|
64 |
+
}
|
65 |
+
}
|
66 |
+
}
|
67 |
+
if($is_martch){
|
68 |
+
$filteredData[] = $longest['original'];
|
69 |
+
}else if($last_index>0){
|
70 |
+
$filteredData[] = $conversationsStr[$last_index]['original'];
|
71 |
+
}
|
72 |
+
$conversationsStr = $newConversationsStr;
|
73 |
+
print_r("\r".count($conversationsStr));
|
74 |
+
}
|
75 |
+
|
76 |
+
return $filteredData;
|
77 |
+
}
|
78 |
+
|
79 |
+
function writeJsonFile($data, $filename) {
|
80 |
+
$jsonData = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
|
81 |
+
file_put_contents($filename, $jsonData);
|
82 |
+
}
|
83 |
+
|
84 |
+
function main() {
|
85 |
+
$inputFilename = 'merged_data1716636036.json';
|
86 |
+
$outputFilename = 'filtered_data.json';
|
87 |
+
|
88 |
+
$data = readJsonFile($inputFilename);
|
89 |
+
echo "Reading data completed.\n".count($data)."数据";
|
90 |
+
// 切分数据
|
91 |
+
$parts = array_chunk($data, intval(count($data) / 2)); // 假设启动4个进程
|
92 |
+
$processes = [];
|
93 |
+
|
94 |
+
foreach ($parts as $index => $part) {
|
95 |
+
$pid = pcntl_fork();
|
96 |
+
if ($pid == -1) {
|
97 |
+
die('Could not fork');
|
98 |
+
} else if ($pid) {
|
99 |
+
// 父进程
|
100 |
+
$processes[] = $pid;
|
101 |
+
} else {
|
102 |
+
// 子进程
|
103 |
+
processPart($part, $index);
|
104 |
+
exit();
|
105 |
+
}
|
106 |
+
}
|
107 |
+
$status = null;
|
108 |
+
foreach ($processes as $process) {
|
109 |
+
pcntl_waitpid($process, $status);
|
110 |
+
}
|
111 |
+
|
112 |
+
echo "All processes completed.\n";
|
113 |
+
// 假设所有的 JSON 文件都在同一个目录下,例如 "datasets/"
|
114 |
+
$directory = "datasets/";
|
115 |
+
$allData = [];
|
116 |
+
|
117 |
+
// 打开目录,并读取其内容
|
118 |
+
if ($handle = opendir($directory)) {
|
119 |
+
while (false !== ($entry = readdir($handle))) {
|
120 |
+
if ($entry != "." && $entry != ".." && pathinfo($entry, PATHINFO_EXTENSION) == 'json') {
|
121 |
+
// 读取 JSON 文件
|
122 |
+
$jsonContent = file_get_contents($directory . $entry);
|
123 |
+
echo $directory . $entry;
|
124 |
+
// 解码 JSON 文件内容为数组
|
125 |
+
$data = json_decode($jsonContent, true);
|
126 |
+
// 将解码后的数组合并到主数组中
|
127 |
+
$allData = array_merge($allData, $data);
|
128 |
+
}
|
129 |
+
}
|
130 |
+
closedir($handle);
|
131 |
+
}
|
132 |
+
shuffle($allData);
|
133 |
+
// 编码总数组为 JSON
|
134 |
+
$finalJson = json_encode($allData, JSON_PRETTY_PRINT);
|
135 |
+
|
136 |
+
// 写入最终的 JSON 到一个新文件
|
137 |
+
file_put_contents("datasets/merged_data.json", $finalJson);
|
138 |
+
|
139 |
+
echo "All JSON files have been merged into merged_data.json\n";
|
140 |
+
$pattern = $directory . '/filtered_data_part_[0-9]*.json';
|
141 |
+
$files = glob($pattern);
|
142 |
+
|
143 |
+
// 遍历文件数组,并逐个删除
|
144 |
+
foreach ($files as $file) {
|
145 |
+
if (is_file($file)) {
|
146 |
+
if (unlink($file)) {
|
147 |
+
echo "Deleted: $file\n";
|
148 |
+
} else {
|
149 |
+
echo "Error deleting: $file\n";
|
150 |
+
}
|
151 |
+
}
|
152 |
+
}
|
153 |
+
// 源文件路径
|
154 |
+
$sourceFile = $directory.'merged_data.json';
|
155 |
+
|
156 |
+
// 目标目录路径
|
157 |
+
$destinationFile = 'merged_data'.time().'.json';
|
158 |
+
|
159 |
+
// 移动文件
|
160 |
+
if (rename($sourceFile, $destinationFile)) {
|
161 |
+
echo "文件成功移动到: $destinationFile";
|
162 |
+
} else {
|
163 |
+
echo "文件移动失败!";
|
164 |
+
}
|
165 |
+
}
|
166 |
+
|
167 |
+
function processPart($dataPart, $index) {
|
168 |
+
$filteredData = filterConversations($dataPart);
|
169 |
+
$outputFilename = "datasets/filtered_data_part_$index.json";
|
170 |
+
writeJsonFile($filteredData, $outputFilename);
|
171 |
+
echo "Process $index: Writing data completed.\n";
|
172 |
+
}
|
173 |
+
|
174 |
+
|
175 |
+
main();
|
176 |
+
?>
|
177 |
+
?>
|
.ipynb_checkpoints/test-checkpoint.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import math
|
3 |
+
from collections import Counter
|
4 |
+
|
5 |
+
def read_json_file(filename):
|
6 |
+
with open(filename, 'r', encoding='utf-8') as file:
|
7 |
+
data = json.load(file)
|
8 |
+
return data
|
9 |
+
|
10 |
+
def concatenate_conversations(conversations):
|
11 |
+
concatenated = ' '.join(conv['content'] for conv in conversations)
|
12 |
+
return concatenated.strip()
|
13 |
+
|
14 |
+
def cosine_similarity(tokensA, tokensB):
|
15 |
+
a = b = c = 0.0
|
16 |
+
unique_tokens = set(tokensA.keys()).union(set(tokensB.keys()))
|
17 |
+
for token in unique_tokens:
|
18 |
+
x = tokensA.get(token, 0)
|
19 |
+
y = tokensB.get(token, 0)
|
20 |
+
a += x * y
|
21 |
+
b += x ** 2
|
22 |
+
c += y ** 2
|
23 |
+
return a / (math.sqrt(b) * math.sqrt(c)) if b > 0 and c > 0 else 0
|
24 |
+
|
25 |
+
def filter_conversations(data):
|
26 |
+
conversations_str = [{'content': concatenate_conversations(item['conversations']), 'original': item} for item in data]
|
27 |
+
conversations_str.sort(key=lambda x: len(x['content']), reverse=True)
|
28 |
+
|
29 |
+
filtered_data = []
|
30 |
+
while conversations_str:
|
31 |
+
longest = conversations_str.pop(0)
|
32 |
+
tokensB = Counter(longest['content'].split())
|
33 |
+
new_conversations_str = []
|
34 |
+
for item in conversations_str:
|
35 |
+
tokensA = Counter(item['content'].split())
|
36 |
+
similarity = cosine_similarity(tokensA, tokensB)
|
37 |
+
if similarity < 0.95:
|
38 |
+
new_conversations_str.append(item)
|
39 |
+
else:
|
40 |
+
longest_count = len(longest['original']['conversations'])
|
41 |
+
item_count = len(item['original']['conversations'])
|
42 |
+
if item_count > longest_count:
|
43 |
+
longest = item
|
44 |
+
filtered_data.append(longest['original'])
|
45 |
+
conversations_str = conversations_str
|
46 |
+
print("\rRemaining items: {}".format(len(conversations_str)), end='')
|
47 |
+
|
48 |
+
return filtered_data
|
49 |
+
|
50 |
+
def write_json_file(data, filename):
|
51 |
+
with open(filename, 'w', encoding='utf-8') as file:
|
52 |
+
json.dump(data, file, ensure_ascii=False, indent=4)
|
53 |
+
|
54 |
+
def main():
|
55 |
+
input_filename = 'unique_data1.json'
|
56 |
+
output_filename = 'filtered_data.json'
|
57 |
+
|
58 |
+
data = read_json_file(input_filename)
|
59 |
+
print(f"Reading data completed. {len(data)} entries loaded.")
|
60 |
+
filtered_data = filter_conversations(data)
|
61 |
+
print("Filtering completed.")
|
62 |
+
write_json_file(filtered_data, output_filename)
|
63 |
+
print("Writing data completed.")
|
64 |
+
|
65 |
+
main()
|
11.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5cb89c31fc384d1e7de36753f3309c35747d912288fbfaaf4c3cc9492e30064c
|
3 |
+
size 48701344
|
Claude-3-Opus-Instruct-15K/.ipynb_checkpoints/Claude3-Opus-Multi-Instruct-5K-v1-checkpoint.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3df81a1d629c2b7601f4fd0c44901f28d5b5892eb2fb8b897d7d05a80ac274d0
|
3 |
+
size 14998066
|
Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3df81a1d629c2b7601f4fd0c44901f28d5b5892eb2fb8b897d7d05a80ac274d0
|
3 |
+
size 14998066
|
Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6bd8f6b0c314377f8ac9aab6ab8b93bb9f1d939df129e51103e7d8ac717b2b5
|
3 |
+
size 14586329
|
Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1312a6ba6d347a33609c03a466c02eff8838ed7c5680ba1dacb57218028cbe02
|
3 |
+
size 23703566
|
Claude-3-Opus-Instruct-15K/zero3_bf16.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"zero_optimization": {
|
3 |
+
"stage": 3,
|
4 |
+
"overlap_comm": true,
|
5 |
+
"contiguous_gradients": true,
|
6 |
+
"sub_group_size": 0,
|
7 |
+
"reduce_bucket_size": "auto",
|
8 |
+
"stage3_prefetch_bucket_size": "auto",
|
9 |
+
"stage3_param_persistence_threshold": "auto",
|
10 |
+
"stage3_max_live_parameters": 0,
|
11 |
+
"stage3_max_reuse_distance": 0,
|
12 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
13 |
+
},
|
14 |
+
"bf16": {
|
15 |
+
"enabled": true
|
16 |
+
},
|
17 |
+
"fp16": {
|
18 |
+
"enabled": "auto",
|
19 |
+
"auto_cast": false,
|
20 |
+
"loss_scale": 0,
|
21 |
+
"initial_scale_power": 32,
|
22 |
+
"loss_scale_window": 1000,
|
23 |
+
"hysteresis": 2,
|
24 |
+
"min_loss_scale": 1
|
25 |
+
},
|
26 |
+
"gradient_accumulation_steps": "auto",
|
27 |
+
"gradient_clipping": "auto",
|
28 |
+
"train_batch_size": "auto",
|
29 |
+
"train_micro_batch_size_per_gpu": "auto",
|
30 |
+
"wall_clock_breakdown": false
|
31 |
+
}
|
LimaRP-augmented-8k-context.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fb7ac99415c0870fd9cfc48d0bce2aa980fb975eaf71e50dde47c39fc1c66648
|
3 |
+
size 20503146
|
clean-gpt2.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cdfbff2b639bc8a9ec55e3093803b87d1b7dbbeb76db2e21092a553d62c8635e
|
3 |
+
size 686939966
|
config.yaml
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_model: /root/autodl-tmp/c4/Meta-Llama-3-8B-Instruct-abliterated-v3
|
2 |
+
model_type: AutoModelForCausalLM
|
3 |
+
tokenizer_type: AutoTokenizer
|
4 |
+
tokenizer_use_fast: false
|
5 |
+
|
6 |
+
|
7 |
+
load_in_8bit: false
|
8 |
+
load_in_4bit: false
|
9 |
+
strict: false
|
10 |
+
model_config:
|
11 |
+
|
12 |
+
datasets:
|
13 |
+
- path: /root/autodl-tmp/c4/11.json
|
14 |
+
type: sharegpt
|
15 |
+
conversation: llama3
|
16 |
+
roles:
|
17 |
+
input: user
|
18 |
+
output: assistant
|
19 |
+
- path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
|
20 |
+
type: sharegpt
|
21 |
+
conversation: llama3
|
22 |
+
- path: Sao10K/Short-Storygen-v2
|
23 |
+
type:
|
24 |
+
# The below are defaults. only set what's needed if you use a different column name.
|
25 |
+
system_prompt: ""
|
26 |
+
system_format: "{system}"
|
27 |
+
field_instruction: prompt
|
28 |
+
field_system: system
|
29 |
+
field_output: response
|
30 |
+
conversation: llama3
|
31 |
+
- path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json
|
32 |
+
type:
|
33 |
+
# The below are defaults. only set what's needed if you use a different column name.
|
34 |
+
system_prompt: ""
|
35 |
+
system_format: "{system}"
|
36 |
+
field_instruction: prompt
|
37 |
+
#field_input: prompt
|
38 |
+
field_output: response
|
39 |
+
conversation: llama3
|
40 |
+
- path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json
|
41 |
+
type:
|
42 |
+
# The below are defaults. only set what's needed if you use a different column name.
|
43 |
+
system_prompt: ""
|
44 |
+
system_format: "{system}"
|
45 |
+
field_instruction: prompt
|
46 |
+
#field_input: prompt
|
47 |
+
field_output: response
|
48 |
+
conversation: llama3
|
49 |
+
- path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json
|
50 |
+
type:
|
51 |
+
# The below are defaults. only set what's needed if you use a different column name.
|
52 |
+
system_prompt: ""
|
53 |
+
system_format: "{system}"
|
54 |
+
field_instruction: prompt
|
55 |
+
#field_input: prompt
|
56 |
+
field_output: response
|
57 |
+
conversation: llama3
|
58 |
+
- path: SicariusSicariiStuff/Bluemoon_Top50MB_Sorted_Fixed
|
59 |
+
type: sharegpt
|
60 |
+
conversation: llama3
|
61 |
+
- path: /root/autodl-tmp/c4/clean-gpt2.json
|
62 |
+
type: sharegpt
|
63 |
+
conversation: llama3
|
64 |
+
- path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
|
65 |
+
type: sharegpt
|
66 |
+
conversation: llama3
|
67 |
+
|
68 |
+
chat_template: llama3
|
69 |
+
|
70 |
+
|
71 |
+
dataset_prepared_path: /root/autodl-tmp/thingy
|
72 |
+
val_set_size: 0.005
|
73 |
+
output_dir: ./out
|
74 |
+
|
75 |
+
sequence_len: 4096
|
76 |
+
sample_packing: true
|
77 |
+
pad_to_sequence_len: true
|
78 |
+
|
79 |
+
gradient_accumulation_steps: 4
|
80 |
+
micro_batch_size: 3
|
81 |
+
num_epochs: 5
|
82 |
+
logging_steps: 1
|
83 |
+
optimizer: adamw_8bit
|
84 |
+
lr_scheduler: cosine
|
85 |
+
learning_rate: 2e-5
|
86 |
+
|
87 |
+
wandb_project: llama3-8b-hiwaifu
|
88 |
+
wandb_watch:
|
89 |
+
wandb_run_id:
|
90 |
+
wandb_log_model:
|
91 |
+
|
92 |
+
train_on_inputs: false
|
93 |
+
group_by_length: false
|
94 |
+
bf16: auto
|
95 |
+
fp16:
|
96 |
+
tf32: false
|
97 |
+
|
98 |
+
gradient_checkpointing: true
|
99 |
+
gradient_checkpointing_kwargs:
|
100 |
+
use_reentrant: false
|
101 |
+
early_stopping_patience:
|
102 |
+
resume_from_checkpoint:
|
103 |
+
local_rank:
|
104 |
+
logging_steps: 1
|
105 |
+
xformers_attention:
|
106 |
+
flash_attention: true
|
107 |
+
saves_per_epoch: 1
|
108 |
+
save_total_limit: 2
|
109 |
+
save_steps:
|
110 |
+
evals_per_epoch: 4
|
111 |
+
eval_sample_packing: false
|
112 |
+
debug:
|
113 |
+
deepspeed: /root/autodl-tmp/c4/axolotl/deepspeed_configs/zero3_bf16.json
|
114 |
+
weight_decay: 0.05
|
115 |
+
fsdp:
|
116 |
+
fsdp_config:
|
117 |
+
special_tokens:
|
118 |
+
eos_token: "<|eot_id|>"
|
119 |
+
pad_token: "<|end_of_text|>"
|
mergekit.yaml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
merge_method: task_arithmetic
|
2 |
+
base_model: /root/autodl-tmp/llama3
|
3 |
+
models:
|
4 |
+
- model: /root/autodl-tmp/llama3
|
5 |
+
parameters:
|
6 |
+
weight: 0.3
|
7 |
+
- model: /root/autodl-tmp/out/checkpoint-385
|
8 |
+
parameters:
|
9 |
+
weight: 0.7
|
10 |
+
dtype: float16
|
pippa_raw_fix.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3792ffd85ebb51b05f7636e54f67cb64239d980c6fb29e888be744e286ff997
|
3 |
+
size 102788461
|
test.php
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?php
|
2 |
+
ini_set('memory_limit', '-1');
|
3 |
+
function readJsonFile($filename) {
|
4 |
+
$jsonString = file_get_contents($filename);
|
5 |
+
$data = json_decode($jsonString, true);
|
6 |
+
return $data;
|
7 |
+
}
|
8 |
+
|
9 |
+
function concatenateConversations($conversations) {
|
10 |
+
$concatenated = array_reduce($conversations, function($carry, $item) {
|
11 |
+
return $carry . ' ' . $item['content'];
|
12 |
+
}, '');
|
13 |
+
return trim($concatenated);
|
14 |
+
}
|
15 |
+
function cosineSimilarity($tokensA, $tokensB) {
|
16 |
+
$a = $b = $c = 0;
|
17 |
+
$uniqueTokens = array_unique(array_merge(array_keys($tokensA), array_keys($tokensB)));
|
18 |
+
foreach ($uniqueTokens as $token) {
|
19 |
+
$x = isset($tokensA[$token]) ? $tokensA[$token] : 0;
|
20 |
+
$y = isset($tokensB[$token]) ? $tokensB[$token] : 0;
|
21 |
+
$a += $x * $y;
|
22 |
+
$b += $x * $x;
|
23 |
+
$c += $y * $y;
|
24 |
+
}
|
25 |
+
return $b * $c > 0 ? $a / sqrt($b * $c) : 0;
|
26 |
+
}
|
27 |
+
function filterConversations($data) {
|
28 |
+
$conversationsStr = array_map(function($item) {
|
29 |
+
return ['content' => concatenateConversations($item['conversations']), 'original' => $item];
|
30 |
+
}, $data);
|
31 |
+
|
32 |
+
usort($conversationsStr, function($a, $b) {
|
33 |
+
return strlen($b['content']) <=> strlen($a['content']);
|
34 |
+
});
|
35 |
+
|
36 |
+
$filteredData = [];
|
37 |
+
while (!empty($conversationsStr)) {
|
38 |
+
$last_len = 0;
|
39 |
+
#is_martch
|
40 |
+
$is_martch = true;
|
41 |
+
$last_index = 0;
|
42 |
+
$longest = array_shift($conversationsStr);
|
43 |
+
$newConversationsStr = [];
|
44 |
+
$tokensB = array_count_values(str_word_count($longest['content'], 1));
|
45 |
+
foreach ($conversationsStr as $index=>$item) {
|
46 |
+
$tokensA = array_count_values(str_word_count($item['content'], 1));
|
47 |
+
$similarity = cosineSimilarity($tokensA, $tokensB);
|
48 |
+
if ($similarity<0.95) {
|
49 |
+
$newConversationsStr[] = $item;
|
50 |
+
}else{
|
51 |
+
$is_martch = False;
|
52 |
+
$itemCount= count($item['original']['conversations']);
|
53 |
+
$longestCount= count($longest['original']['conversations']);
|
54 |
+
if($itemCount>$longestCount){
|
55 |
+
if($itemCount>$last_len){
|
56 |
+
$last_len = $itemCount;
|
57 |
+
$last_index = $index;
|
58 |
+
}
|
59 |
+
}else{
|
60 |
+
if($longestCount>$last_len){
|
61 |
+
$last_len = $longestCount;
|
62 |
+
$last_index = $index;
|
63 |
+
}
|
64 |
+
}
|
65 |
+
}
|
66 |
+
}
|
67 |
+
if($is_martch){
|
68 |
+
$filteredData[] = $longest['original'];
|
69 |
+
}else if($last_index>0){
|
70 |
+
$filteredData[] = $conversationsStr[$last_index]['original'];
|
71 |
+
}
|
72 |
+
$conversationsStr = $newConversationsStr;
|
73 |
+
print_r("\r".count($conversationsStr));
|
74 |
+
}
|
75 |
+
|
76 |
+
return $filteredData;
|
77 |
+
}
|
78 |
+
|
79 |
+
function writeJsonFile($data, $filename) {
|
80 |
+
$jsonData = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
|
81 |
+
file_put_contents($filename, $jsonData);
|
82 |
+
}
|
83 |
+
|
84 |
+
function main() {
|
85 |
+
$inputFilename = 'merged_data1716636036.json';
|
86 |
+
$outputFilename = 'filtered_data.json';
|
87 |
+
|
88 |
+
$data = readJsonFile($inputFilename);
|
89 |
+
echo "Reading data completed.\n".count($data)."数据";
|
90 |
+
// 切分数据
|
91 |
+
$parts = array_chunk($data, intval(count($data) / 2)); // 假设启动4个进程
|
92 |
+
$processes = [];
|
93 |
+
|
94 |
+
foreach ($parts as $index => $part) {
|
95 |
+
$pid = pcntl_fork();
|
96 |
+
if ($pid == -1) {
|
97 |
+
die('Could not fork');
|
98 |
+
} else if ($pid) {
|
99 |
+
// 父进程
|
100 |
+
$processes[] = $pid;
|
101 |
+
} else {
|
102 |
+
// 子进程
|
103 |
+
processPart($part, $index);
|
104 |
+
exit();
|
105 |
+
}
|
106 |
+
}
|
107 |
+
$status = null;
|
108 |
+
foreach ($processes as $process) {
|
109 |
+
pcntl_waitpid($process, $status);
|
110 |
+
}
|
111 |
+
|
112 |
+
echo "All processes completed.\n";
|
113 |
+
// 假设所有的 JSON 文件都在同一个目录下,例如 "datasets/"
|
114 |
+
$directory = "datasets/";
|
115 |
+
$allData = [];
|
116 |
+
|
117 |
+
// 打开目录,并读取其内容
|
118 |
+
if ($handle = opendir($directory)) {
|
119 |
+
while (false !== ($entry = readdir($handle))) {
|
120 |
+
if ($entry != "." && $entry != ".." && pathinfo($entry, PATHINFO_EXTENSION) == 'json') {
|
121 |
+
// 读取 JSON 文件
|
122 |
+
$jsonContent = file_get_contents($directory . $entry);
|
123 |
+
echo $directory . $entry;
|
124 |
+
// 解码 JSON 文件内容为数组
|
125 |
+
$data = json_decode($jsonContent, true);
|
126 |
+
// 将解码后的数组合并到主数组中
|
127 |
+
$allData = array_merge($allData, $data);
|
128 |
+
}
|
129 |
+
}
|
130 |
+
closedir($handle);
|
131 |
+
}
|
132 |
+
shuffle($allData);
|
133 |
+
// 编码总数组为 JSON
|
134 |
+
$finalJson = json_encode($allData, JSON_PRETTY_PRINT);
|
135 |
+
|
136 |
+
// 写入最终的 JSON 到一个新文件
|
137 |
+
file_put_contents("datasets/merged_data.json", $finalJson);
|
138 |
+
|
139 |
+
echo "All JSON files have been merged into merged_data.json\n";
|
140 |
+
$pattern = $directory . '/filtered_data_part_[0-9]*.json';
|
141 |
+
$files = glob($pattern);
|
142 |
+
|
143 |
+
// 遍历文件数组,并逐个删除
|
144 |
+
foreach ($files as $file) {
|
145 |
+
if (is_file($file)) {
|
146 |
+
if (unlink($file)) {
|
147 |
+
echo "Deleted: $file\n";
|
148 |
+
} else {
|
149 |
+
echo "Error deleting: $file\n";
|
150 |
+
}
|
151 |
+
}
|
152 |
+
}
|
153 |
+
// 源文件路径
|
154 |
+
$sourceFile = $directory.'merged_data.json';
|
155 |
+
|
156 |
+
// 目标目录路径
|
157 |
+
$destinationFile = 'merged_data'.time().'.json';
|
158 |
+
|
159 |
+
// 移动文件
|
160 |
+
if (rename($sourceFile, $destinationFile)) {
|
161 |
+
echo "文件成功移动到: $destinationFile";
|
162 |
+
} else {
|
163 |
+
echo "文件移动失败!";
|
164 |
+
}
|
165 |
+
}
|
166 |
+
|
167 |
+
function processPart($dataPart, $index) {
|
168 |
+
$filteredData = filterConversations($dataPart);
|
169 |
+
$outputFilename = "datasets/filtered_data_part_$index.json";
|
170 |
+
writeJsonFile($filteredData, $outputFilename);
|
171 |
+
echo "Process $index: Writing data completed.\n";
|
172 |
+
}
|
173 |
+
|
174 |
+
|
175 |
+
main();
|
176 |
+
?>
|
177 |
+
?>
|
test.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import math
|
3 |
+
from collections import Counter
|
4 |
+
|
5 |
+
def read_json_file(filename):
|
6 |
+
with open(filename, 'r', encoding='utf-8') as file:
|
7 |
+
data = json.load(file)
|
8 |
+
return data
|
9 |
+
|
10 |
+
def concatenate_conversations(conversations):
|
11 |
+
concatenated = ' '.join(conv['content'] for conv in conversations)
|
12 |
+
return concatenated.strip()
|
13 |
+
|
14 |
+
def cosine_similarity(tokensA, tokensB):
|
15 |
+
a = b = c = 0.0
|
16 |
+
unique_tokens = set(tokensA.keys()).union(set(tokensB.keys()))
|
17 |
+
for token in unique_tokens:
|
18 |
+
x = tokensA.get(token, 0)
|
19 |
+
y = tokensB.get(token, 0)
|
20 |
+
a += x * y
|
21 |
+
b += x ** 2
|
22 |
+
c += y ** 2
|
23 |
+
return a / (math.sqrt(b) * math.sqrt(c)) if b > 0 and c > 0 else 0
|
24 |
+
|
25 |
+
def filter_conversations(data):
|
26 |
+
conversations_str = [{'content': concatenate_conversations(item['conversations']), 'original': item} for item in data]
|
27 |
+
conversations_str.sort(key=lambda x: len(x['content']), reverse=True)
|
28 |
+
|
29 |
+
filtered_data = []
|
30 |
+
while conversations_str:
|
31 |
+
longest = conversations_str.pop(0)
|
32 |
+
tokensB = Counter(longest['content'].split())
|
33 |
+
new_conversations_str = []
|
34 |
+
for item in conversations_str:
|
35 |
+
tokensA = Counter(item['content'].split())
|
36 |
+
similarity = cosine_similarity(tokensA, tokensB)
|
37 |
+
if similarity < 0.95:
|
38 |
+
new_conversations_str.append(item)
|
39 |
+
else:
|
40 |
+
longest_count = len(longest['original']['conversations'])
|
41 |
+
item_count = len(item['original']['conversations'])
|
42 |
+
if item_count > longest_count:
|
43 |
+
longest = item
|
44 |
+
filtered_data.append(longest['original'])
|
45 |
+
conversations_str = conversations_str
|
46 |
+
print("\rRemaining items: {}".format(len(conversations_str)), end='')
|
47 |
+
|
48 |
+
return filtered_data
|
49 |
+
|
50 |
+
def write_json_file(data, filename):
|
51 |
+
with open(filename, 'w', encoding='utf-8') as file:
|
52 |
+
json.dump(data, file, ensure_ascii=False, indent=4)
|
53 |
+
|
54 |
+
def main():
|
55 |
+
input_filename = 'unique_data1.json'
|
56 |
+
output_filename = 'filtered_data.json'
|
57 |
+
|
58 |
+
data = read_json_file(input_filename)
|
59 |
+
print(f"Reading data completed. {len(data)} entries loaded.")
|
60 |
+
filtered_data = filter_conversations(data)
|
61 |
+
print("Filtering completed.")
|
62 |
+
write_json_file(filtered_data, output_filename)
|
63 |
+
print("Writing data completed.")
|
64 |
+
|
65 |
+
main()
|