taozi555
/

datas

Model card Files Files and versions Community

taozi555 commited on May 26

Commit

1ea204c

•

1 Parent(s): ee6f462

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

.gitattributes +8 -0
.ipynb_checkpoints/LimaRP-augmented-8k-context-checkpoint.json +3 -0
.ipynb_checkpoints/config-checkpoint.yaml +119 -0
.ipynb_checkpoints/mergekit-checkpoint.yaml +10 -0
.ipynb_checkpoints/test-checkpoint.php +177 -0
.ipynb_checkpoints/test-checkpoint.py +65 -0
11.json +3 -0
Claude-3-Opus-Instruct-15K/.ipynb_checkpoints/Claude3-Opus-Multi-Instruct-5K-v1-checkpoint.json +3 -0
Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json +3 -0
Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json +3 -0
Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json +3 -0
Claude-3-Opus-Instruct-15K/zero3_bf16.json +31 -0
LimaRP-augmented-8k-context.json +3 -0
clean-gpt2.json +3 -0
config.yaml +119 -0
mergekit.yaml +10 -0
pippa_raw_fix.parquet +3 -0
test.php +177 -0
test.py +65 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+.ipynb_checkpoints/LimaRP-augmented-8k-context-checkpoint.json filter=lfs diff=lfs merge=lfs -text
+11.json filter=lfs diff=lfs merge=lfs -text
+Claude-3-Opus-Instruct-15K/.ipynb_checkpoints/Claude3-Opus-Multi-Instruct-5K-v1-checkpoint.json filter=lfs diff=lfs merge=lfs -text
+Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json filter=lfs diff=lfs merge=lfs -text
+Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json filter=lfs diff=lfs merge=lfs -text
+Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json filter=lfs diff=lfs merge=lfs -text
+LimaRP-augmented-8k-context.json filter=lfs diff=lfs merge=lfs -text
+clean-gpt2.json filter=lfs diff=lfs merge=lfs -text

.ipynb_checkpoints/LimaRP-augmented-8k-context-checkpoint.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb7ac99415c0870fd9cfc48d0bce2aa980fb975eaf71e50dde47c39fc1c66648
+size 20503146

.ipynb_checkpoints/config-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+base_model: /root/autodl-tmp/c4/Meta-Llama-3-8B-Instruct-abliterated-v3
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+tokenizer_use_fast: false
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+model_config:
+datasets:
+  - path: /root/autodl-tmp/c4/11.json
+    type: sharegpt
+    conversation: llama3
+    roles:
+      input: user
+      output: assistant
+  - path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
+    type: sharegpt
+    conversation: llama3
+  - path: Sao10K/Short-Storygen-v2
+    type:
+      # The below are defaults. only set what's needed if you use a different column name.
+      system_prompt: ""
+      system_format: "{system}"
+      field_instruction: prompt
+      field_system: system
+      field_output: response
+    conversation: llama3
+  - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json
+    type:
+      # The below are defaults. only set what's needed if you use a different column name.
+      system_prompt: ""
+      system_format: "{system}"
+      field_instruction: prompt
+      #field_input: prompt
+      field_output: response
+    conversation: llama3
+  - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json
+    type:
+      # The below are defaults. only set what's needed if you use a different column name.
+      system_prompt: ""
+      system_format: "{system}"
+      field_instruction: prompt
+      #field_input: prompt
+      field_output: response
+    conversation: llama3
+  - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json
+    type:
+      # The below are defaults. only set what's needed if you use a different column name.
+      system_prompt: ""
+      system_format: "{system}"
+      field_instruction: prompt
+      #field_input: prompt
+      field_output: response
+    conversation: llama3
+  - path: SicariusSicariiStuff/Bluemoon_Top50MB_Sorted_Fixed
+    type: sharegpt
+    conversation: llama3
+  - path: /root/autodl-tmp/c4/clean-gpt2.json
+    type: sharegpt
+    conversation: llama3
+  - path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
+    type: sharegpt
+    conversation: llama3
+chat_template: llama3
+dataset_prepared_path: /root/autodl-tmp/thingy
+val_set_size: 0.005
+output_dir: ./out
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+gradient_accumulation_steps: 4
+micro_batch_size: 3
+num_epochs: 5
+logging_steps: 1
+optimizer: adamw_8bit
+lr_scheduler: cosine
+learning_rate: 2e-5
+wandb_project: llama3-8b-hiwaifu
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+saves_per_epoch: 1
+save_total_limit: 2
+save_steps:
+evals_per_epoch: 4
+eval_sample_packing: false
+debug:
+deepspeed: /root/autodl-tmp/c4/axolotl/deepspeed_configs/zero3_bf16.json
+weight_decay: 0.05
+fsdp:
+fsdp_config:
+special_tokens:
+  eos_token: "<|eot_id|>"
+  pad_token: "<|end_of_text|>"

.ipynb_checkpoints/mergekit-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+merge_method: task_arithmetic
+base_model: /root/autodl-tmp/llama3
+models:
+  - model: /root/autodl-tmp/llama3
+    parameters:
+      weight: 0.3
+  - model: /root/autodl-tmp/out/checkpoint-385
+    parameters:
+      weight: 0.7
+dtype: float16

.ipynb_checkpoints/test-checkpoint.php ADDED Viewed

	@@ -0,0 +1,177 @@

+<?php
+ini_set('memory_limit', '-1');
+function readJsonFile($filename) {
+    $jsonString = file_get_contents($filename);
+    $data = json_decode($jsonString, true);
+    return $data;
+}
+function concatenateConversations($conversations) {
+    $concatenated = array_reduce($conversations, function($carry, $item) {
+        return $carry . ' ' . $item['content'];
+    }, '');
+    return trim($concatenated);
+}
+function cosineSimilarity($tokensA, $tokensB) {
+    $a = $b = $c = 0;
+    $uniqueTokens = array_unique(array_merge(array_keys($tokensA), array_keys($tokensB)));
+    foreach ($uniqueTokens as $token) {
+        $x = isset($tokensA[$token]) ? $tokensA[$token] : 0;
+        $y = isset($tokensB[$token]) ? $tokensB[$token] : 0;
+        $a += $x * $y;
+        $b += $x * $x;
+        $c += $y * $y;
+    }
+    return $b * $c > 0 ? $a / sqrt($b * $c) : 0;
+}
+function filterConversations($data) {
+    $conversationsStr = array_map(function($item) {
+        return ['content' => concatenateConversations($item['conversations']), 'original' => $item];
+    }, $data);
+    usort($conversationsStr, function($a, $b) {
+        return strlen($b['content']) <=> strlen($a['content']);
+    });
+    $filteredData = [];
+    while (!empty($conversationsStr)) {
+        $last_len = 0;
+        #is_martch
+        $is_martch = true;
+        $last_index = 0;
+        $longest = array_shift($conversationsStr);
+        $newConversationsStr = [];
+        $tokensB = array_count_values(str_word_count($longest['content'], 1));
+        foreach ($conversationsStr as $index=>$item) {
+            $tokensA = array_count_values(str_word_count($item['content'], 1));
+            $similarity = cosineSimilarity($tokensA, $tokensB);
+            if ($similarity<0.95) {
+                $newConversationsStr[] = $item;
+            }else{
+                $is_martch = False;
+                $itemCount= count($item['original']['conversations']);
+                $longestCount= count($longest['original']['conversations']);
+                if($itemCount>$longestCount){
+                    if($itemCount>$last_len){
+                        $last_len = $itemCount;
+                        $last_index = $index;
+                    }
+                }else{
+                    if($longestCount>$last_len){
+                        $last_len = $longestCount;
+                        $last_index = $index;
+                    }
+                }
+            }
+        }
+        if($is_martch){
+            $filteredData[] = $longest['original'];
+        }else if($last_index>0){
+            $filteredData[] = $conversationsStr[$last_index]['original'];
+        }
+        $conversationsStr = $newConversationsStr;
+        print_r("\r".count($conversationsStr));
+    }
+    return $filteredData;
+}
+function writeJsonFile($data, $filename) {
+    $jsonData = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
+    file_put_contents($filename, $jsonData);
+}
+function main() {
+    $inputFilename = 'merged_data1716636036.json';
+    $outputFilename = 'filtered_data.json';
+    $data = readJsonFile($inputFilename);
+    echo "Reading data completed.\n".count($data)."数据";
+    // 切分数据
+    $parts = array_chunk($data, intval(count($data) / 2)); // 假设启动4个进程
+    $processes = [];
+    foreach ($parts as $index => $part) {
+        $pid = pcntl_fork();
+        if ($pid == -1) {
+            die('Could not fork');
+        } else if ($pid) {
+            // 父进程
+            $processes[] = $pid;
+        } else {
+            // 子进程
+            processPart($part, $index);
+            exit();
+        }
+    }
+    $status = null;
+    foreach ($processes as $process) {
+        pcntl_waitpid($process, $status);
+    }
+    echo "All processes completed.\n";
+    // 假设所有的 JSON 文件都在同一个目录下，例如 "datasets/"
+    $directory = "datasets/";
+    $allData = [];
+    // 打开目录，并读取其内容
+    if ($handle = opendir($directory)) {
+        while (false !== ($entry = readdir($handle))) {
+            if ($entry != "." && $entry != ".." && pathinfo($entry, PATHINFO_EXTENSION) == 'json') {
+                // 读取 JSON 文件
+                $jsonContent = file_get_contents($directory . $entry);
+                echo $directory . $entry;
+                // 解码 JSON 文件内容为数组
+                $data = json_decode($jsonContent, true);
+                // 将解码后的数组合并到主数组中
+                $allData = array_merge($allData, $data);
+            }
+        }
+        closedir($handle);
+    }
+    shuffle($allData);
+    // 编码总数组为 JSON
+    $finalJson = json_encode($allData, JSON_PRETTY_PRINT);
+    // 写入最终的 JSON 到一个新文件
+    file_put_contents("datasets/merged_data.json", $finalJson);
+    echo "All JSON files have been merged into merged_data.json\n";
+    $pattern = $directory . '/filtered_data_part_[0-9]*.json';
+    $files = glob($pattern);
+    // 遍历文件数组，并逐个删除
+    foreach ($files as $file) {
+        if (is_file($file)) {
+            if (unlink($file)) {
+                echo "Deleted: $file\n";
+            } else {
+                echo "Error deleting: $file\n";
+            }
+        }
+    }
+    // 源文件路径
+    $sourceFile = $directory.'merged_data.json';
+    // 目标目录路径
+    $destinationFile = 'merged_data'.time().'.json';
+    // 移动文件
+    if (rename($sourceFile, $destinationFile)) {
+        echo "文件成功移动到: $destinationFile";
+    } else {
+        echo "文件移动失败!";
+    }
+}
+function processPart($dataPart, $index) {
+    $filteredData = filterConversations($dataPart);
+    $outputFilename = "datasets/filtered_data_part_$index.json";
+    writeJsonFile($filteredData, $outputFilename);
+    echo "Process $index: Writing data completed.\n";
+}
+main();
+?>
+?>

.ipynb_checkpoints/test-checkpoint.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import json
+import math
+from collections import Counter
+def read_json_file(filename):
+    with open(filename, 'r', encoding='utf-8') as file:
+        data = json.load(file)
+    return data
+def concatenate_conversations(conversations):
+    concatenated = ' '.join(conv['content'] for conv in conversations)
+    return concatenated.strip()
+def cosine_similarity(tokensA, tokensB):
+    a = b = c = 0.0
+    unique_tokens = set(tokensA.keys()).union(set(tokensB.keys()))
+    for token in unique_tokens:
+        x = tokensA.get(token, 0)
+        y = tokensB.get(token, 0)
+        a += x * y
+        b += x ** 2
+        c += y ** 2
+    return a / (math.sqrt(b) * math.sqrt(c)) if b > 0 and c > 0 else 0
+def filter_conversations(data):
+    conversations_str = [{'content': concatenate_conversations(item['conversations']), 'original': item} for item in data]
+    conversations_str.sort(key=lambda x: len(x['content']), reverse=True)
+    filtered_data = []
+    while conversations_str:
+        longest = conversations_str.pop(0)
+        tokensB = Counter(longest['content'].split())
+        new_conversations_str = []
+        for item in conversations_str:
+            tokensA = Counter(item['content'].split())
+            similarity = cosine_similarity(tokensA, tokensB)
+            if similarity < 0.95:
+                new_conversations_str.append(item)
+            else:
+                longest_count = len(longest['original']['conversations'])
+                item_count = len(item['original']['conversations'])
+                if item_count > longest_count:
+                    longest = item
+        filtered_data.append(longest['original'])
+        conversations_str = conversations_str
+        print("\rRemaining items: {}".format(len(conversations_str)), end='')
+    return filtered_data
+def write_json_file(data, filename):
+    with open(filename, 'w', encoding='utf-8') as file:
+        json.dump(data, file, ensure_ascii=False, indent=4)
+def main():
+    input_filename = 'unique_data1.json'
+    output_filename = 'filtered_data.json'
+    data = read_json_file(input_filename)
+    print(f"Reading data completed. {len(data)} entries loaded.")
+    filtered_data = filter_conversations(data)
+    print("Filtering completed.")
+    write_json_file(filtered_data, output_filename)
+    print("Writing data completed.")
+main()

11.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cb89c31fc384d1e7de36753f3309c35747d912288fbfaaf4c3cc9492e30064c
+size 48701344

Claude-3-Opus-Instruct-15K/.ipynb_checkpoints/Claude3-Opus-Multi-Instruct-5K-v1-checkpoint.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3df81a1d629c2b7601f4fd0c44901f28d5b5892eb2fb8b897d7d05a80ac274d0
+size 14998066

Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3df81a1d629c2b7601f4fd0c44901f28d5b5892eb2fb8b897d7d05a80ac274d0
+size 14998066

Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6bd8f6b0c314377f8ac9aab6ab8b93bb9f1d939df129e51103e7d8ac717b2b5
+size 14586329

Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1312a6ba6d347a33609c03a466c02eff8838ed7c5680ba1dacb57218028cbe02
+size 23703566

Claude-3-Opus-Instruct-15K/zero3_bf16.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 0,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}

LimaRP-augmented-8k-context.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb7ac99415c0870fd9cfc48d0bce2aa980fb975eaf71e50dde47c39fc1c66648
+size 20503146

clean-gpt2.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdfbff2b639bc8a9ec55e3093803b87d1b7dbbeb76db2e21092a553d62c8635e
+size 686939966

config.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+base_model: /root/autodl-tmp/c4/Meta-Llama-3-8B-Instruct-abliterated-v3
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+tokenizer_use_fast: false
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+model_config:
+datasets:
+  - path: /root/autodl-tmp/c4/11.json
+    type: sharegpt
+    conversation: llama3
+    roles:
+      input: user
+      output: assistant
+  - path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
+    type: sharegpt
+    conversation: llama3
+  - path: Sao10K/Short-Storygen-v2
+    type:
+      # The below are defaults. only set what's needed if you use a different column name.
+      system_prompt: ""
+      system_format: "{system}"
+      field_instruction: prompt
+      field_system: system
+      field_output: response
+    conversation: llama3
+  - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Claude3-Opus-Multi-Instruct-5K-v1.json
+    type:
+      # The below are defaults. only set what's needed if you use a different column name.
+      system_prompt: ""
+      system_format: "{system}"
+      field_instruction: prompt
+      #field_input: prompt
+      field_output: response
+    conversation: llama3
+  - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-3.5K-Filtered-v2.json
+    type:
+      # The below are defaults. only set what's needed if you use a different column name.
+      system_prompt: ""
+      system_format: "{system}"
+      field_instruction: prompt
+      #field_input: prompt
+      field_output: response
+    conversation: llama3
+  - path: /root/autodl-tmp/c4/Claude-3-Opus-Instruct-15K/Opus_Instruct-v2-6.5K-Filtered-v2.json
+    type:
+      # The below are defaults. only set what's needed if you use a different column name.
+      system_prompt: ""
+      system_format: "{system}"
+      field_instruction: prompt
+      #field_input: prompt
+      field_output: response
+    conversation: llama3
+  - path: SicariusSicariiStuff/Bluemoon_Top50MB_Sorted_Fixed
+    type: sharegpt
+    conversation: llama3
+  - path: /root/autodl-tmp/c4/clean-gpt2.json
+    type: sharegpt
+    conversation: llama3
+  - path: /root/autodl-tmp/c4/LimaRP-augmented-8k-context.json
+    type: sharegpt
+    conversation: llama3
+chat_template: llama3
+dataset_prepared_path: /root/autodl-tmp/thingy
+val_set_size: 0.005
+output_dir: ./out
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+gradient_accumulation_steps: 4
+micro_batch_size: 3
+num_epochs: 5
+logging_steps: 1
+optimizer: adamw_8bit
+lr_scheduler: cosine
+learning_rate: 2e-5
+wandb_project: llama3-8b-hiwaifu
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+saves_per_epoch: 1
+save_total_limit: 2
+save_steps:
+evals_per_epoch: 4
+eval_sample_packing: false
+debug:
+deepspeed: /root/autodl-tmp/c4/axolotl/deepspeed_configs/zero3_bf16.json
+weight_decay: 0.05
+fsdp:
+fsdp_config:
+special_tokens:
+  eos_token: "<|eot_id|>"
+  pad_token: "<|end_of_text|>"

mergekit.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+merge_method: task_arithmetic
+base_model: /root/autodl-tmp/llama3
+models:
+  - model: /root/autodl-tmp/llama3
+    parameters:
+      weight: 0.3
+  - model: /root/autodl-tmp/out/checkpoint-385
+    parameters:
+      weight: 0.7
+dtype: float16

pippa_raw_fix.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3792ffd85ebb51b05f7636e54f67cb64239d980c6fb29e888be744e286ff997
+size 102788461

test.php ADDED Viewed

	@@ -0,0 +1,177 @@

+<?php
+ini_set('memory_limit', '-1');
+function readJsonFile($filename) {
+    $jsonString = file_get_contents($filename);
+    $data = json_decode($jsonString, true);
+    return $data;
+}
+function concatenateConversations($conversations) {
+    $concatenated = array_reduce($conversations, function($carry, $item) {
+        return $carry . ' ' . $item['content'];
+    }, '');
+    return trim($concatenated);
+}
+function cosineSimilarity($tokensA, $tokensB) {
+    $a = $b = $c = 0;
+    $uniqueTokens = array_unique(array_merge(array_keys($tokensA), array_keys($tokensB)));
+    foreach ($uniqueTokens as $token) {
+        $x = isset($tokensA[$token]) ? $tokensA[$token] : 0;
+        $y = isset($tokensB[$token]) ? $tokensB[$token] : 0;
+        $a += $x * $y;
+        $b += $x * $x;
+        $c += $y * $y;
+    }
+    return $b * $c > 0 ? $a / sqrt($b * $c) : 0;
+}
+function filterConversations($data) {
+    $conversationsStr = array_map(function($item) {
+        return ['content' => concatenateConversations($item['conversations']), 'original' => $item];
+    }, $data);
+    usort($conversationsStr, function($a, $b) {
+        return strlen($b['content']) <=> strlen($a['content']);
+    });
+    $filteredData = [];
+    while (!empty($conversationsStr)) {
+        $last_len = 0;
+        #is_martch
+        $is_martch = true;
+        $last_index = 0;
+        $longest = array_shift($conversationsStr);
+        $newConversationsStr = [];
+        $tokensB = array_count_values(str_word_count($longest['content'], 1));
+        foreach ($conversationsStr as $index=>$item) {
+            $tokensA = array_count_values(str_word_count($item['content'], 1));
+            $similarity = cosineSimilarity($tokensA, $tokensB);
+            if ($similarity<0.95) {
+                $newConversationsStr[] = $item;
+            }else{
+                $is_martch = False;
+                $itemCount= count($item['original']['conversations']);
+                $longestCount= count($longest['original']['conversations']);
+                if($itemCount>$longestCount){
+                    if($itemCount>$last_len){
+                        $last_len = $itemCount;
+                        $last_index = $index;
+                    }
+                }else{
+                    if($longestCount>$last_len){
+                        $last_len = $longestCount;
+                        $last_index = $index;
+                    }
+                }
+            }
+        }
+        if($is_martch){
+            $filteredData[] = $longest['original'];
+        }else if($last_index>0){
+            $filteredData[] = $conversationsStr[$last_index]['original'];
+        }
+        $conversationsStr = $newConversationsStr;
+        print_r("\r".count($conversationsStr));
+    }
+    return $filteredData;
+}
+function writeJsonFile($data, $filename) {
+    $jsonData = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
+    file_put_contents($filename, $jsonData);
+}
+function main() {
+    $inputFilename = 'merged_data1716636036.json';
+    $outputFilename = 'filtered_data.json';
+    $data = readJsonFile($inputFilename);
+    echo "Reading data completed.\n".count($data)."数据";
+    // 切分数据
+    $parts = array_chunk($data, intval(count($data) / 2)); // 假设启动4个进程
+    $processes = [];
+    foreach ($parts as $index => $part) {
+        $pid = pcntl_fork();
+        if ($pid == -1) {
+            die('Could not fork');
+        } else if ($pid) {
+            // 父进程
+            $processes[] = $pid;
+        } else {
+            // 子进程
+            processPart($part, $index);
+            exit();
+        }
+    }
+    $status = null;
+    foreach ($processes as $process) {
+        pcntl_waitpid($process, $status);
+    }
+    echo "All processes completed.\n";
+    // 假设所有的 JSON 文件都在同一个目录下，例如 "datasets/"
+    $directory = "datasets/";
+    $allData = [];
+    // 打开目录，并读取其内容
+    if ($handle = opendir($directory)) {
+        while (false !== ($entry = readdir($handle))) {
+            if ($entry != "." && $entry != ".." && pathinfo($entry, PATHINFO_EXTENSION) == 'json') {
+                // 读取 JSON 文件
+                $jsonContent = file_get_contents($directory . $entry);
+                echo $directory . $entry;
+                // 解码 JSON 文件内容为数组
+                $data = json_decode($jsonContent, true);
+                // 将解码后的数组合并到主数组中
+                $allData = array_merge($allData, $data);
+            }
+        }
+        closedir($handle);
+    }
+    shuffle($allData);
+    // 编码总数组为 JSON
+    $finalJson = json_encode($allData, JSON_PRETTY_PRINT);
+    // 写入最终的 JSON 到一个新文件
+    file_put_contents("datasets/merged_data.json", $finalJson);
+    echo "All JSON files have been merged into merged_data.json\n";
+    $pattern = $directory . '/filtered_data_part_[0-9]*.json';
+    $files = glob($pattern);
+    // 遍历文件数组，并逐个删除
+    foreach ($files as $file) {
+        if (is_file($file)) {
+            if (unlink($file)) {
+                echo "Deleted: $file\n";
+            } else {
+                echo "Error deleting: $file\n";
+            }
+        }
+    }
+    // 源文件路径
+    $sourceFile = $directory.'merged_data.json';
+    // 目标目录路径
+    $destinationFile = 'merged_data'.time().'.json';
+    // 移动文件
+    if (rename($sourceFile, $destinationFile)) {
+        echo "文件成功移动到: $destinationFile";
+    } else {
+        echo "文件移动失败!";
+    }
+}
+function processPart($dataPart, $index) {
+    $filteredData = filterConversations($dataPart);
+    $outputFilename = "datasets/filtered_data_part_$index.json";
+    writeJsonFile($filteredData, $outputFilename);
+    echo "Process $index: Writing data completed.\n";
+}
+main();
+?>
+?>

test.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import json
+import math
+from collections import Counter
+def read_json_file(filename):
+    with open(filename, 'r', encoding='utf-8') as file:
+        data = json.load(file)
+    return data
+def concatenate_conversations(conversations):
+    concatenated = ' '.join(conv['content'] for conv in conversations)
+    return concatenated.strip()
+def cosine_similarity(tokensA, tokensB):
+    a = b = c = 0.0
+    unique_tokens = set(tokensA.keys()).union(set(tokensB.keys()))
+    for token in unique_tokens:
+        x = tokensA.get(token, 0)
+        y = tokensB.get(token, 0)
+        a += x * y
+        b += x ** 2
+        c += y ** 2
+    return a / (math.sqrt(b) * math.sqrt(c)) if b > 0 and c > 0 else 0
+def filter_conversations(data):
+    conversations_str = [{'content': concatenate_conversations(item['conversations']), 'original': item} for item in data]
+    conversations_str.sort(key=lambda x: len(x['content']), reverse=True)
+    filtered_data = []
+    while conversations_str:
+        longest = conversations_str.pop(0)
+        tokensB = Counter(longest['content'].split())
+        new_conversations_str = []
+        for item in conversations_str:
+            tokensA = Counter(item['content'].split())
+            similarity = cosine_similarity(tokensA, tokensB)
+            if similarity < 0.95:
+                new_conversations_str.append(item)
+            else:
+                longest_count = len(longest['original']['conversations'])
+                item_count = len(item['original']['conversations'])
+                if item_count > longest_count:
+                    longest = item
+        filtered_data.append(longest['original'])
+        conversations_str = conversations_str
+        print("\rRemaining items: {}".format(len(conversations_str)), end='')
+    return filtered_data
+def write_json_file(data, filename):
+    with open(filename, 'w', encoding='utf-8') as file:
+        json.dump(data, file, ensure_ascii=False, indent=4)
+def main():
+    input_filename = 'unique_data1.json'
+    output_filename = 'filtered_data.json'
+    data = read_json_file(input_filename)
+    print(f"Reading data completed. {len(data)} entries loaded.")
+    filtered_data = filter_conversations(data)
+    print("Filtering completed.")
+    write_json_file(filtered_data, output_filename)
+    print("Writing data completed.")
+main()