|
<?php |
|
ini_set('memory_limit', '-1'); |
|
function readJsonFile($filename) { |
|
$jsonString = file_get_contents($filename); |
|
$data = json_decode($jsonString, true); |
|
return $data; |
|
} |
|
|
|
function concatenateConversations($conversations) { |
|
$concatenated = array_reduce($conversations, function($carry, $item) { |
|
return $carry . ' ' . $item['content']; |
|
}, ''); |
|
return trim($concatenated); |
|
} |
|
function cosineSimilarity($tokensA, $tokensB) { |
|
$a = $b = $c = 0; |
|
$uniqueTokens = array_unique(array_merge(array_keys($tokensA), array_keys($tokensB))); |
|
foreach ($uniqueTokens as $token) { |
|
$x = isset($tokensA[$token]) ? $tokensA[$token] : 0; |
|
$y = isset($tokensB[$token]) ? $tokensB[$token] : 0; |
|
$a += $x * $y; |
|
$b += $x * $x; |
|
$c += $y * $y; |
|
} |
|
return $b * $c > 0 ? $a / sqrt($b * $c) : 0; |
|
} |
|
function filterConversations($data) { |
|
$conversationsStr = array_map(function($item) { |
|
return ['content' => concatenateConversations($item['conversations']), 'original' => $item]; |
|
}, $data); |
|
|
|
usort($conversationsStr, function($a, $b) { |
|
return strlen($b['content']) <=> strlen($a['content']); |
|
}); |
|
|
|
$filteredData = []; |
|
while (!empty($conversationsStr)) { |
|
$last_len = 0; |
|
|
|
$is_martch = true; |
|
$last_index = 0; |
|
$longest = array_shift($conversationsStr); |
|
$newConversationsStr = []; |
|
$tokensB = array_count_values(str_word_count($longest['content'], 1)); |
|
foreach ($conversationsStr as $index=>$item) { |
|
$tokensA = array_count_values(str_word_count($item['content'], 1)); |
|
$similarity = cosineSimilarity($tokensA, $tokensB); |
|
if ($similarity<0.95) { |
|
$newConversationsStr[] = $item; |
|
}else{ |
|
$is_martch = False; |
|
$itemCount= count($item['original']['conversations']); |
|
$longestCount= count($longest['original']['conversations']); |
|
if($itemCount>$longestCount){ |
|
if($itemCount>$last_len){ |
|
$last_len = $itemCount; |
|
$last_index = $index; |
|
} |
|
}else{ |
|
if($longestCount>$last_len){ |
|
$last_len = $longestCount; |
|
$last_index = $index; |
|
} |
|
} |
|
} |
|
} |
|
if($is_martch){ |
|
$filteredData[] = $longest['original']; |
|
}else if($last_index>0){ |
|
$filteredData[] = $conversationsStr[$last_index]['original']; |
|
} |
|
$conversationsStr = $newConversationsStr; |
|
print_r("\r".count($conversationsStr)); |
|
} |
|
|
|
return $filteredData; |
|
} |
|
|
|
function writeJsonFile($data, $filename) { |
|
$jsonData = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT); |
|
file_put_contents($filename, $jsonData); |
|
} |
|
|
|
function main() { |
|
$inputFilename = 'merged_data1716636036.json'; |
|
$outputFilename = 'filtered_data.json'; |
|
|
|
$data = readJsonFile($inputFilename); |
|
echo "Reading data completed.\n".count($data)."数据"; |
|
|
|
$parts = array_chunk($data, intval(count($data) / 2)); |
|
$processes = []; |
|
|
|
foreach ($parts as $index => $part) { |
|
$pid = pcntl_fork(); |
|
if ($pid == -1) { |
|
die('Could not fork'); |
|
} else if ($pid) { |
|
|
|
$processes[] = $pid; |
|
} else { |
|
|
|
processPart($part, $index); |
|
exit(); |
|
} |
|
} |
|
$status = null; |
|
foreach ($processes as $process) { |
|
pcntl_waitpid($process, $status); |
|
} |
|
|
|
echo "All processes completed.\n"; |
|
|
|
$directory = "datasets/"; |
|
$allData = []; |
|
|
|
|
|
if ($handle = opendir($directory)) { |
|
while (false !== ($entry = readdir($handle))) { |
|
if ($entry != "." && $entry != ".." && pathinfo($entry, PATHINFO_EXTENSION) == 'json') { |
|
|
|
$jsonContent = file_get_contents($directory . $entry); |
|
echo $directory . $entry; |
|
|
|
$data = json_decode($jsonContent, true); |
|
|
|
$allData = array_merge($allData, $data); |
|
} |
|
} |
|
closedir($handle); |
|
} |
|
shuffle($allData); |
|
|
|
$finalJson = json_encode($allData, JSON_PRETTY_PRINT); |
|
|
|
|
|
file_put_contents("datasets/merged_data.json", $finalJson); |
|
|
|
echo "All JSON files have been merged into merged_data.json\n"; |
|
$pattern = $directory . '/filtered_data_part_[0-9]*.json'; |
|
$files = glob($pattern); |
|
|
|
|
|
foreach ($files as $file) { |
|
if (is_file($file)) { |
|
if (unlink($file)) { |
|
echo "Deleted: $file\n"; |
|
} else { |
|
echo "Error deleting: $file\n"; |
|
} |
|
} |
|
} |
|
|
|
$sourceFile = $directory.'merged_data.json'; |
|
|
|
|
|
$destinationFile = 'merged_data'.time().'.json'; |
|
|
|
|
|
if (rename($sourceFile, $destinationFile)) { |
|
echo "文件成功移动到: $destinationFile"; |
|
} else { |
|
echo "文件移动失败!"; |
|
} |
|
} |
|
|
|
function processPart($dataPart, $index) { |
|
$filteredData = filterConversations($dataPart); |
|
$outputFilename = "datasets/filtered_data_part_$index.json"; |
|
writeJsonFile($filteredData, $outputFilename); |
|
echo "Process $index: Writing data completed.\n"; |
|
} |
|
|
|
|
|
main(); |
|
?> |
|
?> |