import json import math from collections import Counter def read_json_file(filename): with open(filename, 'r', encoding='utf-8') as file: data = json.load(file) return data def concatenate_conversations(conversations): concatenated = ' '.join(conv['content'] for conv in conversations) return concatenated.strip() def cosine_similarity(tokensA, tokensB): a = b = c = 0.0 unique_tokens = set(tokensA.keys()).union(set(tokensB.keys())) for token in unique_tokens: x = tokensA.get(token, 0) y = tokensB.get(token, 0) a += x * y b += x ** 2 c += y ** 2 return a / (math.sqrt(b) * math.sqrt(c)) if b > 0 and c > 0 else 0 def filter_conversations(data): conversations_str = [{'content': concatenate_conversations(item['conversations']), 'original': item} for item in data] conversations_str.sort(key=lambda x: len(x['content']), reverse=True) filtered_data = [] while conversations_str: longest = conversations_str.pop(0) tokensB = Counter(longest['content'].split()) new_conversations_str = [] for item in conversations_str: tokensA = Counter(item['content'].split()) similarity = cosine_similarity(tokensA, tokensB) if similarity < 0.95: new_conversations_str.append(item) else: longest_count = len(longest['original']['conversations']) item_count = len(item['original']['conversations']) if item_count > longest_count: longest = item filtered_data.append(longest['original']) conversations_str = conversations_str print("\rRemaining items: {}".format(len(conversations_str)), end='') return filtered_data def write_json_file(data, filename): with open(filename, 'w', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False, indent=4) def main(): input_filename = 'unique_data1.json' output_filename = 'filtered_data.json' data = read_json_file(input_filename) print(f"Reading data completed. {len(data)} entries loaded.") filtered_data = filter_conversations(data) print("Filtering completed.") write_json_file(filtered_data, output_filename) print("Writing data completed.") main()