|
import json |
|
import math |
|
from collections import Counter |
|
|
|
def read_json_file(filename): |
|
with open(filename, 'r', encoding='utf-8') as file: |
|
data = json.load(file) |
|
return data |
|
|
|
def concatenate_conversations(conversations): |
|
concatenated = ' '.join(conv['content'] for conv in conversations) |
|
return concatenated.strip() |
|
|
|
def cosine_similarity(tokensA, tokensB): |
|
a = b = c = 0.0 |
|
unique_tokens = set(tokensA.keys()).union(set(tokensB.keys())) |
|
for token in unique_tokens: |
|
x = tokensA.get(token, 0) |
|
y = tokensB.get(token, 0) |
|
a += x * y |
|
b += x ** 2 |
|
c += y ** 2 |
|
return a / (math.sqrt(b) * math.sqrt(c)) if b > 0 and c > 0 else 0 |
|
|
|
def filter_conversations(data): |
|
conversations_str = [{'content': concatenate_conversations(item['conversations']), 'original': item} for item in data] |
|
conversations_str.sort(key=lambda x: len(x['content']), reverse=True) |
|
|
|
filtered_data = [] |
|
while conversations_str: |
|
longest = conversations_str.pop(0) |
|
tokensB = Counter(longest['content'].split()) |
|
new_conversations_str = [] |
|
for item in conversations_str: |
|
tokensA = Counter(item['content'].split()) |
|
similarity = cosine_similarity(tokensA, tokensB) |
|
if similarity < 0.95: |
|
new_conversations_str.append(item) |
|
else: |
|
longest_count = len(longest['original']['conversations']) |
|
item_count = len(item['original']['conversations']) |
|
if item_count > longest_count: |
|
longest = item |
|
filtered_data.append(longest['original']) |
|
conversations_str = conversations_str |
|
print("\rRemaining items: {}".format(len(conversations_str)), end='') |
|
|
|
return filtered_data |
|
|
|
def write_json_file(data, filename): |
|
with open(filename, 'w', encoding='utf-8') as file: |
|
json.dump(data, file, ensure_ascii=False, indent=4) |
|
|
|
def main(): |
|
input_filename = 'unique_data1.json' |
|
output_filename = 'filtered_data.json' |
|
|
|
data = read_json_file(input_filename) |
|
print(f"Reading data completed. {len(data)} entries loaded.") |
|
filtered_data = filter_conversations(data) |
|
print("Filtering completed.") |
|
write_json_file(filtered_data, output_filename) |
|
print("Writing data completed.") |
|
|
|
main() |