datas / test.py
taozi555's picture
Upload folder using huggingface_hub
1ea204c verified
raw
history blame
2.33 kB
import json
import math
from collections import Counter
def read_json_file(filename):
with open(filename, 'r', encoding='utf-8') as file:
data = json.load(file)
return data
def concatenate_conversations(conversations):
concatenated = ' '.join(conv['content'] for conv in conversations)
return concatenated.strip()
def cosine_similarity(tokensA, tokensB):
a = b = c = 0.0
unique_tokens = set(tokensA.keys()).union(set(tokensB.keys()))
for token in unique_tokens:
x = tokensA.get(token, 0)
y = tokensB.get(token, 0)
a += x * y
b += x ** 2
c += y ** 2
return a / (math.sqrt(b) * math.sqrt(c)) if b > 0 and c > 0 else 0
def filter_conversations(data):
conversations_str = [{'content': concatenate_conversations(item['conversations']), 'original': item} for item in data]
conversations_str.sort(key=lambda x: len(x['content']), reverse=True)
filtered_data = []
while conversations_str:
longest = conversations_str.pop(0)
tokensB = Counter(longest['content'].split())
new_conversations_str = []
for item in conversations_str:
tokensA = Counter(item['content'].split())
similarity = cosine_similarity(tokensA, tokensB)
if similarity < 0.95:
new_conversations_str.append(item)
else:
longest_count = len(longest['original']['conversations'])
item_count = len(item['original']['conversations'])
if item_count > longest_count:
longest = item
filtered_data.append(longest['original'])
conversations_str = conversations_str
print("\rRemaining items: {}".format(len(conversations_str)), end='')
return filtered_data
def write_json_file(data, filename):
with open(filename, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
def main():
input_filename = 'unique_data1.json'
output_filename = 'filtered_data.json'
data = read_json_file(input_filename)
print(f"Reading data completed. {len(data)} entries loaded.")
filtered_data = filter_conversations(data)
print("Filtering completed.")
write_json_file(filtered_data, output_filename)
print("Writing data completed.")
main()