File size: 2,121 Bytes
171f55b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from sentence_transformers import util
import json
from safetensors.numpy import load_file
import numpy as np
import torch
from safetensors.torch import save_file


def dedup_similar_sentences(sentences, threshold=0.9, batch_size=64):
    file = load_file("/root/autodl-tmp/dedup_audio_text.safetensors")
    sentence_embeddings = torch.tensor(file['text_embed']).cuda()

    batch_idx = 0
    while batch_idx * batch_size < len(sentences):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, len(sentences))

        batch_embeddings = sentence_embeddings[start_idx:end_idx]
        cosine_scores = torch.matmul(batch_embeddings, sentence_embeddings.T) / (
                torch.norm(batch_embeddings, dim=1)[:, None] *
                torch.norm(sentence_embeddings, dim=1))

        duplicate_indices = torch.where((cosine_scores > threshold) & (cosine_scores < 1.0))
        duplicate_indices_list = duplicate_indices[1].tolist()
        remove_idx = set(duplicate_indices_list)

        # Update sentences and sentence_embeddings to remove duplicates
        keep_indices = [idx for idx in range(len(sentences)) if idx not in remove_idx]
        sentences = [sentences[idx] for idx in keep_indices]
        sentence_embeddings = sentence_embeddings[keep_indices]

        print(len(sentences))
        print(sentence_embeddings.shape)
        # Update batch_idx accordingly
        batch_idx = start_idx // batch_size + 1

    uq_sentences = sentences

    return uq_sentences, sentence_embeddings


def read_default_prompt():
    import json
    with open('/root/autodl-tmp/dedup_audio_text.json', 'r') as f:
        data = json.load(f)
    return data


if __name__ == '__main__':
    all_texts = read_default_prompt()
    unique_sentences, unique_embeddings = dedup_similar_sentences(all_texts, threshold=0.8)
    with open("/root/autodl-tmp/dedup_audio_text_80.json", "w") as outfile:
        json.dump(unique_sentences, outfile)

    tensors = {
        "text_embed": unique_embeddings,
    }
    save_file(tensors, "/root/autodl-tmp/dedup_audio_text_80.safetensors")