File size: 3,373 Bytes
171f55b
 
 
 
 
 
 
 
 
 
 
 
ca13e69
171f55b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca13e69
171f55b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import random

import librosa
import laion_clap
import torch
import numpy as np
from safetensors import safe_open
from transformers import pipeline


class AudioPipeline(object):
    def __init__(self, audio_text_path, audio_text_embeddings_path):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = laion_clap.CLAP_Module(enable_fusion=False)
        self.model.load_ckpt()  # download the default pretrained checkpoint.
        self.audio_text_path = audio_text_path
        self.audio_text_embeddings_path = audio_text_embeddings_path
        self.gpt2_pipe = pipeline('text-generation', model='Gustavosta/MagicPrompt-Stable-Diffusion', tokenizer='gpt2')

    def audio_embedding(self, file_path):
        # quantization
        def int16_to_float32(x):
            return (x / 32767.0).astype(np.float32)

        def float32_to_int16(x):
            x = np.clip(x, a_min=-1., a_max=1.)
            return (x * 32767.).astype(np.int16)

        # Get audio embeddings from audio data
        audio_data, _ = librosa.load(file_path, sr=48000)  # sample rate should be 48000
        audio_data = audio_data.reshape(1, -1)  # Make it (1,T) or (N,T)
        audio_data = torch.from_numpy(
            int16_to_float32(float32_to_int16(audio_data))).float()  # quantize before send it in to the model
        audio_embed = self.model.get_audio_embedding_from_data(x=audio_data, use_tensor=True)
        return audio_embed

    def load_candidate_text(self):
        import json
        with open(self.audio_text_path, 'r') as f:
            texts = json.load(f)

        tensors = {}
        with safe_open(self.audio_text_embeddings_path, framework="pt", device=self.device) as f:
            for k in f.keys():
                tensors[k] = f.get_tensor(k)
        text_embed = tensors["text_embed"]
        return texts, text_embed

    def audio2txt(self, filepath):
        audio_embed = self.audio_embedding(filepath)
        texts, text_embed = self.load_candidate_text()
        # Concatenate the embeddings from all batches into a single tensor
        result_tensor = torch.matmul(audio_embed, text_embed.transpose(0, 1))
        similarity_scores = torch.softmax(result_tensor, dim=1)
        topk_scores, topk_indices = torch.topk(similarity_scores, k=10, dim=1)
        print("Top 10 similarity scores:", topk_scores)
        print("Top 10 sentence indices:", topk_indices)
        #
        topK_sentences = [texts[idx].replace("The sounds of", "") for idx in topk_indices[0].tolist()]
        starting_text = topK_sentences[0]
        response = self.gpt2_pipe(starting_text, max_length=(len(starting_text) + random.randint(60, 90)),
                                  num_return_sequences=4)
        response_list = []
        for x in response:
            resp = x['generated_text'].strip()
            if resp != starting_text and len(resp) > (len(starting_text) + 4) and resp.endswith(
                    (":", "-", "—")) is False:
                response_list.append(resp)
        return response_list[0]


if __name__ == "__main__":
    pipeline = AudioPipeline(audio_text_path='/root/autodl-tmp/dedup_audio_text_80.json',
                             audio_text_embeddings_path='/root/autodl-tmp/audio_text_embeddings.safetensors')
    texts = pipeline.audio2txt('/root/autodl-tmp/下载.wav')
    print(texts)