import random import librosa import laion_clap import torch import numpy as np from safetensors import safe_open from transformers import pipeline class AudioPipeline(object): def __init__(self, audio_text_path, audio_text_embeddings_path): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model = laion_clap.CLAP_Module(enable_fusion=False) self.model.load_ckpt() # download the default pretrained checkpoint. self.audio_text_path = audio_text_path self.audio_text_embeddings_path = audio_text_embeddings_path self.gpt2_pipe = pipeline('text-generation', model='Gustavosta/MagicPrompt-Stable-Diffusion', tokenizer='gpt2') def audio_embedding(self, file_path): # quantization def int16_to_float32(x): return (x / 32767.0).astype(np.float32) def float32_to_int16(x): x = np.clip(x, a_min=-1., a_max=1.) return (x * 32767.).astype(np.int16) # Get audio embeddings from audio data audio_data, _ = librosa.load(file_path, sr=48000) # sample rate should be 48000 audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T) audio_data = torch.from_numpy( int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model audio_embed = self.model.get_audio_embedding_from_data(x=audio_data, use_tensor=True) return audio_embed def load_candidate_text(self): import json with open(self.audio_text_path, 'r') as f: texts = json.load(f) tensors = {} with safe_open(self.audio_text_embeddings_path, framework="pt", device=self.device) as f: for k in f.keys(): tensors[k] = f.get_tensor(k) text_embed = tensors["text_embed"] return texts, text_embed def audio2txt(self, filepath): audio_embed = self.audio_embedding(filepath) texts, text_embed = self.load_candidate_text() # Concatenate the embeddings from all batches into a single tensor result_tensor = torch.matmul(audio_embed, text_embed.transpose(0, 1)) similarity_scores = torch.softmax(result_tensor, dim=1) topk_scores, topk_indices = torch.topk(similarity_scores, k=10, dim=1) print("Top 10 similarity scores:", topk_scores) print("Top 10 sentence indices:", topk_indices) # topK_sentences = [texts[idx].replace("The sounds of", "") for idx in topk_indices[0].tolist()] starting_text = topK_sentences[0] response = self.gpt2_pipe(starting_text, max_length=(len(starting_text) + random.randint(60, 90)), num_return_sequences=4) response_list = [] for x in response: resp = x['generated_text'].strip() if resp != starting_text and len(resp) > (len(starting_text) + 4) and resp.endswith( (":", "-", "—")) is False: response_list.append(resp) return response_list[0] if __name__ == "__main__": pipeline = AudioPipeline(audio_text_path='/root/autodl-tmp/dedup_audio_text_80.json', audio_text_embeddings_path='/root/autodl-tmp/audio_text_embeddings.safetensors') texts = pipeline.audio2txt('/root/autodl-tmp/下载.wav') print(texts)