Spaces:
Build error
Build error
import random | |
import librosa | |
import laion_clap | |
import torch | |
import numpy as np | |
from safetensors import safe_open | |
from transformers import pipeline | |
class AudioPipeline(object): | |
def __init__(self, audio_text_path, audio_text_embeddings_path): | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
self.model = laion_clap.CLAP_Module(enable_fusion=False) | |
self.model.load_ckpt() # download the default pretrained checkpoint. | |
self.audio_text_path = audio_text_path | |
self.audio_text_embeddings_path = audio_text_embeddings_path | |
self.gpt2_pipe = pipeline('text-generation', model='Gustavosta/MagicPrompt-Stable-Diffusion', tokenizer='gpt2') | |
def audio_embedding(self, file_path): | |
# quantization | |
def int16_to_float32(x): | |
return (x / 32767.0).astype(np.float32) | |
def float32_to_int16(x): | |
x = np.clip(x, a_min=-1., a_max=1.) | |
return (x * 32767.).astype(np.int16) | |
# Get audio embeddings from audio data | |
audio_data, _ = librosa.load(file_path, sr=48000) # sample rate should be 48000 | |
audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T) | |
audio_data = torch.from_numpy( | |
int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model | |
audio_embed = self.model.get_audio_embedding_from_data(x=audio_data, use_tensor=True) | |
return audio_embed | |
def load_candidate_text(self): | |
import json | |
with open(self.audio_text_path, 'r') as f: | |
texts = json.load(f) | |
tensors = {} | |
with safe_open(self.audio_text_embeddings_path, framework="pt", device=self.device) as f: | |
for k in f.keys(): | |
tensors[k] = f.get_tensor(k) | |
text_embed = tensors["text_embed"] | |
return texts, text_embed | |
def audio2txt(self, filepath): | |
audio_embed = self.audio_embedding(filepath) | |
texts, text_embed = self.load_candidate_text() | |
# Concatenate the embeddings from all batches into a single tensor | |
result_tensor = torch.matmul(audio_embed, text_embed.transpose(0, 1)) | |
similarity_scores = torch.softmax(result_tensor, dim=1) | |
topk_scores, topk_indices = torch.topk(similarity_scores, k=10, dim=1) | |
print("Top 10 similarity scores:", topk_scores) | |
print("Top 10 sentence indices:", topk_indices) | |
# | |
topK_sentences = [texts[idx].replace("The sounds of", "") for idx in topk_indices[0].tolist()] | |
starting_text = topK_sentences[0] | |
response = self.gpt2_pipe(starting_text, max_length=(len(starting_text) + random.randint(60, 90)), | |
num_return_sequences=4) | |
response_list = [] | |
for x in response: | |
resp = x['generated_text'].strip() | |
if resp != starting_text and len(resp) > (len(starting_text) + 4) and resp.endswith( | |
(":", "-", "—")) is False: | |
response_list.append(resp) | |
return response_list[0] | |
if __name__ == "__main__": | |
pipeline = AudioPipeline(audio_text_path='/root/autodl-tmp/dedup_audio_text_80.json', | |
audio_text_embeddings_path='/root/autodl-tmp/audio_text_embeddings.safetensors') | |
texts = pipeline.audio2txt('/root/autodl-tmp/下载.wav') | |
print(texts) | |