audio_img / audio_to_text.py
pengdaqian
add more
ca13e69
import random
import librosa
import laion_clap
import torch
import numpy as np
from safetensors import safe_open
from transformers import pipeline
class AudioPipeline(object):
def __init__(self, audio_text_path, audio_text_embeddings_path):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = laion_clap.CLAP_Module(enable_fusion=False)
self.model.load_ckpt() # download the default pretrained checkpoint.
self.audio_text_path = audio_text_path
self.audio_text_embeddings_path = audio_text_embeddings_path
self.gpt2_pipe = pipeline('text-generation', model='Gustavosta/MagicPrompt-Stable-Diffusion', tokenizer='gpt2')
def audio_embedding(self, file_path):
# quantization
def int16_to_float32(x):
return (x / 32767.0).astype(np.float32)
def float32_to_int16(x):
x = np.clip(x, a_min=-1., a_max=1.)
return (x * 32767.).astype(np.int16)
# Get audio embeddings from audio data
audio_data, _ = librosa.load(file_path, sr=48000) # sample rate should be 48000
audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
audio_data = torch.from_numpy(
int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
audio_embed = self.model.get_audio_embedding_from_data(x=audio_data, use_tensor=True)
return audio_embed
def load_candidate_text(self):
import json
with open(self.audio_text_path, 'r') as f:
texts = json.load(f)
tensors = {}
with safe_open(self.audio_text_embeddings_path, framework="pt", device=self.device) as f:
for k in f.keys():
tensors[k] = f.get_tensor(k)
text_embed = tensors["text_embed"]
return texts, text_embed
def audio2txt(self, filepath):
audio_embed = self.audio_embedding(filepath)
texts, text_embed = self.load_candidate_text()
# Concatenate the embeddings from all batches into a single tensor
result_tensor = torch.matmul(audio_embed, text_embed.transpose(0, 1))
similarity_scores = torch.softmax(result_tensor, dim=1)
topk_scores, topk_indices = torch.topk(similarity_scores, k=10, dim=1)
print("Top 10 similarity scores:", topk_scores)
print("Top 10 sentence indices:", topk_indices)
#
topK_sentences = [texts[idx].replace("The sounds of", "") for idx in topk_indices[0].tolist()]
starting_text = topK_sentences[0]
response = self.gpt2_pipe(starting_text, max_length=(len(starting_text) + random.randint(60, 90)),
num_return_sequences=4)
response_list = []
for x in response:
resp = x['generated_text'].strip()
if resp != starting_text and len(resp) > (len(starting_text) + 4) and resp.endswith(
(":", "-", "—")) is False:
response_list.append(resp)
return response_list[0]
if __name__ == "__main__":
pipeline = AudioPipeline(audio_text_path='/root/autodl-tmp/dedup_audio_text_80.json',
audio_text_embeddings_path='/root/autodl-tmp/audio_text_embeddings.safetensors')
texts = pipeline.audio2txt('/root/autodl-tmp/下载.wav')
print(texts)