Spaces:

next-social
/

audio_img

Build error

audio_img / audio_to_text.py

pengdaqian

add more

ca13e69 over 1 year ago

3.37 kB

	import random

	import librosa
	import laion_clap
	import torch
	import numpy as np
	from safetensors import safe_open
	from transformers import pipeline


	class AudioPipeline(object):
	def __init__(self, audio_text_path, audio_text_embeddings_path):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.model = laion_clap.CLAP_Module(enable_fusion=False)
	self.model.load_ckpt() # download the default pretrained checkpoint.
	self.audio_text_path = audio_text_path
	self.audio_text_embeddings_path = audio_text_embeddings_path
	self.gpt2_pipe = pipeline('text-generation', model='Gustavosta/MagicPrompt-Stable-Diffusion', tokenizer='gpt2')

	def audio_embedding(self, file_path):
	# quantization
	def int16_to_float32(x):
	return (x / 32767.0).astype(np.float32)

	def float32_to_int16(x):
	x = np.clip(x, a_min=-1., a_max=1.)
	return (x * 32767.).astype(np.int16)

	# Get audio embeddings from audio data
	audio_data, _ = librosa.load(file_path, sr=48000) # sample rate should be 48000
	audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
	audio_data = torch.from_numpy(
	int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
	audio_embed = self.model.get_audio_embedding_from_data(x=audio_data, use_tensor=True)
	return audio_embed

	def load_candidate_text(self):
	import json
	with open(self.audio_text_path, 'r') as f:
	texts = json.load(f)

	tensors = {}
	with safe_open(self.audio_text_embeddings_path, framework="pt", device=self.device) as f:
	for k in f.keys():
	tensors[k] = f.get_tensor(k)
	text_embed = tensors["text_embed"]
	return texts, text_embed

	def audio2txt(self, filepath):
	audio_embed = self.audio_embedding(filepath)
	texts, text_embed = self.load_candidate_text()
	# Concatenate the embeddings from all batches into a single tensor
	result_tensor = torch.matmul(audio_embed, text_embed.transpose(0, 1))
	similarity_scores = torch.softmax(result_tensor, dim=1)
	topk_scores, topk_indices = torch.topk(similarity_scores, k=10, dim=1)
	print("Top 10 similarity scores:", topk_scores)
	print("Top 10 sentence indices:", topk_indices)
	#
	topK_sentences = [texts[idx].replace("The sounds of", "") for idx in topk_indices[0].tolist()]
	starting_text = topK_sentences[0]
	response = self.gpt2_pipe(starting_text, max_length=(len(starting_text) + random.randint(60, 90)),
	num_return_sequences=4)
	response_list = []
	for x in response:
	resp = x['generated_text'].strip()
	if resp != starting_text and len(resp) > (len(starting_text) + 4) and resp.endswith(
	(":", "-", "—")) is False:
	response_list.append(resp)
	return response_list[0]


	if __name__ == "__main__":
	pipeline = AudioPipeline(audio_text_path='/root/autodl-tmp/dedup_audio_text_80.json',
	audio_text_embeddings_path='/root/autodl-tmp/audio_text_embeddings.safetensors')
	texts = pipeline.audio2txt('/root/autodl-tmp/下载.wav')
	print(texts)