Spaces:
Build error
Build error
""" | |
This is an example using CLAP for zero-shot inference. | |
""" | |
from CLAPWrapper import CLAPWrapper | |
import torch.nn.functional as F | |
# Define classes for zero-shot | |
# Should be in lower case and can be more than one word | |
classes = ['coughing','sneezing','drinking sipping', 'breathing', 'brushing teeth'] | |
ground_truth = ['coughing'] | |
# Add prompt | |
prompt = 'this is a sound of ' | |
class_prompts = [prompt + x for x in classes] | |
#Load audio files | |
audio_files = ['audio_file'] | |
# Load and initialize CLAP | |
weights_path = "weights_path" | |
# Setting use_cuda = True will load the model on a GPU using CUDA | |
clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False) | |
# compute text embeddings from natural text | |
text_embeddings = clap_model.get_text_embeddings(class_prompts) | |
# compute the audio embeddings from an audio file | |
audio_embeddings = clap_model.get_audio_embeddings(audio_files, resample=True) | |
# compute the similarity between audio_embeddings and text_embeddings | |
similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings) | |
similarity = F.softmax(similarity, dim=1) | |
values, indices = similarity[0].topk(5) | |
# Print the results | |
print("Ground Truth: {}".format(ground_truth)) | |
print("Top predictions:\n") | |
for value, index in zip(values, indices): | |
print(f"{classes[index]:>16s}: {100 * value.item():.2f}%") | |
""" | |
The output (the exact numbers may vary): | |
Ground Truth: coughing | |
Top predictions: | |
coughing: 98.55% | |
sneezing: 1.24% | |
drinking sipping: 0.15% | |
breathing: 0.02% | |
brushing teeth: 0.01% | |
""" | |