Spaces:
Build error
Build error
File size: 1,550 Bytes
92740f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
"""
This is an example using CLAP for zero-shot inference.
"""
from CLAPWrapper import CLAPWrapper
import torch.nn.functional as F
# Define classes for zero-shot
# Should be in lower case and can be more than one word
classes = ['coughing','sneezing','drinking sipping', 'breathing', 'brushing teeth']
ground_truth = ['coughing']
# Add prompt
prompt = 'this is a sound of '
class_prompts = [prompt + x for x in classes]
#Load audio files
audio_files = ['audio_file']
# Load and initialize CLAP
weights_path = "weights_path"
# Setting use_cuda = True will load the model on a GPU using CUDA
clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
# compute text embeddings from natural text
text_embeddings = clap_model.get_text_embeddings(class_prompts)
# compute the audio embeddings from an audio file
audio_embeddings = clap_model.get_audio_embeddings(audio_files, resample=True)
# compute the similarity between audio_embeddings and text_embeddings
similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
similarity = F.softmax(similarity, dim=1)
values, indices = similarity[0].topk(5)
# Print the results
print("Ground Truth: {}".format(ground_truth))
print("Top predictions:\n")
for value, index in zip(values, indices):
print(f"{classes[index]:>16s}: {100 * value.item():.2f}%")
"""
The output (the exact numbers may vary):
Ground Truth: coughing
Top predictions:
coughing: 98.55%
sneezing: 1.24%
drinking sipping: 0.15%
breathing: 0.02%
brushing teeth: 0.01%
"""
|