CNPM / app.py
admin
2 pure en
0f76a19
raw
history blame
4.6 kB
import os
import torch
import random
import shutil
import librosa
import warnings
import numpy as np
import gradio as gr
import librosa.display
import matplotlib.pyplot as plt
from utils import get_modelist, find_audio_files, embed_img
from model import EvalNet
CLASSES = ["Gong", "Shang", "Jue", "Zhi", "Yu"]
TEMP_DIR = "./__pycache__/tmp"
SAMPLE_RATE = 44100
def zero_padding(y: np.ndarray, end: int):
size = len(y)
if size < end:
return np.concatenate((y, np.zeros(end - size)))
elif size > end:
return y[-end:]
return y
def audio2mel(audio_path: str, seg_len=20):
os.makedirs(TEMP_DIR, exist_ok=True)
try:
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
y = zero_padding(y, seg_len * sr)
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
librosa.display.specshow(log_mel_spec)
plt.axis("off")
plt.savefig(
f"{TEMP_DIR}/output.jpg",
bbox_inches="tight",
pad_inches=0.0,
)
plt.close()
except Exception as e:
print(f"Error converting {audio_path} : {e}")
def audio2cqt(audio_path: str, seg_len=20):
os.makedirs(TEMP_DIR, exist_ok=True)
try:
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
y = zero_padding(y, seg_len * sr)
cqt_spec = librosa.cqt(y=y, sr=sr)
log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
librosa.display.specshow(log_cqt_spec)
plt.axis("off")
plt.savefig(
f"{TEMP_DIR}/output.jpg",
bbox_inches="tight",
pad_inches=0.0,
)
plt.close()
except Exception as e:
print(f"Error converting {audio_path} : {e}")
def audio2chroma(audio_path: str, seg_len=20):
os.makedirs(TEMP_DIR, exist_ok=True)
try:
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
y = zero_padding(y, seg_len * sr)
chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr)
log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
librosa.display.specshow(log_chroma_spec)
plt.axis("off")
plt.savefig(
f"{TEMP_DIR}/output.jpg",
bbox_inches="tight",
pad_inches=0.0,
)
plt.close()
except Exception as e:
print(f"Error converting {audio_path} : {e}")
def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR):
if os.path.exists(folder_path):
shutil.rmtree(folder_path)
if not wav_path:
return None, "Please input an audio!"
try:
model = EvalNet(log_name, len(CLASSES)).model
except Exception as e:
return None, f"{e}"
spec = log_name.split("_")[-3]
eval("audio2%s" % spec)(wav_path)
input = embed_img(f"{folder_path}/output.jpg")
output: torch.Tensor = model(input)
pred_id = torch.max(output.data, 1)[1]
return (
os.path.basename(wav_path),
CLASSES[pred_id].capitalize(),
)
if __name__ == "__main__":
warnings.filterwarnings("ignore")
models = get_modelist()
examples = []
example_audios = find_audio_files()
model_num = len(models)
for audio in example_audios:
examples.append([audio, models[random.randint(0, model_num - 1)]])
with gr.Blocks() as demo:
gr.Interface(
fn=infer,
inputs=[
gr.Audio(label="Upload a recording", type="filepath"),
gr.Dropdown(choices=models, label="Select a model", value=models[0]),
],
outputs=[
gr.Textbox(label="Audio filename", show_copy_button=True),
gr.Textbox(
label="Chinese pentatonic mode recognition",
show_copy_button=True,
),
],
examples=examples,
cache_examples=False,
flagging_mode="never",
title="It is recommended to keep the recording length around 20s.",
)
gr.Markdown(
"""
# Cite
```bibtex
@dataset{zhaorui_liu_2021_5676893,
author = {Monan Zhou, Shenyang Xu, Zhaorui Liu, Zhaowen Wang, Feng Yu, Wei Li and Baoqiang Han},
title = {CCMusic: an Open and Diverse Database for Chinese Music Information Retrieval Research},
month = {mar},
year = {2024},
publisher = {HuggingFace},
version = {1.2},
url = {https://huggingface.co/ccmusic-database}
}
```"""
)
demo.launch()