bel_canto / app.py
admin
upd cite
ab58dee
raw
history blame
5.97 kB
import os
import torch
import shutil
import librosa
import warnings
import numpy as np
import gradio as gr
import librosa.display
import matplotlib.pyplot as plt
from collections import Counter
from model import EvalNet
from utils import get_modelist, find_wav_files, embed_img
TRANSLATE = {
"m_bel": "Bel Canto, Male",
"f_bel": "Bel Canto, Female",
"m_folk": "Folk Singing, Male",
"f_folk": "Folk Singing, Female",
}
CLASSES = list(TRANSLATE.keys())
TEMP_DIR = "./__pycache__/tmp"
SAMPLE_RATE = 22050
def wav2mel(audio_path: str, width=1.6, topdb=40):
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
non_silents = librosa.effects.split(y, top_db=topdb)
non_silent = np.concatenate([y[start:end] for start, end in non_silents])
mel_spec = librosa.feature.melspectrogram(y=non_silent, sr=sr)
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
dur = librosa.get_duration(y=non_silent, sr=sr)
total_frames = log_mel_spec.shape[1]
step = int(width * total_frames / dur)
count = int(total_frames / step)
begin = int(0.5 * (total_frames - count * step))
end = begin + step * count
for i in range(begin, end, step):
librosa.display.specshow(log_mel_spec[:, i : i + step])
plt.axis("off")
plt.savefig(
f"{TEMP_DIR}/mel_{round(dur, 2)}_{i}.jpg",
bbox_inches="tight",
pad_inches=0.0,
)
plt.close()
def wav2cqt(audio_path: str, width=1.6, topdb=40):
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
non_silents = librosa.effects.split(y, top_db=topdb)
non_silent = np.concatenate([y[start:end] for start, end in non_silents])
cqt_spec = librosa.cqt(y=non_silent, sr=sr)
log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max)
dur = librosa.get_duration(y=non_silent, sr=sr)
total_frames = log_cqt_spec.shape[1]
step = int(width * total_frames / dur)
count = int(total_frames / step)
begin = int(0.5 * (total_frames - count * step))
end = begin + step * count
for i in range(begin, end, step):
librosa.display.specshow(log_cqt_spec[:, i : i + step])
plt.axis("off")
plt.savefig(
f"{TEMP_DIR}/cqt_{round(dur, 2)}_{i}.jpg",
bbox_inches="tight",
pad_inches=0.0,
)
plt.close()
def wav2chroma(audio_path: str, width=1.6, topdb=40):
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
non_silents = librosa.effects.split(y, top_db=topdb)
non_silent = np.concatenate([y[start:end] for start, end in non_silents])
chroma_spec = librosa.feature.chroma_stft(y=non_silent, sr=sr)
log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max)
dur = librosa.get_duration(y=non_silent, sr=sr)
total_frames = log_chroma_spec.shape[1]
step = int(width * total_frames / dur)
count = int(total_frames / step)
begin = int(0.5 * (total_frames - count * step))
end = begin + step * count
for i in range(begin, end, step):
librosa.display.specshow(log_chroma_spec[:, i : i + step])
plt.axis("off")
plt.savefig(
f"{TEMP_DIR}/chroma_{round(dur, 2)}_{i}.jpg",
bbox_inches="tight",
pad_inches=0.0,
)
plt.close()
def most_common_element(input_list: list):
counter = Counter(input_list)
mce, _ = counter.most_common(1)[0]
return mce
def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR):
if os.path.exists(folder_path):
shutil.rmtree(folder_path)
if not wav_path:
return None, "Please input an audio!"
spec = log_name.split("_")[-3]
os.makedirs(folder_path, exist_ok=True)
try:
model = EvalNet(log_name, len(TRANSLATE)).model
eval("wav2%s" % spec)(wav_path)
except Exception as e:
return None, f"{e}"
outputs = []
all_files = os.listdir(folder_path)
for file_name in all_files:
if file_name.lower().endswith(".jpg"):
file_path = os.path.join(folder_path, file_name)
input = embed_img(file_path)
output: torch.Tensor = model(input)
pred_id = torch.max(output.data, 1)[1]
outputs.append(int(pred_id))
max_count_item = most_common_element(outputs)
shutil.rmtree(folder_path)
return os.path.basename(wav_path), TRANSLATE[CLASSES[max_count_item]]
if __name__ == "__main__":
warnings.filterwarnings("ignore")
models = get_modelist(assign_model="GoogleNet_mel")
examples = []
example_wavs = find_wav_files()
for wav in example_wavs:
examples.append([wav, models[0]])
with gr.Blocks() as demo:
gr.Interface(
fn=infer,
inputs=[
gr.Audio(label="Upload a recording (>40dB)", type="filepath"),
gr.Dropdown(choices=models, label="Select a model", value=models[0]),
],
outputs=[
gr.Textbox(label="Audio filename", show_copy_button=True),
gr.Textbox(label="Singing method recognition", show_copy_button=True),
],
examples=examples,
cache_examples=False,
allow_flagging="never",
title="It is recommended to keep the recording length around 5s, too long will affect the recognition efficiency.",
)
gr.Markdown(
"""
# Cite
```bibtex
@article{Zhou-2025,
author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research},
journal = {Transactions of the International Society for Music Information Retrieval},
volume = {8},
number = {1},
pages = {22--38},
month = {Mar},
year = {2025},
url = {https://doi.org/10.5334/tismir.194},
doi = {10.5334/tismir.194}
}
```"""
)
demo.launch()