import os import torch import shutil import librosa import warnings import numpy as np import gradio as gr import librosa.display import matplotlib.pyplot as plt from collections import Counter from model import EvalNet from utils import get_modelist, find_wav_files, embed_img TRANSLATE = { "m_bel": "Bel Canto, Male", "f_bel": "Bel Canto, Female", "m_folk": "Folk Singing, Male", "f_folk": "Folk Singing, Female", } CLASSES = list(TRANSLATE.keys()) TEMP_DIR = "./__pycache__/tmp" SAMPLE_RATE = 22050 def wav2mel(audio_path: str, width=1.6, topdb=40): y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) non_silents = librosa.effects.split(y, top_db=topdb) non_silent = np.concatenate([y[start:end] for start, end in non_silents]) mel_spec = librosa.feature.melspectrogram(y=non_silent, sr=sr) log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) dur = librosa.get_duration(y=non_silent, sr=sr) total_frames = log_mel_spec.shape[1] step = int(width * total_frames / dur) count = int(total_frames / step) begin = int(0.5 * (total_frames - count * step)) end = begin + step * count for i in range(begin, end, step): librosa.display.specshow(log_mel_spec[:, i : i + step]) plt.axis("off") plt.savefig( f"{TEMP_DIR}/mel_{round(dur, 2)}_{i}.jpg", bbox_inches="tight", pad_inches=0.0, ) plt.close() def wav2cqt(audio_path: str, width=1.6, topdb=40): y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) non_silents = librosa.effects.split(y, top_db=topdb) non_silent = np.concatenate([y[start:end] for start, end in non_silents]) cqt_spec = librosa.cqt(y=non_silent, sr=sr) log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max) dur = librosa.get_duration(y=non_silent, sr=sr) total_frames = log_cqt_spec.shape[1] step = int(width * total_frames / dur) count = int(total_frames / step) begin = int(0.5 * (total_frames - count * step)) end = begin + step * count for i in range(begin, end, step): librosa.display.specshow(log_cqt_spec[:, i : i + step]) plt.axis("off") plt.savefig( f"{TEMP_DIR}/cqt_{round(dur, 2)}_{i}.jpg", bbox_inches="tight", pad_inches=0.0, ) plt.close() def wav2chroma(audio_path: str, width=1.6, topdb=40): y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) non_silents = librosa.effects.split(y, top_db=topdb) non_silent = np.concatenate([y[start:end] for start, end in non_silents]) chroma_spec = librosa.feature.chroma_stft(y=non_silent, sr=sr) log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max) dur = librosa.get_duration(y=non_silent, sr=sr) total_frames = log_chroma_spec.shape[1] step = int(width * total_frames / dur) count = int(total_frames / step) begin = int(0.5 * (total_frames - count * step)) end = begin + step * count for i in range(begin, end, step): librosa.display.specshow(log_chroma_spec[:, i : i + step]) plt.axis("off") plt.savefig( f"{TEMP_DIR}/chroma_{round(dur, 2)}_{i}.jpg", bbox_inches="tight", pad_inches=0.0, ) plt.close() def most_common_element(input_list: list): counter = Counter(input_list) mce, _ = counter.most_common(1)[0] return mce def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR): if os.path.exists(folder_path): shutil.rmtree(folder_path) if not wav_path: return None, "Please input an audio!" spec = log_name.split("_")[-3] os.makedirs(folder_path, exist_ok=True) try: model = EvalNet(log_name, len(TRANSLATE)).model eval("wav2%s" % spec)(wav_path) except Exception as e: return None, f"{e}" outputs = [] all_files = os.listdir(folder_path) for file_name in all_files: if file_name.lower().endswith(".jpg"): file_path = os.path.join(folder_path, file_name) input = embed_img(file_path) output: torch.Tensor = model(input) pred_id = torch.max(output.data, 1)[1] outputs.append(int(pred_id)) max_count_item = most_common_element(outputs) shutil.rmtree(folder_path) return os.path.basename(wav_path), TRANSLATE[CLASSES[max_count_item]] if __name__ == "__main__": warnings.filterwarnings("ignore") models = get_modelist(assign_model="GoogleNet_mel") examples = [] example_wavs = find_wav_files() for wav in example_wavs: examples.append([wav, models[0]]) with gr.Blocks() as demo: gr.Interface( fn=infer, inputs=[ gr.Audio(label="Upload a recording (>40dB)", type="filepath"), gr.Dropdown(choices=models, label="Select a model", value=models[0]), ], outputs=[ gr.Textbox(label="Audio filename", show_copy_button=True), gr.Textbox(label="Singing method recognition", show_copy_button=True), ], examples=examples, cache_examples=False, allow_flagging="never", title="It is recommended to keep the recording length around 5s, too long will affect the recognition efficiency.", ) gr.Markdown( """ # Cite ```bibtex @article{Zhou-2025, author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han}, title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research}, journal = {Transactions of the International Society for Music Information Retrieval}, volume = {8}, number = {1}, pages = {22--38}, month = {Mar}, year = {2025}, url = {https://doi.org/10.5334/tismir.194}, doi = {10.5334/tismir.194} } ```""" ) demo.launch()