Guzheng_Tech99 / app.py
admin
fix cite
cf1a30f
raw
history blame
6 kB
import os
import torch
import librosa
import warnings
import numpy as np
import pandas as pd
import gradio as gr
import librosa.display
from model import EvalNet, t_EvalNet
from utils import get_modelist, find_files, embed, MODEL_DIR
TRANSLATE = {
"chanyin": "Vibrato", # 颤音
"boxian": "Plucks", # 拨弦
"shanghua": "Upward Portamento", # 上滑音
"xiahua": "Downward Portamento", # 下滑音
"huazhi/guazou/lianmo/liantuo": "Glissando", # 花指\刮奏\连抹\连托
"yaozhi": "Tremolo", # 摇指
"dianyin": "Point Note", # 点音
}
CLASSES = list(TRANSLATE.keys())
TEMP_DIR = "./__pycache__/tmp"
SAMPLE_RATE = 44100
HOP_LENGTH = 512
TIME_LENGTH = 3
def logMel(y, sr=SAMPLE_RATE):
mel = librosa.feature.melspectrogram(
y=y,
sr=sr,
hop_length=HOP_LENGTH,
fmin=27.5,
)
return librosa.power_to_db(mel, ref=np.max)
def logCqt(y, sr=SAMPLE_RATE):
cqt = librosa.cqt(
y,
sr=sr,
hop_length=HOP_LENGTH,
fmin=27.5,
n_bins=88,
bins_per_octave=12,
)
return ((1.0 / 80.0) * librosa.core.amplitude_to_db(np.abs(cqt), ref=np.max)) + 1.0
def logChroma(y, sr=SAMPLE_RATE):
chroma = librosa.feature.chroma_stft(
y=y,
sr=sr,
hop_length=HOP_LENGTH,
)
return (
(1.0 / 80.0) * librosa.core.amplitude_to_db(np.abs(chroma), ref=np.max)
) + 1.0
def RoW_norm(data):
common_sum = 0
square_sum = 0
tfle = 0
for i in range(len(data)):
tfle += (data[i].sum(-1).sum(0) != 0).astype("float").sum()
common_sum += data[i].sum(-1).sum(-1)
square_sum += (data[i] ** 2).sum(-1).sum(-1)
common_avg = common_sum / tfle
square_avg = square_sum / tfle
std = np.sqrt(square_avg - common_avg**2)
return common_avg, std
def norm(data):
size = data.shape
avg, std = RoW_norm(data)
avg = np.tile(avg.reshape((1, -1, 1, 1)), (size[0], 1, size[2], size[3]))
std = np.tile(std.reshape((1, -1, 1, 1)), (size[0], 1, size[2], size[3]))
return (data - avg) / std
def chunk_data(f):
x = []
xdata = np.transpose(f)
s = SAMPLE_RATE * TIME_LENGTH // HOP_LENGTH
length = int(np.ceil((int(len(xdata) / s) + 1) * s))
app = np.zeros((length - xdata.shape[0], xdata.shape[1]))
xdata = np.concatenate((xdata, app), 0)
for i in range(int(length / s)):
data = xdata[int(i * s) : int(i * s + s)]
x.append(np.transpose(data[:s, :]))
return np.array(x)
def load(audio_path: str, converto="mel"):
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
spec = eval("log%s(y, sr)" % converto.capitalize())
x_spec = chunk_data(spec)
Xtr_spec = np.expand_dims(x_spec, axis=3)
return list(norm(Xtr_spec))
def format_second(seconds):
integer_part = int(seconds)
decimal_part = round(seconds - integer_part, 3)
hours, remainder = divmod(integer_part, 3600)
minutes, seconds = divmod(remainder, 60)
return f"{hours:02}:{minutes:02}:{seconds:02}.{decimal_part:.3f}"
def infer(audio_path: str, log_name: str):
if not audio_path:
return "Please input an audio!", None
backbone = "_".join(log_name.split("_")[:-1])
spec = log_name.split("_")[-1]
try:
input = load(audio_path, converto=spec)
dur = librosa.get_duration(path=audio_path)
frames_per_3s = input[0].shape[1]
if "vit" in backbone or "swin" in backbone:
eval_net = t_EvalNet(
backbone,
len(TRANSLATE),
input[0].shape[1],
weight_path=f"{MODEL_DIR}/{log_name}.pt",
)
else:
eval_net = EvalNet(
backbone,
len(TRANSLATE),
input[0].shape[1],
weight_path=f"{MODEL_DIR}/{log_name}.pt",
)
input_size = eval_net.get_input_size()
embeded_input = embed(input, input_size)
output = list(eval_net.forward(embeded_input))
except Exception as e:
return f"{e}", None
index = 0
outputs = []
for y in output:
preds = list(y.T)
for pred in preds:
start = index * TIME_LENGTH / frames_per_3s
if start > dur:
break
to = (index + 1) * TIME_LENGTH / frames_per_3s
outputs.append(
{
"Frame": f"{format_second(start)} - {format_second(to)}",
"Tech": TRANSLATE[CLASSES[torch.argmax(pred).item()]],
}
)
index += 1
return os.path.basename(audio_path), pd.DataFrame(outputs)
if __name__ == "__main__":
warnings.filterwarnings("ignore")
models = get_modelist(assign_model="VGG19_mel")
examples = []
example_wavs = find_files()
for wav in example_wavs:
examples.append([wav, models[0]])
with gr.Blocks() as demo:
gr.Interface(
fn=infer,
inputs=[
gr.Audio(label="Upload audio", type="filepath"),
gr.Dropdown(choices=models, label="Select a model", value=models[0]),
],
outputs=[
gr.Textbox(label="Audio filename", show_copy_button=True),
gr.Dataframe(label="Frame-level guzheng playing technique detection"),
],
examples=examples,
cache_examples=False,
flagging_mode="never",
title="It is suggested that the recording time should not be too long",
)
gr.Markdown(
"""
# Cite
```bibtex
@article{Zhou-2025,
title = {CCMusic: an Open and Diverse Database for Chinese Music Information Retrieval Research},
author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
journal = {Transactions of the International Society for Music Information Retrieval},
year = {2025}
}
```"""
)
demo.launch()