Spaces:

KevinGeng
/

Laronix_voice_quality_checking_system_MICROPHONE

Sleeping

App Files Files Community

KevinGeng commited on Dec 12, 2023

Commit

73426f6

•

1 Parent(s): 11a6db4

add password pretection

Browse files

Files changed (1) hide show

app.py +69 -4

app.py CHANGED Viewed

@@ -48,10 +48,13 @@ def calc_mos(audio_path, ref):
     wav, sr = torchaudio.load(audio_path, channels_first=True)
     if wav.shape[0] > 1:
         wav = wav.mean(dim=0, keepdim=True) # Mono channel
     osr = 16_000
     batch = wav.unsqueeze(0).repeat(10, 1, 1)
     csr = ChangeSampleRate(sr, osr)
     out_wavs = csr(wav)
     # ASR
     trans = p(audio_path)["text"]
     # WER
@@ -82,7 +85,68 @@ def calc_mos(audio_path, ref):
     phone_transcription = processor.batch_decode(phone_predicted_ids)
     lst_phonemes = phone_transcription[0].split(" ")
     wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
     ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
     # pdb.set_trace()
     return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm
@@ -90,8 +154,9 @@ def calc_mos(audio_path, ref):
 with open("local/description.md") as f:
     description = f.read()
-# calc_mos("audio_2023-11-01_15-57-39.wav", "hello world")
-# pdb.set_trace()
 examples = [
     ["local/Julianna_Set1_Author_01.wav", "Once upon a time, there was a young rat named Arthur who couldn't make up his mind."],
     ["local/Patient_Arthur_set1_002_noisy.wav", "Whenever the other rats asked him if he would like to go hunting with them, he would answer in a soft voice, 'I don't know.'"],
@@ -112,5 +177,5 @@ iface = gr.Interface(
   allow_flagging="auto",
   examples=examples,
 )
-iface.launch()

     wav, sr = torchaudio.load(audio_path, channels_first=True)
     if wav.shape[0] > 1:
         wav = wav.mean(dim=0, keepdim=True) # Mono channel
+    # get decibel
     osr = 16_000
     batch = wav.unsqueeze(0).repeat(10, 1, 1)
     csr = ChangeSampleRate(sr, osr)
     out_wavs = csr(wav)
+    db = torchaudio.transforms.AmplitudeToDB(stype="amplitude", top_db=80)(wav)
     # ASR
     trans = p(audio_path)["text"]
     # WER
     phone_transcription = processor.batch_decode(phone_predicted_ids)
     lst_phonemes = phone_transcription[0].split(" ")
     wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
+    import matplotlib.pyplot as plt
+    fig = plt.figure(figsize=(30, 10))
+    # ax = fig.subplots(1, 1)
+    # pdb.set_trace()
+    # time_x = torch.arange(wav.shape[-1]) / sr
+    # # ax.plot(time_x, wav_vad.squeeze())
+    # pdb.set_trace()
+    # ax.plot(time_x, wav.squeeze(), alpha=0.5)
+    # get f0
+    f0 = torchaudio.functional.compute_kaldi_pitch(wav, frame_length=25, frame_shift=20, min_f0=20, max_f0=600, sample_rate=sr)[0, :, 1]
+    # # get f0 time x axis
+    # time_x_f0 = torch.arange(f0.shape[-1]) * 20 / 1000
+    # plot f0 with x axis as time
+    # spectrogram with x axis as time
+    pdb.set_trace()
+    spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=400, hop_length=160, n_mels=80)(wav)
+    spectrogram = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)(spectrogram)
+    # plot spectrogram with x axis as time, y axis as frequency bins
+    ax2 = fig.add_subplot(212)
+    ax2.set_xlabel("Time (s)")
+    ax2.set_ylabel("Frequency (Hz)")
+    ax2.set_title("Spectrogram")
+    ax2.set_xticks(torch.arange(0, spectrogram.shape[-1], 100))
+    ax2.set_xticklabels(torch.arange(0, spectrogram.shape[-1], 100) * 20 / 1000)
+    ax2.set_yticks(torch.arange(0, spectrogram.shape[1], 10))
+    ax2.set_yticklabels(torch.arange(0, spectrogram.shape[1], 10) * 800 / 80)
+    # add colorbar to spectrogram with limitation from -80 to 0
+    cbar = plt.colorbar(ax2.imshow(spectrogram.squeeze().numpy(), aspect='auto', origin='lower'))
+    cbar.set_label("dB")
+    ax2.grid()
+    # plot f0 with x axis as time, y axis as frequency bins, y is limited from 0 to 600
+    ax1 = fig.add_subplot(211)
+    ax1.set_xlabel("Time (s)")
+    ax1.set_ylabel("Frequency (Hz)")
+    ax1.set_title("F0")
+    ax1.set_xticks(torch.arange(0, f0.shape[-1], 100))
+    ax1.set_xticklabels(torch.arange(0, f0.shape[-1], 100) * 20 / 1000)
+    ax1.set_yticks(torch.arange(0, 600, 50))
+    ax1.set_yticklabels(torch.arange(0, 600, 50))
+    # add colorbar to f0 with limitation from 0 to 600
+    # cbar = plt.colorbar(ax1.imshow(f0.squeeze().numpy(), aspect='auto', origin='lower'))
+    # cbar.set_label("Hz")
+    ax1.grid()
+    # remove unvoiced part based on vad
+    # plot f0 with x axis as time
+    # time_x = torch.arange(f0.shape[-1]) * 20 / 1000
+    # plt.plot(time_x, f0.squeeze())
+    # fig.savefig("vad.png")
+    # pdb.set_trace()
     ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
     # pdb.set_trace()
     return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm
 with open("local/description.md") as f:
     description = f.read()
+calc_mos("JOHN1.wav", "he would answer in a soft voice, 'I don't know.'")
+pdb.set_trace()
 examples = [
     ["local/Julianna_Set1_Author_01.wav", "Once upon a time, there was a young rat named Arthur who couldn't make up his mind."],
     ["local/Patient_Arthur_set1_002_noisy.wav", "Whenever the other rats asked him if he would like to go hunting with them, he would answer in a soft voice, 'I don't know.'"],
   allow_flagging="auto",
   examples=examples,
 )
+# add password to protect the interface
+iface.launch(share=True, auth=['Laronix', 'LaronixSLP'], auth_message="Authentication Required, ask [email protected] for password,\n Thanks for your cooperation!")