File size: 4,276 Bytes
2f63626 38aa581 ef72cdf 277be2c f68d656 2570281 2f63626 10b61cb 3e9b351 38aa581 3e9b351 38aa581 3e9b351 100ee4d 3e9b351 100ee4d 38aa581 100ee4d 10b61cb ef72cdf 19ec674 100ee4d ef72cdf 277be2c 868f0f3 277be2c 14da95d ef72cdf 277be2c 100ee4d b1e70cf 118d6d2 277be2c 118d6d2 277be2c 118d6d2 c81d6d1 f68d656 5bd7d1d 3cb3592 1360ce0 2f63626 ef72cdf 2f63626 40b6dea 3c81006 ef72cdf 803066e 331a033 ef72cdf 991c5a2 ef72cdf 331a033 3cb3592 3c81006 3cb3592 868f0f3 3cb3592 868f0f3 331a033 868f0f3 6f56357 ef72cdf 97325b8 1360ce0 3c81006 1360ce0 3c81006 1360ce0 3c81006 1360ce0 98178bd 1360ce0 3c81006 1360ce0 97325b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import gradio as gr
import subprocess,os
from datasets import load_dataset, Audio
import datas,ctcalign,graph
from numpy import random
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def setup():
r0 = subprocess.run(["pwd"], capture_output=True, text=True)
print('PWD::', r0.stdout)
r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/master.zip"], capture_output=True, text=True)
print(r1.stdout)
subprocess.run(["unzip", "./master.zip"])
subprocess.run(["mv", "REAPER-master", "REAPER"])
subprocess.run(["rm", "./master.zip"])
os.chdir('./REAPER')
subprocess.run(["mkdir", "build"])
os.chdir('./build')
r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True)
print(r2.stdout)
r3 = subprocess.run(["make"], capture_output=True, text=True)
print(r3.stdout)
os.chdir('../..')
r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True)
print('LS::', r9.stdout)
#print('about to setup')
setup()
def load_lang(langname):
if langname=="Icelandic":
df = datas.ds_i
lang_aligner = datas.a_i
elif langname =="Faroese":
df = datas.ds_f
lang_aligner = datas.a_f
df = df.data.to_pandas()
df = df.drop(columns=['audio', 'speaker_id','duration'])
return (df[:15], lang_aligner) #(df, df[:50])
def f1(langname,lang_aligner):
if langname=="Icelandic":
ds = datas.ds_i
elif langname =="Faroese":
ds = datas.ds_f
maxdat=len(ds)
ds = ds.select([random.randint(maxdat-1)])
sound_path = ds['audio'][0]['path'] # audio 0 array is the audio data itself
transcript = ds['normalized_text'][0]
return (graph.align_and_graph(sound_path,transcript,lang_aligner),sound_path)
bl = gr.Blocks()
with bl:
with gr.Row():
gr.Markdown(
"""
# Demo under construction
## 1. Choose a language to load
## 2. See a small sample of the selected corpus
## 3. Click the button below to view time-aligned prosody information for a random example (from the whole corpus, not necessarily the shown sample)
Pitch is shown in dark blue and loudness is the light orange line.
The pitch estimation, and the time-alignment of words to audio, are completely automated and there will be some inaccuracy.
More information below.
""" )
lloadr = gr.Dropdown(["Faroese", "Icelandic"], label="Language")#, info="Loading the dataset takes some time")
align_func = gr.State()#value=ctcalign.aligner(model_path="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h",model_word_separator = '|',model_blank_token = '[PAD]'))
with gr.Row():
#invisidata = gr.DataFrame(interactive=False, visible=False)
databrowser = gr.DataFrame(wrap=True, max_rows=50, interactive=False, overflow_row_behaviour='paginate')
with gr.Row():
btn1 = gr.Button(value="CLICK HERE")
btn1.style(full_width=False)
audio1 = gr.Audio(interactive=False)
pl1 = gr.Plot()
btn1.click(f1, [lloadr,align_func], [pl1,audio1])
lloadr.change(load_lang,lloadr,[databrowser,align_func])
gr.Markdown(
"""
# ABOUT
The Icelandic corpus is [samromur-asr](https://huggingface.co/datasets/language-and-voice-lab/samromur_asr), and Faroese uses [ravnursson-asr](https://huggingface.co/datasets/carlosdanielhernandezmena/ravnursson_asr).
### Pitch tracking (F0 estimation)
Estimated pitch is shown in blue on the graphs, as tracked by [REAPER](https://github.com/google/REAPER).
### Intensity
The orange line is root mean squared energy, which reflects loudness and is also a good indication of syllable placement, as it should line up with vowels and similar sounds.
[ABOUT CTC ALIGNMENT - TODO]
This is a work-in-progress basic demo for automatic prosodic annotation in Faroese and Icelandic.
Contact [email protected] / https://github.com/catiR/ when things break, or with ideas/suggestions about how to apply this.
The source code is available under the Files tab at the top of the Space.
"""
)
if __name__ == "__main__":
bl.launch() |