File size: 3,913 Bytes
2f63626 38aa581 ef72cdf 0669708 b1e70cf f68d656 2570281 2f63626 10b61cb 3e9b351 38aa581 3e9b351 38aa581 3e9b351 100ee4d 3e9b351 100ee4d 38aa581 100ee4d 10b61cb ef72cdf 19ec674 100ee4d ef72cdf 7116577 331a033 868f0f3 7116577 42ca8af 331a033 6f56357 14da95d ef72cdf 331a033 100ee4d b1e70cf 118d6d2 f68d656 118d6d2 f68d656 118d6d2 37025f6 118d6d2 f68d656 c81d6d1 f68d656 1360ce0 f68d656 5bd7d1d 19ec674 37025f6 1360ce0 2f63626 ef72cdf 2f63626 40b6dea 803066e ef72cdf 803066e 331a033 ef72cdf 991c5a2 ef72cdf 331a033 868f0f3 6f56357 868f0f3 331a033 868f0f3 6f56357 ef72cdf 97325b8 1360ce0 98178bd 1360ce0 98178bd 1360ce0 98178bd 1360ce0 98178bd 1360ce0 98178bd 1360ce0 97325b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import gradio as gr
import subprocess,os
from datasets import load_dataset, Audio
import corpora
import ctcalign,graph
from numpy import random
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def setup():
r0 = subprocess.run(["pwd"], capture_output=True, text=True)
print('PWD::', r0.stdout)
r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/master.zip"], capture_output=True, text=True)
print(r1.stdout)
subprocess.run(["unzip", "./master.zip"])
subprocess.run(["mv", "REAPER-master", "REAPER"])
subprocess.run(["rm", "./master.zip"])
os.chdir('./REAPER')
subprocess.run(["mkdir", "build"])
os.chdir('./build')
r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True)
print(r2.stdout)
r3 = subprocess.run(["make"], capture_output=True, text=True)
print(r3.stdout)
os.chdir('../..')
r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True)
print('LS::', r9.stdout)
#print('about to setup')
setup()
def load_lang(langname):
if langname=="Icelandic":
df = corpora.ds_i
model_path="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
elif langname =="Faroese":
df = corpora.ds_f
model_path = "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
model_word_separator = '|'
model_blank_token = '[PAD]'
lang_aligner = ctcalign.aligner(model_path,model_word_separator,model_blank_token)
df = df.data.to_pandas()
df = df.drop(columns=['audio', 'speaker_id','duration'])
return (df[:10], lang_aligner) #(df, df[:50])
def f1(langname,lang_aligner):
if langname=="Icelandic":
ds = corpora.ds_i
elif langname =="Faroese":
ds = corpora.ds_f
#fig = plt.figure(figsize=(10,4))
#plt.axline((0,0),slope=1,color="darkgray")
#plt.xlabel("Vowel length (ms)")
#plt.ylabel("Consonant length (ms)")
maxdat=len(ds)
ds = ds.select([random.randint(maxdat-1)])
#print([th for th in ds.sample()])
sound_path = ds['audio'][0]['path'] # audio 0 array is the audio data itself
transcript = ds['normalized_text'][0]
#print('PLACE A:',lang_aligner)
return graph.align_and_graph(sound_path,transcript,lang_aligner)
bl = gr.Blocks()
with bl:
lloadr = gr.Dropdown(["Faroese", "Icelandic"], label="Select a language")#, info="Loading the dataset takes some time")
align_func = gr.State()#value=ctcalign.aligner(model_path="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h",model_word_separator = '|',model_blank_token = '[PAD]'))
with gr.Row():
#invisidata = gr.DataFrame(interactive=False, visible=False)
databrowser = gr.DataFrame(wrap=True, max_rows=50, interactive=False, overflow_row_behaviour='paginate')
btn1 = gr.Button(value="The random prosody button")
btn1.style(full_width=False, size="sm")
pl1 = gr.Plot()
btn1.click(f1, [lloadr,align_func], pl1)
lloadr.change(load_lang,lloadr,[databrowser,align_func])
gr.Markdown(
"""
# ABOUT
This is a work-in-progress demo.
Icelandic uses the [samromur-asr](https://huggingface.co/datasets/language-and-voice-lab/samromur_asr) corpus, and Faroese uses [ravnursson-asr](https://huggingface.co/datasets/carlosdanielhernandezmena/ravnursson_asr).
After you select a language, a few example sentences from the corpus are displayed.
Click the button to view time-aligned prosody information for a random sentence - this could be any sentence, not only one of the ones shown above.
[ABOUT REAPER PITCH TRACKING - TODO]
[ABOUT RMSE INTENSITY - TODO]
[ABOUT CTC ALIGNMENT - TODO]
[email protected] / https://github.com/catiR/
"""
)
if __name__ == "__main__":
bl.launch() |