prosalign / app.py
clr's picture
Update app.py
3c81006
raw
history blame
4.28 kB
import gradio as gr
import subprocess,os
from datasets import load_dataset, Audio
import datas,ctcalign,graph
from numpy import random
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def setup():
r0 = subprocess.run(["pwd"], capture_output=True, text=True)
print('PWD::', r0.stdout)
r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/master.zip"], capture_output=True, text=True)
print(r1.stdout)
subprocess.run(["unzip", "./master.zip"])
subprocess.run(["mv", "REAPER-master", "REAPER"])
subprocess.run(["rm", "./master.zip"])
os.chdir('./REAPER')
subprocess.run(["mkdir", "build"])
os.chdir('./build')
r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True)
print(r2.stdout)
r3 = subprocess.run(["make"], capture_output=True, text=True)
print(r3.stdout)
os.chdir('../..')
r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True)
print('LS::', r9.stdout)
#print('about to setup')
setup()
def load_lang(langname):
if langname=="Icelandic":
df = datas.ds_i
lang_aligner = datas.a_i
elif langname =="Faroese":
df = datas.ds_f
lang_aligner = datas.a_f
df = df.data.to_pandas()
df = df.drop(columns=['audio', 'speaker_id','duration'])
return (df[:15], lang_aligner) #(df, df[:50])
def f1(langname,lang_aligner):
if langname=="Icelandic":
ds = datas.ds_i
elif langname =="Faroese":
ds = datas.ds_f
maxdat=len(ds)
ds = ds.select([random.randint(maxdat-1)])
sound_path = ds['audio'][0]['path'] # audio 0 array is the audio data itself
transcript = ds['normalized_text'][0]
return (graph.align_and_graph(sound_path,transcript,lang_aligner),sound_path)
bl = gr.Blocks()
with bl:
with gr.Row():
gr.Markdown(
"""
# Demo under construction
## 1. Choose a language to load
## 2. See a small sample of the selected corpus
## 3. Click the button below to view time-aligned prosody information for a random example (from the whole corpus, not necessarily the shown sample)
Pitch is shown in dark blue and loudness is the light orange line.
The pitch estimation, and the time-alignment of words to audio, are completely automated and there will be some inaccuracy.
More information below.
""" )
lloadr = gr.Dropdown(["Faroese", "Icelandic"], label="Language")#, info="Loading the dataset takes some time")
align_func = gr.State()#value=ctcalign.aligner(model_path="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h",model_word_separator = '|',model_blank_token = '[PAD]'))
with gr.Row():
#invisidata = gr.DataFrame(interactive=False, visible=False)
databrowser = gr.DataFrame(wrap=True, max_rows=50, interactive=False, overflow_row_behaviour='paginate')
with gr.Row():
btn1 = gr.Button(value="CLICK HERE")
btn1.style(full_width=False)
audio1 = gr.Audio(interactive=False)
pl1 = gr.Plot()
btn1.click(f1, [lloadr,align_func], [pl1,audio1])
lloadr.change(load_lang,lloadr,[databrowser,align_func])
gr.Markdown(
"""
# ABOUT
The Icelandic corpus is [samromur-asr](https://huggingface.co/datasets/language-and-voice-lab/samromur_asr), and Faroese uses [ravnursson-asr](https://huggingface.co/datasets/carlosdanielhernandezmena/ravnursson_asr).
### Pitch tracking (F0 estimation)
Estimated pitch is shown in blue on the graphs, as tracked by [REAPER](https://github.com/google/REAPER).
### Intensity
The orange line is root mean squared energy, which reflects loudness and is also a good indication of syllable placement, as it should line up with vowels and similar sounds.
[ABOUT CTC ALIGNMENT - TODO]
This is a work-in-progress basic demo for automatic prosodic annotation in Faroese and Icelandic.
Contact [email protected] / https://github.com/catiR/ when things break, or with ideas/suggestions about how to apply this.
The source code is available under the Files tab at the top of the Space.
"""
)
if __name__ == "__main__":
bl.launch()