Spaces:

clr
/

prosalign

Sleeping

File size: 4,276 Bytes

2f63626
38aa581
ef72cdf
277be2c
f68d656
2570281
 
2f63626
 
 
 
 
10b61cb
 
 
 
 
3e9b351
 
 
38aa581
3e9b351
38aa581
3e9b351
100ee4d
3e9b351
 
100ee4d
38aa581
100ee4d
 
10b61cb
 
ef72cdf
19ec674
100ee4d
ef72cdf
 
277be2c
 
868f0f3
277be2c
 
14da95d
 
ef72cdf
277be2c
100ee4d
 
b1e70cf
118d6d2
277be2c
118d6d2
277be2c
118d6d2
c81d6d1
f68d656
 
 
5bd7d1d
3cb3592
1360ce0
 
2f63626
ef72cdf
2f63626
40b6dea
3c81006
 
 
 
 
 
 
 
 
 
 
 
 
ef72cdf
803066e
331a033
ef72cdf
991c5a2
ef72cdf
 
331a033
3cb3592
3c81006
 
3cb3592
868f0f3
 
 
3cb3592
868f0f3
331a033
 
868f0f3
6f56357
ef72cdf
97325b8
1360ce0
 
 
3c81006
 
1360ce0
3c81006
 
1360ce0
3c81006
 
1360ce0
98178bd
1360ce0
3c81006
 
 
1360ce0
 
 
 
97325b8

import gradio as gr
import subprocess,os
from datasets import load_dataset, Audio
import datas,ctcalign,graph
from numpy import random


import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


def setup():
    r0 = subprocess.run(["pwd"], capture_output=True, text=True)
    print('PWD::', r0.stdout)
    r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/master.zip"], capture_output=True, text=True)
    print(r1.stdout)
    subprocess.run(["unzip", "./master.zip"])
    subprocess.run(["mv", "REAPER-master", "REAPER"])
    subprocess.run(["rm", "./master.zip"])
    os.chdir('./REAPER')
    subprocess.run(["mkdir", "build"])
    os.chdir('./build')
    r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True)
    print(r2.stdout)
    r3 = subprocess.run(["make"], capture_output=True, text=True)
    print(r3.stdout)
    
    os.chdir('../..')
    r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True)
    print('LS::', r9.stdout)

                        
#print('about to setup')
setup()

def load_lang(langname):
    if langname=="Icelandic":
        df = datas.ds_i
        lang_aligner = datas.a_i
    elif langname =="Faroese":
        df = datas.ds_f
        lang_aligner = datas.a_f

    df = df.data.to_pandas()
    df = df.drop(columns=['audio', 'speaker_id','duration'])
    return (df[:15], lang_aligner) #(df, df[:50])


def f1(langname,lang_aligner):
    if langname=="Icelandic":
        ds = datas.ds_i
    elif langname =="Faroese":
        ds = datas.ds_f

    maxdat=len(ds)

    ds = ds.select([random.randint(maxdat-1)])
    sound_path = ds['audio'][0]['path'] # audio 0 array is the audio data itself
    transcript = ds['normalized_text'][0]
    return (graph.align_and_graph(sound_path,transcript,lang_aligner),sound_path)


bl = gr.Blocks()

with bl:

    with gr.Row():
        gr.Markdown(
        """
        # Demo under construction
        ## 1. Choose a language to load
        ## 2. See a small sample of the selected corpus
        ## 3. Click the button below to view time-aligned prosody information for a random example (from the whole corpus, not necessarily the shown sample)

        Pitch is shown in dark blue and loudness is the light orange line.
        The pitch estimation, and the time-alignment of words to audio, are completely automated and there will be some inaccuracy.
        More information below.
        """ )
        lloadr = gr.Dropdown(["Faroese", "Icelandic"], label="Language")#, info="Loading the dataset takes some time")

    align_func = gr.State()#value=ctcalign.aligner(model_path="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h",model_word_separator = '|',model_blank_token = '[PAD]'))
    
    with gr.Row():
        #invisidata = gr.DataFrame(interactive=False, visible=False)
        databrowser = gr.DataFrame(wrap=True, max_rows=50, interactive=False, overflow_row_behaviour='paginate')


    with gr.Row():
        btn1 = gr.Button(value="CLICK HERE")
        btn1.style(full_width=False)
        audio1 = gr.Audio(interactive=False)

    pl1 = gr.Plot()

    btn1.click(f1, [lloadr,align_func], [pl1,audio1])



    
    lloadr.change(load_lang,lloadr,[databrowser,align_func])


    gr.Markdown(
        """
    # ABOUT

    The Icelandic corpus is [samromur-asr](https://huggingface.co/datasets/language-and-voice-lab/samromur_asr), and Faroese uses [ravnursson-asr](https://huggingface.co/datasets/carlosdanielhernandezmena/ravnursson_asr).
    
    ### Pitch tracking (F0 estimation)
    Estimated pitch is shown in blue on the graphs, as tracked by [REAPER](https://github.com/google/REAPER).

    ### Intensity
    The orange line is root mean squared energy, which reflects loudness and is also a good indication of syllable placement, as it should line up with vowels and similar sounds.

    [ABOUT CTC ALIGNMENT - TODO]

    This is a work-in-progress basic demo for automatic prosodic annotation in Faroese and Icelandic.
    Contact [email protected] / https://github.com/catiR/ when things break, or with ideas/suggestions about how to apply this.
    The source code is available under the Files tab at the top of the Space.
    """
    )


if __name__ == "__main__":
    bl.launch()