File size: 3,913 Bytes
2f63626
38aa581
ef72cdf
0669708
b1e70cf
f68d656
2570281
 
2f63626
 
 
 
 
10b61cb
 
 
 
 
3e9b351
 
 
38aa581
3e9b351
38aa581
3e9b351
100ee4d
3e9b351
 
100ee4d
38aa581
100ee4d
 
10b61cb
 
ef72cdf
19ec674
100ee4d
ef72cdf
 
7116577
331a033
868f0f3
7116577
42ca8af
331a033
 
 
6f56357
14da95d
 
ef72cdf
331a033
100ee4d
 
b1e70cf
118d6d2
f68d656
118d6d2
f68d656
118d6d2
 
37025f6
 
 
 
118d6d2
f68d656
c81d6d1
f68d656
 
1360ce0
f68d656
5bd7d1d
19ec674
37025f6
1360ce0
 
2f63626
ef72cdf
2f63626
40b6dea
803066e
ef72cdf
803066e
331a033
ef72cdf
991c5a2
ef72cdf
 
331a033
868f0f3
 
 
 
 
 
6f56357
868f0f3
331a033
 
868f0f3
6f56357
ef72cdf
97325b8
1360ce0
 
 
 
 
98178bd
1360ce0
 
 
98178bd
1360ce0
98178bd
1360ce0
98178bd
1360ce0
98178bd
1360ce0
 
 
 
 
 
97325b8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import gradio as gr
import subprocess,os
from datasets import load_dataset, Audio
import corpora
import ctcalign,graph
from numpy import random


import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


def setup():
    r0 = subprocess.run(["pwd"], capture_output=True, text=True)
    print('PWD::', r0.stdout)
    r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/master.zip"], capture_output=True, text=True)
    print(r1.stdout)
    subprocess.run(["unzip", "./master.zip"])
    subprocess.run(["mv", "REAPER-master", "REAPER"])
    subprocess.run(["rm", "./master.zip"])
    os.chdir('./REAPER')
    subprocess.run(["mkdir", "build"])
    os.chdir('./build')
    r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True)
    print(r2.stdout)
    r3 = subprocess.run(["make"], capture_output=True, text=True)
    print(r3.stdout)
    
    os.chdir('../..')
    r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True)
    print('LS::', r9.stdout)

                        
#print('about to setup')
setup()

def load_lang(langname):
    if langname=="Icelandic":
        df = corpora.ds_i
        model_path="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
    elif langname =="Faroese":
        df = corpora.ds_f
        model_path = "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"

    model_word_separator = '|'
    model_blank_token = '[PAD]'
    lang_aligner = ctcalign.aligner(model_path,model_word_separator,model_blank_token)

    df = df.data.to_pandas()
    df = df.drop(columns=['audio', 'speaker_id','duration'])
    return (df[:10], lang_aligner) #(df, df[:50])


def f1(langname,lang_aligner):
    if langname=="Icelandic":
        ds = corpora.ds_i
    elif langname =="Faroese":
        ds = corpora.ds_f

    
    #fig = plt.figure(figsize=(10,4))
    #plt.axline((0,0),slope=1,color="darkgray")
    #plt.xlabel("Vowel length (ms)")
    #plt.ylabel("Consonant length (ms)")


    maxdat=len(ds)

    ds = ds.select([random.randint(maxdat-1)])
    #print([th for th in ds.sample()])
    sound_path = ds['audio'][0]['path'] # audio 0 array is the audio data itself
    transcript = ds['normalized_text'][0]
    #print('PLACE A:',lang_aligner)
    return graph.align_and_graph(sound_path,transcript,lang_aligner)


bl = gr.Blocks()

with bl:

    lloadr = gr.Dropdown(["Faroese", "Icelandic"], label="Select a language")#, info="Loading the dataset takes some time")

    align_func = gr.State()#value=ctcalign.aligner(model_path="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h",model_word_separator = '|',model_blank_token = '[PAD]'))
    
    with gr.Row():
        #invisidata = gr.DataFrame(interactive=False, visible=False)
        databrowser = gr.DataFrame(wrap=True, max_rows=50, interactive=False, overflow_row_behaviour='paginate')


    
    btn1 = gr.Button(value="The random prosody button")
    btn1.style(full_width=False, size="sm")

    pl1 = gr.Plot()

    btn1.click(f1, [lloadr,align_func], pl1)



    
    lloadr.change(load_lang,lloadr,[databrowser,align_func])


    gr.Markdown(
        """
    # ABOUT
    This is a work-in-progress demo.
    
    Icelandic uses the [samromur-asr](https://huggingface.co/datasets/language-and-voice-lab/samromur_asr) corpus, and Faroese uses [ravnursson-asr](https://huggingface.co/datasets/carlosdanielhernandezmena/ravnursson_asr).
    
    After you select a language, a few example sentences from the corpus are displayed.
    
    Click the button to view time-aligned prosody information for a random sentence - this could be any sentence, not only one of the ones shown above.
    
    [ABOUT REAPER PITCH TRACKING - TODO]

    [ABOUT RMSE INTENSITY - TODO]

    [ABOUT CTC ALIGNMENT - TODO]

    [email protected] / https://github.com/catiR/
    """
    )


if __name__ == "__main__":
    bl.launch()