File size: 1,780 Bytes
8a0a2d0
3fb48f1
8a0a2d0
 
3fb48f1
ce200fa
8a0a2d0
 
e8aa27e
3fb48f1
8a0a2d0
 
 
 
 
 
 
 
 
 
 
 
 
 
3fb48f1
8a0a2d0
 
 
 
 
 
 
 
 
1dceb84
8a0a2d0
 
 
 
 
 
 
 
 
3fb48f1
 
 
8a0a2d0
 
3fb48f1
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from transformers import pipeline, WhisperModel
import gradio as gr
import pandas as pd
import string

pipe = pipeline(model="matteocirca/whisper-small-it-2",return_timestamps="word")
current_audio = None
segments = {}

def audio2segments(audio,word):
    global segments,current_audio
    
    if audio != current_audio or current_audio == None:
        segments = pipe(audio)
        current_audio = audio
    if not word:
        if current_audio != None:
            return segments["text"],"<html><h1>No Word inserted!</h1></html>"
        else:
            return "","<html><h1>No Word inserted!</h1></html>"
    df = pd.DataFrame(columns=["Occurrence n","Starting TimeStamp","Ending TimeStamp"])
    
    if word:
        ranges_list = []
        ranges = []
        print(segments)
       
        for w in segments['chunks']:
            if word == w["text"].translate(str.maketrans('', '', string.punctuation)).replace(" ","").lower() :
                ranges_list.append(w["timestamp"])
        res = "<table><thead><tr><th>Occurrence n°</th><th>Start</th><th>End</th></tr></thead><tbody>"
        

        for i,r in enumerate(ranges_list):
            # ranges_list.append({"Occurrence n":i,"Starting TimeStamp":r[0],"Ending TimeStamp":r[1]})
            res += f"<tr><td>{i}</td><td>{r[0]}</td><td>{r[1]}</td></tr>"
        
        res+=" </tbody></table>"
        print(res)
        return segments["text"],res

def clear():
    segments = {}
    

iface = gr.Interface(
    fn=audio2segments, 
    inputs=[gr.Audio(sources=["upload","microphone"], type="filepath"),"text"],
    outputs=["text","html"],
    title="Whisper Small Italian",
    description="Realtime demo for Italian speech recognition using a fine-tuned Whisper small model.",
)


iface.launch()