Spaces:

matteocirca
/

ASR-app-pro

Sleeping

File size: 1,780 Bytes

8a0a2d0
3fb48f1
8a0a2d0
 
3fb48f1
ce200fa
8a0a2d0
 
e8aa27e
3fb48f1
8a0a2d0
 
 
 
 
 
 
 
 
 
 
 
 
 
3fb48f1
8a0a2d0
 
 
 
 
 
 
 
 
1dceb84
8a0a2d0
 
 
 
 
 
 
 
 
3fb48f1
 
 
8a0a2d0
 
3fb48f1

from transformers import pipeline, WhisperModel
import gradio as gr
import pandas as pd
import string

pipe = pipeline(model="matteocirca/whisper-small-it-2",return_timestamps="word")
current_audio = None
segments = {}

def audio2segments(audio,word):
    global segments,current_audio
    
    if audio != current_audio or current_audio == None:
        segments = pipe(audio)
        current_audio = audio
    if not word:
        if current_audio != None:
            return segments["text"],"<html><h1>No Word inserted!</h1></html>"
        else:
            return "","<html><h1>No Word inserted!</h1></html>"
    df = pd.DataFrame(columns=["Occurrence n","Starting TimeStamp","Ending TimeStamp"])
    
    if word:
        ranges_list = []
        ranges = []
        print(segments)
       
        for w in segments['chunks']:
            if word == w["text"].translate(str.maketrans('', '', string.punctuation)).replace(" ","").lower() :
                ranges_list.append(w["timestamp"])
        res = "<table><thead><tr><th>Occurrence n°</th><th>Start</th><th>End</th></tr></thead><tbody>"
        

        for i,r in enumerate(ranges_list):
            # ranges_list.append({"Occurrence n":i,"Starting TimeStamp":r[0],"Ending TimeStamp":r[1]})
            res += f"<tr><td>{i}</td><td>{r[0]}</td><td>{r[1]}</td></tr>"
        
        res+=" </tbody></table>"
        print(res)
        return segments["text"],res

def clear():
    segments = {}
    

iface = gr.Interface(
    fn=audio2segments, 
    inputs=[gr.Audio(sources=["upload","microphone"], type="filepath"),"text"],
    outputs=["text","html"],
    title="Whisper Small Italian",
    description="Realtime demo for Italian speech recognition using a fine-tuned Whisper small model.",
)


iface.launch()