File size: 1,620 Bytes
483362e
 
 
8dbb96b
 
d311eed
8dbb96b
 
 
1e61fa7
6b4a273
8dbb96b
 
 
 
1e61fa7
6b4a273
8dbb96b
 
483362e
 
0cc869c
 
 
 
d311eed
 
 
 
 
483362e
 
 
84d64d1
0cc869c
d311eed
 
0cc869c
483362e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from transformers import pipeline
import gradio as gr

pipe_fine = pipeline(model="zeihers-mart/whisper-small-swedish-basic", device_map="auto") 
pipe_raw = pipeline(model="openai/whisper-small", device_map="auto")
sa = pipeline('sentiment-analysis', model='marma/bert-base-swedish-cased-sentiment')

# force swedish
pipe_fine.model.config.forced_decoder_ids = (
    pipe_fine.tokenizer.get_decoder_prompt_ids(
        language="sv", task="transcribe"
    )
)

pipe_raw.model.config.forced_decoder_ids = (
    pipe_raw.tokenizer.get_decoder_prompt_ids(
        language="sv", task="transcribe"
    )
)

def transcribe(audio):
    text_sv = pipe_fine(audio)["text"]
    print(f"Audio transcribed: {text_sv}")
    text_raw= pipe_raw(audio)["text"]
    print(f"Text translated: {text_raw}")
    sentiment= sa(text_sv)["label"]
    path = "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e0/SNice.svg/1200px-SNice.svg.png"
    if sentiment == "NEGATIVE":
        path = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/42/Sad_smiley_yellow_simple.svg/2048px-Sad_smiley_yellow_simple.svg.png"
    return text_sv, text_raw, path

iface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(sources=["microphone"], type="filepath"), 
    outputs=[gr.Textbox(label="Fine-tuned transcription"),
             gr.Textbox(label="Whisper transcription"),
             gr.Image(label="Sentiment from Fine-tuned transcription", width=100, height=100)],
    title="Finetuned Whisper Swedish Small",
    description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()