File size: 5,766 Bytes
e1e27eb
 
 
 
 
90301be
e1e27eb
 
 
 
 
 
4d45f42
90301be
4d45f42
 
 
 
90301be
4d45f42
90301be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1e27eb
 
 
 
 
326a994
 
 
e1e27eb
e9daf29
90301be
 
 
e9daf29
 
 
 
 
 
 
90301be
a8a920e
 
 
90301be
e1e27eb
 
 
 
 
 
 
90301be
a8a920e
 
 
90301be
e1e27eb
e9daf29
e1e27eb
 
 
 
 
 
 
 
 
 
 
e9daf29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d45f42
 
 
 
 
 
 
e1e27eb
 
90301be
e9daf29
 
90301be
e9daf29
326a994
e9daf29
e1e27eb
 
 
90301be
e1e27eb
 
 
 
 
 
90301be
 
 
 
 
 
 
e1e27eb
e9daf29
 
 
 
 
 
 
 
e1e27eb
 
90301be
e1e27eb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import gradio as gr
# from gradio import ChatMessage
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import numpy as np
import librosa
import random
import json
import os


from huggingface_hub import InferenceClient

hf_token = os.getenv("HF_Token")

# def get_token():
#     with open("credentials.json","r") as f:
#         credentials = json.load(f)
#     return credentials['token']

# hf_token = get_token()


words_to_guess = [
    "elephant",
    "rainbow",
    "mountain",
    "ocean",
    "butterfly",
    "guitar",
    "volcano",
    "chocolate",
    "kangaroo",
    "spaceship",
    "whisper",
    "pyramid",
    "sunflower",
    "unicorn",
    "jungle",
    "diamond",
    "castle",
    "galaxy",
    "wizard",
    "tornado"
]

RANDOM_WORD = random.choice(words_to_guess)

client = InferenceClient(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    token=hf_token)

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")


def chat(audio, chat:list, asr_model:str):

    status = ""

    if asr_model == "openai/whisper-large-v2":
        transcription = transcribe_whisper_large_v2(audio)
    elif asr_model == "openai/whisper-tiny.en":
        transcription = transcribe_whisper_tiny_en(audio)
    else:
        raise ValueError(f"No Model found with the given choice: {asr_model}")
    
    if RANDOM_WORD in transcription:
        status = f"""# YOU WON !! πŸŽ‰πŸŽŠ
        The Word was: {RANDOM_WORD}
        """
    
    chat.append({'role':'user','content':transcription})
    response = client.chat_completion(
        messages=chat,
        max_tokens=500,
        stream=False,
    ).choices[0].message.content
    chat.append({'role':'assistant','content':response})
    if RANDOM_WORD in response:
        status = f"""# YOU LOST !! ❌❌
        The Word was: {RANDOM_WORD}
        """
    return chat, status

def transcribe_whisper_large_v2(audio):
    sr, audio = audio
    audio = audio.astype(np.float32)
    if len(audio.shape) > 2 and audio.shape[1] > 1:
        audio = np.mean(audio, axis=1)
    audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
    input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features 
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
    transcription = processor.tokenizer.normalize(transcription[0])
    return transcription

def transcribe_whisper_tiny_en(audio):
    sr, audio = audio
    audio = audio.astype(np.float32)
    if len(audio.shape) > 2 and audio.shape[1] > 1:
        audio = np.mean(audio, axis=1)
    audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
    input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    transcription = processor.tokenizer.normalize(transcription[0])
    return transcription

def load_model(asr_model_choice:str):
    global processor
    global model
    global model_flag

    if asr_model_choice == "openai/whisper-large-v2":
        processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
        model.config.forced_decoder_ids = None
        model_flag = "openai/whisper-large-v2"
    elif asr_model_choice == "openai/whisper-tiny.en":
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
        model_flag = "openai/whisper-tiny.en"

    print("Model Loaded: ",model_flag)







# The App
with gr.Blocks() as app:

    gr.Markdown("# VoiceBot Game πŸ•ΉοΈ")
    gr.Markdown("Welcome to VoiceBot πŸ‘‹, here is how it works")
    gr.Markdown("This Bot can only be interacted with through your voice. Press record and say something, after stopping the recoding your audio will be processed directly. You have the option to choose between different models. The model you choose influences the Bot's perfomance to understand what you have said. A better perfomance also comes with longer waiting time. πŸ˜•")
    gr.Markdown("The Game works as follows: The Bot get's an initial word, you have to guess it. You can ask questions. If the bot says the word before you, You Lose! If you say the word first you Win!")
    gr.Markdown("Have fun playing arround πŸŽ‰")
    gr.Markdown("If you have any wishes for models or a general idea, feel free to let me know πŸ™Œ")

    chatbot = gr.Chatbot(
        value=[{
            'role':'System',
            'content':f"The User tries to guess a word. The User asks you questions about the word and you answer those questions. Try to help the user to find the word by giving very short descriptions. THE WORD TO GUESS IS: {RANDOM_WORD}"
        }],
        bubble_full_width=False,
        type="messages"
    )


    audio_input = gr.Audio(
        sources=['microphone'],
        interactive=True,
        scale=8
    )

    status = gr.Markdown()

    with gr.Accordion(label="Settings", open=False):

        asr_model_choice = gr.Radio(
            label="Select ASR Model",
            choices=["openai/whisper-large-v2","openai/whisper-tiny.en"],
            value="openai/whisper-tiny.en"
        )
        asr_model_choice.change(load_model, asr_model_choice)

    # Event listener for when the audio recording stops
    audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot, asr_model_choice], outputs=[chatbot, status])

app.launch()