aurelben commited on
Commit
f3b0ffc
1 Parent(s): ff78834

initial commit

Browse files
Files changed (2) hide show
  1. app.py +79 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import numpy as np
5
+ import torch
6
+ from groq import Groq
7
+ from transformers import pipeline
8
+ from transformers.utils import is_flash_attn_2_available
9
+ from TTS.api import TTS
10
+
11
+ transcriber = pipeline("automatic-speech-recognition",
12
+ model="openai/whisper-large-v3",
13
+ torch_dtype=torch.float16,
14
+ device="cuda:0",
15
+ model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
16
+ )
17
+
18
+ groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))
19
+
20
+ def transcribe(stream, new_chunk):
21
+ """
22
+ Transcribes using whisper
23
+ """
24
+ sr, y = new_chunk
25
+
26
+ # Convert stereo to mono if necessary
27
+ if y.ndim == 2 and y.shape[1] == 2:
28
+ y = y.mean(axis=1) # Averaging both channels if stereo
29
+
30
+ y = y.astype(np.float32)
31
+
32
+ # Normalization
33
+ y /= np.max(np.abs(y))
34
+
35
+ if stream is not None:
36
+ stream = np.concatenate([stream, y])
37
+ else:
38
+ stream = y
39
+ return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
40
+
41
+ def autocomplete(text):
42
+ """
43
+ Autocomplete the text using Gemma.
44
+ """
45
+ if text != "":
46
+ response = groq_client.chat.completions.create(
47
+ model='gemma-7b-it',
48
+ messages=[{"role": "system", "content": "You are a friendly assistant named Gemma."},
49
+ {"role": "user", "content": text}]
50
+ )
51
+
52
+ return response.choices[0].message.content
53
+
54
+ def process_audio(input_audio, new_chunk):
55
+ """
56
+ Process the audio input by transcribing and completing the sentences.
57
+ Accumulate results to return to Gradio interface.
58
+ """
59
+
60
+ stream, transcription = transcribe(input_audio, new_chunk)
61
+ text = autocomplete(transcription)
62
+ api = TTS(model_name="tts_models/fra/fairseq/vits").to("cuda")
63
+ api.tts_to_file(text, file_path="output.wav")
64
+ gr.Audio(interactive=False, autoplay=True)
65
+ print (transcription, text)
66
+ return stream, text
67
+
68
+
69
+ demo = gr.Interface(
70
+ fn = process_audio,
71
+ inputs = ["state", gr.Audio(sources=["microphone"], streaming=True)],
72
+ outputs = ["state", gr.Markdown()],
73
+ title="Hey Gemma ☎️",
74
+ description="Powered by [whisper-base-en](https://huggingface.co/openai/whisper-base.en), and [gemma-7b-it](https://huggingface.co/google/gemma-7b-it) (via [Groq](https://groq.com/))",
75
+ live=True,
76
+ allow_flagging="never"
77
+ )
78
+
79
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==4.19.2
2
+ groq==0.4.2
3
+ numpy==1.24.4
4
+ torchaudio==2.2.1
5
+ transformers==4.37.2
6
+ tts