voice-ai / app.py
Adipta's picture
Update app.py
5479293 verified
raw
history blame
2.09 kB
import os
import json
import tempfile
import gradio as gr
from google.cloud import speech
from microphone import MicrophoneStream
from utils import listen_print_loop
import pyaudio
def list_audio_devices():
audio = pyaudio.PyAudio()
for i in range(audio.get_device_count()):
device_info = audio.get_device_info_by_index(i)
print(f"Device {i}: {device_info['name']}")
# process of getting credentials
def get_credentials():
creds_json_str = os.getenv("GOOGLE")
if creds_json_str is None:
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
# create a temporary file
with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as temp:
temp.write(creds_json_str) # write in json format
temp_filename = temp.name
return temp_filename
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = get_credentials()
# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10) # 100ms
LANGUAGE = "id-ID"
transcribe_client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=RATE,
language_code=LANGUAGE,
)
streaming_config = speech.StreamingRecognitionConfig(
config=config, interim_results=True
)
async def main(audio) -> None:
print("Streaming started ...")
print(list_audio_devices())
with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)
responses = transcribe_client.streaming_recognize(streaming_config, requests)
return await listen_print_loop(responses)
demo = gr.Interface(
fn=main,
inputs=[
gr.Audio(sources="microphone", streaming=True, label="Input Speech")
],
outputs=[
gr.Textbox(label="Transcription"),
gr.Audio(label="Audio")
],
live=True)
if __name__ == "__main__":
demo.launch()