Spaces:

KrishGoyani
/

SpeechGenie

Sleeping

App Files Files Community

KrishGoyani commited on Jun 24, 2024

Commit

d9d5ae9

verified ·

1 Parent(s): 5987173

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -13

app.py CHANGED Viewed

@@ -11,45 +11,45 @@ from datasets import load_dataset
 import torchaudio
 import os
 transcriber = pipeline(
     "automatic-speech-recognition", model="openai/whisper-small.en", device=device
 )
 def transcribe(audio):
   print("Listening your query")
   result = transcriber(audio)
   return result['text']
-api_token = os.getenv('HF_API_TOKEN')
 def query(text, model_id="meta-llama/Meta-Llama-3-8B-Instruct"):
     api_url = f"https://api-inference.huggingface.co/models/{model_id}"
     hf_folder = HfFolder()
-    headers = {"Authorization": f"Bearer {api_token}"}
     payload = {"inputs": text}
     print(f"Querying...: {text}")
     response = requests.post(api_url, headers=headers, json=payload)
-    print(response.json()[0]['generated_text'])
-    return response.json()[0]['generated_text']
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-target_dtype = np.int16
-max_range = np.iinfo(target_dtype).max
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 vocoder.to(device)
 def tts(text):
     # Process the text
     inputs = processor(text=text, return_tensors="pt")
@@ -69,18 +69,20 @@ def tts(text):
     return output_path
 def STT(audio):
   text = transcribe(audio)
   response = query(text)
   audio =  tts(response)
   return audio
-demo = gr.Interface(
     fn=STT,
     inputs=gr.Audio(sources="microphone", type="filepath", label="Speak your question"),
     outputs=gr.Audio(type="filepath", label="Generated response"),
     live=True,
-    title="Audio Question to Audio Answer (Jugadu GPT4-o)",
     description="Speak a question into the microphone, and the system will generate an audio response.",
     article="""
     This application uses advanced speech processing models to convert spoken questions into spoken answers.
@@ -90,4 +92,5 @@ demo = gr.Interface(
 )
 # Launch the interface
-demo.launch(share=True, debug=True)

 import torchaudio
 import os
+#below is the transcriber pipeline that loads whisper model
 transcriber = pipeline(
     "automatic-speech-recognition", model="openai/whisper-small.en", device=device
 )
+#convert audio in to text
 def transcribe(audio):
   print("Listening your query")
   result = transcriber(audio)
   return result['text']
+#uses hosted api of Llama-3 model gives response
 def query(text, model_id="meta-llama/Meta-Llama-3-8B-Instruct"):
     api_url = f"https://api-inference.huggingface.co/models/{model_id}"
     hf_folder = HfFolder()
+    headers = {"Authorization": f"Bearer {hf_folder.get_token()}"}
     payload = {"inputs": text}
     print(f"Querying...: {text}")
     response = requests.post(api_url, headers=headers, json=payload)
+    print(response.json()[0]['generated_text'][len(text) + 1 :])
+    return response.json()[0]['generated_text'][len(text) + 1 :]
+#below loads text to speech models and vocoders
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 model.to(device)
 vocoder.to(device)
+#converts text to speech
 def tts(text):
     # Process the text
     inputs = processor(text=text, return_tensors="pt")
     return output_path
+#main function that calls other 3 functions
 def STT(audio):
   text = transcribe(audio)
   response = query(text)
   audio =  tts(response)
   return audio
+#gradio interface works as frontend
+stt_gradio = gr.Interface(
     fn=STT,
     inputs=gr.Audio(sources="microphone", type="filepath", label="Speak your question"),
     outputs=gr.Audio(type="filepath", label="Generated response"),
     live=True,
+    title="Audio Question to Audio Answer(Jugadu GPT4-o)",
     description="Speak a question into the microphone, and the system will generate an audio response.",
     article="""
     This application uses advanced speech processing models to convert spoken questions into spoken answers.
 )
 # Launch the interface
+stt_gradio.queue()
+stt_gradio.launch(share=True, debug=True)