KrishGoyani commited on
Commit
d9d5ae9
·
verified ·
1 Parent(s): 5987173

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -13
app.py CHANGED
@@ -11,45 +11,45 @@ from datasets import load_dataset
11
  import torchaudio
12
  import os
13
 
 
 
14
  transcriber = pipeline(
15
  "automatic-speech-recognition", model="openai/whisper-small.en", device=device
16
  )
17
 
 
18
  def transcribe(audio):
19
  print("Listening your query")
20
  result = transcriber(audio)
21
  return result['text']
22
 
23
- api_token = os.getenv('HF_API_TOKEN')
24
-
25
  def query(text, model_id="meta-llama/Meta-Llama-3-8B-Instruct"):
26
  api_url = f"https://api-inference.huggingface.co/models/{model_id}"
27
  hf_folder = HfFolder()
28
- headers = {"Authorization": f"Bearer {api_token}"}
29
  payload = {"inputs": text}
30
 
31
  print(f"Querying...: {text}")
32
  response = requests.post(api_url, headers=headers, json=payload)
33
- print(response.json()[0]['generated_text'])
34
- return response.json()[0]['generated_text']
35
 
36
 
37
 
38
-
39
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
40
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
41
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
42
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
43
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
44
 
45
- target_dtype = np.int16
46
- max_range = np.iinfo(target_dtype).max
47
 
48
 
49
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
50
  model.to(device)
51
  vocoder.to(device)
52
 
 
53
  def tts(text):
54
  # Process the text
55
  inputs = processor(text=text, return_tensors="pt")
@@ -69,18 +69,20 @@ def tts(text):
69
 
70
  return output_path
71
 
 
72
  def STT(audio):
73
  text = transcribe(audio)
74
  response = query(text)
75
  audio = tts(response)
76
  return audio
77
-
78
- demo = gr.Interface(
 
79
  fn=STT,
80
  inputs=gr.Audio(sources="microphone", type="filepath", label="Speak your question"),
81
  outputs=gr.Audio(type="filepath", label="Generated response"),
82
  live=True,
83
- title="Audio Question to Audio Answer (Jugadu GPT4-o)",
84
  description="Speak a question into the microphone, and the system will generate an audio response.",
85
  article="""
86
  This application uses advanced speech processing models to convert spoken questions into spoken answers.
@@ -90,4 +92,5 @@ demo = gr.Interface(
90
  )
91
 
92
  # Launch the interface
93
- demo.launch(share=True, debug=True)
 
 
11
  import torchaudio
12
  import os
13
 
14
+
15
+ #below is the transcriber pipeline that loads whisper model
16
  transcriber = pipeline(
17
  "automatic-speech-recognition", model="openai/whisper-small.en", device=device
18
  )
19
 
20
+ #convert audio in to text
21
  def transcribe(audio):
22
  print("Listening your query")
23
  result = transcriber(audio)
24
  return result['text']
25
 
26
+ #uses hosted api of Llama-3 model gives response
 
27
  def query(text, model_id="meta-llama/Meta-Llama-3-8B-Instruct"):
28
  api_url = f"https://api-inference.huggingface.co/models/{model_id}"
29
  hf_folder = HfFolder()
30
+ headers = {"Authorization": f"Bearer {hf_folder.get_token()}"}
31
  payload = {"inputs": text}
32
 
33
  print(f"Querying...: {text}")
34
  response = requests.post(api_url, headers=headers, json=payload)
35
+ print(response.json()[0]['generated_text'][len(text) + 1 :])
36
+ return response.json()[0]['generated_text'][len(text) + 1 :]
37
 
38
 
39
 
40
+ #below loads text to speech models and vocoders
41
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
42
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
43
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
44
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
45
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
46
 
 
 
47
 
48
 
 
49
  model.to(device)
50
  vocoder.to(device)
51
 
52
+ #converts text to speech
53
  def tts(text):
54
  # Process the text
55
  inputs = processor(text=text, return_tensors="pt")
 
69
 
70
  return output_path
71
 
72
+ #main function that calls other 3 functions
73
  def STT(audio):
74
  text = transcribe(audio)
75
  response = query(text)
76
  audio = tts(response)
77
  return audio
78
+
79
+ #gradio interface works as frontend
80
+ stt_gradio = gr.Interface(
81
  fn=STT,
82
  inputs=gr.Audio(sources="microphone", type="filepath", label="Speak your question"),
83
  outputs=gr.Audio(type="filepath", label="Generated response"),
84
  live=True,
85
+ title="Audio Question to Audio Answer(Jugadu GPT4-o)",
86
  description="Speak a question into the microphone, and the system will generate an audio response.",
87
  article="""
88
  This application uses advanced speech processing models to convert spoken questions into spoken answers.
 
92
  )
93
 
94
  # Launch the interface
95
+ stt_gradio.queue()
96
+ stt_gradio.launch(share=True, debug=True)