capradeepgujaran commited on
Commit
b7effce
·
verified ·
1 Parent(s): bb3964e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -34
app.py CHANGED
@@ -1,52 +1,73 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
  import torch
4
  from TTS.api import TTS
5
-
6
- # Initialize Whisper for speech recognition
7
- asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
8
 
9
  # Initialize TTS model
10
- # Note: We're using a try-except block to handle potential issues with GPU availability
11
  try:
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
- tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
14
  except Exception as e:
15
  print(f"Error initializing TTS model: {e}")
16
  tts = None
17
 
18
- def generate_voiceover(audio_file, emotion):
 
 
 
 
 
 
 
 
 
19
  try:
20
- # Transcribe audio using Whisper
21
- result = asr(audio_file)
22
- transcription = result["text"]
23
-
24
- # Generate voice over with selected emotion
25
  if tts is not None:
26
- tts_audio = tts.tts(text=transcription, speaker_wav="path/to/speaker/reference.wav", language="en")
27
- return (gr.Audio(value=tts_audio, type="numpy"), transcription)
 
 
28
  else:
29
  return (None, "TTS model not available. Check logs for initialization error.")
30
  except Exception as e:
31
- return (None, f"Error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  # Gradio interface
34
- iface = gr.Interface(
35
- fn=generate_voiceover,
36
- inputs=[
37
- gr.Audio(type="filepath", label="Upload Audio"),
38
- gr.Dropdown(["Happy", "Sad", "Angry", "Neutral"], label="Select Emotion")
39
- ],
40
- outputs=[
41
- gr.Audio(label="Generated Voiceover"),
42
- gr.Textbox(label="Transcription/Error Message")
43
- ],
44
- title="Voice Over Generator with Emotion Control",
45
- description="Upload an audio file, select an emotion, and generate a voice over."
46
- )
47
-
48
- # This line is crucial for Hugging Face Spaces
49
- iface.launch()
50
-
51
- if __name__ == "__main__":
52
- iface.launch()
 
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoProcessor, MusicgenForConditionalGeneration
3
  import torch
4
  from TTS.api import TTS
5
+ import scipy
 
 
6
 
7
  # Initialize TTS model
 
8
  try:
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
+ tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device)
11
  except Exception as e:
12
  print(f"Error initializing TTS model: {e}")
13
  tts = None
14
 
15
+ # Initialize Musicgen model for sound generation
16
+ try:
17
+ processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
18
+ model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
19
+ except Exception as e:
20
+ print(f"Error initializing Musicgen model: {e}")
21
+ processor = None
22
+ model = None
23
+
24
+ def generate_speech(text, emotion):
25
  try:
 
 
 
 
 
26
  if tts is not None:
27
+ # Note: emotion parameter is not used in this basic example
28
+ # You may need a different TTS model or post-processing to incorporate emotion
29
+ speech = tts.tts(text=text)
30
+ return (gr.Audio(value=(22050, speech), type="numpy"), "Speech generated successfully")
31
  else:
32
  return (None, "TTS model not available. Check logs for initialization error.")
33
  except Exception as e:
34
+ return (None, f"Error in speech generation: {str(e)}")
35
+
36
+ def generate_sound(text):
37
+ try:
38
+ if processor is not None and model is not None:
39
+ inputs = processor(
40
+ text=[text],
41
+ padding=True,
42
+ return_tensors="pt",
43
+ )
44
+ audio_values = model.generate(**inputs, max_new_tokens=256)
45
+ sampling_rate = model.config.audio_encoder.sampling_rate
46
+ scipy.io.wavfile.write("output.wav", rate=sampling_rate, data=audio_values[0, 0].numpy())
47
+ return (gr.Audio(value="output.wav", type="filepath"), "Sound generated successfully")
48
+ else:
49
+ return (None, "Musicgen model not available. Check logs for initialization error.")
50
+ except Exception as e:
51
+ return (None, f"Error in sound generation: {str(e)}")
52
 
53
  # Gradio interface
54
+ with gr.Blocks() as iface:
55
+ gr.Markdown("# Text-to-Speech and Text-to-Sound Generation Tool")
56
+
57
+ with gr.Tab("Text-to-Speech"):
58
+ text_input = gr.Textbox(label="Enter text for speech generation")
59
+ emotion_input = gr.Dropdown(["Happy", "Sad", "Angry", "Neutral"], label="Select Emotion")
60
+ speech_button = gr.Button("Generate Speech")
61
+ speech_output = gr.Audio(label="Generated Speech")
62
+ speech_message = gr.Textbox(label="Message")
63
+
64
+ with gr.Tab("Text-to-Sound"):
65
+ sound_input = gr.Textbox(label="Enter text description for sound generation")
66
+ sound_button = gr.Button("Generate Sound")
67
+ sound_output = gr.Audio(label="Generated Sound")
68
+ sound_message = gr.Textbox(label="Message")
69
+
70
+ speech_button.click(generate_speech, inputs=[text_input, emotion_input], outputs=[speech_output, speech_message])
71
+ sound_button.click(generate_sound, inputs=[sound_input], outputs=[sound_output, sound_message])
72
+
73
+ iface.launch()