Vinay15 commited on
Commit
47c1a8b
·
verified ·
1 Parent(s): dedcf07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -36
app.py CHANGED
@@ -4,58 +4,49 @@ import torch
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  from datasets import load_dataset
6
  import soundfile as sf
7
- import io
8
 
9
- # Load models and pronunciation dictionary
10
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
12
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
 
14
  with open("pronunciation_dict.json", "r") as f:
15
  pronunciation_dict = json.load(f)
16
 
17
- # Preprocess the text
18
  def preprocess_text(text):
19
  for term, phonetic in pronunciation_dict.items():
20
  text = text.replace(term, phonetic)
21
  return text
22
 
23
- # Text-to-Speech function
24
  def text_to_speech(input_text):
 
25
  processed_text = preprocess_text(input_text)
 
 
26
  inputs = processor(text=processed_text, return_tensors="pt")
27
-
 
28
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
29
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
30
-
 
31
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
32
-
33
- # Prepare audio as BytesIO
34
- audio_buffer = io.BytesIO()
35
- sf.write(audio_buffer, speech.numpy(), samplerate=16000, format='WAV')
36
- audio_buffer.seek(0)
37
-
38
- return audio_buffer
39
-
40
- # Define examples
41
- examples = [
42
- "We are using API for authentication.",
43
- "CUDA and TensorFlow work together for deep learning models.",
44
- "The database uses NoSQL and supports JSON for data storage.",
45
- "Machine learning and artificial intelligence are advancing fast.",
46
- "Natural language processing techniques are widely adopted."
47
- ]
48
-
49
- # Create Gradio interface
50
- iface = gr.Interface(
51
- fn=text_to_speech,
52
- inputs="text",
53
- outputs="audio",
54
- title="Fine-tuning TTS for English with a Focus on Technical Vocabulary Using SpeechT5",
55
- description="Enter text with technical jargon for TTS conversion.",
56
- examples=examples,
57
- cache_examples=False # Disable caching for now
58
- )
59
-
60
- # Launch interface
61
- iface.launch()
 
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  from datasets import load_dataset
6
  import soundfile as sf
 
7
 
8
+ # Step 3: Load the models and the pronunciation dictionary
9
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
10
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
11
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
12
 
13
+ # Load pronunciation dictionary from JSON file
14
  with open("pronunciation_dict.json", "r") as f:
15
  pronunciation_dict = json.load(f)
16
 
17
+ # Function to preprocess the input text
18
  def preprocess_text(text):
19
  for term, phonetic in pronunciation_dict.items():
20
  text = text.replace(term, phonetic)
21
  return text
22
 
23
+ # Step 4: Define the TTS function
24
  def text_to_speech(input_text):
25
+ # Preprocess the text
26
  processed_text = preprocess_text(input_text)
27
+
28
+ # Convert the processed text to model inputs
29
  inputs = processor(text=processed_text, return_tensors="pt")
30
+
31
+ # Load xvector embeddings from dataset for speaker voice characteristics
32
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
33
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
34
+
35
+ # Generate speech using the model and vocoder
36
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
37
+
38
+ # Save the generated speech as a .wav file
39
+ output_file = "speech_output.wav"
40
+ sf.write(output_file, speech.numpy(), samplerate=16000)
41
+
42
+ return output_file
43
+
44
+ # Step 5: Create Gradio interface
45
+ iface = gr.Interface(fn=text_to_speech,
46
+ inputs="text",
47
+ outputs="audio",
48
+ title="Text-to-Speech (TTS) Application",
49
+ description="Enter text with technical jargon for TTS conversion.")
50
+
51
+ # Step 6: Launch the app
52
+ iface.launch(share=True)