Vinay15 commited on
Commit
faabe22
·
verified ·
1 Parent(s): cb82b2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -21
app.py CHANGED
@@ -1,11 +1,13 @@
1
  import gradio as gr
2
  import json
3
  import torch
 
 
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  from datasets import load_dataset
6
  import soundfile as sf
7
 
8
- # Step 3: Load the models and the pronunciation dictionary
9
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
10
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
11
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
@@ -14,39 +16,52 @@ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
14
  with open("pronunciation_dict.json", "r") as f:
15
  pronunciation_dict = json.load(f)
16
 
17
- # Function to preprocess the input text
18
  def preprocess_text(text):
 
 
19
  for term, phonetic in pronunciation_dict.items():
20
- text = text.replace(term, phonetic)
 
21
  return text
22
 
23
- # Step 4: Define the TTS function
24
  def text_to_speech(input_text):
25
- # Preprocess the text
26
  processed_text = preprocess_text(input_text)
 
 
27
 
28
- # Convert the processed text to model inputs
29
- inputs = processor(text=processed_text, return_tensors="pt")
30
-
31
- # Load xvector embeddings from dataset for speaker voice characteristics
32
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
33
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 
 
34
 
35
- # Generate speech using the model and vocoder
36
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
 
 
 
 
37
 
38
- # Save the generated speech as a .wav file
 
 
 
39
  output_file = "speech_output.wav"
40
- sf.write(output_file, speech.numpy(), samplerate=16000)
41
 
42
  return output_file
43
 
44
- # Step 5: Create Gradio interface
45
- iface = gr.Interface(fn=text_to_speech,
46
- inputs="text",
47
- outputs="audio",
48
- title="Fine-tuning TTS for English with a Focus on Technical Vocabulary Using SpeechT5",
49
- description="Enter text with technical jargon for TTS conversion.")
 
 
50
 
51
- # Step 6: Launch the app
52
- iface.launch(share=True)
 
1
  import gradio as gr
2
  import json
3
  import torch
4
+ import numpy as np
5
+ import re
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
  from datasets import load_dataset
8
  import soundfile as sf
9
 
10
+ # Step 1: Load the models and the pronunciation dictionary
11
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
12
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
13
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 
16
  with open("pronunciation_dict.json", "r") as f:
17
  pronunciation_dict = json.load(f)
18
 
19
+ # Function to preprocess and apply pronunciation dictionary
20
  def preprocess_text(text):
21
+ # Convert text to uppercase for uniformity in matching
22
+ text = text.upper()
23
  for term, phonetic in pronunciation_dict.items():
24
+ # Replace terms with their phonetic equivalents
25
+ text = text.replace(term.upper(), phonetic)
26
  return text
27
 
28
+ # Step 2: Define the TTS function with sentence segmentation
29
  def text_to_speech(input_text):
30
+ # Preprocess and segment text
31
  processed_text = preprocess_text(input_text)
32
+ # Split the processed text by punctuation to form shorter segments
33
+ segments = re.split(r'(?<=[.!?]) +', processed_text)
34
 
35
+ # Load speaker embeddings for consistent voice
 
 
 
36
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
37
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
38
+
39
+ audio_outputs = []
40
 
41
+ # Generate speech for each text segment
42
+ for segment in segments:
43
+ if segment.strip(): # Ensure the segment is not empty
44
+ inputs = processor(text=segment, return_tensors="pt")
45
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
46
+ audio_outputs.append(speech.numpy())
47
 
48
+ # Concatenate audio from all segments
49
+ complete_speech = np.concatenate(audio_outputs)
50
+
51
+ # Save the concatenated speech as a .wav file
52
  output_file = "speech_output.wav"
53
+ sf.write(output_file, complete_speech, samplerate=16000)
54
 
55
  return output_file
56
 
57
+ # Step 3: Create Gradio interface
58
+ iface = gr.Interface(
59
+ fn=text_to_speech,
60
+ inputs="text",
61
+ outputs="audio",
62
+ title="Fine-tuning TTS for Technical Vocabulary",
63
+ description="Enter text with technical jargon for TTS conversion. The model will handle abbreviations and technical terms for better pronunciation."
64
+ )
65
 
66
+ # Step 4: Launch the app
67
+ iface.launch(share=True)