Vinay15 commited on
Commit
dedcf07
·
verified ·
1 Parent(s): 64adb3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -22
app.py CHANGED
@@ -1,7 +1,3 @@
1
- # Step 1: Install Gradio
2
- #pip install gradio
3
-
4
- # Step 2: Import necessary libraries
5
  import gradio as gr
6
  import json
7
  import torch
@@ -10,44 +6,38 @@ from datasets import load_dataset
10
  import soundfile as sf
11
  import io
12
 
13
- # Step 3: Load the models and the pronunciation dictionary
14
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
15
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
16
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
17
 
18
- # Load pronunciation dictionary from JSON file
19
  with open("pronunciation_dict.json", "r") as f:
20
  pronunciation_dict = json.load(f)
21
 
22
- # Function to preprocess the input text
23
  def preprocess_text(text):
24
  for term, phonetic in pronunciation_dict.items():
25
  text = text.replace(term, phonetic)
26
  return text
27
 
28
- # Step 4: Define the TTS function
29
  def text_to_speech(input_text):
30
- # Preprocess the text
31
  processed_text = preprocess_text(input_text)
32
-
33
- # Convert the processed text to model inputs
34
  inputs = processor(text=processed_text, return_tensors="pt")
35
-
36
- # Load xvector embeddings from dataset for speaker voice characteristics
37
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
38
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
39
-
40
- # Generate speech using the model and vocoder
41
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
42
-
43
- # Convert generated speech to an in-memory buffer
44
  audio_buffer = io.BytesIO()
45
  sf.write(audio_buffer, speech.numpy(), samplerate=16000, format='WAV')
46
  audio_buffer.seek(0)
47
-
48
  return audio_buffer
49
 
50
- # Step 5: Create Gradio interface with examples
51
  examples = [
52
  "We are using API for authentication.",
53
  "CUDA and TensorFlow work together for deep learning models.",
@@ -56,14 +46,16 @@ examples = [
56
  "Natural language processing techniques are widely adopted."
57
  ]
58
 
 
59
  iface = gr.Interface(
60
  fn=text_to_speech,
61
  inputs="text",
62
  outputs="audio",
63
  title="Fine-tuning TTS for English with a Focus on Technical Vocabulary Using SpeechT5",
64
  description="Enter text with technical jargon for TTS conversion.",
65
- examples=examples # Adding preset examples for users
 
66
  )
67
 
68
- # Step 6: Launch the app
69
- iface.launch(share=True)
 
 
 
 
 
1
  import gradio as gr
2
  import json
3
  import torch
 
6
  import soundfile as sf
7
  import io
8
 
9
+ # Load models and pronunciation dictionary
10
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
12
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
 
14
  with open("pronunciation_dict.json", "r") as f:
15
  pronunciation_dict = json.load(f)
16
 
17
+ # Preprocess the text
18
  def preprocess_text(text):
19
  for term, phonetic in pronunciation_dict.items():
20
  text = text.replace(term, phonetic)
21
  return text
22
 
23
+ # Text-to-Speech function
24
  def text_to_speech(input_text):
 
25
  processed_text = preprocess_text(input_text)
 
 
26
  inputs = processor(text=processed_text, return_tensors="pt")
27
+
 
28
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
29
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
30
+
 
31
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
32
+
33
+ # Prepare audio as BytesIO
34
  audio_buffer = io.BytesIO()
35
  sf.write(audio_buffer, speech.numpy(), samplerate=16000, format='WAV')
36
  audio_buffer.seek(0)
37
+
38
  return audio_buffer
39
 
40
+ # Define examples
41
  examples = [
42
  "We are using API for authentication.",
43
  "CUDA and TensorFlow work together for deep learning models.",
 
46
  "Natural language processing techniques are widely adopted."
47
  ]
48
 
49
+ # Create Gradio interface
50
  iface = gr.Interface(
51
  fn=text_to_speech,
52
  inputs="text",
53
  outputs="audio",
54
  title="Fine-tuning TTS for English with a Focus on Technical Vocabulary Using SpeechT5",
55
  description="Enter text with technical jargon for TTS conversion.",
56
+ examples=examples,
57
+ cache_examples=False # Disable caching for now
58
  )
59
 
60
+ # Launch interface
61
+ iface.launch()