JackismyShephard commited on
Commit
ec39417
·
1 Parent(s): 831b161

add postprocessing option

Browse files
Files changed (2) hide show
  1. app.py +23 -3
  2. requirements.txt +2 -1
app.py CHANGED
@@ -4,13 +4,17 @@ import torch
4
 
5
  from transformers import pipeline
6
 
 
 
7
  checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
8
 
 
 
9
  pipe = pipeline(
10
  "text-to-speech",
11
  model=checkpoint_finetuned,
12
  use_fast=True,
13
- device=0 if torch.cuda.is_available() else "cpu",
14
  )
15
 
16
 
@@ -24,7 +28,7 @@ speaker_embeddings = {
24
  }
25
 
26
 
27
- def predict(text, speaker):
28
  if len(text.strip()) == 0:
29
  return (16000, np.zeros(0))
30
 
@@ -41,7 +45,10 @@ def predict(text, speaker):
41
  forward_params = {"speaker_embeddings": speaker_embedding}
42
  speech = pipe(text, forward_params=forward_params)
43
 
44
- return (speech["sampling_rate"], speech["audio"])
 
 
 
45
 
46
 
47
  def replace_danish_letters(text):
@@ -72,6 +79,17 @@ replacements = [
72
  ("ü", "y"),
73
  ]
74
 
 
 
 
 
 
 
 
 
 
 
 
75
  title = "Danish Speech Synthesis"
76
 
77
  description = (
@@ -84,6 +102,7 @@ examples = [
84
  [
85
  "I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene.",
86
  "F23 (Female, 23, Vestjylland)",
 
87
  ],
88
  ]
89
 
@@ -103,6 +122,7 @@ demo = gr.Interface(
103
  ],
104
  value="F23 (Female, 23, Vestjylland)",
105
  ),
 
106
  ],
107
  outputs=[
108
  gr.Audio(label="Generated Speech", type="numpy"),
 
4
 
5
  from transformers import pipeline
6
 
7
+ from resemble_enhance.enhancer.inference import denoise, enhance
8
+
9
  checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
10
 
11
+ device = 0 if torch.cuda.is_available() else "cpu"
12
+
13
  pipe = pipeline(
14
  "text-to-speech",
15
  model=checkpoint_finetuned,
16
  use_fast=True,
17
+ device=device,
18
  )
19
 
20
 
 
28
  }
29
 
30
 
31
+ def predict(text, speaker, post_process):
32
  if len(text.strip()) == 0:
33
  return (16000, np.zeros(0))
34
 
 
45
  forward_params = {"speaker_embeddings": speaker_embedding}
46
  speech = pipe(text, forward_params=forward_params)
47
 
48
+ if post_process:
49
+ return enhance_audio(speech["audio"], speech["sampling_rate"], device)
50
+ else:
51
+ return (speech["sampling_rate"], speech["audio"])
52
 
53
 
54
  def replace_danish_letters(text):
 
79
  ("ü", "y"),
80
  ]
81
 
82
+
83
+ def enhance_audio(waveform, sr, device="cuda"):
84
+ tensor = torch.tensor(waveform).float()
85
+ denoised, new_sr = denoise(tensor, sr, device)
86
+ enhanced, new_sr = enhance(
87
+ denoised, new_sr, device, nfe=2, solver="midpoint", lambd=0.9, tau=0.95
88
+ )
89
+ enhanced_cpu = enhanced.cpu().numpy()
90
+ return new_sr, enhanced_cpu
91
+
92
+
93
  title = "Danish Speech Synthesis"
94
 
95
  description = (
 
102
  [
103
  "I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene.",
104
  "F23 (Female, 23, Vestjylland)",
105
+ True,
106
  ],
107
  ]
108
 
 
122
  ],
123
  value="F23 (Female, 23, Vestjylland)",
124
  ),
125
+ gr.Checkbox(label="Enhance audio (takes substantially longer)"),
126
  ],
127
  outputs=[
128
  gr.Audio(label="Generated Speech", type="numpy"),
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  --extra-index-url https://download.pytorch.org/whl/cu113
2
  torch
3
  transformers
4
- sentencepiece
 
 
1
  --extra-index-url https://download.pytorch.org/whl/cu113
2
  torch
3
  transformers
4
+ sentencepiece
5
+ resemble-enhance