Barani1-t commited on
Commit
7e3558a
·
1 Parent(s): 9a915aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -2
app.py CHANGED
@@ -4,6 +4,8 @@ import torch
4
  from datasets import load_dataset
5
  import librosa
6
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
 
 
7
 
8
  target_dtype = np.int16
9
  max_range = np.iinfo(target_dtype).max
@@ -23,19 +25,28 @@ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(devic
23
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
24
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
25
 
 
 
 
26
 
27
  def translate(audio):
28
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"language": "nl","task": "transcribe"})
29
  return outputs["text"]
30
 
31
 
32
- def synthesise(text):
33
  inputs = processor(text=text, padding='max_length', truncation=True,max_length=600,return_tensors="pt")
34
  print(inputs)
35
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device),vocoder=vocoder)
36
  return speech.cpu()
37
 
38
-
 
 
 
 
 
 
39
  def speech_to_speech_translation(audio):
40
  sampling_rate = 16000
41
  data_array,samplerate = librosa.load(audio)
 
4
  from datasets import load_dataset
5
  import librosa
6
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
+ from transformers import VitsModel, VitsTokenizer
8
+
9
 
10
  target_dtype = np.int16
11
  max_range = np.iinfo(target_dtype).max
 
25
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
26
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
27
 
28
+ model_mms = VitsModel.from_pretrained("facebook/mms-tts-nld")
29
+ tokenizer_mms = VitsTokenizer.from_pretrained("facebook/mms-tts-nld")
30
+
31
 
32
  def translate(audio):
33
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"language": "nl","task": "transcribe"})
34
  return outputs["text"]
35
 
36
 
37
+ def synthesise_speechT5(text):
38
  inputs = processor(text=text, padding='max_length', truncation=True,max_length=600,return_tensors="pt")
39
  print(inputs)
40
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device),vocoder=vocoder)
41
  return speech.cpu()
42
 
43
+ def synthesise(text):
44
+ inputs = tokenizer_mms(text, return_tensors="pt")
45
+ input_ids = inputs["input_ids"]
46
+ with torch.no_grad():
47
+ outputs = model_mms(input_ids)
48
+ return speech = outputs["waveform"]
49
+
50
  def speech_to_speech_translation(audio):
51
  sampling_rate = 16000
52
  data_array,samplerate = librosa.load(audio)