Spaces:
Runtime error
Runtime error
Commit
·
11b23f3
1
Parent(s):
dc433d9
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,8 @@ import librosa
|
|
3 |
from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
|
4 |
|
5 |
|
|
|
|
|
6 |
def load_and_fix_data(input_file, model_sampling_rate):
|
7 |
speech, sample_rate = librosa.load(input_file)
|
8 |
if len(speech.shape) > 1:
|
@@ -17,21 +19,90 @@ sampling_rate = feature_extractor.sampling_rate
|
|
17 |
|
18 |
asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish")
|
19 |
|
|
|
|
|
|
|
|
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
def predict_and_ctc_lm_decode(input_file):
|
28 |
speech = load_and_fix_data(input_file, sampling_rate)
|
29 |
transcribed_text = asr(speech, chunk_length_s=5, stride_length_s=1)
|
30 |
transcribed_text = transcribed_text["text"]
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
|
37 |
gr.Interface(
|
@@ -39,7 +110,7 @@ gr.Interface(
|
|
39 |
inputs=[
|
40 |
gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio")
|
41 |
],
|
42 |
-
outputs=
|
43 |
examples=[["audio1.wav"], ["travel.wav"]],
|
44 |
title="Generate-Gender-Neutralized-Audios",
|
45 |
description = "This is a Gradio demo for generating gender neutralized audios. To use it, simply provide an audio input (via microphone or audio recording), which will then be transcribed and gender-neutralized using a pre-trained models. Finally, with the help of Coqui's TTS model, gender neutralised audio is generated.",
|
|
|
3 |
from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
|
4 |
|
5 |
|
6 |
+
|
7 |
+
|
8 |
def load_and_fix_data(input_file, model_sampling_rate):
|
9 |
speech, sample_rate = librosa.load(input_file)
|
10 |
if len(speech.shape) > 1:
|
|
|
19 |
|
20 |
asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish")
|
21 |
|
22 |
+
prefix = ''
|
23 |
+
model_checkpoint = "hackathon-pln-es/es_text_neutralizer"
|
24 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
25 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
26 |
|
27 |
+
def postproc(input_sentence, preds):
|
28 |
+
try:
|
29 |
+
preds = preds.replace('De el', 'Del').replace('de el', 'del').replace(' ', ' ')
|
30 |
+
if preds[0].islower():
|
31 |
+
preds = preds.capitalize()
|
32 |
+
preds = preds.replace(' . ', '. ').replace(' , ', ', ')
|
33 |
|
34 |
+
# Nombres en mayusculas
|
35 |
+
prev_letter = ''
|
36 |
+
for word in input_sentence.split(' '):
|
37 |
+
if word:
|
38 |
+
if word[0].isupper():
|
39 |
+
if word.lower() in preds and word != input_sentence.split(' ')[0]:
|
40 |
+
if prev_letter == '.':
|
41 |
+
preds = preds.replace('. ' + word.lower() + ' ', '. ' + word + ' ')
|
42 |
+
else:
|
43 |
+
if word[-1] == '.':
|
44 |
+
preds = preds.replace(word.lower(), word)
|
45 |
+
else:
|
46 |
+
preds = preds.replace(word.lower() + ' ', word + ' ')
|
47 |
+
prev_letter = word[-1]
|
48 |
+
preds = preds.strip() # quitar ultimo espacio
|
49 |
+
except:
|
50 |
+
pass
|
51 |
+
return preds
|
52 |
+
|
53 |
+
model_name = "es/mai/tacotron2-DDC"
|
54 |
|
55 |
+
def predict_and_ctc_lm_decode(input_file, speaker_idx: str=None):
|
56 |
speech = load_and_fix_data(input_file, sampling_rate)
|
57 |
transcribed_text = asr(speech, chunk_length_s=5, stride_length_s=1)
|
58 |
transcribed_text = transcribed_text["text"]
|
59 |
+
inputs = tokenizer([prefix + transcribed_text], return_tensors="pt", padding=True)
|
60 |
+
with torch.no_grad():
|
61 |
+
if first_generation:
|
62 |
+
output_sequence = model.generate(
|
63 |
+
input_ids=inputs["input_ids"].to(device),
|
64 |
+
attention_mask=inputs["attention_mask"].to(device),
|
65 |
+
do_sample=False, # disable sampling to test if batching affects output
|
66 |
+
)
|
67 |
+
else:
|
68 |
+
|
69 |
+
output_sequence = model.generate(
|
70 |
+
input_ids=inputs["input_ids"].to(device),
|
71 |
+
attention_mask=inputs["attention_mask"].to(device),
|
72 |
+
do_sample=False,
|
73 |
+
num_beams=2,
|
74 |
+
repetition_penalty=2.5,
|
75 |
+
# length_penalty=1.0,
|
76 |
+
early_stopping=True# disable sampling to test if batching affects output
|
77 |
+
)
|
78 |
+
preds = postproc(transcribed_text,
|
79 |
+
preds=tokenizer.decode(output_sequence[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
|
80 |
+
if len(preds) > MAX_TXT_LEN:
|
81 |
+
text = preds[:MAX_TXT_LEN]
|
82 |
+
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
|
83 |
+
print(text, model_name)
|
84 |
+
# download model
|
85 |
+
model_path, config_path, model_item = manager.download_model(f"tts_models/{model_name}")
|
86 |
+
vocoder_name: Optional[str] = model_item["default_vocoder"]
|
87 |
+
# download vocoder
|
88 |
+
vocoder_path = None
|
89 |
+
vocoder_config_path = None
|
90 |
+
if vocoder_name is not None:
|
91 |
+
vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)
|
92 |
+
# init synthesizer
|
93 |
+
synthesizer = Synthesizer(
|
94 |
+
model_path, config_path, None, None, vocoder_path, vocoder_config_path,
|
95 |
+
)
|
96 |
+
# synthesize
|
97 |
+
if synthesizer is None:
|
98 |
+
raise NameError("model not found")
|
99 |
+
wavs = synthesizer.tts(preds, speaker_idx)
|
100 |
+
# return output
|
101 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
102 |
+
synthesizer.save_wav(wavs, fp)
|
103 |
+
return fp.name
|
104 |
+
|
105 |
+
|
106 |
|
107 |
|
108 |
gr.Interface(
|
|
|
110 |
inputs=[
|
111 |
gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio")
|
112 |
],
|
113 |
+
outputs=gr.outputs.Audio(label="Output"),
|
114 |
examples=[["audio1.wav"], ["travel.wav"]],
|
115 |
title="Generate-Gender-Neutralized-Audios",
|
116 |
description = "This is a Gradio demo for generating gender neutralized audios. To use it, simply provide an audio input (via microphone or audio recording), which will then be transcribed and gender-neutralized using a pre-trained models. Finally, with the help of Coqui's TTS model, gender neutralised audio is generated.",
|