Spaces:
Sleeping
Sleeping
File size: 3,703 Bytes
95573ef 494f8dc 95573ef da7da4a 494f8dc 95573ef 494f8dc da7da4a 494f8dc fc52adb 95573ef 494f8dc 95573ef 494f8dc fc52adb 494f8dc 95573ef 834ee4c 494f8dc 834ee4c 494f8dc 834ee4c 494f8dc 834ee4c 494f8dc 834ee4c 494f8dc a7111d3 834ee4c 494f8dc 834ee4c 494f8dc 834ee4c 494f8dc 834ee4c 494f8dc 834ee4c 494f8dc 95573ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import gradio as gr
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
import torch
import numpy as np
device = "cuda:0" if torch.cuda.is_available() else "cpu"
#device = "cpu"
torch_dtype = torch.float16 if device != "cpu" else torch.float32
print("Device:", device)
model_id = "openai/whisper-large-v3"
#model_id = "openai/whisper-medium"
# model_id = "openai/whisper-large-v3"
# model_id = "openai/whisper-medium"
# model = AutoModelForSpeechSeq2Seq.from_pretrained(
# model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
# )
# model.to(device)
# processor = AutoProcessor.from_pretrained(model_id)
# pipe_transcription = pipeline(
# "automatic-speech-recognition",
# model=model,
# tokenizer=processor.tokenizer,
# feature_extractor=processor.feature_extractor,
# max_new_tokens=128,
# chunk_length_s=30,
# batch_size=16,
# return_timestamps=True,
# torch_dtype=torch_dtype,
# device=device,
# )
pipe_transcription = pipeline("automatic-speech-recognition", model="pierreguillou/whisper-medium-french")
pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=device)
pipe_tts = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device) # Better quality, way faster than bark
def get_translation(text):
return pipe_translate(text)[0]["translation_text"]
def get_transcript(voice):
return get_translation(pipe_transcription(voice)["text"])#, generate_kwargs={"task": "translate", "language": "french"})["text"]
def get_audio(text):
speech = pipe_tts(text)
return speech["sampling_rate"], (speech["audio"]* 32767).astype(np.int16).T
with gr.Blocks() as demo:
with gr.Tab("Texte (rapide)"):
input_text = gr.Textbox(
label="Input text",
info="Your text",
lines=3,
placeholder="Écrire le texte à traduire",
)
translation_button = gr.Button("Traduire...")
output_text = gr.Textbox(
label="Output text",
info="Your text",
lines=3,
placeholder="Votre traduction",
)
speech_button = gr.Button("Générer audio...")
translation_button.click(
get_translation,
inputs=[
input_text
],
outputs=[
output_text
],
)
speech_button.click(
get_audio,
inputs=[
output_text
],
outputs=[
gr.Audio(label="Output")
],
)
with gr.Tab("Voix (plus lent)"):
voice = gr.Audio(sources=["microphone"], type="filepath")
translation_button = gr.Button("Traduire votre enregistrement !")
output_text = gr.Textbox(
label="Texte traduit",
info="Votre texte",
lines=3,
placeholder="Votre traduction",
)
speech_button = gr.Button("Générer audio !")
translation_button.click(
get_transcript,
inputs=[
voice
],
outputs=[
output_text
],
)
speech_button.click(
get_audio,
inputs=[
output_text
],
outputs=[
gr.Audio(label="Output")
],
)
demo.launch() |