File size: 1,701 Bytes
da5250a
05fd694
13b10f1
d7dfa49
 
 
292172d
bda48ea
726d965
292172d
 
 
726d965
05fd694
 
d7dfa49
292172d
 
05fd694
292172d
d7dfa49
726d965
292172d
05fd694
292172d
 
 
 
 
05fd694
292172d
 
 
 
 
05fd694
 
 
292172d
05fd694
292172d
6cfff67
0856a96
b2604a4
05fd694
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os
import gradio as gr
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS

# Load Whisper STT model
whisper_model = whisper.load_model("base")

# Load translation models
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")

def translate_speech(audio):
    audio = audio[0]
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
    _, probs = whisper_model.detect_language(mel)
    options = whisper.DecodingOptions(fp16=False)
    result = whisper.decode(whisper_model, mel, options)
    text = result.text

    # Translate text
    tokenizer.src_lang = 'en'  # Assuming the input is always in English
    encoded_text = tokenizer(text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_text)
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

    # Text-to-speech (TTS)
    tts = gTTS(text=translated_text, lang='en')  # Assuming the target language is English
    audio_path = "translated_audio.mp3"
    tts.save(audio_path)

    return audio_path

def translate_speech_interface(audio):
    translated_audio = translate_speech(audio)
    translated_audio_bytes = open(translated_audio, "rb").read()

    return translated_audio_bytes

audio_recording = gr.inputs.Audio(source="microphone", type="numpy", label="Record your speech")
output_audio = gr.outputs.Audio(type="numpy", label="Translated Audio")

iface = gr.Interface(fn=translate_speech_interface, inputs=audio_recording, outputs=output_audio, title="Speech Translator")
iface.launch()