Spaces:
Sleeping
Sleeping
import torch | |
import torchaudio | |
from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
from transformers import MarianMTModel, MarianTokenizer | |
import gradio as gr | |
from pydub import AudioSegment | |
import os | |
# Load Whisper model for transcription | |
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") | |
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small") | |
whisper_model.eval() | |
torch.set_grad_enabled(False) | |
# Load MarianMT model for translation (English → Spanish) | |
translation_model_name = "Helsinki-NLP/opus-mt-en-es" | |
translator = MarianMTModel.from_pretrained(translation_model_name) | |
tokenizer = MarianTokenizer.from_pretrained(translation_model_name) | |
def translate_text(text): | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
translated = translator.generate(**inputs) | |
return tokenizer.decode(translated[0], skip_special_tokens=True) | |
def transcribe_and_translate(filepath): | |
if filepath is None or not os.path.exists(filepath): | |
return "No audio file provided." | |
audio = AudioSegment.from_file(filepath).set_channels(1) | |
chunk_length_ms = 30 * 1000 | |
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] | |
full_transcript = "" | |
for i, chunk in enumerate(chunks): | |
chunk_path = f"chunk_{i}.wav" | |
chunk.export(chunk_path, format="wav") | |
waveform, sr = torchaudio.load(chunk_path) | |
os.remove(chunk_path) | |
if sr != 16000: | |
waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform) | |
waveform = waveform.mean(dim=0) | |
inputs = whisper_processor(waveform, sampling_rate=16000, return_tensors="pt") | |
predicted_ids = whisper_model.generate(inputs["input_features"]) | |
transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
full_transcript += transcription.strip() + " " | |
translated_text = translate_text(full_transcript.strip()) | |
return translated_text | |
mic_ui = gr.Interface( | |
fn=transcribe_and_translate, | |
inputs=gr.Audio(sources="microphone", type="filepath"), | |
outputs=gr.Textbox(label="Translation (English Speech → Spanish Text)"), | |
) | |
file_ui = gr.Interface( | |
fn=transcribe_and_translate, | |
inputs=gr.Audio(sources="upload", type="filepath"), | |
outputs=gr.Textbox(label="Translation (English Speech → Spanish Text)"), | |
) | |
app = gr.TabbedInterface([mic_ui, file_ui], ["Microphone Input", "Upload File"]) | |
app.launch() | |