Spaces:
Sleeping
Sleeping
File size: 2,563 Bytes
d07df4a 867e47c d07df4a 2aa273f 4d2fe8e cf5e40a d07df4a c97e116 2aa273f d07df4a 2aa273f 34f8d61 2aa273f d07df4a 2aa273f d07df4a 2aa273f d07df4a 2aa273f d07df4a 2aa273f d07df4a 2aa273f d07df4a 2aa273f d07df4a 2aa273f d07df4a 2aa273f d07df4a 2aa273f 1d88b62 2aa273f 1d88b62 dc41b9b d07df4a 2aa273f 1d88b62 2aa273f 1d88b62 d07df4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer
import gradio as gr
from pydub import AudioSegment
import os
# Load Whisper model for transcription
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_model.eval()
torch.set_grad_enabled(False)
# Load MarianMT model for translation (English → Spanish)
translation_model_name = "Helsinki-NLP/opus-mt-en-es"
translator = MarianMTModel.from_pretrained(translation_model_name)
tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
def translate_text(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
translated = translator.generate(**inputs)
return tokenizer.decode(translated[0], skip_special_tokens=True)
def transcribe_and_translate(filepath):
if filepath is None or not os.path.exists(filepath):
return "No audio file provided."
audio = AudioSegment.from_file(filepath).set_channels(1)
chunk_length_ms = 30 * 1000
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
full_transcript = ""
for i, chunk in enumerate(chunks):
chunk_path = f"chunk_{i}.wav"
chunk.export(chunk_path, format="wav")
waveform, sr = torchaudio.load(chunk_path)
os.remove(chunk_path)
if sr != 16000:
waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)
waveform = waveform.mean(dim=0)
inputs = whisper_processor(waveform, sampling_rate=16000, return_tensors="pt")
predicted_ids = whisper_model.generate(inputs["input_features"])
transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
full_transcript += transcription.strip() + " "
translated_text = translate_text(full_transcript.strip())
return translated_text
mic_ui = gr.Interface(
fn=transcribe_and_translate,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs=gr.Textbox(label="Translation (English Speech → Spanish Text)"),
)
file_ui = gr.Interface(
fn=transcribe_and_translate,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.Textbox(label="Translation (English Speech → Spanish Text)"),
)
app = gr.TabbedInterface([mic_ui, file_ui], ["Microphone Input", "Upload File"])
app.launch()
|