File size: 2,851 Bytes
dde51bf fc110d0 dde51bf cd7c511 dde51bf 0196a98 dde51bf efa7028 fc110d0 dde51bf efa7028 dde51bf fc110d0 b9710dc fc110d0 b9710dc fc110d0 54eca9b b9710dc fc110d0 54eca9b dde51bf 87c7dbc a8d0349 0fa00fd dde51bf 87c7dbc a8d0349 0fa00fd dde51bf fc110d0 dde51bf 54eca9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import gradio as gr
import librosa
MODEL_NAME = "EwoutLagendijk/whisper-small-indonesian"
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"
# Load model and processor
model_name = "EwoutLagendijk/whisper-small-indonesian"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)
# Update the generation config for transcription
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="id", task="transcribe")
# Initialize the translation pipeline (using a model like `Helsinki-NLP/opus-mt-id-en` for Indonesian to English)
translation_pipeline = pipeline("translation", model="Helsinki-NLP/opus-mt-id-en")
def transcribe_speech(filepath):
# Load the audio
audio, sampling_rate = librosa.load(filepath, sr=16000)
# Define chunk size (e.g., 30 seconds)
chunk_duration = 5 # in seconds
chunk_samples = chunk_duration * sampling_rate
# Process audio in chunks
transcription = []
for i in range(0, len(audio), chunk_samples):
chunk = audio[i:i + chunk_samples]
# Convert the chunk into input features
inputs = processor(audio=chunk, sampling_rate=16000, return_tensors="pt").input_features
# Generate transcription for the chunk
generated_ids = model.generate(
inputs,
max_new_tokens=444, # Max allowed by Whisper
forced_decoder_ids=processor.get_decoder_prompt_ids(language="id", task="transcribe")
)
# Decode and append the transcription
chunk_transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Translate the transcription to English (or another language of choice)
chunk_translation = translation_pipeline(chunk_transcription)[0]['translation_text']
# Append both transcription and translation
transcription.append(f"Chunk {i//chunk_samples + 1}:\n")
transcription.append(f"Transcription: {chunk_transcription}\n")
transcription.append(f"Translation: {chunk_translation}\n\n")
# Combine all chunk transcriptions and translations into a single string
return "\n".join(transcription)
demo = gr.Blocks()
mic_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs=gr.Textbox(lines=10, label="Microphone output"),
)
file_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.Textbox(lines=10, label="File output"),
)
with demo:
gr.TabbedInterface([mic_transcribe, file_transcribe], ["Transcribe and translate Microphone", "Transcribe and translate Audio File"])
demo.launch(debug=True)
|