Spaces:
Runtime error
Runtime error
import streamlit as st | |
import torchaudio | |
from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
# Load the Whisper model and processor | |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") | |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") | |
# Title | |
st.title('Audio to Text Transcription') | |
# Sidebar for file upload | |
st.sidebar.title("Upload your audio file") | |
uploaded_file = st.sidebar.file_uploader("Choose an audio file", type=["mp3", "wav", "mp4", "m4a"]) | |
if uploaded_file: | |
st.sidebar.audio(uploaded_file) | |
# Process the uploaded file | |
audio_tensor, sampling_rate = torchaudio.load(uploaded_file) | |
resampler = torchaudio.transforms.Resample(sampling_rate, 16000) | |
resampled_waveform = resampler(audio_tensor) | |
segment_duration = 120 # Segment duration in seconds (2 minutes) | |
num_segments = len(resampled_waveform[0]) // (segment_duration * 16000) | |
segment_transcriptions = [] | |
# Transcribe each segment | |
for i in range(num_segments): | |
start = i * segment_duration * 16000 | |
end = min(len(resampled_waveform[0]), (i + 1) * segment_duration * 16000) | |
segment = resampled_waveform[0][start:end] | |
# Transcribe the segment | |
input_features = processor( | |
segment, sampling_rate=16000, return_tensors="pt" | |
).input_features | |
predicted_ids = model.generate(input_features) | |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) | |
segment_transcriptions.append(transcription[0]) | |
# Combine segment transcriptions into the full transcript | |
full_transcript = " ".join(segment_transcriptions) | |
# Display the transcript | |
st.header("Transcription") | |
st.write(full_transcript) | |