import streamlit as st import torchaudio from transformers import WhisperProcessor, WhisperForConditionalGeneration # Load the Whisper model and processor processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") # Title st.title('Audio to Text Transcription') # Sidebar for file upload st.sidebar.title("Upload your audio file") uploaded_file = st.sidebar.file_uploader("Choose an audio file", type=["mp3", "wav", "mp4", "m4a"]) if uploaded_file: st.sidebar.audio(uploaded_file) # Process the uploaded file audio_tensor, sampling_rate = torchaudio.load(uploaded_file) resampler = torchaudio.transforms.Resample(sampling_rate, 16000) resampled_waveform = resampler(audio_tensor) segment_duration = 120 # Segment duration in seconds (2 minutes) num_segments = len(resampled_waveform[0]) // (segment_duration * 16000) segment_transcriptions = [] # Transcribe each segment for i in range(num_segments): start = i * segment_duration * 16000 end = min(len(resampled_waveform[0]), (i + 1) * segment_duration * 16000) segment = resampled_waveform[0][start:end] # Transcribe the segment input_features = processor( segment, sampling_rate=16000, return_tensors="pt" ).input_features predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) segment_transcriptions.append(transcription[0]) # Combine segment transcriptions into the full transcript full_transcript = " ".join(segment_transcriptions) # Display the transcript st.header("Transcription") st.write(full_transcript)