import streamlit as st import openai import os from pydub import AudioSegment from pydub.silence import split_on_silence from dotenv import load_dotenv from tempfile import NamedTemporaryFile import math from docx import Document import time # Load environment variables from .env file load_dotenv() # Set your OpenAI API key openai.api_key = os.getenv("OPENAI_API_KEY") # Comprehensive dictionary of languages supported by Whisper (ISO 639-1 codes) # This list is based on the languages supported by the official Whisper model. languages = { "Afrikaans": "af", "Albanian": "sq", "Amharic": "am", "Arabic": "ar", "Armenian": "hy", "Assamese": "as", "Azerbaijani": "az", "Basque": "eu", "Belarusian": "be", "Bengali": "bn", "Bosnian": "bs", "Bulgarian": "bg", "Burmese": "my", "Catalan": "ca", "Cebuano": "ceb", "Chichewa": "ny", "Chinese": "zh", "Corsican": "co", "Croatian": "hr", "Czech": "cs", "Danish": "da", "Dutch": "nl", "English": "en", "Esperanto": "eo", "Estonian": "et", "Filipino": "tl", "Finnish": "fi", "French": "fr", "Frisian": "fy", "Galician": "gl", "Georgian": "ka", "German": "de", "Greek": "el", "Gujarati": "gu", "Haitian Creole": "ht", "Hausa": "ha", "Hawaiian": "haw", "Hebrew": "he", "Hindi": "hi", "Hmong": "hmn", "Hungarian": "hu", "Icelandic": "is", "Igbo": "ig", "Indonesian": "id", "Irish": "ga", "Italian": "it", "Japanese": "ja", "Javanese": "jw", "Kannada": "kn", "Kazakh": "kk", "Khmer": "km", "Kinyarwanda": "rw", "Korean": "ko", "Kurdish": "ku", "Kyrgyz": "ky", "Lao": "lo", "Latin": "la", "Latvian": "lv", "Lithuanian": "lt", "Luxembourgish": "lb", "Macedonian": "mk", "Malagasy": "mg", "Malay": "ms", "Malayalam": "ml", "Maltese": "mt", "Maori": "mi", "Marathi": "mr", "Mongolian": "mn", "Nepali": "ne", "Norwegian": "no", "Nyanja": "ny", "Odia": "or", "Pashto": "ps", "Persian": "fa", "Polish": "pl", "Portuguese": "pt", "Punjabi": "pa", "Romanian": "ro", "Russian": "ru", "Samoan": "sm", "Scots Gaelic": "gd", "Serbian": "sr", "Sesotho": "st", "Shona": "sn", "Sindhi": "sd", "Sinhala": "si", "Slovak": "sk", "Slovenian": "sl", "Somali": "so", "Spanish": "es", "Sundanese": "su", "Swahili": "sw", "Swedish": "sv", "Tajik": "tg", "Tamil": "ta", "Tatar": "tt", "Telugu": "te", "Thai": "th", "Turkish": "tr", "Turkmen": "tk", "Ukrainian": "uk", "Urdu": "ur", "Uyghur": "ug", "Uzbek": "uz", "Vietnamese": "vi", "Welsh": "cy", "Xhosa": "xh", "Yiddish": "yi", "Yoruba": "yo", "Zulu": "zu" } # Create a selectbox for language selection; default is English. selected_lang_name = st.selectbox("Select transcription language", sorted(languages.keys()), index=sorted(languages.keys()).index("English")) selected_language = languages[selected_lang_name] def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250): """ Split an audio file into chunks using silence detection. """ audio = AudioSegment.from_file(audio_file_path) chunks = split_on_silence( audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh, keep_silence=keep_silence ) return chunks def transcribe(audio_file): """ Transcribe an audio file using the OpenAI Whisper model. This uses the OpenAI API with the forced language set to the selected language. Args: audio_file (str): Path to the audio file. Returns: str: Transcribed text. """ with open(audio_file, "rb") as audio: response = openai.audio.transcriptions.create( model="whisper-1", file=audio, response_format="text", language=selected_language # Use the selected language code ) return response def process_audio_chunks(audio_chunks): """ Process and transcribe each audio chunk. Args: audio_chunks (list): List of AudioSegment chunks. Returns: str: Combined transcription from all chunks. """ transcriptions = [] min_length_ms = 100 # Minimum length required by OpenAI API (0.1 seconds) for i, chunk in enumerate(audio_chunks): if len(chunk) < min_length_ms: st.warning(f"Chunk {i} is too short to be processed.") continue with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: chunk.export(temp_audio_file.name, format="wav") temp_audio_file_path = temp_audio_file.name transcription = transcribe(temp_audio_file_path) if transcription: transcriptions.append(transcription) st.write(f"Transcription for chunk {i}: {transcription}") os.remove(temp_audio_file_path) return " ".join(transcriptions) def save_transcription_to_docx(transcription, audio_file_path): """ Save the transcription as a .docx file. """ base_name = os.path.splitext(os.path.basename(audio_file_path))[0] output_file_name = f"{base_name}_full_transcription.docx" doc = Document() doc.add_paragraph(transcription) doc.save(output_file_name) return output_file_name st.header("Audio Transcription with OpenAI's Whisper") # Allow uploading of audio or video files uploaded_file = st.file_uploader("Upload an audio or video file", type=["wav", "mp3", "ogg", "m4a", "mp4", "mov"]) if 'transcription' not in st.session_state: st.session_state.transcription = None if uploaded_file is not None and st.session_state.transcription is None: st.audio(uploaded_file) # Save uploaded file temporarily file_extension = uploaded_file.name.split(".")[-1] original_file_name = uploaded_file.name.rsplit('.', 1)[0] temp_audio_file = f"temp_audio_file.{file_extension}" with open(temp_audio_file, "wb") as f: f.write(uploaded_file.getbuffer()) processing_start = time.time() with st.spinner('Transcribing...'): audio_chunks = split_audio_on_silence( temp_audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=250 ) transcription = process_audio_chunks(audio_chunks) if transcription: st.session_state.transcription = transcription st.success('Transcription complete!') output_docx_file = save_transcription_to_docx(transcription, uploaded_file.name) st.session_state.output_docx_file = output_docx_file processing_duration = time.time() - processing_start st.info(f"Total processing time: {processing_duration:.2f} seconds.") if os.path.exists(temp_audio_file): os.remove(temp_audio_file) if st.session_state.transcription: st.text_area("Transcription", st.session_state.transcription, key="transcription_area_final") with open(st.session_state.output_docx_file, "rb") as docx_file: st.download_button( label="Download Transcription (.docx)", data=docx_file, file_name=st.session_state.output_docx_file, mime='application/vnd.openxmlformats-officedocument.wordprocessingml.document' )