AudioBot / app.py
Deepakkori45's picture
Update app.py
bfbed8e verified
raw
history blame
7.52 kB
import streamlit as st
import openai
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence
from dotenv import load_dotenv
from tempfile import NamedTemporaryFile
import math
from docx import Document
import time
# Load environment variables from .env file
load_dotenv()
# Set your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")
# Comprehensive dictionary of languages supported by Whisper (ISO 639-1 codes)
# This list is based on the languages supported by the official Whisper model.
languages = {
"Afrikaans": "af",
"Albanian": "sq",
"Amharic": "am",
"Arabic": "ar",
"Armenian": "hy",
"Assamese": "as",
"Azerbaijani": "az",
"Basque": "eu",
"Belarusian": "be",
"Bengali": "bn",
"Bosnian": "bs",
"Bulgarian": "bg",
"Burmese": "my",
"Catalan": "ca",
"Cebuano": "ceb",
"Chichewa": "ny",
"Chinese": "zh",
"Corsican": "co",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Esperanto": "eo",
"Estonian": "et",
"Filipino": "tl",
"Finnish": "fi",
"French": "fr",
"Frisian": "fy",
"Galician": "gl",
"Georgian": "ka",
"German": "de",
"Greek": "el",
"Gujarati": "gu",
"Haitian Creole": "ht",
"Hausa": "ha",
"Hawaiian": "haw",
"Hebrew": "he",
"Hindi": "hi",
"Hmong": "hmn",
"Hungarian": "hu",
"Icelandic": "is",
"Igbo": "ig",
"Indonesian": "id",
"Irish": "ga",
"Italian": "it",
"Japanese": "ja",
"Javanese": "jw",
"Kannada": "kn",
"Kazakh": "kk",
"Khmer": "km",
"Kinyarwanda": "rw",
"Korean": "ko",
"Kurdish": "ku",
"Kyrgyz": "ky",
"Lao": "lo",
"Latin": "la",
"Latvian": "lv",
"Lithuanian": "lt",
"Luxembourgish": "lb",
"Macedonian": "mk",
"Malagasy": "mg",
"Malay": "ms",
"Malayalam": "ml",
"Maltese": "mt",
"Maori": "mi",
"Marathi": "mr",
"Mongolian": "mn",
"Nepali": "ne",
"Norwegian": "no",
"Nyanja": "ny",
"Odia": "or",
"Pashto": "ps",
"Persian": "fa",
"Polish": "pl",
"Portuguese": "pt",
"Punjabi": "pa",
"Romanian": "ro",
"Russian": "ru",
"Samoan": "sm",
"Scots Gaelic": "gd",
"Serbian": "sr",
"Sesotho": "st",
"Shona": "sn",
"Sindhi": "sd",
"Sinhala": "si",
"Slovak": "sk",
"Slovenian": "sl",
"Somali": "so",
"Spanish": "es",
"Sundanese": "su",
"Swahili": "sw",
"Swedish": "sv",
"Tajik": "tg",
"Tamil": "ta",
"Tatar": "tt",
"Telugu": "te",
"Thai": "th",
"Turkish": "tr",
"Turkmen": "tk",
"Ukrainian": "uk",
"Urdu": "ur",
"Uyghur": "ug",
"Uzbek": "uz",
"Vietnamese": "vi",
"Welsh": "cy",
"Xhosa": "xh",
"Yiddish": "yi",
"Yoruba": "yo",
"Zulu": "zu"
}
# Create a selectbox for language selection; default is English.
selected_lang_name = st.selectbox("Select transcription language", sorted(languages.keys()), index=sorted(languages.keys()).index("English"))
selected_language = languages[selected_lang_name]
def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
"""
Split an audio file into chunks using silence detection.
"""
audio = AudioSegment.from_file(audio_file_path)
chunks = split_on_silence(
audio,
min_silence_len=min_silence_len,
silence_thresh=silence_thresh,
keep_silence=keep_silence
)
return chunks
def transcribe(audio_file):
"""
Transcribe an audio file using the OpenAI Whisper model.
This uses the OpenAI API with the forced language set to the selected language.
Args:
audio_file (str): Path to the audio file.
Returns:
str: Transcribed text.
"""
with open(audio_file, "rb") as audio:
response = openai.audio.transcriptions.create(
model="whisper-1",
file=audio,
response_format="text",
language=selected_language # Use the selected language code
)
return response
def process_audio_chunks(audio_chunks):
"""
Process and transcribe each audio chunk.
Args:
audio_chunks (list): List of AudioSegment chunks.
Returns:
str: Combined transcription from all chunks.
"""
transcriptions = []
min_length_ms = 100 # Minimum length required by OpenAI API (0.1 seconds)
for i, chunk in enumerate(audio_chunks):
if len(chunk) < min_length_ms:
st.warning(f"Chunk {i} is too short to be processed.")
continue
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
chunk.export(temp_audio_file.name, format="wav")
temp_audio_file_path = temp_audio_file.name
transcription = transcribe(temp_audio_file_path)
if transcription:
transcriptions.append(transcription)
st.write(f"Transcription for chunk {i}: {transcription}")
os.remove(temp_audio_file_path)
return " ".join(transcriptions)
def save_transcription_to_docx(transcription, audio_file_path):
"""
Save the transcription as a .docx file.
"""
base_name = os.path.splitext(os.path.basename(audio_file_path))[0]
output_file_name = f"{base_name}_full_transcription.docx"
doc = Document()
doc.add_paragraph(transcription)
doc.save(output_file_name)
return output_file_name
st.title("Audio Transcription with OpenAI's Whisper")
# Allow uploading of audio or video files
uploaded_file = st.file_uploader("Upload an audio or video file", type=["wav", "mp3", "ogg", "m4a", "mp4", "mov"])
if 'transcription' not in st.session_state:
st.session_state.transcription = None
if uploaded_file is not None and st.session_state.transcription is None:
st.audio(uploaded_file)
# Save uploaded file temporarily
file_extension = uploaded_file.name.split(".")[-1]
original_file_name = uploaded_file.name.rsplit('.', 1)[0]
temp_audio_file = f"temp_audio_file.{file_extension}"
with open(temp_audio_file, "wb") as f:
f.write(uploaded_file.getbuffer())
processing_start = time.time()
with st.spinner('Transcribing...'):
audio_chunks = split_audio_on_silence(
temp_audio_file,
min_silence_len=500,
silence_thresh=-40,
keep_silence=250
)
transcription = process_audio_chunks(audio_chunks)
if transcription:
st.session_state.transcription = transcription
st.success('Transcription complete!')
output_docx_file = save_transcription_to_docx(transcription, uploaded_file.name)
st.session_state.output_docx_file = output_docx_file
processing_duration = time.time() - processing_start
st.info(f"Total processing time: {processing_duration:.2f} seconds.")
if os.path.exists(temp_audio_file):
os.remove(temp_audio_file)
if st.session_state.transcription:
st.text_area("Transcription", st.session_state.transcription, key="transcription_area_final")
with open(st.session_state.output_docx_file, "rb") as docx_file:
st.download_button(
label="Download Transcription (.docx)",
data=docx_file,
file_name=st.session_state.output_docx_file,
mime='application/vnd.openxmlformats-officedocument.wordprocessingml.document'
)