Spaces:

propilot
/

ai-speech-recognition

Sleeping

File size: 4,045 Bytes

eb3cdc8
ff5f53a
 
eb3cdc8
 
 
ff5f53a
eb3cdc8
ff5f53a
 
 
 
 
 
eb3cdc8
 
 
 
 
 
 
 
 
 
 
6f58142
eb3cdc8
 
 
6f58142
eb3cdc8
 
 
 
ff5f53a
 
 
 
 
 
eb3cdc8
ff5f53a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb3cdc8
ff5f53a
eb3cdc8
 
 
 
 
ff5f53a
eb3cdc8
6f58142
 
 
eb3cdc8
 
 
 
 
 
 
ff5f53a
eb3cdc8
 
6f58142
ff5f53a
 
eb3cdc8
 
ff5f53a
 
 
 
 
eb3cdc8
ff5f53a
eb3cdc8
 
 
6f58142
eb3cdc8

import streamlit as st
import os
import tempfile
import whisper
import speech_recognition as sr
from pydub import AudioSegment
from audio_recorder_streamlit import audio_recorder

# Function to convert mp3 file to wav
def convert_mp3_to_wav(mp3_path):
    audio = AudioSegment.from_mp3(mp3_path)
    wav_path = mp3_path.replace('.mp3', '.wav')
    audio.export(wav_path, format="wav")
    return wav_path

# Function to transcribe audio using OpenAI Whisper
def transcribe_whisper(model_name, file_path):
    model = whisper.load_model(model_name)
    result = model.transcribe(file_path)
    return result["text"]

# Function to transcribe audio using Google Speech API
def transcribe_speech_recognition(file_path):
    r = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        r.adjust_for_ambient_noise(source, duration=0.25)  # Adjust ambient noise threshold
        audio = r.record(source)

    try:
        result = r.recognize_google(audio, language='es')
        return result
    except sr.UnknownValueError:
        return "No se pudo reconocer ningún texto en el audio."

# Function to perform transcription based on selected method
def perform_transcription(transcription_method, model_name, audio_path):
    if transcription_method == 'OpenAI Whisper':
        return transcribe_whisper(model_name, audio_path)
    else:
        return transcribe_speech_recognition(audio_path)

# Function to handle uploaded file transcription
def handle_uploaded_file(uploaded_file, transcription_method, model_name):
    file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size}
    st.write(file_details)

    # Save uploaded file to temp directory
    os.makedirs("temp", exist_ok=True)  # Create temp directory if it doesn't exist
    file_path = os.path.join("temp", uploaded_file.name)
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    with st.spinner('Transcribiendo...'):
        if uploaded_file.name.endswith('.mp3') and transcription_method != 'OpenAI Whisper':
            # Convert mp3 to wav if Google Speech API is selected and file is in mp3 format
            file_path = convert_mp3_to_wav(file_path)

        # Perform transcription
        transcript = perform_transcription(transcription_method, model_name, file_path)

    st.text_area('Resultado de la Transcripción:', transcript, height=200)

def main():
    st.title('Transcriptor de Audio')

    # Choose the transcription method and model
    option = st.selectbox('Escoger Modelo de Transcripción', ('Subir un archivo', 'Grabar audio en tiempo real'))
    transcription_method = st.selectbox('Escoge el método de transcripción', ('OpenAI Whisper', 'Google Speech API'))
    
    model_name = None  # Initialize model_name with a default value
    
    if transcription_method == 'OpenAI Whisper':
        model_name = st.selectbox('Escoge el modelo de Whisper', ('base', 'small', 'medium', 'large', 'tiny'))

    if option == 'Subir un archivo':
        uploaded_file = st.file_uploader("Sube tu archivo de audio para transcribir", type=['wav', 'mp3'])

        if uploaded_file is not None:
            handle_uploaded_file(uploaded_file, transcription_method, model_name)
    
    elif option == 'Grabar audio en tiempo real':
        audio_bytes = audio_recorder(pause_threshold=5, sample_rate=16_000)

        if audio_bytes:
            st.write("Grabación finalizada. Transcribiendo...")
            with st.spinner('Transcribiendo...'):
                # Save recorded audio to a temporary file
                with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio:
                    temp_path = temp_audio.name
                    temp_audio.write(audio_bytes)

                # Perform transcription
                transcript = perform_transcription(transcription_method, model_name, temp_path)

                st.text_area('Resultado de la Transcripción:', transcript, height=200)


if __name__ == "__main__":
    main()