import streamlit as st import sparknlp import os import pandas as pd from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline from sparknlp.pretrained import PretrainedPipeline # Page configuration st.set_page_config( layout="wide", initial_sidebar_state="auto" ) # Custom CSS for styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def init_spark(): """Initialize Spark NLP.""" return sparknlp.start() @st.cache_resource def create_pipeline(model): """Create a Spark NLP pipeline for audio processing.""" audio_assembler = AudioAssembler() \ .setInputCol("audio_content") \ .setOutputCol("audio_assembler") speech_to_text = Wav2Vec2ForCTC \ .pretrained(model)\ .setInputCols("audio_assembler") \ .setOutputCol("text") pipeline = Pipeline(stages=[ audio_assembler, speech_to_text ]) return pipeline def fit_data(pipeline, fed_data): """Fit the data into the pipeline and return the transcription.""" data, sampling_rate = librosa.load(fed_data, sr=16000) data = [float(x) for x in data] schema = StructType([ StructField("audio_content", ArrayType(FloatType())), StructField("sampling_rate", LongType()) ]) df = pd.DataFrame({ "audio_content": [data], "sampling_rate": [sampling_rate] }) spark_df = spark.createDataFrame(df, schema) pipeline_df = pipeline.fit(spark_df).transform(spark_df) return pipeline_df.select("text.result") def save_uploadedfile(uploadedfile, path): """Save the uploaded file to the specified path.""" filepath = os.path.join(path, uploadedfile.name) with open(filepath, "wb") as f: if hasattr(uploadedfile, 'getbuffer'): f.write(uploadedfile.getbuffer()) else: f.write(uploadedfile.read()) # Sidebar content model_list = [ "asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman", "asr_wav2vec2_base_100h_13K_steps", "asr_wav2vec2_base_100h_ngram", "asr_wav2vec2_base_100h_by_facebook", "asr_wav2vec2_base_100h_test", "asr_wav2vec2_base_960h" ] model = st.sidebar.selectbox( "Choose the pretrained model", model_list, help="For more info about the models visit: https://sparknlp.org/models" ) # Main content st.markdown('

Speech Recognition With Wav2Vec2ForCTC

', unsafe_allow_html=True) st.markdown('

This demo transcribes audio files into texts using the Wav2Vec2ForCTC Annotator and advanced speech recognition models.

', unsafe_allow_html=True) # Reference notebook link in sidebar st.sidebar.markdown('Reference notebook:') st.sidebar.markdown("""

""", unsafe_allow_html=True) # Load examples AUDIO_FILE_PATH = "inputs" audio_files = sorted(os.listdir(AUDIO_FILE_PATH)) selected_audio = st.selectbox("Select an audio", audio_files) # Creating a simplified Python list of audio file types audio_file_types = ["mp3", "flac", "wav", "aac", "ogg", "aiff", "wma", "m4a", "ape", "dsf", "dff", "midi", "mid", "opus", "amr"] uploadedfile = st.file_uploader("Try it for yourself!", type=audio_file_types) if uploadedfile: selected_audio = f"{AUDIO_FILE_PATH}/{uploadedfile.name}" save_uploadedfile(uploadedfile, AUDIO_FILE_PATH) elif selected_audio: selected_audio = f"{AUDIO_FILE_PATH}/{selected_audio}" # Audio playback and transcription st.subheader("Play Audio") with open(selected_audio, 'rb') as audio_file: audio_bytes = audio_file.read() st.audio(audio_bytes) st.subheader(f"Transcription for {selected_audio}:") spark = init_spark() pipeline = create_pipeline(model) output = fit_data(pipeline, selected_audio) st.text(output.first().result[0].strip())