Spaces:

spark-nlp
/

Wav2Vec2ForCTC

Sleeping

App Files Files Community

abdullahmubeen10 commited on Aug 8, 2024

Commit

6b52778

verified ·

1 Parent(s): 63c4ad8

Upload 15 files

Browse files

Files changed (15) hide show

.streamlit/config.toml +3 -0
Demo.py +146 -0
Dockerfile +70 -0
inputs/audio-1.flac +0 -0
inputs/audio-10.flac +0 -0
inputs/audio-2.flac +0 -0
inputs/audio-3.flac +0 -0
inputs/audio-4.flac +0 -0
inputs/audio-5.flac +0 -0
inputs/audio-6.flac +0 -0
inputs/audio-7.flac +0 -0
inputs/audio-8.flac +0 -0
inputs/audio-9.flac +0 -0
pages/Workflow & Model Overview.py +198 -0
requirements.txt +5 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[theme]
+base="light"
+primaryColor="#29B4E8"

Demo.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import streamlit as st
+import sparknlp
+import os
+import pandas as pd
+from sparknlp.base import *
+from sparknlp.annotator import *
+from pyspark.ml import Pipeline
+from sparknlp.pretrained import PretrainedPipeline
+# Page configuration
+st.set_page_config(
+    layout="wide",
+    initial_sidebar_state="auto"
+)
+# Custom CSS for styling
+st.markdown("""
+    <style>
+        .main-title {
+            font-size: 36px;
+            color: #4A90E2;
+            font-weight: bold;
+            text-align: center;
+        }
+        .section {
+            background-color: #f9f9f9;
+            padding: 10px;
+            border-radius: 10px;
+            margin-top: 10px;
+        }
+        .section p, .section ul {
+            color: #666666;
+        }
+    </style>
+""", unsafe_allow_html=True)
+@st.cache_resource
+def init_spark():
+    """Initialize Spark NLP."""
+    return sparknlp.start()
+@st.cache_resource
+def create_pipeline(model):
+    """Create a Spark NLP pipeline for audio processing."""
+    audio_assembler = AudioAssembler() \
+        .setInputCol("audio_content") \
+        .setOutputCol("audio_assembler")
+    speech_to_text = Wav2Vec2ForCTC \
+        .pretrained(model)\
+        .setInputCols("audio_assembler") \
+        .setOutputCol("text")
+    pipeline = Pipeline(stages=[
+        audio_assembler,
+        speech_to_text
+    ])
+    return pipeline
+def fit_data(pipeline, fed_data):
+    """Fit the data into the pipeline and return the transcription."""
+    data, sampling_rate = librosa.load(fed_data, sr=16000)
+    data = [float(x) for x in data]
+    schema = StructType([
+        StructField("audio_content", ArrayType(FloatType())),
+        StructField("sampling_rate", LongType())
+    ])
+    df = pd.DataFrame({
+        "audio_content": [data],
+        "sampling_rate": [sampling_rate]
+    })
+    spark_df = spark.createDataFrame(df, schema)
+    pipeline_df = pipeline.fit(spark_df).transform(spark_df)
+    return pipeline_df.select("text.result")
+def save_uploadedfile(uploadedfile, path):
+    """Save the uploaded file to the specified path."""
+    filepath = os.path.join(path, uploadedfile.name)
+    with open(filepath, "wb") as f:
+        if hasattr(uploadedfile, 'getbuffer'):
+            f.write(uploadedfile.getbuffer())
+        else:
+            f.write(uploadedfile.read())
+# Sidebar content
+model_list = [
+    "asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman",
+    "asr_wav2vec2_base_100h_13K_steps",
+    "asr_wav2vec2_base_100h_ngram",
+    "asr_wav2vec2_base_100h_by_facebook",
+    "asr_wav2vec2_base_100h_test",
+    "asr_wav2vec2_base_960h"
+]
+model = st.sidebar.selectbox(
+    "Choose the pretrained model",
+    model_list,
+    help="For more info about the models visit: https://sparknlp.org/models"
+)
+# Main content
+st.markdown('<div class="main-title">Speech Recognition With Wav2Vec2ForCTC</div>', unsafe_allow_html=True)
+st.markdown('<div class="section"><p>This demo transcribes audio files into texts using the <code>Wav2Vec2ForCTC</code> Annotator and advanced speech recognition models.</p></div>', unsafe_allow_html=True)
+# Reference notebook link in sidebar
+st.sidebar.markdown('Reference notebook:')
+st.sidebar.markdown("""
+    <a href="https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/open-source-nlp/17.0.Automatic_Speech_Recognition_Wav2Vec2.ipynb">
+        <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
+    </a>
+""", unsafe_allow_html=True)
+# Load examples
+AUDIO_FILE_PATH = "inputs"
+audio_files = sorted(os.listdir(AUDIO_FILE_PATH))
+selected_audio = st.selectbox("Select an audio", audio_files)
+# Creating a simplified Python list of audio file types
+audio_file_types = ["mp3", "flac", "wav", "aac", "ogg", "aiff", "wma", "m4a", "ape", "dsf", "dff", "midi", "mid", "opus", "amr"]
+uploadedfile = st.file_uploader("Try it for yourself!", type=audio_file_types)
+if uploadedfile:
+    selected_audio = f"{AUDIO_FILE_PATH}/{uploadedfile.name}"
+    save_uploadedfile(uploadedfile, AUDIO_FILE_PATH)
+elif selected_audio:
+    selected_audio = f"{AUDIO_FILE_PATH}/{selected_audio}"
+# Audio playback and transcription
+st.subheader("Play Audio")
+with open(selected_audio, 'rb') as audio_file:
+    audio_bytes = audio_file.read()
+st.audio(audio_bytes)
+st.subheader(f"Transcription for {selected_audio}:")
+spark = init_spark()
+pipeline = create_pipeline(model)
+output = fit_data(pipeline, selected_audio)
+st.text(output.first().result[0].strip())

Dockerfile ADDED Viewed

	@@ -0,0 +1,70 @@

+# Download base image ubuntu 18.04
+FROM ubuntu:18.04
+# Set environment variables
+ENV NB_USER jovyan
+ENV NB_UID 1000
+ENV HOME /home/${NB_USER}
+# Install required packages
+RUN apt-get update && apt-get install -y \
+    tar \
+    wget \
+    bash \
+    rsync \
+    gcc \
+    libfreetype6-dev \
+    libhdf5-serial-dev \
+    libpng-dev \
+    libzmq3-dev \
+    python3 \
+    python3-dev \
+    python3-pip \
+    unzip \
+    pkg-config \
+    software-properties-common \
+    graphviz \
+    openjdk-8-jdk \
+    ant \
+    ca-certificates-java \
+    && apt-get clean \
+    && update-ca-certificates -f;
+# Install Python 3.8 and pip
+RUN add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y python3.8 python3-pip \
+    && apt-get clean;
+# Set up JAVA_HOME
+ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
+RUN mkdir -p ${HOME} \
+    && echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
+    && chown -R ${NB_UID}:${NB_UID} ${HOME}
+# Create a new user named "jovyan" with user ID 1000
+RUN useradd -m -u ${NB_UID} ${NB_USER}
+# Switch to the "jovyan" user
+USER ${NB_USER}
+# Set home and path variables for the user
+ENV HOME=/home/${NB_USER} \
+    PATH=/home/${NB_USER}/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR ${HOME}
+# Upgrade pip and install Python dependencies
+RUN python3.8 -m pip install --upgrade pip
+COPY requirements.txt /tmp/requirements.txt
+RUN python3.8 -m pip install -r /tmp/requirements.txt
+# Copy the application code into the container at /home/jovyan
+COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
+# Expose port for Streamlit
+EXPOSE 7860
+# Define the entry point for the container
+ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]

inputs/audio-1.flac ADDED Viewed

Binary file (112 kB). View file

inputs/audio-10.flac ADDED Viewed

Binary file (76 kB). View file

inputs/audio-2.flac ADDED Viewed

Binary file (49 kB). View file

inputs/audio-3.flac ADDED Viewed

Binary file (74 kB). View file

inputs/audio-4.flac ADDED Viewed

Binary file (113 kB). View file

inputs/audio-5.flac ADDED Viewed

Binary file (138 kB). View file

inputs/audio-6.flac ADDED Viewed

Binary file (36.5 kB). View file

inputs/audio-7.flac ADDED Viewed

Binary file (177 kB). View file

inputs/audio-8.flac ADDED Viewed

Binary file (94.3 kB). View file

inputs/audio-9.flac ADDED Viewed

Binary file (129 kB). View file

pages/Workflow & Model Overview.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import streamlit as st
+# Custom CSS for better styling
+st.markdown("""
+    <style>
+        .main-title {
+            font-size: 36px;
+            color: #4A90E2;
+            font-weight: bold;
+            text-align: center;
+        }
+        .sub-title {
+            font-size: 24px;
+            color: #4A90E2;
+            margin-top: 20px;
+        }
+        .section {
+            background-color: #f9f9f9;
+            padding: 15px;
+            border-radius: 10px;
+            margin-top: 20px;
+        }
+        .section p, .section ul {
+            color: #666666;
+        }
+        .link {
+            color: #4A90E2;
+            text-decoration: none;
+        }
+        .benchmark-table {
+            width: 100%;
+            border-collapse: collapse;
+            margin-top: 20px;
+        }
+        .benchmark-table th, .benchmark-table td {
+            border: 1px solid #ddd;
+            padding: 8px;
+            text-align: left;
+        }
+        .benchmark-table th {
+            background-color: #4A90E2;
+            color: white;
+        }
+        .benchmark-table td {
+            background-color: #f2f2f2;
+        }
+    </style>
+""", unsafe_allow_html=True)
+# Main Title
+st.markdown('<div class="main-title">Wav2Vec2 for Speech Recognition</div>', unsafe_allow_html=True)
+# Description
+st.markdown("""
+<div class="section">
+    <p><strong>Wav2Vec2</strong> is a groundbreaking model in Automatic Speech Recognition (ASR), developed to learn speech representations from raw audio. This model achieves exceptional accuracy with minimal labeled data, making it ideal for low-resource settings. Adapted for Spark NLP, Wav2Vec2 enables scalable, production-ready ASR applications.</p>
+</div>
+""", unsafe_allow_html=True)
+# Why, Where, and When to Use Wav2Vec2
+st.markdown('<div class="sub-title">Why, Where, and When to Use Wav2Vec2</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <p>Use <strong>Wav2Vec2</strong> when you need a robust ASR solution that excels in scenarios with limited labeled data. It’s perfect for various speech-to-text applications where scalability and accuracy are critical. Some ideal use cases include:</p>
+    <ul>
+        <li><strong>Transcription Services:</strong> Efficiently convert large volumes of speech into text, vital for media, legal, and healthcare industries.</li>
+        <li><strong>Voice-Activated Assistants:</strong> Enhance the accuracy of voice commands in smart devices and personal assistants.</li>
+        <li><strong>Meeting Summarization:</strong> Automatically transcribe and summarize meetings, aiding in easy content review and catch-up for absentees.</li>
+        <li><strong>Language Learning Tools:</strong> Assist learners in improving pronunciation by providing real-time speech-to-text feedback.</li>
+        <li><strong>Accessibility Enhancements:</strong> Generate real-time captions for videos and live events, making content accessible to the hearing impaired.</li>
+        <li><strong>Call Center Analytics:</strong> Analyze customer interactions for insights and quality monitoring.</li>
+    </ul>
+</div>
+""", unsafe_allow_html=True)
+# How to Use the Model
+st.markdown('<div class="sub-title">How to Use the Model</div>', unsafe_allow_html=True)
+st.code('''
+audio_assembler = AudioAssembler() \\
+    .setInputCol("audio_content") \\
+    .setOutputCol("audio_assembler")
+speech_to_text = Wav2Vec2ForCTC \\
+    .pretrained("asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman", "en")\\
+    .setInputCols("audio_assembler") \\
+    .setOutputCol("text")
+pipeline = Pipeline(stages=[
+  audio_assembler,
+  speech_to_text,
+])
+pipelineModel = pipeline.fit(audioDf)
+pipelineDF = pipelineModel.transform(audioDf)
+''', language='python')
+# Best Practices & Tips
+st.markdown('<div class="sub-title">Best Practices & Tips</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <ul>
+        <li><strong>Preprocessing:</strong> Ensure your audio data is clear and well-prepared by removing background noise and normalizing audio levels for the best transcription results.</li>
+        <li><strong>Fine-tuning:</strong> For specific use cases or languages, consider fine-tuning the model on your own dataset to improve accuracy.</li>
+        <li><strong>Batch Processing:</strong> Leverage Spark NLP's distributed processing capabilities to handle large-scale audio datasets efficiently.</li>
+        <li><strong>Model Evaluation:</strong> Regularly evaluate the model's performance on your specific use case using metrics like Word Error Rate (WER) to ensure it meets your accuracy requirements.</li>
+        <li><strong>Resource Management:</strong> When deploying in production, monitor resource usage, especially for large models, to optimize performance and cost.</li>
+    </ul>
+</div>
+""", unsafe_allow_html=True)
+# Model Information
+st.markdown('<div class="sub-title">Model Information</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <table class="benchmark-table">
+        <tr>
+            <th>Attribute</th>
+            <th>Description</th>
+        </tr>
+        <tr>
+            <td><strong>Model Name</strong></td>
+            <td>asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman</td>
+        </tr>
+        <tr>
+            <td><strong>Compatibility</strong></td>
+            <td>Spark NLP 4.2.0+</td>
+        </tr>
+        <tr>
+            <td><strong>License</strong></td>
+            <td>Open Source</td>
+        </tr>
+        <tr>
+            <td><strong>Edition</strong></td>
+            <td>Official</td>
+        </tr>
+        <tr>
+            <td><strong>Input Labels</strong></td>
+            <td>[audio_assembler]</td>
+        </tr>
+        <tr>
+            <td><strong>Output Labels</strong></td>
+            <td>[text]</td>
+        </tr>
+        <tr>
+            <td><strong>Language</strong></td>
+            <td>en</td>
+        </tr>
+        <tr>
+            <td><strong>Size</strong></td>
+            <td>1.2 GB</td>
+        </tr>
+    </table>
+</div>
+""", unsafe_allow_html=True)
+# Data Source Section
+st.markdown('<div class="sub-title">Data Source</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <p>The Wav2Vec2 model is available on <a class="link" href="https://huggingface.co/jonatasgrosman/asr_wav2vec2_large_xlsr_53_english" target="_blank">Hugging Face</a>. This model, trained by <em>jonatasgrosman</em>, has been adapted for use with Spark NLP, ensuring it is optimized for large-scale applications.</p>
+</div>
+""", unsafe_allow_html=True)
+# Conclusion
+st.markdown('<div class="sub-title">Conclusion</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <p><strong>Wav2Vec2</strong> is a versatile and powerful ASR model that excels in scenarios with limited labeled data, making it a game-changer in the field of speech recognition. Its seamless integration with Spark NLP allows for scalable, efficient, and accurate deployment in various real-world applications, from transcription services to voice-activated systems.</p>
+</div>
+""", unsafe_allow_html=True)
+# References
+st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <ul>
+        <li><a class="link" href="https://sparknlp.org/2022/09/24/asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman_en.html" target="_blank">Wav2Vec2 Model on Spark NLP</a></li>
+        <li><a class="link" href="https://huggingface.co/jonatasgrosman/asr_wav2vec2_large_xlsr_53_english" target="_blank">Wav2Vec2 Model on Hugging Face</a></li>
+        <li><a class="link" href="https://arxiv.org/abs/2006.11477" target="_blank">wav2vec 2.0 Paper</a></li>
+        <li><a class="link" href="https://github.com/pytorch/fairseq/tree/master/examples/wav2vec" target="_blank">Wav2Vec2 GitHub Repository</a></li>
+    </ul>
+</div>
+""", unsafe_allow_html=True)
+# Community & Support
+st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <ul>
+        <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Comprehensive documentation and examples.</li>
+        <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Join the community for live discussions and support.</li>
+        <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Report issues, request features, and contribute to the project.</li>
+        <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Read articles and tutorials about Spark NLP.</li>
+        <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Watch video tutorials and demonstrations.</li>
+    </ul>
+</div>
+""", unsafe_allow_html=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+spark-nlp
+pyspark
+librosa
+pandas