abdullahmubeen10 commited on
Commit
6b52778
·
verified ·
1 Parent(s): 63c4ad8

Upload 15 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+
6
+ from sparknlp.base import *
7
+ from sparknlp.annotator import *
8
+ from pyspark.ml import Pipeline
9
+ from sparknlp.pretrained import PretrainedPipeline
10
+
11
+ # Page configuration
12
+ st.set_page_config(
13
+ layout="wide",
14
+ initial_sidebar_state="auto"
15
+ )
16
+
17
+ # Custom CSS for styling
18
+ st.markdown("""
19
+ <style>
20
+ .main-title {
21
+ font-size: 36px;
22
+ color: #4A90E2;
23
+ font-weight: bold;
24
+ text-align: center;
25
+ }
26
+ .section {
27
+ background-color: #f9f9f9;
28
+ padding: 10px;
29
+ border-radius: 10px;
30
+ margin-top: 10px;
31
+ }
32
+ .section p, .section ul {
33
+ color: #666666;
34
+ }
35
+ </style>
36
+ """, unsafe_allow_html=True)
37
+
38
+ @st.cache_resource
39
+ def init_spark():
40
+ """Initialize Spark NLP."""
41
+ return sparknlp.start()
42
+
43
+ @st.cache_resource
44
+ def create_pipeline(model):
45
+ """Create a Spark NLP pipeline for audio processing."""
46
+ audio_assembler = AudioAssembler() \
47
+ .setInputCol("audio_content") \
48
+ .setOutputCol("audio_assembler")
49
+
50
+ speech_to_text = Wav2Vec2ForCTC \
51
+ .pretrained(model)\
52
+ .setInputCols("audio_assembler") \
53
+ .setOutputCol("text")
54
+
55
+ pipeline = Pipeline(stages=[
56
+ audio_assembler,
57
+ speech_to_text
58
+ ])
59
+ return pipeline
60
+
61
+ def fit_data(pipeline, fed_data):
62
+ """Fit the data into the pipeline and return the transcription."""
63
+ data, sampling_rate = librosa.load(fed_data, sr=16000)
64
+ data = [float(x) for x in data]
65
+
66
+ schema = StructType([
67
+ StructField("audio_content", ArrayType(FloatType())),
68
+ StructField("sampling_rate", LongType())
69
+ ])
70
+
71
+ df = pd.DataFrame({
72
+ "audio_content": [data],
73
+ "sampling_rate": [sampling_rate]
74
+ })
75
+
76
+ spark_df = spark.createDataFrame(df, schema)
77
+ pipeline_df = pipeline.fit(spark_df).transform(spark_df)
78
+ return pipeline_df.select("text.result")
79
+
80
+ def save_uploadedfile(uploadedfile, path):
81
+ """Save the uploaded file to the specified path."""
82
+ filepath = os.path.join(path, uploadedfile.name)
83
+ with open(filepath, "wb") as f:
84
+ if hasattr(uploadedfile, 'getbuffer'):
85
+ f.write(uploadedfile.getbuffer())
86
+ else:
87
+ f.write(uploadedfile.read())
88
+
89
+ # Sidebar content
90
+ model_list = [
91
+ "asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman",
92
+ "asr_wav2vec2_base_100h_13K_steps",
93
+ "asr_wav2vec2_base_100h_ngram",
94
+ "asr_wav2vec2_base_100h_by_facebook",
95
+ "asr_wav2vec2_base_100h_test",
96
+ "asr_wav2vec2_base_960h"
97
+ ]
98
+
99
+ model = st.sidebar.selectbox(
100
+ "Choose the pretrained model",
101
+ model_list,
102
+ help="For more info about the models visit: https://sparknlp.org/models"
103
+ )
104
+
105
+ # Main content
106
+ st.markdown('<div class="main-title">Speech Recognition With Wav2Vec2ForCTC</div>', unsafe_allow_html=True)
107
+ st.markdown('<div class="section"><p>This demo transcribes audio files into texts using the <code>Wav2Vec2ForCTC</code> Annotator and advanced speech recognition models.</p></div>', unsafe_allow_html=True)
108
+
109
+ # Reference notebook link in sidebar
110
+ st.sidebar.markdown('Reference notebook:')
111
+ st.sidebar.markdown("""
112
+ <a href="https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/open-source-nlp/17.0.Automatic_Speech_Recognition_Wav2Vec2.ipynb">
113
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
114
+ </a>
115
+ """, unsafe_allow_html=True)
116
+
117
+ # Load examples
118
+ AUDIO_FILE_PATH = "inputs"
119
+ audio_files = sorted(os.listdir(AUDIO_FILE_PATH))
120
+
121
+ selected_audio = st.selectbox("Select an audio", audio_files)
122
+
123
+ # Creating a simplified Python list of audio file types
124
+ audio_file_types = ["mp3", "flac", "wav", "aac", "ogg", "aiff", "wma", "m4a", "ape", "dsf", "dff", "midi", "mid", "opus", "amr"]
125
+ uploadedfile = st.file_uploader("Try it for yourself!", type=audio_file_types)
126
+
127
+ if uploadedfile:
128
+ selected_audio = f"{AUDIO_FILE_PATH}/{uploadedfile.name}"
129
+ save_uploadedfile(uploadedfile, AUDIO_FILE_PATH)
130
+ elif selected_audio:
131
+ selected_audio = f"{AUDIO_FILE_PATH}/{selected_audio}"
132
+
133
+ # Audio playback and transcription
134
+ st.subheader("Play Audio")
135
+
136
+ with open(selected_audio, 'rb') as audio_file:
137
+ audio_bytes = audio_file.read()
138
+ st.audio(audio_bytes)
139
+
140
+ st.subheader(f"Transcription for {selected_audio}:")
141
+
142
+ spark = init_spark()
143
+ pipeline = create_pipeline(model)
144
+ output = fit_data(pipeline, selected_audio)
145
+
146
+ st.text(output.first().result[0].strip())
Dockerfile ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ # Set environment variables
5
+ ENV NB_USER jovyan
6
+ ENV NB_UID 1000
7
+ ENV HOME /home/${NB_USER}
8
+
9
+ # Install required packages
10
+ RUN apt-get update && apt-get install -y \
11
+ tar \
12
+ wget \
13
+ bash \
14
+ rsync \
15
+ gcc \
16
+ libfreetype6-dev \
17
+ libhdf5-serial-dev \
18
+ libpng-dev \
19
+ libzmq3-dev \
20
+ python3 \
21
+ python3-dev \
22
+ python3-pip \
23
+ unzip \
24
+ pkg-config \
25
+ software-properties-common \
26
+ graphviz \
27
+ openjdk-8-jdk \
28
+ ant \
29
+ ca-certificates-java \
30
+ && apt-get clean \
31
+ && update-ca-certificates -f;
32
+
33
+ # Install Python 3.8 and pip
34
+ RUN add-apt-repository ppa:deadsnakes/ppa \
35
+ && apt-get update \
36
+ && apt-get install -y python3.8 python3-pip \
37
+ && apt-get clean;
38
+
39
+ # Set up JAVA_HOME
40
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
41
+ RUN mkdir -p ${HOME} \
42
+ && echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
43
+ && chown -R ${NB_UID}:${NB_UID} ${HOME}
44
+
45
+ # Create a new user named "jovyan" with user ID 1000
46
+ RUN useradd -m -u ${NB_UID} ${NB_USER}
47
+
48
+ # Switch to the "jovyan" user
49
+ USER ${NB_USER}
50
+
51
+ # Set home and path variables for the user
52
+ ENV HOME=/home/${NB_USER} \
53
+ PATH=/home/${NB_USER}/.local/bin:$PATH
54
+
55
+ # Set the working directory to the user's home directory
56
+ WORKDIR ${HOME}
57
+
58
+ # Upgrade pip and install Python dependencies
59
+ RUN python3.8 -m pip install --upgrade pip
60
+ COPY requirements.txt /tmp/requirements.txt
61
+ RUN python3.8 -m pip install -r /tmp/requirements.txt
62
+
63
+ # Copy the application code into the container at /home/jovyan
64
+ COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
65
+
66
+ # Expose port for Streamlit
67
+ EXPOSE 7860
68
+
69
+ # Define the entry point for the container
70
+ ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
inputs/audio-1.flac ADDED
Binary file (112 kB). View file
 
inputs/audio-10.flac ADDED
Binary file (76 kB). View file
 
inputs/audio-2.flac ADDED
Binary file (49 kB). View file
 
inputs/audio-3.flac ADDED
Binary file (74 kB). View file
 
inputs/audio-4.flac ADDED
Binary file (113 kB). View file
 
inputs/audio-5.flac ADDED
Binary file (138 kB). View file
 
inputs/audio-6.flac ADDED
Binary file (36.5 kB). View file
 
inputs/audio-7.flac ADDED
Binary file (177 kB). View file
 
inputs/audio-8.flac ADDED
Binary file (94.3 kB). View file
 
inputs/audio-9.flac ADDED
Binary file (129 kB). View file
 
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Custom CSS for better styling
4
+ st.markdown("""
5
+ <style>
6
+ .main-title {
7
+ font-size: 36px;
8
+ color: #4A90E2;
9
+ font-weight: bold;
10
+ text-align: center;
11
+ }
12
+ .sub-title {
13
+ font-size: 24px;
14
+ color: #4A90E2;
15
+ margin-top: 20px;
16
+ }
17
+ .section {
18
+ background-color: #f9f9f9;
19
+ padding: 15px;
20
+ border-radius: 10px;
21
+ margin-top: 20px;
22
+ }
23
+ .section p, .section ul {
24
+ color: #666666;
25
+ }
26
+ .link {
27
+ color: #4A90E2;
28
+ text-decoration: none;
29
+ }
30
+ .benchmark-table {
31
+ width: 100%;
32
+ border-collapse: collapse;
33
+ margin-top: 20px;
34
+ }
35
+ .benchmark-table th, .benchmark-table td {
36
+ border: 1px solid #ddd;
37
+ padding: 8px;
38
+ text-align: left;
39
+ }
40
+ .benchmark-table th {
41
+ background-color: #4A90E2;
42
+ color: white;
43
+ }
44
+ .benchmark-table td {
45
+ background-color: #f2f2f2;
46
+ }
47
+ </style>
48
+ """, unsafe_allow_html=True)
49
+
50
+ # Main Title
51
+ st.markdown('<div class="main-title">Wav2Vec2 for Speech Recognition</div>', unsafe_allow_html=True)
52
+
53
+ # Description
54
+ st.markdown("""
55
+ <div class="section">
56
+ <p><strong>Wav2Vec2</strong> is a groundbreaking model in Automatic Speech Recognition (ASR), developed to learn speech representations from raw audio. This model achieves exceptional accuracy with minimal labeled data, making it ideal for low-resource settings. Adapted for Spark NLP, Wav2Vec2 enables scalable, production-ready ASR applications.</p>
57
+ </div>
58
+ """, unsafe_allow_html=True)
59
+
60
+ # Why, Where, and When to Use Wav2Vec2
61
+ st.markdown('<div class="sub-title">Why, Where, and When to Use Wav2Vec2</div>', unsafe_allow_html=True)
62
+ st.markdown("""
63
+ <div class="section">
64
+ <p>Use <strong>Wav2Vec2</strong> when you need a robust ASR solution that excels in scenarios with limited labeled data. It’s perfect for various speech-to-text applications where scalability and accuracy are critical. Some ideal use cases include:</p>
65
+ <ul>
66
+ <li><strong>Transcription Services:</strong> Efficiently convert large volumes of speech into text, vital for media, legal, and healthcare industries.</li>
67
+ <li><strong>Voice-Activated Assistants:</strong> Enhance the accuracy of voice commands in smart devices and personal assistants.</li>
68
+ <li><strong>Meeting Summarization:</strong> Automatically transcribe and summarize meetings, aiding in easy content review and catch-up for absentees.</li>
69
+ <li><strong>Language Learning Tools:</strong> Assist learners in improving pronunciation by providing real-time speech-to-text feedback.</li>
70
+ <li><strong>Accessibility Enhancements:</strong> Generate real-time captions for videos and live events, making content accessible to the hearing impaired.</li>
71
+ <li><strong>Call Center Analytics:</strong> Analyze customer interactions for insights and quality monitoring.</li>
72
+ </ul>
73
+ </div>
74
+ """, unsafe_allow_html=True)
75
+
76
+ # How to Use the Model
77
+ st.markdown('<div class="sub-title">How to Use the Model</div>', unsafe_allow_html=True)
78
+ st.code('''
79
+ audio_assembler = AudioAssembler() \\
80
+ .setInputCol("audio_content") \\
81
+ .setOutputCol("audio_assembler")
82
+
83
+ speech_to_text = Wav2Vec2ForCTC \\
84
+ .pretrained("asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman", "en")\\
85
+ .setInputCols("audio_assembler") \\
86
+ .setOutputCol("text")
87
+
88
+ pipeline = Pipeline(stages=[
89
+ audio_assembler,
90
+ speech_to_text,
91
+ ])
92
+
93
+ pipelineModel = pipeline.fit(audioDf)
94
+
95
+ pipelineDF = pipelineModel.transform(audioDf)
96
+ ''', language='python')
97
+
98
+ # Best Practices & Tips
99
+ st.markdown('<div class="sub-title">Best Practices & Tips</div>', unsafe_allow_html=True)
100
+ st.markdown("""
101
+ <div class="section">
102
+ <ul>
103
+ <li><strong>Preprocessing:</strong> Ensure your audio data is clear and well-prepared by removing background noise and normalizing audio levels for the best transcription results.</li>
104
+ <li><strong>Fine-tuning:</strong> For specific use cases or languages, consider fine-tuning the model on your own dataset to improve accuracy.</li>
105
+ <li><strong>Batch Processing:</strong> Leverage Spark NLP's distributed processing capabilities to handle large-scale audio datasets efficiently.</li>
106
+ <li><strong>Model Evaluation:</strong> Regularly evaluate the model's performance on your specific use case using metrics like Word Error Rate (WER) to ensure it meets your accuracy requirements.</li>
107
+ <li><strong>Resource Management:</strong> When deploying in production, monitor resource usage, especially for large models, to optimize performance and cost.</li>
108
+ </ul>
109
+ </div>
110
+ """, unsafe_allow_html=True)
111
+
112
+ # Model Information
113
+ st.markdown('<div class="sub-title">Model Information</div>', unsafe_allow_html=True)
114
+ st.markdown("""
115
+ <div class="section">
116
+ <table class="benchmark-table">
117
+ <tr>
118
+ <th>Attribute</th>
119
+ <th>Description</th>
120
+ </tr>
121
+ <tr>
122
+ <td><strong>Model Name</strong></td>
123
+ <td>asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman</td>
124
+ </tr>
125
+ <tr>
126
+ <td><strong>Compatibility</strong></td>
127
+ <td>Spark NLP 4.2.0+</td>
128
+ </tr>
129
+ <tr>
130
+ <td><strong>License</strong></td>
131
+ <td>Open Source</td>
132
+ </tr>
133
+ <tr>
134
+ <td><strong>Edition</strong></td>
135
+ <td>Official</td>
136
+ </tr>
137
+ <tr>
138
+ <td><strong>Input Labels</strong></td>
139
+ <td>[audio_assembler]</td>
140
+ </tr>
141
+ <tr>
142
+ <td><strong>Output Labels</strong></td>
143
+ <td>[text]</td>
144
+ </tr>
145
+ <tr>
146
+ <td><strong>Language</strong></td>
147
+ <td>en</td>
148
+ </tr>
149
+ <tr>
150
+ <td><strong>Size</strong></td>
151
+ <td>1.2 GB</td>
152
+ </tr>
153
+ </table>
154
+ </div>
155
+ """, unsafe_allow_html=True)
156
+
157
+ # Data Source Section
158
+ st.markdown('<div class="sub-title">Data Source</div>', unsafe_allow_html=True)
159
+ st.markdown("""
160
+ <div class="section">
161
+ <p>The Wav2Vec2 model is available on <a class="link" href="https://huggingface.co/jonatasgrosman/asr_wav2vec2_large_xlsr_53_english" target="_blank">Hugging Face</a>. This model, trained by <em>jonatasgrosman</em>, has been adapted for use with Spark NLP, ensuring it is optimized for large-scale applications.</p>
162
+ </div>
163
+ """, unsafe_allow_html=True)
164
+
165
+ # Conclusion
166
+ st.markdown('<div class="sub-title">Conclusion</div>', unsafe_allow_html=True)
167
+ st.markdown("""
168
+ <div class="section">
169
+ <p><strong>Wav2Vec2</strong> is a versatile and powerful ASR model that excels in scenarios with limited labeled data, making it a game-changer in the field of speech recognition. Its seamless integration with Spark NLP allows for scalable, efficient, and accurate deployment in various real-world applications, from transcription services to voice-activated systems.</p>
170
+ </div>
171
+ """, unsafe_allow_html=True)
172
+
173
+ # References
174
+ st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
175
+ st.markdown("""
176
+ <div class="section">
177
+ <ul>
178
+ <li><a class="link" href="https://sparknlp.org/2022/09/24/asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman_en.html" target="_blank">Wav2Vec2 Model on Spark NLP</a></li>
179
+ <li><a class="link" href="https://huggingface.co/jonatasgrosman/asr_wav2vec2_large_xlsr_53_english" target="_blank">Wav2Vec2 Model on Hugging Face</a></li>
180
+ <li><a class="link" href="https://arxiv.org/abs/2006.11477" target="_blank">wav2vec 2.0 Paper</a></li>
181
+ <li><a class="link" href="https://github.com/pytorch/fairseq/tree/master/examples/wav2vec" target="_blank">Wav2Vec2 GitHub Repository</a></li>
182
+ </ul>
183
+ </div>
184
+ """, unsafe_allow_html=True)
185
+
186
+ # Community & Support
187
+ st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
188
+ st.markdown("""
189
+ <div class="section">
190
+ <ul>
191
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Comprehensive documentation and examples.</li>
192
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Join the community for live discussions and support.</li>
193
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Report issues, request features, and contribute to the project.</li>
194
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Read articles and tutorials about Spark NLP.</li>
195
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Watch video tutorials and demonstrations.</li>
196
+ </ul>
197
+ </div>
198
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ spark-nlp
3
+ pyspark
4
+ librosa
5
+ pandas