Spaces:

spark-nlp
/

Wav2Vec2ForCTC

Sleeping

App Files Files Community

Wav2Vec2ForCTC / Demo.py

abdullahmubeen10

Upload 15 files

6b52778 verified 11 months ago

raw

history blame

4.64 kB

	import streamlit as st
	import sparknlp
	import os
	import pandas as pd

	from sparknlp.base import *
	from sparknlp.annotator import *
	from pyspark.ml import Pipeline
	from sparknlp.pretrained import PretrainedPipeline

	# Page configuration
	st.set_page_config(
	layout="wide",
	initial_sidebar_state="auto"
	)

	# Custom CSS for styling
	st.markdown("""
	<style>
	.main-title {
	font-size: 36px;
	color: #4A90E2;
	font-weight: bold;
	text-align: center;
	}
	.section {
	background-color: #f9f9f9;
	padding: 10px;
	border-radius: 10px;
	margin-top: 10px;
	}
	.section p, .section ul {
	color: #666666;
	}
	</style>
	""", unsafe_allow_html=True)

	@st.cache_resource
	def init_spark():
	"""Initialize Spark NLP."""
	return sparknlp.start()

	@st.cache_resource
	def create_pipeline(model):
	"""Create a Spark NLP pipeline for audio processing."""
	audio_assembler = AudioAssembler() \
	.setInputCol("audio_content") \
	.setOutputCol("audio_assembler")

	speech_to_text = Wav2Vec2ForCTC \
	.pretrained(model)\
	.setInputCols("audio_assembler") \
	.setOutputCol("text")

	pipeline = Pipeline(stages=[
	audio_assembler,
	speech_to_text
	])
	return pipeline

	def fit_data(pipeline, fed_data):
	"""Fit the data into the pipeline and return the transcription."""
	data, sampling_rate = librosa.load(fed_data, sr=16000)
	data = [float(x) for x in data]

	schema = StructType([
	StructField("audio_content", ArrayType(FloatType())),
	StructField("sampling_rate", LongType())
	])

	df = pd.DataFrame({
	"audio_content": [data],
	"sampling_rate": [sampling_rate]
	})

	spark_df = spark.createDataFrame(df, schema)
	pipeline_df = pipeline.fit(spark_df).transform(spark_df)
	return pipeline_df.select("text.result")

	def save_uploadedfile(uploadedfile, path):
	"""Save the uploaded file to the specified path."""
	filepath = os.path.join(path, uploadedfile.name)
	with open(filepath, "wb") as f:
	if hasattr(uploadedfile, 'getbuffer'):
	f.write(uploadedfile.getbuffer())
	else:
	f.write(uploadedfile.read())

	# Sidebar content
	model_list = [
	"asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman",
	"asr_wav2vec2_base_100h_13K_steps",
	"asr_wav2vec2_base_100h_ngram",
	"asr_wav2vec2_base_100h_by_facebook",
	"asr_wav2vec2_base_100h_test",
	"asr_wav2vec2_base_960h"
	]

	model = st.sidebar.selectbox(
	"Choose the pretrained model",
	model_list,
	help="For more info about the models visit: https://sparknlp.org/models"
	)

	# Main content
	st.markdown('<div class="main-title">Speech Recognition With Wav2Vec2ForCTC</div>', unsafe_allow_html=True)
	st.markdown('<div class="section"><p>This demo transcribes audio files into texts using the <code>Wav2Vec2ForCTC</code> Annotator and advanced speech recognition models.</p></div>', unsafe_allow_html=True)

	# Reference notebook link in sidebar
	st.sidebar.markdown('Reference notebook:')
	st.sidebar.markdown("""
	<a href="https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/open-source-nlp/17.0.Automatic_Speech_Recognition_Wav2Vec2.ipynb">
	<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
	</a>
	""", unsafe_allow_html=True)

	# Load examples
	AUDIO_FILE_PATH = "inputs"
	audio_files = sorted(os.listdir(AUDIO_FILE_PATH))

	selected_audio = st.selectbox("Select an audio", audio_files)

	# Creating a simplified Python list of audio file types
	audio_file_types = ["mp3", "flac", "wav", "aac", "ogg", "aiff", "wma", "m4a", "ape", "dsf", "dff", "midi", "mid", "opus", "amr"]
	uploadedfile = st.file_uploader("Try it for yourself!", type=audio_file_types)

	if uploadedfile:
	selected_audio = f"{AUDIO_FILE_PATH}/{uploadedfile.name}"
	save_uploadedfile(uploadedfile, AUDIO_FILE_PATH)
	elif selected_audio:
	selected_audio = f"{AUDIO_FILE_PATH}/{selected_audio}"

	# Audio playback and transcription
	st.subheader("Play Audio")

	with open(selected_audio, 'rb') as audio_file:
	audio_bytes = audio_file.read()
	st.audio(audio_bytes)

	st.subheader(f"Transcription for {selected_audio}:")

	spark = init_spark()
	pipeline = create_pipeline(model)
	output = fit_data(pipeline, selected_audio)

	st.text(output.first().result[0].strip())