ASR-Arabic / app.py
Hammad712's picture
Update app.py
09b7ae0 verified
raw
history blame
5.15 kB
import gradio as gr
import requests
import Levenshtein
import numpy as np
from transformers import pipeline
# Function to securely load the Hugging Face API token
def load_hf_token():
# Replace this with your actual Hugging Face API token
return "your_huggingface_api_token"
# Function to query the Hugging Face Inference API
def transcribe_audio_hf(audio):
"""
Transcribes speech from an audio file using the Hugging Face Inference API.
Args:
audio (numpy.array): Audio data as a numpy array.
Returns:
str: The transcription of the speech in the audio file.
"""
API_URL = "https://api-inference.huggingface.co/models/jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
headers = {"Authorization": f"Bearer {load_hf_token()}"}
response = requests.post(API_URL, headers=headers, data=audio.tobytes())
return response.json().get("text", "").strip()
# Function to calculate Levenshtein similarity
def levenshtein_similarity(transcription1, transcription2):
"""
Calculate the Levenshtein similarity between two transcriptions.
Args:
transcription1 (str): The first transcription.
transcription2 (str): The second transcription.
Returns:
float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
"""
distance = Levenshtein.distance(transcription1, transcription2)
max_len = max(len(transcription1), len(transcription2))
return 1 - distance / max_len # Normalize to get similarity score
# Function to evaluate audio similarity
def evaluate_audio_similarity(original_audio, user_audio):
"""
Compares the similarity between the transcription of an original audio file and a user's audio file.
Args:
original_audio (numpy.array): Original audio data.
user_audio (numpy.array): User's audio data.
Returns:
tuple: Transcriptions and Levenshtein similarity score.
"""
transcription_original = transcribe_audio_hf(original_audio)
transcription_user = transcribe_audio_hf(user_audio)
similarity_score = levenshtein_similarity(transcription_original, transcription_user)
return transcription_original, transcription_user, similarity_score
# Set up the Whisper ASR model for full-context and streaming ASR
whisper_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
# Full-context ASR function
def full_context_asr(audio):
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
return whisper_transcriber({"sampling_rate": sr, "raw": y})["text"]
# Streaming ASR function
def streaming_asr(stream, new_chunk):
sr, y = new_chunk
y = y.astype(np.float32)
y /= np.max(np.abs(y))
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
return stream, whisper_transcriber({"sampling_rate": sr, "raw": stream})["text"]
# Define Gradio interface for full-context ASR
def gradio_full_context_interface(audio):
if audio is not None:
transcription = full_context_asr(audio)
return transcription
else:
return "Please provide an audio file."
# Define Gradio interface for audio similarity checking
def gradio_similarity_interface(original_audio, user_audio):
if original_audio is not None and user_audio is not None:
transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio, user_audio)
result = {
"Original Transcription": transcription_original,
"User Transcription": transcription_user,
"Levenshtein Similarity Score": similarity_score,
}
if similarity_score > 0.8: # Adjust the threshold as needed
result["Feedback"] = "The pronunciation is likely correct based on transcription similarity."
else:
result["Feedback"] = "The pronunciation may be incorrect based on transcription similarity."
return result
else:
return "Please provide both original and user audio for comparison."
# Create Gradio app for full-context ASR
full_context_demo = gr.Interface(
fn=gradio_full_context_interface,
inputs=gr.Audio(source="microphone", type="numpy"),
outputs="text",
title="Full-Context ASR Demo"
)
# Create Gradio app for streaming ASR
streaming_demo = gr.Interface(
fn=streaming_asr,
inputs=["state", gr.Audio(source="microphone", type="numpy", streaming=True)],
outputs=["state", "text"],
live=True,
title="Streaming ASR Demo"
)
# Create Gradio app for audio similarity checking
similarity_demo = gr.Interface(
fn=gradio_similarity_interface,
inputs=[
gr.Audio(source="upload", type="numpy", label="Original Audio"),
gr.Audio(source="upload", type="numpy", label="User Audio")
],
outputs="json",
title="Audio Transcription and Similarity Checker"
)
# Launch all three demos
gr.TabbedInterface([full_context_demo, streaming_demo, similarity_demo], ["Full-Context ASR", "Streaming ASR", "Similarity Checker"]).launch()