Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
import Levenshtein | |
import numpy as np | |
from transformers import pipeline | |
# Function to securely load the Hugging Face API token | |
def load_hf_token(): | |
# Replace this with your actual Hugging Face API token | |
return "your_huggingface_api_token" | |
# Function to query the Hugging Face Inference API | |
def transcribe_audio_hf(audio): | |
""" | |
Transcribes speech from an audio file using the Hugging Face Inference API. | |
Args: | |
audio (numpy.array): Audio data as a numpy array. | |
Returns: | |
str: The transcription of the speech in the audio file. | |
""" | |
API_URL = "https://api-inference.huggingface.co/models/jonatasgrosman/wav2vec2-large-xlsr-53-arabic" | |
headers = {"Authorization": f"Bearer {load_hf_token()}"} | |
response = requests.post(API_URL, headers=headers, data=audio.tobytes()) | |
return response.json().get("text", "").strip() | |
# Function to calculate Levenshtein similarity | |
def levenshtein_similarity(transcription1, transcription2): | |
""" | |
Calculate the Levenshtein similarity between two transcriptions. | |
Args: | |
transcription1 (str): The first transcription. | |
transcription2 (str): The second transcription. | |
Returns: | |
float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions. | |
""" | |
distance = Levenshtein.distance(transcription1, transcription2) | |
max_len = max(len(transcription1), len(transcription2)) | |
return 1 - distance / max_len # Normalize to get similarity score | |
# Function to evaluate audio similarity | |
def evaluate_audio_similarity(original_audio, user_audio): | |
""" | |
Compares the similarity between the transcription of an original audio file and a user's audio file. | |
Args: | |
original_audio (numpy.array): Original audio data. | |
user_audio (numpy.array): User's audio data. | |
Returns: | |
tuple: Transcriptions and Levenshtein similarity score. | |
""" | |
transcription_original = transcribe_audio_hf(original_audio) | |
transcription_user = transcribe_audio_hf(user_audio) | |
similarity_score = levenshtein_similarity(transcription_original, transcription_user) | |
return transcription_original, transcription_user, similarity_score | |
# Set up the Whisper ASR model for full-context and streaming ASR | |
whisper_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") | |
# Full-context ASR function | |
def full_context_asr(audio): | |
sr, y = audio | |
y = y.astype(np.float32) | |
y /= np.max(np.abs(y)) | |
return whisper_transcriber({"sampling_rate": sr, "raw": y})["text"] | |
# Streaming ASR function | |
def streaming_asr(stream, new_chunk): | |
sr, y = new_chunk | |
y = y.astype(np.float32) | |
y /= np.max(np.abs(y)) | |
if stream is not None: | |
stream = np.concatenate([stream, y]) | |
else: | |
stream = y | |
return stream, whisper_transcriber({"sampling_rate": sr, "raw": stream})["text"] | |
# Define Gradio interface for full-context ASR | |
def gradio_full_context_interface(audio): | |
if audio is not None: | |
transcription = full_context_asr(audio) | |
return transcription | |
else: | |
return "Please provide an audio file." | |
# Define Gradio interface for audio similarity checking | |
def gradio_similarity_interface(original_audio, user_audio): | |
if original_audio is not None and user_audio is not None: | |
transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio, user_audio) | |
result = { | |
"Original Transcription": transcription_original, | |
"User Transcription": transcription_user, | |
"Levenshtein Similarity Score": similarity_score, | |
} | |
if similarity_score > 0.8: # Adjust the threshold as needed | |
result["Feedback"] = "The pronunciation is likely correct based on transcription similarity." | |
else: | |
result["Feedback"] = "The pronunciation may be incorrect based on transcription similarity." | |
return result | |
else: | |
return "Please provide both original and user audio for comparison." | |
# Create Gradio app for full-context ASR | |
full_context_demo = gr.Interface( | |
fn=gradio_full_context_interface, | |
inputs=gr.Audio(source="microphone", type="numpy"), | |
outputs="text", | |
title="Full-Context ASR Demo" | |
) | |
# Create Gradio app for streaming ASR | |
streaming_demo = gr.Interface( | |
fn=streaming_asr, | |
inputs=["state", gr.Audio(source="microphone", type="numpy", streaming=True)], | |
outputs=["state", "text"], | |
live=True, | |
title="Streaming ASR Demo" | |
) | |
# Create Gradio app for audio similarity checking | |
similarity_demo = gr.Interface( | |
fn=gradio_similarity_interface, | |
inputs=[ | |
gr.Audio(source="upload", type="numpy", label="Original Audio"), | |
gr.Audio(source="upload", type="numpy", label="User Audio") | |
], | |
outputs="json", | |
title="Audio Transcription and Similarity Checker" | |
) | |
# Launch all three demos | |
gr.TabbedInterface([full_context_demo, streaming_demo, similarity_demo], ["Full-Context ASR", "Streaming ASR", "Similarity Checker"]).launch() | |