File size: 5,151 Bytes
09b7ae0
5487b58
913fbb4
09b7ae0
 
913fbb4
5487b58
 
09b7ae0
 
913fbb4
5487b58
09b7ae0
913fbb4
5487b58
913fbb4
09b7ae0
913fbb4
 
 
5487b58
 
09b7ae0
5487b58
913fbb4
09b7ae0
913fbb4
 
 
 
 
 
 
 
 
 
 
 
 
09b7ae0
 
913fbb4
 
 
09b7ae0
 
913fbb4
 
 
09b7ae0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92efd8f
09b7ae0
 
 
 
 
 
 
 
 
92efd8f
09b7ae0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
import requests
import Levenshtein
import numpy as np
from transformers import pipeline

# Function to securely load the Hugging Face API token
def load_hf_token():
    # Replace this with your actual Hugging Face API token
    return "your_huggingface_api_token"

# Function to query the Hugging Face Inference API
def transcribe_audio_hf(audio):
    """
    Transcribes speech from an audio file using the Hugging Face Inference API.
    Args:
        audio (numpy.array): Audio data as a numpy array.
    Returns:
        str: The transcription of the speech in the audio file.
    """
    API_URL = "https://api-inference.huggingface.co/models/jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
    headers = {"Authorization": f"Bearer {load_hf_token()}"}
    response = requests.post(API_URL, headers=headers, data=audio.tobytes())
    return response.json().get("text", "").strip()

# Function to calculate Levenshtein similarity
def levenshtein_similarity(transcription1, transcription2):
    """
    Calculate the Levenshtein similarity between two transcriptions.
    Args:
        transcription1 (str): The first transcription.
        transcription2 (str): The second transcription.
    Returns:
        float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
    """
    distance = Levenshtein.distance(transcription1, transcription2)
    max_len = max(len(transcription1), len(transcription2))
    return 1 - distance / max_len  # Normalize to get similarity score

# Function to evaluate audio similarity
def evaluate_audio_similarity(original_audio, user_audio):
    """
    Compares the similarity between the transcription of an original audio file and a user's audio file.
    Args:
        original_audio (numpy.array): Original audio data.
        user_audio (numpy.array): User's audio data.
    Returns:
        tuple: Transcriptions and Levenshtein similarity score.
    """
    transcription_original = transcribe_audio_hf(original_audio)
    transcription_user = transcribe_audio_hf(user_audio)
    similarity_score = levenshtein_similarity(transcription_original, transcription_user)
    return transcription_original, transcription_user, similarity_score

# Set up the Whisper ASR model for full-context and streaming ASR
whisper_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

# Full-context ASR function
def full_context_asr(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    return whisper_transcriber({"sampling_rate": sr, "raw": y})["text"]

# Streaming ASR function
def streaming_asr(stream, new_chunk):
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    
    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y
        
    return stream, whisper_transcriber({"sampling_rate": sr, "raw": stream})["text"]

# Define Gradio interface for full-context ASR
def gradio_full_context_interface(audio):
    if audio is not None:
        transcription = full_context_asr(audio)
        return transcription
    else:
        return "Please provide an audio file."

# Define Gradio interface for audio similarity checking
def gradio_similarity_interface(original_audio, user_audio):
    if original_audio is not None and user_audio is not None:
        transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio, user_audio)
        
        result = {
            "Original Transcription": transcription_original,
            "User Transcription": transcription_user,
            "Levenshtein Similarity Score": similarity_score,
        }
        
        if similarity_score > 0.8:  # Adjust the threshold as needed
            result["Feedback"] = "The pronunciation is likely correct based on transcription similarity."
        else:
            result["Feedback"] = "The pronunciation may be incorrect based on transcription similarity."
        
        return result
    else:
        return "Please provide both original and user audio for comparison."

# Create Gradio app for full-context ASR
full_context_demo = gr.Interface(
    fn=gradio_full_context_interface, 
    inputs=gr.Audio(source="microphone", type="numpy"), 
    outputs="text", 
    title="Full-Context ASR Demo"
)

# Create Gradio app for streaming ASR
streaming_demo = gr.Interface(
    fn=streaming_asr,
    inputs=["state", gr.Audio(source="microphone", type="numpy", streaming=True)],
    outputs=["state", "text"],
    live=True,
    title="Streaming ASR Demo"
)

# Create Gradio app for audio similarity checking
similarity_demo = gr.Interface(
    fn=gradio_similarity_interface, 
    inputs=[
        gr.Audio(source="upload", type="numpy", label="Original Audio"),
        gr.Audio(source="upload", type="numpy", label="User Audio")
    ], 
    outputs="json", 
    title="Audio Transcription and Similarity Checker"
)

# Launch all three demos
gr.TabbedInterface([full_context_demo, streaming_demo, similarity_demo], ["Full-Context ASR", "Streaming ASR", "Similarity Checker"]).launch()