Hammad712 commited on
Commit
137635a
·
verified ·
1 Parent(s): db4abdc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -0
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import Levenshtein
4
+ import numpy as np
5
+ from transformers import pipeline
6
+
7
+ # Function to securely load the Hugging Face API token
8
+ def load_hf_token():
9
+ # Replace this with your actual Hugging Face API token
10
+ return "your_huggingface_api_token"
11
+
12
+ # Function to query the Hugging Face Inference API
13
+ def transcribe_audio_hf(audio):
14
+ """
15
+ Transcribes speech from an audio file using the Hugging Face Inference API.
16
+ Args:
17
+ audio (numpy.array): Audio data as a numpy array.
18
+ Returns:
19
+ str: The transcription of the speech in the audio file.
20
+ """
21
+ API_URL = "https://api-inference.huggingface.co/models/jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
22
+ headers = {"Authorization": f"Bearer {load_hf_token()}"}
23
+ response = requests.post(API_URL, headers=headers, data=audio.tobytes())
24
+ return response.json().get("text", "").strip()
25
+
26
+ # Function to calculate Levenshtein similarity
27
+ def levenshtein_similarity(transcription1, transcription2):
28
+ """
29
+ Calculate the Levenshtein similarity between two transcriptions.
30
+ Args:
31
+ transcription1 (str): The first transcription.
32
+ transcription2 (str): The second transcription.
33
+ Returns:
34
+ float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
35
+ """
36
+ distance = Levenshtein.distance(transcription1, transcription2)
37
+ max_len = max(len(transcription1), len(transcription2))
38
+ return 1 - distance / max_len # Normalize to get similarity score
39
+
40
+ # Function to evaluate audio similarity
41
+ def evaluate_audio_similarity(original_audio, user_audio):
42
+ """
43
+ Compares the similarity between the transcription of an original audio file and a user's audio file.
44
+ Args:
45
+ original_audio (numpy.array): Original audio data.
46
+ user_audio (numpy.array): User's audio data.
47
+ Returns:
48
+ tuple: Transcriptions and Levenshtein similarity score.
49
+ """
50
+ transcription_original = transcribe_audio_hf(original_audio)
51
+ transcription_user = transcribe_audio_hf(user_audio)
52
+ similarity_score = levenshtein_similarity(transcription_original, transcription_user)
53
+ return transcription_original, transcription_user, similarity_score
54
+
55
+ # Set up the Whisper ASR model for full-context and streaming ASR
56
+ whisper_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
57
+
58
+ # Full-context ASR function
59
+ def full_context_asr(audio):
60
+ sr, y = audio
61
+ y = y.astype(np.float32)
62
+ y /= np.max(np.abs(y))
63
+ return whisper_transcriber({"sampling_rate": sr, "raw": y})["text"]
64
+
65
+ # Streaming ASR function
66
+ def streaming_asr(stream, new_chunk):
67
+ sr, y = new_chunk
68
+ y = y.astype(np.float32)
69
+ y /= np.max(np.abs(y))
70
+
71
+ if stream is not None:
72
+ stream = np.concatenate([stream, y])
73
+ else:
74
+ stream = y
75
+
76
+ return stream, whisper_transcriber({"sampling_rate": sr, "raw": stream})["text"]
77
+
78
+ # Define Gradio interface for full-context ASR
79
+ def gradio_full_context_interface(audio):
80
+ if audio is not None:
81
+ transcription = full_context_asr(audio)
82
+ return transcription
83
+ else:
84
+ return "Please provide an audio file."
85
+
86
+ # Define Gradio interface for audio similarity checking
87
+ def gradio_similarity_interface(original_audio, user_audio):
88
+ if original_audio is not None and user_audio is not None:
89
+ transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio, user_audio)
90
+
91
+ result = {
92
+ "Original Transcription": transcription_original,
93
+ "User Transcription": transcription_user,
94
+ "Levenshtein Similarity Score": similarity_score,
95
+ }
96
+
97
+ if similarity_score > 0.8: # Adjust the threshold as needed
98
+ result["Feedback"] = "The pronunciation is likely correct based on transcription similarity."
99
+ else:
100
+ result["Feedback"] = "The pronunciation may be incorrect based on transcription similarity."
101
+
102
+ return result
103
+ else:
104
+ return "Please provide both original and user audio for comparison."
105
+
106
+ # Create Gradio app for full-context ASR
107
+ full_context_demo = gr.Interface(
108
+ fn=gradio_full_context_interface,
109
+ inputs=gr.Audio(source="microphone", type="numpy"),
110
+ outputs="text",
111
+ title="Full-Context ASR Demo"
112
+ )
113
+
114
+ # Create Gradio app for streaming ASR
115
+ streaming_demo = gr.Interface(
116
+ fn=streaming_asr,
117
+ inputs=["state", gr.Audio(source="microphone", type="numpy", streaming=True)],
118
+ outputs=["state", "text"],
119
+ live=True,
120
+ title="Streaming ASR Demo"
121
+ )
122
+
123
+ # Create Gradio app for audio similarity checking
124
+ similarity_demo = gr.Interface(
125
+ fn=gradio_similarity_interface,
126
+ inputs=[
127
+ gr.Audio(source="upload", type="numpy", label="Original Audio"),
128
+ gr.Audio(source="upload", type="numpy", label="User Audio")
129
+ ],
130
+ outputs="json",
131
+ title="Audio Transcription and Similarity Checker"
132
+ )
133
+
134
+ # Launch all three demos
135
+ gr.TabbedInterface([full_context_demo, streaming_demo, similarity_demo], ["Full-Context ASR", "Streaming ASR", "Similarity Checker"]).launch()