Hammad712 commited on
Commit
09b7ae0
·
verified ·
1 Parent(s): bcc1d68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -99
app.py CHANGED
@@ -1,28 +1,29 @@
1
- import streamlit as st
2
  import requests
3
  import Levenshtein
4
- from io import BytesIO
5
- from audio_recorder_streamlit import audio_recorder
6
 
7
  # Function to securely load the Hugging Face API token
8
- @st.cache_resource
9
  def load_hf_token():
10
- return st.secrets["HF_API_KEY"]
 
11
 
12
  # Function to query the Hugging Face Inference API
13
- def transcribe_audio_hf(audio_bytes):
14
  """
15
  Transcribes speech from an audio file using the Hugging Face Inference API.
16
  Args:
17
- audio_bytes (bytes): Audio data in bytes.
18
  Returns:
19
  str: The transcription of the speech in the audio file.
20
  """
21
  API_URL = "https://api-inference.huggingface.co/models/jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
22
  headers = {"Authorization": f"Bearer {load_hf_token()}"}
23
- response = requests.post(API_URL, headers=headers, data=audio_bytes)
24
  return response.json().get("text", "").strip()
25
 
 
26
  def levenshtein_similarity(transcription1, transcription2):
27
  """
28
  Calculate the Levenshtein similarity between two transcriptions.
@@ -36,102 +37,99 @@ def levenshtein_similarity(transcription1, transcription2):
36
  max_len = max(len(transcription1), len(transcription2))
37
  return 1 - distance / max_len # Normalize to get similarity score
38
 
39
- def evaluate_audio_similarity(original_audio_bytes, user_audio_bytes):
 
40
  """
41
  Compares the similarity between the transcription of an original audio file and a user's audio file.
42
  Args:
43
- original_audio_bytes (bytes): Bytes of the original audio file.
44
- user_audio_bytes (bytes): Bytes of the user's audio file.
45
  Returns:
46
  tuple: Transcriptions and Levenshtein similarity score.
47
  """
48
- transcription_original = transcribe_audio_hf(original_audio_bytes)
49
- transcription_user = transcribe_audio_hf(user_audio_bytes)
50
- similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
51
- return transcription_original, transcription_user, similarity_score_levenshtein
52
-
53
- st.title("Audio Transcription and Similarity Checker")
54
-
55
- # Choose between upload or record
56
- st.sidebar.header("Input Method")
57
- input_method = st.sidebar.selectbox("Choose Input Method", ["Upload", "Record"])
58
-
59
- original_audio_bytes = None
60
- user_audio_bytes = None
61
-
62
- if input_method == "Upload":
63
- # Upload original audio file
64
- original_audio = st.file_uploader("Upload Original Audio", type=["wav", "mp3"])
65
-
66
- if original_audio:
67
- original_audio_bytes = original_audio.read()
68
- st.audio(original_audio_bytes, format="audio/wav")
69
-
70
- # Upload user audio file
71
- user_audio = st.file_uploader("Upload User Audio", type=["wav", "mp3"])
72
-
73
- if user_audio:
74
- user_audio_bytes = user_audio.read()
75
- st.audio(user_audio_bytes, format="audio/wav")
76
-
77
- # Add a button to perform the test
78
- if original_audio_bytes and user_audio_bytes:
79
- if st.button("Perform Testing"):
80
- with st.spinner("Performing transcription and similarity testing..."):
81
- transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_bytes, user_audio_bytes)
82
-
83
- # Display results
84
- st.markdown("---")
85
- st.subheader("Transcriptions and Similarity Score")
86
- st.write(f"**Original Transcription:** {transcription_original}")
87
- st.write(f"**User Transcription:** {transcription_user}")
88
- st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")
89
-
90
- if similarity_score > 0.8: # Adjust the threshold as needed
91
- st.success("The pronunciation is likely correct based on transcription similarity.")
92
- else:
93
- st.error("The pronunciation may be incorrect based on transcription similarity.")
94
-
95
- elif input_method == "Record":
96
- st.write("Record or Upload Original Audio")
97
- st.write("Click the button below to start recording. The button will turn green when recording is active.")
98
-
99
- original_audio_bytes = audio_recorder(key="original_audio_recorder")
100
-
101
- if original_audio_bytes and len(original_audio_bytes) > 0:
102
- with st.spinner("Processing original audio..."):
103
- st.audio(original_audio_bytes, format="audio/wav")
104
- st.success("Original audio recorded successfully!")
105
-
106
  else:
107
- st.warning("No original audio recorded. Please record or upload an audio file.")
108
-
109
- st.write("Record or Upload User Audio")
110
- st.write("Click the button below to start recording. The button will turn green when recording is active.")
111
-
112
- user_audio_bytes = audio_recorder(key="user_audio_recorder")
113
-
114
- if user_audio_bytes and len(user_audio_bytes) > 0:
115
- with st.spinner("Processing user audio..."):
116
- st.audio(user_audio_bytes, format="audio/wav")
117
- st.success("User audio recorded successfully!")
118
  else:
119
- st.warning("No user audio recorded. Please record or upload an audio file.")
120
-
121
- # Add a button to perform the test
122
- if original_audio_bytes and user_audio_bytes:
123
- if st.button("Perform Testing"):
124
- with st.spinner("Performing transcription and similarity testing..."):
125
- transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_bytes, user_audio_bytes)
126
-
127
- # Display results
128
- st.markdown("---")
129
- st.subheader("Transcriptions and Similarity Score")
130
- st.write(f"**Original Transcription:** {transcription_original}")
131
- st.write(f"**User Transcription:** {transcription_user}")
132
- st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")
133
-
134
- if similarity_score > 0.8: # Adjust the threshold as needed
135
- st.success("The pronunciation is likely correct based on transcription similarity.")
136
- else:
137
- st.error("The pronunciation may be incorrect based on transcription similarity.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
  import requests
3
  import Levenshtein
4
+ import numpy as np
5
+ from transformers import pipeline
6
 
7
  # Function to securely load the Hugging Face API token
 
8
  def load_hf_token():
9
+ # Replace this with your actual Hugging Face API token
10
+ return "your_huggingface_api_token"
11
 
12
  # Function to query the Hugging Face Inference API
13
+ def transcribe_audio_hf(audio):
14
  """
15
  Transcribes speech from an audio file using the Hugging Face Inference API.
16
  Args:
17
+ audio (numpy.array): Audio data as a numpy array.
18
  Returns:
19
  str: The transcription of the speech in the audio file.
20
  """
21
  API_URL = "https://api-inference.huggingface.co/models/jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
22
  headers = {"Authorization": f"Bearer {load_hf_token()}"}
23
+ response = requests.post(API_URL, headers=headers, data=audio.tobytes())
24
  return response.json().get("text", "").strip()
25
 
26
+ # Function to calculate Levenshtein similarity
27
  def levenshtein_similarity(transcription1, transcription2):
28
  """
29
  Calculate the Levenshtein similarity between two transcriptions.
 
37
  max_len = max(len(transcription1), len(transcription2))
38
  return 1 - distance / max_len # Normalize to get similarity score
39
 
40
+ # Function to evaluate audio similarity
41
+ def evaluate_audio_similarity(original_audio, user_audio):
42
  """
43
  Compares the similarity between the transcription of an original audio file and a user's audio file.
44
  Args:
45
+ original_audio (numpy.array): Original audio data.
46
+ user_audio (numpy.array): User's audio data.
47
  Returns:
48
  tuple: Transcriptions and Levenshtein similarity score.
49
  """
50
+ transcription_original = transcribe_audio_hf(original_audio)
51
+ transcription_user = transcribe_audio_hf(user_audio)
52
+ similarity_score = levenshtein_similarity(transcription_original, transcription_user)
53
+ return transcription_original, transcription_user, similarity_score
54
+
55
+ # Set up the Whisper ASR model for full-context and streaming ASR
56
+ whisper_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
57
+
58
+ # Full-context ASR function
59
+ def full_context_asr(audio):
60
+ sr, y = audio
61
+ y = y.astype(np.float32)
62
+ y /= np.max(np.abs(y))
63
+ return whisper_transcriber({"sampling_rate": sr, "raw": y})["text"]
64
+
65
+ # Streaming ASR function
66
+ def streaming_asr(stream, new_chunk):
67
+ sr, y = new_chunk
68
+ y = y.astype(np.float32)
69
+ y /= np.max(np.abs(y))
70
+
71
+ if stream is not None:
72
+ stream = np.concatenate([stream, y])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  else:
74
+ stream = y
75
+
76
+ return stream, whisper_transcriber({"sampling_rate": sr, "raw": stream})["text"]
77
+
78
+ # Define Gradio interface for full-context ASR
79
+ def gradio_full_context_interface(audio):
80
+ if audio is not None:
81
+ transcription = full_context_asr(audio)
82
+ return transcription
 
 
83
  else:
84
+ return "Please provide an audio file."
85
+
86
+ # Define Gradio interface for audio similarity checking
87
+ def gradio_similarity_interface(original_audio, user_audio):
88
+ if original_audio is not None and user_audio is not None:
89
+ transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio, user_audio)
90
+
91
+ result = {
92
+ "Original Transcription": transcription_original,
93
+ "User Transcription": transcription_user,
94
+ "Levenshtein Similarity Score": similarity_score,
95
+ }
96
+
97
+ if similarity_score > 0.8: # Adjust the threshold as needed
98
+ result["Feedback"] = "The pronunciation is likely correct based on transcription similarity."
99
+ else:
100
+ result["Feedback"] = "The pronunciation may be incorrect based on transcription similarity."
101
+
102
+ return result
103
+ else:
104
+ return "Please provide both original and user audio for comparison."
105
+
106
+ # Create Gradio app for full-context ASR
107
+ full_context_demo = gr.Interface(
108
+ fn=gradio_full_context_interface,
109
+ inputs=gr.Audio(source="microphone", type="numpy"),
110
+ outputs="text",
111
+ title="Full-Context ASR Demo"
112
+ )
113
+
114
+ # Create Gradio app for streaming ASR
115
+ streaming_demo = gr.Interface(
116
+ fn=streaming_asr,
117
+ inputs=["state", gr.Audio(source="microphone", type="numpy", streaming=True)],
118
+ outputs=["state", "text"],
119
+ live=True,
120
+ title="Streaming ASR Demo"
121
+ )
122
+
123
+ # Create Gradio app for audio similarity checking
124
+ similarity_demo = gr.Interface(
125
+ fn=gradio_similarity_interface,
126
+ inputs=[
127
+ gr.Audio(source="upload", type="numpy", label="Original Audio"),
128
+ gr.Audio(source="upload", type="numpy", label="User Audio")
129
+ ],
130
+ outputs="json",
131
+ title="Audio Transcription and Similarity Checker"
132
+ )
133
+
134
+ # Launch all three demos
135
+ gr.TabbedInterface([full_context_demo, streaming_demo, similarity_demo], ["Full-Context ASR", "Streaming ASR", "Similarity Checker"]).launch()