j-tobias
commited on
Commit
·
744c1ac
1
Parent(s):
05c5243
initial Commit
Browse files
app.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
import librosa
|
4 |
+
import hashlib
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
|
8 |
+
from scipy.io.wavfile import write as wav_write
|
9 |
+
from check import transcibe, estimate_audio_quality
|
10 |
+
|
11 |
+
def visible():
|
12 |
+
return gr.update(visible=True),gr.update(visible=True)
|
13 |
+
|
14 |
+
def check(audio:gr.Audio):
|
15 |
+
|
16 |
+
sr, audio = audio
|
17 |
+
audio = audio.astype(np.float32)
|
18 |
+
if len(audio.shape) > 2 and audio.shape[1] > 1:
|
19 |
+
audio = np.mean(audio, axis=1)
|
20 |
+
|
21 |
+
quality, quality_score, features = estimate_audio_quality(audio, sr)
|
22 |
+
audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
23 |
+
transcription = transcibe(audio_resampled, 16000)
|
24 |
+
|
25 |
+
check_result = f"""
|
26 |
+
### Results
|
27 |
+
|
28 |
+
Qualtiy (0-1):{quality_score} - {quality}
|
29 |
+
|
30 |
+
Feautes:
|
31 |
+
"""
|
32 |
+
|
33 |
+
for key, value in features.items():
|
34 |
+
check_result += f"""
|
35 |
+
- {key}: {round(value,3)}
|
36 |
+
"""
|
37 |
+
|
38 |
+
return transcription, check_result
|
39 |
+
|
40 |
+
def generate_sample_id(audio, transcription):
|
41 |
+
# Combine the audio and transcription into a single string
|
42 |
+
combined = f"{audio.tostring()}{transcription}"
|
43 |
+
|
44 |
+
# Generate a hash of the combined string
|
45 |
+
sample_id = hashlib.sha256(combined.encode('utf-8')).hexdigest()
|
46 |
+
|
47 |
+
return sample_id[:10] # Return the first 10 characters of the hash as the ID
|
48 |
+
|
49 |
+
|
50 |
+
def save_sample(audio, transcription, check_result):
|
51 |
+
sample_id = generate_sample_id(audio, transcription)
|
52 |
+
|
53 |
+
# Create a directory to save the sample
|
54 |
+
directory = f"{sample_id}_data"
|
55 |
+
if not os.path.exists(directory):
|
56 |
+
os.makedirs(directory)
|
57 |
+
|
58 |
+
# Save the audio as a .wav file
|
59 |
+
audio_filename = os.path.join(directory, f"{sample_id}.wav")
|
60 |
+
wav_write(audio_filename, 16000, audio) # Assuming a sample rate of 16kHz
|
61 |
+
|
62 |
+
# Save the transcription as a .txt file
|
63 |
+
transcription_filename = os.path.join(directory, f"{sample_id}_transcription.txt")
|
64 |
+
with open(transcription_filename, 'w') as f:
|
65 |
+
f.write(transcription)
|
66 |
+
|
67 |
+
# Save the check_result as a JSON file
|
68 |
+
check_result_filename = os.path.join(directory, f"{sample_id}_features.json")
|
69 |
+
with open(check_result_filename, 'w') as f:
|
70 |
+
json.dump(check_result, f)
|
71 |
+
|
72 |
+
with gr.Blocks() as app:
|
73 |
+
|
74 |
+
gr.Markdown("# Open ASR Dataset")
|
75 |
+
gr.Markdown(" ")
|
76 |
+
gr.Markdown("This App is an effort to collectively crowdsource a new Dataset for the ASR community")
|
77 |
+
# gr.Markdown("You can Access the Dataset (here)[LINK]")
|
78 |
+
gr.Markdown("The Dataset will be updated every 100 created samples")
|
79 |
+
gr.Markdown(" ")
|
80 |
+
gr.Markdown("Create a New Sample")
|
81 |
+
|
82 |
+
|
83 |
+
new_audio = gr.Audio(
|
84 |
+
label = "Add Audio",
|
85 |
+
sources=['upload','microphone']
|
86 |
+
)
|
87 |
+
with gr.Row():
|
88 |
+
|
89 |
+
language = gr.Radio(
|
90 |
+
label="Spoken Language",
|
91 |
+
choices=["English","French","Spanish","German","Italian", "Chinese", "Japanese", "Arabic", "Russian", "Portuguese"],
|
92 |
+
value= "English",
|
93 |
+
scale=4
|
94 |
+
)
|
95 |
+
|
96 |
+
bckgrnd_noise = gr.Radio(
|
97 |
+
label="Background Noise Level",
|
98 |
+
choices=["Quiet", "Moderate", "Noisy"],
|
99 |
+
value="Moderate",
|
100 |
+
scale=2
|
101 |
+
)
|
102 |
+
|
103 |
+
recording_env = gr.Radio(
|
104 |
+
label="Recording Environment",
|
105 |
+
choices=["Studio", "Home", "Outdoors", "Office"],
|
106 |
+
value="Home",
|
107 |
+
scale=2
|
108 |
+
)
|
109 |
+
|
110 |
+
check_sample_btn = gr.Button(
|
111 |
+
value="Check Sample",
|
112 |
+
variant="secondary",
|
113 |
+
size="sm"
|
114 |
+
)
|
115 |
+
|
116 |
+
with gr.Row():
|
117 |
+
|
118 |
+
check_result = gr.Markdown()
|
119 |
+
|
120 |
+
transcription = gr.TextArea(
|
121 |
+
label="Transcription",
|
122 |
+
visible=False,
|
123 |
+
interactive=True
|
124 |
+
)
|
125 |
+
|
126 |
+
save_sample_button = gr.Button(
|
127 |
+
value="Save Sample",
|
128 |
+
variant="primary",
|
129 |
+
size="sm",
|
130 |
+
scale=1,
|
131 |
+
visible=False
|
132 |
+
)
|
133 |
+
|
134 |
+
check_sample_btn.click(visible, outputs=[transcription, save_sample_button])
|
135 |
+
check_sample_btn.click(check, inputs=[new_audio], outputs=[transcription, check_result])
|
136 |
+
|
137 |
+
save_sample_button.click(save_sample, inputs=[new_audio, transcription, check_result])
|
138 |
+
|
139 |
+
|
140 |
+
app.launch()
|
check.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
2 |
+
import numpy as np
|
3 |
+
import librosa
|
4 |
+
|
5 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
|
6 |
+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
|
7 |
+
model.config.forced_decoder_ids = None
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
def transcibe(audio:np.ndarray, sr:int):
|
13 |
+
input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features
|
14 |
+
predicted_ids = model.generate(input_features)
|
15 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
|
16 |
+
transcription = processor.tokenizer.normalize(transcription[0])
|
17 |
+
return transcription
|
18 |
+
|
19 |
+
def audio_len(audio:np.ndarray, sr:int):
|
20 |
+
return len(audio) / sr
|
21 |
+
|
22 |
+
def rms_energy(audio: np.ndarray):
|
23 |
+
return np.sqrt(np.mean(audio**2))
|
24 |
+
|
25 |
+
def zero_crossing_rate(audio: np.ndarray):
|
26 |
+
return np.mean(np.abs(np.diff(np.sign(audio))))
|
27 |
+
|
28 |
+
def spectral_centroid(audio: np.ndarray, sr: int):
|
29 |
+
return librosa.feature.spectral_centroid(y=audio, sr=sr).mean()
|
30 |
+
|
31 |
+
def spectral_bandwidth(audio: np.ndarray, sr: int):
|
32 |
+
return librosa.feature.spectral_bandwidth(y=audio, sr=sr).mean()
|
33 |
+
|
34 |
+
def mfccs(audio: np.ndarray, sr: int, n_mfcc: int = 13):
|
35 |
+
return librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc).mean(axis=1)
|
36 |
+
|
37 |
+
def chroma_features(audio: np.ndarray, sr: int):
|
38 |
+
return librosa.feature.chroma_stft(y=audio, sr=sr).mean(axis=1)
|
39 |
+
|
40 |
+
def signal_to_noise_ratio(audio: np.ndarray):
|
41 |
+
signal_power = np.mean(audio ** 2)
|
42 |
+
noise_power = np.var(audio)
|
43 |
+
return 10 * np.log10(signal_power / noise_power)
|
44 |
+
|
45 |
+
def tempo(audio: np.ndarray, sr: int):
|
46 |
+
onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
|
47 |
+
return librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
|
48 |
+
|
49 |
+
def silence_ratio(audio: np.ndarray, threshold: float = 0.01):
|
50 |
+
return np.mean(np.abs(audio) < threshold)
|
51 |
+
|
52 |
+
def estimate_audio_quality(audio: np.ndarray, sr: int):
|
53 |
+
# Compute features
|
54 |
+
snr = signal_to_noise_ratio(audio)
|
55 |
+
rms = rms_energy(audio)
|
56 |
+
silence = silence_ratio(audio)
|
57 |
+
spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr).mean()
|
58 |
+
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr).mean()
|
59 |
+
zcr = zero_crossing_rate(audio)
|
60 |
+
|
61 |
+
# Normalize features (example normalization, adjust as necessary)
|
62 |
+
snr_norm = np.clip(snr / 50.0, 0, 1) # Assuming 50 dB is very good
|
63 |
+
rms_norm = np.clip(rms / np.max(np.abs(audio)), 0, 1) # Normalizing by max amplitude
|
64 |
+
silence_norm = 1 - silence # Less silence is better
|
65 |
+
spectral_centroid_norm = np.clip(spectral_centroid / sr, 0, 1)
|
66 |
+
spectral_bandwidth_norm = np.clip(spectral_bandwidth / (sr/2), 0, 1)
|
67 |
+
zcr_norm = np.clip(zcr / 0.1, 0, 1) # Assuming 0.1 as an acceptable ZCR
|
68 |
+
|
69 |
+
features = {
|
70 |
+
"snr_nrom":snr_norm,
|
71 |
+
"rms_norm":rms_norm,
|
72 |
+
"silence_norm":silence_norm,
|
73 |
+
"spectral_centroid":spectral_centroid_norm,
|
74 |
+
"spectral_bandwidth_norm":spectral_bandwidth_norm,
|
75 |
+
"zcr_norm":zcr_norm
|
76 |
+
}
|
77 |
+
|
78 |
+
|
79 |
+
# Weighting features
|
80 |
+
weights = {
|
81 |
+
"snr": 0.25,
|
82 |
+
"rms": 0.2,
|
83 |
+
"silence": 0.2,
|
84 |
+
"spectral_centroid": 0.1,
|
85 |
+
"spectral_bandwidth": 0.15,
|
86 |
+
"zcr": 0.1
|
87 |
+
}
|
88 |
+
|
89 |
+
# Calculate overall quality score
|
90 |
+
quality_score = (
|
91 |
+
weights["snr"] * snr_norm +
|
92 |
+
weights["rms"] * rms_norm +
|
93 |
+
weights["silence"] * silence_norm +
|
94 |
+
weights["spectral_centroid"] * spectral_centroid_norm +
|
95 |
+
weights["spectral_bandwidth"] * spectral_bandwidth_norm +
|
96 |
+
weights["zcr"] * zcr_norm
|
97 |
+
)
|
98 |
+
|
99 |
+
# Interpret the score
|
100 |
+
if quality_score > 0.85:
|
101 |
+
quality = "Excellent"
|
102 |
+
elif quality_score > 0.7:
|
103 |
+
quality = "Good"
|
104 |
+
elif quality_score > 0.5:
|
105 |
+
quality = "Fair"
|
106 |
+
else:
|
107 |
+
quality = "Poor"
|
108 |
+
|
109 |
+
return quality, round(quality_score, 3), features
|