j-tobias commited on
Commit
744c1ac
·
1 Parent(s): 05c5243

initial Commit

Browse files
Files changed (2) hide show
  1. app.py +140 -0
  2. check.py +109 -0
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import librosa
4
+ import hashlib
5
+ import json
6
+ import os
7
+
8
+ from scipy.io.wavfile import write as wav_write
9
+ from check import transcibe, estimate_audio_quality
10
+
11
+ def visible():
12
+ return gr.update(visible=True),gr.update(visible=True)
13
+
14
+ def check(audio:gr.Audio):
15
+
16
+ sr, audio = audio
17
+ audio = audio.astype(np.float32)
18
+ if len(audio.shape) > 2 and audio.shape[1] > 1:
19
+ audio = np.mean(audio, axis=1)
20
+
21
+ quality, quality_score, features = estimate_audio_quality(audio, sr)
22
+ audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=16000)
23
+ transcription = transcibe(audio_resampled, 16000)
24
+
25
+ check_result = f"""
26
+ ### Results
27
+
28
+ Qualtiy (0-1):{quality_score} - {quality}
29
+
30
+ Feautes:
31
+ """
32
+
33
+ for key, value in features.items():
34
+ check_result += f"""
35
+ - {key}: {round(value,3)}
36
+ """
37
+
38
+ return transcription, check_result
39
+
40
+ def generate_sample_id(audio, transcription):
41
+ # Combine the audio and transcription into a single string
42
+ combined = f"{audio.tostring()}{transcription}"
43
+
44
+ # Generate a hash of the combined string
45
+ sample_id = hashlib.sha256(combined.encode('utf-8')).hexdigest()
46
+
47
+ return sample_id[:10] # Return the first 10 characters of the hash as the ID
48
+
49
+
50
+ def save_sample(audio, transcription, check_result):
51
+ sample_id = generate_sample_id(audio, transcription)
52
+
53
+ # Create a directory to save the sample
54
+ directory = f"{sample_id}_data"
55
+ if not os.path.exists(directory):
56
+ os.makedirs(directory)
57
+
58
+ # Save the audio as a .wav file
59
+ audio_filename = os.path.join(directory, f"{sample_id}.wav")
60
+ wav_write(audio_filename, 16000, audio) # Assuming a sample rate of 16kHz
61
+
62
+ # Save the transcription as a .txt file
63
+ transcription_filename = os.path.join(directory, f"{sample_id}_transcription.txt")
64
+ with open(transcription_filename, 'w') as f:
65
+ f.write(transcription)
66
+
67
+ # Save the check_result as a JSON file
68
+ check_result_filename = os.path.join(directory, f"{sample_id}_features.json")
69
+ with open(check_result_filename, 'w') as f:
70
+ json.dump(check_result, f)
71
+
72
+ with gr.Blocks() as app:
73
+
74
+ gr.Markdown("# Open ASR Dataset")
75
+ gr.Markdown(" ")
76
+ gr.Markdown("This App is an effort to collectively crowdsource a new Dataset for the ASR community")
77
+ # gr.Markdown("You can Access the Dataset (here)[LINK]")
78
+ gr.Markdown("The Dataset will be updated every 100 created samples")
79
+ gr.Markdown(" ")
80
+ gr.Markdown("Create a New Sample")
81
+
82
+
83
+ new_audio = gr.Audio(
84
+ label = "Add Audio",
85
+ sources=['upload','microphone']
86
+ )
87
+ with gr.Row():
88
+
89
+ language = gr.Radio(
90
+ label="Spoken Language",
91
+ choices=["English","French","Spanish","German","Italian", "Chinese", "Japanese", "Arabic", "Russian", "Portuguese"],
92
+ value= "English",
93
+ scale=4
94
+ )
95
+
96
+ bckgrnd_noise = gr.Radio(
97
+ label="Background Noise Level",
98
+ choices=["Quiet", "Moderate", "Noisy"],
99
+ value="Moderate",
100
+ scale=2
101
+ )
102
+
103
+ recording_env = gr.Radio(
104
+ label="Recording Environment",
105
+ choices=["Studio", "Home", "Outdoors", "Office"],
106
+ value="Home",
107
+ scale=2
108
+ )
109
+
110
+ check_sample_btn = gr.Button(
111
+ value="Check Sample",
112
+ variant="secondary",
113
+ size="sm"
114
+ )
115
+
116
+ with gr.Row():
117
+
118
+ check_result = gr.Markdown()
119
+
120
+ transcription = gr.TextArea(
121
+ label="Transcription",
122
+ visible=False,
123
+ interactive=True
124
+ )
125
+
126
+ save_sample_button = gr.Button(
127
+ value="Save Sample",
128
+ variant="primary",
129
+ size="sm",
130
+ scale=1,
131
+ visible=False
132
+ )
133
+
134
+ check_sample_btn.click(visible, outputs=[transcription, save_sample_button])
135
+ check_sample_btn.click(check, inputs=[new_audio], outputs=[transcription, check_result])
136
+
137
+ save_sample_button.click(save_sample, inputs=[new_audio, transcription, check_result])
138
+
139
+
140
+ app.launch()
check.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
+ import numpy as np
3
+ import librosa
4
+
5
+ processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
6
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
7
+ model.config.forced_decoder_ids = None
8
+
9
+
10
+
11
+
12
+ def transcibe(audio:np.ndarray, sr:int):
13
+ input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features
14
+ predicted_ids = model.generate(input_features)
15
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
16
+ transcription = processor.tokenizer.normalize(transcription[0])
17
+ return transcription
18
+
19
+ def audio_len(audio:np.ndarray, sr:int):
20
+ return len(audio) / sr
21
+
22
+ def rms_energy(audio: np.ndarray):
23
+ return np.sqrt(np.mean(audio**2))
24
+
25
+ def zero_crossing_rate(audio: np.ndarray):
26
+ return np.mean(np.abs(np.diff(np.sign(audio))))
27
+
28
+ def spectral_centroid(audio: np.ndarray, sr: int):
29
+ return librosa.feature.spectral_centroid(y=audio, sr=sr).mean()
30
+
31
+ def spectral_bandwidth(audio: np.ndarray, sr: int):
32
+ return librosa.feature.spectral_bandwidth(y=audio, sr=sr).mean()
33
+
34
+ def mfccs(audio: np.ndarray, sr: int, n_mfcc: int = 13):
35
+ return librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc).mean(axis=1)
36
+
37
+ def chroma_features(audio: np.ndarray, sr: int):
38
+ return librosa.feature.chroma_stft(y=audio, sr=sr).mean(axis=1)
39
+
40
+ def signal_to_noise_ratio(audio: np.ndarray):
41
+ signal_power = np.mean(audio ** 2)
42
+ noise_power = np.var(audio)
43
+ return 10 * np.log10(signal_power / noise_power)
44
+
45
+ def tempo(audio: np.ndarray, sr: int):
46
+ onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
47
+ return librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
48
+
49
+ def silence_ratio(audio: np.ndarray, threshold: float = 0.01):
50
+ return np.mean(np.abs(audio) < threshold)
51
+
52
+ def estimate_audio_quality(audio: np.ndarray, sr: int):
53
+ # Compute features
54
+ snr = signal_to_noise_ratio(audio)
55
+ rms = rms_energy(audio)
56
+ silence = silence_ratio(audio)
57
+ spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr).mean()
58
+ spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr).mean()
59
+ zcr = zero_crossing_rate(audio)
60
+
61
+ # Normalize features (example normalization, adjust as necessary)
62
+ snr_norm = np.clip(snr / 50.0, 0, 1) # Assuming 50 dB is very good
63
+ rms_norm = np.clip(rms / np.max(np.abs(audio)), 0, 1) # Normalizing by max amplitude
64
+ silence_norm = 1 - silence # Less silence is better
65
+ spectral_centroid_norm = np.clip(spectral_centroid / sr, 0, 1)
66
+ spectral_bandwidth_norm = np.clip(spectral_bandwidth / (sr/2), 0, 1)
67
+ zcr_norm = np.clip(zcr / 0.1, 0, 1) # Assuming 0.1 as an acceptable ZCR
68
+
69
+ features = {
70
+ "snr_nrom":snr_norm,
71
+ "rms_norm":rms_norm,
72
+ "silence_norm":silence_norm,
73
+ "spectral_centroid":spectral_centroid_norm,
74
+ "spectral_bandwidth_norm":spectral_bandwidth_norm,
75
+ "zcr_norm":zcr_norm
76
+ }
77
+
78
+
79
+ # Weighting features
80
+ weights = {
81
+ "snr": 0.25,
82
+ "rms": 0.2,
83
+ "silence": 0.2,
84
+ "spectral_centroid": 0.1,
85
+ "spectral_bandwidth": 0.15,
86
+ "zcr": 0.1
87
+ }
88
+
89
+ # Calculate overall quality score
90
+ quality_score = (
91
+ weights["snr"] * snr_norm +
92
+ weights["rms"] * rms_norm +
93
+ weights["silence"] * silence_norm +
94
+ weights["spectral_centroid"] * spectral_centroid_norm +
95
+ weights["spectral_bandwidth"] * spectral_bandwidth_norm +
96
+ weights["zcr"] * zcr_norm
97
+ )
98
+
99
+ # Interpret the score
100
+ if quality_score > 0.85:
101
+ quality = "Excellent"
102
+ elif quality_score > 0.7:
103
+ quality = "Good"
104
+ elif quality_score > 0.5:
105
+ quality = "Fair"
106
+ else:
107
+ quality = "Poor"
108
+
109
+ return quality, round(quality_score, 3), features