|
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
import numpy as np |
|
import librosa |
|
|
|
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") |
|
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") |
|
model.config.forced_decoder_ids = None |
|
|
|
|
|
|
|
|
|
def transcibe(audio:np.ndarray, sr:int): |
|
input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features |
|
predicted_ids = model.generate(input_features) |
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False) |
|
transcription = processor.tokenizer.normalize(transcription[0]) |
|
return transcription |
|
|
|
def audio_len(audio:np.ndarray, sr:int): |
|
return len(audio) / sr |
|
|
|
def rms_energy(audio: np.ndarray): |
|
return np.sqrt(np.mean(audio**2)) |
|
|
|
def zero_crossing_rate(audio: np.ndarray): |
|
return np.mean(np.abs(np.diff(np.sign(audio)))) |
|
|
|
def spectral_centroid(audio: np.ndarray, sr: int): |
|
return librosa.feature.spectral_centroid(y=audio, sr=sr).mean() |
|
|
|
def spectral_bandwidth(audio: np.ndarray, sr: int): |
|
return librosa.feature.spectral_bandwidth(y=audio, sr=sr).mean() |
|
|
|
def mfccs(audio: np.ndarray, sr: int, n_mfcc: int = 13): |
|
return librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc).mean(axis=1) |
|
|
|
def chroma_features(audio: np.ndarray, sr: int): |
|
return librosa.feature.chroma_stft(y=audio, sr=sr).mean(axis=1) |
|
|
|
def signal_to_noise_ratio(audio: np.ndarray): |
|
signal_power = np.mean(audio ** 2) |
|
noise_power = np.var(audio) |
|
return 10 * np.log10(signal_power / noise_power) |
|
|
|
def tempo(audio: np.ndarray, sr: int): |
|
onset_env = librosa.onset.onset_strength(y=audio, sr=sr) |
|
return librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0] |
|
|
|
def silence_ratio(audio: np.ndarray, threshold: float = 0.01): |
|
return np.mean(np.abs(audio) < threshold) |
|
|
|
def estimate_audio_quality(audio: np.ndarray, sr: int): |
|
|
|
snr = signal_to_noise_ratio(audio) |
|
rms = rms_energy(audio) |
|
silence = silence_ratio(audio) |
|
spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr).mean() |
|
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr).mean() |
|
zcr = zero_crossing_rate(audio) |
|
|
|
|
|
snr_norm = np.clip(snr / 50.0, 0, 1) |
|
rms_norm = np.clip(rms / np.max(np.abs(audio)), 0, 1) |
|
silence_norm = 1 - silence |
|
spectral_centroid_norm = np.clip(spectral_centroid / sr, 0, 1) |
|
spectral_bandwidth_norm = np.clip(spectral_bandwidth / (sr/2), 0, 1) |
|
zcr_norm = np.clip(zcr / 0.1, 0, 1) |
|
|
|
features = { |
|
"snr_nrom":snr_norm, |
|
"rms_norm":rms_norm, |
|
"silence_norm":silence_norm, |
|
"spectral_centroid":spectral_centroid_norm, |
|
"spectral_bandwidth_norm":spectral_bandwidth_norm, |
|
"zcr_norm":zcr_norm |
|
} |
|
|
|
|
|
|
|
weights = { |
|
"snr": 0.25, |
|
"rms": 0.2, |
|
"silence": 0.2, |
|
"spectral_centroid": 0.1, |
|
"spectral_bandwidth": 0.15, |
|
"zcr": 0.1 |
|
} |
|
|
|
|
|
quality_score = ( |
|
weights["snr"] * snr_norm + |
|
weights["rms"] * rms_norm + |
|
weights["silence"] * silence_norm + |
|
weights["spectral_centroid"] * spectral_centroid_norm + |
|
weights["spectral_bandwidth"] * spectral_bandwidth_norm + |
|
weights["zcr"] * zcr_norm |
|
) |
|
|
|
|
|
if quality_score > 0.85: |
|
quality = "Excellent" |
|
elif quality_score > 0.7: |
|
quality = "Good" |
|
elif quality_score > 0.5: |
|
quality = "Fair" |
|
else: |
|
quality = "Poor" |
|
|
|
return quality, round(quality_score, 3), features |