j-tobias
initial Commit
744c1ac
raw
history blame
3.85 kB
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import numpy as np
import librosa
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
model.config.forced_decoder_ids = None
def transcibe(audio:np.ndarray, sr:int):
input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
transcription = processor.tokenizer.normalize(transcription[0])
return transcription
def audio_len(audio:np.ndarray, sr:int):
return len(audio) / sr
def rms_energy(audio: np.ndarray):
return np.sqrt(np.mean(audio**2))
def zero_crossing_rate(audio: np.ndarray):
return np.mean(np.abs(np.diff(np.sign(audio))))
def spectral_centroid(audio: np.ndarray, sr: int):
return librosa.feature.spectral_centroid(y=audio, sr=sr).mean()
def spectral_bandwidth(audio: np.ndarray, sr: int):
return librosa.feature.spectral_bandwidth(y=audio, sr=sr).mean()
def mfccs(audio: np.ndarray, sr: int, n_mfcc: int = 13):
return librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc).mean(axis=1)
def chroma_features(audio: np.ndarray, sr: int):
return librosa.feature.chroma_stft(y=audio, sr=sr).mean(axis=1)
def signal_to_noise_ratio(audio: np.ndarray):
signal_power = np.mean(audio ** 2)
noise_power = np.var(audio)
return 10 * np.log10(signal_power / noise_power)
def tempo(audio: np.ndarray, sr: int):
onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
return librosa.beat.tempo(onset_envelope=onset_env, sr=sr)[0]
def silence_ratio(audio: np.ndarray, threshold: float = 0.01):
return np.mean(np.abs(audio) < threshold)
def estimate_audio_quality(audio: np.ndarray, sr: int):
# Compute features
snr = signal_to_noise_ratio(audio)
rms = rms_energy(audio)
silence = silence_ratio(audio)
spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr).mean()
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr).mean()
zcr = zero_crossing_rate(audio)
# Normalize features (example normalization, adjust as necessary)
snr_norm = np.clip(snr / 50.0, 0, 1) # Assuming 50 dB is very good
rms_norm = np.clip(rms / np.max(np.abs(audio)), 0, 1) # Normalizing by max amplitude
silence_norm = 1 - silence # Less silence is better
spectral_centroid_norm = np.clip(spectral_centroid / sr, 0, 1)
spectral_bandwidth_norm = np.clip(spectral_bandwidth / (sr/2), 0, 1)
zcr_norm = np.clip(zcr / 0.1, 0, 1) # Assuming 0.1 as an acceptable ZCR
features = {
"snr_nrom":snr_norm,
"rms_norm":rms_norm,
"silence_norm":silence_norm,
"spectral_centroid":spectral_centroid_norm,
"spectral_bandwidth_norm":spectral_bandwidth_norm,
"zcr_norm":zcr_norm
}
# Weighting features
weights = {
"snr": 0.25,
"rms": 0.2,
"silence": 0.2,
"spectral_centroid": 0.1,
"spectral_bandwidth": 0.15,
"zcr": 0.1
}
# Calculate overall quality score
quality_score = (
weights["snr"] * snr_norm +
weights["rms"] * rms_norm +
weights["silence"] * silence_norm +
weights["spectral_centroid"] * spectral_centroid_norm +
weights["spectral_bandwidth"] * spectral_bandwidth_norm +
weights["zcr"] * zcr_norm
)
# Interpret the score
if quality_score > 0.85:
quality = "Excellent"
elif quality_score > 0.7:
quality = "Good"
elif quality_score > 0.5:
quality = "Fair"
else:
quality = "Poor"
return quality, round(quality_score, 3), features