from transformers import WhisperProcessor, WhisperForConditionalGeneration from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor import plotly.graph_objs as go from datasets import load_dataset from datasets import Audio import evaluate import librosa import numpy as np import pandas as pd N_SAMPLES = 30 wer_metric = evaluate.load("wer") def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str): if data_subset is None: raise ValueError("No Dataset selected") if model_1 is None: raise ValueError("No Model 1 selected") if model_2 is None: raise ValueError("No Model 2 selected") if data_subset == "Common Voice": dataset, text_column = load_Common_Voice() elif data_subset == "VoxPopuli": dataset, text_column = load_Vox_Populi() elif data_subset == "OWN Recoding/Sample": sr, audio = own_audio audio = audio.astype(np.float32) / 32768.0 print("AUDIO: ", type(audio), audio) audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) else: # if data_subset is None then still load load_Common_Voice dataset, text_column = load_Common_Voice() print("Dataset Loaded") model1, processor1 = load_model(model_1) model2, processor2 = load_model(model_2) print("Models Loaded") if data_subset == "OWN Recoding/Sample": sample = {"audio":{"array":audio,"sampling_rate":16000}} transcription1 = model_compute(model1, processor1, sample, model_1) transcription2 = model_compute(model2, processor2, sample, model_2) transcriptions1 = [transcription1] transcriptions2 = [transcription2] references = [own_transcription] wer1 = compute_wer(references, transcriptions1) wer2 = compute_wer(references, transcriptions2) results_md = f""" #### {model_1} - WER Score: {wer1} #### {model_2} - WER Score: {wer2}""" # Create the bar plot fig = go.Figure( data=[ go.Bar(x=[f"{model_1}"], y=[wer1], showlegend=False), go.Bar(x=[f"{model_2}"], y=[wer2], showlegend=False), ] ) # Update the layout for better visualization fig.update_layout( title="Comparison of Two Models", xaxis_title="Models", yaxis_title="Value", barmode="group", ) df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":[wer1],"transcriptions 2":transcriptions2,"WER 2":[wer2]}) yield results_md, fig, df else: references = [] transcriptions1 = [] transcriptions2 = [] WER1s = [] WER2s = [] counter = 0 for i, sample in enumerate(dataset, start=1): print(counter) counter += 1 references.append(sample[text_column]) if model_1 == model_2: transcription = model_compute(model1, processor1, sample, model_1) transcriptions1.append(transcription) transcriptions2.append(transcription) else: transcription1 = model_compute(model1, processor1, sample, model_1) transcription2 = model_compute(model2, processor2, sample, model_2) transcriptions1.append(transcription1) transcriptions2.append(transcription2) WER1s.append(compute_wer([sample[text_column]], [transcription1])) WER2s.append(compute_wer([sample[text_column]], [transcription2])) results_md = f""" {i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)} #### {model_1} - WER Score: {sum(WER1s)/N_SAMPLES} #### {model_2} - WER Score: {sum(WER2s)/N_SAMPLES}""" # Create the bar plot fig = go.Figure( data=[ go.Bar(x=[f"{model_1}"], y=[sum(WER1s)/N_SAMPLES], showlegend=False), go.Bar(x=[f"{model_2}"], y=[sum(WER2s)/N_SAMPLES], showlegend=False), ] ) # Update the layout for better visualization fig.update_layout( title="Comparison of Two Models", xaxis_title="Models", yaxis_title="Value", barmode="group", ) df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":WER1s,"transcriptions 2":transcriptions2,"WER 2":WER2s}) yield results_md, fig, df # DATASET LOADERS def load_Common_Voice(): dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True) text_column = "sentence" dataset = dataset.take(N_SAMPLES) dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) dataset = list(dataset) return dataset, text_column def load_Vox_Populi(): # Load the dataset in streaming mode dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True) # Optionally, preview the first item to understand the structure (can be removed in production) print(next(iter(dataset))) # Take the first 120 examples to work with dataset = dataset.take(N_SAMPLES+20) text_column = "normalized_text" # Filter out samples with empty or unwanted 'normalized_text' values and invalid audio dataset = dataset.filter(lambda x: is_valid_sample(x[text_column], x['audio'])) # Take the first 100 examples after filtering dataset = dataset.take(N_SAMPLES) # Cast the 'audio' column to the desired sampling rate dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) # Convert to list and return dataset = list(dataset) return dataset, text_column def is_valid_sample(text, audio): # Check if 'normalized_text' is valid text = text.strip() if text == "" or text == "ignore time segment in scoring": return False # Check if the 'audio' array is valid (not empty and meets length criteria) if len(audio['array']) == 0: # Audio is empty return False # Optionally, check if the audio duration is within a certain range duration = audio['array'].size / audio['sampling_rate'] if duration < 1.0 or duration > 60.0: # Example: Filter out audio shorter than 1 second or longer than 60 seconds return False return True # MODEL LOADERS def load_model(model_id:str): if model_id == "openai/whisper-tiny.en": model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") elif model_id == "facebook/s2t-medium-librispeech-asr": model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr") processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr") else: model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") return model, processor # MODEL INFERENCE def model_compute(model, processor, sample, model_id): if model_id == "openai/whisper-tiny.en": sample = sample["audio"] input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) return transcription[0] elif model_id == "facebook/s2t-medium-librispeech-asr": sample = sample["audio"] features = processor(sample["array"], sampling_rate=16000, padding=True, return_tensors="pt") input_features = features.input_features attention_mask = features.attention_mask gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask) transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)[0] return transcription else: return model(sample) # UTILS def compute_wer(references, predictions): wer = wer_metric.compute(references=references, predictions=predictions) wer = round(N_SAMPLES * wer, 2) return wer # print(load_Vox_Populi()) # print(run("Common Voice", "openai/whisper-tiny.en", "openai/whisper-tiny.en", None, None))