Spaces:

justus-tobias
/

ASR_Model_Comparison

Paused

File size: 11,165 Bytes

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import plotly.graph_objs as go
from datasets import load_dataset
from datasets import Audio
import evaluate
import librosa
import torch
import numpy as np
import pandas as pd

N_SAMPLES = 30

wer_metric = evaluate.load("wer")

def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str):

    if data_subset is None:
        raise ValueError("No Dataset selected")
    if model_1 is None:
        raise ValueError("No Model 1 selected")
    if model_2 is None:
        raise ValueError("No Model 2 selected")

    if data_subset == "Common Voice":
        dataset, text_column = load_Common_Voice()
    elif data_subset == "VoxPopuli":
        dataset, text_column = load_Vox_Populi()
    elif data_subset == "Librispeech ASR clean":
        dataset, text_column = load_Librispeech_ASR_clean()
    elif data_subset == "OWN Recoding/Sample":
        sr, audio = own_audio
        audio = audio.astype(np.float32) / 32768.0
        print("AUDIO: ", type(audio), audio)
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
    else:
        # if data_subset is None then still load load_Common_Voice
        dataset, text_column = load_Common_Voice()
    print("Dataset Loaded")
    
    model1, processor1 = load_model(model_1)
    model2, processor2 = load_model(model_2)
    print("Models Loaded")



    if data_subset == "OWN Recoding/Sample":
        sample = {"audio":{"array":audio,"sampling_rate":16000}}
        transcription1 = model_compute(model1, processor1, sample, model_1)
        transcription2 = model_compute(model2, processor2, sample, model_2)

        transcriptions1 = [transcription1]
        transcriptions2 = [transcription2]
        references = [own_transcription]
        
        wer1 = round(N_SAMPLES * compute_wer(references, transcriptions1), 2)
        wer2 = round(N_SAMPLES * compute_wer(references, transcriptions2), 2)

        results_md = f"""
        #### {model_1} 
        - WER Score: {wer1}
           
        #### {model_2} 
        - WER Score: {wer2}"""

        # Create the bar plot
        fig = go.Figure(
            data=[
                go.Bar(x=[f"{model_1}"], y=[wer1], showlegend=False),
                go.Bar(x=[f"{model_2}"], y=[wer2], showlegend=False),
            ]
        )
        # Update the layout for better visualization
        fig.update_layout(
            title="Comparison of Two Models",
            xaxis_title="Models",
            yaxis_title="Value",
            barmode="group",
        )

        df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":[wer1],"transcriptions 2":transcriptions2,"WER 2":[wer2]})

        yield results_md, fig, df

    else:
        references = []
        transcriptions1 = []
        transcriptions2 = []
        WER1s = []
        WER2s = []


        counter = 0
        for i, sample in enumerate(dataset, start=1):
            print(counter)
            counter += 1

            references.append(sample[text_column])

            if model_1 == model_2:
                transcription = model_compute(model1, processor1, sample, model_1)

                transcriptions1.append(transcription)
                transcriptions2.append(transcription)
            else:
                transcription1 = model_compute(model1, processor1, sample, model_1)
                transcription2 = model_compute(model2, processor2, sample, model_2)
                transcriptions1.append(transcription1)
                transcriptions2.append(transcription2)

            WER1s.append(compute_wer([sample[text_column]], [transcription1]))
            WER2s.append(compute_wer([sample[text_column]], [transcription2]))


            results_md = f"""
            {i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
            
            #### {model_1} 
            - WER Score: {round(sum(WER1s)/len(WER1s), 2)}
            
            #### {model_2} 
            - WER Score: {round(sum(WER2s)/len(WER2s), 2)}"""
            
            # Create the bar plot
            fig = go.Figure(
                data=[
                    go.Bar(x=[f"{model_1}"], y=[sum(WER1s)/len(WER1s)], showlegend=False),
                    go.Bar(x=[f"{model_2}"], y=[sum(WER2s)/len(WER2s)], showlegend=False),
                ]
            )

            # Update the layout for better visualization
            fig.update_layout(
                title="Comparison of Two Models",
                xaxis_title="Models",
                yaxis_title="Value",
                barmode="group",
            )

            df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":WER1s,"transcriptions 2":transcriptions2,"WER 2":WER2s})

            yield results_md, fig, df

    


# DATASET LOADERS
def load_Common_Voice():
    dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True)
    text_column = "sentence"
    dataset = dataset.take(N_SAMPLES)
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    dataset = list(dataset)
    for sample in dataset:
        sample["text"] = sample["text"].lower()
    return dataset, text_column

def load_Vox_Populi():
    # Load the dataset in streaming mode
    dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
    
    # Optionally, preview the first item to understand the structure (can be removed in production)
    print(next(iter(dataset)))

    # Take the first 120 examples to work with
    dataset = dataset.take(N_SAMPLES+20)
    text_column = "normalized_text"

    # Filter out samples with empty or unwanted 'normalized_text' values and invalid audio
    dataset = dataset.filter(lambda x: is_valid_sample(x[text_column], x['audio']))

    # Take the first 100 examples after filtering
    dataset = dataset.take(N_SAMPLES)

    # Cast the 'audio' column to the desired sampling rate
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

    # Convert to list and return
    dataset = list(dataset)
    return dataset, text_column

def load_Librispeech_ASR_clean():
    dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True, token=True, trust_remote_code=True)
    print(next(iter(dataset)))
    text_column = "text"
    dataset = dataset.take(N_SAMPLES)
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    dataset = list(dataset)
    for sample in dataset:
        sample["text"] = sample["text"].lower()
    return dataset, text_column

def is_valid_sample(text, audio):
    # Check if 'normalized_text' is valid
    text = text.strip()
    if text == "" or text == "ignore time segment in scoring":
        return False
    
    # Check if the 'audio' array is valid (not empty and meets length criteria)
    if len(audio['array']) == 0:  # Audio is empty
        return False
    
    # Optionally, check if the audio duration is within a certain range
    duration = audio['array'].size / audio['sampling_rate']
    if duration < 1.0 or duration > 60.0:  # Example: Filter out audio shorter than 1 second or longer than 60 seconds
        return False
    
    return True


# MODEL LOADERS
def load_model(model_id:str):
    if model_id == "openai/whisper-tiny.en":
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
    elif model_id == "facebook/s2t-medium-librispeech-asr":
        model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
        processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
    elif model_id == "facebook/wav2vec2-base-960h":
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    elif model_id == "openai/whisper-large-v2":
        processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
        model.config.forced_decoder_ids = None
    else:
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
    
    return model, processor


# MODEL INFERENCE
def model_compute(model, processor, sample, model_id):

    if model_id == "openai/whisper-tiny.en":
        sample = sample["audio"]
        input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        transcription = processor.tokenizer.normalize(transcription[0])
        return transcription
    elif model_id == "facebook/s2t-medium-librispeech-asr":
        sample = sample["audio"]
        features = processor(sample["array"], sampling_rate=16000, padding=True, return_tensors="pt")
        input_features = features.input_features
        attention_mask = features.attention_mask
        gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
        transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)
        return transcription[0]
    elif model_id == "facebook/wav2vec2-base-960h":
        sample = sample["audio"]
        input_values = processor(sample["array"], sampling_rate=16000, return_tensors="pt", padding="longest").input_values  # Batch size 1
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)
        return transcription[0].lower()
    elif model_id == "openai/whisper-large-v2":
        sample = sample["audio"]
        input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        transcription = processor.tokenizer.normalize(transcription[0])
        print("TRANSCRIPTION Whisper Large v2: ", transcription)
        return transcription
    else:
        sample = sample["audio"]
        input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        return transcription[0]

# UTILS
def compute_wer(references, predictions):
    wer = wer_metric.compute(references=references, predictions=predictions)
    return wer