File size: 11,916 Bytes

import subprocess
import torch

# if torch.cuda.is_available():
#     process = subprocess.Popen(['pip', 'uninstall', 'onnxruntime'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
#     stdout, stderr = process.communicate()
#     process = subprocess.Popen(['pip', 'install', '--force-reinstall', 'onnxruntime-gpu'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
#     stdout, stderr = process.communicate()

import whisperx
import os
import time
import json
import base64
import numpy as np

DEVNULL = open(os.devnull, 'w')

# from transformers.pipelines.audio_utils import ffmpeg_read
from typing import Dict, List, Any

import logging

logger = logging.getLogger(__name__)


SAMPLE_RATE = 16000

def whisper_config():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    whisper_model = "large-v2"
    batch_size = 16  # reduce if low on GPU mem, 16 initailly
    # change to "int8" if low on GPU mem (may reduce accuracy)
    compute_type = "float16" if device == "cuda" else "int8"
    return device, batch_size, compute_type, whisper_model

# From https://gist.github.com/kylemcdonald/85d70bf53e207bab3775
# load_audio can not detect the input type
def ffmpeg_load_audio(filename, sr=44100, mono=False, normalize=True, in_type=np.int16, out_type=np.float32):
    channels = 1 if mono else 2
    format_strings = {
        np.float64: 'f64le',
        np.float32: 'f32le',
        np.int16: 's16le',
        np.int32: 's32le',
        np.uint32: 'u32le'
    }
    format_string = format_strings[in_type]
    command = [
        'ffmpeg',
        '-i', filename,
        '-f', format_string,
        '-acodec', 'pcm_' + format_string,
        '-ar', str(sr),
        '-ac', str(channels),
        '-']
    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=DEVNULL, bufsize=4096)
    bytes_per_sample = np.dtype(in_type).itemsize
    frame_size = bytes_per_sample * channels
    chunk_size = frame_size * sr # read in 1-second chunks
    raw = b''
    with p.stdout as stdout:
        while True:
            data = stdout.read(chunk_size)
            if data:
                raw += data
            else:
                break
    audio = np.fromstring(raw, dtype=in_type).astype(out_type)
    if channels > 1:
        audio = audio.reshape((-1, channels)).transpose()
    if audio.size == 0:
        return audio, sr
    if issubclass(out_type, np.floating):
        if normalize:
            peak = np.abs(audio).max()
            if peak > 0:
                audio /= peak
        elif issubclass(in_type, np.integer):
            audio /= np.iinfo(in_type).max
    return audio

# FROM HuggingFace
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
    """
    Helper function to read an audio file through ffmpeg.
    """
    ar = f"{sampling_rate}"
    ac = "1"
    format_for_conversion = "f32le"
    ffmpeg_command = [
        "ffmpeg",
        "-i",
        "pipe:0",
        "-ac",
        ac,
        "-ar",
        ar,
        "-f",
        format_for_conversion,
        "-hide_banner",
        "-loglevel",
        "quiet",
        "pipe:1",
    ]

    try:
        with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
            output_stream = ffmpeg_process.communicate(bpayload)
    except FileNotFoundError as error:
        raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
    out_bytes = output_stream[0]
    audio = np.frombuffer(out_bytes, np.float32)
    if audio.shape[0] == 0:
        raise ValueError(
            "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
            "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
            "URL, ensure that the URL is the full address to **download** the audio file."
        )
    return audio


# FROM whisperX
def load_audio(file: str, sr: int = SAMPLE_RATE):
    """
    Open an audio file and read as mono waveform, resampling as necessary

    Parameters
    ----------
    file: str
        The audio file to open

    sr: int
        The sample rate to resample the audio if necessary

    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    """
    try:
        # Launches a subprocess to decode audio while down-mixing and resampling as necessary.
        # Requires the ffmpeg CLI to be installed.
        cmd = [
            "ffmpeg",
            "-nostdin",
            "-threads",
            "0",
            "-i",
            file,
            "-f",
            "s16le",
            "-ac",
            "1",
            "-acodec",
            "pcm_s16le",
            "-ar",
            str(sr),
            "-",
        ]
        out = subprocess.run(cmd, capture_output=True, check=True).stdout
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e

    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0


def display_gpu_infos():
    if not torch.cuda.is_available():
        return "NO CUDA"
    
    infos = "torch.cuda.current_device(): " + str(torch.cuda.current_device()) + ", "
    infos = infos + "torch.cuda.device(0): " +  str(torch.cuda.device(0)) + ", "
    infos = infos + "torch.cuda.device_count(): " + str(torch.cuda.device_count()) + ", "
    infos = infos + "torch.cuda.get_device_name(0): " + str(torch.cuda.get_device_name(0))
    return infos

class EndpointHandler():
    def __init__(self, path=""):
        # load the model
        device, batch_size, compute_type, whisper_model = whisper_config()
        self.model = whisperx.load_model(whisper_model, device=device, compute_type=compute_type)
        # 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 ERIC
        # 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 OLD
        logger.info(f"Model {whisper_model} initialized")
        
        self.diarize_model = whisperx.DiarizationPipeline( 
                "pyannote/speaker-diarization-3.1",
                use_auth_token="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", device=device)
        
        logger.info(f"Model for diarization initialized")

        
    def __call__(self, data: Any) -> Dict[str, str]:
            """
            Args:
                data (:obj:):
                    includes the deserialized audio file as bytes
            Return:
                A :obj:`dict`:. base64 encoded image
            """
            # get the start time
            st = time.time()
            

            logger.info("--------------- CONFIGURATION ------------------------")
            device, batch_size, compute_type, whisper_model = whisper_config()
            logger.info(f"device: {device}, batch_size: {batch_size}, compute_type:{compute_type}, whisper_model: {whisper_model}")
            logger.info(display_gpu_infos())
            
            # 1. process input
            inputs_encoded = data.pop("inputs", data)
            parameters = data.pop("parameters", None)
            options = data.pop("options", None)
            
            # OPTIONS are given as parameters
            info = False
            if options and "info" in options.keys() and options['info']:
                info = True

            alignment = False
            if options and "alignment" in options.keys() and options['alignment']:
                alignment = True
                
            diarization = True
            if options and "diarization" in options.keys() and not options['diarization']:
                diarization = False
        
            language = "fr"
            if parameters and "language" in parameters.keys():
                language = parameters["language"]

            inputs = base64.b64decode(inputs_encoded)
            # make a tmp file
            with open('/tmp/myfile.tmp', 'wb') as w:
                w.write(inputs)
            
            # audio_nparray = ffmpeg_load_audio('/tmp/myfile.tmp', sr=SAMPLE_RATE, mono=True, out_type=np.float32)
            audio_nparray = load_audio('/tmp/myfile.tmp', sr=SAMPLE_RATE)
            # clean up
            os.remove('/tmp/myfile.tmp')
                
            # audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
            # audio_tensor= torch.from_numpy(audio_nparray)
            
            # get the end time
            et = time.time()

            # get the execution time
            elapsed_time = et - st
            logger.info(f"TIME for audio processing : {elapsed_time:.2f} seconds")
            if info:
                print(f"TIME for audio processing : {elapsed_time:.2f} seconds")

            # 2. transcribe
            logger.info("--------------- STARTING TRANSCRIPTION ------------------------")
            transcription = self.model.transcribe(audio_nparray, batch_size=batch_size,language=language)
            if info:
                print(transcription["segments"][0:10000]) # before alignment
            logger.info(transcription["segments"][0:10000])

            try:
                first_text = transcription["segments"][0]["text"]
            except:
                logger.warning("No transcription")
                return {"transcription": transcription["segments"]}
            
            # get the execution time
            et = time.time()
            elapsed_time = et - st
            st = time.time()
            logger.info(f"TIME for audio transcription : {elapsed_time:.2f} seconds")
            if info:
                print(f"TIME for audio transcription : {elapsed_time:.2f} seconds")

            # 3. align
            if alignment:
                logger.info("--------------- STARTING ALIGNMENT ------------------------")
                model_a, metadata = whisperx.load_align_model(
                    language_code=transcription["language"], device=device)
                transcription = whisperx.align(
                    transcription["segments"], model_a, metadata, audio_nparray, device, return_char_alignments=False)
                if info:
                    print(transcription["segments"][0:10000])
                logger.info(transcription["segments"][0:10000])

                # get the execution time
                et = time.time()
                elapsed_time = et - st
                st = time.time()
                logger.info(f"TIME for alignment : {elapsed_time:.2f} seconds")
                if info:
                    print(f"TIME for alignment : {elapsed_time:.2f} seconds")
            
            # 4. Assign speaker labels
            if diarization:
                logger.info("--------------- STARTING DIARIZATION ------------------------")
                # add min/max number of speakers if known
                diarize_segments = self.diarize_model(audio_nparray)
                if info:
                    print(diarize_segments)
                logger.info(diarize_segments)
                # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

                transcription = whisperx.assign_word_speakers(diarize_segments, transcription)
                if info:
                    print(transcription["segments"][0:10000])
                logger.info(transcription["segments"][0:10000])  # segments are now assigned speaker IDs
                
                # get the execution time
                et = time.time()
                elapsed_time = et - st
                st = time.time()
                logger.info(f"TIME for audio diarization : {elapsed_time:.2f} seconds")
                if info:
                    print(f"TIME for audio diarization : {elapsed_time:.2f} seconds")

            # results_json = json.dumps(results)
            # return {"results": results_json}
            return {"transcription": transcription["segments"]}