Spaces:

linpershey
/

process_mining

Running

File size: 7,343 Bytes
from enum import Enum
from typing import Optional, Dict, Any
from pm4py.util import exec_utils, constants
from tempfile import NamedTemporaryFile
import pm4py
import os
import sys
import subprocess
import importlib.util


class Parameters(Enum):
    API_KEY = "api_key"
    MODEL = "openai_model"
    RECORDING_DURATION = "recording_duration"
    VOICE = "voice"
    PLAY_SOUND = "play_sound"
    MAX_LEN = "max_len"


def check_ffmpeg_installed():
    try:
        # Try to execute "ffmpeg -version" command and capture its output
        result = subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
        # If the command was executed successfully, ffmpeg is installed
        return True
    except:
        # If the command execution leads to an error, ffmpeg is not installed
        return False


def speech_to_text(sound_file_path: Optional[str] = None, parameters: Optional[Dict[Any, Any]] = None) -> str:
    """
    Uses an OpenAI speech-to-text model

    Parameters
    ------------------
    sound_file_path
        If provided, path to a .mp3 file containing the voice to be transcribed as text. If not, a recording of the specified duration is started, and provided to the model.
    parameters
        Parameters of the method, including:
        - Parameters.API_KEY => the API key to be used
        - Parameters.MODEL => the speech-to-text model to be used (default: whisper-1)
        - Parameters.RECORDING_DURATION => the duration of the voice recording

    Returns
    -------------------
    text
        Transcription as text of the sound
    """
    if parameters is None:
        parameters = {}

    api_key = exec_utils.get_param_value(Parameters.API_KEY, parameters, constants.OPENAI_API_KEY)
    model = exec_utils.get_param_value(Parameters.MODEL, parameters, constants.OPENAI_DEFAULT_STT_MODEL)
    recording_duration = exec_utils.get_param_value(Parameters.RECORDING_DURATION, parameters, 10)

    if sound_file_path is None:
        import pyaudio
        from pydub import AudioSegment
        import wave

        # Audio recording parameters
        FORMAT = pyaudio.paInt16
        CHANNELS = 1
        RATE = 44100
        CHUNK = 1024
        RECORD_SECONDS = recording_duration

        F = NamedTemporaryFile(suffix=".wav")
        WAVE_OUTPUT_FILENAME = F.name
        F.close()

        F = NamedTemporaryFile(suffix=".mp3")
        sound_file_path = F.name
        F.close()

        audio = pyaudio.PyAudio()

        # Start recording
        stream = audio.open(format=FORMAT, channels=CHANNELS,
                            rate=RATE, input=True,
                            frames_per_buffer=CHUNK)
        print("Recording...")

        frames = []

        for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
            data = stream.read(CHUNK)
            frames.append(data)

        print("Finished recording.")

        # Stop recording
        stream.stop_stream()
        stream.close()
        audio.terminate()

        # Save the recorded data as a WAV file
        wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(audio.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))
        wf.close()

        sound = AudioSegment.from_wav(WAVE_OUTPUT_FILENAME)
        sound.export(sound_file_path, format="mp3")

    if sound_file_path is not None:
        from openai import OpenAI

        client = OpenAI(api_key=api_key)

        transcript = client.audio.transcriptions.create(
            model=model,
            file=open(sound_file_path, "rb")
        )

        return transcript.text


def text_to_speech(stri: str, parameters: Optional[Dict[Any, Any]] = None) -> str:
    """
    Uses an OpenAI text-to-speech model

    Parameters
    ---------------
    stri
        String that needs to be translated to voice
    parameters
        Parameters of the algorithm, including:
        - Parameters.API_KEY => the API key of OpenAI to be used
        - Parameters.MODEL => the TTS model of OpenAI to be used (default: tts-1)
        - Parameters.VOICE => the voice of the TTS model to be used (default: alloy)
        - Parameters.PLAY_SOUND => boolean that determines if the voice should be played

    Returns
    ---------------
    stru
        Path to the .mp3 file obtained after the transcription
    """
    if parameters is None:
        parameters = {}

    api_key = exec_utils.get_param_value(Parameters.API_KEY, parameters, constants.OPENAI_API_KEY)
    model = exec_utils.get_param_value(Parameters.MODEL, parameters, constants.OPENAI_DEFAULT_TTS_MODEL)
    voice = exec_utils.get_param_value(Parameters.VOICE, parameters, constants.OPENAI_DEFAULT_TTS_VOICE)
    max_len = exec_utils.get_param_value(Parameters.MAX_LEN, parameters, 4096)
    play_sound = exec_utils.get_param_value(Parameters.PLAY_SOUND, parameters, True)

    F = NamedTemporaryFile(suffix=".mp3")
    speech_file_path = F.name
    F.close()

    from openai import OpenAI

    client = OpenAI(api_key=api_key)

    if len(stri) > max_len:
        # TTS limit
        stri = stri[:max_len]

    response = client.audio.speech.create(
        model=model,
        voice=voice,
        input=stri
    )

    response.stream_to_file(speech_file_path)

    if play_sound:
        if importlib.util.find_spec("pygame"):
            # if the user installed pygame, use that to seamlessy play the .mp3 file
            import pygame

            pygame.mixer.init()
            pygame.mixer.music.load(speech_file_path)
            pygame.mixer.music.play()

            while pygame.mixer.music.get_busy():
                pygame.time.Clock().tick(10)
        else:
            # calls the system .mp3 opener
            if sys.platform.startswith('darwin'):
                subprocess.call(('open', speech_file_path))
            elif os.name == 'nt':  # For Windows
                os.startfile(speech_file_path)
            elif os.name == 'posix':  # For Linux, Mac, etc.
                subprocess.call(('xdg-open', speech_file_path))

    return speech_file_path


if __name__ == "__main__":
    if not check_ffmpeg_installed():
        raise Exception("install ffmpeg and add it to the environment variables!")

    if not importlib.util.find_spec("pydub") or not importlib.util.find_spec("pyaudio"):
        raise Exception("install pydub and pyaudio using pip!")

    api_key = "sk-"

    log = pm4py.read_xes("../../tests/compressed_input_data/15_bpic2020_permit_log_1t_per_variant.xes.gz")
    var_abstr = pm4py.llm.abstract_variants(log)

    parameters = {}

    parameters["api_key"] = api_key # OpenAI key
    parameters["recording_duration"] = 6  # 6 seconds recording duration

    print("Please insert your inquiry:")
    user_inquiry = speech_to_text(None, parameters=parameters)
    print("This is your inquiry:", user_inquiry)

    print("Now your inquiry is vocalized before execution:")
    text_to_speech(user_inquiry, parameters=parameters)

    prompt = var_abstr + "\n\n" + user_inquiry

    response = pm4py.llm.openai_query(prompt, api_key=api_key)
    print("This is the response of the OpenAI model:", response)

    print("Now the response is vocalized:")
    text_to_speech(response, parameters=parameters)