Spaces:
Sleeping
Sleeping
File size: 7,343 Bytes
8097001 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
from enum import Enum
from typing import Optional, Dict, Any
from pm4py.util import exec_utils, constants
from tempfile import NamedTemporaryFile
import pm4py
import os
import sys
import subprocess
import importlib.util
class Parameters(Enum):
API_KEY = "api_key"
MODEL = "openai_model"
RECORDING_DURATION = "recording_duration"
VOICE = "voice"
PLAY_SOUND = "play_sound"
MAX_LEN = "max_len"
def check_ffmpeg_installed():
try:
# Try to execute "ffmpeg -version" command and capture its output
result = subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
# If the command was executed successfully, ffmpeg is installed
return True
except:
# If the command execution leads to an error, ffmpeg is not installed
return False
def speech_to_text(sound_file_path: Optional[str] = None, parameters: Optional[Dict[Any, Any]] = None) -> str:
"""
Uses an OpenAI speech-to-text model
Parameters
------------------
sound_file_path
If provided, path to a .mp3 file containing the voice to be transcribed as text. If not, a recording of the specified duration is started, and provided to the model.
parameters
Parameters of the method, including:
- Parameters.API_KEY => the API key to be used
- Parameters.MODEL => the speech-to-text model to be used (default: whisper-1)
- Parameters.RECORDING_DURATION => the duration of the voice recording
Returns
-------------------
text
Transcription as text of the sound
"""
if parameters is None:
parameters = {}
api_key = exec_utils.get_param_value(Parameters.API_KEY, parameters, constants.OPENAI_API_KEY)
model = exec_utils.get_param_value(Parameters.MODEL, parameters, constants.OPENAI_DEFAULT_STT_MODEL)
recording_duration = exec_utils.get_param_value(Parameters.RECORDING_DURATION, parameters, 10)
if sound_file_path is None:
import pyaudio
from pydub import AudioSegment
import wave
# Audio recording parameters
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = recording_duration
F = NamedTemporaryFile(suffix=".wav")
WAVE_OUTPUT_FILENAME = F.name
F.close()
F = NamedTemporaryFile(suffix=".mp3")
sound_file_path = F.name
F.close()
audio = pyaudio.PyAudio()
# Start recording
stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK)
print("Recording...")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("Finished recording.")
# Stop recording
stream.stop_stream()
stream.close()
audio.terminate()
# Save the recorded data as a WAV file
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
sound = AudioSegment.from_wav(WAVE_OUTPUT_FILENAME)
sound.export(sound_file_path, format="mp3")
if sound_file_path is not None:
from openai import OpenAI
client = OpenAI(api_key=api_key)
transcript = client.audio.transcriptions.create(
model=model,
file=open(sound_file_path, "rb")
)
return transcript.text
def text_to_speech(stri: str, parameters: Optional[Dict[Any, Any]] = None) -> str:
"""
Uses an OpenAI text-to-speech model
Parameters
---------------
stri
String that needs to be translated to voice
parameters
Parameters of the algorithm, including:
- Parameters.API_KEY => the API key of OpenAI to be used
- Parameters.MODEL => the TTS model of OpenAI to be used (default: tts-1)
- Parameters.VOICE => the voice of the TTS model to be used (default: alloy)
- Parameters.PLAY_SOUND => boolean that determines if the voice should be played
Returns
---------------
stru
Path to the .mp3 file obtained after the transcription
"""
if parameters is None:
parameters = {}
api_key = exec_utils.get_param_value(Parameters.API_KEY, parameters, constants.OPENAI_API_KEY)
model = exec_utils.get_param_value(Parameters.MODEL, parameters, constants.OPENAI_DEFAULT_TTS_MODEL)
voice = exec_utils.get_param_value(Parameters.VOICE, parameters, constants.OPENAI_DEFAULT_TTS_VOICE)
max_len = exec_utils.get_param_value(Parameters.MAX_LEN, parameters, 4096)
play_sound = exec_utils.get_param_value(Parameters.PLAY_SOUND, parameters, True)
F = NamedTemporaryFile(suffix=".mp3")
speech_file_path = F.name
F.close()
from openai import OpenAI
client = OpenAI(api_key=api_key)
if len(stri) > max_len:
# TTS limit
stri = stri[:max_len]
response = client.audio.speech.create(
model=model,
voice=voice,
input=stri
)
response.stream_to_file(speech_file_path)
if play_sound:
if importlib.util.find_spec("pygame"):
# if the user installed pygame, use that to seamlessy play the .mp3 file
import pygame
pygame.mixer.init()
pygame.mixer.music.load(speech_file_path)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
pygame.time.Clock().tick(10)
else:
# calls the system .mp3 opener
if sys.platform.startswith('darwin'):
subprocess.call(('open', speech_file_path))
elif os.name == 'nt': # For Windows
os.startfile(speech_file_path)
elif os.name == 'posix': # For Linux, Mac, etc.
subprocess.call(('xdg-open', speech_file_path))
return speech_file_path
if __name__ == "__main__":
if not check_ffmpeg_installed():
raise Exception("install ffmpeg and add it to the environment variables!")
if not importlib.util.find_spec("pydub") or not importlib.util.find_spec("pyaudio"):
raise Exception("install pydub and pyaudio using pip!")
api_key = "sk-"
log = pm4py.read_xes("../../tests/compressed_input_data/15_bpic2020_permit_log_1t_per_variant.xes.gz")
var_abstr = pm4py.llm.abstract_variants(log)
parameters = {}
parameters["api_key"] = api_key # OpenAI key
parameters["recording_duration"] = 6 # 6 seconds recording duration
print("Please insert your inquiry:")
user_inquiry = speech_to_text(None, parameters=parameters)
print("This is your inquiry:", user_inquiry)
print("Now your inquiry is vocalized before execution:")
text_to_speech(user_inquiry, parameters=parameters)
prompt = var_abstr + "\n\n" + user_inquiry
response = pm4py.llm.openai_query(prompt, api_key=api_key)
print("This is the response of the OpenAI model:", response)
print("Now the response is vocalized:")
text_to_speech(response, parameters=parameters)
|