FIXED_GENERATION_CONFIG = dict( max_completion_tokens=1024, top_k=50, length_penalty=1.0, seed=42 ) MAX_AUDIO_LENGTH = 120 def prepare_multimodal_content(text_input, base64_audio_input): return [ { "type": "text", "text": f"Text instruction: {text_input}" }, { "type": "audio_url", "audio_url": { "url": f"data:audio/ogg;base64,{base64_audio_input}" }, }, ] def change_multimodal_content( original_content, text_input="", base64_audio_input=""): # Since python 3.7 dictionary is ordered. if text_input: original_content[0] = { "type": "text", "text": f"Text instruction: {text_input}" } if base64_audio_input: original_content[1] = { "type": "audio_url", "audio_url": { "url": f"data:audio/ogg;base64,{base64_audio_input}" } } return original_content