|
#include "model_adapter.h" |
|
#include "otherarch/utils.h" |
|
|
|
#include "whisper.cpp" |
|
|
|
#define DR_WAV_IMPLEMENTATION |
|
#include "dr_wav.h" |
|
|
|
#include <cmath> |
|
#include <fstream> |
|
#include <cstdio> |
|
#include <regex> |
|
#include <string> |
|
#include <thread> |
|
#include <vector> |
|
#include <cstring> |
|
#include <mutex> |
|
#include <cinttypes> |
|
|
|
#define COMMON_SAMPLE_RATE 16000 |
|
|
|
#if defined(_MSC_VER) |
|
#pragma warning(disable: 4244 4267) |
|
#endif |
|
|
|
static int whisperdebugmode = 0; |
|
static bool whisper_is_quiet = false; |
|
static whisper_context * whisper_ctx = nullptr; |
|
static std::string whisper_output_text = ""; |
|
|
|
int total_transcribe_gens = 0; |
|
|
|
static bool is_wav_buffer(const std::string buf) { |
|
|
|
|
|
if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") { |
|
return false; |
|
} |
|
uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4); |
|
if (chunk_size + 8 != buf.size()) { |
|
return false; |
|
} |
|
return true; |
|
} |
|
|
|
static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) |
|
{ |
|
drwav wav; |
|
std::vector<uint8_t> wav_data = kcpp_base64_decode(b64data); |
|
|
|
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { |
|
printf("error: failed to open WAV file from stdin\n"); |
|
return false; |
|
} |
|
|
|
if (wav.channels != 1 && wav.channels != 2) { |
|
printf("WAV file must be mono or stereo\n"); |
|
drwav_uninit(&wav); |
|
return false; |
|
} |
|
|
|
if (wav.bitsPerSample != 8 && wav.bitsPerSample != 16 && wav.bitsPerSample != 32) { |
|
printf("WAV file must be 8-bit, 16-bit or 32-bit. Detected: %d\n",wav.bitsPerSample); |
|
drwav_uninit(&wav); |
|
return false; |
|
} |
|
|
|
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); |
|
|
|
std::vector<int16_t> pcm16; |
|
pcm16.resize(n*wav.channels); |
|
|
|
if (wav.bitsPerSample == 8) { |
|
|
|
std::vector<uint8_t> pcm8(n * wav.channels); |
|
drwav_read_pcm_frames(&wav, n, pcm8.data()); |
|
drwav_u8_to_s16(pcm16.data(), pcm8.data(), n * wav.channels); |
|
} else if (wav.bitsPerSample == 16) { |
|
|
|
drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); |
|
} else if (wav.bitsPerSample == 32) { |
|
|
|
std::vector<int32_t> pcm32(n * wav.channels); |
|
drwav_read_pcm_frames_s32(&wav, n, pcm32.data()); |
|
for (uint64_t i = 0; i < n * wav.channels; ++i) { |
|
pcm16[i] = static_cast<int16_t>(pcm32[i] >> 16); |
|
} |
|
} |
|
drwav_uninit(&wav); |
|
|
|
std::vector<float> raw_pcm; |
|
raw_pcm.resize(n); |
|
|
|
if(whisperdebugmode==1 && !whisper_is_quiet) |
|
{ |
|
printf("\nwav_data_size: %d, n:%d",wav_data.size(),n); |
|
} |
|
|
|
|
|
if (wav.channels == 1) { |
|
for (uint64_t i = 0; i < n; i++) { |
|
raw_pcm[i] = float(pcm16[i])/32768.0f; |
|
} |
|
} else { |
|
for (uint64_t i = 0; i < n; i++) { |
|
raw_pcm[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; |
|
} |
|
} |
|
|
|
if (wav.sampleRate != COMMON_SAMPLE_RATE) { |
|
if(whisperdebugmode==1 && !whisper_is_quiet) |
|
{ |
|
printf("\nResample wav from %" PRIu32 " to %" PRIu32 " (in size: %zu)", |
|
wav.sampleRate, COMMON_SAMPLE_RATE, raw_pcm.size()); |
|
} |
|
raw_pcm = resample_wav(raw_pcm, wav.sampleRate, COMMON_SAMPLE_RATE); |
|
} |
|
|
|
uint64_t finalsize = raw_pcm.size(); |
|
pcmf32.resize(finalsize); |
|
for (uint64_t i = 0; i < finalsize; i++) { |
|
pcmf32[i] = raw_pcm[i]; |
|
} |
|
|
|
return true; |
|
} |
|
|
|
static std::string output_txt(struct whisper_context * ctx, std::vector<std::vector<float>> pcmf32s) { |
|
|
|
std::string outtxt = ""; |
|
const int n_segments = whisper_full_n_segments(ctx); |
|
for (int i = 0; i < n_segments; ++i) { |
|
const char * text = whisper_full_get_segment_text(ctx, i); |
|
outtxt += text; |
|
} |
|
return outtxt; |
|
} |
|
|
|
void cb_log_disable(enum ggml_log_level , const char * , void * ) { } |
|
|
|
static std::string whisperplatformenv, whisperdeviceenv, whispervulkandeviceenv; |
|
bool whispertype_load_model(const whisper_load_model_inputs inputs) |
|
{ |
|
whisper_is_quiet = inputs.quiet; |
|
|
|
|
|
int cl_parseinfo = inputs.clblast_info; |
|
std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(cl_parseinfo>0?1:0); |
|
putenv((char*)usingclblast.c_str()); |
|
cl_parseinfo = cl_parseinfo%100; |
|
int platform = cl_parseinfo/10; |
|
int devices = cl_parseinfo%10; |
|
whisperplatformenv = "GGML_OPENCL_PLATFORM="+std::to_string(platform); |
|
whisperdeviceenv = "GGML_OPENCL_DEVICE="+std::to_string(devices); |
|
putenv((char*)whisperplatformenv.c_str()); |
|
putenv((char*)whisperdeviceenv.c_str()); |
|
std::string vulkan_info_raw = inputs.vulkan_info; |
|
std::string vulkan_info_str = ""; |
|
for (size_t i = 0; i < vulkan_info_raw.length(); ++i) { |
|
vulkan_info_str += vulkan_info_raw[i]; |
|
if (i < vulkan_info_raw.length() - 1) { |
|
vulkan_info_str += ","; |
|
} |
|
} |
|
if(vulkan_info_str!="") |
|
{ |
|
whispervulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str; |
|
putenv((char*)whispervulkandeviceenv.c_str()); |
|
} |
|
|
|
|
|
std::string modelfile = inputs.model_filename; |
|
printf("\nLoading Whisper Model: %s",modelfile.c_str()); |
|
|
|
whisperdebugmode = inputs.debugmode; |
|
if (whisperdebugmode!=1) { |
|
whisper_log_set(cb_log_disable, NULL); |
|
} |
|
|
|
|
|
struct whisper_context_params cparams = whisper_context_default_params(); |
|
cparams.use_gpu = true; |
|
cparams.flash_attn = false; |
|
|
|
whisper_ctx = whisper_init_from_file_with_params(modelfile.c_str(), cparams); |
|
|
|
if (whisper_ctx == nullptr) { |
|
printf("\nWhisper Load Error: Failed to initialize whisper context!\n"); |
|
return false; |
|
} |
|
|
|
printf("\nWhisper Load Complete.\n"); |
|
|
|
return true; |
|
} |
|
|
|
whisper_generation_outputs whispertype_generate(const whisper_generation_inputs inputs) |
|
{ |
|
whisper_generation_outputs output; |
|
|
|
if(whisper_ctx==nullptr) |
|
{ |
|
printf("\nWarning: KCPP whisper not initialized!\n"); |
|
output.text = ""; |
|
output.status = 0; |
|
return output; |
|
} |
|
|
|
if(!whisper_is_quiet) |
|
{ |
|
printf("\nWhisper Transcribe Generating..."); |
|
} |
|
|
|
const std::string b64data = std::string(inputs.audio_data); |
|
const std::string initprompt = std::string(inputs.prompt); |
|
const std::string langcode = std::string(inputs.langcode); |
|
|
|
std::vector<float> pcmf32; |
|
std::vector<std::vector<float>> pcmf32s; |
|
|
|
if (!::read_wav(b64data, pcmf32, pcmf32s, false)) { |
|
printf("\nWhisper: Failed to read input wav data!\n"); |
|
output.text = ""; |
|
output.status = 0; |
|
return output; |
|
} |
|
|
|
|
|
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); |
|
wparams.strategy = WHISPER_SAMPLING_GREEDY; |
|
wparams.print_realtime = false; |
|
wparams.print_progress = false; |
|
wparams.print_timestamps = false; |
|
wparams.print_special = false; |
|
wparams.translate = false; |
|
wparams.language = langcode.c_str(); |
|
wparams.detect_language = false; |
|
wparams.n_threads = 4; |
|
wparams.n_max_text_ctx = wparams.n_max_text_ctx; |
|
wparams.offset_ms = 0; |
|
wparams.duration_ms = 0; |
|
wparams.token_timestamps = false; |
|
wparams.thold_pt = 0.01f; |
|
wparams.max_len = 100; |
|
wparams.split_on_word = false; |
|
wparams.audio_ctx = 0; |
|
wparams.speed_up = false; |
|
wparams.debug_mode = (whisperdebugmode==1); |
|
wparams.tdrz_enable = false; |
|
wparams.suppress_regex = nullptr; |
|
wparams.suppress_non_speech_tokens = inputs.suppress_non_speech; |
|
wparams.initial_prompt = initprompt.c_str(); |
|
wparams.greedy.best_of = -1; |
|
wparams.beam_search.beam_size = -1; |
|
wparams.temperature_inc = 0.2f; |
|
wparams.temperature = 0.0f; |
|
wparams.entropy_thold = 2.40f; |
|
wparams.logprob_thold = -1.00f; |
|
wparams.no_timestamps = true; |
|
|
|
if (whisper_full_parallel(whisper_ctx, wparams, pcmf32.data(), pcmf32.size(), 1) != 0) { |
|
printf("\nWhisper: Failed to process audio!\n"); |
|
output.text = ""; |
|
output.status = 0; |
|
return output; |
|
} |
|
|
|
if (!whisper_is_quiet && whisperdebugmode==1) { |
|
whisper_print_timings(whisper_ctx); |
|
} |
|
|
|
|
|
whisper_output_text = output_txt(whisper_ctx, pcmf32s); |
|
std::string ts = get_timestamp_str(); |
|
if(!whisper_is_quiet) |
|
{ |
|
printf("\n[%s] Whisper Transcribe Output: %s",ts.c_str(),whisper_output_text.c_str()); |
|
} else { |
|
printf("\n[%s] Whisper Transcribe Done.",ts.c_str()); |
|
} |
|
output.text = whisper_output_text.c_str(); |
|
output.status = 1; |
|
total_transcribe_gens += 1; |
|
return output; |
|
} |
|
|