Comparative-Analysis-of-Speech-Synthesis-Models
/
TensorFlowTTS
/examples
/cppwin
/TensorflowTTSCppInference
/FastSpeech2.cpp
FastSpeech2::FastSpeech2() | |
{ | |
FastSpeech = nullptr; | |
} | |
FastSpeech2::FastSpeech2(const std::string & SavedModelFolder) | |
{ | |
Initialize(SavedModelFolder); | |
} | |
bool FastSpeech2::Initialize(const std::string & SavedModelFolder) | |
{ | |
try { | |
FastSpeech = new Model(SavedModelFolder); | |
} | |
catch (...) { | |
FastSpeech = nullptr; | |
return false; | |
} | |
return true; | |
} | |
TFTensor<float> FastSpeech2::DoInference(const std::vector<int32_t>& InputIDs, int32_t SpeakerID, float Speed, float Energy, float F0, int32_t EmotionID) | |
{ | |
if (!FastSpeech) | |
throw std::invalid_argument("Tried to do inference on unloaded or invalid model!"); | |
// Convenience reference so that we don't have to constantly derefer pointers. | |
Model& Mdl = *FastSpeech; | |
// Define the tensors | |
Tensor input_ids{ Mdl,"serving_default_input_ids" }; | |
Tensor energy_ratios{ Mdl,"serving_default_energy_ratios" }; | |
Tensor f0_ratios{ Mdl,"serving_default_f0_ratios" }; | |
Tensor speaker_ids{ Mdl,"serving_default_speaker_ids" }; | |
Tensor speed_ratios{ Mdl,"serving_default_speed_ratios" }; | |
Tensor* emotion_ids = nullptr; | |
// This is a multi-emotion model | |
if (EmotionID != -1) | |
{ | |
emotion_ids = new Tensor{Mdl,"serving_default_emotion_ids"}; | |
emotion_ids->set_data(std::vector<int32_t>{EmotionID}); | |
} | |
// This is the shape of the input IDs, our equivalent to tf.expand_dims. | |
std::vector<int64_t> InputIDShape = { 1, (int64_t)InputIDs.size() }; | |
input_ids.set_data(InputIDs, InputIDShape); | |
energy_ratios.set_data(std::vector<float>{ Energy }); | |
f0_ratios.set_data(std::vector<float>{F0}); | |
speaker_ids.set_data(std::vector<int32_t>{SpeakerID}); | |
speed_ratios.set_data(std::vector<float>{Speed}); | |
// Define output tensor | |
Tensor output{ Mdl,"StatefulPartitionedCall" }; | |
// Vector of input tensors | |
std::vector<Tensor*> inputs = { &input_ids,&speaker_ids,&speed_ratios,&f0_ratios,&energy_ratios }; | |
if (EmotionID != -1) | |
inputs.push_back(emotion_ids); | |
// Do inference | |
FastSpeech->run(inputs, output); | |
// Define output and return it | |
TFTensor<float> Output = VoxUtil::CopyTensor<float>(output); | |
// We allocated the emotion_ids tensor dynamically, delete it | |
if (emotion_ids) | |
delete emotion_ids; | |
// We could just straight out define it in the return statement, but I like it more this way | |
return Output; | |
} | |
FastSpeech2::~FastSpeech2() | |
{ | |
if (FastSpeech) | |
delete FastSpeech; | |
} | |