File size: 2,453 Bytes
d5ee97c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
#include "FastSpeech2.h"
#include <stdexcept>
FastSpeech2::FastSpeech2()
{
FastSpeech = nullptr;
}
FastSpeech2::FastSpeech2(const std::string & SavedModelFolder)
{
Initialize(SavedModelFolder);
}
bool FastSpeech2::Initialize(const std::string & SavedModelFolder)
{
try {
FastSpeech = new Model(SavedModelFolder);
}
catch (...) {
FastSpeech = nullptr;
return false;
}
return true;
}
TFTensor<float> FastSpeech2::DoInference(const std::vector<int32_t>& InputIDs, int32_t SpeakerID, float Speed, float Energy, float F0, int32_t EmotionID)
{
if (!FastSpeech)
throw std::invalid_argument("Tried to do inference on unloaded or invalid model!");
// Convenience reference so that we don't have to constantly derefer pointers.
Model& Mdl = *FastSpeech;
// Define the tensors
Tensor input_ids{ Mdl,"serving_default_input_ids" };
Tensor energy_ratios{ Mdl,"serving_default_energy_ratios" };
Tensor f0_ratios{ Mdl,"serving_default_f0_ratios" };
Tensor speaker_ids{ Mdl,"serving_default_speaker_ids" };
Tensor speed_ratios{ Mdl,"serving_default_speed_ratios" };
Tensor* emotion_ids = nullptr;
// This is a multi-emotion model
if (EmotionID != -1)
{
emotion_ids = new Tensor{Mdl,"serving_default_emotion_ids"};
emotion_ids->set_data(std::vector<int32_t>{EmotionID});
}
// This is the shape of the input IDs, our equivalent to tf.expand_dims.
std::vector<int64_t> InputIDShape = { 1, (int64_t)InputIDs.size() };
input_ids.set_data(InputIDs, InputIDShape);
energy_ratios.set_data(std::vector<float>{ Energy });
f0_ratios.set_data(std::vector<float>{F0});
speaker_ids.set_data(std::vector<int32_t>{SpeakerID});
speed_ratios.set_data(std::vector<float>{Speed});
// Define output tensor
Tensor output{ Mdl,"StatefulPartitionedCall" };
// Vector of input tensors
std::vector<Tensor*> inputs = { &input_ids,&speaker_ids,&speed_ratios,&f0_ratios,&energy_ratios };
if (EmotionID != -1)
inputs.push_back(emotion_ids);
// Do inference
FastSpeech->run(inputs, output);
// Define output and return it
TFTensor<float> Output = VoxUtil::CopyTensor<float>(output);
// We allocated the emotion_ids tensor dynamically, delete it
if (emotion_ids)
delete emotion_ids;
// We could just straight out define it in the return statement, but I like it more this way
return Output;
}
FastSpeech2::~FastSpeech2()
{
if (FastSpeech)
delete FastSpeech;
}
|