File size: 2,453 Bytes
d5ee97c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#include "FastSpeech2.h"
#include <stdexcept>


FastSpeech2::FastSpeech2()
{
	FastSpeech = nullptr;
}

FastSpeech2::FastSpeech2(const std::string & SavedModelFolder)
{
	Initialize(SavedModelFolder);
}


bool FastSpeech2::Initialize(const std::string & SavedModelFolder)
{
	try {
		FastSpeech = new Model(SavedModelFolder);
	}
	catch (...) {
		FastSpeech = nullptr;
		return false;

	}
	return true;
}

TFTensor<float> FastSpeech2::DoInference(const std::vector<int32_t>& InputIDs, int32_t SpeakerID, float Speed, float Energy, float F0, int32_t EmotionID)
{
	if (!FastSpeech)
        throw std::invalid_argument("Tried to do inference on unloaded or invalid model!");

	// Convenience reference so that we don't have to constantly derefer pointers.
	Model& Mdl = *FastSpeech;

	// Define the tensors
	Tensor input_ids{ Mdl,"serving_default_input_ids" };
	Tensor energy_ratios{ Mdl,"serving_default_energy_ratios" };
	Tensor f0_ratios{ Mdl,"serving_default_f0_ratios" };
	Tensor speaker_ids{ Mdl,"serving_default_speaker_ids" };
	Tensor speed_ratios{ Mdl,"serving_default_speed_ratios" };
    Tensor* emotion_ids = nullptr;

    // This is a multi-emotion model
    if (EmotionID != -1)
    {
        emotion_ids = new Tensor{Mdl,"serving_default_emotion_ids"};
        emotion_ids->set_data(std::vector<int32_t>{EmotionID});

    }


	// This is the shape of the input IDs, our equivalent to tf.expand_dims.
	std::vector<int64_t> InputIDShape = { 1, (int64_t)InputIDs.size() };

	input_ids.set_data(InputIDs, InputIDShape);
	energy_ratios.set_data(std::vector<float>{ Energy });
	f0_ratios.set_data(std::vector<float>{F0});
	speaker_ids.set_data(std::vector<int32_t>{SpeakerID});
	speed_ratios.set_data(std::vector<float>{Speed});

	// Define output tensor
	Tensor output{ Mdl,"StatefulPartitionedCall" };


	// Vector of input tensors
	std::vector<Tensor*> inputs = { &input_ids,&speaker_ids,&speed_ratios,&f0_ratios,&energy_ratios };

    if (EmotionID != -1)
        inputs.push_back(emotion_ids);


	// Do inference
	FastSpeech->run(inputs, output);

	// Define output and return it
	TFTensor<float> Output = VoxUtil::CopyTensor<float>(output);

    // We allocated the emotion_ids tensor dynamically, delete it
    if (emotion_ids)
        delete emotion_ids;

    // We could just straight out define it in the return statement, but I like it more this way

	return Output;
}

FastSpeech2::~FastSpeech2()
{
	if (FastSpeech)
		delete FastSpeech;
}