File size: 3,912 Bytes
28d0c5f
74a35d9
85b7206
28d0c5f
 
 
 
 
 
 
0700cb3
 
 
 
 
 
 
28d0c5f
 
 
 
 
0700cb3
 
 
 
 
 
 
 
 
0746ae5
28d0c5f
 
 
0700cb3
 
 
 
 
 
 
 
 
0746ae5
28d0c5f
 
0700cb3
 
 
 
 
 
 
28d0c5f
 
 
 
 
 
85b7206
 
 
 
0700cb3
 
 
 
 
 
 
85b7206
 
 
 
 
0700cb3
 
 
 
 
 
 
 
 
85b7206
 
 
 
 
 
 
 
 
0700cb3
 
 
 
 
 
 
85b7206
 
 
 
 
0700cb3
 
 
 
 
 
 
 
 
85b7206
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy as np
import torch
import ModelInterfaces


class NeuralASR(ModelInterfaces.IASRModel):
    word_locations_in_samples = None
    audio_transcript = None

    def __init__(self, model: torch.nn.Module, decoder) -> None:
        """
        Initialize the NeuralASR (Audio Speech Recognition) model.

        Args:
            model (torch.nn.Module): The neural network model for ASR.
            decoder: The decoder to convert CTC outputs to transcripts.
        """
        super().__init__()
        self.model = model
        self.decoder = decoder  # Decoder from CTC-outputs to transcripts

    def getTranscript(self) -> str:
        """
        Get the transcript of the processed audio.

        Returns:
            str: The audio transcript.

        Raises:
            AssertionError: If the audio has not been processed.
        """
        assert self.audio_transcript is not None, 'Can get audio transcripts without having processed the audio'
        return self.audio_transcript

    def getWordLocations(self) -> list:
        """
        Get the word locations from the processed audio.

        Returns:
            list: A list of word locations in samples.

        Raises:
            AssertionError: If the audio has not been processed.
        """
        assert self.word_locations_in_samples is not None, 'Can get word locations without having processed the audio'
        return self.word_locations_in_samples

    def processAudio(self, audio: torch.Tensor) -> None:
        """
        Process the audio to generate transcripts and word locations.

        Args:
            audio (torch.Tensor): The input audio tensor.
        """
        audio_length_in_samples = audio.shape[1]
        with torch.inference_mode():
            nn_output = self.model(audio)

            self.audio_transcript, self.word_locations_in_samples = self.decoder(
                nn_output[0, :, :].detach(), audio_length_in_samples, word_align=True)


class NeuralTTS(ModelInterfaces.ITextToSpeechModel):
    def __init__(self, model: torch.nn.Module, sampling_rate: int) -> None:
        """
        Initialize the NeuralTTS (Text to Speech) model.

        Args:
            model (torch.nn.Module): The neural network model for TTS.
            sampling_rate (int): The sampling rate for the audio.
        """
        super().__init__()
        self.model = model
        self.sampling_rate = sampling_rate

    def getAudioFromSentence(self, sentence: str) -> np.array:
        """
        Generate audio from a given sentence.

        Args:
            sentence (str): The input sentence.

        Returns:
            np.array: The generated audio as a numpy array.
        """
        with torch.inference_mode():
            audio_transcript = self.model.apply_tts(texts=[sentence],
                                                    sample_rate=self.sampling_rate)[0]

        return audio_transcript


class NeuralTranslator(ModelInterfaces.ITranslationModel):
    def __init__(self, model: torch.nn.Module, tokenizer) -> None:
        """
        Initialize the NeuralTranslator model.

        Args:
            model (torch.nn.Module): The neural network model for translation.
            tokenizer: The tokenizer for text processing.
        """
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer

    def translateSentence(self, sentence: str) -> str:
        """
        Translate a given sentence to the target language.

        Args:
            sentence (str): The input sentence.

        Returns:
            str: The translated sentence.
        """
        tokenized_text = self.tokenizer(sentence, return_tensors='pt')
        translation = self.model.generate(**tokenized_text)
        translated_text = self.tokenizer.batch_decode(
            translation, skip_special_tokens=True)[0]

        return translated_text