File size: 4,028 Bytes
3943768
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import io
import numpy as np
import pydub

from utils import have_pyrubberband


# Keep non-native package imports out of global space


def get_wave_header(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
    # This will create a wave header then append the frame input
    # It should be first on a streaming wav file
    # Other frames better should not have it (else you will hear some artifacts each chunk start)
    import wave
    wav_buf = io.BytesIO()
    with wave.open(wav_buf, "wb") as vfout:
        vfout.setnchannels(channels)
        vfout.setsampwidth(sample_width)
        vfout.setframerate(sample_rate)
        vfout.writeframes(frame_input)

    wav_buf.seek(0)
    return wav_buf.read()


def prepare_speech(sr=24000):
    # Must set autoplay to True first
    return get_wave_header(sample_rate=sr)


def get_no_audio(return_as_byte=True, return_nonbyte_as_file=False, sr=None):
    if return_as_byte:
        return b""
    else:
        if return_nonbyte_as_file:
            return None
        else:
            assert sr is not None
            return sr, np.array([]).astype(np.int16)


def combine_audios(audios, audio=None, channels=1, sample_width=2, sr=24000, expect_bytes=True, verbose=False):
    no_audio = get_no_audio(sr=sr)
    have_audio = any(x not in [no_audio, None, ''] for x in audios) or audio not in [no_audio, None, '']
    if not have_audio:
        return no_audio

    if audio or audios:
        if verbose:
            print("begin combine audios")
        is_bytes = expect_bytes  # force default as bytes no matter input if know should have been bytes
        if audios:
            is_bytes |= isinstance(audios[0], (bytes, bytearray))
        if audio:
            is_bytes |= isinstance(audio, (bytes, bytearray))
        assert audio is None or isinstance(audio, (bytes, bytearray))
        from pydub import AudioSegment
        combined_wav = AudioSegment.empty()
        for x in audios:
            if x is not None:
                s = io.BytesIO(x) if is_bytes else x
                combined_wav += AudioSegment.from_raw(s, sample_width=sample_width, frame_rate=sr, channels=channels)
        if audio is not None:
            s = io.BytesIO(audio) if is_bytes else audio
            combined_wav += AudioSegment.from_raw(s, sample_width=sample_width, frame_rate=sr, channels=channels)
        if is_bytes:
            combined_wav = combined_wav.export(format='raw').read()
        if verbose:
            print("end1 combine audios")
        return combined_wav
    # audio just empty stream, but not None, else would nuke audio
    if verbose:
        print("end2 combine audios")
    return audio


def chunk_speed_change(chunk, sr, tts_speed=1.0):
    if tts_speed == 1.0:
        return chunk

    if have_pyrubberband:
        import pyrubberband as pyrb
        chunk = pyrb.time_stretch(chunk, sr, tts_speed)
        chunk = (chunk * 32767).astype(np.int16)
        return chunk

    if tts_speed < 1.0:
        # chunk = chunk.astype(np.float32)
        # chunk = 0.5 * chunk / np.max(chunk)
        # chunk = librosa.effects.time_stretch(chunk, rate=tts_speed)
        return chunk

    # speed-up
    from pydub import AudioSegment
    from pydub.effects import speedup

    s = io.BytesIO(chunk)
    channels = 1
    sample_width = 2
    audio = AudioSegment.from_raw(s, sample_width=sample_width, frame_rate=sr, channels=channels)
    # chunk = speedup(audio, tts_speed, 150).export(format='raw').read()
    chunk = pydub_to_np(speedup(audio, tts_speed, 150))
    # audio = audio._spawn(audio.raw_data, overrides={
    #    "frame_rate": int(audio.frame_rate * tts_speed)
    # })
    # chunk = np.array(audio.get_array_of_samples())

    return chunk


def pydub_to_np(audio: pydub.AudioSegment) -> (np.ndarray, int):
    """
    Converts pydub audio segment into np.int16 of shape [duration_in_seconds*sample_rate, channels],
    """
    return np.array(audio.get_array_of_samples(), dtype=np.int16).reshape((-1, audio.channels))