File size: 5,000 Bytes
67c46fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import io
import base64

import librosa
import numpy as np
import math
import torch
import torchaudio
import torchaudio
import sox
import tempfile


def encode_wav(wav, sr, rep_format="wav"):
    with io.BytesIO() as wavio:
        torchaudio.save(wavio, wav, sr, format=rep_format)
        audio_bytes = wavio.getvalue()
        encoded_wav = base64.b64encode(audio_bytes).decode("ascii")
    return encoded_wav


def trim_silence(audio, sr, keep_left_time=0.05, keep_right_time=0.22, hop_size=240):
    _, index = librosa.effects.trim(audio, top_db=20, frame_length=512, hop_length=128)
    num_frames = int(math.ceil((index[1] - index[0]) / hop_size))  # 300

    left_sil_samples = int(keep_left_time * sr)
    right_sil_samples = int(keep_right_time * sr)

    wav_len = len(audio)
    start_idx = index[0] - left_sil_samples
    trim_wav = audio

    if start_idx > 0:
        trim_wav = trim_wav[start_idx:]
    else:
        trim_wav = np.pad(
            trim_wav, (abs(start_idx), 0), mode="constant", constant_values=0.0
        )
    wav_len = len(trim_wav)
    out_len = int(num_frames * hop_size + (keep_left_time + keep_right_time) * sr)

    if out_len < wav_len:
        trim_wav = trim_wav[:out_len]
    else:
        trim_wav = np.pad(
            trim_wav, (0, (out_len - wav_len)), mode="constant", constant_values=0.0
        )
    return trim_wav


def volumn_adjust(audio16bit_torch, sr, volumn_ratio):
    """使用sox进行音频音量调整
    Args:
        audio16bit_torch (Tensor): 输入音频张量 [1, samples]
        volume_ratio (float): 音量比率,>1增大音量,<1降低音量

    Returns:
        Tensor: 调整音量后的音频张量
    """
    # 创建临时文件
    with tempfile.NamedTemporaryFile(
        suffix=".wav", delete=True
    ) as temp_in, tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_out:
        # 保存输入音频到临时文件
        torchaudio.save(temp_in.name, audio16bit_torch, sr)  # 假设采样率为16000
        # 创建sox转换器
        tfm = sox.Transformer()
        tfm.vol(volumn_ratio)  # 设置音量调整比率
        # 应用音量调整
        tfm.build_file(temp_in.name, temp_out.name)
        # 读取处理后的音频
        audio_changed, _ = torchaudio.load(temp_out.name)
    return audio_changed


def speech_adjust(audio16bit_torch, sr, speed_ratio):
    """使用sox进行音频变速处理
    Args:
        audio16bit_torch (Tensor): 输入音频张量 [1, samples]
        speed_ratio (float): 速度比率,>1加速,<1减速

    Returns:
        Tensor: 变速后的音频张量
    """
    # 创建临时文件
    with tempfile.NamedTemporaryFile(
        suffix=".wav", delete=True
    ) as temp_in, tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_out:
        # 保存输入音频到临时文件
        torchaudio.save(temp_in.name, audio16bit_torch, sr)  # 假设采样率为16000
        # 创建sox转换器
        tfm = sox.Transformer()
        tfm.tempo(speed_ratio)  # 设置变速比率
        # 应用变速处理
        tfm.build_file(temp_in.name, temp_out.name)
        # 读取处理后的音频
        audio_changed, _ = torchaudio.load(temp_out.name)
    return audio_changed


def audio_resample(audio16bit_torch, result_sr, target_sample_rate):
    audio16bit_torch = torchaudio.transforms.Resample(
        orig_freq=result_sr, new_freq=target_sample_rate
    )(audio16bit_torch)
    result_sr = target_sample_rate
    return audio16bit_torch, result_sr


def norm_audio(audio16bit_torch):
    # 直接 归一化处理。
    audio16bit_torch = audio16bit_torch.numpy()
    audio16bit_torch = (
        audio16bit_torch / np.abs(audio16bit_torch).max() * 32767
    ).astype(np.int16)
    audio16bit_torch = torch.from_numpy(audio16bit_torch)
    return audio16bit_torch


def resample_audio(wav, original_sample_rate, target_sample_rate):
    if original_sample_rate != target_sample_rate:
        assert (
            original_sample_rate > target_sample_rate
        ), "wav sample rate {} must be greater than {}".format(
            original_sample_rate, target_sample_rate
        )
        wav = torchaudio.transforms.Resample(
            orig_freq=original_sample_rate, new_freq=target_sample_rate
        )(wav)
    return wav


def energy_norm_fn(wav):
    if type(wav) is np.ndarray:
        max_data = np.max(np.abs(wav))
        wav = wav / max(max_data, 0.01) * 0.999
    else:
        max_data = torch.max(torch.abs(wav))
        wav = wav / max(max_data, 0.01) * 0.999
    return wav


def get_audio_tokens(audio_tokens: str) -> list[int]:
    audio_tokens = audio_tokens.split("><audio_")
    audio_tokens = [
        int(token.replace("<audio_", "").replace(">", "")) + 65536
        for token in audio_tokens
    ]
    return audio_tokens


def load_audio(audio_path: str):
    audio_wav, sr = torchaudio.load(audio_path)
    audio_wav = audio_wav.mean(dim=0, keepdim=True)
    return audio_wav, sr