HoneyTian commited on
Commit
a8c8d73
·
0 Parent(s):

first commit

Browse files
Files changed (49) hide show
  1. .gitattributes +36 -0
  2. .gitignore +13 -0
  3. Dockerfile +24 -0
  4. README.md +11 -0
  5. data/examples/default/audio_0_2.wav +3 -0
  6. data/examples/default/audio_0_3_clone_from_audio_0_2.wav +3 -0
  7. data/examples/default/audio_0_5_clone_from_audio_0_2.wav +3 -0
  8. data/examples/default/audio_1_3_clone_from_audio_0_2.wav +3 -0
  9. data/examples/default/audio_2_3_clone_from_audio_0_2.wav +3 -0
  10. data/examples/default/audio_3_3_clone_from_audio_0_2.wav +3 -0
  11. data/examples/default/audio_4_3_clone_from_audio_0_2.wav +3 -0
  12. data/examples/default/audio_5_3_clone_from_audio_0_2.wav +3 -0
  13. data/examples/default/audio_6_3_clone_from_audio_0_2.wav +3 -0
  14. data/examples/default/audio_7_3_clone_from_audio_0_2.wav +3 -0
  15. data/examples/default/audio_8_3_clone_from_audio_0_2.wav +3 -0
  16. data/examples/default/audio_9_3_clone_from_audio_0_2.wav +3 -0
  17. data/examples/mix/noise/0001f9f2-3626-427f-8ae5-105d81fcb5a3_th-TH_1678772646723.wav +3 -0
  18. data/examples/mix/noise/000e2a2e-43c8-4752-8e26-34207fa6e9e4_th-TH_1678244573769.wav +3 -0
  19. data/examples/mix/noise/000f28d7-2129-49d5-9942-16ebf60e8285_th-TH_1678343313388.wav +3 -0
  20. data/examples/mix/noise/00240453-cd58-4059-9a38-d00583b879c7_th-TH_1678168729318.wav +3 -0
  21. data/examples/mix/speech/0000c655-3a8e-4196-bc31-c01fa8d115cc_th-TH_1678768644585.wav +3 -0
  22. data/examples/mix/speech/000f62f5-5b05-4494-a8db-0eaca3ebd871_th-TH_1678353399860.wav +3 -0
  23. data/examples/mix/speech/001df4d1-9f7a-4e78-adc9-ef26d07eba60_th-TH_1667878032.0303788.wav +3 -0
  24. data/examples/mix/speech/001ef59d-b266-4409-b89c-627e3d7fb27d_th-TH_1678356022482.wav +3 -0
  25. examples/audio_fmt_convert.py +48 -0
  26. examples/batch_audio_fmt_convert.py +61 -0
  27. examples/concat/test1.py +48 -0
  28. examples/concat/test2.py +51 -0
  29. examples/sound_play_speed/by_pydub.py +51 -0
  30. examples/sound_speed/by_audiostretchy.py +39 -0
  31. examples/sound_speed/by_audiotsm.py +48 -0
  32. examples/sound_speed/by_librosa.py +52 -0
  33. examples/sound_speed/by_pydub.py +47 -0
  34. examples/sound_volume/by_ffmpy_by_db.py +56 -0
  35. examples/sound_volume/by_ffmpy_by_radio.py +56 -0
  36. examples/sound_volume/by_numpy_by_db.py +65 -0
  37. examples/sound_volume/by_numpy_by_radio.py +63 -0
  38. examples/sound_volume/by_pydub_by_db.py +40 -0
  39. examples/sound_volume/by_pydub_by_reference.py +47 -0
  40. main.py +473 -0
  41. project_settings.py +12 -0
  42. requirements.txt +7 -0
  43. toolbox/__init__.py +6 -0
  44. toolbox/audio_edit/__init__.py +6 -0
  45. toolbox/audio_edit/augment.py +45 -0
  46. toolbox/audio_edit/convert.py +106 -0
  47. toolbox/audio_edit/info.py +121 -0
  48. toolbox/audio_edit/speech_speed.py +130 -0
  49. toolbox/audio_edit/volume.py +139 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ .git/
3
+ .idea/
4
+
5
+ #/data/
6
+ /dotenv/
7
+ /logs/
8
+ /trained_models
9
+ /temp/
10
+
11
+ **/__pycache__/
12
+
13
+ #**/*.wav
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /code
4
+
5
+ COPY . /code
6
+
7
+ RUN apt-get update
8
+ RUN apt-get install -y ffmpeg
9
+
10
+ RUN pip install --upgrade pip
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
+
13
+ RUN useradd -m -u 1000 user
14
+
15
+ USER user
16
+
17
+ ENV HOME=/home/user \
18
+ PATH=/home/user/.local/bin:$PATH
19
+
20
+ WORKDIR $HOME/app
21
+
22
+ COPY --chown=user . $HOME/app
23
+
24
+ CMD ["python3", "main.py"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Audio Edit
3
+ emoji: 🐨
4
+ colorFrom: purple
5
+ colorTo: pink
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
data/examples/default/audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5f5101df0c899f2beebc9de948c80a64e36c2fc9d38420879fb1c093dc5e961
3
+ size 1071894
data/examples/default/audio_0_3_clone_from_audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bda67fec70dfc965c2b1b52d9fae03ec3f5086eacad0285342c5850b78eb10d
3
+ size 121388
data/examples/default/audio_0_5_clone_from_audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54598a92c876efa14c108dc83a102a9d200de1a8f7a2df9e0fc1c627acddec12
3
+ size 28204
data/examples/default/audio_1_3_clone_from_audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e3fe23973ff066845ff6f8d75338f4cb9249e084e3523365868154fa8421075
3
+ size 150572
data/examples/default/audio_2_3_clone_from_audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d779c52da6bd16fabed43d9ba065f3a16c400f241f6e05511c9becfd1e2695e
3
+ size 107052
data/examples/default/audio_3_3_clone_from_audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c20584ae6e2e83af86f1f79d053847fe5db0249e6ced2541ddb914187e468d
3
+ size 153644
data/examples/default/audio_4_3_clone_from_audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a3944f4f27ea33fddbc13a0aeae2261051a68636316dcebda03e8f1eeb823e1
3
+ size 159276
data/examples/default/audio_5_3_clone_from_audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e870e9de7d931eb1da038815e63f6094b2a19dede5c808d64a851803471357f9
3
+ size 147500
data/examples/default/audio_6_3_clone_from_audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:feaaccc3b090dab7f76d798ca77922206111cfd9c852c0213444e1ea6e7541c3
3
+ size 167980
data/examples/default/audio_7_3_clone_from_audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4dce5a9a5ba5b5f1a05eb095f762238018784bee96fa66e1714be1e819164f3
3
+ size 165420
data/examples/default/audio_8_3_clone_from_audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3a714c542b99438898740041f86082983c2988c73c113ceea68f6af90f99efc
3
+ size 159276
data/examples/default/audio_9_3_clone_from_audio_0_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dca1e7538e2cd44702b112d59fad5cf6327005059122f6d1e549c9005ce06180
3
+ size 103980
data/examples/mix/noise/0001f9f2-3626-427f-8ae5-105d81fcb5a3_th-TH_1678772646723.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bac05176ef3031404f6a619ad67dbf10bc0ebea09ab8f21ec5095de5fd5562b
3
+ size 32044
data/examples/mix/noise/000e2a2e-43c8-4752-8e26-34207fa6e9e4_th-TH_1678244573769.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d71ed84f9a586b92563a78ec41682479dd5fdc65f5b72b3b5fbd7a7cb54a853c
3
+ size 32044
data/examples/mix/noise/000f28d7-2129-49d5-9942-16ebf60e8285_th-TH_1678343313388.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:447023f56f3f3b345885de15e8dceec8799a465e924209133cd3cdc761bcc297
3
+ size 32044
data/examples/mix/noise/00240453-cd58-4059-9a38-d00583b879c7_th-TH_1678168729318.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50e29bc4c2ea9282efcbcd326dd71905caeb8ceafbd718cd535b1d7e65bec508
3
+ size 32044
data/examples/mix/speech/0000c655-3a8e-4196-bc31-c01fa8d115cc_th-TH_1678768644585.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fab119763be2e0aa39e92e3dbd90958ee88d9321d5a51a40c4fc663b8f673a73
3
+ size 32044
data/examples/mix/speech/000f62f5-5b05-4494-a8db-0eaca3ebd871_th-TH_1678353399860.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f60650ed15d659658bffc77c7ec380be370e78a170b3140906f14b2d89aa85e0
3
+ size 32044
data/examples/mix/speech/001df4d1-9f7a-4e78-adc9-ef26d07eba60_th-TH_1667878032.0303788.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:058ea8a79e4c1328614c2ccacb84b3bb9f9e32def909f654db3da052f137d36e
3
+ size 32044
data/examples/mix/speech/001ef59d-b266-4409-b89c-627e3d7fb27d_th-TH_1678356022482.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fe593e08b014b7fab4f368f7d0f84a45eed7fbc79b9bad4d005b1386cc4904c
3
+ size 32044
examples/audio_fmt_convert.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ import librosa
6
+ import numpy as np
7
+ from scipy.io import wavfile
8
+
9
+ from project_settings import project_path
10
+
11
+
12
+ def get_args():
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument(
15
+ "--filename",
16
+ default=(project_path / "data/常相伴AI配音9.10/常相伴AI配音9.10/台湾-女声/1.m4a").as_posix(),
17
+ type=str,
18
+ )
19
+ parser.add_argument(
20
+ "--output_file",
21
+ default="temp.wav",
22
+ type=str,
23
+ )
24
+ args = parser.parse_args()
25
+ return args
26
+
27
+
28
+ def main():
29
+ args = get_args()
30
+
31
+ max_wave_value = 32768.0
32
+
33
+ signal, sample_rate = librosa.load(args.filename, sr=8000)
34
+ signal *= max_wave_value
35
+ signal = np.array(signal, dtype=np.int16)
36
+ print(signal.dtype)
37
+ print(sample_rate)
38
+
39
+ wavfile.write(
40
+ args.output_file,
41
+ 8000,
42
+ signal,
43
+ )
44
+ return
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
examples/batch_audio_fmt_convert.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ import librosa
6
+ import numpy as np
7
+ from pathlib import Path
8
+ from scipy.io import wavfile
9
+ from tqdm import tqdm
10
+
11
+ from project_settings import project_path
12
+
13
+
14
+ def get_args():
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument(
17
+ "--audio_dir",
18
+ default=(project_path / "data/yd").as_posix(),
19
+ type=str,
20
+ )
21
+ parser.add_argument(
22
+ "--output_dir",
23
+ default=(project_path / "data/temp_wav").as_posix(),
24
+ type=str,
25
+ )
26
+ args = parser.parse_args()
27
+ return args
28
+
29
+
30
+ def main():
31
+ args = get_args()
32
+
33
+ audio_dir = Path(args.audio_dir)
34
+ output_dir = Path(args.output_dir)
35
+ output_dir.mkdir(parents=True, exist_ok=True)
36
+
37
+ max_wave_value = 32768.0
38
+
39
+ for filename in tqdm(audio_dir.glob("**/*.wav")):
40
+ basename = filename.stem
41
+ relative_dir = filename.parent.relative_to(audio_dir)
42
+
43
+ signal, sample_rate = librosa.load(filename, sr=8000)
44
+ # print(signal.shape)
45
+ # print(signal.dtype)
46
+ # exit(0)
47
+ signal *= max_wave_value
48
+ signal = np.array(signal, dtype=np.int16)
49
+
50
+ output_filename = output_dir / relative_dir / f"{basename}.wav"
51
+ output_filename.parent.mkdir(parents=True, exist_ok=True)
52
+ wavfile.write(
53
+ output_filename.as_posix(),
54
+ 8000,
55
+ signal,
56
+ )
57
+ return
58
+
59
+
60
+ if __name__ == "__main__":
61
+ main()
examples/concat/test1.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ import os
6
+ from ffmpy import FFmpeg
7
+
8
+
9
+ def get_args():
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument("--change_by_db", default=-11, type=int)
12
+ args = parser.parse_args()
13
+ return args
14
+
15
+
16
+ def change_by_decibel(audio_path: str, output_file: str, decibel):
17
+ ext = os.path.basename(audio_path).strip().split(".")[-1]
18
+ if ext not in ["wav", "mp3"]:
19
+ raise Exception("format error")
20
+ if os.path.exists(output_file):
21
+ os.remove(output_file)
22
+ ff = FFmpeg(
23
+ inputs={audio_path: None},
24
+ outputs={output_file: f'-filter:a "volume={decibel}dB"'}
25
+ )
26
+ ff.run()
27
+ return output_file
28
+
29
+
30
+ def main():
31
+ args = get_args()
32
+
33
+ for i in range(10):
34
+ filename = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2.wav".format(i)
35
+ output_file = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2_volume.wav".format(i)
36
+
37
+ output_file = change_by_decibel(
38
+ filename,
39
+ output_file,
40
+ args.change_by_db,
41
+ )
42
+ # print(f"output_file: {output_file}")
43
+
44
+ return
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
examples/concat/test2.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import librosa
4
+ import numpy as np
5
+ from scipy.io import wavfile
6
+
7
+
8
+ for i in range(10):
9
+ filename1 = r"C:\Users\tianx\Desktop\Audio\x_tts_v2\audio_0_section_1.wav"
10
+ filename2 = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2_volume.wav".format(i)
11
+ filename3 = r"C:\Users\tianx\Desktop\Audio\x_tts_v2\audio_0_section_2.wav"
12
+
13
+ output_filename = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2_concat_volume.wav".format(i)
14
+
15
+ signal1, sample_rate = librosa.load(filename1, sr=8000)
16
+
17
+ print(sample_rate)
18
+ print(signal1.dtype)
19
+ print(signal1.shape)
20
+
21
+ signal2, sample_rate = librosa.load(filename2, sr=8000)
22
+
23
+ print(sample_rate)
24
+ print(signal2.dtype)
25
+ print(signal2.shape)
26
+
27
+ signal3, sample_rate = librosa.load(filename3, sr=8000)
28
+
29
+ print(sample_rate)
30
+ print(signal3.dtype)
31
+ print(signal3.shape)
32
+
33
+ signal = np.concatenate([signal1, signal2, signal3], dtype=np.float32)
34
+ print(signal.dtype)
35
+ print(signal.shape)
36
+
37
+ max_wave_value = 32768.0
38
+ signal *= max_wave_value
39
+ signal = np.array(signal, dtype=np.int16)
40
+ print(signal.dtype)
41
+ print(sample_rate)
42
+
43
+ wavfile.write(
44
+ output_filename,
45
+ 8000,
46
+ signal,
47
+ )
48
+
49
+
50
+ if __name__ == '__main__':
51
+ pass
examples/sound_play_speed/by_pydub.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 这调整的是播放速度, 会改变声音的频率.
5
+ """
6
+ import argparse
7
+ import os.path
8
+
9
+ from pydub import AudioSegment
10
+ from pydub.playback import play
11
+
12
+ from project_settings import project_path
13
+
14
+
15
+ def get_args():
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument(
18
+ "--filename",
19
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
20
+ type=str,
21
+ )
22
+ parser.add_argument(
23
+ "--output_file",
24
+ default="temp.wav",
25
+ type=str,
26
+ )
27
+ parser.add_argument("--speed", default=1.3, type=float)
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def main():
33
+ args = get_args()
34
+
35
+ sound = AudioSegment.from_wav(args.filename)
36
+ altered_sound = sound._spawn(sound.raw_data, overrides={
37
+ "frame_rate": int(sound.frame_rate * args.speed)
38
+ })
39
+ altered_sound .set_frame_rate(sound.frame_rate)
40
+
41
+ # play(altered_sound)
42
+
43
+ if os.path.exists(args.output_file):
44
+ os.remove(args.output_file)
45
+ altered_sound.export(args.output_file, format="wav")
46
+
47
+ return
48
+
49
+
50
+ if __name__ == "__main__":
51
+ main()
examples/sound_speed/by_audiostretchy.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 这调整的是播放速度, 会改变声音的频率.
5
+ """
6
+ import argparse
7
+
8
+ from audiostretchy.stretch import stretch_audio
9
+
10
+ from project_settings import project_path
11
+
12
+
13
+ def get_args():
14
+ parser = argparse.ArgumentParser()
15
+ parser.add_argument(
16
+ "--filename",
17
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
18
+ type=str,
19
+ )
20
+ parser.add_argument(
21
+ "--output_file",
22
+ default="temp.wav",
23
+ type=str,
24
+ )
25
+ parser.add_argument("--ratio", default=1.5, type=float)
26
+ args = parser.parse_args()
27
+ return args
28
+
29
+
30
+ def main():
31
+ args = get_args()
32
+
33
+ stretch_audio(args.filename, args.output_file, ratio=args.ratio)
34
+
35
+ return
36
+
37
+
38
+ if __name__ == "__main__":
39
+ main()
examples/sound_speed/by_audiotsm.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os.path
5
+
6
+ import audiotsm
7
+ import audiotsm.io.wav
8
+ import audiotsm.io.array
9
+
10
+ from project_settings import project_path
11
+
12
+
13
+ def get_args():
14
+ parser = argparse.ArgumentParser()
15
+ parser.add_argument(
16
+ "--filename",
17
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
18
+ type=str,
19
+ )
20
+ parser.add_argument(
21
+ "--output_file",
22
+ default="temp.wav",
23
+ type=str,
24
+ )
25
+ parser.add_argument("--speed", default=1.1, type=float)
26
+ args = parser.parse_args()
27
+ return args
28
+
29
+
30
+ def main():
31
+ args = get_args()
32
+
33
+ reader = audiotsm.io.wav.WavReader(args.filename)
34
+
35
+ writer = audiotsm.io.wav.WavWriter(args.output_file, reader.channels, reader.samplerate)
36
+
37
+ # 使用WSOLA算法进行时间缩放
38
+ wsola = audiotsm.wsola(reader.channels, speed=args.speed)
39
+ wsola.run(reader, writer)
40
+
41
+ # 关闭文件
42
+ writer.close()
43
+ reader.close()
44
+ return
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
examples/sound_speed/by_librosa.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os.path
5
+
6
+ import librosa
7
+ import numpy as np
8
+ from scipy.io import wavfile
9
+
10
+ from project_settings import project_path
11
+
12
+
13
+ def get_args():
14
+ parser = argparse.ArgumentParser()
15
+ parser.add_argument(
16
+ "--filename",
17
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
18
+ type=str,
19
+ )
20
+ parser.add_argument(
21
+ "--output_file",
22
+ default="temp.wav",
23
+ type=str,
24
+ )
25
+ parser.add_argument("--speed", default=1.1, type=float)
26
+ args = parser.parse_args()
27
+ return args
28
+
29
+
30
+ def main():
31
+ args = get_args()
32
+
33
+ signal, sample_rate = librosa.load(args.filename, sr=None)
34
+ signal_ = librosa.effects.time_stretch(signal, rate=args.speed)
35
+
36
+ # max_wave_value = 32768.0
37
+ max_wave_value = 1 << 15
38
+
39
+ signal_ = signal_ * max_wave_value
40
+ signal_ = np.array(signal_, dtype=np.int16)
41
+
42
+ wavfile.write(
43
+ filename=args.output_file,
44
+ rate=sample_rate,
45
+ data=signal_
46
+
47
+ )
48
+ return
49
+
50
+
51
+ if __name__ == "__main__":
52
+ main()
examples/sound_speed/by_pydub.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 这调整的是播放速度, 会改变声音的频率.
5
+ """
6
+ import argparse
7
+ import os.path
8
+
9
+ from pydub import AudioSegment
10
+ from pydub.playback import play
11
+
12
+ from project_settings import project_path
13
+
14
+
15
+ def get_args():
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument(
18
+ "--filename",
19
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
20
+ type=str,
21
+ )
22
+ parser.add_argument(
23
+ "--output_file",
24
+ default="temp.wav",
25
+ type=str,
26
+ )
27
+ parser.add_argument("--speed", default=1.5, type=float)
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def main():
33
+ args = get_args()
34
+
35
+ sound = AudioSegment.from_wav(args.filename)
36
+
37
+ sound_ = sound.speedup(playback_speed=args.speed)
38
+
39
+ if os.path.exists(args.output_file):
40
+ os.remove(args.output_file)
41
+ sound_.export(args.output_file, format="wav")
42
+
43
+ return
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
examples/sound_volume/by_ffmpy_by_db.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ import os
6
+ from ffmpy import FFmpeg
7
+
8
+ from project_settings import project_path
9
+
10
+
11
+ def get_args():
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument(
14
+ "--filename",
15
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
16
+ type=str,
17
+ )
18
+ parser.add_argument(
19
+ "--output_file",
20
+ default="temp.wav",
21
+ type=str,
22
+ )
23
+ parser.add_argument("--change_by_db", default=-10, type=int)
24
+ args = parser.parse_args()
25
+ return args
26
+
27
+
28
+ def change_by_decibel(audio_path: str, output_file: str, decibel):
29
+ ext = os.path.basename(audio_path).strip().split(".")[-1]
30
+ if ext not in ["wav", "mp3"]:
31
+ raise Exception("format error")
32
+ if os.path.exists(output_file):
33
+ os.remove(output_file)
34
+ ff = FFmpeg(
35
+ inputs={audio_path: None},
36
+ outputs={output_file: f'-filter:a "volume={decibel}dB"'}
37
+ )
38
+ ff.run()
39
+ return output_file
40
+
41
+
42
+ def main():
43
+ args = get_args()
44
+
45
+ output_file = change_by_decibel(
46
+ args.filename,
47
+ args.output_file,
48
+ args.change_by_db,
49
+ )
50
+ print(f"output_file: {output_file}")
51
+
52
+ return
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
examples/sound_volume/by_ffmpy_by_radio.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ import os
6
+ from ffmpy import FFmpeg
7
+
8
+ from project_settings import project_path
9
+
10
+
11
+ def get_args():
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument(
14
+ "--filename",
15
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
16
+ type=str,
17
+ )
18
+ parser.add_argument(
19
+ "--output_file",
20
+ default="temp.wav",
21
+ type=str,
22
+ )
23
+ parser.add_argument("--change_by_radio", default=0.5, type=float)
24
+ args = parser.parse_args()
25
+ return args
26
+
27
+
28
+ def change_by_ratio(audio_path: str, output_file: str, ratio):
29
+ ext = os.path.basename(audio_path).strip().split(".")[-1]
30
+ if ext not in ["wav", "mp3"]:
31
+ raise Exception("format error")
32
+ if os.path.exists(output_file):
33
+ os.remove(output_file)
34
+ ff = FFmpeg(
35
+ inputs={audio_path: None},
36
+ outputs={output_file: f'-filter:a "volume={ratio}"'}
37
+ )
38
+ ff.run()
39
+ return output_file
40
+
41
+
42
+ def main():
43
+ args = get_args()
44
+
45
+ output_file = change_by_ratio(
46
+ args.filename,
47
+ args.output_file,
48
+ args.change_by_radio,
49
+ )
50
+ print(f"output_file: {output_file}")
51
+
52
+ return
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
examples/sound_volume/by_numpy_by_db.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 输出的音频有噪音.
5
+ """
6
+ import argparse
7
+
8
+ import librosa
9
+ import numpy as np
10
+ from scipy.io import wavfile
11
+
12
+ from project_settings import project_path
13
+
14
+
15
+ def get_args():
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument(
18
+ "--filename",
19
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
20
+ type=str,
21
+ )
22
+ parser.add_argument(
23
+ "--output_file",
24
+ default="temp.wav",
25
+ type=str,
26
+ )
27
+ parser.add_argument("--change_by_db", default=-10, type=int)
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def change_by_db(signal: np.ndarray, db: int = -10):
33
+ # 使用分贝转换公式:dB = 20 * log10(amplitude)
34
+ # 反向转换:amplitude = 10^(dB/20)
35
+ signal_ = signal * (10 ** (db / 20))
36
+ signal_ = np.clip(signal_, a_min=0.0, a_max=1.0)
37
+ return signal_
38
+
39
+
40
+ def main():
41
+ args = get_args()
42
+
43
+ signal, sample_rate = librosa.load(args.filename)
44
+
45
+ signal_ = change_by_db(
46
+ signal=signal,
47
+ db=args.change_by_db,
48
+ )
49
+ # max_wave_value = 32768.0
50
+ max_wave_value = 1 << 15
51
+
52
+ signal_ = signal_ * max_wave_value
53
+ signal_ = np.array(signal_, dtype=np.int16)
54
+
55
+ wavfile.write(
56
+ filename=args.output_file,
57
+ rate=sample_rate,
58
+ data=signal_
59
+
60
+ )
61
+ return
62
+
63
+
64
+ if __name__ == "__main__":
65
+ main()
examples/sound_volume/by_numpy_by_radio.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 输出的音频有噪音.
5
+ """
6
+ import argparse
7
+
8
+ import librosa
9
+ import numpy as np
10
+ from scipy.io import wavfile
11
+
12
+ from project_settings import project_path
13
+
14
+
15
+ def get_args():
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument(
18
+ "--filename",
19
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
20
+ type=str,
21
+ )
22
+ parser.add_argument(
23
+ "--output_file",
24
+ default="temp.wav",
25
+ type=str,
26
+ )
27
+ parser.add_argument("--change_by_radio", default=0.5, type=float)
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def change_by_ratio(signal: np.ndarray, radio: float = 1.0):
33
+ signal_ = signal * radio
34
+ signal_ = np.clip(signal_, a_min=0.0, a_max=1.0)
35
+ return signal_
36
+
37
+
38
+ def main():
39
+ args = get_args()
40
+
41
+ signal, sample_rate = librosa.load(args.filename)
42
+
43
+ signal_ = change_by_ratio(
44
+ signal=signal,
45
+ radio=args.change_by_radio,
46
+ )
47
+ # max_wave_value = 32768.0
48
+ max_wave_value = 1 << 15
49
+
50
+ signal_ = signal_ * max_wave_value
51
+ signal_ = np.array(signal_, dtype=np.int16)
52
+
53
+ wavfile.write(
54
+ filename=args.output_file,
55
+ rate=sample_rate,
56
+ data=signal_
57
+
58
+ )
59
+ return
60
+
61
+
62
+ if __name__ == "__main__":
63
+ main()
examples/sound_volume/by_pydub_by_db.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ from pydub import AudioSegment
6
+
7
+ from project_settings import project_path
8
+
9
+
10
+ def get_args():
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument(
13
+ "--filename",
14
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
15
+ type=str,
16
+ )
17
+ parser.add_argument(
18
+ "--output_file",
19
+ default="temp.wav",
20
+ type=str,
21
+ )
22
+ parser.add_argument("--change_by_db", default=-10, type=int)
23
+ args = parser.parse_args()
24
+ return args
25
+
26
+
27
+ def main():
28
+ args = get_args()
29
+
30
+ sound = AudioSegment.from_wav(args.filename)
31
+ sound_ = sound + args.change_by_db
32
+ sound_.export(
33
+ args.output_file,
34
+ format="wav"
35
+ )
36
+ return
37
+
38
+
39
+ if __name__ == "__main__":
40
+ main()
examples/sound_volume/by_pydub_by_reference.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ from pydub import AudioSegment
6
+
7
+ from project_settings import project_path
8
+
9
+
10
+ def get_args():
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument(
13
+ "--filename",
14
+ default=(project_path / "data/examples/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
15
+ type=str,
16
+ )
17
+ parser.add_argument(
18
+ "--reference",
19
+ default=(project_path / "data/examples/audio_0_2.wav").as_posix(),
20
+ type=str,
21
+ )
22
+ parser.add_argument(
23
+ "--output_file",
24
+ default="temp.wav",
25
+ type=str,
26
+ )
27
+ args = parser.parse_args()
28
+ return args
29
+
30
+
31
+ def main():
32
+ args = get_args()
33
+
34
+ sound1 = AudioSegment.from_wav(args.filename)
35
+ sound2 = AudioSegment.from_wav(args.reference)
36
+
37
+ sound1_ = sound1.apply_gain(sound2.dBFS - sound1.dBFS)
38
+
39
+ sound1_.export(
40
+ args.output_file,
41
+ format="wav"
42
+ )
43
+ return
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
main.py ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 任意格式转到 wav 8000 int16 格式。
5
+ 多通道转单通道。
6
+ 音频 pad 加长。
7
+
8
+ """
9
+ import argparse
10
+ import json
11
+ from pathlib import Path
12
+ import platform
13
+ from typing import Tuple, List
14
+
15
+ import gradio as gr
16
+ import numpy as np
17
+
18
+ from project_settings import project_path
19
+ from toolbox.audio_edit.info import get_audio_info, engine_to_function as info_engine_to_function
20
+ from toolbox.audio_edit.convert import audio_convert, engine_to_function as cvt_engine_to_function
21
+ from toolbox.audio_edit.speech_speed import change_speech_speed, engine_to_function as speed_engine_to_function
22
+ from toolbox.audio_edit.volume import change_volume, engine_to_function as volume_engine_to_function
23
+ from toolbox.audio_edit.augment import mix_speech_and_noise
24
+
25
+
26
+ def get_args():
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument(
29
+ "--examples_dir",
30
+ default=(project_path / "data/examples").as_posix(),
31
+ type=str,
32
+ )
33
+ args = parser.parse_args()
34
+ return args
35
+
36
+
37
+ def when_click_get_audio_info(filename: str, engine: str) -> str:
38
+ message = "success"
39
+
40
+ try:
41
+ info: dict = get_audio_info(filename, engine)
42
+ result = json.dumps(info, ensure_ascii=False, indent=4)
43
+ except Exception as e:
44
+ result = None
45
+ message = f"failed. error type: {type(e)}, error text: {str(e)}"
46
+
47
+ return result, message
48
+
49
+
50
+ def when_click_audio_convert(filename: str,
51
+ to_sample_rate: int = 8000,
52
+ sample_width: int = 2,
53
+ channels: str = "0",
54
+ engine: str = "librosa",
55
+ ) -> Tuple[str, str, str, str]:
56
+ message = "success"
57
+
58
+ try:
59
+ output_file: str = audio_convert(filename,
60
+ to_sample_rate=to_sample_rate,
61
+ sample_width=sample_width,
62
+ channels=channels,
63
+ engine=engine,
64
+ )
65
+ origin_audio_info, _ = when_click_get_audio_info(filename, engine="wave")
66
+ output_audio_info, _ = when_click_get_audio_info(output_file, engine="wave")
67
+ except Exception as e:
68
+ output_file = None
69
+ origin_audio_info = None
70
+ output_audio_info = None
71
+ message = f"failed. error type: {type(e)}, error text: {str(e)}"
72
+
73
+ return filename, output_file, output_file, origin_audio_info, output_audio_info, message
74
+
75
+
76
+ def when_click_change_speech_speed(filename: str, speed: float = 1.0, engine: str = "librosa"):
77
+ message = "success"
78
+
79
+ try:
80
+ output_file: str = change_speech_speed(filename, speed, engine)
81
+ origin_audio_info, _ = when_click_get_audio_info(filename, engine="pydub")
82
+ output_audio_info, _ = when_click_get_audio_info(output_file, engine="pydub")
83
+ except Exception as e:
84
+ output_file = None
85
+ origin_audio_info = None
86
+ output_audio_info = None
87
+ message = f"failed. error type: {type(e)}, error text: {str(e)}"
88
+
89
+ return filename, output_file, output_file, origin_audio_info, output_audio_info, message
90
+
91
+
92
+ def when_click_change_volume(filename: str,
93
+ radio: float = 1.0,
94
+ decibel: float = 0.0,
95
+ reference: str = None,
96
+ engine: str = "by_ffmpy_by_db",
97
+ ):
98
+ message = "success"
99
+ try:
100
+ output_file: str = change_volume(filename, radio, decibel, reference, engine)
101
+ except Exception as e:
102
+ output_file = None
103
+ message = f"failed. error type: {type(e)}, error text: {str(e)}"
104
+ return filename, output_file, output_file, message
105
+
106
+
107
+ def when_click_pad_audio(audio, pad_seconds: int = 10, pad_mode: str = "zero"):
108
+ sample_rate, signal = audio
109
+
110
+ message = "success"
111
+
112
+ pad_signal = signal
113
+ try:
114
+ if not signal.ndim == 1:
115
+ raise AssertionError
116
+
117
+ pad_length = int(pad_seconds * sample_rate)
118
+
119
+ if pad_mode == "zero":
120
+ pad = np.zeros(shape=(pad_length,), dtype=signal.dtype)
121
+ elif pad_mode == "repeat":
122
+ signal_length = len(signal)
123
+ if pad_length <= signal_length:
124
+ pad = signal[:pad_length]
125
+ else:
126
+ a = pad_length // signal_length
127
+ pad = np.concat([signal] * int(a + 1), axis=-1)
128
+ pad = pad[:pad_length]
129
+ else:
130
+ raise NotImplementedError
131
+ pad_signal = np.concat([signal, pad], axis=-1)
132
+ except Exception as e:
133
+ message = f"failed. error type: {type(e)}, error text: {str(e)}"
134
+
135
+ return (sample_rate, pad_signal), message
136
+
137
+
138
+ def when_click_mix_speech_and_noise(speech_t, noise_t, snr_db: float):
139
+ sample_rate1, speech = speech_t
140
+ sample_rate2, noise = noise_t
141
+
142
+ message = "success"
143
+ mix_signal = speech
144
+ try:
145
+ if sample_rate1 != sample_rate2:
146
+ raise AssertionError
147
+
148
+ if speech.dtype == np.int16:
149
+ speech = np.array(speech, dtype=np.float32)
150
+ speech /= (1 << 15)
151
+ else:
152
+ raise NotImplementedError
153
+
154
+ if noise.dtype == np.int16:
155
+ noise = np.array(noise, dtype=np.float32)
156
+ noise /= (1 << 15)
157
+ else:
158
+ raise NotImplementedError
159
+
160
+ mix_signal = mix_speech_and_noise(speech, noise, snr_db)
161
+ mix_signal = np.array(mix_signal * (1 << 15), dtype=np.int16)
162
+
163
+ except Exception as e:
164
+ message = f"failed. error type: {type(e)}, error text: {str(e)}"
165
+
166
+ # np.int16
167
+ return (sample_rate1, mix_signal), message
168
+
169
+
170
+ audio_convert_examples = [
171
+ [
172
+ (project_path / "data/examples/default/audio_0_2.wav").as_posix(),
173
+ 8000, 2, "0", "librosa"
174
+ ]
175
+ ]
176
+
177
+
178
+ change_volume_examples = [
179
+ [
180
+ (project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
181
+ 1.0, -10.0,
182
+ None,
183
+ "by_ffmpy_by_db"
184
+ ],
185
+ [
186
+ (project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
187
+ -0.5, 0.0,
188
+ None,
189
+ "by_ffmpy_by_radio"
190
+ ],
191
+ [
192
+ (project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
193
+ 1.0, -10.0,
194
+ None,
195
+ "by_pydub_by_db"
196
+ ],
197
+ [
198
+ (project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
199
+ 1.0, 0.0,
200
+ (project_path / "data/examples/default/audio_0_2.wav").as_posix(),
201
+ "by_pydub_by_reference"
202
+ ]
203
+ ]
204
+
205
+
206
+ pad_audio_examples = [
207
+ [
208
+ (project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
209
+ 10, "zero",
210
+ ],
211
+ ]
212
+
213
+
214
+ mix_speech_and_noise_examples = [
215
+ [
216
+ (project_path / "data/examples/mix/speech/000f62f5-5b05-4494-a8db-0eaca3ebd871_th-TH_1678353399860.wav").as_posix(),
217
+ (project_path / "data/examples/mix/noise/000e2a2e-43c8-4752-8e26-34207fa6e9e4_th-TH_1678244573769.wav").as_posix(),
218
+ -5,
219
+ ],
220
+ [
221
+ (project_path / "data/examples/mix/speech/0000c655-3a8e-4196-bc31-c01fa8d115cc_th-TH_1678768644585.wav").as_posix(),
222
+ (project_path / "data/examples/mix/noise/000f28d7-2129-49d5-9942-16ebf60e8285_th-TH_1678343313388.wav").as_posix(),
223
+ 0,
224
+ ],
225
+ [
226
+ (project_path / "data/examples/mix/speech/001df4d1-9f7a-4e78-adc9-ef26d07eba60_th-TH_1667878032.0303788.wav").as_posix(),
227
+ (project_path / "data/examples/mix/noise/0001f9f2-3626-427f-8ae5-105d81fcb5a3_th-TH_1678772646723.wav").as_posix(),
228
+ 5,
229
+ ],
230
+ [
231
+ (project_path / "data/examples/mix/speech/001ef59d-b266-4409-b89c-627e3d7fb27d_th-TH_1678356022482.wav").as_posix(),
232
+ (project_path / "data/examples/mix/noise/00240453-cd58-4059-9a38-d00583b879c7_th-TH_1678168729318.wav").as_posix(),
233
+ 10,
234
+ ]
235
+
236
+ ]
237
+
238
+
239
+ def main():
240
+ args = get_args()
241
+
242
+ # examples
243
+ examples_dir = Path(args.examples_dir)
244
+
245
+ # choices
246
+ info_choices = list(info_engine_to_function.keys())
247
+ cvt_choices = list(cvt_engine_to_function.keys())
248
+ speed_choices = list(speed_engine_to_function.keys())
249
+ volume_choices = list(volume_engine_to_function.keys())
250
+
251
+ # ui
252
+ with gr.Blocks() as blocks:
253
+ with gr.Tabs():
254
+ with gr.TabItem("info"):
255
+ with gr.Row():
256
+ with gr.Column(variant="panel", scale=5):
257
+ info_audio = gr.File(label="audio")
258
+ info_engine = gr.Dropdown(choices=info_choices, value=info_choices[0], label="engine")
259
+ info_button = gr.Button(variant="primary")
260
+ with gr.Column(variant="panel", scale=5):
261
+ info_output = gr.Text(label="output")
262
+ info_log = gr.Text(label="log")
263
+
264
+ gr.Examples(
265
+ examples=[
266
+ [filename.as_posix(), "wave"]
267
+ for filename in examples_dir.glob("**/*.wav")
268
+ ],
269
+ inputs=[info_audio, info_engine],
270
+ outputs=[info_output, info_log],
271
+ fn=when_click_get_audio_info,
272
+ )
273
+ info_button.click(
274
+ when_click_get_audio_info,
275
+ inputs=[info_audio, info_engine],
276
+ outputs=[info_output, info_log]
277
+ )
278
+ with gr.TabItem("convert"):
279
+ with gr.Row():
280
+ with gr.Column(variant="panel", scale=5):
281
+ cvt_audio_file = gr.File(label="audio_file")
282
+ cvt_audio = gr.Audio(label="audio")
283
+
284
+ with gr.Row():
285
+ cvt_sample_rate = gr.Dropdown(choices=[8000], value=8000, label="sample_rate")
286
+ cvt_sample_width = gr.Dropdown(choices=[2], value=2, label="sample_width")
287
+ cvt_channels = gr.Text(
288
+ value="0", label="channels",
289
+ info = "The channels to be retained, separated by commas, such as `0,1`"
290
+ )
291
+ cvt_engine = gr.Dropdown(choices=cvt_choices, value=cvt_choices[0], label="engine")
292
+ cvt_button = gr.Button(variant="primary")
293
+ with gr.Column(variant="panel", scale=5):
294
+ cvt_output_audio_file = gr.File(label="output_audio_file")
295
+ cvt_output_audio = gr.Audio(label="output_audio")
296
+ cvt_origin_audio_info = gr.Text(label="origin_audio_info")
297
+ cvt_output_audio_info = gr.Text(label="output_audio_info")
298
+ cvt_log = gr.Text(label="log")
299
+ gr.Examples(
300
+ examples=audio_convert_examples,
301
+ inputs=[
302
+ cvt_audio_file,
303
+ cvt_sample_rate, cvt_sample_width, cvt_channels,
304
+ cvt_engine,
305
+ ],
306
+ outputs=[
307
+ cvt_audio,
308
+ cvt_output_audio_file, cvt_output_audio,
309
+ cvt_origin_audio_info, cvt_output_audio_info,
310
+ cvt_log
311
+ ],
312
+ fn=when_click_audio_convert,
313
+ )
314
+ cvt_button.click(
315
+ when_click_audio_convert,
316
+ inputs=[
317
+ cvt_audio_file,
318
+ cvt_sample_rate, cvt_sample_width, cvt_channels,
319
+ cvt_engine,
320
+ ],
321
+ outputs=[
322
+ cvt_audio,
323
+ cvt_output_audio_file, cvt_output_audio,
324
+ cvt_origin_audio_info, cvt_output_audio_info,
325
+ cvt_log
326
+ ],
327
+ )
328
+ with gr.TabItem("speech_speed"):
329
+ with gr.Row():
330
+ with gr.Column(variant="panel", scale=5):
331
+ speech_speed_audio_file = gr.File(label="audio_file")
332
+ speech_speed_audio = gr.Audio(label="audio")
333
+ with gr.Row():
334
+ speech_speed_speed = gr.Slider(minimum=0.0, maximum=4.0, value=1.0, label="speed")
335
+ speech_speed_engine = gr.Dropdown(choices=speed_choices, value=speed_choices[0], label="engine")
336
+ speech_speed_button = gr.Button(variant="primary")
337
+ with gr.Column(variant="panel", scale=5):
338
+ speech_speed_output_audio_file = gr.File(label="output_audio_file")
339
+ speech_speed_output_audio = gr.Audio(label="output_audio")
340
+ speech_speed_origin_audio_info = gr.Text(label="origin_audio_info")
341
+ speech_speed_output_audio_info = gr.Text(label="output_audio_info")
342
+ speech_speed_log = gr.Text(label="log")
343
+ gr.Examples(
344
+ examples=[
345
+ [filename.as_posix(), 0.5]
346
+ for filename in examples_dir.glob("**/*.wav")
347
+ ],
348
+ inputs=[speech_speed_audio_file, speech_speed_speed, speech_speed_engine],
349
+ outputs=[
350
+ speech_speed_audio,
351
+ speech_speed_output_audio_file, speech_speed_output_audio,
352
+ speech_speed_origin_audio_info, speech_speed_output_audio_info,
353
+ speech_speed_log,
354
+ ],
355
+ fn=when_click_change_speech_speed,
356
+ )
357
+ speech_speed_button.click(
358
+ when_click_change_speech_speed,
359
+ inputs=[speech_speed_audio_file, speech_speed_speed, speech_speed_engine],
360
+ outputs=[
361
+ speech_speed_audio,
362
+ speech_speed_output_audio_file, speech_speed_output_audio,
363
+ speech_speed_origin_audio_info, speech_speed_output_audio_info,
364
+ speech_speed_log,
365
+ ]
366
+ )
367
+ with gr.TabItem("volume"):
368
+ with gr.Row():
369
+ with gr.Column(variant="panel", scale=5):
370
+ volume_audio_file = gr.File(label="audio_file")
371
+ volume_speed_audio = gr.Audio(label="audio")
372
+ with gr.Row():
373
+ with gr.Column():
374
+ volume_radio = gr.Slider(minimum=0.0, maximum=3.0, value=1.0, step=0.1, label="radio")
375
+ volume_decibel = gr.Slider(minimum=-30.0, maximum=30.0, value=0.0, step=0.1, label="decibel")
376
+ volume_engine = gr.Dropdown(choices=volume_choices, value=volume_choices[0], label="engine")
377
+ with gr.Column():
378
+ volume_reference = gr.File(label="reference")
379
+
380
+ volume_button = gr.Button(variant="primary")
381
+ with gr.Column(variant="panel", scale=5):
382
+ volume_output_audio_file = gr.File(label="output_audio_file")
383
+ volume_output_audio = gr.Audio(label="output_audio")
384
+ volume_log = gr.Text(label="log")
385
+
386
+ gr.Examples(
387
+ examples=change_volume_examples,
388
+ inputs=[volume_audio_file, volume_radio, volume_decibel, volume_reference, volume_engine],
389
+ outputs=[
390
+ volume_speed_audio,
391
+ volume_output_audio_file, volume_output_audio,
392
+ volume_log,
393
+ ],
394
+ fn=when_click_change_volume,
395
+ )
396
+ volume_button.click(
397
+ when_click_change_volume,
398
+ inputs=[volume_audio_file, volume_radio, volume_decibel, volume_reference, volume_engine],
399
+ outputs=[
400
+ volume_speed_audio,
401
+ volume_output_audio_file, volume_output_audio,
402
+ volume_log,
403
+ ]
404
+ )
405
+ with gr.TabItem("pad"):
406
+ with gr.Row():
407
+ with gr.Column(variant="panel", scale=5):
408
+ pad_audio = gr.Audio(label="audio")
409
+ with gr.Row():
410
+ pad_seconds = gr.Slider(minimum=0, maximum=100, value=20, step=0.1, label="pad_seconds")
411
+ pad_mode = gr.Dropdown(choices=["zero", "repeat"], value="zero", label="pad_mode")
412
+ pad_button = gr.Button(variant="primary")
413
+
414
+ with gr.Column(variant="panel", scale=5):
415
+ pad_output_audio = gr.Audio(label="output_audio")
416
+ pad_log = gr.Text(label="log")
417
+ gr.Examples(
418
+ examples=pad_audio_examples,
419
+ inputs=[pad_audio, pad_seconds, pad_mode],
420
+ outputs=[
421
+ pad_output_audio, pad_log
422
+ ],
423
+ fn=when_click_pad_audio,
424
+ )
425
+ pad_button.click(
426
+ when_click_pad_audio,
427
+ inputs=[pad_audio, pad_seconds, pad_mode],
428
+ outputs=[
429
+ pad_output_audio, pad_log
430
+ ],
431
+ )
432
+
433
+ with gr.TabItem("mix"):
434
+ with gr.Row():
435
+ with gr.Column(variant="panel", scale=5):
436
+ mix_speed_audio = gr.Audio(label="speech")
437
+ mix_noise_audio = gr.Audio(label="noise")
438
+ with gr.Row():
439
+ mix_snr_db = gr.Slider(minimum=-10, maximum=20, value=10, step=0.1, label="snr_db")
440
+ mix_button = gr.Button(variant="primary")
441
+
442
+ with gr.Column(variant="panel", scale=5):
443
+ mix_output_audio = gr.Audio(label="output_audio")
444
+ mix_log = gr.Text(label="log")
445
+
446
+ gr.Examples(
447
+ examples=mix_speech_and_noise_examples,
448
+ inputs=[mix_speed_audio, mix_noise_audio, mix_snr_db],
449
+ outputs=[
450
+ mix_output_audio, mix_log
451
+ ],
452
+ fn=when_click_mix_speech_and_noise,
453
+ )
454
+ mix_button.click(
455
+ when_click_mix_speech_and_noise,
456
+ inputs=[mix_speed_audio, mix_noise_audio, mix_snr_db],
457
+ outputs=[
458
+ mix_output_audio, mix_log
459
+ ],
460
+ )
461
+
462
+ # http://127.0.0.1:7860/
463
+ blocks.queue().launch(
464
+ share=False if platform.system() == "Windows" else False,
465
+ # server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
466
+ server_name="0.0.0.0",
467
+ server_port=7860,
468
+ )
469
+ return
470
+
471
+
472
+ if __name__ == "__main__":
473
+ main()
project_settings.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import os
4
+ from pathlib import Path
5
+
6
+
7
+ project_path = os.path.abspath(os.path.dirname(__file__))
8
+ project_path = Path(project_path)
9
+
10
+
11
+ if __name__ == '__main__':
12
+ pass
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==4.44.1
2
+ librosa==0.10.2
3
+ soundfile==0.12.1
4
+ scipy==1.14.1
5
+ audiotsm==0.1.2
6
+ audiostretchy==1.3.5
7
+ tinytag==2.0.0
toolbox/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/audio_edit/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/audio_edit/augment.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import numpy as np
4
+
5
+
6
+ def mix_speech_and_noise(speech: np.ndarray, noise: np.ndarray, snr_db: float):
7
+ if len(speech) != len(noise):
8
+ raise AssertionError
9
+ # np.float32, value between (-1, 1).
10
+
11
+ speech_power = np.mean(np.square(speech))
12
+ noise_power = speech_power / (10 ** (snr_db / 10))
13
+
14
+ noise_adjusted = np.sqrt(noise_power) * noise / np.sqrt(np.mean(noise**2))
15
+
16
+ noisy_signal = speech + noise_adjusted
17
+
18
+ return noisy_signal
19
+
20
+
21
+ def speech_echo(speech: np.ndarray, ser_db: float, delay_samples: int, num_echoes: int = 1):
22
+ ser_linear = 10 ** (ser_db / 20)
23
+
24
+ echo = np.zeros_like(speech)
25
+
26
+ for i in range(1, num_echoes + 1):
27
+ echo[i * delay_samples:] += ser_linear ** i * speech[:-i * delay_samples]
28
+
29
+ enhanced_speech = speech + echo
30
+
31
+ return enhanced_speech
32
+
33
+
34
+ def main():
35
+ speech = np.random.randn(10000)
36
+ noise = np.random.randn(10000)
37
+ snr_db = 20
38
+
39
+ noisy_signal = mix_speech_and_noise(speech, noise, snr_db)
40
+
41
+ return
42
+
43
+
44
+ if __name__ == '__main__':
45
+ main()
toolbox/audio_edit/convert.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ from pathlib import Path
5
+ import tempfile
6
+ from typing import List
7
+ import uuid
8
+
9
+ import librosa
10
+ import numpy as np
11
+ from scipy.io import wavfile
12
+
13
+ from project_settings import project_path
14
+
15
+
16
+ def get_args():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument(
19
+ "--filename",
20
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
21
+ type=str,
22
+ )
23
+ args = parser.parse_args()
24
+ return args
25
+
26
+
27
+ def get_channel_list(channels: str = "0") -> List[int]:
28
+ splits = [int(split.strip()) for split in channels.split(",")]
29
+ return splits
30
+
31
+
32
+ def audio_convert_by_librosa(filename: str,
33
+ to_sample_rate: int = 8000,
34
+ sample_width: int = 2,
35
+ channels: str = "0",
36
+ ) -> str:
37
+ channels_ = get_channel_list(channels)
38
+ channels_max = max(channels_)
39
+
40
+ signal, sample_rate = librosa.load(filename, sr=to_sample_rate, mono=False)
41
+
42
+ if signal.ndim > 2:
43
+ raise AssertionError
44
+
45
+ if signal.ndim == 2:
46
+ if signal.shape[0] > signal.shape[1]:
47
+ raise AssertionError
48
+ if channels_max > signal.shape[0]:
49
+ raise AssertionError(f"channels_max `{channels_max}` great than num channels `{signal.shape[0]}`")
50
+
51
+ signal_ = list()
52
+ for ch in channels_:
53
+ sub_signal = signal[ch, :]
54
+ signal_.append(sub_signal)
55
+ signal = np.concatenate(signal_, axis=-1)
56
+
57
+ if sample_width == 2:
58
+ max_wave_value = 32768.0
59
+ signal *= max_wave_value
60
+ signal = np.array(signal, dtype=np.int16)
61
+ else:
62
+ raise AssertionError(f"invalid sample_width: {sample_width}")
63
+
64
+ output_dir = Path(tempfile.gettempdir()) / "audio_edit/fmt_convert"
65
+ output_dir.mkdir(parents=True, exist_ok=True)
66
+ output_file = output_dir / f"{uuid.uuid4()}.wav"
67
+ output_file = output_file.as_posix()
68
+
69
+ wavfile.write(
70
+ output_file,
71
+ to_sample_rate,
72
+ signal,
73
+ )
74
+ return output_file
75
+
76
+
77
+ engine_to_function = {
78
+ "librosa": audio_convert_by_librosa,
79
+ }
80
+
81
+
82
+ def audio_convert(filename: str,
83
+ to_sample_rate: int = 8000,
84
+ sample_width: int = 2,
85
+ channels: str = "0",
86
+ engine: str = "librosa"
87
+ ):
88
+
89
+ function = engine_to_function.get(engine)
90
+ if function is None:
91
+ raise AssertionError(f"invalid engine: {engine}")
92
+
93
+ result = function(filename, to_sample_rate=to_sample_rate, sample_width=sample_width, channels=channels)
94
+ return result
95
+
96
+
97
+ def main():
98
+ args = get_args()
99
+
100
+ output_file = audio_convert_by_librosa(args.filename)
101
+ print(output_file)
102
+ return
103
+
104
+
105
+ if __name__ == '__main__':
106
+ main()
toolbox/audio_edit/info.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ import librosa
6
+ from pydub import AudioSegment
7
+ import soundfile as sf
8
+ from tinytag import TinyTag
9
+ import wave
10
+
11
+ from project_settings import project_path
12
+
13
+
14
+ def get_args():
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument(
17
+ "--filename",
18
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
19
+ type=str,
20
+ )
21
+ args = parser.parse_args()
22
+ return args
23
+
24
+
25
+ def get_audio_info_by_wave(filename: str):
26
+ with wave.open(filename, 'rb') as wf:
27
+ params = wf.getparams()
28
+ audio_info = {
29
+ "channels": wf.getnchannels(),
30
+ "sample_width": wf.getsampwidth(),
31
+ "sample_rate": wf.getframerate(),
32
+ "num_samples": wf.getnframes(),
33
+ "duration": round(wf.getnframes() / wf.getframerate(), 4)
34
+ }
35
+ return audio_info
36
+
37
+
38
+ def get_audio_info_by_pydub(filename: str):
39
+ audio = AudioSegment.from_file(filename)
40
+ audio_info = {
41
+ "duration": audio.duration_seconds,
42
+ "sample_rate": audio.frame_rate,
43
+ "channels": audio.channels,
44
+ "sample_width": audio.sample_width,
45
+ "num_samples": len(audio.get_array_of_samples()),
46
+ "rms": audio.rms,
47
+ "Decibels Full Scale (dBFS)": round(audio.dBFS, 4),
48
+ }
49
+ return audio_info
50
+
51
+
52
+ def get_audio_info_by_librosa(filename: str):
53
+ y, sr = librosa.load(filename, sr=None)
54
+ audio_info = {
55
+ "duration": librosa.get_duration(y=y, sr=sr),
56
+ "sample_rate": sr,
57
+ "num_samples": len(y),
58
+ # "sample_width": y.dtype.itemsize
59
+ }
60
+ return audio_info
61
+
62
+
63
+ def get_audio_info_by_soundfile(filename: str):
64
+ data, samplerate = sf.read(filename)
65
+ audio_info = {
66
+ "duration": len(data) / samplerate,
67
+ "sample_rate": samplerate,
68
+ "num_samples": len(data),
69
+ # "sample_width": data.dtype.itemsize
70
+ }
71
+ return audio_info
72
+
73
+
74
+ def get_audio_info_by_tiny_tag(filename: str):
75
+ tag = TinyTag.get(filename)
76
+ audio_info = {
77
+ "duration": tag.duration,
78
+ "sample_rate": tag.samplerate,
79
+ "channels": tag.channels,
80
+ # "bitrate": tag.bitrate
81
+ }
82
+ return audio_info
83
+
84
+
85
+ engine_to_function = {
86
+ "wave": get_audio_info_by_wave,
87
+ "pydub": get_audio_info_by_pydub,
88
+ "librosa": get_audio_info_by_librosa,
89
+ "soundfile": get_audio_info_by_soundfile,
90
+ "tiny_tag": get_audio_info_by_tiny_tag,
91
+
92
+ }
93
+
94
+
95
+ def get_audio_info(filename: str, engine: str = "wave"):
96
+ function = engine_to_function.get(engine)
97
+ if function is None:
98
+ raise AssertionError(f"invalid engine: {engine}")
99
+
100
+ return function(filename)
101
+
102
+
103
+ def main():
104
+ args = get_args()
105
+
106
+ info = get_audio_info_by_wave(args.filename)
107
+ print(info)
108
+ info = get_audio_info_by_pydub(args.filename)
109
+ print(info)
110
+ info = get_audio_info_by_librosa(args.filename)
111
+ print(info)
112
+ info = get_audio_info_by_soundfile(args.filename)
113
+ print(info)
114
+ info = get_audio_info_by_tiny_tag(args.filename)
115
+ print(info)
116
+
117
+ return
118
+
119
+
120
+ if __name__ == '__main__':
121
+ main()
toolbox/audio_edit/speech_speed.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os
5
+ from pathlib import Path
6
+ import tempfile
7
+ import uuid
8
+
9
+ from audiostretchy.stretch import stretch_audio
10
+ import audiotsm
11
+ import audiotsm.io.wav
12
+ import audiotsm.io.array
13
+ import librosa
14
+ import numpy as np
15
+ from pydub import AudioSegment
16
+ from scipy.io import wavfile
17
+
18
+ from project_settings import project_path
19
+
20
+
21
+ def get_args():
22
+ parser = argparse.ArgumentParser()
23
+ parser.add_argument(
24
+ "--filename",
25
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
26
+ type=str,
27
+ )
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def change_speech_speed_by_audiostretchy(filename: str, speed: float = 1.0) -> str:
33
+ if not (0.5 <= speed <= 2.0):
34
+ raise AssertionError(f"speed should between 0.5 and 2.0.")
35
+
36
+ output_dir = Path(tempfile.gettempdir()) / "audio_edit/speech_speed"
37
+ output_dir.mkdir(parents=True, exist_ok=True)
38
+ output_file = output_dir / f"{uuid.uuid4()}.wav"
39
+ output_file = output_file.as_posix()
40
+
41
+ stretch_audio(filename, output_file, ratio=1 / speed)
42
+
43
+ return output_file
44
+
45
+
46
+ def change_speech_speed_by_audiotsm(filename: str, speed: float = 1.0) -> str:
47
+ output_dir = Path(tempfile.gettempdir()) / "audio_edit/speech_speed"
48
+ output_dir.mkdir(parents=True, exist_ok=True)
49
+ output_file = output_dir / f"{uuid.uuid4()}.wav"
50
+ output_file = output_file.as_posix()
51
+
52
+ reader = audiotsm.io.wav.WavReader(filename)
53
+ writer = audiotsm.io.wav.WavWriter(output_file, reader.channels, reader.samplerate)
54
+
55
+ wsola = audiotsm.wsola(reader.channels, speed=speed)
56
+ wsola.run(reader, writer)
57
+
58
+ writer.close()
59
+ reader.close()
60
+ return output_file
61
+
62
+
63
+ def change_speech_speed_by_librosa(filename: str, speed: float = 1.0) -> str:
64
+ signal, sample_rate = librosa.load(filename, sr=None)
65
+ signal_ = librosa.effects.time_stretch(signal, rate=speed)
66
+
67
+ # max_wave_value = 32768.0
68
+ max_wave_value = 1 << 15
69
+
70
+ signal_ = signal_ * max_wave_value
71
+ signal_ = np.array(signal_, dtype=np.int16)
72
+
73
+ output_dir = Path(tempfile.gettempdir()) / "audio_edit/speech_speed"
74
+ output_dir.mkdir(parents=True, exist_ok=True)
75
+ output_file = output_dir / f"{uuid.uuid4()}.wav"
76
+ output_file = output_file.as_posix()
77
+
78
+ wavfile.write(
79
+ filename=output_file,
80
+ rate=sample_rate,
81
+ data=signal_,
82
+ )
83
+ return output_file
84
+
85
+
86
+ def change_speech_speed_by_pydub(filename: str, speed: float = 1.0) -> str:
87
+ if speed < 1.0:
88
+ raise AssertionError(f"speed cannot less than 1.0 for pydub.")
89
+
90
+ sound = AudioSegment.from_wav(filename)
91
+ sound_ = sound.speedup(playback_speed=speed)
92
+
93
+ output_dir = Path(tempfile.gettempdir()) / "audio_edit/speech_speed"
94
+ output_dir.mkdir(parents=True, exist_ok=True)
95
+ output_file = output_dir / f"{uuid.uuid4()}.wav"
96
+ output_file = output_file.as_posix()
97
+
98
+ if os.path.exists(output_file):
99
+ os.remove(output_file)
100
+ sound_.export(output_file, format="wav")
101
+
102
+ return output_file
103
+
104
+
105
+ engine_to_function = {
106
+ "audiostretchy": change_speech_speed_by_audiostretchy,
107
+ "audiotsm": change_speech_speed_by_audiotsm,
108
+ "librosa": change_speech_speed_by_librosa,
109
+ "pydub": change_speech_speed_by_pydub,
110
+ }
111
+
112
+
113
+ def change_speech_speed(filename: str, speed: float = 1.0, engine: str = "pydub"):
114
+ function = engine_to_function.get(engine)
115
+ if function is None:
116
+ raise AssertionError(f"invalid engine: {engine}")
117
+
118
+ return function(filename, speed=speed)
119
+
120
+
121
+ def main():
122
+ args = get_args()
123
+
124
+ output_file = change_speech_speed_by_audiostretchy(args.filename, speed=0.5)
125
+ print(output_file)
126
+ return
127
+
128
+
129
+ if __name__ == '__main__':
130
+ main()
toolbox/audio_edit/volume.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os
5
+ from pathlib import Path
6
+ import tempfile
7
+ import uuid
8
+
9
+ from ffmpy import FFmpeg
10
+ from pydub import AudioSegment
11
+
12
+ from project_settings import project_path
13
+
14
+
15
+ def get_args():
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument(
18
+ "--filename",
19
+ default=(project_path / "data/voice_clone_audio/e2_tts/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
20
+ type=str,
21
+ )
22
+ parser.add_argument(
23
+ "--output_file",
24
+ default="temp.wav",
25
+ type=str,
26
+ )
27
+ parser.add_argument("--change_by_db", default=-10, type=int)
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def change_volume_by_ffmpy_by_db(filename: str, decibel: float = 0.0) -> str:
33
+ ext = os.path.basename(filename).strip().split(".")[-1]
34
+ if ext not in ["wav", "mp3"]:
35
+ raise Exception("format error")
36
+
37
+ output_dir = Path(tempfile.gettempdir()) / "audio_edit/volume"
38
+ output_dir.mkdir(parents=True, exist_ok=True)
39
+ output_file = output_dir / f"{uuid.uuid4()}.wav"
40
+ output_file = output_file.as_posix()
41
+
42
+ if os.path.exists(output_file):
43
+ os.remove(output_file)
44
+ ff = FFmpeg(
45
+ inputs={filename: None},
46
+ outputs={output_file: f'-filter:a "volume={decibel}dB"'}
47
+ )
48
+ ff.run()
49
+ return output_file
50
+
51
+
52
+ def change_volume_by_ffmpy_by_radio(filename: str, radio: float = 0) -> str:
53
+ ext = os.path.basename(filename).strip().split(".")[-1]
54
+ if ext not in ["wav", "mp3"]:
55
+ raise Exception("format error")
56
+
57
+ output_dir = Path(tempfile.gettempdir()) / "audio_edit/volume"
58
+ output_dir.mkdir(parents=True, exist_ok=True)
59
+ output_file = output_dir / f"{uuid.uuid4()}.wav"
60
+ output_file = output_file.as_posix()
61
+
62
+ if os.path.exists(output_file):
63
+ os.remove(output_file)
64
+ ff = FFmpeg(
65
+ inputs={filename: None},
66
+ outputs={output_file: f'-filter:a "volume={radio}"'}
67
+ )
68
+ ff.run()
69
+ return output_file
70
+
71
+
72
+ def change_volume_by_pydub_by_db(filename: str, decibel: float = 0.0) -> str:
73
+ sound = AudioSegment.from_wav(filename)
74
+ sound_ = sound + decibel
75
+
76
+ output_dir = Path(tempfile.gettempdir()) / "audio_edit/volume"
77
+ output_dir.mkdir(parents=True, exist_ok=True)
78
+ output_file = output_dir / f"{uuid.uuid4()}.wav"
79
+ output_file = output_file.as_posix()
80
+
81
+ sound_.export(
82
+ output_file,
83
+ format="wav"
84
+ )
85
+ return output_file
86
+
87
+
88
+ def change_volume_by_pydub_by_reference(filename: str, reference: str) -> str:
89
+ sound1 = AudioSegment.from_wav(filename)
90
+ sound2 = AudioSegment.from_wav(reference)
91
+
92
+ sound1_ = sound1.apply_gain(sound2.dBFS - sound1.dBFS)
93
+
94
+ output_dir = Path(tempfile.gettempdir()) / "audio_edit/volume"
95
+ output_dir.mkdir(parents=True, exist_ok=True)
96
+ output_file = output_dir / f"{uuid.uuid4()}.wav"
97
+ output_file = output_file.as_posix()
98
+
99
+ sound1_.export(
100
+ output_file,
101
+ format="wav"
102
+ )
103
+ return output_file
104
+
105
+
106
+ engine_to_function = {
107
+ "by_ffmpy_by_db": change_volume_by_ffmpy_by_db,
108
+ "by_ffmpy_by_radio": change_volume_by_ffmpy_by_radio,
109
+ "by_pydub_by_db": change_volume_by_pydub_by_db,
110
+ "by_pydub_by_reference": change_volume_by_pydub_by_reference,
111
+ }
112
+
113
+
114
+ def change_volume(filename: str, radio: float = 1.0, decibel: float = 0.0, reference: str = None, engine: str = "by_ffmpy_by_db"):
115
+ function = engine_to_function.get(engine)
116
+ if function is None:
117
+ raise AssertionError(f"invalid engine: {engine}")
118
+
119
+ if engine.endswith("by_radio"):
120
+ result = function(filename, radio=radio)
121
+ elif engine.endswith("by_db"):
122
+ result = function(filename, decibel=decibel)
123
+ elif engine.endswith("by_reference"):
124
+ result = function(filename, reference=reference)
125
+ else:
126
+ raise AssertionError
127
+ return result
128
+
129
+
130
+ def main():
131
+ args = get_args()
132
+
133
+ output_file = change_volume_by_pydub_by_db(args.filename, decibel=-10)
134
+ print(output_file)
135
+ return
136
+
137
+
138
+ if __name__ == '__main__':
139
+ main()