HoneyTian commited on
Commit
20323b4
·
1 Parent(s): 45cf916

add trim audio

Browse files
examples/batch_audio_fmt_convert.py CHANGED
@@ -18,13 +18,13 @@ def get_args():
18
  parser.add_argument(
19
  "--audio_dir",
20
  # default=(project_path / "data/yd").as_posix(),
21
- default=r"E:\牛信文档\语音克隆\多语种语音克隆\money_char",
22
  type=str,
23
  )
24
  parser.add_argument(
25
  "--output_dir",
26
  # default=(project_path / "data/temp_wav").as_posix(),
27
- default=r"E:\牛信文档\语音克隆\多语种语音克隆\money_char",
28
  type=str,
29
  )
30
  args = parser.parse_args()
@@ -44,10 +44,14 @@ def main():
44
  basename = filename.stem
45
  relative_dir = filename.parent.relative_to(audio_dir)
46
 
47
- signal, sample_rate = librosa.load(filename, sr=8000)
 
48
  # print(signal.shape)
49
  # print(signal.dtype)
50
  # exit(0)
 
 
 
51
  signal *= max_wave_value
52
  signal = np.array(signal, dtype=np.int16)
53
 
 
18
  parser.add_argument(
19
  "--audio_dir",
20
  # default=(project_path / "data/yd").as_posix(),
21
+ default=r"C:\Users\tianx\Desktop\sample-audio",
22
  type=str,
23
  )
24
  parser.add_argument(
25
  "--output_dir",
26
  # default=(project_path / "data/temp_wav").as_posix(),
27
+ default=r"C:\Users\tianx\Desktop\sample-audio2",
28
  type=str,
29
  )
30
  args = parser.parse_args()
 
44
  basename = filename.stem
45
  relative_dir = filename.parent.relative_to(audio_dir)
46
 
47
+ signal, sample_rate = librosa.load(filename, mono=False, sr=8000)
48
+
49
  # print(signal.shape)
50
  # print(signal.dtype)
51
  # exit(0)
52
+ if not signal.ndim == 2:
53
+ raise AssertionError
54
+ signal = signal[0]
55
  signal *= max_wave_value
56
  signal = np.array(signal, dtype=np.int16)
57
 
main.py CHANGED
@@ -1,14 +1,16 @@
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  """
4
- docker build -t audio_edit:v20250116_1917 .
 
 
5
 
6
  docker run -itd \
7
  --name audio_edit_7861 \
8
  --restart=always \
9
  --network host \
10
  -e port=7861 \
11
- audio_edit:v20250116_1917
12
  """
13
  import argparse
14
  import json
@@ -30,6 +32,7 @@ from toolbox.audio_edit.volume import change_volume, engine_to_function as volum
30
  from toolbox.audio_edit.augment import mix_speech_and_noise
31
  from toolbox.audio_edit.reverb import reverb, engine_to_function as reverb_engine_to_function
32
  from toolbox.os.command import Command
 
33
 
34
 
35
  def get_args():
@@ -180,6 +183,27 @@ def when_click_pad_audio(audio_t, pad_seconds: int = 10, pad_mode: str = "zero")
180
  return (sample_rate, pad_signal), message
181
 
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  def when_click_reverb(audio_t, kwargs: str, engine: str):
184
  sample_rate, signal = audio_t
185
 
@@ -278,6 +302,15 @@ pad_audio_examples = [
278
  ]
279
 
280
 
 
 
 
 
 
 
 
 
 
281
  reverb_examples = [
282
  [
283
  (project_path / "data/examples/default/audio_0_2.wav").as_posix(),
@@ -328,6 +361,7 @@ def main():
328
  cvt_choices = list(cvt_engine_to_function.keys())
329
  speed_choices = list(speed_engine_to_function.keys())
330
  volume_choices = list(volume_engine_to_function.keys())
 
331
  reverb_choices = list(reverb_engine_to_function.keys())
332
 
333
  # ui
@@ -499,6 +533,32 @@ def main():
499
  pad_output_audio, pad_log
500
  ],
501
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  with gr.TabItem("reverb"):
503
  with gr.Row():
504
  with gr.Column(variant="panel", scale=5):
@@ -568,8 +628,8 @@ def main():
568
  # http://10.75.27.247:7861/
569
  blocks.queue().launch(
570
  share=False if platform.system() == "Windows" else False,
571
- # server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
572
- server_name="0.0.0.0",
573
  server_port=environment.get("port", 7860, dtype=int),
574
  )
575
  return
 
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  """
4
+ docker build -t audio_edit:v20250314_1357 .
5
+
6
+ docker stop audio_edit_7861 && docker rm audio_edit_7861
7
 
8
  docker run -itd \
9
  --name audio_edit_7861 \
10
  --restart=always \
11
  --network host \
12
  -e port=7861 \
13
+ audio_edit:v20250314_1357
14
  """
15
  import argparse
16
  import json
 
32
  from toolbox.audio_edit.augment import mix_speech_and_noise
33
  from toolbox.audio_edit.reverb import reverb, engine_to_function as reverb_engine_to_function
34
  from toolbox.os.command import Command
35
+ from toolbox.audio_edit.trim import audio_trim, engine_to_function as trim_engine_to_function
36
 
37
 
38
  def get_args():
 
183
  return (sample_rate, pad_signal), message
184
 
185
 
186
+ def when_click_trim_audio(audio_t, kwargs: str, engine: str):
187
+ sample_rate, signal = audio_t
188
+ filename = save_input_audio(sample_rate, signal)
189
+
190
+ message = "success"
191
+ try:
192
+ kwargs = json.loads(kwargs)
193
+ output_file, ext = audio_trim(
194
+ filename=filename,
195
+ engine=engine,
196
+ **kwargs,
197
+ )
198
+ ext = json.dumps(ext, ensure_ascii=False, indent=4)
199
+ message += f"\n\n{ext}"
200
+ except Exception as e:
201
+ output_file = None
202
+ message = f"failed. error type: {type(e)}, error text: {str(e)}"
203
+
204
+ return output_file, message
205
+
206
+
207
  def when_click_reverb(audio_t, kwargs: str, engine: str):
208
  sample_rate, signal = audio_t
209
 
 
302
  ]
303
 
304
 
305
+ trim_examples = [
306
+ [
307
+ (project_path / "data/examples/mix/speech/000f62f5-5b05-4494-a8db-0eaca3ebd871_th-TH_1678353399860.wav").as_posix(),
308
+ '{\n "silence_threshold": -40,\n "min_silence_len": 200,\n "min_kept_silence": 200,\n "mode": "trim"\n}',
309
+ "pydub",
310
+ ]
311
+ ]
312
+
313
+
314
  reverb_examples = [
315
  [
316
  (project_path / "data/examples/default/audio_0_2.wav").as_posix(),
 
361
  cvt_choices = list(cvt_engine_to_function.keys())
362
  speed_choices = list(speed_engine_to_function.keys())
363
  volume_choices = list(volume_engine_to_function.keys())
364
+ trim_choices = list(trim_engine_to_function.keys())
365
  reverb_choices = list(reverb_engine_to_function.keys())
366
 
367
  # ui
 
533
  pad_output_audio, pad_log
534
  ],
535
  )
536
+ with gr.TabItem("trim"):
537
+ with gr.Row():
538
+ with gr.Column(variant="panel", scale=5):
539
+ trim_audio = gr.Audio(label="audio")
540
+ trim_kwargs = gr.Textbox(lines=8, label="kwargs")
541
+ trim_engine = gr.Dropdown(choices=trim_choices, value=trim_choices[0], label="engine")
542
+ trim_button = gr.Button(variant="primary")
543
+
544
+ with gr.Column(variant="panel", scale=5):
545
+ trim_output_audio = gr.Audio(label="output_audio")
546
+ trim_log = gr.Text(label="log")
547
+ gr.Examples(
548
+ examples=trim_examples,
549
+ inputs=[trim_audio, trim_kwargs, trim_engine],
550
+ outputs=[
551
+ trim_output_audio, trim_log
552
+ ],
553
+ fn=when_click_trim_audio,
554
+ )
555
+ trim_button.click(
556
+ when_click_trim_audio,
557
+ inputs=[trim_audio, trim_kwargs, trim_engine],
558
+ outputs=[
559
+ trim_output_audio, trim_log
560
+ ],
561
+ )
562
  with gr.TabItem("reverb"):
563
  with gr.Row():
564
  with gr.Column(variant="panel", scale=5):
 
628
  # http://10.75.27.247:7861/
629
  blocks.queue().launch(
630
  share=False if platform.system() == "Windows" else False,
631
+ server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
632
+ # server_name="0.0.0.0",
633
  server_port=environment.get("port", 7860, dtype=int),
634
  )
635
  return
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio==4.44.1
2
  librosa==0.10.2
3
  soundfile==0.12.1
4
  scipy==1.14.1
 
1
+ gradio
2
  librosa==0.10.2
3
  soundfile==0.12.1
4
  scipy==1.14.1
toolbox/audio_edit/trim.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ from pathlib import Path
5
+ import tempfile
6
+ import uuid
7
+
8
+ import librosa
9
+ from pydub import AudioSegment
10
+ from pydub.silence import detect_silence
11
+ from scipy.io import wavfile
12
+
13
+ from project_settings import project_path
14
+
15
+
16
+ def get_args():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument(
19
+ "--filename",
20
+ default=(project_path / "data/examples/mix/speech/000f62f5-5b05-4494-a8db-0eaca3ebd871_th-TH_1678353399860.wav").as_posix(),
21
+ type=str,
22
+ )
23
+ args = parser.parse_args()
24
+ return args
25
+
26
+
27
+ def audio_trim_by_pydub(filename: str,
28
+ silence_threshold: int = -40,
29
+ min_silence_len: float = 1000,
30
+ min_kept_silence: float = 200,
31
+ mode: str = "trim"
32
+ ):
33
+ audio = AudioSegment.from_file(filename, format="wav")
34
+ length = len(audio)
35
+
36
+ silent_ranges = detect_silence(audio, min_silence_len=min_silence_len, silence_thresh=silence_threshold)
37
+
38
+ output_dir = Path(tempfile.gettempdir()) / "audio_edit/trim"
39
+ output_dir.mkdir(parents=True, exist_ok=True)
40
+ output_file = output_dir / f"{uuid.uuid4()}.wav"
41
+ output_file = output_file.as_posix()
42
+
43
+ if len(silent_ranges) == 0:
44
+ audio.export(output_file)
45
+ ext = {
46
+ "begin": 0,
47
+ "end": length,
48
+ "origin_length": length,
49
+ }
50
+ return output_file, ext
51
+
52
+ begin_silence = silent_ranges[0]
53
+ begin = 0
54
+ if begin_silence[0] == 0:
55
+ begin = max(0, begin_silence[1] - min_kept_silence)
56
+
57
+ end_silence = silent_ranges[-1]
58
+ end = length
59
+ if end_silence[1] == length:
60
+ end = min(length, end_silence[0] + min_kept_silence)
61
+
62
+ if mode == "trim":
63
+ pass
64
+ elif mode == "rtrim":
65
+ begin = 0
66
+ elif mode == "ltrim":
67
+ end = length
68
+
69
+ trimmed_audio = audio[begin:end]
70
+ trimmed_audio.export(output_file)
71
+
72
+ ext = {
73
+ "begin (ms)": begin,
74
+ "end (ms)": end,
75
+ "origin_length (ms)": length,
76
+ }
77
+ return output_file, ext
78
+
79
+
80
+ def audio_trim_by_librosa(filename: str,
81
+ sample_rate: int = None,
82
+ top_db: float = 60,
83
+ frame_length: int = 2048,
84
+ hop_length: int = 512,
85
+ mode: str = "trim",
86
+ **kwargs
87
+ ):
88
+ signal, sample_rate = librosa.load(filename, sr=sample_rate, mono=False)
89
+ length = len(signal)
90
+
91
+ _, index= librosa.effects.trim(
92
+ signal,
93
+ top_db=top_db, frame_length=frame_length,
94
+ hop_length=hop_length,
95
+ **kwargs
96
+ )
97
+
98
+ output_dir = Path(tempfile.gettempdir()) / "audio_edit/trim"
99
+ output_dir.mkdir(parents=True, exist_ok=True)
100
+ output_file = output_dir / f"{uuid.uuid4()}.wav"
101
+ output_file = output_file.as_posix()
102
+
103
+ begin = index[0]
104
+ end = index[1]
105
+
106
+ if mode == "trim":
107
+ pass
108
+ elif mode == "rtrim":
109
+ begin = 0
110
+ elif mode == "ltrim":
111
+ end = length
112
+
113
+ signal_trimmed = signal[begin:end]
114
+ wavfile.write(
115
+ output_file,
116
+ rate=sample_rate,
117
+ data=signal_trimmed
118
+ )
119
+
120
+ ext = {
121
+ "begin": begin,
122
+ "end": end,
123
+ "origin_length": length,
124
+ }
125
+ return output_file, ext
126
+
127
+
128
+ engine_to_function = {
129
+ "pydub": audio_trim_by_pydub,
130
+ "librosa": audio_trim_by_librosa,
131
+ }
132
+
133
+
134
+ def audio_trim(filename: str, engine: str = "librosa", **kwargs):
135
+ function = engine_to_function.get(engine)
136
+ if function is None:
137
+ raise AssertionError(f"invalid engine: {engine}")
138
+
139
+ return function(filename, **kwargs)
140
+
141
+
142
+ def main():
143
+ args = get_args()
144
+
145
+ output_file, ext = audio_trim(args.filename, engine="pydub")
146
+ # output_file, ext = audio_trim(args.filename, engine="librosa")
147
+ print(output_file)
148
+ return
149
+
150
+
151
+ if __name__ == "__main__":
152
+ main()