haoheliu commited on
Commit
c8989e8
·
verified ·
1 Parent(s): 1f34ab8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -11
app.py CHANGED
@@ -13,12 +13,13 @@ import os
13
  DEFAULT_TOKEN_RATE = 100
14
  DEFAULT_SEMANTIC_VOCAB_SIZE = 16384
15
  DEFAULT_SAMPLE_RATE = 16000
 
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
 
18
  # Title and Description
19
  st.title("SemantiCodec: Ultra-Low Bitrate Neural Audio Codec")
20
  st.write("""
21
- Upload your audio file, adjust the codec parameters, and compare the original and reconstructed audio.
22
  SemantiCodec achieves high-quality audio reconstruction with ultra-low bitrates!
23
  """)
24
 
@@ -34,7 +35,7 @@ ddim_steps = st.sidebar.slider("DDIM Sampling Steps", 10, 100, 50, step=5)
34
  guidance_scale = st.sidebar.slider("CFG Guidance Scale", 0.5, 5.0, 2.0, step=0.1)
35
 
36
  # Upload Audio File
37
- uploaded_file = st.file_uploader("Upload an audio file (WAV format)", type=["wav"])
38
 
39
  # Helper function: Plot spectrogram
40
  def plot_spectrogram(waveform, sample_rate, title):
@@ -57,7 +58,7 @@ if uploaded_file and st.button("Run SemantiCodec"):
57
 
58
  # Load audio
59
  waveform, sample_rate = torchaudio.load(input_path)
60
-
61
  # Check if resampling is needed
62
  if sample_rate != DEFAULT_SAMPLE_RATE:
63
  st.write(f"Resampling audio from {sample_rate} Hz to {DEFAULT_SAMPLE_RATE} Hz...")
@@ -65,12 +66,23 @@ if uploaded_file and st.button("Run SemantiCodec"):
65
  waveform = resampler(waveform)
66
  sample_rate = DEFAULT_SAMPLE_RATE # Update sample rate to 16kHz
67
 
 
 
 
 
 
 
 
68
  # Convert to numpy for librosa compatibility
69
- waveform = waveform[0].numpy()
70
 
71
- # Plot Original Spectrogram (16kHz resampled)
72
- st.write("Original Audio Spectrogram (Resampled to 16kHz):")
73
- plot_spectrogram(waveform, sample_rate, "Original Audio Spectrogram (Resampled to 16kHz)")
 
 
 
 
74
 
75
  # Initialize SemantiCodec
76
  st.write("Initializing SemantiCodec...")
@@ -86,7 +98,7 @@ if uploaded_file and st.button("Run SemantiCodec"):
86
 
87
  # Encode and Decode
88
  st.write("Encoding and Decoding Audio...")
89
- tokens = semanticodec.encode(input_path)
90
  reconstructed_waveform = semanticodec.decode(tokens)[0, 0]
91
 
92
  # Save reconstructed audio
@@ -101,8 +113,8 @@ if uploaded_file and st.button("Run SemantiCodec"):
101
  st.write(f"Shape of Latent Code: {tokens.shape}")
102
 
103
  # Audio Players
104
- st.audio(input_path, format="audio/wav")
105
- st.write("Original Audio")
106
  st.audio(reconstructed_path, format="audio/wav")
107
  st.write("Reconstructed Audio")
108
 
@@ -113,6 +125,5 @@ if uploaded_file and st.button("Run SemantiCodec"):
113
  file_name="reconstructed_audio.wav",
114
  )
115
 
116
-
117
  # Footer
118
  st.write("Built with [Streamlit](https://streamlit.io) and SemantiCodec")
 
13
  DEFAULT_TOKEN_RATE = 100
14
  DEFAULT_SEMANTIC_VOCAB_SIZE = 16384
15
  DEFAULT_SAMPLE_RATE = 16000
16
+ MAX_DURATION_SECONDS = 30 # Maximum allowed duration
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
 
19
  # Title and Description
20
  st.title("SemantiCodec: Ultra-Low Bitrate Neural Audio Codec")
21
  st.write("""
22
+ Upload your audio file (up to 30 seconds), adjust the codec parameters, and compare the original and reconstructed audio.
23
  SemantiCodec achieves high-quality audio reconstruction with ultra-low bitrates!
24
  """)
25
 
 
35
  guidance_scale = st.sidebar.slider("CFG Guidance Scale", 0.5, 5.0, 2.0, step=0.1)
36
 
37
  # Upload Audio File
38
+ uploaded_file = st.file_uploader("Upload an audio file (WAV format, up to 30 seconds)", type=["wav"])
39
 
40
  # Helper function: Plot spectrogram
41
  def plot_spectrogram(waveform, sample_rate, title):
 
58
 
59
  # Load audio
60
  waveform, sample_rate = torchaudio.load(input_path)
61
+
62
  # Check if resampling is needed
63
  if sample_rate != DEFAULT_SAMPLE_RATE:
64
  st.write(f"Resampling audio from {sample_rate} Hz to {DEFAULT_SAMPLE_RATE} Hz...")
 
66
  waveform = resampler(waveform)
67
  sample_rate = DEFAULT_SAMPLE_RATE # Update sample rate to 16kHz
68
 
69
+ # Check and limit duration
70
+ num_samples = waveform.size(1)
71
+ max_samples = MAX_DURATION_SECONDS * sample_rate # 30 seconds limit
72
+ if num_samples > max_samples:
73
+ st.write(f"Truncating audio to the first {MAX_DURATION_SECONDS} seconds...")
74
+ waveform = waveform[:, :max_samples]
75
+
76
  # Convert to numpy for librosa compatibility
77
+ waveform_np = waveform[0].numpy()
78
 
79
+ # Plot Original Spectrogram (16kHz resampled and truncated)
80
+ st.write(f"Original Audio Spectrogram (Resampled and limited to {MAX_DURATION_SECONDS} seconds):")
81
+ plot_spectrogram(waveform_np, sample_rate, f"Original Audio Spectrogram (Resampled to {DEFAULT_SAMPLE_RATE} Hz)")
82
+
83
+ # Save truncated audio for processing
84
+ truncated_path = os.path.join(temp_dir, "truncated_input.wav")
85
+ torchaudio.save(truncated_path, waveform, sample_rate)
86
 
87
  # Initialize SemantiCodec
88
  st.write("Initializing SemantiCodec...")
 
98
 
99
  # Encode and Decode
100
  st.write("Encoding and Decoding Audio...")
101
+ tokens = semanticodec.encode(truncated_path)
102
  reconstructed_waveform = semanticodec.decode(tokens)[0, 0]
103
 
104
  # Save reconstructed audio
 
113
  st.write(f"Shape of Latent Code: {tokens.shape}")
114
 
115
  # Audio Players
116
+ st.audio(truncated_path, format="audio/wav")
117
+ st.write("Original Audio (Truncated)")
118
  st.audio(reconstructed_path, format="audio/wav")
119
  st.write("Reconstructed Audio")
120
 
 
125
  file_name="reconstructed_audio.wav",
126
  )
127
 
 
128
  # Footer
129
  st.write("Built with [Streamlit](https://streamlit.io) and SemantiCodec")