Spaces:

akhaliq
/

SpecVQGAN_Neural_Audio_Codec

Runtime error

App Files Files Community

Ahsen Khaliq commited on Oct 19, 2021

Commit

7c0b7db

1 Parent(s): 8c22980

Create app.py

Browse files

Files changed (1) hide show

app.py +86 -0

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+os.system("git clone https://github.com/v-iashin/SpecVQGAN")
+os.system("pip install pytorch-lightning==1.2.10 omegaconf==2.0.6 streamlit==0.80 matplotlib==3.4.1 albumentations==0.5.2 SoundFile torch")
+from pathlib import Path
+import soundfile
+import torch
+os.chdir("SpecVQGAN")
+from feature_extraction.demo_utils import (calculate_codebook_bitrate,
+                                           extract_melspectrogram,
+                                           get_audio_file_bitrate,
+                                           get_duration,
+                                           load_neural_audio_codec)
+from sample_visualization import tensor_to_plt
+from torch.utils.data.dataloader import default_collate
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model_name = '2021-05-19T22-16-54_vggsound_codebook'
+log_dir = './logs'
+# loading the models might take a few minutes
+config, model, vocoder = load_neural_audio_codec(model_name, log_dir, device)
+def inference(audio):
+  # Select an Audio
+  input_wav = audio.name
+  # Spectrogram Extraction
+  model_sr = config.data.params.sample_rate
+  duration = get_duration(input_wav)
+  spec = extract_melspectrogram(input_wav, sr=model_sr, duration=duration)
+  print(f'Audio Duration: {duration} seconds')
+  print('Original Spectrogram Shape:', spec.shape)
+  # Prepare Input
+  spectrogram = {'input': spec}
+  batch = default_collate([spectrogram])
+  batch['image'] = batch['input'].to(device)
+  x = model.get_input(batch, 'image')
+  with torch.no_grad():
+    quant_z, diff, info = model.encode(x)
+    xrec = model.decode(quant_z)
+  print('Compressed representation (it is all you need to recover the audio):')
+  F, T = quant_z.shape[-2:]
+  print(info[2].reshape(F, T))
+    # Calculate Bitrate
+  bitrate = calculate_codebook_bitrate(duration, quant_z, model.quantize.n_e)
+  orig_bitrate = get_audio_file_bitrate(input_wav)
+  # Save and Display
+  x = x.squeeze(0)
+  xrec = xrec.squeeze(0)
+  # specs are in [-1, 1], making them in [0, 1]
+  wav_x = vocoder((x + 1) / 2).squeeze().detach().cpu().numpy()
+  wav_xrec = vocoder((xrec + 1) / 2).squeeze().detach().cpu().numpy()
+  # Creating a temp folder which will hold the results
+  tmp_dir = os.path.join('./tmp/neural_audio_codec', Path(input_wav).parent.stem)
+  os.makedirs(tmp_dir, exist_ok=True)
+  # Save paths
+  x_save_path = Path(tmp_dir) / 'vocoded_orig_spec.wav'
+  xrec_save_path = Path(tmp_dir) / f'specvqgan_{bitrate:.2f}kbps.wav'
+  # Save
+  soundfile.write(x_save_path, wav_x, model_sr, 'PCM_16')
+  soundfile.write(xrec_save_path, wav_xrec, model_sr, 'PCM_16')
+  return './tmp/neural_audio_codec/vocoded_orig_spec.wav', "./tmp/neural_audio_codec/"+f'specvqgan_{bitrate:.2f}kbps.wav'
+title = "Anime2Sketch"
+description = "demo for Anime2Sketch. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below."
+article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2104.05703'>Adversarial Open Domain Adaption for Sketch-to-Photo Synthesis</a> | <a href='https://github.com/Mukosame/Anime2Sketch'>Github Repo</a></p>"
+gr.Interface(
+    inference,
+    gr.inputs.Audio(type="file", label="Input Audio"),
+    [gr.outputs.Audio(type="file", label="Original audio"),gr.outputs.Audio(type="file", label="Reconstructed audio")],
+    title=title,
+    description=description,
+    article=article,
+    enable_queue=True
+    ).launch(debug=True)