File size: 15,252 Bytes

b2a4a92

import numpy as np
import torch
import torchaudio
import librosa
import librosa.display
import matplotlib.pyplot as plt
import soundfile as sf
from PIL import Image


# Step 1: Encode Audio to Mel-Spectrogram
def encode_audio_to_mel_spectrogram(audio_file, n_mels=128):
    """

    Encode an audio file to a mel-spectrogram.

    

    Parameters:

    - audio_file: Path to the audio file.

    - n_mels: Number of mel bands (default: 128).

    

    Returns:

    - mel_spectrogram_db: Mel-spectrogram in dB scale.

    - sample_rate: Sample rate of the audio file.

    """
    y, sample_rate = librosa.load(audio_file, sr=None)  # Load audio
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sample_rate, n_mels=n_mels)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)  # Convert to dB
    return mel_spectrogram_db, sample_rate

# Improved Step 2: Save Mel-Spectrogram as Image
def save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, output_image='mel_spectrogram.png', method='matplotlib', figsize=(10, 4), cmap='hot'):
    """

    Save the mel-spectrogram as an image using the specified method.

    

    Parameters:

    - mel_spectrogram_db: Mel-spectrogram in dB scale.

    - sample_rate: Sample rate of the audio file.

    - output_image: Path to save the image.

    - method: Method for saving ('matplotlib' or 'custom').

    - figsize: Size of the figure for matplotlib (default: (10, 4)).

    - cmap: Colormap for the spectrogram (default: 'hot').

    """
    if method == 'matplotlib':
        plt.figure(figsize=figsize)
        librosa.display.specshow(mel_spectrogram_db, sr=sample_rate, x_axis='time', y_axis='mel', cmap=cmap)
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel-Spectrogram')
        plt.savefig(output_image)
        plt.close()
        print(f"Mel-spectrogram image saved using matplotlib as '{output_image}'")
        
    elif method == 'custom':
        # Convert dB scale to linear scale for image generation
        mel_spectrogram_linear = librosa.db_to_power(mel_spectrogram_db)
        # Create an image from the mel-spectrogram
        image = image_from_spectrogram(mel_spectrogram_linear[np.newaxis, ...])  # Add channel dimension
        # Save the image
        image.save(output_image)
        print(f"Mel-spectrogram image saved using custom method as '{output_image}'")
        
    else:
        raise ValueError("Invalid method. Choose 'matplotlib' or 'custom'.")


# Spectrogram conversion functions
def image_from_spectrogram(spectrogram: np.ndarray, power: float = 0.25) -> Image.Image:
    """

    Compute a spectrogram image from a spectrogram magnitude array.



    Args:

        spectrogram: (channels, frequency, time)

        power: A power curve to apply to the spectrogram to preserve contrast



    Returns:

        image: (frequency, time, channels)

    """
    # Rescale to 0-1
    max_value = np.max(spectrogram)
    data = spectrogram / max_value

    # Apply the power curve
    data = np.power(data, power)

    # Rescale to 0-255 and invert
    data = 255 - (data * 255).astype(np.uint8)

    # Convert to a PIL image
    if data.shape[0] == 1:
        image = Image.fromarray(data[0], mode="L").convert("RGB")
    elif data.shape[0] == 2:
        data = np.array([np.zeros_like(data[0]), data[0], data[1]]).transpose(1, 2, 0)
        image = Image.fromarray(data, mode="RGB")
    else:
        raise NotImplementedError(f"Unsupported number of channels: {data.shape[0]}")

    # Flip Y
    image = image.transpose(Image.FLIP_TOP_BOTTOM)
    return image


# Step 3: Extract Mel-Spectrogram from Image (Direct Pixel Manipulation)
def extract_mel_spectrogram_from_image(image_path):
    """

    Extract a mel-spectrogram from a saved image using pixel manipulation.

    

    Parameters:

    - image_path: Path to the spectrogram image file.

    

    Returns:

    - mel_spectrogram_db: The extracted mel-spectrogram in dB scale.

    """
    img = Image.open(image_path).convert('L')  # Open image and convert to grayscale
    img_array = np.array(img)  # Convert to NumPy array
    mel_spectrogram_db = img_array / 255.0 * -80  # Scale to dB range
    return mel_spectrogram_db

# Alternative Spectrogram Extraction (IFFT Method)
def extract_spectrogram_with_ifft(mel_spectrogram_db):
    """

    Extracts the audio signal from a mel-spectrogram using the inverse FFT method.

    

    Parameters:

    - mel_spectrogram_db: The mel-spectrogram in dB scale.

    

    Returns:

    - audio: The reconstructed audio signal.

    """
    # Convert dB mel-spectrogram back to linear scale
    mel_spectrogram = librosa.db_to_power(mel_spectrogram_db)

    # Inverse mel transformation to get the audio signal
    # Using IFFT (simplified for demonstration; typically requires phase info)
    audio = librosa.feature.inverse.mel_to_audio(mel_spectrogram)
    
    return audio

# Step 4: Decode Mel-Spectrogram with Griffin-Lim
def decode_mel_spectrogram_to_audio(mel_spectrogram_db, sample_rate, output_audio='griffin_reconstructed_audio.wav'):
    """

    Decode a mel-spectrogram into audio using Griffin-Lim algorithm.

    

    Parameters:

    - mel_spectrogram_db: The mel-spectrogram in dB scale.

    - sample_rate: The sample rate for the audio file.

    - output_audio: Path to save the reconstructed audio file.

    """
    # Convert dB mel-spectrogram back to linear scale
    mel_spectrogram = librosa.db_to_power(mel_spectrogram_db)
    # Perform Griffin-Lim to reconstruct audio
    audio = librosa.griffinlim(mel_spectrogram)
    # Save the generated audio
    sf.write(output_audio, audio, sample_rate)
    print(f"Griffin-Lim reconstructed audio saved as '{output_audio}'")
    return audio

# Step 5: Load MelGAN Vocoder
def load_melgan_vocoder():
    """

    Load a lightweight pre-trained MelGAN vocoder for decoding mel-spectrograms.

    Returns a torch MelGAN vocoder model.

    """
    model = torchaudio.models.MelGAN()  # Load MelGAN model
    model.eval()  # Ensure the model is in evaluation mode
    return model

# Step 6: Decode Mel-Spectrogram with MelGAN
def decode_mel_spectrogram_with_melgan(mel_spectrogram_db, sample_rate, output_audio='melgan_reconstructed_audio.wav'):
    """

    Decode a mel-spectrogram into audio using MelGAN vocoder.

    

    Parameters:

    - mel_spectrogram_db: The mel-spectrogram in dB scale.

    - sample_rate: The sample rate for the audio file.

    - output_audio: Path to save the reconstructed audio file.

    

    Returns:

    - audio: The reconstructed audio signal.

    """
    # Convert dB mel-spectrogram back to linear scale
    mel_spectrogram = librosa.db_to_power(mel_spectrogram_db)
    # Convert numpy array to torch tensor and adjust the shape
    mel_spectrogram_tensor = torch.tensor(mel_spectrogram).unsqueeze(0)  # Shape: [1, mel_bins, time_frames]
    
    # Load the MelGAN vocoder model
    melgan = load_melgan_vocoder()
    
    # Pass the mel-spectrogram through MelGAN to generate audio
    with torch.no_grad():
        audio = melgan(mel_spectrogram_tensor).squeeze().numpy()  # Squeeze to remove batch dimension
    
    # Save the generated audio
    sf.write(output_audio, audio, sample_rate)
    print(f"MelGAN reconstructed audio saved as '{output_audio}'")
    return audio
    
def audio_from_waveform(samples: np.ndarray, sample_rate: int, normalize: bool = False) -> pydub.AudioSegment:
    """

    Convert a numpy array of samples of a waveform to an audio segment.



    Args:

        samples: (channels, samples) array

        sample_rate: Sample rate of the audio.

        normalize: Flag to normalize volume.



    Returns:

        pydub.AudioSegment

    """
    # Normalize volume to fit in int16
    if normalize:
        samples *= np.iinfo(np.int16).max / np.max(np.abs(samples))

    # Transpose and convert to int16
    samples = samples.transpose(1, 0).astype(np.int16)

    # Write to the bytes of a WAV file
    wav_bytes = io.BytesIO()
    wavfile.write(wav_bytes, sample_rate, samples)
    wav_bytes.seek(0)

    # Read into pydub
    return pydub.AudioSegment.from_wav(wav_bytes)


def apply_filters(segment: pydub.AudioSegment, compression: bool = False) -> pydub.AudioSegment:
    """

    Apply post-processing filters to the audio segment to compress it and keep at a -10 dBFS level.



    Args:

        segment: The audio segment to filter.

        compression: Flag to apply dynamic range compression.



    Returns:

        pydub.AudioSegment

    """
    if compression:
        segment = pydub.effects.normalize(segment, headroom=0.1)
        segment = segment.apply_gain(-10 - segment.dBFS)
        segment = pydub.effects.compress_dynamic_range(
            segment,
            threshold=-20.0,
            ratio=4.0,
            attack=5.0,
            release=50.0,
        )

    # Apply gain to desired dB level and normalize again
    desired_db = -12
    segment = segment.apply_gain(desired_db - segment.dBFS)
    return pydub.effects.normalize(segment, headroom=0.1)


def stitch_segments(segments: Sequence[pydub.AudioSegment], crossfade_s: float) -> pydub.AudioSegment:
    """

    Stitch together a sequence of audio segments with a crossfade between each segment.



    Args:

        segments: Sequence of audio segments to stitch.

        crossfade_s: Duration of crossfade in seconds.



    Returns:

        pydub.AudioSegment

    """
    crossfade_ms = int(crossfade_s * 1000)
    combined_segment = segments[0]
    for segment in segments[1:]:
        combined_segment = combined_segment.append(segment, crossfade=crossfade_ms)
    return combined_segment


def overlay_segments(segments: Sequence[pydub.AudioSegment]) -> pydub.AudioSegment:
    """

    Overlay a sequence of audio segments on top of each other.



    Args:

        segments: Sequence of audio segments to overlay.



    Returns:

        pydub.AudioSegment

    """
    assert len(segments) > 0
    output: pydub.AudioSegment = segments[0]
    for segment in segments[1:]:
        output = output.overlay(segment)
    return output



# Step 7: Full Pipeline for Audio Processing with Customization
def mel_spectrogram_pipeline(audio_file, output_image='mel_spectrogram.png', 

                             output_audio_griffin='griffin_reconstructed_audio.wav', 

                             output_audio_melgan='melgan_reconstructed_audio.wav',

                             extraction_method='pixel',  # 'pixel' or 'ifft'

                             decoding_method='griffin'):  # 'griffin' or 'melgan'
    """

    Full pipeline to encode audio to mel-spectrogram, save it as an image, extract the spectrogram from the image,

    and decode it back to audio using the selected methods.

    

    Parameters:

    - audio_file: Path to the audio file to be processed.

    - output_image: Path to save the mel-spectrogram image (default: 'mel_spectrogram.png').

    - output_audio_griffin: Path to save the Griffin-Lim reconstructed audio.

    - output_audio_melgan: Path to save the MelGAN reconstructed audio.

    - extraction_method: Method for extraction ('pixel' or 'ifft').

    - decoding_method: Method for decoding ('griffin' or 'melgan').

    """
    # Step 1: Encode (Audio -> Mel-Spectrogram)
    mel_spectrogram_db, sample_rate = encode_audio_to_mel_spectrogram(audio_file)
    
    # Step 2: Convert Mel-Spectrogram to Image and save it
    save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, output_image)
    
    # Step 3: Extract Mel-Spectrogram from the image based on chosen method
    if extraction_method == 'pixel':
        extracted_mel_spectrogram_db = extract_mel_spectrogram_from_image(output_image)
    elif extraction_method == 'ifft':
        extracted_mel_spectrogram_db = extract_spectrogram_with_ifft(mel_spectrogram_db)
    else:
        raise ValueError("Invalid extraction method. Choose 'pixel' or 'ifft'.")
    
    # Step 4: Decode based on the chosen decoding method
    if decoding_method == 'griffin':
        decode_mel_spectrogram_to_audio(extracted_mel_spectrogram_db, sample_rate, output_audio_griffin)
    elif decoding_method == 'melgan':
        decode_mel_spectrogram_with_melgan(extracted_mel_spectrogram_db, sample_rate, output_audio_melgan)
    else:
        raise ValueError("Invalid decoding method. Choose 'griffin' or 'melgan'.")


def process_audio(audio_file, extraction_method, decoding_method):
    # Create temporary files for outputs
    with tempfile.NamedTemporaryFile(suffix=".png") as temp_image, \
         tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_griffin, \
         tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_melgan:
        
        # Step 1: Encode (Audio -> Mel-Spectrogram)
        mel_spectrogram_db, sample_rate = encode_audio_to_mel_spectrogram(audio_file)
        
        # Step 2: Convert Mel-Spectrogram to Image and save it
        save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, temp_image.name)
        
        # Step 3: Extract Mel-Spectrogram from the image based on chosen method
        if extraction_method == 'pixel':
            extracted_mel_spectrogram_db = extract_mel_spectrogram_from_image(temp_image.name)
        elif extraction_method == 'ifft':
            extracted_mel_spectrogram_db = extract_spectrogram_with_ifft(mel_spectrogram_db)
        
        # Step 4: Decode using both methods
        decode_mel_spectrogram_to_audio(extracted_mel_spectrogram_db, sample_rate, temp_audio_griffin.name)
        decode_mel_spectrogram_with_melgan(extracted_mel_spectrogram_db, sample_rate, temp_audio_melgan.name)
        
        # Return results
        return (temp_image.name, 
                temp_audio_griffin.name if decoding_method == 'griffin' else temp_audio_melgan.name)

# Create Gradio interface
iface = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(type="filepath", label="Upload Audio"),
        gr.Radio(["pixel", "ifft"], label="Extraction Method", value="pixel"),
        gr.Radio(["griffin", "melgan"], label="Decoding Method", value="griffin")
    ],
    outputs=[
        gr.Image(type="filepath", label="Mel-Spectrogram"),
        gr.Audio(type="filepath", label="Reconstructed Audio")
    ],
    title="Audio Encoder-Decoder",
    description="Upload an audio file to encode it to a mel-spectrogram and then decode it back to audio."
)

# Launch the app
iface.launch()


# Example usage(TEST)
if __name__ == "__main__":
    audio_file_path = 'your_audio_file.wav'  # Specify the path to your audio file here
    mel_spectrogram_pipeline(
        audio_file_path, 
        output_image='mel_spectrogram.png',
        output_audio_griffin='griffin_reconstructed_audio.wav',
        output_audio_melgan='melgan_reconstructed_audio.wav',
        extraction_method='pixel',  # Choose 'pixel' or 'ifft'
        decoding_method='griffin'  # Choose 'griffin' or 'melgan'
    )