mergekit
Merge
Mistral_Star
Mistral_Quiet
Mistral
Mixtral
Question-Answer
Token-Classification
Sequence-Classification
SpydazWeb-AI
chemistry
biology
legal
code
climate
medical
LCARS_AI_StarTrek_Computer
text-generation-inference
chain-of-thought
tree-of-knowledge
forest-of-thoughts
visual-spacial-sketchpad
alpha-mind
knowledge-graph
entity-detection
encyclopedia
wikipedia
stack-exchange
Reddit
Cyber-series
MegaMind
Cybertron
SpydazWeb
Spydaz
LCARS
star-trek
mega-transformers
Mulit-Mega-Merge
Multi-Lingual
Afro-Centric
African-Model
Ancient-One
import numpy as np | |
import torch | |
import torchaudio | |
import librosa | |
import librosa.display | |
import matplotlib.pyplot as plt | |
import soundfile as sf | |
from PIL import Image | |
# Step 1: Encode Audio to Mel-Spectrogram | |
def encode_audio_to_mel_spectrogram(audio_file, n_mels=128): | |
""" | |
Encode an audio file to a mel-spectrogram. | |
Parameters: | |
- audio_file: Path to the audio file. | |
- n_mels: Number of mel bands (default: 128). | |
Returns: | |
- mel_spectrogram_db: Mel-spectrogram in dB scale. | |
- sample_rate: Sample rate of the audio file. | |
""" | |
y, sample_rate = librosa.load(audio_file, sr=None) # Load audio | |
mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sample_rate, n_mels=n_mels) | |
mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max) # Convert to dB | |
return mel_spectrogram_db, sample_rate | |
# Improved Step 2: Save Mel-Spectrogram as Image | |
def save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, output_image='mel_spectrogram.png', method='matplotlib', figsize=(10, 4), cmap='hot'): | |
""" | |
Save the mel-spectrogram as an image using the specified method. | |
Parameters: | |
- mel_spectrogram_db: Mel-spectrogram in dB scale. | |
- sample_rate: Sample rate of the audio file. | |
- output_image: Path to save the image. | |
- method: Method for saving ('matplotlib' or 'custom'). | |
- figsize: Size of the figure for matplotlib (default: (10, 4)). | |
- cmap: Colormap for the spectrogram (default: 'hot'). | |
""" | |
if method == 'matplotlib': | |
plt.figure(figsize=figsize) | |
librosa.display.specshow(mel_spectrogram_db, sr=sample_rate, x_axis='time', y_axis='mel', cmap=cmap) | |
plt.colorbar(format='%+2.0f dB') | |
plt.title('Mel-Spectrogram') | |
plt.savefig(output_image) | |
plt.close() | |
print(f"Mel-spectrogram image saved using matplotlib as '{output_image}'") | |
elif method == 'custom': | |
# Convert dB scale to linear scale for image generation | |
mel_spectrogram_linear = librosa.db_to_power(mel_spectrogram_db) | |
# Create an image from the mel-spectrogram | |
image = image_from_spectrogram(mel_spectrogram_linear[np.newaxis, ...]) # Add channel dimension | |
# Save the image | |
image.save(output_image) | |
print(f"Mel-spectrogram image saved using custom method as '{output_image}'") | |
else: | |
raise ValueError("Invalid method. Choose 'matplotlib' or 'custom'.") | |
# Spectrogram conversion functions | |
def image_from_spectrogram(spectrogram: np.ndarray, power: float = 0.25) -> Image.Image: | |
""" | |
Compute a spectrogram image from a spectrogram magnitude array. | |
Args: | |
spectrogram: (channels, frequency, time) | |
power: A power curve to apply to the spectrogram to preserve contrast | |
Returns: | |
image: (frequency, time, channels) | |
""" | |
# Rescale to 0-1 | |
max_value = np.max(spectrogram) | |
data = spectrogram / max_value | |
# Apply the power curve | |
data = np.power(data, power) | |
# Rescale to 0-255 and invert | |
data = 255 - (data * 255).astype(np.uint8) | |
# Convert to a PIL image | |
if data.shape[0] == 1: | |
image = Image.fromarray(data[0], mode="L").convert("RGB") | |
elif data.shape[0] == 2: | |
data = np.array([np.zeros_like(data[0]), data[0], data[1]]).transpose(1, 2, 0) | |
image = Image.fromarray(data, mode="RGB") | |
else: | |
raise NotImplementedError(f"Unsupported number of channels: {data.shape[0]}") | |
# Flip Y | |
image = image.transpose(Image.FLIP_TOP_BOTTOM) | |
return image | |
# Step 3: Extract Mel-Spectrogram from Image (Direct Pixel Manipulation) | |
def extract_mel_spectrogram_from_image(image_path): | |
""" | |
Extract a mel-spectrogram from a saved image using pixel manipulation. | |
Parameters: | |
- image_path: Path to the spectrogram image file. | |
Returns: | |
- mel_spectrogram_db: The extracted mel-spectrogram in dB scale. | |
""" | |
img = Image.open(image_path).convert('L') # Open image and convert to grayscale | |
img_array = np.array(img) # Convert to NumPy array | |
mel_spectrogram_db = img_array / 255.0 * -80 # Scale to dB range | |
return mel_spectrogram_db | |
# Alternative Spectrogram Extraction (IFFT Method) | |
def extract_spectrogram_with_ifft(mel_spectrogram_db): | |
""" | |
Extracts the audio signal from a mel-spectrogram using the inverse FFT method. | |
Parameters: | |
- mel_spectrogram_db: The mel-spectrogram in dB scale. | |
Returns: | |
- audio: The reconstructed audio signal. | |
""" | |
# Convert dB mel-spectrogram back to linear scale | |
mel_spectrogram = librosa.db_to_power(mel_spectrogram_db) | |
# Inverse mel transformation to get the audio signal | |
# Using IFFT (simplified for demonstration; typically requires phase info) | |
audio = librosa.feature.inverse.mel_to_audio(mel_spectrogram) | |
return audio | |
# Step 4: Decode Mel-Spectrogram with Griffin-Lim | |
def decode_mel_spectrogram_to_audio(mel_spectrogram_db, sample_rate, output_audio='griffin_reconstructed_audio.wav'): | |
""" | |
Decode a mel-spectrogram into audio using Griffin-Lim algorithm. | |
Parameters: | |
- mel_spectrogram_db: The mel-spectrogram in dB scale. | |
- sample_rate: The sample rate for the audio file. | |
- output_audio: Path to save the reconstructed audio file. | |
""" | |
# Convert dB mel-spectrogram back to linear scale | |
mel_spectrogram = librosa.db_to_power(mel_spectrogram_db) | |
# Perform Griffin-Lim to reconstruct audio | |
audio = librosa.griffinlim(mel_spectrogram) | |
# Save the generated audio | |
sf.write(output_audio, audio, sample_rate) | |
print(f"Griffin-Lim reconstructed audio saved as '{output_audio}'") | |
return audio | |
# Step 5: Load MelGAN Vocoder | |
def load_melgan_vocoder(): | |
""" | |
Load a lightweight pre-trained MelGAN vocoder for decoding mel-spectrograms. | |
Returns a torch MelGAN vocoder model. | |
""" | |
model = torchaudio.models.MelGAN() # Load MelGAN model | |
model.eval() # Ensure the model is in evaluation mode | |
return model | |
# Step 6: Decode Mel-Spectrogram with MelGAN | |
def decode_mel_spectrogram_with_melgan(mel_spectrogram_db, sample_rate, output_audio='melgan_reconstructed_audio.wav'): | |
""" | |
Decode a mel-spectrogram into audio using MelGAN vocoder. | |
Parameters: | |
- mel_spectrogram_db: The mel-spectrogram in dB scale. | |
- sample_rate: The sample rate for the audio file. | |
- output_audio: Path to save the reconstructed audio file. | |
Returns: | |
- audio: The reconstructed audio signal. | |
""" | |
# Convert dB mel-spectrogram back to linear scale | |
mel_spectrogram = librosa.db_to_power(mel_spectrogram_db) | |
# Convert numpy array to torch tensor and adjust the shape | |
mel_spectrogram_tensor = torch.tensor(mel_spectrogram).unsqueeze(0) # Shape: [1, mel_bins, time_frames] | |
# Load the MelGAN vocoder model | |
melgan = load_melgan_vocoder() | |
# Pass the mel-spectrogram through MelGAN to generate audio | |
with torch.no_grad(): | |
audio = melgan(mel_spectrogram_tensor).squeeze().numpy() # Squeeze to remove batch dimension | |
# Save the generated audio | |
sf.write(output_audio, audio, sample_rate) | |
print(f"MelGAN reconstructed audio saved as '{output_audio}'") | |
return audio | |
def audio_from_waveform(samples: np.ndarray, sample_rate: int, normalize: bool = False) -> pydub.AudioSegment: | |
""" | |
Convert a numpy array of samples of a waveform to an audio segment. | |
Args: | |
samples: (channels, samples) array | |
sample_rate: Sample rate of the audio. | |
normalize: Flag to normalize volume. | |
Returns: | |
pydub.AudioSegment | |
""" | |
# Normalize volume to fit in int16 | |
if normalize: | |
samples *= np.iinfo(np.int16).max / np.max(np.abs(samples)) | |
# Transpose and convert to int16 | |
samples = samples.transpose(1, 0).astype(np.int16) | |
# Write to the bytes of a WAV file | |
wav_bytes = io.BytesIO() | |
wavfile.write(wav_bytes, sample_rate, samples) | |
wav_bytes.seek(0) | |
# Read into pydub | |
return pydub.AudioSegment.from_wav(wav_bytes) | |
def apply_filters(segment: pydub.AudioSegment, compression: bool = False) -> pydub.AudioSegment: | |
""" | |
Apply post-processing filters to the audio segment to compress it and keep at a -10 dBFS level. | |
Args: | |
segment: The audio segment to filter. | |
compression: Flag to apply dynamic range compression. | |
Returns: | |
pydub.AudioSegment | |
""" | |
if compression: | |
segment = pydub.effects.normalize(segment, headroom=0.1) | |
segment = segment.apply_gain(-10 - segment.dBFS) | |
segment = pydub.effects.compress_dynamic_range( | |
segment, | |
threshold=-20.0, | |
ratio=4.0, | |
attack=5.0, | |
release=50.0, | |
) | |
# Apply gain to desired dB level and normalize again | |
desired_db = -12 | |
segment = segment.apply_gain(desired_db - segment.dBFS) | |
return pydub.effects.normalize(segment, headroom=0.1) | |
def stitch_segments(segments: Sequence[pydub.AudioSegment], crossfade_s: float) -> pydub.AudioSegment: | |
""" | |
Stitch together a sequence of audio segments with a crossfade between each segment. | |
Args: | |
segments: Sequence of audio segments to stitch. | |
crossfade_s: Duration of crossfade in seconds. | |
Returns: | |
pydub.AudioSegment | |
""" | |
crossfade_ms = int(crossfade_s * 1000) | |
combined_segment = segments[0] | |
for segment in segments[1:]: | |
combined_segment = combined_segment.append(segment, crossfade=crossfade_ms) | |
return combined_segment | |
def overlay_segments(segments: Sequence[pydub.AudioSegment]) -> pydub.AudioSegment: | |
""" | |
Overlay a sequence of audio segments on top of each other. | |
Args: | |
segments: Sequence of audio segments to overlay. | |
Returns: | |
pydub.AudioSegment | |
""" | |
assert len(segments) > 0 | |
output: pydub.AudioSegment = segments[0] | |
for segment in segments[1:]: | |
output = output.overlay(segment) | |
return output | |
# Step 7: Full Pipeline for Audio Processing with Customization | |
def mel_spectrogram_pipeline(audio_file, output_image='mel_spectrogram.png', | |
output_audio_griffin='griffin_reconstructed_audio.wav', | |
output_audio_melgan='melgan_reconstructed_audio.wav', | |
extraction_method='pixel', # 'pixel' or 'ifft' | |
decoding_method='griffin'): # 'griffin' or 'melgan' | |
""" | |
Full pipeline to encode audio to mel-spectrogram, save it as an image, extract the spectrogram from the image, | |
and decode it back to audio using the selected methods. | |
Parameters: | |
- audio_file: Path to the audio file to be processed. | |
- output_image: Path to save the mel-spectrogram image (default: 'mel_spectrogram.png'). | |
- output_audio_griffin: Path to save the Griffin-Lim reconstructed audio. | |
- output_audio_melgan: Path to save the MelGAN reconstructed audio. | |
- extraction_method: Method for extraction ('pixel' or 'ifft'). | |
- decoding_method: Method for decoding ('griffin' or 'melgan'). | |
""" | |
# Step 1: Encode (Audio -> Mel-Spectrogram) | |
mel_spectrogram_db, sample_rate = encode_audio_to_mel_spectrogram(audio_file) | |
# Step 2: Convert Mel-Spectrogram to Image and save it | |
save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, output_image) | |
# Step 3: Extract Mel-Spectrogram from the image based on chosen method | |
if extraction_method == 'pixel': | |
extracted_mel_spectrogram_db = extract_mel_spectrogram_from_image(output_image) | |
elif extraction_method == 'ifft': | |
extracted_mel_spectrogram_db = extract_spectrogram_with_ifft(mel_spectrogram_db) | |
else: | |
raise ValueError("Invalid extraction method. Choose 'pixel' or 'ifft'.") | |
# Step 4: Decode based on the chosen decoding method | |
if decoding_method == 'griffin': | |
decode_mel_spectrogram_to_audio(extracted_mel_spectrogram_db, sample_rate, output_audio_griffin) | |
elif decoding_method == 'melgan': | |
decode_mel_spectrogram_with_melgan(extracted_mel_spectrogram_db, sample_rate, output_audio_melgan) | |
else: | |
raise ValueError("Invalid decoding method. Choose 'griffin' or 'melgan'.") | |
def process_audio(audio_file, extraction_method, decoding_method): | |
# Create temporary files for outputs | |
with tempfile.NamedTemporaryFile(suffix=".png") as temp_image, \ | |
tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_griffin, \ | |
tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_melgan: | |
# Step 1: Encode (Audio -> Mel-Spectrogram) | |
mel_spectrogram_db, sample_rate = encode_audio_to_mel_spectrogram(audio_file) | |
# Step 2: Convert Mel-Spectrogram to Image and save it | |
save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, temp_image.name) | |
# Step 3: Extract Mel-Spectrogram from the image based on chosen method | |
if extraction_method == 'pixel': | |
extracted_mel_spectrogram_db = extract_mel_spectrogram_from_image(temp_image.name) | |
elif extraction_method == 'ifft': | |
extracted_mel_spectrogram_db = extract_spectrogram_with_ifft(mel_spectrogram_db) | |
# Step 4: Decode using both methods | |
decode_mel_spectrogram_to_audio(extracted_mel_spectrogram_db, sample_rate, temp_audio_griffin.name) | |
decode_mel_spectrogram_with_melgan(extracted_mel_spectrogram_db, sample_rate, temp_audio_melgan.name) | |
# Return results | |
return (temp_image.name, | |
temp_audio_griffin.name if decoding_method == 'griffin' else temp_audio_melgan.name) | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=process_audio, | |
inputs=[ | |
gr.Audio(type="filepath", label="Upload Audio"), | |
gr.Radio(["pixel", "ifft"], label="Extraction Method", value="pixel"), | |
gr.Radio(["griffin", "melgan"], label="Decoding Method", value="griffin") | |
], | |
outputs=[ | |
gr.Image(type="filepath", label="Mel-Spectrogram"), | |
gr.Audio(type="filepath", label="Reconstructed Audio") | |
], | |
title="Audio Encoder-Decoder", | |
description="Upload an audio file to encode it to a mel-spectrogram and then decode it back to audio." | |
) | |
# Launch the app | |
iface.launch() | |
# Example usage(TEST) | |
if __name__ == "__main__": | |
audio_file_path = 'your_audio_file.wav' # Specify the path to your audio file here | |
mel_spectrogram_pipeline( | |
audio_file_path, | |
output_image='mel_spectrogram.png', | |
output_audio_griffin='griffin_reconstructed_audio.wav', | |
output_audio_melgan='melgan_reconstructed_audio.wav', | |
extraction_method='pixel', # Choose 'pixel' or 'ifft' | |
decoding_method='griffin' # Choose 'griffin' or 'melgan' | |
) | |