ngxson's picture
ngxson HF staff
fix filename, add convert to mp3
5d4d4d4
// @ts-expect-error this package does not have typing
import TextLineStream from 'textlinestream';
import { Client } from '@gradio/client';
import * as lamejs from '@breezystack/lamejs';
// ponyfill for missing ReadableStream asyncIterator on Safari
import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
import { CONFIG } from '../config';
import { uploadFiles } from '@huggingface/hub';
export const isDev: boolean = import.meta.env.MODE === 'development';
export const testToken: string = import.meta.env.VITE_TEST_TOKEN;
export const isBlogMode: boolean = !!window.location.href.match(/blogmode/);
export const delay = (ms: number) => new Promise((res) => setTimeout(res, ms));
// return URL to the WAV file
export const generateAudio = async (
content: string,
voice: string,
speed: number = 1.1
): Promise<string> => {
const maxRetries = 3;
for (let i = 0; i < maxRetries; i++) {
try {
const client = await Client.connect(CONFIG.ttsSpaceId);
const result = await client.predict('/tts', {
text: content,
voice,
speed,
});
console.log(result.data);
return (result.data as any)[0].url;
} catch (e) {
if (i === maxRetries - 1) {
throw e; // last retry, throw error
}
console.error('Failed to generate audio, retrying...', e);
}
continue;
}
return ''; // should never reach here
};
export const pickRand = <T>(arr: T[]): T => {
return arr[Math.floor(Math.random() * arr.length)];
};
// wrapper for SSE
export async function* getSSEStreamAsync(fetchResponse: Response) {
if (!fetchResponse.body) throw new Error('Response body is empty');
const lines: ReadableStream<string> = fetchResponse.body
.pipeThrough(new TextDecoderStream())
.pipeThrough(new TextLineStream());
// @ts-expect-error asyncIterator complains about type, but it should work
for await (const line of asyncIterator(lines)) {
//if (isDev) console.log({ line });
if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
const data = JSON.parse(line.slice(5));
yield data;
} else if (line.startsWith('error:')) {
const data = JSON.parse(line.slice(6));
throw new Error(data.message || 'Unknown error');
}
}
}
export const uploadFileToHub = async (
buf: ArrayBuffer,
filename: string,
repoId: string,
hfToken: string
) => {
await uploadFiles({
accessToken: hfToken,
repo: repoId,
files: [
{
path: filename,
content: new Blob([buf], { type: 'audio/wav' }),
},
],
});
};
/**
* Ok now, most of the functions below are written by ChatGPT using Reasoning mode.
*/
////////////////////////////////////////
// Audio manipulation utils
export const trimSilence = (audioBuffer: AudioBuffer): AudioBuffer => {
const threshold = 0.01; // Amplitude below which a sample is considered silent.
const numChannels = audioBuffer.numberOfChannels;
const totalSamples = audioBuffer.length;
// Helper function to check if a sample at the given index is silent in all channels.
const isSilent = (index: number): boolean => {
for (let channel = 0; channel < numChannels; channel++) {
const channelData = audioBuffer.getChannelData(channel);
if (Math.abs(channelData[index]) > threshold) {
return false;
}
}
return true;
};
// Find the first non-silent sample.
let startSample = 0;
while (startSample < totalSamples && isSilent(startSample)) {
startSample++;
}
// Find the last non-silent sample.
let endSample = totalSamples - 1;
while (endSample >= startSample && isSilent(endSample)) {
endSample--;
}
// If no non-silent samples were found, return an empty AudioBuffer.
if (startSample >= totalSamples || endSample < startSample) {
return new AudioBuffer({
length: 1,
numberOfChannels: numChannels,
sampleRate: audioBuffer.sampleRate,
});
}
const newLength = endSample - startSample + 1;
const newBuffer = new AudioBuffer({
length: newLength,
numberOfChannels: numChannels,
sampleRate: audioBuffer.sampleRate,
});
// Copy the trimmed audio samples from the original buffer to the new buffer.
for (let channel = 0; channel < numChannels; channel++) {
const oldData = audioBuffer.getChannelData(channel);
const newData = newBuffer.getChannelData(channel);
for (let i = 0; i < newLength; i++) {
newData[i] = oldData[startSample + i];
}
}
return newBuffer;
};
export const joinAudio = (
audio1: AudioBuffer,
audio2: AudioBuffer,
gapMilisecs: number,
overlap: 'none' | 'cross-fade' = 'none'
): AudioBuffer => {
const sampleRate = audio1.sampleRate;
const numChannels = audio1.numberOfChannels;
// Ensure both audio buffers are compatible.
if (audio2.sampleRate !== sampleRate) {
throw new Error('Audio buffers must have the same sample rate');
}
if (audio2.numberOfChannels !== numChannels) {
throw new Error('Audio buffers must have the same number of channels');
}
const gapSeconds = gapMilisecs / 1000;
let newLength: number;
if (gapSeconds > 0) {
// Pad with silence: gapSamples of silence in between.
const gapSamples = Math.round(gapSeconds * sampleRate);
newLength = audio1.length + gapSamples + audio2.length;
} else if (gapSeconds === 0) {
// Simply join one after the other.
newLength = audio1.length + audio2.length;
} else {
// gapSeconds < 0 means we blend (overlap) the end of audio1 with the beginning of audio2.
const overlapSamplesRequested = Math.round(-gapSeconds * sampleRate);
// Ensure we don't overlap more than available in either buffer.
const effectiveOverlap = Math.min(
overlapSamplesRequested,
audio1.length,
audio2.length
);
newLength = audio1.length + audio2.length - effectiveOverlap;
}
// Create a new AudioBuffer for the joined result.
const newBuffer = new AudioBuffer({
length: newLength,
numberOfChannels: numChannels,
sampleRate: sampleRate,
});
// Process each channel.
for (let channel = 0; channel < numChannels; channel++) {
const outputData = newBuffer.getChannelData(channel);
const data1 = audio1.getChannelData(channel);
const data2 = audio2.getChannelData(channel);
let offset = 0;
if (gapSeconds < 0) {
// Blend the join section.
const overlapSamplesRequested = Math.round(-gapSeconds * sampleRate);
const effectiveOverlap = Math.min(
overlapSamplesRequested,
audio1.length,
audio2.length
);
// Copy audio1 data up to the start of the overlapping section.
const nonOverlapLength = audio1.length - effectiveOverlap;
outputData.set(data1.subarray(0, nonOverlapLength), offset);
offset += nonOverlapLength;
// Blend overlapping region.
if (overlap === 'cross-fade') {
for (let i = 0; i < effectiveOverlap; i++) {
// Linear crossfade:
const fadeOut = 1 - i / effectiveOverlap;
const fadeIn = i / effectiveOverlap;
outputData[offset + i] =
data1[nonOverlapLength + i] * fadeOut + data2[i] * fadeIn;
}
} else {
for (let i = 0; i < effectiveOverlap; i++) {
outputData[offset + i] = data1[nonOverlapLength + i] + data2[i];
}
}
offset += effectiveOverlap;
// Append remaining audio2 data.
outputData.set(data2.subarray(effectiveOverlap), offset);
} else if (gapSeconds === 0) {
// Directly concatenate: copy audio1 then audio2.
outputData.set(data1, offset);
offset += audio1.length;
outputData.set(data2, offset);
} else {
// gapSeconds > 0: insert silence between audio1 and audio2.
const gapSamples = Math.round(gapSeconds * sampleRate);
outputData.set(data1, offset);
offset += audio1.length;
// Silence: the buffer is initialized with zeros, so we simply move the offset.
offset += gapSamples;
outputData.set(data2, offset);
}
}
return newBuffer;
};
export const addNoise = (
audioBuffer: AudioBuffer,
magnitude: number
): AudioBuffer => {
const { numberOfChannels, sampleRate, length } = audioBuffer;
const newBuffer = new AudioBuffer({
length,
numberOfChannels,
sampleRate,
});
for (let channel = 0; channel < numberOfChannels; channel++) {
const inputData = audioBuffer.getChannelData(channel);
const outputData = newBuffer.getChannelData(channel);
for (let i = 0; i < length; i++) {
// Generate white noise in the range [-magnitude, +magnitude].
const noise = (Math.random() * 2 - 1) * magnitude;
outputData[i] = inputData[i] + noise;
}
}
return newBuffer;
};
export const addSilence = (
audioBuffer: AudioBuffer,
toBeginning: boolean,
durationMilisecs: number
): AudioBuffer => {
// Convert duration from milliseconds to samples.
const sampleRate = audioBuffer.sampleRate;
const silenceSamples = Math.round((durationMilisecs / 1000) * sampleRate);
const numChannels = audioBuffer.numberOfChannels;
const originalLength = audioBuffer.length;
const newLength = originalLength + silenceSamples;
// Create a new AudioBuffer with extra space for the silence.
const newBuffer = new AudioBuffer({
length: newLength,
numberOfChannels: numChannels,
sampleRate: sampleRate,
});
// Process each channel: copy original audio into the correct position.
for (let channel = 0; channel < numChannels; channel++) {
const originalData = audioBuffer.getChannelData(channel);
const newData = newBuffer.getChannelData(channel);
if (toBeginning) {
// Leave the first `silenceSamples` as zeros, then copy the original data.
newData.set(originalData, silenceSamples);
} else {
// Copy the original data first; the remaining samples are already zeros.
newData.set(originalData, 0);
}
}
return newBuffer;
};
////////////////////////////////////////
// Audio formatting utils
export const loadWavAndDecode = async (url: string): Promise<AudioBuffer> => {
const response = await fetch(url);
const arrayBuffer = await response.arrayBuffer();
// @ts-expect-error this is fine
const AudioContext = window.AudioContext || window.webkitAudioContext;
if (!AudioContext) {
throw new Error('AudioContext is not supported on this browser');
}
const audioCtx = new AudioContext();
let audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
// force mono
if (audioBuffer.numberOfChannels > 1) {
const monoBuffer = new AudioContext().createBuffer(
1,
audioBuffer.length,
audioBuffer.sampleRate
);
const monoData = monoBuffer.getChannelData(0);
for (let i = 0; i < audioBuffer.length; i++) {
let sum = 0;
for (let channel = 0; channel < audioBuffer.numberOfChannels; channel++) {
sum += audioBuffer.getChannelData(channel)[i];
}
monoData[i] = sum / audioBuffer.numberOfChannels;
}
audioBuffer = monoBuffer;
}
return audioBuffer;
};
export function audioBufferToWav(
buffer: AudioBuffer,
options: { float32?: boolean } = {}
): ArrayBuffer {
const numChannels = buffer.numberOfChannels;
const sampleRate = buffer.sampleRate;
const format = options.float32 ? 3 : 1; // 3 = IEEE float, 1 = PCM
const bitDepth = options.float32 ? 32 : 16;
const numSamples = buffer.length;
const headerLength = 44;
const bytesPerSample = bitDepth / 8;
const dataLength = numSamples * numChannels * bytesPerSample;
const bufferLength = headerLength + dataLength;
const arrayBuffer = new ArrayBuffer(bufferLength);
const view = new DataView(arrayBuffer);
let offset = 0;
function writeString(str: string) {
for (let i = 0; i < str.length; i++) {
view.setUint8(offset, str.charCodeAt(i));
offset++;
}
}
// Write WAV header
writeString('RIFF');
view.setUint32(offset, 36 + dataLength, true);
offset += 4;
writeString('WAVE');
writeString('fmt ');
view.setUint32(offset, 16, true);
offset += 4;
view.setUint16(offset, format, true);
offset += 2;
view.setUint16(offset, numChannels, true);
offset += 2;
view.setUint32(offset, sampleRate, true);
offset += 4;
view.setUint32(offset, sampleRate * numChannels * bytesPerSample, true);
offset += 4;
view.setUint16(offset, numChannels * bytesPerSample, true);
offset += 2;
view.setUint16(offset, bitDepth, true);
offset += 2;
writeString('data');
view.setUint32(offset, dataLength, true);
offset += 4;
// Write PCM samples: interleave channels
const channels: Float32Array[] = [];
for (let i = 0; i < numChannels; i++) {
channels.push(buffer.getChannelData(i));
}
for (let i = 0; i < numSamples; i++) {
for (let channel = 0; channel < numChannels; channel++) {
let sample = channels[channel][i];
// Clamp the sample to [-1, 1]
sample = Math.max(-1, Math.min(1, sample));
if (options.float32) {
view.setFloat32(offset, sample, true);
offset += 4;
} else {
// Convert to 16-bit PCM sample
const intSample = sample < 0 ? sample * 0x8000 : sample * 0x7fff;
view.setInt16(offset, intSample, true);
offset += 2;
}
}
}
return arrayBuffer;
}
export const blobFromAudioBuffer = (audioBuffer: AudioBuffer): Blob => {
// Using 16-bit PCM for compatibility.
const wavArrayBuffer = audioBufferToWav(audioBuffer, { float32: false });
return new Blob([wavArrayBuffer], { type: 'audio/wav' });
};
export function audioBufferToMp3(buffer: AudioBuffer): ArrayBuffer {
const numChannels = buffer.numberOfChannels;
const sampleRate = buffer.sampleRate;
const bitRate = 128; // kbps - adjust as desired
// Initialize MP3 encoder.
// Note: If more than 2 channels are present, only the first 2 channels will be used.
const mp3encoder = new lamejs.Mp3Encoder(
numChannels >= 2 ? 2 : 1,
sampleRate,
bitRate
);
const samples = buffer.length;
const chunkSize = 1152; // Frame size for MP3 encoding
// Prepare channel data.
const channels: Float32Array[] = [];
for (let ch = 0; ch < numChannels; ch++) {
channels.push(buffer.getChannelData(ch));
}
const mp3Data: Uint8Array[] = [];
// For mono audio, encode directly.
if (numChannels === 1) {
for (let i = 0; i < samples; i += chunkSize) {
const sampleChunk = channels[0].subarray(i, i + chunkSize);
const int16Buffer = floatTo16BitPCM(sampleChunk);
const mp3buf = mp3encoder.encodeBuffer(int16Buffer);
if (mp3buf.length > 0) {
mp3Data.push(new Uint8Array(mp3buf));
}
}
} else {
// For stereo (or more channels, use first two channels).
const left = channels[0];
const right = channels[1];
for (let i = 0; i < samples; i += chunkSize) {
const leftChunk = left.subarray(i, i + chunkSize);
const rightChunk = right.subarray(i, i + chunkSize);
const leftInt16 = floatTo16BitPCM(leftChunk);
const rightInt16 = floatTo16BitPCM(rightChunk);
const mp3buf = mp3encoder.encodeBuffer(leftInt16, rightInt16);
if (mp3buf.length > 0) {
mp3Data.push(new Uint8Array(mp3buf));
}
}
}
// Flush the encoder to get any remaining MP3 data.
const endBuf = mp3encoder.flush();
if (endBuf.length > 0) {
mp3Data.push(new Uint8Array(endBuf));
}
// Concatenate all MP3 chunks into a single ArrayBuffer.
const totalLength = mp3Data.reduce((acc, curr) => acc + curr.length, 0);
const result = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of mp3Data) {
result.set(chunk, offset);
offset += chunk.length;
}
return result.buffer;
}
/**
* Helper function that converts a Float32Array of PCM samples (range -1..1)
* into an Int16Array (range -32768..32767).
*/
function floatTo16BitPCM(input: Float32Array): Int16Array {
const output = new Int16Array(input.length);
for (let i = 0; i < input.length; i++) {
const s = Math.max(-1, Math.min(1, input[i]));
output[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
return output;
}
// clean up filename for saving
export const cleanupFilename = (name: string): string => {
return name.replace(/[^a-zA-Z0-9-_]/g, '_');
};