Spaces:
Running
Running
// @ts-expect-error this package does not have typing | |
import TextLineStream from 'textlinestream'; | |
import { Client } from '@gradio/client'; | |
import * as lamejs from '@breezystack/lamejs'; | |
// ponyfill for missing ReadableStream asyncIterator on Safari | |
import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator'; | |
import { CONFIG } from '../config'; | |
import { uploadFiles } from '@huggingface/hub'; | |
export const isDev: boolean = import.meta.env.MODE === 'development'; | |
export const testToken: string = import.meta.env.VITE_TEST_TOKEN; | |
export const isBlogMode: boolean = !!window.location.href.match(/blogmode/); | |
export const delay = (ms: number) => new Promise((res) => setTimeout(res, ms)); | |
// return URL to the WAV file | |
export const generateAudio = async ( | |
content: string, | |
voice: string, | |
speed: number = 1.1 | |
): Promise<string> => { | |
const maxRetries = 3; | |
for (let i = 0; i < maxRetries; i++) { | |
try { | |
const client = await Client.connect(CONFIG.ttsSpaceId); | |
const result = await client.predict('/tts', { | |
text: content, | |
voice, | |
speed, | |
}); | |
console.log(result.data); | |
return (result.data as any)[0].url; | |
} catch (e) { | |
if (i === maxRetries - 1) { | |
throw e; // last retry, throw error | |
} | |
console.error('Failed to generate audio, retrying...', e); | |
} | |
continue; | |
} | |
return ''; // should never reach here | |
}; | |
export const pickRand = <T>(arr: T[]): T => { | |
return arr[Math.floor(Math.random() * arr.length)]; | |
}; | |
// wrapper for SSE | |
export async function* getSSEStreamAsync(fetchResponse: Response) { | |
if (!fetchResponse.body) throw new Error('Response body is empty'); | |
const lines: ReadableStream<string> = fetchResponse.body | |
.pipeThrough(new TextDecoderStream()) | |
.pipeThrough(new TextLineStream()); | |
// @ts-expect-error asyncIterator complains about type, but it should work | |
for await (const line of asyncIterator(lines)) { | |
//if (isDev) console.log({ line }); | |
if (line.startsWith('data:') && !line.endsWith('[DONE]')) { | |
const data = JSON.parse(line.slice(5)); | |
yield data; | |
} else if (line.startsWith('error:')) { | |
const data = JSON.parse(line.slice(6)); | |
throw new Error(data.message || 'Unknown error'); | |
} | |
} | |
} | |
export const uploadFileToHub = async ( | |
buf: ArrayBuffer, | |
filename: string, | |
repoId: string, | |
hfToken: string | |
) => { | |
await uploadFiles({ | |
accessToken: hfToken, | |
repo: repoId, | |
files: [ | |
{ | |
path: filename, | |
content: new Blob([buf], { type: 'audio/wav' }), | |
}, | |
], | |
}); | |
}; | |
/** | |
* Ok now, most of the functions below are written by ChatGPT using Reasoning mode. | |
*/ | |
//////////////////////////////////////// | |
// Audio manipulation utils | |
export const trimSilence = (audioBuffer: AudioBuffer): AudioBuffer => { | |
const threshold = 0.01; // Amplitude below which a sample is considered silent. | |
const numChannels = audioBuffer.numberOfChannels; | |
const totalSamples = audioBuffer.length; | |
// Helper function to check if a sample at the given index is silent in all channels. | |
const isSilent = (index: number): boolean => { | |
for (let channel = 0; channel < numChannels; channel++) { | |
const channelData = audioBuffer.getChannelData(channel); | |
if (Math.abs(channelData[index]) > threshold) { | |
return false; | |
} | |
} | |
return true; | |
}; | |
// Find the first non-silent sample. | |
let startSample = 0; | |
while (startSample < totalSamples && isSilent(startSample)) { | |
startSample++; | |
} | |
// Find the last non-silent sample. | |
let endSample = totalSamples - 1; | |
while (endSample >= startSample && isSilent(endSample)) { | |
endSample--; | |
} | |
// If no non-silent samples were found, return an empty AudioBuffer. | |
if (startSample >= totalSamples || endSample < startSample) { | |
return new AudioBuffer({ | |
length: 1, | |
numberOfChannels: numChannels, | |
sampleRate: audioBuffer.sampleRate, | |
}); | |
} | |
const newLength = endSample - startSample + 1; | |
const newBuffer = new AudioBuffer({ | |
length: newLength, | |
numberOfChannels: numChannels, | |
sampleRate: audioBuffer.sampleRate, | |
}); | |
// Copy the trimmed audio samples from the original buffer to the new buffer. | |
for (let channel = 0; channel < numChannels; channel++) { | |
const oldData = audioBuffer.getChannelData(channel); | |
const newData = newBuffer.getChannelData(channel); | |
for (let i = 0; i < newLength; i++) { | |
newData[i] = oldData[startSample + i]; | |
} | |
} | |
return newBuffer; | |
}; | |
export const joinAudio = ( | |
audio1: AudioBuffer, | |
audio2: AudioBuffer, | |
gapMilisecs: number, | |
overlap: 'none' | 'cross-fade' = 'none' | |
): AudioBuffer => { | |
const sampleRate = audio1.sampleRate; | |
const numChannels = audio1.numberOfChannels; | |
// Ensure both audio buffers are compatible. | |
if (audio2.sampleRate !== sampleRate) { | |
throw new Error('Audio buffers must have the same sample rate'); | |
} | |
if (audio2.numberOfChannels !== numChannels) { | |
throw new Error('Audio buffers must have the same number of channels'); | |
} | |
const gapSeconds = gapMilisecs / 1000; | |
let newLength: number; | |
if (gapSeconds > 0) { | |
// Pad with silence: gapSamples of silence in between. | |
const gapSamples = Math.round(gapSeconds * sampleRate); | |
newLength = audio1.length + gapSamples + audio2.length; | |
} else if (gapSeconds === 0) { | |
// Simply join one after the other. | |
newLength = audio1.length + audio2.length; | |
} else { | |
// gapSeconds < 0 means we blend (overlap) the end of audio1 with the beginning of audio2. | |
const overlapSamplesRequested = Math.round(-gapSeconds * sampleRate); | |
// Ensure we don't overlap more than available in either buffer. | |
const effectiveOverlap = Math.min( | |
overlapSamplesRequested, | |
audio1.length, | |
audio2.length | |
); | |
newLength = audio1.length + audio2.length - effectiveOverlap; | |
} | |
// Create a new AudioBuffer for the joined result. | |
const newBuffer = new AudioBuffer({ | |
length: newLength, | |
numberOfChannels: numChannels, | |
sampleRate: sampleRate, | |
}); | |
// Process each channel. | |
for (let channel = 0; channel < numChannels; channel++) { | |
const outputData = newBuffer.getChannelData(channel); | |
const data1 = audio1.getChannelData(channel); | |
const data2 = audio2.getChannelData(channel); | |
let offset = 0; | |
if (gapSeconds < 0) { | |
// Blend the join section. | |
const overlapSamplesRequested = Math.round(-gapSeconds * sampleRate); | |
const effectiveOverlap = Math.min( | |
overlapSamplesRequested, | |
audio1.length, | |
audio2.length | |
); | |
// Copy audio1 data up to the start of the overlapping section. | |
const nonOverlapLength = audio1.length - effectiveOverlap; | |
outputData.set(data1.subarray(0, nonOverlapLength), offset); | |
offset += nonOverlapLength; | |
// Blend overlapping region. | |
if (overlap === 'cross-fade') { | |
for (let i = 0; i < effectiveOverlap; i++) { | |
// Linear crossfade: | |
const fadeOut = 1 - i / effectiveOverlap; | |
const fadeIn = i / effectiveOverlap; | |
outputData[offset + i] = | |
data1[nonOverlapLength + i] * fadeOut + data2[i] * fadeIn; | |
} | |
} else { | |
for (let i = 0; i < effectiveOverlap; i++) { | |
outputData[offset + i] = data1[nonOverlapLength + i] + data2[i]; | |
} | |
} | |
offset += effectiveOverlap; | |
// Append remaining audio2 data. | |
outputData.set(data2.subarray(effectiveOverlap), offset); | |
} else if (gapSeconds === 0) { | |
// Directly concatenate: copy audio1 then audio2. | |
outputData.set(data1, offset); | |
offset += audio1.length; | |
outputData.set(data2, offset); | |
} else { | |
// gapSeconds > 0: insert silence between audio1 and audio2. | |
const gapSamples = Math.round(gapSeconds * sampleRate); | |
outputData.set(data1, offset); | |
offset += audio1.length; | |
// Silence: the buffer is initialized with zeros, so we simply move the offset. | |
offset += gapSamples; | |
outputData.set(data2, offset); | |
} | |
} | |
return newBuffer; | |
}; | |
export const addNoise = ( | |
audioBuffer: AudioBuffer, | |
magnitude: number | |
): AudioBuffer => { | |
const { numberOfChannels, sampleRate, length } = audioBuffer; | |
const newBuffer = new AudioBuffer({ | |
length, | |
numberOfChannels, | |
sampleRate, | |
}); | |
for (let channel = 0; channel < numberOfChannels; channel++) { | |
const inputData = audioBuffer.getChannelData(channel); | |
const outputData = newBuffer.getChannelData(channel); | |
for (let i = 0; i < length; i++) { | |
// Generate white noise in the range [-magnitude, +magnitude]. | |
const noise = (Math.random() * 2 - 1) * magnitude; | |
outputData[i] = inputData[i] + noise; | |
} | |
} | |
return newBuffer; | |
}; | |
export const addSilence = ( | |
audioBuffer: AudioBuffer, | |
toBeginning: boolean, | |
durationMilisecs: number | |
): AudioBuffer => { | |
// Convert duration from milliseconds to samples. | |
const sampleRate = audioBuffer.sampleRate; | |
const silenceSamples = Math.round((durationMilisecs / 1000) * sampleRate); | |
const numChannels = audioBuffer.numberOfChannels; | |
const originalLength = audioBuffer.length; | |
const newLength = originalLength + silenceSamples; | |
// Create a new AudioBuffer with extra space for the silence. | |
const newBuffer = new AudioBuffer({ | |
length: newLength, | |
numberOfChannels: numChannels, | |
sampleRate: sampleRate, | |
}); | |
// Process each channel: copy original audio into the correct position. | |
for (let channel = 0; channel < numChannels; channel++) { | |
const originalData = audioBuffer.getChannelData(channel); | |
const newData = newBuffer.getChannelData(channel); | |
if (toBeginning) { | |
// Leave the first `silenceSamples` as zeros, then copy the original data. | |
newData.set(originalData, silenceSamples); | |
} else { | |
// Copy the original data first; the remaining samples are already zeros. | |
newData.set(originalData, 0); | |
} | |
} | |
return newBuffer; | |
}; | |
//////////////////////////////////////// | |
// Audio formatting utils | |
export const loadWavAndDecode = async (url: string): Promise<AudioBuffer> => { | |
const response = await fetch(url); | |
const arrayBuffer = await response.arrayBuffer(); | |
// @ts-expect-error this is fine | |
const AudioContext = window.AudioContext || window.webkitAudioContext; | |
if (!AudioContext) { | |
throw new Error('AudioContext is not supported on this browser'); | |
} | |
const audioCtx = new AudioContext(); | |
let audioBuffer = await audioCtx.decodeAudioData(arrayBuffer); | |
// force mono | |
if (audioBuffer.numberOfChannels > 1) { | |
const monoBuffer = new AudioContext().createBuffer( | |
1, | |
audioBuffer.length, | |
audioBuffer.sampleRate | |
); | |
const monoData = monoBuffer.getChannelData(0); | |
for (let i = 0; i < audioBuffer.length; i++) { | |
let sum = 0; | |
for (let channel = 0; channel < audioBuffer.numberOfChannels; channel++) { | |
sum += audioBuffer.getChannelData(channel)[i]; | |
} | |
monoData[i] = sum / audioBuffer.numberOfChannels; | |
} | |
audioBuffer = monoBuffer; | |
} | |
return audioBuffer; | |
}; | |
export function audioBufferToWav( | |
buffer: AudioBuffer, | |
options: { float32?: boolean } = {} | |
): ArrayBuffer { | |
const numChannels = buffer.numberOfChannels; | |
const sampleRate = buffer.sampleRate; | |
const format = options.float32 ? 3 : 1; // 3 = IEEE float, 1 = PCM | |
const bitDepth = options.float32 ? 32 : 16; | |
const numSamples = buffer.length; | |
const headerLength = 44; | |
const bytesPerSample = bitDepth / 8; | |
const dataLength = numSamples * numChannels * bytesPerSample; | |
const bufferLength = headerLength + dataLength; | |
const arrayBuffer = new ArrayBuffer(bufferLength); | |
const view = new DataView(arrayBuffer); | |
let offset = 0; | |
function writeString(str: string) { | |
for (let i = 0; i < str.length; i++) { | |
view.setUint8(offset, str.charCodeAt(i)); | |
offset++; | |
} | |
} | |
// Write WAV header | |
writeString('RIFF'); | |
view.setUint32(offset, 36 + dataLength, true); | |
offset += 4; | |
writeString('WAVE'); | |
writeString('fmt '); | |
view.setUint32(offset, 16, true); | |
offset += 4; | |
view.setUint16(offset, format, true); | |
offset += 2; | |
view.setUint16(offset, numChannels, true); | |
offset += 2; | |
view.setUint32(offset, sampleRate, true); | |
offset += 4; | |
view.setUint32(offset, sampleRate * numChannels * bytesPerSample, true); | |
offset += 4; | |
view.setUint16(offset, numChannels * bytesPerSample, true); | |
offset += 2; | |
view.setUint16(offset, bitDepth, true); | |
offset += 2; | |
writeString('data'); | |
view.setUint32(offset, dataLength, true); | |
offset += 4; | |
// Write PCM samples: interleave channels | |
const channels: Float32Array[] = []; | |
for (let i = 0; i < numChannels; i++) { | |
channels.push(buffer.getChannelData(i)); | |
} | |
for (let i = 0; i < numSamples; i++) { | |
for (let channel = 0; channel < numChannels; channel++) { | |
let sample = channels[channel][i]; | |
// Clamp the sample to [-1, 1] | |
sample = Math.max(-1, Math.min(1, sample)); | |
if (options.float32) { | |
view.setFloat32(offset, sample, true); | |
offset += 4; | |
} else { | |
// Convert to 16-bit PCM sample | |
const intSample = sample < 0 ? sample * 0x8000 : sample * 0x7fff; | |
view.setInt16(offset, intSample, true); | |
offset += 2; | |
} | |
} | |
} | |
return arrayBuffer; | |
} | |
export const blobFromAudioBuffer = (audioBuffer: AudioBuffer): Blob => { | |
// Using 16-bit PCM for compatibility. | |
const wavArrayBuffer = audioBufferToWav(audioBuffer, { float32: false }); | |
return new Blob([wavArrayBuffer], { type: 'audio/wav' }); | |
}; | |
export function audioBufferToMp3(buffer: AudioBuffer): ArrayBuffer { | |
const numChannels = buffer.numberOfChannels; | |
const sampleRate = buffer.sampleRate; | |
const bitRate = 128; // kbps - adjust as desired | |
// Initialize MP3 encoder. | |
// Note: If more than 2 channels are present, only the first 2 channels will be used. | |
const mp3encoder = new lamejs.Mp3Encoder( | |
numChannels >= 2 ? 2 : 1, | |
sampleRate, | |
bitRate | |
); | |
const samples = buffer.length; | |
const chunkSize = 1152; // Frame size for MP3 encoding | |
// Prepare channel data. | |
const channels: Float32Array[] = []; | |
for (let ch = 0; ch < numChannels; ch++) { | |
channels.push(buffer.getChannelData(ch)); | |
} | |
const mp3Data: Uint8Array[] = []; | |
// For mono audio, encode directly. | |
if (numChannels === 1) { | |
for (let i = 0; i < samples; i += chunkSize) { | |
const sampleChunk = channels[0].subarray(i, i + chunkSize); | |
const int16Buffer = floatTo16BitPCM(sampleChunk); | |
const mp3buf = mp3encoder.encodeBuffer(int16Buffer); | |
if (mp3buf.length > 0) { | |
mp3Data.push(new Uint8Array(mp3buf)); | |
} | |
} | |
} else { | |
// For stereo (or more channels, use first two channels). | |
const left = channels[0]; | |
const right = channels[1]; | |
for (let i = 0; i < samples; i += chunkSize) { | |
const leftChunk = left.subarray(i, i + chunkSize); | |
const rightChunk = right.subarray(i, i + chunkSize); | |
const leftInt16 = floatTo16BitPCM(leftChunk); | |
const rightInt16 = floatTo16BitPCM(rightChunk); | |
const mp3buf = mp3encoder.encodeBuffer(leftInt16, rightInt16); | |
if (mp3buf.length > 0) { | |
mp3Data.push(new Uint8Array(mp3buf)); | |
} | |
} | |
} | |
// Flush the encoder to get any remaining MP3 data. | |
const endBuf = mp3encoder.flush(); | |
if (endBuf.length > 0) { | |
mp3Data.push(new Uint8Array(endBuf)); | |
} | |
// Concatenate all MP3 chunks into a single ArrayBuffer. | |
const totalLength = mp3Data.reduce((acc, curr) => acc + curr.length, 0); | |
const result = new Uint8Array(totalLength); | |
let offset = 0; | |
for (const chunk of mp3Data) { | |
result.set(chunk, offset); | |
offset += chunk.length; | |
} | |
return result.buffer; | |
} | |
/** | |
* Helper function that converts a Float32Array of PCM samples (range -1..1) | |
* into an Int16Array (range -32768..32767). | |
*/ | |
function floatTo16BitPCM(input: Float32Array): Int16Array { | |
const output = new Int16Array(input.length); | |
for (let i = 0; i < input.length; i++) { | |
const s = Math.max(-1, Math.min(1, input[i])); | |
output[i] = s < 0 ? s * 0x8000 : s * 0x7fff; | |
} | |
return output; | |
} | |
// clean up filename for saving | |
export const cleanupFilename = (name: string): string => { | |
return name.replace(/[^a-zA-Z0-9-_]/g, '_'); | |
}; | |