kokoro-podcast-generator

Running

App Files Files Community

kokoro-podcast-generator / front /src /utils /utils.ts

ngxson HF Staff

fix filename, add convert to mp3

5d4d4d4 5 months ago

raw

history blame contribute delete

16.3 kB

	// @ts-expect-error this package does not have typing
	import TextLineStream from 'textlinestream';
	import { Client } from '@gradio/client';
	import * as lamejs from '@breezystack/lamejs';

	// ponyfill for missing ReadableStream asyncIterator on Safari
	import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
	import { CONFIG } from '../config';
	import { uploadFiles } from '@huggingface/hub';

	export const isDev: boolean = import.meta.env.MODE === 'development';
	export const testToken: string = import.meta.env.VITE_TEST_TOKEN;
	export const isBlogMode: boolean = !!window.location.href.match(/blogmode/);

	export const delay = (ms: number) => new Promise((res) => setTimeout(res, ms));

	// return URL to the WAV file
	export const generateAudio = async (
	content: string,
	voice: string,
	speed: number = 1.1
	): Promise<string> => {
	const maxRetries = 3;
	for (let i = 0; i < maxRetries; i++) {
	try {
	const client = await Client.connect(CONFIG.ttsSpaceId);
	const result = await client.predict('/tts', {
	text: content,
	voice,
	speed,
	});

	console.log(result.data);
	return (result.data as any)[0].url;
	} catch (e) {
	if (i === maxRetries - 1) {
	throw e; // last retry, throw error
	}
	console.error('Failed to generate audio, retrying...', e);
	}
	continue;
	}
	return ''; // should never reach here
	};

	export const pickRand = <T>(arr: T[]): T => {
	return arr[Math.floor(Math.random() * arr.length)];
	};

	// wrapper for SSE
	export async function* getSSEStreamAsync(fetchResponse: Response) {
	if (!fetchResponse.body) throw new Error('Response body is empty');
	const lines: ReadableStream<string> = fetchResponse.body
	.pipeThrough(new TextDecoderStream())
	.pipeThrough(new TextLineStream());
	// @ts-expect-error asyncIterator complains about type, but it should work
	for await (const line of asyncIterator(lines)) {
	//if (isDev) console.log({ line });
	if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
	const data = JSON.parse(line.slice(5));
	yield data;
	} else if (line.startsWith('error:')) {
	const data = JSON.parse(line.slice(6));
	throw new Error(data.message \|\| 'Unknown error');
	}
	}
	}

	export const uploadFileToHub = async (
	buf: ArrayBuffer,
	filename: string,
	repoId: string,
	hfToken: string
	) => {
	await uploadFiles({
	accessToken: hfToken,
	repo: repoId,
	files: [
	{
	path: filename,
	content: new Blob([buf], { type: 'audio/wav' }),
	},
	],
	});
	};

	/**
	* Ok now, most of the functions below are written by ChatGPT using Reasoning mode.
	*/

	////////////////////////////////////////
	// Audio manipulation utils

	export const trimSilence = (audioBuffer: AudioBuffer): AudioBuffer => {
	const threshold = 0.01; // Amplitude below which a sample is considered silent.
	const numChannels = audioBuffer.numberOfChannels;
	const totalSamples = audioBuffer.length;

	// Helper function to check if a sample at the given index is silent in all channels.
	const isSilent = (index: number): boolean => {
	for (let channel = 0; channel < numChannels; channel++) {
	const channelData = audioBuffer.getChannelData(channel);
	if (Math.abs(channelData[index]) > threshold) {
	return false;
	}
	}
	return true;
	};

	// Find the first non-silent sample.
	let startSample = 0;
	while (startSample < totalSamples && isSilent(startSample)) {
	startSample++;
	}

	// Find the last non-silent sample.
	let endSample = totalSamples - 1;
	while (endSample >= startSample && isSilent(endSample)) {
	endSample--;
	}

	// If no non-silent samples were found, return an empty AudioBuffer.
	if (startSample >= totalSamples \|\| endSample < startSample) {
	return new AudioBuffer({
	length: 1,
	numberOfChannels: numChannels,
	sampleRate: audioBuffer.sampleRate,
	});
	}

	const newLength = endSample - startSample + 1;
	const newBuffer = new AudioBuffer({
	length: newLength,
	numberOfChannels: numChannels,
	sampleRate: audioBuffer.sampleRate,
	});

	// Copy the trimmed audio samples from the original buffer to the new buffer.
	for (let channel = 0; channel < numChannels; channel++) {
	const oldData = audioBuffer.getChannelData(channel);
	const newData = newBuffer.getChannelData(channel);
	for (let i = 0; i < newLength; i++) {
	newData[i] = oldData[startSample + i];
	}
	}

	return newBuffer;
	};

	export const joinAudio = (
	audio1: AudioBuffer,
	audio2: AudioBuffer,
	gapMilisecs: number,
	overlap: 'none' \| 'cross-fade' = 'none'
	): AudioBuffer => {
	const sampleRate = audio1.sampleRate;
	const numChannels = audio1.numberOfChannels;

	// Ensure both audio buffers are compatible.
	if (audio2.sampleRate !== sampleRate) {
	throw new Error('Audio buffers must have the same sample rate');
	}
	if (audio2.numberOfChannels !== numChannels) {
	throw new Error('Audio buffers must have the same number of channels');
	}

	const gapSeconds = gapMilisecs / 1000;
	let newLength: number;

	if (gapSeconds > 0) {
	// Pad with silence: gapSamples of silence in between.
	const gapSamples = Math.round(gapSeconds * sampleRate);
	newLength = audio1.length + gapSamples + audio2.length;
	} else if (gapSeconds === 0) {
	// Simply join one after the other.
	newLength = audio1.length + audio2.length;
	} else {
	// gapSeconds < 0 means we blend (overlap) the end of audio1 with the beginning of audio2.
	const overlapSamplesRequested = Math.round(-gapSeconds * sampleRate);
	// Ensure we don't overlap more than available in either buffer.
	const effectiveOverlap = Math.min(
	overlapSamplesRequested,
	audio1.length,
	audio2.length
	);
	newLength = audio1.length + audio2.length - effectiveOverlap;
	}

	// Create a new AudioBuffer for the joined result.
	const newBuffer = new AudioBuffer({
	length: newLength,
	numberOfChannels: numChannels,
	sampleRate: sampleRate,
	});

	// Process each channel.
	for (let channel = 0; channel < numChannels; channel++) {
	const outputData = newBuffer.getChannelData(channel);
	const data1 = audio1.getChannelData(channel);
	const data2 = audio2.getChannelData(channel);
	let offset = 0;

	if (gapSeconds < 0) {
	// Blend the join section.
	const overlapSamplesRequested = Math.round(-gapSeconds * sampleRate);
	const effectiveOverlap = Math.min(
	overlapSamplesRequested,
	audio1.length,
	audio2.length
	);

	// Copy audio1 data up to the start of the overlapping section.
	const nonOverlapLength = audio1.length - effectiveOverlap;
	outputData.set(data1.subarray(0, nonOverlapLength), offset);
	offset += nonOverlapLength;

	// Blend overlapping region.
	if (overlap === 'cross-fade') {
	for (let i = 0; i < effectiveOverlap; i++) {
	// Linear crossfade:
	const fadeOut = 1 - i / effectiveOverlap;
	const fadeIn = i / effectiveOverlap;
	outputData[offset + i] =
	data1[nonOverlapLength + i] * fadeOut + data2[i] * fadeIn;
	}
	} else {
	for (let i = 0; i < effectiveOverlap; i++) {
	outputData[offset + i] = data1[nonOverlapLength + i] + data2[i];
	}
	}
	offset += effectiveOverlap;

	// Append remaining audio2 data.
	outputData.set(data2.subarray(effectiveOverlap), offset);
	} else if (gapSeconds === 0) {
	// Directly concatenate: copy audio1 then audio2.
	outputData.set(data1, offset);
	offset += audio1.length;
	outputData.set(data2, offset);
	} else {
	// gapSeconds > 0: insert silence between audio1 and audio2.
	const gapSamples = Math.round(gapSeconds * sampleRate);
	outputData.set(data1, offset);
	offset += audio1.length;

	// Silence: the buffer is initialized with zeros, so we simply move the offset.
	offset += gapSamples;

	outputData.set(data2, offset);
	}
	}

	return newBuffer;
	};

	export const addNoise = (
	audioBuffer: AudioBuffer,
	magnitude: number
	): AudioBuffer => {
	const { numberOfChannels, sampleRate, length } = audioBuffer;
	const newBuffer = new AudioBuffer({
	length,
	numberOfChannels,
	sampleRate,
	});

	for (let channel = 0; channel < numberOfChannels; channel++) {
	const inputData = audioBuffer.getChannelData(channel);
	const outputData = newBuffer.getChannelData(channel);

	for (let i = 0; i < length; i++) {
	// Generate white noise in the range [-magnitude, +magnitude].
	const noise = (Math.random() * 2 - 1) * magnitude;
	outputData[i] = inputData[i] + noise;
	}
	}

	return newBuffer;
	};

	export const addSilence = (
	audioBuffer: AudioBuffer,
	toBeginning: boolean,
	durationMilisecs: number
	): AudioBuffer => {
	// Convert duration from milliseconds to samples.
	const sampleRate = audioBuffer.sampleRate;
	const silenceSamples = Math.round((durationMilisecs / 1000) * sampleRate);
	const numChannels = audioBuffer.numberOfChannels;
	const originalLength = audioBuffer.length;
	const newLength = originalLength + silenceSamples;

	// Create a new AudioBuffer with extra space for the silence.
	const newBuffer = new AudioBuffer({
	length: newLength,
	numberOfChannels: numChannels,
	sampleRate: sampleRate,
	});

	// Process each channel: copy original audio into the correct position.
	for (let channel = 0; channel < numChannels; channel++) {
	const originalData = audioBuffer.getChannelData(channel);
	const newData = newBuffer.getChannelData(channel);

	if (toBeginning) {
	// Leave the first `silenceSamples` as zeros, then copy the original data.
	newData.set(originalData, silenceSamples);
	} else {
	// Copy the original data first; the remaining samples are already zeros.
	newData.set(originalData, 0);
	}
	}

	return newBuffer;
	};

	////////////////////////////////////////
	// Audio formatting utils

	export const loadWavAndDecode = async (url: string): Promise<AudioBuffer> => {
	const response = await fetch(url);
	const arrayBuffer = await response.arrayBuffer();
	// @ts-expect-error this is fine
	const AudioContext = window.AudioContext \|\| window.webkitAudioContext;
	if (!AudioContext) {
	throw new Error('AudioContext is not supported on this browser');
	}
	const audioCtx = new AudioContext();
	let audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
	// force mono
	if (audioBuffer.numberOfChannels > 1) {
	const monoBuffer = new AudioContext().createBuffer(
	1,
	audioBuffer.length,
	audioBuffer.sampleRate
	);
	const monoData = monoBuffer.getChannelData(0);
	for (let i = 0; i < audioBuffer.length; i++) {
	let sum = 0;
	for (let channel = 0; channel < audioBuffer.numberOfChannels; channel++) {
	sum += audioBuffer.getChannelData(channel)[i];
	}
	monoData[i] = sum / audioBuffer.numberOfChannels;
	}
	audioBuffer = monoBuffer;
	}
	return audioBuffer;
	};

	export function audioBufferToWav(
	buffer: AudioBuffer,
	options: { float32?: boolean } = {}
	): ArrayBuffer {
	const numChannels = buffer.numberOfChannels;
	const sampleRate = buffer.sampleRate;
	const format = options.float32 ? 3 : 1; // 3 = IEEE float, 1 = PCM
	const bitDepth = options.float32 ? 32 : 16;

	const numSamples = buffer.length;
	const headerLength = 44;
	const bytesPerSample = bitDepth / 8;
	const dataLength = numSamples * numChannels * bytesPerSample;
	const bufferLength = headerLength + dataLength;

	const arrayBuffer = new ArrayBuffer(bufferLength);
	const view = new DataView(arrayBuffer);
	let offset = 0;

	function writeString(str: string) {
	for (let i = 0; i < str.length; i++) {
	view.setUint8(offset, str.charCodeAt(i));
	offset++;
	}
	}

	// Write WAV header
	writeString('RIFF');
	view.setUint32(offset, 36 + dataLength, true);
	offset += 4;
	writeString('WAVE');
	writeString('fmt ');
	view.setUint32(offset, 16, true);
	offset += 4;
	view.setUint16(offset, format, true);
	offset += 2;
	view.setUint16(offset, numChannels, true);
	offset += 2;
	view.setUint32(offset, sampleRate, true);
	offset += 4;
	view.setUint32(offset, sampleRate * numChannels * bytesPerSample, true);
	offset += 4;
	view.setUint16(offset, numChannels * bytesPerSample, true);
	offset += 2;
	view.setUint16(offset, bitDepth, true);
	offset += 2;
	writeString('data');
	view.setUint32(offset, dataLength, true);
	offset += 4;

	// Write PCM samples: interleave channels
	const channels: Float32Array[] = [];
	for (let i = 0; i < numChannels; i++) {
	channels.push(buffer.getChannelData(i));
	}

	for (let i = 0; i < numSamples; i++) {
	for (let channel = 0; channel < numChannels; channel++) {
	let sample = channels[channel][i];
	// Clamp the sample to [-1, 1]
	sample = Math.max(-1, Math.min(1, sample));
	if (options.float32) {
	view.setFloat32(offset, sample, true);
	offset += 4;
	} else {
	// Convert to 16-bit PCM sample
	const intSample = sample < 0 ? sample * 0x8000 : sample * 0x7fff;
	view.setInt16(offset, intSample, true);
	offset += 2;
	}
	}
	}

	return arrayBuffer;
	}

	export const blobFromAudioBuffer = (audioBuffer: AudioBuffer): Blob => {
	// Using 16-bit PCM for compatibility.
	const wavArrayBuffer = audioBufferToWav(audioBuffer, { float32: false });
	return new Blob([wavArrayBuffer], { type: 'audio/wav' });
	};

	export function audioBufferToMp3(buffer: AudioBuffer): ArrayBuffer {
	const numChannels = buffer.numberOfChannels;
	const sampleRate = buffer.sampleRate;
	const bitRate = 128; // kbps - adjust as desired

	// Initialize MP3 encoder.
	// Note: If more than 2 channels are present, only the first 2 channels will be used.
	const mp3encoder = new lamejs.Mp3Encoder(
	numChannels >= 2 ? 2 : 1,
	sampleRate,
	bitRate
	);

	const samples = buffer.length;
	const chunkSize = 1152; // Frame size for MP3 encoding

	// Prepare channel data.
	const channels: Float32Array[] = [];
	for (let ch = 0; ch < numChannels; ch++) {
	channels.push(buffer.getChannelData(ch));
	}

	const mp3Data: Uint8Array[] = [];

	// For mono audio, encode directly.
	if (numChannels === 1) {
	for (let i = 0; i < samples; i += chunkSize) {
	const sampleChunk = channels[0].subarray(i, i + chunkSize);
	const int16Buffer = floatTo16BitPCM(sampleChunk);
	const mp3buf = mp3encoder.encodeBuffer(int16Buffer);
	if (mp3buf.length > 0) {
	mp3Data.push(new Uint8Array(mp3buf));
	}
	}
	} else {
	// For stereo (or more channels, use first two channels).
	const left = channels[0];
	const right = channels[1];
	for (let i = 0; i < samples; i += chunkSize) {
	const leftChunk = left.subarray(i, i + chunkSize);
	const rightChunk = right.subarray(i, i + chunkSize);
	const leftInt16 = floatTo16BitPCM(leftChunk);
	const rightInt16 = floatTo16BitPCM(rightChunk);
	const mp3buf = mp3encoder.encodeBuffer(leftInt16, rightInt16);
	if (mp3buf.length > 0) {
	mp3Data.push(new Uint8Array(mp3buf));
	}
	}
	}

	// Flush the encoder to get any remaining MP3 data.
	const endBuf = mp3encoder.flush();
	if (endBuf.length > 0) {
	mp3Data.push(new Uint8Array(endBuf));
	}

	// Concatenate all MP3 chunks into a single ArrayBuffer.
	const totalLength = mp3Data.reduce((acc, curr) => acc + curr.length, 0);
	const result = new Uint8Array(totalLength);
	let offset = 0;
	for (const chunk of mp3Data) {
	result.set(chunk, offset);
	offset += chunk.length;
	}

	return result.buffer;
	}

	/**
	* Helper function that converts a Float32Array of PCM samples (range -1..1)
	* into an Int16Array (range -32768..32767).
	*/
	function floatTo16BitPCM(input: Float32Array): Int16Array {
	const output = new Int16Array(input.length);
	for (let i = 0; i < input.length; i++) {
	const s = Math.max(-1, Math.min(1, input[i]));
	output[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
	}
	return output;
	}

	// clean up filename for saving
	export const cleanupFilename = (name: string): string => {
	return name.replace(/[^a-zA-Z0-9-_]/g, '_');
	};