kokoro-podcast-generator

Running

File size: 16,329 Bytes

// @ts-expect-error this package does not have typing
import TextLineStream from 'textlinestream';
import { Client } from '@gradio/client';
import * as lamejs from '@breezystack/lamejs';

// ponyfill for missing ReadableStream asyncIterator on Safari
import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
import { CONFIG } from '../config';
import { uploadFiles } from '@huggingface/hub';

export const isDev: boolean = import.meta.env.MODE === 'development';
export const testToken: string = import.meta.env.VITE_TEST_TOKEN;
export const isBlogMode: boolean = !!window.location.href.match(/blogmode/);

export const delay = (ms: number) => new Promise((res) => setTimeout(res, ms));

// return URL to the WAV file
export const generateAudio = async (
  content: string,
  voice: string,
  speed: number = 1.1
): Promise<string> => {
  const maxRetries = 3;
  for (let i = 0; i < maxRetries; i++) {
    try {
      const client = await Client.connect(CONFIG.ttsSpaceId);
      const result = await client.predict('/tts', {
        text: content,
        voice,
        speed,
      });

      console.log(result.data);
      return (result.data as any)[0].url;
    } catch (e) {
      if (i === maxRetries - 1) {
        throw e; // last retry, throw error
      }
      console.error('Failed to generate audio, retrying...', e);
    }
    continue;
  }
  return ''; // should never reach here
};

export const pickRand = <T>(arr: T[]): T => {
  return arr[Math.floor(Math.random() * arr.length)];
};

// wrapper for SSE
export async function* getSSEStreamAsync(fetchResponse: Response) {
  if (!fetchResponse.body) throw new Error('Response body is empty');
  const lines: ReadableStream<string> = fetchResponse.body
    .pipeThrough(new TextDecoderStream())
    .pipeThrough(new TextLineStream());
  // @ts-expect-error asyncIterator complains about type, but it should work
  for await (const line of asyncIterator(lines)) {
    //if (isDev) console.log({ line });
    if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
      const data = JSON.parse(line.slice(5));
      yield data;
    } else if (line.startsWith('error:')) {
      const data = JSON.parse(line.slice(6));
      throw new Error(data.message || 'Unknown error');
    }
  }
}

export const uploadFileToHub = async (
  buf: ArrayBuffer,
  filename: string,
  repoId: string,
  hfToken: string
) => {
  await uploadFiles({
    accessToken: hfToken,
    repo: repoId,
    files: [
      {
        path: filename,
        content: new Blob([buf], { type: 'audio/wav' }),
      },
    ],
  });
};

/**
 * Ok now, most of the functions below are written by ChatGPT using Reasoning mode.
 */

////////////////////////////////////////
// Audio manipulation utils

export const trimSilence = (audioBuffer: AudioBuffer): AudioBuffer => {
  const threshold = 0.01; // Amplitude below which a sample is considered silent.
  const numChannels = audioBuffer.numberOfChannels;
  const totalSamples = audioBuffer.length;

  // Helper function to check if a sample at the given index is silent in all channels.
  const isSilent = (index: number): boolean => {
    for (let channel = 0; channel < numChannels; channel++) {
      const channelData = audioBuffer.getChannelData(channel);
      if (Math.abs(channelData[index]) > threshold) {
        return false;
      }
    }
    return true;
  };

  // Find the first non-silent sample.
  let startSample = 0;
  while (startSample < totalSamples && isSilent(startSample)) {
    startSample++;
  }

  // Find the last non-silent sample.
  let endSample = totalSamples - 1;
  while (endSample >= startSample && isSilent(endSample)) {
    endSample--;
  }

  // If no non-silent samples were found, return an empty AudioBuffer.
  if (startSample >= totalSamples || endSample < startSample) {
    return new AudioBuffer({
      length: 1,
      numberOfChannels: numChannels,
      sampleRate: audioBuffer.sampleRate,
    });
  }

  const newLength = endSample - startSample + 1;
  const newBuffer = new AudioBuffer({
    length: newLength,
    numberOfChannels: numChannels,
    sampleRate: audioBuffer.sampleRate,
  });

  // Copy the trimmed audio samples from the original buffer to the new buffer.
  for (let channel = 0; channel < numChannels; channel++) {
    const oldData = audioBuffer.getChannelData(channel);
    const newData = newBuffer.getChannelData(channel);
    for (let i = 0; i < newLength; i++) {
      newData[i] = oldData[startSample + i];
    }
  }

  return newBuffer;
};

export const joinAudio = (
  audio1: AudioBuffer,
  audio2: AudioBuffer,
  gapMilisecs: number,
  overlap: 'none' | 'cross-fade' = 'none'
): AudioBuffer => {
  const sampleRate = audio1.sampleRate;
  const numChannels = audio1.numberOfChannels;

  // Ensure both audio buffers are compatible.
  if (audio2.sampleRate !== sampleRate) {
    throw new Error('Audio buffers must have the same sample rate');
  }
  if (audio2.numberOfChannels !== numChannels) {
    throw new Error('Audio buffers must have the same number of channels');
  }

  const gapSeconds = gapMilisecs / 1000;
  let newLength: number;

  if (gapSeconds > 0) {
    // Pad with silence: gapSamples of silence in between.
    const gapSamples = Math.round(gapSeconds * sampleRate);
    newLength = audio1.length + gapSamples + audio2.length;
  } else if (gapSeconds === 0) {
    // Simply join one after the other.
    newLength = audio1.length + audio2.length;
  } else {
    // gapSeconds < 0 means we blend (overlap) the end of audio1 with the beginning of audio2.
    const overlapSamplesRequested = Math.round(-gapSeconds * sampleRate);
    // Ensure we don't overlap more than available in either buffer.
    const effectiveOverlap = Math.min(
      overlapSamplesRequested,
      audio1.length,
      audio2.length
    );
    newLength = audio1.length + audio2.length - effectiveOverlap;
  }

  // Create a new AudioBuffer for the joined result.
  const newBuffer = new AudioBuffer({
    length: newLength,
    numberOfChannels: numChannels,
    sampleRate: sampleRate,
  });

  // Process each channel.
  for (let channel = 0; channel < numChannels; channel++) {
    const outputData = newBuffer.getChannelData(channel);
    const data1 = audio1.getChannelData(channel);
    const data2 = audio2.getChannelData(channel);
    let offset = 0;

    if (gapSeconds < 0) {
      // Blend the join section.
      const overlapSamplesRequested = Math.round(-gapSeconds * sampleRate);
      const effectiveOverlap = Math.min(
        overlapSamplesRequested,
        audio1.length,
        audio2.length
      );

      // Copy audio1 data up to the start of the overlapping section.
      const nonOverlapLength = audio1.length - effectiveOverlap;
      outputData.set(data1.subarray(0, nonOverlapLength), offset);
      offset += nonOverlapLength;

      // Blend overlapping region.
      if (overlap === 'cross-fade') {
        for (let i = 0; i < effectiveOverlap; i++) {
          // Linear crossfade:
          const fadeOut = 1 - i / effectiveOverlap;
          const fadeIn = i / effectiveOverlap;
          outputData[offset + i] =
            data1[nonOverlapLength + i] * fadeOut + data2[i] * fadeIn;
        }
      } else {
        for (let i = 0; i < effectiveOverlap; i++) {
          outputData[offset + i] = data1[nonOverlapLength + i] + data2[i];
        }
      }
      offset += effectiveOverlap;

      // Append remaining audio2 data.
      outputData.set(data2.subarray(effectiveOverlap), offset);
    } else if (gapSeconds === 0) {
      // Directly concatenate: copy audio1 then audio2.
      outputData.set(data1, offset);
      offset += audio1.length;
      outputData.set(data2, offset);
    } else {
      // gapSeconds > 0: insert silence between audio1 and audio2.
      const gapSamples = Math.round(gapSeconds * sampleRate);
      outputData.set(data1, offset);
      offset += audio1.length;

      // Silence: the buffer is initialized with zeros, so we simply move the offset.
      offset += gapSamples;

      outputData.set(data2, offset);
    }
  }

  return newBuffer;
};

export const addNoise = (
  audioBuffer: AudioBuffer,
  magnitude: number
): AudioBuffer => {
  const { numberOfChannels, sampleRate, length } = audioBuffer;
  const newBuffer = new AudioBuffer({
    length,
    numberOfChannels,
    sampleRate,
  });

  for (let channel = 0; channel < numberOfChannels; channel++) {
    const inputData = audioBuffer.getChannelData(channel);
    const outputData = newBuffer.getChannelData(channel);

    for (let i = 0; i < length; i++) {
      // Generate white noise in the range [-magnitude, +magnitude].
      const noise = (Math.random() * 2 - 1) * magnitude;
      outputData[i] = inputData[i] + noise;
    }
  }

  return newBuffer;
};

export const addSilence = (
  audioBuffer: AudioBuffer,
  toBeginning: boolean,
  durationMilisecs: number
): AudioBuffer => {
  // Convert duration from milliseconds to samples.
  const sampleRate = audioBuffer.sampleRate;
  const silenceSamples = Math.round((durationMilisecs / 1000) * sampleRate);
  const numChannels = audioBuffer.numberOfChannels;
  const originalLength = audioBuffer.length;
  const newLength = originalLength + silenceSamples;

  // Create a new AudioBuffer with extra space for the silence.
  const newBuffer = new AudioBuffer({
    length: newLength,
    numberOfChannels: numChannels,
    sampleRate: sampleRate,
  });

  // Process each channel: copy original audio into the correct position.
  for (let channel = 0; channel < numChannels; channel++) {
    const originalData = audioBuffer.getChannelData(channel);
    const newData = newBuffer.getChannelData(channel);

    if (toBeginning) {
      // Leave the first `silenceSamples` as zeros, then copy the original data.
      newData.set(originalData, silenceSamples);
    } else {
      // Copy the original data first; the remaining samples are already zeros.
      newData.set(originalData, 0);
    }
  }

  return newBuffer;
};

////////////////////////////////////////
// Audio formatting utils

export const loadWavAndDecode = async (url: string): Promise<AudioBuffer> => {
  const response = await fetch(url);
  const arrayBuffer = await response.arrayBuffer();
  // @ts-expect-error this is fine
  const AudioContext = window.AudioContext || window.webkitAudioContext;
  if (!AudioContext) {
    throw new Error('AudioContext is not supported on this browser');
  }
  const audioCtx = new AudioContext();
  let audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
  // force mono
  if (audioBuffer.numberOfChannels > 1) {
    const monoBuffer = new AudioContext().createBuffer(
      1,
      audioBuffer.length,
      audioBuffer.sampleRate
    );
    const monoData = monoBuffer.getChannelData(0);
    for (let i = 0; i < audioBuffer.length; i++) {
      let sum = 0;
      for (let channel = 0; channel < audioBuffer.numberOfChannels; channel++) {
        sum += audioBuffer.getChannelData(channel)[i];
      }
      monoData[i] = sum / audioBuffer.numberOfChannels;
    }
    audioBuffer = monoBuffer;
  }
  return audioBuffer;
};

export function audioBufferToWav(
  buffer: AudioBuffer,
  options: { float32?: boolean } = {}
): ArrayBuffer {
  const numChannels = buffer.numberOfChannels;
  const sampleRate = buffer.sampleRate;
  const format = options.float32 ? 3 : 1; // 3 = IEEE float, 1 = PCM
  const bitDepth = options.float32 ? 32 : 16;

  const numSamples = buffer.length;
  const headerLength = 44;
  const bytesPerSample = bitDepth / 8;
  const dataLength = numSamples * numChannels * bytesPerSample;
  const bufferLength = headerLength + dataLength;

  const arrayBuffer = new ArrayBuffer(bufferLength);
  const view = new DataView(arrayBuffer);
  let offset = 0;

  function writeString(str: string) {
    for (let i = 0; i < str.length; i++) {
      view.setUint8(offset, str.charCodeAt(i));
      offset++;
    }
  }

  // Write WAV header
  writeString('RIFF');
  view.setUint32(offset, 36 + dataLength, true);
  offset += 4;
  writeString('WAVE');
  writeString('fmt ');
  view.setUint32(offset, 16, true);
  offset += 4;
  view.setUint16(offset, format, true);
  offset += 2;
  view.setUint16(offset, numChannels, true);
  offset += 2;
  view.setUint32(offset, sampleRate, true);
  offset += 4;
  view.setUint32(offset, sampleRate * numChannels * bytesPerSample, true);
  offset += 4;
  view.setUint16(offset, numChannels * bytesPerSample, true);
  offset += 2;
  view.setUint16(offset, bitDepth, true);
  offset += 2;
  writeString('data');
  view.setUint32(offset, dataLength, true);
  offset += 4;

  // Write PCM samples: interleave channels
  const channels: Float32Array[] = [];
  for (let i = 0; i < numChannels; i++) {
    channels.push(buffer.getChannelData(i));
  }

  for (let i = 0; i < numSamples; i++) {
    for (let channel = 0; channel < numChannels; channel++) {
      let sample = channels[channel][i];
      // Clamp the sample to [-1, 1]
      sample = Math.max(-1, Math.min(1, sample));
      if (options.float32) {
        view.setFloat32(offset, sample, true);
        offset += 4;
      } else {
        // Convert to 16-bit PCM sample
        const intSample = sample < 0 ? sample * 0x8000 : sample * 0x7fff;
        view.setInt16(offset, intSample, true);
        offset += 2;
      }
    }
  }

  return arrayBuffer;
}

export const blobFromAudioBuffer = (audioBuffer: AudioBuffer): Blob => {
  // Using 16-bit PCM for compatibility.
  const wavArrayBuffer = audioBufferToWav(audioBuffer, { float32: false });
  return new Blob([wavArrayBuffer], { type: 'audio/wav' });
};

export function audioBufferToMp3(buffer: AudioBuffer): ArrayBuffer {
  const numChannels = buffer.numberOfChannels;
  const sampleRate = buffer.sampleRate;
  const bitRate = 128; // kbps - adjust as desired

  // Initialize MP3 encoder.
  // Note: If more than 2 channels are present, only the first 2 channels will be used.
  const mp3encoder = new lamejs.Mp3Encoder(
    numChannels >= 2 ? 2 : 1,
    sampleRate,
    bitRate
  );

  const samples = buffer.length;
  const chunkSize = 1152; // Frame size for MP3 encoding

  // Prepare channel data.
  const channels: Float32Array[] = [];
  for (let ch = 0; ch < numChannels; ch++) {
    channels.push(buffer.getChannelData(ch));
  }

  const mp3Data: Uint8Array[] = [];

  // For mono audio, encode directly.
  if (numChannels === 1) {
    for (let i = 0; i < samples; i += chunkSize) {
      const sampleChunk = channels[0].subarray(i, i + chunkSize);
      const int16Buffer = floatTo16BitPCM(sampleChunk);
      const mp3buf = mp3encoder.encodeBuffer(int16Buffer);
      if (mp3buf.length > 0) {
        mp3Data.push(new Uint8Array(mp3buf));
      }
    }
  } else {
    // For stereo (or more channels, use first two channels).
    const left = channels[0];
    const right = channels[1];
    for (let i = 0; i < samples; i += chunkSize) {
      const leftChunk = left.subarray(i, i + chunkSize);
      const rightChunk = right.subarray(i, i + chunkSize);
      const leftInt16 = floatTo16BitPCM(leftChunk);
      const rightInt16 = floatTo16BitPCM(rightChunk);
      const mp3buf = mp3encoder.encodeBuffer(leftInt16, rightInt16);
      if (mp3buf.length > 0) {
        mp3Data.push(new Uint8Array(mp3buf));
      }
    }
  }

  // Flush the encoder to get any remaining MP3 data.
  const endBuf = mp3encoder.flush();
  if (endBuf.length > 0) {
    mp3Data.push(new Uint8Array(endBuf));
  }

  // Concatenate all MP3 chunks into a single ArrayBuffer.
  const totalLength = mp3Data.reduce((acc, curr) => acc + curr.length, 0);
  const result = new Uint8Array(totalLength);
  let offset = 0;
  for (const chunk of mp3Data) {
    result.set(chunk, offset);
    offset += chunk.length;
  }

  return result.buffer;
}

/**
 * Helper function that converts a Float32Array of PCM samples (range -1..1)
 * into an Int16Array (range -32768..32767).
 */
function floatTo16BitPCM(input: Float32Array): Int16Array {
  const output = new Int16Array(input.length);
  for (let i = 0; i < input.length; i++) {
    const s = Math.max(-1, Math.min(1, input[i]));
    output[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
  }
  return output;
}

// clean up filename for saving
export const cleanupFilename = (name: string): string => {
  return name.replace(/[^a-zA-Z0-9-_]/g, '_');
};