jbilcke-hf's picture
jbilcke-hf HF staff
upgraded to @aitube/client 0.0.12
f24ad59
raw
history blame
2.8 kB
import { ClapProject, ClapSegment, getClapAssetSourceType, filterSegments, ClapSegmentFilteringMode } from "@aitube/clap"
import { getSpeechBackgroundAudioPrompt } from "@aitube/engine"
import { generateSpeechWithParlerTTS } from "@/app/api/generators/speech/generateVoiceWithParlerTTS"
import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
import { ClapCompletionMode } from "../types"
export async function processShot({
shotSegment,
existingClap,
newerClap,
mode
}: {
shotSegment: ClapSegment
existingClap: ClapProject
newerClap: ClapProject
mode: ClapCompletionMode
}): Promise<void> {
const shotSegments: ClapSegment[] = filterSegments(
ClapSegmentFilteringMode.START,
shotSegment,
existingClap.segments
)
const shotDialogueSegments: ClapSegment[] = shotSegments.filter(s =>
s.category === "dialogue"
)
let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0)
console.log(`[api/edit/dialogues] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotDialogueSegments.length} dialogues)`)
if (shotDialogueSegment && !shotDialogueSegment.assetUrl) {
// console.log(`[api/edit/dialogues] generating audio..`)
try {
// this generates a mp3
shotDialogueSegment.assetUrl = await generateSpeechWithParlerTTS({
text: shotDialogueSegment.prompt,
audioId: getSpeechBackgroundAudioPrompt(
shotSegments,
existingClap.entityIndex,
["high quality", "crisp", "detailed"]
),
debug: true,
})
shotDialogueSegment.assetSourceType = getClapAssetSourceType(shotDialogueSegment.assetUrl)
const { durationInMs, durationInSec, hasAudio } = await getMediaInfo(shotDialogueSegment.assetUrl)
if (hasAudio && durationInMs > 1000) {
shotDialogueSegment.assetDurationInMs = durationInMs
shotSegment.assetDurationInMs = durationInMs
// we update the duration of all the segments for this shot
// (it is possible that this makes the two previous lines redundant)
existingClap.segments.forEach(s => {
s.assetDurationInMs = durationInMs
})
}
} catch (err) {
console.log(`[api/edit/dialogues] processShot: failed to generate audio: ${err}`)
throw err
}
console.log(`[api/edit/dialogues] processShot: generated dialogue audio: ${shotDialogueSegment?.assetUrl?.slice?.(0, 50)}...`)
// if it's partial, we need to manually add it
if (mode === "partial") {
newerClap.segments.push(shotDialogueSegment)
}
} else {
console.log(`[api/edit/dialogues] processShot: there is already a dialogue audio: ${shotDialogueSegment?.assetUrl?.slice?.(0, 50)}...`)
}
}