VideoChain-API / src /production /generateShot.mts
jbilcke-hf's picture
jbilcke-hf HF staff
ok! now, time to debug and build the frontend..
5dfc565
raw
history blame
6.19 kB
import path from "node:path"
import { v4 as uuidv4 } from "uuid"
import tmpDir from "temp-dir"
import { downloadFileToTmp } from "../utils/downloadFileToTmp.mts"
import { generateAudio } from "./generateAudio.mts"
import { generateVideo } from "./generateVideo.mts"
import { upscaleVideo } from "./upscaleVideo.mts"
import { generateVoice } from "./generateVoice.mts"
import { generateSeed } from "../utils/generateSeed.mts"
import { mergeAudio } from "./mergeAudio.mts"
import { addAudioToVideo } from "./addAudioToVideo.mts"
import { interpolateVideo } from "./interpolateVideo.mts"
import { postInterpolation } from "./postInterpolation.mts"
export const generateShot = async ({
seed = 0,
shotId = "",
actorPrompt = "",
shotPrompt = "",
backgroundAudioPrompt = "",
foregroundAudioPrompt = "",
actorDialoguePrompt = "",
actorVoicePrompt = "",
duration = 2,
nbFrames = 24,
resolution = 576,
nbSteps = 35,
upscale = true,
interpolate = true,
noise = true,
}: {
seed?: number;
shotId?: string;
actorPrompt?: string;
shotPrompt?: string;
backgroundAudioPrompt?: string;
foregroundAudioPrompt?: string;
actorDialoguePrompt?: string;
actorVoicePrompt?: string;
duration?: number; // 2 seconds
nbFrames?: number; // 24 FPS
resolution?: number; // 256, 320, 512, 576, 720, 1080..
nbSteps?: number;
upscale?: boolean;
interpolate?: boolean;
noise?: boolean;
}) => {
seed = seed || generateSeed()
shotId = shotId || uuidv4()
const shotFileName = `${shotId}.mp4`
console.log("generating video shot:", {
seed,
shotId,
actorPrompt,
shotPrompt,
backgroundAudioPrompt,
foregroundAudioPrompt,
actorDialoguePrompt,
actorVoicePrompt,
duration,
nbFrames,
resolution,
nbSteps,
upscale,
interpolate,
noise,
})
if (actorPrompt) {
console.log("generating actor..")
const actorIdentityFileName = `actor_${Date.now()}.png`
// await generateActor(actorPrompt, actorIdentityFileName, seed)
}
console.log("generating base video ..")
let generatedVideoUrl = ""
// currenty the base model is incapable of generating more than 24 FPS,
// because otherwise the upscaler will have trouble
// so for now, we fix it to 24 frames
// const nbFramesForBaseModel = Math.min(3, Math.max(1, Math.round(duration))) * 8
const nbFramesForBaseModel = 24
try {
generatedVideoUrl = await generateVideo(shotPrompt, {
seed,
nbFrames: nbFramesForBaseModel,
nbSteps
})
} catch (err) {
// upscaling can be finicky, if it fails we try again
console.log('- trying again to generate base shot..')
generatedVideoUrl = await generateVideo(shotPrompt, {
seed,
nbFrames: nbFramesForBaseModel,
nbSteps
})
}
console.log("downloading video..")
const videoFileName = await downloadFileToTmp(generatedVideoUrl, shotFileName)
if (upscale) {
console.log("upscaling video..")
try {
await upscaleVideo(videoFileName, shotPrompt)
} catch (err) {
// upscaling can be finicky, if it fails we try again
console.log('- trying again to upscale shot..')
await upscaleVideo(videoFileName, shotPrompt)
}
}
if (interpolate) {
console.log("upscaling video..")
// ATTENTION 1:
// the interpolation step always create a SLOW MOTION video
// it means it can last a lot longer (eg. 2x, 3x, 4x.. longer)
// than the duration generated by the original video model
// ATTENTION 2:
// the interpolation step generates videos in 910x512!
// ATTENTION 3:
// the interpolation step parameters are currently not passed to the space,
// so changing those two variables below will have no effect!
const interpolationSteps = 3
const interpolatedFramesPerSecond = 24
await interpolateVideo(
task,
interpolationSteps,
interpolatedFramesPerSecond
)
console.log('creating slow-mo video (910x512 @ 24 FPS)')
// with our current interpolation settings, the 3 seconds video generated by the model
// become a 7 seconds video, at 24 FPS
// so we want to scale it back to the desired duration length
// also, as a last trick we want to upscale it (without AI) and add some FXs
console.log('performing final scaling (1280x720 @ 24 FPS)')
await postInterpolation(videoFileName, duration, nbFrames)
}
let backgroundAudioFileName = ''
if (backgroundAudioPrompt) {
console.log("generating background audio..")
backgroundAudioFileName = await generateAudio(backgroundAudioPrompt, `shot_${shotId}_audio_${uuidv4}.m4a`)
}
let foregroundAudioFileName = ''
if (foregroundAudioPrompt) {
console.log("generating foreground audio..")
foregroundAudioFileName = await generateAudio(foregroundAudioPrompt, `shot_${shotId}_audio_${uuidv4()}.m4a`)
}
let voiceAudioFileName = ''
if (actorDialoguePrompt) {
console.log("configuring dialogue..")
if (actorVoicePrompt) {
console.log("configuring voice..")
// well.. that's a TODO!
// for now let's always use the same voice model
console.log('TODO this should be done in the sequence, not the prompt!')
voiceAudioFileName = await generateVoice(actorDialoguePrompt, `shot_${shotId}_voice_${uuidv4()}.m4a`)
}
}
console.log('merging audio with video..')
if (backgroundAudioFileName || foregroundAudioFileName) {
let audioFileName = ''
// we have both background and foreground
if (backgroundAudioFileName && foregroundAudioFileName) {
audioFileName = await mergeAudio({
input1FileName: backgroundAudioFileName,
input1Volume: 0.2,// 20% volume
input2FileName: foregroundAudioFileName,
input2Volume: 0.7, // 70% volume
})
} else if (backgroundAudioFileName) {
audioFileName = backgroundAudioFileName
} else if (foregroundAudioFileName) {
audioFileName = foregroundAudioFileName
}
await addAudioToVideo(task, audioFileName)
}
console.log("returning result to user..")
const filePath = path.resolve(tmpDir, videoFileName)
return {
shotId,
filePath,
videoFileName
}
}