"use server" import { ClapProject, getValidNumber, newClap, newSegment } from "@aitube/clap" import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace" import { parseRawStringToYAML } from "@/app/api/parsers/parseRawStringToYAML" import { sleep } from "@/lib/utils/sleep" import { systemPrompt } from "./systemPrompt" import { LatentStory } from "./types" // a helper to generate Clap stories from a few sentences // this is mostly used by external apps such as the Stories Factory export async function create(request: { prompt?: string width?: number height?: number }= { prompt: "", width: 1024, height: 576, }): Promise { const prompt = `${request?.prompt || ""}`.trim() console.log("api/v1/create(): request:", request) if (!prompt.length) { throw new Error(`please provide a prompt`) } const width = getValidNumber(request?.width, 256, 8192, 1024) const height = getValidNumber(request?.height, 256, 8192, 576) const userPrompt = `Video story to generate: ${prompt}` const prefix = "```yaml\n" const nbMaxNewTokens = 1400 // TODO use streaming for the Hugging Face prediction // // note that a Clap file is actually a YAML stream of documents // so technically we could stream everything from end-to-end // (but I haven't coded the helpers to do this yet) let rawString = await predict({ systemPrompt, userPrompt, nbMaxNewTokens, prefix, }) console.log("api/v1/create(): rawString: ", rawString) let shots: LatentStory[] = [] let maybeShots = parseRawStringToYAML(rawString, []) if (!Array.isArray(maybeShots) || maybeShots.length === 0) { console.log(`api/v1/create(): failed to generate shots.. trying again`) await sleep(2000) rawString = await predict({ systemPrompt, userPrompt: userPrompt + ".", // we trick the Hugging Face cache nbMaxNewTokens, prefix, }) console.log("api/v1/create(): rawString: ", rawString) maybeShots = parseRawStringToYAML(rawString, []) if (!Array.isArray(maybeShots) || maybeShots.length === 0) { console.log(`api/v1/create(): failed to generate shots for the second time, which indicates an issue with the Hugging Face API`) } else { shots = maybeShots } } else { shots = maybeShots } console.log(`api/v1/create(): generated ${shots.length} shots`) // this is approximate - TTS generation will determine the final duration of each shot const defaultSegmentDurationInMs = 7000 let currentElapsedTimeInMs = 0 const clap: ClapProject = newClap({ meta: { title: "Not needed", // we don't need a title actually description: "This video has been generated using AI", synopsis: "", licence: "Non Commercial", orientation: "vertical", width, height, isInteractive: false, isLoop: false, durationInMs: shots.length * defaultSegmentDurationInMs, defaultVideoModel: "AnimateDiff-Lightning", } }) for (const { title, image, voice } of shots) { console.log(`api/v1/create(): - ${title}`) // note: it would be nice if we could have a convention saying that // track 0 is for videos and track 1 storyboards // // however, that's a bit constraining as people will generate .clap // using all kind of tools and development experience, // and they may not wish to learn the Clap protocol format completely // // TL;DR: // we should fix the Clap file editor to make it able to react videos // from any track number /* we disable it, because we don't generate animated videos yet clap.segments.push(newSegment({ track: 0, category: "video", prompt: image, outputType: "video" })) */ clap.segments.push(newSegment({ track: 1, startTimeInMs: currentElapsedTimeInMs, assetDurationInMs: defaultSegmentDurationInMs, category: "storyboard", prompt: image, outputType: "image" })) clap.segments.push(newSegment({ track: 2, startTimeInMs: currentElapsedTimeInMs, assetDurationInMs: defaultSegmentDurationInMs, category: "interface", prompt: title, // assetUrl: `data:text/plain;base64,${btoa(title)}`, assetUrl: title, outputType: "text" })) clap.segments.push(newSegment({ track: 3, startTimeInMs: currentElapsedTimeInMs, assetDurationInMs: defaultSegmentDurationInMs, category: "dialogue", prompt: voice, outputType: "audio" })) // the presence of a camera is mandatory clap.segments.push(newSegment({ track: 4, startTimeInMs: currentElapsedTimeInMs, assetDurationInMs: defaultSegmentDurationInMs, category: "camera", prompt: "vertical video", outputType: "text" })) currentElapsedTimeInMs += defaultSegmentDurationInMs } return clap }