Spaces:
Running
Running
import { NextResponse, NextRequest } from "next/server" | |
import { ClapProject, getValidNumber, newClap, newSegment, serializeClap } from "@aitube/clap" | |
import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace" | |
import { parseRawStringToYAML } from "@/app/api/utils/parseRawStringToYAML" | |
import { systemPrompt } from "./systemPrompt" | |
export type LatentStory = { | |
title: string | |
image: string | |
voice: string | |
} | |
// a helper to generate Clap stories from a few sentences | |
// this is mostly used by external apps such as the Stories Factory | |
export async function POST(req: NextRequest) { | |
const request = await req.json() as { | |
prompt: string | |
width: number | |
height: number | |
// can add more stuff for the V2 of Stories Factory | |
} | |
const prompt = `${request?.prompt || ""}`.trim() | |
console.log("[api/v1/create] request:", request) | |
if (!prompt.length) { throw new Error(`please provide a prompt`) } | |
const width = getValidNumber(request?.width, 256, 8192, 1024) | |
const height = getValidNumber(request?.height, 256, 8192, 576) | |
const userPrompt = `Video story to generate: ${prompt}` | |
// TODO use streaming for the Hugging Face prediction | |
// | |
// note that a Clap file is actually a YAML stream of documents | |
// so technically we could stream everything from end-to-end | |
// (but I haven't coded the helpers to do this yet) | |
const rawString = await predict({ | |
systemPrompt, | |
userPrompt, | |
nbMaxNewTokens: 1400, | |
prefix: "```yaml\n", | |
}) | |
console.log("[api/v1/create] rawString: ", rawString) | |
const shots = parseRawStringToYAML<LatentStory[]>(rawString, []) | |
console.log(`[api/v1/create] generated ${shots.length} shots`) | |
// this is approximate - TTS generation will determine the final duration of each shot | |
const defaultSegmentDurationInMs = 7000 | |
let currentElapsedTimeInMs = 0 | |
let currentSegmentDurationInMs = defaultSegmentDurationInMs | |
const clap: ClapProject = newClap({ | |
meta: { | |
title: "Not needed", // we don't need a title actually | |
description: "This video has been generated using AI", | |
synopsis: "", | |
licence: "Non Commercial", | |
orientation: "vertical", | |
width, | |
height, | |
isInteractive: false, | |
isLoop: false, | |
durationInMs: shots.length * defaultSegmentDurationInMs, | |
defaultVideoModel: "AnimateDiff-Lightning", | |
} | |
}) | |
for (const { title, image, voice } of shots) { | |
console.log(`[api/v1/create] - ${title}`) | |
// note: it would be nice if we could have a convention saying that | |
// track 0 is for videos and track 1 storyboards | |
// | |
// however, that's a bit constraining as people will generate .clap | |
// using all kind of tools and development experience, | |
// and they may not wish to learn the Clap protocol format completely | |
// | |
// TL;DR: | |
// we should fix the Clap file editor to make it able to react videos | |
// from any track number | |
/* | |
we disable it, because we don't generate animated videos yet | |
clap.segments.push(newSegment({ | |
track: 0, | |
category: "video", | |
prompt: image, | |
outputType: "video" | |
})) | |
*/ | |
clap.segments.push(newSegment({ | |
track: 1, | |
startTimeInMs: currentSegmentDurationInMs, | |
assetDurationInMs: defaultSegmentDurationInMs, | |
category: "storyboard", | |
prompt: image, | |
outputType: "image" | |
})) | |
clap.segments.push(newSegment({ | |
track: 2, | |
startTimeInMs: currentSegmentDurationInMs, | |
assetDurationInMs: defaultSegmentDurationInMs, | |
category: "interface", | |
prompt: title, | |
// assetUrl: `data:text/plain;base64,${btoa(title)}`, | |
assetUrl: title, | |
outputType: "text" | |
})) | |
clap.segments.push(newSegment({ | |
track: 3, | |
startTimeInMs: currentSegmentDurationInMs, | |
assetDurationInMs: defaultSegmentDurationInMs, | |
category: "dialogue", | |
prompt: voice, | |
outputType: "audio" | |
})) | |
// the presence of a camera is mandatory | |
clap.segments.push(newSegment({ | |
track: 4, | |
startTimeInMs: currentSegmentDurationInMs, | |
assetDurationInMs: defaultSegmentDurationInMs, | |
category: "camera", | |
prompt: "vertical video", | |
outputType: "text" | |
})) | |
currentSegmentDurationInMs += defaultSegmentDurationInMs | |
} | |
// TODO replace by Clap file streaming | |
return new NextResponse(await serializeClap(clap), { | |
status: 200, | |
headers: new Headers({ "content-type": "application/x-gzip" }), | |
}) | |
} | |