Spaces:
Running
Running
File size: 4,498 Bytes
6215321 0d218b1 6215321 4348dc5 0d218b1 ce559ed 6215321 ce559ed 6215321 ce559ed 6215321 3b780fb 6215321 ce559ed eae869c ce559ed 6215321 3b780fb ce559ed 3b780fb ce559ed eae869c ce559ed 0d218b1 ce559ed 3b780fb ce559ed 0bf0c48 ce559ed 6215321 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import { NextResponse, NextRequest } from "next/server"
import { ClapProject, getValidNumber, newClap, newSegment, serializeClap } from "@aitube/clap"
import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
import { parseRawStringToYAML } from "@/app/api/utils/parseRawStringToYAML"
import { systemPrompt } from "./systemPrompt"
export type LatentStory = {
title: string
image: string
voice: string
}
// a helper to generate Clap stories from a few sentences
// this is mostly used by external apps such as the Stories Factory
export async function POST(req: NextRequest) {
const request = await req.json() as {
prompt: string
width: number
height: number
// can add more stuff for the V2 of Stories Factory
}
const prompt = `${request?.prompt || ""}`.trim()
console.log("[api/v1/create] request:", request)
if (!prompt.length) { throw new Error(`please provide a prompt`) }
const width = getValidNumber(request?.width, 256, 8192, 1024)
const height = getValidNumber(request?.height, 256, 8192, 576)
const userPrompt = `Video story to generate: ${prompt}`
// TODO use streaming for the Hugging Face prediction
//
// note that a Clap file is actually a YAML stream of documents
// so technically we could stream everything from end-to-end
// (but I haven't coded the helpers to do this yet)
const rawString = await predict({
systemPrompt,
userPrompt,
nbMaxNewTokens: 1400,
prefix: "```yaml\n",
})
console.log("[api/v1/create] rawString: ", rawString)
const shots = parseRawStringToYAML<LatentStory[]>(rawString, [])
console.log(`[api/v1/create] generated ${shots.length} shots`)
// this is approximate - TTS generation will determine the final duration of each shot
const defaultSegmentDurationInMs = 7000
let currentElapsedTimeInMs = 0
let currentSegmentDurationInMs = defaultSegmentDurationInMs
const clap: ClapProject = newClap({
meta: {
title: "Not needed", // we don't need a title actually
description: "This video has been generated using AI",
synopsis: "",
licence: "Non Commercial",
orientation: "vertical",
width,
height,
isInteractive: false,
isLoop: false,
durationInMs: shots.length * defaultSegmentDurationInMs,
defaultVideoModel: "AnimateDiff-Lightning",
}
})
for (const { title, image, voice } of shots) {
console.log(`[api/v1/create] - ${title}`)
// note: it would be nice if we could have a convention saying that
// track 0 is for videos and track 1 storyboards
//
// however, that's a bit constraining as people will generate .clap
// using all kind of tools and development experience,
// and they may not wish to learn the Clap protocol format completely
//
// TL;DR:
// we should fix the Clap file editor to make it able to react videos
// from any track number
/*
we disable it, because we don't generate animated videos yet
clap.segments.push(newSegment({
track: 0,
category: "video",
prompt: image,
outputType: "video"
}))
*/
clap.segments.push(newSegment({
track: 1,
startTimeInMs: currentSegmentDurationInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: "storyboard",
prompt: image,
outputType: "image"
}))
clap.segments.push(newSegment({
track: 2,
startTimeInMs: currentSegmentDurationInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: "interface",
prompt: title,
// assetUrl: `data:text/plain;base64,${btoa(title)}`,
assetUrl: title,
outputType: "text"
}))
clap.segments.push(newSegment({
track: 3,
startTimeInMs: currentSegmentDurationInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: "dialogue",
prompt: voice,
outputType: "audio"
}))
// the presence of a camera is mandatory
clap.segments.push(newSegment({
track: 4,
startTimeInMs: currentSegmentDurationInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: "camera",
prompt: "vertical video",
outputType: "text"
}))
currentSegmentDurationInMs += defaultSegmentDurationInMs
}
// TODO replace by Clap file streaming
return new NextResponse(await serializeClap(clap), {
status: 200,
headers: new Headers({ "content-type": "application/x-gzip" }),
})
}
|