Spaces:

jbilcke-hf
/

ai-tube

Running

App Files Files Community

ai-tube / src /app /api /v1 /create /route.ts

jbilcke-hf HF staff

wip

3b780fb 9 months ago

raw

history blame

4.5 kB

	import { NextResponse, NextRequest } from "next/server"
	import { ClapProject, getValidNumber, newClap, newSegment, serializeClap } from "@aitube/clap"

	import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
	import { parseRawStringToYAML } from "@/app/api/utils/parseRawStringToYAML"

	import { systemPrompt } from "./systemPrompt"

	export type LatentStory = {
	title: string
	image: string
	voice: string
	}

	// a helper to generate Clap stories from a few sentences
	// this is mostly used by external apps such as the Stories Factory
	export async function POST(req: NextRequest) {

	const request = await req.json() as {
	prompt: string
	width: number
	height: number
	// can add more stuff for the V2 of Stories Factory
	}

	const prompt = `${request?.prompt \|\| ""}`.trim()

	console.log("[api/v1/create] request:", request)

	if (!prompt.length) { throw new Error(`please provide a prompt`) }

	const width = getValidNumber(request?.width, 256, 8192, 1024)
	const height = getValidNumber(request?.height, 256, 8192, 576)

	const userPrompt = `Video story to generate: ${prompt}`

	// TODO use streaming for the Hugging Face prediction
	//
	// note that a Clap file is actually a YAML stream of documents
	// so technically we could stream everything from end-to-end
	// (but I haven't coded the helpers to do this yet)
	const rawString = await predict({
	systemPrompt,
	userPrompt,
	nbMaxNewTokens: 1400,
	prefix: "```yaml\n",
	})

	console.log("[api/v1/create] rawString: ", rawString)

	const shots = parseRawStringToYAML<LatentStory[]>(rawString, [])

	console.log(`[api/v1/create] generated ${shots.length} shots`)

	// this is approximate - TTS generation will determine the final duration of each shot
	const defaultSegmentDurationInMs = 7000

	let currentElapsedTimeInMs = 0
	let currentSegmentDurationInMs = defaultSegmentDurationInMs

	const clap: ClapProject = newClap({
	meta: {
	title: "Not needed", // we don't need a title actually
	description: "This video has been generated using AI",
	synopsis: "",
	licence: "Non Commercial",
	orientation: "vertical",
	width,
	height,
	isInteractive: false,
	isLoop: false,
	durationInMs: shots.length * defaultSegmentDurationInMs,
	defaultVideoModel: "AnimateDiff-Lightning",
	}
	})

	for (const { title, image, voice } of shots) {

	console.log(`[api/v1/create] - ${title}`)

	// note: it would be nice if we could have a convention saying that
	// track 0 is for videos and track 1 storyboards
	//
	// however, that's a bit constraining as people will generate .clap
	// using all kind of tools and development experience,
	// and they may not wish to learn the Clap protocol format completely
	//
	// TL;DR:
	// we should fix the Clap file editor to make it able to react videos
	// from any track number


	/*
	we disable it, because we don't generate animated videos yet
	clap.segments.push(newSegment({
	track: 0,
	category: "video",
	prompt: image,
	outputType: "video"
	}))
	*/

	clap.segments.push(newSegment({
	track: 1,
	startTimeInMs: currentSegmentDurationInMs,
	assetDurationInMs: defaultSegmentDurationInMs,
	category: "storyboard",
	prompt: image,
	outputType: "image"
	}))

	clap.segments.push(newSegment({
	track: 2,
	startTimeInMs: currentSegmentDurationInMs,
	assetDurationInMs: defaultSegmentDurationInMs,
	category: "interface",
	prompt: title,
	// assetUrl: `data:text/plain;base64,${btoa(title)}`,
	assetUrl: title,
	outputType: "text"
	}))

	clap.segments.push(newSegment({
	track: 3,
	startTimeInMs: currentSegmentDurationInMs,
	assetDurationInMs: defaultSegmentDurationInMs,
	category: "dialogue",
	prompt: voice,
	outputType: "audio"
	}))

	// the presence of a camera is mandatory
	clap.segments.push(newSegment({
	track: 4,
	startTimeInMs: currentSegmentDurationInMs,
	assetDurationInMs: defaultSegmentDurationInMs,
	category: "camera",
	prompt: "vertical video",
	outputType: "text"
	}))

	currentSegmentDurationInMs += defaultSegmentDurationInMs
	}

	// TODO replace by Clap file streaming
	return new NextResponse(await serializeClap(clap), {
	status: 200,
	headers: new Headers({ "content-type": "application/x-gzip" }),
	})
	}