Spaces:

jbilcke-hf
/

ai-tube

Running

File size: 4,498 Bytes

import { NextResponse, NextRequest } from "next/server"
import { ClapProject, getValidNumber, newClap, newSegment, serializeClap } from "@aitube/clap"

import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
import { parseRawStringToYAML } from "@/app/api/utils/parseRawStringToYAML"

import { systemPrompt } from "./systemPrompt"

export type LatentStory = {
  title: string
  image: string
  voice: string
}

// a helper to generate Clap stories from a few sentences
// this is mostly used by external apps such as the Stories Factory
export async function POST(req: NextRequest) {

  const request = await req.json() as {
    prompt: string
    width: number
    height: number
    // can add more stuff for the V2 of Stories Factory
  }
  
  const prompt = `${request?.prompt || ""}`.trim()

  console.log("[api/v1/create] request:", request)

  if (!prompt.length) { throw new Error(`please provide a prompt`) }

  const width = getValidNumber(request?.width, 256, 8192, 1024)
  const height = getValidNumber(request?.height, 256, 8192, 576)

  const userPrompt = `Video story to generate: ${prompt}`

  // TODO use streaming for the Hugging Face prediction
  //
  // note that a Clap file is actually a YAML stream of documents
  // so technically we could stream everything from end-to-end
  // (but I haven't coded the helpers to do this yet)
  const rawString = await predict({
    systemPrompt,
    userPrompt,
    nbMaxNewTokens: 1400,
    prefix: "```yaml\n",
  })

  console.log("[api/v1/create] rawString: ", rawString)

  const shots = parseRawStringToYAML<LatentStory[]>(rawString, [])

  console.log(`[api/v1/create] generated ${shots.length} shots`)

  // this is approximate - TTS generation will determine the final duration of each shot
  const defaultSegmentDurationInMs = 7000

  let currentElapsedTimeInMs = 0
  let currentSegmentDurationInMs = defaultSegmentDurationInMs

  const clap: ClapProject = newClap({
    meta: {
      title: "Not needed", // we don't need a title actually
      description: "This video has been generated using AI",
      synopsis: "",
      licence: "Non Commercial",
      orientation: "vertical",
      width,
      height,
      isInteractive: false,
      isLoop: false,
      durationInMs: shots.length * defaultSegmentDurationInMs,
      defaultVideoModel: "AnimateDiff-Lightning",
    }
  })

  for (const { title, image, voice } of shots) {

    console.log(`[api/v1/create]  - ${title}`)

    // note: it would be nice if we could have a convention saying that
    // track 0 is for videos and track 1 storyboards
    // 
    // however, that's a bit constraining as people will generate .clap
    // using all kind of tools and development experience,
    // and they may not wish to learn the Clap protocol format completely
    //
    // TL;DR: 
    // we should fix the Clap file editor to make it able to react videos
    // from any track number


    /*
    we disable it, because we don't generate animated videos yet
    clap.segments.push(newSegment({
      track: 0,
      category: "video",
      prompt: image,
      outputType: "video"
    }))
    */

    clap.segments.push(newSegment({
      track: 1,
      startTimeInMs: currentSegmentDurationInMs,
      assetDurationInMs: defaultSegmentDurationInMs,
      category: "storyboard",
      prompt: image,
      outputType: "image"
    }))

    clap.segments.push(newSegment({
      track: 2,
      startTimeInMs: currentSegmentDurationInMs,
      assetDurationInMs: defaultSegmentDurationInMs,
      category: "interface",
      prompt: title,
      // assetUrl: `data:text/plain;base64,${btoa(title)}`,
      assetUrl: title,
      outputType: "text"
    }))

    clap.segments.push(newSegment({
      track: 3,
      startTimeInMs: currentSegmentDurationInMs,
      assetDurationInMs: defaultSegmentDurationInMs,
      category: "dialogue",
      prompt: voice,
      outputType: "audio"
    }))

    // the presence of a camera is mandatory
    clap.segments.push(newSegment({
      track: 4,
      startTimeInMs: currentSegmentDurationInMs,
      assetDurationInMs: defaultSegmentDurationInMs,
      category: "camera",
      prompt: "vertical video",
      outputType: "text"
    }))

    currentSegmentDurationInMs += defaultSegmentDurationInMs
  }

  // TODO replace by Clap file streaming
  return new NextResponse(await serializeClap(clap), {
    status: 200,
    headers: new Headers({ "content-type": "application/x-gzip" }),
  })
}