Spaces:

jbilcke-hf
/

ai-tube

Running

App Files Files Community

jbilcke-hf HF staff commited on May 15, 2024

Commit

1e78ba0

1 Parent(s): 3c87951

update

Browse files

Files changed (8) hide show

src/app/api/v1/create/checkCaptions.ts +14 -0
src/app/api/v1/create/index.ts +44 -9
src/app/api/v1/edit/entities/clapToLatentStory.ts +5 -3
src/app/api/v1/edit/music/generateMusic.ts +10 -1
src/app/api/v1/edit/music/generateMusicPrompt.ts +2 -2
src/app/api/v1/edit/sounds/generateSoundPrompt.ts +2 -2
src/app/api/v1/render/animatediff-lcm-svd/cluster.ts +17 -5
src/app/api/v1/render/animatediff-lcm-svd/index.ts +6 -2

src/app/api/v1/create/checkCaptions.ts ADDED Viewed

	@@ -0,0 +1,14 @@

+export function checkCaptions(input: string): {
+  prompt: string
+  hasCaptions: boolean
+} {
+  const prompt = input.replaceAll(/,? ?(?:no|without|skip|hide|empty|remove|delete) (?:(?:the|any|all) )?(?:comment|caption|commentary|sub|subtitle|title|subtext|commentarie)s?(?: (?:pls|plz|please|thanks?))?/gi, "")
+  return {
+    prompt,
+    // the rule is that we have captions, unless we did have text like "no captions"
+    hasCaptions: prompt === input
+  }
+}

src/app/api/v1/create/index.ts CHANGED Viewed

@@ -11,6 +11,8 @@ import { systemPromptCompleteStory } from "./systemPromptCompleteStory"
 import { generateMusicPrompts } from "../edit/music/generateMusicPrompt"
 import { clapToLatentStory } from "../edit/entities/clapToLatentStory"
 import { generateRandomStory } from "./generateRandomStory"
 // a helper to generate Clap stories from a few sentences
 // this is mostly used by external apps such as the Stories Factory
@@ -27,9 +29,9 @@ export async function create(request: {
 }): Promise<ClapProject> {
   // we limit to 512 characters
-  let prompt = `${request?.prompt || ""}`.trim().slice(0, 512)
-  console.log("api/v1/create(): request:", request)
   if (!prompt.length) {
     // throw new Error(`please provide a prompt`)
@@ -96,7 +98,7 @@ Output: `
   console.log(`api/v1/create(): generated ${shots.length} shots`)
   // this is approximate - TTS generation will determine the final duration of each shot
-  const defaultSegmentDurationInMs = 7000
   let currentElapsedTimeInMs = 0
@@ -121,7 +123,7 @@ Output: `
   for (const { comment, image, voice } of shots) {
-    console.log(`api/v1/create():  - ${comment}`)
     // note: it would be nice if we could have a convention saying that
     // track 0 is for videos and track 1 storyboards
@@ -156,6 +158,7 @@ Output: `
       status: "to_generate",
     }))
     clap.segments.push(newSegment({
       track: 2,
       startTimeInMs: currentElapsedTimeInMs,
@@ -168,6 +171,7 @@ Output: `
       outputType: ClapOutputType.TEXT,
       status: "to_generate",
     }))
     clap.segments.push(newSegment({
       track: 3,
@@ -195,21 +199,52 @@ Output: `
     currentElapsedTimeInMs += defaultSegmentDurationInMs
   }
-  // one more thing: music!
   let musicPrompts: string[] = []
   try {
     musicPrompts = await generateMusicPrompts({
       prompt,
-      latentStory: await clapToLatentStory(clap)
     })
     const musicPrompt = musicPrompts.at(0)
     if (!musicPrompt) { throw new Error(`not enough music prompts`) }
     // console.log("musicPrompt:", musicPrompt)
     clap.segments.push(newSegment({
-      track: 5,
       startTimeInMs: 0,
       endTimeInMs: currentElapsedTimeInMs,
       assetDurationInMs: currentElapsedTimeInMs,

 import { generateMusicPrompts } from "../edit/music/generateMusicPrompt"
 import { clapToLatentStory } from "../edit/entities/clapToLatentStory"
 import { generateRandomStory } from "./generateRandomStory"
+import { generateSoundPrompts } from "../edit/sounds/generateSoundPrompt"
+import { checkCaptions } from "./checkCaptions"
 // a helper to generate Clap stories from a few sentences
 // this is mostly used by external apps such as the Stories Factory
 }): Promise<ClapProject> {
   // we limit to 512 characters
+  let { prompt, hasCaptions } = checkCaptions(`${request?.prompt || ""}`.trim().slice(0, 512))
+  // console.log("api/v1/create(): request:", request)
   if (!prompt.length) {
     // throw new Error(`please provide a prompt`)
   console.log(`api/v1/create(): generated ${shots.length} shots`)
   // this is approximate - TTS generation will determine the final duration of each shot
+  const defaultSegmentDurationInMs = 3000
   let currentElapsedTimeInMs = 0
   for (const { comment, image, voice } of shots) {
+    // console.log(`api/v1/create():  - ${comment}`)
     // note: it would be nice if we could have a convention saying that
     // track 0 is for videos and track 1 storyboards
       status: "to_generate",
     }))
+    if (hasCaptions) {
     clap.segments.push(newSegment({
       track: 2,
       startTimeInMs: currentElapsedTimeInMs,
       outputType: ClapOutputType.TEXT,
       status: "to_generate",
     }))
+  }
     clap.segments.push(newSegment({
       track: 3,
     currentElapsedTimeInMs += defaultSegmentDurationInMs
   }
+  const latentStory = await clapToLatentStory(clap)
+  let soundPrompts: string[] = []
+  try {
+    soundPrompts = await generateSoundPrompts({
+      prompt,
+      latentStory,
+      turbo,
+    })
+    const soundPrompt = soundPrompts.at(0)
+    if (!soundPrompt) { throw new Error(`not enough sound prompts`) }
+    // console.log("musicPrompt:", musicPrompt)
+    clap.segments.push(newSegment({
+      track: 5,
+      startTimeInMs: 0,
+      endTimeInMs: currentElapsedTimeInMs,
+      assetDurationInMs: currentElapsedTimeInMs,
+      category: ClapSegmentCategory.SOUND,
+      prompt: soundPrompt,
+      outputType: ClapOutputType.AUDIO,
+      status: "to_generate",
+    }))
+  } catch (err) {
+    console.error(`[api/v1/create] failed to generate sound prompts`)
+    // soundPrompts.push("lofi hiphop loop")
+  }
   let musicPrompts: string[] = []
   try {
     musicPrompts = await generateMusicPrompts({
       prompt,
+      latentStory,
+      turbo,
     })
     const musicPrompt = musicPrompts.at(0)
     if (!musicPrompt) { throw new Error(`not enough music prompts`) }
     // console.log("musicPrompt:", musicPrompt)
     clap.segments.push(newSegment({
+      track: 6,
       startTimeInMs: 0,
       endTimeInMs: currentElapsedTimeInMs,
       assetDurationInMs: currentElapsedTimeInMs,

src/app/api/v1/edit/entities/clapToLatentStory.ts CHANGED Viewed

@@ -23,6 +23,8 @@ export async function clapToLatentStory(clap: ClapProject): Promise<LatentStory[
       ClapSegmentCategory.STORYBOARD
     ).at(0)
     const comment = filterSegments(
       ClapSegmentFilteringMode.START,
       shot,
@@ -38,9 +40,9 @@ export async function clapToLatentStory(clap: ClapProject): Promise<LatentStory[
     ).at(0)
     const latentStory: LatentStory = {
-      comment: comment.prompt,
-      image: image.prompt,
-      voice: voice.prompt,
     }
     latentStories.push(latentStory)

       ClapSegmentCategory.STORYBOARD
     ).at(0)
+    // note: the comment might be missing, that's on purpose
+    // this can happen if the user asked for no captions or no commentary
     const comment = filterSegments(
       ClapSegmentFilteringMode.START,
       shot,
     ).at(0)
     const latentStory: LatentStory = {
+      comment: comment?.prompt || "",
+      image: image?.prompt || "",
+      voice: voice?.prompt || "",
     }
     latentStories.push(latentStory)

src/app/api/v1/edit/music/generateMusic.ts CHANGED Viewed

@@ -48,7 +48,16 @@ export async function generateMusic({
   }
-  const durationInSec = 14 // musicSegment.assetDurationInMs / 1000
   console.log(`generateMusic(): generating a music with:\n  duration: ${durationInSec} sec\n  prompt: ${prompt}`)

   }
+  // unconfirmed, I think some durations might make musicgen crash
+  // File "/home/user/app/audiocraft/modules/transformer.py", line 394, in forward
+  //   k, v = self._complete_kv(k, v)
+  // File "/home/user/app/audiocraft/modules/transformer.py", line 286, in _complete_kv
+  // assert nk.shape[time_dim] == nv.shape[time_dim]
+  //
+  // it is also possible that it was because I tried to generate on the prod,
+  // while users where already using the musicgen cluster
+  const durationInSec = 12 // musicSegment.assetDurationInMs / 1000
   console.log(`generateMusic(): generating a music with:\n  duration: ${durationInSec} sec\n  prompt: ${prompt}`)

src/app/api/v1/edit/music/generateMusicPrompt.ts CHANGED Viewed

@@ -21,7 +21,7 @@ export async function generateMusicPrompts({
 }): Promise<string[]> {
   if (!prompt.length) { throw new Error(`please provide a prompt`) }
-  console.log("generateMusicPrompts(): prompt:", prompt)
   if (!latentStory.length) { throw new Error(`please provide a story`) }
@@ -45,7 +45,7 @@ ${YAML.stringify(
   const prefix = "\""
   // we don't need a lot here!
-  const nbMaxNewTokens = 120
   // TODO use streaming for the Hugging Face prediction
   //

 }): Promise<string[]> {
   if (!prompt.length) { throw new Error(`please provide a prompt`) }
+  // console.log("generateMusicPrompts(): prompt:", prompt)
   if (!latentStory.length) { throw new Error(`please provide a story`) }
   const prefix = "\""
   // we don't need a lot here!
+  const nbMaxNewTokens = 80
   // TODO use streaming for the Hugging Face prediction
   //

src/app/api/v1/edit/sounds/generateSoundPrompt.ts CHANGED Viewed

@@ -21,7 +21,7 @@ export async function generateSoundPrompts({
 }): Promise<string[]> {
   if (!prompt.length) { throw new Error(`please provide a prompt`) }
-  console.log("generateSoundPrompts(): prompt:", prompt)
   if (!latentStory.length) { throw new Error(`please provide a story`) }
@@ -53,7 +53,7 @@ ${YAML.stringify(
   const prefix = "\""
   // we don't need a lot here!
-  const nbMaxNewTokens = 120
   // TODO use streaming for the Hugging Face prediction
   //

 }): Promise<string[]> {
   if (!prompt.length) { throw new Error(`please provide a prompt`) }
+  // console.log("generateSoundPrompts(): prompt:", prompt)
   if (!latentStory.length) { throw new Error(`please provide a story`) }
   const prefix = "\""
   // we don't need a lot here!
+  const nbMaxNewTokens = 80
   // TODO use streaming for the Hugging Face prediction
   //

src/app/api/v1/render/animatediff-lcm-svd/cluster.ts CHANGED Viewed

@@ -2,7 +2,9 @@ import { sleep } from "@/lib/utils/sleep"
 import { ClusterMachine } from "../../types"
 export const nbClusterMachines = 8
 // make sure the machines are running!!
@@ -17,18 +19,28 @@ export const nbClusterMachines = 8
 // we maintain a global cluster state
-export const clusterMachines: ClusterMachine[] = []
 for (let i = 0; i < nbClusterMachines; i++) {
   clusterMachines.push({
     id: i,
     url: `https://jbilcke-hf-ai-tube-model-als-${i + 1}.hf.space`,
-    // careful when trying this one (check number of Gradio parameters, fps etc):
-    // url: `https://jbilcke-hf-ai-tube-model-als-experimental.hf.space`,
     busy: false
   })
 }
 export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
   let clusterMachine: ClusterMachine | undefined = undefined
   let timeSpentWaitingInMs = 0

 import { ClusterMachine } from "../../types"
+// 8 allows us to support about 1 request per minute
+// we are still gonna need to add a hugging face login wall,
+// to limit further the amount of requests people do
 export const nbClusterMachines = 8
 // make sure the machines are running!!
 // we maintain a global cluster state
+export const clusterMachines: ClusterMachine[] = [
+    // careful when trying this one (check number of Gradio parameters, fps etc):
+    // url: `https://jbilcke-hf-ai-tube-model-als-experimental.hf.space`,
+  // { id: 0, url: `https://jbilcke-hf-ai-tube-model-als-1.hf.space`, busy: false },
+  // { id: 1, url: `https://jbilcke-hf-ai-tube-model-als-2.hf.space`, busy: false },
+  // { id: 2, url: `https://jbilcke-hf-ai-tube-model-als-3.hf.space`, busy: false },
+  // { id: 3, url: `https://jbilcke-hf-ai-tube-model-als-4.hf.space`, busy: false },
+  // { id: 4, url: `https://jbilcke-hf-ai-tube-model-als-5.hf.space`, busy: false },
+  // { id: 5, url: `https://jbilcke-hf-ai-tube-model-als-6.hf.space`, busy: false },
+]
 for (let i = 0; i < nbClusterMachines; i++) {
   clusterMachines.push({
     id: i,
     url: `https://jbilcke-hf-ai-tube-model-als-${i + 1}.hf.space`,
     busy: false
   })
 }
 export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
   let clusterMachine: ClusterMachine | undefined = undefined
   let timeSpentWaitingInMs = 0

src/app/api/v1/render/animatediff-lcm-svd/index.ts CHANGED Viewed

@@ -48,6 +48,9 @@ export async function render(request: {
   const durationInSec = Math.round(nbFrames / nbFPS)
   const framesPerSec = nbFPS
   // vital step: image size must match the output video size
   const resizedImageBase64 = await resizeImage({
     input: imageInputBase64,
@@ -75,6 +78,7 @@ export async function render(request: {
       })
     }
     const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
       method: "POST",
       headers: {
@@ -88,7 +92,7 @@ export async function render(request: {
           resizedImageBase64,
           0, // seed,
           true,
-          33, // motion_bucket_id,
           // attention: we are experimenting with ffmpeg to change the speed,
           // on the server "als-2"
@@ -103,7 +107,7 @@ export async function render(request: {
           1.0, // min_guidance_scale,
           width,
           height,
-          nbSteps,
         ],
       }),

   const durationInSec = Math.round(nbFrames / nbFPS)
   const framesPerSec = nbFPS
+  // I never how how to pick this
+  const motionBucketId = 30
   // vital step: image size must match the output video size
   const resizedImageBase64 = await resizeImage({
     input: imageInputBase64,
       })
     }
     const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
       method: "POST",
       headers: {
           resizedImageBase64,
           0, // seed,
           true,
+          motionBucketId, // motion_bucket_id,
           // attention: we are experimenting with ffmpeg to change the speed,
           // on the server "als-2"
           1.0, // min_guidance_scale,
           width,
           height,
+          4, // I don't see a lot of diff between 4 and 6, // nbSteps,
         ],
       }),