jbilcke-hf HF staff commited on
Commit
1e78ba0
·
1 Parent(s): 3c87951
src/app/api/v1/create/checkCaptions.ts ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export function checkCaptions(input: string): {
2
+ prompt: string
3
+ hasCaptions: boolean
4
+ } {
5
+
6
+ const prompt = input.replaceAll(/,? ?(?:no|without|skip|hide|empty|remove|delete) (?:(?:the|any|all) )?(?:comment|caption|commentary|sub|subtitle|title|subtext|commentarie)s?(?: (?:pls|plz|please|thanks?))?/gi, "")
7
+
8
+ return {
9
+ prompt,
10
+
11
+ // the rule is that we have captions, unless we did have text like "no captions"
12
+ hasCaptions: prompt === input
13
+ }
14
+ }
src/app/api/v1/create/index.ts CHANGED
@@ -11,6 +11,8 @@ import { systemPromptCompleteStory } from "./systemPromptCompleteStory"
11
  import { generateMusicPrompts } from "../edit/music/generateMusicPrompt"
12
  import { clapToLatentStory } from "../edit/entities/clapToLatentStory"
13
  import { generateRandomStory } from "./generateRandomStory"
 
 
14
 
15
  // a helper to generate Clap stories from a few sentences
16
  // this is mostly used by external apps such as the Stories Factory
@@ -27,9 +29,9 @@ export async function create(request: {
27
  }): Promise<ClapProject> {
28
 
29
  // we limit to 512 characters
30
- let prompt = `${request?.prompt || ""}`.trim().slice(0, 512)
31
 
32
- console.log("api/v1/create(): request:", request)
33
 
34
  if (!prompt.length) {
35
  // throw new Error(`please provide a prompt`)
@@ -96,7 +98,7 @@ Output: `
96
  console.log(`api/v1/create(): generated ${shots.length} shots`)
97
 
98
  // this is approximate - TTS generation will determine the final duration of each shot
99
- const defaultSegmentDurationInMs = 7000
100
 
101
  let currentElapsedTimeInMs = 0
102
 
@@ -121,7 +123,7 @@ Output: `
121
 
122
  for (const { comment, image, voice } of shots) {
123
 
124
- console.log(`api/v1/create(): - ${comment}`)
125
 
126
  // note: it would be nice if we could have a convention saying that
127
  // track 0 is for videos and track 1 storyboards
@@ -156,6 +158,7 @@ Output: `
156
  status: "to_generate",
157
  }))
158
 
 
159
  clap.segments.push(newSegment({
160
  track: 2,
161
  startTimeInMs: currentElapsedTimeInMs,
@@ -168,6 +171,7 @@ Output: `
168
  outputType: ClapOutputType.TEXT,
169
  status: "to_generate",
170
  }))
 
171
 
172
  clap.segments.push(newSegment({
173
  track: 3,
@@ -195,21 +199,52 @@ Output: `
195
  currentElapsedTimeInMs += defaultSegmentDurationInMs
196
  }
197
 
198
- // one more thing: music!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  let musicPrompts: string[] = []
200
-
201
  try {
202
  musicPrompts = await generateMusicPrompts({
203
  prompt,
204
- latentStory: await clapToLatentStory(clap)
 
205
  })
206
  const musicPrompt = musicPrompts.at(0)
207
  if (!musicPrompt) { throw new Error(`not enough music prompts`) }
208
-
209
  // console.log("musicPrompt:", musicPrompt)
210
 
211
  clap.segments.push(newSegment({
212
- track: 5,
213
  startTimeInMs: 0,
214
  endTimeInMs: currentElapsedTimeInMs,
215
  assetDurationInMs: currentElapsedTimeInMs,
 
11
  import { generateMusicPrompts } from "../edit/music/generateMusicPrompt"
12
  import { clapToLatentStory } from "../edit/entities/clapToLatentStory"
13
  import { generateRandomStory } from "./generateRandomStory"
14
+ import { generateSoundPrompts } from "../edit/sounds/generateSoundPrompt"
15
+ import { checkCaptions } from "./checkCaptions"
16
 
17
  // a helper to generate Clap stories from a few sentences
18
  // this is mostly used by external apps such as the Stories Factory
 
29
  }): Promise<ClapProject> {
30
 
31
  // we limit to 512 characters
32
+ let { prompt, hasCaptions } = checkCaptions(`${request?.prompt || ""}`.trim().slice(0, 512))
33
 
34
+ // console.log("api/v1/create(): request:", request)
35
 
36
  if (!prompt.length) {
37
  // throw new Error(`please provide a prompt`)
 
98
  console.log(`api/v1/create(): generated ${shots.length} shots`)
99
 
100
  // this is approximate - TTS generation will determine the final duration of each shot
101
+ const defaultSegmentDurationInMs = 3000
102
 
103
  let currentElapsedTimeInMs = 0
104
 
 
123
 
124
  for (const { comment, image, voice } of shots) {
125
 
126
+ // console.log(`api/v1/create(): - ${comment}`)
127
 
128
  // note: it would be nice if we could have a convention saying that
129
  // track 0 is for videos and track 1 storyboards
 
158
  status: "to_generate",
159
  }))
160
 
161
+ if (hasCaptions) {
162
  clap.segments.push(newSegment({
163
  track: 2,
164
  startTimeInMs: currentElapsedTimeInMs,
 
171
  outputType: ClapOutputType.TEXT,
172
  status: "to_generate",
173
  }))
174
+ }
175
 
176
  clap.segments.push(newSegment({
177
  track: 3,
 
199
  currentElapsedTimeInMs += defaultSegmentDurationInMs
200
  }
201
 
202
+ const latentStory = await clapToLatentStory(clap)
203
+
204
+ let soundPrompts: string[] = []
205
+
206
+ try {
207
+ soundPrompts = await generateSoundPrompts({
208
+ prompt,
209
+ latentStory,
210
+ turbo,
211
+ })
212
+ const soundPrompt = soundPrompts.at(0)
213
+ if (!soundPrompt) { throw new Error(`not enough sound prompts`) }
214
+
215
+ // console.log("musicPrompt:", musicPrompt)
216
+
217
+ clap.segments.push(newSegment({
218
+ track: 5,
219
+ startTimeInMs: 0,
220
+ endTimeInMs: currentElapsedTimeInMs,
221
+ assetDurationInMs: currentElapsedTimeInMs,
222
+ category: ClapSegmentCategory.SOUND,
223
+ prompt: soundPrompt,
224
+ outputType: ClapOutputType.AUDIO,
225
+ status: "to_generate",
226
+ }))
227
+ } catch (err) {
228
+ console.error(`[api/v1/create] failed to generate sound prompts`)
229
+ // soundPrompts.push("lofi hiphop loop")
230
+ }
231
+
232
+
233
  let musicPrompts: string[] = []
234
+
235
  try {
236
  musicPrompts = await generateMusicPrompts({
237
  prompt,
238
+ latentStory,
239
+ turbo,
240
  })
241
  const musicPrompt = musicPrompts.at(0)
242
  if (!musicPrompt) { throw new Error(`not enough music prompts`) }
243
+
244
  // console.log("musicPrompt:", musicPrompt)
245
 
246
  clap.segments.push(newSegment({
247
+ track: 6,
248
  startTimeInMs: 0,
249
  endTimeInMs: currentElapsedTimeInMs,
250
  assetDurationInMs: currentElapsedTimeInMs,
src/app/api/v1/edit/entities/clapToLatentStory.ts CHANGED
@@ -23,6 +23,8 @@ export async function clapToLatentStory(clap: ClapProject): Promise<LatentStory[
23
  ClapSegmentCategory.STORYBOARD
24
  ).at(0)
25
 
 
 
26
  const comment = filterSegments(
27
  ClapSegmentFilteringMode.START,
28
  shot,
@@ -38,9 +40,9 @@ export async function clapToLatentStory(clap: ClapProject): Promise<LatentStory[
38
  ).at(0)
39
 
40
  const latentStory: LatentStory = {
41
- comment: comment.prompt,
42
- image: image.prompt,
43
- voice: voice.prompt,
44
  }
45
 
46
  latentStories.push(latentStory)
 
23
  ClapSegmentCategory.STORYBOARD
24
  ).at(0)
25
 
26
+ // note: the comment might be missing, that's on purpose
27
+ // this can happen if the user asked for no captions or no commentary
28
  const comment = filterSegments(
29
  ClapSegmentFilteringMode.START,
30
  shot,
 
40
  ).at(0)
41
 
42
  const latentStory: LatentStory = {
43
+ comment: comment?.prompt || "",
44
+ image: image?.prompt || "",
45
+ voice: voice?.prompt || "",
46
  }
47
 
48
  latentStories.push(latentStory)
src/app/api/v1/edit/music/generateMusic.ts CHANGED
@@ -48,7 +48,16 @@ export async function generateMusic({
48
  }
49
 
50
 
51
- const durationInSec = 14 // musicSegment.assetDurationInMs / 1000
 
 
 
 
 
 
 
 
 
52
 
53
  console.log(`generateMusic(): generating a music with:\n duration: ${durationInSec} sec\n prompt: ${prompt}`)
54
 
 
48
  }
49
 
50
 
51
+ // unconfirmed, I think some durations might make musicgen crash
52
+ // File "/home/user/app/audiocraft/modules/transformer.py", line 394, in forward
53
+ // k, v = self._complete_kv(k, v)
54
+ // File "/home/user/app/audiocraft/modules/transformer.py", line 286, in _complete_kv
55
+ // assert nk.shape[time_dim] == nv.shape[time_dim]
56
+ //
57
+ // it is also possible that it was because I tried to generate on the prod,
58
+ // while users where already using the musicgen cluster
59
+
60
+ const durationInSec = 12 // musicSegment.assetDurationInMs / 1000
61
 
62
  console.log(`generateMusic(): generating a music with:\n duration: ${durationInSec} sec\n prompt: ${prompt}`)
63
 
src/app/api/v1/edit/music/generateMusicPrompt.ts CHANGED
@@ -21,7 +21,7 @@ export async function generateMusicPrompts({
21
  }): Promise<string[]> {
22
 
23
  if (!prompt.length) { throw new Error(`please provide a prompt`) }
24
- console.log("generateMusicPrompts(): prompt:", prompt)
25
 
26
 
27
  if (!latentStory.length) { throw new Error(`please provide a story`) }
@@ -45,7 +45,7 @@ ${YAML.stringify(
45
  const prefix = "\""
46
 
47
  // we don't need a lot here!
48
- const nbMaxNewTokens = 120
49
 
50
  // TODO use streaming for the Hugging Face prediction
51
  //
 
21
  }): Promise<string[]> {
22
 
23
  if (!prompt.length) { throw new Error(`please provide a prompt`) }
24
+ // console.log("generateMusicPrompts(): prompt:", prompt)
25
 
26
 
27
  if (!latentStory.length) { throw new Error(`please provide a story`) }
 
45
  const prefix = "\""
46
 
47
  // we don't need a lot here!
48
+ const nbMaxNewTokens = 80
49
 
50
  // TODO use streaming for the Hugging Face prediction
51
  //
src/app/api/v1/edit/sounds/generateSoundPrompt.ts CHANGED
@@ -21,7 +21,7 @@ export async function generateSoundPrompts({
21
  }): Promise<string[]> {
22
 
23
  if (!prompt.length) { throw new Error(`please provide a prompt`) }
24
- console.log("generateSoundPrompts(): prompt:", prompt)
25
 
26
 
27
  if (!latentStory.length) { throw new Error(`please provide a story`) }
@@ -53,7 +53,7 @@ ${YAML.stringify(
53
  const prefix = "\""
54
 
55
  // we don't need a lot here!
56
- const nbMaxNewTokens = 120
57
 
58
  // TODO use streaming for the Hugging Face prediction
59
  //
 
21
  }): Promise<string[]> {
22
 
23
  if (!prompt.length) { throw new Error(`please provide a prompt`) }
24
+ // console.log("generateSoundPrompts(): prompt:", prompt)
25
 
26
 
27
  if (!latentStory.length) { throw new Error(`please provide a story`) }
 
53
  const prefix = "\""
54
 
55
  // we don't need a lot here!
56
+ const nbMaxNewTokens = 80
57
 
58
  // TODO use streaming for the Hugging Face prediction
59
  //
src/app/api/v1/render/animatediff-lcm-svd/cluster.ts CHANGED
@@ -2,7 +2,9 @@ import { sleep } from "@/lib/utils/sleep"
2
  import { ClusterMachine } from "../../types"
3
 
4
 
5
-
 
 
6
  export const nbClusterMachines = 8
7
  // make sure the machines are running!!
8
 
@@ -17,18 +19,28 @@ export const nbClusterMachines = 8
17
 
18
  // we maintain a global cluster state
19
 
20
- export const clusterMachines: ClusterMachine[] = []
 
 
 
 
 
 
 
 
 
 
 
 
21
  for (let i = 0; i < nbClusterMachines; i++) {
22
  clusterMachines.push({
23
  id: i,
24
  url: `https://jbilcke-hf-ai-tube-model-als-${i + 1}.hf.space`,
25
-
26
- // careful when trying this one (check number of Gradio parameters, fps etc):
27
- // url: `https://jbilcke-hf-ai-tube-model-als-experimental.hf.space`,
28
  busy: false
29
  })
30
  }
31
 
 
32
  export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
33
  let clusterMachine: ClusterMachine | undefined = undefined
34
  let timeSpentWaitingInMs = 0
 
2
  import { ClusterMachine } from "../../types"
3
 
4
 
5
+ // 8 allows us to support about 1 request per minute
6
+ // we are still gonna need to add a hugging face login wall,
7
+ // to limit further the amount of requests people do
8
  export const nbClusterMachines = 8
9
  // make sure the machines are running!!
10
 
 
19
 
20
  // we maintain a global cluster state
21
 
22
+ export const clusterMachines: ClusterMachine[] = [
23
+
24
+ // careful when trying this one (check number of Gradio parameters, fps etc):
25
+ // url: `https://jbilcke-hf-ai-tube-model-als-experimental.hf.space`,
26
+
27
+ // { id: 0, url: `https://jbilcke-hf-ai-tube-model-als-1.hf.space`, busy: false },
28
+ // { id: 1, url: `https://jbilcke-hf-ai-tube-model-als-2.hf.space`, busy: false },
29
+ // { id: 2, url: `https://jbilcke-hf-ai-tube-model-als-3.hf.space`, busy: false },
30
+ // { id: 3, url: `https://jbilcke-hf-ai-tube-model-als-4.hf.space`, busy: false },
31
+ // { id: 4, url: `https://jbilcke-hf-ai-tube-model-als-5.hf.space`, busy: false },
32
+ // { id: 5, url: `https://jbilcke-hf-ai-tube-model-als-6.hf.space`, busy: false },
33
+ ]
34
+
35
  for (let i = 0; i < nbClusterMachines; i++) {
36
  clusterMachines.push({
37
  id: i,
38
  url: `https://jbilcke-hf-ai-tube-model-als-${i + 1}.hf.space`,
 
 
 
39
  busy: false
40
  })
41
  }
42
 
43
+
44
  export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
45
  let clusterMachine: ClusterMachine | undefined = undefined
46
  let timeSpentWaitingInMs = 0
src/app/api/v1/render/animatediff-lcm-svd/index.ts CHANGED
@@ -48,6 +48,9 @@ export async function render(request: {
48
  const durationInSec = Math.round(nbFrames / nbFPS)
49
  const framesPerSec = nbFPS
50
 
 
 
 
51
  // vital step: image size must match the output video size
52
  const resizedImageBase64 = await resizeImage({
53
  input: imageInputBase64,
@@ -75,6 +78,7 @@ export async function render(request: {
75
  })
76
  }
77
 
 
78
  const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
79
  method: "POST",
80
  headers: {
@@ -88,7 +92,7 @@ export async function render(request: {
88
  resizedImageBase64,
89
  0, // seed,
90
  true,
91
- 33, // motion_bucket_id,
92
 
93
  // attention: we are experimenting with ffmpeg to change the speed,
94
  // on the server "als-2"
@@ -103,7 +107,7 @@ export async function render(request: {
103
  1.0, // min_guidance_scale,
104
  width,
105
  height,
106
- nbSteps,
107
  ],
108
  }),
109
 
 
48
  const durationInSec = Math.round(nbFrames / nbFPS)
49
  const framesPerSec = nbFPS
50
 
51
+ // I never how how to pick this
52
+ const motionBucketId = 30
53
+
54
  // vital step: image size must match the output video size
55
  const resizedImageBase64 = await resizeImage({
56
  input: imageInputBase64,
 
78
  })
79
  }
80
 
81
+
82
  const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
83
  method: "POST",
84
  headers: {
 
92
  resizedImageBase64,
93
  0, // seed,
94
  true,
95
+ motionBucketId, // motion_bucket_id,
96
 
97
  // attention: we are experimenting with ffmpeg to change the speed,
98
  // on the server "als-2"
 
107
  1.0, // min_guidance_scale,
108
  width,
109
  height,
110
+ 4, // I don't see a lot of diff between 4 and 6, // nbSteps,
111
  ],
112
  }),
113