Spaces:
Running
Running
Commit
·
1e78ba0
1
Parent(s):
3c87951
update
Browse files- src/app/api/v1/create/checkCaptions.ts +14 -0
- src/app/api/v1/create/index.ts +44 -9
- src/app/api/v1/edit/entities/clapToLatentStory.ts +5 -3
- src/app/api/v1/edit/music/generateMusic.ts +10 -1
- src/app/api/v1/edit/music/generateMusicPrompt.ts +2 -2
- src/app/api/v1/edit/sounds/generateSoundPrompt.ts +2 -2
- src/app/api/v1/render/animatediff-lcm-svd/cluster.ts +17 -5
- src/app/api/v1/render/animatediff-lcm-svd/index.ts +6 -2
src/app/api/v1/create/checkCaptions.ts
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export function checkCaptions(input: string): {
|
2 |
+
prompt: string
|
3 |
+
hasCaptions: boolean
|
4 |
+
} {
|
5 |
+
|
6 |
+
const prompt = input.replaceAll(/,? ?(?:no|without|skip|hide|empty|remove|delete) (?:(?:the|any|all) )?(?:comment|caption|commentary|sub|subtitle|title|subtext|commentarie)s?(?: (?:pls|plz|please|thanks?))?/gi, "")
|
7 |
+
|
8 |
+
return {
|
9 |
+
prompt,
|
10 |
+
|
11 |
+
// the rule is that we have captions, unless we did have text like "no captions"
|
12 |
+
hasCaptions: prompt === input
|
13 |
+
}
|
14 |
+
}
|
src/app/api/v1/create/index.ts
CHANGED
@@ -11,6 +11,8 @@ import { systemPromptCompleteStory } from "./systemPromptCompleteStory"
|
|
11 |
import { generateMusicPrompts } from "../edit/music/generateMusicPrompt"
|
12 |
import { clapToLatentStory } from "../edit/entities/clapToLatentStory"
|
13 |
import { generateRandomStory } from "./generateRandomStory"
|
|
|
|
|
14 |
|
15 |
// a helper to generate Clap stories from a few sentences
|
16 |
// this is mostly used by external apps such as the Stories Factory
|
@@ -27,9 +29,9 @@ export async function create(request: {
|
|
27 |
}): Promise<ClapProject> {
|
28 |
|
29 |
// we limit to 512 characters
|
30 |
-
let prompt = `${request?.prompt || ""}`.trim().slice(0, 512)
|
31 |
|
32 |
-
console.log("api/v1/create(): request:", request)
|
33 |
|
34 |
if (!prompt.length) {
|
35 |
// throw new Error(`please provide a prompt`)
|
@@ -96,7 +98,7 @@ Output: `
|
|
96 |
console.log(`api/v1/create(): generated ${shots.length} shots`)
|
97 |
|
98 |
// this is approximate - TTS generation will determine the final duration of each shot
|
99 |
-
const defaultSegmentDurationInMs =
|
100 |
|
101 |
let currentElapsedTimeInMs = 0
|
102 |
|
@@ -121,7 +123,7 @@ Output: `
|
|
121 |
|
122 |
for (const { comment, image, voice } of shots) {
|
123 |
|
124 |
-
console.log(`api/v1/create(): - ${comment}`)
|
125 |
|
126 |
// note: it would be nice if we could have a convention saying that
|
127 |
// track 0 is for videos and track 1 storyboards
|
@@ -156,6 +158,7 @@ Output: `
|
|
156 |
status: "to_generate",
|
157 |
}))
|
158 |
|
|
|
159 |
clap.segments.push(newSegment({
|
160 |
track: 2,
|
161 |
startTimeInMs: currentElapsedTimeInMs,
|
@@ -168,6 +171,7 @@ Output: `
|
|
168 |
outputType: ClapOutputType.TEXT,
|
169 |
status: "to_generate",
|
170 |
}))
|
|
|
171 |
|
172 |
clap.segments.push(newSegment({
|
173 |
track: 3,
|
@@ -195,21 +199,52 @@ Output: `
|
|
195 |
currentElapsedTimeInMs += defaultSegmentDurationInMs
|
196 |
}
|
197 |
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
let musicPrompts: string[] = []
|
200 |
-
|
201 |
try {
|
202 |
musicPrompts = await generateMusicPrompts({
|
203 |
prompt,
|
204 |
-
latentStory
|
|
|
205 |
})
|
206 |
const musicPrompt = musicPrompts.at(0)
|
207 |
if (!musicPrompt) { throw new Error(`not enough music prompts`) }
|
208 |
-
|
209 |
// console.log("musicPrompt:", musicPrompt)
|
210 |
|
211 |
clap.segments.push(newSegment({
|
212 |
-
track:
|
213 |
startTimeInMs: 0,
|
214 |
endTimeInMs: currentElapsedTimeInMs,
|
215 |
assetDurationInMs: currentElapsedTimeInMs,
|
|
|
11 |
import { generateMusicPrompts } from "../edit/music/generateMusicPrompt"
|
12 |
import { clapToLatentStory } from "../edit/entities/clapToLatentStory"
|
13 |
import { generateRandomStory } from "./generateRandomStory"
|
14 |
+
import { generateSoundPrompts } from "../edit/sounds/generateSoundPrompt"
|
15 |
+
import { checkCaptions } from "./checkCaptions"
|
16 |
|
17 |
// a helper to generate Clap stories from a few sentences
|
18 |
// this is mostly used by external apps such as the Stories Factory
|
|
|
29 |
}): Promise<ClapProject> {
|
30 |
|
31 |
// we limit to 512 characters
|
32 |
+
let { prompt, hasCaptions } = checkCaptions(`${request?.prompt || ""}`.trim().slice(0, 512))
|
33 |
|
34 |
+
// console.log("api/v1/create(): request:", request)
|
35 |
|
36 |
if (!prompt.length) {
|
37 |
// throw new Error(`please provide a prompt`)
|
|
|
98 |
console.log(`api/v1/create(): generated ${shots.length} shots`)
|
99 |
|
100 |
// this is approximate - TTS generation will determine the final duration of each shot
|
101 |
+
const defaultSegmentDurationInMs = 3000
|
102 |
|
103 |
let currentElapsedTimeInMs = 0
|
104 |
|
|
|
123 |
|
124 |
for (const { comment, image, voice } of shots) {
|
125 |
|
126 |
+
// console.log(`api/v1/create(): - ${comment}`)
|
127 |
|
128 |
// note: it would be nice if we could have a convention saying that
|
129 |
// track 0 is for videos and track 1 storyboards
|
|
|
158 |
status: "to_generate",
|
159 |
}))
|
160 |
|
161 |
+
if (hasCaptions) {
|
162 |
clap.segments.push(newSegment({
|
163 |
track: 2,
|
164 |
startTimeInMs: currentElapsedTimeInMs,
|
|
|
171 |
outputType: ClapOutputType.TEXT,
|
172 |
status: "to_generate",
|
173 |
}))
|
174 |
+
}
|
175 |
|
176 |
clap.segments.push(newSegment({
|
177 |
track: 3,
|
|
|
199 |
currentElapsedTimeInMs += defaultSegmentDurationInMs
|
200 |
}
|
201 |
|
202 |
+
const latentStory = await clapToLatentStory(clap)
|
203 |
+
|
204 |
+
let soundPrompts: string[] = []
|
205 |
+
|
206 |
+
try {
|
207 |
+
soundPrompts = await generateSoundPrompts({
|
208 |
+
prompt,
|
209 |
+
latentStory,
|
210 |
+
turbo,
|
211 |
+
})
|
212 |
+
const soundPrompt = soundPrompts.at(0)
|
213 |
+
if (!soundPrompt) { throw new Error(`not enough sound prompts`) }
|
214 |
+
|
215 |
+
// console.log("musicPrompt:", musicPrompt)
|
216 |
+
|
217 |
+
clap.segments.push(newSegment({
|
218 |
+
track: 5,
|
219 |
+
startTimeInMs: 0,
|
220 |
+
endTimeInMs: currentElapsedTimeInMs,
|
221 |
+
assetDurationInMs: currentElapsedTimeInMs,
|
222 |
+
category: ClapSegmentCategory.SOUND,
|
223 |
+
prompt: soundPrompt,
|
224 |
+
outputType: ClapOutputType.AUDIO,
|
225 |
+
status: "to_generate",
|
226 |
+
}))
|
227 |
+
} catch (err) {
|
228 |
+
console.error(`[api/v1/create] failed to generate sound prompts`)
|
229 |
+
// soundPrompts.push("lofi hiphop loop")
|
230 |
+
}
|
231 |
+
|
232 |
+
|
233 |
let musicPrompts: string[] = []
|
234 |
+
|
235 |
try {
|
236 |
musicPrompts = await generateMusicPrompts({
|
237 |
prompt,
|
238 |
+
latentStory,
|
239 |
+
turbo,
|
240 |
})
|
241 |
const musicPrompt = musicPrompts.at(0)
|
242 |
if (!musicPrompt) { throw new Error(`not enough music prompts`) }
|
243 |
+
|
244 |
// console.log("musicPrompt:", musicPrompt)
|
245 |
|
246 |
clap.segments.push(newSegment({
|
247 |
+
track: 6,
|
248 |
startTimeInMs: 0,
|
249 |
endTimeInMs: currentElapsedTimeInMs,
|
250 |
assetDurationInMs: currentElapsedTimeInMs,
|
src/app/api/v1/edit/entities/clapToLatentStory.ts
CHANGED
@@ -23,6 +23,8 @@ export async function clapToLatentStory(clap: ClapProject): Promise<LatentStory[
|
|
23 |
ClapSegmentCategory.STORYBOARD
|
24 |
).at(0)
|
25 |
|
|
|
|
|
26 |
const comment = filterSegments(
|
27 |
ClapSegmentFilteringMode.START,
|
28 |
shot,
|
@@ -38,9 +40,9 @@ export async function clapToLatentStory(clap: ClapProject): Promise<LatentStory[
|
|
38 |
).at(0)
|
39 |
|
40 |
const latentStory: LatentStory = {
|
41 |
-
comment: comment
|
42 |
-
image: image
|
43 |
-
voice: voice
|
44 |
}
|
45 |
|
46 |
latentStories.push(latentStory)
|
|
|
23 |
ClapSegmentCategory.STORYBOARD
|
24 |
).at(0)
|
25 |
|
26 |
+
// note: the comment might be missing, that's on purpose
|
27 |
+
// this can happen if the user asked for no captions or no commentary
|
28 |
const comment = filterSegments(
|
29 |
ClapSegmentFilteringMode.START,
|
30 |
shot,
|
|
|
40 |
).at(0)
|
41 |
|
42 |
const latentStory: LatentStory = {
|
43 |
+
comment: comment?.prompt || "",
|
44 |
+
image: image?.prompt || "",
|
45 |
+
voice: voice?.prompt || "",
|
46 |
}
|
47 |
|
48 |
latentStories.push(latentStory)
|
src/app/api/v1/edit/music/generateMusic.ts
CHANGED
@@ -48,7 +48,16 @@ export async function generateMusic({
|
|
48 |
}
|
49 |
|
50 |
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
console.log(`generateMusic(): generating a music with:\n duration: ${durationInSec} sec\n prompt: ${prompt}`)
|
54 |
|
|
|
48 |
}
|
49 |
|
50 |
|
51 |
+
// unconfirmed, I think some durations might make musicgen crash
|
52 |
+
// File "/home/user/app/audiocraft/modules/transformer.py", line 394, in forward
|
53 |
+
// k, v = self._complete_kv(k, v)
|
54 |
+
// File "/home/user/app/audiocraft/modules/transformer.py", line 286, in _complete_kv
|
55 |
+
// assert nk.shape[time_dim] == nv.shape[time_dim]
|
56 |
+
//
|
57 |
+
// it is also possible that it was because I tried to generate on the prod,
|
58 |
+
// while users where already using the musicgen cluster
|
59 |
+
|
60 |
+
const durationInSec = 12 // musicSegment.assetDurationInMs / 1000
|
61 |
|
62 |
console.log(`generateMusic(): generating a music with:\n duration: ${durationInSec} sec\n prompt: ${prompt}`)
|
63 |
|
src/app/api/v1/edit/music/generateMusicPrompt.ts
CHANGED
@@ -21,7 +21,7 @@ export async function generateMusicPrompts({
|
|
21 |
}): Promise<string[]> {
|
22 |
|
23 |
if (!prompt.length) { throw new Error(`please provide a prompt`) }
|
24 |
-
console.log("generateMusicPrompts(): prompt:", prompt)
|
25 |
|
26 |
|
27 |
if (!latentStory.length) { throw new Error(`please provide a story`) }
|
@@ -45,7 +45,7 @@ ${YAML.stringify(
|
|
45 |
const prefix = "\""
|
46 |
|
47 |
// we don't need a lot here!
|
48 |
-
const nbMaxNewTokens =
|
49 |
|
50 |
// TODO use streaming for the Hugging Face prediction
|
51 |
//
|
|
|
21 |
}): Promise<string[]> {
|
22 |
|
23 |
if (!prompt.length) { throw new Error(`please provide a prompt`) }
|
24 |
+
// console.log("generateMusicPrompts(): prompt:", prompt)
|
25 |
|
26 |
|
27 |
if (!latentStory.length) { throw new Error(`please provide a story`) }
|
|
|
45 |
const prefix = "\""
|
46 |
|
47 |
// we don't need a lot here!
|
48 |
+
const nbMaxNewTokens = 80
|
49 |
|
50 |
// TODO use streaming for the Hugging Face prediction
|
51 |
//
|
src/app/api/v1/edit/sounds/generateSoundPrompt.ts
CHANGED
@@ -21,7 +21,7 @@ export async function generateSoundPrompts({
|
|
21 |
}): Promise<string[]> {
|
22 |
|
23 |
if (!prompt.length) { throw new Error(`please provide a prompt`) }
|
24 |
-
console.log("generateSoundPrompts(): prompt:", prompt)
|
25 |
|
26 |
|
27 |
if (!latentStory.length) { throw new Error(`please provide a story`) }
|
@@ -53,7 +53,7 @@ ${YAML.stringify(
|
|
53 |
const prefix = "\""
|
54 |
|
55 |
// we don't need a lot here!
|
56 |
-
const nbMaxNewTokens =
|
57 |
|
58 |
// TODO use streaming for the Hugging Face prediction
|
59 |
//
|
|
|
21 |
}): Promise<string[]> {
|
22 |
|
23 |
if (!prompt.length) { throw new Error(`please provide a prompt`) }
|
24 |
+
// console.log("generateSoundPrompts(): prompt:", prompt)
|
25 |
|
26 |
|
27 |
if (!latentStory.length) { throw new Error(`please provide a story`) }
|
|
|
53 |
const prefix = "\""
|
54 |
|
55 |
// we don't need a lot here!
|
56 |
+
const nbMaxNewTokens = 80
|
57 |
|
58 |
// TODO use streaming for the Hugging Face prediction
|
59 |
//
|
src/app/api/v1/render/animatediff-lcm-svd/cluster.ts
CHANGED
@@ -2,7 +2,9 @@ import { sleep } from "@/lib/utils/sleep"
|
|
2 |
import { ClusterMachine } from "../../types"
|
3 |
|
4 |
|
5 |
-
|
|
|
|
|
6 |
export const nbClusterMachines = 8
|
7 |
// make sure the machines are running!!
|
8 |
|
@@ -17,18 +19,28 @@ export const nbClusterMachines = 8
|
|
17 |
|
18 |
// we maintain a global cluster state
|
19 |
|
20 |
-
export const clusterMachines: ClusterMachine[] = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
for (let i = 0; i < nbClusterMachines; i++) {
|
22 |
clusterMachines.push({
|
23 |
id: i,
|
24 |
url: `https://jbilcke-hf-ai-tube-model-als-${i + 1}.hf.space`,
|
25 |
-
|
26 |
-
// careful when trying this one (check number of Gradio parameters, fps etc):
|
27 |
-
// url: `https://jbilcke-hf-ai-tube-model-als-experimental.hf.space`,
|
28 |
busy: false
|
29 |
})
|
30 |
}
|
31 |
|
|
|
32 |
export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
|
33 |
let clusterMachine: ClusterMachine | undefined = undefined
|
34 |
let timeSpentWaitingInMs = 0
|
|
|
2 |
import { ClusterMachine } from "../../types"
|
3 |
|
4 |
|
5 |
+
// 8 allows us to support about 1 request per minute
|
6 |
+
// we are still gonna need to add a hugging face login wall,
|
7 |
+
// to limit further the amount of requests people do
|
8 |
export const nbClusterMachines = 8
|
9 |
// make sure the machines are running!!
|
10 |
|
|
|
19 |
|
20 |
// we maintain a global cluster state
|
21 |
|
22 |
+
export const clusterMachines: ClusterMachine[] = [
|
23 |
+
|
24 |
+
// careful when trying this one (check number of Gradio parameters, fps etc):
|
25 |
+
// url: `https://jbilcke-hf-ai-tube-model-als-experimental.hf.space`,
|
26 |
+
|
27 |
+
// { id: 0, url: `https://jbilcke-hf-ai-tube-model-als-1.hf.space`, busy: false },
|
28 |
+
// { id: 1, url: `https://jbilcke-hf-ai-tube-model-als-2.hf.space`, busy: false },
|
29 |
+
// { id: 2, url: `https://jbilcke-hf-ai-tube-model-als-3.hf.space`, busy: false },
|
30 |
+
// { id: 3, url: `https://jbilcke-hf-ai-tube-model-als-4.hf.space`, busy: false },
|
31 |
+
// { id: 4, url: `https://jbilcke-hf-ai-tube-model-als-5.hf.space`, busy: false },
|
32 |
+
// { id: 5, url: `https://jbilcke-hf-ai-tube-model-als-6.hf.space`, busy: false },
|
33 |
+
]
|
34 |
+
|
35 |
for (let i = 0; i < nbClusterMachines; i++) {
|
36 |
clusterMachines.push({
|
37 |
id: i,
|
38 |
url: `https://jbilcke-hf-ai-tube-model-als-${i + 1}.hf.space`,
|
|
|
|
|
|
|
39 |
busy: false
|
40 |
})
|
41 |
}
|
42 |
|
43 |
+
|
44 |
export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
|
45 |
let clusterMachine: ClusterMachine | undefined = undefined
|
46 |
let timeSpentWaitingInMs = 0
|
src/app/api/v1/render/animatediff-lcm-svd/index.ts
CHANGED
@@ -48,6 +48,9 @@ export async function render(request: {
|
|
48 |
const durationInSec = Math.round(nbFrames / nbFPS)
|
49 |
const framesPerSec = nbFPS
|
50 |
|
|
|
|
|
|
|
51 |
// vital step: image size must match the output video size
|
52 |
const resizedImageBase64 = await resizeImage({
|
53 |
input: imageInputBase64,
|
@@ -75,6 +78,7 @@ export async function render(request: {
|
|
75 |
})
|
76 |
}
|
77 |
|
|
|
78 |
const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
|
79 |
method: "POST",
|
80 |
headers: {
|
@@ -88,7 +92,7 @@ export async function render(request: {
|
|
88 |
resizedImageBase64,
|
89 |
0, // seed,
|
90 |
true,
|
91 |
-
|
92 |
|
93 |
// attention: we are experimenting with ffmpeg to change the speed,
|
94 |
// on the server "als-2"
|
@@ -103,7 +107,7 @@ export async function render(request: {
|
|
103 |
1.0, // min_guidance_scale,
|
104 |
width,
|
105 |
height,
|
106 |
-
nbSteps,
|
107 |
],
|
108 |
}),
|
109 |
|
|
|
48 |
const durationInSec = Math.round(nbFrames / nbFPS)
|
49 |
const framesPerSec = nbFPS
|
50 |
|
51 |
+
// I never how how to pick this
|
52 |
+
const motionBucketId = 30
|
53 |
+
|
54 |
// vital step: image size must match the output video size
|
55 |
const resizedImageBase64 = await resizeImage({
|
56 |
input: imageInputBase64,
|
|
|
78 |
})
|
79 |
}
|
80 |
|
81 |
+
|
82 |
const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
|
83 |
method: "POST",
|
84 |
headers: {
|
|
|
92 |
resizedImageBase64,
|
93 |
0, // seed,
|
94 |
true,
|
95 |
+
motionBucketId, // motion_bucket_id,
|
96 |
|
97 |
// attention: we are experimenting with ffmpeg to change the speed,
|
98 |
// on the server "als-2"
|
|
|
107 |
1.0, // min_guidance_scale,
|
108 |
width,
|
109 |
height,
|
110 |
+
4, // I don't see a lot of diff between 4 and 6, // nbSteps,
|
111 |
],
|
112 |
}),
|
113 |
|