Spaces:
Running
Running
import { create } from "zustand" | |
import { ClapProject, ClapSegment, newClap, parseClap } from "@aitube/clap" | |
import { getVideoPrompt } from "@aitube/engine" | |
import { LatentEngineStore } from "./types" | |
import { resolveSegments } from "../resolvers/resolveSegments" | |
import { fetchLatentClap } from "./generators/fetchLatentClap" | |
import { InteractiveSegmenterResult, MPMask } from "@mediapipe/tasks-vision" | |
import { segmentFrame } from "@/lib/on-device-ai/segmentFrameOnClick" | |
import { drawSegmentation } from "../utils/canvas/drawSegmentation" | |
import { getZIndexDepth } from "../utils/data/getZIndexDepth" | |
import { getSegmentStartAt } from "../utils/data/getSegmentStartAt" | |
import { getElementsSortedByStartAt } from "../utils/data/getElementsSortedByStartAt" | |
import { getSegmentEndAt } from "../utils/data/getSegmentEndAt" | |
import { setZIndexDepthId } from "../utils/data/setZIndexDepth" | |
import { setSegmentStartAt } from "../utils/data/setSegmentStartAt" | |
import { setSegmentEndAt } from "../utils/data/setSegmentEndAt" | |
import { setSegmentId } from "../utils/data/setSegmentId" | |
export const useLatentEngine = create<LatentEngineStore>((set, get) => ({ | |
jwtToken: "", | |
width: 512, | |
height: 288, | |
clap: newClap(), | |
debug: true, | |
headless: false, // false by default | |
isLoop: false, | |
isStatic: false, | |
isLive: false, | |
isInteractive: false, | |
isLoading: false, // true when a .clap is being downloaded and/or generated | |
isLoaded: false, // true if a clap is loaded | |
isPlaying: false, | |
isPaused: true, | |
// our "this is AI.. gasp!" disclaimer | |
hasDisclaimer: true, | |
hasPresentedDisclaimer: false, | |
videoSimulationPromise: undefined, | |
videoSimulationPending: false, | |
videoSimulationStartedAt: performance.now(), | |
videoSimulationEndedAt: performance.now(), | |
videoSimulationDurationInMs: 0, | |
videoSimulationVideoPlaybackFPS: 0, | |
videoSimulationRenderingTimeFPS: 0, | |
interfaceSimulationPromise: undefined, | |
interfaceSimulationPending: false, | |
interfaceSimulationStartedAt: performance.now(), | |
interfaceSimulationEndedAt: performance.now(), | |
interfaceSimulationDurationInMs: 0, | |
entitySimulationPromise: undefined, | |
entitySimulationPending: false, | |
entitySimulationStartedAt: performance.now(), | |
entitySimulationEndedAt: performance.now(), | |
entitySimulationDurationInMs: 0, | |
renderingIntervalId: undefined, | |
renderingIntervalDelayInMs: 150, // 0.2s | |
renderingLastRenderAt: performance.now(), | |
// for our calculations to be correct | |
// those need to match the actual output from the API | |
// don't trust the parameters you send to the API, | |
// instead check the *actual* values with VLC!! | |
videoModelFPS: 24, | |
videoModelNumOfFrames: 60, // 80, | |
videoModelDurationInSec: 2.584, | |
playbackSpeed: 1, | |
positionInMs: 0, | |
durationInMs: 0, | |
// this is the "buffer size" | |
videoLayers: [ | |
{ | |
id: "video-buffer-0", | |
element: null as unknown as JSX.Element, | |
}, | |
{ | |
id: "video-buffer-1", | |
element: null as unknown as JSX.Element, | |
}, | |
/* | |
{ | |
id: "video-buffer-2", | |
element: null as unknown as JSX.Element, | |
}, | |
{ | |
id: "video-buffer-3", | |
element: null as unknown as JSX.Element, | |
}, | |
*/ | |
], | |
videoElements: [], | |
interfaceLayers: [], | |
setJwtToken: (jwtToken: string) => { | |
set({ | |
jwtToken | |
}) | |
}, | |
setContainerDimension: ({ width, height }: { width: number; height: number }) => { | |
set({ | |
width, | |
height | |
}) | |
}, | |
imagine: async (prompt: string): Promise<void> => { | |
set({ | |
isLoaded: false, | |
isLoading: true, | |
}) | |
let clap: ClapProject | undefined = undefined | |
try { | |
clap = await fetchLatentClap(prompt) | |
} catch (err) { | |
console.error(`generateAndLoad failed (${err})`) | |
set({ | |
isLoading: false, | |
}) | |
} | |
if (!clap) { return } | |
get().open(clap) | |
}, | |
open: async (src?: string | ClapProject | Blob) => { | |
const { debug } = get() | |
set({ | |
isLoaded: false, | |
isLoading: true, | |
}) | |
let clap: ClapProject | undefined = undefined | |
try { | |
clap = await parseClap(src, debug) | |
} catch (err) { | |
console.error(`failed to open the Clap: ${err}`) | |
set({ | |
isLoading: false, | |
}) | |
} | |
if (!clap) { return } | |
set({ | |
clap, | |
isLoading: false, | |
isLoaded: true, | |
isLoop: clap.meta.isLoop, | |
isStatic: !clap.meta.isInteractive, | |
isLive: false, | |
isInteractive: clap.meta.isInteractive, | |
}) | |
}, | |
setVideoElements: (videoElements: HTMLVideoElement[] = []) => { set({ videoElements }) }, | |
processClickOnSegment: (result: InteractiveSegmenterResult) => { | |
console.log(`processClickOnSegment: user clicked on something:`, result) | |
const { videoElements, debug } = get() | |
if (!result?.categoryMask) { | |
if (debug) { | |
console.log(`processClickOnSegment: no categoryMask, so we skip the click`) | |
} | |
return | |
} | |
try { | |
if (debug) { | |
console.log(`processClickOnSegment: callling drawSegmentation`) | |
} | |
const firstVisibleVideo = videoElements.find(element => | |
getZIndexDepth(element) > 0 | |
) | |
const segmentationElements = Array.from( | |
document.querySelectorAll('.segmentation-canvas') | |
) as HTMLCanvasElement[] | |
const segmentationElement = segmentationElements.at(0) | |
const canvasMask: HTMLCanvasElement = drawSegmentation({ | |
mask: result.categoryMask, | |
canvas: segmentationElement, | |
backgroundImage: firstVisibleVideo, | |
fillStyle: "rgba(255, 255, 255, 1.0)" | |
}) | |
// TODO: read the canvas te determine on what the user clicked | |
if (debug) { | |
console.log(`processClickOnSegment: filtering the original image`) | |
} | |
// filterImage(imageElement, canvasMask) | |
if (debug) { | |
console.log("processClickOnSegment: TODO call data.close() to free the memory!") | |
} | |
result.close() | |
} catch (err) { | |
console.error(`processClickOnSegment: something failed ${err}`) | |
} | |
}, | |
onClickOnSegmentationLayer: (event) => { | |
const { videoElements, debug } = get() | |
if (debug) { | |
console.log("onClickOnSegmentationLayer") | |
} | |
const firstVisibleVideo = videoElements.find(element => | |
getZIndexDepth(element) > 0 | |
) | |
if (!firstVisibleVideo) { return } | |
const box = event.currentTarget.getBoundingClientRect() | |
const px = event.clientX | |
const py = event.clientY | |
const x = px / box.width | |
const y = py / box.height | |
console.log(`onClickOnSegmentationLayer: user clicked on `, { x, y, px, py, box, videoElements }) | |
const fn = async () => { | |
// todo julian: this should use the visible element instead | |
const results: InteractiveSegmenterResult = await segmentFrame(firstVisibleVideo, x, y) | |
get().processClickOnSegment(results) | |
} | |
fn() | |
}, | |
togglePlayPause: (): boolean => { | |
const { isLoaded, isPlaying, playbackSpeed, renderingIntervalId, videoElements } = get() | |
if (!isLoaded) { return false } | |
const newValue = !isPlaying | |
clearInterval(renderingIntervalId) | |
const firstVisibleVideo = videoElements.find(element => | |
getZIndexDepth(element) > 0 | |
) | |
// Note Julian: we could also let the background scheduler | |
// (runRenderingLoop) do its work of advancing the cursor here | |
if (newValue) { | |
if (firstVisibleVideo) { | |
try { | |
firstVisibleVideo.playbackRate = playbackSpeed | |
firstVisibleVideo.play() | |
} catch (err) { | |
console.error(`togglePlayPause: failed to start the video (${err})`) | |
} | |
} | |
set({ | |
isPlaying: true, | |
renderingIntervalId: setTimeout(() => { get().runRenderingLoop() }, 0) | |
}) | |
} else { | |
if (firstVisibleVideo) { | |
try { | |
firstVisibleVideo.playbackRate = playbackSpeed | |
firstVisibleVideo.pause() | |
} catch (err) { | |
console.error(`togglePlayPause: failed to pause the video (${err})`) | |
} | |
} | |
set({ isPlaying: false }) | |
} | |
return newValue | |
}, | |
play: (): boolean => { | |
const { isLoaded, isPlaying, renderingIntervalId, renderingIntervalDelayInMs } = get() | |
if (!isLoaded) { return false } | |
if (isPlaying) { return true } | |
clearInterval(renderingIntervalId) | |
set({ | |
isPlaying: true, | |
renderingIntervalId: setTimeout(() => { get().runRenderingLoop() }, 0) | |
}) | |
return true | |
}, | |
pause: (): boolean => { | |
const { isLoaded, renderingIntervalId } = get() | |
if (!isLoaded) { return false } | |
clearInterval(renderingIntervalId) | |
set({ isPlaying: false }) | |
return false | |
}, | |
// a slow rendering function (async - might call a third party LLM) | |
runVideoSimulationLoop: async () => { | |
const { | |
isLoaded, | |
isPlaying, | |
clap, | |
playbackSpeed, | |
positionInMs, | |
videoModelFPS, | |
videoModelNumOfFrames, | |
videoModelDurationInSec, | |
videoElements, | |
jwtToken, | |
} = get() | |
if (!isLoaded || !isPlaying) { | |
set({ videoSimulationPending: false }) | |
return | |
} | |
set({ | |
videoSimulationPending: true, | |
videoSimulationStartedAt: performance.now(), | |
}) | |
const videosSortedByStartAt = getElementsSortedByStartAt(videoElements) | |
// videos whose timestamp is behind the current cursor | |
let toRecycle: HTMLVideoElement[] = [] | |
let toPlay: HTMLVideoElement[] = [] | |
let toPreload: HTMLVideoElement[] = [] | |
for (let i = 0; i < videosSortedByStartAt.length; i++) { | |
const video = videosSortedByStartAt[i] | |
const segmentStartAt = getSegmentStartAt(video) | |
const segmentEndAt = getSegmentEndAt(video) | |
// this segment has been spent, it should be discared | |
if (segmentEndAt < positionInMs) { | |
toRecycle.push(video) | |
} else if (segmentStartAt < positionInMs) { | |
toPlay.push(video) | |
video.play() | |
setZIndexDepthId(video, 10) | |
} else { | |
toPreload.push(video) | |
video.pause() | |
setZIndexDepthId(video, 0) | |
} | |
} | |
const videoDurationInMs = videoModelDurationInSec * 1000 | |
// TODO julian: this is an approximation | |
// to grab the max number of segments | |
const maxBufferDurationInMs = positionInMs + (videoDurationInMs * 4) | |
console.log(`DEBUG: `, { | |
positionInMs, | |
videoModelDurationInSec, | |
videoDurationInMs, | |
"(videoDurationInMs * 4)": (videoDurationInMs * 4), | |
maxBufferDurationInMs, | |
segments: clap.segments | |
}) | |
const prefilterSegmentsForPerformanceReasons: ClapSegment[] = clap.segments.filter(s => | |
s.startTimeInMs >= positionInMs && | |
s.startTimeInMs < maxBufferDurationInMs | |
) | |
console.log(`prefilterSegmentsForPerformanceReasons: `, prefilterSegmentsForPerformanceReasons) | |
// this tells us how much time is left | |
let remainingTimeInMs = Math.max(0, clap.meta.durationInMs - positionInMs) | |
// to avoid interruptions, we should jump to the beginning of the project | |
// as soo as we are start playing back the "last" video segment | |
// now, we need to recycle spent videos, | |
// by discarding their content and replacing it with fresh one | |
// | |
// yes: I know the code is complex and not intuitive - sorry about that | |
const extraPositivePrompt: string[] = ["high quality", "crisp", "detailed"] | |
let bufferAheadOfCurrentPositionInMs = positionInMs | |
for (let i = 0; i < toRecycle.length; i++) { | |
console.log(`got a spent video to recycle`) | |
// we select the segments in the current shot | |
const shotSegmentsToPreload: ClapSegment[] = prefilterSegmentsForPerformanceReasons.filter(s => | |
s.startTimeInMs >= bufferAheadOfCurrentPositionInMs && | |
s.startTimeInMs < (bufferAheadOfCurrentPositionInMs + videoDurationInMs) | |
) | |
bufferAheadOfCurrentPositionInMs += videoDurationInMs | |
const prompt = getVideoPrompt(shotSegmentsToPreload, clap.entityIndex, extraPositivePrompt) | |
console.log(`video prompt: ${prompt}`) | |
// could also be the camera | |
// after all, we don't necessarily have a shot, | |
// this could also be a gaussian splat | |
const shotData = shotSegmentsToPreload.find(s => s.category === "video") | |
console.log(`shotData:`, shotData) | |
if (!prompt || !shotData) { continue } | |
const recycled = toRecycle[i] | |
recycled.pause() | |
setSegmentId(recycled, shotData.id) | |
setSegmentStartAt(recycled, shotData.startTimeInMs) | |
setSegmentEndAt(recycled, shotData.endTimeInMs) | |
setZIndexDepthId(recycled, 0) | |
// this is the best compromise for now in term of speed | |
const width = 512 | |
const height = 288 | |
// this is our magic trick: we let the browser do the token-secured, | |
// asynchronous and parallel video generation call for us | |
// | |
// one issue with this approach is that it hopes the video | |
// will be downloaded in time, but it's not an exact science | |
// | |
// first, generation time varies between 4sec and 7sec, | |
// then some people will get 300ms latency due to their ISP, | |
// and finally the video itself is a 150~200 Kb payload) | |
recycled.src = `/api/resolvers/video?t=${ | |
// to prevent funny people from using this as a free, open-bar video API | |
// we have this system of token with a 24h expiration date | |
// we might even make it tighter in the future | |
jwtToken | |
}&w=${ | |
width | |
}&h=${ | |
height | |
}&p=${ | |
// let's re-use the best ideas from the Latent Browser: | |
// a text uri equals a latent resource | |
encodeURIComponent(prompt) | |
}` | |
toPreload.push(recycled) | |
} | |
const videoSimulationEndedAt = performance.now() | |
const videoSimulationDurationInMs = videoSimulationEndedAt - get().videoSimulationStartedAt | |
const videoSimulationDurationInSec = videoSimulationDurationInMs / 1000 | |
const videoSimulationVideoPlaybackFPS = videoModelFPS * playbackSpeed | |
const videoSimulationRenderingTimeFPS = videoModelNumOfFrames / videoSimulationDurationInSec | |
set({ | |
videoSimulationPending: false, | |
videoSimulationEndedAt, | |
videoSimulationDurationInMs, | |
videoSimulationVideoPlaybackFPS, | |
videoSimulationRenderingTimeFPS, | |
}) | |
}, | |
// a slow rendering function (async - might call a third party LLM) | |
runInterfaceSimulationLoop: async () => { | |
const { | |
isLoaded, | |
isPlaying, | |
clap, | |
} = get() | |
if (!isLoaded || !isPlaying) { | |
set({ interfaceSimulationPending: false }) | |
return | |
} | |
set({ | |
interfaceSimulationPending: true, | |
interfaceSimulationStartedAt: performance.now(), | |
}) | |
try { | |
if (get().isPlaying) { | |
// console.log(`runSimulationLoop: rendering UI layer..`) | |
// note: for now we only display one panel at a time, | |
// later we can try to see if we should handle more | |
// for nice gradient transition, | |
const interfaceLayers = await resolveSegments(clap, "interface", 1) | |
if (get().isPlaying) { | |
set({ | |
interfaceLayers | |
}) | |
// console.log(`runSimulationLoop: rendered UI layer`) | |
} | |
} | |
} catch (err) { | |
console.error(`runInterfaceSimulationLoop failed to render UI layer ${err}`) | |
} | |
const interfaceSimulationEndedAt = performance.now() | |
const interfaceSimulationDurationInMs = interfaceSimulationEndedAt - get().interfaceSimulationStartedAt | |
set({ | |
interfaceSimulationPending: false, | |
interfaceSimulationEndedAt, | |
interfaceSimulationDurationInMs, | |
}) | |
}, | |
// a slow rendering function (async - might call a third party LLM) | |
runEntitySimulationLoop: async () => { | |
const { | |
isLoaded, | |
isPlaying, | |
clap, | |
} = get() | |
if (!isLoaded || !isPlaying) { | |
set({ entitySimulationPending: false }) | |
return | |
} | |
set({ | |
entitySimulationPending: true, | |
entitySimulationStartedAt: performance.now(), | |
}) | |
const entitySimulationEndedAt = performance.now() | |
const entitySimulationDurationInMs = entitySimulationEndedAt - get().entitySimulationStartedAt | |
set({ | |
entitySimulationPending: false, | |
entitySimulationEndedAt, | |
entitySimulationDurationInMs, | |
}) | |
}, | |
// a fast sync rendering function; whose sole role is to filter the component | |
// list to put into the buffer the one that should be displayed | |
runRenderingLoop: () => { | |
const { | |
isLoaded, | |
isPlaying, | |
renderingIntervalId, | |
renderingIntervalDelayInMs, | |
renderingLastRenderAt, | |
positionInMs, | |
videoSimulationPending, | |
runVideoSimulationLoop, | |
interfaceSimulationPending, | |
runInterfaceSimulationLoop, | |
entitySimulationPending, | |
runEntitySimulationLoop, | |
} = get() | |
if (!isLoaded || !isPlaying) { return } | |
// TODO julian: don't do this here, this is inneficient | |
const videoElements = Array.from( | |
document.querySelectorAll('.video-buffer') | |
) as HTMLVideoElement[] | |
const newRenderingLastRenderAt = performance.now() | |
const elapsedInMs = newRenderingLastRenderAt - renderingLastRenderAt | |
// let's move inside the Clap file timeline | |
const newPositionInMs = positionInMs + elapsedInMs | |
clearInterval(renderingIntervalId) | |
set({ | |
isPlaying: true, | |
renderingLastRenderAt: newRenderingLastRenderAt, | |
positionInMs: newPositionInMs, | |
videoElements: videoElements, | |
// TODO: use requestAnimationFrame somehow | |
// https://developers.google.com/mediapipe/solutions/vision/image_segmenter/web_js | |
renderingIntervalId: setTimeout(() => { get().runRenderingLoop() }, renderingIntervalDelayInMs) | |
}) | |
// note that having this second set() also helps us to make sure previously values are properly stored | |
// in the state when the simulation loop runs | |
if (!videoSimulationPending) { | |
set({ videoSimulationPromise: runVideoSimulationLoop() }) // <-- note: this is a fire-and-forget operation! | |
} | |
if (!interfaceSimulationPending) { | |
set({ interfaceSimulationPromise: runInterfaceSimulationLoop() }) // <-- note: this is a fire-and-forget operation! | |
} | |
if (!entitySimulationPending) { | |
set({ entitySimulationPromise: runEntitySimulationLoop() }) // <-- note: this is a fire-and-forget operation! | |
} | |
}, | |
jumpTo: (positionInMs: number) => { | |
set({ positionInMs }) | |
}, | |
jumpToStart: () => { | |
set({ positionInMs: 0 }) | |
}, | |
})) |