Spaces:

jbilcke-hf
/

ai-tube

Running

File size: 18,565 Bytes


import { create } from "zustand"

import { ClapProject, ClapSegment, newClap, parseClap } from "@aitube/clap"
import { getVideoPrompt } from "@aitube/engine"

import { LatentEngineStore } from "./types"
import { resolveSegments } from "../resolvers/resolveSegments"
import { fetchLatentClap } from "./generators/fetchLatentClap"

import { InteractiveSegmenterResult, MPMask } from "@mediapipe/tasks-vision"
import { segmentFrame } from "@/lib/on-device-ai/segmentFrameOnClick"
import { drawSegmentation } from "../utils/canvas/drawSegmentation"
import { getZIndexDepth } from "../utils/data/getZIndexDepth"
import { getSegmentStartAt } from "../utils/data/getSegmentStartAt"
import { getElementsSortedByStartAt } from "../utils/data/getElementsSortedByStartAt"
import { getSegmentEndAt } from "../utils/data/getSegmentEndAt"
import { setZIndexDepthId } from "../utils/data/setZIndexDepth"
import { setSegmentStartAt } from "../utils/data/setSegmentStartAt"
import { setSegmentEndAt } from "../utils/data/setSegmentEndAt"
import { setSegmentId } from "../utils/data/setSegmentId"

export const useLatentEngine = create<LatentEngineStore>((set, get) => ({
  jwtToken: "",

  width: 512,
  height: 288,

  clap: newClap(),
  debug: true,

  headless: false, // false by default

  isLoop: false,
  isStatic: false,
  isLive: false,
  isInteractive: false,

  isLoading: false, // true when a .clap is being downloaded and/or generated
  isLoaded: false, // true if a clap is loaded
  isPlaying: false,
  isPaused: true,

  // our "this is AI.. gasp!" disclaimer
  hasDisclaimer: true,
  hasPresentedDisclaimer: false,

  videoSimulationPromise: undefined,
  videoSimulationPending: false,
  videoSimulationStartedAt: performance.now(),
  videoSimulationEndedAt: performance.now(),
  videoSimulationDurationInMs: 0,
  videoSimulationVideoPlaybackFPS: 0,
  videoSimulationRenderingTimeFPS: 0,

  interfaceSimulationPromise: undefined,
  interfaceSimulationPending: false,
  interfaceSimulationStartedAt: performance.now(),
  interfaceSimulationEndedAt: performance.now(),
  interfaceSimulationDurationInMs: 0,

  entitySimulationPromise: undefined,
  entitySimulationPending: false,
  entitySimulationStartedAt: performance.now(),
  entitySimulationEndedAt: performance.now(),
  entitySimulationDurationInMs: 0,

  renderingIntervalId: undefined,
  renderingIntervalDelayInMs: 150, // 0.2s
  renderingLastRenderAt: performance.now(),

  // for our calculations to be correct
  // those need to match the actual output from the API
  // don't trust the parameters you send to the API,
  // instead check the *actual* values with VLC!!
  videoModelFPS: 24,
  videoModelNumOfFrames: 60, // 80,
  videoModelDurationInSec: 2.584,

  playbackSpeed: 1,

  positionInMs: 0,
  durationInMs: 0,

  // this is the "buffer size"
  videoLayers: [
    {
      id: "video-buffer-0",
      element: null as unknown as JSX.Element,
    },
    {
      id: "video-buffer-1",
      element: null as unknown as JSX.Element,
    },
    /*
    {
      id: "video-buffer-2",
      element: null as unknown as JSX.Element,
    },
    {
      id: "video-buffer-3",
      element: null as unknown as JSX.Element,
    },
    */
  ],
  videoElements: [],

  interfaceLayers: [],

  setJwtToken: (jwtToken: string) => {
    set({
      jwtToken
    })
  },

  setContainerDimension: ({ width, height }: { width: number; height: number }) => {
    set({
      width,
      height
    })
  },

  imagine: async (prompt: string): Promise<void> => {
    set({
      isLoaded: false,
      isLoading: true, 
    })

    let clap: ClapProject | undefined = undefined

    try {
      clap = await fetchLatentClap(prompt)
    } catch (err) {
      console.error(`generateAndLoad failed (${err})`)
      set({
        isLoading: false,
      })
    }

    if (!clap) { return }

    get().open(clap)
  },


  open: async (src?: string | ClapProject | Blob) => {
    const { debug } = get()
    set({
      isLoaded: false,
      isLoading: true, 
    })

    let clap: ClapProject | undefined = undefined

    try {
      clap = await parseClap(src, debug)
    } catch (err) {
      console.error(`failed to open the Clap: ${err}`)
      set({
        isLoading: false,
      })
    }

    if (!clap) { return }
 
    set({
      clap,
      isLoading: false,
      isLoaded: true,
      isLoop: clap.meta.isLoop,
      isStatic: !clap.meta.isInteractive,
      isLive: false,
      isInteractive: clap.meta.isInteractive,
    })
  },

  setVideoElements: (videoElements: HTMLVideoElement[] = []) => { set({ videoElements }) },

  processClickOnSegment: (result: InteractiveSegmenterResult) => {
    console.log(`processClickOnSegment: user clicked on something:`, result)

    const { videoElements, debug } = get()

    if (!result?.categoryMask) {
      if (debug) {
        console.log(`processClickOnSegment: no categoryMask, so we skip the click`)
      }
      return
    }

    try {
      if (debug) {
        console.log(`processClickOnSegment: callling drawSegmentation`)
      }

      const firstVisibleVideo = videoElements.find(element => 
        getZIndexDepth(element) > 0
      )

      const segmentationElements = Array.from(
        document.querySelectorAll('.segmentation-canvas')
      ) as HTMLCanvasElement[]

      const segmentationElement = segmentationElements.at(0)
   
      const canvasMask: HTMLCanvasElement = drawSegmentation({
        mask: result.categoryMask,
        canvas: segmentationElement,
        backgroundImage: firstVisibleVideo,
        fillStyle: "rgba(255, 255, 255, 1.0)"
      })
      // TODO: read the canvas te determine on what the user clicked

      if (debug) {
        console.log(`processClickOnSegment: filtering the original image`)
      }
      // filterImage(imageElement, canvasMask)

      if (debug) {
        console.log("processClickOnSegment: TODO call data.close() to free the memory!")
      }
      result.close()
    } catch (err) {
      console.error(`processClickOnSegment: something failed ${err}`)
    }
  },
  onClickOnSegmentationLayer: (event) => {

    const { videoElements, debug } = get()
    if (debug) {
      console.log("onClickOnSegmentationLayer")
    }

    const firstVisibleVideo = videoElements.find(element => 
      getZIndexDepth(element) > 0
    )
    if (!firstVisibleVideo) { return }

    const box = event.currentTarget.getBoundingClientRect()

    const px = event.clientX
    const py = event.clientY

    const x = px / box.width
    const y = py / box.height
    console.log(`onClickOnSegmentationLayer: user clicked on `, { x, y, px, py, box, videoElements })

    const fn = async () => {
      // todo julian: this should use the visible element instead
      const results: InteractiveSegmenterResult = await segmentFrame(firstVisibleVideo, x, y)
      get().processClickOnSegment(results)
    }
    fn()
  },
  
  togglePlayPause: (): boolean => {
    const { isLoaded, isPlaying, playbackSpeed, renderingIntervalId, videoElements } = get()
    if (!isLoaded) { return false }
    
    const newValue = !isPlaying

    clearInterval(renderingIntervalId)

    const firstVisibleVideo = videoElements.find(element => 
      getZIndexDepth(element) > 0
    )

    // Note Julian: we could also let the background scheduler
    // (runRenderingLoop) do its work of advancing the cursor here

    if (newValue) {
      if (firstVisibleVideo) {
        try {
          firstVisibleVideo.playbackRate = playbackSpeed
          firstVisibleVideo.play()
        } catch (err) {
          console.error(`togglePlayPause: failed to start the video (${err})`)
        }
      }
      set({
        isPlaying: true,
        renderingIntervalId: setTimeout(() => { get().runRenderingLoop() }, 0)
      })
    } else {
      if (firstVisibleVideo) {
        try {
          firstVisibleVideo.playbackRate = playbackSpeed
          firstVisibleVideo.pause()
        } catch (err) {
          console.error(`togglePlayPause: failed to pause the video (${err})`)
        }
      }
      set({ isPlaying: false })
    }

    return newValue
  },


  play: (): boolean => {
    const { isLoaded, isPlaying, renderingIntervalId, renderingIntervalDelayInMs } = get()

    if (!isLoaded) { return false }

    if (isPlaying) { return true }

    clearInterval(renderingIntervalId)
    set({
      isPlaying: true,
      renderingIntervalId: setTimeout(() => { get().runRenderingLoop() }, 0)
    })

    return true
  },

  pause: (): boolean => {
    const { isLoaded, renderingIntervalId } = get()
    if (!isLoaded) { return false }

    clearInterval(renderingIntervalId)

    set({ isPlaying: false })

    return false
  },

  // a slow rendering function (async - might call a third party LLM)
  runVideoSimulationLoop: async () => {
    const {
      isLoaded,
      isPlaying,
      clap,
      playbackSpeed,
      positionInMs,
      videoModelFPS,
      videoModelNumOfFrames,
      videoModelDurationInSec,
      videoElements,
      jwtToken,
    } = get()

    if (!isLoaded || !isPlaying) {
      set({ videoSimulationPending: false })
      return
    }

    set({
      videoSimulationPending: true,
      videoSimulationStartedAt: performance.now(),
    })

    const videosSortedByStartAt = getElementsSortedByStartAt(videoElements)

    // videos whose timestamp is behind the current cursor
    let toRecycle: HTMLVideoElement[] = []
    let toPlay: HTMLVideoElement[] = []
    let toPreload: HTMLVideoElement[] = []
    
    for (let i = 0; i < videosSortedByStartAt.length; i++) {
      const video = videosSortedByStartAt[i]
      
      const segmentStartAt = getSegmentStartAt(video)
      const segmentEndAt = getSegmentEndAt(video)

      // this segment has been spent, it should be discared
      if (segmentEndAt < positionInMs) {
        toRecycle.push(video)
      } else if (segmentStartAt < positionInMs) {
        toPlay.push(video)
        video.play()
        setZIndexDepthId(video, 10)
      } else {
        toPreload.push(video)
        video.pause()
        setZIndexDepthId(video, 0)
      }
    }

    const videoDurationInMs = videoModelDurationInSec * 1000

    // TODO julian: this is an approximation
    // to grab the max number of segments
    const maxBufferDurationInMs = positionInMs + (videoDurationInMs * 4)
    console.log(`DEBUG: `, {
      positionInMs,
      videoModelDurationInSec,
      videoDurationInMs,
      "(videoDurationInMs * 4)":  (videoDurationInMs * 4),
      maxBufferDurationInMs,
      segments: clap.segments
    })
    
    const prefilterSegmentsForPerformanceReasons: ClapSegment[] = clap.segments.filter(s =>
      s.startTimeInMs >= positionInMs &&
      s.startTimeInMs < maxBufferDurationInMs
    )

    console.log(`prefilterSegmentsForPerformanceReasons: `, prefilterSegmentsForPerformanceReasons)

    // this tells us how much time is left
    let remainingTimeInMs = Math.max(0, clap.meta.durationInMs - positionInMs)
    // to avoid interruptions, we should jump to the beginning of the project
    // as soo as we are start playing back the "last" video segment

    // now, we need to recycle spent videos,
    // by discarding their content and replacing it with fresh one
    //
    // yes: I know the code is complex and not intuitive - sorry about that

    const extraPositivePrompt: string[] = ["high quality", "crisp", "detailed"]

    let bufferAheadOfCurrentPositionInMs = positionInMs

    for (let i = 0; i < toRecycle.length; i++) {
      console.log(`got a spent video to recycle`)
      
      // we select the segments in the current shot

      const shotSegmentsToPreload: ClapSegment[] = prefilterSegmentsForPerformanceReasons.filter(s =>
        s.startTimeInMs >= bufferAheadOfCurrentPositionInMs &&
        s.startTimeInMs < (bufferAheadOfCurrentPositionInMs + videoDurationInMs)
      )
      
      bufferAheadOfCurrentPositionInMs += videoDurationInMs

      const prompt = getVideoPrompt(shotSegmentsToPreload, clap.entityIndex, extraPositivePrompt)

      console.log(`video prompt: ${prompt}`)
      // could also be the camera
      // after all, we don't necessarily have a shot,
      // this could also be a gaussian splat
      const shotData = shotSegmentsToPreload.find(s => s.category === "video")

      console.log(`shotData:`, shotData)

      if (!prompt || !shotData) { continue }
  
      const recycled = toRecycle[i]

      recycled.pause()

      setSegmentId(recycled, shotData.id)
      setSegmentStartAt(recycled, shotData.startTimeInMs)
      setSegmentEndAt(recycled, shotData.endTimeInMs)
      setZIndexDepthId(recycled, 0)

      // this is the best compromise for now in term of speed
      const width = 512
      const height = 288

      // this is our magic trick: we let the browser do the token-secured,
      // asynchronous and parallel video generation call for us
      //
      // one issue with this approach is that it hopes the video
      // will be downloaded in time, but it's not an exact science
      //
      // first, generation time varies between 4sec and 7sec,
      // then some people will get 300ms latency due to their ISP,
      // and finally the video itself is a 150~200 Kb payload)
      recycled.src = `/api/resolvers/video?t=${

      // to prevent funny people from using this as a free, open-bar video API
      // we have this system of token with a 24h expiration date
      // we might even make it tighter in the future
        jwtToken
      }&w=${
        width

      }&h=${
        height
      }&p=${
        // let's re-use the best ideas from the Latent Browser:
        // a text uri equals a latent resource
        encodeURIComponent(prompt)
      }`

      toPreload.push(recycled)
    }

    const videoSimulationEndedAt = performance.now()
    const videoSimulationDurationInMs = videoSimulationEndedAt - get().videoSimulationStartedAt 
    const videoSimulationDurationInSec = videoSimulationDurationInMs / 1000

    const videoSimulationVideoPlaybackFPS = videoModelFPS * playbackSpeed
    const videoSimulationRenderingTimeFPS = videoModelNumOfFrames / videoSimulationDurationInSec
    set({
      videoSimulationPending: false,
      videoSimulationEndedAt,
      videoSimulationDurationInMs,
      videoSimulationVideoPlaybackFPS,
      videoSimulationRenderingTimeFPS,
    })
  },


  // a slow rendering function (async - might call a third party LLM)
  runInterfaceSimulationLoop: async () => {
    const {
      isLoaded,
      isPlaying,
      clap,
    } = get()

    if (!isLoaded || !isPlaying) {
      set({ interfaceSimulationPending: false })
      return
    }

    set({
      interfaceSimulationPending: true,
      interfaceSimulationStartedAt: performance.now(),
    })
 
    try {
      if (get().isPlaying) {
        // console.log(`runSimulationLoop: rendering UI layer..`)

        // note: for now we only display one panel at a time,
        // later we can try to see if we should handle more
        // for nice gradient transition,
        const interfaceLayers = await resolveSegments(clap, "interface", 1)

        if (get().isPlaying) {
          set({
            interfaceLayers
          })

          // console.log(`runSimulationLoop: rendered UI layer`)
        }
      }
    } catch (err) {
      console.error(`runInterfaceSimulationLoop failed to render UI layer ${err}`)
    }

    const interfaceSimulationEndedAt = performance.now()
    const interfaceSimulationDurationInMs = interfaceSimulationEndedAt - get().interfaceSimulationStartedAt

    set({
      interfaceSimulationPending: false,
      interfaceSimulationEndedAt,
      interfaceSimulationDurationInMs,
    })
  },


  // a slow rendering function (async - might call a third party LLM)
  runEntitySimulationLoop: async () => {
    const {
      isLoaded,
      isPlaying,
      clap,
    } = get()


    if (!isLoaded || !isPlaying) {
      set({ entitySimulationPending: false })
      return
    }

    set({
      entitySimulationPending: true,
      entitySimulationStartedAt: performance.now(),
    })

    const entitySimulationEndedAt = performance.now()
    const entitySimulationDurationInMs = entitySimulationEndedAt - get().entitySimulationStartedAt 

    set({
      entitySimulationPending: false,
      entitySimulationEndedAt,
      entitySimulationDurationInMs,
    })
  },
  // a fast sync rendering function; whose sole role is to filter the component
  // list to put into the buffer the one that should be displayed
  runRenderingLoop: () => {
    const {
      isLoaded,
      isPlaying,
      renderingIntervalId,
      renderingIntervalDelayInMs,
      renderingLastRenderAt,
      positionInMs,
      videoSimulationPending,
      runVideoSimulationLoop,
      interfaceSimulationPending,
      runInterfaceSimulationLoop,
      entitySimulationPending,
      runEntitySimulationLoop,
    } = get()
    if (!isLoaded || !isPlaying) { return }

    // TODO julian: don't do this here, this is inneficient
    const videoElements = Array.from(
      document.querySelectorAll('.video-buffer')
    ) as HTMLVideoElement[]
 
    const newRenderingLastRenderAt = performance.now()
    const elapsedInMs = newRenderingLastRenderAt - renderingLastRenderAt

    // let's move inside the Clap file timeline
    const newPositionInMs = positionInMs + elapsedInMs

    clearInterval(renderingIntervalId)

    set({
      isPlaying: true,
      renderingLastRenderAt: newRenderingLastRenderAt,
      positionInMs: newPositionInMs,

      videoElements: videoElements,
      // TODO: use requestAnimationFrame somehow
      // https://developers.google.com/mediapipe/solutions/vision/image_segmenter/web_js
      renderingIntervalId: setTimeout(() => { get().runRenderingLoop() }, renderingIntervalDelayInMs)
    })

    // note that having this second set() also helps us to make sure previously values are properly stored
    // in the state when the simulation loop runs
    if (!videoSimulationPending) {
      set({ videoSimulationPromise: runVideoSimulationLoop() }) // <-- note: this is a fire-and-forget operation!
    }
    if (!interfaceSimulationPending) {
      set({ interfaceSimulationPromise: runInterfaceSimulationLoop() }) // <-- note: this is a fire-and-forget operation!
    }
    if (!entitySimulationPending) {
      set({ entitySimulationPromise: runEntitySimulationLoop() }) // <-- note: this is a fire-and-forget operation!
    }
  },

  jumpTo: (positionInMs: number) => {
    set({ positionInMs })
  },
  jumpToStart: () => {
    set({ positionInMs: 0 })
  },
}))