jbilcke-hf HF staff commited on
Commit
8ce416b
·
1 Parent(s): b785e1d

working on image analysis (idefics)

Browse files
Files changed (3) hide show
  1. src/analysis/analyzeImage.mts +43 -0
  2. src/index.mts +52 -2
  3. src/types.mts +11 -0
src/analysis/analyzeImage.mts ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import { client } from "@gradio/client"
3
+
4
+
5
+ // we don't use replicas yet, because it ain't easy to get their hostname
6
+ const instances: string[] = [
7
+ `${process.env.VC_ANALYSIS_SPACE_API_URL || ""}`,
8
+ // `${process.env.VC_UPSCALING_SPACE_API_URL_2 || ""}`,
9
+ // `${process.env.VC_UPSCALING_SPACE_API_URL_3 || ""}`,
10
+ ].filter(instance => instance?.length > 0)
11
+
12
+
13
+ export async function analyzeImage(src: string, prompt: string): Promise<string> {
14
+
15
+ const instance = instances.shift()
16
+ instances.push(instance)
17
+
18
+ const api = await client(instance, {
19
+ hf_token: `${process.env.VC_HF_API_TOKEN}` as any
20
+ })
21
+
22
+ const result = await api.predict(6, [
23
+ "HuggingFaceM4/idefics-80b-instruct", // string (Option from: ['HuggingFaceM4/idefics-80b-instruct']) in 'Model' Dropdown component
24
+ prompt, // string in 'Text input' Textbox component
25
+ "null", // any (any valid json) in 'IDEFICS' Chatbot component
26
+ src, // blob in 'Image input' Image component
27
+
28
+ // the following values come from the source code at:
29
+ // https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/blob/main/app_dialogue.py#L416-L472
30
+
31
+ "Greedy", // string in 'Decoding strategy' Radio component
32
+ 0.4, // number (numeric value between 0.0 and 5.0) in 'Sampling temperature' Slider component
33
+ 512, // number (numeric value between 8 and 1024) in 'Maximum number of new tokens to generate' Slider component
34
+ 1, // number (numeric value between 0.0 and 5.0) in 'Repetition penalty' Slider component
35
+ 0.8, // number (numeric value between 0.01 and 0.99) in 'Top P' Slider component
36
+ ])
37
+
38
+ const rawResponse = result as any
39
+
40
+ console.log("rawResponse:", rawResponse)
41
+
42
+ return rawResponse?.data?.[0] as string
43
+ }
src/index.mts CHANGED
@@ -4,7 +4,7 @@ import path from "node:path"
4
  import { validate as uuidValidate } from "uuid"
5
  import express from "express"
6
 
7
- import { Video, VideoStatus, VideoAPIRequest, RenderRequest, RenderedScene } from "./types.mts"
8
 
9
  import { parseVideoRequest } from "./utils/parseVideoRequest.mts"
10
  import { savePendingVideo } from "./scheduler/savePendingVideo.mts"
@@ -23,6 +23,7 @@ import { sortVideosByYoungestFirst } from "./utils/sortVideosByYoungestFirst.mts
23
  import { getRenderedScene, renderScene } from "./production/renderScene.mts"
24
  import { parseRenderRequest } from "./utils/parseRenderRequest.mts"
25
  import { loadRenderedSceneFromCache } from "./utils/loadRenderedSceneFromCache.mts"
 
26
 
27
  initFolders()
28
  // to disable all processing (eg. to debug)
@@ -36,6 +37,56 @@ app.use(express.json())
36
 
37
  let isRendering = false
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  // a "fast track" pipeline
40
  app.post("/render", async (req, res) => {
41
 
@@ -227,7 +278,6 @@ app.post("/:ownerId", async (req, res) => {
227
  }
228
  })
229
 
230
-
231
  app.get("/:ownerId/:videoId\.mp4", async (req, res) => {
232
 
233
  /*
 
4
  import { validate as uuidValidate } from "uuid"
5
  import express from "express"
6
 
7
+ import { Video, VideoStatus, VideoAPIRequest, RenderRequest, RenderedScene, ImageAnalysisRequest, ImageAnalysisResponse } from "./types.mts"
8
 
9
  import { parseVideoRequest } from "./utils/parseVideoRequest.mts"
10
  import { savePendingVideo } from "./scheduler/savePendingVideo.mts"
 
23
  import { getRenderedScene, renderScene } from "./production/renderScene.mts"
24
  import { parseRenderRequest } from "./utils/parseRenderRequest.mts"
25
  import { loadRenderedSceneFromCache } from "./utils/loadRenderedSceneFromCache.mts"
26
+ import { analyzeImage } from "./analysis/analyzeImage.mts"
27
 
28
  initFolders()
29
  // to disable all processing (eg. to debug)
 
37
 
38
  let isRendering = false
39
 
40
+ // an image analyzing pipeline
41
+ app.post("/analyze", async (req, res) => {
42
+
43
+ console.log(req.body)
44
+
45
+ const request = req.body as ImageAnalysisRequest
46
+
47
+ if (!request.prompt) {
48
+ console.log("Invalid prompt")
49
+ res.status(400)
50
+ res.write(JSON.stringify({ result: "", error: "invalid prompt" }))
51
+ res.end()
52
+ return
53
+ }
54
+
55
+ if (!request.image) {
56
+ console.log("Invalid image")
57
+ res.status(400)
58
+ res.write(JSON.stringify({ result: "", error: "invalid image" }))
59
+ res.end()
60
+ return
61
+ }
62
+
63
+ const response: ImageAnalysisResponse = {
64
+ result: "",
65
+ error: ""
66
+ }
67
+
68
+ try {
69
+ response.result = await analyzeImage(request.image, request.prompt)
70
+ } catch (err) {
71
+ // console.log("failed to render scene!")
72
+ response.error = `failed to render scene: ${err}`
73
+ }
74
+
75
+ if (response.error.length > 0) {
76
+ // console.log("server error")
77
+ res.status(500)
78
+ res.write(JSON.stringify(response))
79
+ res.end()
80
+ return
81
+ } else {
82
+ // console.log("all good")
83
+ res.status(200)
84
+ res.write(JSON.stringify(response))
85
+ res.end()
86
+ return
87
+ }
88
+ })
89
+
90
  // a "fast track" pipeline
91
  app.post("/render", async (req, res) => {
92
 
 
278
  }
279
  })
280
 
 
281
  app.get("/:ownerId/:videoId\.mp4", async (req, res) => {
282
 
283
  /*
src/types.mts CHANGED
@@ -310,6 +310,17 @@ export interface RenderRequest {
310
  wait: boolean // wait until the job is completed
311
  }
312
 
 
 
 
 
 
 
 
 
 
 
 
313
  export interface ImageSegmentationRequest {
314
  image: string // in base64
315
  keywords: string[]
 
310
  wait: boolean // wait until the job is completed
311
  }
312
 
313
+ export interface ImageAnalysisRequest {
314
+ image: string // in base64
315
+ prompt: string
316
+ }
317
+
318
+ export interface ImageAnalysisResponse {
319
+ result: string
320
+ error?: string
321
+ }
322
+
323
+
324
  export interface ImageSegmentationRequest {
325
  image: string // in base64
326
  keywords: string[]