jbilcke commited on
Commit
566d763
·
1 Parent(s): 2156c54

update clap spec

Browse files
src/clap/clap-specification-draft.md ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CLAP Format Specification
2
+
3
+ Status: DRAFT
4
+ Document revision: 0.0.1
5
+ Last updated: Feb 6th, 2024
6
+
7
+ ## BEFORE YOU READ
8
+
9
+ The CLAP format spec is experimental and not finished yet!
10
+ There might be inconsistencies, unnecessary redundancies or blatant omissions.
11
+
12
+ ## What are CLAP files?
13
+
14
+ The CLAP format (.clap) is a video project format used to store information about a generative AI video.
15
+
16
+ It preserves prompts and assets into the same container, making easier to share a project between different people or applications.
17
+
18
+ ## Structure
19
+
20
+ A CLAP is an array of objects serialized into a YAML text string, then finally compressed using gzip to a binary file.
21
+
22
+ The file extension is `.clap`
23
+ The mime type is `application/x-yaml`
24
+
25
+ There can be 5 different types of objects:
26
+
27
+ - one HEADER
28
+ - one METADATA
29
+ - zero, one or more MODEL(s)
30
+ - zero, one or more SCENE(s)
31
+ - zero, one or more SEGMENT(s)
32
+
33
+ This can be represented in javascript like this:
34
+
35
+ ```javascript
36
+ const entries = [
37
+ clapHeader,
38
+ clapMeta,
39
+ ...clapModels,
40
+ ...clapScenes,
41
+ ...clapSegments
42
+ ]
43
+ ```
44
+
45
+ ## Header
46
+
47
+ The HEADER provides information about how to decode a CLAP.
48
+
49
+ Knowing in advance the number of models, scenes and segments helps the decoder parsing the information,
50
+ and in some implementation, help with debugging, logging, and provisioning memory usage.
51
+
52
+ However in the future, it is possible that a different scheme is used, in order to support streaming.
53
+
54
+ Either by recognizing the shape of each object (fields), or by using a specific field eg. a `_type`.
55
+
56
+ ```typescript
57
+ {
58
+ // used to know which format version is used.
59
+ // CLAP is still in development and the format is not fully specified yet,
60
+ // during the period most .clap file will have the "clap-0" format
61
+ format: "clap-0"
62
+
63
+ numberOfModels: number // integer
64
+ numberOfScenes: number // integer
65
+ numberOfSegments: number // integer
66
+ }
67
+ ```
68
+
69
+ ## Metadata
70
+
71
+ ```typescript
72
+ {
73
+ id: string // "<a valid UUID V4>"
74
+ title: string // "project title"
75
+ description: string // "project description"
76
+ licence: string // "information about licensing"
77
+
78
+ // this provides information about the image ratio
79
+ // this might be removed in the final spec, as this
80
+ // can be re-computed from width and height
81
+ orientation: "landscape" | "vertical" | "square"
82
+
83
+ // the suggested width and height of the video
84
+ // note that this is just an indicator,
85
+ // and might be superseeded by the application reading the .clap file
86
+ width: number // integer between 256 and 8192 (value in pixels)
87
+ height: number // integer between 256 and 8192 (value in pixels)
88
+
89
+ // name of the suggested video model to use
90
+ // note that this is just an indicator,
91
+ // and might be superseeded by the application reading the .clap file
92
+ defaultVideoModel: string
93
+
94
+ // additional prompt to use in the video generation
95
+ // this helps adding some magic touch and flair to the videos,
96
+ // but perhaps the field should be renamed
97
+ extraPositivePrompt: string
98
+
99
+ // the screenplay (script) of the video
100
+ screenplay: string
101
+ }
102
+
103
+ ## Models
104
+
105
+ Before talking about models, first we should describe the concept of entity:
106
+
107
+ in a story, an entity is something (person, place, vehicle, animal, robot, alien, object) with a name, a description of the appearance, an age, mileage or quality, an origin, and so on.
108
+
109
+ An example could be "a giant magical school bus, with appearance of a cat with wheels, and which talks"
110
+
111
+ The CLAP model would be an instance (an interpretation) of this entity, where we would assign it an identity:
112
+ - a name and age
113
+ - a visual style (a photo of the magic school bus cat)
114
+ - a voice style
115
+ - and maybe other things eg. an origin or background story
116
+
117
+ As you can see, it can be difficult to create clearly separated categories, like "vehicule", "character", or "location"
118
+ (the magical cat bus could turn into a location in some scene, a speaking character in another etc)
119
+
120
+ This is why there is a common schema for all models:
121
+
122
+ ```typescript
123
+ {
124
+ id: string
125
+ category: ClapSegmentCategory
126
+ triggerName: string
127
+ label: string
128
+ description: string
129
+ author: string
130
+ thumbnailUrl: string
131
+ seed: number
132
+
133
+ assetSourceType: ClapAssetSource
134
+ assetUrl: string
135
+
136
+ age: number
137
+ gender: ClapModelGender
138
+ region: ClapModelRegion
139
+ appearance: ClapModelAppearance
140
+ voiceVendor: ClapVoiceVendor
141
+ voiceId: string
142
+ }
143
+ ```
144
+
145
+ TO BE CONTINUED
146
+
147
+ (you can read "./types.ts" for more information)
src/clap/parseClap.ts CHANGED
@@ -50,7 +50,8 @@ export async function parseClap(inputStringOrBlob: string | Blob): Promise<ClapP
50
  width: getValidNumber(maybeClapMeta.width, 256, 8192, 1024),
51
  height: getValidNumber(maybeClapMeta.height, 256, 8192, 576),
52
  defaultVideoModel: typeof maybeClapMeta.defaultVideoModel === "string" ? maybeClapMeta.defaultVideoModel : "SVD",
53
- extraPositivePrompt: Array.isArray(maybeClapMeta.extraPositivePrompt) ? maybeClapMeta.extraPositivePrompt : []
 
54
  }
55
 
56
  /*
 
50
  width: getValidNumber(maybeClapMeta.width, 256, 8192, 1024),
51
  height: getValidNumber(maybeClapMeta.height, 256, 8192, 576),
52
  defaultVideoModel: typeof maybeClapMeta.defaultVideoModel === "string" ? maybeClapMeta.defaultVideoModel : "SVD",
53
+ extraPositivePrompt: Array.isArray(maybeClapMeta.extraPositivePrompt) ? maybeClapMeta.extraPositivePrompt : [],
54
+ screenplay: typeof maybeClapMeta.screenplay === "string" ? maybeClapMeta.screenplay : "",
55
  }
56
 
57
  /*
src/clap/serializeClap.ts CHANGED
@@ -130,6 +130,7 @@ export async function serializeClap({
130
  height: getValidNumber(meta.height, 256, 8192, 576),
131
  defaultVideoModel: typeof meta.defaultVideoModel === "string" ? meta.defaultVideoModel : "SVD",
132
  extraPositivePrompt: Array.isArray(meta.extraPositivePrompt) ? meta.extraPositivePrompt : [],
 
133
  }
134
 
135
  const entries = [
 
130
  height: getValidNumber(meta.height, 256, 8192, 576),
131
  defaultVideoModel: typeof meta.defaultVideoModel === "string" ? meta.defaultVideoModel : "SVD",
132
  extraPositivePrompt: Array.isArray(meta.extraPositivePrompt) ? meta.extraPositivePrompt : [],
133
+ screenplay: typeof meta.screenplay == "string" ? meta.screenplay : "",
134
  }
135
 
136
  const entries = [
src/clap/types.ts CHANGED
@@ -84,6 +84,7 @@ export type ClapMeta = {
84
  height: number
85
  defaultVideoModel: string
86
  extraPositivePrompt: string[]
 
87
  }
88
 
89
  export type ClapSceneEvent = {
 
84
  height: number
85
  defaultVideoModel: string
86
  extraPositivePrompt: string[]
87
+ screenplay: string
88
  }
89
 
90
  export type ClapSceneEvent = {