jbilcke-hf HF Staff commited on
Commit
92eacee
·
1 Parent(s): 4905fb4

let's work on megadatasets

Browse files
CLAUDE.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Video Model Studio - Guidelines for Claude
2
+
3
+ ## Build & Run Commands
4
+ - Setup: `./setup.sh` (with flash attention) or `./setup_no_captions.sh` (without)
5
+ - Run: `./run.sh` or `python3.10 app.py`
6
+ - Test: `python3 tests/test_dataset.py`
7
+ - Single model test: `bash tests/scripts/dummy_cogvideox_lora.sh`
8
+
9
+ ## Code Style
10
+ - Python version: 3.10 (required for flash-attention compatibility)
11
+ - Type hints: Use typing module annotations for all functions
12
+ - Docstrings: Google style with Args/Returns sections
13
+ - Error handling: Use try/except with specific exceptions, log errors
14
+ - Imports: Group standard lib, third-party, and project imports
15
+ - Naming: snake_case for functions/variables, PascalCase for classes
16
+ - Use Path objects from pathlib instead of string paths
17
+ - Format utility functions: Extract reusable logic to separate functions
18
+ - Environment variables: Use parse_bool_env for boolean env vars
README.md CHANGED
@@ -217,3 +217,15 @@ By default `run.sh` will store stuff in `.data/` (located inside the current wor
217
  ```bash
218
  ./run.sh
219
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  ```bash
218
  ./run.sh
219
  ```
220
+
221
+ ### Environment Variables
222
+
223
+ - `STORAGE_PATH`: Specifies the base storage path (default: '.data')
224
+ - `HF_API_TOKEN`: Your Hugging Face API token for accessing models and publishing
225
+ - `USE_LARGE_DATASET`: Set to "true" or "1" to enable large dataset mode, which:
226
+ - Hides the caption list in the caption tab
227
+ - Disables preview and editing of individual captions
228
+ - Disables the dataset download button
229
+ - Use this when working with large datasets that would be too slow to display in the UI
230
+ - `PRELOAD_CAPTIONING_MODEL`: Preloads the captioning model at startup
231
+ - `ASK_USER_TO_DUPLICATE_SPACE`: Prompts users to duplicate the space
vms/config.py CHANGED
@@ -19,6 +19,9 @@ def parse_bool_env(env_value: Optional[str]) -> bool:
19
  HF_API_TOKEN = os.getenv("HF_API_TOKEN")
20
  ASK_USER_TO_DUPLICATE_SPACE = parse_bool_env(os.getenv("ASK_USER_TO_DUPLICATE_SPACE"))
21
 
 
 
 
22
  # Base storage path
23
  STORAGE_PATH = Path(os.environ.get('STORAGE_PATH', '.data'))
24
 
 
19
  HF_API_TOKEN = os.getenv("HF_API_TOKEN")
20
  ASK_USER_TO_DUPLICATE_SPACE = parse_bool_env(os.getenv("ASK_USER_TO_DUPLICATE_SPACE"))
21
 
22
+ # For large datasets that would be slow to display or download
23
+ USE_LARGE_DATASET = parse_bool_env(os.getenv("USE_LARGE_DATASET"))
24
+
25
  # Base storage path
26
  STORAGE_PATH = Path(os.environ.get('STORAGE_PATH', '.data'))
27
 
vms/ui/project/tabs/caption_tab.py CHANGED
@@ -11,7 +11,7 @@ from pathlib import Path
11
  import mimetypes
12
 
13
  from vms.utils import BaseTab, is_image_file, is_video_file, copy_files_to_training_dir
14
- from vms.config import DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, DEFAULT_PROMPT_PREFIX, STAGING_PATH, TRAINING_VIDEOS_PATH
15
 
16
  logger = logging.getLogger(__name__)
17
 
@@ -28,7 +28,10 @@ class CaptionTab(BaseTab):
28
  """Create the Caption tab UI components"""
29
  with gr.TabItem(self.title, id=self.id) as tab:
30
  with gr.Row():
31
- self.components["caption_title"] = gr.Markdown("## Captioning of 0 files (0 bytes)")
 
 
 
32
 
33
  with gr.Row():
34
  with gr.Column():
@@ -62,7 +65,7 @@ class CaptionTab(BaseTab):
62
  interactive=False
63
  )
64
 
65
- with gr.Row():
66
  with gr.Column():
67
  self.components["training_dataset"] = gr.Dataframe(
68
  headers=["name", "status"],
@@ -95,6 +98,10 @@ class CaptionTab(BaseTab):
95
  visible=True
96
  )
97
  self.components["original_file_path"] = gr.State(value=None)
 
 
 
 
98
 
99
  return tab
100
 
@@ -174,10 +181,16 @@ class CaptionTab(BaseTab):
174
 
175
  def refresh(self) -> Dict[str, Any]:
176
  """Refresh the dataset list with current data"""
177
- training_dataset = self.list_training_files_to_caption()
178
- return {
179
- "training_dataset": training_dataset
180
- }
 
 
 
 
 
 
181
 
182
  def show_refreshing_status(self) -> List[List[str]]:
183
  """Show a 'Refreshing...' status in the dataframe"""
@@ -318,6 +331,10 @@ class CaptionTab(BaseTab):
318
 
319
  def list_training_files_to_caption(self) -> List[List[str]]:
320
  """List all clips and images - both pending and captioned"""
 
 
 
 
321
  files = []
322
  already_listed = {}
323
 
 
11
  import mimetypes
12
 
13
  from vms.utils import BaseTab, is_image_file, is_video_file, copy_files_to_training_dir
14
+ from vms.config import DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, DEFAULT_PROMPT_PREFIX, STAGING_PATH, TRAINING_VIDEOS_PATH, USE_LARGE_DATASET
15
 
16
  logger = logging.getLogger(__name__)
17
 
 
28
  """Create the Caption tab UI components"""
29
  with gr.TabItem(self.title, id=self.id) as tab:
30
  with gr.Row():
31
+ if USE_LARGE_DATASET:
32
+ self.components["caption_title"] = gr.Markdown("## Captioning (Large Dataset Mode)")
33
+ else:
34
+ self.components["caption_title"] = gr.Markdown("## Captioning of 0 files (0 bytes)")
35
 
36
  with gr.Row():
37
  with gr.Column():
 
65
  interactive=False
66
  )
67
 
68
+ with gr.Row(visible=not USE_LARGE_DATASET):
69
  with gr.Column():
70
  self.components["training_dataset"] = gr.Dataframe(
71
  headers=["name", "status"],
 
98
  visible=True
99
  )
100
  self.components["original_file_path"] = gr.State(value=None)
101
+
102
+ with gr.Row(visible=USE_LARGE_DATASET):
103
+ gr.Markdown("### Large Dataset Mode Active")
104
+ gr.Markdown("Caption preview and editing is disabled to improve performance with large datasets.")
105
 
106
  return tab
107
 
 
181
 
182
  def refresh(self) -> Dict[str, Any]:
183
  """Refresh the dataset list with current data"""
184
+ if USE_LARGE_DATASET:
185
+ # In large dataset mode, we don't attempt to list files
186
+ return {
187
+ "training_dataset": [["Large dataset mode enabled", "listing skipped"]]
188
+ }
189
+ else:
190
+ training_dataset = self.list_training_files_to_caption()
191
+ return {
192
+ "training_dataset": training_dataset
193
+ }
194
 
195
  def show_refreshing_status(self) -> List[List[str]]:
196
  """Show a 'Refreshing...' status in the dataframe"""
 
331
 
332
  def list_training_files_to_caption(self) -> List[List[str]]:
333
  """List all clips and images - both pending and captioned"""
334
+ # In large dataset mode, return a placeholder message instead of listing all files
335
+ if USE_LARGE_DATASET:
336
+ return [["Large dataset mode enabled", "listing skipped"]]
337
+
338
  files = []
339
  already_listed = {}
340
 
vms/ui/project/tabs/manage_tab.py CHANGED
@@ -11,7 +11,7 @@ from typing import Dict, Any, List, Optional
11
  from vms.utils import BaseTab, validate_model_repo
12
  from vms.config import (
13
  HF_API_TOKEN, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, TRAINING_VIDEOS_PATH,
14
- TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, LOG_FILE_PATH
15
  )
16
 
17
  logger = logging.getLogger(__name__)
@@ -36,8 +36,13 @@ class ManageTab(BaseTab):
36
  self.components["download_dataset_btn"] = gr.DownloadButton(
37
  "📦 Download training dataset (.zip)",
38
  variant="secondary",
39
- size="lg"
 
40
  )
 
 
 
 
41
  self.components["download_model_btn"] = gr.DownloadButton(
42
  "🧠 Download weights (.safetensors)",
43
  variant="secondary",
 
11
  from vms.utils import BaseTab, validate_model_repo
12
  from vms.config import (
13
  HF_API_TOKEN, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, TRAINING_VIDEOS_PATH,
14
+ TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, LOG_FILE_PATH, USE_LARGE_DATASET
15
  )
16
 
17
  logger = logging.getLogger(__name__)
 
36
  self.components["download_dataset_btn"] = gr.DownloadButton(
37
  "📦 Download training dataset (.zip)",
38
  variant="secondary",
39
+ size="lg",
40
+ visible=not USE_LARGE_DATASET
41
  )
42
+ # If we have a large dataset, display a message explaining why download is disabled
43
+ if USE_LARGE_DATASET:
44
+ gr.Markdown("📦 Training dataset download disabled for large datasets")
45
+
46
  self.components["download_model_btn"] = gr.DownloadButton(
47
  "🧠 Download weights (.safetensors)",
48
  variant="secondary",