Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on 29 days ago

Commit

92eacee

1 Parent(s): 4905fb4

let's work on megadatasets

Browse files

Files changed (5) hide show

CLAUDE.md +18 -0
README.md +12 -0
vms/config.py +3 -0
vms/ui/project/tabs/caption_tab.py +24 -7
vms/ui/project/tabs/manage_tab.py +7 -2

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,18 @@

+# Video Model Studio - Guidelines for Claude
+## Build & Run Commands
+- Setup: `./setup.sh` (with flash attention) or `./setup_no_captions.sh` (without)
+- Run: `./run.sh` or `python3.10 app.py`
+- Test: `python3 tests/test_dataset.py`
+- Single model test: `bash tests/scripts/dummy_cogvideox_lora.sh`
+## Code Style
+- Python version: 3.10 (required for flash-attention compatibility)
+- Type hints: Use typing module annotations for all functions
+- Docstrings: Google style with Args/Returns sections
+- Error handling: Use try/except with specific exceptions, log errors
+- Imports: Group standard lib, third-party, and project imports
+- Naming: snake_case for functions/variables, PascalCase for classes
+- Use Path objects from pathlib instead of string paths
+- Format utility functions: Extract reusable logic to separate functions
+- Environment variables: Use parse_bool_env for boolean env vars

README.md CHANGED Viewed

@@ -217,3 +217,15 @@ By default `run.sh` will store stuff in `.data/` (located inside the current wor
 ```bash
 ./run.sh
 ```

 ```bash
 ./run.sh
 ```
+### Environment Variables
+- `STORAGE_PATH`: Specifies the base storage path (default: '.data')
+- `HF_API_TOKEN`: Your Hugging Face API token for accessing models and publishing
+- `USE_LARGE_DATASET`: Set to "true" or "1" to enable large dataset mode, which:
+  - Hides the caption list in the caption tab
+  - Disables preview and editing of individual captions
+  - Disables the dataset download button
+  - Use this when working with large datasets that would be too slow to display in the UI
+- `PRELOAD_CAPTIONING_MODEL`: Preloads the captioning model at startup
+- `ASK_USER_TO_DUPLICATE_SPACE`: Prompts users to duplicate the space

vms/config.py CHANGED Viewed

@@ -19,6 +19,9 @@ def parse_bool_env(env_value: Optional[str]) -> bool:
 HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 ASK_USER_TO_DUPLICATE_SPACE = parse_bool_env(os.getenv("ASK_USER_TO_DUPLICATE_SPACE"))
 # Base storage path
 STORAGE_PATH = Path(os.environ.get('STORAGE_PATH', '.data'))

 HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 ASK_USER_TO_DUPLICATE_SPACE = parse_bool_env(os.getenv("ASK_USER_TO_DUPLICATE_SPACE"))
+# For large datasets that would be slow to display or download
+USE_LARGE_DATASET = parse_bool_env(os.getenv("USE_LARGE_DATASET"))
 # Base storage path
 STORAGE_PATH = Path(os.environ.get('STORAGE_PATH', '.data'))

vms/ui/project/tabs/caption_tab.py CHANGED Viewed

@@ -11,7 +11,7 @@ from pathlib import Path
 import mimetypes
 from vms.utils import BaseTab, is_image_file, is_video_file, copy_files_to_training_dir
-from vms.config import DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, DEFAULT_PROMPT_PREFIX, STAGING_PATH, TRAINING_VIDEOS_PATH
 logger = logging.getLogger(__name__)
@@ -28,7 +28,10 @@ class CaptionTab(BaseTab):
         """Create the Caption tab UI components"""
         with gr.TabItem(self.title, id=self.id) as tab:
             with gr.Row():
-                self.components["caption_title"] = gr.Markdown("## Captioning of 0 files (0 bytes)")
             with gr.Row():
                 with gr.Column():
@@ -62,7 +65,7 @@ class CaptionTab(BaseTab):
                             interactive=False
                         )
-            with gr.Row():
                 with gr.Column():
                     self.components["training_dataset"] = gr.Dataframe(
                         headers=["name", "status"],
@@ -95,6 +98,10 @@ class CaptionTab(BaseTab):
                         visible=True
                     )
                     self.components["original_file_path"] = gr.State(value=None)
         return tab
@@ -174,10 +181,16 @@ class CaptionTab(BaseTab):
     def refresh(self) -> Dict[str, Any]:
         """Refresh the dataset list with current data"""
-        training_dataset = self.list_training_files_to_caption()
-        return {
-            "training_dataset": training_dataset
-        }
     def show_refreshing_status(self) -> List[List[str]]:
         """Show a 'Refreshing...' status in the dataframe"""
@@ -318,6 +331,10 @@ class CaptionTab(BaseTab):
     def list_training_files_to_caption(self) -> List[List[str]]:
         """List all clips and images - both pending and captioned"""
         files = []
         already_listed = {}

 import mimetypes
 from vms.utils import BaseTab, is_image_file, is_video_file, copy_files_to_training_dir
+from vms.config import DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, DEFAULT_PROMPT_PREFIX, STAGING_PATH, TRAINING_VIDEOS_PATH, USE_LARGE_DATASET
 logger = logging.getLogger(__name__)
         """Create the Caption tab UI components"""
         with gr.TabItem(self.title, id=self.id) as tab:
             with gr.Row():
+                if USE_LARGE_DATASET:
+                    self.components["caption_title"] = gr.Markdown("## Captioning (Large Dataset Mode)")
+                else:
+                    self.components["caption_title"] = gr.Markdown("## Captioning of 0 files (0 bytes)")
             with gr.Row():
                 with gr.Column():
                             interactive=False
                         )
+            with gr.Row(visible=not USE_LARGE_DATASET):
                 with gr.Column():
                     self.components["training_dataset"] = gr.Dataframe(
                         headers=["name", "status"],
                         visible=True
                     )
                     self.components["original_file_path"] = gr.State(value=None)
+            with gr.Row(visible=USE_LARGE_DATASET):
+                gr.Markdown("### Large Dataset Mode Active")
+                gr.Markdown("Caption preview and editing is disabled to improve performance with large datasets.")
         return tab
     def refresh(self) -> Dict[str, Any]:
         """Refresh the dataset list with current data"""
+        if USE_LARGE_DATASET:
+            # In large dataset mode, we don't attempt to list files
+            return {
+                "training_dataset": [["Large dataset mode enabled", "listing skipped"]]
+            }
+        else:
+            training_dataset = self.list_training_files_to_caption()
+            return {
+                "training_dataset": training_dataset
+            }
     def show_refreshing_status(self) -> List[List[str]]:
         """Show a 'Refreshing...' status in the dataframe"""
     def list_training_files_to_caption(self) -> List[List[str]]:
         """List all clips and images - both pending and captioned"""
+        # In large dataset mode, return a placeholder message instead of listing all files
+        if USE_LARGE_DATASET:
+            return [["Large dataset mode enabled", "listing skipped"]]
         files = []
         already_listed = {}

vms/ui/project/tabs/manage_tab.py CHANGED Viewed

@@ -11,7 +11,7 @@ from typing import Dict, Any, List, Optional
 from vms.utils import BaseTab, validate_model_repo
 from vms.config import (
     HF_API_TOKEN, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, TRAINING_VIDEOS_PATH,
-    TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, LOG_FILE_PATH
 )
 logger = logging.getLogger(__name__)
@@ -36,8 +36,13 @@ class ManageTab(BaseTab):
                         self.components["download_dataset_btn"] = gr.DownloadButton(
                             "📦 Download training dataset (.zip)",
                             variant="secondary",
-                            size="lg"
                         )
                         self.components["download_model_btn"] = gr.DownloadButton(
                             "🧠 Download weights (.safetensors)",
                             variant="secondary",

 from vms.utils import BaseTab, validate_model_repo
 from vms.config import (
     HF_API_TOKEN, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, TRAINING_VIDEOS_PATH,
+    TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, LOG_FILE_PATH, USE_LARGE_DATASET
 )
 logger = logging.getLogger(__name__)
                         self.components["download_dataset_btn"] = gr.DownloadButton(
                             "📦 Download training dataset (.zip)",
                             variant="secondary",
+                            size="lg",
+                            visible=not USE_LARGE_DATASET
                         )
+                        # If we have a large dataset, display a message explaining why download is disabled
+                        if USE_LARGE_DATASET:
+                            gr.Markdown("📦 Training dataset download disabled for large datasets")
                         self.components["download_model_btn"] = gr.DownloadButton(
                             "🧠 Download weights (.safetensors)",
                             variant="secondary",