File size: 4,880 Bytes
91fb4ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
from pathlib import Path
import logging
import shutil
from typing import Any, Optional, Dict, List, Union, Tuple
from config import STORAGE_PATH, TRAINING_PATH, STAGING_PATH, TRAINING_VIDEOS_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
from utils import extract_scene_info, make_archive, is_image_file, is_video_file

logger = logging.getLogger(__name__)

def prepare_finetrainers_dataset() -> Tuple[Path, Path]:
    """make sure we have a Finetrainers-compatible dataset structure
    
    Checks that we have:
        training/
        β”œβ”€β”€ prompt.txt       # All captions, one per line
        β”œβ”€β”€ videos.txt       # All video paths, one per line
        └── videos/          # Directory containing all mp4 files
            β”œβ”€β”€ 00000.mp4
            β”œβ”€β”€ 00001.mp4
            └── ...
    Returns:
        Tuple of (videos_file_path, prompts_file_path)
    """

    # Verifies the videos subdirectory
    TRAINING_VIDEOS_PATH.mkdir(exist_ok=True)
    
    # Clear existing training lists
    for f in TRAINING_PATH.glob("*"):
        if f.is_file():
            if f.name in ["videos.txt", "prompts.txt"]:
                f.unlink()
    
    videos_file = TRAINING_PATH / "videos.txt"
    prompts_file = TRAINING_PATH / "prompts.txt"  # Note: Changed from prompt.txt to prompts.txt to match our config
    
    media_files = []
    captions = []
    # Process all video files from the videos subdirectory
    for idx, file in enumerate(sorted(TRAINING_VIDEOS_PATH.glob("*.mp4"))):
        caption_file = file.with_suffix('.txt')
        if caption_file.exists():
            # Normalize caption to single line
            caption = caption_file.read_text().strip()
            caption = ' '.join(caption.split())
            
            # Use relative path from training root
            relative_path = f"videos/{file.name}"
            media_files.append(relative_path)
            captions.append(caption)
            
            # Clean up the caption file since it's now in prompts.txt
            # EDIT well you know what, let's keep it, otherwise running the function
            # twice might cause some errors
            # caption_file.unlink()

    # Write files if we have content
    if media_files and captions:
        videos_file.write_text('\n'.join(media_files))
        prompts_file.write_text('\n'.join(captions))
  
    else:
        raise ValueError("No valid video/caption pairs found in training directory")
    # Verify file contents
    with open(videos_file) as vf:
        video_lines = [l.strip() for l in vf.readlines() if l.strip()]
    with open(prompts_file) as pf:
        prompt_lines = [l.strip() for l in pf.readlines() if l.strip()]
        
    if len(video_lines) != len(prompt_lines):
        raise ValueError(f"Mismatch in generated files: {len(video_lines)} videos vs {len(prompt_lines)} prompts")
        
    return videos_file, prompts_file

def copy_files_to_training_dir(prompt_prefix: str) -> int:
    """Just copy files over, with no destruction"""

    gr.Info("Copying assets to the training dataset..")

    # Find files needing captions
    video_files = list(STAGING_PATH.glob("*.mp4"))
    image_files = [f for f in STAGING_PATH.glob("*") if is_image_file(f)]
    all_files = video_files + image_files
    
    nb_copied_pairs = 0

    for file_path in all_files:

        caption = ""
        file_caption_path = file_path.with_suffix('.txt')
        if file_caption_path.exists():
            logger.debug(f"Found caption file: {file_caption_path}")
            caption = file_caption_path.read_text()

         # Get parent caption if this is a clip
        parent_caption = ""
        if "___" in file_path.stem:
            parent_name, _ = extract_scene_info(file_path.stem)
            #print(f"parent_name is {parent_name}")
            parent_caption_path = STAGING_PATH / f"{parent_name}.txt"
            if parent_caption_path.exists():
                logger.debug(f"Found parent caption file: {parent_caption_path}")
                parent_caption = parent_caption_path.read_text().strip()

        target_file_path = TRAINING_VIDEOS_PATH / file_path.name

        target_caption_path = target_file_path.with_suffix('.txt')

        if parent_caption and not caption.endswith(parent_caption):
            caption = f"{caption}\n{parent_caption}"

        if prompt_prefix and not caption.startswith(prompt_prefix):
            caption = f"{prompt_prefix}{caption}"
            
        # make sure we only copy over VALID pairs
        if caption:
            target_caption_path.write_text(caption)
            shutil.copy2(file_path, target_file_path)
            nb_copied_pairs += 1

    prepare_finetrainers_dataset()

    gr.Info(f"Successfully generated the training dataset ({nb_copied_pairs} pairs)")

    return nb_copied_pairs