File size: 9,288 Bytes
91fb4ef
 
a529bb7
91fb4ef
 
 
 
a529bb7
91fb4ef
 
a529bb7
 
91fb4ef
 
 
 
 
 
 
 
 
 
 
 
a529bb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91fb4ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
import os
import shutil
import subprocess
from huggingface_hub import HfApi, create_repo
from pathlib import Path
import json
import re
import logging
from typing import Any, Optional, Dict, List, Union, Tuple

logger = logging.getLogger(__name__)

def make_archive(source: str | Path, destination: str | Path):
    source = str(source)
    destination = str(destination)
    #print(f"make_archive({source}, {destination})")
    base = os.path.basename(destination)
    name = base.split('.')[0]
    format = base.split('.')[1]
    archive_from = os.path.dirname(source)
    archive_to = os.path.basename(source.strip(os.sep))
    shutil.make_archive(name, format, archive_from, archive_to)
    shutil.move('%s.%s'%(name,format), destination)

def get_video_fps(video_path: Path) -> Optional[str]:
    """Get FPS information from video file using ffprobe
    
    Args:
        video_path: Path to video file
        
    Returns:
        FPS string (e.g. "24 FPS, ") or None if unable to determine
    """
    try:
        cmd = [
            'ffprobe',
            '-v', 'error',
            '-select_streams', 'v:0',
            '-show_entries', 'stream=avg_frame_rate',
            '-of', 'default=noprint_wrappers=1:nokey=1',
            str(video_path)
        ]
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            logger.warning(f"Error getting FPS for {video_path}: {result.stderr}")
            return None
            
        fps = result.stdout.strip()
        if '/' in fps:
            # Convert fraction to decimal
            num, den = map(int, fps.split('/'))
            if den == 0:
                return None
            fps = str(round(num / den))
            
        return f"{fps} FPS, "
        
    except Exception as e:
        logger.warning(f"Failed to get FPS for {video_path}: {e}")
        return None

def extract_scene_info(filename: str) -> Tuple[str, Optional[int]]:
    """Extract base name and scene number from filename
    
    Args:
        filename: Input filename like "my_cool_video_1___001.mp4"
        
    Returns:
        Tuple of (base_name, scene_number)
        e.g. ("my_cool_video_1", 1)
    """
    # Match numbers at the end of the filename before extension
    match = re.search(r'(.+?)___(\d+)$', Path(filename).stem)
    if match:
        return match.group(1), int(match.group(2))
    return Path(filename).stem, None

def is_image_file(file_path: Path) -> bool:
    """Check if file is an image based on extension
    
    Args:
        file_path: Path to check
        
    Returns:
        bool: True if file has image extension
    """
    image_extensions = {'.jpg', '.jpeg', '.png', '.webp', '.avif', '.heic'}
    return file_path.suffix.lower() in image_extensions

def is_video_file(file_path: Path) -> bool:
    """Check if file is a video based on extension
    
    Args:
        file_path: Path to check
        
    Returns:
        bool: True if file has video extension
    """
    video_extensions = {'.mp4', '.webm'}
    return file_path.suffix.lower() in video_extensions

def parse_bool_env(env_value: Optional[str]) -> bool:
    """Parse environment variable string to boolean
    
    Handles various true/false string representations:
    - True: "true", "True", "TRUE", "1", etc
    - False: "false", "False", "FALSE", "0", "", None
    """
    if not env_value:
        return False
    return str(env_value).lower() in ('true', '1', 't', 'y', 'yes')

def validate_model_repo(repo_id: str) -> Dict[str, str]:
    """Validate HuggingFace model repository name
    
    Args:
        repo_id: Repository ID in format "username/model-name"
        
    Returns:
        Dict with error message if invalid, or None if valid
    """
    if not repo_id:
        return {"error": "Repository name is required"}
    
    if "/" not in repo_id:
        return {"error": "Repository name must be in format username/model-name"}
        
    # Check characters
    invalid_chars = set('<>:"/\\|?*')
    if any(c in repo_id for c in invalid_chars):
        return {"error": "Repository name contains invalid characters"}
        
    return {"error": None}

def save_to_hub(model_path: Path, repo_id: str, token: str, commit_message: str = "Update model") -> bool:
    """Save model files to Hugging Face Hub
    
    Args:
        model_path: Path to model files
        repo_id: Repository ID (username/model-name)
        token: HuggingFace API token
        commit_message: Optional commit message
        
    Returns:
        bool: True if successful, False if failed
    """
    try:
        api = HfApi(token=token)
        
        # Validate repo_id
        validation = validate_model_repo(repo_id)
        if validation["error"]:
            return False
        
        # Create or get repo
        try:
            create_repo(repo_id, token=token, repo_type="model", exist_ok=True)
        except Exception as e:
            print(f"Error creating repo: {e}")
            return False
            
        # Upload all files
        api.upload_folder(
            folder_path=str(model_path),
            repo_id=repo_id,
            repo_type="model",
            commit_message=commit_message
        )
        
        return True
    except Exception as e:
        print(f"Error uploading to hub: {e}")
        return False

def parse_training_log(line: str) -> Dict:
    """Parse a training log line for metrics
    
    Args:
        line: Log line from training output
        
    Returns:
        Dict with parsed metrics (epoch, step, loss, etc)
    """
    metrics = {}
    
    try:
        # Extract step/epoch info
        if "step=" in line:
            step = int(line.split("step=")[1].split()[0].strip(","))
            metrics["step"] = step
        
        if "epoch=" in line:
            epoch = int(line.split("epoch=")[1].split()[0].strip(","))
            metrics["epoch"] = epoch
            
        if "loss=" in line:
            loss = float(line.split("loss=")[1].split()[0].strip(","))
            metrics["loss"] = loss
            
        if "lr=" in line:
            lr = float(line.split("lr=")[1].split()[0].strip(","))
            metrics["learning_rate"] = lr
    except:
        pass
        
    return metrics

def format_size(size_bytes: int) -> str:
    """Format bytes into human readable string with appropriate unit
    
    Args:
        size_bytes: Size in bytes
        
    Returns:
        Formatted string (e.g. "1.5 Gb")
    """
    units = ['bytes', 'Kb', 'Mb', 'Gb', 'Tb']
    unit_index = 0
    size = float(size_bytes)
    
    while size >= 1024 and unit_index < len(units) - 1:
        size /= 1024
        unit_index += 1
        
    # Special case for bytes - no decimal places
    if unit_index == 0:
        return f"{int(size)} {units[unit_index]}"
    
    return f"{size:.1f} {units[unit_index]}"


def count_media_files(path: Path) -> Tuple[int, int, int]:
    """Count videos and images in directory
    
    Args:
        path: Directory to scan
        
    Returns:
        Tuple of (video_count, image_count, total_size)
    """
    video_count = 0
    image_count = 0
    total_size = 0
    
    for file in path.glob("*"):
        # Skip hidden files and caption files
        if file.name.startswith('.') or file.suffix.lower() == '.txt':
            continue
            
        if is_video_file(file):
            video_count += 1
            total_size += file.stat().st_size
        elif is_image_file(file):
            image_count += 1
            total_size += file.stat().st_size
            
    return video_count, image_count, total_size

def format_media_title(action: str, video_count: int, image_count: int, total_size: int) -> str:
    """Format title with media counts and size
    
    Args:
        action: Action (eg "split", "caption")
        video_count: Number of videos
        image_count: Number of images
        total_size: Total size in bytes
        
    Returns:
        Formatted title string
    """
    parts = []
    if image_count > 0:
        parts.append(f"{image_count:,} photo{'s' if image_count != 1 else ''}")
    if video_count > 0:
        parts.append(f"{video_count:,} video{'s' if video_count != 1 else ''}")
        
    if not parts:
        return f"## 0 files to {action} (0 bytes)"
        
    return f"## {' and '.join(parts)} to {action} ({format_size(total_size)})"

def add_prefix_to_caption(caption: str, prefix: str) -> str:
    """Add prefix to caption if not already present"""
    if not prefix or not caption:
        return caption
    if caption.startswith(prefix):
        return caption
    return f"{prefix}{caption}"

def format_time(seconds: float) -> str:
    """Format time duration in seconds to human readable string
    
    Args:
        seconds: Time in seconds
        
    Returns:
        Formatted string (e.g. "2h 30m 45s")
    """
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    
    parts = []
    if hours > 0:
        parts.append(f"{hours}h")
    if minutes > 0:
        parts.append(f"{minutes}m")
    if secs > 0 or not parts:
        parts.append(f"{secs}s")
        
    return " ".join(parts)