File size: 9,373 Bytes
b5df735 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
"""
Storage Configuration Management
Centralizes all storage path configurations for downloads and transcripts
"""
import os
from pathlib import Path
from typing import Optional
from dotenv import load_dotenv
class StorageConfig:
"""Centralized storage configuration for podcast processing"""
def __init__(self, config_file: str = "config.env"):
"""
Initialize storage configuration
Args:
config_file: Path to configuration file
"""
self.config_file = config_file
self._load_config()
self._ensure_directories()
def _load_config(self):
"""Load configuration from environment file or Modal environment"""
# Check if we're running in Modal environment
is_modal_env = (
os.getenv("MODAL_TASK_ID") or
os.getenv("MODAL_IS_INSIDE_CONTAINER") or
os.getenv("DEPLOYMENT_MODE") == "modal"
)
if is_modal_env:
print("π§ Using Modal environment configuration")
# Use Modal defaults - don't load config files
self.downloads_dir = Path("/root/downloads").resolve()
self.transcripts_dir = Path("/root/transcripts").resolve()
self.cache_dir = Path("/root/cache").resolve()
else:
print("π§ Using local environment configuration")
# Load from config file if it exists
if os.path.exists(self.config_file):
load_dotenv(self.config_file, override=False)
print(f"π Loaded config from {self.config_file}")
# Load from .env if it exists
if os.path.exists(".env"):
load_dotenv(".env", override=False)
print("π Loaded config from .env")
# Set defaults for local environment
self.downloads_dir = Path(os.getenv("DOWNLOADS_DIR", "./downloads")).resolve()
self.transcripts_dir = Path(os.getenv("TRANSCRIPTS_DIR", "./transcripts")).resolve()
self.cache_dir = Path(os.getenv("CACHE_DIR", "./cache")).resolve()
# Common settings (apply to both environments)
self.download_quality = os.getenv("DOWNLOAD_QUALITY", "highest")
self.convert_to_mp3 = os.getenv("CONVERT_TO_MP3", "true").lower() == "true"
self.default_model_size = os.getenv("DEFAULT_MODEL_SIZE", "turbo")
self.default_output_format = os.getenv("DEFAULT_OUTPUT_FORMAT", "srt")
self.enable_speaker_diarization = os.getenv("ENABLE_SPEAKER_DIARIZATION", "false").lower() == "true"
self.use_parallel_processing = os.getenv("USE_PARALLEL_PROCESSING", "true").lower() == "true"
self.chunk_duration = int(os.getenv("CHUNK_DURATION", "60"))
# Store environment type for reference
self.is_modal_env = is_modal_env
def _ensure_directories(self):
"""Ensure all configured directories exist"""
for directory in [self.downloads_dir, self.transcripts_dir, self.cache_dir]:
try:
directory.mkdir(parents=True, exist_ok=True)
if self.is_modal_env:
print(f"π Modal storage directory ready: {directory}")
else:
print(f"π Local storage directory ready: {directory}")
except Exception as e:
print(f"β οΈ Failed to create directory {directory}: {e}")
# In Modal environment, some directories might be managed differently
if not self.is_modal_env:
raise
def get_download_path(self, filename: str) -> Path:
"""
Get full path for downloaded audio file
Args:
filename: Audio filename
Returns:
Full path for downloaded file
"""
return self.downloads_dir / filename
def get_transcript_path(self, audio_filename: str, output_format: str = None) -> Path:
"""
Get full path for transcript file
Args:
audio_filename: Original audio filename
output_format: Output format (txt, srt, json)
Returns:
Full path for transcript file
"""
if output_format is None:
output_format = self.default_output_format
# Remove audio extension and add transcript extension
base_name = Path(audio_filename).stem
transcript_filename = f"{base_name}.{output_format}"
return self.transcripts_dir / transcript_filename
def get_cache_path(self, filename: str) -> Path:
"""
Get full path for cache file
Args:
filename: Cache filename
Returns:
Full path for cache file
"""
return self.cache_dir / filename
def get_audio_files(self) -> list[Path]:
"""
Get list of all audio files in downloads directory
Returns:
List of audio file paths
"""
audio_extensions = {'.mp3', '.wav', '.m4a', '.flac', '.aac', '.ogg'}
audio_files = []
for file_path in self.downloads_dir.iterdir():
if file_path.is_file() and file_path.suffix.lower() in audio_extensions:
audio_files.append(file_path)
return sorted(audio_files)
def get_transcript_files(self, audio_filename: str = None) -> dict[str, Path]:
"""
Get paths for all transcript formats for a given audio file
Args:
audio_filename: Audio filename (optional)
Returns:
Dictionary mapping format to file path
"""
if audio_filename:
base_name = Path(audio_filename).stem
return {
'txt': self.get_transcript_path(audio_filename, 'txt'),
'srt': self.get_transcript_path(audio_filename, 'srt'),
'json': self.get_transcript_path(audio_filename, 'json')
}
else:
# Return all transcript files
transcript_files = {'txt': [], 'srt': [], 'json': []}
for file_path in self.transcripts_dir.iterdir():
if file_path.is_file():
ext = file_path.suffix[1:] # Remove the dot
if ext in transcript_files:
transcript_files[ext].append(file_path)
return transcript_files
def cleanup_temp_files(self, pattern: str = "temp_*"):
"""
Clean up temporary files in cache directory
Args:
pattern: File pattern to match for cleanup
"""
import glob
temp_files = glob.glob(str(self.cache_dir / pattern))
for temp_file in temp_files:
try:
os.remove(temp_file)
print(f"ποΈ Cleaned up temp file: {temp_file}")
except Exception as e:
print(f"β οΈ Failed to cleanup {temp_file}: {e}")
def get_storage_info(self) -> dict:
"""
Get storage configuration information
Returns:
Dictionary with storage information
"""
audio_files = self.get_audio_files()
transcript_files = self.get_transcript_files()
def get_dir_size(directory: Path) -> int:
"""Get total size of directory in bytes"""
total_size = 0
try:
for file_path in directory.rglob('*'):
if file_path.is_file():
total_size += file_path.stat().st_size
except Exception:
pass
return total_size
return {
"environment": "modal" if self.is_modal_env else "local",
"downloads_dir": str(self.downloads_dir),
"transcripts_dir": str(self.transcripts_dir),
"cache_dir": str(self.cache_dir),
"audio_files_count": len(audio_files),
"transcript_txt_count": len(transcript_files.get('txt', [])),
"transcript_srt_count": len(transcript_files.get('srt', [])),
"transcript_json_count": len(transcript_files.get('json', [])),
"downloads_size_mb": round(get_dir_size(self.downloads_dir) / (1024 * 1024), 2),
"transcripts_size_mb": round(get_dir_size(self.transcripts_dir) / (1024 * 1024), 2),
"cache_size_mb": round(get_dir_size(self.cache_dir) / (1024 * 1024), 2),
}
# Global storage configuration instance
_storage_config: Optional[StorageConfig] = None
def get_storage_config() -> StorageConfig:
"""
Get global storage configuration instance
Returns:
StorageConfig instance
"""
global _storage_config
if _storage_config is None:
_storage_config = StorageConfig()
return _storage_config
def get_downloads_dir() -> Path:
"""Get downloads directory path"""
return get_storage_config().downloads_dir
def get_transcripts_dir() -> Path:
"""Get transcripts directory path"""
return get_storage_config().transcripts_dir
def get_cache_dir() -> Path:
"""Get cache directory path"""
return get_storage_config().cache_dir |