""" Unit tests for storage configuration system Tests the new storage configuration functionality including: - Storage config management - Environment detection - Path generation - Storage tools """ import pytest import os import tempfile import shutil from pathlib import Path from unittest.mock import patch, MagicMock import asyncio import sys sys.path.append(str(Path(__file__).parent.parent / "src")) from src.utils.storage_config import StorageConfig, get_storage_config from src.tools.storage_tools import get_storage_info_tool class TestStorageConfig: """Test cases for StorageConfig class""" def setup_method(self): """Setup for each test method""" # Create temporary directory for testing self.temp_dir = Path(tempfile.mkdtemp()) self.config_file = self.temp_dir / "test_config.env" # Create test config file config_content = """ DOWNLOADS_DIR=./test_downloads TRANSCRIPTS_DIR=./test_transcripts CACHE_DIR=./test_cache DEFAULT_MODEL_SIZE=base DEFAULT_OUTPUT_FORMAT=srt USE_PARALLEL_PROCESSING=true CHUNK_DURATION=30 """ with open(self.config_file, 'w') as f: f.write(config_content) def teardown_method(self): """Cleanup after each test method""" # Remove temporary directory if self.temp_dir.exists(): shutil.rmtree(self.temp_dir) def test_local_environment_detection(self): """Test local environment detection and configuration loading""" # Mock environment to ensure local detection with patch.dict(os.environ, {}, clear=True): storage_config = StorageConfig(config_file=str(self.config_file)) assert not storage_config.is_modal_env assert storage_config.default_model_size == "base" assert storage_config.default_output_format == "srt" assert storage_config.use_parallel_processing == True assert storage_config.chunk_duration == 30 def test_modal_environment_detection(self): """Test Modal environment detection""" # Mock Modal environment variables with patch.dict(os.environ, {"MODAL_TASK_ID": "test-task-123"}, clear=True): storage_config = StorageConfig(config_file=str(self.config_file)) assert storage_config.is_modal_env assert str(storage_config.downloads_dir) == "/root/downloads" assert str(storage_config.transcripts_dir) == "/root/transcripts" assert str(storage_config.cache_dir) == "/root/cache" def test_modal_environment_detection_deployment_mode(self): """Test Modal environment detection via DEPLOYMENT_MODE""" with patch.dict(os.environ, {"DEPLOYMENT_MODE": "modal"}, clear=True): storage_config = StorageConfig(config_file=str(self.config_file)) assert storage_config.is_modal_env def test_modal_environment_detection_container_var(self): """Test Modal environment detection via MODAL_IS_INSIDE_CONTAINER""" with patch.dict(os.environ, {"MODAL_IS_INSIDE_CONTAINER": "true"}, clear=True): storage_config = StorageConfig(config_file=str(self.config_file)) assert storage_config.is_modal_env def test_path_generation(self): """Test path generation methods""" with patch.dict(os.environ, {}, clear=True): storage_config = StorageConfig(config_file=str(self.config_file)) # Test download path download_path = storage_config.get_download_path("test.mp3") assert download_path.name == "test.mp3" assert "test_downloads" in str(download_path) # Test transcript paths txt_path = storage_config.get_transcript_path("test.mp3", "txt") assert txt_path.name == "test.txt" assert "test_transcripts" in str(txt_path) srt_path = storage_config.get_transcript_path("test.mp3", "srt") assert srt_path.name == "test.srt" # Test default format default_path = storage_config.get_transcript_path("test.mp3") assert default_path.name == "test.srt" # Should use default format # Test cache path cache_path = storage_config.get_cache_path("temp.dat") assert cache_path.name == "temp.dat" assert "test_cache" in str(cache_path) def test_audio_files_listing(self): """Test audio files listing functionality""" with patch.dict(os.environ, {}, clear=True): # Create a separate test directory for this specific test test_dir = self.temp_dir / "audio_test" test_config_file = test_dir / "config.env" test_dir.mkdir(exist_ok=True) # Create isolated config file config_content = """ DOWNLOADS_DIR=./audio_test_downloads TRANSCRIPTS_DIR=./audio_test_transcripts CACHE_DIR=./audio_test_cache """ with open(test_config_file, 'w') as f: f.write(config_content) storage_config = StorageConfig(config_file=str(test_config_file)) # Create test audio files storage_config.downloads_dir.mkdir(parents=True, exist_ok=True) test_files = ["test1.mp3", "test2.wav", "test3.m4a", "not_audio.txt"] for filename in test_files: (storage_config.downloads_dir / filename).touch() audio_files = storage_config.get_audio_files() audio_names = [f.name for f in audio_files] assert "test1.mp3" in audio_names assert "test2.wav" in audio_names assert "test3.m4a" in audio_names assert "not_audio.txt" not in audio_names assert len(audio_files) == 3 def test_transcript_files_mapping(self): """Test transcript files mapping functionality""" with patch.dict(os.environ, {}, clear=True): storage_config = StorageConfig(config_file=str(self.config_file)) # Test specific audio file mapping transcript_files = storage_config.get_transcript_files("episode123.mp3") assert "txt" in transcript_files assert "srt" in transcript_files assert "json" in transcript_files assert transcript_files["txt"].name == "episode123.txt" assert transcript_files["srt"].name == "episode123.srt" assert transcript_files["json"].name == "episode123.json" def test_storage_info_generation(self): """Test storage information generation""" with patch.dict(os.environ, {}, clear=True): # Create a separate test directory for this specific test test_dir = self.temp_dir / "info_test" test_config_file = test_dir / "config.env" test_dir.mkdir(exist_ok=True) # Create isolated config file config_content = """ DOWNLOADS_DIR=./info_test_downloads TRANSCRIPTS_DIR=./info_test_transcripts CACHE_DIR=./info_test_cache """ with open(test_config_file, 'w') as f: f.write(config_content) storage_config = StorageConfig(config_file=str(test_config_file)) # Create some test files storage_config.downloads_dir.mkdir(parents=True, exist_ok=True) storage_config.transcripts_dir.mkdir(parents=True, exist_ok=True) # Create test audio file test_audio = storage_config.downloads_dir / "test.mp3" test_audio.write_bytes(b"fake audio data" * 100) # Create test transcript files (storage_config.transcripts_dir / "test.txt").write_text("transcript text") (storage_config.transcripts_dir / "test.srt").write_text("srt content") storage_info = storage_config.get_storage_info() assert storage_info["environment"] == "local" assert storage_info["audio_files_count"] == 1 assert storage_info["transcript_txt_count"] == 1 assert storage_info["transcript_srt_count"] == 1 assert storage_info["transcript_json_count"] == 0 # Check that sizes are calculated (should be greater than 0 due to our test files) assert storage_info["downloads_size_mb"] >= 0 assert storage_info["transcripts_size_mb"] >= 0 def test_cleanup_temp_files(self): """Test temporary files cleanup""" with patch.dict(os.environ, {}, clear=True): storage_config = StorageConfig(config_file=str(self.config_file)) # Create cache directory and temp files storage_config.cache_dir.mkdir(parents=True, exist_ok=True) temp_file1 = storage_config.cache_dir / "temp_file1.dat" temp_file2 = storage_config.cache_dir / "temp_file2.dat" normal_file = storage_config.cache_dir / "normal_file.dat" temp_file1.touch() temp_file2.touch() normal_file.touch() # Test cleanup storage_config.cleanup_temp_files("temp_*") assert not temp_file1.exists() assert not temp_file2.exists() assert normal_file.exists() # Should not be deleted def test_config_file_not_exists(self): """Test behavior when config file doesn't exist""" non_existent_config = self.temp_dir / "non_existent.env" with patch.dict(os.environ, {}, clear=True): storage_config = StorageConfig(config_file=str(non_existent_config)) # Should use defaults assert not storage_config.is_modal_env assert storage_config.default_model_size == "turbo" assert storage_config.default_output_format == "srt" class TestStorageConfigGlobalInstance: """Test cases for global storage config instance management""" def test_global_instance_singleton(self): """Test that get_storage_config returns singleton instance""" # Clear any existing global instance import src.utils.storage_config as storage_module storage_module._storage_config = None with patch.dict(os.environ, {}, clear=True): config1 = get_storage_config() config2 = get_storage_config() assert config1 is config2 # Should be the same instance def test_global_instance_reset(self): """Test resetting global instance""" import src.utils.storage_config as storage_module with patch.dict(os.environ, {}, clear=True): config1 = get_storage_config() # Reset global instance storage_module._storage_config = None config2 = get_storage_config() assert config1 is not config2 # Should be different instances class TestStorageTools: """Test cases for storage management tools""" def setup_method(self): """Setup for each test method""" self.temp_dir = Path(tempfile.mkdtemp()) # Mock storage config to use temp directory self.mock_config = MagicMock() self.mock_config.downloads_dir = self.temp_dir / "downloads" self.mock_config.transcripts_dir = self.temp_dir / "transcripts" self.mock_config.cache_dir = self.temp_dir / "cache" self.mock_config.is_modal_env = False # Create directories for directory in [self.mock_config.downloads_dir, self.mock_config.transcripts_dir, self.mock_config.cache_dir]: directory.mkdir(parents=True, exist_ok=True) def teardown_method(self): """Cleanup after each test method""" if self.temp_dir.exists(): shutil.rmtree(self.temp_dir) @pytest.mark.asyncio async def test_get_storage_info_tool_success(self): """Test get_storage_info_tool with successful execution""" # Create test files (self.mock_config.downloads_dir / "test.mp3").write_bytes(b"audio data") (self.mock_config.transcripts_dir / "test.txt").write_text("transcript") # Mock storage config mock_storage_info = { "environment": "local", "downloads_dir": str(self.mock_config.downloads_dir), "transcripts_dir": str(self.mock_config.transcripts_dir), "cache_dir": str(self.mock_config.cache_dir), "audio_files_count": 1, "transcript_txt_count": 1, "transcript_srt_count": 0, "transcript_json_count": 0, "downloads_size_mb": 0.01, "transcripts_size_mb": 0.01, "cache_size_mb": 0.0 } self.mock_config.get_storage_info.return_value = mock_storage_info with patch('src.tools.storage_tools.get_storage_config', return_value=self.mock_config): result = await get_storage_info_tool() assert result["status"] == "success" assert result["environment"] == "local" assert result["audio_files_count"] == 1 assert result["transcript_txt_count"] == 1 @pytest.mark.asyncio async def test_get_storage_info_tool_failure(self): """Test get_storage_info_tool with exception handling""" # Mock config that raises exception self.mock_config.get_storage_info.side_effect = Exception("Test error") with patch('src.tools.storage_tools.get_storage_config', return_value=self.mock_config): result = await get_storage_info_tool() assert result["status"] == "failed" assert "Test error" in result["error_message"] class TestDistributedTranscriptionFixes: """Test cases for distributed transcription speaker information collection fixes""" def test_collect_speaker_information_string_speakers(self): """Test handling of string format speakers_detected""" from src.services.distributed_transcription_service import DistributedTranscriptionService service = DistributedTranscriptionService() # Test with string format speakers_detected successful_chunks = [ { "speakers_detected": "SPEAKER_01", # String instead of list "speaker_summary": { "SPEAKER_01": { "total_duration": 120.5, "segment_count": 5 } } }, { "speakers_detected": ["SPEAKER_02"], # Normal list format "speaker_summary": { "SPEAKER_02": { "total_duration": 95.3, "segment_count": 3 } } } ] result = service._collect_speaker_information(successful_chunks, True) assert result["global_speaker_count"] == 2 assert "SPEAKER_01" in result["speakers_detected"] assert "SPEAKER_02" in result["speakers_detected"] assert result["speaker_summary"]["SPEAKER_01"]["total_duration"] == 120.5 assert result["speaker_summary"]["SPEAKER_02"]["total_duration"] == 95.3 def test_collect_speaker_information_invalid_data(self): """Test handling of invalid speaker data""" from src.services.distributed_transcription_service import DistributedTranscriptionService service = DistributedTranscriptionService() # Test with invalid data formats successful_chunks = [ { "speakers_detected": 123, # Invalid type (number) "speaker_summary": "invalid" # Invalid type (string) }, { "speakers_detected": None, # None value "speaker_summary": { "SPEAKER_01": "invalid_info" # Invalid speaker info format } }, { "speakers_detected": ["SPEAKER_02"], # Valid "speaker_summary": { "SPEAKER_02": { "total_duration": 50.0, "segment_count": 2 } } } ] result = service._collect_speaker_information(successful_chunks, True) # Should handle invalid data gracefully and only process valid chunk assert result["global_speaker_count"] == 1 assert result["speakers_detected"] == ["SPEAKER_02"] assert result["speaker_summary"]["SPEAKER_02"]["total_duration"] == 50.0 def test_collect_speaker_information_disabled(self): """Test when speaker diarization is disabled""" from src.services.distributed_transcription_service import DistributedTranscriptionService service = DistributedTranscriptionService() successful_chunks = [{"speakers_detected": ["SPEAKER_01"]}] result = service._collect_speaker_information(successful_chunks, False) # Should return empty result when disabled assert result == {} if __name__ == "__main__": # Run tests with pytest pytest.main([__file__, "-v"])