ModalTranscriberMCP / tests /test_09_storage_config_unit_tests.py
richard-su's picture
Upload folder using huggingface_hub
76f9cd2 verified
"""
Unit tests for storage configuration system
Tests the new storage configuration functionality including:
- Storage config management
- Environment detection
- Path generation
- Storage tools
"""
import pytest
import os
import tempfile
import shutil
from pathlib import Path
from unittest.mock import patch, MagicMock
import asyncio
import sys
sys.path.append(str(Path(__file__).parent.parent / "src"))
from src.utils.storage_config import StorageConfig, get_storage_config
from src.tools.storage_tools import get_storage_info_tool
class TestStorageConfig:
"""Test cases for StorageConfig class"""
def setup_method(self):
"""Setup for each test method"""
# Create temporary directory for testing
self.temp_dir = Path(tempfile.mkdtemp())
self.config_file = self.temp_dir / "test_config.env"
# Create test config file
config_content = """
DOWNLOADS_DIR=./test_downloads
TRANSCRIPTS_DIR=./test_transcripts
CACHE_DIR=./test_cache
DEFAULT_MODEL_SIZE=base
DEFAULT_OUTPUT_FORMAT=srt
USE_PARALLEL_PROCESSING=true
CHUNK_DURATION=30
"""
with open(self.config_file, 'w') as f:
f.write(config_content)
def teardown_method(self):
"""Cleanup after each test method"""
# Remove temporary directory
if self.temp_dir.exists():
shutil.rmtree(self.temp_dir)
def test_local_environment_detection(self):
"""Test local environment detection and configuration loading"""
# Mock environment to ensure local detection
with patch.dict(os.environ, {}, clear=True):
storage_config = StorageConfig(config_file=str(self.config_file))
assert not storage_config.is_modal_env
assert storage_config.default_model_size == "base"
assert storage_config.default_output_format == "srt"
assert storage_config.use_parallel_processing == True
assert storage_config.chunk_duration == 30
def test_modal_environment_detection(self):
"""Test Modal environment detection"""
# Mock Modal environment variables
with patch.dict(os.environ, {"MODAL_TASK_ID": "test-task-123"}, clear=True):
storage_config = StorageConfig(config_file=str(self.config_file))
assert storage_config.is_modal_env
assert str(storage_config.downloads_dir) == "/root/downloads"
assert str(storage_config.transcripts_dir) == "/root/transcripts"
assert str(storage_config.cache_dir) == "/root/cache"
def test_modal_environment_detection_deployment_mode(self):
"""Test Modal environment detection via DEPLOYMENT_MODE"""
with patch.dict(os.environ, {"DEPLOYMENT_MODE": "modal"}, clear=True):
storage_config = StorageConfig(config_file=str(self.config_file))
assert storage_config.is_modal_env
def test_modal_environment_detection_container_var(self):
"""Test Modal environment detection via MODAL_IS_INSIDE_CONTAINER"""
with patch.dict(os.environ, {"MODAL_IS_INSIDE_CONTAINER": "true"}, clear=True):
storage_config = StorageConfig(config_file=str(self.config_file))
assert storage_config.is_modal_env
def test_path_generation(self):
"""Test path generation methods"""
with patch.dict(os.environ, {}, clear=True):
storage_config = StorageConfig(config_file=str(self.config_file))
# Test download path
download_path = storage_config.get_download_path("test.mp3")
assert download_path.name == "test.mp3"
assert "test_downloads" in str(download_path)
# Test transcript paths
txt_path = storage_config.get_transcript_path("test.mp3", "txt")
assert txt_path.name == "test.txt"
assert "test_transcripts" in str(txt_path)
srt_path = storage_config.get_transcript_path("test.mp3", "srt")
assert srt_path.name == "test.srt"
# Test default format
default_path = storage_config.get_transcript_path("test.mp3")
assert default_path.name == "test.srt" # Should use default format
# Test cache path
cache_path = storage_config.get_cache_path("temp.dat")
assert cache_path.name == "temp.dat"
assert "test_cache" in str(cache_path)
def test_audio_files_listing(self):
"""Test audio files listing functionality"""
with patch.dict(os.environ, {}, clear=True):
# Create a separate test directory for this specific test
test_dir = self.temp_dir / "audio_test"
test_config_file = test_dir / "config.env"
test_dir.mkdir(exist_ok=True)
# Create isolated config file
config_content = """
DOWNLOADS_DIR=./audio_test_downloads
TRANSCRIPTS_DIR=./audio_test_transcripts
CACHE_DIR=./audio_test_cache
"""
with open(test_config_file, 'w') as f:
f.write(config_content)
storage_config = StorageConfig(config_file=str(test_config_file))
# Create test audio files
storage_config.downloads_dir.mkdir(parents=True, exist_ok=True)
test_files = ["test1.mp3", "test2.wav", "test3.m4a", "not_audio.txt"]
for filename in test_files:
(storage_config.downloads_dir / filename).touch()
audio_files = storage_config.get_audio_files()
audio_names = [f.name for f in audio_files]
assert "test1.mp3" in audio_names
assert "test2.wav" in audio_names
assert "test3.m4a" in audio_names
assert "not_audio.txt" not in audio_names
assert len(audio_files) == 3
def test_transcript_files_mapping(self):
"""Test transcript files mapping functionality"""
with patch.dict(os.environ, {}, clear=True):
storage_config = StorageConfig(config_file=str(self.config_file))
# Test specific audio file mapping
transcript_files = storage_config.get_transcript_files("episode123.mp3")
assert "txt" in transcript_files
assert "srt" in transcript_files
assert "json" in transcript_files
assert transcript_files["txt"].name == "episode123.txt"
assert transcript_files["srt"].name == "episode123.srt"
assert transcript_files["json"].name == "episode123.json"
def test_storage_info_generation(self):
"""Test storage information generation"""
with patch.dict(os.environ, {}, clear=True):
# Create a separate test directory for this specific test
test_dir = self.temp_dir / "info_test"
test_config_file = test_dir / "config.env"
test_dir.mkdir(exist_ok=True)
# Create isolated config file
config_content = """
DOWNLOADS_DIR=./info_test_downloads
TRANSCRIPTS_DIR=./info_test_transcripts
CACHE_DIR=./info_test_cache
"""
with open(test_config_file, 'w') as f:
f.write(config_content)
storage_config = StorageConfig(config_file=str(test_config_file))
# Create some test files
storage_config.downloads_dir.mkdir(parents=True, exist_ok=True)
storage_config.transcripts_dir.mkdir(parents=True, exist_ok=True)
# Create test audio file
test_audio = storage_config.downloads_dir / "test.mp3"
test_audio.write_bytes(b"fake audio data" * 100)
# Create test transcript files
(storage_config.transcripts_dir / "test.txt").write_text("transcript text")
(storage_config.transcripts_dir / "test.srt").write_text("srt content")
storage_info = storage_config.get_storage_info()
assert storage_info["environment"] == "local"
assert storage_info["audio_files_count"] == 1
assert storage_info["transcript_txt_count"] == 1
assert storage_info["transcript_srt_count"] == 1
assert storage_info["transcript_json_count"] == 0
# Check that sizes are calculated (should be greater than 0 due to our test files)
assert storage_info["downloads_size_mb"] >= 0
assert storage_info["transcripts_size_mb"] >= 0
def test_cleanup_temp_files(self):
"""Test temporary files cleanup"""
with patch.dict(os.environ, {}, clear=True):
storage_config = StorageConfig(config_file=str(self.config_file))
# Create cache directory and temp files
storage_config.cache_dir.mkdir(parents=True, exist_ok=True)
temp_file1 = storage_config.cache_dir / "temp_file1.dat"
temp_file2 = storage_config.cache_dir / "temp_file2.dat"
normal_file = storage_config.cache_dir / "normal_file.dat"
temp_file1.touch()
temp_file2.touch()
normal_file.touch()
# Test cleanup
storage_config.cleanup_temp_files("temp_*")
assert not temp_file1.exists()
assert not temp_file2.exists()
assert normal_file.exists() # Should not be deleted
def test_config_file_not_exists(self):
"""Test behavior when config file doesn't exist"""
non_existent_config = self.temp_dir / "non_existent.env"
with patch.dict(os.environ, {}, clear=True):
storage_config = StorageConfig(config_file=str(non_existent_config))
# Should use defaults
assert not storage_config.is_modal_env
assert storage_config.default_model_size == "turbo"
assert storage_config.default_output_format == "srt"
class TestStorageConfigGlobalInstance:
"""Test cases for global storage config instance management"""
def test_global_instance_singleton(self):
"""Test that get_storage_config returns singleton instance"""
# Clear any existing global instance
import src.utils.storage_config as storage_module
storage_module._storage_config = None
with patch.dict(os.environ, {}, clear=True):
config1 = get_storage_config()
config2 = get_storage_config()
assert config1 is config2 # Should be the same instance
def test_global_instance_reset(self):
"""Test resetting global instance"""
import src.utils.storage_config as storage_module
with patch.dict(os.environ, {}, clear=True):
config1 = get_storage_config()
# Reset global instance
storage_module._storage_config = None
config2 = get_storage_config()
assert config1 is not config2 # Should be different instances
class TestStorageTools:
"""Test cases for storage management tools"""
def setup_method(self):
"""Setup for each test method"""
self.temp_dir = Path(tempfile.mkdtemp())
# Mock storage config to use temp directory
self.mock_config = MagicMock()
self.mock_config.downloads_dir = self.temp_dir / "downloads"
self.mock_config.transcripts_dir = self.temp_dir / "transcripts"
self.mock_config.cache_dir = self.temp_dir / "cache"
self.mock_config.is_modal_env = False
# Create directories
for directory in [self.mock_config.downloads_dir,
self.mock_config.transcripts_dir,
self.mock_config.cache_dir]:
directory.mkdir(parents=True, exist_ok=True)
def teardown_method(self):
"""Cleanup after each test method"""
if self.temp_dir.exists():
shutil.rmtree(self.temp_dir)
@pytest.mark.asyncio
async def test_get_storage_info_tool_success(self):
"""Test get_storage_info_tool with successful execution"""
# Create test files
(self.mock_config.downloads_dir / "test.mp3").write_bytes(b"audio data")
(self.mock_config.transcripts_dir / "test.txt").write_text("transcript")
# Mock storage config
mock_storage_info = {
"environment": "local",
"downloads_dir": str(self.mock_config.downloads_dir),
"transcripts_dir": str(self.mock_config.transcripts_dir),
"cache_dir": str(self.mock_config.cache_dir),
"audio_files_count": 1,
"transcript_txt_count": 1,
"transcript_srt_count": 0,
"transcript_json_count": 0,
"downloads_size_mb": 0.01,
"transcripts_size_mb": 0.01,
"cache_size_mb": 0.0
}
self.mock_config.get_storage_info.return_value = mock_storage_info
with patch('src.tools.storage_tools.get_storage_config', return_value=self.mock_config):
result = await get_storage_info_tool()
assert result["status"] == "success"
assert result["environment"] == "local"
assert result["audio_files_count"] == 1
assert result["transcript_txt_count"] == 1
@pytest.mark.asyncio
async def test_get_storage_info_tool_failure(self):
"""Test get_storage_info_tool with exception handling"""
# Mock config that raises exception
self.mock_config.get_storage_info.side_effect = Exception("Test error")
with patch('src.tools.storage_tools.get_storage_config', return_value=self.mock_config):
result = await get_storage_info_tool()
assert result["status"] == "failed"
assert "Test error" in result["error_message"]
class TestDistributedTranscriptionFixes:
"""Test cases for distributed transcription speaker information collection fixes"""
def test_collect_speaker_information_string_speakers(self):
"""Test handling of string format speakers_detected"""
from src.services.distributed_transcription_service import DistributedTranscriptionService
service = DistributedTranscriptionService()
# Test with string format speakers_detected
successful_chunks = [
{
"speakers_detected": "SPEAKER_01", # String instead of list
"speaker_summary": {
"SPEAKER_01": {
"total_duration": 120.5,
"segment_count": 5
}
}
},
{
"speakers_detected": ["SPEAKER_02"], # Normal list format
"speaker_summary": {
"SPEAKER_02": {
"total_duration": 95.3,
"segment_count": 3
}
}
}
]
result = service._collect_speaker_information(successful_chunks, True)
assert result["global_speaker_count"] == 2
assert "SPEAKER_01" in result["speakers_detected"]
assert "SPEAKER_02" in result["speakers_detected"]
assert result["speaker_summary"]["SPEAKER_01"]["total_duration"] == 120.5
assert result["speaker_summary"]["SPEAKER_02"]["total_duration"] == 95.3
def test_collect_speaker_information_invalid_data(self):
"""Test handling of invalid speaker data"""
from src.services.distributed_transcription_service import DistributedTranscriptionService
service = DistributedTranscriptionService()
# Test with invalid data formats
successful_chunks = [
{
"speakers_detected": 123, # Invalid type (number)
"speaker_summary": "invalid" # Invalid type (string)
},
{
"speakers_detected": None, # None value
"speaker_summary": {
"SPEAKER_01": "invalid_info" # Invalid speaker info format
}
},
{
"speakers_detected": ["SPEAKER_02"], # Valid
"speaker_summary": {
"SPEAKER_02": {
"total_duration": 50.0,
"segment_count": 2
}
}
}
]
result = service._collect_speaker_information(successful_chunks, True)
# Should handle invalid data gracefully and only process valid chunk
assert result["global_speaker_count"] == 1
assert result["speakers_detected"] == ["SPEAKER_02"]
assert result["speaker_summary"]["SPEAKER_02"]["total_duration"] == 50.0
def test_collect_speaker_information_disabled(self):
"""Test when speaker diarization is disabled"""
from src.services.distributed_transcription_service import DistributedTranscriptionService
service = DistributedTranscriptionService()
successful_chunks = [{"speakers_detected": ["SPEAKER_01"]}]
result = service._collect_speaker_information(successful_chunks, False)
# Should return empty result when disabled
assert result == {}
if __name__ == "__main__":
# Run tests with pytest
pytest.main([__file__, "-v"])