File size: 3,862 Bytes
a459327
 
 
 
 
 
7dfa01d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a459327
 
 
 
 
 
 
7dfa01d
 
 
 
 
 
 
 
a459327
bc0fbf6
 
 
 
 
 
 
 
db1df57
bc0fbf6
db1df57
 
bc0fbf6
93aef48
db1df57
bc0fbf6
 
 
 
db1df57
bc0fbf6
 
 
 
 
8df3af9
bc0fbf6
 
8df3af9
bc0fbf6
db1df57
8df3af9
bc0fbf6
a459327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dfa01d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import torch
import numpy as np
import librosa

def load_audio(audio_file, sr=22050):
    """Load an audio file and convert to mono if needed."""
    try:
        # Try to load audio with librosa
        y, sr = librosa.load(audio_file, sr=sr, mono=True)
        return y, sr
    except Exception as e:
        print(f"Error loading audio with librosa: {str(e)}")
        # Fallback to basic loading if necessary
        import soundfile as sf
        try:
            y, sr = sf.read(audio_file)
            # Convert to mono if stereo
            if len(y.shape) > 1:
                y = y.mean(axis=1)
            return y, sr
        except Exception as e2:
            print(f"Error loading audio with soundfile: {str(e2)}")
            raise ValueError(f"Could not load audio file: {audio_file}")

def extract_audio_duration(y, sr):
    """Get the duration of audio in seconds."""
    return len(y) / sr

def extract_mfcc_features(y, sr, n_mfcc=20):
    """Extract MFCC features from audio."""
    try:
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfccs_mean = np.mean(mfccs.T, axis=0)
        return mfccs_mean
    except Exception as e:
        print(f"Error extracting MFCCs: {str(e)}")
        # Return a fallback feature vector if extraction fails
        return np.zeros(n_mfcc)

def calculate_lyrics_length(duration):
    """
    Calculate appropriate lyrics length based on audio duration.
    Uses a more conservative calculation that generates shorter lyrics:
    - Average words per line (8-10 words)
    - Reduced words per minute (45 words instead of 135)
    - Simplified song structure
    """
    # Convert duration to minutes
    duration_minutes = duration / 60
    
    # Calculate total words based on duration
    # Using 45 words per minute (reduced from 135)
    total_words = int(duration_minutes * 90)
    
    # Calculate number of lines
    # Assuming 8-10 words per line
    words_per_line = 9  # average
    total_lines = total_words // words_per_line
    
    # Adjust for song structure with shorter lengths
    if total_lines < 6:
        # Very short song - keep it simple
        return max(2, total_lines)
    elif total_lines < 10:
        # Short song - one verse and chorus
        return min(6, total_lines)
    elif total_lines < 15:
        # Medium song - two verses and chorus
        return min(10, total_lines)
    else:
        # Longer song - two verses, chorus, and bridge
        return min(15, total_lines)

def format_genre_results(top_genres):
    """Format genre classification results for display."""
    result = "Top Detected Genres:\n"
    for genre, confidence in top_genres:
        result += f"- {genre}: {confidence*100:.2f}%\n"
    return result

def ensure_cuda_availability():
    """Check and report CUDA availability for informational purposes."""
    cuda_available = torch.cuda.is_available()
    if cuda_available:
        device_count = torch.cuda.device_count()
        device_name = torch.cuda.get_device_name(0) if device_count > 0 else "Unknown"
        print(f"CUDA is available with {device_count} device(s). Using: {device_name}")
    else:
        print("CUDA is not available. Using CPU for inference.")
    return cuda_available

def preprocess_audio_for_model(waveform, sample_rate, target_sample_rate=16000, max_length=16000):
    """Preprocess audio for model input (resample, pad/trim)."""
    # Resample if needed
    if sample_rate != target_sample_rate:
        waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate)
    
    # Trim or pad to expected length
    if len(waveform) > max_length:
        waveform = waveform[:max_length]
    elif len(waveform) < max_length:
        padding = max_length - len(waveform)
        waveform = np.pad(waveform, (0, padding), 'constant')
        
    return waveform