File size: 4,889 Bytes
381c43b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import librosa
import parselmouth
from parselmouth.praat import call

import numpy as np


class ProsodicFeatureExtractor:
    """
    A class for extracting various prosodic features from audio data.

    Attributes:
        y (numpy.array): Audio time series.
        sr (int): Sampling rate of the audio time series.
        audio_arr (numpy.array): Original audio array for parselmouth processing.
        orig_sr (int): Original sampling rate of the audio array 

    Methods:
        extract(features_to_extract=None): Extracts specified prosodic features from audio.
        extract_f0(): Extracts fundamental frequency (F0) from audio.
        extract_energy(): Extracts energy from audio.
        extract_speaking_rate(): Estimates the speaking rate from audio.
        extract_pauses(): Detects pauses from audio.
        extract_formants(): Extracts formant frequencies from audio.
    """
    def __init__(self, y, sr, audio_arr, orig_sr):
        """
        Initializes the ProsodicFeatureExtractor with audio data.
        """
        self.y = y
        self.sr = sr
        self.audio_arr = audio_arr
        self.orig_sr = orig_sr

    def extract(self, features_to_extract=None):
        """
        Extracts the specified prosodic features.

        Args:
            features_to_extract (list, optional): List of feature names to extract.
                Defaults to all available features if None.

        Returns:
            dict: A dictionary containing the extracted features.
        """
        feature_funcs = {
            'f0': self.extract_f0,
            'energy': self.extract_energy,
            'speaking_rate': self.extract_speaking_rate,
            'pauses': self.extract_pauses,
            'formants': self.extract_formants
        }

        if features_to_extract is None:
            features_to_extract = feature_funcs.keys()

        features = {}
        for feature in features_to_extract:
            if feature in feature_funcs:
                result = feature_funcs[feature]()
                if isinstance(result, tuple):
                    features.update(result)
                else:
                    features[feature] = result
                    
        return features

    def extract_f0(self):
        """
        Extracts the fundamental frequency (F0) using PYIN algorithm.
        """
        f0, voiced_flag, voiced_probs = librosa.pyin(self.y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
        f0 = np.nan_to_num(f0)
        return f0

    def extract_energy(self):
        """
        Extracts the root-mean-square (RMS) energy from the audio.
        """
        return librosa.feature.rms(y=self.y)[0]


    def extract_speaking_rate(self):
        """
        Estimates the speaking rate by calculating the number of syllables per second.
        """
        try:
            snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
            total_duration = snd.get_total_duration()
            intensity = snd.to_intensity()
            intensity_values = intensity.values.T
            threshold = 0.3 * max(intensity_values)
            syllable_count = len([1 for i in intensity_values if i > threshold])
            speaking_rate = syllable_count / total_duration
            return speaking_rate
        except Exception as e:
            print(f'Error extracting speaking rate: {e}')
            return None

    def extract_pauses(self):
        """
        Identifies and timestamps pauses in the audio.
        """
        try:
            snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
            silences = call(snd, "To TextGrid (silences)", 100, 0, -25, 0.1, 0.1, "silent", "sounding")
            pauses = [(call(silences, "Get start time of interval", 1, i), call(silences, "Get end time of interval", 1, i)) for i in range(1, call(silences, "Get number of intervals", 1) + 1) if call(silences, "Get label of interval", 1, i) == "silent"]
            return pauses
        except Exception as e:
            print(f'Error extracting pauses: {e}')
            return None
        
    def extract_formants(self):
        """
        Extracts the first three formant frequencies using the Burg method.
        """
        try:
            snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
            formant = call(snd, "To Formant (burg)", 0.025, 5, 5500, 0.025, 50)
            formant_values = {}
            for i in range(1, 4):  # Extracting the first three formants
                formant_values[f'F{i}_mean'] = call(formant, "Get mean", i, 0, 0, "Hertz")
                formant_values[f'F{i}_stdev'] = call(formant, "Get standard deviation", i, 0, 0, "Hertz")
            return formant_values
        except Exception as e:
            print(f'Error extracting formants: {e}')
            return {}