Spaces:

ajaykarthick
/

audio_deepfake_detection

Runtime error

File size: 4,889 Bytes

381c43b

import librosa
import parselmouth
from parselmouth.praat import call

import numpy as np


class ProsodicFeatureExtractor:
    """
    A class for extracting various prosodic features from audio data.

    Attributes:
        y (numpy.array): Audio time series.
        sr (int): Sampling rate of the audio time series.
        audio_arr (numpy.array): Original audio array for parselmouth processing.
        orig_sr (int): Original sampling rate of the audio array 

    Methods:
        extract(features_to_extract=None): Extracts specified prosodic features from audio.
        extract_f0(): Extracts fundamental frequency (F0) from audio.
        extract_energy(): Extracts energy from audio.
        extract_speaking_rate(): Estimates the speaking rate from audio.
        extract_pauses(): Detects pauses from audio.
        extract_formants(): Extracts formant frequencies from audio.
    """
    def __init__(self, y, sr, audio_arr, orig_sr):
        """
        Initializes the ProsodicFeatureExtractor with audio data.
        """
        self.y = y
        self.sr = sr
        self.audio_arr = audio_arr
        self.orig_sr = orig_sr

    def extract(self, features_to_extract=None):
        """
        Extracts the specified prosodic features.

        Args:
            features_to_extract (list, optional): List of feature names to extract.
                Defaults to all available features if None.

        Returns:
            dict: A dictionary containing the extracted features.
        """
        feature_funcs = {
            'f0': self.extract_f0,
            'energy': self.extract_energy,
            'speaking_rate': self.extract_speaking_rate,
            'pauses': self.extract_pauses,
            'formants': self.extract_formants
        }

        if features_to_extract is None:
            features_to_extract = feature_funcs.keys()

        features = {}
        for feature in features_to_extract:
            if feature in feature_funcs:
                result = feature_funcs[feature]()
                if isinstance(result, tuple):
                    features.update(result)
                else:
                    features[feature] = result
                    
        return features

    def extract_f0(self):
        """
        Extracts the fundamental frequency (F0) using PYIN algorithm.
        """
        f0, voiced_flag, voiced_probs = librosa.pyin(self.y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
        f0 = np.nan_to_num(f0)
        return f0

    def extract_energy(self):
        """
        Extracts the root-mean-square (RMS) energy from the audio.
        """
        return librosa.feature.rms(y=self.y)[0]


    def extract_speaking_rate(self):
        """
        Estimates the speaking rate by calculating the number of syllables per second.
        """
        try:
            snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
            total_duration = snd.get_total_duration()
            intensity = snd.to_intensity()
            intensity_values = intensity.values.T
            threshold = 0.3 * max(intensity_values)
            syllable_count = len([1 for i in intensity_values if i > threshold])
            speaking_rate = syllable_count / total_duration
            return speaking_rate
        except Exception as e:
            print(f'Error extracting speaking rate: {e}')
            return None

    def extract_pauses(self):
        """
        Identifies and timestamps pauses in the audio.
        """
        try:
            snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
            silences = call(snd, "To TextGrid (silences)", 100, 0, -25, 0.1, 0.1, "silent", "sounding")
            pauses = [(call(silences, "Get start time of interval", 1, i), call(silences, "Get end time of interval", 1, i)) for i in range(1, call(silences, "Get number of intervals", 1) + 1) if call(silences, "Get label of interval", 1, i) == "silent"]
            return pauses
        except Exception as e:
            print(f'Error extracting pauses: {e}')
            return None
        
    def extract_formants(self):
        """
        Extracts the first three formant frequencies using the Burg method.
        """
        try:
            snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
            formant = call(snd, "To Formant (burg)", 0.025, 5, 5500, 0.025, 50)
            formant_values = {}
            for i in range(1, 4):  # Extracting the first three formants
                formant_values[f'F{i}_mean'] = call(formant, "Get mean", i, 0, 0, "Hertz")
                formant_values[f'F{i}_stdev'] = call(formant, "Get standard deviation", i, 0, 0, "Hertz")
            return formant_values
        except Exception as e:
            print(f'Error extracting formants: {e}')
            return {}