File size: 2,436 Bytes
e701383
a2df844
 
ea19a0e
26ba744
e701383
176ce75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea19a0e
e1503ef
 
 
 
 
 
 
 
 
 
 
fb031e1
26ba744
 
 
 
176ce75
 
 
 
 
e1503ef
 
a01aab6
 
e701383
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import parselmouth
from parselmouth.praat import call
import numpy as np
import pandas as pd

def find_rises_and_peaks_gradient(data, threshold=4):
    data['rise_point'] = 0
    data['peak_point'] = 0
    
    pitch_values = data['pitch'].values
    gradients = np.gradient(pitch_values)
    
    in_rise = False
    rise_start = 0
    successive_rise_count = 0
    min_successive_rise = 3  # Minimum successive values to qualify as a rise
    checking_rise = False  # Flag to start checking for rises after NaN

    for i in range(1, len(gradients)):
        if np.isnan(pitch_values[i]):
            checking_rise = False  # Reset flag when encountering NaN
            in_rise = False
            successive_rise_count = 0
            continue
        
        if not checking_rise:
            checking_rise = True  # Start checking for rises after NaN
            continue
        
        if gradients[i] >= threshold:
            if not in_rise:
                in_rise = True
                rise_start = i-1
            successive_rise_count += 1
        else:
            if in_rise:
                if successive_rise_count >= min_successive_rise:
                    data.at[rise_start, 'rise_point'] = 1
                    data.at[i-1, 'peak_point'] = 1
                in_rise = False
                successive_rise_count = 0

    return data

def get_pitch(audio_data):
    rate, data = audio_data
    if data.ndim > 1:  # Check if the audio is stereo or multi-channel
        data = np.mean(data, axis=1)  # Convert to mono by averaging channels
    
    # Convert data to float64 for compatibility with Parselmouth
    data = data.astype('float64')

    sound = parselmouth.Sound(values=data, sampling_frequency=rate)
    try:
        pitch = call(sound, "To Pitch", 0.0, 75, 500)
        pitch_values = pitch.selected_array['frequency']
        # return "Pitch frequencies: " + str(pitch_values)
        pitch_values[pitch_values==0] = np.nan
        df_pitch = pd.DataFrame(np.column_stack([pitch.xs(), pitch_values]), 
                                columns=['time', 'pitch']) 
        
        df_pitch = find_rises_and_peaks_gradient(df_pitch)

        output = df_pitch.to_json(orient='records')
        
        return output
    except Exception as e:
        return "Error in pitch extraction: " + str(e)

demo = gr.Interface(fn=get_pitch, inputs="audio", outputs="text")
demo.launch()