File size: 2,928 Bytes
7a0ff7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Script to read log-loss data of many sentences and characterize the empirical distribution.
We also report the mean log-loss as a function of sentence length
"""
from scipy.interpolate import RectBivariateSpline, interp1d
import numpy as np

def fit_survival_func(xx, log_space=True):
    """
    Returns an estimated survival function to the data in :xx: using
    interpolation.

    Args:
        :xx:  data
        :log_space:  indicates whether fitting is in log space or not.

    Returns:
         univariate function
    """
    assert len(xx) > 0

    eps = 1 / len(xx)
    inf = 1 / eps

    sxx = np.sort(xx)
    qq = np.mean(np.expand_dims(sxx,1) >= sxx, 0)

    if log_space:
        qq = -np.log(qq)


    if log_space:
        return interp1d(sxx, qq, fill_value=(0 , np.log(inf)), bounds_error=False)
    else:
        return interp1d(sxx, qq, fill_value=(1 , 0), bounds_error=False)


def fit_per_length_survival_function(lengths, xx, G=501, log_space=True):
    """
    Returns a survival function for every sentence length in tokens.
    Use 2D interpolation over the empirical survival function of the pairs (length, x)
    
    Args:
        :lengths:, :xx:, 1-D arrays
        :G:  number of grid points to use in the interpolation in the xx dimension
        :log_space:  indicates whether result is in log space or not.

    Returns:
        bivariate function (length, x) -> [0,1]
    """

    assert len(lengths) == len(xx)

    min_tokens_per_sentence = lengths.min()
    max_tokens_per_sentence = lengths.max()
    ll = np.arange(min_tokens_per_sentence, max_tokens_per_sentence)

    ppx_min_val = xx.min()
    ppx_max_val = xx.max()
    xx0 = np.linspace(ppx_min_val, ppx_max_val, G)

    ll_valid = []
    zz = []
    for l in ll:
        xx1 = xx[lengths == l]
        if len(xx1) > 1:
            univariate_survival_func = fit_survival_func(xx1, log_space=log_space)
            ll_valid.append(l)
            zz.append(univariate_survival_func(xx0))

    func = RectBivariateSpline(np.array(ll_valid), xx0, np.vstack(zz))
    if log_space:
        def func2d(x, y):
            return np.exp(-func(x,y))
        return func2d
    else:
        return func
    

# import pickle
# import pandas as pd
# df = pd.read_csv('D:\\.Idan\\转讜讗专 砖谞讬\\转讝讛\\detectLM\\article_null.csv')
# LOGLOSS_PVAL_FUNC_FILE = 'D:\.Idan\转讜讗专 砖谞讬\转讝讛\detectLM\example\logloss_pval_function.pkl'
# LOGLOSS_PVAL_FUNC_FILE_TEST = 'D:\.Idan\转讜讗专 砖谞讬\转讝讛\detectLM\example\logloss_pval_function_test.pkl'
# with open(LOGLOSS_PVAL_FUNC_FILE, 'wb') as handle:
#     pickle.dump(fit_per_length_survival_function(df['length'].values, df['response'].values), handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open(LOGLOSS_PVAL_FUNC_FILE, 'rb') as f:
#     data = pickle.load(f)
#     print(data)

# with open(LOGLOSS_PVAL_FUNC_FILE_TEST, 'rb') as f:
#     data = pickle.load(f)
#     print(data)