File size: 14,444 Bytes
bacf16b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
# coding=utf-8
# Copyright 2019 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python3
"""Generic helper functions used across codebase."""
import warnings
from collections import namedtuple
from datetime import datetime
import os
import math
import pathlib
import torch
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
from typing import List, Tuple
from sklearn import preprocessing

import data_formatter
from data_formatter import types

DataTypes = types.DataTypes
InputTypes = types.InputTypes
MINUTE = 60

# OS related functions.
def create_folder_if_not_exist(directory):
  """Creates folder if it doesn't exist.

  Args:
    directory: Folder path to create.
  """
  # Also creates directories recursively
  pathlib.Path(directory).mkdir(parents=True, exist_ok=True)


def csv_path_to_folder(path: str):
    return "/".join(path.split('/')[:-1]) + "/"


def interpolate(data: pd.DataFrame, 
                column_definition: List[Tuple[str, DataTypes, InputTypes]],
                gap_threshold: int = 0, 
                min_drop_length: int = 0, 
                interval_length: int = 0):
  """Interpolates missing values in data.

  Args:
    df: Dataframe to interpolate on. Sorted by id and then time (a DateTime object).
    column_definition: List of tuples describing columns (column_name, data_type, input_type).
    gap_threshold: Number in minutes, maximum allowed gap for interpolation.
    min_drop_length: Number of points, minimum number within an interval to interpolate.
    interval_length: Number in minutes, length of interpolation.

  Returns:
    data: DataFrame with missing values interpolated and 
          additional column ('segment') indicating continuous segments.
    column_definition: Updataed list of tuples (column_name, data_type, input_type).
  """
  # select all real-valued columns that are not id, time, or static
  interpolation_columns = [column_name for column_name, data_type, input_type in column_definition if 
    data_type == DataTypes.REAL_VALUED and 
    input_type not in set([InputTypes.ID, InputTypes.TIME, InputTypes.STATIC_INPUT])]
  # select all other columns except time
  constant_columns = [column_name for column_name, data_type, input_type in column_definition if
    input_type not in set([InputTypes.TIME])]
  constant_columns += ['id_segment']

  # get id and time columns
  id_col = [column_name for column_name, data_type, input_type in column_definition if input_type == InputTypes.ID][0]
  time_col = [column_name for column_name, data_type, input_type in column_definition if input_type == InputTypes.TIME][0]
  
  # round to minute
  data[time_col] = data[time_col].dt.round('1min')
  # count dropped segments
  dropped_segments = 0
  # count number of values that are interpolated
  interpolation_count = 0
  # store final output
  output = []
  for id, id_data in data.groupby(id_col):
    # sort values 
    id_data.sort_values(time_col, inplace=True)
    # get time difference between consecutive rows
    lag = (id_data[time_col].diff().dt.total_seconds().fillna(0) / 60.0).astype(int)
    # if lag > gap_threshold
    id_segment = (lag > gap_threshold).cumsum()
    id_data['id_segment'] = id_segment
    for segment, segment_data in id_data.groupby('id_segment'):
      # if segment is too short, then we don't interpolate
      if len(segment_data) < min_drop_length:
        dropped_segments += 1
        continue
      
      # find and print duplicated times
      duplicates = segment_data.duplicated(time_col, keep=False)
      if duplicates.any():
        print(segment_data[duplicates])
        raise ValueError('Duplicate times in segment {} of id {}'.format(segment, id))

      # reindex at interval_length minute intervals
      segment_data = segment_data.set_index(time_col)
      index_new = pd.date_range(start = segment_data.index[0], 
                            end = segment_data.index[-1], 
                            freq = interval_length)
      index_union = index_new.union(segment_data.index)
      segment_data = segment_data.reindex(index_union)
      # count nan values in interpolation columns
      interpolation_count += segment_data[interpolation_columns[0]].isna().sum()
      # interpolate
      segment_data[interpolation_columns] = segment_data[interpolation_columns].interpolate(method='index')
      # fill constant columns with last value
      segment_data[constant_columns] = segment_data[constant_columns].ffill()
      # delete rows not conforming to frequency
      segment_data = segment_data.reindex(index_new)
      # reset index, make the time a column with name time_col
      segment_data = segment_data.reset_index().rename(columns={'index': time_col})
      # set the id_segment to position in output
      segment_data['id_segment'] = len(output)
      # add to output
      output.append(segment_data)
  # print number of dropped segments and number of segments
  print('\tDropped segments: {}'.format(dropped_segments))
  print('\tExtracted segments: {}'.format(len(output)))
  # concat all segments and reset index
  output = pd.concat(output)
  output.reset_index(drop=True, inplace=True)
  # count number of interpolated values
  print('\tInterpolated values: {}'.format(interpolation_count))
  print('\tPercent of values interpolated: {:.2f}%'.format(interpolation_count / len(output) * 100))
  # add id_segment column to column_definition as ID
  column_definition += [('id_segment', DataTypes.CATEGORICAL, InputTypes.SID)]

  return output, column_definition

def create_index(time_col: pd.Series, interval_length: int):
  """Creates a new index at interval_length minute intervals.

  Args:
    time_col: Series of times.
    interval_length: Number in minutes, length of interpolation.

  Returns:
    index: New index.
  """
  # margin of error
  eps = pd.Timedelta('1min')
  new_time_col = [time_col.iloc[0]]
  for time in time_col.iloc[1:]:
    if time - new_time_col[-1] <= pd.Timedelta(interval_length) + eps:
      new_time_col.append(time)
    else:
      filler = new_time_col[-1] + pd.Timedelta(interval_length)
      while filler < time:
        new_time_col.append(filler)
        filler += pd.Timedelta(interval_length)
      new_time_col.append(time)
  return pd.to_datetime(new_time_col)

def split(df: pd.DataFrame, 
          column_definition: List[Tuple[str, DataTypes, InputTypes]],
          test_percent_subjects: float, 
          length_segment: int,
          max_length_input: int,
          random_state: int = 42):
  """Splits data into train, validation and test sets.

  Args: 
    df: Dataframe to split.
    column_definition: List of tuples describing columns (column_name, data_type, input_type).
    test_percent_subjects: Float number from [0, 1], percentage of subjects to use for test set.
    length_segment: Number of points, length of segments saved for validation / test sets.
    max_length_input: Number of points, maximum length of input sequences for models.
    random_state: Number, Random state for reproducibility.

  Returns:
    train_idx: Training set indices.
    val_idx: Validation set indices.
    test_idx: Test set indices.
  """
  # set random state
  np.random.seed(random_state)
  # get id and id_segment columns
  id_col = [column_name for column_name, data_type, input_type in column_definition if input_type == InputTypes.ID][0]
  id_segment_col = [column_name for column_name, data_type, input_type in column_definition if input_type == InputTypes.SID][0]
  # get unique ids
  ids = df[id_col].unique()

  # select some subjects for test data set
  test_ids = np.random.choice(ids, math.ceil(len(ids) * test_percent_subjects), replace=False)
  test_idx_ood = list(df[df[id_col].isin(test_ids)].index)
  # get the remaning data for training and validation
  df = df[~df[id_col].isin(test_ids)]

  # iterate through subjects and split into train, val and test
  train_idx = []; val_idx = []; test_idx = []
  for id, id_data in df.groupby(id_col):
    segment_ids = id_data[id_segment_col].unique()
    if len(segment_ids) >= 2:
      train_idx += list(id_data.loc[id_data[id_segment_col].isin(segment_ids[:-2])].index)
      penultimate_segment = id_data[id_data[id_segment_col] == segment_ids[-2]]
      last_segment = id_data[id_data[id_segment_col] == segment_ids[-1]]
      if len(last_segment) >= max_length_input + 3 * length_segment:
        train_idx += list(penultimate_segment.index)
        train_idx += list(last_segment.iloc[:-2*length_segment].index)
        val_idx += list(last_segment.iloc[-2*length_segment-max_length_input:-length_segment].index)
        test_idx += list(last_segment.iloc[-length_segment-max_length_input:].index)
      elif len(last_segment) >= max_length_input + 2 * length_segment:
        train_idx += list(penultimate_segment.index)
        val_idx += list(last_segment.iloc[:-length_segment].index)
        test_idx += list(last_segment.iloc[-length_segment-max_length_input:].index)
      else:
        test_idx += list(last_segment.index)
        if len(penultimate_segment) >= max_length_input + 2 * length_segment:
          val_idx += list(penultimate_segment.iloc[-length_segment-max_length_input:].index)
          train_idx += list(penultimate_segment.iloc[:-length_segment].index)
        else:
          train_idx += list(penultimate_segment.index)
    else:
      if len(id_data) >= max_length_input + 3 * length_segment:
        train_idx += list(id_data.iloc[:-2*length_segment].index)
        val_idx += list(id_data.iloc[-2*length_segment-max_length_input:-length_segment].index)
        test_idx += list(id_data.iloc[-length_segment-max_length_input:].index)
      elif len(id_data) >= max_length_input + 2 * length_segment:
        train_idx += list(id_data.iloc[:-length_segment].index)
        test_idx += list(id_data.iloc[-length_segment-max_length_input:].index)
      else:
        train_idx += list(id_data.index)
  total_len = len(train_idx) + len(val_idx) + len(test_idx) + len(test_idx_ood)
  print('\tTrain: {} ({:.2f}%)'.format(len(train_idx), len(train_idx) / total_len * 100))
  print('\tVal: {} ({:.2f}%)'.format(len(val_idx), len(val_idx) / total_len * 100))
  print('\tTest: {} ({:.2f}%)'.format(len(test_idx), len(test_idx) / total_len * 100))
  print('\tTest OOD: {} ({:.2f}%)'.format(len(test_idx_ood), len(test_idx_ood) / total_len * 100))
  return train_idx, val_idx, test_idx, test_idx_ood

def encode(df: pd.DataFrame, 
          column_definition: List[Tuple[str, DataTypes, InputTypes]],
          date: List,):
  """Encodes categorical columns.

  Args: 
    df: Dataframe to split.
    column_definition: List of tuples describing columns (column_name, data_type, input_type).
    date: List of str, list containing date info to extract.

  Returns:
    df: Dataframe with encoded columns.
    column_definition: Updated list of tuples containing column name and types.
    encoders: dictionary containing encoders.
  """ 
  encoders = {}
  new_columns = []
  for i in range(len(column_definition)):
    column, column_type, input_type = column_definition[i]
    if column_type == DataTypes.DATE:
      for extract_col in date:
        df[column + '_' + extract_col] = getattr(df[column].dt, extract_col)
        df[column + '_' + extract_col] = df[column + '_' + extract_col].astype(np.float32)
        new_columns.append((column + '_' + extract_col, DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT))
    elif column_type == DataTypes.CATEGORICAL:
      encoders[column] = preprocessing.LabelEncoder()
      df[column] = encoders[column].fit_transform(df[column]).astype(np.float32)
      column_definition[i] = (column, DataTypes.REAL_VALUED, input_type)
    else:
      continue
  column_definition += new_columns
  # print updated column definition
  print('\tUpdated column definition:')
  for column, column_type, input_type in column_definition:
    print('\t\t{}: {} ({})'.format(column, 
                                   DataTypes(column_type).name, 
                                   InputTypes(input_type).name))
  return df, column_definition, encoders
  
def scale(train_data: pd.DataFrame,
          val_data: pd.DataFrame,
          test_data: pd.DataFrame,
          column_definition: List[Tuple[str, DataTypes, InputTypes]],
          scaler: str):
  """Scales numerical data.

  Args:
    train_data: pd.Dataframe, DataFrame of training data.
    val_data: pd.Dataframe, DataFrame of validation data.
    test_data: pd.Dataframe, DataFrame of testing data.
    column_definition: List of tuples describing columns (column_name, data_type, input_type).
    scaler: String, scaler to use.
  
  Returns:
    train_data: pd.Dataframe, DataFrame of scaled training data.
    val_data: pd.Dataframe, DataFrame of scaled validation data.
    test_data: pd.Dataframe, DataFrame of scaled testing data.
    scalers: dictionary index by column names containing scalers.
  """
  # select all real-valued columns
  columns_to_scale = [column for column, data_type, input_type in column_definition if data_type == DataTypes.REAL_VALUED]
  # handle no scaling case
  if scaler == 'None':
    print('\tNo scaling applied')
    return train_data, val_data, test_data, None
  scalers = {}
  for column in columns_to_scale:
    scaler_column = getattr(preprocessing, scaler)()
    train_data[column] = scaler_column.fit_transform(train_data[column].values.reshape(-1, 1))
    # handle empty validation and test sets
    val_data[column] = scaler_column.transform(val_data[column].values.reshape(-1, 1)) if val_data.shape[0] > 0 else val_data[column]
    test_data[column] = scaler_column.transform(test_data[column].values.reshape(-1, 1)) if test_data.shape[0] > 0 else test_data[column]
    scalers[column] = scaler_column
  # print columns that were scaled
  print('\tScaled columns: {}'.format(columns_to_scale))
  return train_data, val_data, test_data, scalers