File size: 1,488 Bytes
c822df8
75c23d4
fd4c500
 
c822df8
fd4c500
 
 
 
c822df8
 
fd4c500
 
 
 
 
9854909
c822df8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9854909
c822df8
 
 
fd4c500
 
 
 
 
 
c822df8
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""Functions for doing feature selection during preprocessing."""

from typing import Optional, cast

import numpy as np
from numpy import ndarray
from numpy.typing import NDArray

from .utils import ArrayLike


def run_feature_selection(
    X: ndarray,
    y: ndarray,
    select_k_features: int,
    random_state: Optional[np.random.RandomState] = None,
) -> NDArray[np.bool_]:
    """
    Find most important features.

    Uses a gradient boosting tree regressor as a proxy for finding
    the k most important features in X, returning indices for those
    features as output.
    """
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.feature_selection import SelectFromModel

    clf = RandomForestRegressor(
        n_estimators=100, max_depth=3, random_state=random_state
    )
    clf.fit(X, y)
    selector = SelectFromModel(
        clf, threshold=-np.inf, max_features=select_k_features, prefit=True
    )
    return cast(NDArray[np.bool_], selector.get_support(indices=False))


# Function has not been removed only due to usage in module tests
def _handle_feature_selection(
    X: ndarray,
    select_k_features: Optional[int],
    y: ndarray,
    variable_names: ArrayLike[str],
):
    if select_k_features is not None:
        selection = run_feature_selection(X, y, select_k_features)
        print(f"Using features {[variable_names[i] for i in selection]}")
        X = X[:, selection]
    else:
        selection = None

    return X, selection