import unittest import numpy as np import collections import sklearn # noqa import sklearn.datasets import sklearn.ensemble import sklearn.linear_model # noqa from numpy.testing import assert_array_equal from sklearn.datasets import load_iris, make_classification, make_multilabel_classification from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LinearRegression from lime.discretize import QuartileDiscretizer, DecileDiscretizer, EntropyDiscretizer try: from sklearn.model_selection import train_test_split except ImportError: # Deprecated in scikit-learn version 0.18, removed in 0.20 from sklearn.cross_validation import train_test_split from lime.lime_tabular import LimeTabularExplainer class TestLimeTabular(unittest.TestCase): def setUp(self): iris = load_iris() self.feature_names = iris.feature_names self.target_names = iris.target_names (self.train, self.test, self.labels_train, self.labels_test) = train_test_split(iris.data, iris.target, train_size=0.80) def test_lime_explainer_good_regressor(self): np.random.seed(1) rf = RandomForestClassifier(n_estimators=500) rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) explainer = LimeTabularExplainer(self.train, mode="classification", feature_names=self.feature_names, class_names=self.target_names, discretize_continuous=True) exp = explainer.explain_instance(self.test[i], rf.predict_proba, num_features=2, model_regressor=LinearRegression()) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] self.assertEqual(1, sum([1 if 'petal width' in x else 0 for x in keys]), "Petal Width is a major feature") self.assertEqual(1, sum([1 if 'petal length' in x else 0 for x in keys]), "Petal Length is a major feature") def test_lime_explainer_good_regressor_synthetic_data(self): X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=2, random_state=10) rf = RandomForestClassifier(n_estimators=500) rf.fit(X, y) instance = np.random.randint(0, X.shape[0]) feature_names = ["feature" + str(i) for i in range(20)] explainer = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True) exp = explainer.explain_instance(X[instance], rf.predict_proba) self.assertIsNotNone(exp) self.assertEqual(10, len(exp.as_list())) def test_lime_explainer_sparse_synthetic_data(self): n_features = 20 X, y = make_multilabel_classification(n_samples=100, sparse=True, n_features=n_features, n_classes=1, n_labels=2) rf = RandomForestClassifier(n_estimators=500) rf.fit(X, y) instance = np.random.randint(0, X.shape[0]) feature_names = ["feature" + str(i) for i in range(n_features)] explainer = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True) exp = explainer.explain_instance(X[instance], rf.predict_proba) self.assertIsNotNone(exp) self.assertEqual(10, len(exp.as_list())) def test_lime_explainer_no_regressor(self): np.random.seed(1) rf = RandomForestClassifier(n_estimators=500) rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) explainer = LimeTabularExplainer(self.train, feature_names=self.feature_names, class_names=self.target_names, discretize_continuous=True) exp = explainer.explain_instance(self.test[i], rf.predict_proba, num_features=2) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] self.assertEqual(1, sum([1 if 'petal width' in x else 0 for x in keys]), "Petal Width is a major feature") self.assertEqual(1, sum([1 if 'petal length' in x else 0 for x in keys]), "Petal Length is a major feature") def test_lime_explainer_entropy_discretizer(self): np.random.seed(1) rf = RandomForestClassifier(n_estimators=500) rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) explainer = LimeTabularExplainer(self.train, feature_names=self.feature_names, class_names=self.target_names, training_labels=self.labels_train, discretize_continuous=True, discretizer='entropy') exp = explainer.explain_instance(self.test[i], rf.predict_proba, num_features=2) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] print(keys) self.assertEqual(1, sum([1 if 'petal width' in x else 0 for x in keys]), "Petal Width is a major feature") self.assertEqual(1, sum([1 if 'petal length' in x else 0 for x in keys]), "Petal Length is a major feature") def test_lime_tabular_explainer_equal_random_state(self): X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=2, random_state=10) rf = RandomForestClassifier(n_estimators=500, random_state=10) rf.fit(X, y) instance = np.random.RandomState(10).randint(0, X.shape[0]) feature_names = ["feature" + str(i) for i in range(20)] # ---------------------------------------------------------------------- # -------------------------Quartile Discretizer------------------------- # ---------------------------------------------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertDictEqual(exp_1.as_map(), exp_2.as_map()) # ---------------------------------------------------------------------- # --------------------------Decile Discretizer-------------------------- # ---------------------------------------------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertDictEqual(exp_1.as_map(), exp_2.as_map()) # ---------------------------------------------------------------------- # -------------------------Entropy Discretizer-------------------------- # ---------------------------------------------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertDictEqual(exp_1.as_map(), exp_2.as_map()) def test_lime_tabular_explainer_not_equal_random_state(self): X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=2, random_state=10) rf = RandomForestClassifier(n_estimators=500, random_state=10) rf.fit(X, y) instance = np.random.RandomState(10).randint(0, X.shape[0]) feature_names = ["feature" + str(i) for i in range(20)] # ---------------------------------------------------------------------- # -------------------------Quartile Discretizer------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) # ---------------------------------------------------------------------- # --------------------------Decile Discretizer-------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) # ---------------------------------------------------------------------- # --------------------------Entropy Discretizer------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) def testFeatureNamesAndCategoricalFeats(self): training_data = np.array([[0., 1.], [1., 0.]]) explainer = LimeTabularExplainer(training_data=training_data) self.assertEqual(explainer.feature_names, ['0', '1']) self.assertEqual(explainer.categorical_features, [0, 1]) explainer = LimeTabularExplainer( training_data=training_data, feature_names=np.array(['one', 'two']) ) self.assertEqual(explainer.feature_names, ['one', 'two']) explainer = LimeTabularExplainer( training_data=training_data, categorical_features=np.array([0]), discretize_continuous=False ) self.assertEqual(explainer.categorical_features, [0]) def testFeatureValues(self): training_data = np.array([ [0, 0, 2], [1, 1, 0], [0, 2, 2], [1, 3, 0] ]) explainer = LimeTabularExplainer( training_data=training_data, categorical_features=[0, 1, 2] ) self.assertEqual(set(explainer.feature_values[0]), {0, 1}) self.assertEqual(set(explainer.feature_values[1]), {0, 1, 2, 3}) self.assertEqual(set(explainer.feature_values[2]), {0, 2}) assert_array_equal(explainer.feature_frequencies[0], np.array([.5, .5])) assert_array_equal(explainer.feature_frequencies[1], np.array([.25, .25, .25, .25])) assert_array_equal(explainer.feature_frequencies[2], np.array([.5, .5])) def test_lime_explainer_with_data_stats(self): np.random.seed(1) rf = RandomForestClassifier(n_estimators=500) rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) # Generate stats using a quartile descritizer descritizer = QuartileDiscretizer(self.train, [], self.feature_names, self.target_names, random_state=20) d_means = descritizer.means d_stds = descritizer.stds d_mins = descritizer.mins d_maxs = descritizer.maxs d_bins = descritizer.bins(self.train, self.target_names) # Compute feature values and frequencies of all columns cat_features = np.arange(self.train.shape[1]) discretized_training_data = descritizer.discretize(self.train) feature_values = {} feature_frequencies = {} for feature in cat_features: column = discretized_training_data[:, feature] feature_count = collections.Counter(column) values, frequencies = map(list, zip(*(feature_count.items()))) feature_values[feature] = values feature_frequencies[feature] = frequencies # Convert bins to list from array d_bins_revised = {} index = 0 for bin in d_bins: d_bins_revised[index] = bin.tolist() index = index+1 # Descritized stats data_stats = {} data_stats["means"] = d_means data_stats["stds"] = d_stds data_stats["maxs"] = d_maxs data_stats["mins"] = d_mins data_stats["bins"] = d_bins_revised data_stats["feature_values"] = feature_values data_stats["feature_frequencies"] = feature_frequencies data = np.zeros((2, len(self.feature_names))) explainer = LimeTabularExplainer( data, feature_names=self.feature_names, random_state=10, training_data_stats=data_stats, training_labels=self.target_names) exp = explainer.explain_instance(self.test[i], rf.predict_proba, num_features=2, model_regressor=LinearRegression()) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] self.assertEqual(1, sum([1 if 'petal width' in x else 0 for x in keys]), "Petal Width is a major feature") self.assertEqual(1, sum([1 if 'petal length' in x else 0 for x in keys]), "Petal Length is a major feature") if __name__ == '__main__': unittest.main()